From 4c71970057e11d969513fa28161cbc05511e4a68 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Wed, 19 Feb 2020 11:15:00 -0600
Subject: [PATCH 001/660] add libdeepgalois and lonestargnn

---
 CMakeLists.txt                            |   5 +
 libdeepgalois/CMakeLists.txt              |  23 +
 libdeepgalois/gnn.h                       |  31 ++
 libdeepgalois/layers.h                    |   8 +
 libdeepgalois/layers/arithmetic_layer.h   |  22 +
 libdeepgalois/layers/graph_conv_layer.h   | 186 ++++++++
 libdeepgalois/layers/layer.h              | 156 +++++++
 libdeepgalois/layers/linear_layer.h       |  28 ++
 libdeepgalois/layers/relu_layer.h         |  24 ++
 libdeepgalois/layers/softmax_loss_layer.h |  47 ++
 libdeepgalois/lgraph.h                    | 179 ++++++++
 libdeepgalois/math_functions.hpp          | 500 ++++++++++++++++++++++
 libdeepgalois/net.h                       | 341 +++++++++++++++
 libdeepgalois/node.h                      | 109 +++++
 libdeepgalois/optimizer.h                 | 221 ++++++++++
 libdeepgalois/random.h                    |  63 +++
 libdeepgalois/timer.h                     |  21 +
 libdeepgalois/types.h                     |  34 ++
 libdeepgalois/utils.h                     | 119 +++++
 lonestargnn/CMakeLists.txt                |   8 +
 lonestargnn/README.md                     |  60 +++
 lonestargnn/gcn/CMakeLists.txt            |  16 +
 lonestargnn/gcn/gcn.cpp                   |  47 ++
 lonestargnn/graphsage/gs-mean.cpp         |  41 ++
 lonestargnn/lonestargnn.h                 |  50 +++
 lonestargnn/run-citeseer.sh               |   1 +
 26 files changed, 2340 insertions(+)
 create mode 100644 libdeepgalois/CMakeLists.txt
 create mode 100644 libdeepgalois/gnn.h
 create mode 100644 libdeepgalois/layers.h
 create mode 100644 libdeepgalois/layers/arithmetic_layer.h
 create mode 100644 libdeepgalois/layers/graph_conv_layer.h
 create mode 100644 libdeepgalois/layers/layer.h
 create mode 100644 libdeepgalois/layers/linear_layer.h
 create mode 100644 libdeepgalois/layers/relu_layer.h
 create mode 100644 libdeepgalois/layers/softmax_loss_layer.h
 create mode 100644 libdeepgalois/lgraph.h
 create mode 100644 libdeepgalois/math_functions.hpp
 create mode 100644 libdeepgalois/net.h
 create mode 100644 libdeepgalois/node.h
 create mode 100644 libdeepgalois/optimizer.h
 create mode 100644 libdeepgalois/random.h
 create mode 100644 libdeepgalois/timer.h
 create mode 100644 libdeepgalois/types.h
 create mode 100644 libdeepgalois/utils.h
 create mode 100644 lonestargnn/CMakeLists.txt
 create mode 100644 lonestargnn/README.md
 create mode 100644 lonestargnn/gcn/CMakeLists.txt
 create mode 100644 lonestargnn/gcn/gcn.cpp
 create mode 100644 lonestargnn/graphsage/gs-mean.cpp
 create mode 100644 lonestargnn/lonestargnn.h
 create mode 100755 lonestargnn/run-citeseer.sh

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 40a7a7fb7b..8ce9f7f3a9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -45,6 +45,7 @@ set(USE_SANITIZER OFF CACHE BOOL "Use address and memory sanatizer")
 set(INSTALL_APPS OFF CACHE BOOL "Install apps as well as library")
 set(SKIP_COMPILE_APPS OFF CACHE BOOL "Skip compilation of applications using Galois library")
 set(GRAPH_LOCATION "" CACHE PATH "Location of inputs for tests if downloaded/stored separately.")
+set(USE_DEEPGALOIS OFF CACHE BOOL "Install gnn apps as well as the DeepGalois library")
 
 if(WIN32 AND NOT CYGWIN)
   set(DEFAULT_INSTALL_CMAKE_DIR "${CMAKE_INSTALL_PREFIX}/CMake")
@@ -514,6 +515,10 @@ endfunction()
 
 add_subdirectory(libllvm)
 add_subdirectory(libgalois)
+if(USE_DEEPGALOIS)
+  add_subdirectory(libdeepgalois)
+  add_subdirectory(lonestargnn)
+endif(USE_DEEPGALOIS)
 if(ENABLE_DIST_GALOIS)
   add_subdirectory(libdist)
   add_subdirectory(libcusp)
diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
new file mode 100644
index 0000000000..8caa65ebc9
--- /dev/null
+++ b/libdeepgalois/CMakeLists.txt
@@ -0,0 +1,23 @@
+set(sources
+  $<TARGET_OBJECTS:galois_shmem_obj>
+#  $<TARGET_OBJECTS:galois_dist_async_obj>
+  $<TARGET_OBJECTS:gllvm_obj>
+)
+
+add_library(deepgalois STATIC ${sources})
+
+target_link_libraries(deepgalois galois_shmem galois_dist_async gllvm)
+target_link_libraries(deepgalois ${MPI_CXX_LIBRARIES})
+
+target_include_directories(deepgalois PUBLIC
+  ${CMAKE_SOURCE_DIR}/libllvm/include
+  ${CMAKE_SOURCE_DIR}/libgalois/include
+  ${CMAKE_SOURCE_DIR}/libdist/include
+  ${CMAKE_SOURCE_DIR}/libdeepgalios/include
+  ${CMAKE_CURRENT_SOURCE_DIR}/include
+)
+
+set_target_properties (deepgalois PROPERTIES
+  INTERFACE_POSITION_INDEPENDENT_CODE On
+  POSITION_INDEPENDENT_CODE On
+)
diff --git a/libdeepgalois/gnn.h b/libdeepgalois/gnn.h
new file mode 100644
index 0000000000..d2d2bafb28
--- /dev/null
+++ b/libdeepgalois/gnn.h
@@ -0,0 +1,31 @@
+#ifndef _GNN_H_
+#define _GNN_H_
+
+#include "galois/Galois.h"
+#include "galois/Reduction.h"
+#include "galois/Timer.h"
+#include "galois/ParallelSTL.h"
+#include "llvm/Support/CommandLine.h"
+#include "galois/runtime/Profile.h"
+#include <boost/iterator/transform_iterator.hpp>
+
+namespace cll = llvm::cl;
+static cll::opt<std::string> dataset(cll::Positional, cll::desc("<dataset name>"), cll::Required); // 'cora', 'citeseer', 'pubmed'
+static cll::opt<std::string> filetype(cll::Positional, cll::desc("<filetype: el,gr>"), cll::init("gr")); // file format of the input graph
+static cll::opt<std::string> model("m", cll::desc("Model string"), cll::init("gcn")); // 'gcn', 'gcn_cheby', 'dense'
+static cll::opt<float> learning_rate("lr", cll::desc("Initial learning rate (default value 0.01)"), cll::init(0.01));
+static cll::opt<unsigned> epochs("k", cll::desc("number of epoch, i.e. iterations (default value 1)"), cll::init(1));
+static cll::opt<unsigned> hidden1("h", cll::desc("Number of units in hidden layer 1 (default value 16)"), cll::init(16));
+static cll::opt<float> dropout_rate("d", cll::desc("Dropout rate (1 - keep probability) (default value 0.5)"), cll::init(0.5));
+static cll::opt<float> weight_decay("wd", cll::desc("Weight for L2 loss on embedding matrix (default value 5e-4)"), cll::init(5e-4));
+static cll::opt<float> early_stopping("es", cll::desc("Tolerance for early stopping (# of epochs) (default value 10)"), cll::init(10));
+static cll::opt<unsigned> max_degree("md", cll::desc("Maximum Chebyshev polynomial degree (default value 3)"), cll::init(3));
+static cll::opt<unsigned> do_validate("dv", cll::desc("enable validation"), cll::init(1));
+static cll::opt<unsigned> do_test("dt", cll::desc("enable test"), cll::init(1));
+#define CHUNK_SIZE 256
+
+#include "types.h"
+#include "utils.h"
+#include "net.h"
+
+#endif
diff --git a/libdeepgalois/layers.h b/libdeepgalois/layers.h
new file mode 100644
index 0000000000..9650e931a9
--- /dev/null
+++ b/libdeepgalois/layers.h
@@ -0,0 +1,8 @@
+#ifndef _LAYERS_H_
+#define _LAYERS_H_
+#include "layers/relu_layer.h"
+#include "layers/linear_layer.h"
+#include "layers/arithmetic_layer.h"
+#include "layers/graph_conv_layer.h"
+#include "layers/softmax_loss_layer.h"
+#endif
diff --git a/libdeepgalois/layers/arithmetic_layer.h b/libdeepgalois/layers/arithmetic_layer.h
new file mode 100644
index 0000000000..aed91e0379
--- /dev/null
+++ b/libdeepgalois/layers/arithmetic_layer.h
@@ -0,0 +1,22 @@
+#pragma once
+#include "layer.h"
+
+// element-wise add N vectors ```y_i = x0_i + x1_i + ... + xnum_i```
+class elementwise_add_layer : public layer {
+public:
+	elementwise_add_layer(unsigned level, std::vector<size_t> in_dim,
+		std::vector<size_t> out_dim) : layer(level, in_dim, out_dim) {
+		trainable_ = false;
+	}
+	std::string layer_type() const override { return std::string("elementwise_add"); }
+	void forward_propagation(const tensor_t &in_data, tensor_t &out_data) override {
+		for (size_t sample = 0; sample < in_data.size(); ++sample) {
+			for (size_t j = 0; j < in_data[0].size(); j++)
+				out_data[sample][j] = in_data[sample][j];
+		}
+	}
+	void back_propagation(const tensor_t &in_data, const tensor_t &out_data, 
+		tensor_t &out_grad, tensor_t &in_grad) override {
+		in_grad = out_grad;
+	}
+};
diff --git a/libdeepgalois/layers/graph_conv_layer.h b/libdeepgalois/layers/graph_conv_layer.h
new file mode 100644
index 0000000000..b81f7bc10e
--- /dev/null
+++ b/libdeepgalois/layers/graph_conv_layer.h
@@ -0,0 +1,186 @@
+#pragma once
+#include "layer.h"
+
+/* GraphConv Layer
+	Parameters
+	----------
+	x: int, number of samples.
+	y: int, Input feature size.
+	z: int, Output feature size.
+	dropout: bool, optional, if True, a dropout operation is applied before other operations.
+	norm : bool, optional, if True, the normalizer :math:`c_{ij}` is applied. Default: ``True``.
+	bias : bool, optional, if True, adds a learnable bias to the output. Default: ``False``.
+	activation: callable activation function/layer or None, optional
+	If not None, applies an activation function to the updated node features. Default: ``None``.
+*/
+class graph_conv_layer: public layer {
+public:
+	graph_conv_layer(unsigned level, Graph *g, bool act, bool norm, bool bias, bool dropout,
+		std::vector<size_t> in_dims, std::vector<size_t> out_dims) :
+		layer(level, in_dims, out_dims), graph(g), act_(act), norm_(norm), bias_(bias), dropout_(dropout) {
+		assert(input_dims[0] == output_dims[0]); // num_vertices
+		x = input_dims[0];
+		y = input_dims[1];
+		z = output_dims[1];
+		trainable_ = true;
+		name_ = layer_type() + "_" + std::to_string(level);
+		//std::cout << name_ << " constructed: act(" << act_ << ") dropout(" << dropout << ")\n";
+		init();
+	}
+	void init() {
+		std::cout << name_ << ": allocating memory for parameters and intermediate data... ";
+		Timer t_alloc;
+		t_alloc.Start();
+		// randomly initialize trainable parameters for conv layers
+		rand_init_matrix(y, z, W);
+		//rand_init_matrix(y, z, Q);
+		zero_init_matrix(y, z, weight_grad);
+		alloc_grad();
+		if (dropout_) {
+			dropout_mask.resize(x);
+			for (size_t i = 0; i < x; i++) dropout_mask[i].resize(y);
+		}
+		in_temp.resize(x*y);
+		//for (size_t i = 0; i < x; ++i) in_temp[i].resize(y);
+		out_temp.resize(x*z); // same as pre_sup in original GCN code: https://github.com/chenxuhao/gcn/blob/master/gcn/layers.py
+		//for (size_t i = 0; i < x; ++i) out_temp[i].resize(z);
+		trans_data.resize(y*x); // y*x
+		//for (size_t i = 0; i < y; ++i) trans_data[i].resize(x);
+		if (norm_) norm_factor_counting();
+		t_alloc.Stop();
+		std::cout << "Done, allocation time: " << t_alloc.Millisecs() << " ms\n";
+	}
+	graph_conv_layer(unsigned level, std::vector<size_t> in_dims, 
+		std::vector<size_t> out_dims) : graph_conv_layer(level, NULL, false, true, false, true, in_dims, out_dims) {}
+	~graph_conv_layer() {}
+	std::string layer_type() const override { return std::string("graph_conv"); }
+
+	// user-defined aggregate function
+	void aggregate(Graph *g, const vec_t &in, tensor_t &out) { update_all(g, in, out, true, norm_factor); }
+
+	// user-defined combine function
+	void combine(const vec_t &self, const vec_t &neighbors, const vec_t mat_v, const vec_t mat_u, vec_t &out) {
+		vec_t a(out.size(), 0);
+		vec_t b(out.size(), 0);
+		mvmul(mat_v, self, a);
+		mvmul(mat_u, neighbors, b); 
+		vadd(a, b, out); // out = W*self + Q*neighbors
+	}
+
+	void set_context(net_phase ctx) override { phase_ = ctx; }
+
+	// 𝒉[𝑙] = σ(𝑊 * Σ(𝒉[𝑙-1]))
+	void forward_propagation(const tensor_t &in_data, tensor_t &out_data) override {
+		// input: x*y; W: y*z; output: x*z
+		// if y > z:
+		// mult W first to reduce the feature size for aggregation
+		// else: aggregate first then mult W (not implemented yet)
+		//Timer t_matmul, t_agg, t_dropout;
+		//t_matmul.Start();
+		if (dropout_ && phase_ == net_phase::train) {
+			//t_dropout.Start();
+			//for (size_t i = 0; i < x; ++i) {
+			galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) {
+				dropout(in_data[i], dropout_mask[i], &in_temp[i*y]);
+			}, galois::loopname("dropout"));
+			//t_dropout.Stop();
+			matmul1D1D(x, z, y, in_temp, W, out_temp); // x*y; y*z; x*z
+		} else matmul2D1D(z, in_data, W, out_temp); // x*y; y*z; x*z
+		//t_matmul.Stop();
+		//t_agg.Start();
+		aggregate(graph, out_temp, out_data); // aggregate
+		//t_agg.Stop();
+		if (act_) {
+			galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) {
+				relu(out_data[i], out_data[i]);
+			}, galois::loopname("relu"));
+		}
+		//double dropout_time = 0;
+		//if (dropout_ && phase_ == net_phase::train) dropout_time = t_dropout.Millisecs();
+		//std::cout << "\n\t" << name_ << " matmul time: " << t_matmul.Millisecs() 
+		//	<< ", aggregation time: " << t_agg.Millisecs() << ", dropout time: " << dropout_time << "\n";
+	}
+
+	// 𝜕𝐸 / 𝜕𝑦[𝑙−1] = 𝜕𝐸 / 𝜕𝑦[𝑙] ∗ 𝑊 ^𝑇
+	void back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad) override {
+		if (act_) {
+			//for (size_t j = 0; j < z; ++j) 
+			galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) {
+				for (size_t j = 0; j < z; ++j) 
+					//if (out_data[i][j] <= 0.0) out_temp[i][j] = 0.0;
+					out_temp[i*z+j] = out_data[i][j] > float_t(0) ? out_grad[i][j] : float_t(0);
+			}, galois::loopname("d_relu"));
+		//} else out_temp = out_grad; // TODO: avoid copying
+		} else copy2D1D(out_grad, out_temp);
+		if (level_ != 0) { // no need to calculate in_grad for the first layer
+			vec_t trans_W(z*y);
+			transpose(y, z, W, trans_W); // derivative of matmul needs transposed matrix
+			matmul1D1D(x, y, z, out_temp, trans_W, in_temp); // x*z; z*y -> x*y
+			update_all(graph, in_temp, in_grad, true, norm_factor); // x*x; x*y -> x*y NOTE: since graph is symmetric, the derivative is the same
+			if (dropout_) {
+				galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) {
+					d_dropout(in_grad[i], dropout_mask[i], in_grad[i]);
+				}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("d_dropout"));
+			}
+		}
+
+		// calculate weight gradients
+		transpose2D1D(in_data, trans_data); // y*x
+		matmul1D1D(y, z, x, trans_data, out_temp, weight_grad); // y*x; x*z; y*z
+	}
+
+	void degree_counting() {
+		assert(x == graph->size());
+		degrees.resize(x);
+		galois::do_all(galois::iterate((size_t)0, x), [&] (auto v) {
+			degrees[v] = std::distance(graph->edge_begin(v), graph->edge_end(v));
+		}, galois::loopname("DegreeCounting"));
+	}
+
+	// for each vertex v, compute pow(|N(v)|, -0.5), where |N(v)| is the degree of v
+	void norm_factor_counting() {
+		degree_counting();
+		norm_factor.resize(x);
+		galois::do_all(galois::iterate((size_t)0, x), [&] (auto v) {
+			float_t temp = std::sqrt(float_t(degrees[v]));
+			if (temp == 0.0) norm_factor[v] = 0.0;
+			else norm_factor[v] = 1.0 / temp;
+		}, galois::loopname("NormCounting"));
+	}
+
+private:
+	Graph *graph;
+	bool act_; // whether to use activation function at the end
+	bool norm_; // whether to normalize data
+	bool bias_; // whether to add bias afterwards
+	bool dropout_; // whether to use dropout at first
+	net_phase phase_;
+	size_t x;
+	size_t y;
+	size_t z;
+	vec_t out_temp;
+	vec_t in_temp;
+	vec_t trans_data; // y*x
+	std::vector<unsigned> degrees;
+	std::vector<float_t> norm_factor; // normalization constant based on graph structure
+	std::vector<std::vector<unsigned> > dropout_mask;
+
+	// Glorot & Bengio (AISTATS 2010) init
+	inline void rand_init_matrix(size_t dim_x, size_t dim_y, vec_t &matrix) {
+		auto init_range = sqrt(6.0/(dim_x + dim_y));
+		std::default_random_engine rng;
+		std::uniform_real_distribution<float_t> dist(-init_range, init_range);
+		matrix.resize(dim_x * dim_y);
+		for (size_t i = 0; i < dim_x; ++i) {
+			for (size_t j = 0; j < dim_y; ++j)
+				matrix[i*dim_y+j] = dist(rng);
+		}
+	}
+	inline void zero_init_matrix(size_t dim_x, size_t dim_y, vec_t &matrix) {
+		matrix.resize(dim_x * dim_y);
+		for (size_t i = 0; i < dim_x; ++i) {
+			for (size_t j = 0; j < dim_y; ++j)
+				matrix[i*dim_y+j] = 0;
+		}
+	}
+};
diff --git a/libdeepgalois/layers/layer.h b/libdeepgalois/layers/layer.h
new file mode 100644
index 0000000000..4a8a545738
--- /dev/null
+++ b/libdeepgalois/layers/layer.h
@@ -0,0 +1,156 @@
+#pragma once
+
+#include <queue>
+#include <cmath>
+#include <vector>
+#include <limits>
+#include <memory>
+#include <string>
+#include <iomanip>
+#include <numeric>
+#include <sstream>
+#include <utility>
+#include <algorithm>
+#include <unordered_set>
+#include "../node.h"
+#include "../types.h"
+#include "../utils.h"
+#include "../optimizer.h"
+#include "../math_functions.hpp"
+/**
+ * base class of all kind of NN layers
+ *
+ * sub-class should override these methods:
+ * - forward_propagation ... body of forward-pass calculation
+ * - back_propagation    ... body of backward-pass calculation
+ * - in_shape            ... specify input data shapes
+ * - out_shape           ... specify output data shapes
+ * - layer_type          ... name of layer
+ **/
+
+class layer : public node {
+public:
+	layer(unsigned level, std::vector<size_t> in_dims, std::vector<size_t> out_dims) :
+		node(in_dims.size(), out_dims.size()), 
+		level_(level), begin_(0), end_(0), num_dims(in_dims.size()),
+		input_dims(in_dims), output_dims(out_dims) { add_edge(); }
+	virtual ~layer() = default;
+	virtual void forward_propagation(const tensor_t &in_data, tensor_t &out_data) = 0;
+	virtual void back_propagation(const tensor_t &in_data, const tensor_t &out_data,
+			tensor_t &out_grad, tensor_t &in_grad) = 0;
+	virtual std::string layer_type() const = 0;
+	virtual void set_context(net_phase ctx) {}
+	//virtual void setup(Graph *g, vec_t *diff, LabelList *lab) = 0;
+
+	void set_trainable(bool trainable) { trainable_ = trainable; }
+	bool trainable() const { return trainable_; }
+	void set_name(std::string name) { name_ = name; }
+	std::string get_name() { return name_; }
+	void print_layer_info() {
+		std::cout << "Layer" << level_ << " type: " << layer_type()
+			<< " input[" << input_dims[0] << "," << input_dims[1] 
+			<< "] output[" << output_dims[0] << "," << output_dims[1] << "]\n";
+	}
+	virtual void set_sample_mask(size_t sample_begin, size_t sample_end, size_t sample_count, MaskList &masks) {
+		begin_ = sample_begin;
+		end_ = sample_end;
+		count_ = sample_count;
+		masks_ = masks;
+	}
+	void set_in_data(tensor_t data) {
+		prev_ = std::make_shared<edge>(this, input_dims[1]);
+		prev_->get_data() = data;
+		prev_->get_gradient().resize(input_dims[0]);
+		// allocate memory for intermediate gradients
+		//std::cout << "l0 in_grad alloc: x=" << output_dims[0] << ", y=" << output_dims[1] << "\n";
+		for (size_t i = 0; i < input_dims[0]; ++i)
+			prev_->get_gradient()[i].resize(input_dims[1]);
+	}
+	void add_edge() {
+		// add an outgoing edge
+		next_ = std::make_shared<edge>(this, output_dims[1]);
+		// allocate memory for intermediate feature vectors
+		next_->get_data().resize(output_dims[0]);
+		for (size_t i = 0; i < output_dims[0]; ++i)
+			next_->get_data()[i].resize(output_dims[1]);
+	}
+	void alloc_grad() {
+		// allocate memory for intermediate gradients
+		//std::cout << "l" << level_ << " out_grad alloc: x=" << output_dims[0] << ", y=" << output_dims[1] << "\n";
+		next_->get_gradient().resize(output_dims[0]);
+		for (size_t i = 0; i < output_dims[0]; ++i)
+			next_->get_gradient()[i].resize(output_dims[1]);
+	}
+	void forward() {
+		forward_propagation(prev()->get_data(), next()->get_data());
+	}
+	void backward() {
+		back_propagation(prev()->get_data(), next()->get_data(), next()->get_gradient(), prev()->get_gradient());
+	}
+	void update_weight(optimizer *opt) {
+		//std::cout << "[debug] " << name_ << ": updating weight...\n"; 
+		// parallelize only when target size is big enough to mitigate thread spawning overhead.
+		bool parallel = (W.size() >= 512);
+		//vec_t diff;
+		//prev()->merge_grads(&diff);
+		//auto in_data = prev()->get_data();
+		//float_t rcp_batch_size = float_t(1.0) / in_data.size();
+		//for (size_t i = 0; i < diff.size(); ++i)
+		//	diff[i] *= rcp_batch_size;
+		opt->update(weight_grad, W, parallel); // W += grad
+		prev()->clear_grads();
+	}
+	inline acc_t get_masked_loss() {
+		//acc_t total_loss = acc_t(0);
+		//size_t valid_sample_count = 0;
+		AccumF total_loss;
+		AccumU valid_sample_count;
+		total_loss.reset();
+		valid_sample_count.reset();
+		//for (size_t i = begin_; i < end_; i ++) {
+		galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) {
+			if (masks_[i]) {
+				total_loss += loss[i];
+				valid_sample_count += 1;
+			}
+		}, galois::chunk_size<256>(), galois::steal(), galois::loopname("getMaskedLoss"));
+		//}
+		assert(valid_sample_count.reduce() == count_);
+		return total_loss.reduce() / (acc_t)count_;
+	}
+
+protected:
+	unsigned level_; // layer id: [0, num_layers-1]
+	size_t begin_; // sample begin index
+	size_t end_; // sample end index
+	size_t count_; // number of samples
+	MaskList masks_; // masks to show which samples are valid
+	size_t num_dims; // number of dimensions
+	std::vector<size_t> input_dims; // input dimensions
+	std::vector<size_t> output_dims; // output dimentions
+	std::string name_; // name of this layer
+	bool trainable_; // is this layer trainable
+	vec_t W; // parameters to learn, for vertex v, layer0: D x 16, layer1: 16 x E
+	vec_t Q; // parameters to learn, for vertex u, i.e. v's neighbors, layer0: D x 16, layer1: 16 x E
+	vec_t weight_grad; // weight gradient for updating parameters
+	vec_t loss; // error for each vertex: N x 1
+};
+
+// head: layer i+1, tail: layer i
+inline void connect(layer *head, layer *tail,
+    	size_t head_index = 0, size_t tail_index = 0) {
+	//auto out_shape = head->out_shape()[head_index];
+	//auto in_shape  = tail->in_shape()[tail_index];
+	//head->setup(false);
+	//if (in_shape.size() == 0) {
+	//	tail->set_in_shape(out_shape);
+	//	in_shape = out_shape;
+	//}
+	//if (out_shape.size() != in_shape.size()) 
+	//	connection_mismatch(*head, *tail);
+	//if (!head->next_[head_index])
+	//	throw nn_error("output edge must not be null");
+	tail->prev_ = head->next_;
+	tail->prev_->add_next_node(tail);
+}
+
diff --git a/libdeepgalois/layers/linear_layer.h b/libdeepgalois/layers/linear_layer.h
new file mode 100644
index 0000000000..e4ff524f3f
--- /dev/null
+++ b/libdeepgalois/layers/linear_layer.h
@@ -0,0 +1,28 @@
+#pragma once
+#include "layer.h"
+
+class linear_layer : public layer {
+public:
+	linear_layer(unsigned level, float_t scale, float_t bias,
+		std::vector<size_t> in_dims, std::vector<size_t> out_dims) :
+		layer(level, in_dims, out_dims), scale_(scale), bias_(bias) {
+		trainable_ = false; }
+	linear_layer(unsigned level, std::vector<size_t> in_dim,
+		std::vector<size_t> out_dim) : linear_layer(level, 1.0, 0.0, in_dim, out_dim) { }
+	std::string layer_type() const override { return "linear"; }
+
+	void forward_propagation(const tensor_t &in_data, tensor_t &out_data) override {
+		for (size_t sample = 0; sample < input_dims[0]; ++sample) {
+			for (size_t i = 0; i < input_dims[1]; i ++)
+				out_data[sample][i] = scale_ * in_data[sample][i] + bias_;
+		}
+	}
+	void back_propagation(const tensor_t &in_data, const tensor_t &out_data, 
+		tensor_t &out_grad, tensor_t &in_grad) override {
+		for (size_t sample = 0; sample < input_dims[0]; ++sample)
+			for (size_t i = 0; i < input_dims[1]; i++)
+				in_grad[sample][i] = out_grad[sample][i]  * scale_;
+	}
+protected:
+	float_t scale_, bias_;
+};
diff --git a/libdeepgalois/layers/relu_layer.h b/libdeepgalois/layers/relu_layer.h
new file mode 100644
index 0000000000..389e6b3c1f
--- /dev/null
+++ b/libdeepgalois/layers/relu_layer.h
@@ -0,0 +1,24 @@
+#pragma once
+#include "layer.h"
+
+// ReLU Layer
+class relu_layer : public layer {
+public:
+	relu_layer(unsigned level, std::vector<size_t> in_dims, std::vector<size_t> out_dims)
+		: layer(level, in_dims, out_dims) {
+		trainable_ = false;
+	}
+	std::string layer_type() const override { return std::string("relu"); }
+	// 𝑦[𝑙] = max(0, 𝑦[𝑙−1])
+	void forward_propagation(const tensor_t &in_data, tensor_t &out_data) override {
+		galois::do_all(galois::iterate((size_t)0, input_dims[0]), [&](const auto& i) {
+			for (size_t j = 0; j < input_dims[1]; ++j) 
+				out_data[i][j] = std::max(in_data[i][j], (float_t)0) +
+					negative_slope * std::min(in_data[i][j], (float_t)0);
+		}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("relu_layer-fw"));
+	}
+	// 𝜕𝐿 / 𝜕𝑦[𝑙−1] = 0, 𝑖𝑓 (𝑦[𝑙] < 0)
+	//              = 𝜕𝐿 / 𝜕𝑦𝑙 , 𝑜𝑡ℎ𝑒𝑟𝑤𝑖𝑠𝑒
+	void back_propagation(const tensor_t &in_data, const tensor_t &out_data, 
+		tensor_t &out_grad, tensor_t &in_grad) override {}
+};
diff --git a/libdeepgalois/layers/softmax_loss_layer.h b/libdeepgalois/layers/softmax_loss_layer.h
new file mode 100644
index 0000000000..bdd52e4d38
--- /dev/null
+++ b/libdeepgalois/layers/softmax_loss_layer.h
@@ -0,0 +1,47 @@
+#pragma once
+#include "layer.h"
+
+class softmax_loss_layer: public layer {
+public:
+	softmax_loss_layer(unsigned level, std::vector<size_t> in_dims, 
+		std::vector<size_t> out_dims, LabelList *lab)
+		: layer(level, in_dims, out_dims), labels(lab) {
+		trainable_ = false;
+		loss.resize(in_dims[0]); // error for each sample
+		name_ = layer_type() + "_" + std::to_string(level);
+	}
+	softmax_loss_layer(unsigned level, std::vector<size_t> in_dims, 
+		std::vector<size_t> out_dims) : 
+		softmax_loss_layer(level, in_dims, out_dims, NULL) {}
+	~softmax_loss_layer() {}
+	std::string layer_type() const override { return std::string("softmax_loss"); }
+
+	// TODO: need kernel fusion optimization
+	// 𝑦[i] = 𝑒^𝑥[i] / Σ 𝑒^𝑥[𝑘]
+	void forward_propagation(const tensor_t &in_data, tensor_t &out_data) override {
+		galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) {
+			if (masks_[i] == 1) { // masked
+				softmax(in_data[i], out_data[i]); // normalize using softmax
+				// y is a one hot encoded vector for the labels
+				std::vector<acc_t> y(output_dims[1], 0.0); // ground truth
+				y[(*labels)[i]] = 1.0; // one-hot
+				loss[i] = cross_entropy(y, out_data[i]);
+			}
+		}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("softmax-loss-fw"));
+	}
+
+	void back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad) override {
+		//std::cout << name_ << " backward: x=" << in_grad.size() << ", y=" << in_grad[0].size() << "\n";
+		galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) {
+			vec_t norm_grad(output_dims[1]);
+			std::vector<acc_t> y(output_dims[1], 0.0); // ground truth
+			y[(*labels)[i]] = 1.0;
+			d_cross_entropy(y, out_data[i], norm_grad);
+			d_softmax(in_data[i], out_data[i], in_grad[i], norm_grad);
+		}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("softmax-loss-bw"));
+	}
+
+private:
+	LabelList *labels;
+};
+
diff --git a/libdeepgalois/lgraph.h b/libdeepgalois/lgraph.h
new file mode 100644
index 0000000000..78f6f76aec
--- /dev/null
+++ b/libdeepgalois/lgraph.h
@@ -0,0 +1,179 @@
+#ifndef __LGRAPH_HPP__
+#define __LGRAPH_HPP__
+
+//defines the Learning Graph (LGraph) data structure
+#include <set>
+#include <string>
+#include <sstream>
+#include <fstream>
+#include <iostream>
+#include <algorithm>
+typedef unsigned IndexT;
+typedef float ValueT;
+
+struct Edge {
+	IndexT src;
+	IndexT dst;
+	ValueT elabel;
+	Edge() : src(0), dst(0), elabel(0) {}
+	Edge(IndexT from, IndexT to, ValueT el) :
+		src(from), dst(to), elabel(el) {}
+	std::string to_string() const {
+		std::stringstream ss;
+		ss << "e(" << src << "," << dst << "," << elabel << ")";
+		return ss.str();
+	}
+};
+typedef std::vector<Edge> EdgeList;
+
+class LGraph {
+public:
+	LGraph() : symmetrize_(false), directed_(false) {}
+	void clean() {
+		delete[] rowptr_;
+		delete[] colidx_;
+		delete[] weight_;
+		degrees.clear();
+		el.clear();
+		//labels_.clear();
+		//vertices.clear();
+	}
+	bool directed() const { return directed_; }
+	size_t num_vertices() const { return num_vertices_; }
+	size_t num_edges() const { return num_edges_; }
+	IndexT * out_rowptr() const { return rowptr_; }
+	IndexT * out_colidx() const { return colidx_; }
+	unsigned out_degree(IndexT n) const { return rowptr_[n+1] - rowptr_[n]; }
+	IndexT get_offset(IndexT n) { return rowptr_[n]; }
+	IndexT get_dest(IndexT n) { return colidx_[n]; }
+	ValueT get_weight(IndexT n) { return weight_[n]; }
+	unsigned get_max_degree() { return max_degree; }
+	//ValueT * labels() { return labels_.data(); }
+	//ValueT get_label(IndexT n) { return labels_[n]; }
+	void read_edgelist(const char *filename, bool symmetrize = false) {
+		std::ifstream in;
+		std::string line;
+		in.open(filename, std::ios::in);
+		IndexT max_vid = 0;
+		while (std::getline(in, line)) {
+			std::istringstream edge_stream(line);
+			IndexT u, v;
+			edge_stream >> u;
+			edge_stream >> v;
+			el.push_back(Edge(u, v, 1));
+			if (symmetrize) el.push_back(Edge(v, u, 1));
+			if (u > max_vid) max_vid = u;
+			if (v > max_vid) max_vid = v;
+		}
+		in.close();
+		directed_ = true;
+		num_vertices_ = max_vid+1;
+		num_edges_ = el.size();
+		std::cout << "num_vertices_ " << num_vertices_ << " num_edges_ " << num_edges_ << "\n";
+		MakeGraphFromEL();
+	}
+
+private:
+	EdgeList el;
+	bool symmetrize_; // whether to symmetrize a directed graph
+	bool directed_;
+	size_t num_vertices_;
+	size_t num_edges_;
+	IndexT *rowptr_;
+	IndexT *colidx_;
+	ValueT *weight_;
+	unsigned max_degree;
+	std::vector<IndexT> degrees;
+	std::vector<ValueT> labels_;
+	std::vector<std::vector<Edge> > vertices;
+
+	static bool compare_id(Edge a, Edge b) { return (a.dst < b.dst); }
+
+	void MakeGraphFromEL() {
+		SquishGraph();
+		MakeCSR(false);
+	}
+
+	void SquishGraph(bool remove_selfloops = true, bool remove_redundents = true) {
+		std::vector<Edge> neighbors;
+		for (size_t i = 0; i < num_vertices_; i++)
+			vertices.push_back(neighbors);
+		for (size_t i = 0; i < num_edges_; i ++)
+			vertices[el[i].src].push_back(el[i]);
+		el.clear();
+		printf("Sorting the neighbor lists...");
+		for (size_t i = 0; i < num_vertices_; i ++)
+			std::sort(vertices[i].begin(), vertices[i].end(), compare_id);
+		printf(" Done\n");
+		//remove self loops
+		int num_selfloops = 0;
+		if(remove_selfloops) {
+			printf("Removing self loops...");
+			for(size_t i = 0; i < num_vertices_; i ++) {
+				for(unsigned j = 0; j < vertices[i].size(); j ++) {
+					if(i == vertices[i][j].dst) {
+						vertices[i].erase(vertices[i].begin()+j);
+						num_selfloops ++;
+						j --;
+					}
+				}
+			}
+			printf(" %d selfloops are removed\n", num_selfloops);
+			num_edges_ -= num_selfloops;
+		}
+		// remove redundent
+		int num_redundents = 0;
+		if(remove_redundents) {
+			printf("Removing redundent edges...");
+			for (size_t i = 0; i < num_vertices_; i ++) {
+				for (unsigned j = 1; j < vertices[i].size(); j ++) {
+					if (vertices[i][j].dst == vertices[i][j-1].dst) {
+						vertices[i].erase(vertices[i].begin()+j);
+						num_redundents ++;
+						j --;
+					}
+				}
+			}
+			printf(" %d redundent edges are removed\n", num_redundents);
+			num_edges_ -= num_redundents;
+		}
+	}
+
+	void MakeCSR(bool transpose) {
+		degrees.resize(num_vertices_);
+		std::fill(degrees.begin(), degrees.end(), 0);
+		for (size_t i = 0; i < num_vertices_; i ++)
+			degrees[i] = vertices[i].size();
+		max_degree = *(std::max_element(degrees.begin(), degrees.end()));
+
+		std::vector<IndexT> offsets(degrees.size() + 1);
+		IndexT total = 0;
+		for (size_t n = 0; n < degrees.size(); n++) {
+			offsets[n] = total;
+			total += degrees[n];
+		}
+		offsets[degrees.size()] = total;
+
+		assert(num_edges_ == offsets[num_vertices_]);
+		weight_ = new ValueT[num_edges_];
+		colidx_ = new IndexT[num_edges_];
+		rowptr_ = new IndexT[num_vertices_+1]; 
+		for (size_t i = 0; i < num_vertices_+1; i ++) rowptr_[i] = offsets[i];
+		for (size_t i = 0; i < num_vertices_; i ++) {
+			for (auto it = vertices[i].begin(); it < vertices[i].end(); it ++) {
+				Edge e = *it;
+				assert(i == e.src);
+				if (symmetrize_ || (!symmetrize_ && !transpose)) {
+					weight_[offsets[e.src]] = e.elabel;
+					colidx_[offsets[e.src]++] = e.dst;
+				}
+				if (symmetrize_ || (!symmetrize_ && transpose)) {
+					weight_[offsets[e.dst]] = e.elabel;
+					colidx_[offsets[e.dst]++] = e.src;
+				}
+			}
+		}
+	}
+};
+
+#endif
diff --git a/libdeepgalois/math_functions.hpp b/libdeepgalois/math_functions.hpp
new file mode 100644
index 0000000000..8791416441
--- /dev/null
+++ b/libdeepgalois/math_functions.hpp
@@ -0,0 +1,500 @@
+#ifndef _MATH_FUNCTIONS_
+#define _MATH_FUNCTIONS_
+#include <cmath>
+#include "utils.h"
+#include <immintrin.h>
+
+#ifdef WITH_BLAS
+extern "C" {
+#include <cblas.h>
+//#include <clapack.h>
+}
+#endif
+
+const float negative_slope = 0;
+
+// vector add
+template <typename DataTy = float>
+inline void vadd(const std::vector<DataTy> &a, const std::vector<DataTy> &b, std::vector<DataTy> &out) {
+	//for (size_t i = 0; i < out.size(); ++i) out[i] = a[i] + b[i];
+	size_t n = out.size();
+	size_t vec_len = 8;
+	const size_t alignedN = n - n % vec_len;
+	for (size_t i = 0; i < alignedN; i += vec_len)
+		_mm256_storeu_ps(&out[i], _mm256_add_ps(_mm256_loadu_ps(&a[i]), _mm256_loadu_ps(&b[i])));
+	for (size_t i = alignedN; i < n; ++i) out[i] = a[i] + b[i];
+}
+
+template <typename DataTy = float>
+inline void vadd(size_t n, const DataTy *a, const DataTy *b, DataTy *out) {
+	size_t vec_len = 8;
+	const size_t alignedN = n - n % vec_len;
+	for (size_t i = 0; i < alignedN; i += vec_len)
+		_mm256_storeu_ps(&out[i], _mm256_add_ps(_mm256_loadu_ps(&a[i]), _mm256_loadu_ps(&b[i])));
+	for (size_t i = alignedN; i < n; ++i) out[i] = a[i] + b[i];
+}
+
+// vector subtract
+template <typename DataTy = float>
+inline void vsub(const std::vector<DataTy> &in_a, const std::vector<DataTy> &in_b, std::vector<DataTy> &out) {
+	for (size_t i = 0; i < out.size(); ++i) out[i] = in_a[i] - in_b[i];
+}
+
+// vector multiply
+template <typename DataTy = float>
+inline void vmul(const std::vector<DataTy> &in_a, const std::vector<DataTy> &in_b, std::vector<DataTy> &out) {
+	for (size_t i = 0; i < out.size(); ++i) out[i] = in_a[i] * in_b[i];
+}
+
+// vector divide
+template <typename DataTy = float>
+inline void vdiv(const std::vector<DataTy> &in_a, const std::vector<DataTy> &in_b, std::vector<DataTy> &out) {
+	for (size_t i = 0; i < out.size(); ++i) {
+		assert(in_b[i] != 0);
+		out[i] = in_a[i] / in_b[i];
+	}
+}
+
+// vector add scalar
+template <typename DataTy = float>
+inline void add_scalar(const DataTy alpha, std::vector<DataTy> &Y) {
+	for (size_t i = 0; i < Y.size(); ++i) Y[i] += alpha;
+}
+
+// vector subtract scalar
+template <typename DataTy = float>
+inline void sub_scalar(const DataTy alpha, std::vector<DataTy> &Y) {
+	for (size_t i = 0; i < Y.size(); ++i) Y[i] -= alpha;
+}
+
+// vector multiply scalar
+template <typename DataTy = float>
+inline void mul_scalar(const DataTy alpha, std::vector<DataTy> &Y) {
+	for (size_t i = 0; i < Y.size(); ++i) Y[i] *= alpha;
+}
+
+template <typename DataTy = float>
+inline void mul_scalar(size_t n, const DataTy alpha, const DataTy *in, DataTy *out) {
+	for (size_t i = 0; i < n; ++i) out[i] = alpha *in[i];
+}
+
+// vector divide scalar
+template <typename DataTy = float>
+inline void div_scalar(const DataTy alpha, std::vector<DataTy> &Y) {
+	assert(alpha != 0);
+	for (size_t i = 0; i < Y.size(); ++i) Y[i] /= alpha;
+}
+
+// dot product
+template <typename DataTy = float>
+inline DataTy dot(const std::vector<DataTy> &x, const std::vector<DataTy> &y) {
+	DataTy sum = 0;
+	for (size_t i = 0; i < x.size(); ++i)
+		sum += x[i] * y[i];
+	return sum;
+}
+
+// matrix-vector multiply
+inline void mvmul(const vec_t &matrix, const vec_t &in_vector, vec_t &out_vector) {
+	size_t m = out_vector.size();
+	size_t n = in_vector.size();
+	for (size_t i = 0; i < m; ++i) { 
+		for (size_t j = 0; j < n; ++j) { 
+			out_vector[i] += matrix[i*n+j] * in_vector[j];
+		} 
+	} 
+}
+
+// vector-vector multiply
+inline void vvmul(const vec_t &a, const vec_t &b, tensor_t &out) {
+	size_t m = a.size();
+	size_t n = b.size();
+	for (size_t i = 0; i < m; ++i) { 
+		for (size_t j = 0; j < n; ++j) { 
+			out[i][j] += a[i] * b[j];
+		} 
+	} 
+}
+
+// matrix addition
+inline void matadd(size_t x, size_t y, const tensor_t &A, const tensor_t &B, tensor_t &C) {
+	for (size_t i = 0; i < x; ++i)
+		for (size_t j = 0; j < y; ++j)
+			C[i][j] = A[i][j] + B[i][j];
+}
+
+// TODO: vectorize
+template <typename DataTy = float>
+inline void copy2D1D(const tensor_t &in, vec_t &out) {
+	size_t x = in.size();
+	size_t y = in[0].size();
+#ifdef WITH_BLAS
+	auto ptr = &out[0];
+	for (size_t i = 0; i < x; i++) {
+		std::copy(in[i].begin(), in[i].end(), ptr);
+		ptr += y;
+	}
+#else
+	assert(out.size() == x*y);
+	for (size_t i = 0; i < x; i ++) {
+		for (size_t j = 0; j < y; j ++) {
+			out[i*y+j] = in[i][j];
+		}
+	}
+#endif
+}
+
+// matrix multiply: all 2D
+inline void matmul2D(const tensor_t &A, const tensor_t &B, tensor_t &C) {
+	// A: x*z; B: z*y; C: x*y
+	size_t dim_x = A.size();
+	size_t dim_y = C[0].size();
+	size_t dim_z = A[0].size();
+	assert(C.size() == dim_x);
+	assert(B.size() == dim_z);
+	assert(B[0].size() == dim_y);
+
+	for (size_t i = 0; i < dim_x; ++i) { 
+		for (size_t j = 0; j < dim_y; ++j) { 
+			C[i][j] = 0;
+			for (size_t k = 0; k < dim_z; ++k) { 
+				C[i][j] += A[i][k] * B[k][j];
+			} 
+		} 
+	} 
+}
+
+inline void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z, 
+	const vec_t &A, const vec_t &B, vec_t &C) {
+	galois::StatTimer Tmatmul("MatMul");
+	Tmatmul.start();
+#ifdef WITH_BLAS
+	const int M = dim_x;
+	const int N = dim_y;
+	const int K = dim_z;
+	const float alpha = 1.0;
+	const float beta = 0.0;
+	const CBLAS_TRANSPOSE TransA = CblasNoTrans;
+	const CBLAS_TRANSPOSE TransB = CblasNoTrans;
+	int lda = (TransA == CblasNoTrans) ? K : M;
+	int ldb = (TransB == CblasNoTrans) ? N : K;
+	cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, &A[0], lda, &B[0], ldb, beta, &C[0], N);
+#else
+	//std::cout << "using naive matmul, slow\n";
+	assert(A.size() == dim_x*dim_z);
+	assert(B.size() == dim_z*dim_y);
+	assert(C.size() == dim_x*dim_y);
+
+	for (size_t i = 0; i < dim_x; ++i) { 
+		for (size_t j = 0; j < dim_y; ++j) { 
+			C[i*dim_y+j] = 0;
+			for (size_t k = 0; k < dim_z; ++k) { 
+				C[i*dim_y+j] += A[i*dim_z+k] * B[k*dim_y+j];
+			} 
+		} 
+	} 
+#endif
+	Tmatmul.stop();
+}
+
+inline void matmul2D1D(const size_t dim_y, const tensor_t &A, const vec_t &B, vec_t &C) {
+	// A: x*z; B: z*y; C: x*y
+	size_t dim_x = A.size();
+	size_t dim_z = A[0].size();
+	assert(B.size() == dim_z*dim_y);
+	assert(C.size() == dim_x*dim_y);
+
+#ifdef WITH_BLAS
+	vec_t A1D(dim_x*dim_z);
+	copy2D1D(A, A1D);
+	matmul1D1D(dim_x, dim_y, dim_z, A1D, B, C);
+#else
+	for (size_t i = 0; i < dim_x; ++i) { 
+		for (size_t j = 0; j < dim_y; ++j) { 
+			C[i*dim_y+j] = 0;
+			for (size_t k = 0; k < dim_z; ++k) { 
+				C[i*dim_y+j] += A[i][k] * B[k][j];
+			} 
+		} 
+	} 
+#endif
+}
+
+// matrix multiply
+inline void matmul(const tensor_t &A, const vec_t &B, tensor_t &C) {
+	// A: x*z; B: z*y; C: x*y
+	size_t dim_x = C.size();
+	size_t dim_y = C[0].size();
+	size_t dim_z = A[0].size();
+	assert(A.size() == dim_x);
+	assert(B.size() == dim_y*dim_z);
+
+#ifdef WITH_BLAS
+	vec_t A1D(dim_x*dim_z);
+	vec_t C1D(dim_x*dim_y, 0);
+	auto ptr = &A1D[0];
+	for (size_t i = 0; i < dim_x; i++) {
+		std::copy(A[i].begin(), A[i].end(), ptr);
+		ptr += dim_z;
+	}
+	matmul1D1D(dim_x, dim_y, dim_z, A1D, B, C1D);
+	for (size_t i = 0; i < dim_x; i++) {
+		for (size_t j = 0; j < dim_y; ++j) { 
+			C[i][j] = C1D[i*dim_y+j];
+		}
+	}
+#else
+	for (size_t i = 0; i < dim_x; ++i) { 
+		for (size_t j = 0; j < dim_y; ++j) { 
+			C[i][j] = 0;
+			for (size_t k = 0; k < dim_z; ++k) { 
+				C[i][j] += A[i][k] * B[k*dim_y+j];
+			} 
+		} 
+	} 
+#endif
+}
+
+template <typename DataTy = float>
+inline void transpose2D(const tensor_t &in, tensor_t &out) {
+	size_t x = in.size();
+	size_t y = in[0].size();
+	for (size_t i = 0; i < y; i ++) {
+		for (size_t j = 0; j < x; j ++) {
+			out[i][j] = in[j][i];
+		}
+	}
+}
+
+// TODO: vectorize
+template <typename DataTy = float>
+inline void transpose2D1D(const tensor_t &in, vec_t &out) {
+	size_t x = in.size();
+	size_t y = in[0].size();
+	assert(out.size() == x*y);
+	for (size_t i = 0; i < y; i ++) {
+		for (size_t j = 0; j < x; j ++) {
+			out[i*x+j] = in[j][i];
+		}
+	}
+}
+
+template <typename DataTy = float>
+inline void transpose(size_t x, size_t y, const vec_t &in, vec_t &out) {
+	for (size_t i = 0; i < y; i ++) {
+		for (size_t j = 0; j < x; j ++) {
+			out[i*x+j] = in[j*y+i];
+		}
+	}
+}
+
+template <typename DataTy = float>
+inline int argmax(const size_t n, const std::vector<DataTy> &x) {
+	DataTy max = x[0];
+	int max_ind = 0;
+	for (size_t i = 1; i < n; i++) {
+		if (x[i] > max) {
+			max_ind = i;
+			max = x[i];
+		}
+	}
+	return max_ind;
+}
+
+inline void clear(vec_t &in) {
+	for (size_t i = 0; i < in.size(); i++) in[i] = 0;
+}
+
+inline void update_all(Graph *g, const tensor_t &in, tensor_t &out, bool norm, const vec_t &norm_factor) {
+	galois::do_all(galois::iterate(g->begin(), g->end()), [&](const auto& src) {
+		clear(out[src]); // TODO: vectorize clear
+		float_t a = 0.0, b = 0.0;
+		if (norm) a = norm_factor[src];
+		// gather neighbors' embeddings
+		for (const auto e : g->edges(src)) {
+			const auto dst = g->getEdgeDst(e);
+			if (norm) {
+				b = a * norm_factor[dst];
+				vec_t neighbor = in[dst];
+				mul_scalar(b, neighbor);
+				vadd(out[src], neighbor, out[src]); // out[src] += in[dst]
+			} else vadd(out[src], in[dst], out[src]); // out[src] += in[dst]
+		}
+	}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("update_all"));
+}
+
+inline void update_all(Graph *g, const vec_t &in, tensor_t &out, bool norm, const vec_t &norm_factor) {
+	size_t len = out[0].size();
+	galois::do_all(galois::iterate(g->begin(), g->end()), [&](const auto& src) {
+		clear(out[src]);
+		float_t a = 0.0, b = 0.0;
+		if (norm) a = norm_factor[src];
+		// gather neighbors' embeddings
+		for (const auto e : g->edges(src)) {
+			const auto dst = g->getEdgeDst(e);
+			if (norm) {
+				b = a * norm_factor[dst];
+				vec_t neighbor(len);
+				mul_scalar(len, b, &in[dst*len], neighbor.data());
+				vadd(out[src], neighbor, out[src]); // out[src] += in[dst]
+			} else vadd(len, out[src].data(), &in[dst*len], out[src].data()); // out[src] += in[dst]
+		}
+	}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("update_all"));
+}
+
+template <typename DataTy = float>
+inline void relu(const std::vector<DataTy> &in, std::vector<DataTy> &out) {
+	for (size_t i = 0; i < out.size(); ++i) {
+		out[i] = std::max(in[i], (DataTy)0) + negative_slope * std::min(in[i], (DataTy)0);
+	}
+}
+
+template <typename DataTy = float>
+inline void d_relu(const std::vector<DataTy> &in_diff, const std::vector<DataTy> &fv, std::vector<DataTy> &out_diff) {
+	for (size_t i = 0; i < out_diff.size(); ++i) {
+		out_diff[i] = in_diff[i] * ((fv[i] > (DataTy)0)  + negative_slope * (fv[i] <= (DataTy)0));
+	}
+}
+
+inline void d_mvmul(vec_t &in_diff, vec_t &h_in, tensor_t &out_diff) {
+	vvmul(h_in, in_diff, out_diff); // transposed feature matrix X^T times in_diff 
+}
+
+inline void d_vadd(vec_t &in_diff, vec_t &out_diff) {
+	for (size_t i = 0; i < out_diff.size(); ++i)
+		out_diff[i] = in_diff[i];
+}
+
+template <typename DataTy = float>
+inline float reduce_mean(const std::vector<DataTy> &x) {
+	size_t n = x.size();
+	assert(n > 0);
+	float sum = (float)x[0];
+	for (size_t i = 1; i < n; i++) {
+		sum += (float)x[i];
+	}
+	return sum / (float)n;
+}
+
+const float scale_ = 1. / (1. - dropout_rate);
+
+inline void dropout(const vec_t &in, std::vector<unsigned> &mask, vec_t &out) {
+	assert(mask.size() == out.size());
+	//rng_bernoulli(1. - dropout_rate, mask); // Create random numbers
+	for (size_t i = 0; i < in.size(); ++i)
+		mask[i] = bernoulli(dropout_rate);
+	for (size_t i = 0; i < in.size(); ++i)
+		out[i] = in[i] * mask[i] * scale_;
+}
+
+inline void dropout(const vec_t &in, std::vector<unsigned> &mask, float_t *out) {
+	for (size_t i = 0; i < in.size(); ++i)
+		mask[i] = bernoulli(dropout_rate);
+	for (size_t i = 0; i < in.size(); ++i)
+		out[i] = in[i] * mask[i] * scale_;
+}
+
+inline void d_dropout(const vec_t &in_diff, std::vector<unsigned> &mask, vec_t &out_diff) {
+	for (size_t i = 0; i < in_diff.size(); ++i)
+		out_diff[i] = in_diff[i] * mask[i] * scale_;
+}
+
+template <typename DataTy = float>
+inline DataTy sigmoid_func(DataTy x) {
+	return 0.5 * tanh(0.5 * x) + 0.5;
+}
+
+// Sigmoid
+template <typename DataTy = float>
+inline void sigmoid(std::vector<DataTy> &fv) {
+	size_t count = fv.size();
+	for (size_t i = 0; i < count; ++i) {
+		fv[i] = sigmoid_func(fv[i]);
+	}
+}
+
+// Softmax function takes an N-dimensional vector (X) of real number,
+// and transforms it into a vector of real number in range (0,1) which add upto 1.
+// To make softmax func numerically stable, we simply normalize the values in the vector, 
+// by multiplying the numerator and denominator with a constant C, where log(C)=-max(X)
+//    exps = np.exp(X - np.max(X))
+//    exps / np.sum(exps)
+template <typename DataTy = float>
+inline void softmax(const std::vector<DataTy> &input, std::vector<DataTy> &output) {
+	const float_t max = *std::max_element(input.begin(), input.end());
+	float_t denominator(0);
+	for (size_t i = 0; i < input.size(); i++) {
+		output[i] = std::exp(input[i] - max);
+		denominator += output[i];
+	}
+	for (size_t i = 0; i < input.size(); i++)
+		output[i] /= denominator;
+}
+
+template <typename DataTy = float>
+inline void log_softmax(const std::vector<DataTy> &input, std::vector<DataTy> &output) {
+	const float_t max = *std::max_element(input.begin(), input.end());
+	float_t denominator(0);
+	for (size_t i = 0; i < input.size(); i++)
+		denominator += std::exp(input[i] - max);
+	for (size_t i = 0; i < input.size(); i++)
+		output[i] = input[i] - max - denominator;
+}
+
+// Due to the desirable property of softmax function outputting a probability distribution, 
+// we often use it as the final layer in neural networks.
+// For this we need to calculate the derivative or gradient,
+// and pass it back to the previous layer during backpropagation.
+template <typename DataTy = float>
+inline void d_softmax(const std::vector<DataTy> &y, const std::vector<DataTy> &p, 
+		std::vector<DataTy> &dy, const std::vector<DataTy> &dp) {
+	auto n = y.size();
+	vec_t df(n, 0);
+	for (size_t i = 0; i < n; i++) {
+		for (size_t j = 0; j < n; j++) {
+			//DataTy delta_ij = i == j? 1 : 0;
+			//df[i] += p[j] * (delta_ij - p[i]);
+			df[j] = (j == i) ? p[i] * (float_t(1) - p[i]) : -p[j] * p[i];
+		}
+		// dy = dp * (gradient of softmax)
+		dy[i] = dot(dp, df);
+	}
+/* 
+	for (size_t j = 0; j < x.size(); j++) {
+		for (size_t k = 0; k < x.size(); k++) {
+			df[k] = (k == j) ? y[j] * (float_t(1) - y[j]) : -y[k] * y[j];
+		}
+		dx[j] = vectorize::dot(&dy[0], &df[0], len);
+	}
+*/
+}
+
+// cross-entropy loss function for multi-class classification
+// y: ground truth
+// p: predicted probability
+template <typename DataTy = float>
+inline DataTy cross_entropy(const std::vector<DataTy> &y, const std::vector<DataTy> &p) {
+	auto n = y.size();
+	assert(n > 0);
+	DataTy loss = 0.0;
+	for (size_t i = 0; i < n; i++) {
+		if (y[i] == float_t(0)) continue;
+		if (p[i] == float_t(0)) loss -= y[i] * std::log(float_t(1e-10));
+		//if (p[i]==float_t(1)) loss -= (float_t(1) - y[i]) * std::log(float_t(1e-10));
+		else loss -= y[i] * std::log(p[i]);// + (float_t(1) - y[i]) * std::log(float_t(1) - p[i]);
+		//loss -= y[i] * std::log(p[i]);
+	}
+	return loss;
+}
+
+template <typename DataTy = float>
+inline void d_cross_entropy(const std::vector<DataTy> &y, const std::vector<DataTy> &p, std::vector<DataTy> &d) {
+	auto n = y.size();
+	//for (size_t i = 0; i < n; i++) d[i] = (p[i] - y[i]) / (p[i] * (float_t(1) - p[i]));
+	for (size_t i = 0; i < n; i++) {
+		d[i] = -y[i] / (p[i] + float_t(1e-10));
+		//d[i] = p[i] - y[i];
+	}
+}
+
+#endif
diff --git a/libdeepgalois/net.h b/libdeepgalois/net.h
new file mode 100644
index 0000000000..fac7caee00
--- /dev/null
+++ b/libdeepgalois/net.h
@@ -0,0 +1,341 @@
+#ifndef _MODEL_H_
+#define _MODEL_H_
+
+#include <random>
+#include "gnn.h"
+#include "lgraph.h"
+#include "layers.h"
+#include "optimizer.h"
+
+#define NUM_CONV_LAYERS 2
+
+// N: number of vertices, D: feature vector dimentions, 
+// E: number of distinct labels, i.e. number of vertex classes
+// layer 1: features N x D, weights D x 16, out N x 16 (hidden1=16)
+// layer 2: features N x 16, weights 16 x E, out N x E
+class Net {
+public:
+	Net() {}
+
+	// user-defined aggregate function
+	virtual void aggregate(Graph *g, size_t dim, const tensor_t &in_feats, tensor_t &out_feats) {}
+	
+	// user-defined combine function
+	virtual void combine(const vec_t ma, const vec_t mb, const vec_t &a, const vec_t &b, vec_t &out) {}
+	
+	void init() {
+		assert(dropout_rate < 1.0);
+		read_graph(dataset, g); 
+		n = g.size(); // N
+		labels.resize(n, 0); // label for each vertex: N x 1
+		num_classes = read_labels(dataset, labels);
+
+		std::cout << "Reading label masks ... ";
+		train_mask.resize(n, 0);
+		val_mask.resize(n, 0);
+		if (dataset == "reddit") {
+			train_begin = 0, train_count = 153431, train_end = train_begin + train_count;
+			val_begin = 153431, val_count = 23831, val_end = val_begin + val_count;
+			for (size_t i = train_begin; i < train_end; i++) train_mask[i] = 1;
+			for (size_t i = val_begin; i < val_end; i++) val_mask[i] = 1;
+		} else {
+			train_count = read_masks(dataset, "train", train_begin, train_end, train_mask);
+			val_count = read_masks(dataset, "val", val_begin, val_end, val_mask);
+		}
+		std::cout << "Done\n";
+
+		num_layers = NUM_CONV_LAYERS + 1;
+		feature_dims.resize(num_layers + 1);
+		input_features.resize(n); // input embedding: N x D
+		feature_dims[0] = read_features(dataset, input_features); // input feature dimension: D
+		feature_dims[1] = hidden1; // hidden1 level embedding: 16
+		feature_dims[2] = num_classes; // output embedding: E
+		feature_dims[3] = num_classes; // normalized output embedding: E
+		layers.resize(num_layers);
+	}
+	size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; }
+	size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id+1]; }
+	size_t get_nnodes() { return n; }
+	size_t get_nedges() { return g.sizeEdges(); }
+	size_t get_ft_dim() { return feature_dims[0]; }
+	size_t get_nclasses() { return num_classes; }
+	size_t get_label(size_t i) { return labels[i]; }
+	void construct_layers() {
+		std::cout << "\nConstructing layers...\n";
+		append_conv_layer(0, true); // first conv layer
+		append_conv_layer(1); // hidden1 layer
+		append_out_layer(2); // output layer
+		layers[0]->set_in_data(input_features); // feed input data
+	}
+
+	void set_netphase(net_phase phase) {
+		for (size_t i = 0; i < num_layers; i ++)
+			layers[i]->set_context(phase);
+	}
+
+	void print_layers_info() {
+		for (size_t i = 0; i < num_layers; i ++)
+			layers[i]->print_layer_info();
+	}
+
+	void append_conv_layer(size_t layer_id, bool act = false, bool norm = true, bool bias = false, bool dropout = true) {
+		assert(layer_id < NUM_CONV_LAYERS);
+		std::vector<size_t> in_dims(2), out_dims(2);
+		in_dims[0] = out_dims[0] = n;
+		in_dims[1] = get_in_dim(layer_id);
+		out_dims[1] = get_out_dim(layer_id);
+		layers[layer_id] = new graph_conv_layer(layer_id, &g, act, norm, bias, dropout, in_dims, out_dims);
+		if(layer_id > 0) connect(layers[layer_id-1], layers[layer_id]);
+	}
+
+	void append_out_layer(size_t layer_id) {
+		assert(layer_id > 0); // can not be the first layer
+		std::vector<size_t> in_dims(2), out_dims(2);
+		in_dims[0] = out_dims[0] = n;
+		in_dims[1] = get_in_dim(layer_id);
+		out_dims[1] = get_out_dim(layer_id);
+		layers[layer_id] = new softmax_loss_layer(layer_id, in_dims, out_dims, &labels);
+		connect(layers[layer_id-1], layers[layer_id]);
+	}
+
+	// forward propagation: [begin, end) is the range of samples used.
+	acc_t fprop(size_t begin, size_t end, size_t count, MaskList &masks) {
+		// set mask for the last layer
+		layers[num_layers-1]->set_sample_mask(begin, end, count, masks);
+		// layer0: from N x D to N x 16
+		// layer1: from N x 16 to N x E
+		// layer2: from N x E to N x E (normalize only)
+		for (size_t i = 0; i < num_layers; i ++)
+			layers[i]->forward();
+		return layers[num_layers-1]->get_masked_loss();
+	}
+
+	// back propogation
+	void bprop() {
+		for (size_t i = num_layers; i != 0; i --)
+			layers[i-1]->backward();
+	}
+
+	// update trainable weights after back-propagation
+	void update_weights(optimizer *opt) {
+		for (size_t i = 0; i < num_layers; i ++)
+			if (layers[i]->trainable()) layers[i]->update_weight(opt);
+	}
+
+	// evaluate, i.e. inference or predict
+	double evaluate(size_t begin, size_t end, size_t count, MaskList &masks, acc_t &loss, acc_t &acc) {
+		Timer t_eval;
+		t_eval.Start();
+		loss = fprop(begin, end, count, masks);
+		acc = masked_accuracy(begin, end, count, masks);
+		t_eval.Stop();
+		return t_eval.Millisecs();
+	}
+
+	// training
+	void train(optimizer *opt) {
+		std::cout << "\nStart training...\n";
+		galois::StatTimer Tupdate("Train-WeightUpdate");
+		galois::StatTimer Tfw("Train-Forward");
+		galois::StatTimer Tbw("Train-Backward");
+		galois::StatTimer Tval("Validation");
+		Timer t_epoch;
+		// run epoches
+		for (size_t i = 0; i < epochs; i++) {
+			std::cout << "Epoch " << std::setw(2) << i << std::fixed << std::setprecision(3) << ":";
+			t_epoch.Start();
+
+			// training steps
+			set_netphase(net_phase::train);
+			acc_t train_loss = 0.0, train_acc = 0.0;
+			Tfw.start();
+			train_loss = fprop(train_begin, train_end, train_count, train_mask); // forward
+			train_acc = masked_accuracy(train_begin, train_end, train_count, train_mask); // predict
+			Tfw.stop();
+			Tbw.start();
+			bprop(); // back propogation
+			Tbw.stop();
+			Tupdate.start();
+			update_weights(opt); // update parameters
+			Tupdate.stop();
+			set_netphase(net_phase::test);
+			std::cout << " train_loss = " << std::setw(5) << train_loss << " train_acc = " << std::setw(5) << train_acc;
+			t_epoch.Stop();
+			double epoch_time = t_epoch.Millisecs();
+
+			if (do_validate) {
+				// Validation
+				acc_t val_loss = 0.0, val_acc = 0.0;
+				Tval.start();
+				double val_time = evaluate(val_begin, val_end, val_count, val_mask, val_loss, val_acc);
+				Tval.stop();
+				std::cout << " val_loss = " << std::setw(5) << val_loss << " val_acc = " << std::setw(5) << val_acc;
+				std::cout << " time = " << epoch_time + val_time << " ms (train_time = " << epoch_time << " val_time = " << val_time << ")\n";
+			} else {
+				std::cout << " train_time = " << epoch_time << " ms\n";
+			}
+		}
+	}
+
+protected:
+	size_t n; // number of samples: N
+	size_t num_classes; // number of vertex classes: E
+	size_t num_layers; // for now hard-coded: NUM_CONV_LAYERS + 1
+	std::vector<size_t> feature_dims; // feature dimnesions for each layer
+
+	Graph g; // the input graph, |V| = N
+	tensor_t input_features; // input features: N x D
+	std::vector<label_t> labels; // labels for classification: N x 1
+	MaskList train_mask, val_mask; // masks for traning and validation
+	size_t train_begin, train_end, train_count, val_begin, val_end, val_count;
+
+	std::vector<layer *> layers; // all the layers in the neural network
+	/*
+	inline void init_features(size_t dim, vec_t &x) {
+		std::default_random_engine rng;
+		std::uniform_real_distribution<feature_t> dist(0, 0.1);
+		for (size_t i = 0; i < dim; ++i)
+			x[i] = dist(rng);
+	}
+	//*/
+
+	// labels contain the ground truth (e.g. vertex classes) for each example (num_examples x 1).
+	// Note that labels is not one-hot encoded vector and it can be computed
+	// as y.argmax(axis=1) from one-hot encoded vector (y) of labels if required.
+	size_t read_labels(std::string dataset_str, LabelList &labels) {
+		std::cout << "Reading labels ... ";
+		Timer t_read;
+		t_read.Start();
+		std::string filename = path + dataset_str + "-labels.txt";
+		std::ifstream in;
+		std::string line;
+		in.open(filename, std::ios::in);
+		size_t m, n;
+		in >> m >> n >> std::ws;
+		assert(m == labels.size()); // number of vertices
+		unsigned v = 0;
+		while (std::getline(in, line)) {
+			std::istringstream label_stream(line);
+			unsigned x;
+			for (size_t idx = 0; idx < n; ++idx) {
+				label_stream >> x;
+				if (x != 0) {
+					labels[v] = idx;
+					break;
+				}
+			}
+			v ++;
+		}
+		in.close();
+		t_read.Stop();
+		// number of vertex classes
+		std::cout << "Done, unique label counts: " << n << ", time: " << t_read.Millisecs() << " ms\n";
+		return n;
+	}
+
+	size_t read_features(std::string dataset_str, tensor_t &feats) {
+		std::cout << "Reading features ... ";
+		Timer t_read;
+		t_read.Start();
+		std::string filename = path + dataset_str + ".ft";
+		std::ifstream in;
+		std::string line;
+		in.open(filename, std::ios::in);
+		size_t m, n;
+		in >> m >> n >> std::ws;
+		assert(m == feats.size()); // m = number of vertices
+		for (size_t i = 0; i < m; ++i) {
+			feats[i].resize(n);
+			for (size_t j = 0; j < n; ++j)
+				feats[i][j] = 0;
+		}
+		while (std::getline(in, line)) {
+			std::istringstream edge_stream(line);
+			unsigned u, v;
+			float_t w;
+			edge_stream >> u;
+			edge_stream >> v;
+			edge_stream >> w;
+			feats[u][v] = w;
+		}
+		/*
+		for (size_t i = 0; i < 10; ++i)
+			for (size_t j = 0; j < n; ++j)
+				if (feats[i][j] > 0)
+					std::cout << "feats[" << i << "][" << j << "]: " << feats[i][j] << std::endl;
+		//*/
+		in.close();
+		t_read.Stop();
+		std::cout << "Done, feature dimention: " << n << ", time: " << t_read.Millisecs() << " ms\n";
+		return n;
+	}
+
+	unsigned read_graph(std::string dataset_str, Graph &graph) {
+		//printf("Start readGraph\n");
+		galois::StatTimer Tread("GraphReadingTime");
+		Tread.start();
+		LGraph lgraph;
+		unsigned max_degree = 0;
+		if (filetype == "el") {
+			std::string filename = path + dataset_str + ".el";
+			printf("Reading .el file: %s\n", filename.c_str());
+			lgraph.read_edgelist(filename.c_str(), true); //symmetrize
+			genGraph(lgraph, graph);
+		} else if (filetype == "gr") {
+			std::string filename = path + dataset_str + ".csgr";
+			printf("Reading .gr file: %s\n", filename.c_str());
+			galois::graphs::readGraph(graph, filename);
+			/*
+			galois::do_all(galois::iterate(graph.begin(), graph.end()), [&](const auto& vid) {
+				graph.getData(vid) = 1;
+				//for (auto e : graph.edges(n)) graph.getEdgeData(e) = 1;
+			}, galois::chunk_size<256>(), galois::steal(), galois::loopname("assignVertexLabels"));
+			std::vector<unsigned> degrees(graph.size());
+			galois::do_all(galois::iterate(graph.begin(), graph.end()), [&](const auto& vid) {
+				degrees[vid] = std::distance(graph.edge_begin(vid), graph.edge_end(vid));
+			}, galois::loopname("computeMaxDegree"));
+			max_degree = *(std::max_element(degrees.begin(), degrees.end()));
+			*/
+		} else { printf("Unkown file format\n"); exit(1); }
+		if (filetype != "gr") {
+			max_degree = lgraph.get_max_degree();
+			lgraph.clean();
+		}
+		printf("max degree = %u\n", max_degree);
+		Tread.stop();
+		//printf("Done readGraph\n");
+		std::cout << "num_vertices " << g.size() << " num_edges " << g.sizeEdges() << "\n";
+		return max_degree;
+	}
+
+	void genGraph(LGraph &lg, Graph &g) {
+		g.allocateFrom(lg.num_vertices(), lg.num_edges());
+		g.constructNodes();
+		for (size_t i = 0; i < lg.num_vertices(); i++) {
+			g.getData(i) = 1;
+			auto row_begin = lg.get_offset(i);
+			auto row_end = lg.get_offset(i+1);
+			g.fixEndEdge(i, row_end);
+			for (auto offset = row_begin; offset < row_end; offset ++)
+				g.constructEdge(offset, lg.get_dest(offset), 0); // do not consider edge labels now
+		}
+	}
+
+	inline acc_t masked_accuracy(size_t begin, size_t end, size_t count, MaskList &masks) {
+		// comparing outputs with the ground truth (labels)
+		//acc_t accuracy_all = 0.0;
+		AccumF accuracy_all;
+		accuracy_all.reset();
+		//for (size_t i = begin; i < end; i++) {
+		galois::do_all(galois::iterate(begin, end), [&](const auto& i) {
+			if (masks[i] == 1) {
+				int prediction = argmax(num_classes, layers[NUM_CONV_LAYERS-1]->next()->get_data()[i]);
+				if ((label_t)prediction == labels[i]) accuracy_all += 1.0;
+			}
+		}, galois::chunk_size<256>(), galois::steal(), galois::loopname("getMaskedLoss"));
+		//}
+		return accuracy_all.reduce() / (acc_t)count;
+	}
+};
+
+#endif
diff --git a/libdeepgalois/node.h b/libdeepgalois/node.h
new file mode 100644
index 0000000000..deffebad9b
--- /dev/null
+++ b/libdeepgalois/node.h
@@ -0,0 +1,109 @@
+#pragma once
+#include <vector>
+class node;
+class layer;
+class edge;
+
+typedef std::shared_ptr<edge> edgeptr_t;
+
+// node data structure
+class node : public std::enable_shared_from_this<node> {
+public:
+	node(size_t in_size, size_t out_size) {}//: prev_(in_size), next_(out_size) {}
+	virtual ~node() {}
+	const edgeptr_t prev() const { return prev_; }
+	//const std::vector<edgeptr_t> &prev() const { return prev_; }
+	const edgeptr_t next() const { return next_; }
+	//const std::vector<edgeptr_t> &next() const { return next_; }
+	//std::vector<node *> prev_nodes() const;
+	//std::vector<node *> next_nodes() const;
+
+protected:
+	node() = delete;
+	friend void connect(layer *head, layer *tail, size_t head_index, size_t tail_index);
+	//mutable std::vector<edgeptr_t> prev_;
+	//mutable std::vector<edgeptr_t> next_;
+	mutable edgeptr_t prev_;
+	mutable edgeptr_t next_;
+};
+
+// edges manage the input/output data and gradients between nodes
+class edge {
+public:
+	edge(node *prev, size_t len) :
+		ft_dim_(len),
+		data_({vec_t(len)}),
+		grad_({vec_t(len)}),
+		prev_(prev) {}
+
+	void merge_grads(vec_t *dst) {
+		assert(!grad_.empty());
+		const auto &grad_head = grad_[0];
+		size_t sz             = grad_head.size();
+		dst->resize(sz);
+		float_t *pdst = &(*dst)[0];
+		std::copy(grad_head.begin(), grad_head.end(), pdst);
+		// @todo consider adding parallelism and vectorization
+		for (size_t sample = 1; sample < grad_.size(); ++sample) {
+			for (size_t i = 0; i < sz; i++)
+				pdst[i] += grad_[sample][i];
+			//vectorize::reduce<float_t>(&grad_[sample][0], sz, pdst);
+		}
+	}
+	void clear_grads() {
+		for (size_t sample = 0; sample < grad_.size(); ++sample) {
+			auto &g = grad_[sample];
+			std::fill(g.begin(), g.end(), 0.0); // TODO: need vectorize
+			//vectorize::fill(&g[0], g.size(), float_t{0});
+		}
+	}
+
+	tensor_t *get_data_ptr() { return &data_; }
+	tensor_t &get_data() { return data_; }
+	//const tensor_t *get_data() const { return &data_; }
+	const tensor_t &get_data() const { return data_; }
+	//tensor_t *get_gradient() { return &grad_; }
+	tensor_t &get_gradient() { return grad_; }
+	//const tensor_t *get_gradient() const { return &grad_; }
+	const tensor_t &get_gradient() const { return grad_; }
+
+	//const std::vector<node *> &next() const { return next_; }
+	const node *next() const { return next_; }
+	node *prev() { return prev_; }
+	const node *prev() const { return prev_; }
+	//const shape3d &shape() const { return shape_; }
+	//vector_type vtype() const { return vtype_; }
+	//void add_next_node(node *next) { next_.push_back(next); }
+	void add_next_node(node *next) { next_ = next; }
+private:
+	//shape3d shape_;
+	size_t ft_dim_;
+	//vector_type vtype_;
+	tensor_t data_;
+	tensor_t grad_;
+	node *prev_;                // previous node, "producer" of this tensor
+	node *next_;                // next node, "consumer" of this tensor
+	//std::vector<node *> next_;  // next nodes, "consumers" of this tensor
+};
+/*
+inline std::vector<node *> node::prev_nodes() const {
+	std::vector<node *> vecs;
+	for (auto &e : prev_) {
+		if (e && e->prev()) {
+			vecs.insert(vecs.end(), e->prev());
+		}
+	}
+	return vecs;
+}
+
+inline std::vector<node *> node::next_nodes() const {
+	std::vector<node *> vecs;
+	for (auto &e : next_) {
+		if (e) {
+			auto n = e->next();
+			vecs.insert(vecs.end(), n.begin(), n.end());
+		}
+	}
+	return vecs;
+}
+*/
diff --git a/libdeepgalois/optimizer.h b/libdeepgalois/optimizer.h
new file mode 100644
index 0000000000..2896881fed
--- /dev/null
+++ b/libdeepgalois/optimizer.h
@@ -0,0 +1,221 @@
+#pragma once
+
+#include <algorithm>
+#include <unordered_map>
+#include "types.h"
+
+// base class of optimizer
+// usesHessian : true if an optimizer uses hessian (2nd order derivative of loss function)
+struct optimizer {
+	optimizer()                  = default;
+	optimizer(const optimizer &) = default;
+	optimizer(optimizer &&)      = default;
+	optimizer &operator=(const optimizer &) = default;
+	optimizer &operator=(optimizer &&) = default;
+	virtual ~optimizer()               = default;
+	virtual void update(const vec_t &dW, vec_t &W, bool parallelize) = 0;
+	virtual void reset() {}  // override to implement pre-learning action
+};
+
+// helper class to hold N values for each weight
+template <int N>
+struct stateful_optimizer : public optimizer {
+	void reset() override { for (auto &e : E_) e.clear(); }
+protected:
+	template <int Index>
+	vec_t &get(const vec_t &key) {
+		static_assert(Index < N, "index out of range");
+		if (E_[Index][&key].empty()) E_[Index][&key].resize(key.size(), float_t());
+		return E_[Index][&key];
+	}
+	std::unordered_map<const vec_t *, vec_t> E_[N];
+};
+
+/**
+ * adaptive gradient method
+ *
+ * J Duchi, E Hazan and Y Singer,
+ * Adaptive subgradient methods for online learning and stochastic optimization
+ * The Journal of Machine Learning Research, pages 2121-2159, 2011.
+ **/
+struct adagrad : public stateful_optimizer<1> {
+	adagrad() : alpha(learning_rate), eps(float_t(1e-8)) {}
+	void update(const vec_t &dW, vec_t &W, bool parallelize) {
+		vec_t &g = get<0>(W);
+		if (parallelize) {
+			galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) {
+				g[i] += dW[i] * dW[i];
+				W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps);
+			}, galois::loopname("adagrad_update"));
+		} else {
+			for (size_t i = 0; i < W.size(); i++) {
+				g[i] += dW[i] * dW[i];
+				W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps);
+			}
+		}
+	}
+	float_t alpha;  // learning rate
+	private:
+	float_t eps;
+};
+
+/**
+ * RMSprop
+ *
+ * T Tieleman, and G E Hinton,
+ * Lecture 6.5 - rmsprop, COURSERA: Neural Networks for Machine Learning (2012)
+ **/
+struct RMSprop : public stateful_optimizer<1> {
+	RMSprop() : alpha(float_t(0.0001)), mu(float_t(0.99)), eps(float_t(1e-8)) {}
+	void update(const vec_t &dW, vec_t &W, bool parallelize) {
+		vec_t &g = get<0>(W);
+		galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) {
+			g[i] = mu * g[i] + (1 - mu) * dW[i] * dW[i];
+			W[i] -= alpha * dW[i] / std::sqrt(g[i] + eps);
+		}, galois::loopname("rms_update"));
+	}
+	float_t alpha;  // learning rate
+	float_t mu;     // decay term
+private:
+	float_t eps;  // constant value to avoid zero-division
+};
+
+// Adam: A Method for Stochastic Optimization
+// http://arxiv.org/abs/1412.6980
+struct adam : public stateful_optimizer<2> {
+	adam() : alpha(learning_rate), b1(float_t(0.9)),
+		b2(float_t(0.999)), b1_t(float_t(0.9)),
+		b2_t(float_t(0.999)), eps(float_t(1e-8)) {}
+
+	void update(const vec_t &dW, vec_t &W, bool parallelize) {
+		vec_t &mt = get<0>(W);
+		vec_t &vt = get<1>(W);
+		galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) {
+			mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i];
+			vt[i] = b2 * vt[i] + (float_t(1) - b2) * dW[i] * dW[i];
+			// L2 norm based update rule
+			W[i] -= alpha * (mt[i] / (float_t(1) - b1_t)) /
+				std::sqrt((vt[i] / (float_t(1) - b2_t)) + eps);
+		}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("adam_update"));
+		b1_t *= b1;
+		b2_t *= b2;
+	}
+
+	float_t alpha;  // learning rate
+	float_t b1;     // decay term
+	float_t b2;     // decay term
+	float_t b1_t;   // decay term power t
+	float_t b2_t;   // decay term power t
+
+private:
+	float_t eps;  // constant value to avoid zero-division
+};
+
+/**
+ * @brief [a new optimizer (2015)]
+ * @details [see Adam: A Method for Stochastic Optimization (Algorithm 2)
+ *               http://arxiv.org/abs/1412.6980]
+ *
+ */
+struct adamax : public stateful_optimizer<2> {
+	adamax()
+		: alpha(float_t(0.002)),
+		b1(float_t(0.9)),
+		b2(float_t(0.999)),
+		b1_t(b1),
+		eps(float_t(1e-8)) {}
+
+	void update(const vec_t &dW, vec_t &W, bool parallelize) {
+		vec_t &mt = get<0>(W);
+		vec_t &ut = get<1>(W);
+		galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) {
+			mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i];
+			ut[i] = std::max(b2 * ut[i], std::abs(dW[i]));
+			// Lp norm based update rule
+			W[i] -= (alpha / (1.0 - b1_t)) * (mt[i] / (ut[i] + eps));
+		}, galois::loopname("adamax_update"));
+		b1_t *= b1;
+	}
+
+	float_t alpha;  // learning rate
+	float_t b1;     // decay term
+	float_t b2;     // decay term
+	float_t b1_t;   // decay term power t
+
+private:
+	float_t eps;  // constant value to avoid zero-division
+};
+
+/**
+ * SGD without momentum
+ *
+ * slightly faster than tiny_dnn::momentum
+ **/
+struct gradient_descent : public optimizer {
+	gradient_descent() : alpha(float_t(0.01)), lambda(float_t(0)) {}
+	void update(const vec_t &dW, vec_t &W, bool parallelize) {
+		galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) {
+			W[i] = W[i] - alpha * (dW[i] + lambda * W[i]); 
+		}, galois::loopname("gradient_descent_update"));
+	}
+	float_t alpha;   // learning rate
+	float_t lambda;  // weight decay
+};
+
+/**
+ * SGD with momentum
+ *
+ * B T Polyak,
+ * Some methods of speeding up the convergence of iteration methods
+ * USSR Computational Mathematics and Mathematical Physics, 4(5):1-17, 1964.
+ **/
+struct momentum : public stateful_optimizer<1> {
+ public:
+  momentum() : alpha(float_t(0.01)), lambda(float_t(0)), mu(float_t(0.9)) {}
+
+  void update(const vec_t &dW, vec_t &W, bool parallelize) {
+    vec_t &dWprev = get<0>(W);
+
+    //for_i(parallelize, W.size(), [&](size_t i) {
+	galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) {
+      float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda);
+      W[i] += V;
+      dWprev[i] = V;
+    //});
+	}, galois::loopname("momentum_update"));
+  }
+
+  float_t alpha;   // learning rate
+  float_t lambda;  // weight decay
+  float_t mu;      // momentum
+};
+
+/**
+ * SGD with Nesterov momentum
+ *
+ * Y Nesterov,
+ * A method for unconstrained convex minimization problem with the rate of
+ * convergence o(1/k2), Doklady ANSSSR, vol.269, pp.543-547, 1983.
+ **/
+struct nesterov_momentum : public stateful_optimizer<1> {
+ public:
+  nesterov_momentum()
+    : alpha(float_t(0.01)), lambda(float_t(0)), mu(float_t(0.9)) {}
+
+  void update(const vec_t &dW, vec_t &W, bool parallelize) {
+    vec_t &dWprev = get<0>(W);
+
+    //for_i(parallelize, W.size(), [&](size_t i) {
+	galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) {
+      float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda);
+      W[i] += (-mu) * dWprev[i] + (1 + mu) * V;
+      dWprev[i] = V;
+    //});
+	}, galois::loopname("nesterov_momentum_update"));
+  }
+
+  float_t alpha;   // learning rate
+  float_t lambda;  // weight decay
+  float_t mu;      // momentum
+};
+
diff --git a/libdeepgalois/random.h b/libdeepgalois/random.h
new file mode 100644
index 0000000000..9236e9c391
--- /dev/null
+++ b/libdeepgalois/random.h
@@ -0,0 +1,63 @@
+#ifndef RANDOM_H
+#define RANDOM_H
+typedef boost::mt19937 rng_t;
+
+// random seeding
+int64_t seedgen(void) {
+	int64_t s, seed, pid;
+	FILE* f = fopen("/dev/urandom", "rb");
+	if (f && fread(&seed, 1, sizeof(seed), f) == sizeof(seed)) {
+		fclose(f);
+		return seed;
+	}
+	std::cout << "System entropy source not available, using fallback algorithm to generate seed instead.";
+	if (f) fclose(f);
+	pid = getpid();
+	s = time(NULL);
+	seed = std::abs(((s * 181) * ((pid - 83) * 359)) % 104729);
+	return seed;
+}
+
+// This random number generator facade hides boost and CUDA rng
+// implementation from one another (for cross-platform compatibility).
+class RNG {
+public:
+	RNG() : generator_(new Generator()) { }
+	explicit RNG(unsigned int seed) : generator_(new Generator(seed)) { }
+	explicit RNG(const RNG&);
+	RNG& operator=(const RNG& other) { generator_ = other.generator_; return *this; }
+	void* generator() { return static_cast<void*>(generator_->rng()); }
+private:
+	class Generator {
+		public:
+			Generator() : rng_(new rng_t(seedgen())) {}
+			explicit Generator(unsigned seed) : rng_(new rng_t(seed)) {}
+			rng_t* rng() { return rng_.get(); }
+		private:
+			std::shared_ptr<rng_t> rng_;
+	};
+
+	std::shared_ptr<Generator> generator_;
+};
+
+std::shared_ptr<RNG> random_generator_;
+inline static RNG& rng_stream() {
+	random_generator_.reset(new RNG());
+	return *random_generator_;
+}
+
+inline rng_t* rng() {
+	return static_cast<rng_t*>(rng_stream().generator());
+}
+
+#include <boost/random/bernoulli_distribution.hpp>
+template <typename DataTy = float>
+void rng_bernoulli(const DataTy p, std::vector<unsigned> &r) {
+	boost::bernoulli_distribution<DataTy> random_distribution(p);
+	boost::variate_generator<rng_t*, boost::bernoulli_distribution<DataTy> >
+		variate_generator(rng(), random_distribution);
+	for (size_t i = 0; i < r.size(); ++i)
+		r[i] = static_cast<unsigned>(variate_generator());
+}
+
+#endif
diff --git a/libdeepgalois/timer.h b/libdeepgalois/timer.h
new file mode 100644
index 0000000000..e6c838c37b
--- /dev/null
+++ b/libdeepgalois/timer.h
@@ -0,0 +1,21 @@
+#ifndef TIMER_H_
+#define TIMER_H_
+#include <sys/time.h>
+
+class Timer {
+public:
+	Timer() {}
+	void Start() { gettimeofday(&start_time_, NULL); }
+	void Stop() {
+		gettimeofday(&elapsed_time_, NULL);
+		elapsed_time_.tv_sec  -= start_time_.tv_sec;
+		elapsed_time_.tv_usec -= start_time_.tv_usec;
+	}
+	double Seconds() const { return elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec/1e6; }
+	double Millisecs() const { return 1000*elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec/1000; }
+	double Microsecs() const { return 1e6*elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec; }
+private:
+	struct timeval start_time_;
+	struct timeval elapsed_time_;
+};
+#endif  // TIMER_H_
diff --git a/libdeepgalois/types.h b/libdeepgalois/types.h
new file mode 100644
index 0000000000..bc9fe21049
--- /dev/null
+++ b/libdeepgalois/types.h
@@ -0,0 +1,34 @@
+#ifndef TYPES_H
+#define TYPES_H
+#include <vector>
+#include "galois/Galois.h"
+#include "galois/graphs/LCGraph.h"
+
+#ifdef CNN_USE_DOUBLE
+typedef double float_t;
+typedef double feature_t;
+#else
+typedef float float_t;
+typedef float feature_t; // feature type
+#endif
+typedef std::vector<float_t> vec_t; // feature vector (1D)
+typedef std::vector<vec_t> tensor_t; // feature vectors (2D): num_samples x feature_dim
+typedef std::vector<feature_t> FV; // feature vector
+typedef std::vector<FV> FV2D; // feature vectors: num_samples x feature_dim
+typedef float acc_t; // Accuracy type
+typedef short label_t; // label is for classification (supervised learning)
+typedef uint8_t mask_t; // mask is used to indicate different uses of labels: train, val, test
+typedef std::vector<label_t> LabelList; // label list to store label for each vertex
+typedef std::vector<mask_t> MaskList; // mask list to store mask for each vertex
+typedef galois::GAccumulator<acc_t> AccumF;
+typedef galois::GAccumulator<size_t> AccumU;
+
+#ifdef EDGE_LABEL
+typedef galois::graphs::LC_CSR_Graph<uint32_t, uint32_t>::with_numa_alloc<true>::type ::with_no_lockable<true>::type Graph;
+#else
+typedef galois::graphs::LC_CSR_Graph<uint32_t, void>::with_numa_alloc<true>::type ::with_no_lockable<true>::type Graph;
+#endif
+
+typedef Graph::GraphNode GNode;
+
+#endif
diff --git a/libdeepgalois/utils.h b/libdeepgalois/utils.h
new file mode 100644
index 0000000000..70356654b9
--- /dev/null
+++ b/libdeepgalois/utils.h
@@ -0,0 +1,119 @@
+#pragma once
+
+#include <random>
+#include <iomanip>
+#include <fstream>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include "gnn.h"
+
+std::string path = "/h2/xchen/datasets/Learning/"; // path to the input dataset
+enum class net_phase { train, test };
+
+class ResourceManager {
+public:
+	ResourceManager() {}
+	~ResourceManager(){}
+	//peak memory usage
+	std::string get_peak_memory() {
+		double kbm;
+		struct rusage CurUsage;
+		getrusage(RUSAGE_SELF, &CurUsage);
+		kbm = (double)CurUsage.ru_maxrss;
+		double mbm = kbm / 1024.0;
+		double gbm = mbm / 1024.0;
+		return
+			"Peak memory: " +
+			to_string_with_precision(mbm, 3) + " MB; " +
+			to_string_with_precision(gbm, 3) + " GB";
+	}
+private:
+	template <typename T = double>
+	std::string to_string_with_precision(const T a_value, const int& n) {
+		std::ostringstream out;
+		out << std::fixed;
+		out << std::setprecision(n) << a_value;
+		return out.str();
+	}
+};
+
+class Timer {
+public:
+	Timer() {}
+	void Start() { gettimeofday(&start_time_, NULL); }
+	void Stop() {
+		gettimeofday(&elapsed_time_, NULL);
+		elapsed_time_.tv_sec  -= start_time_.tv_sec;
+		elapsed_time_.tv_usec -= start_time_.tv_usec;
+	}
+	double Seconds() const { return elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec/1e6; }
+	double Millisecs() const { return 1000*elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec/1000; }
+	double Microsecs() const { return 1e6*elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec; }
+private:
+	struct timeval start_time_;
+	struct timeval elapsed_time_;
+};
+
+class random_generator {
+public:
+	static random_generator &get_instance() {
+		static random_generator instance;
+		return instance;
+	}
+	std::mt19937 &operator()() { return gen_; }
+	void set_seed(unsigned int seed) { gen_.seed(seed); }
+
+private:
+	random_generator() : gen_(1) {}
+	std::mt19937 gen_;
+};
+
+template <typename T>
+inline typename std::enable_if<std::is_integral<T>::value, T>::type
+uniform_rand(T min, T max) {
+	std::uniform_int_distribution<T> dst(min, max);
+	return dst(random_generator::get_instance()());
+}
+
+template <typename T>
+inline typename std::enable_if<std::is_floating_point<T>::value, T>::type
+uniform_rand(T min, T max) {
+	std::uniform_real_distribution<T> dst(min, max);
+	return dst(random_generator::get_instance()());
+}
+
+inline bool bernoulli(float_t p) {
+	return uniform_rand(float_t{0}, float_t{1}) <= p;
+}
+
+size_t read_masks(std::string dataset_str, std::string mask_type, size_t &begin, size_t &end, MaskList &masks) {
+	if (dataset_str != "citeseer" && dataset_str != "cora") {
+		std::cout << "Dataset currently not supported\n";
+		exit(1);
+	}
+	size_t i = 0;
+	size_t sample_count = 0;
+	std::string filename = path + dataset_str + "-" + mask_type + "_mask.txt";
+	//std::cout << "Reading " << filename << "\n";
+	std::ifstream in;
+	std::string line;
+	in.open(filename, std::ios::in);
+	in >> begin >> end >> std::ws;
+	while (std::getline(in, line)) {
+		std::istringstream mask_stream(line);
+		if (i >= begin && i < end) {
+			unsigned mask = 0;
+			mask_stream >> mask;
+			if (mask == 1) {
+				masks[i] = 1;
+				sample_count ++;
+			}
+		}
+		i ++;
+	} 
+	//std::cout << mask_type + "_mask range: [" << begin << ", " << end
+	//	<< ") Number of valid samples: " << sample_count << "\n";
+	in.close();
+	return sample_count;
+}
+
diff --git a/lonestargnn/CMakeLists.txt b/lonestargnn/CMakeLists.txt
new file mode 100644
index 0000000000..c03a5c6676
--- /dev/null
+++ b/lonestargnn/CMakeLists.txt
@@ -0,0 +1,8 @@
+include_directories(BEFORE
+  ${CMAKE_SOURCE_DIR}/libllvm/include
+  ${CMAKE_CURRENT_BINARY_DIR}/../libllvm/include
+)
+include_directories(${CMAKE_SOURCE_DIR}/lonestargnn)
+include_directories(${CMAKE_SOURCE_DIR}/libdeepgalois)
+
+add_subdirectory(gcn)
diff --git a/lonestargnn/README.md b/lonestargnn/README.md
new file mode 100644
index 0000000000..bae49e36a0
--- /dev/null
+++ b/lonestargnn/README.md
@@ -0,0 +1,60 @@
+DESCRIPTION 
+===========
+
+This application does vertex classification in an undirected graph.
+It uses graph neural network (GNN) to train the vertex features 
+which are then used to classify vertices into different classes.
+
+INPUT
+===========
+
+The input dataset contains three parts:
+1. the input graph file: edgelist format of a |V| x |V| sparse matrix.
+2. the vertex label file: |V| lines with each line a integer.
+3. the input feature file: edgelist format of |V| x |D| sparse matrix.
+
+Vertex ids are expected to be sequential integers between 0 and |V|-1.
+|V| is the number of vertices. |D| is the dimension of input feature vectors.
+
+BUILD
+===========
+
+1. Run cmake at BUILD directory `cd build; cmake -DUSE_DEEPGALOIS=1 -DUSE_BLAS=1 ../`
+
+2. Run `cd <BUILD>/lonestargnn/gcn; make -j`
+
+RUN
+===========
+
+The following are a few example command lines.
+
+$ export OPENBLAS_NUM_THREADS=28
+$ ./gnn cora -t=1 -k=3
+$ ./gnn citeseer -t=3 -k=30
+$ ./gnn reddit -t=56 -k=3
+
+PERFORMANCE
+===========
+- I
+- I
+- I
+
+REFERENCES
+===========
+The GCN model:
+Semi-Supervised Classification with Graph Convolutional Networks (ICLR 2017)  
+http://arxiv.org/abs/1609.02907 
+https://github.com/tkipf/gcn
+
+DGL:
+Deep Graph Library: Towards Efficient and Scalable Deep Learning on Graphs
+https://arxiv.org/abs/1909.01315
+https://github.com/dmlc/dgl
+
+GraphSAGE: 
+Inductive Representation Learning on Large Graphs
+http://snap.stanford.edu/graphsage/
+
+NeuGraph: Parallel Deep Neural Network Computation on Large Graphs
+https://www.usenix.org/conference/atc19/presentation/ma
+
diff --git a/lonestargnn/gcn/CMakeLists.txt b/lonestargnn/gcn/CMakeLists.txt
new file mode 100644
index 0000000000..f1a65740f7
--- /dev/null
+++ b/lonestargnn/gcn/CMakeLists.txt
@@ -0,0 +1,16 @@
+SET(USE_BLAS ON CACHE BOOL "Use blas")
+
+SET(BLAS_INC /net/ohm/export/cdgc/cxh/OpenBLAS/build/include)
+SET(BLAS_LIB /net/ohm/export/cdgc/cxh/OpenBLAS/build/lib)
+
+if (USE_BLAS)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWITH_BLAS")
+    include_directories(${BLAS_INC})
+    link_directories(${BLAS_LIB})
+endif()
+
+app(gcn gcn.cpp)
+
+if (USE_BLAS)
+    target_link_libraries(gcn -lopenblas)
+endif()
diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp
new file mode 100644
index 0000000000..8d1f792fd1
--- /dev/null
+++ b/lonestargnn/gcn/gcn.cpp
@@ -0,0 +1,47 @@
+// Graph Neural Networks
+// Xuhao Chen <cxh@utexas.edu>
+#include "gnn.h"
+#include "lonestargnn.h"
+
+const char* name = "Graph Convolutional Networks";
+const char* desc = "Graph convolutional neural networks on an undirected graph";
+const char* url  = 0;
+
+int main(int argc, char** argv) {
+	galois::SharedMemSys G;
+	LonestarGnnStart(argc, argv, name, desc, url);
+	Net network; // the neural network to train
+	network.init();
+	network.construct_layers(); // default setting for now; see its implementation to find how to customize it by the user
+	network.print_layers_info();
+	ResourceManager rm;
+
+	// the optimizer used to update parameters, see optimizer.h for more details
+	//optimizer *opt = new gradient_descent();
+	//optimizer *opt = new adagrad(); 
+	optimizer *opt = new adam();
+	galois::StatTimer Ttrain("TrainAndVal");
+	Ttrain.start();
+	network.train(opt); // do training using training samples
+	Ttrain.stop();
+
+	if (do_test) {
+		// test using test samples
+		size_t n = network.get_nnodes();
+		acc_t test_loss = 0.0, test_acc = 0.0;
+		size_t test_begin = 0, test_end = n, test_count = n;
+		MaskList test_mask(n, 0);
+		if (dataset == "reddit") {
+			test_begin = 177262; test_count = 55703; test_end = test_begin + test_count;
+			for (size_t i = test_begin; i < test_end; i++) test_mask[i] = 1;
+		} else test_count = read_masks(dataset, "test", test_begin, test_end, test_mask);
+		galois::StatTimer Ttest("Test");
+		Ttest.start();
+		double test_time = network.evaluate(test_begin, test_end, test_count, test_mask, test_loss, test_acc);
+		std::cout << "\nTesting: test_loss = " << test_loss << " test_acc = " << test_acc << " test_time = " << test_time << "\n";
+		Ttest.stop();
+	}
+	std::cout << "\n" << rm.get_peak_memory() << "\n\n";
+	return 0;
+}
+
diff --git a/lonestargnn/graphsage/gs-mean.cpp b/lonestargnn/graphsage/gs-mean.cpp
new file mode 100644
index 0000000000..b70cdc183c
--- /dev/null
+++ b/lonestargnn/graphsage/gs-mean.cpp
@@ -0,0 +1,41 @@
+// Graph Neural Networks
+// Xuhao Chen <cxh@utexas.edu>
+#include "gnn.h"
+
+const char* name = "GraphSage";
+const char* desc = "A graph neural network variant: GraphSAGE";
+const char* url  = 0;
+
+class GraphSageMean: public graph_conv_layer { 
+	// user-defined combine function
+};
+
+int main(int argc, char** argv) {
+	galois::SharedMemSys G;
+	LonestarStart(argc, argv, name, desc, url);
+	Net network; // the neural network to train
+	network.init(); // default setting for now; see its implementation to find how to customize it by the user
+	ResourceManager rm;
+
+	// the optimizer used to update parameters, see optimizer.h for more details
+	//optimizer *opt = new gradient_descent();
+	//optimizer *opt = new adagrad(); 
+	optimizer *opt = new adam();
+	galois::StatTimer Ttrain("Train");
+	Ttrain.start();
+	network.train(opt); // do training using training samples
+	Ttrain.stop();
+
+	// test using test samples
+	acc_t test_loss = 0.0, test_acc = 0.0;
+	size_t test_begin = 2312, test_end = 3312; // [2312, 3327) test size = 1015 TODO: replace ad-hoc settings
+	galois::StatTimer Ttest("Test");
+	Ttest.start();
+	double test_time = network.evaluate(test_begin, test_end, test_loss, test_acc);
+	std::cout << "\nTesting: test_loss = " << test_loss << " test_acc = " << test_acc << " test_time = " << test_time << "\n";
+	Ttest.stop();
+
+	std::cout << "\n" << rm.get_peak_memory() << "\n\n";
+	return 0;
+}
+
diff --git a/lonestargnn/lonestargnn.h b/lonestargnn/lonestargnn.h
new file mode 100644
index 0000000000..e53dc2e461
--- /dev/null
+++ b/lonestargnn/lonestargnn.h
@@ -0,0 +1,50 @@
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include "galois/Galois.h"
+#include "galois/Version.h"
+#include "llvm/Support/CommandLine.h"
+
+//! standard global options to the benchmarks
+extern llvm::cl::opt<bool> skipVerify;
+extern llvm::cl::opt<int> numThreads;
+extern llvm::cl::opt<std::string> statFile;
+
+//! standard global options to the benchmarks
+llvm::cl::opt<bool> skipVerify("noverify", llvm::cl::desc("Skip verification step (default value false)"), llvm::cl::init(false));
+llvm::cl::opt<int>numThreads("t", llvm::cl::desc("Number of threads (default value 1)"), llvm::cl::init(1));
+llvm::cl::opt<std::string> statFile("statFile", llvm::cl::desc("ouput file to print stats to (default value empty)"), llvm::cl::init(""));
+
+static void LonestarGnnPrintVersion() {
+	std::cout << "LoneStar Benchmark Suite v" << galois::getVersion() << " (" << galois::getRevision() << ")\n";
+}
+
+//! initialize lonestargnn benchmark
+void LonestarGnnStart(int argc, char** argv, const char* app, const char* desc, const char* url) {
+	llvm::cl::SetVersionPrinter(LonestarGnnPrintVersion);
+	llvm::cl::ParseCommandLineOptions(argc, argv);
+	numThreads = galois::setActiveThreads(numThreads);
+	galois::runtime::setStatFile(statFile);
+	LonestarGnnPrintVersion();
+	std::cout << "Copyright (C) " << galois::getCopyrightYear() << " The University of Texas at Austin\n";
+	std::cout << "http://iss.ices.utexas.edu/galois/\n\n";
+	std::cout << "application: " << (app ? app : "unspecified") << "\n";
+	if (desc) std::cout << desc << "\n";
+	if (url) std::cout << "http://iss.ices.utexas.edu/?p=projects/galois/benchmarks/" << url << "\n";
+	std::cout << "\n";
+
+	std::ostringstream cmdout;
+	for (int i = 0; i < argc; ++i) {
+		cmdout << argv[i];
+		if (i != argc - 1) cmdout << " ";
+	}
+
+	galois::runtime::reportParam("(NULL)", "CommandLine", cmdout.str());
+	galois::runtime::reportParam("(NULL)", "Threads", numThreads);
+
+	char name[256];
+	gethostname(name, 256);
+	galois::runtime::reportParam("(NULL)", "Hostname", name);
+}
+
diff --git a/lonestargnn/run-citeseer.sh b/lonestargnn/run-citeseer.sh
new file mode 100755
index 0000000000..a70f0bdc1f
--- /dev/null
+++ b/lonestargnn/run-citeseer.sh
@@ -0,0 +1 @@
+./gcn citeseer -t=56 -k=3

From f175999b9a60423d9c994a7e39e5a4a3dbcf28d2 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Wed, 19 Feb 2020 14:42:07 -0600
Subject: [PATCH 002/660] add gpu

---
 libdeepgalois/common.h            | 50 +++++++++++++++++++++++++++++++
 libdeepgalois/cutils.h            | 40 +++++++++++++++++++++++++
 libdeepgalois/gnn.h               | 31 -------------------
 libdeepgalois/gpu_kernels.hpp     | 41 +++++++++++++++++++++++++
 libdeepgalois/layers/relu_layer.h | 10 +++++--
 libdeepgalois/math_functions.hpp  | 26 ++++++++--------
 libdeepgalois/net.h               |  3 +-
 libdeepgalois/utils.h             |  1 -
 lonestargnn/gcn/gcn.cpp           |  1 -
 lonestargnn/lonestargnn.h         | 31 +++++++++++++++----
 10 files changed, 179 insertions(+), 55 deletions(-)
 create mode 100644 libdeepgalois/common.h
 create mode 100644 libdeepgalois/cutils.h
 delete mode 100644 libdeepgalois/gnn.h
 create mode 100644 libdeepgalois/gpu_kernels.hpp

diff --git a/libdeepgalois/common.h b/libdeepgalois/common.h
new file mode 100644
index 0000000000..e1bff6901d
--- /dev/null
+++ b/libdeepgalois/common.h
@@ -0,0 +1,50 @@
+#pragma once
+#include "cutils.h"
+
+class DeepGalois {
+public:
+	~DeepGalois();
+	enum Brew { CPU, GPU };
+	static DeepGalois& Get() {
+	}
+	inline static cublasHandle_t cublas_handle() { return Get().cublas_handle_; }
+	inline static curandGenerator_t curand_generator() { return Get().curand_generator_; }
+	inline static Brew mode() { return Get().mode_; }
+	inline static void set_mode(Brew mode) { Get().mode_ = mode; }
+	inline static int solver_count() { return Get().solver_count_; }
+	inline static void set_solver_count(int val) { Get().solver_count_ = val; }
+	inline static int solver_rank() { return Get().solver_rank_; }
+	inline static void set_solver_rank(int val) { Get().solver_rank_ = val; }
+	inline static bool multiprocess() { return Get().multiprocess_; }
+	inline static void set_multiprocess(bool val) { Get().multiprocess_ = val; }
+	inline static bool root_solver() { return Get().solver_rank_ == 0; }
+	static void SetDevice(const int device_id) {
+		int current_device;
+		CUDA_CHECK(cudaGetDevice(&current_device));
+		if (current_device == device_id) return;
+		CUDA_CHECK(cudaSetDevice(device_id));
+		if (Get().cublas_handle_) CUBLAS_CHECK(cublasDestroy(Get().cublas_handle_));
+		if (Get().curand_generator_) CURAND_CHECK(curandDestroyGenerator(Get().curand_generator_));
+		CUBLAS_CHECK(cublasCreate(&Get().cublas_handle_));
+		CURAND_CHECK(curandCreateGenerator(&Get().curand_generator_, CURAND_RNG_PSEUDO_DEFAULT));
+		CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().curand_generator_, cluster_seedgen()));
+	}
+	static void DeviceQuery();
+	static bool CheckDevice(const int device_id);
+	static int FindDevice(const int start_id = 0);
+
+protected:
+	cublasHandle_t cublas_handle_;
+	curandGenerator_t curand_generator_;
+	shared_ptr<RNG> random_generator_;
+	Brew mode_;
+	// Parallel training
+	int solver_count_;
+	int solver_rank_;
+	bool multiprocess_;
+
+private:
+	// The private constructor to avoid duplicate instantiation.
+	DeepGalois();
+};
+
diff --git a/libdeepgalois/cutils.h b/libdeepgalois/cutils.h
new file mode 100644
index 0000000000..4356ec2979
--- /dev/null
+++ b/libdeepgalois/cutils.h
@@ -0,0 +1,40 @@
+#pragma once
+
+// CUDA: use 256 threads per block
+const int CUDA_NUM_THREADS = 256;
+
+// CUDA: number of blocks for threads.
+inline int CUDA_GET_BLOCKS(const int N) {
+	return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}
+
+// CUDA: various checks for different function calls.
+#define CUDA_CHECK(condition) \
+  // Code block avoids redefinition of cudaError_t error \
+  do { \
+    cudaError_t error = condition; \
+    CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \
+  } while (0)
+
+#define CUBLAS_CHECK(condition) \
+  do { \
+    cublasStatus_t status = condition; \
+    CHECK_EQ(status, CUBLAS_STATUS_SUCCESS) << " " \
+      << caffe::cublasGetErrorString(status); \
+  } while (0)
+
+#define CURAND_CHECK(condition) \
+  do { \
+    curandStatus_t status = condition; \
+    CHECK_EQ(status, CURAND_STATUS_SUCCESS) << " " \
+      << caffe::curandGetErrorString(status); \
+  } while (0)
+
+// CUDA: grid stride looping
+#define CUDA_KERNEL_LOOP(i, n) \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
+       i < (n); i += blockDim.x * gridDim.x)
+
+// CUDA: check for error after kernel execution and exit loudly if there is one.
+#define CUDA_POST_KERNEL_CHECK CUDA_CHECK(cudaPeekAtLastError())
+
diff --git a/libdeepgalois/gnn.h b/libdeepgalois/gnn.h
deleted file mode 100644
index d2d2bafb28..0000000000
--- a/libdeepgalois/gnn.h
+++ /dev/null
@@ -1,31 +0,0 @@
-#ifndef _GNN_H_
-#define _GNN_H_
-
-#include "galois/Galois.h"
-#include "galois/Reduction.h"
-#include "galois/Timer.h"
-#include "galois/ParallelSTL.h"
-#include "llvm/Support/CommandLine.h"
-#include "galois/runtime/Profile.h"
-#include <boost/iterator/transform_iterator.hpp>
-
-namespace cll = llvm::cl;
-static cll::opt<std::string> dataset(cll::Positional, cll::desc("<dataset name>"), cll::Required); // 'cora', 'citeseer', 'pubmed'
-static cll::opt<std::string> filetype(cll::Positional, cll::desc("<filetype: el,gr>"), cll::init("gr")); // file format of the input graph
-static cll::opt<std::string> model("m", cll::desc("Model string"), cll::init("gcn")); // 'gcn', 'gcn_cheby', 'dense'
-static cll::opt<float> learning_rate("lr", cll::desc("Initial learning rate (default value 0.01)"), cll::init(0.01));
-static cll::opt<unsigned> epochs("k", cll::desc("number of epoch, i.e. iterations (default value 1)"), cll::init(1));
-static cll::opt<unsigned> hidden1("h", cll::desc("Number of units in hidden layer 1 (default value 16)"), cll::init(16));
-static cll::opt<float> dropout_rate("d", cll::desc("Dropout rate (1 - keep probability) (default value 0.5)"), cll::init(0.5));
-static cll::opt<float> weight_decay("wd", cll::desc("Weight for L2 loss on embedding matrix (default value 5e-4)"), cll::init(5e-4));
-static cll::opt<float> early_stopping("es", cll::desc("Tolerance for early stopping (# of epochs) (default value 10)"), cll::init(10));
-static cll::opt<unsigned> max_degree("md", cll::desc("Maximum Chebyshev polynomial degree (default value 3)"), cll::init(3));
-static cll::opt<unsigned> do_validate("dv", cll::desc("enable validation"), cll::init(1));
-static cll::opt<unsigned> do_test("dt", cll::desc("enable test"), cll::init(1));
-#define CHUNK_SIZE 256
-
-#include "types.h"
-#include "utils.h"
-#include "net.h"
-
-#endif
diff --git a/libdeepgalois/gpu_kernels.hpp b/libdeepgalois/gpu_kernels.hpp
new file mode 100644
index 0000000000..7cb1068fc6
--- /dev/null
+++ b/libdeepgalois/gpu_kernels.hpp
@@ -0,0 +1,41 @@
+#pragma once
+#include <cuda.h>
+#include <curand.h>
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#include "cutils.h"
+
+// flattern data into 1D before feed into the ReLU operater
+__global__ void relu_gpu(const int n, const float_t* in, float_t* out) {
+	CUDA_KERNEL_LOOP(index, n) {
+		out[index] = in[index] > 0 ? in[index] : 0;
+	}
+}
+
+__global__ void d_relu_gpu(const int n, const float_t* in_diff, const float_t* in_data, float_t* out_diff) {
+	CUDA_KERNEL_LOOP(index, n) {
+		out_diff[index] = in_data[index] > 0 ? in_diff[index] : 0;
+	}
+}
+
+void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, 
+	const int M, const int N, const int K, const float alpha, 
+	const float* A, const float* B, const float beta, float* C) {
+	// Note that cublas follows fortran order.
+	int lda = (TransA == CblasNoTrans) ? K : M;
+	int ldb = (TransB == CblasNoTrans) ? N : K;
+	cublasOperation_t cuTransA = (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+	cublasOperation_t cuTransB = (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+	CUBLAS_CHECK(cublasSgemm(cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
+}
+
+void gemv_gpu(const CBLAS_TRANSPOSE TransA, const int M, const int N, 
+	const float alpha, const float* A, const float* x, const float beta, float* y) {
+	cublasOperation_t cuTransA = (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
+	CUBLAS_CHECK(cublasSgemv(cublas_handle(), cuTransA, N, M, &alpha, A, N, x, 1, &beta, y, 1));
+}
+
+void scal_gpu<float>(const int N, const float alpha, float *X) {
+	CUBLAS_CHECK(cublasSscal(cublas_handle(), N, &alpha, X, 1));
+}
+
diff --git a/libdeepgalois/layers/relu_layer.h b/libdeepgalois/layers/relu_layer.h
index 389e6b3c1f..2795fc404e 100644
--- a/libdeepgalois/layers/relu_layer.h
+++ b/libdeepgalois/layers/relu_layer.h
@@ -13,12 +13,16 @@ class relu_layer : public layer {
 	void forward_propagation(const tensor_t &in_data, tensor_t &out_data) override {
 		galois::do_all(galois::iterate((size_t)0, input_dims[0]), [&](const auto& i) {
 			for (size_t j = 0; j < input_dims[1]; ++j) 
-				out_data[i][j] = std::max(in_data[i][j], (float_t)0) +
-					negative_slope * std::min(in_data[i][j], (float_t)0);
+				out_data[i][j] = std::max(in_data[i][j], (float_t)0);
 		}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("relu_layer-fw"));
 	}
 	// 𝜕𝐿 / 𝜕𝑦[𝑙−1] = 0, 𝑖𝑓 (𝑦[𝑙] < 0)
 	//              = 𝜕𝐿 / 𝜕𝑦𝑙 , 𝑜𝑡ℎ𝑒𝑟𝑤𝑖𝑠𝑒
 	void back_propagation(const tensor_t &in_data, const tensor_t &out_data, 
-		tensor_t &out_grad, tensor_t &in_grad) override {}
+		tensor_t &out_grad, tensor_t &in_grad) override {
+		galois::do_all(galois::iterate((size_t)0, input_dims[0]), [&](const auto& i) {
+			for (size_t j = 0; j < input_dims[1]; ++j) 
+				in_grad[i][j] = out_data[i][j] > float_t(0) ? out_grad[i][j] : float_t(0);
+		}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("relu_layer-bw"));
+	}
 };
diff --git a/libdeepgalois/math_functions.hpp b/libdeepgalois/math_functions.hpp
index 8791416441..f1612aac1c 100644
--- a/libdeepgalois/math_functions.hpp
+++ b/libdeepgalois/math_functions.hpp
@@ -164,27 +164,27 @@ inline void matmul2D(const tensor_t &A, const tensor_t &B, tensor_t &C) {
 	} 
 }
 
+void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, 
+	const int M, const int N, const int K, const float alpha, 
+	const float* A, const float* B, const float beta, float* C) {
+	int lda = (TransA == CblasNoTrans) ? K : M;
+	int ldb = (TransB == CblasNoTrans) ? N : K;
+	cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N);
+}
+
 inline void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z, 
 	const vec_t &A, const vec_t &B, vec_t &C) {
 	galois::StatTimer Tmatmul("MatMul");
 	Tmatmul.start();
-#ifdef WITH_BLAS
-	const int M = dim_x;
-	const int N = dim_y;
-	const int K = dim_z;
-	const float alpha = 1.0;
-	const float beta = 0.0;
-	const CBLAS_TRANSPOSE TransA = CblasNoTrans;
-	const CBLAS_TRANSPOSE TransB = CblasNoTrans;
-	int lda = (TransA == CblasNoTrans) ? K : M;
-	int ldb = (TransB == CblasNoTrans) ? N : K;
-	cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, &A[0], lda, &B[0], ldb, beta, &C[0], N);
-#else
-	//std::cout << "using naive matmul, slow\n";
 	assert(A.size() == dim_x*dim_z);
 	assert(B.size() == dim_z*dim_y);
 	assert(C.size() == dim_x*dim_y);
 
+#ifdef WITH_BLAS
+	const CBLAS_TRANSPOSE TransA = CblasNoTrans;
+	const CBLAS_TRANSPOSE TransB = CblasNoTrans;
+	sgemm_cpu(TransA, TransB, dim_x, dim_y, dim_z, 1.0, &A[0], &B[0], 0.0, &C[0]);
+#else
 	for (size_t i = 0; i < dim_x; ++i) { 
 		for (size_t j = 0; j < dim_y; ++j) { 
 			C[i*dim_y+j] = 0;
diff --git a/libdeepgalois/net.h b/libdeepgalois/net.h
index fac7caee00..f6d6930d5a 100644
--- a/libdeepgalois/net.h
+++ b/libdeepgalois/net.h
@@ -2,7 +2,8 @@
 #define _MODEL_H_
 
 #include <random>
-#include "gnn.h"
+#include "galois/Galois.h"
+#include "galois/Timer.h"
 #include "lgraph.h"
 #include "layers.h"
 #include "optimizer.h"
diff --git a/libdeepgalois/utils.h b/libdeepgalois/utils.h
index 70356654b9..100a997b57 100644
--- a/libdeepgalois/utils.h
+++ b/libdeepgalois/utils.h
@@ -5,7 +5,6 @@
 #include <fstream>
 #include <sys/time.h>
 #include <sys/resource.h>
-#include "gnn.h"
 
 std::string path = "/h2/xchen/datasets/Learning/"; // path to the input dataset
 enum class net_phase { train, test };
diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp
index 8d1f792fd1..72fc8373fc 100644
--- a/lonestargnn/gcn/gcn.cpp
+++ b/lonestargnn/gcn/gcn.cpp
@@ -1,6 +1,5 @@
 // Graph Neural Networks
 // Xuhao Chen <cxh@utexas.edu>
-#include "gnn.h"
 #include "lonestargnn.h"
 
 const char* name = "Graph Convolutional Networks";
diff --git a/lonestargnn/lonestargnn.h b/lonestargnn/lonestargnn.h
index e53dc2e461..efbb862fd7 100644
--- a/lonestargnn/lonestargnn.h
+++ b/lonestargnn/lonestargnn.h
@@ -1,10 +1,30 @@
 #pragma once
 
-#include <iostream>
 #include <sstream>
+#include <iostream>
+#include "galois/Timer.h"
 #include "galois/Galois.h"
 #include "galois/Version.h"
+#include "galois/Reduction.h"
+#include "galois/ParallelSTL.h"
+#include "galois/runtime/Profile.h"
 #include "llvm/Support/CommandLine.h"
+#include <boost/iterator/transform_iterator.hpp>
+
+namespace cll = llvm::cl;
+static cll::opt<std::string> dataset(cll::Positional, cll::desc("<dataset name>"), cll::Required); // 'cora', 'citeseer', 'pubmed'
+static cll::opt<std::string> filetype(cll::Positional, cll::desc("<filetype: el,gr>"), cll::init("gr")); // file format of the input graph
+static cll::opt<std::string> model("m", cll::desc("Model string"), cll::init("gcn")); // 'gcn', 'gcn_cheby', 'dense'
+static cll::opt<float> learning_rate("lr", cll::desc("Initial learning rate (default value 0.01)"), cll::init(0.01));
+static cll::opt<unsigned> epochs("k", cll::desc("number of epoch, i.e. iterations (default value 1)"), cll::init(1));
+static cll::opt<unsigned> hidden1("h", cll::desc("Number of units in hidden layer 1 (default value 16)"), cll::init(16));
+static cll::opt<float> dropout_rate("d", cll::desc("Dropout rate (1 - keep probability) (default value 0.5)"), cll::init(0.5));
+static cll::opt<float> weight_decay("wd", cll::desc("Weight for L2 loss on embedding matrix (default value 5e-4)"), cll::init(5e-4));
+static cll::opt<float> early_stopping("es", cll::desc("Tolerance for early stopping (# of epochs) (default value 10)"), cll::init(10));
+static cll::opt<unsigned> max_degree("md", cll::desc("Maximum Chebyshev polynomial degree (default value 3)"), cll::init(3));
+static cll::opt<unsigned> do_validate("dv", cll::desc("enable validation"), cll::init(1));
+static cll::opt<unsigned> do_test("dt", cll::desc("enable test"), cll::init(1));
+#define CHUNK_SIZE 256
 
 //! standard global options to the benchmarks
 extern llvm::cl::opt<bool> skipVerify;
@@ -17,7 +37,7 @@ llvm::cl::opt<int>numThreads("t", llvm::cl::desc("Number of threads (default val
 llvm::cl::opt<std::string> statFile("statFile", llvm::cl::desc("ouput file to print stats to (default value empty)"), llvm::cl::init(""));
 
 static void LonestarGnnPrintVersion() {
-	std::cout << "LoneStar Benchmark Suite v" << galois::getVersion() << " (" << galois::getRevision() << ")\n";
+	std::cout << "LoneStarGNN Benchmark Suite v" << galois::getVersion() << " (" << galois::getRevision() << ")\n";
 }
 
 //! initialize lonestargnn benchmark
@@ -33,18 +53,19 @@ void LonestarGnnStart(int argc, char** argv, const char* app, const char* desc,
 	if (desc) std::cout << desc << "\n";
 	if (url) std::cout << "http://iss.ices.utexas.edu/?p=projects/galois/benchmarks/" << url << "\n";
 	std::cout << "\n";
-
 	std::ostringstream cmdout;
 	for (int i = 0; i < argc; ++i) {
 		cmdout << argv[i];
 		if (i != argc - 1) cmdout << " ";
 	}
-
 	galois::runtime::reportParam("(NULL)", "CommandLine", cmdout.str());
 	galois::runtime::reportParam("(NULL)", "Threads", numThreads);
-
 	char name[256];
 	gethostname(name, 256);
 	galois::runtime::reportParam("(NULL)", "Hostname", name);
 }
 
+#include "types.h"
+#include "utils.h"
+#include "net.h"
+

From b12af3048addd7133db6c0162abe0bc3d948c32c Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Thu, 20 Feb 2020 13:29:30 -0600
Subject: [PATCH 003/660] add cpp

---
 libdeepgalois/CMakeLists.txt                  |  45 ++-
 libdeepgalois/gpu_kernels.hpp                 |  41 ---
 libdeepgalois/{ => include}/common.h          |   8 +-
 libdeepgalois/{ => include}/cutils.h          |   6 +-
 libdeepgalois/{ => include}/layers.h          |   0
 .../{ => include}/layers/arithmetic_layer.h   |   0
 .../{ => include}/layers/graph_conv_layer.h   |  20 +-
 libdeepgalois/{ => include}/layers/layer.h    |   0
 .../{ => include}/layers/linear_layer.h       |   0
 libdeepgalois/include/layers/relu_layer.h     |  15 +
 .../include/layers/softmax_loss_layer.h       |  18 ++
 libdeepgalois/{ => include}/lgraph.h          |   0
 libdeepgalois/include/math_functions.hpp      |  43 +++
 libdeepgalois/{ => include}/net.h             |   6 +-
 libdeepgalois/{ => include}/node.h            |   1 +
 libdeepgalois/{ => include}/optimizer.h       |   6 +-
 libdeepgalois/{ => include}/random.h          |   0
 libdeepgalois/{ => include}/timer.h           |   0
 libdeepgalois/{ => include}/types.h           |   1 +
 libdeepgalois/{ => include}/utils.h           |   5 +-
 libdeepgalois/layers/relu_layer.h             |  28 --
 libdeepgalois/layers/softmax_loss_layer.h     |  47 ---
 libdeepgalois/src/layers/relu_layer.cpp       |  19 ++
 .../src/layers/softmax_loss_layer.cpp         |  34 +++
 .../math_functions.cpp}                       | 281 ++++++------------
 libdeepgalois/src/math_functions.cu           |  84 ++++++
 lonestargnn/CMakeLists.txt                    |   2 +-
 lonestargnn/gcn/CMakeLists.txt                |  18 +-
 lonestargnn/lonestargnn.h                     |   1 -
 29 files changed, 382 insertions(+), 347 deletions(-)
 delete mode 100644 libdeepgalois/gpu_kernels.hpp
 rename libdeepgalois/{ => include}/common.h (90%)
 rename libdeepgalois/{ => include}/cutils.h (91%)
 rename libdeepgalois/{ => include}/layers.h (100%)
 rename libdeepgalois/{ => include}/layers/arithmetic_layer.h (100%)
 rename libdeepgalois/{ => include}/layers/graph_conv_layer.h (94%)
 rename libdeepgalois/{ => include}/layers/layer.h (100%)
 rename libdeepgalois/{ => include}/layers/linear_layer.h (100%)
 create mode 100644 libdeepgalois/include/layers/relu_layer.h
 create mode 100644 libdeepgalois/include/layers/softmax_loss_layer.h
 rename libdeepgalois/{ => include}/lgraph.h (100%)
 create mode 100644 libdeepgalois/include/math_functions.hpp
 rename libdeepgalois/{ => include}/net.h (98%)
 rename libdeepgalois/{ => include}/node.h (99%)
 rename libdeepgalois/{ => include}/optimizer.h (97%)
 rename libdeepgalois/{ => include}/random.h (100%)
 rename libdeepgalois/{ => include}/timer.h (100%)
 rename libdeepgalois/{ => include}/types.h (98%)
 rename libdeepgalois/{ => include}/utils.h (92%)
 delete mode 100644 libdeepgalois/layers/relu_layer.h
 delete mode 100644 libdeepgalois/layers/softmax_loss_layer.h
 create mode 100644 libdeepgalois/src/layers/relu_layer.cpp
 create mode 100644 libdeepgalois/src/layers/softmax_loss_layer.cpp
 rename libdeepgalois/{math_functions.hpp => src/math_functions.cpp} (60%)
 create mode 100644 libdeepgalois/src/math_functions.cu

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index 8caa65ebc9..4f51532898 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -1,23 +1,52 @@
+SET(OPENBLAS_INC /net/ohm/export/cdgc/cxh/OpenBLAS/build/include)
+SET(OPENBLAS_LIB /net/ohm/export/cdgc/cxh/OpenBLAS/build/lib)
+include_directories(${OPENBLAS_INC})
+link_directories(${OPENBLAS_LIB})
+
+#SET(CUDA_INC /org/centers/cdgc/cuda/cuda-10.0/include)
+#SET(CUDA_LIB /org/centers/cdgc/cuda/cuda-10.0/lib64/)
+#SET(ENABLE_GPU OFF CACHE BOOL "Use GPU for DeepGalois")
+#if (ENABLE_GPU)
+#	target_compile_definitions(distbench PRIVATE __GALOIS_HET_CUDA__=1)
+#    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DENABLE_GPU")
+#    include_directories(${CUDA_INC})
+#    link_directories(${CUDA_LIB})
+#endif()
+
+#set(sources
+#  src/layers/relu_layer.cu
+#)
+#cuda_add_library(deepgalois_gpu
+#  ${sources}
+#OPTIONS -D_FORCE_INLINES
+#)
+#target_include_directories(deepgalois_gpu PUBLIC
+#  ${CMAKE_SOURCE_DIR}/libgpu/include
+#)
+#set_target_properties(deepgalois_gpu PROPERTIES
+#  INTERFACE_POSITION_INDEPENDENT_CODE On
+#  POSITION_INDEPENDENT_CODE On
+#)
+#target_link_libraries(deepgalois -lcudart -lcublas)
+
 set(sources
-  $<TARGET_OBJECTS:galois_shmem_obj>
-#  $<TARGET_OBJECTS:galois_dist_async_obj>
-  $<TARGET_OBJECTS:gllvm_obj>
+  src/layers/relu_layer.cpp
+  src/layers/softmax_loss_layer.cpp
+  src/math_functions.cpp
 )
-
 add_library(deepgalois STATIC ${sources})
 
-target_link_libraries(deepgalois galois_shmem galois_dist_async gllvm)
+target_link_libraries(deepgalois galois_shmem gllvm)
 target_link_libraries(deepgalois ${MPI_CXX_LIBRARIES})
+target_link_libraries(deepgalois -lopenblas)
 
 target_include_directories(deepgalois PUBLIC
   ${CMAKE_SOURCE_DIR}/libllvm/include
   ${CMAKE_SOURCE_DIR}/libgalois/include
-  ${CMAKE_SOURCE_DIR}/libdist/include
-  ${CMAKE_SOURCE_DIR}/libdeepgalios/include
   ${CMAKE_CURRENT_SOURCE_DIR}/include
 )
 
-set_target_properties (deepgalois PROPERTIES
+set_target_properties(deepgalois PROPERTIES
   INTERFACE_POSITION_INDEPENDENT_CODE On
   POSITION_INDEPENDENT_CODE On
 )
diff --git a/libdeepgalois/gpu_kernels.hpp b/libdeepgalois/gpu_kernels.hpp
deleted file mode 100644
index 7cb1068fc6..0000000000
--- a/libdeepgalois/gpu_kernels.hpp
+++ /dev/null
@@ -1,41 +0,0 @@
-#pragma once
-#include <cuda.h>
-#include <curand.h>
-#include <cublas_v2.h>
-#include <cuda_runtime.h>
-#include "cutils.h"
-
-// flattern data into 1D before feed into the ReLU operater
-__global__ void relu_gpu(const int n, const float_t* in, float_t* out) {
-	CUDA_KERNEL_LOOP(index, n) {
-		out[index] = in[index] > 0 ? in[index] : 0;
-	}
-}
-
-__global__ void d_relu_gpu(const int n, const float_t* in_diff, const float_t* in_data, float_t* out_diff) {
-	CUDA_KERNEL_LOOP(index, n) {
-		out_diff[index] = in_data[index] > 0 ? in_diff[index] : 0;
-	}
-}
-
-void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, 
-	const int M, const int N, const int K, const float alpha, 
-	const float* A, const float* B, const float beta, float* C) {
-	// Note that cublas follows fortran order.
-	int lda = (TransA == CblasNoTrans) ? K : M;
-	int ldb = (TransB == CblasNoTrans) ? N : K;
-	cublasOperation_t cuTransA = (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-	cublasOperation_t cuTransB = (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-	CUBLAS_CHECK(cublasSgemm(cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
-}
-
-void gemv_gpu(const CBLAS_TRANSPOSE TransA, const int M, const int N, 
-	const float alpha, const float* A, const float* x, const float beta, float* y) {
-	cublasOperation_t cuTransA = (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
-	CUBLAS_CHECK(cublasSgemv(cublas_handle(), cuTransA, N, M, &alpha, A, N, x, 1, &beta, y, 1));
-}
-
-void scal_gpu<float>(const int N, const float alpha, float *X) {
-	CUBLAS_CHECK(cublasSscal(cublas_handle(), N, &alpha, X, 1));
-}
-
diff --git a/libdeepgalois/common.h b/libdeepgalois/include/common.h
similarity index 90%
rename from libdeepgalois/common.h
rename to libdeepgalois/include/common.h
index e1bff6901d..0c3023c3f2 100644
--- a/libdeepgalois/common.h
+++ b/libdeepgalois/include/common.h
@@ -1,4 +1,6 @@
 #pragma once
+#include "types.h"
+#include "utils.h"
 #include "cutils.h"
 
 class DeepGalois {
@@ -29,9 +31,9 @@ class DeepGalois {
 		CURAND_CHECK(curandCreateGenerator(&Get().curand_generator_, CURAND_RNG_PSEUDO_DEFAULT));
 		CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().curand_generator_, cluster_seedgen()));
 	}
-	static void DeviceQuery();
-	static bool CheckDevice(const int device_id);
-	static int FindDevice(const int start_id = 0);
+	static void DeviceQuery() {}
+	static bool CheckDevice(const int device_id) { return true; }
+	static int FindDevice(const int start_id = 0) { return 0; }
 
 protected:
 	cublasHandle_t cublas_handle_;
diff --git a/libdeepgalois/cutils.h b/libdeepgalois/include/cutils.h
similarity index 91%
rename from libdeepgalois/cutils.h
rename to libdeepgalois/include/cutils.h
index 4356ec2979..cda8d23cba 100644
--- a/libdeepgalois/cutils.h
+++ b/libdeepgalois/include/cutils.h
@@ -1,4 +1,9 @@
 #pragma once
+#include <cuda.h>
+#include <curand.h>
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#include <driver_types.h>
 
 // CUDA: use 256 threads per block
 const int CUDA_NUM_THREADS = 256;
@@ -10,7 +15,6 @@ inline int CUDA_GET_BLOCKS(const int N) {
 
 // CUDA: various checks for different function calls.
 #define CUDA_CHECK(condition) \
-  // Code block avoids redefinition of cudaError_t error \
   do { \
     cudaError_t error = condition; \
     CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \
diff --git a/libdeepgalois/layers.h b/libdeepgalois/include/layers.h
similarity index 100%
rename from libdeepgalois/layers.h
rename to libdeepgalois/include/layers.h
diff --git a/libdeepgalois/layers/arithmetic_layer.h b/libdeepgalois/include/layers/arithmetic_layer.h
similarity index 100%
rename from libdeepgalois/layers/arithmetic_layer.h
rename to libdeepgalois/include/layers/arithmetic_layer.h
diff --git a/libdeepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/layers/graph_conv_layer.h
similarity index 94%
rename from libdeepgalois/layers/graph_conv_layer.h
rename to libdeepgalois/include/layers/graph_conv_layer.h
index b81f7bc10e..2e304a0c98 100644
--- a/libdeepgalois/layers/graph_conv_layer.h
+++ b/libdeepgalois/include/layers/graph_conv_layer.h
@@ -16,8 +16,9 @@
 class graph_conv_layer: public layer {
 public:
 	graph_conv_layer(unsigned level, Graph *g, bool act, bool norm, bool bias, bool dropout,
-		std::vector<size_t> in_dims, std::vector<size_t> out_dims) :
-		layer(level, in_dims, out_dims), graph(g), act_(act), norm_(norm), bias_(bias), dropout_(dropout) {
+		float dropout_rate, std::vector<size_t> in_dims, std::vector<size_t> out_dims) :
+		layer(level, in_dims, out_dims), graph(g), act_(act), norm_(norm), bias_(bias), 
+		dropout_(dropout), dropout_rate_(dropout_rate) {
 		assert(input_dims[0] == output_dims[0]); // num_vertices
 		x = input_dims[0];
 		y = input_dims[1];
@@ -26,7 +27,11 @@ class graph_conv_layer: public layer {
 		name_ = layer_type() + "_" + std::to_string(level);
 		//std::cout << name_ << " constructed: act(" << act_ << ") dropout(" << dropout << ")\n";
 		init();
+		scale_ = 1. / (1. - dropout_rate_);
 	}
+	graph_conv_layer(unsigned level, std::vector<size_t> in_dims, 
+		std::vector<size_t> out_dims) : graph_conv_layer(level, NULL, false, true, false, true, 0.5, in_dims, out_dims) {}
+	~graph_conv_layer() {}
 	void init() {
 		std::cout << name_ << ": allocating memory for parameters and intermediate data... ";
 		Timer t_alloc;
@@ -50,9 +55,6 @@ class graph_conv_layer: public layer {
 		t_alloc.Stop();
 		std::cout << "Done, allocation time: " << t_alloc.Millisecs() << " ms\n";
 	}
-	graph_conv_layer(unsigned level, std::vector<size_t> in_dims, 
-		std::vector<size_t> out_dims) : graph_conv_layer(level, NULL, false, true, false, true, in_dims, out_dims) {}
-	~graph_conv_layer() {}
 	std::string layer_type() const override { return std::string("graph_conv"); }
 
 	// user-defined aggregate function
@@ -78,12 +80,10 @@ class graph_conv_layer: public layer {
 		//Timer t_matmul, t_agg, t_dropout;
 		//t_matmul.Start();
 		if (dropout_ && phase_ == net_phase::train) {
-			//t_dropout.Start();
 			//for (size_t i = 0; i < x; ++i) {
 			galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) {
-				dropout(in_data[i], dropout_mask[i], &in_temp[i*y]);
+				dropout(scale_, dropout_rate_, in_data[i], dropout_mask[i], &in_temp[i*y]);
 			}, galois::loopname("dropout"));
-			//t_dropout.Stop();
 			matmul1D1D(x, z, y, in_temp, W, out_temp); // x*y; y*z; x*z
 		} else matmul2D1D(z, in_data, W, out_temp); // x*y; y*z; x*z
 		//t_matmul.Stop();
@@ -119,7 +119,7 @@ class graph_conv_layer: public layer {
 			update_all(graph, in_temp, in_grad, true, norm_factor); // x*x; x*y -> x*y NOTE: since graph is symmetric, the derivative is the same
 			if (dropout_) {
 				galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) {
-					d_dropout(in_grad[i], dropout_mask[i], in_grad[i]);
+					d_dropout(scale_, in_grad[i], dropout_mask[i], in_grad[i]);
 				}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("d_dropout"));
 			}
 		}
@@ -154,6 +154,8 @@ class graph_conv_layer: public layer {
 	bool norm_; // whether to normalize data
 	bool bias_; // whether to add bias afterwards
 	bool dropout_; // whether to use dropout at first
+	const float dropout_rate_;
+	float scale_;
 	net_phase phase_;
 	size_t x;
 	size_t y;
diff --git a/libdeepgalois/layers/layer.h b/libdeepgalois/include/layers/layer.h
similarity index 100%
rename from libdeepgalois/layers/layer.h
rename to libdeepgalois/include/layers/layer.h
diff --git a/libdeepgalois/layers/linear_layer.h b/libdeepgalois/include/layers/linear_layer.h
similarity index 100%
rename from libdeepgalois/layers/linear_layer.h
rename to libdeepgalois/include/layers/linear_layer.h
diff --git a/libdeepgalois/include/layers/relu_layer.h b/libdeepgalois/include/layers/relu_layer.h
new file mode 100644
index 0000000000..c4acdd50ac
--- /dev/null
+++ b/libdeepgalois/include/layers/relu_layer.h
@@ -0,0 +1,15 @@
+#pragma once
+#include "layer.h"
+
+// ReLU Layer
+class relu_layer : public layer {
+public:
+	relu_layer(unsigned level, std::vector<size_t> in_dims, std::vector<size_t> out_dims)
+		: layer(level, in_dims, out_dims) {
+		trainable_ = false;
+	}
+	~relu_layer() {}
+	std::string layer_type() const override { return std::string("relu"); }
+	virtual void forward_propagation(const tensor_t &in_data, tensor_t &out_data);
+	virtual void back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad);
+};
diff --git a/libdeepgalois/include/layers/softmax_loss_layer.h b/libdeepgalois/include/layers/softmax_loss_layer.h
new file mode 100644
index 0000000000..6375f72121
--- /dev/null
+++ b/libdeepgalois/include/layers/softmax_loss_layer.h
@@ -0,0 +1,18 @@
+#pragma once
+#include "layer.h"
+
+class softmax_loss_layer: public layer {
+public:
+	softmax_loss_layer(unsigned level, std::vector<size_t> in_dims, 
+		std::vector<size_t> out_dims, LabelList *lab);
+	softmax_loss_layer(unsigned level, std::vector<size_t> in_dims, 
+		std::vector<size_t> out_dims) : softmax_loss_layer(level, in_dims, out_dims, NULL) {}
+	~softmax_loss_layer() {}
+	std::string layer_type() const override { return std::string("softmax_loss"); }
+	virtual void forward_propagation(const tensor_t &in_data, tensor_t &out_data);
+	virtual void back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad);
+
+private:
+	LabelList *labels;
+};
+
diff --git a/libdeepgalois/lgraph.h b/libdeepgalois/include/lgraph.h
similarity index 100%
rename from libdeepgalois/lgraph.h
rename to libdeepgalois/include/lgraph.h
diff --git a/libdeepgalois/include/math_functions.hpp b/libdeepgalois/include/math_functions.hpp
new file mode 100644
index 0000000000..d3d08b10b2
--- /dev/null
+++ b/libdeepgalois/include/math_functions.hpp
@@ -0,0 +1,43 @@
+#ifndef _MATH_FUNCTIONS_
+#define _MATH_FUNCTIONS_
+#include <cmath>
+#include "types.h"
+#include <immintrin.h>
+
+const float negative_slope = 0;
+
+void vadd(const vec_t &a, const vec_t &b, vec_t &out);
+void vadd(size_t n, const float_t *a, const float_t *b, float_t *out);
+void vsub(const vec_t &a, const vec_t &b, vec_t &out);
+void vmul(const vec_t &a, const vec_t &b, vec_t &out);
+void vdiv(const vec_t &a, const vec_t &b, vec_t &out);
+void add_scalar(const float_t alpha, vec_t &Y);
+void sub_scalar(const float_t alpha, vec_t &Y);
+void mul_scalar(const float_t alpha, vec_t &Y);
+void mul_scalar(size_t n, const float_t alpha, const float_t *in, float_t *out);
+void div_scalar(const float_t alpha, vec_t &Y);
+float_t dot(const vec_t &x, const vec_t &y);
+void mvmul(const vec_t &matrix, const vec_t &in_vector, vec_t &out_vector);
+void vvmul(const vec_t &a, const vec_t &b, tensor_t &out);
+void matadd(size_t x, size_t y, const tensor_t &A, const tensor_t &B, tensor_t &C);
+void copy2D1D(const tensor_t &in, vec_t &out);
+void matmul2D(const tensor_t &A, const tensor_t &B, tensor_t &C);
+void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z, const vec_t &A, const vec_t &B, vec_t &C);
+void matmul2D1D(const size_t dim_y, const tensor_t &A, const vec_t &B, vec_t &C);
+void transpose2D(const tensor_t &in, tensor_t &out);
+void transpose2D1D(const tensor_t &in, vec_t &out);
+void transpose(size_t x, size_t y, const vec_t &in, vec_t &out);
+int argmax(const size_t n, const vec_t &x);
+void clear(vec_t &in);
+void relu(const vec_t &in, vec_t &out);
+void update_all(Graph *g, const tensor_t &in, tensor_t &out, bool norm, const vec_t &norm_factor);
+void update_all(Graph *g, const vec_t &in, tensor_t &out, bool norm, const vec_t &norm_factor);
+void dropout(const float scale, const float dropout_rate, const vec_t &in, std::vector<unsigned> &mask, vec_t &out);
+void dropout(const float scale, const float dropout_rate, const vec_t &in, std::vector<unsigned> &mask, float_t *out);
+void d_dropout(const float scale, const vec_t &in_diff, std::vector<unsigned> &mask, vec_t &out_diff);
+void softmax(const vec_t &input, vec_t &output);
+void d_softmax(const vec_t &y, const vec_t &p, vec_t &dy, const vec_t &dp);
+float_t cross_entropy(const vec_t &y, const vec_t &p);
+void d_cross_entropy(const vec_t &y, const vec_t &p, vec_t &d);
+
+#endif
diff --git a/libdeepgalois/net.h b/libdeepgalois/include/net.h
similarity index 98%
rename from libdeepgalois/net.h
rename to libdeepgalois/include/net.h
index f6d6930d5a..f845eed82e 100644
--- a/libdeepgalois/net.h
+++ b/libdeepgalois/include/net.h
@@ -25,7 +25,6 @@ class Net {
 	virtual void combine(const vec_t ma, const vec_t mb, const vec_t &a, const vec_t &b, vec_t &out) {}
 	
 	void init() {
-		assert(dropout_rate < 1.0);
 		read_graph(dataset, g); 
 		n = g.size(); // N
 		labels.resize(n, 0); // label for each vertex: N x 1
@@ -79,13 +78,14 @@ class Net {
 			layers[i]->print_layer_info();
 	}
 
-	void append_conv_layer(size_t layer_id, bool act = false, bool norm = true, bool bias = false, bool dropout = true) {
+	void append_conv_layer(size_t layer_id, bool act = false, bool norm = true, bool bias = false, bool dropout = true, float dropout_rate = 0.5) {
+		assert(dropout_rate < 1.0);
 		assert(layer_id < NUM_CONV_LAYERS);
 		std::vector<size_t> in_dims(2), out_dims(2);
 		in_dims[0] = out_dims[0] = n;
 		in_dims[1] = get_in_dim(layer_id);
 		out_dims[1] = get_out_dim(layer_id);
-		layers[layer_id] = new graph_conv_layer(layer_id, &g, act, norm, bias, dropout, in_dims, out_dims);
+		layers[layer_id] = new graph_conv_layer(layer_id, &g, act, norm, bias, dropout, dropout_rate, in_dims, out_dims);
 		if(layer_id > 0) connect(layers[layer_id-1], layers[layer_id]);
 	}
 
diff --git a/libdeepgalois/node.h b/libdeepgalois/include/node.h
similarity index 99%
rename from libdeepgalois/node.h
rename to libdeepgalois/include/node.h
index deffebad9b..1a50080934 100644
--- a/libdeepgalois/node.h
+++ b/libdeepgalois/include/node.h
@@ -1,5 +1,6 @@
 #pragma once
 #include <vector>
+#include "types.h"
 class node;
 class layer;
 class edge;
diff --git a/libdeepgalois/optimizer.h b/libdeepgalois/include/optimizer.h
similarity index 97%
rename from libdeepgalois/optimizer.h
rename to libdeepgalois/include/optimizer.h
index 2896881fed..d0f35eac11 100644
--- a/libdeepgalois/optimizer.h
+++ b/libdeepgalois/include/optimizer.h
@@ -39,7 +39,7 @@ struct stateful_optimizer : public optimizer {
  * The Journal of Machine Learning Research, pages 2121-2159, 2011.
  **/
 struct adagrad : public stateful_optimizer<1> {
-	adagrad() : alpha(learning_rate), eps(float_t(1e-8)) {}
+	adagrad() : alpha(0.01), eps(float_t(1e-8)) {}
 	void update(const vec_t &dW, vec_t &W, bool parallelize) {
 		vec_t &g = get<0>(W);
 		if (parallelize) {
@@ -83,7 +83,7 @@ struct RMSprop : public stateful_optimizer<1> {
 // Adam: A Method for Stochastic Optimization
 // http://arxiv.org/abs/1412.6980
 struct adam : public stateful_optimizer<2> {
-	adam() : alpha(learning_rate), b1(float_t(0.9)),
+	adam() : alpha(0.01), b1(float_t(0.9)),
 		b2(float_t(0.999)), b1_t(float_t(0.9)),
 		b2_t(float_t(0.999)), eps(float_t(1e-8)) {}
 
@@ -96,7 +96,7 @@ struct adam : public stateful_optimizer<2> {
 			// L2 norm based update rule
 			W[i] -= alpha * (mt[i] / (float_t(1) - b1_t)) /
 				std::sqrt((vt[i] / (float_t(1) - b2_t)) + eps);
-		}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("adam_update"));
+		}, galois::chunk_size<256>(), galois::steal(), galois::loopname("adam_update"));
 		b1_t *= b1;
 		b2_t *= b2;
 	}
diff --git a/libdeepgalois/random.h b/libdeepgalois/include/random.h
similarity index 100%
rename from libdeepgalois/random.h
rename to libdeepgalois/include/random.h
diff --git a/libdeepgalois/timer.h b/libdeepgalois/include/timer.h
similarity index 100%
rename from libdeepgalois/timer.h
rename to libdeepgalois/include/timer.h
diff --git a/libdeepgalois/types.h b/libdeepgalois/include/types.h
similarity index 98%
rename from libdeepgalois/types.h
rename to libdeepgalois/include/types.h
index bc9fe21049..0aa80cce4f 100644
--- a/libdeepgalois/types.h
+++ b/libdeepgalois/include/types.h
@@ -30,5 +30,6 @@ typedef galois::graphs::LC_CSR_Graph<uint32_t, void>::with_numa_alloc<true>::typ
 #endif
 
 typedef Graph::GraphNode GNode;
+#define CHUNK_SIZE 256
 
 #endif
diff --git a/libdeepgalois/utils.h b/libdeepgalois/include/utils.h
similarity index 92%
rename from libdeepgalois/utils.h
rename to libdeepgalois/include/utils.h
index 100a997b57..ceb49b0e41 100644
--- a/libdeepgalois/utils.h
+++ b/libdeepgalois/include/utils.h
@@ -3,10 +3,11 @@
 #include <random>
 #include <iomanip>
 #include <fstream>
+#include <iostream>
 #include <sys/time.h>
 #include <sys/resource.h>
 
-std::string path = "/h2/xchen/datasets/Learning/"; // path to the input dataset
+const std::string path = "/h2/xchen/datasets/Learning/"; // path to the input dataset
 enum class net_phase { train, test };
 
 class ResourceManager {
@@ -85,7 +86,7 @@ inline bool bernoulli(float_t p) {
 	return uniform_rand(float_t{0}, float_t{1}) <= p;
 }
 
-size_t read_masks(std::string dataset_str, std::string mask_type, size_t &begin, size_t &end, MaskList &masks) {
+inline size_t read_masks(std::string dataset_str, std::string mask_type, size_t &begin, size_t &end, std::vector<uint8_t> &masks) {
 	if (dataset_str != "citeseer" && dataset_str != "cora") {
 		std::cout << "Dataset currently not supported\n";
 		exit(1);
diff --git a/libdeepgalois/layers/relu_layer.h b/libdeepgalois/layers/relu_layer.h
deleted file mode 100644
index 2795fc404e..0000000000
--- a/libdeepgalois/layers/relu_layer.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#pragma once
-#include "layer.h"
-
-// ReLU Layer
-class relu_layer : public layer {
-public:
-	relu_layer(unsigned level, std::vector<size_t> in_dims, std::vector<size_t> out_dims)
-		: layer(level, in_dims, out_dims) {
-		trainable_ = false;
-	}
-	std::string layer_type() const override { return std::string("relu"); }
-	// 𝑦[𝑙] = max(0, 𝑦[𝑙−1])
-	void forward_propagation(const tensor_t &in_data, tensor_t &out_data) override {
-		galois::do_all(galois::iterate((size_t)0, input_dims[0]), [&](const auto& i) {
-			for (size_t j = 0; j < input_dims[1]; ++j) 
-				out_data[i][j] = std::max(in_data[i][j], (float_t)0);
-		}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("relu_layer-fw"));
-	}
-	// 𝜕𝐿 / 𝜕𝑦[𝑙−1] = 0, 𝑖𝑓 (𝑦[𝑙] < 0)
-	//              = 𝜕𝐿 / 𝜕𝑦𝑙 , 𝑜𝑡ℎ𝑒𝑟𝑤𝑖𝑠𝑒
-	void back_propagation(const tensor_t &in_data, const tensor_t &out_data, 
-		tensor_t &out_grad, tensor_t &in_grad) override {
-		galois::do_all(galois::iterate((size_t)0, input_dims[0]), [&](const auto& i) {
-			for (size_t j = 0; j < input_dims[1]; ++j) 
-				in_grad[i][j] = out_data[i][j] > float_t(0) ? out_grad[i][j] : float_t(0);
-		}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("relu_layer-bw"));
-	}
-};
diff --git a/libdeepgalois/layers/softmax_loss_layer.h b/libdeepgalois/layers/softmax_loss_layer.h
deleted file mode 100644
index bdd52e4d38..0000000000
--- a/libdeepgalois/layers/softmax_loss_layer.h
+++ /dev/null
@@ -1,47 +0,0 @@
-#pragma once
-#include "layer.h"
-
-class softmax_loss_layer: public layer {
-public:
-	softmax_loss_layer(unsigned level, std::vector<size_t> in_dims, 
-		std::vector<size_t> out_dims, LabelList *lab)
-		: layer(level, in_dims, out_dims), labels(lab) {
-		trainable_ = false;
-		loss.resize(in_dims[0]); // error for each sample
-		name_ = layer_type() + "_" + std::to_string(level);
-	}
-	softmax_loss_layer(unsigned level, std::vector<size_t> in_dims, 
-		std::vector<size_t> out_dims) : 
-		softmax_loss_layer(level, in_dims, out_dims, NULL) {}
-	~softmax_loss_layer() {}
-	std::string layer_type() const override { return std::string("softmax_loss"); }
-
-	// TODO: need kernel fusion optimization
-	// 𝑦[i] = 𝑒^𝑥[i] / Σ 𝑒^𝑥[𝑘]
-	void forward_propagation(const tensor_t &in_data, tensor_t &out_data) override {
-		galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) {
-			if (masks_[i] == 1) { // masked
-				softmax(in_data[i], out_data[i]); // normalize using softmax
-				// y is a one hot encoded vector for the labels
-				std::vector<acc_t> y(output_dims[1], 0.0); // ground truth
-				y[(*labels)[i]] = 1.0; // one-hot
-				loss[i] = cross_entropy(y, out_data[i]);
-			}
-		}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("softmax-loss-fw"));
-	}
-
-	void back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad) override {
-		//std::cout << name_ << " backward: x=" << in_grad.size() << ", y=" << in_grad[0].size() << "\n";
-		galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) {
-			vec_t norm_grad(output_dims[1]);
-			std::vector<acc_t> y(output_dims[1], 0.0); // ground truth
-			y[(*labels)[i]] = 1.0;
-			d_cross_entropy(y, out_data[i], norm_grad);
-			d_softmax(in_data[i], out_data[i], in_grad[i], norm_grad);
-		}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("softmax-loss-bw"));
-	}
-
-private:
-	LabelList *labels;
-};
-
diff --git a/libdeepgalois/src/layers/relu_layer.cpp b/libdeepgalois/src/layers/relu_layer.cpp
new file mode 100644
index 0000000000..ccabc8a090
--- /dev/null
+++ b/libdeepgalois/src/layers/relu_layer.cpp
@@ -0,0 +1,19 @@
+#include "layers/relu_layer.h"
+
+// 𝑦[𝑙] = max(0, 𝑦[𝑙−1])
+void relu_layer::forward_propagation(const tensor_t &in_data, tensor_t &out_data) {
+	galois::do_all(galois::iterate((size_t)0, input_dims[0]), [&](const auto& i) {
+		for (size_t j = 0; j < input_dims[1]; ++j) 
+			out_data[i][j] = std::max(in_data[i][j], (float_t)0);
+	}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("relu_layer-fw"));
+}
+
+// 𝜕𝐿 / 𝜕𝑦[𝑙−1] = 0, 𝑖𝑓 (𝑦[𝑙] < 0)
+//              = 𝜕𝐿 / 𝜕𝑦𝑙 , 𝑜𝑡ℎ𝑒𝑟𝑤𝑖𝑠𝑒
+void relu_layer::back_propagation(const tensor_t &in_data, const tensor_t &out_data, 
+	tensor_t &out_grad, tensor_t &in_grad) {
+	galois::do_all(galois::iterate((size_t)0, input_dims[0]), [&](const auto& i) {
+		for (size_t j = 0; j < input_dims[1]; ++j) 
+			in_grad[i][j] = out_data[i][j] > float_t(0) ? out_grad[i][j] : float_t(0);
+	}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("relu_layer-bw"));
+}
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp
new file mode 100644
index 0000000000..61f63f6f0e
--- /dev/null
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp
@@ -0,0 +1,34 @@
+#include "layers/softmax_loss_layer.h"
+
+softmax_loss_layer::softmax_loss_layer(unsigned level, std::vector<size_t> in_dims, std::vector<size_t> out_dims, LabelList *lab)
+	: layer(level, in_dims, out_dims), labels(lab) {
+	trainable_ = false;
+	loss.resize(in_dims[0]); // error for each sample
+	name_ = layer_type() + "_" + std::to_string(level);
+}
+
+// TODO: need kernel fusion optimization
+// 𝑦[i] = 𝑒^𝑥[i] / Σ 𝑒^𝑥[𝑘]
+void softmax_loss_layer::forward_propagation(const tensor_t &in_data, tensor_t &out_data) {
+	galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) {
+		if (masks_[i] == 1) { // masked
+			softmax(in_data[i], out_data[i]); // normalize using softmax
+			// y is a one hot encoded vector for the labels
+			std::vector<acc_t> y(output_dims[1], 0.0); // ground truth
+			y[(*labels)[i]] = 1.0; // one-hot
+			loss[i] = cross_entropy(y, out_data[i]);
+		}
+	}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("softmax-loss-fw"));
+}
+
+void softmax_loss_layer::back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad) {
+	//std::cout << name_ << " backward: x=" << in_grad.size() << ", y=" << in_grad[0].size() << "\n";
+	galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) {
+		vec_t norm_grad(output_dims[1]);
+		std::vector<acc_t> y(output_dims[1], 0.0); // ground truth
+		y[(*labels)[i]] = 1.0;
+		d_cross_entropy(y, out_data[i], norm_grad);
+		d_softmax(in_data[i], out_data[i], in_grad[i], norm_grad);
+	}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("softmax-loss-bw"));
+}
+
diff --git a/libdeepgalois/math_functions.hpp b/libdeepgalois/src/math_functions.cpp
similarity index 60%
rename from libdeepgalois/math_functions.hpp
rename to libdeepgalois/src/math_functions.cpp
index f1612aac1c..a4d1d77719 100644
--- a/libdeepgalois/math_functions.hpp
+++ b/libdeepgalois/src/math_functions.cpp
@@ -1,21 +1,13 @@
-#ifndef _MATH_FUNCTIONS_
-#define _MATH_FUNCTIONS_
-#include <cmath>
+#include "math_functions.hpp"
 #include "utils.h"
-#include <immintrin.h>
 
-#ifdef WITH_BLAS
 extern "C" {
 #include <cblas.h>
 //#include <clapack.h>
 }
-#endif
-
-const float negative_slope = 0;
 
 // vector add
-template <typename DataTy = float>
-inline void vadd(const std::vector<DataTy> &a, const std::vector<DataTy> &b, std::vector<DataTy> &out) {
+void vadd(const vec_t &a, const vec_t &b, vec_t &out) {
 	//for (size_t i = 0; i < out.size(); ++i) out[i] = a[i] + b[i];
 	size_t n = out.size();
 	size_t vec_len = 8;
@@ -25,8 +17,7 @@ inline void vadd(const std::vector<DataTy> &a, const std::vector<DataTy> &b, std
 	for (size_t i = alignedN; i < n; ++i) out[i] = a[i] + b[i];
 }
 
-template <typename DataTy = float>
-inline void vadd(size_t n, const DataTy *a, const DataTy *b, DataTy *out) {
+void vadd(size_t n, const float_t *a, const float_t *b, float_t *out) {
 	size_t vec_len = 8;
 	const size_t alignedN = n - n % vec_len;
 	for (size_t i = 0; i < alignedN; i += vec_len)
@@ -35,20 +26,17 @@ inline void vadd(size_t n, const DataTy *a, const DataTy *b, DataTy *out) {
 }
 
 // vector subtract
-template <typename DataTy = float>
-inline void vsub(const std::vector<DataTy> &in_a, const std::vector<DataTy> &in_b, std::vector<DataTy> &out) {
+void vsub(const vec_t &in_a, const vec_t &in_b, vec_t &out) {
 	for (size_t i = 0; i < out.size(); ++i) out[i] = in_a[i] - in_b[i];
 }
 
 // vector multiply
-template <typename DataTy = float>
-inline void vmul(const std::vector<DataTy> &in_a, const std::vector<DataTy> &in_b, std::vector<DataTy> &out) {
+void vmul(const vec_t &in_a, const vec_t &in_b, vec_t &out) {
 	for (size_t i = 0; i < out.size(); ++i) out[i] = in_a[i] * in_b[i];
 }
 
 // vector divide
-template <typename DataTy = float>
-inline void vdiv(const std::vector<DataTy> &in_a, const std::vector<DataTy> &in_b, std::vector<DataTy> &out) {
+void vdiv(const vec_t &in_a, const vec_t &in_b, vec_t &out) {
 	for (size_t i = 0; i < out.size(); ++i) {
 		assert(in_b[i] != 0);
 		out[i] = in_a[i] / in_b[i];
@@ -56,46 +44,40 @@ inline void vdiv(const std::vector<DataTy> &in_a, const std::vector<DataTy> &in_
 }
 
 // vector add scalar
-template <typename DataTy = float>
-inline void add_scalar(const DataTy alpha, std::vector<DataTy> &Y) {
+void add_scalar(const float_t alpha, vec_t &Y) {
 	for (size_t i = 0; i < Y.size(); ++i) Y[i] += alpha;
 }
 
 // vector subtract scalar
-template <typename DataTy = float>
-inline void sub_scalar(const DataTy alpha, std::vector<DataTy> &Y) {
+void sub_scalar(const float_t alpha, vec_t &Y) {
 	for (size_t i = 0; i < Y.size(); ++i) Y[i] -= alpha;
 }
 
 // vector multiply scalar
-template <typename DataTy = float>
-inline void mul_scalar(const DataTy alpha, std::vector<DataTy> &Y) {
+void mul_scalar(const float_t alpha, vec_t &Y) {
 	for (size_t i = 0; i < Y.size(); ++i) Y[i] *= alpha;
 }
 
-template <typename DataTy = float>
-inline void mul_scalar(size_t n, const DataTy alpha, const DataTy *in, DataTy *out) {
+void mul_scalar(size_t n, const float_t alpha, const float_t *in, float_t *out) {
 	for (size_t i = 0; i < n; ++i) out[i] = alpha *in[i];
 }
 
 // vector divide scalar
-template <typename DataTy = float>
-inline void div_scalar(const DataTy alpha, std::vector<DataTy> &Y) {
+void div_scalar(const float_t alpha, vec_t &Y) {
 	assert(alpha != 0);
 	for (size_t i = 0; i < Y.size(); ++i) Y[i] /= alpha;
 }
 
 // dot product
-template <typename DataTy = float>
-inline DataTy dot(const std::vector<DataTy> &x, const std::vector<DataTy> &y) {
-	DataTy sum = 0;
+float_t dot(const vec_t &x, const vec_t &y) {
+	float_t sum = 0;
 	for (size_t i = 0; i < x.size(); ++i)
 		sum += x[i] * y[i];
 	return sum;
 }
 
 // matrix-vector multiply
-inline void mvmul(const vec_t &matrix, const vec_t &in_vector, vec_t &out_vector) {
+void mvmul(const vec_t &matrix, const vec_t &in_vector, vec_t &out_vector) {
 	size_t m = out_vector.size();
 	size_t n = in_vector.size();
 	for (size_t i = 0; i < m; ++i) { 
@@ -106,7 +88,7 @@ inline void mvmul(const vec_t &matrix, const vec_t &in_vector, vec_t &out_vector
 }
 
 // vector-vector multiply
-inline void vvmul(const vec_t &a, const vec_t &b, tensor_t &out) {
+void vvmul(const vec_t &a, const vec_t &b, tensor_t &out) {
 	size_t m = a.size();
 	size_t n = b.size();
 	for (size_t i = 0; i < m; ++i) { 
@@ -117,35 +99,32 @@ inline void vvmul(const vec_t &a, const vec_t &b, tensor_t &out) {
 }
 
 // matrix addition
-inline void matadd(size_t x, size_t y, const tensor_t &A, const tensor_t &B, tensor_t &C) {
+void matadd(size_t x, size_t y, const tensor_t &A, const tensor_t &B, tensor_t &C) {
 	for (size_t i = 0; i < x; ++i)
 		for (size_t j = 0; j < y; ++j)
 			C[i][j] = A[i][j] + B[i][j];
 }
 
 // TODO: vectorize
-template <typename DataTy = float>
-inline void copy2D1D(const tensor_t &in, vec_t &out) {
+void copy2D1D(const tensor_t &in, vec_t &out) {
 	size_t x = in.size();
 	size_t y = in[0].size();
-#ifdef WITH_BLAS
 	auto ptr = &out[0];
 	for (size_t i = 0; i < x; i++) {
 		std::copy(in[i].begin(), in[i].end(), ptr);
 		ptr += y;
 	}
-#else
-	assert(out.size() == x*y);
-	for (size_t i = 0; i < x; i ++) {
-		for (size_t j = 0; j < y; j ++) {
-			out[i*y+j] = in[i][j];
-		}
-	}
-#endif
 }
 
-// matrix multiply: all 2D
-inline void matmul2D(const tensor_t &A, const tensor_t &B, tensor_t &C) {
+void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, 
+		const int M, const int N, const int K, const float alpha, 
+		const float* A, const float* B, const float beta, float* C) {
+	int lda = (TransA == CblasNoTrans) ? K : M;
+	int ldb = (TransB == CblasNoTrans) ? N : K;
+	cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N);
+}
+
+void matmul2D(const tensor_t &A, const tensor_t &B, tensor_t &C) {
 	// A: x*z; B: z*y; C: x*y
 	size_t dim_x = A.size();
 	size_t dim_y = C[0].size();
@@ -164,72 +143,37 @@ inline void matmul2D(const tensor_t &A, const tensor_t &B, tensor_t &C) {
 	} 
 }
 
-void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, 
-	const int M, const int N, const int K, const float alpha, 
-	const float* A, const float* B, const float beta, float* C) {
-	int lda = (TransA == CblasNoTrans) ? K : M;
-	int ldb = (TransB == CblasNoTrans) ? N : K;
-	cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N);
-}
-
-inline void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z, 
+void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z, 
 	const vec_t &A, const vec_t &B, vec_t &C) {
 	galois::StatTimer Tmatmul("MatMul");
 	Tmatmul.start();
 	assert(A.size() == dim_x*dim_z);
 	assert(B.size() == dim_z*dim_y);
 	assert(C.size() == dim_x*dim_y);
-
-#ifdef WITH_BLAS
 	const CBLAS_TRANSPOSE TransA = CblasNoTrans;
 	const CBLAS_TRANSPOSE TransB = CblasNoTrans;
 	sgemm_cpu(TransA, TransB, dim_x, dim_y, dim_z, 1.0, &A[0], &B[0], 0.0, &C[0]);
-#else
-	for (size_t i = 0; i < dim_x; ++i) { 
-		for (size_t j = 0; j < dim_y; ++j) { 
-			C[i*dim_y+j] = 0;
-			for (size_t k = 0; k < dim_z; ++k) { 
-				C[i*dim_y+j] += A[i*dim_z+k] * B[k*dim_y+j];
-			} 
-		} 
-	} 
-#endif
 	Tmatmul.stop();
 }
 
-inline void matmul2D1D(const size_t dim_y, const tensor_t &A, const vec_t &B, vec_t &C) {
+void matmul2D1D(const size_t dim_y, const tensor_t &A, const vec_t &B, vec_t &C) {
 	// A: x*z; B: z*y; C: x*y
 	size_t dim_x = A.size();
 	size_t dim_z = A[0].size();
 	assert(B.size() == dim_z*dim_y);
 	assert(C.size() == dim_x*dim_y);
-
-#ifdef WITH_BLAS
 	vec_t A1D(dim_x*dim_z);
 	copy2D1D(A, A1D);
 	matmul1D1D(dim_x, dim_y, dim_z, A1D, B, C);
-#else
-	for (size_t i = 0; i < dim_x; ++i) { 
-		for (size_t j = 0; j < dim_y; ++j) { 
-			C[i*dim_y+j] = 0;
-			for (size_t k = 0; k < dim_z; ++k) { 
-				C[i*dim_y+j] += A[i][k] * B[k][j];
-			} 
-		} 
-	} 
-#endif
 }
 
-// matrix multiply
-inline void matmul(const tensor_t &A, const vec_t &B, tensor_t &C) {
+void matmul(const tensor_t &A, const vec_t &B, tensor_t &C) {
 	// A: x*z; B: z*y; C: x*y
 	size_t dim_x = C.size();
 	size_t dim_y = C[0].size();
 	size_t dim_z = A[0].size();
 	assert(A.size() == dim_x);
 	assert(B.size() == dim_y*dim_z);
-
-#ifdef WITH_BLAS
 	vec_t A1D(dim_x*dim_z);
 	vec_t C1D(dim_x*dim_y, 0);
 	auto ptr = &A1D[0];
@@ -243,20 +187,9 @@ inline void matmul(const tensor_t &A, const vec_t &B, tensor_t &C) {
 			C[i][j] = C1D[i*dim_y+j];
 		}
 	}
-#else
-	for (size_t i = 0; i < dim_x; ++i) { 
-		for (size_t j = 0; j < dim_y; ++j) { 
-			C[i][j] = 0;
-			for (size_t k = 0; k < dim_z; ++k) { 
-				C[i][j] += A[i][k] * B[k*dim_y+j];
-			} 
-		} 
-	} 
-#endif
 }
 
-template <typename DataTy = float>
-inline void transpose2D(const tensor_t &in, tensor_t &out) {
+void transpose2D(const tensor_t &in, tensor_t &out) {
 	size_t x = in.size();
 	size_t y = in[0].size();
 	for (size_t i = 0; i < y; i ++) {
@@ -267,8 +200,7 @@ inline void transpose2D(const tensor_t &in, tensor_t &out) {
 }
 
 // TODO: vectorize
-template <typename DataTy = float>
-inline void transpose2D1D(const tensor_t &in, vec_t &out) {
+void transpose2D1D(const tensor_t &in, vec_t &out) {
 	size_t x = in.size();
 	size_t y = in[0].size();
 	assert(out.size() == x*y);
@@ -279,18 +211,15 @@ inline void transpose2D1D(const tensor_t &in, vec_t &out) {
 	}
 }
 
-template <typename DataTy = float>
-inline void transpose(size_t x, size_t y, const vec_t &in, vec_t &out) {
+void transpose(size_t x, size_t y, const vec_t &in, vec_t &out) {
 	for (size_t i = 0; i < y; i ++) {
 		for (size_t j = 0; j < x; j ++) {
 			out[i*x+j] = in[j*y+i];
 		}
 	}
 }
-
-template <typename DataTy = float>
-inline int argmax(const size_t n, const std::vector<DataTy> &x) {
-	DataTy max = x[0];
+int argmax(const size_t n, const vec_t &x) {
+	float_t max = x[0];
 	int max_ind = 0;
 	for (size_t i = 1; i < n; i++) {
 		if (x[i] > max) {
@@ -301,72 +230,32 @@ inline int argmax(const size_t n, const std::vector<DataTy> &x) {
 	return max_ind;
 }
 
-inline void clear(vec_t &in) {
+void clear(vec_t &in) {
 	for (size_t i = 0; i < in.size(); i++) in[i] = 0;
 }
 
-inline void update_all(Graph *g, const tensor_t &in, tensor_t &out, bool norm, const vec_t &norm_factor) {
-	galois::do_all(galois::iterate(g->begin(), g->end()), [&](const auto& src) {
-		clear(out[src]); // TODO: vectorize clear
-		float_t a = 0.0, b = 0.0;
-		if (norm) a = norm_factor[src];
-		// gather neighbors' embeddings
-		for (const auto e : g->edges(src)) {
-			const auto dst = g->getEdgeDst(e);
-			if (norm) {
-				b = a * norm_factor[dst];
-				vec_t neighbor = in[dst];
-				mul_scalar(b, neighbor);
-				vadd(out[src], neighbor, out[src]); // out[src] += in[dst]
-			} else vadd(out[src], in[dst], out[src]); // out[src] += in[dst]
-		}
-	}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("update_all"));
-}
-
-inline void update_all(Graph *g, const vec_t &in, tensor_t &out, bool norm, const vec_t &norm_factor) {
-	size_t len = out[0].size();
-	galois::do_all(galois::iterate(g->begin(), g->end()), [&](const auto& src) {
-		clear(out[src]);
-		float_t a = 0.0, b = 0.0;
-		if (norm) a = norm_factor[src];
-		// gather neighbors' embeddings
-		for (const auto e : g->edges(src)) {
-			const auto dst = g->getEdgeDst(e);
-			if (norm) {
-				b = a * norm_factor[dst];
-				vec_t neighbor(len);
-				mul_scalar(len, b, &in[dst*len], neighbor.data());
-				vadd(out[src], neighbor, out[src]); // out[src] += in[dst]
-			} else vadd(len, out[src].data(), &in[dst*len], out[src].data()); // out[src] += in[dst]
-		}
-	}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("update_all"));
-}
-
-template <typename DataTy = float>
-inline void relu(const std::vector<DataTy> &in, std::vector<DataTy> &out) {
+void relu(const vec_t &in, vec_t &out) {
 	for (size_t i = 0; i < out.size(); ++i) {
-		out[i] = std::max(in[i], (DataTy)0) + negative_slope * std::min(in[i], (DataTy)0);
+		out[i] = std::max(in[i], (float_t)0) + negative_slope * std::min(in[i], (float_t)0);
 	}
 }
 
-template <typename DataTy = float>
-inline void d_relu(const std::vector<DataTy> &in_diff, const std::vector<DataTy> &fv, std::vector<DataTy> &out_diff) {
+void d_relu(const vec_t &in_diff, const vec_t &fv, vec_t &out_diff) {
 	for (size_t i = 0; i < out_diff.size(); ++i) {
-		out_diff[i] = in_diff[i] * ((fv[i] > (DataTy)0)  + negative_slope * (fv[i] <= (DataTy)0));
+		out_diff[i] = in_diff[i] * ((fv[i] > (float_t)0)  + negative_slope * (fv[i] <= (float_t)0));
 	}
 }
 
-inline void d_mvmul(vec_t &in_diff, vec_t &h_in, tensor_t &out_diff) {
+void d_mvmul(vec_t &in_diff, vec_t &h_in, tensor_t &out_diff) {
 	vvmul(h_in, in_diff, out_diff); // transposed feature matrix X^T times in_diff 
 }
 
-inline void d_vadd(vec_t &in_diff, vec_t &out_diff) {
+void d_vadd(vec_t &in_diff, vec_t &out_diff) {
 	for (size_t i = 0; i < out_diff.size(); ++i)
 		out_diff[i] = in_diff[i];
 }
 
-template <typename DataTy = float>
-inline float reduce_mean(const std::vector<DataTy> &x) {
+float reduce_mean(const vec_t &x) {
 	size_t n = x.size();
 	assert(n > 0);
 	float sum = (float)x[0];
@@ -376,51 +265,83 @@ inline float reduce_mean(const std::vector<DataTy> &x) {
 	return sum / (float)n;
 }
 
-const float scale_ = 1. / (1. - dropout_rate);
-
-inline void dropout(const vec_t &in, std::vector<unsigned> &mask, vec_t &out) {
+void dropout(const float scale, const float dropout_rate, const vec_t &in, std::vector<unsigned> &mask, vec_t &out) {
 	assert(mask.size() == out.size());
 	//rng_bernoulli(1. - dropout_rate, mask); // Create random numbers
 	for (size_t i = 0; i < in.size(); ++i)
 		mask[i] = bernoulli(dropout_rate);
 	for (size_t i = 0; i < in.size(); ++i)
-		out[i] = in[i] * mask[i] * scale_;
+		out[i] = in[i] * mask[i] * scale;
 }
 
-inline void dropout(const vec_t &in, std::vector<unsigned> &mask, float_t *out) {
+void dropout(const float scale, const float dropout_rate, const vec_t &in, std::vector<unsigned> &mask, float_t *out) {
 	for (size_t i = 0; i < in.size(); ++i)
 		mask[i] = bernoulli(dropout_rate);
 	for (size_t i = 0; i < in.size(); ++i)
-		out[i] = in[i] * mask[i] * scale_;
+		out[i] = in[i] * mask[i] * scale;
 }
 
-inline void d_dropout(const vec_t &in_diff, std::vector<unsigned> &mask, vec_t &out_diff) {
+void d_dropout(const float scale, const vec_t &in_diff, std::vector<unsigned> &mask, vec_t &out_diff) {
 	for (size_t i = 0; i < in_diff.size(); ++i)
-		out_diff[i] = in_diff[i] * mask[i] * scale_;
+		out_diff[i] = in_diff[i] * mask[i] * scale;
 }
 
-template <typename DataTy = float>
-inline DataTy sigmoid_func(DataTy x) {
+float_t sigmoid_func(float_t x) {
 	return 0.5 * tanh(0.5 * x) + 0.5;
 }
 
 // Sigmoid
-template <typename DataTy = float>
-inline void sigmoid(std::vector<DataTy> &fv) {
+void sigmoid(vec_t &fv) {
 	size_t count = fv.size();
 	for (size_t i = 0; i < count; ++i) {
 		fv[i] = sigmoid_func(fv[i]);
 	}
 }
 
+void update_all(Graph *g, const tensor_t &in, tensor_t &out, bool norm, const vec_t &norm_factor) {
+	galois::do_all(galois::iterate(g->begin(), g->end()), [&](const auto& src) {
+		clear(out[src]); // TODO: vectorize clear
+		float_t a = 0.0, b = 0.0;
+		if (norm) a = norm_factor[src];
+		// gather neighbors' embeddings
+		for (const auto e : g->edges(src)) {
+			const auto dst = g->getEdgeDst(e);
+			if (norm) {
+				b = a * norm_factor[dst];
+				vec_t neighbor = in[dst];
+				mul_scalar(b, neighbor);
+				vadd(out[src], neighbor, out[src]); // out[src] += in[dst]
+			} else vadd(out[src], in[dst], out[src]); // out[src] += in[dst]
+		}
+	}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("update_all"));
+}
+
+void update_all(Graph *g, const vec_t &in, tensor_t &out, bool norm, const vec_t &norm_factor) {
+	size_t len = out[0].size();
+	galois::do_all(galois::iterate(g->begin(), g->end()), [&](const auto& src) {
+		clear(out[src]);
+		float_t a = 0.0, b = 0.0;
+		if (norm) a = norm_factor[src];
+		// gather neighbors' embeddings
+		for (const auto e : g->edges(src)) {
+			const auto dst = g->getEdgeDst(e);
+			if (norm) {
+				b = a * norm_factor[dst];
+				vec_t neighbor(len);
+				mul_scalar(len, b, &in[dst*len], neighbor.data());
+				vadd(out[src], neighbor, out[src]); // out[src] += in[dst]
+			} else vadd(len, out[src].data(), &in[dst*len], out[src].data()); // out[src] += in[dst]
+		}
+	}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("update_all"));
+}
+
 // Softmax function takes an N-dimensional vector (X) of real number,
 // and transforms it into a vector of real number in range (0,1) which add upto 1.
 // To make softmax func numerically stable, we simply normalize the values in the vector, 
 // by multiplying the numerator and denominator with a constant C, where log(C)=-max(X)
 //    exps = np.exp(X - np.max(X))
 //    exps / np.sum(exps)
-template <typename DataTy = float>
-inline void softmax(const std::vector<DataTy> &input, std::vector<DataTy> &output) {
+void softmax(const vec_t &input, vec_t &output) {
 	const float_t max = *std::max_element(input.begin(), input.end());
 	float_t denominator(0);
 	for (size_t i = 0; i < input.size(); i++) {
@@ -431,8 +352,7 @@ inline void softmax(const std::vector<DataTy> &input, std::vector<DataTy> &outpu
 		output[i] /= denominator;
 }
 
-template <typename DataTy = float>
-inline void log_softmax(const std::vector<DataTy> &input, std::vector<DataTy> &output) {
+void log_softmax(const vec_t &input, vec_t &output) {
 	const float_t max = *std::max_element(input.begin(), input.end());
 	float_t denominator(0);
 	for (size_t i = 0; i < input.size(); i++)
@@ -445,38 +365,27 @@ inline void log_softmax(const std::vector<DataTy> &input, std::vector<DataTy> &o
 // we often use it as the final layer in neural networks.
 // For this we need to calculate the derivative or gradient,
 // and pass it back to the previous layer during backpropagation.
-template <typename DataTy = float>
-inline void d_softmax(const std::vector<DataTy> &y, const std::vector<DataTy> &p, 
-		std::vector<DataTy> &dy, const std::vector<DataTy> &dp) {
+void d_softmax(const vec_t &y, const vec_t &p, vec_t &dy, const vec_t &dp) {
 	auto n = y.size();
 	vec_t df(n, 0);
 	for (size_t i = 0; i < n; i++) {
 		for (size_t j = 0; j < n; j++) {
-			//DataTy delta_ij = i == j? 1 : 0;
+			//float_t delta_ij = i == j? 1 : 0;
 			//df[i] += p[j] * (delta_ij - p[i]);
 			df[j] = (j == i) ? p[i] * (float_t(1) - p[i]) : -p[j] * p[i];
 		}
 		// dy = dp * (gradient of softmax)
 		dy[i] = dot(dp, df);
 	}
-/* 
-	for (size_t j = 0; j < x.size(); j++) {
-		for (size_t k = 0; k < x.size(); k++) {
-			df[k] = (k == j) ? y[j] * (float_t(1) - y[j]) : -y[k] * y[j];
-		}
-		dx[j] = vectorize::dot(&dy[0], &df[0], len);
-	}
-*/
 }
 
 // cross-entropy loss function for multi-class classification
 // y: ground truth
 // p: predicted probability
-template <typename DataTy = float>
-inline DataTy cross_entropy(const std::vector<DataTy> &y, const std::vector<DataTy> &p) {
+float_t cross_entropy(const vec_t &y, const vec_t &p) {
 	auto n = y.size();
 	assert(n > 0);
-	DataTy loss = 0.0;
+	float_t loss = 0.0;
 	for (size_t i = 0; i < n; i++) {
 		if (y[i] == float_t(0)) continue;
 		if (p[i] == float_t(0)) loss -= y[i] * std::log(float_t(1e-10));
@@ -487,8 +396,7 @@ inline DataTy cross_entropy(const std::vector<DataTy> &y, const std::vector<Data
 	return loss;
 }
 
-template <typename DataTy = float>
-inline void d_cross_entropy(const std::vector<DataTy> &y, const std::vector<DataTy> &p, std::vector<DataTy> &d) {
+void d_cross_entropy(const vec_t &y, const vec_t &p, vec_t &d) {
 	auto n = y.size();
 	//for (size_t i = 0; i < n; i++) d[i] = (p[i] - y[i]) / (p[i] * (float_t(1) - p[i]));
 	for (size_t i = 0; i < n; i++) {
@@ -497,4 +405,3 @@ inline void d_cross_entropy(const std::vector<DataTy> &y, const std::vector<Data
 	}
 }
 
-#endif
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
new file mode 100644
index 0000000000..7e96afc0c3
--- /dev/null
+++ b/libdeepgalois/src/math_functions.cu
@@ -0,0 +1,84 @@
+#pragma once
+#include "cutils.h"
+
+// flattern data into 1D before feed into the ReLU operater
+__global__ void relu_gpu(const int n, const float_t* in, float_t* out) {
+	CUDA_KERNEL_LOOP(index, n) {
+		out[index] = in[index] > 0 ? in[index] : 0;
+	}
+}
+
+__global__ void d_relu_gpu(const int n, const float_t* in_diff, const float_t* in_data, float_t* out_diff) {
+	CUDA_KERNEL_LOOP(index, n) {
+		out_diff[index] = in_data[index] > 0 ? in_diff[index] : 0;
+	}
+}
+
+void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, 
+	const int M, const int N, const int K, const float alpha, 
+	const float* A, const float* B, const float beta, float* C) {
+	// Note that cublas follows fortran order.
+	int lda = (TransA == CblasNoTrans) ? K : M;
+	int ldb = (TransB == CblasNoTrans) ? N : K;
+	cublasOperation_t cuTransA = (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+	cublasOperation_t cuTransB = (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+	CUBLAS_CHECK(cublasSgemm(cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
+}
+
+void gemv_gpu(const CBLAS_TRANSPOSE TransA, const int M, const int N, 
+	const float alpha, const float* A, const float* x, const float beta, float* y) {
+	cublasOperation_t cuTransA = (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
+	CUBLAS_CHECK(cublasSgemv(cublas_handle(), cuTransA, N, M, &alpha, A, N, x, 1, &beta, y, 1));
+}
+
+void scal_gpu(const int N, const float alpha, float *X) {
+	CUBLAS_CHECK(cublasSscal(cublas_handle(), N, &alpha, X, 1));
+}
+
+void dot_gpu(const int n, const float* x, const float* y, float* out) {
+	CUBLAS_CHECK(cublasSdot(Caffe::cublas_handle(), n, x, 1, y, 1, out));
+}
+
+void asum_gpu(const int n, const float* x, float* y) {
+	CUBLAS_CHECK(cublasSasum(Caffe::cublas_handle(), n, x, 1, y));
+}
+
+void scale_gpu(const int n, const float alpha, const float *x, float* y) {
+	CUBLAS_CHECK(cublasScopy(Caffe::cublas_handle(), n, x, 1, y, 1));
+	CUBLAS_CHECK(cublasSscal(Caffe::cublas_handle(), n, &alpha, y, 1));
+}
+
+__global__ void set_kernel(const int n, const float_t alpha, float_t* y) {
+	CUDA_KERNEL_LOOP(index, n) {
+		y[index] = alpha;
+	}
+}
+
+void set_gpu(const int N, const float_t alpha, float_t* Y) {
+	if (alpha == 0) {
+		CUDA_CHECK(cudaMemset(Y, 0, sizeof(float_t) * N));
+		return;
+	}
+	set_kernel<float_t><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(N, alpha, Y);
+}
+
+__global__ void add_scalar_kernel(const int n, const float_t alpha, float_t* y) {
+	CUDA_KERNEL_LOOP(index, n) {
+		y[index] += alpha;
+	}
+}
+
+void add_scalar_gpu(const int N, const float alpha, float* Y) {
+	add_scalar_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(N, alpha, Y);
+}
+
+__global__ void add_kernel(const int n, const float_t* a, const float_t* b, float_t* y) {
+	CUDA_KERNEL_LOOP(index, n) {
+		y[index] = a[index] + b[index];
+	}
+}
+
+void add_gpu<float>(const int N, const float* a, const float* b, float* y) {
+	add_kernel<<<GET_BLOCKS(N), CUDA_NUM_THREADS>>>(N, a, b, y);
+}
+
diff --git a/lonestargnn/CMakeLists.txt b/lonestargnn/CMakeLists.txt
index c03a5c6676..9e2597dffb 100644
--- a/lonestargnn/CMakeLists.txt
+++ b/lonestargnn/CMakeLists.txt
@@ -3,6 +3,6 @@ include_directories(BEFORE
   ${CMAKE_CURRENT_BINARY_DIR}/../libllvm/include
 )
 include_directories(${CMAKE_SOURCE_DIR}/lonestargnn)
-include_directories(${CMAKE_SOURCE_DIR}/libdeepgalois)
+include_directories(${CMAKE_SOURCE_DIR}/libdeepgalois/include)
 
 add_subdirectory(gcn)
diff --git a/lonestargnn/gcn/CMakeLists.txt b/lonestargnn/gcn/CMakeLists.txt
index f1a65740f7..05484252b8 100644
--- a/lonestargnn/gcn/CMakeLists.txt
+++ b/lonestargnn/gcn/CMakeLists.txt
@@ -1,16 +1,8 @@
-SET(USE_BLAS ON CACHE BOOL "Use blas")
-
-SET(BLAS_INC /net/ohm/export/cdgc/cxh/OpenBLAS/build/include)
-SET(BLAS_LIB /net/ohm/export/cdgc/cxh/OpenBLAS/build/lib)
-
-if (USE_BLAS)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWITH_BLAS")
-    include_directories(${BLAS_INC})
-    link_directories(${BLAS_LIB})
-endif()
+SET(OPENBLAS_INC /net/ohm/export/cdgc/cxh/OpenBLAS/build/include)
+SET(OPENBLAS_LIB /net/ohm/export/cdgc/cxh/OpenBLAS/build/lib)
+include_directories(${OPENBLAS_INC})
+link_directories(${OPENBLAS_LIB})
 
 app(gcn gcn.cpp)
+target_link_libraries(gcn deepgalois)
 
-if (USE_BLAS)
-    target_link_libraries(gcn -lopenblas)
-endif()
diff --git a/lonestargnn/lonestargnn.h b/lonestargnn/lonestargnn.h
index efbb862fd7..cbf3c1ae2a 100644
--- a/lonestargnn/lonestargnn.h
+++ b/lonestargnn/lonestargnn.h
@@ -24,7 +24,6 @@ static cll::opt<float> early_stopping("es", cll::desc("Tolerance for early stopp
 static cll::opt<unsigned> max_degree("md", cll::desc("Maximum Chebyshev polynomial degree (default value 3)"), cll::init(3));
 static cll::opt<unsigned> do_validate("dv", cll::desc("enable validation"), cll::init(1));
 static cll::opt<unsigned> do_test("dt", cll::desc("enable test"), cll::init(1));
-#define CHUNK_SIZE 256
 
 //! standard global options to the benchmarks
 extern llvm::cl::opt<bool> skipVerify;

From 55f96412a8bdebddf572625a22d4e5f6c305c13f Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Fri, 21 Feb 2020 09:51:17 -0600
Subject: [PATCH 004/660] add cuda in cmake

---
 libdeepgalois/CMakeLists.txt                  | 37 ++++++----
 libdeepgalois/include/aggregator.h            |  6 ++
 libdeepgalois/include/common.h                | 26 +++++--
 libdeepgalois/include/cutils.h                | 14 ++--
 libdeepgalois/include/gtypes.h                | 15 ++++
 .../include/layers/graph_conv_layer.h         | 72 +++----------------
 libdeepgalois/include/layers/layer.h          | 10 +--
 libdeepgalois/include/layers/relu_layer.h     |  2 +
 .../include/layers/softmax_loss_layer.h       |  2 +
 .../{math_functions.hpp => math_functions.hh} | 27 ++++---
 libdeepgalois/include/net.h                   |  8 +--
 libdeepgalois/include/node.h                  | 55 +++-----------
 libdeepgalois/include/types.h                 | 13 +---
 libdeepgalois/include/utils.h                 |  1 +
 libdeepgalois/src/aggregator.cpp              | 40 +++++++++++
 libdeepgalois/src/aggregator.cu               |  7 ++
 libdeepgalois/src/layers/graph_conv_layer.cpp | 54 ++++++++++++++
 libdeepgalois/src/layers/relu_layer.cpp       | 16 ++++-
 .../src/layers/softmax_loss_layer.cpp         |  3 +
 libdeepgalois/src/math_functions.cpp          | 43 ++---------
 libdeepgalois/src/math_functions.cu           | 60 +++++++++++-----
 21 files changed, 285 insertions(+), 226 deletions(-)
 create mode 100644 libdeepgalois/include/aggregator.h
 create mode 100644 libdeepgalois/include/gtypes.h
 rename libdeepgalois/include/{math_functions.hpp => math_functions.hh} (58%)
 create mode 100644 libdeepgalois/src/aggregator.cpp
 create mode 100644 libdeepgalois/src/aggregator.cu
 create mode 100644 libdeepgalois/src/layers/graph_conv_layer.cpp

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index 4f51532898..7e558221f6 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -3,23 +3,15 @@ SET(OPENBLAS_LIB /net/ohm/export/cdgc/cxh/OpenBLAS/build/lib)
 include_directories(${OPENBLAS_INC})
 link_directories(${OPENBLAS_LIB})
 
-#SET(CUDA_INC /org/centers/cdgc/cuda/cuda-10.0/include)
-#SET(CUDA_LIB /org/centers/cdgc/cuda/cuda-10.0/lib64/)
-#SET(ENABLE_GPU OFF CACHE BOOL "Use GPU for DeepGalois")
-#if (ENABLE_GPU)
-#	target_compile_definitions(distbench PRIVATE __GALOIS_HET_CUDA__=1)
-#    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DENABLE_GPU")
-#    include_directories(${CUDA_INC})
-#    link_directories(${CUDA_LIB})
-#endif()
+SET(CUDA_INC /org/centers/cdgc/cuda/cuda-8.0/include)
+SET(CUDA_LIB /org/centers/cdgc/cuda/cuda-8.0/lib64/)
+include_directories(${CUDA_INC})
+link_directories(${CUDA_LIB})
+link_directories(${CMAKE_SOURCE_DIR}/libgpu)
 
 #set(sources
 #  src/layers/relu_layer.cu
 #)
-#cuda_add_library(deepgalois_gpu
-#  ${sources}
-#OPTIONS -D_FORCE_INLINES
-#)
 #target_include_directories(deepgalois_gpu PUBLIC
 #  ${CMAKE_SOURCE_DIR}/libgpu/include
 #)
@@ -27,18 +19,33 @@ link_directories(${OPENBLAS_LIB})
 #  INTERFACE_POSITION_INDEPENDENT_CODE On
 #  POSITION_INDEPENDENT_CODE On
 #)
-#target_link_libraries(deepgalois -lcudart -lcublas)
+cmake_minimum_required(VERSION 2.8)
+find_package(CUDA REQUIRED)
+set(CUDA_SEPARABLE_COMPILATION ON)
+set(CUDA_PROPAGATE_HOST_FLAGS OFF)
+#set(CUDA_HOST_COMPILER g++)
+list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_60,code=sm_60; -std=c++11")
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
+include_directories(${CMAKE_SOURCE_DIR}/libgpu/include)
+file(GLOB CUDA_FILES "src/" *.cu)
+#CUDA_COMPILE(CU_O src/math_functions.cu)
+CUDA_COMPILE(CU_O ${CUDA_FILES})
 
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 set(sources
   src/layers/relu_layer.cpp
+  src/layers/graph_conv_layer.cpp
   src/layers/softmax_loss_layer.cpp
   src/math_functions.cpp
+  src/aggregator.cpp
+  ${CU_O}
 )
 add_library(deepgalois STATIC ${sources})
 
-target_link_libraries(deepgalois galois_shmem gllvm)
+target_link_libraries(deepgalois galois_shmem gllvm galois_gpu)
 target_link_libraries(deepgalois ${MPI_CXX_LIBRARIES})
 target_link_libraries(deepgalois -lopenblas)
+target_link_libraries(deepgalois -lcudart -lcublas)
 
 target_include_directories(deepgalois PUBLIC
   ${CMAKE_SOURCE_DIR}/libllvm/include
diff --git a/libdeepgalois/include/aggregator.h b/libdeepgalois/include/aggregator.h
new file mode 100644
index 0000000000..4e178d89b8
--- /dev/null
+++ b/libdeepgalois/include/aggregator.h
@@ -0,0 +1,6 @@
+#pragma once
+#include "types.h"
+#include "gtypes.h"
+
+void update_all(Graph *g, const tensor_t &in, tensor_t &out, bool norm, const vec_t &norm_factor);
+void update_all(Graph *g, const vec_t &in, tensor_t &out, bool norm, const vec_t &norm_factor);
diff --git a/libdeepgalois/include/common.h b/libdeepgalois/include/common.h
index 0c3023c3f2..f942fd106c 100644
--- a/libdeepgalois/include/common.h
+++ b/libdeepgalois/include/common.h
@@ -2,6 +2,7 @@
 #include "types.h"
 #include "utils.h"
 #include "cutils.h"
+//#include "random.h"
 
 class DeepGalois {
 public:
@@ -28,17 +29,17 @@ class DeepGalois {
 		if (Get().cublas_handle_) CUBLAS_CHECK(cublasDestroy(Get().cublas_handle_));
 		if (Get().curand_generator_) CURAND_CHECK(curandDestroyGenerator(Get().curand_generator_));
 		CUBLAS_CHECK(cublasCreate(&Get().cublas_handle_));
-		CURAND_CHECK(curandCreateGenerator(&Get().curand_generator_, CURAND_RNG_PSEUDO_DEFAULT));
-		CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().curand_generator_, cluster_seedgen()));
+		//CURAND_CHECK(curandCreateGenerator(&Get().curand_generator_, CURAND_RNG_PSEUDO_DEFAULT));
+		//CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().curand_generator_, cluster_seedgen()));
 	}
 	static void DeviceQuery() {}
 	static bool CheckDevice(const int device_id) { return true; }
 	static int FindDevice(const int start_id = 0) { return 0; }
 
 protected:
-	cublasHandle_t cublas_handle_;
-	curandGenerator_t curand_generator_;
-	shared_ptr<RNG> random_generator_;
+	cublasHandle_t cublas_handle_; // used to call cuBLAS
+	curandGenerator_t curand_generator_; // used to generate random numbers on GPU
+	//shared_ptr<RNG> random_generator_;
 	Brew mode_;
 	// Parallel training
 	int solver_count_;
@@ -47,6 +48,19 @@ class DeepGalois {
 
 private:
 	// The private constructor to avoid duplicate instantiation.
-	DeepGalois();
+	DeepGalois() : cublas_handle_(NULL), curand_generator_(NULL), 
+			//random_generator_(NULL), mode_(DeepGalois::CPU),
+			mode_(DeepGalois::CPU),
+			solver_count_(1), solver_rank_(0), multiprocess_(false) {
+		// Try to create a cublas handler, and report an error if failed (but we will
+		// keep the program running as one might just want to run CPU code).
+		if (cublasCreate(&cublas_handle_) != CUBLAS_STATUS_SUCCESS) {
+			std::cout << "Cannot create Cublas handle. Cublas won't be available.";
+		}
+		// Try to create a curand handler.
+		//if (curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT) != CURAND_STATUS_SUCCESS ||
+		//	curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen()) != CURAND_STATUS_SUCCESS)
+		//	std::cout << "Cannot create Curand generator. Curand won't be available.";
+	}
 };
 
diff --git a/libdeepgalois/include/cutils.h b/libdeepgalois/include/cutils.h
index cda8d23cba..8a0fcaa3a1 100644
--- a/libdeepgalois/include/cutils.h
+++ b/libdeepgalois/include/cutils.h
@@ -17,21 +17,23 @@ inline int CUDA_GET_BLOCKS(const int N) {
 #define CUDA_CHECK(condition) \
   do { \
     cudaError_t error = condition; \
-    CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \
+    if (error != cudaSuccess) {    \
+      fprintf(stderr, "error %d: Cuda error in file '%s' in line %i : %s.\n", \
+      error, __FILE__, __LINE__, cudaGetErrorString(error) );                    \
+      exit(EXIT_FAILURE);                                                     \
+    } \
   } while (0)
 
 #define CUBLAS_CHECK(condition) \
   do { \
-    cublasStatus_t status = condition; \
-    CHECK_EQ(status, CUBLAS_STATUS_SUCCESS) << " " \
-      << caffe::cublasGetErrorString(status); \
+    cublasStatus_t status = condition;   \
+    if (status != CUBLAS_STATUS_SUCCESS) \
+      ;      \
   } while (0)
 
 #define CURAND_CHECK(condition) \
   do { \
     curandStatus_t status = condition; \
-    CHECK_EQ(status, CURAND_STATUS_SUCCESS) << " " \
-      << caffe::curandGetErrorString(status); \
   } while (0)
 
 // CUDA: grid stride looping
diff --git a/libdeepgalois/include/gtypes.h b/libdeepgalois/include/gtypes.h
new file mode 100644
index 0000000000..a30468b0f9
--- /dev/null
+++ b/libdeepgalois/include/gtypes.h
@@ -0,0 +1,15 @@
+#pragma once
+#include "galois/Galois.h"
+#include "galois/graphs/LCGraph.h"
+
+typedef galois::GAccumulator<acc_t> AccumF;
+typedef galois::GAccumulator<size_t> AccumU;
+
+#ifdef EDGE_LABEL
+typedef galois::graphs::LC_CSR_Graph<uint32_t, uint32_t>::with_numa_alloc<true>::type ::with_no_lockable<true>::type Graph;
+#else
+typedef galois::graphs::LC_CSR_Graph<uint32_t, void>::with_numa_alloc<true>::type ::with_no_lockable<true>::type Graph;
+#endif
+
+typedef Graph::GraphNode GNode;
+
diff --git a/libdeepgalois/include/layers/graph_conv_layer.h b/libdeepgalois/include/layers/graph_conv_layer.h
index 2e304a0c98..0633ec63e9 100644
--- a/libdeepgalois/include/layers/graph_conv_layer.h
+++ b/libdeepgalois/include/layers/graph_conv_layer.h
@@ -1,5 +1,6 @@
 #pragma once
 #include "layer.h"
+#include "gtypes.h"
 
 /* GraphConv Layer
 	Parameters
@@ -56,12 +57,15 @@ class graph_conv_layer: public layer {
 		std::cout << "Done, allocation time: " << t_alloc.Millisecs() << " ms\n";
 	}
 	std::string layer_type() const override { return std::string("graph_conv"); }
-
+	void set_context(net_phase ctx) override { phase_ = ctx; }
+	virtual void forward_propagation(const tensor_t &in_data, tensor_t &out_data);
+	virtual void forward_propagation(const float_t *in_data, float_t *out_data);
+	virtual void back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad);
+	virtual void back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad);
 	// user-defined aggregate function
-	void aggregate(Graph *g, const vec_t &in, tensor_t &out) { update_all(g, in, out, true, norm_factor); }
-
+	virtual void aggregate(Graph *g, const vec_t &in, tensor_t &out);
 	// user-defined combine function
-	void combine(const vec_t &self, const vec_t &neighbors, const vec_t mat_v, const vec_t mat_u, vec_t &out) {
+	virtual void combine(const vec_t &self, const vec_t &neighbors, const vec_t mat_v, const vec_t mat_u, vec_t &out) {
 		vec_t a(out.size(), 0);
 		vec_t b(out.size(), 0);
 		mvmul(mat_v, self, a);
@@ -69,66 +73,6 @@ class graph_conv_layer: public layer {
 		vadd(a, b, out); // out = W*self + Q*neighbors
 	}
 
-	void set_context(net_phase ctx) override { phase_ = ctx; }
-
-	// 𝒉[𝑙] = σ(𝑊 * Σ(𝒉[𝑙-1]))
-	void forward_propagation(const tensor_t &in_data, tensor_t &out_data) override {
-		// input: x*y; W: y*z; output: x*z
-		// if y > z:
-		// mult W first to reduce the feature size for aggregation
-		// else: aggregate first then mult W (not implemented yet)
-		//Timer t_matmul, t_agg, t_dropout;
-		//t_matmul.Start();
-		if (dropout_ && phase_ == net_phase::train) {
-			//for (size_t i = 0; i < x; ++i) {
-			galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) {
-				dropout(scale_, dropout_rate_, in_data[i], dropout_mask[i], &in_temp[i*y]);
-			}, galois::loopname("dropout"));
-			matmul1D1D(x, z, y, in_temp, W, out_temp); // x*y; y*z; x*z
-		} else matmul2D1D(z, in_data, W, out_temp); // x*y; y*z; x*z
-		//t_matmul.Stop();
-		//t_agg.Start();
-		aggregate(graph, out_temp, out_data); // aggregate
-		//t_agg.Stop();
-		if (act_) {
-			galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) {
-				relu(out_data[i], out_data[i]);
-			}, galois::loopname("relu"));
-		}
-		//double dropout_time = 0;
-		//if (dropout_ && phase_ == net_phase::train) dropout_time = t_dropout.Millisecs();
-		//std::cout << "\n\t" << name_ << " matmul time: " << t_matmul.Millisecs() 
-		//	<< ", aggregation time: " << t_agg.Millisecs() << ", dropout time: " << dropout_time << "\n";
-	}
-
-	// 𝜕𝐸 / 𝜕𝑦[𝑙−1] = 𝜕𝐸 / 𝜕𝑦[𝑙] ∗ 𝑊 ^𝑇
-	void back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad) override {
-		if (act_) {
-			//for (size_t j = 0; j < z; ++j) 
-			galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) {
-				for (size_t j = 0; j < z; ++j) 
-					//if (out_data[i][j] <= 0.0) out_temp[i][j] = 0.0;
-					out_temp[i*z+j] = out_data[i][j] > float_t(0) ? out_grad[i][j] : float_t(0);
-			}, galois::loopname("d_relu"));
-		//} else out_temp = out_grad; // TODO: avoid copying
-		} else copy2D1D(out_grad, out_temp);
-		if (level_ != 0) { // no need to calculate in_grad for the first layer
-			vec_t trans_W(z*y);
-			transpose(y, z, W, trans_W); // derivative of matmul needs transposed matrix
-			matmul1D1D(x, y, z, out_temp, trans_W, in_temp); // x*z; z*y -> x*y
-			update_all(graph, in_temp, in_grad, true, norm_factor); // x*x; x*y -> x*y NOTE: since graph is symmetric, the derivative is the same
-			if (dropout_) {
-				galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) {
-					d_dropout(scale_, in_grad[i], dropout_mask[i], in_grad[i]);
-				}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("d_dropout"));
-			}
-		}
-
-		// calculate weight gradients
-		transpose2D1D(in_data, trans_data); // y*x
-		matmul1D1D(y, z, x, trans_data, out_temp, weight_grad); // y*x; x*z; y*z
-	}
-
 	void degree_counting() {
 		assert(x == graph->size());
 		degrees.resize(x);
diff --git a/libdeepgalois/include/layers/layer.h b/libdeepgalois/include/layers/layer.h
index 4a8a545738..076253fe82 100644
--- a/libdeepgalois/include/layers/layer.h
+++ b/libdeepgalois/include/layers/layer.h
@@ -6,6 +6,7 @@
 #include <limits>
 #include <memory>
 #include <string>
+#include <cassert>
 #include <iomanip>
 #include <numeric>
 #include <sstream>
@@ -15,8 +16,9 @@
 #include "../node.h"
 #include "../types.h"
 #include "../utils.h"
+#include "../gtypes.h"
 #include "../optimizer.h"
-#include "../math_functions.hpp"
+#include "../math_functions.hh"
 /**
  * base class of all kind of NN layers
  *
@@ -36,11 +38,11 @@ class layer : public node {
 		input_dims(in_dims), output_dims(out_dims) { add_edge(); }
 	virtual ~layer() = default;
 	virtual void forward_propagation(const tensor_t &in_data, tensor_t &out_data) = 0;
-	virtual void back_propagation(const tensor_t &in_data, const tensor_t &out_data,
-			tensor_t &out_grad, tensor_t &in_grad) = 0;
+	virtual void forward_propagation(const float_t *in_data, float_t *out_data) = 0;
+	virtual void back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad) = 0;
+	virtual void back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) = 0;
 	virtual std::string layer_type() const = 0;
 	virtual void set_context(net_phase ctx) {}
-	//virtual void setup(Graph *g, vec_t *diff, LabelList *lab) = 0;
 
 	void set_trainable(bool trainable) { trainable_ = trainable; }
 	bool trainable() const { return trainable_; }
diff --git a/libdeepgalois/include/layers/relu_layer.h b/libdeepgalois/include/layers/relu_layer.h
index c4acdd50ac..285e09b472 100644
--- a/libdeepgalois/include/layers/relu_layer.h
+++ b/libdeepgalois/include/layers/relu_layer.h
@@ -11,5 +11,7 @@ class relu_layer : public layer {
 	~relu_layer() {}
 	std::string layer_type() const override { return std::string("relu"); }
 	virtual void forward_propagation(const tensor_t &in_data, tensor_t &out_data);
+	virtual void forward_propagation(const float_t *in_data, float_t *out_data);
 	virtual void back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad);
+	virtual void back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad);
 };
diff --git a/libdeepgalois/include/layers/softmax_loss_layer.h b/libdeepgalois/include/layers/softmax_loss_layer.h
index 6375f72121..236fd35118 100644
--- a/libdeepgalois/include/layers/softmax_loss_layer.h
+++ b/libdeepgalois/include/layers/softmax_loss_layer.h
@@ -10,7 +10,9 @@ class softmax_loss_layer: public layer {
 	~softmax_loss_layer() {}
 	std::string layer_type() const override { return std::string("softmax_loss"); }
 	virtual void forward_propagation(const tensor_t &in_data, tensor_t &out_data);
+	virtual void forward_propagation(const float_t *in_data, float_t *out_data);
 	virtual void back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad);
+	virtual void back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad);
 
 private:
 	LabelList *labels;
diff --git a/libdeepgalois/include/math_functions.hpp b/libdeepgalois/include/math_functions.hh
similarity index 58%
rename from libdeepgalois/include/math_functions.hpp
rename to libdeepgalois/include/math_functions.hh
index d3d08b10b2..86363f4ba3 100644
--- a/libdeepgalois/include/math_functions.hpp
+++ b/libdeepgalois/include/math_functions.hh
@@ -1,12 +1,14 @@
 #ifndef _MATH_FUNCTIONS_
 #define _MATH_FUNCTIONS_
 #include <cmath>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
 #include "types.h"
-#include <immintrin.h>
 
 const float negative_slope = 0;
 
-void vadd(const vec_t &a, const vec_t &b, vec_t &out);
+void vadd(const vec_t &a, const vec_t &b, vec_t &out); // vector add
 void vadd(size_t n, const float_t *a, const float_t *b, float_t *out);
 void vsub(const vec_t &a, const vec_t &b, vec_t &out);
 void vmul(const vec_t &a, const vec_t &b, vec_t &out);
@@ -22,22 +24,29 @@ void vvmul(const vec_t &a, const vec_t &b, tensor_t &out);
 void matadd(size_t x, size_t y, const tensor_t &A, const tensor_t &B, tensor_t &C);
 void copy2D1D(const tensor_t &in, vec_t &out);
 void matmul2D(const tensor_t &A, const tensor_t &B, tensor_t &C);
-void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z, const vec_t &A, const vec_t &B, vec_t &C);
+void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z, const vec_t &A, const vec_t &B, vec_t &C); // matrix multiply
 void matmul2D1D(const size_t dim_y, const tensor_t &A, const vec_t &B, vec_t &C);
 void transpose2D(const tensor_t &in, tensor_t &out);
 void transpose2D1D(const tensor_t &in, vec_t &out);
 void transpose(size_t x, size_t y, const vec_t &in, vec_t &out);
-int argmax(const size_t n, const vec_t &x);
+int argmax(const size_t n, const vec_t &x); // the arguments of the maxima
 void clear(vec_t &in);
-void relu(const vec_t &in, vec_t &out);
-void update_all(Graph *g, const tensor_t &in, tensor_t &out, bool norm, const vec_t &norm_factor);
-void update_all(Graph *g, const vec_t &in, tensor_t &out, bool norm, const vec_t &norm_factor);
-void dropout(const float scale, const float dropout_rate, const vec_t &in, std::vector<unsigned> &mask, vec_t &out);
+void relu(const vec_t &in, vec_t &out); // ReLU
+void d_relu(const vec_t &in_diff, const vec_t &data, vec_t &out_diff); // ReLU derivative
+void dropout(const float scale, const float dropout_rate, const vec_t &in, std::vector<unsigned> &mask, vec_t &out); // dropout
 void dropout(const float scale, const float dropout_rate, const vec_t &in, std::vector<unsigned> &mask, float_t *out);
-void d_dropout(const float scale, const vec_t &in_diff, std::vector<unsigned> &mask, vec_t &out_diff);
+void d_dropout(const float scale, const vec_t &in_diff, std::vector<unsigned> &mask, vec_t &out_diff); // dropout derivative
 void softmax(const vec_t &input, vec_t &output);
 void d_softmax(const vec_t &y, const vec_t &p, vec_t &dy, const vec_t &dp);
 float_t cross_entropy(const vec_t &y, const vec_t &p);
 void d_cross_entropy(const vec_t &y, const vec_t &p, vec_t &d);
 
+void vadd_gpu(const size_t n, const float_t *a, const float_t *b, float_t *out); // vector add
+void relu_gpu(const size_t n, const float_t *in, float_t *out); // ReLU
+void d_relu_gpu(const size_t n, const float_t *in_diff, const float_t *data, float_t *out_diff); // ReLU derivative
+void dropout_gpu(const float scale, const float dropout_rate, const float_t *in, unsigned *mask, float_t *out); // dropout
+void d_dropout_gpu(const float scale, const float_t *in_diff, const unsigned *mask, float_t *out_diff); // dropout derivative
+void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, const float_t *A, const float_t *B, float_t *C); // matrix multiply
+int argmax_gpu(const size_t n, const float_t *x); // the arguments of the maxima
+
 #endif
diff --git a/libdeepgalois/include/net.h b/libdeepgalois/include/net.h
index f845eed82e..8b51b6d350 100644
--- a/libdeepgalois/include/net.h
+++ b/libdeepgalois/include/net.h
@@ -2,7 +2,7 @@
 #define _MODEL_H_
 
 #include <random>
-#include "galois/Galois.h"
+#include "gtypes.h"
 #include "galois/Timer.h"
 #include "lgraph.h"
 #include "layers.h"
@@ -18,12 +18,6 @@ class Net {
 public:
 	Net() {}
 
-	// user-defined aggregate function
-	virtual void aggregate(Graph *g, size_t dim, const tensor_t &in_feats, tensor_t &out_feats) {}
-	
-	// user-defined combine function
-	virtual void combine(const vec_t ma, const vec_t mb, const vec_t &a, const vec_t &b, vec_t &out) {}
-	
 	void init() {
 		read_graph(dataset, g); 
 		n = g.size(); // N
diff --git a/libdeepgalois/include/node.h b/libdeepgalois/include/node.h
index 1a50080934..eec041e0e1 100644
--- a/libdeepgalois/include/node.h
+++ b/libdeepgalois/include/node.h
@@ -13,17 +13,11 @@ class node : public std::enable_shared_from_this<node> {
 	node(size_t in_size, size_t out_size) {}//: prev_(in_size), next_(out_size) {}
 	virtual ~node() {}
 	const edgeptr_t prev() const { return prev_; }
-	//const std::vector<edgeptr_t> &prev() const { return prev_; }
 	const edgeptr_t next() const { return next_; }
-	//const std::vector<edgeptr_t> &next() const { return next_; }
-	//std::vector<node *> prev_nodes() const;
-	//std::vector<node *> next_nodes() const;
 
 protected:
 	node() = delete;
 	friend void connect(layer *head, layer *tail, size_t head_index, size_t tail_index);
-	//mutable std::vector<edgeptr_t> prev_;
-	//mutable std::vector<edgeptr_t> next_;
 	mutable edgeptr_t prev_;
 	mutable edgeptr_t next_;
 };
@@ -46,8 +40,7 @@ class edge {
 		std::copy(grad_head.begin(), grad_head.end(), pdst);
 		// @todo consider adding parallelism and vectorization
 		for (size_t sample = 1; sample < grad_.size(); ++sample) {
-			for (size_t i = 0; i < sz; i++)
-				pdst[i] += grad_[sample][i];
+			for (size_t i = 0; i < sz; i++) pdst[i] += grad_[sample][i];
 			//vectorize::reduce<float_t>(&grad_[sample][0], sz, pdst);
 		}
 	}
@@ -61,50 +54,24 @@ class edge {
 
 	tensor_t *get_data_ptr() { return &data_; }
 	tensor_t &get_data() { return data_; }
-	//const tensor_t *get_data() const { return &data_; }
 	const tensor_t &get_data() const { return data_; }
-	//tensor_t *get_gradient() { return &grad_; }
 	tensor_t &get_gradient() { return grad_; }
-	//const tensor_t *get_gradient() const { return &grad_; }
 	const tensor_t &get_gradient() const { return grad_; }
+	float_t *get_gpu_data() const { return gpu_data_; }
+	float_t *get_gpu_gradient() { return gpu_grad_; }
 
-	//const std::vector<node *> &next() const { return next_; }
 	const node *next() const { return next_; }
 	node *prev() { return prev_; }
 	const node *prev() const { return prev_; }
-	//const shape3d &shape() const { return shape_; }
-	//vector_type vtype() const { return vtype_; }
-	//void add_next_node(node *next) { next_.push_back(next); }
 	void add_next_node(node *next) { next_ = next; }
+
 private:
-	//shape3d shape_;
-	size_t ft_dim_;
-	//vector_type vtype_;
-	tensor_t data_;
-	tensor_t grad_;
-	node *prev_;                // previous node, "producer" of this tensor
-	node *next_;                // next node, "consumer" of this tensor
-	//std::vector<node *> next_;  // next nodes, "consumers" of this tensor
+	size_t ft_dim_;     // feature dimensions
+	tensor_t data_;     // feature vectors on CPU
+	tensor_t grad_;     // gradients on CPU
+	float_t *gpu_data_; // feature vectors on GPU
+	float_t *gpu_grad_; // gradients on CPU
+	node *prev_;        // previous node, "producer" of this tensor
+	node *next_;        // next node, "consumer" of this tensor
 };
-/*
-inline std::vector<node *> node::prev_nodes() const {
-	std::vector<node *> vecs;
-	for (auto &e : prev_) {
-		if (e && e->prev()) {
-			vecs.insert(vecs.end(), e->prev());
-		}
-	}
-	return vecs;
-}
 
-inline std::vector<node *> node::next_nodes() const {
-	std::vector<node *> vecs;
-	for (auto &e : next_) {
-		if (e) {
-			auto n = e->next();
-			vecs.insert(vecs.end(), n.begin(), n.end());
-		}
-	}
-	return vecs;
-}
-*/
diff --git a/libdeepgalois/include/types.h b/libdeepgalois/include/types.h
index 0aa80cce4f..8d78e03d48 100644
--- a/libdeepgalois/include/types.h
+++ b/libdeepgalois/include/types.h
@@ -1,8 +1,7 @@
 #ifndef TYPES_H
 #define TYPES_H
 #include <vector>
-#include "galois/Galois.h"
-#include "galois/graphs/LCGraph.h"
+#include <stdint.h>
 
 #ifdef CNN_USE_DOUBLE
 typedef double float_t;
@@ -20,16 +19,6 @@ typedef short label_t; // label is for classification (supervised learning)
 typedef uint8_t mask_t; // mask is used to indicate different uses of labels: train, val, test
 typedef std::vector<label_t> LabelList; // label list to store label for each vertex
 typedef std::vector<mask_t> MaskList; // mask list to store mask for each vertex
-typedef galois::GAccumulator<acc_t> AccumF;
-typedef galois::GAccumulator<size_t> AccumU;
-
-#ifdef EDGE_LABEL
-typedef galois::graphs::LC_CSR_Graph<uint32_t, uint32_t>::with_numa_alloc<true>::type ::with_no_lockable<true>::type Graph;
-#else
-typedef galois::graphs::LC_CSR_Graph<uint32_t, void>::with_numa_alloc<true>::type ::with_no_lockable<true>::type Graph;
-#endif
-
-typedef Graph::GraphNode GNode;
 #define CHUNK_SIZE 256
 
 #endif
diff --git a/libdeepgalois/include/utils.h b/libdeepgalois/include/utils.h
index ceb49b0e41..caf27c56a3 100644
--- a/libdeepgalois/include/utils.h
+++ b/libdeepgalois/include/utils.h
@@ -3,6 +3,7 @@
 #include <random>
 #include <iomanip>
 #include <fstream>
+#include <sstream>
 #include <iostream>
 #include <sys/time.h>
 #include <sys/resource.h>
diff --git a/libdeepgalois/src/aggregator.cpp b/libdeepgalois/src/aggregator.cpp
new file mode 100644
index 0000000000..4b3f7cbab6
--- /dev/null
+++ b/libdeepgalois/src/aggregator.cpp
@@ -0,0 +1,40 @@
+#include "aggregator.h"
+#include "math_functions.hh"
+
+void update_all(Graph *g, const tensor_t &in, tensor_t &out, bool norm, const vec_t &norm_factor) {
+	galois::do_all(galois::iterate(g->begin(), g->end()), [&](const auto& src) {
+		clear(out[src]); // TODO: vectorize clear
+		float_t a = 0.0, b = 0.0;
+		if (norm) a = norm_factor[src];
+		// gather neighbors' embeddings
+		for (const auto e : g->edges(src)) {
+			const auto dst = g->getEdgeDst(e);
+			if (norm) {
+				b = a * norm_factor[dst];
+				vec_t neighbor = in[dst];
+				mul_scalar(b, neighbor);
+				vadd(out[src], neighbor, out[src]); // out[src] += in[dst]
+			} else vadd(out[src], in[dst], out[src]); // out[src] += in[dst]
+		}
+	}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("update_all"));
+}
+
+void update_all(Graph *g, const vec_t &in, tensor_t &out, bool norm, const vec_t &norm_factor) {
+	size_t len = out[0].size();
+	galois::do_all(galois::iterate(g->begin(), g->end()), [&](const auto& src) {
+		clear(out[src]);
+		float_t a = 0.0, b = 0.0;
+		if (norm) a = norm_factor[src];
+		// gather neighbors' embeddings
+		for (const auto e : g->edges(src)) {
+			const auto dst = g->getEdgeDst(e);
+			if (norm) {
+				b = a * norm_factor[dst];
+				vec_t neighbor(len);
+				mul_scalar(len, b, &in[dst*len], neighbor.data());
+				vadd(out[src], neighbor, out[src]); // out[src] += in[dst]
+			} else vadd(len, out[src].data(), &in[dst*len], out[src].data()); // out[src] += in[dst]
+		}
+	}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("update_all"));
+}
+
diff --git a/libdeepgalois/src/aggregator.cu b/libdeepgalois/src/aggregator.cu
new file mode 100644
index 0000000000..1cc93e6866
--- /dev/null
+++ b/libdeepgalois/src/aggregator.cu
@@ -0,0 +1,7 @@
+#include "csr_graph.h"
+#include "aggregator.h"
+#include "math_functions.hh"
+
+void update_all(CSRGraph *g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor) {
+}
+	
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
new file mode 100644
index 0000000000..863b5df73c
--- /dev/null
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -0,0 +1,54 @@
+#include "layers/graph_conv_layer.h"
+#include "aggregator.h"
+
+void graph_conv_layer::aggregate(Graph *g, const vec_t &in, tensor_t &out) {
+	update_all(g, in, out, true, norm_factor);
+}
+
+// 𝒉[𝑙] = σ(𝑊 * Σ(𝒉[𝑙-1]))
+void graph_conv_layer::forward_propagation(const tensor_t &in_data, tensor_t &out_data) {
+	// input: x*y; W: y*z; output: x*z
+	// if y > z: mult W first to reduce the feature size for aggregation
+	// else: aggregate first then mult W (not implemented yet)
+	if (dropout_ && phase_ == net_phase::train) {
+		galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) {
+			dropout(scale_, dropout_rate_, in_data[i], dropout_mask[i], &in_temp[i*y]);
+		}, galois::loopname("dropout"));
+		matmul1D1D(x, z, y, in_temp, W, out_temp); // x*y; y*z; x*z
+	} else matmul2D1D(z, in_data, W, out_temp); // x*y; y*z; x*z
+	aggregate(graph, out_temp, out_data); // aggregate
+	if (act_) {
+		galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) {
+			relu(out_data[i], out_data[i]);
+		}, galois::loopname("relu"));
+	}
+}
+
+void graph_conv_layer::forward_propagation(const float_t *in_data, float_t *out_data) { }
+
+// 𝜕𝐸 / 𝜕𝑦[𝑙−1] = 𝜕𝐸 / 𝜕𝑦[𝑙] ∗ 𝑊 ^𝑇
+void graph_conv_layer::back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad) {
+	if (act_) {
+		galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) {
+			for (size_t j = 0; j < z; ++j) //TODO: use in_data or out_data?
+				out_temp[i*z+j] = out_data[i][j] > float_t(0) ? out_grad[i][j] : float_t(0);
+		}, galois::loopname("d_relu"));
+	} else copy2D1D(out_grad, out_temp); // TODO: avoid copying
+	if (level_ != 0) { // no need to calculate in_grad for the first layer
+		vec_t trans_W(z*y);
+		transpose(y, z, W, trans_W); // derivative of matmul needs transposed matrix
+		matmul1D1D(x, y, z, out_temp, trans_W, in_temp); // x*z; z*y -> x*y
+		update_all(graph, in_temp, in_grad, true, norm_factor); // x*x; x*y -> x*y NOTE: since graph is symmetric, the derivative is the same
+		if (dropout_) {
+			galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) {
+				d_dropout(scale_, in_grad[i], dropout_mask[i], in_grad[i]);
+			}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("d_dropout"));
+		}
+	}
+	// calculate weight gradients
+	transpose2D1D(in_data, trans_data); // y*x
+	matmul1D1D(y, z, x, trans_data, out_temp, weight_grad); // y*x; x*z; y*z
+}
+
+void graph_conv_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) { }
+
diff --git a/libdeepgalois/src/layers/relu_layer.cpp b/libdeepgalois/src/layers/relu_layer.cpp
index ccabc8a090..eb02f66d50 100644
--- a/libdeepgalois/src/layers/relu_layer.cpp
+++ b/libdeepgalois/src/layers/relu_layer.cpp
@@ -8,12 +8,24 @@ void relu_layer::forward_propagation(const tensor_t &in_data, tensor_t &out_data
 	}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("relu_layer-fw"));
 }
 
+// 𝑦[𝑙] = max(0, 𝑦[𝑙−1])
+void relu_layer::forward_propagation(const float_t *in_data, float_t *out_data) {
+	const size_t count = input_dims[0] * input_dims[1];
+	relu_gpu(count, in_data, out_data);
+}
+
 // 𝜕𝐿 / 𝜕𝑦[𝑙−1] = 0, 𝑖𝑓 (𝑦[𝑙] < 0)
 //              = 𝜕𝐿 / 𝜕𝑦𝑙 , 𝑜𝑡ℎ𝑒𝑟𝑤𝑖𝑠𝑒
-void relu_layer::back_propagation(const tensor_t &in_data, const tensor_t &out_data, 
-	tensor_t &out_grad, tensor_t &in_grad) {
+void relu_layer::back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad) {
 	galois::do_all(galois::iterate((size_t)0, input_dims[0]), [&](const auto& i) {
 		for (size_t j = 0; j < input_dims[1]; ++j) 
 			in_grad[i][j] = out_data[i][j] > float_t(0) ? out_grad[i][j] : float_t(0);
 	}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("relu_layer-bw"));
 }
+
+// 𝜕𝐿 / 𝜕𝑦[𝑙−1] = 0, 𝑖𝑓 (𝑦[𝑙] < 0)
+//              = 𝜕𝐿 / 𝜕𝑦𝑙 , 𝑜𝑡ℎ𝑒𝑟𝑤𝑖𝑠𝑒
+void relu_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) {
+	const size_t count = input_dims[0] * input_dims[1];
+	d_relu_gpu(count, out_grad, in_data, in_grad);
+}
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp
index 61f63f6f0e..22a9d1a83c 100644
--- a/libdeepgalois/src/layers/softmax_loss_layer.cpp
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp
@@ -21,6 +21,8 @@ void softmax_loss_layer::forward_propagation(const tensor_t &in_data, tensor_t &
 	}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("softmax-loss-fw"));
 }
 
+void softmax_loss_layer::forward_propagation(const float_t *in_data, float_t *out_data) { }
+
 void softmax_loss_layer::back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad) {
 	//std::cout << name_ << " backward: x=" << in_grad.size() << ", y=" << in_grad[0].size() << "\n";
 	galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) {
@@ -32,3 +34,4 @@ void softmax_loss_layer::back_propagation(const tensor_t &in_data, const tensor_
 	}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("softmax-loss-bw"));
 }
 
+void softmax_loss_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) { }
diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp
index a4d1d77719..1e3e0e1d79 100644
--- a/libdeepgalois/src/math_functions.cpp
+++ b/libdeepgalois/src/math_functions.cpp
@@ -1,5 +1,7 @@
-#include "math_functions.hpp"
+#include "math_functions.hh"
 #include "utils.h"
+#include "galois/Timer.h"
+#include <immintrin.h>
 
 extern "C" {
 #include <cblas.h>
@@ -242,7 +244,7 @@ void relu(const vec_t &in, vec_t &out) {
 
 void d_relu(const vec_t &in_diff, const vec_t &fv, vec_t &out_diff) {
 	for (size_t i = 0; i < out_diff.size(); ++i) {
-		out_diff[i] = in_diff[i] * ((fv[i] > (float_t)0)  + negative_slope * (fv[i] <= (float_t)0));
+		out_diff[i] = in_diff[i] * ((fv[i] > (float_t)0) + negative_slope * (fv[i] <= (float_t)0));
 	}
 }
 
@@ -298,43 +300,6 @@ void sigmoid(vec_t &fv) {
 	}
 }
 
-void update_all(Graph *g, const tensor_t &in, tensor_t &out, bool norm, const vec_t &norm_factor) {
-	galois::do_all(galois::iterate(g->begin(), g->end()), [&](const auto& src) {
-		clear(out[src]); // TODO: vectorize clear
-		float_t a = 0.0, b = 0.0;
-		if (norm) a = norm_factor[src];
-		// gather neighbors' embeddings
-		for (const auto e : g->edges(src)) {
-			const auto dst = g->getEdgeDst(e);
-			if (norm) {
-				b = a * norm_factor[dst];
-				vec_t neighbor = in[dst];
-				mul_scalar(b, neighbor);
-				vadd(out[src], neighbor, out[src]); // out[src] += in[dst]
-			} else vadd(out[src], in[dst], out[src]); // out[src] += in[dst]
-		}
-	}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("update_all"));
-}
-
-void update_all(Graph *g, const vec_t &in, tensor_t &out, bool norm, const vec_t &norm_factor) {
-	size_t len = out[0].size();
-	galois::do_all(galois::iterate(g->begin(), g->end()), [&](const auto& src) {
-		clear(out[src]);
-		float_t a = 0.0, b = 0.0;
-		if (norm) a = norm_factor[src];
-		// gather neighbors' embeddings
-		for (const auto e : g->edges(src)) {
-			const auto dst = g->getEdgeDst(e);
-			if (norm) {
-				b = a * norm_factor[dst];
-				vec_t neighbor(len);
-				mul_scalar(len, b, &in[dst*len], neighbor.data());
-				vadd(out[src], neighbor, out[src]); // out[src] += in[dst]
-			} else vadd(len, out[src].data(), &in[dst*len], out[src].data()); // out[src] += in[dst]
-		}
-	}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("update_all"));
-}
-
 // Softmax function takes an N-dimensional vector (X) of real number,
 // and transforms it into a vector of real number in range (0,1) which add upto 1.
 // To make softmax func numerically stable, we simply normalize the values in the vector, 
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index 7e96afc0c3..0179c46d56 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -1,19 +1,32 @@
-#pragma once
-#include "cutils.h"
+#include "math_functions.hh"
+#include "common.h"
+
+extern "C" {
+#include <cblas.h>
+//#include <clapack.h>
+}
 
 // flattern data into 1D before feed into the ReLU operater
-__global__ void relu_gpu(const int n, const float_t* in, float_t* out) {
+__global__ void relu_kernel(const int n, const float_t* in, float_t* out) {
 	CUDA_KERNEL_LOOP(index, n) {
 		out[index] = in[index] > 0 ? in[index] : 0;
 	}
 }
 
-__global__ void d_relu_gpu(const int n, const float_t* in_diff, const float_t* in_data, float_t* out_diff) {
+void relu_gpu(const int n, const float_t *in_data, float_t* out_data) {
+	relu_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, in_data, out_data);
+}
+
+__global__ void d_relu_kernel(const int n, const float_t* in_diff, const float_t* data, float_t* out_diff) {
 	CUDA_KERNEL_LOOP(index, n) {
-		out_diff[index] = in_data[index] > 0 ? in_diff[index] : 0;
+		out_diff[index] = data[index] > 0 ? in_diff[index] : 0;
 	}
 }
 
+void d_relu_gpu(const int n, const float_t *in_diff, const float_t *data, float_t *out_diff) {
+	d_relu_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, in_diff, data, out_diff);
+}
+
 void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, 
 	const int M, const int N, const int K, const float alpha, 
 	const float* A, const float* B, const float beta, float* C) {
@@ -22,30 +35,41 @@ void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
 	int ldb = (TransB == CblasNoTrans) ? N : K;
 	cublasOperation_t cuTransA = (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
 	cublasOperation_t cuTransB = (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-	CUBLAS_CHECK(cublasSgemm(cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
+	CUBLAS_CHECK(cublasSgemm(DeepGalois::cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
+}
+
+void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, const float_t *A, const float_t *B, float_t *C) {
+	const CBLAS_TRANSPOSE TransA = CblasNoTrans;
+	const CBLAS_TRANSPOSE TransB = CblasNoTrans;
+	sgemm_gpu(TransA, TransB, dim_x, dim_y, dim_z, 1.0, A, B, 0.0, C);
+}
+
+// the arguments of the maxima
+int argmax_gpu(const size_t n, const float_t *x) {
+	return 0;
 }
 
 void gemv_gpu(const CBLAS_TRANSPOSE TransA, const int M, const int N, 
 	const float alpha, const float* A, const float* x, const float beta, float* y) {
 	cublasOperation_t cuTransA = (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
-	CUBLAS_CHECK(cublasSgemv(cublas_handle(), cuTransA, N, M, &alpha, A, N, x, 1, &beta, y, 1));
+	CUBLAS_CHECK(cublasSgemv(DeepGalois::cublas_handle(), cuTransA, N, M, &alpha, A, N, x, 1, &beta, y, 1));
 }
 
 void scal_gpu(const int N, const float alpha, float *X) {
-	CUBLAS_CHECK(cublasSscal(cublas_handle(), N, &alpha, X, 1));
+	CUBLAS_CHECK(cublasSscal(DeepGalois::cublas_handle(), N, &alpha, X, 1));
 }
 
 void dot_gpu(const int n, const float* x, const float* y, float* out) {
-	CUBLAS_CHECK(cublasSdot(Caffe::cublas_handle(), n, x, 1, y, 1, out));
+	CUBLAS_CHECK(cublasSdot(DeepGalois::cublas_handle(), n, x, 1, y, 1, out));
 }
 
 void asum_gpu(const int n, const float* x, float* y) {
-	CUBLAS_CHECK(cublasSasum(Caffe::cublas_handle(), n, x, 1, y));
+	CUBLAS_CHECK(cublasSasum(DeepGalois::cublas_handle(), n, x, 1, y));
 }
 
 void scale_gpu(const int n, const float alpha, const float *x, float* y) {
-	CUBLAS_CHECK(cublasScopy(Caffe::cublas_handle(), n, x, 1, y, 1));
-	CUBLAS_CHECK(cublasSscal(Caffe::cublas_handle(), n, &alpha, y, 1));
+	CUBLAS_CHECK(cublasScopy(DeepGalois::cublas_handle(), n, x, 1, y, 1));
+	CUBLAS_CHECK(cublasSscal(DeepGalois::cublas_handle(), n, &alpha, y, 1));
 }
 
 __global__ void set_kernel(const int n, const float_t alpha, float_t* y) {
@@ -59,7 +83,7 @@ void set_gpu(const int N, const float_t alpha, float_t* Y) {
 		CUDA_CHECK(cudaMemset(Y, 0, sizeof(float_t) * N));
 		return;
 	}
-	set_kernel<float_t><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(N, alpha, Y);
+	set_kernel<<<CUDA_GET_BLOCKS(N), CUDA_NUM_THREADS>>>(N, alpha, Y);
 }
 
 __global__ void add_scalar_kernel(const int n, const float_t alpha, float_t* y) {
@@ -68,17 +92,17 @@ __global__ void add_scalar_kernel(const int n, const float_t alpha, float_t* y)
 	}
 }
 
-void add_scalar_gpu(const int N, const float alpha, float* Y) {
-	add_scalar_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(N, alpha, Y);
+void add_scalar_gpu(const int N, const float_t alpha, float_t* Y) {
+	add_scalar_kernel<<<CUDA_GET_BLOCKS(N), CUDA_NUM_THREADS>>>(N, alpha, Y);
 }
 
-__global__ void add_kernel(const int n, const float_t* a, const float_t* b, float_t* y) {
+__global__ void vadd_kernel(const int n, const float_t* a, const float_t* b, float_t* y) {
 	CUDA_KERNEL_LOOP(index, n) {
 		y[index] = a[index] + b[index];
 	}
 }
 
-void add_gpu<float>(const int N, const float* a, const float* b, float* y) {
-	add_kernel<<<GET_BLOCKS(N), CUDA_NUM_THREADS>>>(N, a, b, y);
+void vadd_gpu(const int N, const float_t* a, const float_t* b, float_t* y) {
+	vadd_kernel<<<CUDA_GET_BLOCKS(N), CUDA_NUM_THREADS>>>(N, a, b, y);
 }
 

From 1f9627f32c1a93865f4ea1c3d355c47515f1c686 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Fri, 21 Feb 2020 12:56:53 -0600
Subject: [PATCH 005/660] update CMakeLists

---
 CMakeLists.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f1b0489c10..d25c2764a4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -504,6 +504,10 @@ endif(USE_PANGOLIN)
 if(USE_DEEPGALOIS)
   add_subdirectory(libdeepgalois)
   add_subdirectory(lonestargnn)
+  cuda_include_directories("${CUB_ROOT}")
+  cuda_include_directories("${MGPU_ROOT}/src")
+  cuda_include_directories("${CMAKE_SOURCE_DIR}/libgpu/include")
+  add_subdirectory(libgpu)
 endif(USE_DEEPGALOIS)
 if(ENABLE_DIST_GALOIS)
   add_subdirectory(libdist)

From b9c8b80566d55f6e9e315abd7fda2610f372bcce Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Fri, 21 Feb 2020 13:34:40 -0600
Subject: [PATCH 006/660] fix CMakeLists

---
 CMakeLists.txt | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d25c2764a4..79555a0b31 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -502,11 +502,26 @@ if(USE_PANGOLIN)
   add_subdirectory(lonestarmine)
 endif(USE_PANGOLIN)
 if(USE_DEEPGALOIS)
+  SET(CUDA_SEPARABLE_COMPILATION ON)
+  find_package(CUDA REQUIRED)
+  set(CUDA_PROPAGATE_HOST_FLAGS off)
+  set(CUDA_SEPARABLE_COMPILATION on)
+  set(CUDA_HOST_COMPILER g++)
+  string(REPLACE "." "" GENCODES ${CUDA_CAPABILITY})
+  string(REPLACE "," ";" GENCODES ${GENCODES})
+  foreach(GENCODE ${GENCODES})
+    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; --expt-extended-lambda -gencode arch=compute_${GENCODE},code=sm_${GENCODE})
+  endforeach()
+  list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
+  cuda_include_directories("${CMAKE_SOURCE_DIR}/libgpu/include")
   add_subdirectory(libdeepgalois)
   add_subdirectory(lonestargnn)
+  set(CUB_ROOT "${CMAKE_SOURCE_DIR}/cub") # only required headers
   cuda_include_directories("${CUB_ROOT}")
+  link_directories(${CMAKE_SOURCE_DIR}/cub)
+  set(MGPU_ROOT "${CMAKE_SOURCE_DIR}/moderngpu") # only required headers
   cuda_include_directories("${MGPU_ROOT}/src")
-  cuda_include_directories("${CMAKE_SOURCE_DIR}/libgpu/include")
+  link_directories(${CMAKE_SOURCE_DIR}/moderngpu/src)
   add_subdirectory(libgpu)
 endif(USE_DEEPGALOIS)
 if(ENABLE_DIST_GALOIS)

From d7801b8ae670ce8935d1d6c5a308073616f5a368 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Sat, 22 Feb 2020 10:01:06 -0600
Subject: [PATCH 007/660] add CPU_ONLY flag

---
 libdeepgalois/CMakeLists.txt                  | 59 ++++++++++--------
 libdeepgalois/include/aggregator.h            | 11 +++-
 .../include/layers/graph_conv_layer.h         | 60 +++++++-----------
 libdeepgalois/include/net.h                   | 53 ++++++----------
 libdeepgalois/src/aggregator.cu               |  3 +-
 libdeepgalois/src/layers/graph_conv_layer.cpp | 61 +++++++++++++++++--
 6 files changed, 139 insertions(+), 108 deletions(-)

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index 7e558221f6..0b0be6217c 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -1,35 +1,40 @@
+cmake_minimum_required(VERSION 2.8)
+
 SET(OPENBLAS_INC /net/ohm/export/cdgc/cxh/OpenBLAS/build/include)
 SET(OPENBLAS_LIB /net/ohm/export/cdgc/cxh/OpenBLAS/build/lib)
+set(CUB_ROOT "${CMAKE_SOURCE_DIR}/cub") # only required headers
+set(MGPU_ROOT "${CMAKE_SOURCE_DIR}/moderngpu") # only required headers
 include_directories(${OPENBLAS_INC})
 link_directories(${OPENBLAS_LIB})
+link_directories(${CMAKE_SOURCE_DIR}/libgalois)
 
-SET(CUDA_INC /org/centers/cdgc/cuda/cuda-8.0/include)
-SET(CUDA_LIB /org/centers/cdgc/cuda/cuda-8.0/lib64/)
-include_directories(${CUDA_INC})
-link_directories(${CUDA_LIB})
-link_directories(${CMAKE_SOURCE_DIR}/libgpu)
-
-#set(sources
-#  src/layers/relu_layer.cu
-#)
-#target_include_directories(deepgalois_gpu PUBLIC
-#  ${CMAKE_SOURCE_DIR}/libgpu/include
-#)
-#set_target_properties(deepgalois_gpu PROPERTIES
-#  INTERFACE_POSITION_INDEPENDENT_CODE On
-#  POSITION_INDEPENDENT_CODE On
-#)
-cmake_minimum_required(VERSION 2.8)
-find_package(CUDA REQUIRED)
-set(CUDA_SEPARABLE_COMPILATION ON)
-set(CUDA_PROPAGATE_HOST_FLAGS OFF)
-#set(CUDA_HOST_COMPILER g++)
-list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_60,code=sm_60; -std=c++11")
-include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
-include_directories(${CMAKE_SOURCE_DIR}/libgpu/include)
-file(GLOB CUDA_FILES "src/" *.cu)
-#CUDA_COMPILE(CU_O src/math_functions.cu)
-CUDA_COMPILE(CU_O ${CUDA_FILES})
+#deepgalois_option(CPU_ONLY "Build DeepGalois without CUDA support" OFF)
+set(CPU_ONLY ON CACHE BOOL "Build DeepGalois without CUDA support")
+if(CPU_ONLY)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCPU_ONLY")
+else()
+  SET(CUDA_INC /org/centers/cdgc/cuda/cuda-8.0/include)
+  SET(CUDA_LIB /org/centers/cdgc/cuda/cuda-8.0/lib64/)
+  include_directories(${CUDA_INC})
+  link_directories(${CUDA_LIB})
+  link_directories(${CMAKE_SOURCE_DIR}/libgpu)
+  find_package(CUDA REQUIRED)
+  set(CUDA_SEPARABLE_COMPILATION ON)
+  set(CUDA_PROPAGATE_HOST_FLAGS OFF)
+  set(CUDA_HOST_COMPILER g++)
+  #list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_60,code=sm_60")
+  #list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_60,code=sm_60; -std=c++11")
+  include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
+  include_directories(${CMAKE_SOURCE_DIR}/libgpu/include)
+  include_directories(${CMAKE_SOURCE_DIR}/libgalois/include)
+  #include_directories(${CUB_ROOT})
+  #include_directories(${MGPU_ROOT}/src)
+  cuda_include_directories("${CUB_ROOT}")
+  cuda_include_directories("${MGPU_ROOT}/src")
+  file(GLOB CUDA_FILES "src/" *.cu)
+  CUDA_COMPILE(CU_O src/math_functions.cu src/aggregator.cu)
+  #CUDA_COMPILE(CU_O ${CUDA_FILES})
+endif()
 
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 set(sources
diff --git a/libdeepgalois/include/aggregator.h b/libdeepgalois/include/aggregator.h
index 4e178d89b8..a071781f54 100644
--- a/libdeepgalois/include/aggregator.h
+++ b/libdeepgalois/include/aggregator.h
@@ -1,6 +1,15 @@
 #pragma once
 #include "types.h"
+#ifdef CPU_ONLY
 #include "gtypes.h"
-
 void update_all(Graph *g, const tensor_t &in, tensor_t &out, bool norm, const vec_t &norm_factor);
 void update_all(Graph *g, const vec_t &in, tensor_t &out, bool norm, const vec_t &norm_factor);
+#else
+#include "gg.h"
+#include "ggcuda.h"
+#include "cub/cub.cuh"
+#define TB_SIZE 256
+#define WARP_SIZE 32
+void update_all(CSRGraph g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor);
+#endif
+
diff --git a/libdeepgalois/include/layers/graph_conv_layer.h b/libdeepgalois/include/layers/graph_conv_layer.h
index 0633ec63e9..44dfd197af 100644
--- a/libdeepgalois/include/layers/graph_conv_layer.h
+++ b/libdeepgalois/include/layers/graph_conv_layer.h
@@ -1,6 +1,6 @@
 #pragma once
 #include "layer.h"
-#include "gtypes.h"
+#include "aggregator.h"
 
 /* GraphConv Layer
 	Parameters
@@ -16,22 +16,15 @@
 */
 class graph_conv_layer: public layer {
 public:
+#ifdef CPU_ONLY
 	graph_conv_layer(unsigned level, Graph *g, bool act, bool norm, bool bias, bool dropout,
-		float dropout_rate, std::vector<size_t> in_dims, std::vector<size_t> out_dims) :
-		layer(level, in_dims, out_dims), graph(g), act_(act), norm_(norm), bias_(bias), 
-		dropout_(dropout), dropout_rate_(dropout_rate) {
-		assert(input_dims[0] == output_dims[0]); // num_vertices
-		x = input_dims[0];
-		y = input_dims[1];
-		z = output_dims[1];
-		trainable_ = true;
-		name_ = layer_type() + "_" + std::to_string(level);
-		//std::cout << name_ << " constructed: act(" << act_ << ") dropout(" << dropout << ")\n";
-		init();
-		scale_ = 1. / (1. - dropout_rate_);
-	}
-	graph_conv_layer(unsigned level, std::vector<size_t> in_dims, 
-		std::vector<size_t> out_dims) : graph_conv_layer(level, NULL, false, true, false, true, 0.5, in_dims, out_dims) {}
+		float dropout_rate, std::vector<size_t> in_dims, std::vector<size_t> out_dims);
+#else
+	graph_conv_layer(unsigned level, CSRGraph *g, bool act, bool norm, bool bias, bool dropout,
+		float dropout_rate, std::vector<size_t> in_dims, std::vector<size_t> out_dims);
+#endif
+	graph_conv_layer(unsigned level, std::vector<size_t> in_dims, std::vector<size_t> out_dims) :
+		graph_conv_layer(level, NULL, false, true, false, true, 0.5, in_dims, out_dims) {}
 	~graph_conv_layer() {}
 	void init() {
 		std::cout << name_ << ": allocating memory for parameters and intermediate data... ";
@@ -47,11 +40,8 @@ class graph_conv_layer: public layer {
 			for (size_t i = 0; i < x; i++) dropout_mask[i].resize(y);
 		}
 		in_temp.resize(x*y);
-		//for (size_t i = 0; i < x; ++i) in_temp[i].resize(y);
 		out_temp.resize(x*z); // same as pre_sup in original GCN code: https://github.com/chenxuhao/gcn/blob/master/gcn/layers.py
-		//for (size_t i = 0; i < x; ++i) out_temp[i].resize(z);
 		trans_data.resize(y*x); // y*x
-		//for (size_t i = 0; i < y; ++i) trans_data[i].resize(x);
 		if (norm_) norm_factor_counting();
 		t_alloc.Stop();
 		std::cout << "Done, allocation time: " << t_alloc.Millisecs() << " ms\n";
@@ -59,20 +49,21 @@ class graph_conv_layer: public layer {
 	std::string layer_type() const override { return std::string("graph_conv"); }
 	void set_context(net_phase ctx) override { phase_ = ctx; }
 	virtual void forward_propagation(const tensor_t &in_data, tensor_t &out_data);
-	virtual void forward_propagation(const float_t *in_data, float_t *out_data);
 	virtual void back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad);
+	virtual void forward_propagation(const float_t *in_data, float_t *out_data);
 	virtual void back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad);
 	// user-defined aggregate function
+#ifdef CPU_ONLY
 	virtual void aggregate(Graph *g, const vec_t &in, tensor_t &out);
+#else
+	virtual void aggregate(CSRGraph g, const float_t *in, float_t *out);
+#endif
 	// user-defined combine function
-	virtual void combine(const vec_t &self, const vec_t &neighbors, const vec_t mat_v, const vec_t mat_u, vec_t &out) {
-		vec_t a(out.size(), 0);
-		vec_t b(out.size(), 0);
-		mvmul(mat_v, self, a);
-		mvmul(mat_u, neighbors, b); 
-		vadd(a, b, out); // out = W*self + Q*neighbors
-	}
+	virtual void combine(const vec_t &self, const vec_t &neighbors, vec_t &out);
+	// user-defined pre-computing function, called during initialization
+	virtual void norm_factor_counting();
 
+protected:
 	void degree_counting() {
 		assert(x == graph->size());
 		degrees.resize(x);
@@ -81,19 +72,12 @@ class graph_conv_layer: public layer {
 		}, galois::loopname("DegreeCounting"));
 	}
 
-	// for each vertex v, compute pow(|N(v)|, -0.5), where |N(v)| is the degree of v
-	void norm_factor_counting() {
-		degree_counting();
-		norm_factor.resize(x);
-		galois::do_all(galois::iterate((size_t)0, x), [&] (auto v) {
-			float_t temp = std::sqrt(float_t(degrees[v]));
-			if (temp == 0.0) norm_factor[v] = 0.0;
-			else norm_factor[v] = 1.0 / temp;
-		}, galois::loopname("NormCounting"));
-	}
-
 private:
+#ifdef CPU_ONLY
 	Graph *graph;
+#else
+	CSRGraph graph_gpu;
+#endif
 	bool act_; // whether to use activation function at the end
 	bool norm_; // whether to normalize data
 	bool bias_; // whether to add bias afterwards
diff --git a/libdeepgalois/include/net.h b/libdeepgalois/include/net.h
index 8b51b6d350..288e3aac3b 100644
--- a/libdeepgalois/include/net.h
+++ b/libdeepgalois/include/net.h
@@ -19,8 +19,7 @@ class Net {
 	Net() {}
 
 	void init() {
-		read_graph(dataset, g); 
-		n = g.size(); // N
+		n = read_graph_cpu(dataset, graph_cpu); 
 		labels.resize(n, 0); // label for each vertex: N x 1
 		num_classes = read_labels(dataset, labels);
 
@@ -50,7 +49,6 @@ class Net {
 	size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; }
 	size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id+1]; }
 	size_t get_nnodes() { return n; }
-	size_t get_nedges() { return g.sizeEdges(); }
 	size_t get_ft_dim() { return feature_dims[0]; }
 	size_t get_nclasses() { return num_classes; }
 	size_t get_label(size_t i) { return labels[i]; }
@@ -79,7 +77,11 @@ class Net {
 		in_dims[0] = out_dims[0] = n;
 		in_dims[1] = get_in_dim(layer_id);
 		out_dims[1] = get_out_dim(layer_id);
-		layers[layer_id] = new graph_conv_layer(layer_id, &g, act, norm, bias, dropout, dropout_rate, in_dims, out_dims);
+#ifdef CPU_ONLY
+		layers[layer_id] = new graph_conv_layer(layer_id, &graph_cpu, act, norm, bias, dropout, dropout_rate, in_dims, out_dims);
+#else
+		layers[layer_id] = new graph_conv_layer(layer_id, &graph_gpu, act, norm, bias, dropout, dropout_rate, in_dims, out_dims);
+#endif
 		if(layer_id > 0) connect(layers[layer_id-1], layers[layer_id]);
 	}
 
@@ -177,8 +179,11 @@ class Net {
 	size_t num_classes; // number of vertex classes: E
 	size_t num_layers; // for now hard-coded: NUM_CONV_LAYERS + 1
 	std::vector<size_t> feature_dims; // feature dimnesions for each layer
-
-	Graph g; // the input graph, |V| = N
+#ifdef CPU_ONLY
+	Graph graph_cpu; // the input graph, |V| = N
+#else
+	CSRGraph graph_gpu; // the input graph, |V| = N
+#endif
 	tensor_t input_features; // input features: N x D
 	std::vector<label_t> labels; // labels for classification: N x 1
 	MaskList train_mask, val_mask; // masks for traning and validation
@@ -265,42 +270,23 @@ class Net {
 		return n;
 	}
 
-	unsigned read_graph(std::string dataset_str, Graph &graph) {
-		//printf("Start readGraph\n");
+	size_t read_graph_cpu(std::string dataset_str, Graph &graph) {
 		galois::StatTimer Tread("GraphReadingTime");
 		Tread.start();
 		LGraph lgraph;
-		unsigned max_degree = 0;
 		if (filetype == "el") {
 			std::string filename = path + dataset_str + ".el";
 			printf("Reading .el file: %s\n", filename.c_str());
 			lgraph.read_edgelist(filename.c_str(), true); //symmetrize
 			genGraph(lgraph, graph);
+			lgraph.clean();
 		} else if (filetype == "gr") {
 			std::string filename = path + dataset_str + ".csgr";
 			printf("Reading .gr file: %s\n", filename.c_str());
 			galois::graphs::readGraph(graph, filename);
-			/*
-			galois::do_all(galois::iterate(graph.begin(), graph.end()), [&](const auto& vid) {
-				graph.getData(vid) = 1;
-				//for (auto e : graph.edges(n)) graph.getEdgeData(e) = 1;
-			}, galois::chunk_size<256>(), galois::steal(), galois::loopname("assignVertexLabels"));
-			std::vector<unsigned> degrees(graph.size());
-			galois::do_all(galois::iterate(graph.begin(), graph.end()), [&](const auto& vid) {
-				degrees[vid] = std::distance(graph.edge_begin(vid), graph.edge_end(vid));
-			}, galois::loopname("computeMaxDegree"));
-			max_degree = *(std::max_element(degrees.begin(), degrees.end()));
-			*/
 		} else { printf("Unkown file format\n"); exit(1); }
-		if (filetype != "gr") {
-			max_degree = lgraph.get_max_degree();
-			lgraph.clean();
-		}
-		printf("max degree = %u\n", max_degree);
 		Tread.stop();
-		//printf("Done readGraph\n");
-		std::cout << "num_vertices " << g.size() << " num_edges " << g.sizeEdges() << "\n";
-		return max_degree;
+		return graph.size();
 	}
 
 	void genGraph(LGraph &lg, Graph &g) {
@@ -312,23 +298,20 @@ class Net {
 			auto row_end = lg.get_offset(i+1);
 			g.fixEndEdge(i, row_end);
 			for (auto offset = row_begin; offset < row_end; offset ++)
-				g.constructEdge(offset, lg.get_dest(offset), 0); // do not consider edge labels now
+				g.constructEdge(offset, lg.get_dest(offset), 0);
 		}
 	}
 
+	// comparing outputs with the ground truth (labels)
 	inline acc_t masked_accuracy(size_t begin, size_t end, size_t count, MaskList &masks) {
-		// comparing outputs with the ground truth (labels)
-		//acc_t accuracy_all = 0.0;
 		AccumF accuracy_all;
 		accuracy_all.reset();
-		//for (size_t i = begin; i < end; i++) {
 		galois::do_all(galois::iterate(begin, end), [&](const auto& i) {
 			if (masks[i] == 1) {
-				int prediction = argmax(num_classes, layers[NUM_CONV_LAYERS-1]->next()->get_data()[i]);
-				if ((label_t)prediction == labels[i]) accuracy_all += 1.0;
+				int preds = argmax(num_classes, layers[NUM_CONV_LAYERS-1]->next()->get_data()[i]);
+				if ((label_t)preds == labels[i]) accuracy_all += 1.0;
 			}
 		}, galois::chunk_size<256>(), galois::steal(), galois::loopname("getMaskedLoss"));
-		//}
 		return accuracy_all.reduce() / (acc_t)count;
 	}
 };
diff --git a/libdeepgalois/src/aggregator.cu b/libdeepgalois/src/aggregator.cu
index 1cc93e6866..44a3e59d2d 100644
--- a/libdeepgalois/src/aggregator.cu
+++ b/libdeepgalois/src/aggregator.cu
@@ -1,7 +1,6 @@
-#include "csr_graph.h"
 #include "aggregator.h"
 #include "math_functions.hh"
 
-void update_all(CSRGraph *g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor) {
+void update_all(CSRGraph g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor) {
 }
 	
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 863b5df73c..71d5f18f5b 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -1,10 +1,43 @@
 #include "layers/graph_conv_layer.h"
-#include "aggregator.h"
 
 void graph_conv_layer::aggregate(Graph *g, const vec_t &in, tensor_t &out) {
 	update_all(g, in, out, true, norm_factor);
 }
 
+// for each vertex v, compute pow(|N(v)|, -0.5), where |N(v)| is the degree of v
+void graph_conv_layer::norm_factor_counting() {
+	degree_counting();
+	norm_factor.resize(x);
+	galois::do_all(galois::iterate((size_t)0, x), [&] (auto v) {
+		float_t temp = std::sqrt(float_t(degrees[v]));
+		if (temp == 0.0) norm_factor[v] = 0.0;
+		else norm_factor[v] = 1.0 / temp;
+	}, galois::loopname("NormCounting"));
+}
+
+void graph_conv_layer::combine(const vec_t &self, const vec_t &neighbors, vec_t &out) {
+	vec_t a(out.size(), 0);
+	vec_t b(out.size(), 0);
+	mvmul(Q, self, a);
+	mvmul(W, neighbors, b); 
+	vadd(a, b, out); // out = W*self + Q*neighbors
+}
+
+#ifdef CPU_ONLY
+graph_conv_layer::graph_conv_layer(unsigned level, Graph *g, bool act, bool norm, bool bias, 
+	bool dropout, float dropout_rate, std::vector<size_t> in_dims, std::vector<size_t> out_dims) :
+		layer(level, in_dims, out_dims), graph(g), act_(act), norm_(norm), 
+		bias_(bias), dropout_(dropout), dropout_rate_(dropout_rate) {
+	assert(input_dims[0] == output_dims[0]); // num_vertices
+	x = input_dims[0];
+	y = input_dims[1];
+	z = output_dims[1];
+	trainable_ = true;
+	name_ = layer_type() + "_" + std::to_string(level);
+	init();
+	scale_ = 1. / (1. - dropout_rate_);
+}
+
 // 𝒉[𝑙] = σ(𝑊 * Σ(𝒉[𝑙-1]))
 void graph_conv_layer::forward_propagation(const tensor_t &in_data, tensor_t &out_data) {
 	// input: x*y; W: y*z; output: x*z
@@ -24,8 +57,6 @@ void graph_conv_layer::forward_propagation(const tensor_t &in_data, tensor_t &ou
 	}
 }
 
-void graph_conv_layer::forward_propagation(const float_t *in_data, float_t *out_data) { }
-
 // 𝜕𝐸 / 𝜕𝑦[𝑙−1] = 𝜕𝐸 / 𝜕𝑦[𝑙] ∗ 𝑊 ^𝑇
 void graph_conv_layer::back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad) {
 	if (act_) {
@@ -49,6 +80,26 @@ void graph_conv_layer::back_propagation(const tensor_t &in_data, const tensor_t
 	transpose2D1D(in_data, trans_data); // y*x
 	matmul1D1D(y, z, x, trans_data, out_temp, weight_grad); // y*x; x*z; y*z
 }
+#else
+graph_conv_layer::graph_conv_layer(unsigned level, CSRGraph *g, bool act, bool norm, bool bias, 
+	bool dropout, float dropout_rate, std::vector<size_t> in_dims, std::vector<size_t> out_dims) :
+		layer(level, in_dims, out_dims), graph_gpu(*g), act_(act), norm_(norm), 
+		bias_(bias), dropout_(dropout), dropout_rate_(dropout_rate) {
+	assert(input_dims[0] == output_dims[0]); // num_vertices
+	x = input_dims[0];
+	y = input_dims[1];
+	z = output_dims[1];
+	trainable_ = true;
+	name_ = layer_type() + "_" + std::to_string(level);
+	init();
+	scale_ = 1. / (1. - dropout_rate_);
+}
+	
+// GPU forward
+void graph_conv_layer::forward_propagation(const float_t *in_data, float_t *out_data) {
+}
 
-void graph_conv_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) { }
-
+// GPU backward
+void graph_conv_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) {
+}
+#endif

From 6af133b1cac0d57566d4f5f101ea12eeb73bac87 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Sat, 22 Feb 2020 11:32:14 -0600
Subject: [PATCH 008/660] fix bug

---
 libdeepgalois/CMakeLists.txt                  | 24 +++++++++----------
 libdeepgalois/src/layers/graph_conv_layer.cpp |  8 ++++++-
 lonestargnn/gcn/CMakeLists.txt                |  3 +++
 3 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index 0b0be6217c..3519694c57 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -4,35 +4,33 @@ SET(OPENBLAS_INC /net/ohm/export/cdgc/cxh/OpenBLAS/build/include)
 SET(OPENBLAS_LIB /net/ohm/export/cdgc/cxh/OpenBLAS/build/lib)
 set(CUB_ROOT "${CMAKE_SOURCE_DIR}/cub") # only required headers
 set(MGPU_ROOT "${CMAKE_SOURCE_DIR}/moderngpu") # only required headers
+SET(CUDA_INC /org/centers/cdgc/cuda/cuda-8.0/include)
+SET(CUDA_LIB /org/centers/cdgc/cuda/cuda-8.0/lib64/)
 include_directories(${OPENBLAS_INC})
+include_directories(${CMAKE_SOURCE_DIR}/libgalois/include)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
 link_directories(${OPENBLAS_LIB})
 link_directories(${CMAKE_SOURCE_DIR}/libgalois)
 
 #deepgalois_option(CPU_ONLY "Build DeepGalois without CUDA support" OFF)
-set(CPU_ONLY ON CACHE BOOL "Build DeepGalois without CUDA support")
-if(CPU_ONLY)
+set(USE_CPU ON CACHE BOOL "Build DeepGalois without CUDA support")
+if(USE_CPU)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCPU_ONLY")
 else()
-  SET(CUDA_INC /org/centers/cdgc/cuda/cuda-8.0/include)
-  SET(CUDA_LIB /org/centers/cdgc/cuda/cuda-8.0/lib64/)
-  include_directories(${CUDA_INC})
-  link_directories(${CUDA_LIB})
-  link_directories(${CMAKE_SOURCE_DIR}/libgpu)
   find_package(CUDA REQUIRED)
   set(CUDA_SEPARABLE_COMPILATION ON)
   set(CUDA_PROPAGATE_HOST_FLAGS OFF)
   set(CUDA_HOST_COMPILER g++)
   #list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_60,code=sm_60")
   #list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_60,code=sm_60; -std=c++11")
-  include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
-  include_directories(${CMAKE_SOURCE_DIR}/libgpu/include)
-  include_directories(${CMAKE_SOURCE_DIR}/libgalois/include)
-  #include_directories(${CUB_ROOT})
-  #include_directories(${MGPU_ROOT}/src)
+  cuda_include_directories(${CUDA_INC})
   cuda_include_directories("${CUB_ROOT}")
   cuda_include_directories("${MGPU_ROOT}/src")
+  cuda_include_directories(${CMAKE_SOURCE_DIR}/libgpu/include)
+  link_directories(${CUDA_LIB})
+  link_directories(${CMAKE_SOURCE_DIR}/libgpu)
   file(GLOB CUDA_FILES "src/" *.cu)
-  CUDA_COMPILE(CU_O src/math_functions.cu src/aggregator.cu)
+  cuda_cpmpile(CU_O src/math_functions.cu src/aggregator.cu)
   #CUDA_COMPILE(CU_O ${CUDA_FILES})
 endif()
 
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 71d5f18f5b..98e9e14211 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -80,6 +80,9 @@ void graph_conv_layer::back_propagation(const tensor_t &in_data, const tensor_t
 	transpose2D1D(in_data, trans_data); // y*x
 	matmul1D1D(y, z, x, trans_data, out_temp, weight_grad); // y*x; x*z; y*z
 }
+
+void graph_conv_layer::forward_propagation(const float_t *in_data, float_t *out_data) {}
+void graph_conv_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) {}
 #else
 graph_conv_layer::graph_conv_layer(unsigned level, CSRGraph *g, bool act, bool norm, bool bias, 
 	bool dropout, float dropout_rate, std::vector<size_t> in_dims, std::vector<size_t> out_dims) :
@@ -94,7 +97,10 @@ graph_conv_layer::graph_conv_layer(unsigned level, CSRGraph *g, bool act, bool n
 	init();
 	scale_ = 1. / (1. - dropout_rate_);
 }
-	
+
+void graph_conv_layer::forward_propagation(const tensor_t &in_data, tensor_t &out_data) {}
+void graph_conv_layer::back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad) {}
+
 // GPU forward
 void graph_conv_layer::forward_propagation(const float_t *in_data, float_t *out_data) {
 }
diff --git a/lonestargnn/gcn/CMakeLists.txt b/lonestargnn/gcn/CMakeLists.txt
index 05484252b8..ae1d2dff4b 100644
--- a/lonestargnn/gcn/CMakeLists.txt
+++ b/lonestargnn/gcn/CMakeLists.txt
@@ -2,6 +2,9 @@ SET(OPENBLAS_INC /net/ohm/export/cdgc/cxh/OpenBLAS/build/include)
 SET(OPENBLAS_LIB /net/ohm/export/cdgc/cxh/OpenBLAS/build/lib)
 include_directories(${OPENBLAS_INC})
 link_directories(${OPENBLAS_LIB})
+if(USE_CPU)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCPU_ONLY")
+endif()
 
 app(gcn gcn.cpp)
 target_link_libraries(gcn deepgalois)

From e4535e3ab843f57a1f9b38369b96b64edd7ef021 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Sat, 22 Feb 2020 15:29:22 -0600
Subject: [PATCH 009/660] add context

---
 libdeepgalois/CMakeLists.txt                  |   8 +-
 libdeepgalois/include/aggregator.h            |   6 +-
 libdeepgalois/include/common.h                |  66 ------
 libdeepgalois/include/context.h               |  64 ++++++
 .../include/layers/graph_conv_layer.h         |  56 +----
 libdeepgalois/include/layers/layer.h          |   5 +-
 .../include/layers/softmax_loss_layer.h       |   8 +-
 libdeepgalois/include/net.h                   | 212 ++----------------
 libdeepgalois/src/aggregator.cpp              |  16 +-
 libdeepgalois/src/context.cpp                 | 156 +++++++++++++
 libdeepgalois/src/layers/graph_conv_layer.cpp |  60 +++--
 .../src/layers/softmax_loss_layer.cpp         |  12 +-
 libdeepgalois/src/math_functions.cu           |   2 +-
 libdeepgalois/src/net.cpp                     | 107 +++++++++
 lonestargnn/CMakeLists.txt                    |  10 +
 lonestargnn/gcn/CMakeLists.txt                |   8 -
 lonestargnn/gcn/gcn.cpp                       |   6 +-
 17 files changed, 423 insertions(+), 379 deletions(-)
 delete mode 100644 libdeepgalois/include/common.h
 create mode 100644 libdeepgalois/include/context.h
 create mode 100644 libdeepgalois/src/context.cpp
 create mode 100644 libdeepgalois/src/net.cpp

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index 3519694c57..514af263d4 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -9,6 +9,7 @@ SET(CUDA_LIB /org/centers/cdgc/cuda/cuda-8.0/lib64/)
 include_directories(${OPENBLAS_INC})
 include_directories(${CMAKE_SOURCE_DIR}/libgalois/include)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
+include_directories(${CUDA_INC})
 link_directories(${OPENBLAS_LIB})
 link_directories(${CMAKE_SOURCE_DIR}/libgalois)
 
@@ -23,14 +24,13 @@ else()
   set(CUDA_HOST_COMPILER g++)
   #list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_60,code=sm_60")
   #list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_60,code=sm_60; -std=c++11")
-  cuda_include_directories(${CUDA_INC})
   cuda_include_directories("${CUB_ROOT}")
   cuda_include_directories("${MGPU_ROOT}/src")
   cuda_include_directories(${CMAKE_SOURCE_DIR}/libgpu/include)
   link_directories(${CUDA_LIB})
   link_directories(${CMAKE_SOURCE_DIR}/libgpu)
   file(GLOB CUDA_FILES "src/" *.cu)
-  cuda_cpmpile(CU_O src/math_functions.cu src/aggregator.cu)
+  cuda_compile(CU_O src/math_functions.cu src/aggregator.cu)
   #CUDA_COMPILE(CU_O ${CUDA_FILES})
 endif()
 
@@ -41,6 +41,8 @@ set(sources
   src/layers/softmax_loss_layer.cpp
   src/math_functions.cpp
   src/aggregator.cpp
+  src/context.cpp
+  src/net.cpp
   ${CU_O}
 )
 add_library(deepgalois STATIC ${sources})
@@ -48,7 +50,7 @@ add_library(deepgalois STATIC ${sources})
 target_link_libraries(deepgalois galois_shmem gllvm galois_gpu)
 target_link_libraries(deepgalois ${MPI_CXX_LIBRARIES})
 target_link_libraries(deepgalois -lopenblas)
-target_link_libraries(deepgalois -lcudart -lcublas)
+target_link_libraries(deepgalois -lcudart -lcublas -lcurand)
 
 target_include_directories(deepgalois PUBLIC
   ${CMAKE_SOURCE_DIR}/libllvm/include
diff --git a/libdeepgalois/include/aggregator.h b/libdeepgalois/include/aggregator.h
index a071781f54..6fb4ec8d41 100644
--- a/libdeepgalois/include/aggregator.h
+++ b/libdeepgalois/include/aggregator.h
@@ -2,14 +2,14 @@
 #include "types.h"
 #ifdef CPU_ONLY
 #include "gtypes.h"
-void update_all(Graph *g, const tensor_t &in, tensor_t &out, bool norm, const vec_t &norm_factor);
-void update_all(Graph *g, const vec_t &in, tensor_t &out, bool norm, const vec_t &norm_factor);
+void update_all(Graph &g, const tensor_t &in, tensor_t &out, bool norm, const vec_t &norm_factor);
+void update_all(Graph &g, const vec_t &in, tensor_t &out, bool norm, const vec_t &norm_factor);
 #else
 #include "gg.h"
 #include "ggcuda.h"
 #include "cub/cub.cuh"
 #define TB_SIZE 256
 #define WARP_SIZE 32
-void update_all(CSRGraph g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor);
+void update_all(CSRGraph &g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor);
 #endif
 
diff --git a/libdeepgalois/include/common.h b/libdeepgalois/include/common.h
deleted file mode 100644
index f942fd106c..0000000000
--- a/libdeepgalois/include/common.h
+++ /dev/null
@@ -1,66 +0,0 @@
-#pragma once
-#include "types.h"
-#include "utils.h"
-#include "cutils.h"
-//#include "random.h"
-
-class DeepGalois {
-public:
-	~DeepGalois();
-	enum Brew { CPU, GPU };
-	static DeepGalois& Get() {
-	}
-	inline static cublasHandle_t cublas_handle() { return Get().cublas_handle_; }
-	inline static curandGenerator_t curand_generator() { return Get().curand_generator_; }
-	inline static Brew mode() { return Get().mode_; }
-	inline static void set_mode(Brew mode) { Get().mode_ = mode; }
-	inline static int solver_count() { return Get().solver_count_; }
-	inline static void set_solver_count(int val) { Get().solver_count_ = val; }
-	inline static int solver_rank() { return Get().solver_rank_; }
-	inline static void set_solver_rank(int val) { Get().solver_rank_ = val; }
-	inline static bool multiprocess() { return Get().multiprocess_; }
-	inline static void set_multiprocess(bool val) { Get().multiprocess_ = val; }
-	inline static bool root_solver() { return Get().solver_rank_ == 0; }
-	static void SetDevice(const int device_id) {
-		int current_device;
-		CUDA_CHECK(cudaGetDevice(&current_device));
-		if (current_device == device_id) return;
-		CUDA_CHECK(cudaSetDevice(device_id));
-		if (Get().cublas_handle_) CUBLAS_CHECK(cublasDestroy(Get().cublas_handle_));
-		if (Get().curand_generator_) CURAND_CHECK(curandDestroyGenerator(Get().curand_generator_));
-		CUBLAS_CHECK(cublasCreate(&Get().cublas_handle_));
-		//CURAND_CHECK(curandCreateGenerator(&Get().curand_generator_, CURAND_RNG_PSEUDO_DEFAULT));
-		//CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().curand_generator_, cluster_seedgen()));
-	}
-	static void DeviceQuery() {}
-	static bool CheckDevice(const int device_id) { return true; }
-	static int FindDevice(const int start_id = 0) { return 0; }
-
-protected:
-	cublasHandle_t cublas_handle_; // used to call cuBLAS
-	curandGenerator_t curand_generator_; // used to generate random numbers on GPU
-	//shared_ptr<RNG> random_generator_;
-	Brew mode_;
-	// Parallel training
-	int solver_count_;
-	int solver_rank_;
-	bool multiprocess_;
-
-private:
-	// The private constructor to avoid duplicate instantiation.
-	DeepGalois() : cublas_handle_(NULL), curand_generator_(NULL), 
-			//random_generator_(NULL), mode_(DeepGalois::CPU),
-			mode_(DeepGalois::CPU),
-			solver_count_(1), solver_rank_(0), multiprocess_(false) {
-		// Try to create a cublas handler, and report an error if failed (but we will
-		// keep the program running as one might just want to run CPU code).
-		if (cublasCreate(&cublas_handle_) != CUBLAS_STATUS_SUCCESS) {
-			std::cout << "Cannot create Cublas handle. Cublas won't be available.";
-		}
-		// Try to create a curand handler.
-		//if (curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT) != CURAND_STATUS_SUCCESS ||
-		//	curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen()) != CURAND_STATUS_SUCCESS)
-		//	std::cout << "Cannot create Curand generator. Curand won't be available.";
-	}
-};
-
diff --git a/libdeepgalois/include/context.h b/libdeepgalois/include/context.h
new file mode 100644
index 0000000000..1fc8b6ffc4
--- /dev/null
+++ b/libdeepgalois/include/context.h
@@ -0,0 +1,64 @@
+#pragma once
+#include <string>
+#include <cassert>
+#include "types.h"
+#include "utils.h"
+#include "lgraph.h"
+#include "gtypes.h"
+#include "cutils.h"
+//#include "random.h"
+
+class Context {
+public:
+	Context();
+	~Context();
+	enum Brew { CPU, GPU };
+	//static Context& Get();
+	cublasHandle_t cublas_handle() { return cublas_handle_; }
+	curandGenerator_t curand_generator() { return curand_generator_; }
+	Brew mode() { return mode_; }
+	void set_mode(Brew mode) { mode_ = mode; }
+	int solver_count() { return solver_count_; }
+	void set_solver_count(int val) { solver_count_ = val; }
+	int solver_rank() { return solver_rank_; }
+	void set_solver_rank(int val) { solver_rank_ = val; }
+	bool multiprocess() { return multiprocess_; }
+	void set_multiprocess(bool val) { multiprocess_ = val; }
+	bool root_solver() { return solver_rank_ == 0; }
+	void SetDevice(const int device_id);
+	void DeviceQuery() {}
+	bool CheckDevice(const int device_id) { return true; }
+	int FindDevice(const int start_id = 0) { return 0; }
+	size_t read_graph(std::string dataset_str);
+	size_t read_graph_cpu(std::string dataset_str, std::string filetype = "gr");
+	size_t read_graph_gpu(std::string dataset_str);
+	size_t read_labels(std::string dataset_str, size_t num);
+	label_t get_label(size_t i) { return labels[i]; }
+	label_t *get_labels_ptr(size_t i) { return &(labels[0]); }
+	void degree_counting();
+	void norm_factor_counting();
+#ifdef CPU_ONLY
+	Graph graph_cpu; // the input graph, |V| = N
+#else
+	CSRGraph graph_gpu; // the input graph, |V| = N
+#endif
+	std::vector<label_t> labels; // labels for classification: N x 1
+	std::vector<float_t> norm_factor; // normalization constant based on graph structure
+	std::vector<unsigned> degrees;
+
+protected:
+	Brew mode_;
+	cublasHandle_t cublas_handle_; // used to call cuBLAS
+	curandGenerator_t curand_generator_; // used to generate random numbers on GPU
+	//shared_ptr<RNG> random_generator_;
+	// Parallel training
+	int solver_count_;
+	int solver_rank_;
+	bool multiprocess_;
+	void genGraph(LGraph &lg, Graph &g);
+
+private:
+	// The private constructor to avoid duplicate instantiation.
+	//Context();
+};
+
diff --git a/libdeepgalois/include/layers/graph_conv_layer.h b/libdeepgalois/include/layers/graph_conv_layer.h
index 44dfd197af..ff7fb82b31 100644
--- a/libdeepgalois/include/layers/graph_conv_layer.h
+++ b/libdeepgalois/include/layers/graph_conv_layer.h
@@ -16,68 +16,28 @@
 */
 class graph_conv_layer: public layer {
 public:
-#ifdef CPU_ONLY
-	graph_conv_layer(unsigned level, Graph *g, bool act, bool norm, bool bias, bool dropout,
-		float dropout_rate, std::vector<size_t> in_dims, std::vector<size_t> out_dims);
-#else
-	graph_conv_layer(unsigned level, CSRGraph *g, bool act, bool norm, bool bias, bool dropout,
+	graph_conv_layer(unsigned level, bool act, bool norm, bool bias, bool dropout,
 		float dropout_rate, std::vector<size_t> in_dims, std::vector<size_t> out_dims);
-#endif
 	graph_conv_layer(unsigned level, std::vector<size_t> in_dims, std::vector<size_t> out_dims) :
-		graph_conv_layer(level, NULL, false, true, false, true, 0.5, in_dims, out_dims) {}
+		graph_conv_layer(level, false, true, false, true, 0.5, in_dims, out_dims) {}
 	~graph_conv_layer() {}
-	void init() {
-		std::cout << name_ << ": allocating memory for parameters and intermediate data... ";
-		Timer t_alloc;
-		t_alloc.Start();
-		// randomly initialize trainable parameters for conv layers
-		rand_init_matrix(y, z, W);
-		//rand_init_matrix(y, z, Q);
-		zero_init_matrix(y, z, weight_grad);
-		alloc_grad();
-		if (dropout_) {
-			dropout_mask.resize(x);
-			for (size_t i = 0; i < x; i++) dropout_mask[i].resize(y);
-		}
-		in_temp.resize(x*y);
-		out_temp.resize(x*z); // same as pre_sup in original GCN code: https://github.com/chenxuhao/gcn/blob/master/gcn/layers.py
-		trans_data.resize(y*x); // y*x
-		if (norm_) norm_factor_counting();
-		t_alloc.Stop();
-		std::cout << "Done, allocation time: " << t_alloc.Millisecs() << " ms\n";
-	}
+	void init();
 	std::string layer_type() const override { return std::string("graph_conv"); }
-	void set_context(net_phase ctx) override { phase_ = ctx; }
+	void set_netphase(net_phase ctx) override { phase_ = ctx; }
 	virtual void forward_propagation(const tensor_t &in_data, tensor_t &out_data);
 	virtual void back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad);
 	virtual void forward_propagation(const float_t *in_data, float_t *out_data);
 	virtual void back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad);
 	// user-defined aggregate function
 #ifdef CPU_ONLY
-	virtual void aggregate(Graph *g, const vec_t &in, tensor_t &out);
+	virtual void aggregate(Graph &g, const vec_t &in, tensor_t &out);
 #else
-	virtual void aggregate(CSRGraph g, const float_t *in, float_t *out);
+	virtual void aggregate(CSRGraph &g, const float_t *in, float_t *out);
 #endif
 	// user-defined combine function
 	virtual void combine(const vec_t &self, const vec_t &neighbors, vec_t &out);
-	// user-defined pre-computing function, called during initialization
-	virtual void norm_factor_counting();
-
-protected:
-	void degree_counting() {
-		assert(x == graph->size());
-		degrees.resize(x);
-		galois::do_all(galois::iterate((size_t)0, x), [&] (auto v) {
-			degrees[v] = std::distance(graph->edge_begin(v), graph->edge_end(v));
-		}, galois::loopname("DegreeCounting"));
-	}
 
 private:
-#ifdef CPU_ONLY
-	Graph *graph;
-#else
-	CSRGraph graph_gpu;
-#endif
 	bool act_; // whether to use activation function at the end
 	bool norm_; // whether to normalize data
 	bool bias_; // whether to add bias afterwards
@@ -91,11 +51,9 @@ class graph_conv_layer: public layer {
 	vec_t out_temp;
 	vec_t in_temp;
 	vec_t trans_data; // y*x
-	std::vector<unsigned> degrees;
-	std::vector<float_t> norm_factor; // normalization constant based on graph structure
 	std::vector<std::vector<unsigned> > dropout_mask;
 
-	// Glorot & Bengio (AISTATS 2010) init
+	// Glorot & Bengio (AISTATS 2010)
 	inline void rand_init_matrix(size_t dim_x, size_t dim_y, vec_t &matrix) {
 		auto init_range = sqrt(6.0/(dim_x + dim_y));
 		std::default_random_engine rng;
diff --git a/libdeepgalois/include/layers/layer.h b/libdeepgalois/include/layers/layer.h
index 076253fe82..15e7d88900 100644
--- a/libdeepgalois/include/layers/layer.h
+++ b/libdeepgalois/include/layers/layer.h
@@ -17,6 +17,7 @@
 #include "../types.h"
 #include "../utils.h"
 #include "../gtypes.h"
+#include "../context.h"
 #include "../optimizer.h"
 #include "../math_functions.hh"
 /**
@@ -42,7 +43,8 @@ class layer : public node {
 	virtual void back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad) = 0;
 	virtual void back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) = 0;
 	virtual std::string layer_type() const = 0;
-	virtual void set_context(net_phase ctx) {}
+	virtual void set_context(Context *ctx) { context = ctx; }
+	virtual void set_netphase(net_phase phase) {}
 
 	void set_trainable(bool trainable) { trainable_ = trainable; }
 	bool trainable() const { return trainable_; }
@@ -136,6 +138,7 @@ class layer : public node {
 	vec_t Q; // parameters to learn, for vertex u, i.e. v's neighbors, layer0: D x 16, layer1: 16 x E
 	vec_t weight_grad; // weight gradient for updating parameters
 	vec_t loss; // error for each vertex: N x 1
+	Context *context;
 };
 
 // head: layer i+1, tail: layer i
diff --git a/libdeepgalois/include/layers/softmax_loss_layer.h b/libdeepgalois/include/layers/softmax_loss_layer.h
index 236fd35118..cb698491fc 100644
--- a/libdeepgalois/include/layers/softmax_loss_layer.h
+++ b/libdeepgalois/include/layers/softmax_loss_layer.h
@@ -3,18 +3,12 @@
 
 class softmax_loss_layer: public layer {
 public:
-	softmax_loss_layer(unsigned level, std::vector<size_t> in_dims, 
-		std::vector<size_t> out_dims, LabelList *lab);
-	softmax_loss_layer(unsigned level, std::vector<size_t> in_dims, 
-		std::vector<size_t> out_dims) : softmax_loss_layer(level, in_dims, out_dims, NULL) {}
+	softmax_loss_layer(unsigned level, std::vector<size_t> in_dims, std::vector<size_t> out_dims);
 	~softmax_loss_layer() {}
 	std::string layer_type() const override { return std::string("softmax_loss"); }
 	virtual void forward_propagation(const tensor_t &in_data, tensor_t &out_data);
 	virtual void forward_propagation(const float_t *in_data, float_t *out_data);
 	virtual void back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad);
 	virtual void back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad);
-
-private:
-	LabelList *labels;
 };
 
diff --git a/libdeepgalois/include/net.h b/libdeepgalois/include/net.h
index 288e3aac3b..dba2753221 100644
--- a/libdeepgalois/include/net.h
+++ b/libdeepgalois/include/net.h
@@ -2,9 +2,10 @@
 #define _MODEL_H_
 
 #include <random>
+#include "types.h"
 #include "gtypes.h"
+#include "context.h"
 #include "galois/Timer.h"
-#include "lgraph.h"
 #include "layers.h"
 #include "optimizer.h"
 
@@ -17,52 +18,28 @@
 class Net {
 public:
 	Net() {}
-
-	void init() {
-		n = read_graph_cpu(dataset, graph_cpu); 
-		labels.resize(n, 0); // label for each vertex: N x 1
-		num_classes = read_labels(dataset, labels);
-
-		std::cout << "Reading label masks ... ";
-		train_mask.resize(n, 0);
-		val_mask.resize(n, 0);
-		if (dataset == "reddit") {
-			train_begin = 0, train_count = 153431, train_end = train_begin + train_count;
-			val_begin = 153431, val_count = 23831, val_end = val_begin + val_count;
-			for (size_t i = train_begin; i < train_end; i++) train_mask[i] = 1;
-			for (size_t i = val_begin; i < val_end; i++) val_mask[i] = 1;
-		} else {
-			train_count = read_masks(dataset, "train", train_begin, train_end, train_mask);
-			val_count = read_masks(dataset, "val", val_begin, val_end, val_mask);
-		}
-		std::cout << "Done\n";
-
-		num_layers = NUM_CONV_LAYERS + 1;
-		feature_dims.resize(num_layers + 1);
-		input_features.resize(n); // input embedding: N x D
-		feature_dims[0] = read_features(dataset, input_features); // input feature dimension: D
-		feature_dims[1] = hidden1; // hidden1 level embedding: 16
-		feature_dims[2] = num_classes; // output embedding: E
-		feature_dims[3] = num_classes; // normalized output embedding: E
-		layers.resize(num_layers);
-	}
+	void init(std::string dataset_str, unsigned epochs, unsigned hidden1);
 	size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; }
 	size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id+1]; }
-	size_t get_nnodes() { return n; }
 	size_t get_ft_dim() { return feature_dims[0]; }
-	size_t get_nclasses() { return num_classes; }
-	size_t get_label(size_t i) { return labels[i]; }
+	size_t read_features(std::string dataset_str, tensor_t &feats);
 	void construct_layers() {
 		std::cout << "\nConstructing layers...\n";
 		append_conv_layer(0, true); // first conv layer
 		append_conv_layer(1); // hidden1 layer
 		append_out_layer(2); // output layer
 		layers[0]->set_in_data(input_features); // feed input data
+		set_contexts();
 	}
 
-	void set_netphase(net_phase phase) {
+	void set_contexts() {
 		for (size_t i = 0; i < num_layers; i ++)
-			layers[i]->set_context(phase);
+			layers[i]->set_context(context);
+	}
+
+	void set_netphases(net_phase phase) {
+		for (size_t i = 0; i < num_layers; i ++)
+			layers[i]->set_netphase(phase);
 	}
 
 	void print_layers_info() {
@@ -78,9 +55,9 @@ class Net {
 		in_dims[1] = get_in_dim(layer_id);
 		out_dims[1] = get_out_dim(layer_id);
 #ifdef CPU_ONLY
-		layers[layer_id] = new graph_conv_layer(layer_id, &graph_cpu, act, norm, bias, dropout, dropout_rate, in_dims, out_dims);
+		layers[layer_id] = new graph_conv_layer(layer_id, act, norm, bias, dropout, dropout_rate, in_dims, out_dims);
 #else
-		layers[layer_id] = new graph_conv_layer(layer_id, &graph_gpu, act, norm, bias, dropout, dropout_rate, in_dims, out_dims);
+		layers[layer_id] = new graph_conv_layer(layer_id, act, norm, bias, dropout, dropout_rate, in_dims, out_dims);
 #endif
 		if(layer_id > 0) connect(layers[layer_id-1], layers[layer_id]);
 	}
@@ -91,7 +68,7 @@ class Net {
 		in_dims[0] = out_dims[0] = n;
 		in_dims[1] = get_in_dim(layer_id);
 		out_dims[1] = get_out_dim(layer_id);
-		layers[layer_id] = new softmax_loss_layer(layer_id, in_dims, out_dims, &labels);
+		layers[layer_id] = new softmax_loss_layer(layer_id, in_dims, out_dims);
 		connect(layers[layer_id-1], layers[layer_id]);
 	}
 
@@ -130,65 +107,19 @@ class Net {
 	}
 
 	// training
-	void train(optimizer *opt) {
-		std::cout << "\nStart training...\n";
-		galois::StatTimer Tupdate("Train-WeightUpdate");
-		galois::StatTimer Tfw("Train-Forward");
-		galois::StatTimer Tbw("Train-Backward");
-		galois::StatTimer Tval("Validation");
-		Timer t_epoch;
-		// run epoches
-		for (size_t i = 0; i < epochs; i++) {
-			std::cout << "Epoch " << std::setw(2) << i << std::fixed << std::setprecision(3) << ":";
-			t_epoch.Start();
-
-			// training steps
-			set_netphase(net_phase::train);
-			acc_t train_loss = 0.0, train_acc = 0.0;
-			Tfw.start();
-			train_loss = fprop(train_begin, train_end, train_count, train_mask); // forward
-			train_acc = masked_accuracy(train_begin, train_end, train_count, train_mask); // predict
-			Tfw.stop();
-			Tbw.start();
-			bprop(); // back propogation
-			Tbw.stop();
-			Tupdate.start();
-			update_weights(opt); // update parameters
-			Tupdate.stop();
-			set_netphase(net_phase::test);
-			std::cout << " train_loss = " << std::setw(5) << train_loss << " train_acc = " << std::setw(5) << train_acc;
-			t_epoch.Stop();
-			double epoch_time = t_epoch.Millisecs();
-
-			if (do_validate) {
-				// Validation
-				acc_t val_loss = 0.0, val_acc = 0.0;
-				Tval.start();
-				double val_time = evaluate(val_begin, val_end, val_count, val_mask, val_loss, val_acc);
-				Tval.stop();
-				std::cout << " val_loss = " << std::setw(5) << val_loss << " val_acc = " << std::setw(5) << val_acc;
-				std::cout << " time = " << epoch_time + val_time << " ms (train_time = " << epoch_time << " val_time = " << val_time << ")\n";
-			} else {
-				std::cout << " train_time = " << epoch_time << " ms\n";
-			}
-		}
-	}
+	void train(optimizer *opt, bool need_validate);
+	size_t get_nnodes() { return n; }
 
 protected:
+	Context *context;
 	size_t n; // number of samples: N
 	size_t num_classes; // number of vertex classes: E
 	size_t num_layers; // for now hard-coded: NUM_CONV_LAYERS + 1
+	unsigned num_epochs; // number of epochs
 	std::vector<size_t> feature_dims; // feature dimnesions for each layer
-#ifdef CPU_ONLY
-	Graph graph_cpu; // the input graph, |V| = N
-#else
-	CSRGraph graph_gpu; // the input graph, |V| = N
-#endif
 	tensor_t input_features; // input features: N x D
-	std::vector<label_t> labels; // labels for classification: N x 1
 	MaskList train_mask, val_mask; // masks for traning and validation
 	size_t train_begin, train_end, train_count, val_begin, val_end, val_count;
-
 	std::vector<layer *> layers; // all the layers in the neural network
 	/*
 	inline void init_features(size_t dim, vec_t &x) {
@@ -199,109 +130,6 @@ class Net {
 	}
 	//*/
 
-	// labels contain the ground truth (e.g. vertex classes) for each example (num_examples x 1).
-	// Note that labels is not one-hot encoded vector and it can be computed
-	// as y.argmax(axis=1) from one-hot encoded vector (y) of labels if required.
-	size_t read_labels(std::string dataset_str, LabelList &labels) {
-		std::cout << "Reading labels ... ";
-		Timer t_read;
-		t_read.Start();
-		std::string filename = path + dataset_str + "-labels.txt";
-		std::ifstream in;
-		std::string line;
-		in.open(filename, std::ios::in);
-		size_t m, n;
-		in >> m >> n >> std::ws;
-		assert(m == labels.size()); // number of vertices
-		unsigned v = 0;
-		while (std::getline(in, line)) {
-			std::istringstream label_stream(line);
-			unsigned x;
-			for (size_t idx = 0; idx < n; ++idx) {
-				label_stream >> x;
-				if (x != 0) {
-					labels[v] = idx;
-					break;
-				}
-			}
-			v ++;
-		}
-		in.close();
-		t_read.Stop();
-		// number of vertex classes
-		std::cout << "Done, unique label counts: " << n << ", time: " << t_read.Millisecs() << " ms\n";
-		return n;
-	}
-
-	size_t read_features(std::string dataset_str, tensor_t &feats) {
-		std::cout << "Reading features ... ";
-		Timer t_read;
-		t_read.Start();
-		std::string filename = path + dataset_str + ".ft";
-		std::ifstream in;
-		std::string line;
-		in.open(filename, std::ios::in);
-		size_t m, n;
-		in >> m >> n >> std::ws;
-		assert(m == feats.size()); // m = number of vertices
-		for (size_t i = 0; i < m; ++i) {
-			feats[i].resize(n);
-			for (size_t j = 0; j < n; ++j)
-				feats[i][j] = 0;
-		}
-		while (std::getline(in, line)) {
-			std::istringstream edge_stream(line);
-			unsigned u, v;
-			float_t w;
-			edge_stream >> u;
-			edge_stream >> v;
-			edge_stream >> w;
-			feats[u][v] = w;
-		}
-		/*
-		for (size_t i = 0; i < 10; ++i)
-			for (size_t j = 0; j < n; ++j)
-				if (feats[i][j] > 0)
-					std::cout << "feats[" << i << "][" << j << "]: " << feats[i][j] << std::endl;
-		//*/
-		in.close();
-		t_read.Stop();
-		std::cout << "Done, feature dimention: " << n << ", time: " << t_read.Millisecs() << " ms\n";
-		return n;
-	}
-
-	size_t read_graph_cpu(std::string dataset_str, Graph &graph) {
-		galois::StatTimer Tread("GraphReadingTime");
-		Tread.start();
-		LGraph lgraph;
-		if (filetype == "el") {
-			std::string filename = path + dataset_str + ".el";
-			printf("Reading .el file: %s\n", filename.c_str());
-			lgraph.read_edgelist(filename.c_str(), true); //symmetrize
-			genGraph(lgraph, graph);
-			lgraph.clean();
-		} else if (filetype == "gr") {
-			std::string filename = path + dataset_str + ".csgr";
-			printf("Reading .gr file: %s\n", filename.c_str());
-			galois::graphs::readGraph(graph, filename);
-		} else { printf("Unkown file format\n"); exit(1); }
-		Tread.stop();
-		return graph.size();
-	}
-
-	void genGraph(LGraph &lg, Graph &g) {
-		g.allocateFrom(lg.num_vertices(), lg.num_edges());
-		g.constructNodes();
-		for (size_t i = 0; i < lg.num_vertices(); i++) {
-			g.getData(i) = 1;
-			auto row_begin = lg.get_offset(i);
-			auto row_end = lg.get_offset(i+1);
-			g.fixEndEdge(i, row_end);
-			for (auto offset = row_begin; offset < row_end; offset ++)
-				g.constructEdge(offset, lg.get_dest(offset), 0);
-		}
-	}
-
 	// comparing outputs with the ground truth (labels)
 	inline acc_t masked_accuracy(size_t begin, size_t end, size_t count, MaskList &masks) {
 		AccumF accuracy_all;
@@ -309,7 +137,7 @@ class Net {
 		galois::do_all(galois::iterate(begin, end), [&](const auto& i) {
 			if (masks[i] == 1) {
 				int preds = argmax(num_classes, layers[NUM_CONV_LAYERS-1]->next()->get_data()[i]);
-				if ((label_t)preds == labels[i]) accuracy_all += 1.0;
+				if ((label_t)preds == context->get_label(i)) accuracy_all += 1.0;
 			}
 		}, galois::chunk_size<256>(), galois::steal(), galois::loopname("getMaskedLoss"));
 		return accuracy_all.reduce() / (acc_t)count;
diff --git a/libdeepgalois/src/aggregator.cpp b/libdeepgalois/src/aggregator.cpp
index 4b3f7cbab6..e9fc27d04a 100644
--- a/libdeepgalois/src/aggregator.cpp
+++ b/libdeepgalois/src/aggregator.cpp
@@ -1,14 +1,14 @@
 #include "aggregator.h"
 #include "math_functions.hh"
 
-void update_all(Graph *g, const tensor_t &in, tensor_t &out, bool norm, const vec_t &norm_factor) {
-	galois::do_all(galois::iterate(g->begin(), g->end()), [&](const auto& src) {
+void update_all(Graph &g, const tensor_t &in, tensor_t &out, bool norm, const vec_t &norm_factor) {
+	galois::do_all(galois::iterate(g.begin(), g.end()), [&](const auto& src) {
 		clear(out[src]); // TODO: vectorize clear
 		float_t a = 0.0, b = 0.0;
 		if (norm) a = norm_factor[src];
 		// gather neighbors' embeddings
-		for (const auto e : g->edges(src)) {
-			const auto dst = g->getEdgeDst(e);
+		for (const auto e : g.edges(src)) {
+			const auto dst = g.getEdgeDst(e);
 			if (norm) {
 				b = a * norm_factor[dst];
 				vec_t neighbor = in[dst];
@@ -19,15 +19,15 @@ void update_all(Graph *g, const tensor_t &in, tensor_t &out, bool norm, const ve
 	}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("update_all"));
 }
 
-void update_all(Graph *g, const vec_t &in, tensor_t &out, bool norm, const vec_t &norm_factor) {
+void update_all(Graph &g, const vec_t &in, tensor_t &out, bool norm, const vec_t &norm_factor) {
 	size_t len = out[0].size();
-	galois::do_all(galois::iterate(g->begin(), g->end()), [&](const auto& src) {
+	galois::do_all(galois::iterate(g.begin(), g.end()), [&](const auto& src) {
 		clear(out[src]);
 		float_t a = 0.0, b = 0.0;
 		if (norm) a = norm_factor[src];
 		// gather neighbors' embeddings
-		for (const auto e : g->edges(src)) {
-			const auto dst = g->getEdgeDst(e);
+		for (const auto e : g.edges(src)) {
+			const auto dst = g.getEdgeDst(e);
 			if (norm) {
 				b = a * norm_factor[dst];
 				vec_t neighbor(len);
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
new file mode 100644
index 0000000000..59895347f1
--- /dev/null
+++ b/libdeepgalois/src/context.cpp
@@ -0,0 +1,156 @@
+#include "context.h"
+#include <cstdio>
+#include <ctime>
+
+// random seeding
+int64_t cluster_seedgen(void) {
+	int64_t s, seed, pid;
+	FILE* f = fopen("/dev/urandom", "rb");
+	if (f && fread(&seed, 1, sizeof(seed), f) == sizeof(seed)) {
+		fclose(f);
+		return seed;
+	}
+	std::cout << "System entropy source not available, "
+		"using fallback algorithm to generate seed instead.";
+	if (f) fclose(f);
+	pid = getpid();
+	s = time(NULL);
+	seed = std::abs(((s * 181) * ((pid - 83) * 359)) % 104729);
+	return seed;
+}
+
+void Context::SetDevice(const int device_id) {
+	int current_device;
+	CUDA_CHECK(cudaGetDevice(&current_device));
+	if (current_device == device_id) return;
+	CUDA_CHECK(cudaSetDevice(device_id));
+	if (cublas_handle_) CUBLAS_CHECK(cublasDestroy(cublas_handle_));
+	if (curand_generator_) CURAND_CHECK(curandDestroyGenerator(curand_generator_));
+	CUBLAS_CHECK(cublasCreate(&cublas_handle_));
+	CURAND_CHECK(curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT));
+	CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen()));
+}
+
+Context::Context() : 
+		mode_(Context::CPU),
+		cublas_handle_(NULL), curand_generator_(NULL), 
+		//random_generator_(NULL), mode_(Context::CPU),
+		solver_count_(1), solver_rank_(0), multiprocess_(false) {
+#ifndef CPU_ONLY
+	mode_ = Context::GPU;
+	// Try to create a cublas handler, and report an error if failed (but we will
+	// keep the program running as one might just want to run CPU code).
+	if (cublasCreate(&cublas_handle_) != CUBLAS_STATUS_SUCCESS) {
+		std::cout << "Cannot create Cublas handle. Cublas won't be available.";
+	}
+	// Try to create a curand handler.
+	if (curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT) != CURAND_STATUS_SUCCESS ||
+		curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen()) != CURAND_STATUS_SUCCESS)
+		std::cout << "Cannot create Curand generator. Curand won't be available.";
+#endif
+}
+
+size_t Context::read_graph(std::string dataset_str) {
+#ifdef CPU_ONLY
+	size_t n = read_graph_cpu(dataset_str, "gr");
+#else
+	size_t n = read_graph_gpu(dataset_str);
+#endif
+	return n;
+}
+
+size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype) {
+	galois::StatTimer Tread("GraphReadingTime");
+	Tread.start();
+	LGraph lgraph;
+	if (filetype == "el") {
+		std::string filename = path + dataset_str + ".el";
+		printf("Reading .el file: %s\n", filename.c_str());
+		lgraph.read_edgelist(filename.c_str(), true); //symmetrize
+		genGraph(lgraph, graph_cpu);
+		lgraph.clean();
+	} else if (filetype == "gr") {
+		std::string filename = path + dataset_str + ".csgr";
+		printf("Reading .gr file: %s\n", filename.c_str());
+		galois::graphs::readGraph(graph_cpu, filename);
+	} else { printf("Unkown file format\n"); exit(1); }
+	Tread.stop();
+	std::cout << "num_vertices " << graph_cpu.size() << " num_edges " << graph_cpu.sizeEdges() << "\n";
+	return graph_cpu.size();
+}
+
+size_t Context::read_graph_gpu(std::string dataset_str) {
+}
+
+void Context::genGraph(LGraph &lg, Graph &g) {
+	g.allocateFrom(lg.num_vertices(), lg.num_edges());
+	g.constructNodes();
+	for (size_t i = 0; i < lg.num_vertices(); i++) {
+		g.getData(i) = 1;
+		auto row_begin = lg.get_offset(i);
+		auto row_end = lg.get_offset(i+1);
+		g.fixEndEdge(i, row_end);
+		for (auto offset = row_begin; offset < row_end; offset ++)
+			g.constructEdge(offset, lg.get_dest(offset), 0);
+	}
+}
+
+// user-defined pre-computing function, called during initialization
+// for each vertex v, compute pow(|N(v)|, -0.5), where |N(v)| is the degree of v
+void Context::norm_factor_counting() {
+#ifdef CPU_ONLY
+	size_t n = graph_cpu.size();
+	norm_factor.resize(n);
+	galois::do_all(galois::iterate((size_t)0, n), [&] (auto v) {
+		float_t temp = std::sqrt(float_t(degrees[v]));
+		if (temp == 0.0) norm_factor[v] = 0.0;
+		else norm_factor[v] = 1.0 / temp;
+	}, galois::loopname("NormCounting"));
+#endif
+}
+
+void Context::degree_counting() {
+#ifdef CPU_ONLY
+	size_t n = graph_cpu.size();
+	degrees.resize(n);
+	galois::do_all(galois::iterate((size_t)0, n), [&] (auto v) {
+		degrees[v] = std::distance(graph_cpu.edge_begin(v), graph_cpu.edge_end(v));
+	}, galois::loopname("DegreeCounting"));
+#endif
+}
+
+// labels contain the ground truth (e.g. vertex classes) for each example (num_examples x 1).
+// Note that labels is not one-hot encoded vector and it can be computed
+// as y.argmax(axis=1) from one-hot encoded vector (y) of labels if required.
+size_t Context::read_labels(std::string dataset_str, size_t num) {
+	std::cout << "Reading labels ... ";
+	labels.resize(num, 0); // label for each vertex: N x 1
+	Timer t_read;
+	t_read.Start();
+	std::string filename = path + dataset_str + "-labels.txt";
+	std::ifstream in;
+	std::string line;
+	in.open(filename, std::ios::in);
+	size_t m, n;
+	in >> m >> n >> std::ws;
+	assert(m == labels.size()); // number of vertices
+	unsigned v = 0;
+	while (std::getline(in, line)) {
+		std::istringstream label_stream(line);
+		unsigned x;
+		for (size_t idx = 0; idx < n; ++idx) {
+			label_stream >> x;
+			if (x != 0) {
+				labels[v] = idx;
+				break;
+			}
+		}
+		v ++;
+	}
+	in.close();
+	t_read.Stop();
+	// number of vertex classes
+	std::cout << "Done, unique label counts: " << n << ", time: " << t_read.Millisecs() << " ms\n";
+	return n;
+}
+
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 98e9e14211..0dd83b6b07 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -1,18 +1,7 @@
 #include "layers/graph_conv_layer.h"
 
-void graph_conv_layer::aggregate(Graph *g, const vec_t &in, tensor_t &out) {
-	update_all(g, in, out, true, norm_factor);
-}
-
-// for each vertex v, compute pow(|N(v)|, -0.5), where |N(v)| is the degree of v
-void graph_conv_layer::norm_factor_counting() {
-	degree_counting();
-	norm_factor.resize(x);
-	galois::do_all(galois::iterate((size_t)0, x), [&] (auto v) {
-		float_t temp = std::sqrt(float_t(degrees[v]));
-		if (temp == 0.0) norm_factor[v] = 0.0;
-		else norm_factor[v] = 1.0 / temp;
-	}, galois::loopname("NormCounting"));
+void graph_conv_layer::aggregate(Graph &g, const vec_t &in, tensor_t &out) {
+	update_all(g, in, out, true, context->norm_factor);
 }
 
 void graph_conv_layer::combine(const vec_t &self, const vec_t &neighbors, vec_t &out) {
@@ -23,10 +12,9 @@ void graph_conv_layer::combine(const vec_t &self, const vec_t &neighbors, vec_t
 	vadd(a, b, out); // out = W*self + Q*neighbors
 }
 
-#ifdef CPU_ONLY
-graph_conv_layer::graph_conv_layer(unsigned level, Graph *g, bool act, bool norm, bool bias, 
+graph_conv_layer::graph_conv_layer(unsigned level, bool act, bool norm, bool bias, 
 	bool dropout, float dropout_rate, std::vector<size_t> in_dims, std::vector<size_t> out_dims) :
-		layer(level, in_dims, out_dims), graph(g), act_(act), norm_(norm), 
+		layer(level, in_dims, out_dims), act_(act), norm_(norm), 
 		bias_(bias), dropout_(dropout), dropout_rate_(dropout_rate) {
 	assert(input_dims[0] == output_dims[0]); // num_vertices
 	x = input_dims[0];
@@ -38,6 +26,27 @@ graph_conv_layer::graph_conv_layer(unsigned level, Graph *g, bool act, bool norm
 	scale_ = 1. / (1. - dropout_rate_);
 }
 
+void graph_conv_layer::init() {
+	std::cout << name_ << ": allocating memory for parameters and intermediate data... ";
+	Timer t_alloc;
+	t_alloc.Start();
+	// randomly initialize trainable parameters for conv layers
+	rand_init_matrix(y, z, W);
+	//rand_init_matrix(y, z, Q);
+	zero_init_matrix(y, z, weight_grad);
+	alloc_grad();
+	if (dropout_) {
+		dropout_mask.resize(x);
+		for (size_t i = 0; i < x; i++) dropout_mask[i].resize(y);
+	}
+	in_temp.resize(x*y);
+	out_temp.resize(x*z); // same as pre_sup in original GCN code: https://github.com/chenxuhao/gcn/blob/master/gcn/layers.py
+	trans_data.resize(y*x); // y*x
+	t_alloc.Stop();
+	std::cout << "Done, allocation time: " << t_alloc.Millisecs() << " ms\n";
+}
+
+#ifdef CPU_ONLY
 // 𝒉[𝑙] = σ(𝑊 * Σ(𝒉[𝑙-1]))
 void graph_conv_layer::forward_propagation(const tensor_t &in_data, tensor_t &out_data) {
 	// input: x*y; W: y*z; output: x*z
@@ -49,7 +58,7 @@ void graph_conv_layer::forward_propagation(const tensor_t &in_data, tensor_t &ou
 		}, galois::loopname("dropout"));
 		matmul1D1D(x, z, y, in_temp, W, out_temp); // x*y; y*z; x*z
 	} else matmul2D1D(z, in_data, W, out_temp); // x*y; y*z; x*z
-	aggregate(graph, out_temp, out_data); // aggregate
+	aggregate(context->graph_cpu, out_temp, out_data); // aggregate
 	if (act_) {
 		galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) {
 			relu(out_data[i], out_data[i]);
@@ -69,7 +78,8 @@ void graph_conv_layer::back_propagation(const tensor_t &in_data, const tensor_t
 		vec_t trans_W(z*y);
 		transpose(y, z, W, trans_W); // derivative of matmul needs transposed matrix
 		matmul1D1D(x, y, z, out_temp, trans_W, in_temp); // x*z; z*y -> x*y
-		update_all(graph, in_temp, in_grad, true, norm_factor); // x*x; x*y -> x*y NOTE: since graph is symmetric, the derivative is the same
+		//NOTE: since graph is symmetric, the derivative is the same
+		update_all(context->graph_cpu, in_temp, in_grad, true, context->norm_factor); // x*x; x*y -> x*y
 		if (dropout_) {
 			galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) {
 				d_dropout(scale_, in_grad[i], dropout_mask[i], in_grad[i]);
@@ -84,20 +94,6 @@ void graph_conv_layer::back_propagation(const tensor_t &in_data, const tensor_t
 void graph_conv_layer::forward_propagation(const float_t *in_data, float_t *out_data) {}
 void graph_conv_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) {}
 #else
-graph_conv_layer::graph_conv_layer(unsigned level, CSRGraph *g, bool act, bool norm, bool bias, 
-	bool dropout, float dropout_rate, std::vector<size_t> in_dims, std::vector<size_t> out_dims) :
-		layer(level, in_dims, out_dims), graph_gpu(*g), act_(act), norm_(norm), 
-		bias_(bias), dropout_(dropout), dropout_rate_(dropout_rate) {
-	assert(input_dims[0] == output_dims[0]); // num_vertices
-	x = input_dims[0];
-	y = input_dims[1];
-	z = output_dims[1];
-	trainable_ = true;
-	name_ = layer_type() + "_" + std::to_string(level);
-	init();
-	scale_ = 1. / (1. - dropout_rate_);
-}
-
 void graph_conv_layer::forward_propagation(const tensor_t &in_data, tensor_t &out_data) {}
 void graph_conv_layer::back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad) {}
 
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp
index 22a9d1a83c..3d8c22bf49 100644
--- a/libdeepgalois/src/layers/softmax_loss_layer.cpp
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp
@@ -1,7 +1,7 @@
 #include "layers/softmax_loss_layer.h"
 
-softmax_loss_layer::softmax_loss_layer(unsigned level, std::vector<size_t> in_dims, std::vector<size_t> out_dims, LabelList *lab)
-	: layer(level, in_dims, out_dims), labels(lab) {
+softmax_loss_layer::softmax_loss_layer(unsigned level, std::vector<size_t> in_dims, 
+	std::vector<size_t> out_dims) : layer(level, in_dims, out_dims) {
 	trainable_ = false;
 	loss.resize(in_dims[0]); // error for each sample
 	name_ = layer_type() + "_" + std::to_string(level);
@@ -15,7 +15,7 @@ void softmax_loss_layer::forward_propagation(const tensor_t &in_data, tensor_t &
 			softmax(in_data[i], out_data[i]); // normalize using softmax
 			// y is a one hot encoded vector for the labels
 			std::vector<acc_t> y(output_dims[1], 0.0); // ground truth
-			y[(*labels)[i]] = 1.0; // one-hot
+			y[context->get_label(i)] = 1.0; // one-hot
 			loss[i] = cross_entropy(y, out_data[i]);
 		}
 	}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("softmax-loss-fw"));
@@ -24,14 +24,14 @@ void softmax_loss_layer::forward_propagation(const tensor_t &in_data, tensor_t &
 void softmax_loss_layer::forward_propagation(const float_t *in_data, float_t *out_data) { }
 
 void softmax_loss_layer::back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad) {
-	//std::cout << name_ << " backward: x=" << in_grad.size() << ", y=" << in_grad[0].size() << "\n";
 	galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) {
 		vec_t norm_grad(output_dims[1]);
 		std::vector<acc_t> y(output_dims[1], 0.0); // ground truth
-		y[(*labels)[i]] = 1.0;
+		y[context->get_label(i)] = 1.0;
 		d_cross_entropy(y, out_data[i], norm_grad);
 		d_softmax(in_data[i], out_data[i], in_grad[i], norm_grad);
 	}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("softmax-loss-bw"));
 }
 
-void softmax_loss_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) { }
+void softmax_loss_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) {
+}
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index 0179c46d56..a7e25a7256 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -1,5 +1,5 @@
 #include "math_functions.hh"
-#include "common.h"
+#include "context.h"
 
 extern "C" {
 #include <cblas.h>
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
new file mode 100644
index 0000000000..aff96fde56
--- /dev/null
+++ b/libdeepgalois/src/net.cpp
@@ -0,0 +1,107 @@
+#include "net.h"
+
+void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1) {
+	context = new Context();
+	n = context->read_graph(dataset_str);
+	num_classes = context->read_labels(dataset_str, n);
+	context->degree_counting();
+	context->norm_factor_counting(); // pre-compute normalizing factor
+	num_epochs = epochs;
+	std::cout << "Reading label masks ... ";
+	train_mask.resize(n, 0);
+	val_mask.resize(n, 0);
+	if (dataset_str == "reddit") {
+		train_begin = 0, train_count = 153431, train_end = train_begin + train_count;
+		val_begin = 153431, val_count = 23831, val_end = val_begin + val_count;
+		for (size_t i = train_begin; i < train_end; i++) train_mask[i] = 1;
+		for (size_t i = val_begin; i < val_end; i++) val_mask[i] = 1;
+	} else {
+		train_count = read_masks(dataset_str, "train", train_begin, train_end, train_mask);
+		val_count = read_masks(dataset_str, "val", val_begin, val_end, val_mask);
+	}
+	std::cout << "Done\n";
+
+	num_layers = NUM_CONV_LAYERS + 1;
+	feature_dims.resize(num_layers + 1);
+	input_features.resize(n); // input embedding: N x D
+	feature_dims[0] = read_features(dataset_str, input_features); // input feature dimension: D
+	feature_dims[1] = hidden1; // hidden1 level embedding: 16
+	feature_dims[2] = num_classes; // output embedding: E
+	feature_dims[3] = num_classes; // normalized output embedding: E
+	layers.resize(num_layers);
+}
+
+size_t Net::read_features(std::string dataset_str, tensor_t &feats) {
+	std::cout << "Reading features ... ";
+	Timer t_read;
+	t_read.Start();
+	std::string filename = path + dataset_str + ".ft";
+	std::ifstream in;
+	std::string line;
+	in.open(filename, std::ios::in);
+	size_t m, n;
+	in >> m >> n >> std::ws;
+	assert(m == feats.size()); // m = number of vertices
+	for (size_t i = 0; i < m; ++i) {
+		feats[i].resize(n);
+		for (size_t j = 0; j < n; ++j)
+			feats[i][j] = 0;
+	}
+	while (std::getline(in, line)) {
+		std::istringstream edge_stream(line);
+		unsigned u, v;
+		float_t w;
+		edge_stream >> u;
+		edge_stream >> v;
+		edge_stream >> w;
+		feats[u][v] = w;
+	}
+	in.close();
+	t_read.Stop();
+	std::cout << "Done, feature dimention: " << n << ", time: " << t_read.Millisecs() << " ms\n";
+	return n;
+}
+
+void Net::train(optimizer *opt, bool need_validate) {
+	std::cout << "\nStart training...\n";
+	galois::StatTimer Tupdate("Train-WeightUpdate");
+	galois::StatTimer Tfw("Train-Forward");
+	galois::StatTimer Tbw("Train-Backward");
+	galois::StatTimer Tval("Validation");
+	Timer t_epoch;
+	// run epoches
+	for (unsigned i = 0; i < num_epochs; i++) {
+		std::cout << "Epoch " << std::setw(2) << i << std::fixed << std::setprecision(3) << ":";
+		t_epoch.Start();
+
+		// training steps
+		set_netphases(net_phase::train);
+		acc_t train_loss = 0.0, train_acc = 0.0;
+		Tfw.start();
+		train_loss = fprop(train_begin, train_end, train_count, train_mask); // forward
+		train_acc = masked_accuracy(train_begin, train_end, train_count, train_mask); // predict
+		Tfw.stop();
+		Tbw.start();
+		bprop(); // back propogation
+		Tbw.stop();
+		Tupdate.start();
+		update_weights(opt); // update parameters
+		Tupdate.stop();
+		set_netphases(net_phase::test);
+		std::cout << " train_loss = " << std::setw(5) << train_loss << " train_acc = " << std::setw(5) << train_acc;
+		t_epoch.Stop();
+		double epoch_time = t_epoch.Millisecs();
+		if (need_validate) {
+			// Validation
+			acc_t val_loss = 0.0, val_acc = 0.0;
+			Tval.start();
+			double val_time = evaluate(val_begin, val_end, val_count, val_mask, val_loss, val_acc);
+			Tval.stop();
+			std::cout << " val_loss = " << std::setw(5) << val_loss << " val_acc = " << std::setw(5) << val_acc;
+			std::cout << " time = " << epoch_time + val_time << " ms (train_time = " << epoch_time << " val_time = " << val_time << ")\n";
+		} else {
+			std::cout << " train_time = " << epoch_time << " ms\n";
+		}
+	}
+}
+
diff --git a/lonestargnn/CMakeLists.txt b/lonestargnn/CMakeLists.txt
index 9e2597dffb..338ded6c67 100644
--- a/lonestargnn/CMakeLists.txt
+++ b/lonestargnn/CMakeLists.txt
@@ -4,5 +4,15 @@ include_directories(BEFORE
 )
 include_directories(${CMAKE_SOURCE_DIR}/lonestargnn)
 include_directories(${CMAKE_SOURCE_DIR}/libdeepgalois/include)
+SET(CUDA_INC /org/centers/cdgc/cuda/cuda-8.0/include)
+include_directories(${CUDA_INC})
+
+SET(OPENBLAS_INC /net/ohm/export/cdgc/cxh/OpenBLAS/build/include)
+SET(OPENBLAS_LIB /net/ohm/export/cdgc/cxh/OpenBLAS/build/lib)
+include_directories(${OPENBLAS_INC})
+link_directories(${OPENBLAS_LIB})
+if(USE_CPU)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCPU_ONLY")
+endif()
 
 add_subdirectory(gcn)
diff --git a/lonestargnn/gcn/CMakeLists.txt b/lonestargnn/gcn/CMakeLists.txt
index ae1d2dff4b..ccc59d83de 100644
--- a/lonestargnn/gcn/CMakeLists.txt
+++ b/lonestargnn/gcn/CMakeLists.txt
@@ -1,11 +1,3 @@
-SET(OPENBLAS_INC /net/ohm/export/cdgc/cxh/OpenBLAS/build/include)
-SET(OPENBLAS_LIB /net/ohm/export/cdgc/cxh/OpenBLAS/build/lib)
-include_directories(${OPENBLAS_INC})
-link_directories(${OPENBLAS_LIB})
-if(USE_CPU)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCPU_ONLY")
-endif()
-
 app(gcn gcn.cpp)
 target_link_libraries(gcn deepgalois)
 
diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp
index 72fc8373fc..7540a4b0e4 100644
--- a/lonestargnn/gcn/gcn.cpp
+++ b/lonestargnn/gcn/gcn.cpp
@@ -10,8 +10,8 @@ int main(int argc, char** argv) {
 	galois::SharedMemSys G;
 	LonestarGnnStart(argc, argv, name, desc, url);
 	Net network; // the neural network to train
-	network.init();
-	network.construct_layers(); // default setting for now; see its implementation to find how to customize it by the user
+	network.init(dataset, epochs, hidden1);
+	network.construct_layers(); // default setting for now; can be customized by the user
 	network.print_layers_info();
 	ResourceManager rm;
 
@@ -21,7 +21,7 @@ int main(int argc, char** argv) {
 	optimizer *opt = new adam();
 	galois::StatTimer Ttrain("TrainAndVal");
 	Ttrain.start();
-	network.train(opt); // do training using training samples
+	network.train(opt, do_validate); // do training using training samples
 	Ttrain.stop();
 
 	if (do_test) {

From 591cec5063ca2aba9e0ca9949d699377952a5170 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Sat, 22 Feb 2020 18:27:44 -0600
Subject: [PATCH 010/660] add graph_gpu

---
 libdeepgalois/CMakeLists.txt                  |   6 +-
 libdeepgalois/include/aggregator.h            |   4 +-
 libdeepgalois/include/context.h               |  31 ++--
 libdeepgalois/src/aggregator.cpp              |   2 +
 libdeepgalois/src/aggregator.cu               |   5 +-
 libdeepgalois/src/context.cpp                 |  52 ++++---
 libdeepgalois/src/layers/graph_conv_layer.cpp |   5 +
 libdeepgalois/src/math_functions.cu           |  14 +-
 libgpu/include/checker.h                      |  15 ++
 libgpu/include/csr_graph.h                    | 119 +--------------
 libgpu/include/gg.h                           |  10 +-
 libgpu/include/graph_gpu.h                    | 137 ++++++++++++++++++
 lonestargnn/CMakeLists.txt                    |   1 +
 13 files changed, 227 insertions(+), 174 deletions(-)
 create mode 100644 libgpu/include/checker.h
 create mode 100644 libgpu/include/graph_gpu.h

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index 514af263d4..e27f822b69 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -10,6 +10,9 @@ include_directories(${OPENBLAS_INC})
 include_directories(${CMAKE_SOURCE_DIR}/libgalois/include)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
 include_directories(${CUDA_INC})
+include_directories(${CMAKE_SOURCE_DIR}/libgpu/include)
+include_directories("${CUB_ROOT}")
+include_directories("${MGPU_ROOT}/src")
 link_directories(${OPENBLAS_LIB})
 link_directories(${CMAKE_SOURCE_DIR}/libgalois)
 
@@ -24,9 +27,6 @@ else()
   set(CUDA_HOST_COMPILER g++)
   #list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_60,code=sm_60")
   #list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_60,code=sm_60; -std=c++11")
-  cuda_include_directories("${CUB_ROOT}")
-  cuda_include_directories("${MGPU_ROOT}/src")
-  cuda_include_directories(${CMAKE_SOURCE_DIR}/libgpu/include)
   link_directories(${CUDA_LIB})
   link_directories(${CMAKE_SOURCE_DIR}/libgpu)
   file(GLOB CUDA_FILES "src/" *.cu)
diff --git a/libdeepgalois/include/aggregator.h b/libdeepgalois/include/aggregator.h
index 6fb4ec8d41..61befebf2d 100644
--- a/libdeepgalois/include/aggregator.h
+++ b/libdeepgalois/include/aggregator.h
@@ -5,9 +5,7 @@
 void update_all(Graph &g, const tensor_t &in, tensor_t &out, bool norm, const vec_t &norm_factor);
 void update_all(Graph &g, const vec_t &in, tensor_t &out, bool norm, const vec_t &norm_factor);
 #else
-#include "gg.h"
-#include "ggcuda.h"
-#include "cub/cub.cuh"
+#include "graph_gpu.h"
 #define TB_SIZE 256
 #define WARP_SIZE 32
 void update_all(CSRGraph &g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor);
diff --git a/libdeepgalois/include/context.h b/libdeepgalois/include/context.h
index 1fc8b6ffc4..884eba685b 100644
--- a/libdeepgalois/include/context.h
+++ b/libdeepgalois/include/context.h
@@ -4,9 +4,12 @@
 #include "types.h"
 #include "utils.h"
 #include "lgraph.h"
+#ifdef CPU_ONLY
 #include "gtypes.h"
+#else
+#include "graph_gpu.h"
+#endif
 #include "cutils.h"
-//#include "random.h"
 
 class Context {
 public:
@@ -14,8 +17,10 @@ class Context {
 	~Context();
 	enum Brew { CPU, GPU };
 	//static Context& Get();
-	cublasHandle_t cublas_handle() { return cublas_handle_; }
-	curandGenerator_t curand_generator() { return curand_generator_; }
+#ifndef CPU_ONLY
+	inline static cublasHandle_t cublas_handle() { return cublas_handle_; }
+	inline static curandGenerator_t curand_generator() { return curand_generator_; }
+#endif
 	Brew mode() { return mode_; }
 	void set_mode(Brew mode) { mode_ = mode; }
 	int solver_count() { return solver_count_; }
@@ -25,13 +30,7 @@ class Context {
 	bool multiprocess() { return multiprocess_; }
 	void set_multiprocess(bool val) { multiprocess_ = val; }
 	bool root_solver() { return solver_rank_ == 0; }
-	void SetDevice(const int device_id);
-	void DeviceQuery() {}
-	bool CheckDevice(const int device_id) { return true; }
-	int FindDevice(const int start_id = 0) { return 0; }
 	size_t read_graph(std::string dataset_str);
-	size_t read_graph_cpu(std::string dataset_str, std::string filetype = "gr");
-	size_t read_graph_gpu(std::string dataset_str);
 	size_t read_labels(std::string dataset_str, size_t num);
 	label_t get_label(size_t i) { return labels[i]; }
 	label_t *get_labels_ptr(size_t i) { return &(labels[0]); }
@@ -39,23 +38,31 @@ class Context {
 	void norm_factor_counting();
 #ifdef CPU_ONLY
 	Graph graph_cpu; // the input graph, |V| = N
+	void genGraph(LGraph &lg, Graph &g);
+	size_t read_graph_cpu(std::string dataset_str, std::string filetype = "gr");
 #else
 	CSRGraph graph_gpu; // the input graph, |V| = N
+	size_t read_graph_gpu(std::string dataset_str);
+	void SetDevice(const int device_id);
+	void DeviceQuery() {}
+	bool CheckDevice(const int device_id) { return true; }
+	int FindDevice(const int start_id = 0) { return 0; }
 #endif
 	std::vector<label_t> labels; // labels for classification: N x 1
 	std::vector<float_t> norm_factor; // normalization constant based on graph structure
 	std::vector<unsigned> degrees;
 
 protected:
+#ifndef CPU_ONLY
+	static cublasHandle_t cublas_handle_; // used to call cuBLAS
+	static curandGenerator_t curand_generator_; // used to generate random numbers on GPU
+#endif
 	Brew mode_;
-	cublasHandle_t cublas_handle_; // used to call cuBLAS
-	curandGenerator_t curand_generator_; // used to generate random numbers on GPU
 	//shared_ptr<RNG> random_generator_;
 	// Parallel training
 	int solver_count_;
 	int solver_rank_;
 	bool multiprocess_;
-	void genGraph(LGraph &lg, Graph &g);
 
 private:
 	// The private constructor to avoid duplicate instantiation.
diff --git a/libdeepgalois/src/aggregator.cpp b/libdeepgalois/src/aggregator.cpp
index e9fc27d04a..723a36e9e9 100644
--- a/libdeepgalois/src/aggregator.cpp
+++ b/libdeepgalois/src/aggregator.cpp
@@ -1,3 +1,5 @@
+#include "types.h"
+#include "gtypes.h"
 #include "aggregator.h"
 #include "math_functions.hh"
 
diff --git a/libdeepgalois/src/aggregator.cu b/libdeepgalois/src/aggregator.cu
index 44a3e59d2d..49fed1e67e 100644
--- a/libdeepgalois/src/aggregator.cu
+++ b/libdeepgalois/src/aggregator.cu
@@ -1,6 +1,9 @@
+#include "gg.h"
+#include "ggcuda.h"
+#include "cub/cub.cuh"
 #include "aggregator.h"
 #include "math_functions.hh"
 
-void update_all(CSRGraph g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor) {
+void update_all(CSRGraph &g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor) {
 }
 	
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index 59895347f1..8d7fa0e00c 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -1,4 +1,5 @@
 #include "context.h"
+#include "gtypes.h"
 #include <cstdio>
 #include <ctime>
 
@@ -19,6 +20,31 @@ int64_t cluster_seedgen(void) {
 	return seed;
 }
 
+#ifdef CPU_ONLY
+Context::Context() : mode_(Context::CPU), solver_count_(1), 
+	solver_rank_(0), multiprocess_(false) { }
+Context::~Context() {}
+#else
+Context::Context() : mode_(Context::GPU), solver_count_(1), 
+	solver_rank_(0), multiprocess_(false) {
+	// Try to create a cublas handler, and report an error if failed (but we will
+	// keep the program running as one might just want to run CPU code).
+	if (cublasCreate(&cublas_handle_) != CUBLAS_STATUS_SUCCESS) {
+		std::cout << "Cannot create Cublas handle. Cublas won't be available.";
+	}
+	// Try to create a curand handler.
+	if (curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT) != CURAND_STATUS_SUCCESS ||
+		curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen()) != CURAND_STATUS_SUCCESS)
+		std::cout << "Cannot create Curand generator. Curand won't be available.";
+}
+
+Context::~Context() {
+	if (cublas_handle_) CUBLAS_CHECK(cublasDestroy(cublas_handle_));
+	if (curand_generator_) {
+		CURAND_CHECK(curandDestroyGenerator(curand_generator_));
+	}
+}
+
 void Context::SetDevice(const int device_id) {
 	int current_device;
 	CUDA_CHECK(cudaGetDevice(&current_device));
@@ -30,25 +56,7 @@ void Context::SetDevice(const int device_id) {
 	CURAND_CHECK(curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT));
 	CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen()));
 }
-
-Context::Context() : 
-		mode_(Context::CPU),
-		cublas_handle_(NULL), curand_generator_(NULL), 
-		//random_generator_(NULL), mode_(Context::CPU),
-		solver_count_(1), solver_rank_(0), multiprocess_(false) {
-#ifndef CPU_ONLY
-	mode_ = Context::GPU;
-	// Try to create a cublas handler, and report an error if failed (but we will
-	// keep the program running as one might just want to run CPU code).
-	if (cublasCreate(&cublas_handle_) != CUBLAS_STATUS_SUCCESS) {
-		std::cout << "Cannot create Cublas handle. Cublas won't be available.";
-	}
-	// Try to create a curand handler.
-	if (curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT) != CURAND_STATUS_SUCCESS ||
-		curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen()) != CURAND_STATUS_SUCCESS)
-		std::cout << "Cannot create Curand generator. Curand won't be available.";
 #endif
-}
 
 size_t Context::read_graph(std::string dataset_str) {
 #ifdef CPU_ONLY
@@ -59,6 +67,7 @@ size_t Context::read_graph(std::string dataset_str) {
 	return n;
 }
 
+#ifdef CPU_ONLY
 size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype) {
 	galois::StatTimer Tread("GraphReadingTime");
 	Tread.start();
@@ -79,9 +88,6 @@ size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype) {
 	return graph_cpu.size();
 }
 
-size_t Context::read_graph_gpu(std::string dataset_str) {
-}
-
 void Context::genGraph(LGraph &lg, Graph &g) {
 	g.allocateFrom(lg.num_vertices(), lg.num_edges());
 	g.constructNodes();
@@ -94,6 +100,10 @@ void Context::genGraph(LGraph &lg, Graph &g) {
 			g.constructEdge(offset, lg.get_dest(offset), 0);
 	}
 }
+#else
+size_t Context::read_graph_gpu(std::string dataset_str) {
+}
+#endif
 
 // user-defined pre-computing function, called during initialization
 // for each vertex v, compute pow(|N(v)|, -0.5), where |N(v)| is the degree of v
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 0dd83b6b07..4e27fdd9bb 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -1,7 +1,12 @@
 #include "layers/graph_conv_layer.h"
 
+#ifdef CPU_ONLY
 void graph_conv_layer::aggregate(Graph &g, const vec_t &in, tensor_t &out) {
 	update_all(g, in, out, true, context->norm_factor);
+#else
+void graph_conv_layer::aggregate(CSRGraph &g, const float_t *in, float_t *out) {
+	update_all(g, in, out, true, NULL);
+#endif
 }
 
 void graph_conv_layer::combine(const vec_t &self, const vec_t &neighbors, vec_t &out) {
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index a7e25a7256..064926eb58 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -35,7 +35,7 @@ void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
 	int ldb = (TransB == CblasNoTrans) ? N : K;
 	cublasOperation_t cuTransA = (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
 	cublasOperation_t cuTransB = (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-	CUBLAS_CHECK(cublasSgemm(DeepGalois::cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
+	CUBLAS_CHECK(cublasSgemm(Context::cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
 }
 
 void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, const float_t *A, const float_t *B, float_t *C) {
@@ -52,24 +52,24 @@ int argmax_gpu(const size_t n, const float_t *x) {
 void gemv_gpu(const CBLAS_TRANSPOSE TransA, const int M, const int N, 
 	const float alpha, const float* A, const float* x, const float beta, float* y) {
 	cublasOperation_t cuTransA = (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
-	CUBLAS_CHECK(cublasSgemv(DeepGalois::cublas_handle(), cuTransA, N, M, &alpha, A, N, x, 1, &beta, y, 1));
+	CUBLAS_CHECK(cublasSgemv(Context::cublas_handle(), cuTransA, N, M, &alpha, A, N, x, 1, &beta, y, 1));
 }
 
 void scal_gpu(const int N, const float alpha, float *X) {
-	CUBLAS_CHECK(cublasSscal(DeepGalois::cublas_handle(), N, &alpha, X, 1));
+	CUBLAS_CHECK(cublasSscal(Context::cublas_handle(), N, &alpha, X, 1));
 }
 
 void dot_gpu(const int n, const float* x, const float* y, float* out) {
-	CUBLAS_CHECK(cublasSdot(DeepGalois::cublas_handle(), n, x, 1, y, 1, out));
+	CUBLAS_CHECK(cublasSdot(Context::cublas_handle(), n, x, 1, y, 1, out));
 }
 
 void asum_gpu(const int n, const float* x, float* y) {
-	CUBLAS_CHECK(cublasSasum(DeepGalois::cublas_handle(), n, x, 1, y));
+	CUBLAS_CHECK(cublasSasum(Context::cublas_handle(), n, x, 1, y));
 }
 
 void scale_gpu(const int n, const float alpha, const float *x, float* y) {
-	CUBLAS_CHECK(cublasScopy(DeepGalois::cublas_handle(), n, x, 1, y, 1));
-	CUBLAS_CHECK(cublasSscal(DeepGalois::cublas_handle(), n, &alpha, y, 1));
+	CUBLAS_CHECK(cublasScopy(Context::cublas_handle(), n, x, 1, y, 1));
+	CUBLAS_CHECK(cublasSscal(Context::cublas_handle(), n, &alpha, y, 1));
 }
 
 __global__ void set_kernel(const int n, const float_t alpha, float_t* y) {
diff --git a/libgpu/include/checker.h b/libgpu/include/checker.h
new file mode 100644
index 0000000000..7f2cf4e36e
--- /dev/null
+++ b/libgpu/include/checker.h
@@ -0,0 +1,15 @@
+#ifndef CHECKER_H
+#define CHECKER_H
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+static void check_cuda_error(const cudaError_t e, const char* file,
+                             const int line) {
+  if (e != cudaSuccess) {
+    fprintf(stderr, "%s:%d: %s (%d)\n", file, line, cudaGetErrorString(e), e);
+    exit(1);
+  }
+}
+#define check_cuda(x) check_cuda_error(x, __FILE__, __LINE__)
+
+#endif
diff --git a/libgpu/include/csr_graph.h b/libgpu/include/csr_graph.h
index c9e13b88dc..04a8d90f99 100644
--- a/libgpu/include/csr_graph.h
+++ b/libgpu/include/csr_graph.h
@@ -14,124 +14,7 @@
 #ifndef LSG_CSR_GRAPH
 #define LSG_CSR_GRAPH
 
-#include <fstream>
-
-// Adapted from LSG CSRGraph.h
-
-// TODO: make this template data
-typedef unsigned index_type; // should be size_t, but GPU chokes on size_t
-typedef int edge_data_type;
-typedef int node_data_type;
-
-// very simple implementation
-struct CSRGraph {
-  unsigned read(char file[], bool read_edge_data = true);
-  void copy_to_gpu(struct CSRGraph& copygraph);
-  void copy_to_cpu(struct CSRGraph& copygraph);
-
-  CSRGraph();
-
-  unsigned init();
-  unsigned allocOnHost(bool no_edge_data = false);
-  unsigned allocOnDevice(bool no_edge_data = false);
-  void progressPrint(unsigned maxii, unsigned ii);
-  unsigned readFromGR(char file[], bool read_edge_data = true);
-
-  unsigned deallocOnHost();
-  unsigned deallocOnDevice();
-  void dealloc();
-
-  __device__ __host__ bool valid_node(index_type node) {
-    return (node < nnodes);
-  }
-
-  __device__ __host__ bool valid_edge(index_type edge) {
-    return (edge < nedges);
-  }
-
-  __device__ __host__ index_type getOutDegree(unsigned src) {
-    assert(src < nnodes);
-    return row_start[src + 1] - row_start[src];
-  };
-
-  __device__ __host__ index_type getDestination(unsigned src, unsigned edge) {
-    assert(src < nnodes);
-    assert(edge < getOutDegree(src));
-
-    index_type abs_edge = row_start[src] + edge;
-    assert(abs_edge < nedges);
-
-    return edge_dst[abs_edge];
-  };
-
-  __device__ __host__ index_type getAbsDestination(unsigned abs_edge) {
-    assert(abs_edge < nedges);
-
-    return edge_dst[abs_edge];
-  };
-
-  __device__ __host__ index_type getFirstEdge(unsigned src) {
-    assert(src <= nnodes); // <= is okay
-    return row_start[src];
-  };
-
-  __device__ __host__ edge_data_type getWeight(unsigned src, unsigned edge) {
-    assert(src < nnodes);
-    assert(edge < getOutDegree(src));
-
-    index_type abs_edge = row_start[src] + edge;
-    assert(abs_edge < nedges);
-
-    return edge_data[abs_edge];
-  };
-
-  __device__ __host__ edge_data_type getAbsWeight(unsigned abs_edge) {
-    assert(abs_edge < nedges);
-
-    return edge_data[abs_edge];
-  };
-
-	void init_from_mgraph(int m, int nnz, index_type *h_row_offsets, index_type *h_column_indices, node_data_type *h_labels) {
-		nnodes = m;
-		nedges = nnz;
-		check_cuda(cudaMalloc((void **)&row_start, (m + 1) * sizeof(index_type)));
-		check_cuda(cudaMalloc((void **)&edge_dst, nnz * sizeof(index_type)));
-		check_cuda(cudaMemcpy(row_start, h_row_offsets, (m + 1) * sizeof(index_type), cudaMemcpyHostToDevice));
-		check_cuda(cudaMemcpy(edge_dst, h_column_indices, nnz * sizeof(index_type), cudaMemcpyHostToDevice));
-		#ifdef ENABLE_LABEL
-		check_cuda(cudaMalloc((void **)&node_data, m * sizeof(node_data_type)));
-		check_cuda(cudaMemcpy(node_data, h_labels, m * sizeof(node_data_type), cudaMemcpyHostToDevice));
-		#endif
-		//int *h_degrees = (int *)malloc(m * sizeof(int));
-		//for (int i = 0; i < m; i++) h_degrees[i] = h_row_offsets[i + 1] - h_row_offsets[i];
-		//check_cuda(cudaMalloc((void **)&d_degrees, m * sizeof(int)));
-		//check_cuda(cudaMemcpy(d_degrees, h_degrees, m * sizeof(int), cudaMemcpyHostToDevice));
-	}
-
-	inline __device__ __host__ index_type getEdgeDst(unsigned edge) {
-		assert(edge < nedges);
-		return edge_dst[edge];
-	};
-	inline __device__ __host__ node_data_type getData(unsigned vid) {
-		return node_data[vid];
-	}
-	inline __device__ __host__ index_type edge_begin(unsigned src) {
-		assert(src <= nnodes);
-		return row_start[src];
-	};
-	inline __device__ __host__ index_type edge_end(unsigned src) {
-		assert(src <= nnodes);
-		return row_start[src+1];
-	};
-
-  index_type nnodes, nedges;
-  index_type* row_start; // row_start[node] points into edge_dst, node starts at
-                         // 0, row_start[nnodes] = nedges
-  index_type* edge_dst;
-  edge_data_type* edge_data;
-  node_data_type* node_data;
-  bool device_graph;
-};
+#include "graph_gpu.h"
 
 struct CSRGraphTex : CSRGraph {
   cudaTextureObject_t edge_dst_tx;
diff --git a/libgpu/include/gg.h b/libgpu/include/gg.h
index 779aafdd84..7f4a130c23 100644
--- a/libgpu/include/gg.h
+++ b/libgpu/include/gg.h
@@ -34,14 +34,7 @@
 unsigned const debug = GGDEBUG;
 
 #include "Timer.h"
-
-static void check_cuda_error(const cudaError_t e, const char* file,
-                             const int line) {
-  if (e != cudaSuccess) {
-    fprintf(stderr, "%s:%d: %s (%d)\n", file, line, cudaGetErrorString(e), e);
-    exit(1);
-  }
-}
+#include "checker.h"
 
 template <typename T>
 static void check_retval(const T retval, const T expected, const char* file,
@@ -64,7 +57,6 @@ inline static __device__ __host__ int GG_MIN(int x, int y) {
     return x;
 }
 
-#define check_cuda(x) check_cuda_error(x, __FILE__, __LINE__)
 #define check_rv(r, x) check_retval(r, x, __FILE__, __LINE__)
 
 #include "bmk2.h"
diff --git a/libgpu/include/graph_gpu.h b/libgpu/include/graph_gpu.h
new file mode 100644
index 0000000000..c197c077ec
--- /dev/null
+++ b/libgpu/include/graph_gpu.h
@@ -0,0 +1,137 @@
+/*
+   csr_graph.h
+
+   Implements a CSR Graph. Part of the GGC source code.
+   Interface derived from LonestarGPU.
+
+   Copyright (C) 2014--2016, The University of Texas at Austin
+
+   See LICENSE.TXT for copyright license.
+
+   Author: Sreepathi Pai <sreepai@ices.utexas.edu>
+*/
+
+#ifndef CSR_GRAPH
+#define CSR_GRAPH
+
+#include <cassert>
+#include <fstream>
+#include "checker.h"
+
+// Adapted from LSG CSRGraph.h
+
+// TODO: make this template data
+typedef unsigned index_type; // should be size_t, but GPU chokes on size_t
+typedef int edge_data_type;
+typedef int node_data_type;
+
+// very simple implementation
+struct CSRGraph {
+  unsigned read(char file[], bool read_edge_data = true);
+  void copy_to_gpu(struct CSRGraph& copygraph);
+  void copy_to_cpu(struct CSRGraph& copygraph);
+
+  CSRGraph();
+
+  unsigned init();
+  unsigned allocOnHost(bool no_edge_data = false);
+  unsigned allocOnDevice(bool no_edge_data = false);
+  void progressPrint(unsigned maxii, unsigned ii);
+  unsigned readFromGR(char file[], bool read_edge_data = true);
+
+  unsigned deallocOnHost();
+  unsigned deallocOnDevice();
+  void dealloc();
+
+  __device__ __host__ bool valid_node(index_type node) {
+    return (node < nnodes);
+  }
+
+  __device__ __host__ bool valid_edge(index_type edge) {
+    return (edge < nedges);
+  }
+
+  __device__ __host__ index_type getOutDegree(unsigned src) {
+    assert(src < nnodes);
+    return row_start[src + 1] - row_start[src];
+  };
+
+  __device__ __host__ index_type getDestination(unsigned src, unsigned edge) {
+    assert(src < nnodes);
+    assert(edge < getOutDegree(src));
+
+    index_type abs_edge = row_start[src] + edge;
+    assert(abs_edge < nedges);
+
+    return edge_dst[abs_edge];
+  };
+
+  __device__ __host__ index_type getAbsDestination(unsigned abs_edge) {
+    assert(abs_edge < nedges);
+
+    return edge_dst[abs_edge];
+  };
+
+  __device__ __host__ index_type getFirstEdge(unsigned src) {
+    assert(src <= nnodes); // <= is okay
+    return row_start[src];
+  };
+
+  __device__ __host__ edge_data_type getWeight(unsigned src, unsigned edge) {
+    assert(src < nnodes);
+    assert(edge < getOutDegree(src));
+
+    index_type abs_edge = row_start[src] + edge;
+    assert(abs_edge < nedges);
+
+    return edge_data[abs_edge];
+  };
+
+  __device__ __host__ edge_data_type getAbsWeight(unsigned abs_edge) {
+    assert(abs_edge < nedges);
+
+    return edge_data[abs_edge];
+  };
+
+	void init_from_mgraph(int m, int nnz, index_type *h_row_offsets, index_type *h_column_indices, node_data_type *h_labels) {
+		nnodes = m;
+		nedges = nnz;
+		check_cuda(cudaMalloc((void **)&row_start, (m + 1) * sizeof(index_type)));
+		check_cuda(cudaMalloc((void **)&edge_dst, nnz * sizeof(index_type)));
+		check_cuda(cudaMemcpy(row_start, h_row_offsets, (m + 1) * sizeof(index_type), cudaMemcpyHostToDevice));
+		check_cuda(cudaMemcpy(edge_dst, h_column_indices, nnz * sizeof(index_type), cudaMemcpyHostToDevice));
+		#ifdef ENABLE_LABEL
+		check_cuda(cudaMalloc((void **)&node_data, m * sizeof(node_data_type)));
+		check_cuda(cudaMemcpy(node_data, h_labels, m * sizeof(node_data_type), cudaMemcpyHostToDevice));
+		#endif
+		//int *h_degrees = (int *)malloc(m * sizeof(int));
+		//for (int i = 0; i < m; i++) h_degrees[i] = h_row_offsets[i + 1] - h_row_offsets[i];
+		//check_cuda(cudaMalloc((void **)&d_degrees, m * sizeof(int)));
+		//check_cuda(cudaMemcpy(d_degrees, h_degrees, m * sizeof(int), cudaMemcpyHostToDevice));
+	}
+
+	inline __device__ __host__ index_type getEdgeDst(unsigned edge) {
+		assert(edge < nedges);
+		return edge_dst[edge];
+	};
+	inline __device__ __host__ node_data_type getData(unsigned vid) {
+		return node_data[vid];
+	}
+	inline __device__ __host__ index_type edge_begin(unsigned src) {
+		assert(src <= nnodes);
+		return row_start[src];
+	};
+	inline __device__ __host__ index_type edge_end(unsigned src) {
+		assert(src <= nnodes);
+		return row_start[src+1];
+	};
+
+  index_type nnodes, nedges;
+  index_type* row_start; // row_start[node] points into edge_dst, node starts at
+                         // 0, row_start[nnodes] = nedges
+  index_type* edge_dst;
+  edge_data_type* edge_data;
+  node_data_type* node_data;
+  bool device_graph;
+};
+#endif
diff --git a/lonestargnn/CMakeLists.txt b/lonestargnn/CMakeLists.txt
index 338ded6c67..10e7288dd9 100644
--- a/lonestargnn/CMakeLists.txt
+++ b/lonestargnn/CMakeLists.txt
@@ -6,6 +6,7 @@ include_directories(${CMAKE_SOURCE_DIR}/lonestargnn)
 include_directories(${CMAKE_SOURCE_DIR}/libdeepgalois/include)
 SET(CUDA_INC /org/centers/cdgc/cuda/cuda-8.0/include)
 include_directories(${CUDA_INC})
+include_directories(${CMAKE_SOURCE_DIR}/libgpu/include)
 
 SET(OPENBLAS_INC /net/ohm/export/cdgc/cxh/OpenBLAS/build/include)
 SET(OPENBLAS_LIB /net/ohm/export/cdgc/cxh/OpenBLAS/build/lib)

From 2388800b99785ffe3053d4c45eda1d68a48fe1bd Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Sun, 23 Feb 2020 08:42:40 -0600
Subject: [PATCH 011/660] fix gpu compilation

---
 libdeepgalois/CMakeLists.txt            | 28 +++++-----
 libdeepgalois/include/context.h         |  1 +
 libdeepgalois/include/cutils.h          | 74 ++++++++++++++++++++++++-
 libdeepgalois/include/math_functions.hh |  6 +-
 libdeepgalois/src/aggregator.cu         |  2 +
 libdeepgalois/src/context.cpp           | 20 +++----
 libdeepgalois/src/net.cpp               |  3 +
 lonestargnn/gcn/CMakeLists.txt          |  3 +-
 8 files changed, 107 insertions(+), 30 deletions(-)

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index e27f822b69..7ff89b086b 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -16,8 +16,7 @@ include_directories("${MGPU_ROOT}/src")
 link_directories(${OPENBLAS_LIB})
 link_directories(${CMAKE_SOURCE_DIR}/libgalois)
 
-#deepgalois_option(CPU_ONLY "Build DeepGalois without CUDA support" OFF)
-set(USE_CPU ON CACHE BOOL "Build DeepGalois without CUDA support")
+set(USE_CPU OFF CACHE BOOL "Build DeepGalois without CUDA support")
 if(USE_CPU)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCPU_ONLY")
 else()
@@ -26,12 +25,14 @@ else()
   set(CUDA_PROPAGATE_HOST_FLAGS OFF)
   set(CUDA_HOST_COMPILER g++)
   #list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_60,code=sm_60")
-  #list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_60,code=sm_60; -std=c++11")
   link_directories(${CUDA_LIB})
   link_directories(${CMAKE_SOURCE_DIR}/libgpu)
-  file(GLOB CUDA_FILES "src/" *.cu)
-  cuda_compile(CU_O src/math_functions.cu src/aggregator.cu)
-  #CUDA_COMPILE(CU_O ${CUDA_FILES})
+  set(CUDA_SOURCES src/math_functions.cu src/aggregator.cu)
+  cuda_add_library(dg_gpu ${CUDA_SOURCES})
+  set_target_properties(dg_gpu PROPERTIES COMPILE_FLAGS "-DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_CUDA")
+  set_target_properties(dg_gpu PROPERTIES CUDA_SEPERABLE_COMPILATION ON)
+  #cuda_compile(MF_O src/math_functions.cu)
+  #cuda_compile(AGG_O src/aggregator.cu)
 endif()
 
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
@@ -43,22 +44,21 @@ set(sources
   src/aggregator.cpp
   src/context.cpp
   src/net.cpp
-  ${CU_O}
 )
-add_library(deepgalois STATIC ${sources})
+add_library(dg_cpu STATIC ${sources})
 
-target_link_libraries(deepgalois galois_shmem gllvm galois_gpu)
-target_link_libraries(deepgalois ${MPI_CXX_LIBRARIES})
-target_link_libraries(deepgalois -lopenblas)
-target_link_libraries(deepgalois -lcudart -lcublas -lcurand)
+target_link_libraries(dg_cpu galois_shmem gllvm galois_gpu)
+target_link_libraries(dg_cpu ${MPI_CXX_LIBRARIES})
+target_link_libraries(dg_cpu -lopenblas)
+target_link_libraries(dg_cpu -lcudart -lcublas -lcurand)
 
-target_include_directories(deepgalois PUBLIC
+target_include_directories(dg_cpu PUBLIC
   ${CMAKE_SOURCE_DIR}/libllvm/include
   ${CMAKE_SOURCE_DIR}/libgalois/include
   ${CMAKE_CURRENT_SOURCE_DIR}/include
 )
 
-set_target_properties(deepgalois PROPERTIES
+set_target_properties(dg_cpu PROPERTIES
   INTERFACE_POSITION_INDEPENDENT_CODE On
   POSITION_INDEPENDENT_CODE On
 )
diff --git a/libdeepgalois/include/context.h b/libdeepgalois/include/context.h
index 884eba685b..39fd817198 100644
--- a/libdeepgalois/include/context.h
+++ b/libdeepgalois/include/context.h
@@ -20,6 +20,7 @@ class Context {
 #ifndef CPU_ONLY
 	inline static cublasHandle_t cublas_handle() { return cublas_handle_; }
 	inline static curandGenerator_t curand_generator() { return curand_generator_; }
+	static void create_blas_handle();
 #endif
 	Brew mode() { return mode_; }
 	void set_mode(Brew mode) { mode_ = mode; }
diff --git a/libdeepgalois/include/cutils.h b/libdeepgalois/include/cutils.h
index 8a0fcaa3a1..924dfd06e7 100644
--- a/libdeepgalois/include/cutils.h
+++ b/libdeepgalois/include/cutils.h
@@ -13,6 +13,68 @@ inline int CUDA_GET_BLOCKS(const int N) {
 	return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
 }
 
+inline const char* cublasGetErrorString(cublasStatus_t error) {
+  switch (error) {
+  case CUBLAS_STATUS_SUCCESS:
+    return "CUBLAS_STATUS_SUCCESS";
+  case CUBLAS_STATUS_NOT_INITIALIZED:
+    return "CUBLAS_STATUS_NOT_INITIALIZED";
+  case CUBLAS_STATUS_ALLOC_FAILED:
+    return "CUBLAS_STATUS_ALLOC_FAILED";
+  case CUBLAS_STATUS_INVALID_VALUE:
+    return "CUBLAS_STATUS_INVALID_VALUE";
+  case CUBLAS_STATUS_ARCH_MISMATCH:
+    return "CUBLAS_STATUS_ARCH_MISMATCH";
+  case CUBLAS_STATUS_MAPPING_ERROR:
+    return "CUBLAS_STATUS_MAPPING_ERROR";
+  case CUBLAS_STATUS_EXECUTION_FAILED:
+    return "CUBLAS_STATUS_EXECUTION_FAILED";
+  case CUBLAS_STATUS_INTERNAL_ERROR:
+    return "CUBLAS_STATUS_INTERNAL_ERROR";
+#if CUDA_VERSION >= 6000
+  case CUBLAS_STATUS_NOT_SUPPORTED:
+    return "CUBLAS_STATUS_NOT_SUPPORTED";
+#endif
+#if CUDA_VERSION >= 6050
+  case CUBLAS_STATUS_LICENSE_ERROR:
+    return "CUBLAS_STATUS_LICENSE_ERROR";
+#endif
+  }
+  return "Unknown cublas status";
+}
+
+inline const char* curandGetErrorString(curandStatus_t error) {
+  switch (error) {
+  case CURAND_STATUS_SUCCESS:
+    return "CURAND_STATUS_SUCCESS";
+  case CURAND_STATUS_VERSION_MISMATCH:
+    return "CURAND_STATUS_VERSION_MISMATCH";
+  case CURAND_STATUS_NOT_INITIALIZED:
+    return "CURAND_STATUS_NOT_INITIALIZED";
+  case CURAND_STATUS_ALLOCATION_FAILED:
+    return "CURAND_STATUS_ALLOCATION_FAILED";
+  case CURAND_STATUS_TYPE_ERROR:
+    return "CURAND_STATUS_TYPE_ERROR";
+  case CURAND_STATUS_OUT_OF_RANGE:
+    return "CURAND_STATUS_OUT_OF_RANGE";
+  case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
+    return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
+  case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
+    return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
+  case CURAND_STATUS_LAUNCH_FAILURE:
+    return "CURAND_STATUS_LAUNCH_FAILURE";
+  case CURAND_STATUS_PREEXISTING_FAILURE:
+    return "CURAND_STATUS_PREEXISTING_FAILURE";
+  case CURAND_STATUS_INITIALIZATION_FAILED:
+    return "CURAND_STATUS_INITIALIZATION_FAILED";
+  case CURAND_STATUS_ARCH_MISMATCH:
+    return "CURAND_STATUS_ARCH_MISMATCH";
+  case CURAND_STATUS_INTERNAL_ERROR:
+    return "CURAND_STATUS_INTERNAL_ERROR";
+  }
+  return "Unknown curand status";
+}
+
 // CUDA: various checks for different function calls.
 #define CUDA_CHECK(condition) \
   do { \
@@ -27,13 +89,21 @@ inline int CUDA_GET_BLOCKS(const int N) {
 #define CUBLAS_CHECK(condition) \
   do { \
     cublasStatus_t status = condition;   \
-    if (status != CUBLAS_STATUS_SUCCESS) \
-      ;      \
+    if (status != CUBLAS_STATUS_SUCCESS) { \
+      fprintf(stderr, "error %d: cuBLAS error in file '%s' in line %i : %s.\n", \
+      status, __FILE__, __LINE__, cublasGetErrorString(status) );      \
+      exit(EXIT_FAILURE);                                                     \
+    } \
   } while (0)
 
 #define CURAND_CHECK(condition) \
   do { \
     curandStatus_t status = condition; \
+    if (status != CURAND_STATUS_SUCCESS) { \
+      fprintf(stderr, "error %d: cuBLAS error in file '%s' in line %i : %s.\n", \
+      status, __FILE__, __LINE__, curandGetErrorString(status) );      \
+      exit(EXIT_FAILURE);                                                     \
+    } \
   } while (0)
 
 // CUDA: grid stride looping
diff --git a/libdeepgalois/include/math_functions.hh b/libdeepgalois/include/math_functions.hh
index 86363f4ba3..87d48fd92e 100644
--- a/libdeepgalois/include/math_functions.hh
+++ b/libdeepgalois/include/math_functions.hh
@@ -41,9 +41,9 @@ void d_softmax(const vec_t &y, const vec_t &p, vec_t &dy, const vec_t &dp);
 float_t cross_entropy(const vec_t &y, const vec_t &p);
 void d_cross_entropy(const vec_t &y, const vec_t &p, vec_t &d);
 
-void vadd_gpu(const size_t n, const float_t *a, const float_t *b, float_t *out); // vector add
-void relu_gpu(const size_t n, const float_t *in, float_t *out); // ReLU
-void d_relu_gpu(const size_t n, const float_t *in_diff, const float_t *data, float_t *out_diff); // ReLU derivative
+void vadd_gpu(const int n, const float_t *a, const float_t *b, float_t *out); // vector add
+void relu_gpu(const int n, const float_t *in, float_t *out); // ReLU
+void d_relu_gpu(const int n, const float_t *in_diff, const float_t *data, float_t *out_diff); // ReLU derivative
 void dropout_gpu(const float scale, const float dropout_rate, const float_t *in, unsigned *mask, float_t *out); // dropout
 void d_dropout_gpu(const float scale, const float_t *in_diff, const unsigned *mask, float_t *out_diff); // dropout derivative
 void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, const float_t *A, const float_t *B, float_t *C); // matrix multiply
diff --git a/libdeepgalois/src/aggregator.cu b/libdeepgalois/src/aggregator.cu
index 49fed1e67e..064a01da0e 100644
--- a/libdeepgalois/src/aggregator.cu
+++ b/libdeepgalois/src/aggregator.cu
@@ -5,5 +5,7 @@
 #include "math_functions.hh"
 
 void update_all(CSRGraph &g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor) {
+	unsigned n = g.nnodes;
+	vadd_gpu(n, in, in, out);
 }
 	
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index 8d7fa0e00c..bbb68c194e 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -26,16 +26,15 @@ Context::Context() : mode_(Context::CPU), solver_count_(1),
 Context::~Context() {}
 #else
 Context::Context() : mode_(Context::GPU), solver_count_(1), 
-	solver_rank_(0), multiprocess_(false) {
-	// Try to create a cublas handler, and report an error if failed (but we will
-	// keep the program running as one might just want to run CPU code).
-	if (cublasCreate(&cublas_handle_) != CUBLAS_STATUS_SUCCESS) {
-		std::cout << "Cannot create Cublas handle. Cublas won't be available.";
-	}
-	// Try to create a curand handler.
-	if (curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT) != CURAND_STATUS_SUCCESS ||
-		curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen()) != CURAND_STATUS_SUCCESS)
-		std::cout << "Cannot create Curand generator. Curand won't be available.";
+	solver_rank_(0), multiprocess_(false) { }
+
+cublasHandle_t Context::cublas_handle_ = 0;
+curandGenerator_t Context::curand_generator_ = 0;
+
+void Context::create_blas_handle() {
+	CUBLAS_CHECK(cublasCreate(&cublas_handle_));
+	CURAND_CHECK(curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT));
+	CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen()));
 }
 
 Context::~Context() {
@@ -102,6 +101,7 @@ void Context::genGraph(LGraph &lg, Graph &g) {
 }
 #else
 size_t Context::read_graph_gpu(std::string dataset_str) {
+	return 0;
 }
 #endif
 
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index aff96fde56..eeaf5b668b 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -2,6 +2,9 @@
 
 void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1) {
 	context = new Context();
+#ifndef CPU_ONLY
+	Context::create_blas_handle();
+#endif
 	n = context->read_graph(dataset_str);
 	num_classes = context->read_labels(dataset_str, n);
 	context->degree_counting();
diff --git a/lonestargnn/gcn/CMakeLists.txt b/lonestargnn/gcn/CMakeLists.txt
index ccc59d83de..b71d2df5f8 100644
--- a/lonestargnn/gcn/CMakeLists.txt
+++ b/lonestargnn/gcn/CMakeLists.txt
@@ -1,3 +1,4 @@
 app(gcn gcn.cpp)
-target_link_libraries(gcn deepgalois)
+target_link_libraries(gcn dg_cpu dg_gpu)
+target_link_libraries(gcn -lcudart -lcublas -lcurand -lcudadevrt)
 

From 7a45854332181d526284af79ffadcc60801eecaa Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Sun, 23 Feb 2020 10:20:14 -0600
Subject: [PATCH 012/660] add gpu graph reading

---
 libdeepgalois/CMakeLists.txt    |  3 +-
 libdeepgalois/include/context.h | 14 +++++---
 libdeepgalois/include/net.h     | 39 ++++-----------------
 libdeepgalois/src/context.cpp   | 61 ++++++++++++++++++++++++++++++---
 libdeepgalois/src/net.cpp       | 52 ++++++++--------------------
 libgpu/include/graph_gpu.h      |  4 +--
 libgpu/src/csr_graph.cu         |  4 +--
 lonestargnn/CMakeLists.txt      |  2 +-
 lonestargnn/gcn/CMakeLists.txt  |  5 ++-
 9 files changed, 98 insertions(+), 86 deletions(-)

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index 7ff89b086b..168a022860 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -16,7 +16,7 @@ include_directories("${MGPU_ROOT}/src")
 link_directories(${OPENBLAS_LIB})
 link_directories(${CMAKE_SOURCE_DIR}/libgalois)
 
-set(USE_CPU OFF CACHE BOOL "Build DeepGalois without CUDA support")
+set(USE_CPU ON CACHE BOOL "Build DeepGalois without CUDA support")
 if(USE_CPU)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCPU_ONLY")
 else()
@@ -29,6 +29,7 @@ else()
   link_directories(${CMAKE_SOURCE_DIR}/libgpu)
   set(CUDA_SOURCES src/math_functions.cu src/aggregator.cu)
   cuda_add_library(dg_gpu ${CUDA_SOURCES})
+  target_link_libraries(dg_gpu galois_gpu -lcudart -lcublas -lcurand)
   set_target_properties(dg_gpu PROPERTIES COMPILE_FLAGS "-DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_CUDA")
   set_target_properties(dg_gpu PROPERTIES CUDA_SEPERABLE_COMPILATION ON)
   #cuda_compile(MF_O src/math_functions.cu)
diff --git a/libdeepgalois/include/context.h b/libdeepgalois/include/context.h
index 39fd817198..a40b31b120 100644
--- a/libdeepgalois/include/context.h
+++ b/libdeepgalois/include/context.h
@@ -32,26 +32,32 @@ class Context {
 	void set_multiprocess(bool val) { multiprocess_ = val; }
 	bool root_solver() { return solver_rank_ == 0; }
 	size_t read_graph(std::string dataset_str);
-	size_t read_labels(std::string dataset_str, size_t num);
+	size_t read_labels(std::string dataset_str);
+	size_t read_features(std::string dataset_str);
 	label_t get_label(size_t i) { return labels[i]; }
 	label_t *get_labels_ptr(size_t i) { return &(labels[0]); }
 	void degree_counting();
 	void norm_factor_counting();
+	std::vector<label_t> labels; // labels for classification: N x 1
+	std::vector<float_t> norm_factor; // normalization constant based on graph structure
+	std::vector<unsigned> degrees;
+	tensor_t h_feats; // input features: N x D
+	size_t feat_len; // input feature length: D
 #ifdef CPU_ONLY
 	Graph graph_cpu; // the input graph, |V| = N
 	void genGraph(LGraph &lg, Graph &g);
 	size_t read_graph_cpu(std::string dataset_str, std::string filetype = "gr");
 #else
 	CSRGraph graph_gpu; // the input graph, |V| = N
+	label_t *d_labels; // labels on device
+	float_t *d_norm_factor; // norm_factor on device
+	float_t *d_feats; // input features on device
 	size_t read_graph_gpu(std::string dataset_str);
 	void SetDevice(const int device_id);
 	void DeviceQuery() {}
 	bool CheckDevice(const int device_id) { return true; }
 	int FindDevice(const int start_id = 0) { return 0; }
 #endif
-	std::vector<label_t> labels; // labels for classification: N x 1
-	std::vector<float_t> norm_factor; // normalization constant based on graph structure
-	std::vector<unsigned> degrees;
 
 protected:
 #ifndef CPU_ONLY
diff --git a/libdeepgalois/include/net.h b/libdeepgalois/include/net.h
index dba2753221..80da9fe1ad 100644
--- a/libdeepgalois/include/net.h
+++ b/libdeepgalois/include/net.h
@@ -21,27 +21,17 @@ class Net {
 	void init(std::string dataset_str, unsigned epochs, unsigned hidden1);
 	size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; }
 	size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id+1]; }
-	size_t get_ft_dim() { return feature_dims[0]; }
-	size_t read_features(std::string dataset_str, tensor_t &feats);
-	void construct_layers() {
-		std::cout << "\nConstructing layers...\n";
-		append_conv_layer(0, true); // first conv layer
-		append_conv_layer(1); // hidden1 layer
-		append_out_layer(2); // output layer
-		layers[0]->set_in_data(input_features); // feed input data
-		set_contexts();
-	}
-
+	size_t get_nnodes() { return num_samples; }
+	void train(optimizer *opt, bool need_validate); // training
+	void construct_layers();
 	void set_contexts() {
 		for (size_t i = 0; i < num_layers; i ++)
 			layers[i]->set_context(context);
 	}
-
 	void set_netphases(net_phase phase) {
 		for (size_t i = 0; i < num_layers; i ++)
 			layers[i]->set_netphase(phase);
 	}
-
 	void print_layers_info() {
 		for (size_t i = 0; i < num_layers; i ++)
 			layers[i]->print_layer_info();
@@ -51,21 +41,17 @@ class Net {
 		assert(dropout_rate < 1.0);
 		assert(layer_id < NUM_CONV_LAYERS);
 		std::vector<size_t> in_dims(2), out_dims(2);
-		in_dims[0] = out_dims[0] = n;
+		in_dims[0] = out_dims[0] = num_samples;
 		in_dims[1] = get_in_dim(layer_id);
 		out_dims[1] = get_out_dim(layer_id);
-#ifdef CPU_ONLY
 		layers[layer_id] = new graph_conv_layer(layer_id, act, norm, bias, dropout, dropout_rate, in_dims, out_dims);
-#else
-		layers[layer_id] = new graph_conv_layer(layer_id, act, norm, bias, dropout, dropout_rate, in_dims, out_dims);
-#endif
 		if(layer_id > 0) connect(layers[layer_id-1], layers[layer_id]);
 	}
 
 	void append_out_layer(size_t layer_id) {
 		assert(layer_id > 0); // can not be the first layer
 		std::vector<size_t> in_dims(2), out_dims(2);
-		in_dims[0] = out_dims[0] = n;
+		in_dims[0] = out_dims[0] = num_samples;
 		in_dims[1] = get_in_dim(layer_id);
 		out_dims[1] = get_out_dim(layer_id);
 		layers[layer_id] = new softmax_loss_layer(layer_id, in_dims, out_dims);
@@ -106,29 +92,16 @@ class Net {
 		return t_eval.Millisecs();
 	}
 
-	// training
-	void train(optimizer *opt, bool need_validate);
-	size_t get_nnodes() { return n; }
-
 protected:
 	Context *context;
-	size_t n; // number of samples: N
+	size_t num_samples; // number of samples: N
 	size_t num_classes; // number of vertex classes: E
 	size_t num_layers; // for now hard-coded: NUM_CONV_LAYERS + 1
 	unsigned num_epochs; // number of epochs
 	std::vector<size_t> feature_dims; // feature dimnesions for each layer
-	tensor_t input_features; // input features: N x D
 	MaskList train_mask, val_mask; // masks for traning and validation
 	size_t train_begin, train_end, train_count, val_begin, val_end, val_count;
 	std::vector<layer *> layers; // all the layers in the neural network
-	/*
-	inline void init_features(size_t dim, vec_t &x) {
-		std::default_random_engine rng;
-		std::uniform_real_distribution<feature_t> dist(0, 0.1);
-		for (size_t i = 0; i < dim; ++i)
-			x[i] = dist(rng);
-	}
-	//*/
 
 	// comparing outputs with the ground truth (labels)
 	inline acc_t masked_accuracy(size_t begin, size_t end, size_t count, MaskList &masks) {
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index bbb68c194e..f0854eb403 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -101,8 +101,19 @@ void Context::genGraph(LGraph &lg, Graph &g) {
 }
 #else
 size_t Context::read_graph_gpu(std::string dataset_str) {
+	std::string filename = path + dataset_str + ".csgr";
+	graph_gpu.read(filename.c_str(), false);
+	exit(0);
 	return 0;
 }
+
+void copy_data_to_device() {
+	CUDA_CHECK(cudaMalloc((void **)&d_labels, n * sizeof(label_t)));
+	CUDA_SAFE_CALL(cudaMemcpy(d_labels, labels, n * sizeof(label_t), cudaMemcpyHostToDevice));
+	CUDA_CHECK(cudaMalloc((void **)&d_norm_factor, n * sizeof(float_t)));
+	CUDA_CHECK(cudaMalloc((void **)&d_feats, n * sizeof(float_t)));
+	CUDA_SAFE_CALL(cudaMemcpy(d_feats, h_feats, n * sizeof(float_t), cudaMemcpyHostToDevice));
+}
 #endif
 
 // user-defined pre-computing function, called during initialization
@@ -132,18 +143,17 @@ void Context::degree_counting() {
 // labels contain the ground truth (e.g. vertex classes) for each example (num_examples x 1).
 // Note that labels is not one-hot encoded vector and it can be computed
 // as y.argmax(axis=1) from one-hot encoded vector (y) of labels if required.
-size_t Context::read_labels(std::string dataset_str, size_t num) {
+size_t Context::read_labels(std::string dataset_str) {
 	std::cout << "Reading labels ... ";
-	labels.resize(num, 0); // label for each vertex: N x 1
 	Timer t_read;
 	t_read.Start();
 	std::string filename = path + dataset_str + "-labels.txt";
 	std::ifstream in;
 	std::string line;
 	in.open(filename, std::ios::in);
-	size_t m, n;
+	size_t m, n; // m: number of vertices; n: number of classes
 	in >> m >> n >> std::ws;
-	assert(m == labels.size()); // number of vertices
+	labels.resize(m, 0); // label for each vertex: N x 1
 	unsigned v = 0;
 	while (std::getline(in, line)) {
 		std::istringstream label_stream(line);
@@ -159,8 +169,49 @@ size_t Context::read_labels(std::string dataset_str, size_t num) {
 	}
 	in.close();
 	t_read.Stop();
-	// number of vertex classes
+	// print the number of vertex classes
 	std::cout << "Done, unique label counts: " << n << ", time: " << t_read.Millisecs() << " ms\n";
 	return n;
 }
 
+size_t Context::read_features(std::string dataset_str) {
+	std::cout << "Reading features ... ";
+	Timer t_read;
+	t_read.Start();
+	std::string filename = path + dataset_str + ".ft";
+	std::ifstream in;
+	std::string line;
+	in.open(filename, std::ios::in);
+	size_t m; // m = number of vertices
+	in >> m >> feat_len >> std::ws;
+	//assert(m == );
+	h_feats.resize(m);
+	for (size_t i = 0; i < m; ++i) {
+		h_feats[i].resize(feat_len);
+		for (size_t j = 0; j < feat_len; ++j)
+			h_feats[i][j] = 0;
+	}
+	while (std::getline(in, line)) {
+		std::istringstream edge_stream(line);
+		unsigned u, v;
+		float_t w;
+		edge_stream >> u;
+		edge_stream >> v;
+		edge_stream >> w;
+		h_feats[u][v] = w;
+	}
+	in.close();
+	t_read.Stop();
+	std::cout << "Done, feature length: " << feat_len << ", time: " << t_read.Millisecs() << " ms\n";
+	return feat_len;
+}
+
+/*
+inline void init_features(size_t dim, vec_t &x) {
+	std::default_random_engine rng;
+	std::uniform_real_distribution<feature_t> dist(0, 0.1);
+	for (size_t i = 0; i < dim; ++i)
+		x[i] = dist(rng);
+}
+//*/
+
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index eeaf5b668b..a7cd4ba567 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -5,14 +5,15 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1) {
 #ifndef CPU_ONLY
 	Context::create_blas_handle();
 #endif
-	n = context->read_graph(dataset_str);
-	num_classes = context->read_labels(dataset_str, n);
+	num_samples = context->read_graph(dataset_str);
+	num_classes = context->read_labels(dataset_str);
 	context->degree_counting();
 	context->norm_factor_counting(); // pre-compute normalizing factor
 	num_epochs = epochs;
+
 	std::cout << "Reading label masks ... ";
-	train_mask.resize(n, 0);
-	val_mask.resize(n, 0);
+	train_mask.resize(num_samples, 0);
+	val_mask.resize(num_samples, 0);
 	if (dataset_str == "reddit") {
 		train_begin = 0, train_count = 153431, train_end = train_begin + train_count;
 		val_begin = 153431, val_count = 23831, val_end = val_begin + val_count;
@@ -26,45 +27,13 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1) {
 
 	num_layers = NUM_CONV_LAYERS + 1;
 	feature_dims.resize(num_layers + 1);
-	input_features.resize(n); // input embedding: N x D
-	feature_dims[0] = read_features(dataset_str, input_features); // input feature dimension: D
+	feature_dims[0] = context->read_features(dataset_str); // input feature dimension: D
 	feature_dims[1] = hidden1; // hidden1 level embedding: 16
 	feature_dims[2] = num_classes; // output embedding: E
 	feature_dims[3] = num_classes; // normalized output embedding: E
 	layers.resize(num_layers);
 }
 
-size_t Net::read_features(std::string dataset_str, tensor_t &feats) {
-	std::cout << "Reading features ... ";
-	Timer t_read;
-	t_read.Start();
-	std::string filename = path + dataset_str + ".ft";
-	std::ifstream in;
-	std::string line;
-	in.open(filename, std::ios::in);
-	size_t m, n;
-	in >> m >> n >> std::ws;
-	assert(m == feats.size()); // m = number of vertices
-	for (size_t i = 0; i < m; ++i) {
-		feats[i].resize(n);
-		for (size_t j = 0; j < n; ++j)
-			feats[i][j] = 0;
-	}
-	while (std::getline(in, line)) {
-		std::istringstream edge_stream(line);
-		unsigned u, v;
-		float_t w;
-		edge_stream >> u;
-		edge_stream >> v;
-		edge_stream >> w;
-		feats[u][v] = w;
-	}
-	in.close();
-	t_read.Stop();
-	std::cout << "Done, feature dimention: " << n << ", time: " << t_read.Millisecs() << " ms\n";
-	return n;
-}
-
 void Net::train(optimizer *opt, bool need_validate) {
 	std::cout << "\nStart training...\n";
 	galois::StatTimer Tupdate("Train-WeightUpdate");
@@ -108,3 +77,12 @@ void Net::train(optimizer *opt, bool need_validate) {
 	}
 }
 
+void Net::construct_layers() {
+	std::cout << "\nConstructing layers...\n";
+	append_conv_layer(0, true); // first conv layer
+	append_conv_layer(1); // hidden1 layer
+	append_out_layer(2); // output layer
+	layers[0]->set_in_data(context->h_feats); // feed input data
+	set_contexts();
+}
+
diff --git a/libgpu/include/graph_gpu.h b/libgpu/include/graph_gpu.h
index c197c077ec..2458ad8632 100644
--- a/libgpu/include/graph_gpu.h
+++ b/libgpu/include/graph_gpu.h
@@ -27,7 +27,7 @@ typedef int node_data_type;
 
 // very simple implementation
 struct CSRGraph {
-  unsigned read(char file[], bool read_edge_data = true);
+  unsigned read(const char file[], bool read_edge_data = true);
   void copy_to_gpu(struct CSRGraph& copygraph);
   void copy_to_cpu(struct CSRGraph& copygraph);
 
@@ -37,7 +37,7 @@ struct CSRGraph {
   unsigned allocOnHost(bool no_edge_data = false);
   unsigned allocOnDevice(bool no_edge_data = false);
   void progressPrint(unsigned maxii, unsigned ii);
-  unsigned readFromGR(char file[], bool read_edge_data = true);
+  unsigned readFromGR(const char file[], bool read_edge_data = true);
 
   unsigned deallocOnHost();
   unsigned deallocOnDevice();
diff --git a/libgpu/src/csr_graph.cu b/libgpu/src/csr_graph.cu
index 554550fe91..d00912a404 100644
--- a/libgpu/src/csr_graph.cu
+++ b/libgpu/src/csr_graph.cu
@@ -150,7 +150,7 @@ void CSRGraph::progressPrint(unsigned maxii, unsigned ii) {
   }
 }
 
-unsigned CSRGraph::readFromGR(char file[], bool read_edge_data) {
+unsigned CSRGraph::readFromGR(const char file[], bool read_edge_data) {
   std::ifstream cfile;
   cfile.open(file);
 
@@ -237,7 +237,7 @@ unsigned CSRGraph::readFromGR(char file[], bool read_edge_data) {
   return 0;
 }
 
-unsigned CSRGraph::read(char file[], bool read_edge_data) {
+unsigned CSRGraph::read(const char file[], bool read_edge_data) {
   return readFromGR(file, read_edge_data);
 }
 
diff --git a/lonestargnn/CMakeLists.txt b/lonestargnn/CMakeLists.txt
index 10e7288dd9..e270f63011 100644
--- a/lonestargnn/CMakeLists.txt
+++ b/lonestargnn/CMakeLists.txt
@@ -4,7 +4,7 @@ include_directories(BEFORE
 )
 include_directories(${CMAKE_SOURCE_DIR}/lonestargnn)
 include_directories(${CMAKE_SOURCE_DIR}/libdeepgalois/include)
-SET(CUDA_INC /org/centers/cdgc/cuda/cuda-8.0/include)
+SET(CUDA_INC /org/centers/cdgc/cuda/cuda-10.0/include)
 include_directories(${CUDA_INC})
 include_directories(${CMAKE_SOURCE_DIR}/libgpu/include)
 
diff --git a/lonestargnn/gcn/CMakeLists.txt b/lonestargnn/gcn/CMakeLists.txt
index b71d2df5f8..715a68d497 100644
--- a/lonestargnn/gcn/CMakeLists.txt
+++ b/lonestargnn/gcn/CMakeLists.txt
@@ -1,4 +1,7 @@
 app(gcn gcn.cpp)
-target_link_libraries(gcn dg_cpu dg_gpu)
+target_link_libraries(gcn dg_cpu)
+if(NOT USE_CPU)
+  target_link_libraries(gcn dg_gpu)
+endif()
 target_link_libraries(gcn -lcudart -lcublas -lcurand -lcudadevrt)
 

From eda1ffdabdf4513a1f0cf591399c131877f5b6d4 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Sun, 23 Feb 2020 10:34:07 -0600
Subject: [PATCH 013/660] add copy_data

---
 libdeepgalois/include/context.h |  5 ++++-
 libdeepgalois/src/context.cpp   | 30 ++++++++++++++----------------
 libdeepgalois/src/net.cpp       |  7 ++++---
 3 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/libdeepgalois/include/context.h b/libdeepgalois/include/context.h
index a40b31b120..d03740358e 100644
--- a/libdeepgalois/include/context.h
+++ b/libdeepgalois/include/context.h
@@ -20,7 +20,7 @@ class Context {
 #ifndef CPU_ONLY
 	inline static cublasHandle_t cublas_handle() { return cublas_handle_; }
 	inline static curandGenerator_t curand_generator() { return curand_generator_; }
-	static void create_blas_handle();
+	//static void create_blas_handle();
 #endif
 	Brew mode() { return mode_; }
 	void set_mode(Brew mode) { mode_ = mode; }
@@ -42,6 +42,8 @@ class Context {
 	std::vector<float_t> norm_factor; // normalization constant based on graph structure
 	std::vector<unsigned> degrees;
 	tensor_t h_feats; // input features: N x D
+	size_t n; // number of samples: N
+	size_t num_classes; // number of classes: E
 	size_t feat_len; // input feature length: D
 #ifdef CPU_ONLY
 	Graph graph_cpu; // the input graph, |V| = N
@@ -53,6 +55,7 @@ class Context {
 	float_t *d_norm_factor; // norm_factor on device
 	float_t *d_feats; // input features on device
 	size_t read_graph_gpu(std::string dataset_str);
+	void copy_data_to_device(); // copy labels and input features
 	void SetDevice(const int device_id);
 	void DeviceQuery() {}
 	bool CheckDevice(const int device_id) { return true; }
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index f0854eb403..d71baebc9c 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -25,13 +25,12 @@ Context::Context() : mode_(Context::CPU), solver_count_(1),
 	solver_rank_(0), multiprocess_(false) { }
 Context::~Context() {}
 #else
-Context::Context() : mode_(Context::GPU), solver_count_(1), 
-	solver_rank_(0), multiprocess_(false) { }
-
 cublasHandle_t Context::cublas_handle_ = 0;
 curandGenerator_t Context::curand_generator_ = 0;
 
-void Context::create_blas_handle() {
+Context::Context() : mode_(Context::GPU), solver_count_(1), 
+	solver_rank_(0), multiprocess_(false) {
+//void Context::create_blas_handle() {
 	CUBLAS_CHECK(cublasCreate(&cublas_handle_));
 	CURAND_CHECK(curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT));
 	CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen()));
@@ -59,9 +58,9 @@ void Context::SetDevice(const int device_id) {
 
 size_t Context::read_graph(std::string dataset_str) {
 #ifdef CPU_ONLY
-	size_t n = read_graph_cpu(dataset_str, "gr");
+	n = read_graph_cpu(dataset_str, "gr");
 #else
-	size_t n = read_graph_gpu(dataset_str);
+	n = read_graph_gpu(dataset_str);
 #endif
 	return n;
 }
@@ -103,8 +102,7 @@ void Context::genGraph(LGraph &lg, Graph &g) {
 size_t Context::read_graph_gpu(std::string dataset_str) {
 	std::string filename = path + dataset_str + ".csgr";
 	graph_gpu.read(filename.c_str(), false);
-	exit(0);
-	return 0;
+	return graph_gpu.nnodes;
 }
 
 void copy_data_to_device() {
@@ -112,7 +110,7 @@ void copy_data_to_device() {
 	CUDA_SAFE_CALL(cudaMemcpy(d_labels, labels, n * sizeof(label_t), cudaMemcpyHostToDevice));
 	CUDA_CHECK(cudaMalloc((void **)&d_norm_factor, n * sizeof(float_t)));
 	CUDA_CHECK(cudaMalloc((void **)&d_feats, n * sizeof(float_t)));
-	CUDA_SAFE_CALL(cudaMemcpy(d_feats, h_feats, n * sizeof(float_t), cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpy(d_feats, h_feats, n * feat_len * sizeof(float_t), cudaMemcpyHostToDevice));
 }
 #endif
 
@@ -120,7 +118,6 @@ void copy_data_to_device() {
 // for each vertex v, compute pow(|N(v)|, -0.5), where |N(v)| is the degree of v
 void Context::norm_factor_counting() {
 #ifdef CPU_ONLY
-	size_t n = graph_cpu.size();
 	norm_factor.resize(n);
 	galois::do_all(galois::iterate((size_t)0, n), [&] (auto v) {
 		float_t temp = std::sqrt(float_t(degrees[v]));
@@ -132,7 +129,6 @@ void Context::norm_factor_counting() {
 
 void Context::degree_counting() {
 #ifdef CPU_ONLY
-	size_t n = graph_cpu.size();
 	degrees.resize(n);
 	galois::do_all(galois::iterate((size_t)0, n), [&] (auto v) {
 		degrees[v] = std::distance(graph_cpu.edge_begin(v), graph_cpu.edge_end(v));
@@ -151,14 +147,15 @@ size_t Context::read_labels(std::string dataset_str) {
 	std::ifstream in;
 	std::string line;
 	in.open(filename, std::ios::in);
-	size_t m, n; // m: number of vertices; n: number of classes
-	in >> m >> n >> std::ws;
+	size_t m; // m: number of samples
+	in >> m >> num_classes >> std::ws;
+	assert(m == n);
 	labels.resize(m, 0); // label for each vertex: N x 1
 	unsigned v = 0;
 	while (std::getline(in, line)) {
 		std::istringstream label_stream(line);
 		unsigned x;
-		for (size_t idx = 0; idx < n; ++idx) {
+		for (size_t idx = 0; idx < num_classes; ++idx) {
 			label_stream >> x;
 			if (x != 0) {
 				labels[v] = idx;
@@ -170,8 +167,9 @@ size_t Context::read_labels(std::string dataset_str) {
 	in.close();
 	t_read.Stop();
 	// print the number of vertex classes
-	std::cout << "Done, unique label counts: " << n << ", time: " << t_read.Millisecs() << " ms\n";
-	return n;
+	std::cout << "Done, unique label counts: " << num_classes 
+		<< ", time: " << t_read.Millisecs() << " ms\n";
+	return num_classes;
 }
 
 size_t Context::read_features(std::string dataset_str) {
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index a7cd4ba567..9d1fe771fb 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -2,9 +2,7 @@
 
 void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1) {
 	context = new Context();
-#ifndef CPU_ONLY
-	Context::create_blas_handle();
-#endif
+	//Context::create_blas_handle();
 	num_samples = context->read_graph(dataset_str);
 	num_classes = context->read_labels(dataset_str);
 	context->degree_counting();
@@ -32,6 +30,9 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1) {
 	feature_dims[2] = num_classes; // output embedding: E
 	feature_dims[3] = num_classes; // normalized output embedding: E
 	layers.resize(num_layers);
+#ifndef CPU_ONLY
+	copy_data_to_device(); // copy labels and input features to the device
+#endif
 }
 
 void Net::train(optimizer *opt, bool need_validate) {

From 0400c451b79f062a1e627bed87876b9dcc5082f1 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Sun, 23 Feb 2020 18:58:55 -0600
Subject: [PATCH 014/660] convert data to 1D

---
 libdeepgalois/CMakeLists.txt                  |  1 -
 libdeepgalois/include/aggregator.h            |  5 +-
 libdeepgalois/include/context.h               |  4 +-
 libdeepgalois/include/layers.h                |  6 +-
 .../include/layers/graph_conv_layer.h         | 10 +-
 libdeepgalois/include/layers/layer.h          | 36 +++----
 .../include/layers/softmax_loss_layer.h       |  4 +-
 libdeepgalois/include/math_functions.hh       | 12 +++
 libdeepgalois/include/net.h                   |  2 +-
 libdeepgalois/include/node.h                  | 44 ++++-----
 libdeepgalois/src/aggregator.cpp              | 29 +-----
 libdeepgalois/src/aggregator.cu               |  4 +-
 libdeepgalois/src/context.cpp                 | 15 +--
 libdeepgalois/src/layers/graph_conv_layer.cpp | 39 ++++----
 .../src/layers/softmax_loss_layer.cpp         | 21 +++--
 libdeepgalois/src/math_functions.cpp          | 94 +++++++++++++++++++
 libdeepgalois/src/net.cpp                     |  2 +-
 17 files changed, 195 insertions(+), 133 deletions(-)

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index 168a022860..0fe04fab0d 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -38,7 +38,6 @@ endif()
 
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 set(sources
-  src/layers/relu_layer.cpp
   src/layers/graph_conv_layer.cpp
   src/layers/softmax_loss_layer.cpp
   src/math_functions.cpp
diff --git a/libdeepgalois/include/aggregator.h b/libdeepgalois/include/aggregator.h
index 61befebf2d..5f818d6ce2 100644
--- a/libdeepgalois/include/aggregator.h
+++ b/libdeepgalois/include/aggregator.h
@@ -2,12 +2,11 @@
 #include "types.h"
 #ifdef CPU_ONLY
 #include "gtypes.h"
-void update_all(Graph &g, const tensor_t &in, tensor_t &out, bool norm, const vec_t &norm_factor);
-void update_all(Graph &g, const vec_t &in, tensor_t &out, bool norm, const vec_t &norm_factor);
+void update_all(size_t len, Graph &g, const vec_t &in, vec_t &out, bool norm, const vec_t &norm_factor);
 #else
 #include "graph_gpu.h"
 #define TB_SIZE 256
 #define WARP_SIZE 32
-void update_all(CSRGraph &g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor);
+void update_all(size_t len, CSRGraph &g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor);
 #endif
 
diff --git a/libdeepgalois/include/context.h b/libdeepgalois/include/context.h
index d03740358e..1acc9d0b0e 100644
--- a/libdeepgalois/include/context.h
+++ b/libdeepgalois/include/context.h
@@ -39,9 +39,9 @@ class Context {
 	void degree_counting();
 	void norm_factor_counting();
 	std::vector<label_t> labels; // labels for classification: N x 1
-	std::vector<float_t> norm_factor; // normalization constant based on graph structure
+	vec_t norm_factor; // normalization constant based on graph structure
 	std::vector<unsigned> degrees;
-	tensor_t h_feats; // input features: N x D
+	vec_t h_feats; // input features: N x D
 	size_t n; // number of samples: N
 	size_t num_classes; // number of classes: E
 	size_t feat_len; // input feature length: D
diff --git a/libdeepgalois/include/layers.h b/libdeepgalois/include/layers.h
index 9650e931a9..432d315183 100644
--- a/libdeepgalois/include/layers.h
+++ b/libdeepgalois/include/layers.h
@@ -1,8 +1,8 @@
 #ifndef _LAYERS_H_
 #define _LAYERS_H_
-#include "layers/relu_layer.h"
-#include "layers/linear_layer.h"
-#include "layers/arithmetic_layer.h"
+//#include "layers/relu_layer.h"
+//#include "layers/linear_layer.h"
+//#include "layers/arithmetic_layer.h"
 #include "layers/graph_conv_layer.h"
 #include "layers/softmax_loss_layer.h"
 #endif
diff --git a/libdeepgalois/include/layers/graph_conv_layer.h b/libdeepgalois/include/layers/graph_conv_layer.h
index ff7fb82b31..a74f45f5f0 100644
--- a/libdeepgalois/include/layers/graph_conv_layer.h
+++ b/libdeepgalois/include/layers/graph_conv_layer.h
@@ -24,15 +24,15 @@ class graph_conv_layer: public layer {
 	void init();
 	std::string layer_type() const override { return std::string("graph_conv"); }
 	void set_netphase(net_phase ctx) override { phase_ = ctx; }
-	virtual void forward_propagation(const tensor_t &in_data, tensor_t &out_data);
-	virtual void back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad);
+	virtual void forward_propagation(const vec_t &in_data, vec_t &out_data);
+	virtual void back_propagation(const vec_t &in_data, const vec_t &out_data, vec_t &out_grad, vec_t &in_grad);
 	virtual void forward_propagation(const float_t *in_data, float_t *out_data);
 	virtual void back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad);
 	// user-defined aggregate function
 #ifdef CPU_ONLY
-	virtual void aggregate(Graph &g, const vec_t &in, tensor_t &out);
+	virtual void aggregate(size_t len, Graph &g, const vec_t &in, vec_t &out);
 #else
-	virtual void aggregate(CSRGraph &g, const float_t *in, float_t *out);
+	virtual void aggregate(size_t len, CSRGraph &g, const float_t *in, float_t *out);
 #endif
 	// user-defined combine function
 	virtual void combine(const vec_t &self, const vec_t &neighbors, vec_t &out);
@@ -51,7 +51,7 @@ class graph_conv_layer: public layer {
 	vec_t out_temp;
 	vec_t in_temp;
 	vec_t trans_data; // y*x
-	std::vector<std::vector<unsigned> > dropout_mask;
+	std::vector<unsigned>  dropout_mask; // x*y
 
 	// Glorot & Bengio (AISTATS 2010)
 	inline void rand_init_matrix(size_t dim_x, size_t dim_y, vec_t &matrix) {
diff --git a/libdeepgalois/include/layers/layer.h b/libdeepgalois/include/layers/layer.h
index 15e7d88900..cc2d79dcfe 100644
--- a/libdeepgalois/include/layers/layer.h
+++ b/libdeepgalois/include/layers/layer.h
@@ -38,13 +38,13 @@ class layer : public node {
 		level_(level), begin_(0), end_(0), num_dims(in_dims.size()),
 		input_dims(in_dims), output_dims(out_dims) { add_edge(); }
 	virtual ~layer() = default;
-	virtual void forward_propagation(const tensor_t &in_data, tensor_t &out_data) = 0;
-	virtual void forward_propagation(const float_t *in_data, float_t *out_data) = 0;
-	virtual void back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad) = 0;
-	virtual void back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) = 0;
 	virtual std::string layer_type() const = 0;
-	virtual void set_context(Context *ctx) { context = ctx; }
 	virtual void set_netphase(net_phase phase) {}
+	virtual void set_context(Context *ctx) { context = ctx; }
+	virtual void forward_propagation(const vec_t &in_data, vec_t &out_data) = 0;
+	virtual void back_propagation(const vec_t &in_data, const vec_t &out_data, vec_t &out_grad, vec_t &in_grad) = 0;
+	virtual void forward_propagation(const float_t *in_data, float_t *out_data) = 0;
+	virtual void back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) = 0;
 
 	void set_trainable(bool trainable) { trainable_ = trainable; }
 	bool trainable() const { return trainable_; }
@@ -61,29 +61,22 @@ class layer : public node {
 		count_ = sample_count;
 		masks_ = masks;
 	}
-	void set_in_data(tensor_t data) {
-		prev_ = std::make_shared<edge>(this, input_dims[1]);
+	void set_in_data(vec_t data) {
+		prev_ = std::make_shared<edge>(this, input_dims[0], input_dims[1]);
+		// allocate memory for intermediate features
 		prev_->get_data() = data;
-		prev_->get_gradient().resize(input_dims[0]);
 		// allocate memory for intermediate gradients
-		//std::cout << "l0 in_grad alloc: x=" << output_dims[0] << ", y=" << output_dims[1] << "\n";
-		for (size_t i = 0; i < input_dims[0]; ++i)
-			prev_->get_gradient()[i].resize(input_dims[1]);
+		prev_->get_gradient().resize(input_dims[0]*output_dims[1]);
 	}
 	void add_edge() {
 		// add an outgoing edge
-		next_ = std::make_shared<edge>(this, output_dims[1]);
+		next_ = std::make_shared<edge>(this, output_dims[0], output_dims[1]);
 		// allocate memory for intermediate feature vectors
-		next_->get_data().resize(output_dims[0]);
-		for (size_t i = 0; i < output_dims[0]; ++i)
-			next_->get_data()[i].resize(output_dims[1]);
+		next_->get_data().resize(output_dims[0]*output_dims[1]);
 	}
 	void alloc_grad() {
 		// allocate memory for intermediate gradients
-		//std::cout << "l" << level_ << " out_grad alloc: x=" << output_dims[0] << ", y=" << output_dims[1] << "\n";
-		next_->get_gradient().resize(output_dims[0]);
-		for (size_t i = 0; i < output_dims[0]; ++i)
-			next_->get_gradient()[i].resize(output_dims[1]);
+		next_->get_gradient().resize(output_dims[0]*output_dims[1]);
 	}
 	void forward() {
 		forward_propagation(prev()->get_data(), next()->get_data());
@@ -92,7 +85,6 @@ class layer : public node {
 		back_propagation(prev()->get_data(), next()->get_data(), next()->get_gradient(), prev()->get_gradient());
 	}
 	void update_weight(optimizer *opt) {
-		//std::cout << "[debug] " << name_ << ": updating weight...\n"; 
 		// parallelize only when target size is big enough to mitigate thread spawning overhead.
 		bool parallel = (W.size() >= 512);
 		//vec_t diff;
@@ -105,20 +97,16 @@ class layer : public node {
 		prev()->clear_grads();
 	}
 	inline acc_t get_masked_loss() {
-		//acc_t total_loss = acc_t(0);
-		//size_t valid_sample_count = 0;
 		AccumF total_loss;
 		AccumU valid_sample_count;
 		total_loss.reset();
 		valid_sample_count.reset();
-		//for (size_t i = begin_; i < end_; i ++) {
 		galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) {
 			if (masks_[i]) {
 				total_loss += loss[i];
 				valid_sample_count += 1;
 			}
 		}, galois::chunk_size<256>(), galois::steal(), galois::loopname("getMaskedLoss"));
-		//}
 		assert(valid_sample_count.reduce() == count_);
 		return total_loss.reduce() / (acc_t)count_;
 	}
diff --git a/libdeepgalois/include/layers/softmax_loss_layer.h b/libdeepgalois/include/layers/softmax_loss_layer.h
index cb698491fc..0b1e9af3b5 100644
--- a/libdeepgalois/include/layers/softmax_loss_layer.h
+++ b/libdeepgalois/include/layers/softmax_loss_layer.h
@@ -6,9 +6,9 @@ class softmax_loss_layer: public layer {
 	softmax_loss_layer(unsigned level, std::vector<size_t> in_dims, std::vector<size_t> out_dims);
 	~softmax_loss_layer() {}
 	std::string layer_type() const override { return std::string("softmax_loss"); }
-	virtual void forward_propagation(const tensor_t &in_data, tensor_t &out_data);
+	virtual void forward_propagation(const vec_t &in_data, vec_t &out_data);
 	virtual void forward_propagation(const float_t *in_data, float_t *out_data);
-	virtual void back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad);
+	virtual void back_propagation(const vec_t &in_data, const vec_t &out_data, vec_t &out_grad, vec_t &in_grad);
 	virtual void back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad);
 };
 
diff --git a/libdeepgalois/include/math_functions.hh b/libdeepgalois/include/math_functions.hh
index 87d48fd92e..99a9494f08 100644
--- a/libdeepgalois/include/math_functions.hh
+++ b/libdeepgalois/include/math_functions.hh
@@ -23,23 +23,35 @@ void mvmul(const vec_t &matrix, const vec_t &in_vector, vec_t &out_vector);
 void vvmul(const vec_t &a, const vec_t &b, tensor_t &out);
 void matadd(size_t x, size_t y, const tensor_t &A, const tensor_t &B, tensor_t &C);
 void copy2D1D(const tensor_t &in, vec_t &out);
+void copy1D1D(const vec_t &in, vec_t &out);
+void copy1D1D(size_t len, const float_t *in, float_t *out);
 void matmul2D(const tensor_t &A, const tensor_t &B, tensor_t &C);
 void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z, const vec_t &A, const vec_t &B, vec_t &C); // matrix multiply
 void matmul2D1D(const size_t dim_y, const tensor_t &A, const vec_t &B, vec_t &C);
 void transpose2D(const tensor_t &in, tensor_t &out);
 void transpose2D1D(const tensor_t &in, vec_t &out);
 void transpose(size_t x, size_t y, const vec_t &in, vec_t &out);
+void transpose(size_t x, size_t y, const float_t *in, float_t *out);
 int argmax(const size_t n, const vec_t &x); // the arguments of the maxima
+int argmax(const size_t n, const float_t *x); // the arguments of the maxima
 void clear(vec_t &in);
+void clear(size_t n, float_t *in);
 void relu(const vec_t &in, vec_t &out); // ReLU
+void relu(size_t n, const float_t *in, float_t *out); // ReLU
 void d_relu(const vec_t &in_diff, const vec_t &data, vec_t &out_diff); // ReLU derivative
 void dropout(const float scale, const float dropout_rate, const vec_t &in, std::vector<unsigned> &mask, vec_t &out); // dropout
 void dropout(const float scale, const float dropout_rate, const vec_t &in, std::vector<unsigned> &mask, float_t *out);
+void dropout(size_t n, const float scale, const float dropout_rate, const float_t *in, unsigned *mask, float_t *out);
 void d_dropout(const float scale, const vec_t &in_diff, std::vector<unsigned> &mask, vec_t &out_diff); // dropout derivative
+void d_dropout(size_t n, const float scale, const float_t *in_diff, unsigned *mask, float_t *out_diff);
 void softmax(const vec_t &input, vec_t &output);
+void softmax(size_t n, const float_t *input, float_t *output);
 void d_softmax(const vec_t &y, const vec_t &p, vec_t &dy, const vec_t &dp);
+void d_softmax(size_t n, const float_t *y, const float_t *p, float_t *dy, const float_t *dp);
 float_t cross_entropy(const vec_t &y, const vec_t &p);
+float_t cross_entropy(size_t n, const float_t *y, const float_t *p);
 void d_cross_entropy(const vec_t &y, const vec_t &p, vec_t &d);
+void d_cross_entropy(size_t n, const float_t *y, const float_t *p, float_t *d);
 
 void vadd_gpu(const int n, const float_t *a, const float_t *b, float_t *out); // vector add
 void relu_gpu(const int n, const float_t *in, float_t *out); // ReLU
diff --git a/libdeepgalois/include/net.h b/libdeepgalois/include/net.h
index 80da9fe1ad..66f50a17b6 100644
--- a/libdeepgalois/include/net.h
+++ b/libdeepgalois/include/net.h
@@ -109,7 +109,7 @@ class Net {
 		accuracy_all.reset();
 		galois::do_all(galois::iterate(begin, end), [&](const auto& i) {
 			if (masks[i] == 1) {
-				int preds = argmax(num_classes, layers[NUM_CONV_LAYERS-1]->next()->get_data()[i]);
+				int preds = argmax(num_classes, &(layers[NUM_CONV_LAYERS-1]->next()->get_data()[i*num_classes]));
 				if ((label_t)preds == context->get_label(i)) accuracy_all += 1.0;
 			}
 		}, galois::chunk_size<256>(), galois::steal(), galois::loopname("getMaskedLoss"));
diff --git a/libdeepgalois/include/node.h b/libdeepgalois/include/node.h
index eec041e0e1..b74edec280 100644
--- a/libdeepgalois/include/node.h
+++ b/libdeepgalois/include/node.h
@@ -25,38 +25,31 @@ class node : public std::enable_shared_from_this<node> {
 // edges manage the input/output data and gradients between nodes
 class edge {
 public:
-	edge(node *prev, size_t len) :
-		ft_dim_(len),
-		data_({vec_t(len)}),
-		grad_({vec_t(len)}),
+	edge(node *prev, size_t n, size_t len) :
+		num_samples_(n), ft_dim_(len),
+		data_(vec_t(n*len)), grad_(vec_t(n*len)),
 		prev_(prev) {}
 
 	void merge_grads(vec_t *dst) {
 		assert(!grad_.empty());
-		const auto &grad_head = grad_[0];
-		size_t sz             = grad_head.size();
-		dst->resize(sz);
+		dst->resize(ft_dim_);
 		float_t *pdst = &(*dst)[0];
-		std::copy(grad_head.begin(), grad_head.end(), pdst);
+		std::copy(grad_.begin(), grad_.begin()+ft_dim_, pdst);
 		// @todo consider adding parallelism and vectorization
-		for (size_t sample = 1; sample < grad_.size(); ++sample) {
-			for (size_t i = 0; i < sz; i++) pdst[i] += grad_[sample][i];
-			//vectorize::reduce<float_t>(&grad_[sample][0], sz, pdst);
+		for (size_t sample = 1; sample < num_samples_; ++sample) {
+			for (size_t i = 0; i < ft_dim_; i++) pdst[i] += grad_[sample*ft_dim_+i];
+			//vectorize::reduce<float_t>(&grad_[sample][0], ft_dim_, pdst);
 		}
 	}
 	void clear_grads() {
-		for (size_t sample = 0; sample < grad_.size(); ++sample) {
-			auto &g = grad_[sample];
-			std::fill(g.begin(), g.end(), 0.0); // TODO: need vectorize
-			//vectorize::fill(&g[0], g.size(), float_t{0});
-		}
+		std::fill(grad_.begin(), grad_.end(), float_t{0}); // TODO: need vectorize
+		//vectorize::fill(&grad_[0], grad_.size(), float_t{0});
 	}
 
-	tensor_t *get_data_ptr() { return &data_; }
-	tensor_t &get_data() { return data_; }
-	const tensor_t &get_data() const { return data_; }
-	tensor_t &get_gradient() { return grad_; }
-	const tensor_t &get_gradient() const { return grad_; }
+	vec_t &get_data() { return data_; }
+	const vec_t &get_data() const { return data_; }
+	vec_t &get_gradient() { return grad_; }
+	const vec_t &get_gradient() const { return grad_; }
 	float_t *get_gpu_data() const { return gpu_data_; }
 	float_t *get_gpu_gradient() { return gpu_grad_; }
 
@@ -66,12 +59,13 @@ class edge {
 	void add_next_node(node *next) { next_ = next; }
 
 private:
+	size_t num_samples_;// number of samples
 	size_t ft_dim_;     // feature dimensions
-	tensor_t data_;     // feature vectors on CPU
-	tensor_t grad_;     // gradients on CPU
+	vec_t data_;        // feature vectors on CPU
+	vec_t grad_;        // gradients on CPU
 	float_t *gpu_data_; // feature vectors on GPU
 	float_t *gpu_grad_; // gradients on CPU
-	node *prev_;        // previous node, "producer" of this tensor
-	node *next_;        // next node, "consumer" of this tensor
+	node *prev_;        // previous node, "producer" of data
+	node *next_;        // next node, "consumer" of data
 };
 
diff --git a/libdeepgalois/src/aggregator.cpp b/libdeepgalois/src/aggregator.cpp
index 723a36e9e9..30b2fc0a5e 100644
--- a/libdeepgalois/src/aggregator.cpp
+++ b/libdeepgalois/src/aggregator.cpp
@@ -3,28 +3,9 @@
 #include "aggregator.h"
 #include "math_functions.hh"
 
-void update_all(Graph &g, const tensor_t &in, tensor_t &out, bool norm, const vec_t &norm_factor) {
+void update_all(size_t len, Graph &g, const vec_t &in, vec_t &out, bool norm, const vec_t &norm_factor) {
 	galois::do_all(galois::iterate(g.begin(), g.end()), [&](const auto& src) {
-		clear(out[src]); // TODO: vectorize clear
-		float_t a = 0.0, b = 0.0;
-		if (norm) a = norm_factor[src];
-		// gather neighbors' embeddings
-		for (const auto e : g.edges(src)) {
-			const auto dst = g.getEdgeDst(e);
-			if (norm) {
-				b = a * norm_factor[dst];
-				vec_t neighbor = in[dst];
-				mul_scalar(b, neighbor);
-				vadd(out[src], neighbor, out[src]); // out[src] += in[dst]
-			} else vadd(out[src], in[dst], out[src]); // out[src] += in[dst]
-		}
-	}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("update_all"));
-}
-
-void update_all(Graph &g, const vec_t &in, tensor_t &out, bool norm, const vec_t &norm_factor) {
-	size_t len = out[0].size();
-	galois::do_all(galois::iterate(g.begin(), g.end()), [&](const auto& src) {
-		clear(out[src]);
+		clear(len, &out[src*len]);
 		float_t a = 0.0, b = 0.0;
 		if (norm) a = norm_factor[src];
 		// gather neighbors' embeddings
@@ -33,9 +14,9 @@ void update_all(Graph &g, const vec_t &in, tensor_t &out, bool norm, const vec_t
 			if (norm) {
 				b = a * norm_factor[dst];
 				vec_t neighbor(len);
-				mul_scalar(len, b, &in[dst*len], neighbor.data());
-				vadd(out[src], neighbor, out[src]); // out[src] += in[dst]
-			} else vadd(len, out[src].data(), &in[dst*len], out[src].data()); // out[src] += in[dst]
+				mul_scalar(len, b, &in[dst*len], &neighbor[0]);
+				vadd(len, &out[src*len], &neighbor[0], &out[src*len]); // out[src] += in[dst]
+			} else vadd(len, &out[src*len], &in[dst*len], &out[src*len]); // out[src] += in[dst]
 		}
 	}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("update_all"));
 }
diff --git a/libdeepgalois/src/aggregator.cu b/libdeepgalois/src/aggregator.cu
index 064a01da0e..04f9c1e8f8 100644
--- a/libdeepgalois/src/aggregator.cu
+++ b/libdeepgalois/src/aggregator.cu
@@ -4,8 +4,8 @@
 #include "aggregator.h"
 #include "math_functions.hh"
 
-void update_all(CSRGraph &g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor) {
+void update_all(size_t len, CSRGraph &g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor) {
 	unsigned n = g.nnodes;
-	vadd_gpu(n, in, in, out);
+	vadd_gpu(len, in, in, out);
 }
 	
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index d71baebc9c..44b12e4bb0 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -105,12 +105,12 @@ size_t Context::read_graph_gpu(std::string dataset_str) {
 	return graph_gpu.nnodes;
 }
 
-void copy_data_to_device() {
+void Context::copy_data_to_device() {
 	CUDA_CHECK(cudaMalloc((void **)&d_labels, n * sizeof(label_t)));
-	CUDA_SAFE_CALL(cudaMemcpy(d_labels, labels, n * sizeof(label_t), cudaMemcpyHostToDevice));
+	CUDA_CHECK(cudaMemcpy(d_labels, &labels[0], n * sizeof(label_t), cudaMemcpyHostToDevice));
 	CUDA_CHECK(cudaMalloc((void **)&d_norm_factor, n * sizeof(float_t)));
 	CUDA_CHECK(cudaMalloc((void **)&d_feats, n * sizeof(float_t)));
-	CUDA_SAFE_CALL(cudaMemcpy(d_feats, h_feats, n * feat_len * sizeof(float_t), cudaMemcpyHostToDevice));
+	CUDA_CHECK(cudaMemcpy(d_feats, &h_feats[0], n * feat_len * sizeof(float_t), cudaMemcpyHostToDevice));
 }
 #endif
 
@@ -183,12 +183,7 @@ size_t Context::read_features(std::string dataset_str) {
 	size_t m; // m = number of vertices
 	in >> m >> feat_len >> std::ws;
 	//assert(m == );
-	h_feats.resize(m);
-	for (size_t i = 0; i < m; ++i) {
-		h_feats[i].resize(feat_len);
-		for (size_t j = 0; j < feat_len; ++j)
-			h_feats[i][j] = 0;
-	}
+	h_feats.resize(m*feat_len, 0);
 	while (std::getline(in, line)) {
 		std::istringstream edge_stream(line);
 		unsigned u, v;
@@ -196,7 +191,7 @@ size_t Context::read_features(std::string dataset_str) {
 		edge_stream >> u;
 		edge_stream >> v;
 		edge_stream >> w;
-		h_feats[u][v] = w;
+		h_feats[u*feat_len+v] = w;
 	}
 	in.close();
 	t_read.Stop();
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 4e27fdd9bb..ed2e000661 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -1,11 +1,11 @@
 #include "layers/graph_conv_layer.h"
 
 #ifdef CPU_ONLY
-void graph_conv_layer::aggregate(Graph &g, const vec_t &in, tensor_t &out) {
-	update_all(g, in, out, true, context->norm_factor);
+void graph_conv_layer::aggregate(size_t len, Graph &g, const vec_t &in, vec_t &out) {
+	update_all(len, g, in, out, true, context->norm_factor);
 #else
-void graph_conv_layer::aggregate(CSRGraph &g, const float_t *in, float_t *out) {
-	update_all(g, in, out, true, NULL);
+void graph_conv_layer::aggregate(size_t len, CSRGraph &g, const float_t *in, float_t *out) {
+	update_all(len, g, in, out, true, context->d_norm_factor);
 #endif
 }
 
@@ -40,10 +40,7 @@ void graph_conv_layer::init() {
 	//rand_init_matrix(y, z, Q);
 	zero_init_matrix(y, z, weight_grad);
 	alloc_grad();
-	if (dropout_) {
-		dropout_mask.resize(x);
-		for (size_t i = 0; i < x; i++) dropout_mask[i].resize(y);
-	}
+	if (dropout_) dropout_mask.resize(x*y);
 	in_temp.resize(x*y);
 	out_temp.resize(x*z); // same as pre_sup in original GCN code: https://github.com/chenxuhao/gcn/blob/master/gcn/layers.py
 	trans_data.resize(y*x); // y*x
@@ -53,54 +50,54 @@ void graph_conv_layer::init() {
 
 #ifdef CPU_ONLY
 // 𝒉[𝑙] = σ(𝑊 * Σ(𝒉[𝑙-1]))
-void graph_conv_layer::forward_propagation(const tensor_t &in_data, tensor_t &out_data) {
+void graph_conv_layer::forward_propagation(const vec_t &in_data, vec_t &out_data) {
 	// input: x*y; W: y*z; output: x*z
 	// if y > z: mult W first to reduce the feature size for aggregation
 	// else: aggregate first then mult W (not implemented yet)
 	if (dropout_ && phase_ == net_phase::train) {
 		galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) {
-			dropout(scale_, dropout_rate_, in_data[i], dropout_mask[i], &in_temp[i*y]);
+			dropout(y, scale_, dropout_rate_, &in_data[i*y], &dropout_mask[i*y], &in_temp[i*y]);
 		}, galois::loopname("dropout"));
 		matmul1D1D(x, z, y, in_temp, W, out_temp); // x*y; y*z; x*z
-	} else matmul2D1D(z, in_data, W, out_temp); // x*y; y*z; x*z
-	aggregate(context->graph_cpu, out_temp, out_data); // aggregate
+	} else matmul1D1D(x, z, y, in_data, W, out_temp); // x*y; y*z; x*z
+	aggregate(z, context->graph_cpu, out_temp, out_data); // aggregate
 	if (act_) {
 		galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) {
-			relu(out_data[i], out_data[i]);
+			relu(z, &out_data[i*z], &out_data[i*z]);
 		}, galois::loopname("relu"));
 	}
 }
 
 // 𝜕𝐸 / 𝜕𝑦[𝑙−1] = 𝜕𝐸 / 𝜕𝑦[𝑙] ∗ 𝑊 ^𝑇
-void graph_conv_layer::back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad) {
+void graph_conv_layer::back_propagation(const vec_t &in_data, const vec_t &out_data, vec_t &out_grad, vec_t &in_grad) {
 	if (act_) {
 		galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) {
 			for (size_t j = 0; j < z; ++j) //TODO: use in_data or out_data?
-				out_temp[i*z+j] = out_data[i][j] > float_t(0) ? out_grad[i][j] : float_t(0);
+				out_temp[i*z+j] = out_data[i*z+j] > float_t(0) ? out_grad[i*z+j] : float_t(0);
 		}, galois::loopname("d_relu"));
-	} else copy2D1D(out_grad, out_temp); // TODO: avoid copying
+	} else copy1D1D(out_grad, out_temp); // TODO: avoid copying
 	if (level_ != 0) { // no need to calculate in_grad for the first layer
 		vec_t trans_W(z*y);
 		transpose(y, z, W, trans_W); // derivative of matmul needs transposed matrix
 		matmul1D1D(x, y, z, out_temp, trans_W, in_temp); // x*z; z*y -> x*y
 		//NOTE: since graph is symmetric, the derivative is the same
-		update_all(context->graph_cpu, in_temp, in_grad, true, context->norm_factor); // x*x; x*y -> x*y
+		update_all(y, context->graph_cpu, in_temp, in_grad, true, context->norm_factor); // x*x; x*y -> x*y
 		if (dropout_) {
 			galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) {
-				d_dropout(scale_, in_grad[i], dropout_mask[i], in_grad[i]);
+				d_dropout(y, scale_, &in_grad[i*y], &dropout_mask[i*y], &in_grad[i*y]);
 			}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("d_dropout"));
 		}
 	}
 	// calculate weight gradients
-	transpose2D1D(in_data, trans_data); // y*x
+	transpose(x, y, in_data, trans_data); // y*x
 	matmul1D1D(y, z, x, trans_data, out_temp, weight_grad); // y*x; x*z; y*z
 }
 
 void graph_conv_layer::forward_propagation(const float_t *in_data, float_t *out_data) {}
 void graph_conv_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) {}
 #else
-void graph_conv_layer::forward_propagation(const tensor_t &in_data, tensor_t &out_data) {}
-void graph_conv_layer::back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad) {}
+void graph_conv_layer::forward_propagation(const vec_t &in_data, vec_t &out_data) {}
+void graph_conv_layer::back_propagation(const vec_t &in_data, const vec_t &out_data, vec_t &out_grad, vec_t &in_grad) {}
 
 // GPU forward
 void graph_conv_layer::forward_propagation(const float_t *in_data, float_t *out_data) {
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp
index 3d8c22bf49..bc0cd5e953 100644
--- a/libdeepgalois/src/layers/softmax_loss_layer.cpp
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp
@@ -9,27 +9,30 @@ softmax_loss_layer::softmax_loss_layer(unsigned level, std::vector<size_t> in_di
 
 // TODO: need kernel fusion optimization
 // 𝑦[i] = 𝑒^𝑥[i] / Σ 𝑒^𝑥[𝑘]
-void softmax_loss_layer::forward_propagation(const tensor_t &in_data, tensor_t &out_data) {
+void softmax_loss_layer::forward_propagation(const vec_t &in_data, vec_t &out_data) {
+	size_t len = input_dims[1];
 	galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) {
 		if (masks_[i] == 1) { // masked
-			softmax(in_data[i], out_data[i]); // normalize using softmax
+			softmax(len, &in_data[len*i], &out_data[len*i]); // normalize using softmax
 			// y is a one hot encoded vector for the labels
 			std::vector<acc_t> y(output_dims[1], 0.0); // ground truth
 			y[context->get_label(i)] = 1.0; // one-hot
-			loss[i] = cross_entropy(y, out_data[i]);
+			loss[i] = cross_entropy(len, &y[0], &out_data[len*i]);
 		}
 	}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("softmax-loss-fw"));
 }
 
-void softmax_loss_layer::forward_propagation(const float_t *in_data, float_t *out_data) { }
+void softmax_loss_layer::forward_propagation(const float_t *in_data, float_t *out_data) {
+}
 
-void softmax_loss_layer::back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad) {
+void softmax_loss_layer::back_propagation(const vec_t &in_data, const vec_t &out_data, vec_t &out_grad, vec_t &in_grad) {
+	size_t len = input_dims[1];
 	galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) {
-		vec_t norm_grad(output_dims[1]);
-		std::vector<acc_t> y(output_dims[1], 0.0); // ground truth
+		vec_t norm_grad(len);
+		std::vector<acc_t> y(len, 0.0); // ground truth
 		y[context->get_label(i)] = 1.0;
-		d_cross_entropy(y, out_data[i], norm_grad);
-		d_softmax(in_data[i], out_data[i], in_grad[i], norm_grad);
+		d_cross_entropy(len, &y[0], &out_data[len*i], &norm_grad[0]);
+		d_softmax(len, &in_data[len*i], &out_data[len*i], &in_grad[len*i], &norm_grad[0]);
 	}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("softmax-loss-bw"));
 }
 
diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp
index 1e3e0e1d79..f66fb8d8be 100644
--- a/libdeepgalois/src/math_functions.cpp
+++ b/libdeepgalois/src/math_functions.cpp
@@ -78,6 +78,13 @@ float_t dot(const vec_t &x, const vec_t &y) {
 	return sum;
 }
 
+float_t dot(size_t n, const float_t *x, const float_t *y) {
+	float_t sum = 0;
+	for (size_t i = 0; i < n; ++i)
+		sum += x[i] * y[i];
+	return sum;
+}
+
 // matrix-vector multiply
 void mvmul(const vec_t &matrix, const vec_t &in_vector, vec_t &out_vector) {
 	size_t m = out_vector.size();
@@ -118,6 +125,14 @@ void copy2D1D(const tensor_t &in, vec_t &out) {
 	}
 }
 
+void copy1D1D(const vec_t &in, vec_t &out) {
+	std::copy(in.begin(), in.end(), &out[0]);
+}
+
+void copy1D1D(size_t len, const float_t *in, float_t *out) {
+	std::copy(in, in+len, out);
+}
+
 void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, 
 		const int M, const int N, const int K, const float alpha, 
 		const float* A, const float* B, const float beta, float* C) {
@@ -220,6 +235,15 @@ void transpose(size_t x, size_t y, const vec_t &in, vec_t &out) {
 		}
 	}
 }
+
+void transpose(size_t x, size_t y, const float_t *in, float_t *out) {
+	for (size_t i = 0; i < y; i ++) {
+		for (size_t j = 0; j < x; j ++) {
+			out[i*x+j] = in[j*y+i];
+		}
+	}
+}
+
 int argmax(const size_t n, const vec_t &x) {
 	float_t max = x[0];
 	int max_ind = 0;
@@ -232,16 +256,37 @@ int argmax(const size_t n, const vec_t &x) {
 	return max_ind;
 }
 
+int argmax(const size_t n, const float_t *x) {
+	float_t max = x[0];
+	int max_ind = 0;
+	for (size_t i = 1; i < n; i++) {
+		if (x[i] > max) {
+			max_ind = i;
+			max = x[i];
+		}
+	}
+	return max_ind;
+}
+
 void clear(vec_t &in) {
 	for (size_t i = 0; i < in.size(); i++) in[i] = 0;
 }
 
+void clear(size_t n, float_t *in) {
+	for (size_t i = 0; i < n; i++) in[i] = 0;
+}
+
 void relu(const vec_t &in, vec_t &out) {
 	for (size_t i = 0; i < out.size(); ++i) {
 		out[i] = std::max(in[i], (float_t)0) + negative_slope * std::min(in[i], (float_t)0);
 	}
 }
 
+void relu(size_t n, const float_t *in, float_t *out) {
+	for (size_t i = 0; i < n; ++i)
+		out[i] = std::max(in[i], float_t{0});
+}
+
 void d_relu(const vec_t &in_diff, const vec_t &fv, vec_t &out_diff) {
 	for (size_t i = 0; i < out_diff.size(); ++i) {
 		out_diff[i] = in_diff[i] * ((fv[i] > (float_t)0) + negative_slope * (fv[i] <= (float_t)0));
@@ -283,11 +328,23 @@ void dropout(const float scale, const float dropout_rate, const vec_t &in, std::
 		out[i] = in[i] * mask[i] * scale;
 }
 
+void dropout(size_t n, const float scale, const float dropout_rate, const float_t *in, unsigned *mask, float_t *out) {
+	for (size_t i = 0; i < n; ++i)
+		mask[i] = bernoulli(dropout_rate);
+	for (size_t i = 0; i < n; ++i)
+		out[i] = in[i] * mask[i] * scale;
+}
+
 void d_dropout(const float scale, const vec_t &in_diff, std::vector<unsigned> &mask, vec_t &out_diff) {
 	for (size_t i = 0; i < in_diff.size(); ++i)
 		out_diff[i] = in_diff[i] * mask[i] * scale;
 }
 
+void d_dropout(size_t n, const float scale, const float_t *in_diff, unsigned *mask, float_t *out_diff) {
+	for (size_t i = 0; i < n; ++i)
+		out_diff[i] = in_diff[i] * mask[i] * scale;
+}
+
 float_t sigmoid_func(float_t x) {
 	return 0.5 * tanh(0.5 * x) + 0.5;
 }
@@ -317,6 +374,17 @@ void softmax(const vec_t &input, vec_t &output) {
 		output[i] /= denominator;
 }
 
+void softmax(size_t n, const float_t *input, float_t *output) {
+	const float_t max = *std::max_element(input, input+n);
+	float_t denominator(0);
+	for (size_t i = 0; i < n; i++) {
+		output[i] = std::exp(input[i] - max);
+		denominator += output[i];
+	}
+	for (size_t i = 0; i < n; i++)
+		output[i] /= denominator;
+}
+
 void log_softmax(const vec_t &input, vec_t &output) {
 	const float_t max = *std::max_element(input.begin(), input.end());
 	float_t denominator(0);
@@ -344,6 +412,16 @@ void d_softmax(const vec_t &y, const vec_t &p, vec_t &dy, const vec_t &dp) {
 	}
 }
 
+void d_softmax(size_t n, const float_t *y, const float_t *p, float_t *dy, const float_t *dp) {
+	vec_t df(n, 0);
+	for (size_t i = 0; i < n; i++) {
+		for (size_t j = 0; j < n; j++) {
+			df[j] = (j == i) ? p[i] * (float_t(1) - p[i]) : -p[j] * p[i];
+		}
+		dy[i] = dot(n, dp, &df[0]);
+	}
+}
+
 // cross-entropy loss function for multi-class classification
 // y: ground truth
 // p: predicted probability
@@ -361,6 +439,16 @@ float_t cross_entropy(const vec_t &y, const vec_t &p) {
 	return loss;
 }
 
+float_t cross_entropy(size_t n, const float_t *y, const float_t *p) {
+	float_t loss = 0.0;
+	for (size_t i = 0; i < n; i++) {
+		if (y[i] == float_t(0)) continue;
+		if (p[i] == float_t(0)) loss -= y[i] * std::log(float_t(1e-10));
+		else loss -= y[i] * std::log(p[i]);
+	}
+	return loss;
+}
+
 void d_cross_entropy(const vec_t &y, const vec_t &p, vec_t &d) {
 	auto n = y.size();
 	//for (size_t i = 0; i < n; i++) d[i] = (p[i] - y[i]) / (p[i] * (float_t(1) - p[i]));
@@ -370,3 +458,9 @@ void d_cross_entropy(const vec_t &y, const vec_t &p, vec_t &d) {
 	}
 }
 
+void d_cross_entropy(size_t n, const float_t *y, const float_t *p, float_t *d) {
+	for (size_t i = 0; i < n; i++) {
+		d[i] = -y[i] / (p[i] + float_t(1e-10));
+	}
+}
+
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index 9d1fe771fb..ac9f8c98de 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -31,7 +31,7 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1) {
 	feature_dims[3] = num_classes; // normalized output embedding: E
 	layers.resize(num_layers);
 #ifndef CPU_ONLY
-	copy_data_to_device(); // copy labels and input features to the device
+	context->copy_data_to_device(); // copy labels and input features to the device
 #endif
 }
 

From 2ffb0e8fd5208e888c650558b83935bc0a76049a Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Sun, 23 Feb 2020 18:59:37 -0600
Subject: [PATCH 015/660] refine interfaces

---
 libdeepgalois/include/aggregator.h            |  2 +-
 libdeepgalois/include/context.h               |  2 +-
 .../include/layers/graph_conv_layer.h         |  6 +--
 libdeepgalois/include/layers/layer.h          | 14 +++---
 .../include/layers/softmax_loss_layer.h       |  4 +-
 libdeepgalois/include/math_functions.hh       |  2 +-
 libdeepgalois/include/node.h                  | 49 ++++++++++++++-----
 libdeepgalois/src/aggregator.cpp              |  2 +-
 libdeepgalois/src/context.cpp                 |  4 +-
 libdeepgalois/src/layers/graph_conv_layer.cpp | 33 +++++++------
 .../src/layers/softmax_loss_layer.cpp         | 15 +++---
 libdeepgalois/src/math_functions.cpp          | 11 ++---
 12 files changed, 87 insertions(+), 57 deletions(-)

diff --git a/libdeepgalois/include/aggregator.h b/libdeepgalois/include/aggregator.h
index 5f818d6ce2..1ae8d062ae 100644
--- a/libdeepgalois/include/aggregator.h
+++ b/libdeepgalois/include/aggregator.h
@@ -2,7 +2,7 @@
 #include "types.h"
 #ifdef CPU_ONLY
 #include "gtypes.h"
-void update_all(size_t len, Graph &g, const vec_t &in, vec_t &out, bool norm, const vec_t &norm_factor);
+void update_all(size_t len, Graph &g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor);
 #else
 #include "graph_gpu.h"
 #define TB_SIZE 256
diff --git a/libdeepgalois/include/context.h b/libdeepgalois/include/context.h
index 1acc9d0b0e..68967aaeea 100644
--- a/libdeepgalois/include/context.h
+++ b/libdeepgalois/include/context.h
@@ -39,7 +39,7 @@ class Context {
 	void degree_counting();
 	void norm_factor_counting();
 	std::vector<label_t> labels; // labels for classification: N x 1
-	vec_t norm_factor; // normalization constant based on graph structure
+	float_t *norm_factor; // normalization constant based on graph structure
 	std::vector<unsigned> degrees;
 	vec_t h_feats; // input features: N x D
 	size_t n; // number of samples: N
diff --git a/libdeepgalois/include/layers/graph_conv_layer.h b/libdeepgalois/include/layers/graph_conv_layer.h
index a74f45f5f0..b8b42ca1d0 100644
--- a/libdeepgalois/include/layers/graph_conv_layer.h
+++ b/libdeepgalois/include/layers/graph_conv_layer.h
@@ -24,13 +24,13 @@ class graph_conv_layer: public layer {
 	void init();
 	std::string layer_type() const override { return std::string("graph_conv"); }
 	void set_netphase(net_phase ctx) override { phase_ = ctx; }
-	virtual void forward_propagation(const vec_t &in_data, vec_t &out_data);
-	virtual void back_propagation(const vec_t &in_data, const vec_t &out_data, vec_t &out_grad, vec_t &in_grad);
+	//virtual void forward_propagation(const vec_t &in_data, vec_t &out_data);
+	//virtual void back_propagation(const vec_t &in_data, const vec_t &out_data, vec_t &out_grad, vec_t &in_grad);
 	virtual void forward_propagation(const float_t *in_data, float_t *out_data);
 	virtual void back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad);
 	// user-defined aggregate function
 #ifdef CPU_ONLY
-	virtual void aggregate(size_t len, Graph &g, const vec_t &in, vec_t &out);
+	virtual void aggregate(size_t len, Graph &g, const float_t *in, float_t *out);
 #else
 	virtual void aggregate(size_t len, CSRGraph &g, const float_t *in, float_t *out);
 #endif
diff --git a/libdeepgalois/include/layers/layer.h b/libdeepgalois/include/layers/layer.h
index cc2d79dcfe..737d38fe55 100644
--- a/libdeepgalois/include/layers/layer.h
+++ b/libdeepgalois/include/layers/layer.h
@@ -41,8 +41,8 @@ class layer : public node {
 	virtual std::string layer_type() const = 0;
 	virtual void set_netphase(net_phase phase) {}
 	virtual void set_context(Context *ctx) { context = ctx; }
-	virtual void forward_propagation(const vec_t &in_data, vec_t &out_data) = 0;
-	virtual void back_propagation(const vec_t &in_data, const vec_t &out_data, vec_t &out_grad, vec_t &in_grad) = 0;
+	//virtual void forward_propagation(const vec_t &in_data, vec_t &out_data) = 0;
+	//virtual void back_propagation(const vec_t &in_data, const vec_t &out_data, vec_t &out_grad, vec_t &in_grad) = 0;
 	virtual void forward_propagation(const float_t *in_data, float_t *out_data) = 0;
 	virtual void back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) = 0;
 
@@ -62,21 +62,23 @@ class layer : public node {
 		masks_ = masks;
 	}
 	void set_in_data(vec_t data) {
+		assert(data.size() == input_dims[0]*input_dims[1]);
 		prev_ = std::make_shared<edge>(this, input_dims[0], input_dims[1]);
 		// allocate memory for intermediate features
-		prev_->get_data() = data;
+		//prev_->get_data() = data;
+		std::copy(data.begin(), data.end(), prev_->get_data());
 		// allocate memory for intermediate gradients
-		prev_->get_gradient().resize(input_dims[0]*output_dims[1]);
+		//prev_->get_gradient().resize(input_dims[0]*input_dims[1]);
 	}
 	void add_edge() {
 		// add an outgoing edge
 		next_ = std::make_shared<edge>(this, output_dims[0], output_dims[1]);
 		// allocate memory for intermediate feature vectors
-		next_->get_data().resize(output_dims[0]*output_dims[1]);
+		//next_->get_data().resize(output_dims[0]*output_dims[1]);
 	}
 	void alloc_grad() {
 		// allocate memory for intermediate gradients
-		next_->get_gradient().resize(output_dims[0]*output_dims[1]);
+		//next_->get_gradient().resize(output_dims[0]*output_dims[1]);
 	}
 	void forward() {
 		forward_propagation(prev()->get_data(), next()->get_data());
diff --git a/libdeepgalois/include/layers/softmax_loss_layer.h b/libdeepgalois/include/layers/softmax_loss_layer.h
index 0b1e9af3b5..f4adb51bcd 100644
--- a/libdeepgalois/include/layers/softmax_loss_layer.h
+++ b/libdeepgalois/include/layers/softmax_loss_layer.h
@@ -6,9 +6,9 @@ class softmax_loss_layer: public layer {
 	softmax_loss_layer(unsigned level, std::vector<size_t> in_dims, std::vector<size_t> out_dims);
 	~softmax_loss_layer() {}
 	std::string layer_type() const override { return std::string("softmax_loss"); }
-	virtual void forward_propagation(const vec_t &in_data, vec_t &out_data);
+	//virtual void forward_propagation(const vec_t &in_data, vec_t &out_data);
 	virtual void forward_propagation(const float_t *in_data, float_t *out_data);
-	virtual void back_propagation(const vec_t &in_data, const vec_t &out_data, vec_t &out_grad, vec_t &in_grad);
+	//virtual void back_propagation(const vec_t &in_data, const vec_t &out_data, vec_t &out_grad, vec_t &in_grad);
 	virtual void back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad);
 };
 
diff --git a/libdeepgalois/include/math_functions.hh b/libdeepgalois/include/math_functions.hh
index 99a9494f08..0cbb53bd66 100644
--- a/libdeepgalois/include/math_functions.hh
+++ b/libdeepgalois/include/math_functions.hh
@@ -26,7 +26,7 @@ void copy2D1D(const tensor_t &in, vec_t &out);
 void copy1D1D(const vec_t &in, vec_t &out);
 void copy1D1D(size_t len, const float_t *in, float_t *out);
 void matmul2D(const tensor_t &A, const tensor_t &B, tensor_t &C);
-void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z, const vec_t &A, const vec_t &B, vec_t &C); // matrix multiply
+void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z, const float_t *A, const float_t *B, float_t *C); // matrix multiply
 void matmul2D1D(const size_t dim_y, const tensor_t &A, const vec_t &B, vec_t &C);
 void transpose2D(const tensor_t &in, tensor_t &out);
 void transpose2D1D(const tensor_t &in, vec_t &out);
diff --git a/libdeepgalois/include/node.h b/libdeepgalois/include/node.h
index b74edec280..4dec9a2080 100644
--- a/libdeepgalois/include/node.h
+++ b/libdeepgalois/include/node.h
@@ -27,31 +27,58 @@ class edge {
 public:
 	edge(node *prev, size_t n, size_t len) :
 		num_samples_(n), ft_dim_(len),
-		data_(vec_t(n*len)), grad_(vec_t(n*len)),
-		prev_(prev) {}
+		//data_(vec_t(n*len)), grad_(vec_t(n*len)),
+		data_(NULL), grad_(NULL), prev_(prev) {
+#ifdef CPU_ONLY
+		data_ = new float_t[n*len];
+		grad_ = new float_t[n*len];
+#else
+		CUDA_CHECK(cudaMalloc((void **)&data_, n * len * sizeof(float_t)));
+		CUDA_CHECK(cudaMalloc((void **)&grad_, n * len * sizeof(float_t)));
+#endif
+	}
 
 	void merge_grads(vec_t *dst) {
-		assert(!grad_.empty());
+		//assert(!grad_.empty());
+		assert(grad_ != NULL);
 		dst->resize(ft_dim_);
 		float_t *pdst = &(*dst)[0];
-		std::copy(grad_.begin(), grad_.begin()+ft_dim_, pdst);
+#ifdef CPU_ONLY
+		//std::copy(grad_.begin(), grad_.begin()+ft_dim_, pdst);
+		std::copy(grad_, grad_+ft_dim_, pdst);
 		// @todo consider adding parallelism and vectorization
 		for (size_t sample = 1; sample < num_samples_; ++sample) {
 			for (size_t i = 0; i < ft_dim_; i++) pdst[i] += grad_[sample*ft_dim_+i];
 			//vectorize::reduce<float_t>(&grad_[sample][0], ft_dim_, pdst);
 		}
+#else
+		CUDA_CHECK(cudaMemcpy(&pdst, grad, ft_dim_ * sizeof(float_t), cudaMemcpyDeviceToHost));
+		//TODO
+#endif
 	}
 	void clear_grads() {
-		std::fill(grad_.begin(), grad_.end(), float_t{0}); // TODO: need vectorize
+#ifdef CPU_ONLY
+		//std::fill(grad_.begin(), grad_.end(), float_t{0}); // TODO: need vectorize
+		std::fill(grad_, grad_+ft_dim_*num_samples_, float_t{0}); // TODO: need vectorize
 		//vectorize::fill(&grad_[0], grad_.size(), float_t{0});
+#else
+		CUDA_CHECK(cudaMemset(grad_, 0, ft_dim_*num_samples_*sizeof(float_t)));
+#endif
 	}
-
+/*
 	vec_t &get_data() { return data_; }
 	const vec_t &get_data() const { return data_; }
 	vec_t &get_gradient() { return grad_; }
 	const vec_t &get_gradient() const { return grad_; }
-	float_t *get_gpu_data() const { return gpu_data_; }
+	float_t *get_gpu_data() { return gpu_data_; }
+	const float_t *get_gpu_data() const { return gpu_data_; }
 	float_t *get_gpu_gradient() { return gpu_grad_; }
+	const float_t *get_gpu_gradient() const { return gpu_grad_; }
+*/
+	float_t *get_data() { return data_; }
+	const float_t *get_data() const { return data_; }
+	float_t *get_gradient() { return grad_; }
+	const float_t *get_gradient() const { return grad_; }
 
 	const node *next() const { return next_; }
 	node *prev() { return prev_; }
@@ -61,10 +88,10 @@ class edge {
 private:
 	size_t num_samples_;// number of samples
 	size_t ft_dim_;     // feature dimensions
-	vec_t data_;        // feature vectors on CPU
-	vec_t grad_;        // gradients on CPU
-	float_t *gpu_data_; // feature vectors on GPU
-	float_t *gpu_grad_; // gradients on CPU
+	//vec_t data_;        // feature vectors on CPU
+	//vec_t grad_;        // gradients on CPU
+	float_t *data_; // feature vectors
+	float_t *grad_; // gradients
 	node *prev_;        // previous node, "producer" of data
 	node *next_;        // next node, "consumer" of data
 };
diff --git a/libdeepgalois/src/aggregator.cpp b/libdeepgalois/src/aggregator.cpp
index 30b2fc0a5e..45862b7516 100644
--- a/libdeepgalois/src/aggregator.cpp
+++ b/libdeepgalois/src/aggregator.cpp
@@ -3,7 +3,7 @@
 #include "aggregator.h"
 #include "math_functions.hh"
 
-void update_all(size_t len, Graph &g, const vec_t &in, vec_t &out, bool norm, const vec_t &norm_factor) {
+void update_all(size_t len, Graph &g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor) {
 	galois::do_all(galois::iterate(g.begin(), g.end()), [&](const auto& src) {
 		clear(len, &out[src*len]);
 		float_t a = 0.0, b = 0.0;
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index 44b12e4bb0..3058bac480 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -109,7 +109,7 @@ void Context::copy_data_to_device() {
 	CUDA_CHECK(cudaMalloc((void **)&d_labels, n * sizeof(label_t)));
 	CUDA_CHECK(cudaMemcpy(d_labels, &labels[0], n * sizeof(label_t), cudaMemcpyHostToDevice));
 	CUDA_CHECK(cudaMalloc((void **)&d_norm_factor, n * sizeof(float_t)));
-	CUDA_CHECK(cudaMalloc((void **)&d_feats, n * sizeof(float_t)));
+	CUDA_CHECK(cudaMalloc((void **)&d_feats, n * feat_len *  sizeof(float_t)));
 	CUDA_CHECK(cudaMemcpy(d_feats, &h_feats[0], n * feat_len * sizeof(float_t), cudaMemcpyHostToDevice));
 }
 #endif
@@ -118,7 +118,7 @@ void Context::copy_data_to_device() {
 // for each vertex v, compute pow(|N(v)|, -0.5), where |N(v)| is the degree of v
 void Context::norm_factor_counting() {
 #ifdef CPU_ONLY
-	norm_factor.resize(n);
+	norm_factor = new float_t[n];
 	galois::do_all(galois::iterate((size_t)0, n), [&] (auto v) {
 		float_t temp = std::sqrt(float_t(degrees[v]));
 		if (temp == 0.0) norm_factor[v] = 0.0;
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index ed2e000661..2685629138 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -1,7 +1,7 @@
 #include "layers/graph_conv_layer.h"
 
 #ifdef CPU_ONLY
-void graph_conv_layer::aggregate(size_t len, Graph &g, const vec_t &in, vec_t &out) {
+void graph_conv_layer::aggregate(size_t len, Graph &g, const float_t *in, float_t *out) {
 	update_all(len, g, in, out, true, context->norm_factor);
 #else
 void graph_conv_layer::aggregate(size_t len, CSRGraph &g, const float_t *in, float_t *out) {
@@ -39,7 +39,7 @@ void graph_conv_layer::init() {
 	rand_init_matrix(y, z, W);
 	//rand_init_matrix(y, z, Q);
 	zero_init_matrix(y, z, weight_grad);
-	alloc_grad();
+	//alloc_grad();
 	if (dropout_) dropout_mask.resize(x*y);
 	in_temp.resize(x*y);
 	out_temp.resize(x*z); // same as pre_sup in original GCN code: https://github.com/chenxuhao/gcn/blob/master/gcn/layers.py
@@ -50,7 +50,8 @@ void graph_conv_layer::init() {
 
 #ifdef CPU_ONLY
 // 𝒉[𝑙] = σ(𝑊 * Σ(𝒉[𝑙-1]))
-void graph_conv_layer::forward_propagation(const vec_t &in_data, vec_t &out_data) {
+void graph_conv_layer::forward_propagation(const float_t *in_data, float_t *out_data) {
+//void graph_conv_layer::forward_propagation(const vec_t &in_data, vec_t &out_data) {
 	// input: x*y; W: y*z; output: x*z
 	// if y > z: mult W first to reduce the feature size for aggregation
 	// else: aggregate first then mult W (not implemented yet)
@@ -58,9 +59,9 @@ void graph_conv_layer::forward_propagation(const vec_t &in_data, vec_t &out_data
 		galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) {
 			dropout(y, scale_, dropout_rate_, &in_data[i*y], &dropout_mask[i*y], &in_temp[i*y]);
 		}, galois::loopname("dropout"));
-		matmul1D1D(x, z, y, in_temp, W, out_temp); // x*y; y*z; x*z
-	} else matmul1D1D(x, z, y, in_data, W, out_temp); // x*y; y*z; x*z
-	aggregate(z, context->graph_cpu, out_temp, out_data); // aggregate
+		matmul1D1D(x, z, y, &in_temp[0], &W[0], &out_temp[0]); // x*y; y*z; x*z
+	} else matmul1D1D(x, z, y, in_data, &W[0], &out_temp[0]); // x*y; y*z; x*z
+	aggregate(z, context->graph_cpu, &out_temp[0], out_data); // aggregate
 	if (act_) {
 		galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) {
 			relu(z, &out_data[i*z], &out_data[i*z]);
@@ -69,19 +70,21 @@ void graph_conv_layer::forward_propagation(const vec_t &in_data, vec_t &out_data
 }
 
 // 𝜕𝐸 / 𝜕𝑦[𝑙−1] = 𝜕𝐸 / 𝜕𝑦[𝑙] ∗ 𝑊 ^𝑇
-void graph_conv_layer::back_propagation(const vec_t &in_data, const vec_t &out_data, vec_t &out_grad, vec_t &in_grad) {
+void graph_conv_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) {
+//void graph_conv_layer::back_propagation(const vec_t &in_data, const vec_t &out_data, vec_t &out_grad, vec_t &in_grad) {
 	if (act_) {
 		galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) {
 			for (size_t j = 0; j < z; ++j) //TODO: use in_data or out_data?
 				out_temp[i*z+j] = out_data[i*z+j] > float_t(0) ? out_grad[i*z+j] : float_t(0);
 		}, galois::loopname("d_relu"));
-	} else copy1D1D(out_grad, out_temp); // TODO: avoid copying
+	//} else copy1D1D(out_grad, out_temp); // TODO: avoid copying
+	} else copy1D1D(x*z, out_grad, &out_temp[0]); // TODO: avoid copying
 	if (level_ != 0) { // no need to calculate in_grad for the first layer
 		vec_t trans_W(z*y);
 		transpose(y, z, W, trans_W); // derivative of matmul needs transposed matrix
-		matmul1D1D(x, y, z, out_temp, trans_W, in_temp); // x*z; z*y -> x*y
+		matmul1D1D(x, y, z, &out_temp[0], &trans_W[0], &in_temp[0]); // x*z; z*y -> x*y
 		//NOTE: since graph is symmetric, the derivative is the same
-		update_all(y, context->graph_cpu, in_temp, in_grad, true, context->norm_factor); // x*x; x*y -> x*y
+		update_all(y, context->graph_cpu, &in_temp[0], in_grad, true, context->norm_factor); // x*x; x*y -> x*y
 		if (dropout_) {
 			galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) {
 				d_dropout(y, scale_, &in_grad[i*y], &dropout_mask[i*y], &in_grad[i*y]);
@@ -89,15 +92,13 @@ void graph_conv_layer::back_propagation(const vec_t &in_data, const vec_t &out_d
 		}
 	}
 	// calculate weight gradients
-	transpose(x, y, in_data, trans_data); // y*x
-	matmul1D1D(y, z, x, trans_data, out_temp, weight_grad); // y*x; x*z; y*z
+	transpose(x, y, in_data, &trans_data[0]); // y*x
+	matmul1D1D(y, z, x, &trans_data[0], &out_temp[0], &weight_grad[0]); // y*x; x*z; y*z
 }
 
-void graph_conv_layer::forward_propagation(const float_t *in_data, float_t *out_data) {}
-void graph_conv_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) {}
 #else
-void graph_conv_layer::forward_propagation(const vec_t &in_data, vec_t &out_data) {}
-void graph_conv_layer::back_propagation(const vec_t &in_data, const vec_t &out_data, vec_t &out_grad, vec_t &in_grad) {}
+//void graph_conv_layer::forward_propagation(const vec_t &in_data, vec_t &out_data) {}
+//void graph_conv_layer::back_propagation(const vec_t &in_data, const vec_t &out_data, vec_t &out_grad, vec_t &in_grad) {}
 
 // GPU forward
 void graph_conv_layer::forward_propagation(const float_t *in_data, float_t *out_data) {
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp
index bc0cd5e953..7a9686e772 100644
--- a/libdeepgalois/src/layers/softmax_loss_layer.cpp
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp
@@ -6,10 +6,11 @@ softmax_loss_layer::softmax_loss_layer(unsigned level, std::vector<size_t> in_di
 	loss.resize(in_dims[0]); // error for each sample
 	name_ = layer_type() + "_" + std::to_string(level);
 }
-
+#ifdef CPU_ONLY
 // TODO: need kernel fusion optimization
 // 𝑦[i] = 𝑒^𝑥[i] / Σ 𝑒^𝑥[𝑘]
-void softmax_loss_layer::forward_propagation(const vec_t &in_data, vec_t &out_data) {
+void softmax_loss_layer::forward_propagation(const float_t *in_data, float_t *out_data) {
+//void softmax_loss_layer::forward_propagation(const vec_t &in_data, vec_t &out_data) {
 	size_t len = input_dims[1];
 	galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) {
 		if (masks_[i] == 1) { // masked
@@ -22,10 +23,8 @@ void softmax_loss_layer::forward_propagation(const vec_t &in_data, vec_t &out_da
 	}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("softmax-loss-fw"));
 }
 
-void softmax_loss_layer::forward_propagation(const float_t *in_data, float_t *out_data) {
-}
-
-void softmax_loss_layer::back_propagation(const vec_t &in_data, const vec_t &out_data, vec_t &out_grad, vec_t &in_grad) {
+//void softmax_loss_layer::back_propagation(const vec_t &in_data, const vec_t &out_data, vec_t &out_grad, vec_t &in_grad) {
+void softmax_loss_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) {
 	size_t len = input_dims[1];
 	galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) {
 		vec_t norm_grad(len);
@@ -35,6 +34,10 @@ void softmax_loss_layer::back_propagation(const vec_t &in_data, const vec_t &out
 		d_softmax(len, &in_data[len*i], &out_data[len*i], &in_grad[len*i], &norm_grad[0]);
 	}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("softmax-loss-bw"));
 }
+#else // GPU implementation
+void softmax_loss_layer::forward_propagation(const float_t *in_data, float_t *out_data) {
+}
 
 void softmax_loss_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) {
 }
+#endif
diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp
index f66fb8d8be..98535d98bd 100644
--- a/libdeepgalois/src/math_functions.cpp
+++ b/libdeepgalois/src/math_functions.cpp
@@ -161,15 +161,12 @@ void matmul2D(const tensor_t &A, const tensor_t &B, tensor_t &C) {
 }
 
 void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z, 
-	const vec_t &A, const vec_t &B, vec_t &C) {
+	const float_t *A, const float_t *B, float_t *C) {
 	galois::StatTimer Tmatmul("MatMul");
 	Tmatmul.start();
-	assert(A.size() == dim_x*dim_z);
-	assert(B.size() == dim_z*dim_y);
-	assert(C.size() == dim_x*dim_y);
 	const CBLAS_TRANSPOSE TransA = CblasNoTrans;
 	const CBLAS_TRANSPOSE TransB = CblasNoTrans;
-	sgemm_cpu(TransA, TransB, dim_x, dim_y, dim_z, 1.0, &A[0], &B[0], 0.0, &C[0]);
+	sgemm_cpu(TransA, TransB, dim_x, dim_y, dim_z, 1.0, A, B, 0.0, C);
 	Tmatmul.stop();
 }
 
@@ -181,7 +178,7 @@ void matmul2D1D(const size_t dim_y, const tensor_t &A, const vec_t &B, vec_t &C)
 	assert(C.size() == dim_x*dim_y);
 	vec_t A1D(dim_x*dim_z);
 	copy2D1D(A, A1D);
-	matmul1D1D(dim_x, dim_y, dim_z, A1D, B, C);
+	matmul1D1D(dim_x, dim_y, dim_z, &A1D[0], &B[0], &C[0]);
 }
 
 void matmul(const tensor_t &A, const vec_t &B, tensor_t &C) {
@@ -198,7 +195,7 @@ void matmul(const tensor_t &A, const vec_t &B, tensor_t &C) {
 		std::copy(A[i].begin(), A[i].end(), ptr);
 		ptr += dim_z;
 	}
-	matmul1D1D(dim_x, dim_y, dim_z, A1D, B, C1D);
+	matmul1D1D(dim_x, dim_y, dim_z, &A1D[0], &B[0], &C1D[0]);
 	for (size_t i = 0; i < dim_x; i++) {
 		for (size_t j = 0; j < dim_y; ++j) { 
 			C[i][j] = C1D[i*dim_y+j];

From e15823ee75be386c2e7681fc4baead010912fb02 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Sun, 23 Feb 2020 20:19:15 -0600
Subject: [PATCH 016/660] add gpu data

---
 libdeepgalois/include/context.h                 |  1 +
 libdeepgalois/include/layers/layer.h            | 13 +++++++++----
 libdeepgalois/include/math_functions.hh         |  1 +
 libdeepgalois/include/node.h                    | 13 ++++++++-----
 libdeepgalois/src/context.cpp                   |  2 ++
 libdeepgalois/src/layers/softmax_loss_layer.cpp |  1 +
 libdeepgalois/src/math_functions.cu             |  2 ++
 libdeepgalois/src/net.cpp                       |  2 +-
 8 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/libdeepgalois/include/context.h b/libdeepgalois/include/context.h
index 68967aaeea..5a362804cc 100644
--- a/libdeepgalois/include/context.h
+++ b/libdeepgalois/include/context.h
@@ -36,6 +36,7 @@ class Context {
 	size_t read_features(std::string dataset_str);
 	label_t get_label(size_t i) { return labels[i]; }
 	label_t *get_labels_ptr(size_t i) { return &(labels[0]); }
+	float_t * get_in_ptr();
 	void degree_counting();
 	void norm_factor_counting();
 	std::vector<label_t> labels; // labels for classification: N x 1
diff --git a/libdeepgalois/include/layers/layer.h b/libdeepgalois/include/layers/layer.h
index 737d38fe55..4e634f4934 100644
--- a/libdeepgalois/include/layers/layer.h
+++ b/libdeepgalois/include/layers/layer.h
@@ -61,19 +61,23 @@ class layer : public node {
 		count_ = sample_count;
 		masks_ = masks;
 	}
-	void set_in_data(vec_t data) {
+	void set_in_data(float_t *data) {
 		assert(data.size() == input_dims[0]*input_dims[1]);
 		prev_ = std::make_shared<edge>(this, input_dims[0], input_dims[1]);
+		prev_->set_data(data);
+		// no need to allocate memory for gradients, since this is the input layer.
+		//
 		// allocate memory for intermediate features
 		//prev_->get_data() = data;
-		std::copy(data.begin(), data.end(), prev_->get_data());
+		//std::copy(data.begin(), data.end(), prev_->get_data());
 		// allocate memory for intermediate gradients
 		//prev_->get_gradient().resize(input_dims[0]*input_dims[1]);
 	}
 	void add_edge() {
 		// add an outgoing edge
 		next_ = std::make_shared<edge>(this, output_dims[0], output_dims[1]);
-		// allocate memory for intermediate feature vectors
+		// allocate memory for intermediate feature vectors and gradients
+		next_->alloc();
 		//next_->get_data().resize(output_dims[0]*output_dims[1]);
 	}
 	void alloc_grad() {
@@ -96,7 +100,8 @@ class layer : public node {
 		//for (size_t i = 0; i < diff.size(); ++i)
 		//	diff[i] *= rcp_batch_size;
 		opt->update(weight_grad, W, parallel); // W += grad
-		prev()->clear_grads();
+		//prev()->clear_grads();
+		next()->clear_grads();
 	}
 	inline acc_t get_masked_loss() {
 		AccumF total_loss;
diff --git a/libdeepgalois/include/math_functions.hh b/libdeepgalois/include/math_functions.hh
index 0cbb53bd66..626ed9b4a6 100644
--- a/libdeepgalois/include/math_functions.hh
+++ b/libdeepgalois/include/math_functions.hh
@@ -60,5 +60,6 @@ void dropout_gpu(const float scale, const float dropout_rate, const float_t *in,
 void d_dropout_gpu(const float scale, const float_t *in_diff, const unsigned *mask, float_t *out_diff); // dropout derivative
 void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, const float_t *A, const float_t *B, float_t *C); // matrix multiply
 int argmax_gpu(const size_t n, const float_t *x); // the arguments of the maxima
+void softmax_cross_entropy_gpu(int x, int y, float_t *in_data, float_t *out_data);
 
 #endif
diff --git a/libdeepgalois/include/node.h b/libdeepgalois/include/node.h
index 4dec9a2080..317a8e6f03 100644
--- a/libdeepgalois/include/node.h
+++ b/libdeepgalois/include/node.h
@@ -28,13 +28,15 @@ class edge {
 	edge(node *prev, size_t n, size_t len) :
 		num_samples_(n), ft_dim_(len),
 		//data_(vec_t(n*len)), grad_(vec_t(n*len)),
-		data_(NULL), grad_(NULL), prev_(prev) {
+		data_(NULL), grad_(NULL), prev_(prev) {}
+
+	void alloc() {
 #ifdef CPU_ONLY
-		data_ = new float_t[n*len];
-		grad_ = new float_t[n*len];
+		data_ = new float_t[num_samples_ * ft_dim_];
+		grad_ = new float_t[num_samples_ * ft_dim_];
 #else
-		CUDA_CHECK(cudaMalloc((void **)&data_, n * len * sizeof(float_t)));
-		CUDA_CHECK(cudaMalloc((void **)&grad_, n * len * sizeof(float_t)));
+		CUDA_CHECK(cudaMalloc((void **)&data_, num_samples_ * ft_dim_ * sizeof(float_t)));
+		CUDA_CHECK(cudaMalloc((void **)&grad_, num_samples_ * ft_dim_ * sizeof(float_t)));
 #endif
 	}
 
@@ -75,6 +77,7 @@ class edge {
 	float_t *get_gpu_gradient() { return gpu_grad_; }
 	const float_t *get_gpu_gradient() const { return gpu_grad_; }
 */
+	void set_data(float_t *ptr) { data_ = ptr; }
 	float_t *get_data() { return data_; }
 	const float_t *get_data() const { return data_; }
 	float_t *get_gradient() { return grad_; }
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index 3058bac480..a275cb3b4c 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -98,6 +98,7 @@ void Context::genGraph(LGraph &lg, Graph &g) {
 			g.constructEdge(offset, lg.get_dest(offset), 0);
 	}
 }
+float_t * Context::get_in_ptr() { return &h_feats[0]; }
 #else
 size_t Context::read_graph_gpu(std::string dataset_str) {
 	std::string filename = path + dataset_str + ".csgr";
@@ -112,6 +113,7 @@ void Context::copy_data_to_device() {
 	CUDA_CHECK(cudaMalloc((void **)&d_feats, n * feat_len *  sizeof(float_t)));
 	CUDA_CHECK(cudaMemcpy(d_feats, &h_feats[0], n * feat_len * sizeof(float_t), cudaMemcpyHostToDevice));
 }
+float_t * Context::get_in_ptr() { return d_feats; }
 #endif
 
 // user-defined pre-computing function, called during initialization
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp
index 7a9686e772..cd1b517ccf 100644
--- a/libdeepgalois/src/layers/softmax_loss_layer.cpp
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp
@@ -36,6 +36,7 @@ void softmax_loss_layer::back_propagation(const float_t *in_data, const float_t
 }
 #else // GPU implementation
 void softmax_loss_layer::forward_propagation(const float_t *in_data, float_t *out_data) {
+	softmax_cross_entropy_gpu(input_dims[0], input_dims[1], in_data, out_data);
 }
 
 void softmax_loss_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) {
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index 064926eb58..b65d39972c 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -106,3 +106,5 @@ void vadd_gpu(const int N, const float_t* a, const float_t* b, float_t* y) {
 	vadd_kernel<<<CUDA_GET_BLOCKS(N), CUDA_NUM_THREADS>>>(N, a, b, y);
 }
 
+void softmax_cross_entropy_gpu(int x, int y, float_t *in_data, float_t *out_data) {
+}
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index ac9f8c98de..857b7691b5 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -83,7 +83,7 @@ void Net::construct_layers() {
 	append_conv_layer(0, true); // first conv layer
 	append_conv_layer(1); // hidden1 layer
 	append_out_layer(2); // output layer
-	layers[0]->set_in_data(context->h_feats); // feed input data
+	layers[0]->set_in_data(context->get_in_ptr()); // feed input data
 	set_contexts();
 }
 

From 1bc8b8d5a2923d75c7ad02c752b52faaa99482ed Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Sun, 23 Feb 2020 21:20:56 -0600
Subject: [PATCH 017/660] add node.cu

---
 libdeepgalois/CMakeLists.txt                  |  9 ++-
 libdeepgalois/include/layers/layer.h          |  2 +-
 libdeepgalois/include/math_functions.hh       |  2 +-
 libdeepgalois/include/node.h                  | 58 +++----------------
 .../src/layers/softmax_loss_layer.cpp         |  2 +-
 libdeepgalois/src/math_functions.cu           |  2 +-
 libdeepgalois/src/node.cpp                    | 36 ++++++++++++
 libdeepgalois/src/node.cu                     | 15 +++++
 8 files changed, 71 insertions(+), 55 deletions(-)
 create mode 100644 libdeepgalois/src/node.cpp
 create mode 100644 libdeepgalois/src/node.cu

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index 0fe04fab0d..73152f8792 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -16,7 +16,7 @@ include_directories("${MGPU_ROOT}/src")
 link_directories(${OPENBLAS_LIB})
 link_directories(${CMAKE_SOURCE_DIR}/libgalois)
 
-set(USE_CPU ON CACHE BOOL "Build DeepGalois without CUDA support")
+set(USE_CPU OFF CACHE BOOL "Build DeepGalois without CUDA support")
 if(USE_CPU)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCPU_ONLY")
 else()
@@ -27,7 +27,11 @@ else()
   #list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_60,code=sm_60")
   link_directories(${CUDA_LIB})
   link_directories(${CMAKE_SOURCE_DIR}/libgpu)
-  set(CUDA_SOURCES src/math_functions.cu src/aggregator.cu)
+  set(CUDA_SOURCES
+    src/math_functions.cu
+	src/aggregator.cu
+	src/node.cu
+  )
   cuda_add_library(dg_gpu ${CUDA_SOURCES})
   target_link_libraries(dg_gpu galois_gpu -lcudart -lcublas -lcurand)
   set_target_properties(dg_gpu PROPERTIES COMPILE_FLAGS "-DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_CUDA")
@@ -43,6 +47,7 @@ set(sources
   src/math_functions.cpp
   src/aggregator.cpp
   src/context.cpp
+  src/node.cpp
   src/net.cpp
 )
 add_library(dg_cpu STATIC ${sources})
diff --git a/libdeepgalois/include/layers/layer.h b/libdeepgalois/include/layers/layer.h
index 4e634f4934..874ce85d30 100644
--- a/libdeepgalois/include/layers/layer.h
+++ b/libdeepgalois/include/layers/layer.h
@@ -132,7 +132,7 @@ class layer : public node {
 	vec_t W; // parameters to learn, for vertex v, layer0: D x 16, layer1: 16 x E
 	vec_t Q; // parameters to learn, for vertex u, i.e. v's neighbors, layer0: D x 16, layer1: 16 x E
 	vec_t weight_grad; // weight gradient for updating parameters
-	vec_t loss; // error for each vertex: N x 1
+	float_t *loss; // error for each vertex: N x 1
 	Context *context;
 };
 
diff --git a/libdeepgalois/include/math_functions.hh b/libdeepgalois/include/math_functions.hh
index 626ed9b4a6..b4ce800bb7 100644
--- a/libdeepgalois/include/math_functions.hh
+++ b/libdeepgalois/include/math_functions.hh
@@ -60,6 +60,6 @@ void dropout_gpu(const float scale, const float dropout_rate, const float_t *in,
 void d_dropout_gpu(const float scale, const float_t *in_diff, const unsigned *mask, float_t *out_diff); // dropout derivative
 void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, const float_t *A, const float_t *B, float_t *C); // matrix multiply
 int argmax_gpu(const size_t n, const float_t *x); // the arguments of the maxima
-void softmax_cross_entropy_gpu(int x, int y, float_t *in_data, float_t *out_data);
+void softmax_cross_entropy_gpu(int x, int y, const float_t *in_data, float_t *out_data);
 
 #endif
diff --git a/libdeepgalois/include/node.h b/libdeepgalois/include/node.h
index 317a8e6f03..98b97b2c55 100644
--- a/libdeepgalois/include/node.h
+++ b/libdeepgalois/include/node.h
@@ -1,5 +1,7 @@
 #pragma once
 #include <vector>
+#include <memory>
+#include <cassert>
 #include "types.h"
 class node;
 class layer;
@@ -7,7 +9,7 @@ class edge;
 
 typedef std::shared_ptr<edge> edgeptr_t;
 
-// node data structure
+// node data structure: each layer is a node, two layers are connected by an edge
 class node : public std::enable_shared_from_this<node> {
 public:
 	node(size_t in_size, size_t out_size) {}//: prev_(in_size), next_(out_size) {}
@@ -30,53 +32,13 @@ class edge {
 		//data_(vec_t(n*len)), grad_(vec_t(n*len)),
 		data_(NULL), grad_(NULL), prev_(prev) {}
 
-	void alloc() {
-#ifdef CPU_ONLY
-		data_ = new float_t[num_samples_ * ft_dim_];
-		grad_ = new float_t[num_samples_ * ft_dim_];
-#else
-		CUDA_CHECK(cudaMalloc((void **)&data_, num_samples_ * ft_dim_ * sizeof(float_t)));
-		CUDA_CHECK(cudaMalloc((void **)&grad_, num_samples_ * ft_dim_ * sizeof(float_t)));
-#endif
-	}
+	void alloc();
+	void alloc_gpu();
+	void merge_grads(vec_t *dst);
+	void merge_grads_gpu(float_t *dst);
+	void clear_grads();
+	void clear_grads_gpu();
 
-	void merge_grads(vec_t *dst) {
-		//assert(!grad_.empty());
-		assert(grad_ != NULL);
-		dst->resize(ft_dim_);
-		float_t *pdst = &(*dst)[0];
-#ifdef CPU_ONLY
-		//std::copy(grad_.begin(), grad_.begin()+ft_dim_, pdst);
-		std::copy(grad_, grad_+ft_dim_, pdst);
-		// @todo consider adding parallelism and vectorization
-		for (size_t sample = 1; sample < num_samples_; ++sample) {
-			for (size_t i = 0; i < ft_dim_; i++) pdst[i] += grad_[sample*ft_dim_+i];
-			//vectorize::reduce<float_t>(&grad_[sample][0], ft_dim_, pdst);
-		}
-#else
-		CUDA_CHECK(cudaMemcpy(&pdst, grad, ft_dim_ * sizeof(float_t), cudaMemcpyDeviceToHost));
-		//TODO
-#endif
-	}
-	void clear_grads() {
-#ifdef CPU_ONLY
-		//std::fill(grad_.begin(), grad_.end(), float_t{0}); // TODO: need vectorize
-		std::fill(grad_, grad_+ft_dim_*num_samples_, float_t{0}); // TODO: need vectorize
-		//vectorize::fill(&grad_[0], grad_.size(), float_t{0});
-#else
-		CUDA_CHECK(cudaMemset(grad_, 0, ft_dim_*num_samples_*sizeof(float_t)));
-#endif
-	}
-/*
-	vec_t &get_data() { return data_; }
-	const vec_t &get_data() const { return data_; }
-	vec_t &get_gradient() { return grad_; }
-	const vec_t &get_gradient() const { return grad_; }
-	float_t *get_gpu_data() { return gpu_data_; }
-	const float_t *get_gpu_data() const { return gpu_data_; }
-	float_t *get_gpu_gradient() { return gpu_grad_; }
-	const float_t *get_gpu_gradient() const { return gpu_grad_; }
-*/
 	void set_data(float_t *ptr) { data_ = ptr; }
 	float_t *get_data() { return data_; }
 	const float_t *get_data() const { return data_; }
@@ -91,8 +53,6 @@ class edge {
 private:
 	size_t num_samples_;// number of samples
 	size_t ft_dim_;     // feature dimensions
-	//vec_t data_;        // feature vectors on CPU
-	//vec_t grad_;        // gradients on CPU
 	float_t *data_; // feature vectors
 	float_t *grad_; // gradients
 	node *prev_;        // previous node, "producer" of data
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp
index cd1b517ccf..d6969b7a95 100644
--- a/libdeepgalois/src/layers/softmax_loss_layer.cpp
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp
@@ -3,7 +3,7 @@
 softmax_loss_layer::softmax_loss_layer(unsigned level, std::vector<size_t> in_dims, 
 	std::vector<size_t> out_dims) : layer(level, in_dims, out_dims) {
 	trainable_ = false;
-	loss.resize(in_dims[0]); // error for each sample
+	loss = new float_t[in_dims[0]]; // error for each sample
 	name_ = layer_type() + "_" + std::to_string(level);
 }
 #ifdef CPU_ONLY
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index b65d39972c..9f319d1325 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -106,5 +106,5 @@ void vadd_gpu(const int N, const float_t* a, const float_t* b, float_t* y) {
 	vadd_kernel<<<CUDA_GET_BLOCKS(N), CUDA_NUM_THREADS>>>(N, a, b, y);
 }
 
-void softmax_cross_entropy_gpu(int x, int y, float_t *in_data, float_t *out_data) {
+void softmax_cross_entropy_gpu(int x, int y, const float_t *in_data, float_t *out_data) {
 }
diff --git a/libdeepgalois/src/node.cpp b/libdeepgalois/src/node.cpp
new file mode 100644
index 0000000000..fd55aad0e5
--- /dev/null
+++ b/libdeepgalois/src/node.cpp
@@ -0,0 +1,36 @@
+#include "node.h"
+
+void edge::alloc() {
+#ifdef CPU_ONLY
+	data_ = new float_t[num_samples_ * ft_dim_];
+	grad_ = new float_t[num_samples_ * ft_dim_];
+#else
+	alloc_gpu();
+#endif
+}
+
+void edge::merge_grads(vec_t *dst) {
+	assert(grad_ != NULL);
+	dst->resize(ft_dim_);
+	float_t *pdst = &(*dst)[0];
+#ifdef CPU_ONLY
+	std::copy(grad_, grad_+ft_dim_, pdst);
+	// @todo consider adding parallelism and vectorization
+	for (size_t sample = 1; sample < num_samples_; ++sample) {
+		for (size_t i = 0; i < ft_dim_; i++) pdst[i] += grad_[sample*ft_dim_+i];
+		//vectorize::reduce<float_t>(&grad_[sample][0], ft_dim_, pdst);
+	}
+#else
+	merge_grads_gpu(pdst);
+#endif
+}
+
+void edge::clear_grads() {
+#ifdef CPU_ONLY
+	std::fill(grad_, grad_+ft_dim_*num_samples_, float_t{0}); // TODO: need vectorize
+	//vectorize::fill(&grad_[0], grad_.size(), float_t{0});
+#else
+	clear_grads_gpu();
+#endif
+}
+
diff --git a/libdeepgalois/src/node.cu b/libdeepgalois/src/node.cu
new file mode 100644
index 0000000000..87795390ff
--- /dev/null
+++ b/libdeepgalois/src/node.cu
@@ -0,0 +1,15 @@
+#include "node.h"
+#include "cutils.h"
+
+void edge::alloc_gpu() {
+	CUDA_CHECK(cudaMalloc((void **)&data_, num_samples_ * ft_dim_ * sizeof(float_t)));
+	CUDA_CHECK(cudaMalloc((void **)&grad_, num_samples_ * ft_dim_ * sizeof(float_t)));
+}
+
+void edge::merge_grads_gpu(float_t *dst) {
+	CUDA_CHECK(cudaMemcpy(&dst, grad_, ft_dim_ * sizeof(float_t), cudaMemcpyDeviceToHost));
+}
+
+void edge::clear_grads_gpu() {
+	CUDA_CHECK(cudaMemset(grad_, 0, ft_dim_*num_samples_*sizeof(float_t)));
+}

From 159cd9bb730feb4fb83832407fa4969259d4cde2 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Mon, 24 Feb 2020 09:56:20 -0600
Subject: [PATCH 018/660] add gpu functions

---
 .../include/layers/graph_conv_layer.h         |  8 +--
 libdeepgalois/include/layers/layer.h          |  2 +
 libdeepgalois/include/math_functions.hh       | 14 ++++-
 libdeepgalois/src/layers/graph_conv_layer.cpp | 53 +++++++++++--------
 libdeepgalois/src/math_functions.cu           | 48 +++++++++++++++--
 5 files changed, 94 insertions(+), 31 deletions(-)

diff --git a/libdeepgalois/include/layers/graph_conv_layer.h b/libdeepgalois/include/layers/graph_conv_layer.h
index b8b42ca1d0..f0f27687e7 100644
--- a/libdeepgalois/include/layers/graph_conv_layer.h
+++ b/libdeepgalois/include/layers/graph_conv_layer.h
@@ -48,10 +48,10 @@ class graph_conv_layer: public layer {
 	size_t x;
 	size_t y;
 	size_t z;
-	vec_t out_temp;
-	vec_t in_temp;
-	vec_t trans_data; // y*x
-	std::vector<unsigned>  dropout_mask; // x*y
+	float_t *out_temp;
+	float_t *in_temp;
+	float_t *trans_data; // y*x
+	unsigned * dropout_mask; // x*y
 
 	// Glorot & Bengio (AISTATS 2010)
 	inline void rand_init_matrix(size_t dim_x, size_t dim_y, vec_t &matrix) {
diff --git a/libdeepgalois/include/layers/layer.h b/libdeepgalois/include/layers/layer.h
index 874ce85d30..fea557a3ff 100644
--- a/libdeepgalois/include/layers/layer.h
+++ b/libdeepgalois/include/layers/layer.h
@@ -132,6 +132,8 @@ class layer : public node {
 	vec_t W; // parameters to learn, for vertex v, layer0: D x 16, layer1: 16 x E
 	vec_t Q; // parameters to learn, for vertex u, i.e. v's neighbors, layer0: D x 16, layer1: 16 x E
 	vec_t weight_grad; // weight gradient for updating parameters
+	float_t *d_W;
+	float_t *d_weight_grad;
 	float_t *loss; // error for each vertex: N x 1
 	Context *context;
 };
diff --git a/libdeepgalois/include/math_functions.hh b/libdeepgalois/include/math_functions.hh
index b4ce800bb7..691db22a96 100644
--- a/libdeepgalois/include/math_functions.hh
+++ b/libdeepgalois/include/math_functions.hh
@@ -6,6 +6,11 @@
 #include <cstdint>
 #include "types.h"
 
+extern "C" {
+#include <cblas.h>
+//#include <clapack.h>
+}
+
 const float negative_slope = 0;
 
 void vadd(const vec_t &a, const vec_t &b, vec_t &out); // vector add
@@ -53,13 +58,20 @@ float_t cross_entropy(size_t n, const float_t *y, const float_t *p);
 void d_cross_entropy(const vec_t &y, const vec_t &p, vec_t &d);
 void d_cross_entropy(size_t n, const float_t *y, const float_t *p, float_t *d);
 
+void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned *masks, float_t *in, float_t *out, float_t *matrix, float_t *grad);
+void copy_gpu(size_t len, const float_t *in, float_t *out);
+void malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned *masks, float_t *in, float_t *out);
 void vadd_gpu(const int n, const float_t *a, const float_t *b, float_t *out); // vector add
 void relu_gpu(const int n, const float_t *in, float_t *out); // ReLU
 void d_relu_gpu(const int n, const float_t *in_diff, const float_t *data, float_t *out_diff); // ReLU derivative
-void dropout_gpu(const float scale, const float dropout_rate, const float_t *in, unsigned *mask, float_t *out); // dropout
+void dropout_gpu(const int n, const float scale, const float dropout_rate, const float_t *in, unsigned *mask, float_t *out); // dropout
 void d_dropout_gpu(const float scale, const float_t *in_diff, const unsigned *mask, float_t *out_diff); // dropout derivative
+void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, 
+	const int M, const int N, const int K, const float alpha, const float* A, const float* B, const float beta, float* C);
 void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, const float_t *A, const float_t *B, float_t *C); // matrix multiply
 int argmax_gpu(const size_t n, const float_t *x); // the arguments of the maxima
 void softmax_cross_entropy_gpu(int x, int y, const float_t *in_data, float_t *out_data);
+void scal_gpu(const int N, const float alpha, float *X);
+void add_scalar_gpu(const int N, const float_t alpha, float_t* Y);
 
 #endif
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 2685629138..11e3a6cadb 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -35,15 +35,17 @@ void graph_conv_layer::init() {
 	std::cout << name_ << ": allocating memory for parameters and intermediate data... ";
 	Timer t_alloc;
 	t_alloc.Start();
-	// randomly initialize trainable parameters for conv layers
-	rand_init_matrix(y, z, W);
+#ifdef CPU_ONLY
+	rand_init_matrix(y, z, W); // randomly initialize trainable parameters
 	//rand_init_matrix(y, z, Q);
 	zero_init_matrix(y, z, weight_grad);
-	//alloc_grad();
-	if (dropout_) dropout_mask.resize(x*y);
-	in_temp.resize(x*y);
-	out_temp.resize(x*z); // same as pre_sup in original GCN code: https://github.com/chenxuhao/gcn/blob/master/gcn/layers.py
-	trans_data.resize(y*x); // y*x
+	if (dropout_) dropout_mask = new unsigned[x*y];
+	in_temp = new float_t[x*y];
+	out_temp = new float_t[x*z]; // same as pre_sup in original GCN code: https://github.com/chenxuhao/gcn/blob/master/gcn/layers.py
+	trans_data = new float_t[y*x]; // y*x
+#else
+	gconv_malloc_device(x, y, z, dropout_, dropout_mask, in_temp, out_temp, d_W, d_weight_grad);
+#endif
 	t_alloc.Stop();
 	std::cout << "Done, allocation time: " << t_alloc.Millisecs() << " ms\n";
 }
@@ -51,7 +53,6 @@ void graph_conv_layer::init() {
 #ifdef CPU_ONLY
 // 𝒉[𝑙] = σ(𝑊 * Σ(𝒉[𝑙-1]))
 void graph_conv_layer::forward_propagation(const float_t *in_data, float_t *out_data) {
-//void graph_conv_layer::forward_propagation(const vec_t &in_data, vec_t &out_data) {
 	// input: x*y; W: y*z; output: x*z
 	// if y > z: mult W first to reduce the feature size for aggregation
 	// else: aggregate first then mult W (not implemented yet)
@@ -59,9 +60,9 @@ void graph_conv_layer::forward_propagation(const float_t *in_data, float_t *out_
 		galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) {
 			dropout(y, scale_, dropout_rate_, &in_data[i*y], &dropout_mask[i*y], &in_temp[i*y]);
 		}, galois::loopname("dropout"));
-		matmul1D1D(x, z, y, &in_temp[0], &W[0], &out_temp[0]); // x*y; y*z; x*z
-	} else matmul1D1D(x, z, y, in_data, &W[0], &out_temp[0]); // x*y; y*z; x*z
-	aggregate(z, context->graph_cpu, &out_temp[0], out_data); // aggregate
+		matmul1D1D(x, z, y, in_temp, &W[0], out_temp); // x*y; y*z; x*z
+	} else matmul1D1D(x, z, y, in_data, &W[0], out_temp); // x*y; y*z; x*z
+	aggregate(z, context->graph_cpu, out_temp, out_data); // aggregate
 	if (act_) {
 		galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) {
 			relu(z, &out_data[i*z], &out_data[i*z]);
@@ -77,14 +78,13 @@ void graph_conv_layer::back_propagation(const float_t *in_data, const float_t *o
 			for (size_t j = 0; j < z; ++j) //TODO: use in_data or out_data?
 				out_temp[i*z+j] = out_data[i*z+j] > float_t(0) ? out_grad[i*z+j] : float_t(0);
 		}, galois::loopname("d_relu"));
-	//} else copy1D1D(out_grad, out_temp); // TODO: avoid copying
-	} else copy1D1D(x*z, out_grad, &out_temp[0]); // TODO: avoid copying
+	} else copy1D1D(x*z, out_grad, out_temp); // TODO: avoid copying
 	if (level_ != 0) { // no need to calculate in_grad for the first layer
-		vec_t trans_W(z*y);
+		float_t *trans_W = new float[z*y];
 		transpose(y, z, W, trans_W); // derivative of matmul needs transposed matrix
-		matmul1D1D(x, y, z, &out_temp[0], &trans_W[0], &in_temp[0]); // x*z; z*y -> x*y
+		matmul1D1D(x, y, z, out_temp, trans_W, in_temp); // x*z; z*y -> x*y
 		//NOTE: since graph is symmetric, the derivative is the same
-		update_all(y, context->graph_cpu, &in_temp[0], in_grad, true, context->norm_factor); // x*x; x*y -> x*y
+		update_all(y, context->graph_cpu, in_temp, in_grad, true, context->norm_factor); // x*x; x*y -> x*y
 		if (dropout_) {
 			galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) {
 				d_dropout(y, scale_, &in_grad[i*y], &dropout_mask[i*y], &in_grad[i*y]);
@@ -92,19 +92,30 @@ void graph_conv_layer::back_propagation(const float_t *in_data, const float_t *o
 		}
 	}
 	// calculate weight gradients
-	transpose(x, y, in_data, &trans_data[0]); // y*x
-	matmul1D1D(y, z, x, &trans_data[0], &out_temp[0], &weight_grad[0]); // y*x; x*z; y*z
+	transpose(x, y, in_data, trans_data); // y*x
+	matmul1D1D(y, z, x, trans_data, out_temp, &weight_grad[0]); // y*x; x*z; y*z
 }
 
 #else
-//void graph_conv_layer::forward_propagation(const vec_t &in_data, vec_t &out_data) {}
-//void graph_conv_layer::back_propagation(const vec_t &in_data, const vec_t &out_data, vec_t &out_grad, vec_t &in_grad) {}
-
 // GPU forward
 void graph_conv_layer::forward_propagation(const float_t *in_data, float_t *out_data) {
+	if (dropout_ && phase_ == net_phase::train) {
+		dropout_gpu(x*y, scale_, dropout_rate_, in_data, dropout_mask, in_temp);
+		matmul1D1D_gpu(x, z, y, in_temp, d_W, out_temp);
+	} else matmul1D1D_gpu(x, z, y, in_data, d_W, out_temp);
+	aggregate(z, context->graph_gpu, out_temp, out_data);
+	if (act_) relu_gpu(x*z, out_data, out_data);
 }
 
 // GPU backward
 void graph_conv_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) {
+	if (act_) d_relu_gpu(x*z, out_grad, out_data, out_temp);
+	else copy_gpu(x*z, out_grad, out_temp);
+	if (level_ != 0) {
+		sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0, in_temp);
+		update_all(y, context->graph_gpu, in_temp, in_grad, true, context->d_norm_factor);
+		if (dropout_) d_dropout(y, scale_, in_grad, dropout_mask, in_grad);
+	}
+	sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, d_weight_grad);
 }
 #endif
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index 9f319d1325..9fc01278c9 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -1,9 +1,47 @@
 #include "math_functions.hh"
 #include "context.h"
 
-extern "C" {
-#include <cblas.h>
-//#include <clapack.h>
+void gpu_rng_uniform(const int n, unsigned *r) {
+	CURAND_CHECK(curandGenerate(Context::curand_generator(), r, n));
+}
+
+void gpu_rng_uniform(const int n, const float_t a, const float_t b, float_t* r) {
+	CURAND_CHECK(curandGenerateUniform(Context::curand_generator(), r, n));
+	const float range = b - a;
+	if (range != float_t{1}) scal_gpu(n, range, r);
+	if (a != float_t{0}) add_scalar_gpu(n, a, r);
+}
+
+void gpu_rng_gaussian(const int n, const float_t mu, const float_t sigma, float_t *r) {
+	CURAND_CHECK(curandGenerateNormal(Context::curand_generator(), r, n, mu, sigma));
+}
+
+
+void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned *masks, float_t *in, float_t *out, float_t *matrix, float_t *grad) {
+	if (dropout) CUDA_CHECK(cudaMalloc((void **)&masks, x * y * sizeof(unsigned)));
+	CUDA_CHECK(cudaMalloc((void **)&in, x * y * sizeof(float_t)));
+	CUDA_CHECK(cudaMalloc((void **)&out, x * z * sizeof(float_t)));
+	CUDA_CHECK(cudaMalloc((void **)&matrix, y * z * sizeof(float_t)));
+	auto init_range = sqrt(6.0/(y + z));
+	// Glorot & Bengio (AISTATS 2010)
+	gpu_rng_uniform(y*z, -init_range, init_range, matrix);
+	CUDA_CHECK(cudaMalloc((void **)&grad, y * z * sizeof(float_t)));
+	CUDA_CHECK(cudaMemset(grad, 0, y * z * sizeof(float_t)));
+}
+
+void copy_gpu(size_t len, const float_t *in, float_t *out) {
+	CUDA_CHECK(cudaMemcpy(out, in, len * sizeof(float_t), cudaMemcpyDeviceToDevice));
+}
+
+__global__ void dropout_kernel(const int n, const float scale, const float dropout_rate, const float_t* in, unsigned *masks, float_t* out) {
+	CUDA_KERNEL_LOOP(i, n) {
+		//masks[i] = bernoulli(dropout_rate);
+		out[i] = in[i] * masks[i] * scale;
+	}
+}
+
+void dropout_gpu(const int n, const float scale, const float dropout_rate, const float_t *in, unsigned *masks, float_t *out) {
+	dropout_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, scale, dropout_rate, in, masks, out);
 }
 
 // flattern data into 1D before feed into the ReLU operater
@@ -13,8 +51,8 @@ __global__ void relu_kernel(const int n, const float_t* in, float_t* out) {
 	}
 }
 
-void relu_gpu(const int n, const float_t *in_data, float_t* out_data) {
-	relu_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, in_data, out_data);
+void relu_gpu(const int n, const float_t *in, float_t* out) {
+	relu_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, in, out);
 }
 
 __global__ void d_relu_kernel(const int n, const float_t* in_diff, const float_t* data, float_t* out_diff) {

From 0d37df5dfaec00c21c68054d4d68212e888b5900 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Mon, 24 Feb 2020 10:09:09 -0600
Subject: [PATCH 019/660] fix bug

---
 libdeepgalois/CMakeLists.txt                  | 2 +-
 libdeepgalois/src/layers/graph_conv_layer.cpp | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index 73152f8792..47ace780b9 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -16,7 +16,7 @@ include_directories("${MGPU_ROOT}/src")
 link_directories(${OPENBLAS_LIB})
 link_directories(${CMAKE_SOURCE_DIR}/libgalois)
 
-set(USE_CPU OFF CACHE BOOL "Build DeepGalois without CUDA support")
+set(USE_CPU ON CACHE BOOL "Build DeepGalois without CUDA support")
 if(USE_CPU)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCPU_ONLY")
 else()
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 11e3a6cadb..d335d1be65 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -80,9 +80,10 @@ void graph_conv_layer::back_propagation(const float_t *in_data, const float_t *o
 		}, galois::loopname("d_relu"));
 	} else copy1D1D(x*z, out_grad, out_temp); // TODO: avoid copying
 	if (level_ != 0) { // no need to calculate in_grad for the first layer
-		float_t *trans_W = new float[z*y];
+		vec_t trans_W(z*y);
 		transpose(y, z, W, trans_W); // derivative of matmul needs transposed matrix
-		matmul1D1D(x, y, z, out_temp, trans_W, in_temp); // x*z; z*y -> x*y
+		matmul1D1D(x, y, z, out_temp, &trans_W[0], in_temp); // x*z; z*y -> x*y
+		//sgemm_cpu(x, y, z, 1.0, out_temp, trans_W, 0.0, in_temp); // x*z; z*y -> x*y
 		//NOTE: since graph is symmetric, the derivative is the same
 		update_all(y, context->graph_cpu, in_temp, in_grad, true, context->norm_factor); // x*x; x*y -> x*y
 		if (dropout_) {

From 1c1f9db80d01b82f1647c117666174ba521aed7e Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Mon, 24 Feb 2020 10:49:21 -0600
Subject: [PATCH 020/660] add agg kernel

---
 libdeepgalois/src/aggregator.cu               | 24 ++++++++++++++++++-
 libdeepgalois/src/layers/graph_conv_layer.cpp |  1 +
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/libdeepgalois/src/aggregator.cu b/libdeepgalois/src/aggregator.cu
index 04f9c1e8f8..aa3b70b7b6 100644
--- a/libdeepgalois/src/aggregator.cu
+++ b/libdeepgalois/src/aggregator.cu
@@ -1,11 +1,33 @@
 #include "gg.h"
 #include "ggcuda.h"
 #include "cub/cub.cuh"
+#include "cutils.h"
 #include "aggregator.h"
 #include "math_functions.hh"
 
+// TODO: use warp
+__device__ void scale_add(const int n, const float_t alpha, const float_t* a, const float_t* b, float_t* y) {
+	for (int i = 0; i < n; i++) y[i] = alpha * a[i] + b[i];
+}
+
+__global__  void update_all_kernel(size_t n, size_t len, CSRGraph &g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor) {
+	CUDA_KERNEL_LOOP(src, n) {
+		float_t a = 0.0, b = 1.0;
+		if (norm) a = norm_factor[src];
+		index_type begin = g.edge_begin(src);
+		index_type end = g.edge_end(src);
+		for (index_type e = begin; e != end; e++) {
+			index_type dst = g.getEdgeDst(e);
+			assert(dst < n);
+			if (norm) b = a * norm_factor[dst];
+			scale_add(len, b, in+dst*len, out+src*len, out+src*len); // out[src] += in[dst]
+		}
+	}
+}
+
 void update_all(size_t len, CSRGraph &g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor) {
 	unsigned n = g.nnodes;
-	vadd_gpu(len, in, in, out);
+	CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t)));
+	update_all_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, len, g, in, out, norm, norm_factor);
 }
 	
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index d335d1be65..8c7ba7fc1f 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -100,6 +100,7 @@ void graph_conv_layer::back_propagation(const float_t *in_data, const float_t *o
 #else
 // GPU forward
 void graph_conv_layer::forward_propagation(const float_t *in_data, float_t *out_data) {
+	assert(y <= 128); // currently only support feature length <= 128
 	if (dropout_ && phase_ == net_phase::train) {
 		dropout_gpu(x*y, scale_, dropout_rate_, in_data, dropout_mask, in_temp);
 		matmul1D1D_gpu(x, z, y, in_temp, d_W, out_temp);

From df3bd07abc2ed44acf469661941b7e5433884467 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Mon, 24 Feb 2020 13:11:47 -0600
Subject: [PATCH 021/660] add softmax kernel

---
 libdeepgalois/include/layers/layer.h          |  5 ++-
 .../include/layers/softmax_loss_layer.h       |  2 -
 libdeepgalois/include/math_functions.hh       |  4 +-
 libdeepgalois/include/net.h                   | 10 ++---
 libdeepgalois/include/types.h                 |  2 -
 .../src/layers/softmax_loss_layer.cpp         | 21 ++++++---
 libdeepgalois/src/math_functions.cu           | 43 ++++++++++++++++++-
 libdeepgalois/src/net.cpp                     |  6 +--
 lonestargnn/gcn/gcn.cpp                       |  4 +-
 9 files changed, 72 insertions(+), 25 deletions(-)

diff --git a/libdeepgalois/include/layers/layer.h b/libdeepgalois/include/layers/layer.h
index fea557a3ff..057bc58383 100644
--- a/libdeepgalois/include/layers/layer.h
+++ b/libdeepgalois/include/layers/layer.h
@@ -55,7 +55,7 @@ class layer : public node {
 			<< " input[" << input_dims[0] << "," << input_dims[1] 
 			<< "] output[" << output_dims[0] << "," << output_dims[1] << "]\n";
 	}
-	virtual void set_sample_mask(size_t sample_begin, size_t sample_end, size_t sample_count, MaskList &masks) {
+	virtual void set_sample_mask(size_t sample_begin, size_t sample_end, size_t sample_count, mask_t *masks) {
 		begin_ = sample_begin;
 		end_ = sample_end;
 		count_ = sample_count;
@@ -123,7 +123,6 @@ class layer : public node {
 	size_t begin_; // sample begin index
 	size_t end_; // sample end index
 	size_t count_; // number of samples
-	MaskList masks_; // masks to show which samples are valid
 	size_t num_dims; // number of dimensions
 	std::vector<size_t> input_dims; // input dimensions
 	std::vector<size_t> output_dims; // output dimentions
@@ -134,6 +133,8 @@ class layer : public node {
 	vec_t weight_grad; // weight gradient for updating parameters
 	float_t *d_W;
 	float_t *d_weight_grad;
+	mask_t *masks_; // masks to show which samples are valid
+	mask_t *d_masks_;
 	float_t *loss; // error for each vertex: N x 1
 	Context *context;
 };
diff --git a/libdeepgalois/include/layers/softmax_loss_layer.h b/libdeepgalois/include/layers/softmax_loss_layer.h
index f4adb51bcd..f6d23f6c5a 100644
--- a/libdeepgalois/include/layers/softmax_loss_layer.h
+++ b/libdeepgalois/include/layers/softmax_loss_layer.h
@@ -6,9 +6,7 @@ class softmax_loss_layer: public layer {
 	softmax_loss_layer(unsigned level, std::vector<size_t> in_dims, std::vector<size_t> out_dims);
 	~softmax_loss_layer() {}
 	std::string layer_type() const override { return std::string("softmax_loss"); }
-	//virtual void forward_propagation(const vec_t &in_data, vec_t &out_data);
 	virtual void forward_propagation(const float_t *in_data, float_t *out_data);
-	//virtual void back_propagation(const vec_t &in_data, const vec_t &out_data, vec_t &out_grad, vec_t &in_grad);
 	virtual void back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad);
 };
 
diff --git a/libdeepgalois/include/math_functions.hh b/libdeepgalois/include/math_functions.hh
index 691db22a96..45a34c7fc6 100644
--- a/libdeepgalois/include/math_functions.hh
+++ b/libdeepgalois/include/math_functions.hh
@@ -58,6 +58,7 @@ float_t cross_entropy(size_t n, const float_t *y, const float_t *p);
 void d_cross_entropy(const vec_t &y, const vec_t &p, vec_t &d);
 void d_cross_entropy(size_t n, const float_t *y, const float_t *p, float_t *d);
 
+void out_malloc_device(int n, mask_t *h_masks, mask_t *d_masks, float_t *loss);
 void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned *masks, float_t *in, float_t *out, float_t *matrix, float_t *grad);
 void copy_gpu(size_t len, const float_t *in, float_t *out);
 void malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned *masks, float_t *in, float_t *out);
@@ -70,7 +71,8 @@ void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
 	const int M, const int N, const int K, const float alpha, const float* A, const float* B, const float beta, float* C);
 void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, const float_t *A, const float_t *B, float_t *C); // matrix multiply
 int argmax_gpu(const size_t n, const float_t *x); // the arguments of the maxima
-void softmax_cross_entropy_gpu(int x, int y, const float_t *in_data, float_t *out_data);
+void softmax_cross_entropy_gpu(int x, int y, const float_t *in_data, const mask_t *masks, const label_t *labels, float_t *loss, float_t *out_data);
+void d_softmax_cross_entropy_gpu(int x, int y, const float_t *in_data, const mask_t *masks, const label_t *labels, const float_t *out_data, float_t *diff);
 void scal_gpu(const int N, const float alpha, float *X);
 void add_scalar_gpu(const int N, const float_t alpha, float_t* Y);
 
diff --git a/libdeepgalois/include/net.h b/libdeepgalois/include/net.h
index 66f50a17b6..c2bf8e997e 100644
--- a/libdeepgalois/include/net.h
+++ b/libdeepgalois/include/net.h
@@ -59,9 +59,9 @@ class Net {
 	}
 
 	// forward propagation: [begin, end) is the range of samples used.
-	acc_t fprop(size_t begin, size_t end, size_t count, MaskList &masks) {
+	acc_t fprop(size_t begin, size_t end, size_t count, mask_t *masks) {
 		// set mask for the last layer
-		layers[num_layers-1]->set_sample_mask(begin, end, count, masks);
+		layers[num_layers-1]->set_sample_mask(begin, end, count, &masks[0]);
 		// layer0: from N x D to N x 16
 		// layer1: from N x 16 to N x E
 		// layer2: from N x E to N x E (normalize only)
@@ -83,7 +83,7 @@ class Net {
 	}
 
 	// evaluate, i.e. inference or predict
-	double evaluate(size_t begin, size_t end, size_t count, MaskList &masks, acc_t &loss, acc_t &acc) {
+	double evaluate(size_t begin, size_t end, size_t count, mask_t *masks, acc_t &loss, acc_t &acc) {
 		Timer t_eval;
 		t_eval.Start();
 		loss = fprop(begin, end, count, masks);
@@ -99,12 +99,12 @@ class Net {
 	size_t num_layers; // for now hard-coded: NUM_CONV_LAYERS + 1
 	unsigned num_epochs; // number of epochs
 	std::vector<size_t> feature_dims; // feature dimnesions for each layer
-	MaskList train_mask, val_mask; // masks for traning and validation
+	std::vector<mask_t> train_mask, val_mask; // masks for traning and validation
 	size_t train_begin, train_end, train_count, val_begin, val_end, val_count;
 	std::vector<layer *> layers; // all the layers in the neural network
 
 	// comparing outputs with the ground truth (labels)
-	inline acc_t masked_accuracy(size_t begin, size_t end, size_t count, MaskList &masks) {
+	inline acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t *masks) {
 		AccumF accuracy_all;
 		accuracy_all.reset();
 		galois::do_all(galois::iterate(begin, end), [&](const auto& i) {
diff --git a/libdeepgalois/include/types.h b/libdeepgalois/include/types.h
index 8d78e03d48..9b483e1d70 100644
--- a/libdeepgalois/include/types.h
+++ b/libdeepgalois/include/types.h
@@ -17,8 +17,6 @@ typedef std::vector<FV> FV2D; // feature vectors: num_samples x feature_dim
 typedef float acc_t; // Accuracy type
 typedef short label_t; // label is for classification (supervised learning)
 typedef uint8_t mask_t; // mask is used to indicate different uses of labels: train, val, test
-typedef std::vector<label_t> LabelList; // label list to store label for each vertex
-typedef std::vector<mask_t> MaskList; // mask list to store mask for each vertex
 #define CHUNK_SIZE 256
 
 #endif
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp
index d6969b7a95..7322e916d7 100644
--- a/libdeepgalois/src/layers/softmax_loss_layer.cpp
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp
@@ -3,8 +3,12 @@
 softmax_loss_layer::softmax_loss_layer(unsigned level, std::vector<size_t> in_dims, 
 	std::vector<size_t> out_dims) : layer(level, in_dims, out_dims) {
 	trainable_ = false;
-	loss = new float_t[in_dims[0]]; // error for each sample
 	name_ = layer_type() + "_" + std::to_string(level);
+#ifdef CPU_ONLY
+	loss = new float_t[in_dims[0]]; // error for each sample
+#else
+	out_malloc_device(in_dims[0], masks_, d_masks_, loss);
+#endif
 }
 #ifdef CPU_ONLY
 // TODO: need kernel fusion optimization
@@ -27,18 +31,21 @@ void softmax_loss_layer::forward_propagation(const float_t *in_data, float_t *ou
 void softmax_loss_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) {
 	size_t len = input_dims[1];
 	galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) {
-		vec_t norm_grad(len);
-		std::vector<acc_t> y(len, 0.0); // ground truth
-		y[context->get_label(i)] = 1.0;
-		d_cross_entropy(len, &y[0], &out_data[len*i], &norm_grad[0]);
-		d_softmax(len, &in_data[len*i], &out_data[len*i], &in_grad[len*i], &norm_grad[0]);
+		if (masks_[i] == 1) { // masked
+			vec_t norm_grad(len);
+			std::vector<acc_t> y(len, 0.0); // ground truth
+			y[context->get_label(i)] = 1.0;
+			d_cross_entropy(len, &y[0], &out_data[len*i], &norm_grad[0]);
+			d_softmax(len, &in_data[len*i], &out_data[len*i], &in_grad[len*i], &norm_grad[0]);
+		}
 	}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("softmax-loss-bw"));
 }
 #else // GPU implementation
 void softmax_loss_layer::forward_propagation(const float_t *in_data, float_t *out_data) {
-	softmax_cross_entropy_gpu(input_dims[0], input_dims[1], in_data, out_data);
+	softmax_cross_entropy_gpu(input_dims[0], input_dims[1], in_data, d_masks_, context->d_labels, loss, out_data);
 }
 
 void softmax_loss_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) {
+	d_softmax_cross_entropy_gpu(input_dims[0], input_dims[1], in_data, d_masks_, context->d_labels, out_data, in_grad);
 }
 #endif
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index 9fc01278c9..4097e0410f 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -16,6 +16,11 @@ void gpu_rng_gaussian(const int n, const float_t mu, const float_t sigma, float_
 	CURAND_CHECK(curandGenerateNormal(Context::curand_generator(), r, n, mu, sigma));
 }
 
+void out_malloc_device(int n, mask_t *h_masks, mask_t *d_masks, float_t *loss) {
+	CUDA_CHECK(cudaMalloc((void **)&d_masks, n * sizeof(mask_t)));
+	CUDA_CHECK(cudaMemcpy(d_masks, h_masks, n * sizeof(mask_t), cudaMemcpyHostToDevice));
+	CUDA_CHECK(cudaMalloc((void **)&loss, n * sizeof(float_t)));
+}
 
 void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned *masks, float_t *in, float_t *out, float_t *matrix, float_t *grad) {
 	if (dropout) CUDA_CHECK(cudaMalloc((void **)&masks, x * y * sizeof(unsigned)));
@@ -144,5 +149,41 @@ void vadd_gpu(const int N, const float_t* a, const float_t* b, float_t* y) {
 	vadd_kernel<<<CUDA_GET_BLOCKS(N), CUDA_NUM_THREADS>>>(N, a, b, y);
 }
 
-void softmax_cross_entropy_gpu(int x, int y, const float_t *in_data, float_t *out_data) {
+// TODO: use warp
+__device__ void softmax(int n, const float_t *input, float_t *output) {
+	float_t max = input[0];
+	for (size_t i = 1; i < n; i++) if (input[i] > max) max = input[i];
+	float_t denominator = 0.0;
+	for (size_t i = 0; i < n; i++) {
+		output[i] = exp(input[i] - max);
+		denominator += output[i];
+	}
+	for (size_t i = 0; i < n; i++) output[i] /= denominator;
+}
+
+__device__ void cross_entropy(int n, const label_t idx, float_t *p, float_t &loss) {
+	if (p[idx] == 0.0) loss -= log(float_t(1e-10));
+	else loss -= log(p[idx]);
 }
+
+// n: number of vectors
+// len: length of vectors
+// for each vector, do softmax to normalize the vector, and then compute a loss
+__global__ void softmax_cross_entropy_kernel(int n, int len, const float_t *in_data,
+	const mask_t *masks, const label_t *labels, float_t *loss, float_t *out_data) {
+	CUDA_KERNEL_LOOP(i, n) {
+		if (masks[i] == 1) { // masked
+			softmax(len, in_data+len*i, out_data+len*i); // normalize using softmax
+			loss[i] = 0.0;
+			cross_entropy(len, labels[i], &out_data[len*i], loss[i]);
+		}
+	}
+}
+
+void softmax_cross_entropy_gpu(int n, int len, const float_t *in, const mask_t *masks, const label_t *labels, float_t *loss, float_t *out) {
+	softmax_cross_entropy_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, len, in, masks, labels, loss, out);
+}
+
+void d_softmax_cross_entropy_gpu(int x, int y, const float_t *in, const mask_t *masks, const label_t *labels, const float_t *out, float_t *diff) {
+}
+
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index 857b7691b5..df775c9504 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -51,8 +51,8 @@ void Net::train(optimizer *opt, bool need_validate) {
 		set_netphases(net_phase::train);
 		acc_t train_loss = 0.0, train_acc = 0.0;
 		Tfw.start();
-		train_loss = fprop(train_begin, train_end, train_count, train_mask); // forward
-		train_acc = masked_accuracy(train_begin, train_end, train_count, train_mask); // predict
+		train_loss = fprop(train_begin, train_end, train_count, &train_mask[0]); // forward
+		train_acc = masked_accuracy(train_begin, train_end, train_count, &train_mask[0]); // predict
 		Tfw.stop();
 		Tbw.start();
 		bprop(); // back propogation
@@ -68,7 +68,7 @@ void Net::train(optimizer *opt, bool need_validate) {
 			// Validation
 			acc_t val_loss = 0.0, val_acc = 0.0;
 			Tval.start();
-			double val_time = evaluate(val_begin, val_end, val_count, val_mask, val_loss, val_acc);
+			double val_time = evaluate(val_begin, val_end, val_count, &val_mask[0], val_loss, val_acc);
 			Tval.stop();
 			std::cout << " val_loss = " << std::setw(5) << val_loss << " val_acc = " << std::setw(5) << val_acc;
 			std::cout << " time = " << epoch_time + val_time << " ms (train_time = " << epoch_time << " val_time = " << val_time << ")\n";
diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp
index 7540a4b0e4..1ef0fa24f2 100644
--- a/lonestargnn/gcn/gcn.cpp
+++ b/lonestargnn/gcn/gcn.cpp
@@ -29,14 +29,14 @@ int main(int argc, char** argv) {
 		size_t n = network.get_nnodes();
 		acc_t test_loss = 0.0, test_acc = 0.0;
 		size_t test_begin = 0, test_end = n, test_count = n;
-		MaskList test_mask(n, 0);
+		std::vector<mask_t> test_mask(n, 0);
 		if (dataset == "reddit") {
 			test_begin = 177262; test_count = 55703; test_end = test_begin + test_count;
 			for (size_t i = test_begin; i < test_end; i++) test_mask[i] = 1;
 		} else test_count = read_masks(dataset, "test", test_begin, test_end, test_mask);
 		galois::StatTimer Ttest("Test");
 		Ttest.start();
-		double test_time = network.evaluate(test_begin, test_end, test_count, test_mask, test_loss, test_acc);
+		double test_time = network.evaluate(test_begin, test_end, test_count, &test_mask[0], test_loss, test_acc);
 		std::cout << "\nTesting: test_loss = " << test_loss << " test_acc = " << test_acc << " test_time = " << test_time << "\n";
 		Ttest.stop();
 	}

From a8dc221f6c113f44f92e5da2b603123f69b0d50e Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 24 Feb 2020 14:35:06 -0600
Subject: [PATCH 022/660] cmake changes for cpu only compile

---
 libdeepgalois/CMakeLists.txt   | 9 +++++++--
 lonestargnn/CMakeLists.txt     | 5 ++++-
 lonestargnn/gcn/CMakeLists.txt | 3 +--
 3 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index 47ace780b9..d3176699ab 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -40,7 +40,7 @@ else()
   #cuda_compile(AGG_O src/aggregator.cu)
 endif()
 
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -mavx2")
 set(sources
   src/layers/graph_conv_layer.cpp
   src/layers/softmax_loss_layer.cpp
@@ -52,7 +52,12 @@ set(sources
 )
 add_library(dg_cpu STATIC ${sources})
 
-target_link_libraries(dg_cpu galois_shmem gllvm galois_gpu)
+if(USE_CPU)
+  target_link_libraries(dg_cpu galois_shmem gllvm)
+else()
+  target_link_libraries(dg_cpu galois_shmem gllvm galois_gpu)
+endif()
+
 target_link_libraries(dg_cpu ${MPI_CXX_LIBRARIES})
 target_link_libraries(dg_cpu -lopenblas)
 target_link_libraries(dg_cpu -lcudart -lcublas -lcurand)
diff --git a/lonestargnn/CMakeLists.txt b/lonestargnn/CMakeLists.txt
index e270f63011..00dac96a27 100644
--- a/lonestargnn/CMakeLists.txt
+++ b/lonestargnn/CMakeLists.txt
@@ -4,9 +4,12 @@ include_directories(BEFORE
 )
 include_directories(${CMAKE_SOURCE_DIR}/lonestargnn)
 include_directories(${CMAKE_SOURCE_DIR}/libdeepgalois/include)
+
 SET(CUDA_INC /org/centers/cdgc/cuda/cuda-10.0/include)
 include_directories(${CUDA_INC})
-include_directories(${CMAKE_SOURCE_DIR}/libgpu/include)
+if(NOT USE_CPU)
+  include_directories(${CMAKE_SOURCE_DIR}/libgpu/include)
+endif()
 
 SET(OPENBLAS_INC /net/ohm/export/cdgc/cxh/OpenBLAS/build/include)
 SET(OPENBLAS_LIB /net/ohm/export/cdgc/cxh/OpenBLAS/build/lib)
diff --git a/lonestargnn/gcn/CMakeLists.txt b/lonestargnn/gcn/CMakeLists.txt
index 715a68d497..3d25bb3966 100644
--- a/lonestargnn/gcn/CMakeLists.txt
+++ b/lonestargnn/gcn/CMakeLists.txt
@@ -2,6 +2,5 @@ app(gcn gcn.cpp)
 target_link_libraries(gcn dg_cpu)
 if(NOT USE_CPU)
   target_link_libraries(gcn dg_gpu)
+  target_link_libraries(gcn -lcudart -lcublas -lcurand -lcudadevrt)
 endif()
-target_link_libraries(gcn -lcudart -lcublas -lcurand -lcudadevrt)
-

From 55f18d88ebd3e1b440f1832c6a41f4efd4ef0c7c Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Mon, 24 Feb 2020 14:35:44 -0600
Subject: [PATCH 023/660] fix bug

---
 libdeepgalois/CMakeLists.txt  | 2 +-
 libdeepgalois/include/utils.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index 47ace780b9..50e05c53f8 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -40,7 +40,7 @@ else()
   #cuda_compile(AGG_O src/aggregator.cu)
 endif()
 
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -mavx2")
 set(sources
   src/layers/graph_conv_layer.cpp
   src/layers/softmax_loss_layer.cpp
diff --git a/libdeepgalois/include/utils.h b/libdeepgalois/include/utils.h
index caf27c56a3..3ca868f501 100644
--- a/libdeepgalois/include/utils.h
+++ b/libdeepgalois/include/utils.h
@@ -8,7 +8,7 @@
 #include <sys/time.h>
 #include <sys/resource.h>
 
-const std::string path = "/h2/xchen/datasets/Learning/"; // path to the input dataset
+const std::string path = "/net/ohm/export/iss/inputs/Learning/"; // path to the input dataset
 enum class net_phase { train, test };
 
 class ResourceManager {

From e13ba6ccd73e04123db170049a92ac48f324b9c6 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Mon, 24 Feb 2020 15:23:28 -0600
Subject: [PATCH 024/660] compile on faraday

---
 libdeepgalois/CMakeLists.txt         | 6 +++---
 libdeepgalois/src/math_functions.cpp | 9 +++++++++
 lonestargnn/CMakeLists.txt           | 4 ++--
 3 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index d3176699ab..625ab3b6a4 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 2.8)
 
-SET(OPENBLAS_INC /net/ohm/export/cdgc/cxh/OpenBLAS/build/include)
-SET(OPENBLAS_LIB /net/ohm/export/cdgc/cxh/OpenBLAS/build/lib)
+SET(OPENBLAS_INC /net/ohm/export/cdgc/cxh/OpenBLAS-faraday/build/include)
+SET(OPENBLAS_LIB /net/ohm/export/cdgc/cxh/OpenBLAS-faraday/build/lib)
 set(CUB_ROOT "${CMAKE_SOURCE_DIR}/cub") # only required headers
 set(MGPU_ROOT "${CMAKE_SOURCE_DIR}/moderngpu") # only required headers
 SET(CUDA_INC /org/centers/cdgc/cuda/cuda-8.0/include)
@@ -40,7 +40,7 @@ else()
   #cuda_compile(AGG_O src/aggregator.cu)
 endif()
 
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -mavx2")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 set(sources
   src/layers/graph_conv_layer.cpp
   src/layers/softmax_loss_layer.cpp
diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp
index 98535d98bd..97f6c1198e 100644
--- a/libdeepgalois/src/math_functions.cpp
+++ b/libdeepgalois/src/math_functions.cpp
@@ -9,6 +9,7 @@ extern "C" {
 }
 
 // vector add
+#if defined(__AVX__) || defined(__AVX2__)
 void vadd(const vec_t &a, const vec_t &b, vec_t &out) {
 	//for (size_t i = 0; i < out.size(); ++i) out[i] = a[i] + b[i];
 	size_t n = out.size();
@@ -26,6 +27,14 @@ void vadd(size_t n, const float_t *a, const float_t *b, float_t *out) {
 		_mm256_storeu_ps(&out[i], _mm256_add_ps(_mm256_loadu_ps(&a[i]), _mm256_loadu_ps(&b[i])));
 	for (size_t i = alignedN; i < n; ++i) out[i] = a[i] + b[i];
 }
+#else
+void vadd(const vec_t &a, const vec_t &b, vec_t &out) {
+	for (size_t i = 0; i < out.size(); ++i) out[i] = a[i] + b[i];
+}
+void vadd(size_t n, const float_t *a, const float_t *b, float_t *out) {
+	for (size_t i = 0; i < n; ++i) out[i] = a[i] + b[i];
+}
+#endif
 
 // vector subtract
 void vsub(const vec_t &in_a, const vec_t &in_b, vec_t &out) {
diff --git a/lonestargnn/CMakeLists.txt b/lonestargnn/CMakeLists.txt
index 00dac96a27..e48887e261 100644
--- a/lonestargnn/CMakeLists.txt
+++ b/lonestargnn/CMakeLists.txt
@@ -11,8 +11,8 @@ if(NOT USE_CPU)
   include_directories(${CMAKE_SOURCE_DIR}/libgpu/include)
 endif()
 
-SET(OPENBLAS_INC /net/ohm/export/cdgc/cxh/OpenBLAS/build/include)
-SET(OPENBLAS_LIB /net/ohm/export/cdgc/cxh/OpenBLAS/build/lib)
+SET(OPENBLAS_INC /net/ohm/export/cdgc/cxh/OpenBLAS-faraday/build/include)
+SET(OPENBLAS_LIB /net/ohm/export/cdgc/cxh/OpenBLAS-faraday/build/lib)
 include_directories(${OPENBLAS_INC})
 link_directories(${OPENBLAS_LIB})
 if(USE_CPU)

From da535bd5cb9ba6a174bb6f9d37212a6abab0da4a Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Mon, 24 Feb 2020 15:29:03 -0600
Subject: [PATCH 025/660] fix

---
 libdeepgalois/include/utils.h        |  2 +-
 libdeepgalois/src/math_functions.cpp |  2 +-
 libdeepgalois/src/math_functions.cu  | 35 ++++++++++++++++++++++++----
 libdeepgalois/src/node.cpp           |  4 ++--
 4 files changed, 35 insertions(+), 8 deletions(-)

diff --git a/libdeepgalois/include/utils.h b/libdeepgalois/include/utils.h
index 3ca868f501..63d0f74ff7 100644
--- a/libdeepgalois/include/utils.h
+++ b/libdeepgalois/include/utils.h
@@ -84,7 +84,7 @@ uniform_rand(T min, T max) {
 }
 
 inline bool bernoulli(float_t p) {
-	return uniform_rand(float_t{0}, float_t{1}) <= p;
+	return uniform_rand(float_t(0), float_t(1)) <= p;
 }
 
 inline size_t read_masks(std::string dataset_str, std::string mask_type, size_t &begin, size_t &end, std::vector<uint8_t> &masks) {
diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp
index 97f6c1198e..7e0b805e05 100644
--- a/libdeepgalois/src/math_functions.cpp
+++ b/libdeepgalois/src/math_functions.cpp
@@ -290,7 +290,7 @@ void relu(const vec_t &in, vec_t &out) {
 
 void relu(size_t n, const float_t *in, float_t *out) {
 	for (size_t i = 0; i < n; ++i)
-		out[i] = std::max(in[i], float_t{0});
+		out[i] = std::max(in[i], float_t(0));
 }
 
 void d_relu(const vec_t &in_diff, const vec_t &fv, vec_t &out_diff) {
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index 4097e0410f..781e4a083a 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -8,8 +8,8 @@ void gpu_rng_uniform(const int n, unsigned *r) {
 void gpu_rng_uniform(const int n, const float_t a, const float_t b, float_t* r) {
 	CURAND_CHECK(curandGenerateUniform(Context::curand_generator(), r, n));
 	const float range = b - a;
-	if (range != float_t{1}) scal_gpu(n, range, r);
-	if (a != float_t{0}) add_scalar_gpu(n, a, r);
+	if (range != float_t(1)) scal_gpu(n, range, r);
+	if (a != float_t(0)) add_scalar_gpu(n, a, r);
 }
 
 void gpu_rng_gaussian(const int n, const float_t mu, const float_t sigma, float_t *r) {
@@ -161,11 +161,28 @@ __device__ void softmax(int n, const float_t *input, float_t *output) {
 	for (size_t i = 0; i < n; i++) output[i] /= denominator;
 }
 
-__device__ void cross_entropy(int n, const label_t idx, float_t *p, float_t &loss) {
+// TODO: use warp
+__device__ void d_softmax(size_t n, const float_t *p, const float_t *dp, float_t *dy) {
+	for (size_t i = 0; i < n; i++) {
+		dy[i] = 0;
+		for (size_t j = 0; j < n; j++) {
+			float_t df = (j == i) ? p[i] * (1.0 - p[i]) : -p[j] * p[i];
+			dy[i] += df * dp[j];
+		}
+	}
+}
+
+__device__ void cross_entropy(int n, const label_t idx, const float_t *p, float_t &loss) {
 	if (p[idx] == 0.0) loss -= log(float_t(1e-10));
 	else loss -= log(p[idx]);
 }
 
+__device__ void d_cross_entropy(int n, const label_t idx, const float_t *p, float_t *d) {
+	for (int i = 0; i < n; i++)
+		if (i == (int)idx) d[i] = -1.0 / (p[i] + 1e-10);
+		else d[i] = 0.0;
+}
+
 // n: number of vectors
 // len: length of vectors
 // for each vector, do softmax to normalize the vector, and then compute a loss
@@ -184,6 +201,16 @@ void softmax_cross_entropy_gpu(int n, int len, const float_t *in, const mask_t *
 	softmax_cross_entropy_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, len, in, masks, labels, loss, out);
 }
 
-void d_softmax_cross_entropy_gpu(int x, int y, const float_t *in, const mask_t *masks, const label_t *labels, const float_t *out, float_t *diff) {
+__global__ void d_softmax_cross_entropy_kernel(int n, int len, const float_t *in,
+	const mask_t *masks, const label_t *labels, const float_t *out, float_t *diff) {
+	CUDA_KERNEL_LOOP(i, n) {
+		float_t out_grad[41];
+		d_cross_entropy(len, labels[i], out+len*i, out_grad);
+		d_softmax(len, out+len*i, out_grad, diff+len*i);
+	}
+}
+
+void d_softmax_cross_entropy_gpu(int n, int len, const float_t *in, const mask_t *masks, const label_t *labels, const float_t *out, float_t *diff) {
+	d_softmax_cross_entropy_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, len, in, masks, labels, out, diff);
 }
 
diff --git a/libdeepgalois/src/node.cpp b/libdeepgalois/src/node.cpp
index fd55aad0e5..e4cf43dd21 100644
--- a/libdeepgalois/src/node.cpp
+++ b/libdeepgalois/src/node.cpp
@@ -27,8 +27,8 @@ void edge::merge_grads(vec_t *dst) {
 
 void edge::clear_grads() {
 #ifdef CPU_ONLY
-	std::fill(grad_, grad_+ft_dim_*num_samples_, float_t{0}); // TODO: need vectorize
-	//vectorize::fill(&grad_[0], grad_.size(), float_t{0});
+	std::fill(grad_, grad_+ft_dim_*num_samples_, float_t(0)); // TODO: need vectorize
+	//vectorize::fill(&grad_[0], grad_.size(), float_t(0));
 #else
 	clear_grads_gpu();
 #endif

From b1269ca62ad70f1eda99849a05a4559575354e57 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Mon, 24 Feb 2020 19:32:52 -0600
Subject: [PATCH 026/660] add context.cu

---
 libdeepgalois/include/aggregator.h            |  2 -
 libdeepgalois/include/context.h               | 30 +++----
 libdeepgalois/include/layers/layer.h          | 17 +---
 .../include/layers/softmax_loss_layer.h       |  1 +
 libdeepgalois/include/math_functions.hh       |  1 +
 libdeepgalois/include/types.h                 |  2 +
 libdeepgalois/src/aggregator.cu               |  2 +-
 libdeepgalois/src/context.cpp                 | 79 ++----------------
 libdeepgalois/src/context.cu                  | 80 +++++++++++++++++++
 .../src/layers/softmax_loss_layer.cpp         | 22 ++++-
 libdeepgalois/src/math_functions.cu           | 24 ++++++
 libdeepgalois/src/net.cpp                     |  2 -
 libdeepgalois/src/node.cu                     |  2 +-
 13 files changed, 149 insertions(+), 115 deletions(-)
 create mode 100644 libdeepgalois/src/context.cu

diff --git a/libdeepgalois/include/aggregator.h b/libdeepgalois/include/aggregator.h
index 1ae8d062ae..78749104cf 100644
--- a/libdeepgalois/include/aggregator.h
+++ b/libdeepgalois/include/aggregator.h
@@ -5,8 +5,6 @@
 void update_all(size_t len, Graph &g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor);
 #else
 #include "graph_gpu.h"
-#define TB_SIZE 256
-#define WARP_SIZE 32
 void update_all(size_t len, CSRGraph &g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor);
 #endif
 
diff --git a/libdeepgalois/include/context.h b/libdeepgalois/include/context.h
index 5a362804cc..884cf51c1a 100644
--- a/libdeepgalois/include/context.h
+++ b/libdeepgalois/include/context.h
@@ -3,25 +3,19 @@
 #include <cassert>
 #include "types.h"
 #include "utils.h"
-#include "lgraph.h"
 #ifdef CPU_ONLY
+#include "lgraph.h"
 #include "gtypes.h"
 #else
 #include "graph_gpu.h"
-#endif
 #include "cutils.h"
+#endif
 
 class Context {
 public:
 	Context();
 	~Context();
 	enum Brew { CPU, GPU };
-	//static Context& Get();
-#ifndef CPU_ONLY
-	inline static cublasHandle_t cublas_handle() { return cublas_handle_; }
-	inline static curandGenerator_t curand_generator() { return curand_generator_; }
-	//static void create_blas_handle();
-#endif
 	Brew mode() { return mode_; }
 	void set_mode(Brew mode) { mode_ = mode; }
 	int solver_count() { return solver_count_; }
@@ -46,21 +40,25 @@ class Context {
 	size_t n; // number of samples: N
 	size_t num_classes; // number of classes: E
 	size_t feat_len; // input feature length: D
-#ifdef CPU_ONLY
-	Graph graph_cpu; // the input graph, |V| = N
-	void genGraph(LGraph &lg, Graph &g);
-	size_t read_graph_cpu(std::string dataset_str, std::string filetype = "gr");
-#else
-	CSRGraph graph_gpu; // the input graph, |V| = N
 	label_t *d_labels; // labels on device
-	float_t *d_norm_factor; // norm_factor on device
 	float_t *d_feats; // input features on device
+	float_t *d_norm_factor; // norm_factor on device
+	size_t read_graph_cpu(std::string dataset_str, std::string filetype = "gr");
 	size_t read_graph_gpu(std::string dataset_str);
 	void copy_data_to_device(); // copy labels and input features
 	void SetDevice(const int device_id);
 	void DeviceQuery() {}
 	bool CheckDevice(const int device_id) { return true; }
 	int FindDevice(const int start_id = 0) { return 0; }
+
+#ifdef CPU_ONLY
+	Graph graph_cpu; // the input graph, |V| = N
+	void genGraph(LGraph &lg, Graph &g);
+#else
+	CSRGraph graph_gpu; // the input graph, |V| = N
+	inline static cublasHandle_t cublas_handle() { return cublas_handle_; }
+	inline static curandGenerator_t curand_generator() { return curand_generator_; }
+	void norm_factor_counting_gpu(size_t n, CSRGraph graph, float_t *norm_factor);
 #endif
 
 protected:
@@ -69,8 +67,6 @@ class Context {
 	static curandGenerator_t curand_generator_; // used to generate random numbers on GPU
 #endif
 	Brew mode_;
-	//shared_ptr<RNG> random_generator_;
-	// Parallel training
 	int solver_count_;
 	int solver_rank_;
 	bool multiprocess_;
diff --git a/libdeepgalois/include/layers/layer.h b/libdeepgalois/include/layers/layer.h
index 057bc58383..83547a7f1f 100644
--- a/libdeepgalois/include/layers/layer.h
+++ b/libdeepgalois/include/layers/layer.h
@@ -41,8 +41,7 @@ class layer : public node {
 	virtual std::string layer_type() const = 0;
 	virtual void set_netphase(net_phase phase) {}
 	virtual void set_context(Context *ctx) { context = ctx; }
-	//virtual void forward_propagation(const vec_t &in_data, vec_t &out_data) = 0;
-	//virtual void back_propagation(const vec_t &in_data, const vec_t &out_data, vec_t &out_grad, vec_t &in_grad) = 0;
+	virtual acc_t get_masked_loss() { return acc_t(0); }
 	virtual void forward_propagation(const float_t *in_data, float_t *out_data) = 0;
 	virtual void back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) = 0;
 
@@ -103,20 +102,6 @@ class layer : public node {
 		//prev()->clear_grads();
 		next()->clear_grads();
 	}
-	inline acc_t get_masked_loss() {
-		AccumF total_loss;
-		AccumU valid_sample_count;
-		total_loss.reset();
-		valid_sample_count.reset();
-		galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) {
-			if (masks_[i]) {
-				total_loss += loss[i];
-				valid_sample_count += 1;
-			}
-		}, galois::chunk_size<256>(), galois::steal(), galois::loopname("getMaskedLoss"));
-		assert(valid_sample_count.reduce() == count_);
-		return total_loss.reduce() / (acc_t)count_;
-	}
 
 protected:
 	unsigned level_; // layer id: [0, num_layers-1]
diff --git a/libdeepgalois/include/layers/softmax_loss_layer.h b/libdeepgalois/include/layers/softmax_loss_layer.h
index f6d23f6c5a..78166b2fb5 100644
--- a/libdeepgalois/include/layers/softmax_loss_layer.h
+++ b/libdeepgalois/include/layers/softmax_loss_layer.h
@@ -8,5 +8,6 @@ class softmax_loss_layer: public layer {
 	std::string layer_type() const override { return std::string("softmax_loss"); }
 	virtual void forward_propagation(const float_t *in_data, float_t *out_data);
 	virtual void back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad);
+	virtual acc_t get_masked_loss();
 };
 
diff --git a/libdeepgalois/include/math_functions.hh b/libdeepgalois/include/math_functions.hh
index 45a34c7fc6..810bb894b1 100644
--- a/libdeepgalois/include/math_functions.hh
+++ b/libdeepgalois/include/math_functions.hh
@@ -75,5 +75,6 @@ void softmax_cross_entropy_gpu(int x, int y, const float_t *in_data, const mask_
 void d_softmax_cross_entropy_gpu(int x, int y, const float_t *in_data, const mask_t *masks, const label_t *labels, const float_t *out_data, float_t *diff);
 void scal_gpu(const int N, const float alpha, float *X);
 void add_scalar_gpu(const int N, const float_t alpha, float_t* Y);
+acc_t masked_avg_loss(size_t begin, size_t end, size_t count, mask_t *masks, float_t *loss);
 
 #endif
diff --git a/libdeepgalois/include/types.h b/libdeepgalois/include/types.h
index 9b483e1d70..720c2ae2b8 100644
--- a/libdeepgalois/include/types.h
+++ b/libdeepgalois/include/types.h
@@ -18,5 +18,7 @@ typedef float acc_t; // Accuracy type
 typedef short label_t; // label is for classification (supervised learning)
 typedef uint8_t mask_t; // mask is used to indicate different uses of labels: train, val, test
 #define CHUNK_SIZE 256
+#define TB_SIZE 256
+#define WARP_SIZE 32
 
 #endif
diff --git a/libdeepgalois/src/aggregator.cu b/libdeepgalois/src/aggregator.cu
index aa3b70b7b6..f8d138ca76 100644
--- a/libdeepgalois/src/aggregator.cu
+++ b/libdeepgalois/src/aggregator.cu
@@ -30,4 +30,4 @@ void update_all(size_t len, CSRGraph &g, const float_t *in, float_t *out, bool n
 	CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t)));
 	update_all_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, len, g, in, out, norm, norm_factor);
 }
-	
+
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index a275cb3b4c..50e954a19b 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -1,59 +1,10 @@
 #include "context.h"
 #include "gtypes.h"
-#include <cstdio>
-#include <ctime>
-
-// random seeding
-int64_t cluster_seedgen(void) {
-	int64_t s, seed, pid;
-	FILE* f = fopen("/dev/urandom", "rb");
-	if (f && fread(&seed, 1, sizeof(seed), f) == sizeof(seed)) {
-		fclose(f);
-		return seed;
-	}
-	std::cout << "System entropy source not available, "
-		"using fallback algorithm to generate seed instead.";
-	if (f) fclose(f);
-	pid = getpid();
-	s = time(NULL);
-	seed = std::abs(((s * 181) * ((pid - 83) * 359)) % 104729);
-	return seed;
-}
 
 #ifdef CPU_ONLY
 Context::Context() : mode_(Context::CPU), solver_count_(1), 
 	solver_rank_(0), multiprocess_(false) { }
 Context::~Context() {}
-#else
-cublasHandle_t Context::cublas_handle_ = 0;
-curandGenerator_t Context::curand_generator_ = 0;
-
-Context::Context() : mode_(Context::GPU), solver_count_(1), 
-	solver_rank_(0), multiprocess_(false) {
-//void Context::create_blas_handle() {
-	CUBLAS_CHECK(cublasCreate(&cublas_handle_));
-	CURAND_CHECK(curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT));
-	CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen()));
-}
-
-Context::~Context() {
-	if (cublas_handle_) CUBLAS_CHECK(cublasDestroy(cublas_handle_));
-	if (curand_generator_) {
-		CURAND_CHECK(curandDestroyGenerator(curand_generator_));
-	}
-}
-
-void Context::SetDevice(const int device_id) {
-	int current_device;
-	CUDA_CHECK(cudaGetDevice(&current_device));
-	if (current_device == device_id) return;
-	CUDA_CHECK(cudaSetDevice(device_id));
-	if (cublas_handle_) CUBLAS_CHECK(cublasDestroy(cublas_handle_));
-	if (curand_generator_) CURAND_CHECK(curandDestroyGenerator(curand_generator_));
-	CUBLAS_CHECK(cublasCreate(&cublas_handle_));
-	CURAND_CHECK(curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT));
-	CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen()));
-}
 #endif
 
 size_t Context::read_graph(std::string dataset_str) {
@@ -98,22 +49,8 @@ void Context::genGraph(LGraph &lg, Graph &g) {
 			g.constructEdge(offset, lg.get_dest(offset), 0);
 	}
 }
-float_t * Context::get_in_ptr() { return &h_feats[0]; }
-#else
-size_t Context::read_graph_gpu(std::string dataset_str) {
-	std::string filename = path + dataset_str + ".csgr";
-	graph_gpu.read(filename.c_str(), false);
-	return graph_gpu.nnodes;
-}
 
-void Context::copy_data_to_device() {
-	CUDA_CHECK(cudaMalloc((void **)&d_labels, n * sizeof(label_t)));
-	CUDA_CHECK(cudaMemcpy(d_labels, &labels[0], n * sizeof(label_t), cudaMemcpyHostToDevice));
-	CUDA_CHECK(cudaMalloc((void **)&d_norm_factor, n * sizeof(float_t)));
-	CUDA_CHECK(cudaMalloc((void **)&d_feats, n * feat_len *  sizeof(float_t)));
-	CUDA_CHECK(cudaMemcpy(d_feats, &h_feats[0], n * feat_len * sizeof(float_t), cudaMemcpyHostToDevice));
-}
-float_t * Context::get_in_ptr() { return d_feats; }
+float_t * Context::get_in_ptr() { return &h_feats[0]; }
 #endif
 
 // user-defined pre-computing function, called during initialization
@@ -122,19 +59,13 @@ void Context::norm_factor_counting() {
 #ifdef CPU_ONLY
 	norm_factor = new float_t[n];
 	galois::do_all(galois::iterate((size_t)0, n), [&] (auto v) {
-		float_t temp = std::sqrt(float_t(degrees[v]));
+		auto degree = std::distance(graph_cpu.edge_begin(v), graph_cpu.edge_end(v));
+		float_t temp = std::sqrt(float_t(degree));
 		if (temp == 0.0) norm_factor[v] = 0.0;
 		else norm_factor[v] = 1.0 / temp;
 	}, galois::loopname("NormCounting"));
-#endif
-}
-
-void Context::degree_counting() {
-#ifdef CPU_ONLY
-	degrees.resize(n);
-	galois::do_all(galois::iterate((size_t)0, n), [&] (auto v) {
-		degrees[v] = std::distance(graph_cpu.edge_begin(v), graph_cpu.edge_end(v));
-	}, galois::loopname("DegreeCounting"));
+#else
+	norm_factor_counting_gpu(n, graph_gpu, d_norm_factor);
 #endif
 }
 
diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu
new file mode 100644
index 0000000000..626c5abc0a
--- /dev/null
+++ b/libdeepgalois/src/context.cu
@@ -0,0 +1,80 @@
+#include <ctime>
+#include <cstdio>
+#include <unistd.h>
+#include <sys/types.h>
+#include "context.h"
+
+// random seeding
+int64_t cluster_seedgen(void) {
+	int64_t s, seed, pid;
+	FILE* f = fopen("/dev/urandom", "rb");
+	if (f && fread(&seed, 1, sizeof(seed), f) == sizeof(seed)) {
+		fclose(f);
+		return seed;
+	}
+	std::cout << "System entropy source not available, "
+		"using fallback algorithm to generate seed instead.";
+	if (f) fclose(f);
+	pid = getpid();
+	s = time(NULL);
+	seed = std::abs(((s * 181) * ((pid - 83) * 359)) % 104729);
+	return seed;
+}
+
+__global__ void norm_factor_counting_kernel(size_t n, CSRGraph graph, float_t *norm_factor) {
+	CUDA_KERNEL_LOOP(i, n) {
+		float_t temp = sqrt(float_t(graph.getOutDegree(i)));
+		if (temp == 0.0) norm_factor[i] = 0.0;
+		else norm_factor[i] = 1.0 / temp;
+	}
+}
+
+void Context::norm_factor_counting_gpu(size_t n, CSRGraph graph, float_t *norm_factor) {
+	CUDA_CHECK(cudaMalloc((void **)&norm_factor, n * sizeof(float_t)));
+	norm_factor_counting_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, graph, norm_factor);
+}
+
+cublasHandle_t Context::cublas_handle_ = 0;
+curandGenerator_t Context::curand_generator_ = 0;
+
+Context::Context() : mode_(Context::GPU), solver_count_(1), 
+	solver_rank_(0), multiprocess_(false) {
+	CUBLAS_CHECK(cublasCreate(&cublas_handle_));
+	CURAND_CHECK(curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT));
+	CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen()));
+}
+
+Context::~Context() {
+	if (cublas_handle_) CUBLAS_CHECK(cublasDestroy(cublas_handle_));
+	if (curand_generator_) {
+		CURAND_CHECK(curandDestroyGenerator(curand_generator_));
+	}
+}
+
+void Context::SetDevice(const int device_id) {
+	int current_device;
+	CUDA_CHECK(cudaGetDevice(&current_device));
+	if (current_device == device_id) return;
+	CUDA_CHECK(cudaSetDevice(device_id));
+	if (cublas_handle_) CUBLAS_CHECK(cublasDestroy(cublas_handle_));
+	if (curand_generator_) CURAND_CHECK(curandDestroyGenerator(curand_generator_));
+	CUBLAS_CHECK(cublasCreate(&cublas_handle_));
+	CURAND_CHECK(curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT));
+	CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen()));
+}
+
+size_t Context::read_graph_gpu(std::string dataset_str) {
+	std::string filename = path + dataset_str + ".csgr";
+	graph_gpu.read(filename.c_str(), false);
+	return graph_gpu.nnodes;
+}
+
+void Context::copy_data_to_device() {
+	CUDA_CHECK(cudaMalloc((void **)&d_labels, n * sizeof(label_t)));
+	CUDA_CHECK(cudaMemcpy(d_labels, &labels[0], n * sizeof(label_t), cudaMemcpyHostToDevice));
+	CUDA_CHECK(cudaMalloc((void **)&d_feats, n * feat_len *  sizeof(float_t)));
+	CUDA_CHECK(cudaMemcpy(d_feats, &h_feats[0], n * feat_len * sizeof(float_t), cudaMemcpyHostToDevice));
+}
+
+float_t * Context::get_in_ptr() { return d_feats; }
+
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp
index 7322e916d7..15e7009da6 100644
--- a/libdeepgalois/src/layers/softmax_loss_layer.cpp
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp
@@ -14,7 +14,6 @@ softmax_loss_layer::softmax_loss_layer(unsigned level, std::vector<size_t> in_di
 // TODO: need kernel fusion optimization
 // 𝑦[i] = 𝑒^𝑥[i] / Σ 𝑒^𝑥[𝑘]
 void softmax_loss_layer::forward_propagation(const float_t *in_data, float_t *out_data) {
-//void softmax_loss_layer::forward_propagation(const vec_t &in_data, vec_t &out_data) {
 	size_t len = input_dims[1];
 	galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) {
 		if (masks_[i] == 1) { // masked
@@ -27,7 +26,6 @@ void softmax_loss_layer::forward_propagation(const float_t *in_data, float_t *ou
 	}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("softmax-loss-fw"));
 }
 
-//void softmax_loss_layer::back_propagation(const vec_t &in_data, const vec_t &out_data, vec_t &out_grad, vec_t &in_grad) {
 void softmax_loss_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) {
 	size_t len = input_dims[1];
 	galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) {
@@ -40,6 +38,22 @@ void softmax_loss_layer::back_propagation(const float_t *in_data, const float_t
 		}
 	}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("softmax-loss-bw"));
 }
+
+acc_t softmax_loss_layer::get_masked_loss() {
+	AccumF total_loss;
+	AccumU valid_sample_count;
+	total_loss.reset();
+	valid_sample_count.reset();
+	galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) {
+		if (masks_[i]) {
+			total_loss += loss[i];
+			valid_sample_count += 1;
+		}
+	}, galois::chunk_size<256>(), galois::steal(), galois::loopname("getMaskedLoss"));
+	assert(valid_sample_count.reduce() == count_);
+	return total_loss.reduce() / (acc_t)count_;
+}
+
 #else // GPU implementation
 void softmax_loss_layer::forward_propagation(const float_t *in_data, float_t *out_data) {
 	softmax_cross_entropy_gpu(input_dims[0], input_dims[1], in_data, d_masks_, context->d_labels, loss, out_data);
@@ -48,4 +62,8 @@ void softmax_loss_layer::forward_propagation(const float_t *in_data, float_t *ou
 void softmax_loss_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) {
 	d_softmax_cross_entropy_gpu(input_dims[0], input_dims[1], in_data, d_masks_, context->d_labels, out_data, in_grad);
 }
+
+acc_t softmax_loss_layer::get_masked_loss() {
+	return masked_avg_loss(begin_, end_, count_, masks_, loss);
+}
 #endif
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index 781e4a083a..98e91472aa 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -1,5 +1,8 @@
 #include "math_functions.hh"
 #include "context.h"
+#include "gg.h"
+#include "ggcuda.h"
+#include "cub/cub.cuh"
 
 void gpu_rng_uniform(const int n, unsigned *r) {
 	CURAND_CHECK(curandGenerate(Context::curand_generator(), r, n));
@@ -214,3 +217,24 @@ void d_softmax_cross_entropy_gpu(int n, int len, const float_t *in, const mask_t
 	d_softmax_cross_entropy_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, len, in, masks, labels, out, diff);
 }
 
+__global__ void masked_avg_loss_kernel(size_t begin, size_t end, mask_t *masks, float_t *loss, HGAccumulator<acc_t> total) {
+	total.thread_entry();
+	__shared__ cub::BlockReduce<acc_t, TB_SIZE>::TempStorage local_loss;
+	CUDA_KERNEL_LOOP(i, end-begin) {
+		if (masks[begin+i] == 1)
+			//total += loss[begin+i];
+			total.reduce(loss[begin+i]);
+	}
+	total.thread_exit<cub::BlockReduce<acc_t, TB_SIZE> >(local_loss);
+}
+
+acc_t masked_avg_loss(size_t begin, size_t end, size_t count, mask_t *masks, float_t *loss) {
+	HGAccumulator<acc_t> loss_accum;
+	Shared<acc_t> total_loss = Shared<acc_t>(1);
+	*(total_loss.cpu_wr_ptr()) = 0;
+	loss_accum.rv = total_loss.gpu_wr_ptr();
+	masked_avg_loss_kernel<<<CUDA_GET_BLOCKS(end-begin), CUDA_NUM_THREADS>>>(begin, end, masks, loss, loss_accum);
+	cudaDeviceSynchronize();
+	return *(total_loss.cpu_rd_ptr());
+}
+
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index df775c9504..1d81ea1012 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -2,10 +2,8 @@
 
 void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1) {
 	context = new Context();
-	//Context::create_blas_handle();
 	num_samples = context->read_graph(dataset_str);
 	num_classes = context->read_labels(dataset_str);
-	context->degree_counting();
 	context->norm_factor_counting(); // pre-compute normalizing factor
 	num_epochs = epochs;
 
diff --git a/libdeepgalois/src/node.cu b/libdeepgalois/src/node.cu
index 87795390ff..da79217231 100644
--- a/libdeepgalois/src/node.cu
+++ b/libdeepgalois/src/node.cu
@@ -11,5 +11,5 @@ void edge::merge_grads_gpu(float_t *dst) {
 }
 
 void edge::clear_grads_gpu() {
-	CUDA_CHECK(cudaMemset(grad_, 0, ft_dim_*num_samples_*sizeof(float_t)));
+	CUDA_CHECK(cudaMemset(grad_, 0, num_samples_ * ft_dim_ * sizeof(float_t)));
 }

From 2f43132241a715afc2fa93192d83a29547b9d25a Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Mon, 24 Feb 2020 22:16:58 -0600
Subject: [PATCH 027/660] copy graph to gpu

---
 libdeepgalois/include/context.h | 26 +++++++++++++-------------
 libdeepgalois/include/cutils.h  | 12 ++++++++++++
 libdeepgalois/src/context.cpp   |  2 +-
 libdeepgalois/src/context.cu    | 20 +++++++++++++-------
 4 files changed, 39 insertions(+), 21 deletions(-)

diff --git a/libdeepgalois/include/context.h b/libdeepgalois/include/context.h
index 884cf51c1a..198b0cc9dc 100644
--- a/libdeepgalois/include/context.h
+++ b/libdeepgalois/include/context.h
@@ -31,18 +31,7 @@ class Context {
 	label_t get_label(size_t i) { return labels[i]; }
 	label_t *get_labels_ptr(size_t i) { return &(labels[0]); }
 	float_t * get_in_ptr();
-	void degree_counting();
-	void norm_factor_counting();
-	std::vector<label_t> labels; // labels for classification: N x 1
-	float_t *norm_factor; // normalization constant based on graph structure
-	std::vector<unsigned> degrees;
-	vec_t h_feats; // input features: N x D
-	size_t n; // number of samples: N
-	size_t num_classes; // number of classes: E
-	size_t feat_len; // input feature length: D
-	label_t *d_labels; // labels on device
-	float_t *d_feats; // input features on device
-	float_t *d_norm_factor; // norm_factor on device
+
 	size_t read_graph_cpu(std::string dataset_str, std::string filetype = "gr");
 	size_t read_graph_gpu(std::string dataset_str);
 	void copy_data_to_device(); // copy labels and input features
@@ -50,6 +39,18 @@ class Context {
 	void DeviceQuery() {}
 	bool CheckDevice(const int device_id) { return true; }
 	int FindDevice(const int start_id = 0) { return 0; }
+	void norm_factor_counting();
+	void norm_factor_counting_gpu();
+
+	size_t n; // number of samples: N
+	size_t num_classes; // number of classes: E
+	size_t feat_len; // input feature length: D
+	std::vector<label_t> labels; // labels for classification: N x 1
+	label_t *d_labels; // labels on device
+	vec_t h_feats; // input features: N x D
+	float_t *d_feats; // input features on device
+	float_t *norm_factor; // normalization constant based on graph structure
+	float_t *d_norm_factor; // norm_factor on device
 
 #ifdef CPU_ONLY
 	Graph graph_cpu; // the input graph, |V| = N
@@ -58,7 +59,6 @@ class Context {
 	CSRGraph graph_gpu; // the input graph, |V| = N
 	inline static cublasHandle_t cublas_handle() { return cublas_handle_; }
 	inline static curandGenerator_t curand_generator() { return curand_generator_; }
-	void norm_factor_counting_gpu(size_t n, CSRGraph graph, float_t *norm_factor);
 #endif
 
 protected:
diff --git a/libdeepgalois/include/cutils.h b/libdeepgalois/include/cutils.h
index 924dfd06e7..7d9eef3bb1 100644
--- a/libdeepgalois/include/cutils.h
+++ b/libdeepgalois/include/cutils.h
@@ -13,6 +13,18 @@ inline int CUDA_GET_BLOCKS(const int N) {
 	return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
 }
 
+static unsigned CudaTest(const char *msg) {
+	cudaError_t e;
+	//cudaThreadSynchronize();
+	cudaDeviceSynchronize();
+	if (cudaSuccess != (e = cudaGetLastError())) {
+		fprintf(stderr, "%s: %d\n", msg, e); 
+		fprintf(stderr, "%s\n", cudaGetErrorString(e));
+		exit(-1);
+	}
+	return 0;
+}
+
 inline const char* cublasGetErrorString(cublasStatus_t error) {
   switch (error) {
   case CUBLAS_STATUS_SUCCESS:
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index 50e954a19b..a500c02125 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -65,7 +65,7 @@ void Context::norm_factor_counting() {
 		else norm_factor[v] = 1.0 / temp;
 	}, galois::loopname("NormCounting"));
 #else
-	norm_factor_counting_gpu(n, graph_gpu, d_norm_factor);
+	norm_factor_counting_gpu();
 #endif
 }
 
diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu
index 626c5abc0a..182deeaed0 100644
--- a/libdeepgalois/src/context.cu
+++ b/libdeepgalois/src/context.cu
@@ -21,17 +21,20 @@ int64_t cluster_seedgen(void) {
 	return seed;
 }
 
-__global__ void norm_factor_counting_kernel(size_t n, CSRGraph graph, float_t *norm_factor) {
+__global__ void norm_factor_counting_kernel(int n, CSRGraph graph, float_t *norm_fac) {
 	CUDA_KERNEL_LOOP(i, n) {
 		float_t temp = sqrt(float_t(graph.getOutDegree(i)));
-		if (temp == 0.0) norm_factor[i] = 0.0;
-		else norm_factor[i] = 1.0 / temp;
+		if (temp == 0.0) norm_fac[i] = 0.0;
+		else norm_fac[i] = 1.0 / temp;
 	}
 }
 
-void Context::norm_factor_counting_gpu(size_t n, CSRGraph graph, float_t *norm_factor) {
-	CUDA_CHECK(cudaMalloc((void **)&norm_factor, n * sizeof(float_t)));
-	norm_factor_counting_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, graph, norm_factor);
+void Context::norm_factor_counting_gpu() {
+	std::cout << "Pre-computing normalization factor (n=" << n << ")\n";
+	assert(graph_gpu.nnodes == n);
+	CUDA_CHECK(cudaMalloc((void **)&d_norm_factor, n * sizeof(float_t)));
+	norm_factor_counting_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, graph_gpu, d_norm_factor);
+	CudaTest("solving norm_factor_counting kernel failed");
 }
 
 cublasHandle_t Context::cublas_handle_ = 0;
@@ -65,11 +68,14 @@ void Context::SetDevice(const int device_id) {
 
 size_t Context::read_graph_gpu(std::string dataset_str) {
 	std::string filename = path + dataset_str + ".csgr";
-	graph_gpu.read(filename.c_str(), false);
+	CSRGraph g;
+	g.read(filename.c_str(), false);
+	g.copy_to_gpu(graph_gpu);
 	return graph_gpu.nnodes;
 }
 
 void Context::copy_data_to_device() {
+	assert(labels.size() == n);
 	CUDA_CHECK(cudaMalloc((void **)&d_labels, n * sizeof(label_t)));
 	CUDA_CHECK(cudaMemcpy(d_labels, &labels[0], n * sizeof(label_t), cudaMemcpyHostToDevice));
 	CUDA_CHECK(cudaMalloc((void **)&d_feats, n * feat_len *  sizeof(float_t)));

From 04a4e3b13f0cff7383ffdfdfdc946e9a4f8203ed Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Tue, 25 Feb 2020 08:39:41 -0600
Subject: [PATCH 028/660] add optimizer.cu

---
 libdeepgalois/include/cutils.h                |   2 +-
 libdeepgalois/include/layers/layer.h          |  17 +--
 libdeepgalois/include/math_functions.hh       |   8 +-
 libdeepgalois/include/node.h                  |   1 -
 libdeepgalois/include/optimizer.h             | 112 ++++--------------
 libdeepgalois/src/layers/graph_conv_layer.cpp |   7 +-
 .../src/layers/softmax_loss_layer.cpp         |   4 +-
 libdeepgalois/src/math_functions.cu           |   8 +-
 libdeepgalois/src/node.cpp                    |   2 +
 libdeepgalois/src/optimizer.cpp               |  76 ++++++++++++
 libdeepgalois/src/optimizer.cu                |   4 +
 11 files changed, 133 insertions(+), 108 deletions(-)
 create mode 100644 libdeepgalois/src/optimizer.cpp
 create mode 100644 libdeepgalois/src/optimizer.cu

diff --git a/libdeepgalois/include/cutils.h b/libdeepgalois/include/cutils.h
index 7d9eef3bb1..3710b50ec9 100644
--- a/libdeepgalois/include/cutils.h
+++ b/libdeepgalois/include/cutils.h
@@ -13,7 +13,7 @@ inline int CUDA_GET_BLOCKS(const int N) {
 	return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
 }
 
-static unsigned CudaTest(const char *msg) {
+inline unsigned CudaTest(const char *msg) {
 	cudaError_t e;
 	//cudaThreadSynchronize();
 	cudaDeviceSynchronize();
diff --git a/libdeepgalois/include/layers/layer.h b/libdeepgalois/include/layers/layer.h
index 83547a7f1f..c022b1be46 100644
--- a/libdeepgalois/include/layers/layer.h
+++ b/libdeepgalois/include/layers/layer.h
@@ -59,6 +59,9 @@ class layer : public node {
 		end_ = sample_end;
 		count_ = sample_count;
 		masks_ = masks;
+#ifndef CPU_ONLY
+		copy_masks_device(input_dims[0], masks_, d_masks_);
+#endif
 	}
 	void set_in_data(float_t *data) {
 		assert(data.size() == input_dims[0]*input_dims[1]);
@@ -77,11 +80,9 @@ class layer : public node {
 		next_ = std::make_shared<edge>(this, output_dims[0], output_dims[1]);
 		// allocate memory for intermediate feature vectors and gradients
 		next_->alloc();
-		//next_->get_data().resize(output_dims[0]*output_dims[1]);
 	}
 	void alloc_grad() {
 		// allocate memory for intermediate gradients
-		//next_->get_gradient().resize(output_dims[0]*output_dims[1]);
 	}
 	void forward() {
 		forward_propagation(prev()->get_data(), next()->get_data());
@@ -90,15 +91,15 @@ class layer : public node {
 		back_propagation(prev()->get_data(), next()->get_data(), next()->get_gradient(), prev()->get_gradient());
 	}
 	void update_weight(optimizer *opt) {
-		// parallelize only when target size is big enough to mitigate thread spawning overhead.
-		bool parallel = (W.size() >= 512);
 		//vec_t diff;
 		//prev()->merge_grads(&diff);
-		//auto in_data = prev()->get_data();
-		//float_t rcp_batch_size = float_t(1.0) / in_data.size();
-		//for (size_t i = 0; i < diff.size(); ++i)
-		//	diff[i] *= rcp_batch_size;
+#ifdef CPU_ONLY
+		// parallelize only when target size is big enough to mitigate thread spawning overhead.
+		bool parallel = (W.size() >= 512);
 		opt->update(weight_grad, W, parallel); // W += grad
+#else
+		opt->update_gpu(d_weight_grad, d_W); // W += grad
+#endif
 		//prev()->clear_grads();
 		next()->clear_grads();
 	}
diff --git a/libdeepgalois/include/math_functions.hh b/libdeepgalois/include/math_functions.hh
index 810bb894b1..bf2dafbc5d 100644
--- a/libdeepgalois/include/math_functions.hh
+++ b/libdeepgalois/include/math_functions.hh
@@ -58,10 +58,7 @@ float_t cross_entropy(size_t n, const float_t *y, const float_t *p);
 void d_cross_entropy(const vec_t &y, const vec_t &p, vec_t &d);
 void d_cross_entropy(size_t n, const float_t *y, const float_t *p, float_t *d);
 
-void out_malloc_device(int n, mask_t *h_masks, mask_t *d_masks, float_t *loss);
-void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned *masks, float_t *in, float_t *out, float_t *matrix, float_t *grad);
 void copy_gpu(size_t len, const float_t *in, float_t *out);
-void malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned *masks, float_t *in, float_t *out);
 void vadd_gpu(const int n, const float_t *a, const float_t *b, float_t *out); // vector add
 void relu_gpu(const int n, const float_t *in, float_t *out); // ReLU
 void d_relu_gpu(const int n, const float_t *in_diff, const float_t *data, float_t *out_diff); // ReLU derivative
@@ -77,4 +74,9 @@ void scal_gpu(const int N, const float alpha, float *X);
 void add_scalar_gpu(const int N, const float_t alpha, float_t* Y);
 acc_t masked_avg_loss(size_t begin, size_t end, size_t count, mask_t *masks, float_t *loss);
 
+void copy_masks_device(int n, mask_t *h_masks, mask_t *d_masks);
+void malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned *masks, float_t *in, float_t *out);
+void loss_malloc_device(int n, float_t *loss);
+void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned *masks, float_t *in, float_t *out, float_t *matrix, float_t *grad);
+
 #endif
diff --git a/libdeepgalois/include/node.h b/libdeepgalois/include/node.h
index 98b97b2c55..5a3cf3f83f 100644
--- a/libdeepgalois/include/node.h
+++ b/libdeepgalois/include/node.h
@@ -29,7 +29,6 @@ class edge {
 public:
 	edge(node *prev, size_t n, size_t len) :
 		num_samples_(n), ft_dim_(len),
-		//data_(vec_t(n*len)), grad_(vec_t(n*len)),
 		data_(NULL), grad_(NULL), prev_(prev) {}
 
 	void alloc();
diff --git a/libdeepgalois/include/optimizer.h b/libdeepgalois/include/optimizer.h
index d0f35eac11..2af75a4966 100644
--- a/libdeepgalois/include/optimizer.h
+++ b/libdeepgalois/include/optimizer.h
@@ -14,6 +14,7 @@ struct optimizer {
 	optimizer &operator=(optimizer &&) = default;
 	virtual ~optimizer()               = default;
 	virtual void update(const vec_t &dW, vec_t &W, bool parallelize) = 0;
+	virtual void update_gpu(const float_t *dW, float_t *W) = 0;
 	virtual void reset() {}  // override to implement pre-learning action
 };
 
@@ -40,20 +41,8 @@ struct stateful_optimizer : public optimizer {
  **/
 struct adagrad : public stateful_optimizer<1> {
 	adagrad() : alpha(0.01), eps(float_t(1e-8)) {}
-	void update(const vec_t &dW, vec_t &W, bool parallelize) {
-		vec_t &g = get<0>(W);
-		if (parallelize) {
-			galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) {
-				g[i] += dW[i] * dW[i];
-				W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps);
-			}, galois::loopname("adagrad_update"));
-		} else {
-			for (size_t i = 0; i < W.size(); i++) {
-				g[i] += dW[i] * dW[i];
-				W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps);
-			}
-		}
-	}
+	void update(const vec_t &dW, vec_t &W, bool parallelize);
+	void update_gpu(const float_t *dW, float_t *W) {}
 	float_t alpha;  // learning rate
 	private:
 	float_t eps;
@@ -67,13 +56,8 @@ struct adagrad : public stateful_optimizer<1> {
  **/
 struct RMSprop : public stateful_optimizer<1> {
 	RMSprop() : alpha(float_t(0.0001)), mu(float_t(0.99)), eps(float_t(1e-8)) {}
-	void update(const vec_t &dW, vec_t &W, bool parallelize) {
-		vec_t &g = get<0>(W);
-		galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) {
-			g[i] = mu * g[i] + (1 - mu) * dW[i] * dW[i];
-			W[i] -= alpha * dW[i] / std::sqrt(g[i] + eps);
-		}, galois::loopname("rms_update"));
-	}
+	void update(const vec_t &dW, vec_t &W, bool parallelize);
+	void update_gpu(const float_t *dW, float_t *W) {}
 	float_t alpha;  // learning rate
 	float_t mu;     // decay term
 private:
@@ -83,23 +67,10 @@ struct RMSprop : public stateful_optimizer<1> {
 // Adam: A Method for Stochastic Optimization
 // http://arxiv.org/abs/1412.6980
 struct adam : public stateful_optimizer<2> {
-	adam() : alpha(0.01), b1(float_t(0.9)),
-		b2(float_t(0.999)), b1_t(float_t(0.9)),
-		b2_t(float_t(0.999)), eps(float_t(1e-8)) {}
-
-	void update(const vec_t &dW, vec_t &W, bool parallelize) {
-		vec_t &mt = get<0>(W);
-		vec_t &vt = get<1>(W);
-		galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) {
-			mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i];
-			vt[i] = b2 * vt[i] + (float_t(1) - b2) * dW[i] * dW[i];
-			// L2 norm based update rule
-			W[i] -= alpha * (mt[i] / (float_t(1) - b1_t)) /
-				std::sqrt((vt[i] / (float_t(1) - b2_t)) + eps);
-		}, galois::chunk_size<256>(), galois::steal(), galois::loopname("adam_update"));
-		b1_t *= b1;
-		b2_t *= b2;
-	}
+	adam() : alpha(float_t(0.01)), b1(float_t(0.9)), b2(float_t(0.999)), 
+		b1_t(float_t(0.9)), b2_t(float_t(0.999)), eps(float_t(1e-8)) {}
+	void update(const vec_t &dW, vec_t &W, bool parallelize);
+	void update_gpu(const float_t *dW, float_t *W);
 
 	float_t alpha;  // learning rate
 	float_t b1;     // decay term
@@ -118,24 +89,11 @@ struct adam : public stateful_optimizer<2> {
  *
  */
 struct adamax : public stateful_optimizer<2> {
-	adamax()
-		: alpha(float_t(0.002)),
-		b1(float_t(0.9)),
-		b2(float_t(0.999)),
-		b1_t(b1),
-		eps(float_t(1e-8)) {}
-
-	void update(const vec_t &dW, vec_t &W, bool parallelize) {
-		vec_t &mt = get<0>(W);
-		vec_t &ut = get<1>(W);
-		galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) {
-			mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i];
-			ut[i] = std::max(b2 * ut[i], std::abs(dW[i]));
-			// Lp norm based update rule
-			W[i] -= (alpha / (1.0 - b1_t)) * (mt[i] / (ut[i] + eps));
-		}, galois::loopname("adamax_update"));
-		b1_t *= b1;
-	}
+	adamax() : alpha(float_t(0.002)),
+		b1(float_t(0.9)), b2(float_t(0.999)),
+		b1_t(b1), eps(float_t(1e-8)) {}
+	void update(const vec_t &dW, vec_t &W, bool parallelize);
+	void update_gpu(const float_t *dW, float_t *W) {}
 
 	float_t alpha;  // learning rate
 	float_t b1;     // decay term
@@ -146,18 +104,12 @@ struct adamax : public stateful_optimizer<2> {
 	float_t eps;  // constant value to avoid zero-division
 };
 
-/**
- * SGD without momentum
- *
- * slightly faster than tiny_dnn::momentum
- **/
+// SGD without momentum
+// slightly faster than tiny_dnn::momentum
 struct gradient_descent : public optimizer {
 	gradient_descent() : alpha(float_t(0.01)), lambda(float_t(0)) {}
-	void update(const vec_t &dW, vec_t &W, bool parallelize) {
-		galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) {
-			W[i] = W[i] - alpha * (dW[i] + lambda * W[i]); 
-		}, galois::loopname("gradient_descent_update"));
-	}
+	void update(const vec_t &dW, vec_t &W, bool parallelize);
+	void update_gpu(const float_t *dW, float_t *W) {}
 	float_t alpha;   // learning rate
 	float_t lambda;  // weight decay
 };
@@ -172,18 +124,8 @@ struct gradient_descent : public optimizer {
 struct momentum : public stateful_optimizer<1> {
  public:
   momentum() : alpha(float_t(0.01)), lambda(float_t(0)), mu(float_t(0.9)) {}
-
-  void update(const vec_t &dW, vec_t &W, bool parallelize) {
-    vec_t &dWprev = get<0>(W);
-
-    //for_i(parallelize, W.size(), [&](size_t i) {
-	galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) {
-      float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda);
-      W[i] += V;
-      dWprev[i] = V;
-    //});
-	}, galois::loopname("momentum_update"));
-  }
+  void update(const vec_t &dW, vec_t &W, bool parallelize);
+  void update_gpu(const float_t *dW, float_t *W) {}
 
   float_t alpha;   // learning rate
   float_t lambda;  // weight decay
@@ -201,18 +143,8 @@ struct nesterov_momentum : public stateful_optimizer<1> {
  public:
   nesterov_momentum()
     : alpha(float_t(0.01)), lambda(float_t(0)), mu(float_t(0.9)) {}
-
-  void update(const vec_t &dW, vec_t &W, bool parallelize) {
-    vec_t &dWprev = get<0>(W);
-
-    //for_i(parallelize, W.size(), [&](size_t i) {
-	galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) {
-      float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda);
-      W[i] += (-mu) * dWprev[i] + (1 + mu) * V;
-      dWprev[i] = V;
-    //});
-	}, galois::loopname("nesterov_momentum_update"));
-  }
+  void update(const vec_t &dW, vec_t &W, bool parallelize);
+  void update_gpu(const float_t *dW, float_t *W) {}
 
   float_t alpha;   // learning rate
   float_t lambda;  // weight decay
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 8c7ba7fc1f..b81589b741 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -32,9 +32,9 @@ graph_conv_layer::graph_conv_layer(unsigned level, bool act, bool norm, bool bia
 }
 
 void graph_conv_layer::init() {
-	std::cout << name_ << ": allocating memory for parameters and intermediate data... ";
 	Timer t_alloc;
 	t_alloc.Start();
+	//std::cout << name_ << ": allocating memory for parameters and intermediate data... ";
 #ifdef CPU_ONLY
 	rand_init_matrix(y, z, W); // randomly initialize trainable parameters
 	//rand_init_matrix(y, z, Q);
@@ -47,7 +47,7 @@ void graph_conv_layer::init() {
 	gconv_malloc_device(x, y, z, dropout_, dropout_mask, in_temp, out_temp, d_W, d_weight_grad);
 #endif
 	t_alloc.Stop();
-	std::cout << "Done, allocation time: " << t_alloc.Millisecs() << " ms\n";
+	//std::cout << "Done, time: " << t_alloc.Millisecs() << " ms\n";
 }
 
 #ifdef CPU_ONLY
@@ -101,6 +101,9 @@ void graph_conv_layer::back_propagation(const float_t *in_data, const float_t *o
 // GPU forward
 void graph_conv_layer::forward_propagation(const float_t *in_data, float_t *out_data) {
 	assert(y <= 128); // currently only support feature length <= 128
+	assert(in_data != NULL);
+	assert(in_temp != NULL);
+	assert(dropout_mask != NULL);
 	if (dropout_ && phase_ == net_phase::train) {
 		dropout_gpu(x*y, scale_, dropout_rate_, in_data, dropout_mask, in_temp);
 		matmul1D1D_gpu(x, z, y, in_temp, d_W, out_temp);
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp
index 15e7009da6..430e1f253b 100644
--- a/libdeepgalois/src/layers/softmax_loss_layer.cpp
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp
@@ -4,11 +4,13 @@ softmax_loss_layer::softmax_loss_layer(unsigned level, std::vector<size_t> in_di
 	std::vector<size_t> out_dims) : layer(level, in_dims, out_dims) {
 	trainable_ = false;
 	name_ = layer_type() + "_" + std::to_string(level);
+	std::cout << name_ << ": allocating memory for intermediate data... ";
 #ifdef CPU_ONLY
 	loss = new float_t[in_dims[0]]; // error for each sample
 #else
-	out_malloc_device(in_dims[0], masks_, d_masks_, loss);
+	loss_malloc_device(in_dims[0], loss);
 #endif
+	std::cout << "Done\n";
 }
 #ifdef CPU_ONLY
 // TODO: need kernel fusion optimization
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index 98e91472aa..34b426386a 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -19,10 +19,14 @@ void gpu_rng_gaussian(const int n, const float_t mu, const float_t sigma, float_
 	CURAND_CHECK(curandGenerateNormal(Context::curand_generator(), r, n, mu, sigma));
 }
 
-void out_malloc_device(int n, mask_t *h_masks, mask_t *d_masks, float_t *loss) {
+void loss_malloc_device(int n, float_t *loss) {
+	CUDA_CHECK(cudaMalloc((void **)&loss, n * sizeof(float_t)));
+}
+
+void copy_masks_device(int n, mask_t *h_masks, mask_t *d_masks) {
+	assert(h_masks != NULL);
 	CUDA_CHECK(cudaMalloc((void **)&d_masks, n * sizeof(mask_t)));
 	CUDA_CHECK(cudaMemcpy(d_masks, h_masks, n * sizeof(mask_t), cudaMemcpyHostToDevice));
-	CUDA_CHECK(cudaMalloc((void **)&loss, n * sizeof(float_t)));
 }
 
 void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned *masks, float_t *in, float_t *out, float_t *matrix, float_t *grad) {
diff --git a/libdeepgalois/src/node.cpp b/libdeepgalois/src/node.cpp
index e4cf43dd21..b08cf3c51c 100644
--- a/libdeepgalois/src/node.cpp
+++ b/libdeepgalois/src/node.cpp
@@ -1,6 +1,8 @@
 #include "node.h"
+#include <iostream>
 
 void edge::alloc() {
+	std::cout << "Allocating memory for tensors (intermediate features and gradients... ";
 #ifdef CPU_ONLY
 	data_ = new float_t[num_samples_ * ft_dim_];
 	grad_ = new float_t[num_samples_ * ft_dim_];
diff --git a/libdeepgalois/src/optimizer.cpp b/libdeepgalois/src/optimizer.cpp
new file mode 100644
index 0000000000..3372378de1
--- /dev/null
+++ b/libdeepgalois/src/optimizer.cpp
@@ -0,0 +1,76 @@
+#include "optimizer.h"
+#include "galois/Galois.h"
+
+void adagrad::update(const vec_t &dW, vec_t &W, bool parallelize) {
+	vec_t &g = get<0>(W);
+	if (parallelize) {
+		galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) {
+			g[i] += dW[i] * dW[i];
+			W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps);
+		}, galois::loopname("adagrad_update"));
+	} else {
+		for (size_t i = 0; i < W.size(); i++) {
+			g[i] += dW[i] * dW[i];
+			W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps);
+		}
+	}
+}
+
+void RMSprop::update(const vec_t &dW, vec_t &W, bool parallelize) {
+	vec_t &g = get<0>(W);
+	galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) {
+		g[i] = mu * g[i] + (1 - mu) * dW[i] * dW[i];
+		W[i] -= alpha * dW[i] / std::sqrt(g[i] + eps);
+	}, galois::loopname("rms_update"));
+}
+
+void adam::update(const vec_t &dW, vec_t &W, bool parallelize) {
+	vec_t &mt = get<0>(W);
+	vec_t &vt = get<1>(W);
+	galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) {
+		mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i];
+		vt[i] = b2 * vt[i] + (float_t(1) - b2) * dW[i] * dW[i];
+		// L2 norm based update rule
+		W[i] -= alpha * (mt[i] / (float_t(1) - b1_t)) /
+			std::sqrt((vt[i] / (float_t(1) - b2_t)) + eps);
+	}, galois::chunk_size<256>(), galois::steal(), galois::loopname("adam_update"));
+	b1_t *= b1;
+	b2_t *= b2;
+}
+
+void adamax::update(const vec_t &dW, vec_t &W, bool parallelize) {
+	vec_t &mt = get<0>(W);
+	vec_t &ut = get<1>(W);
+	galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) {
+		mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i];
+		ut[i] = std::max(b2 * ut[i], std::abs(dW[i]));
+		// Lp norm based update rule
+		W[i] -= (alpha / (1.0 - b1_t)) * (mt[i] / (ut[i] + eps));
+	}, galois::loopname("adamax_update"));
+	b1_t *= b1;
+}
+
+void gradient_descent::update(const vec_t &dW, vec_t &W, bool parallelize) {
+	galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) {
+		W[i] = W[i] - alpha * (dW[i] + lambda * W[i]); 
+	}, galois::loopname("gradient_descent_update"));
+}
+
+void momentum::update(const vec_t &dW, vec_t &W, bool parallelize) {
+	vec_t &dWprev = get<0>(W);
+	galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) {
+		float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda);
+		W[i] += V;
+		dWprev[i] = V;
+	}, galois::loopname("momentum_update"));
+}
+
+void nesterov_momentum::update(const vec_t &dW, vec_t &W, bool parallelize) {
+	vec_t &dWprev = get<0>(W);
+	galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) {
+		float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda);
+		W[i] += (-mu) * dWprev[i] + (1 + mu) * V;
+		dWprev[i] = V;
+	}, galois::loopname("nesterov_momentum_update"));
+}
+
diff --git a/libdeepgalois/src/optimizer.cu b/libdeepgalois/src/optimizer.cu
new file mode 100644
index 0000000000..832da51cbf
--- /dev/null
+++ b/libdeepgalois/src/optimizer.cu
@@ -0,0 +1,4 @@
+#include "optimizer.h"
+
+void adam::update_gpu(const float_t *dW, float_t *W) {
+}

From 3fd5da6761d18ff3b639683997930ed3f97a1c5a Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Tue, 25 Feb 2020 08:44:40 -0600
Subject: [PATCH 029/660] fix bug

---
 libdeepgalois/include/optimizer.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/libdeepgalois/include/optimizer.h b/libdeepgalois/include/optimizer.h
index 2af75a4966..072eb7d2bc 100644
--- a/libdeepgalois/include/optimizer.h
+++ b/libdeepgalois/include/optimizer.h
@@ -70,7 +70,11 @@ struct adam : public stateful_optimizer<2> {
 	adam() : alpha(float_t(0.01)), b1(float_t(0.9)), b2(float_t(0.999)), 
 		b1_t(float_t(0.9)), b2_t(float_t(0.999)), eps(float_t(1e-8)) {}
 	void update(const vec_t &dW, vec_t &W, bool parallelize);
+#ifdef CPU_ONLY
+	void update_gpu(const float_t *dW, float_t *W) {}
+#else
 	void update_gpu(const float_t *dW, float_t *W);
+#endif
 
 	float_t alpha;  // learning rate
 	float_t b1;     // decay term

From 63cfd0f876c31f0bee17825a0d641afaabeb8b8b Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Tue, 25 Feb 2020 08:51:26 -0600
Subject: [PATCH 030/660] update CMakeLists.txt

---
 libdeepgalois/CMakeLists.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index 625ab3b6a4..1ce41abc73 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -30,6 +30,8 @@ else()
   set(CUDA_SOURCES
     src/math_functions.cu
 	src/aggregator.cu
+	src/optimizer.cu
+	src/context.cu
 	src/node.cu
   )
   cuda_add_library(dg_gpu ${CUDA_SOURCES})
@@ -46,6 +48,7 @@ set(sources
   src/layers/softmax_loss_layer.cpp
   src/math_functions.cpp
   src/aggregator.cpp
+  src/optimizer.cpp
   src/context.cpp
   src/node.cpp
   src/net.cpp

From 06964ac4e5f687c53d48d20009b9972a08a06861 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Tue, 25 Feb 2020 11:58:04 -0600
Subject: [PATCH 031/660] fix gpu memory alloc

---
 libdeepgalois/include/layers/layer.h          |  3 ++
 libdeepgalois/include/math_functions.hh       | 14 ++---
 libdeepgalois/src/aggregator.cu               |  2 +
 libdeepgalois/src/layers/graph_conv_layer.cpp |  2 +-
 libdeepgalois/src/math_functions.cpp          | 30 +++++------
 libdeepgalois/src/math_functions.cu           | 52 +++++++++++++++++--
 libdeepgalois/src/node.cpp                    |  2 +-
 7 files changed, 77 insertions(+), 28 deletions(-)

diff --git a/libdeepgalois/include/layers/layer.h b/libdeepgalois/include/layers/layer.h
index c022b1be46..11f82b1486 100644
--- a/libdeepgalois/include/layers/layer.h
+++ b/libdeepgalois/include/layers/layer.h
@@ -85,12 +85,15 @@ class layer : public node {
 		// allocate memory for intermediate gradients
 	}
 	void forward() {
+		std::cout << name_ << ": forwarding ... ";
 		forward_propagation(prev()->get_data(), next()->get_data());
 	}
 	void backward() {
+		std::cout << name_ << ": backwarding ... ";
 		back_propagation(prev()->get_data(), next()->get_data(), next()->get_gradient(), prev()->get_gradient());
 	}
 	void update_weight(optimizer *opt) {
+		std::cout << name_ << ": weight updating ... ";
 		//vec_t diff;
 		//prev()->merge_grads(&diff);
 #ifdef CPU_ONLY
diff --git a/libdeepgalois/include/math_functions.hh b/libdeepgalois/include/math_functions.hh
index bf2dafbc5d..2e435d60e2 100644
--- a/libdeepgalois/include/math_functions.hh
+++ b/libdeepgalois/include/math_functions.hh
@@ -44,11 +44,11 @@ void clear(size_t n, float_t *in);
 void relu(const vec_t &in, vec_t &out); // ReLU
 void relu(size_t n, const float_t *in, float_t *out); // ReLU
 void d_relu(const vec_t &in_diff, const vec_t &data, vec_t &out_diff); // ReLU derivative
-void dropout(const float scale, const float dropout_rate, const vec_t &in, std::vector<unsigned> &mask, vec_t &out); // dropout
-void dropout(const float scale, const float dropout_rate, const vec_t &in, std::vector<unsigned> &mask, float_t *out);
-void dropout(size_t n, const float scale, const float dropout_rate, const float_t *in, unsigned *mask, float_t *out);
-void d_dropout(const float scale, const vec_t &in_diff, std::vector<unsigned> &mask, vec_t &out_diff); // dropout derivative
-void d_dropout(size_t n, const float scale, const float_t *in_diff, unsigned *mask, float_t *out_diff);
+void dropout(const float scale, const float dropout_rate, const vec_t &in, std::vector<unsigned> &masks, vec_t &out); // dropout
+void dropout(const float scale, const float dropout_rate, const vec_t &in, std::vector<unsigned> &masks, float_t *out);
+void dropout(size_t n, const float scale, const float dropout_rate, const float_t *in, unsigned *masks, float_t *out);
+void d_dropout(const float scale, const vec_t &in_diff, std::vector<unsigned> &masks, vec_t &out_diff); // dropout derivative
+void d_dropout(size_t n, const float scale, const float_t *in_diff, unsigned *masks, float_t *out_diff);
 void softmax(const vec_t &input, vec_t &output);
 void softmax(size_t n, const float_t *input, float_t *output);
 void d_softmax(const vec_t &y, const vec_t &p, vec_t &dy, const vec_t &dp);
@@ -62,8 +62,8 @@ void copy_gpu(size_t len, const float_t *in, float_t *out);
 void vadd_gpu(const int n, const float_t *a, const float_t *b, float_t *out); // vector add
 void relu_gpu(const int n, const float_t *in, float_t *out); // ReLU
 void d_relu_gpu(const int n, const float_t *in_diff, const float_t *data, float_t *out_diff); // ReLU derivative
-void dropout_gpu(const int n, const float scale, const float dropout_rate, const float_t *in, unsigned *mask, float_t *out); // dropout
-void d_dropout_gpu(const float scale, const float_t *in_diff, const unsigned *mask, float_t *out_diff); // dropout derivative
+void dropout_gpu(const int n, const float scale, const float dropout_rate, const float_t *in, unsigned *masks, float_t *out); // dropout
+void d_dropout_gpu(const int n, const float scale, const float_t *in, const unsigned *masks, float_t *out); // dropout derivative
 void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, 
 	const int M, const int N, const int K, const float alpha, const float* A, const float* B, const float beta, float* C);
 void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, const float_t *A, const float_t *B, float_t *C); // matrix multiply
diff --git a/libdeepgalois/src/aggregator.cu b/libdeepgalois/src/aggregator.cu
index f8d138ca76..a6b61ce914 100644
--- a/libdeepgalois/src/aggregator.cu
+++ b/libdeepgalois/src/aggregator.cu
@@ -26,8 +26,10 @@ __global__  void update_all_kernel(size_t n, size_t len, CSRGraph &g, const floa
 }
 
 void update_all(size_t len, CSRGraph &g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor) {
+	std::cout << "[debug]: update_all on GPU\n";
 	unsigned n = g.nnodes;
 	CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t)));
 	update_all_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, len, g, in, out, norm, norm_factor);
+	CudaTest("solving update_all kernel failed");
 }
 
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index b81589b741..86f39ade20 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -119,7 +119,7 @@ void graph_conv_layer::back_propagation(const float_t *in_data, const float_t *o
 	if (level_ != 0) {
 		sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0, in_temp);
 		update_all(y, context->graph_gpu, in_temp, in_grad, true, context->d_norm_factor);
-		if (dropout_) d_dropout(y, scale_, in_grad, dropout_mask, in_grad);
+		if (dropout_) d_dropout_gpu(y, scale_, in_grad, dropout_mask, in_grad);
 	}
 	sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, d_weight_grad);
 }
diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp
index 7e0b805e05..6b41afb020 100644
--- a/libdeepgalois/src/math_functions.cpp
+++ b/libdeepgalois/src/math_functions.cpp
@@ -318,37 +318,37 @@ float reduce_mean(const vec_t &x) {
 	return sum / (float)n;
 }
 
-void dropout(const float scale, const float dropout_rate, const vec_t &in, std::vector<unsigned> &mask, vec_t &out) {
-	assert(mask.size() == out.size());
-	//rng_bernoulli(1. - dropout_rate, mask); // Create random numbers
+void dropout(const float scale, const float dropout_rate, const vec_t &in, std::vector<unsigned> &masks, vec_t &out) {
+	assert(masks.size() == out.size());
+	//rng_bernoulli(1. - dropout_rate, masks); // Create random numbers
 	for (size_t i = 0; i < in.size(); ++i)
-		mask[i] = bernoulli(dropout_rate);
+		masks[i] = bernoulli(dropout_rate);
 	for (size_t i = 0; i < in.size(); ++i)
-		out[i] = in[i] * mask[i] * scale;
+		out[i] = in[i] * masks[i] * scale;
 }
 
-void dropout(const float scale, const float dropout_rate, const vec_t &in, std::vector<unsigned> &mask, float_t *out) {
+void dropout(const float scale, const float dropout_rate, const vec_t &in, std::vector<unsigned> &masks, float_t *out) {
 	for (size_t i = 0; i < in.size(); ++i)
-		mask[i] = bernoulli(dropout_rate);
+		masks[i] = bernoulli(dropout_rate);
 	for (size_t i = 0; i < in.size(); ++i)
-		out[i] = in[i] * mask[i] * scale;
+		out[i] = in[i] * masks[i] * scale;
 }
 
-void dropout(size_t n, const float scale, const float dropout_rate, const float_t *in, unsigned *mask, float_t *out) {
+void dropout(size_t n, const float scale, const float dropout_rate, const float_t *in, unsigned *masks, float_t *out) {
 	for (size_t i = 0; i < n; ++i)
-		mask[i] = bernoulli(dropout_rate);
+		masks[i] = bernoulli(dropout_rate);
 	for (size_t i = 0; i < n; ++i)
-		out[i] = in[i] * mask[i] * scale;
+		out[i] = in[i] * masks[i] * scale;
 }
 
-void d_dropout(const float scale, const vec_t &in_diff, std::vector<unsigned> &mask, vec_t &out_diff) {
+void d_dropout(const float scale, const vec_t &in_diff, std::vector<unsigned> &masks, vec_t &out_diff) {
 	for (size_t i = 0; i < in_diff.size(); ++i)
-		out_diff[i] = in_diff[i] * mask[i] * scale;
+		out_diff[i] = in_diff[i] * masks[i] * scale;
 }
 
-void d_dropout(size_t n, const float scale, const float_t *in_diff, unsigned *mask, float_t *out_diff) {
+void d_dropout(size_t n, const float scale, const float_t *in_diff, unsigned *masks, float_t *out_diff) {
 	for (size_t i = 0; i < n; ++i)
-		out_diff[i] = in_diff[i] * mask[i] * scale;
+		out_diff[i] = in_diff[i] * masks[i] * scale;
 }
 
 float_t sigmoid_func(float_t x) {
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index 34b426386a..415e141ec9 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -3,6 +3,8 @@
 #include "gg.h"
 #include "ggcuda.h"
 #include "cub/cub.cuh"
+#include <curand_kernel.h>
+
 
 void gpu_rng_uniform(const int n, unsigned *r) {
 	CURAND_CHECK(curandGenerate(Context::curand_generator(), r, n));
@@ -45,15 +47,47 @@ void copy_gpu(size_t len, const float_t *in, float_t *out) {
 	CUDA_CHECK(cudaMemcpy(out, in, len * sizeof(float_t), cudaMemcpyDeviceToDevice));
 }
 
-__global__ void dropout_kernel(const int n, const float scale, const float dropout_rate, const float_t* in, unsigned *masks, float_t* out) {
+__global__ void setup_curand_kernel(const int n, curandState *state) {
+	CUDA_KERNEL_LOOP(i, n) {
+		curand_init(1234, i, 0, &state[i]); // Each thread gets same seed 1234
+		//curand_init(7+i, i, 0, &state[i]); // Each thread gets different seed
+	}
+}
+
+__device__ bool bernoulli_gpu(int tid, curandState *state, float_t p) {
+	curandState local_state = state[tid];
+	return curand_uniform(&local_state) <= p;
+}
+
+__global__ void dropout_kernel(const int n, const float scale, const float dropout_rate, const float_t* in, unsigned *masks, curandState *state, float_t* out) {
 	CUDA_KERNEL_LOOP(i, n) {
-		//masks[i] = bernoulli(dropout_rate);
+		masks[i] = bernoulli_gpu(i, state, dropout_rate);
 		out[i] = in[i] * masks[i] * scale;
 	}
 }
 
 void dropout_gpu(const int n, const float scale, const float dropout_rate, const float_t *in, unsigned *masks, float_t *out) {
-	dropout_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, scale, dropout_rate, in, masks, out);
+	curandState *devStates;
+	CUDA_CHECK(cudaMalloc((void **)&devStates, n * sizeof(curandState)));
+	std::cout << "[debug]: setup curand, n = " << n << "\n";
+	setup_curand_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, devStates);
+	CudaTest("solving setup_curand kernel failed");
+	std::cout << "[debug]: dropout_gpu\n";
+	dropout_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, scale, dropout_rate, in, masks, devStates, out);
+	CudaTest("solving dropout kernel failed");
+	CUDA_CHECK(cudaFree(devStates));
+	std::cout << "[debug]: dropout_gpu done\n";
+}
+
+__global__ void d_dropout_kernel(const int n, const float scale, const float_t *in, const unsigned *masks, float_t *out) {
+	CUDA_KERNEL_LOOP(i, n) {
+		out[i] = in[i] * masks[i] * scale;
+	}
+}
+
+void d_dropout_gpu(const int n, const float scale, const float_t *in, const unsigned *masks, float_t *out) {
+	d_dropout_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, scale, in, masks, out);
+	CudaTest("solving dropout kernel failed");
 }
 
 // flattern data into 1D before feed into the ReLU operater
@@ -64,7 +98,9 @@ __global__ void relu_kernel(const int n, const float_t* in, float_t* out) {
 }
 
 void relu_gpu(const int n, const float_t *in, float_t* out) {
+	std::cout << "[debug]: relu_gpu\n";
 	relu_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, in, out);
+	CudaTest("solving relu kernel failed");
 }
 
 __global__ void d_relu_kernel(const int n, const float_t* in_diff, const float_t* data, float_t* out_diff) {
@@ -75,6 +111,7 @@ __global__ void d_relu_kernel(const int n, const float_t* in_diff, const float_t
 
 void d_relu_gpu(const int n, const float_t *in_diff, const float_t *data, float_t *out_diff) {
 	d_relu_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, in_diff, data, out_diff);
+	CudaTest("solving d_relu kernel failed");
 }
 
 void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, 
@@ -89,6 +126,7 @@ void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
 }
 
 void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, const float_t *A, const float_t *B, float_t *C) {
+	std::cout << "[debug]: matmul1D1D_gpu\n";
 	const CBLAS_TRANSPOSE TransA = CblasNoTrans;
 	const CBLAS_TRANSPOSE TransB = CblasNoTrans;
 	sgemm_gpu(TransA, TransB, dim_x, dim_y, dim_z, 1.0, A, B, 0.0, C);
@@ -134,6 +172,7 @@ void set_gpu(const int N, const float_t alpha, float_t* Y) {
 		return;
 	}
 	set_kernel<<<CUDA_GET_BLOCKS(N), CUDA_NUM_THREADS>>>(N, alpha, Y);
+	CudaTest("solving set kernel failed");
 }
 
 __global__ void add_scalar_kernel(const int n, const float_t alpha, float_t* y) {
@@ -144,6 +183,7 @@ __global__ void add_scalar_kernel(const int n, const float_t alpha, float_t* y)
 
 void add_scalar_gpu(const int N, const float_t alpha, float_t* Y) {
 	add_scalar_kernel<<<CUDA_GET_BLOCKS(N), CUDA_NUM_THREADS>>>(N, alpha, Y);
+	CudaTest("solving add_scalar kernel failed");
 }
 
 __global__ void vadd_kernel(const int n, const float_t* a, const float_t* b, float_t* y) {
@@ -154,6 +194,7 @@ __global__ void vadd_kernel(const int n, const float_t* a, const float_t* b, flo
 
 void vadd_gpu(const int N, const float_t* a, const float_t* b, float_t* y) {
 	vadd_kernel<<<CUDA_GET_BLOCKS(N), CUDA_NUM_THREADS>>>(N, a, b, y);
+	CudaTest("solving vadd kernel failed");
 }
 
 // TODO: use warp
@@ -206,12 +247,13 @@ __global__ void softmax_cross_entropy_kernel(int n, int len, const float_t *in_d
 
 void softmax_cross_entropy_gpu(int n, int len, const float_t *in, const mask_t *masks, const label_t *labels, float_t *loss, float_t *out) {
 	softmax_cross_entropy_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, len, in, masks, labels, loss, out);
+	CudaTest("solving softmax_cross_entropy kernel failed");
 }
 
 __global__ void d_softmax_cross_entropy_kernel(int n, int len, const float_t *in,
 	const mask_t *masks, const label_t *labels, const float_t *out, float_t *diff) {
 	CUDA_KERNEL_LOOP(i, n) {
-		float_t out_grad[41];
+		float_t out_grad[41]; // TODO
 		d_cross_entropy(len, labels[i], out+len*i, out_grad);
 		d_softmax(len, out+len*i, out_grad, diff+len*i);
 	}
@@ -219,6 +261,7 @@ __global__ void d_softmax_cross_entropy_kernel(int n, int len, const float_t *in
 
 void d_softmax_cross_entropy_gpu(int n, int len, const float_t *in, const mask_t *masks, const label_t *labels, const float_t *out, float_t *diff) {
 	d_softmax_cross_entropy_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, len, in, masks, labels, out, diff);
+	CudaTest("solving d_softmax_cross_entropy kernel failed");
 }
 
 __global__ void masked_avg_loss_kernel(size_t begin, size_t end, mask_t *masks, float_t *loss, HGAccumulator<acc_t> total) {
@@ -238,6 +281,7 @@ acc_t masked_avg_loss(size_t begin, size_t end, size_t count, mask_t *masks, flo
 	*(total_loss.cpu_wr_ptr()) = 0;
 	loss_accum.rv = total_loss.gpu_wr_ptr();
 	masked_avg_loss_kernel<<<CUDA_GET_BLOCKS(end-begin), CUDA_NUM_THREADS>>>(begin, end, masks, loss, loss_accum);
+	CudaTest("solving masked_avg_loss kernel failed");
 	cudaDeviceSynchronize();
 	return *(total_loss.cpu_rd_ptr());
 }
diff --git a/libdeepgalois/src/node.cpp b/libdeepgalois/src/node.cpp
index b08cf3c51c..5b60a9f22a 100644
--- a/libdeepgalois/src/node.cpp
+++ b/libdeepgalois/src/node.cpp
@@ -2,7 +2,7 @@
 #include <iostream>
 
 void edge::alloc() {
-	std::cout << "Allocating memory for tensors (intermediate features and gradients... ";
+	//std::cout << "Allocating memory for tensors (intermediate features and gradients) ...\n";
 #ifdef CPU_ONLY
 	data_ = new float_t[num_samples_ * ft_dim_];
 	grad_ = new float_t[num_samples_ * ft_dim_];

From e559ed12a69e1be1b93ea7b911cd6dac5b6429bd Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 25 Feb 2020 13:25:55 -0600
Subject: [PATCH 032/660] ran clang-format on lonestargnn

---
 lonestargnn/gcn/gcn.cpp           |  75 +++++++++++---------
 lonestargnn/graphsage/gs-mean.cpp |  56 ++++++++-------
 lonestargnn/lonestargnn.h         | 114 ++++++++++++++++++++----------
 3 files changed, 147 insertions(+), 98 deletions(-)

diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp
index 1ef0fa24f2..9bfe231181 100644
--- a/lonestargnn/gcn/gcn.cpp
+++ b/lonestargnn/gcn/gcn.cpp
@@ -7,40 +7,47 @@ const char* desc = "Graph convolutional neural networks on an undirected graph";
 const char* url  = 0;
 
 int main(int argc, char** argv) {
-	galois::SharedMemSys G;
-	LonestarGnnStart(argc, argv, name, desc, url);
-	Net network; // the neural network to train
-	network.init(dataset, epochs, hidden1);
-	network.construct_layers(); // default setting for now; can be customized by the user
-	network.print_layers_info();
-	ResourceManager rm;
+  galois::SharedMemSys G;
+  LonestarGnnStart(argc, argv, name, desc, url);
+  Net network; // the neural network to train
+  network.init(dataset, epochs, hidden1);
+  network.construct_layers(); // default setting for now; can be customized by
+                              // the user
+  network.print_layers_info();
+  ResourceManager rm;
 
-	// the optimizer used to update parameters, see optimizer.h for more details
-	//optimizer *opt = new gradient_descent();
-	//optimizer *opt = new adagrad(); 
-	optimizer *opt = new adam();
-	galois::StatTimer Ttrain("TrainAndVal");
-	Ttrain.start();
-	network.train(opt, do_validate); // do training using training samples
-	Ttrain.stop();
+  // the optimizer used to update parameters, see optimizer.h for more details
+  // optimizer *opt = new gradient_descent();
+  // optimizer *opt = new adagrad();
+  optimizer* opt = new adam();
+  galois::StatTimer Ttrain("TrainAndVal");
+  Ttrain.start();
+  network.train(opt, do_validate); // do training using training samples
+  Ttrain.stop();
 
-	if (do_test) {
-		// test using test samples
-		size_t n = network.get_nnodes();
-		acc_t test_loss = 0.0, test_acc = 0.0;
-		size_t test_begin = 0, test_end = n, test_count = n;
-		std::vector<mask_t> test_mask(n, 0);
-		if (dataset == "reddit") {
-			test_begin = 177262; test_count = 55703; test_end = test_begin + test_count;
-			for (size_t i = test_begin; i < test_end; i++) test_mask[i] = 1;
-		} else test_count = read_masks(dataset, "test", test_begin, test_end, test_mask);
-		galois::StatTimer Ttest("Test");
-		Ttest.start();
-		double test_time = network.evaluate(test_begin, test_end, test_count, &test_mask[0], test_loss, test_acc);
-		std::cout << "\nTesting: test_loss = " << test_loss << " test_acc = " << test_acc << " test_time = " << test_time << "\n";
-		Ttest.stop();
-	}
-	std::cout << "\n" << rm.get_peak_memory() << "\n\n";
-	return 0;
+  if (do_test) {
+    // test using test samples
+    size_t n        = network.get_nnodes();
+    acc_t test_loss = 0.0, test_acc = 0.0;
+    size_t test_begin = 0, test_end = n, test_count = n;
+    std::vector<mask_t> test_mask(n, 0);
+    if (dataset == "reddit") {
+      test_begin = 177262;
+      test_count = 55703;
+      test_end   = test_begin + test_count;
+      for (size_t i = test_begin; i < test_end; i++)
+        test_mask[i] = 1;
+    } else
+      test_count = read_masks(dataset, "test", test_begin, test_end, test_mask);
+    galois::StatTimer Ttest("Test");
+    Ttest.start();
+    double test_time = network.evaluate(test_begin, test_end, test_count,
+                                        &test_mask[0], test_loss, test_acc);
+    std::cout << "\nTesting: test_loss = " << test_loss
+              << " test_acc = " << test_acc << " test_time = " << test_time
+              << "\n";
+    Ttest.stop();
+  }
+  std::cout << "\n" << rm.get_peak_memory() << "\n\n";
+  return 0;
 }
-
diff --git a/lonestargnn/graphsage/gs-mean.cpp b/lonestargnn/graphsage/gs-mean.cpp
index b70cdc183c..4bd80e6203 100644
--- a/lonestargnn/graphsage/gs-mean.cpp
+++ b/lonestargnn/graphsage/gs-mean.cpp
@@ -6,36 +6,40 @@ const char* name = "GraphSage";
 const char* desc = "A graph neural network variant: GraphSAGE";
 const char* url  = 0;
 
-class GraphSageMean: public graph_conv_layer { 
-	// user-defined combine function
+class GraphSageMean : public graph_conv_layer {
+  // user-defined combine function
 };
 
 int main(int argc, char** argv) {
-	galois::SharedMemSys G;
-	LonestarStart(argc, argv, name, desc, url);
-	Net network; // the neural network to train
-	network.init(); // default setting for now; see its implementation to find how to customize it by the user
-	ResourceManager rm;
+  galois::SharedMemSys G;
+  LonestarStart(argc, argv, name, desc, url);
+  Net network;    // the neural network to train
+  network.init(); // default setting for now; see its implementation to find how
+                  // to customize it by the user
+  ResourceManager rm;
 
-	// the optimizer used to update parameters, see optimizer.h for more details
-	//optimizer *opt = new gradient_descent();
-	//optimizer *opt = new adagrad(); 
-	optimizer *opt = new adam();
-	galois::StatTimer Ttrain("Train");
-	Ttrain.start();
-	network.train(opt); // do training using training samples
-	Ttrain.stop();
+  // the optimizer used to update parameters, see optimizer.h for more details
+  // optimizer *opt = new gradient_descent();
+  // optimizer *opt = new adagrad();
+  optimizer* opt = new adam();
+  galois::StatTimer Ttrain("Train");
+  Ttrain.start();
+  network.train(opt); // do training using training samples
+  Ttrain.stop();
 
-	// test using test samples
-	acc_t test_loss = 0.0, test_acc = 0.0;
-	size_t test_begin = 2312, test_end = 3312; // [2312, 3327) test size = 1015 TODO: replace ad-hoc settings
-	galois::StatTimer Ttest("Test");
-	Ttest.start();
-	double test_time = network.evaluate(test_begin, test_end, test_loss, test_acc);
-	std::cout << "\nTesting: test_loss = " << test_loss << " test_acc = " << test_acc << " test_time = " << test_time << "\n";
-	Ttest.stop();
+  // test using test samples
+  acc_t test_loss = 0.0, test_acc = 0.0;
+  size_t test_begin = 2312, test_end = 3312; // [2312, 3327) test size = 1015
+                                             // TODO: replace ad-hoc settings
+  galois::StatTimer Ttest("Test");
+  Ttest.start();
+  double test_time =
+      network.evaluate(test_begin, test_end, test_loss, test_acc);
+  std::cout << "\nTesting: test_loss = " << test_loss
+            << " test_acc = " << test_acc << " test_time = " << test_time
+            << "\n";
+  Ttest.stop();
 
-	std::cout << "\n" << rm.get_peak_memory() << "\n\n";
-	return 0;
+  std::cout << "\n" << rm.get_peak_memory() << "\n\n";
+  return 0;
 }
-
diff --git a/lonestargnn/lonestargnn.h b/lonestargnn/lonestargnn.h
index cbf3c1ae2a..7ecbe32d7a 100644
--- a/lonestargnn/lonestargnn.h
+++ b/lonestargnn/lonestargnn.h
@@ -12,17 +12,41 @@
 #include <boost/iterator/transform_iterator.hpp>
 
 namespace cll = llvm::cl;
-static cll::opt<std::string> dataset(cll::Positional, cll::desc("<dataset name>"), cll::Required); // 'cora', 'citeseer', 'pubmed'
-static cll::opt<std::string> filetype(cll::Positional, cll::desc("<filetype: el,gr>"), cll::init("gr")); // file format of the input graph
-static cll::opt<std::string> model("m", cll::desc("Model string"), cll::init("gcn")); // 'gcn', 'gcn_cheby', 'dense'
-static cll::opt<float> learning_rate("lr", cll::desc("Initial learning rate (default value 0.01)"), cll::init(0.01));
-static cll::opt<unsigned> epochs("k", cll::desc("number of epoch, i.e. iterations (default value 1)"), cll::init(1));
-static cll::opt<unsigned> hidden1("h", cll::desc("Number of units in hidden layer 1 (default value 16)"), cll::init(16));
-static cll::opt<float> dropout_rate("d", cll::desc("Dropout rate (1 - keep probability) (default value 0.5)"), cll::init(0.5));
-static cll::opt<float> weight_decay("wd", cll::desc("Weight for L2 loss on embedding matrix (default value 5e-4)"), cll::init(5e-4));
-static cll::opt<float> early_stopping("es", cll::desc("Tolerance for early stopping (# of epochs) (default value 10)"), cll::init(10));
-static cll::opt<unsigned> max_degree("md", cll::desc("Maximum Chebyshev polynomial degree (default value 3)"), cll::init(3));
-static cll::opt<unsigned> do_validate("dv", cll::desc("enable validation"), cll::init(1));
+static cll::opt<std::string>
+    dataset(cll::Positional, cll::desc("<dataset name>"),
+            cll::Required); // 'cora', 'citeseer', 'pubmed'
+static cll::opt<std::string>
+    filetype(cll::Positional, cll::desc("<filetype: el,gr>"),
+             cll::init("gr")); // file format of the input graph
+static cll::opt<std::string>
+    model("m", cll::desc("Model string"),
+          cll::init("gcn")); // 'gcn', 'gcn_cheby', 'dense'
+static cll::opt<float>
+    learning_rate("lr", cll::desc("Initial learning rate (default value 0.01)"),
+                  cll::init(0.01));
+static cll::opt<unsigned>
+    epochs("k", cll::desc("number of epoch, i.e. iterations (default value 1)"),
+           cll::init(1));
+static cll::opt<unsigned>
+    hidden1("h",
+            cll::desc("Number of units in hidden layer 1 (default value 16)"),
+            cll::init(16));
+static cll::opt<float> dropout_rate(
+    "d", cll::desc("Dropout rate (1 - keep probability) (default value 0.5)"),
+    cll::init(0.5));
+static cll::opt<float> weight_decay(
+    "wd",
+    cll::desc("Weight for L2 loss on embedding matrix (default value 5e-4)"),
+    cll::init(5e-4));
+static cll::opt<float> early_stopping(
+    "es",
+    cll::desc("Tolerance for early stopping (# of epochs) (default value 10)"),
+    cll::init(10));
+static cll::opt<unsigned> max_degree(
+    "md", cll::desc("Maximum Chebyshev polynomial degree (default value 3)"),
+    cll::init(3));
+static cll::opt<unsigned> do_validate("dv", cll::desc("enable validation"),
+                                      cll::init(1));
 static cll::opt<unsigned> do_test("dt", cll::desc("enable test"), cll::init(1));
 
 //! standard global options to the benchmarks
@@ -31,40 +55,54 @@ extern llvm::cl::opt<int> numThreads;
 extern llvm::cl::opt<std::string> statFile;
 
 //! standard global options to the benchmarks
-llvm::cl::opt<bool> skipVerify("noverify", llvm::cl::desc("Skip verification step (default value false)"), llvm::cl::init(false));
-llvm::cl::opt<int>numThreads("t", llvm::cl::desc("Number of threads (default value 1)"), llvm::cl::init(1));
-llvm::cl::opt<std::string> statFile("statFile", llvm::cl::desc("ouput file to print stats to (default value empty)"), llvm::cl::init(""));
+llvm::cl::opt<bool>
+    skipVerify("noverify",
+               llvm::cl::desc("Skip verification step (default value false)"),
+               llvm::cl::init(false));
+llvm::cl::opt<int>
+    numThreads("t", llvm::cl::desc("Number of threads (default value 1)"),
+               llvm::cl::init(1));
+llvm::cl::opt<std::string> statFile(
+    "statFile",
+    llvm::cl::desc("ouput file to print stats to (default value empty)"),
+    llvm::cl::init(""));
 
 static void LonestarGnnPrintVersion() {
-	std::cout << "LoneStarGNN Benchmark Suite v" << galois::getVersion() << " (" << galois::getRevision() << ")\n";
+  std::cout << "LoneStarGNN Benchmark Suite v" << galois::getVersion() << " ("
+            << galois::getRevision() << ")\n";
 }
 
 //! initialize lonestargnn benchmark
-void LonestarGnnStart(int argc, char** argv, const char* app, const char* desc, const char* url) {
-	llvm::cl::SetVersionPrinter(LonestarGnnPrintVersion);
-	llvm::cl::ParseCommandLineOptions(argc, argv);
-	numThreads = galois::setActiveThreads(numThreads);
-	galois::runtime::setStatFile(statFile);
-	LonestarGnnPrintVersion();
-	std::cout << "Copyright (C) " << galois::getCopyrightYear() << " The University of Texas at Austin\n";
-	std::cout << "http://iss.ices.utexas.edu/galois/\n\n";
-	std::cout << "application: " << (app ? app : "unspecified") << "\n";
-	if (desc) std::cout << desc << "\n";
-	if (url) std::cout << "http://iss.ices.utexas.edu/?p=projects/galois/benchmarks/" << url << "\n";
-	std::cout << "\n";
-	std::ostringstream cmdout;
-	for (int i = 0; i < argc; ++i) {
-		cmdout << argv[i];
-		if (i != argc - 1) cmdout << " ";
-	}
-	galois::runtime::reportParam("(NULL)", "CommandLine", cmdout.str());
-	galois::runtime::reportParam("(NULL)", "Threads", numThreads);
-	char name[256];
-	gethostname(name, 256);
-	galois::runtime::reportParam("(NULL)", "Hostname", name);
+void LonestarGnnStart(int argc, char** argv, const char* app, const char* desc,
+                      const char* url) {
+  llvm::cl::SetVersionPrinter(LonestarGnnPrintVersion);
+  llvm::cl::ParseCommandLineOptions(argc, argv);
+  numThreads = galois::setActiveThreads(numThreads);
+  galois::runtime::setStatFile(statFile);
+  LonestarGnnPrintVersion();
+  std::cout << "Copyright (C) " << galois::getCopyrightYear()
+            << " The University of Texas at Austin\n";
+  std::cout << "http://iss.ices.utexas.edu/galois/\n\n";
+  std::cout << "application: " << (app ? app : "unspecified") << "\n";
+  if (desc)
+    std::cout << desc << "\n";
+  if (url)
+    std::cout << "http://iss.ices.utexas.edu/?p=projects/galois/benchmarks/"
+              << url << "\n";
+  std::cout << "\n";
+  std::ostringstream cmdout;
+  for (int i = 0; i < argc; ++i) {
+    cmdout << argv[i];
+    if (i != argc - 1)
+      cmdout << " ";
+  }
+  galois::runtime::reportParam("(NULL)", "CommandLine", cmdout.str());
+  galois::runtime::reportParam("(NULL)", "Threads", numThreads);
+  char name[256];
+  gethostname(name, 256);
+  galois::runtime::reportParam("(NULL)", "Hostname", name);
 }
 
 #include "types.h"
 #include "utils.h"
 #include "net.h"
-

From 172e69316f85c63e109674a3a37d2d36f23a363c Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 25 Feb 2020 13:32:38 -0600
Subject: [PATCH 033/660] clangformat on libdeepgalois

---
 libdeepgalois/include/aggregator.h            |   9 +-
 libdeepgalois/include/context.h               | 116 +--
 libdeepgalois/include/cutils.h                |  71 +-
 libdeepgalois/include/gtypes.h                |   7 +-
 .../include/layers/arithmetic_layer.h         |  34 +-
 .../include/layers/graph_conv_layer.h         | 127 +--
 libdeepgalois/include/layers/layer.h          | 235 +++---
 libdeepgalois/include/layers/linear_layer.h   |  44 +-
 libdeepgalois/include/layers/relu_layer.h     |  24 +-
 .../include/layers/softmax_loss_layer.h       |  18 +-
 libdeepgalois/include/lgraph.h                | 305 +++----
 libdeepgalois/include/math_functions.hh       | 150 ++--
 libdeepgalois/include/net.h                   | 194 ++---
 libdeepgalois/include/node.h                  |  77 +-
 libdeepgalois/include/optimizer.h             | 238 ++++--
 libdeepgalois/include/random.h                |  81 +-
 libdeepgalois/include/timer.h                 |  33 +-
 libdeepgalois/include/types.h                 |  14 +-
 libdeepgalois/include/utils.h                 | 164 ++--
 libdeepgalois/src/aggregator.cpp              |  41 +-
 libdeepgalois/src/aggregator.cu               |  50 +-
 libdeepgalois/src/context.cpp                 | 304 ++++---
 libdeepgalois/src/context.cu                  | 125 +--
 libdeepgalois/src/layers/graph_conv_layer.cpp | 228 +++---
 libdeepgalois/src/layers/relu_layer.cpp       |  46 +-
 .../src/layers/softmax_loss_layer.cpp         | 109 +--
 libdeepgalois/src/math_functions.cpp          | 762 +++++++++---------
 libdeepgalois/src/math_functions.cu           | 395 +++++----
 libdeepgalois/src/net.cpp                     | 161 ++--
 libdeepgalois/src/node.cpp                    |  39 +-
 libdeepgalois/src/node.cu                     |  13 +-
 libdeepgalois/src/optimizer.cpp               | 133 +--
 libdeepgalois/src/optimizer.cu                |   3 +-
 33 files changed, 2372 insertions(+), 1978 deletions(-)

diff --git a/libdeepgalois/include/aggregator.h b/libdeepgalois/include/aggregator.h
index 78749104cf..01b1a1e8c8 100644
--- a/libdeepgalois/include/aggregator.h
+++ b/libdeepgalois/include/aggregator.h
@@ -2,9 +2,12 @@
 #include "types.h"
 #ifdef CPU_ONLY
 #include "gtypes.h"
-void update_all(size_t len, Graph &g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor);
+void update_all(size_t len, Graph& g, const float_t* in, float_t* out,
+                bool norm, const float_t* norm_factor);
 #else
 #include "graph_gpu.h"
-void update_all(size_t len, CSRGraph &g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor);
+#define TB_SIZE 256
+#define WARP_SIZE 32
+void update_all(size_t len, CSRGraph& g, const float_t* in, float_t* out,
+                bool norm, const float_t* norm_factor);
 #endif
-
diff --git a/libdeepgalois/include/context.h b/libdeepgalois/include/context.h
index 198b0cc9dc..688ed9a2a5 100644
--- a/libdeepgalois/include/context.h
+++ b/libdeepgalois/include/context.h
@@ -3,76 +3,82 @@
 #include <cassert>
 #include "types.h"
 #include "utils.h"
-#ifdef CPU_ONLY
 #include "lgraph.h"
+#ifdef CPU_ONLY
 #include "gtypes.h"
 #else
 #include "graph_gpu.h"
-#include "cutils.h"
 #endif
+#include "cutils.h"
 
 class Context {
 public:
-	Context();
-	~Context();
-	enum Brew { CPU, GPU };
-	Brew mode() { return mode_; }
-	void set_mode(Brew mode) { mode_ = mode; }
-	int solver_count() { return solver_count_; }
-	void set_solver_count(int val) { solver_count_ = val; }
-	int solver_rank() { return solver_rank_; }
-	void set_solver_rank(int val) { solver_rank_ = val; }
-	bool multiprocess() { return multiprocess_; }
-	void set_multiprocess(bool val) { multiprocess_ = val; }
-	bool root_solver() { return solver_rank_ == 0; }
-	size_t read_graph(std::string dataset_str);
-	size_t read_labels(std::string dataset_str);
-	size_t read_features(std::string dataset_str);
-	label_t get_label(size_t i) { return labels[i]; }
-	label_t *get_labels_ptr(size_t i) { return &(labels[0]); }
-	float_t * get_in_ptr();
-
-	size_t read_graph_cpu(std::string dataset_str, std::string filetype = "gr");
-	size_t read_graph_gpu(std::string dataset_str);
-	void copy_data_to_device(); // copy labels and input features
-	void SetDevice(const int device_id);
-	void DeviceQuery() {}
-	bool CheckDevice(const int device_id) { return true; }
-	int FindDevice(const int start_id = 0) { return 0; }
-	void norm_factor_counting();
-	void norm_factor_counting_gpu();
-
-	size_t n; // number of samples: N
-	size_t num_classes; // number of classes: E
-	size_t feat_len; // input feature length: D
-	std::vector<label_t> labels; // labels for classification: N x 1
-	label_t *d_labels; // labels on device
-	vec_t h_feats; // input features: N x D
-	float_t *d_feats; // input features on device
-	float_t *norm_factor; // normalization constant based on graph structure
-	float_t *d_norm_factor; // norm_factor on device
-
+  Context();
+  ~Context();
+  enum Brew { CPU, GPU };
+  // static Context& Get();
+#ifndef CPU_ONLY
+  inline static cublasHandle_t cublas_handle() { return cublas_handle_; }
+  inline static curandGenerator_t curand_generator() {
+    return curand_generator_;
+  }
+  // static void create_blas_handle();
+#endif
+  Brew mode() { return mode_; }
+  void set_mode(Brew mode) { mode_ = mode; }
+  int solver_count() { return solver_count_; }
+  void set_solver_count(int val) { solver_count_ = val; }
+  int solver_rank() { return solver_rank_; }
+  void set_solver_rank(int val) { solver_rank_ = val; }
+  bool multiprocess() { return multiprocess_; }
+  void set_multiprocess(bool val) { multiprocess_ = val; }
+  bool root_solver() { return solver_rank_ == 0; }
+  size_t read_graph(std::string dataset_str);
+  size_t read_labels(std::string dataset_str);
+  size_t read_features(std::string dataset_str);
+  label_t get_label(size_t i) { return labels[i]; }
+  label_t* get_labels_ptr(size_t i) { return &(labels[0]); }
+  float_t* get_in_ptr();
+  void degree_counting();
+  void norm_factor_counting();
+  std::vector<label_t> labels; // labels for classification: N x 1
+  float_t* norm_factor; // normalization constant based on graph structure
+  std::vector<unsigned> degrees;
+  vec_t h_feats;      // input features: N x D
+  size_t n;           // number of samples: N
+  size_t num_classes; // number of classes: E
+  size_t feat_len;    // input feature length: D
 #ifdef CPU_ONLY
-	Graph graph_cpu; // the input graph, |V| = N
-	void genGraph(LGraph &lg, Graph &g);
+  Graph graph_cpu; // the input graph, |V| = N
+  void genGraph(LGraph& lg, Graph& g);
+  size_t read_graph_cpu(std::string dataset_str, std::string filetype = "gr");
 #else
-	CSRGraph graph_gpu; // the input graph, |V| = N
-	inline static cublasHandle_t cublas_handle() { return cublas_handle_; }
-	inline static curandGenerator_t curand_generator() { return curand_generator_; }
+  CSRGraph graph_gpu;     // the input graph, |V| = N
+  label_t* d_labels;      // labels on device
+  float_t* d_norm_factor; // norm_factor on device
+  float_t* d_feats;       // input features on device
+  size_t read_graph_gpu(std::string dataset_str);
+  void copy_data_to_device(); // copy labels and input features
+  void SetDevice(const int device_id);
+  void DeviceQuery() {}
+  bool CheckDevice(const int device_id) { return true; }
+  int FindDevice(const int start_id = 0) { return 0; }
 #endif
 
 protected:
 #ifndef CPU_ONLY
-	static cublasHandle_t cublas_handle_; // used to call cuBLAS
-	static curandGenerator_t curand_generator_; // used to generate random numbers on GPU
+  static cublasHandle_t cublas_handle_; // used to call cuBLAS
+  static curandGenerator_t
+      curand_generator_; // used to generate random numbers on GPU
 #endif
-	Brew mode_;
-	int solver_count_;
-	int solver_rank_;
-	bool multiprocess_;
+  Brew mode_;
+  // shared_ptr<RNG> random_generator_;
+  // Parallel training
+  int solver_count_;
+  int solver_rank_;
+  bool multiprocess_;
 
 private:
-	// The private constructor to avoid duplicate instantiation.
-	//Context();
+  // The private constructor to avoid duplicate instantiation.
+  // Context();
 };
-
diff --git a/libdeepgalois/include/cutils.h b/libdeepgalois/include/cutils.h
index 3710b50ec9..830a4bbd08 100644
--- a/libdeepgalois/include/cutils.h
+++ b/libdeepgalois/include/cutils.h
@@ -10,19 +10,7 @@ const int CUDA_NUM_THREADS = 256;
 
 // CUDA: number of blocks for threads.
 inline int CUDA_GET_BLOCKS(const int N) {
-	return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
-}
-
-inline unsigned CudaTest(const char *msg) {
-	cudaError_t e;
-	//cudaThreadSynchronize();
-	cudaDeviceSynchronize();
-	if (cudaSuccess != (e = cudaGetLastError())) {
-		fprintf(stderr, "%s: %d\n", msg, e); 
-		fprintf(stderr, "%s\n", cudaGetErrorString(e));
-		exit(-1);
-	}
-	return 0;
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
 }
 
 inline const char* cublasGetErrorString(cublasStatus_t error) {
@@ -88,41 +76,42 @@ inline const char* curandGetErrorString(curandStatus_t error) {
 }
 
 // CUDA: various checks for different function calls.
-#define CUDA_CHECK(condition) \
-  do { \
-    cudaError_t error = condition; \
-    if (error != cudaSuccess) {    \
-      fprintf(stderr, "error %d: Cuda error in file '%s' in line %i : %s.\n", \
-      error, __FILE__, __LINE__, cudaGetErrorString(error) );                    \
-      exit(EXIT_FAILURE);                                                     \
-    } \
+#define CUDA_CHECK(condition)                                                  \
+  do {                                                                         \
+    cudaError_t error = condition;                                             \
+    if (error != cudaSuccess) {                                                \
+      fprintf(stderr, "error %d: Cuda error in file '%s' in line %i : %s.\n",  \
+              error, __FILE__, __LINE__, cudaGetErrorString(error));           \
+      exit(EXIT_FAILURE);                                                      \
+    }                                                                          \
   } while (0)
 
-#define CUBLAS_CHECK(condition) \
-  do { \
-    cublasStatus_t status = condition;   \
-    if (status != CUBLAS_STATUS_SUCCESS) { \
-      fprintf(stderr, "error %d: cuBLAS error in file '%s' in line %i : %s.\n", \
-      status, __FILE__, __LINE__, cublasGetErrorString(status) );      \
-      exit(EXIT_FAILURE);                                                     \
-    } \
+#define CUBLAS_CHECK(condition)                                                \
+  do {                                                                         \
+    cublasStatus_t status = condition;                                         \
+    if (status != CUBLAS_STATUS_SUCCESS) {                                     \
+      fprintf(stderr,                                                          \
+              "error %d: cuBLAS error in file '%s' in line %i : %s.\n",        \
+              status, __FILE__, __LINE__, cublasGetErrorString(status));       \
+      exit(EXIT_FAILURE);                                                      \
+    }                                                                          \
   } while (0)
 
-#define CURAND_CHECK(condition) \
-  do { \
-    curandStatus_t status = condition; \
-    if (status != CURAND_STATUS_SUCCESS) { \
-      fprintf(stderr, "error %d: cuBLAS error in file '%s' in line %i : %s.\n", \
-      status, __FILE__, __LINE__, curandGetErrorString(status) );      \
-      exit(EXIT_FAILURE);                                                     \
-    } \
+#define CURAND_CHECK(condition)                                                \
+  do {                                                                         \
+    curandStatus_t status = condition;                                         \
+    if (status != CURAND_STATUS_SUCCESS) {                                     \
+      fprintf(stderr,                                                          \
+              "error %d: cuBLAS error in file '%s' in line %i : %s.\n",        \
+              status, __FILE__, __LINE__, curandGetErrorString(status));       \
+      exit(EXIT_FAILURE);                                                      \
+    }                                                                          \
   } while (0)
 
 // CUDA: grid stride looping
-#define CUDA_KERNEL_LOOP(i, n) \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
-       i < (n); i += blockDim.x * gridDim.x)
+#define CUDA_KERNEL_LOOP(i, n)                                                 \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n);                 \
+       i += blockDim.x * gridDim.x)
 
 // CUDA: check for error after kernel execution and exit loudly if there is one.
 #define CUDA_POST_KERNEL_CHECK CUDA_CHECK(cudaPeekAtLastError())
-
diff --git a/libdeepgalois/include/gtypes.h b/libdeepgalois/include/gtypes.h
index a30468b0f9..e11c1058cc 100644
--- a/libdeepgalois/include/gtypes.h
+++ b/libdeepgalois/include/gtypes.h
@@ -6,10 +6,11 @@ typedef galois::GAccumulator<acc_t> AccumF;
 typedef galois::GAccumulator<size_t> AccumU;
 
 #ifdef EDGE_LABEL
-typedef galois::graphs::LC_CSR_Graph<uint32_t, uint32_t>::with_numa_alloc<true>::type ::with_no_lockable<true>::type Graph;
+typedef galois::graphs::LC_CSR_Graph<uint32_t, uint32_t>::with_numa_alloc<
+    true>::type ::with_no_lockable<true>::type Graph;
 #else
-typedef galois::graphs::LC_CSR_Graph<uint32_t, void>::with_numa_alloc<true>::type ::with_no_lockable<true>::type Graph;
+typedef galois::graphs::LC_CSR_Graph<uint32_t, void>::with_numa_alloc<
+    true>::type ::with_no_lockable<true>::type Graph;
 #endif
 
 typedef Graph::GraphNode GNode;
-
diff --git a/libdeepgalois/include/layers/arithmetic_layer.h b/libdeepgalois/include/layers/arithmetic_layer.h
index aed91e0379..63dc66f780 100644
--- a/libdeepgalois/include/layers/arithmetic_layer.h
+++ b/libdeepgalois/include/layers/arithmetic_layer.h
@@ -4,19 +4,23 @@
 // element-wise add N vectors ```y_i = x0_i + x1_i + ... + xnum_i```
 class elementwise_add_layer : public layer {
 public:
-	elementwise_add_layer(unsigned level, std::vector<size_t> in_dim,
-		std::vector<size_t> out_dim) : layer(level, in_dim, out_dim) {
-		trainable_ = false;
-	}
-	std::string layer_type() const override { return std::string("elementwise_add"); }
-	void forward_propagation(const tensor_t &in_data, tensor_t &out_data) override {
-		for (size_t sample = 0; sample < in_data.size(); ++sample) {
-			for (size_t j = 0; j < in_data[0].size(); j++)
-				out_data[sample][j] = in_data[sample][j];
-		}
-	}
-	void back_propagation(const tensor_t &in_data, const tensor_t &out_data, 
-		tensor_t &out_grad, tensor_t &in_grad) override {
-		in_grad = out_grad;
-	}
+  elementwise_add_layer(unsigned level, std::vector<size_t> in_dim,
+                        std::vector<size_t> out_dim)
+      : layer(level, in_dim, out_dim) {
+    trainable_ = false;
+  }
+  std::string layer_type() const override {
+    return std::string("elementwise_add");
+  }
+  void forward_propagation(const tensor_t& in_data,
+                           tensor_t& out_data) override {
+    for (size_t sample = 0; sample < in_data.size(); ++sample) {
+      for (size_t j = 0; j < in_data[0].size(); j++)
+        out_data[sample][j] = in_data[sample][j];
+    }
+  }
+  void back_propagation(const tensor_t& in_data, const tensor_t& out_data,
+                        tensor_t& out_grad, tensor_t& in_grad) override {
+    in_grad = out_grad;
+  }
 };
diff --git a/libdeepgalois/include/layers/graph_conv_layer.h b/libdeepgalois/include/layers/graph_conv_layer.h
index f0f27687e7..7dfc8c2154 100644
--- a/libdeepgalois/include/layers/graph_conv_layer.h
+++ b/libdeepgalois/include/layers/graph_conv_layer.h
@@ -3,72 +3,79 @@
 #include "aggregator.h"
 
 /* GraphConv Layer
-	Parameters
-	----------
-	x: int, number of samples.
-	y: int, Input feature size.
-	z: int, Output feature size.
-	dropout: bool, optional, if True, a dropout operation is applied before other operations.
-	norm : bool, optional, if True, the normalizer :math:`c_{ij}` is applied. Default: ``True``.
-	bias : bool, optional, if True, adds a learnable bias to the output. Default: ``False``.
-	activation: callable activation function/layer or None, optional
-	If not None, applies an activation function to the updated node features. Default: ``None``.
+    Parameters
+    ----------
+    x: int, number of samples.
+    y: int, Input feature size.
+    z: int, Output feature size.
+    dropout: bool, optional, if True, a dropout operation is applied before
+   other operations. norm : bool, optional, if True, the normalizer
+   :math:`c_{ij}` is applied. Default: ``True``. bias : bool, optional, if True,
+   adds a learnable bias to the output. Default: ``False``. activation: callable
+   activation function/layer or None, optional If not None, applies an
+   activation function to the updated node features. Default: ``None``.
 */
-class graph_conv_layer: public layer {
+class graph_conv_layer : public layer {
 public:
-	graph_conv_layer(unsigned level, bool act, bool norm, bool bias, bool dropout,
-		float dropout_rate, std::vector<size_t> in_dims, std::vector<size_t> out_dims);
-	graph_conv_layer(unsigned level, std::vector<size_t> in_dims, std::vector<size_t> out_dims) :
-		graph_conv_layer(level, false, true, false, true, 0.5, in_dims, out_dims) {}
-	~graph_conv_layer() {}
-	void init();
-	std::string layer_type() const override { return std::string("graph_conv"); }
-	void set_netphase(net_phase ctx) override { phase_ = ctx; }
-	//virtual void forward_propagation(const vec_t &in_data, vec_t &out_data);
-	//virtual void back_propagation(const vec_t &in_data, const vec_t &out_data, vec_t &out_grad, vec_t &in_grad);
-	virtual void forward_propagation(const float_t *in_data, float_t *out_data);
-	virtual void back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad);
-	// user-defined aggregate function
+  graph_conv_layer(unsigned level, bool act, bool norm, bool bias, bool dropout,
+                   float dropout_rate, std::vector<size_t> in_dims,
+                   std::vector<size_t> out_dims);
+  graph_conv_layer(unsigned level, std::vector<size_t> in_dims,
+                   std::vector<size_t> out_dims)
+      : graph_conv_layer(level, false, true, false, true, 0.5, in_dims,
+                         out_dims) {}
+  ~graph_conv_layer() {}
+  void init();
+  std::string layer_type() const override { return std::string("graph_conv"); }
+  void set_netphase(net_phase ctx) override { phase_ = ctx; }
+  // virtual void forward_propagation(const vec_t &in_data, vec_t &out_data);
+  // virtual void back_propagation(const vec_t &in_data, const vec_t &out_data,
+  // vec_t &out_grad, vec_t &in_grad);
+  virtual void forward_propagation(const float_t* in_data, float_t* out_data);
+  virtual void back_propagation(const float_t* in_data, const float_t* out_data,
+                                float_t* out_grad, float_t* in_grad);
+  // user-defined aggregate function
 #ifdef CPU_ONLY
-	virtual void aggregate(size_t len, Graph &g, const float_t *in, float_t *out);
+  virtual void aggregate(size_t len, Graph& g, const float_t* in, float_t* out);
 #else
-	virtual void aggregate(size_t len, CSRGraph &g, const float_t *in, float_t *out);
+  virtual void aggregate(size_t len, CSRGraph& g, const float_t* in,
+                         float_t* out);
 #endif
-	// user-defined combine function
-	virtual void combine(const vec_t &self, const vec_t &neighbors, vec_t &out);
+  // user-defined combine function
+  virtual void combine(const vec_t& self, const vec_t& neighbors, vec_t& out);
 
 private:
-	bool act_; // whether to use activation function at the end
-	bool norm_; // whether to normalize data
-	bool bias_; // whether to add bias afterwards
-	bool dropout_; // whether to use dropout at first
-	const float dropout_rate_;
-	float scale_;
-	net_phase phase_;
-	size_t x;
-	size_t y;
-	size_t z;
-	float_t *out_temp;
-	float_t *in_temp;
-	float_t *trans_data; // y*x
-	unsigned * dropout_mask; // x*y
+  bool act_;     // whether to use activation function at the end
+  bool norm_;    // whether to normalize data
+  bool bias_;    // whether to add bias afterwards
+  bool dropout_; // whether to use dropout at first
+  const float dropout_rate_;
+  float scale_;
+  net_phase phase_;
+  size_t x;
+  size_t y;
+  size_t z;
+  float_t* out_temp;
+  float_t* in_temp;
+  float_t* trans_data;    // y*x
+  unsigned* dropout_mask; // x*y
 
-	// Glorot & Bengio (AISTATS 2010)
-	inline void rand_init_matrix(size_t dim_x, size_t dim_y, vec_t &matrix) {
-		auto init_range = sqrt(6.0/(dim_x + dim_y));
-		std::default_random_engine rng;
-		std::uniform_real_distribution<float_t> dist(-init_range, init_range);
-		matrix.resize(dim_x * dim_y);
-		for (size_t i = 0; i < dim_x; ++i) {
-			for (size_t j = 0; j < dim_y; ++j)
-				matrix[i*dim_y+j] = dist(rng);
-		}
-	}
-	inline void zero_init_matrix(size_t dim_x, size_t dim_y, vec_t &matrix) {
-		matrix.resize(dim_x * dim_y);
-		for (size_t i = 0; i < dim_x; ++i) {
-			for (size_t j = 0; j < dim_y; ++j)
-				matrix[i*dim_y+j] = 0;
-		}
-	}
+  // Glorot & Bengio (AISTATS 2010)
+  inline void rand_init_matrix(size_t dim_x, size_t dim_y, vec_t& matrix) {
+    auto init_range = sqrt(6.0 / (dim_x + dim_y));
+    std::default_random_engine rng;
+    std::uniform_real_distribution<float_t> dist(-init_range, init_range);
+    matrix.resize(dim_x * dim_y);
+    for (size_t i = 0; i < dim_x; ++i) {
+      for (size_t j = 0; j < dim_y; ++j)
+        matrix[i * dim_y + j] = dist(rng);
+    }
+  }
+  inline void zero_init_matrix(size_t dim_x, size_t dim_y, vec_t& matrix) {
+    matrix.resize(dim_x * dim_y);
+    for (size_t i = 0; i < dim_x; ++i) {
+      for (size_t j = 0; j < dim_y; ++j)
+        matrix[i * dim_y + j] = 0;
+    }
+  }
 };
diff --git a/libdeepgalois/include/layers/layer.h b/libdeepgalois/include/layers/layer.h
index 11f82b1486..c0e694d21c 100644
--- a/libdeepgalois/include/layers/layer.h
+++ b/libdeepgalois/include/layers/layer.h
@@ -33,116 +33,139 @@
 
 class layer : public node {
 public:
-	layer(unsigned level, std::vector<size_t> in_dims, std::vector<size_t> out_dims) :
-		node(in_dims.size(), out_dims.size()), 
-		level_(level), begin_(0), end_(0), num_dims(in_dims.size()),
-		input_dims(in_dims), output_dims(out_dims) { add_edge(); }
-	virtual ~layer() = default;
-	virtual std::string layer_type() const = 0;
-	virtual void set_netphase(net_phase phase) {}
-	virtual void set_context(Context *ctx) { context = ctx; }
-	virtual acc_t get_masked_loss() { return acc_t(0); }
-	virtual void forward_propagation(const float_t *in_data, float_t *out_data) = 0;
-	virtual void back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) = 0;
+  layer(unsigned level, std::vector<size_t> in_dims,
+        std::vector<size_t> out_dims)
+      : node(in_dims.size(), out_dims.size()), level_(level), begin_(0),
+        end_(0), num_dims(in_dims.size()), input_dims(in_dims),
+        output_dims(out_dims) {
+    add_edge();
+  }
+  virtual ~layer()                       = default;
+  virtual std::string layer_type() const = 0;
+  virtual void set_netphase(net_phase phase) {}
+  virtual void set_context(Context* ctx) { context = ctx; }
+  // virtual void forward_propagation(const vec_t &in_data, vec_t &out_data) =
+  // 0; virtual void back_propagation(const vec_t &in_data, const vec_t
+  // &out_data, vec_t &out_grad, vec_t &in_grad) = 0;
+  virtual void forward_propagation(const float_t* in_data,
+                                   float_t* out_data)                = 0;
+  virtual void back_propagation(const float_t* in_data, const float_t* out_data,
+                                float_t* out_grad, float_t* in_grad) = 0;
 
-	void set_trainable(bool trainable) { trainable_ = trainable; }
-	bool trainable() const { return trainable_; }
-	void set_name(std::string name) { name_ = name; }
-	std::string get_name() { return name_; }
-	void print_layer_info() {
-		std::cout << "Layer" << level_ << " type: " << layer_type()
-			<< " input[" << input_dims[0] << "," << input_dims[1] 
-			<< "] output[" << output_dims[0] << "," << output_dims[1] << "]\n";
-	}
-	virtual void set_sample_mask(size_t sample_begin, size_t sample_end, size_t sample_count, mask_t *masks) {
-		begin_ = sample_begin;
-		end_ = sample_end;
-		count_ = sample_count;
-		masks_ = masks;
-#ifndef CPU_ONLY
-		copy_masks_device(input_dims[0], masks_, d_masks_);
-#endif
-	}
-	void set_in_data(float_t *data) {
-		assert(data.size() == input_dims[0]*input_dims[1]);
-		prev_ = std::make_shared<edge>(this, input_dims[0], input_dims[1]);
-		prev_->set_data(data);
-		// no need to allocate memory for gradients, since this is the input layer.
-		//
-		// allocate memory for intermediate features
-		//prev_->get_data() = data;
-		//std::copy(data.begin(), data.end(), prev_->get_data());
-		// allocate memory for intermediate gradients
-		//prev_->get_gradient().resize(input_dims[0]*input_dims[1]);
-	}
-	void add_edge() {
-		// add an outgoing edge
-		next_ = std::make_shared<edge>(this, output_dims[0], output_dims[1]);
-		// allocate memory for intermediate feature vectors and gradients
-		next_->alloc();
-	}
-	void alloc_grad() {
-		// allocate memory for intermediate gradients
-	}
-	void forward() {
-		std::cout << name_ << ": forwarding ... ";
-		forward_propagation(prev()->get_data(), next()->get_data());
-	}
-	void backward() {
-		std::cout << name_ << ": backwarding ... ";
-		back_propagation(prev()->get_data(), next()->get_data(), next()->get_gradient(), prev()->get_gradient());
-	}
-	void update_weight(optimizer *opt) {
-		std::cout << name_ << ": weight updating ... ";
-		//vec_t diff;
-		//prev()->merge_grads(&diff);
-#ifdef CPU_ONLY
-		// parallelize only when target size is big enough to mitigate thread spawning overhead.
-		bool parallel = (W.size() >= 512);
-		opt->update(weight_grad, W, parallel); // W += grad
-#else
-		opt->update_gpu(d_weight_grad, d_W); // W += grad
-#endif
-		//prev()->clear_grads();
-		next()->clear_grads();
-	}
+  void set_trainable(bool trainable) { trainable_ = trainable; }
+  bool trainable() const { return trainable_; }
+  void set_name(std::string name) { name_ = name; }
+  std::string get_name() { return name_; }
+  void print_layer_info() {
+    std::cout << "Layer" << level_ << " type: " << layer_type() << " input["
+              << input_dims[0] << "," << input_dims[1] << "] output["
+              << output_dims[0] << "," << output_dims[1] << "]\n";
+  }
+  virtual void set_sample_mask(size_t sample_begin, size_t sample_end,
+                               size_t sample_count, mask_t* masks) {
+    begin_ = sample_begin;
+    end_   = sample_end;
+    count_ = sample_count;
+    masks_ = masks;
+  }
+  void set_in_data(float_t* data) {
+    assert(data.size() == input_dims[0] * input_dims[1]);
+    prev_ = std::make_shared<edge>(this, input_dims[0], input_dims[1]);
+    prev_->set_data(data);
+    // no need to allocate memory for gradients, since this is the input layer.
+    //
+    // allocate memory for intermediate features
+    // prev_->get_data() = data;
+    // std::copy(data.begin(), data.end(), prev_->get_data());
+    // allocate memory for intermediate gradients
+    // prev_->get_gradient().resize(input_dims[0]*input_dims[1]);
+  }
+  void add_edge() {
+    // add an outgoing edge
+    next_ = std::make_shared<edge>(this, output_dims[0], output_dims[1]);
+    // allocate memory for intermediate feature vectors and gradients
+    next_->alloc();
+    // next_->get_data().resize(output_dims[0]*output_dims[1]);
+  }
+  void alloc_grad() {
+    // allocate memory for intermediate gradients
+    // next_->get_gradient().resize(output_dims[0]*output_dims[1]);
+  }
+  void forward() {
+    forward_propagation(prev()->get_data(), next()->get_data());
+  }
+  void backward() {
+    back_propagation(prev()->get_data(), next()->get_data(),
+                     next()->get_gradient(), prev()->get_gradient());
+  }
+  void update_weight(optimizer* opt) {
+    // parallelize only when target size is big enough to mitigate thread
+    // spawning overhead.
+    bool parallel = (W.size() >= 512);
+    // vec_t diff;
+    // prev()->merge_grads(&diff);
+    // auto in_data = prev()->get_data();
+    // float_t rcp_batch_size = float_t(1.0) / in_data.size();
+    // for (size_t i = 0; i < diff.size(); ++i)
+    //	diff[i] *= rcp_batch_size;
+    opt->update(weight_grad, W, parallel); // W += grad
+    // prev()->clear_grads();
+    next()->clear_grads();
+  }
+  inline acc_t get_masked_loss() {
+    AccumF total_loss;
+    AccumU valid_sample_count;
+    total_loss.reset();
+    valid_sample_count.reset();
+    galois::do_all(galois::iterate(begin_, end_),
+                   [&](const auto& i) {
+                     if (masks_[i]) {
+                       total_loss += loss[i];
+                       valid_sample_count += 1;
+                     }
+                   },
+                   galois::chunk_size<256>(), galois::steal(),
+                   galois::loopname("getMaskedLoss"));
+    assert(valid_sample_count.reduce() == count_);
+    return total_loss.reduce() / (acc_t)count_;
+  }
 
 protected:
-	unsigned level_; // layer id: [0, num_layers-1]
-	size_t begin_; // sample begin index
-	size_t end_; // sample end index
-	size_t count_; // number of samples
-	size_t num_dims; // number of dimensions
-	std::vector<size_t> input_dims; // input dimensions
-	std::vector<size_t> output_dims; // output dimentions
-	std::string name_; // name of this layer
-	bool trainable_; // is this layer trainable
-	vec_t W; // parameters to learn, for vertex v, layer0: D x 16, layer1: 16 x E
-	vec_t Q; // parameters to learn, for vertex u, i.e. v's neighbors, layer0: D x 16, layer1: 16 x E
-	vec_t weight_grad; // weight gradient for updating parameters
-	float_t *d_W;
-	float_t *d_weight_grad;
-	mask_t *masks_; // masks to show which samples are valid
-	mask_t *d_masks_;
-	float_t *loss; // error for each vertex: N x 1
-	Context *context;
+  unsigned level_;                 // layer id: [0, num_layers-1]
+  size_t begin_;                   // sample begin index
+  size_t end_;                     // sample end index
+  size_t count_;                   // number of samples
+  size_t num_dims;                 // number of dimensions
+  std::vector<size_t> input_dims;  // input dimensions
+  std::vector<size_t> output_dims; // output dimentions
+  std::string name_;               // name of this layer
+  bool trainable_;                 // is this layer trainable
+  vec_t W; // parameters to learn, for vertex v, layer0: D x 16, layer1: 16 x E
+  vec_t Q; // parameters to learn, for vertex u, i.e. v's neighbors, layer0: D x
+           // 16, layer1: 16 x E
+  vec_t weight_grad; // weight gradient for updating parameters
+  float_t* d_W;
+  float_t* d_weight_grad;
+  mask_t* masks_; // masks to show which samples are valid
+  mask_t* d_masks_;
+  float_t* loss; // error for each vertex: N x 1
+  Context* context;
 };
 
 // head: layer i+1, tail: layer i
-inline void connect(layer *head, layer *tail,
-    	size_t head_index = 0, size_t tail_index = 0) {
-	//auto out_shape = head->out_shape()[head_index];
-	//auto in_shape  = tail->in_shape()[tail_index];
-	//head->setup(false);
-	//if (in_shape.size() == 0) {
-	//	tail->set_in_shape(out_shape);
-	//	in_shape = out_shape;
-	//}
-	//if (out_shape.size() != in_shape.size()) 
-	//	connection_mismatch(*head, *tail);
-	//if (!head->next_[head_index])
-	//	throw nn_error("output edge must not be null");
-	tail->prev_ = head->next_;
-	tail->prev_->add_next_node(tail);
+inline void connect(layer* head, layer* tail, size_t head_index = 0,
+                    size_t tail_index = 0) {
+  // auto out_shape = head->out_shape()[head_index];
+  // auto in_shape  = tail->in_shape()[tail_index];
+  // head->setup(false);
+  // if (in_shape.size() == 0) {
+  //	tail->set_in_shape(out_shape);
+  //	in_shape = out_shape;
+  //}
+  // if (out_shape.size() != in_shape.size())
+  //	connection_mismatch(*head, *tail);
+  // if (!head->next_[head_index])
+  //	throw nn_error("output edge must not be null");
+  tail->prev_ = head->next_;
+  tail->prev_->add_next_node(tail);
 }
-
diff --git a/libdeepgalois/include/layers/linear_layer.h b/libdeepgalois/include/layers/linear_layer.h
index e4ff524f3f..55d5d245d8 100644
--- a/libdeepgalois/include/layers/linear_layer.h
+++ b/libdeepgalois/include/layers/linear_layer.h
@@ -3,26 +3,30 @@
 
 class linear_layer : public layer {
 public:
-	linear_layer(unsigned level, float_t scale, float_t bias,
-		std::vector<size_t> in_dims, std::vector<size_t> out_dims) :
-		layer(level, in_dims, out_dims), scale_(scale), bias_(bias) {
-		trainable_ = false; }
-	linear_layer(unsigned level, std::vector<size_t> in_dim,
-		std::vector<size_t> out_dim) : linear_layer(level, 1.0, 0.0, in_dim, out_dim) { }
-	std::string layer_type() const override { return "linear"; }
+  linear_layer(unsigned level, float_t scale, float_t bias,
+               std::vector<size_t> in_dims, std::vector<size_t> out_dims)
+      : layer(level, in_dims, out_dims), scale_(scale), bias_(bias) {
+    trainable_ = false;
+  }
+  linear_layer(unsigned level, std::vector<size_t> in_dim,
+               std::vector<size_t> out_dim)
+      : linear_layer(level, 1.0, 0.0, in_dim, out_dim) {}
+  std::string layer_type() const override { return "linear"; }
+
+  void forward_propagation(const tensor_t& in_data,
+                           tensor_t& out_data) override {
+    for (size_t sample = 0; sample < input_dims[0]; ++sample) {
+      for (size_t i = 0; i < input_dims[1]; i++)
+        out_data[sample][i] = scale_ * in_data[sample][i] + bias_;
+    }
+  }
+  void back_propagation(const tensor_t& in_data, const tensor_t& out_data,
+                        tensor_t& out_grad, tensor_t& in_grad) override {
+    for (size_t sample = 0; sample < input_dims[0]; ++sample)
+      for (size_t i = 0; i < input_dims[1]; i++)
+        in_grad[sample][i] = out_grad[sample][i] * scale_;
+  }
 
-	void forward_propagation(const tensor_t &in_data, tensor_t &out_data) override {
-		for (size_t sample = 0; sample < input_dims[0]; ++sample) {
-			for (size_t i = 0; i < input_dims[1]; i ++)
-				out_data[sample][i] = scale_ * in_data[sample][i] + bias_;
-		}
-	}
-	void back_propagation(const tensor_t &in_data, const tensor_t &out_data, 
-		tensor_t &out_grad, tensor_t &in_grad) override {
-		for (size_t sample = 0; sample < input_dims[0]; ++sample)
-			for (size_t i = 0; i < input_dims[1]; i++)
-				in_grad[sample][i] = out_grad[sample][i]  * scale_;
-	}
 protected:
-	float_t scale_, bias_;
+  float_t scale_, bias_;
 };
diff --git a/libdeepgalois/include/layers/relu_layer.h b/libdeepgalois/include/layers/relu_layer.h
index 285e09b472..8a7b447038 100644
--- a/libdeepgalois/include/layers/relu_layer.h
+++ b/libdeepgalois/include/layers/relu_layer.h
@@ -4,14 +4,18 @@
 // ReLU Layer
 class relu_layer : public layer {
 public:
-	relu_layer(unsigned level, std::vector<size_t> in_dims, std::vector<size_t> out_dims)
-		: layer(level, in_dims, out_dims) {
-		trainable_ = false;
-	}
-	~relu_layer() {}
-	std::string layer_type() const override { return std::string("relu"); }
-	virtual void forward_propagation(const tensor_t &in_data, tensor_t &out_data);
-	virtual void forward_propagation(const float_t *in_data, float_t *out_data);
-	virtual void back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad);
-	virtual void back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad);
+  relu_layer(unsigned level, std::vector<size_t> in_dims,
+             std::vector<size_t> out_dims)
+      : layer(level, in_dims, out_dims) {
+    trainable_ = false;
+  }
+  ~relu_layer() {}
+  std::string layer_type() const override { return std::string("relu"); }
+  virtual void forward_propagation(const tensor_t& in_data, tensor_t& out_data);
+  virtual void forward_propagation(const float_t* in_data, float_t* out_data);
+  virtual void back_propagation(const tensor_t& in_data,
+                                const tensor_t& out_data, tensor_t& out_grad,
+                                tensor_t& in_grad);
+  virtual void back_propagation(const float_t* in_data, const float_t* out_data,
+                                float_t* out_grad, float_t* in_grad);
 };
diff --git a/libdeepgalois/include/layers/softmax_loss_layer.h b/libdeepgalois/include/layers/softmax_loss_layer.h
index 78166b2fb5..0a680a3209 100644
--- a/libdeepgalois/include/layers/softmax_loss_layer.h
+++ b/libdeepgalois/include/layers/softmax_loss_layer.h
@@ -1,13 +1,15 @@
 #pragma once
 #include "layer.h"
 
-class softmax_loss_layer: public layer {
+class softmax_loss_layer : public layer {
 public:
-	softmax_loss_layer(unsigned level, std::vector<size_t> in_dims, std::vector<size_t> out_dims);
-	~softmax_loss_layer() {}
-	std::string layer_type() const override { return std::string("softmax_loss"); }
-	virtual void forward_propagation(const float_t *in_data, float_t *out_data);
-	virtual void back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad);
-	virtual acc_t get_masked_loss();
+  softmax_loss_layer(unsigned level, std::vector<size_t> in_dims,
+                     std::vector<size_t> out_dims);
+  ~softmax_loss_layer() {}
+  std::string layer_type() const override {
+    return std::string("softmax_loss");
+  }
+  virtual void forward_propagation(const float_t* in_data, float_t* out_data);
+  virtual void back_propagation(const float_t* in_data, const float_t* out_data,
+                                float_t* out_grad, float_t* in_grad);
 };
-
diff --git a/libdeepgalois/include/lgraph.h b/libdeepgalois/include/lgraph.h
index 78f6f76aec..65cd004c82 100644
--- a/libdeepgalois/include/lgraph.h
+++ b/libdeepgalois/include/lgraph.h
@@ -1,7 +1,7 @@
 #ifndef __LGRAPH_HPP__
 #define __LGRAPH_HPP__
 
-//defines the Learning Graph (LGraph) data structure
+// defines the Learning Graph (LGraph) data structure
 #include <set>
 #include <string>
 #include <sstream>
@@ -12,168 +12,173 @@ typedef unsigned IndexT;
 typedef float ValueT;
 
 struct Edge {
-	IndexT src;
-	IndexT dst;
-	ValueT elabel;
-	Edge() : src(0), dst(0), elabel(0) {}
-	Edge(IndexT from, IndexT to, ValueT el) :
-		src(from), dst(to), elabel(el) {}
-	std::string to_string() const {
-		std::stringstream ss;
-		ss << "e(" << src << "," << dst << "," << elabel << ")";
-		return ss.str();
-	}
+  IndexT src;
+  IndexT dst;
+  ValueT elabel;
+  Edge() : src(0), dst(0), elabel(0) {}
+  Edge(IndexT from, IndexT to, ValueT el) : src(from), dst(to), elabel(el) {}
+  std::string to_string() const {
+    std::stringstream ss;
+    ss << "e(" << src << "," << dst << "," << elabel << ")";
+    return ss.str();
+  }
 };
 typedef std::vector<Edge> EdgeList;
 
 class LGraph {
 public:
-	LGraph() : symmetrize_(false), directed_(false) {}
-	void clean() {
-		delete[] rowptr_;
-		delete[] colidx_;
-		delete[] weight_;
-		degrees.clear();
-		el.clear();
-		//labels_.clear();
-		//vertices.clear();
-	}
-	bool directed() const { return directed_; }
-	size_t num_vertices() const { return num_vertices_; }
-	size_t num_edges() const { return num_edges_; }
-	IndexT * out_rowptr() const { return rowptr_; }
-	IndexT * out_colidx() const { return colidx_; }
-	unsigned out_degree(IndexT n) const { return rowptr_[n+1] - rowptr_[n]; }
-	IndexT get_offset(IndexT n) { return rowptr_[n]; }
-	IndexT get_dest(IndexT n) { return colidx_[n]; }
-	ValueT get_weight(IndexT n) { return weight_[n]; }
-	unsigned get_max_degree() { return max_degree; }
-	//ValueT * labels() { return labels_.data(); }
-	//ValueT get_label(IndexT n) { return labels_[n]; }
-	void read_edgelist(const char *filename, bool symmetrize = false) {
-		std::ifstream in;
-		std::string line;
-		in.open(filename, std::ios::in);
-		IndexT max_vid = 0;
-		while (std::getline(in, line)) {
-			std::istringstream edge_stream(line);
-			IndexT u, v;
-			edge_stream >> u;
-			edge_stream >> v;
-			el.push_back(Edge(u, v, 1));
-			if (symmetrize) el.push_back(Edge(v, u, 1));
-			if (u > max_vid) max_vid = u;
-			if (v > max_vid) max_vid = v;
-		}
-		in.close();
-		directed_ = true;
-		num_vertices_ = max_vid+1;
-		num_edges_ = el.size();
-		std::cout << "num_vertices_ " << num_vertices_ << " num_edges_ " << num_edges_ << "\n";
-		MakeGraphFromEL();
-	}
+  LGraph() : symmetrize_(false), directed_(false) {}
+  void clean() {
+    delete[] rowptr_;
+    delete[] colidx_;
+    delete[] weight_;
+    degrees.clear();
+    el.clear();
+    // labels_.clear();
+    // vertices.clear();
+  }
+  bool directed() const { return directed_; }
+  size_t num_vertices() const { return num_vertices_; }
+  size_t num_edges() const { return num_edges_; }
+  IndexT* out_rowptr() const { return rowptr_; }
+  IndexT* out_colidx() const { return colidx_; }
+  unsigned out_degree(IndexT n) const { return rowptr_[n + 1] - rowptr_[n]; }
+  IndexT get_offset(IndexT n) { return rowptr_[n]; }
+  IndexT get_dest(IndexT n) { return colidx_[n]; }
+  ValueT get_weight(IndexT n) { return weight_[n]; }
+  unsigned get_max_degree() { return max_degree; }
+  // ValueT * labels() { return labels_.data(); }
+  // ValueT get_label(IndexT n) { return labels_[n]; }
+  void read_edgelist(const char* filename, bool symmetrize = false) {
+    std::ifstream in;
+    std::string line;
+    in.open(filename, std::ios::in);
+    IndexT max_vid = 0;
+    while (std::getline(in, line)) {
+      std::istringstream edge_stream(line);
+      IndexT u, v;
+      edge_stream >> u;
+      edge_stream >> v;
+      el.push_back(Edge(u, v, 1));
+      if (symmetrize)
+        el.push_back(Edge(v, u, 1));
+      if (u > max_vid)
+        max_vid = u;
+      if (v > max_vid)
+        max_vid = v;
+    }
+    in.close();
+    directed_     = true;
+    num_vertices_ = max_vid + 1;
+    num_edges_    = el.size();
+    std::cout << "num_vertices_ " << num_vertices_ << " num_edges_ "
+              << num_edges_ << "\n";
+    MakeGraphFromEL();
+  }
 
 private:
-	EdgeList el;
-	bool symmetrize_; // whether to symmetrize a directed graph
-	bool directed_;
-	size_t num_vertices_;
-	size_t num_edges_;
-	IndexT *rowptr_;
-	IndexT *colidx_;
-	ValueT *weight_;
-	unsigned max_degree;
-	std::vector<IndexT> degrees;
-	std::vector<ValueT> labels_;
-	std::vector<std::vector<Edge> > vertices;
+  EdgeList el;
+  bool symmetrize_; // whether to symmetrize a directed graph
+  bool directed_;
+  size_t num_vertices_;
+  size_t num_edges_;
+  IndexT* rowptr_;
+  IndexT* colidx_;
+  ValueT* weight_;
+  unsigned max_degree;
+  std::vector<IndexT> degrees;
+  std::vector<ValueT> labels_;
+  std::vector<std::vector<Edge>> vertices;
 
-	static bool compare_id(Edge a, Edge b) { return (a.dst < b.dst); }
+  static bool compare_id(Edge a, Edge b) { return (a.dst < b.dst); }
 
-	void MakeGraphFromEL() {
-		SquishGraph();
-		MakeCSR(false);
-	}
+  void MakeGraphFromEL() {
+    SquishGraph();
+    MakeCSR(false);
+  }
 
-	void SquishGraph(bool remove_selfloops = true, bool remove_redundents = true) {
-		std::vector<Edge> neighbors;
-		for (size_t i = 0; i < num_vertices_; i++)
-			vertices.push_back(neighbors);
-		for (size_t i = 0; i < num_edges_; i ++)
-			vertices[el[i].src].push_back(el[i]);
-		el.clear();
-		printf("Sorting the neighbor lists...");
-		for (size_t i = 0; i < num_vertices_; i ++)
-			std::sort(vertices[i].begin(), vertices[i].end(), compare_id);
-		printf(" Done\n");
-		//remove self loops
-		int num_selfloops = 0;
-		if(remove_selfloops) {
-			printf("Removing self loops...");
-			for(size_t i = 0; i < num_vertices_; i ++) {
-				for(unsigned j = 0; j < vertices[i].size(); j ++) {
-					if(i == vertices[i][j].dst) {
-						vertices[i].erase(vertices[i].begin()+j);
-						num_selfloops ++;
-						j --;
-					}
-				}
-			}
-			printf(" %d selfloops are removed\n", num_selfloops);
-			num_edges_ -= num_selfloops;
-		}
-		// remove redundent
-		int num_redundents = 0;
-		if(remove_redundents) {
-			printf("Removing redundent edges...");
-			for (size_t i = 0; i < num_vertices_; i ++) {
-				for (unsigned j = 1; j < vertices[i].size(); j ++) {
-					if (vertices[i][j].dst == vertices[i][j-1].dst) {
-						vertices[i].erase(vertices[i].begin()+j);
-						num_redundents ++;
-						j --;
-					}
-				}
-			}
-			printf(" %d redundent edges are removed\n", num_redundents);
-			num_edges_ -= num_redundents;
-		}
-	}
+  void SquishGraph(bool remove_selfloops  = true,
+                   bool remove_redundents = true) {
+    std::vector<Edge> neighbors;
+    for (size_t i = 0; i < num_vertices_; i++)
+      vertices.push_back(neighbors);
+    for (size_t i = 0; i < num_edges_; i++)
+      vertices[el[i].src].push_back(el[i]);
+    el.clear();
+    printf("Sorting the neighbor lists...");
+    for (size_t i = 0; i < num_vertices_; i++)
+      std::sort(vertices[i].begin(), vertices[i].end(), compare_id);
+    printf(" Done\n");
+    // remove self loops
+    int num_selfloops = 0;
+    if (remove_selfloops) {
+      printf("Removing self loops...");
+      for (size_t i = 0; i < num_vertices_; i++) {
+        for (unsigned j = 0; j < vertices[i].size(); j++) {
+          if (i == vertices[i][j].dst) {
+            vertices[i].erase(vertices[i].begin() + j);
+            num_selfloops++;
+            j--;
+          }
+        }
+      }
+      printf(" %d selfloops are removed\n", num_selfloops);
+      num_edges_ -= num_selfloops;
+    }
+    // remove redundent
+    int num_redundents = 0;
+    if (remove_redundents) {
+      printf("Removing redundent edges...");
+      for (size_t i = 0; i < num_vertices_; i++) {
+        for (unsigned j = 1; j < vertices[i].size(); j++) {
+          if (vertices[i][j].dst == vertices[i][j - 1].dst) {
+            vertices[i].erase(vertices[i].begin() + j);
+            num_redundents++;
+            j--;
+          }
+        }
+      }
+      printf(" %d redundent edges are removed\n", num_redundents);
+      num_edges_ -= num_redundents;
+    }
+  }
 
-	void MakeCSR(bool transpose) {
-		degrees.resize(num_vertices_);
-		std::fill(degrees.begin(), degrees.end(), 0);
-		for (size_t i = 0; i < num_vertices_; i ++)
-			degrees[i] = vertices[i].size();
-		max_degree = *(std::max_element(degrees.begin(), degrees.end()));
+  void MakeCSR(bool transpose) {
+    degrees.resize(num_vertices_);
+    std::fill(degrees.begin(), degrees.end(), 0);
+    for (size_t i = 0; i < num_vertices_; i++)
+      degrees[i] = vertices[i].size();
+    max_degree = *(std::max_element(degrees.begin(), degrees.end()));
 
-		std::vector<IndexT> offsets(degrees.size() + 1);
-		IndexT total = 0;
-		for (size_t n = 0; n < degrees.size(); n++) {
-			offsets[n] = total;
-			total += degrees[n];
-		}
-		offsets[degrees.size()] = total;
+    std::vector<IndexT> offsets(degrees.size() + 1);
+    IndexT total = 0;
+    for (size_t n = 0; n < degrees.size(); n++) {
+      offsets[n] = total;
+      total += degrees[n];
+    }
+    offsets[degrees.size()] = total;
 
-		assert(num_edges_ == offsets[num_vertices_]);
-		weight_ = new ValueT[num_edges_];
-		colidx_ = new IndexT[num_edges_];
-		rowptr_ = new IndexT[num_vertices_+1]; 
-		for (size_t i = 0; i < num_vertices_+1; i ++) rowptr_[i] = offsets[i];
-		for (size_t i = 0; i < num_vertices_; i ++) {
-			for (auto it = vertices[i].begin(); it < vertices[i].end(); it ++) {
-				Edge e = *it;
-				assert(i == e.src);
-				if (symmetrize_ || (!symmetrize_ && !transpose)) {
-					weight_[offsets[e.src]] = e.elabel;
-					colidx_[offsets[e.src]++] = e.dst;
-				}
-				if (symmetrize_ || (!symmetrize_ && transpose)) {
-					weight_[offsets[e.dst]] = e.elabel;
-					colidx_[offsets[e.dst]++] = e.src;
-				}
-			}
-		}
-	}
+    assert(num_edges_ == offsets[num_vertices_]);
+    weight_ = new ValueT[num_edges_];
+    colidx_ = new IndexT[num_edges_];
+    rowptr_ = new IndexT[num_vertices_ + 1];
+    for (size_t i = 0; i < num_vertices_ + 1; i++)
+      rowptr_[i] = offsets[i];
+    for (size_t i = 0; i < num_vertices_; i++) {
+      for (auto it = vertices[i].begin(); it < vertices[i].end(); it++) {
+        Edge e = *it;
+        assert(i == e.src);
+        if (symmetrize_ || (!symmetrize_ && !transpose)) {
+          weight_[offsets[e.src]]   = e.elabel;
+          colidx_[offsets[e.src]++] = e.dst;
+        }
+        if (symmetrize_ || (!symmetrize_ && transpose)) {
+          weight_[offsets[e.dst]]   = e.elabel;
+          colidx_[offsets[e.dst]++] = e.src;
+        }
+      }
+    }
+  }
 };
 
 #endif
diff --git a/libdeepgalois/include/math_functions.hh b/libdeepgalois/include/math_functions.hh
index 2e435d60e2..6f4348ff34 100644
--- a/libdeepgalois/include/math_functions.hh
+++ b/libdeepgalois/include/math_functions.hh
@@ -13,70 +13,94 @@ extern "C" {
 
 const float negative_slope = 0;
 
-void vadd(const vec_t &a, const vec_t &b, vec_t &out); // vector add
-void vadd(size_t n, const float_t *a, const float_t *b, float_t *out);
-void vsub(const vec_t &a, const vec_t &b, vec_t &out);
-void vmul(const vec_t &a, const vec_t &b, vec_t &out);
-void vdiv(const vec_t &a, const vec_t &b, vec_t &out);
-void add_scalar(const float_t alpha, vec_t &Y);
-void sub_scalar(const float_t alpha, vec_t &Y);
-void mul_scalar(const float_t alpha, vec_t &Y);
-void mul_scalar(size_t n, const float_t alpha, const float_t *in, float_t *out);
-void div_scalar(const float_t alpha, vec_t &Y);
-float_t dot(const vec_t &x, const vec_t &y);
-void mvmul(const vec_t &matrix, const vec_t &in_vector, vec_t &out_vector);
-void vvmul(const vec_t &a, const vec_t &b, tensor_t &out);
-void matadd(size_t x, size_t y, const tensor_t &A, const tensor_t &B, tensor_t &C);
-void copy2D1D(const tensor_t &in, vec_t &out);
-void copy1D1D(const vec_t &in, vec_t &out);
-void copy1D1D(size_t len, const float_t *in, float_t *out);
-void matmul2D(const tensor_t &A, const tensor_t &B, tensor_t &C);
-void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z, const float_t *A, const float_t *B, float_t *C); // matrix multiply
-void matmul2D1D(const size_t dim_y, const tensor_t &A, const vec_t &B, vec_t &C);
-void transpose2D(const tensor_t &in, tensor_t &out);
-void transpose2D1D(const tensor_t &in, vec_t &out);
-void transpose(size_t x, size_t y, const vec_t &in, vec_t &out);
-void transpose(size_t x, size_t y, const float_t *in, float_t *out);
-int argmax(const size_t n, const vec_t &x); // the arguments of the maxima
-int argmax(const size_t n, const float_t *x); // the arguments of the maxima
-void clear(vec_t &in);
-void clear(size_t n, float_t *in);
-void relu(const vec_t &in, vec_t &out); // ReLU
-void relu(size_t n, const float_t *in, float_t *out); // ReLU
-void d_relu(const vec_t &in_diff, const vec_t &data, vec_t &out_diff); // ReLU derivative
-void dropout(const float scale, const float dropout_rate, const vec_t &in, std::vector<unsigned> &masks, vec_t &out); // dropout
-void dropout(const float scale, const float dropout_rate, const vec_t &in, std::vector<unsigned> &masks, float_t *out);
-void dropout(size_t n, const float scale, const float dropout_rate, const float_t *in, unsigned *masks, float_t *out);
-void d_dropout(const float scale, const vec_t &in_diff, std::vector<unsigned> &masks, vec_t &out_diff); // dropout derivative
-void d_dropout(size_t n, const float scale, const float_t *in_diff, unsigned *masks, float_t *out_diff);
-void softmax(const vec_t &input, vec_t &output);
-void softmax(size_t n, const float_t *input, float_t *output);
-void d_softmax(const vec_t &y, const vec_t &p, vec_t &dy, const vec_t &dp);
-void d_softmax(size_t n, const float_t *y, const float_t *p, float_t *dy, const float_t *dp);
-float_t cross_entropy(const vec_t &y, const vec_t &p);
-float_t cross_entropy(size_t n, const float_t *y, const float_t *p);
-void d_cross_entropy(const vec_t &y, const vec_t &p, vec_t &d);
-void d_cross_entropy(size_t n, const float_t *y, const float_t *p, float_t *d);
+void vadd(const vec_t& a, const vec_t& b, vec_t& out); // vector add
+void vadd(size_t n, const float_t* a, const float_t* b, float_t* out);
+void vsub(const vec_t& a, const vec_t& b, vec_t& out);
+void vmul(const vec_t& a, const vec_t& b, vec_t& out);
+void vdiv(const vec_t& a, const vec_t& b, vec_t& out);
+void add_scalar(const float_t alpha, vec_t& Y);
+void sub_scalar(const float_t alpha, vec_t& Y);
+void mul_scalar(const float_t alpha, vec_t& Y);
+void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out);
+void div_scalar(const float_t alpha, vec_t& Y);
+float_t dot(const vec_t& x, const vec_t& y);
+void mvmul(const vec_t& matrix, const vec_t& in_vector, vec_t& out_vector);
+void vvmul(const vec_t& a, const vec_t& b, tensor_t& out);
+void matadd(size_t x, size_t y, const tensor_t& A, const tensor_t& B,
+            tensor_t& C);
+void copy2D1D(const tensor_t& in, vec_t& out);
+void copy1D1D(const vec_t& in, vec_t& out);
+void copy1D1D(size_t len, const float_t* in, float_t* out);
+void matmul2D(const tensor_t& A, const tensor_t& B, tensor_t& C);
+void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z,
+                const float_t* A, const float_t* B,
+                float_t* C); // matrix multiply
+void matmul2D1D(const size_t dim_y, const tensor_t& A, const vec_t& B,
+                vec_t& C);
+void transpose2D(const tensor_t& in, tensor_t& out);
+void transpose2D1D(const tensor_t& in, vec_t& out);
+void transpose(size_t x, size_t y, const vec_t& in, vec_t& out);
+void transpose(size_t x, size_t y, const float_t* in, float_t* out);
+int argmax(const size_t n, const vec_t& x);   // the arguments of the maxima
+int argmax(const size_t n, const float_t* x); // the arguments of the maxima
+void clear(vec_t& in);
+void clear(size_t n, float_t* in);
+void relu(const vec_t& in, vec_t& out);               // ReLU
+void relu(size_t n, const float_t* in, float_t* out); // ReLU
+void d_relu(const vec_t& in_diff, const vec_t& data,
+            vec_t& out_diff); // ReLU derivative
+void dropout(const float scale, const float dropout_rate, const vec_t& in,
+             std::vector<unsigned>& mask, vec_t& out); // dropout
+void dropout(const float scale, const float dropout_rate, const vec_t& in,
+             std::vector<unsigned>& mask, float_t* out);
+void dropout(size_t n, const float scale, const float dropout_rate,
+             const float_t* in, unsigned* mask, float_t* out);
+void d_dropout(const float scale, const vec_t& in_diff,
+               std::vector<unsigned>& mask,
+               vec_t& out_diff); // dropout derivative
+void d_dropout(size_t n, const float scale, const float_t* in_diff,
+               unsigned* mask, float_t* out_diff);
+void softmax(const vec_t& input, vec_t& output);
+void softmax(size_t n, const float_t* input, float_t* output);
+void d_softmax(const vec_t& y, const vec_t& p, vec_t& dy, const vec_t& dp);
+void d_softmax(size_t n, const float_t* y, const float_t* p, float_t* dy,
+               const float_t* dp);
+float_t cross_entropy(const vec_t& y, const vec_t& p);
+float_t cross_entropy(size_t n, const float_t* y, const float_t* p);
+void d_cross_entropy(const vec_t& y, const vec_t& p, vec_t& d);
+void d_cross_entropy(size_t n, const float_t* y, const float_t* p, float_t* d);
 
-void copy_gpu(size_t len, const float_t *in, float_t *out);
-void vadd_gpu(const int n, const float_t *a, const float_t *b, float_t *out); // vector add
-void relu_gpu(const int n, const float_t *in, float_t *out); // ReLU
-void d_relu_gpu(const int n, const float_t *in_diff, const float_t *data, float_t *out_diff); // ReLU derivative
-void dropout_gpu(const int n, const float scale, const float dropout_rate, const float_t *in, unsigned *masks, float_t *out); // dropout
-void d_dropout_gpu(const int n, const float scale, const float_t *in, const unsigned *masks, float_t *out); // dropout derivative
-void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, 
-	const int M, const int N, const int K, const float alpha, const float* A, const float* B, const float beta, float* C);
-void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, const float_t *A, const float_t *B, float_t *C); // matrix multiply
-int argmax_gpu(const size_t n, const float_t *x); // the arguments of the maxima
-void softmax_cross_entropy_gpu(int x, int y, const float_t *in_data, const mask_t *masks, const label_t *labels, float_t *loss, float_t *out_data);
-void d_softmax_cross_entropy_gpu(int x, int y, const float_t *in_data, const mask_t *masks, const label_t *labels, const float_t *out_data, float_t *diff);
-void scal_gpu(const int N, const float alpha, float *X);
+void out_malloc_device(int n, mask_t* h_masks, mask_t* d_masks, float_t* loss);
+void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout,
+                         unsigned* masks, float_t* in, float_t* out,
+                         float_t* matrix, float_t* grad);
+void copy_gpu(size_t len, const float_t* in, float_t* out);
+void malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned* masks,
+                   float_t* in, float_t* out);
+void vadd_gpu(const int n, const float_t* a, const float_t* b,
+              float_t* out);                                 // vector add
+void relu_gpu(const int n, const float_t* in, float_t* out); // ReLU
+void d_relu_gpu(const int n, const float_t* in_diff, const float_t* data,
+                float_t* out_diff); // ReLU derivative
+void dropout_gpu(const int n, const float scale, const float dropout_rate,
+                 const float_t* in, unsigned* mask, float_t* out); // dropout
+void d_dropout_gpu(const float scale, const float_t* in_diff,
+                   const unsigned* mask,
+                   float_t* out_diff); // dropout derivative
+void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
+               const int M, const int N, const int K, const float alpha,
+               const float* A, const float* B, const float beta, float* C);
+void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z,
+                    const float_t* A, const float_t* B,
+                    float_t* C);                  // matrix multiply
+int argmax_gpu(const size_t n, const float_t* x); // the arguments of the maxima
+void softmax_cross_entropy_gpu(int x, int y, const float_t* in_data,
+                               const mask_t* masks, const label_t* labels,
+                               float_t* loss, float_t* out_data);
+void d_softmax_cross_entropy_gpu(int x, int y, const float_t* in_data,
+                                 const mask_t* masks, const label_t* labels,
+                                 const float_t* out_data, float_t* diff);
+void scal_gpu(const int N, const float alpha, float* X);
 void add_scalar_gpu(const int N, const float_t alpha, float_t* Y);
-acc_t masked_avg_loss(size_t begin, size_t end, size_t count, mask_t *masks, float_t *loss);
-
-void copy_masks_device(int n, mask_t *h_masks, mask_t *d_masks);
-void malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned *masks, float_t *in, float_t *out);
-void loss_malloc_device(int n, float_t *loss);
-void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned *masks, float_t *in, float_t *out, float_t *matrix, float_t *grad);
 
 #endif
diff --git a/libdeepgalois/include/net.h b/libdeepgalois/include/net.h
index c2bf8e997e..87a0e3b72b 100644
--- a/libdeepgalois/include/net.h
+++ b/libdeepgalois/include/net.h
@@ -11,110 +11,124 @@
 
 #define NUM_CONV_LAYERS 2
 
-// N: number of vertices, D: feature vector dimentions, 
+// N: number of vertices, D: feature vector dimentions,
 // E: number of distinct labels, i.e. number of vertex classes
 // layer 1: features N x D, weights D x 16, out N x 16 (hidden1=16)
 // layer 2: features N x 16, weights 16 x E, out N x E
 class Net {
 public:
-	Net() {}
-	void init(std::string dataset_str, unsigned epochs, unsigned hidden1);
-	size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; }
-	size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id+1]; }
-	size_t get_nnodes() { return num_samples; }
-	void train(optimizer *opt, bool need_validate); // training
-	void construct_layers();
-	void set_contexts() {
-		for (size_t i = 0; i < num_layers; i ++)
-			layers[i]->set_context(context);
-	}
-	void set_netphases(net_phase phase) {
-		for (size_t i = 0; i < num_layers; i ++)
-			layers[i]->set_netphase(phase);
-	}
-	void print_layers_info() {
-		for (size_t i = 0; i < num_layers; i ++)
-			layers[i]->print_layer_info();
-	}
+  Net() {}
+  void init(std::string dataset_str, unsigned epochs, unsigned hidden1);
+  size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; }
+  size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id + 1]; }
+  size_t get_nnodes() { return num_samples; }
+  void train(optimizer* opt, bool need_validate); // training
+  void construct_layers();
+  void set_contexts() {
+    for (size_t i = 0; i < num_layers; i++)
+      layers[i]->set_context(context);
+  }
+  void set_netphases(net_phase phase) {
+    for (size_t i = 0; i < num_layers; i++)
+      layers[i]->set_netphase(phase);
+  }
+  void print_layers_info() {
+    for (size_t i = 0; i < num_layers; i++)
+      layers[i]->print_layer_info();
+  }
 
-	void append_conv_layer(size_t layer_id, bool act = false, bool norm = true, bool bias = false, bool dropout = true, float dropout_rate = 0.5) {
-		assert(dropout_rate < 1.0);
-		assert(layer_id < NUM_CONV_LAYERS);
-		std::vector<size_t> in_dims(2), out_dims(2);
-		in_dims[0] = out_dims[0] = num_samples;
-		in_dims[1] = get_in_dim(layer_id);
-		out_dims[1] = get_out_dim(layer_id);
-		layers[layer_id] = new graph_conv_layer(layer_id, act, norm, bias, dropout, dropout_rate, in_dims, out_dims);
-		if(layer_id > 0) connect(layers[layer_id-1], layers[layer_id]);
-	}
+  void append_conv_layer(size_t layer_id, bool act = false, bool norm = true,
+                         bool bias = false, bool dropout = true,
+                         float dropout_rate = 0.5) {
+    assert(dropout_rate < 1.0);
+    assert(layer_id < NUM_CONV_LAYERS);
+    std::vector<size_t> in_dims(2), out_dims(2);
+    in_dims[0] = out_dims[0] = num_samples;
+    in_dims[1]               = get_in_dim(layer_id);
+    out_dims[1]              = get_out_dim(layer_id);
+    layers[layer_id] = new graph_conv_layer(layer_id, act, norm, bias, dropout,
+                                            dropout_rate, in_dims, out_dims);
+    if (layer_id > 0)
+      connect(layers[layer_id - 1], layers[layer_id]);
+  }
 
-	void append_out_layer(size_t layer_id) {
-		assert(layer_id > 0); // can not be the first layer
-		std::vector<size_t> in_dims(2), out_dims(2);
-		in_dims[0] = out_dims[0] = num_samples;
-		in_dims[1] = get_in_dim(layer_id);
-		out_dims[1] = get_out_dim(layer_id);
-		layers[layer_id] = new softmax_loss_layer(layer_id, in_dims, out_dims);
-		connect(layers[layer_id-1], layers[layer_id]);
-	}
+  void append_out_layer(size_t layer_id) {
+    assert(layer_id > 0); // can not be the first layer
+    std::vector<size_t> in_dims(2), out_dims(2);
+    in_dims[0] = out_dims[0] = num_samples;
+    in_dims[1]               = get_in_dim(layer_id);
+    out_dims[1]              = get_out_dim(layer_id);
+    layers[layer_id] = new softmax_loss_layer(layer_id, in_dims, out_dims);
+    connect(layers[layer_id - 1], layers[layer_id]);
+  }
 
-	// forward propagation: [begin, end) is the range of samples used.
-	acc_t fprop(size_t begin, size_t end, size_t count, mask_t *masks) {
-		// set mask for the last layer
-		layers[num_layers-1]->set_sample_mask(begin, end, count, &masks[0]);
-		// layer0: from N x D to N x 16
-		// layer1: from N x 16 to N x E
-		// layer2: from N x E to N x E (normalize only)
-		for (size_t i = 0; i < num_layers; i ++)
-			layers[i]->forward();
-		return layers[num_layers-1]->get_masked_loss();
-	}
+  // forward propagation: [begin, end) is the range of samples used.
+  acc_t fprop(size_t begin, size_t end, size_t count, mask_t* masks) {
+    // set mask for the last layer
+    layers[num_layers - 1]->set_sample_mask(begin, end, count, &masks[0]);
+    // layer0: from N x D to N x 16
+    // layer1: from N x 16 to N x E
+    // layer2: from N x E to N x E (normalize only)
+    for (size_t i = 0; i < num_layers; i++)
+      layers[i]->forward();
+    return layers[num_layers - 1]->get_masked_loss();
+  }
 
-	// back propogation
-	void bprop() {
-		for (size_t i = num_layers; i != 0; i --)
-			layers[i-1]->backward();
-	}
+  // back propogation
+  void bprop() {
+    for (size_t i = num_layers; i != 0; i--)
+      layers[i - 1]->backward();
+  }
 
-	// update trainable weights after back-propagation
-	void update_weights(optimizer *opt) {
-		for (size_t i = 0; i < num_layers; i ++)
-			if (layers[i]->trainable()) layers[i]->update_weight(opt);
-	}
+  // update trainable weights after back-propagation
+  void update_weights(optimizer* opt) {
+    for (size_t i = 0; i < num_layers; i++)
+      if (layers[i]->trainable())
+        layers[i]->update_weight(opt);
+  }
 
-	// evaluate, i.e. inference or predict
-	double evaluate(size_t begin, size_t end, size_t count, mask_t *masks, acc_t &loss, acc_t &acc) {
-		Timer t_eval;
-		t_eval.Start();
-		loss = fprop(begin, end, count, masks);
-		acc = masked_accuracy(begin, end, count, masks);
-		t_eval.Stop();
-		return t_eval.Millisecs();
-	}
+  // evaluate, i.e. inference or predict
+  double evaluate(size_t begin, size_t end, size_t count, mask_t* masks,
+                  acc_t& loss, acc_t& acc) {
+    Timer t_eval;
+    t_eval.Start();
+    loss = fprop(begin, end, count, masks);
+    acc  = masked_accuracy(begin, end, count, masks);
+    t_eval.Stop();
+    return t_eval.Millisecs();
+  }
 
 protected:
-	Context *context;
-	size_t num_samples; // number of samples: N
-	size_t num_classes; // number of vertex classes: E
-	size_t num_layers; // for now hard-coded: NUM_CONV_LAYERS + 1
-	unsigned num_epochs; // number of epochs
-	std::vector<size_t> feature_dims; // feature dimnesions for each layer
-	std::vector<mask_t> train_mask, val_mask; // masks for traning and validation
-	size_t train_begin, train_end, train_count, val_begin, val_end, val_count;
-	std::vector<layer *> layers; // all the layers in the neural network
+  Context* context;
+  size_t num_samples;               // number of samples: N
+  size_t num_classes;               // number of vertex classes: E
+  size_t num_layers;                // for now hard-coded: NUM_CONV_LAYERS + 1
+  unsigned num_epochs;              // number of epochs
+  std::vector<size_t> feature_dims; // feature dimnesions for each layer
+  std::vector<mask_t> train_mask, val_mask; // masks for traning and validation
+  size_t train_begin, train_end, train_count, val_begin, val_end, val_count;
+  std::vector<layer*> layers; // all the layers in the neural network
 
-	// comparing outputs with the ground truth (labels)
-	inline acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t *masks) {
-		AccumF accuracy_all;
-		accuracy_all.reset();
-		galois::do_all(galois::iterate(begin, end), [&](const auto& i) {
-			if (masks[i] == 1) {
-				int preds = argmax(num_classes, &(layers[NUM_CONV_LAYERS-1]->next()->get_data()[i*num_classes]));
-				if ((label_t)preds == context->get_label(i)) accuracy_all += 1.0;
-			}
-		}, galois::chunk_size<256>(), galois::steal(), galois::loopname("getMaskedLoss"));
-		return accuracy_all.reduce() / (acc_t)count;
-	}
+  // comparing outputs with the ground truth (labels)
+  inline acc_t masked_accuracy(size_t begin, size_t end, size_t count,
+                               mask_t* masks) {
+    AccumF accuracy_all;
+    accuracy_all.reset();
+    galois::do_all(galois::iterate(begin, end),
+                   [&](const auto& i) {
+                     if (masks[i] == 1) {
+                       int preds = argmax(num_classes,
+                                          &(layers[NUM_CONV_LAYERS - 1]
+                                                ->next()
+                                                ->get_data()[i * num_classes]));
+                       if ((label_t)preds == context->get_label(i))
+                         accuracy_all += 1.0;
+                     }
+                   },
+                   galois::chunk_size<256>(), galois::steal(),
+                   galois::loopname("getMaskedLoss"));
+    return accuracy_all.reduce() / (acc_t)count;
+  }
 };
 
 #endif
diff --git a/libdeepgalois/include/node.h b/libdeepgalois/include/node.h
index 5a3cf3f83f..918b91b86c 100644
--- a/libdeepgalois/include/node.h
+++ b/libdeepgalois/include/node.h
@@ -9,52 +9,55 @@ class edge;
 
 typedef std::shared_ptr<edge> edgeptr_t;
 
-// node data structure: each layer is a node, two layers are connected by an edge
+// node data structure: each layer is a node, two layers are connected by an
+// edge
 class node : public std::enable_shared_from_this<node> {
 public:
-	node(size_t in_size, size_t out_size) {}//: prev_(in_size), next_(out_size) {}
-	virtual ~node() {}
-	const edgeptr_t prev() const { return prev_; }
-	const edgeptr_t next() const { return next_; }
+  node(size_t in_size, size_t out_size) {
+  } //: prev_(in_size), next_(out_size) {}
+  virtual ~node() {}
+  const edgeptr_t prev() const { return prev_; }
+  const edgeptr_t next() const { return next_; }
 
 protected:
-	node() = delete;
-	friend void connect(layer *head, layer *tail, size_t head_index, size_t tail_index);
-	mutable edgeptr_t prev_;
-	mutable edgeptr_t next_;
+  node() = delete;
+  friend void connect(layer* head, layer* tail, size_t head_index,
+                      size_t tail_index);
+  mutable edgeptr_t prev_;
+  mutable edgeptr_t next_;
 };
 
 // edges manage the input/output data and gradients between nodes
 class edge {
 public:
-	edge(node *prev, size_t n, size_t len) :
-		num_samples_(n), ft_dim_(len),
-		data_(NULL), grad_(NULL), prev_(prev) {}
-
-	void alloc();
-	void alloc_gpu();
-	void merge_grads(vec_t *dst);
-	void merge_grads_gpu(float_t *dst);
-	void clear_grads();
-	void clear_grads_gpu();
-
-	void set_data(float_t *ptr) { data_ = ptr; }
-	float_t *get_data() { return data_; }
-	const float_t *get_data() const { return data_; }
-	float_t *get_gradient() { return grad_; }
-	const float_t *get_gradient() const { return grad_; }
-
-	const node *next() const { return next_; }
-	node *prev() { return prev_; }
-	const node *prev() const { return prev_; }
-	void add_next_node(node *next) { next_ = next; }
+  edge(node* prev, size_t n, size_t len)
+      : num_samples_(n), ft_dim_(len),
+        // data_(vec_t(n*len)), grad_(vec_t(n*len)),
+        data_(NULL), grad_(NULL), prev_(prev) {}
+
+  void alloc();
+  void alloc_gpu();
+  void merge_grads(vec_t* dst);
+  void merge_grads_gpu(float_t* dst);
+  void clear_grads();
+  void clear_grads_gpu();
+
+  void set_data(float_t* ptr) { data_ = ptr; }
+  float_t* get_data() { return data_; }
+  const float_t* get_data() const { return data_; }
+  float_t* get_gradient() { return grad_; }
+  const float_t* get_gradient() const { return grad_; }
+
+  const node* next() const { return next_; }
+  node* prev() { return prev_; }
+  const node* prev() const { return prev_; }
+  void add_next_node(node* next) { next_ = next; }
 
 private:
-	size_t num_samples_;// number of samples
-	size_t ft_dim_;     // feature dimensions
-	float_t *data_; // feature vectors
-	float_t *grad_; // gradients
-	node *prev_;        // previous node, "producer" of data
-	node *next_;        // next node, "consumer" of data
+  size_t num_samples_; // number of samples
+  size_t ft_dim_;      // feature dimensions
+  float_t* data_;      // feature vectors
+  float_t* grad_;      // gradients
+  node* prev_;         // previous node, "producer" of data
+  node* next_;         // next node, "consumer" of data
 };
-
diff --git a/libdeepgalois/include/optimizer.h b/libdeepgalois/include/optimizer.h
index 072eb7d2bc..f1822adc7d 100644
--- a/libdeepgalois/include/optimizer.h
+++ b/libdeepgalois/include/optimizer.h
@@ -5,31 +5,36 @@
 #include "types.h"
 
 // base class of optimizer
-// usesHessian : true if an optimizer uses hessian (2nd order derivative of loss function)
+// usesHessian : true if an optimizer uses hessian (2nd order derivative of loss
+// function)
 struct optimizer {
-	optimizer()                  = default;
-	optimizer(const optimizer &) = default;
-	optimizer(optimizer &&)      = default;
-	optimizer &operator=(const optimizer &) = default;
-	optimizer &operator=(optimizer &&) = default;
-	virtual ~optimizer()               = default;
-	virtual void update(const vec_t &dW, vec_t &W, bool parallelize) = 0;
-	virtual void update_gpu(const float_t *dW, float_t *W) = 0;
-	virtual void reset() {}  // override to implement pre-learning action
+  optimizer()                 = default;
+  optimizer(const optimizer&) = default;
+  optimizer(optimizer&&)      = default;
+  optimizer& operator=(const optimizer&) = default;
+  optimizer& operator=(optimizer&&)                                = default;
+  virtual ~optimizer()                                             = default;
+  virtual void update(const vec_t& dW, vec_t& W, bool parallelize) = 0;
+  virtual void reset() {} // override to implement pre-learning action
 };
 
 // helper class to hold N values for each weight
 template <int N>
 struct stateful_optimizer : public optimizer {
-	void reset() override { for (auto &e : E_) e.clear(); }
+  void reset() override {
+    for (auto& e : E_)
+      e.clear();
+  }
+
 protected:
-	template <int Index>
-	vec_t &get(const vec_t &key) {
-		static_assert(Index < N, "index out of range");
-		if (E_[Index][&key].empty()) E_[Index][&key].resize(key.size(), float_t());
-		return E_[Index][&key];
-	}
-	std::unordered_map<const vec_t *, vec_t> E_[N];
+  template <int Index>
+  vec_t& get(const vec_t& key) {
+    static_assert(Index < N, "index out of range");
+    if (E_[Index][&key].empty())
+      E_[Index][&key].resize(key.size(), float_t());
+    return E_[Index][&key];
+  }
+  std::unordered_map<const vec_t*, vec_t> E_[N];
 };
 
 /**
@@ -40,12 +45,26 @@ struct stateful_optimizer : public optimizer {
  * The Journal of Machine Learning Research, pages 2121-2159, 2011.
  **/
 struct adagrad : public stateful_optimizer<1> {
-	adagrad() : alpha(0.01), eps(float_t(1e-8)) {}
-	void update(const vec_t &dW, vec_t &W, bool parallelize);
-	void update_gpu(const float_t *dW, float_t *W) {}
-	float_t alpha;  // learning rate
-	private:
-	float_t eps;
+  adagrad() : alpha(0.01), eps(float_t(1e-8)) {}
+  void update(const vec_t& dW, vec_t& W, bool parallelize) {
+    vec_t& g = get<0>(W);
+    if (parallelize) {
+      galois::do_all(galois::iterate((size_t)0, W.size()),
+                     [&](const auto& i) {
+                       g[i] += dW[i] * dW[i];
+                       W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps);
+                     },
+                     galois::loopname("adagrad_update"));
+    } else {
+      for (size_t i = 0; i < W.size(); i++) {
+        g[i] += dW[i] * dW[i];
+        W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps);
+      }
+    }
+  }
+  float_t alpha; // learning rate
+private:
+  float_t eps;
 };
 
 /**
@@ -55,35 +74,54 @@ struct adagrad : public stateful_optimizer<1> {
  * Lecture 6.5 - rmsprop, COURSERA: Neural Networks for Machine Learning (2012)
  **/
 struct RMSprop : public stateful_optimizer<1> {
-	RMSprop() : alpha(float_t(0.0001)), mu(float_t(0.99)), eps(float_t(1e-8)) {}
-	void update(const vec_t &dW, vec_t &W, bool parallelize);
-	void update_gpu(const float_t *dW, float_t *W) {}
-	float_t alpha;  // learning rate
-	float_t mu;     // decay term
+  RMSprop() : alpha(float_t(0.0001)), mu(float_t(0.99)), eps(float_t(1e-8)) {}
+  void update(const vec_t& dW, vec_t& W, bool parallelize) {
+    vec_t& g = get<0>(W);
+    galois::do_all(galois::iterate((size_t)0, W.size()),
+                   [&](const auto& i) {
+                     g[i] = mu * g[i] + (1 - mu) * dW[i] * dW[i];
+                     W[i] -= alpha * dW[i] / std::sqrt(g[i] + eps);
+                   },
+                   galois::loopname("rms_update"));
+  }
+  float_t alpha; // learning rate
+  float_t mu;    // decay term
 private:
-	float_t eps;  // constant value to avoid zero-division
+  float_t eps; // constant value to avoid zero-division
 };
 
 // Adam: A Method for Stochastic Optimization
 // http://arxiv.org/abs/1412.6980
 struct adam : public stateful_optimizer<2> {
-	adam() : alpha(float_t(0.01)), b1(float_t(0.9)), b2(float_t(0.999)), 
-		b1_t(float_t(0.9)), b2_t(float_t(0.999)), eps(float_t(1e-8)) {}
-	void update(const vec_t &dW, vec_t &W, bool parallelize);
-#ifdef CPU_ONLY
-	void update_gpu(const float_t *dW, float_t *W) {}
-#else
-	void update_gpu(const float_t *dW, float_t *W);
-#endif
-
-	float_t alpha;  // learning rate
-	float_t b1;     // decay term
-	float_t b2;     // decay term
-	float_t b1_t;   // decay term power t
-	float_t b2_t;   // decay term power t
+  adam()
+      : alpha(0.01), b1(float_t(0.9)), b2(float_t(0.999)), b1_t(float_t(0.9)),
+        b2_t(float_t(0.999)), eps(float_t(1e-8)) {}
+
+  void update(const vec_t& dW, vec_t& W, bool parallelize) {
+    vec_t& mt = get<0>(W);
+    vec_t& vt = get<1>(W);
+    galois::do_all(galois::iterate((size_t)0, W.size()),
+                   [&](const auto& i) {
+                     mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i];
+                     vt[i] = b2 * vt[i] + (float_t(1) - b2) * dW[i] * dW[i];
+                     // L2 norm based update rule
+                     W[i] -= alpha * (mt[i] / (float_t(1) - b1_t)) /
+                             std::sqrt((vt[i] / (float_t(1) - b2_t)) + eps);
+                   },
+                   galois::chunk_size<256>(), galois::steal(),
+                   galois::loopname("adam_update"));
+    b1_t *= b1;
+    b2_t *= b2;
+  }
+
+  float_t alpha; // learning rate
+  float_t b1;    // decay term
+  float_t b2;    // decay term
+  float_t b1_t;  // decay term power t
+  float_t b2_t;  // decay term power t
 
 private:
-	float_t eps;  // constant value to avoid zero-division
+  float_t eps; // constant value to avoid zero-division
 };
 
 /**
@@ -93,29 +131,48 @@ struct adam : public stateful_optimizer<2> {
  *
  */
 struct adamax : public stateful_optimizer<2> {
-	adamax() : alpha(float_t(0.002)),
-		b1(float_t(0.9)), b2(float_t(0.999)),
-		b1_t(b1), eps(float_t(1e-8)) {}
-	void update(const vec_t &dW, vec_t &W, bool parallelize);
-	void update_gpu(const float_t *dW, float_t *W) {}
-
-	float_t alpha;  // learning rate
-	float_t b1;     // decay term
-	float_t b2;     // decay term
-	float_t b1_t;   // decay term power t
+  adamax()
+      : alpha(float_t(0.002)), b1(float_t(0.9)), b2(float_t(0.999)), b1_t(b1),
+        eps(float_t(1e-8)) {}
+
+  void update(const vec_t& dW, vec_t& W, bool parallelize) {
+    vec_t& mt = get<0>(W);
+    vec_t& ut = get<1>(W);
+    galois::do_all(galois::iterate((size_t)0, W.size()),
+                   [&](const auto& i) {
+                     mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i];
+                     ut[i] = std::max(b2 * ut[i], std::abs(dW[i]));
+                     // Lp norm based update rule
+                     W[i] -= (alpha / (1.0 - b1_t)) * (mt[i] / (ut[i] + eps));
+                   },
+                   galois::loopname("adamax_update"));
+    b1_t *= b1;
+  }
+
+  float_t alpha; // learning rate
+  float_t b1;    // decay term
+  float_t b2;    // decay term
+  float_t b1_t;  // decay term power t
 
 private:
-	float_t eps;  // constant value to avoid zero-division
+  float_t eps; // constant value to avoid zero-division
 };
 
-// SGD without momentum
-// slightly faster than tiny_dnn::momentum
+/**
+ * SGD without momentum
+ *
+ * slightly faster than tiny_dnn::momentum
+ **/
 struct gradient_descent : public optimizer {
-	gradient_descent() : alpha(float_t(0.01)), lambda(float_t(0)) {}
-	void update(const vec_t &dW, vec_t &W, bool parallelize);
-	void update_gpu(const float_t *dW, float_t *W) {}
-	float_t alpha;   // learning rate
-	float_t lambda;  // weight decay
+  gradient_descent() : alpha(float_t(0.01)), lambda(float_t(0)) {}
+  void update(const vec_t& dW, vec_t& W, bool parallelize) {
+    galois::do_all(
+        galois::iterate((size_t)0, W.size()),
+        [&](const auto& i) { W[i] = W[i] - alpha * (dW[i] + lambda * W[i]); },
+        galois::loopname("gradient_descent_update"));
+  }
+  float_t alpha;  // learning rate
+  float_t lambda; // weight decay
 };
 
 /**
@@ -126,14 +183,27 @@ struct gradient_descent : public optimizer {
  * USSR Computational Mathematics and Mathematical Physics, 4(5):1-17, 1964.
  **/
 struct momentum : public stateful_optimizer<1> {
- public:
+public:
   momentum() : alpha(float_t(0.01)), lambda(float_t(0)), mu(float_t(0.9)) {}
-  void update(const vec_t &dW, vec_t &W, bool parallelize);
-  void update_gpu(const float_t *dW, float_t *W) {}
 
-  float_t alpha;   // learning rate
-  float_t lambda;  // weight decay
-  float_t mu;      // momentum
+  void update(const vec_t& dW, vec_t& W, bool parallelize) {
+    vec_t& dWprev = get<0>(W);
+
+    // for_i(parallelize, W.size(), [&](size_t i) {
+    galois::do_all(galois::iterate((size_t)0, W.size()),
+                   [&](const auto& i) {
+                     float_t V =
+                         mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda);
+                     W[i] += V;
+                     dWprev[i] = V;
+                     //});
+                   },
+                   galois::loopname("momentum_update"));
+  }
+
+  float_t alpha;  // learning rate
+  float_t lambda; // weight decay
+  float_t mu;     // momentum
 };
 
 /**
@@ -144,14 +214,26 @@ struct momentum : public stateful_optimizer<1> {
  * convergence o(1/k2), Doklady ANSSSR, vol.269, pp.543-547, 1983.
  **/
 struct nesterov_momentum : public stateful_optimizer<1> {
- public:
+public:
   nesterov_momentum()
-    : alpha(float_t(0.01)), lambda(float_t(0)), mu(float_t(0.9)) {}
-  void update(const vec_t &dW, vec_t &W, bool parallelize);
-  void update_gpu(const float_t *dW, float_t *W) {}
-
-  float_t alpha;   // learning rate
-  float_t lambda;  // weight decay
-  float_t mu;      // momentum
+      : alpha(float_t(0.01)), lambda(float_t(0)), mu(float_t(0.9)) {}
+
+  void update(const vec_t& dW, vec_t& W, bool parallelize) {
+    vec_t& dWprev = get<0>(W);
+
+    // for_i(parallelize, W.size(), [&](size_t i) {
+    galois::do_all(galois::iterate((size_t)0, W.size()),
+                   [&](const auto& i) {
+                     float_t V =
+                         mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda);
+                     W[i] += (-mu) * dWprev[i] + (1 + mu) * V;
+                     dWprev[i] = V;
+                     //});
+                   },
+                   galois::loopname("nesterov_momentum_update"));
+  }
+
+  float_t alpha;  // learning rate
+  float_t lambda; // weight decay
+  float_t mu;     // momentum
 };
-
diff --git a/libdeepgalois/include/random.h b/libdeepgalois/include/random.h
index 9236e9c391..8560a24de1 100644
--- a/libdeepgalois/include/random.h
+++ b/libdeepgalois/include/random.h
@@ -4,60 +4,65 @@ typedef boost::mt19937 rng_t;
 
 // random seeding
 int64_t seedgen(void) {
-	int64_t s, seed, pid;
-	FILE* f = fopen("/dev/urandom", "rb");
-	if (f && fread(&seed, 1, sizeof(seed), f) == sizeof(seed)) {
-		fclose(f);
-		return seed;
-	}
-	std::cout << "System entropy source not available, using fallback algorithm to generate seed instead.";
-	if (f) fclose(f);
-	pid = getpid();
-	s = time(NULL);
-	seed = std::abs(((s * 181) * ((pid - 83) * 359)) % 104729);
-	return seed;
+  int64_t s, seed, pid;
+  FILE* f = fopen("/dev/urandom", "rb");
+  if (f && fread(&seed, 1, sizeof(seed), f) == sizeof(seed)) {
+    fclose(f);
+    return seed;
+  }
+  std::cout << "System entropy source not available, using fallback algorithm "
+               "to generate seed instead.";
+  if (f)
+    fclose(f);
+  pid  = getpid();
+  s    = time(NULL);
+  seed = std::abs(((s * 181) * ((pid - 83) * 359)) % 104729);
+  return seed;
 }
 
 // This random number generator facade hides boost and CUDA rng
 // implementation from one another (for cross-platform compatibility).
 class RNG {
 public:
-	RNG() : generator_(new Generator()) { }
-	explicit RNG(unsigned int seed) : generator_(new Generator(seed)) { }
-	explicit RNG(const RNG&);
-	RNG& operator=(const RNG& other) { generator_ = other.generator_; return *this; }
-	void* generator() { return static_cast<void*>(generator_->rng()); }
+  RNG() : generator_(new Generator()) {}
+  explicit RNG(unsigned int seed) : generator_(new Generator(seed)) {}
+  explicit RNG(const RNG&);
+  RNG& operator=(const RNG& other) {
+    generator_ = other.generator_;
+    return *this;
+  }
+  void* generator() { return static_cast<void*>(generator_->rng()); }
+
 private:
-	class Generator {
-		public:
-			Generator() : rng_(new rng_t(seedgen())) {}
-			explicit Generator(unsigned seed) : rng_(new rng_t(seed)) {}
-			rng_t* rng() { return rng_.get(); }
-		private:
-			std::shared_ptr<rng_t> rng_;
-	};
-
-	std::shared_ptr<Generator> generator_;
+  class Generator {
+  public:
+    Generator() : rng_(new rng_t(seedgen())) {}
+    explicit Generator(unsigned seed) : rng_(new rng_t(seed)) {}
+    rng_t* rng() { return rng_.get(); }
+
+  private:
+    std::shared_ptr<rng_t> rng_;
+  };
+
+  std::shared_ptr<Generator> generator_;
 };
 
 std::shared_ptr<RNG> random_generator_;
 inline static RNG& rng_stream() {
-	random_generator_.reset(new RNG());
-	return *random_generator_;
+  random_generator_.reset(new RNG());
+  return *random_generator_;
 }
 
-inline rng_t* rng() {
-	return static_cast<rng_t*>(rng_stream().generator());
-}
+inline rng_t* rng() { return static_cast<rng_t*>(rng_stream().generator()); }
 
 #include <boost/random/bernoulli_distribution.hpp>
 template <typename DataTy = float>
-void rng_bernoulli(const DataTy p, std::vector<unsigned> &r) {
-	boost::bernoulli_distribution<DataTy> random_distribution(p);
-	boost::variate_generator<rng_t*, boost::bernoulli_distribution<DataTy> >
-		variate_generator(rng(), random_distribution);
-	for (size_t i = 0; i < r.size(); ++i)
-		r[i] = static_cast<unsigned>(variate_generator());
+void rng_bernoulli(const DataTy p, std::vector<unsigned>& r) {
+  boost::bernoulli_distribution<DataTy> random_distribution(p);
+  boost::variate_generator<rng_t*, boost::bernoulli_distribution<DataTy>>
+      variate_generator(rng(), random_distribution);
+  for (size_t i = 0; i < r.size(); ++i)
+    r[i] = static_cast<unsigned>(variate_generator());
 }
 
 #endif
diff --git a/libdeepgalois/include/timer.h b/libdeepgalois/include/timer.h
index e6c838c37b..af01412463 100644
--- a/libdeepgalois/include/timer.h
+++ b/libdeepgalois/include/timer.h
@@ -4,18 +4,25 @@
 
 class Timer {
 public:
-	Timer() {}
-	void Start() { gettimeofday(&start_time_, NULL); }
-	void Stop() {
-		gettimeofday(&elapsed_time_, NULL);
-		elapsed_time_.tv_sec  -= start_time_.tv_sec;
-		elapsed_time_.tv_usec -= start_time_.tv_usec;
-	}
-	double Seconds() const { return elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec/1e6; }
-	double Millisecs() const { return 1000*elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec/1000; }
-	double Microsecs() const { return 1e6*elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec; }
+  Timer() {}
+  void Start() { gettimeofday(&start_time_, NULL); }
+  void Stop() {
+    gettimeofday(&elapsed_time_, NULL);
+    elapsed_time_.tv_sec -= start_time_.tv_sec;
+    elapsed_time_.tv_usec -= start_time_.tv_usec;
+  }
+  double Seconds() const {
+    return elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec / 1e6;
+  }
+  double Millisecs() const {
+    return 1000 * elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec / 1000;
+  }
+  double Microsecs() const {
+    return 1e6 * elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec;
+  }
+
 private:
-	struct timeval start_time_;
-	struct timeval elapsed_time_;
+  struct timeval start_time_;
+  struct timeval elapsed_time_;
 };
-#endif  // TIMER_H_
+#endif // TIMER_H_
diff --git a/libdeepgalois/include/types.h b/libdeepgalois/include/types.h
index 720c2ae2b8..5890ed307c 100644
--- a/libdeepgalois/include/types.h
+++ b/libdeepgalois/include/types.h
@@ -11,14 +11,14 @@ typedef float float_t;
 typedef float feature_t; // feature type
 #endif
 typedef std::vector<float_t> vec_t; // feature vector (1D)
-typedef std::vector<vec_t> tensor_t; // feature vectors (2D): num_samples x feature_dim
+typedef std::vector<vec_t>
+    tensor_t; // feature vectors (2D): num_samples x feature_dim
 typedef std::vector<feature_t> FV; // feature vector
-typedef std::vector<FV> FV2D; // feature vectors: num_samples x feature_dim
-typedef float acc_t; // Accuracy type
-typedef short label_t; // label is for classification (supervised learning)
-typedef uint8_t mask_t; // mask is used to indicate different uses of labels: train, val, test
+typedef std::vector<FV> FV2D;      // feature vectors: num_samples x feature_dim
+typedef float acc_t;               // Accuracy type
+typedef short label_t;  // label is for classification (supervised learning)
+typedef uint8_t mask_t; // mask is used to indicate different uses of labels:
+                        // train, val, test
 #define CHUNK_SIZE 256
-#define TB_SIZE 256
-#define WARP_SIZE 32
 
 #endif
diff --git a/libdeepgalois/include/utils.h b/libdeepgalois/include/utils.h
index 63d0f74ff7..1c330daa5b 100644
--- a/libdeepgalois/include/utils.h
+++ b/libdeepgalois/include/utils.h
@@ -8,113 +8,121 @@
 #include <sys/time.h>
 #include <sys/resource.h>
 
-const std::string path = "/net/ohm/export/iss/inputs/Learning/"; // path to the input dataset
+const std::string path =
+    "/net/ohm/export/iss/inputs/Learning/"; // path to the input dataset
 enum class net_phase { train, test };
 
 class ResourceManager {
 public:
-	ResourceManager() {}
-	~ResourceManager(){}
-	//peak memory usage
-	std::string get_peak_memory() {
-		double kbm;
-		struct rusage CurUsage;
-		getrusage(RUSAGE_SELF, &CurUsage);
-		kbm = (double)CurUsage.ru_maxrss;
-		double mbm = kbm / 1024.0;
-		double gbm = mbm / 1024.0;
-		return
-			"Peak memory: " +
-			to_string_with_precision(mbm, 3) + " MB; " +
-			to_string_with_precision(gbm, 3) + " GB";
-	}
+  ResourceManager() {}
+  ~ResourceManager() {}
+  // peak memory usage
+  std::string get_peak_memory() {
+    double kbm;
+    struct rusage CurUsage;
+    getrusage(RUSAGE_SELF, &CurUsage);
+    kbm        = (double)CurUsage.ru_maxrss;
+    double mbm = kbm / 1024.0;
+    double gbm = mbm / 1024.0;
+    return "Peak memory: " + to_string_with_precision(mbm, 3) + " MB; " +
+           to_string_with_precision(gbm, 3) + " GB";
+  }
+
 private:
-	template <typename T = double>
-	std::string to_string_with_precision(const T a_value, const int& n) {
-		std::ostringstream out;
-		out << std::fixed;
-		out << std::setprecision(n) << a_value;
-		return out.str();
-	}
+  template <typename T = double>
+  std::string to_string_with_precision(const T a_value, const int& n) {
+    std::ostringstream out;
+    out << std::fixed;
+    out << std::setprecision(n) << a_value;
+    return out.str();
+  }
 };
 
 class Timer {
 public:
-	Timer() {}
-	void Start() { gettimeofday(&start_time_, NULL); }
-	void Stop() {
-		gettimeofday(&elapsed_time_, NULL);
-		elapsed_time_.tv_sec  -= start_time_.tv_sec;
-		elapsed_time_.tv_usec -= start_time_.tv_usec;
-	}
-	double Seconds() const { return elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec/1e6; }
-	double Millisecs() const { return 1000*elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec/1000; }
-	double Microsecs() const { return 1e6*elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec; }
+  Timer() {}
+  void Start() { gettimeofday(&start_time_, NULL); }
+  void Stop() {
+    gettimeofday(&elapsed_time_, NULL);
+    elapsed_time_.tv_sec -= start_time_.tv_sec;
+    elapsed_time_.tv_usec -= start_time_.tv_usec;
+  }
+  double Seconds() const {
+    return elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec / 1e6;
+  }
+  double Millisecs() const {
+    return 1000 * elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec / 1000;
+  }
+  double Microsecs() const {
+    return 1e6 * elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec;
+  }
+
 private:
-	struct timeval start_time_;
-	struct timeval elapsed_time_;
+  struct timeval start_time_;
+  struct timeval elapsed_time_;
 };
 
 class random_generator {
 public:
-	static random_generator &get_instance() {
-		static random_generator instance;
-		return instance;
-	}
-	std::mt19937 &operator()() { return gen_; }
-	void set_seed(unsigned int seed) { gen_.seed(seed); }
+  static random_generator& get_instance() {
+    static random_generator instance;
+    return instance;
+  }
+  std::mt19937& operator()() { return gen_; }
+  void set_seed(unsigned int seed) { gen_.seed(seed); }
 
 private:
-	random_generator() : gen_(1) {}
-	std::mt19937 gen_;
+  random_generator() : gen_(1) {}
+  std::mt19937 gen_;
 };
 
 template <typename T>
 inline typename std::enable_if<std::is_integral<T>::value, T>::type
 uniform_rand(T min, T max) {
-	std::uniform_int_distribution<T> dst(min, max);
-	return dst(random_generator::get_instance()());
+  std::uniform_int_distribution<T> dst(min, max);
+  return dst(random_generator::get_instance()());
 }
 
 template <typename T>
 inline typename std::enable_if<std::is_floating_point<T>::value, T>::type
 uniform_rand(T min, T max) {
-	std::uniform_real_distribution<T> dst(min, max);
-	return dst(random_generator::get_instance()());
+  std::uniform_real_distribution<T> dst(min, max);
+  return dst(random_generator::get_instance()());
 }
 
 inline bool bernoulli(float_t p) {
-	return uniform_rand(float_t(0), float_t(1)) <= p;
+  return uniform_rand(float_t(0), float_t(1)) <= p;
 }
 
-inline size_t read_masks(std::string dataset_str, std::string mask_type, size_t &begin, size_t &end, std::vector<uint8_t> &masks) {
-	if (dataset_str != "citeseer" && dataset_str != "cora") {
-		std::cout << "Dataset currently not supported\n";
-		exit(1);
-	}
-	size_t i = 0;
-	size_t sample_count = 0;
-	std::string filename = path + dataset_str + "-" + mask_type + "_mask.txt";
-	//std::cout << "Reading " << filename << "\n";
-	std::ifstream in;
-	std::string line;
-	in.open(filename, std::ios::in);
-	in >> begin >> end >> std::ws;
-	while (std::getline(in, line)) {
-		std::istringstream mask_stream(line);
-		if (i >= begin && i < end) {
-			unsigned mask = 0;
-			mask_stream >> mask;
-			if (mask == 1) {
-				masks[i] = 1;
-				sample_count ++;
-			}
-		}
-		i ++;
-	} 
-	//std::cout << mask_type + "_mask range: [" << begin << ", " << end
-	//	<< ") Number of valid samples: " << sample_count << "\n";
-	in.close();
-	return sample_count;
+inline size_t read_masks(std::string dataset_str, std::string mask_type,
+                         size_t& begin, size_t& end,
+                         std::vector<uint8_t>& masks) {
+  if (dataset_str != "citeseer" && dataset_str != "cora") {
+    std::cout << "Dataset currently not supported\n";
+    exit(1);
+  }
+  size_t i             = 0;
+  size_t sample_count  = 0;
+  std::string filename = path + dataset_str + "-" + mask_type + "_mask.txt";
+  // std::cout << "Reading " << filename << "\n";
+  std::ifstream in;
+  std::string line;
+  in.open(filename, std::ios::in);
+  in >> begin >> end >> std::ws;
+  while (std::getline(in, line)) {
+    std::istringstream mask_stream(line);
+    if (i >= begin && i < end) {
+      unsigned mask = 0;
+      mask_stream >> mask;
+      if (mask == 1) {
+        masks[i] = 1;
+        sample_count++;
+      }
+    }
+    i++;
+  }
+  // std::cout << mask_type + "_mask range: [" << begin << ", " << end
+  //	<< ") Number of valid samples: " << sample_count << "\n";
+  in.close();
+  return sample_count;
 }
-
diff --git a/libdeepgalois/src/aggregator.cpp b/libdeepgalois/src/aggregator.cpp
index 45862b7516..6bb301b0be 100644
--- a/libdeepgalois/src/aggregator.cpp
+++ b/libdeepgalois/src/aggregator.cpp
@@ -3,21 +3,28 @@
 #include "aggregator.h"
 #include "math_functions.hh"
 
-void update_all(size_t len, Graph &g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor) {
-	galois::do_all(galois::iterate(g.begin(), g.end()), [&](const auto& src) {
-		clear(len, &out[src*len]);
-		float_t a = 0.0, b = 0.0;
-		if (norm) a = norm_factor[src];
-		// gather neighbors' embeddings
-		for (const auto e : g.edges(src)) {
-			const auto dst = g.getEdgeDst(e);
-			if (norm) {
-				b = a * norm_factor[dst];
-				vec_t neighbor(len);
-				mul_scalar(len, b, &in[dst*len], &neighbor[0]);
-				vadd(len, &out[src*len], &neighbor[0], &out[src*len]); // out[src] += in[dst]
-			} else vadd(len, &out[src*len], &in[dst*len], &out[src*len]); // out[src] += in[dst]
-		}
-	}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("update_all"));
+void update_all(size_t len, Graph& g, const float_t* in, float_t* out,
+                bool norm, const float_t* norm_factor) {
+  galois::do_all(galois::iterate(g.begin(), g.end()),
+                 [&](const auto& src) {
+                   clear(len, &out[src * len]);
+                   float_t a = 0.0, b = 0.0;
+                   if (norm)
+                     a = norm_factor[src];
+                   // gather neighbors' embeddings
+                   for (const auto e : g.edges(src)) {
+                     const auto dst = g.getEdgeDst(e);
+                     if (norm) {
+                       b = a * norm_factor[dst];
+                       vec_t neighbor(len);
+                       mul_scalar(len, b, &in[dst * len], &neighbor[0]);
+                       vadd(len, &out[src * len], &neighbor[0],
+                            &out[src * len]); // out[src] += in[dst]
+                     } else
+                       vadd(len, &out[src * len], &in[dst * len],
+                            &out[src * len]); // out[src] += in[dst]
+                   }
+                 },
+                 galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
+                 galois::loopname("update_all"));
 }
-
diff --git a/libdeepgalois/src/aggregator.cu b/libdeepgalois/src/aggregator.cu
index a6b61ce914..ea41fd3dcb 100644
--- a/libdeepgalois/src/aggregator.cu
+++ b/libdeepgalois/src/aggregator.cu
@@ -6,30 +6,36 @@
 #include "math_functions.hh"
 
 // TODO: use warp
-__device__ void scale_add(const int n, const float_t alpha, const float_t* a, const float_t* b, float_t* y) {
-	for (int i = 0; i < n; i++) y[i] = alpha * a[i] + b[i];
+__device__ void scale_add(const int n, const float_t alpha, const float_t* a,
+                          const float_t* b, float_t* y) {
+  for (int i = 0; i < n; i++)
+    y[i] = alpha * a[i] + b[i];
 }
 
-__global__  void update_all_kernel(size_t n, size_t len, CSRGraph &g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor) {
-	CUDA_KERNEL_LOOP(src, n) {
-		float_t a = 0.0, b = 1.0;
-		if (norm) a = norm_factor[src];
-		index_type begin = g.edge_begin(src);
-		index_type end = g.edge_end(src);
-		for (index_type e = begin; e != end; e++) {
-			index_type dst = g.getEdgeDst(e);
-			assert(dst < n);
-			if (norm) b = a * norm_factor[dst];
-			scale_add(len, b, in+dst*len, out+src*len, out+src*len); // out[src] += in[dst]
-		}
-	}
+__global__ void update_all_kernel(size_t n, size_t len, CSRGraph& g,
+                                  const float_t* in, float_t* out, bool norm,
+                                  const float_t* norm_factor) {
+  CUDA_KERNEL_LOOP(src, n) {
+    float_t a = 0.0, b = 1.0;
+    if (norm)
+      a = norm_factor[src];
+    index_type begin = g.edge_begin(src);
+    index_type end   = g.edge_end(src);
+    for (index_type e = begin; e != end; e++) {
+      index_type dst = g.getEdgeDst(e);
+      assert(dst < n);
+      if (norm)
+        b = a * norm_factor[dst];
+      scale_add(len, b, in + dst * len, out + src * len,
+                out + src * len); // out[src] += in[dst]
+    }
+  }
 }
 
-void update_all(size_t len, CSRGraph &g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor) {
-	std::cout << "[debug]: update_all on GPU\n";
-	unsigned n = g.nnodes;
-	CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t)));
-	update_all_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, len, g, in, out, norm, norm_factor);
-	CudaTest("solving update_all kernel failed");
+void update_all(size_t len, CSRGraph& g, const float_t* in, float_t* out,
+                bool norm, const float_t* norm_factor) {
+  unsigned n = g.nnodes;
+  CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t)));
+  update_all_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
+      n, len, g, in, out, norm, norm_factor);
 }
-
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index a500c02125..04d7c14476 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -1,143 +1,237 @@
 #include "context.h"
 #include "gtypes.h"
+#include <cstdio>
+#include <ctime>
+
+// random seeding
+int64_t cluster_seedgen(void) {
+  int64_t s, seed, pid;
+  FILE* f = fopen("/dev/urandom", "rb");
+  if (f && fread(&seed, 1, sizeof(seed), f) == sizeof(seed)) {
+    fclose(f);
+    return seed;
+  }
+  std::cout << "System entropy source not available, "
+               "using fallback algorithm to generate seed instead.";
+  if (f)
+    fclose(f);
+  pid  = getpid();
+  s    = time(NULL);
+  seed = std::abs(((s * 181) * ((pid - 83) * 359)) % 104729);
+  return seed;
+}
 
 #ifdef CPU_ONLY
-Context::Context() : mode_(Context::CPU), solver_count_(1), 
-	solver_rank_(0), multiprocess_(false) { }
+Context::Context()
+    : mode_(Context::CPU), solver_count_(1), solver_rank_(0),
+      multiprocess_(false) {}
 Context::~Context() {}
+#else
+cublasHandle_t Context::cublas_handle_       = 0;
+curandGenerator_t Context::curand_generator_ = 0;
+
+Context::Context()
+    : mode_(Context::GPU), solver_count_(1), solver_rank_(0),
+      multiprocess_(false) {
+  // void Context::create_blas_handle() {
+  CUBLAS_CHECK(cublasCreate(&cublas_handle_));
+  CURAND_CHECK(
+      curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT));
+  CURAND_CHECK(
+      curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen()));
+}
+
+Context::~Context() {
+  if (cublas_handle_)
+    CUBLAS_CHECK(cublasDestroy(cublas_handle_));
+  if (curand_generator_) {
+    CURAND_CHECK(curandDestroyGenerator(curand_generator_));
+  }
+}
+
+void Context::SetDevice(const int device_id) {
+  int current_device;
+  CUDA_CHECK(cudaGetDevice(&current_device));
+  if (current_device == device_id)
+    return;
+  CUDA_CHECK(cudaSetDevice(device_id));
+  if (cublas_handle_)
+    CUBLAS_CHECK(cublasDestroy(cublas_handle_));
+  if (curand_generator_)
+    CURAND_CHECK(curandDestroyGenerator(curand_generator_));
+  CUBLAS_CHECK(cublasCreate(&cublas_handle_));
+  CURAND_CHECK(
+      curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT));
+  CURAND_CHECK(
+      curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen()));
+}
 #endif
 
 size_t Context::read_graph(std::string dataset_str) {
 #ifdef CPU_ONLY
-	n = read_graph_cpu(dataset_str, "gr");
+  n = read_graph_cpu(dataset_str, "gr");
 #else
-	n = read_graph_gpu(dataset_str);
+  n = read_graph_gpu(dataset_str);
 #endif
-	return n;
+  return n;
 }
 
 #ifdef CPU_ONLY
 size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype) {
-	galois::StatTimer Tread("GraphReadingTime");
-	Tread.start();
-	LGraph lgraph;
-	if (filetype == "el") {
-		std::string filename = path + dataset_str + ".el";
-		printf("Reading .el file: %s\n", filename.c_str());
-		lgraph.read_edgelist(filename.c_str(), true); //symmetrize
-		genGraph(lgraph, graph_cpu);
-		lgraph.clean();
-	} else if (filetype == "gr") {
-		std::string filename = path + dataset_str + ".csgr";
-		printf("Reading .gr file: %s\n", filename.c_str());
-		galois::graphs::readGraph(graph_cpu, filename);
-	} else { printf("Unkown file format\n"); exit(1); }
-	Tread.stop();
-	std::cout << "num_vertices " << graph_cpu.size() << " num_edges " << graph_cpu.sizeEdges() << "\n";
-	return graph_cpu.size();
+  galois::StatTimer Tread("GraphReadingTime");
+  Tread.start();
+  LGraph lgraph;
+  if (filetype == "el") {
+    std::string filename = path + dataset_str + ".el";
+    printf("Reading .el file: %s\n", filename.c_str());
+    lgraph.read_edgelist(filename.c_str(), true); // symmetrize
+    genGraph(lgraph, graph_cpu);
+    lgraph.clean();
+  } else if (filetype == "gr") {
+    std::string filename = path + dataset_str + ".csgr";
+    printf("Reading .gr file: %s\n", filename.c_str());
+    galois::graphs::readGraph(graph_cpu, filename);
+  } else {
+    printf("Unkown file format\n");
+    exit(1);
+  }
+  Tread.stop();
+  std::cout << "num_vertices " << graph_cpu.size() << " num_edges "
+            << graph_cpu.sizeEdges() << "\n";
+  return graph_cpu.size();
 }
 
-void Context::genGraph(LGraph &lg, Graph &g) {
-	g.allocateFrom(lg.num_vertices(), lg.num_edges());
-	g.constructNodes();
-	for (size_t i = 0; i < lg.num_vertices(); i++) {
-		g.getData(i) = 1;
-		auto row_begin = lg.get_offset(i);
-		auto row_end = lg.get_offset(i+1);
-		g.fixEndEdge(i, row_end);
-		for (auto offset = row_begin; offset < row_end; offset ++)
-			g.constructEdge(offset, lg.get_dest(offset), 0);
-	}
+void Context::genGraph(LGraph& lg, Graph& g) {
+  g.allocateFrom(lg.num_vertices(), lg.num_edges());
+  g.constructNodes();
+  for (size_t i = 0; i < lg.num_vertices(); i++) {
+    g.getData(i)   = 1;
+    auto row_begin = lg.get_offset(i);
+    auto row_end   = lg.get_offset(i + 1);
+    g.fixEndEdge(i, row_end);
+    for (auto offset = row_begin; offset < row_end; offset++)
+      g.constructEdge(offset, lg.get_dest(offset), 0);
+  }
+}
+float_t* Context::get_in_ptr() { return &h_feats[0]; }
+#else
+size_t Context::read_graph_gpu(std::string dataset_str) {
+  std::string filename = path + dataset_str + ".csgr";
+  graph_gpu.read(filename.c_str(), false);
+  return graph_gpu.nnodes;
 }
 
-float_t * Context::get_in_ptr() { return &h_feats[0]; }
+void Context::copy_data_to_device() {
+  CUDA_CHECK(cudaMalloc((void**)&d_labels, n * sizeof(label_t)));
+  CUDA_CHECK(cudaMemcpy(d_labels, &labels[0], n * sizeof(label_t),
+                        cudaMemcpyHostToDevice));
+  CUDA_CHECK(cudaMalloc((void**)&d_norm_factor, n * sizeof(float_t)));
+  CUDA_CHECK(cudaMalloc((void**)&d_feats, n * feat_len * sizeof(float_t)));
+  CUDA_CHECK(cudaMemcpy(d_feats, &h_feats[0], n * feat_len * sizeof(float_t),
+                        cudaMemcpyHostToDevice));
+}
+float_t* Context::get_in_ptr() { return d_feats; }
 #endif
 
 // user-defined pre-computing function, called during initialization
 // for each vertex v, compute pow(|N(v)|, -0.5), where |N(v)| is the degree of v
 void Context::norm_factor_counting() {
 #ifdef CPU_ONLY
-	norm_factor = new float_t[n];
-	galois::do_all(galois::iterate((size_t)0, n), [&] (auto v) {
-		auto degree = std::distance(graph_cpu.edge_begin(v), graph_cpu.edge_end(v));
-		float_t temp = std::sqrt(float_t(degree));
-		if (temp == 0.0) norm_factor[v] = 0.0;
-		else norm_factor[v] = 1.0 / temp;
-	}, galois::loopname("NormCounting"));
-#else
-	norm_factor_counting_gpu();
+  norm_factor = new float_t[n];
+  galois::do_all(galois::iterate((size_t)0, n),
+                 [&](auto v) {
+                   float_t temp = std::sqrt(float_t(degrees[v]));
+                   if (temp == 0.0)
+                     norm_factor[v] = 0.0;
+                   else
+                     norm_factor[v] = 1.0 / temp;
+                 },
+                 galois::loopname("NormCounting"));
 #endif
 }
 
-// labels contain the ground truth (e.g. vertex classes) for each example (num_examples x 1).
-// Note that labels is not one-hot encoded vector and it can be computed
-// as y.argmax(axis=1) from one-hot encoded vector (y) of labels if required.
+void Context::degree_counting() {
+#ifdef CPU_ONLY
+  degrees.resize(n);
+  galois::do_all(galois::iterate((size_t)0, n),
+                 [&](auto v) {
+                   degrees[v] = std::distance(graph_cpu.edge_begin(v),
+                                              graph_cpu.edge_end(v));
+                 },
+                 galois::loopname("DegreeCounting"));
+#endif
+}
+
+// labels contain the ground truth (e.g. vertex classes) for each example
+// (num_examples x 1). Note that labels is not one-hot encoded vector and it can
+// be computed as y.argmax(axis=1) from one-hot encoded vector (y) of labels if
+// required.
 size_t Context::read_labels(std::string dataset_str) {
-	std::cout << "Reading labels ... ";
-	Timer t_read;
-	t_read.Start();
-	std::string filename = path + dataset_str + "-labels.txt";
-	std::ifstream in;
-	std::string line;
-	in.open(filename, std::ios::in);
-	size_t m; // m: number of samples
-	in >> m >> num_classes >> std::ws;
-	assert(m == n);
-	labels.resize(m, 0); // label for each vertex: N x 1
-	unsigned v = 0;
-	while (std::getline(in, line)) {
-		std::istringstream label_stream(line);
-		unsigned x;
-		for (size_t idx = 0; idx < num_classes; ++idx) {
-			label_stream >> x;
-			if (x != 0) {
-				labels[v] = idx;
-				break;
-			}
-		}
-		v ++;
-	}
-	in.close();
-	t_read.Stop();
-	// print the number of vertex classes
-	std::cout << "Done, unique label counts: " << num_classes 
-		<< ", time: " << t_read.Millisecs() << " ms\n";
-	return num_classes;
+  std::cout << "Reading labels ... ";
+  Timer t_read;
+  t_read.Start();
+  std::string filename = path + dataset_str + "-labels.txt";
+  std::ifstream in;
+  std::string line;
+  in.open(filename, std::ios::in);
+  size_t m; // m: number of samples
+  in >> m >> num_classes >> std::ws;
+  assert(m == n);
+  labels.resize(m, 0); // label for each vertex: N x 1
+  unsigned v = 0;
+  while (std::getline(in, line)) {
+    std::istringstream label_stream(line);
+    unsigned x;
+    for (size_t idx = 0; idx < num_classes; ++idx) {
+      label_stream >> x;
+      if (x != 0) {
+        labels[v] = idx;
+        break;
+      }
+    }
+    v++;
+  }
+  in.close();
+  t_read.Stop();
+  // print the number of vertex classes
+  std::cout << "Done, unique label counts: " << num_classes
+            << ", time: " << t_read.Millisecs() << " ms\n";
+  return num_classes;
 }
 
 size_t Context::read_features(std::string dataset_str) {
-	std::cout << "Reading features ... ";
-	Timer t_read;
-	t_read.Start();
-	std::string filename = path + dataset_str + ".ft";
-	std::ifstream in;
-	std::string line;
-	in.open(filename, std::ios::in);
-	size_t m; // m = number of vertices
-	in >> m >> feat_len >> std::ws;
-	//assert(m == );
-	h_feats.resize(m*feat_len, 0);
-	while (std::getline(in, line)) {
-		std::istringstream edge_stream(line);
-		unsigned u, v;
-		float_t w;
-		edge_stream >> u;
-		edge_stream >> v;
-		edge_stream >> w;
-		h_feats[u*feat_len+v] = w;
-	}
-	in.close();
-	t_read.Stop();
-	std::cout << "Done, feature length: " << feat_len << ", time: " << t_read.Millisecs() << " ms\n";
-	return feat_len;
+  std::cout << "Reading features ... ";
+  Timer t_read;
+  t_read.Start();
+  std::string filename = path + dataset_str + ".ft";
+  std::ifstream in;
+  std::string line;
+  in.open(filename, std::ios::in);
+  size_t m; // m = number of vertices
+  in >> m >> feat_len >> std::ws;
+  // assert(m == );
+  h_feats.resize(m * feat_len, 0);
+  while (std::getline(in, line)) {
+    std::istringstream edge_stream(line);
+    unsigned u, v;
+    float_t w;
+    edge_stream >> u;
+    edge_stream >> v;
+    edge_stream >> w;
+    h_feats[u * feat_len + v] = w;
+  }
+  in.close();
+  t_read.Stop();
+  std::cout << "Done, feature length: " << feat_len
+            << ", time: " << t_read.Millisecs() << " ms\n";
+  return feat_len;
 }
 
 /*
 inline void init_features(size_t dim, vec_t &x) {
-	std::default_random_engine rng;
-	std::uniform_real_distribution<feature_t> dist(0, 0.1);
-	for (size_t i = 0; i < dim; ++i)
-		x[i] = dist(rng);
+    std::default_random_engine rng;
+    std::uniform_real_distribution<feature_t> dist(0, 0.1);
+    for (size_t i = 0; i < dim; ++i)
+        x[i] = dist(rng);
 }
 //*/
-
diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu
index 182deeaed0..b68f07ab98 100644
--- a/libdeepgalois/src/context.cu
+++ b/libdeepgalois/src/context.cu
@@ -6,81 +6,96 @@
 
 // random seeding
 int64_t cluster_seedgen(void) {
-	int64_t s, seed, pid;
-	FILE* f = fopen("/dev/urandom", "rb");
-	if (f && fread(&seed, 1, sizeof(seed), f) == sizeof(seed)) {
-		fclose(f);
-		return seed;
-	}
-	std::cout << "System entropy source not available, "
-		"using fallback algorithm to generate seed instead.";
-	if (f) fclose(f);
-	pid = getpid();
-	s = time(NULL);
-	seed = std::abs(((s * 181) * ((pid - 83) * 359)) % 104729);
-	return seed;
+  int64_t s, seed, pid;
+  FILE* f = fopen("/dev/urandom", "rb");
+  if (f && fread(&seed, 1, sizeof(seed), f) == sizeof(seed)) {
+    fclose(f);
+    return seed;
+  }
+  std::cout << "System entropy source not available, "
+               "using fallback algorithm to generate seed instead.";
+  if (f)
+    fclose(f);
+  pid  = getpid();
+  s    = time(NULL);
+  seed = std::abs(((s * 181) * ((pid - 83) * 359)) % 104729);
+  return seed;
 }
 
-__global__ void norm_factor_counting_kernel(int n, CSRGraph graph, float_t *norm_fac) {
-	CUDA_KERNEL_LOOP(i, n) {
-		float_t temp = sqrt(float_t(graph.getOutDegree(i)));
-		if (temp == 0.0) norm_fac[i] = 0.0;
-		else norm_fac[i] = 1.0 / temp;
-	}
+__global__ void norm_factor_counting_kernel(int n, CSRGraph graph,
+                                            float_t* norm_fac) {
+  CUDA_KERNEL_LOOP(i, n) {
+    float_t temp = sqrt(float_t(graph.getOutDegree(i)));
+    if (temp == 0.0)
+      norm_fac[i] = 0.0;
+    else
+      norm_fac[i] = 1.0 / temp;
+  }
 }
 
 void Context::norm_factor_counting_gpu() {
-	std::cout << "Pre-computing normalization factor (n=" << n << ")\n";
-	assert(graph_gpu.nnodes == n);
-	CUDA_CHECK(cudaMalloc((void **)&d_norm_factor, n * sizeof(float_t)));
-	norm_factor_counting_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, graph_gpu, d_norm_factor);
-	CudaTest("solving norm_factor_counting kernel failed");
+  std::cout << "Pre-computing normalization factor (n=" << n << ")\n";
+  assert(graph_gpu.nnodes == n);
+  CUDA_CHECK(cudaMalloc((void**)&d_norm_factor, n * sizeof(float_t)));
+  norm_factor_counting_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
+      n, graph_gpu, d_norm_factor);
+  CudaTest("solving norm_factor_counting kernel failed");
 }
 
-cublasHandle_t Context::cublas_handle_ = 0;
+cublasHandle_t Context::cublas_handle_       = 0;
 curandGenerator_t Context::curand_generator_ = 0;
 
-Context::Context() : mode_(Context::GPU), solver_count_(1), 
-	solver_rank_(0), multiprocess_(false) {
-	CUBLAS_CHECK(cublasCreate(&cublas_handle_));
-	CURAND_CHECK(curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT));
-	CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen()));
+Context::Context()
+    : mode_(Context::GPU), solver_count_(1), solver_rank_(0),
+      multiprocess_(false) {
+  CUBLAS_CHECK(cublasCreate(&cublas_handle_));
+  CURAND_CHECK(
+      curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT));
+  CURAND_CHECK(
+      curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen()));
 }
 
 Context::~Context() {
-	if (cublas_handle_) CUBLAS_CHECK(cublasDestroy(cublas_handle_));
-	if (curand_generator_) {
-		CURAND_CHECK(curandDestroyGenerator(curand_generator_));
-	}
+  if (cublas_handle_)
+    CUBLAS_CHECK(cublasDestroy(cublas_handle_));
+  if (curand_generator_) {
+    CURAND_CHECK(curandDestroyGenerator(curand_generator_));
+  }
 }
 
 void Context::SetDevice(const int device_id) {
-	int current_device;
-	CUDA_CHECK(cudaGetDevice(&current_device));
-	if (current_device == device_id) return;
-	CUDA_CHECK(cudaSetDevice(device_id));
-	if (cublas_handle_) CUBLAS_CHECK(cublasDestroy(cublas_handle_));
-	if (curand_generator_) CURAND_CHECK(curandDestroyGenerator(curand_generator_));
-	CUBLAS_CHECK(cublasCreate(&cublas_handle_));
-	CURAND_CHECK(curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT));
-	CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen()));
+  int current_device;
+  CUDA_CHECK(cudaGetDevice(&current_device));
+  if (current_device == device_id)
+    return;
+  CUDA_CHECK(cudaSetDevice(device_id));
+  if (cublas_handle_)
+    CUBLAS_CHECK(cublasDestroy(cublas_handle_));
+  if (curand_generator_)
+    CURAND_CHECK(curandDestroyGenerator(curand_generator_));
+  CUBLAS_CHECK(cublasCreate(&cublas_handle_));
+  CURAND_CHECK(
+      curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT));
+  CURAND_CHECK(
+      curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen()));
 }
 
 size_t Context::read_graph_gpu(std::string dataset_str) {
-	std::string filename = path + dataset_str + ".csgr";
-	CSRGraph g;
-	g.read(filename.c_str(), false);
-	g.copy_to_gpu(graph_gpu);
-	return graph_gpu.nnodes;
+  std::string filename = path + dataset_str + ".csgr";
+  CSRGraph g;
+  g.read(filename.c_str(), false);
+  g.copy_to_gpu(graph_gpu);
+  return graph_gpu.nnodes;
 }
 
 void Context::copy_data_to_device() {
-	assert(labels.size() == n);
-	CUDA_CHECK(cudaMalloc((void **)&d_labels, n * sizeof(label_t)));
-	CUDA_CHECK(cudaMemcpy(d_labels, &labels[0], n * sizeof(label_t), cudaMemcpyHostToDevice));
-	CUDA_CHECK(cudaMalloc((void **)&d_feats, n * feat_len *  sizeof(float_t)));
-	CUDA_CHECK(cudaMemcpy(d_feats, &h_feats[0], n * feat_len * sizeof(float_t), cudaMemcpyHostToDevice));
+  assert(labels.size() == n);
+  CUDA_CHECK(cudaMalloc((void**)&d_labels, n * sizeof(label_t)));
+  CUDA_CHECK(cudaMemcpy(d_labels, &labels[0], n * sizeof(label_t),
+                        cudaMemcpyHostToDevice));
+  CUDA_CHECK(cudaMalloc((void**)&d_feats, n * feat_len * sizeof(float_t)));
+  CUDA_CHECK(cudaMemcpy(d_feats, &h_feats[0], n * feat_len * sizeof(float_t),
+                        cudaMemcpyHostToDevice));
 }
 
-float_t * Context::get_in_ptr() { return d_feats; }
-
+float_t* Context::get_in_ptr() { return d_feats; }
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 86f39ade20..06ec53b2db 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -1,126 +1,164 @@
 #include "layers/graph_conv_layer.h"
 
 #ifdef CPU_ONLY
-void graph_conv_layer::aggregate(size_t len, Graph &g, const float_t *in, float_t *out) {
-	update_all(len, g, in, out, true, context->norm_factor);
+void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in,
+                                 float_t* out) {
+  update_all(len, g, in, out, true, context->norm_factor);
 #else
-void graph_conv_layer::aggregate(size_t len, CSRGraph &g, const float_t *in, float_t *out) {
-	update_all(len, g, in, out, true, context->d_norm_factor);
+void graph_conv_layer::aggregate(size_t len, CSRGraph& g, const float_t* in,
+                                 float_t* out) {
+  update_all(len, g, in, out, true, context->d_norm_factor);
 #endif
 }
 
-void graph_conv_layer::combine(const vec_t &self, const vec_t &neighbors, vec_t &out) {
-	vec_t a(out.size(), 0);
-	vec_t b(out.size(), 0);
-	mvmul(Q, self, a);
-	mvmul(W, neighbors, b); 
-	vadd(a, b, out); // out = W*self + Q*neighbors
+void graph_conv_layer::combine(const vec_t& self, const vec_t& neighbors,
+                               vec_t& out) {
+  vec_t a(out.size(), 0);
+  vec_t b(out.size(), 0);
+  mvmul(Q, self, a);
+  mvmul(W, neighbors, b);
+  vadd(a, b, out); // out = W*self + Q*neighbors
 }
 
-graph_conv_layer::graph_conv_layer(unsigned level, bool act, bool norm, bool bias, 
-	bool dropout, float dropout_rate, std::vector<size_t> in_dims, std::vector<size_t> out_dims) :
-		layer(level, in_dims, out_dims), act_(act), norm_(norm), 
-		bias_(bias), dropout_(dropout), dropout_rate_(dropout_rate) {
-	assert(input_dims[0] == output_dims[0]); // num_vertices
-	x = input_dims[0];
-	y = input_dims[1];
-	z = output_dims[1];
-	trainable_ = true;
-	name_ = layer_type() + "_" + std::to_string(level);
-	init();
-	scale_ = 1. / (1. - dropout_rate_);
+graph_conv_layer::graph_conv_layer(unsigned level, bool act, bool norm,
+                                   bool bias, bool dropout, float dropout_rate,
+                                   std::vector<size_t> in_dims,
+                                   std::vector<size_t> out_dims)
+    : layer(level, in_dims, out_dims), act_(act), norm_(norm), bias_(bias),
+      dropout_(dropout), dropout_rate_(dropout_rate) {
+  assert(input_dims[0] == output_dims[0]); // num_vertices
+  x          = input_dims[0];
+  y          = input_dims[1];
+  z          = output_dims[1];
+  trainable_ = true;
+  name_      = layer_type() + "_" + std::to_string(level);
+  init();
+  scale_ = 1. / (1. - dropout_rate_);
 }
 
 void graph_conv_layer::init() {
-	Timer t_alloc;
-	t_alloc.Start();
-	//std::cout << name_ << ": allocating memory for parameters and intermediate data... ";
+  std::cout << name_
+            << ": allocating memory for parameters and intermediate data... ";
+  Timer t_alloc;
+  t_alloc.Start();
 #ifdef CPU_ONLY
-	rand_init_matrix(y, z, W); // randomly initialize trainable parameters
-	//rand_init_matrix(y, z, Q);
-	zero_init_matrix(y, z, weight_grad);
-	if (dropout_) dropout_mask = new unsigned[x*y];
-	in_temp = new float_t[x*y];
-	out_temp = new float_t[x*z]; // same as pre_sup in original GCN code: https://github.com/chenxuhao/gcn/blob/master/gcn/layers.py
-	trans_data = new float_t[y*x]; // y*x
+  rand_init_matrix(y, z, W); // randomly initialize trainable parameters
+  // rand_init_matrix(y, z, Q);
+  zero_init_matrix(y, z, weight_grad);
+  if (dropout_)
+    dropout_mask = new unsigned[x * y];
+  in_temp  = new float_t[x * y];
+  out_temp = new float_t
+      [x * z]; // same as pre_sup in original GCN code:
+               // https://github.com/chenxuhao/gcn/blob/master/gcn/layers.py
+  trans_data = new float_t[y * x]; // y*x
 #else
-	gconv_malloc_device(x, y, z, dropout_, dropout_mask, in_temp, out_temp, d_W, d_weight_grad);
+  gconv_malloc_device(x, y, z, dropout_, dropout_mask, in_temp, out_temp, d_W,
+                      d_weight_grad);
 #endif
-	t_alloc.Stop();
-	//std::cout << "Done, time: " << t_alloc.Millisecs() << " ms\n";
+  t_alloc.Stop();
+  std::cout << "Done, allocation time: " << t_alloc.Millisecs() << " ms\n";
 }
 
 #ifdef CPU_ONLY
 // 𝒉[𝑙] = σ(𝑊 * Σ(𝒉[𝑙-1]))
-void graph_conv_layer::forward_propagation(const float_t *in_data, float_t *out_data) {
-	// input: x*y; W: y*z; output: x*z
-	// if y > z: mult W first to reduce the feature size for aggregation
-	// else: aggregate first then mult W (not implemented yet)
-	if (dropout_ && phase_ == net_phase::train) {
-		galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) {
-			dropout(y, scale_, dropout_rate_, &in_data[i*y], &dropout_mask[i*y], &in_temp[i*y]);
-		}, galois::loopname("dropout"));
-		matmul1D1D(x, z, y, in_temp, &W[0], out_temp); // x*y; y*z; x*z
-	} else matmul1D1D(x, z, y, in_data, &W[0], out_temp); // x*y; y*z; x*z
-	aggregate(z, context->graph_cpu, out_temp, out_data); // aggregate
-	if (act_) {
-		galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) {
-			relu(z, &out_data[i*z], &out_data[i*z]);
-		}, galois::loopname("relu"));
-	}
+void graph_conv_layer::forward_propagation(const float_t* in_data,
+                                           float_t* out_data) {
+  // input: x*y; W: y*z; output: x*z
+  // if y > z: mult W first to reduce the feature size for aggregation
+  // else: aggregate first then mult W (not implemented yet)
+  if (dropout_ && phase_ == net_phase::train) {
+    galois::do_all(galois::iterate((size_t)0, x),
+                   [&](const auto& i) {
+                     dropout(y, scale_, dropout_rate_, &in_data[i * y],
+                             &dropout_mask[i * y], &in_temp[i * y]);
+                   },
+                   galois::loopname("dropout"));
+    matmul1D1D(x, z, y, in_temp, &W[0], out_temp); // x*y; y*z; x*z
+  } else
+    matmul1D1D(x, z, y, in_data, &W[0], out_temp);      // x*y; y*z; x*z
+  aggregate(z, context->graph_cpu, out_temp, out_data); // aggregate
+  if (act_) {
+    galois::do_all(
+        galois::iterate((size_t)0, x),
+        [&](const auto& i) { relu(z, &out_data[i * z], &out_data[i * z]); },
+        galois::loopname("relu"));
+  }
 }
 
 // 𝜕𝐸 / 𝜕𝑦[𝑙−1] = 𝜕𝐸 / 𝜕𝑦[𝑙] ∗ 𝑊 ^𝑇
-void graph_conv_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) {
-//void graph_conv_layer::back_propagation(const vec_t &in_data, const vec_t &out_data, vec_t &out_grad, vec_t &in_grad) {
-	if (act_) {
-		galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) {
-			for (size_t j = 0; j < z; ++j) //TODO: use in_data or out_data?
-				out_temp[i*z+j] = out_data[i*z+j] > float_t(0) ? out_grad[i*z+j] : float_t(0);
-		}, galois::loopname("d_relu"));
-	} else copy1D1D(x*z, out_grad, out_temp); // TODO: avoid copying
-	if (level_ != 0) { // no need to calculate in_grad for the first layer
-		vec_t trans_W(z*y);
-		transpose(y, z, W, trans_W); // derivative of matmul needs transposed matrix
-		matmul1D1D(x, y, z, out_temp, &trans_W[0], in_temp); // x*z; z*y -> x*y
-		//sgemm_cpu(x, y, z, 1.0, out_temp, trans_W, 0.0, in_temp); // x*z; z*y -> x*y
-		//NOTE: since graph is symmetric, the derivative is the same
-		update_all(y, context->graph_cpu, in_temp, in_grad, true, context->norm_factor); // x*x; x*y -> x*y
-		if (dropout_) {
-			galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) {
-				d_dropout(y, scale_, &in_grad[i*y], &dropout_mask[i*y], &in_grad[i*y]);
-			}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("d_dropout"));
-		}
-	}
-	// calculate weight gradients
-	transpose(x, y, in_data, trans_data); // y*x
-	matmul1D1D(y, z, x, trans_data, out_temp, &weight_grad[0]); // y*x; x*z; y*z
+void graph_conv_layer::back_propagation(const float_t* in_data,
+                                        const float_t* out_data,
+                                        float_t* out_grad, float_t* in_grad) {
+  // void graph_conv_layer::back_propagation(const vec_t &in_data, const vec_t
+  // &out_data, vec_t &out_grad, vec_t &in_grad) {
+  if (act_) {
+    galois::do_all(galois::iterate((size_t)0, x),
+                   [&](const auto& i) {
+                     for (size_t j = 0; j < z;
+                          ++j) // TODO: use in_data or out_data?
+                       out_temp[i * z + j] = out_data[i * z + j] > float_t(0)
+                                                 ? out_grad[i * z + j]
+                                                 : float_t(0);
+                   },
+                   galois::loopname("d_relu"));
+  } else
+    copy1D1D(x * z, out_grad, out_temp); // TODO: avoid copying
+  if (level_ != 0) { // no need to calculate in_grad for the first layer
+    vec_t trans_W(z * y);
+    transpose(y, z, W, trans_W); // derivative of matmul needs transposed matrix
+    matmul1D1D(x, y, z, out_temp, &trans_W[0], in_temp); // x*z; z*y -> x*y
+    // sgemm_cpu(x, y, z, 1.0, out_temp, trans_W, 0.0, in_temp); // x*z; z*y ->
+    // x*y NOTE: since graph is symmetric, the derivative is the same
+    update_all(y, context->graph_cpu, in_temp, in_grad, true,
+               context->norm_factor); // x*x; x*y -> x*y
+    if (dropout_) {
+      galois::do_all(galois::iterate((size_t)0, x),
+                     [&](const auto& i) {
+                       d_dropout(y, scale_, &in_grad[i * y],
+                                 &dropout_mask[i * y], &in_grad[i * y]);
+                     },
+                     galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
+                     galois::loopname("d_dropout"));
+    }
+  }
+  // calculate weight gradients
+  transpose(x, y, in_data, trans_data);                       // y*x
+  matmul1D1D(y, z, x, trans_data, out_temp, &weight_grad[0]); // y*x; x*z; y*z
 }
 
 #else
 // GPU forward
-void graph_conv_layer::forward_propagation(const float_t *in_data, float_t *out_data) {
-	assert(y <= 128); // currently only support feature length <= 128
-	assert(in_data != NULL);
-	assert(in_temp != NULL);
-	assert(dropout_mask != NULL);
-	if (dropout_ && phase_ == net_phase::train) {
-		dropout_gpu(x*y, scale_, dropout_rate_, in_data, dropout_mask, in_temp);
-		matmul1D1D_gpu(x, z, y, in_temp, d_W, out_temp);
-	} else matmul1D1D_gpu(x, z, y, in_data, d_W, out_temp);
-	aggregate(z, context->graph_gpu, out_temp, out_data);
-	if (act_) relu_gpu(x*z, out_data, out_data);
+void graph_conv_layer::forward_propagation(const float_t* in_data,
+                                           float_t* out_data) {
+  assert(y <= 128); // currently only support feature length <= 128
+  if (dropout_ && phase_ == net_phase::train) {
+    dropout_gpu(x * y, scale_, dropout_rate_, in_data, dropout_mask, in_temp);
+    matmul1D1D_gpu(x, z, y, in_temp, d_W, out_temp);
+  } else
+    matmul1D1D_gpu(x, z, y, in_data, d_W, out_temp);
+  aggregate(z, context->graph_gpu, out_temp, out_data);
+  if (act_)
+    relu_gpu(x * z, out_data, out_data);
 }
 
 // GPU backward
-void graph_conv_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) {
-	if (act_) d_relu_gpu(x*z, out_grad, out_data, out_temp);
-	else copy_gpu(x*z, out_grad, out_temp);
-	if (level_ != 0) {
-		sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0, in_temp);
-		update_all(y, context->graph_gpu, in_temp, in_grad, true, context->d_norm_factor);
-		if (dropout_) d_dropout_gpu(y, scale_, in_grad, dropout_mask, in_grad);
-	}
-	sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, d_weight_grad);
+void graph_conv_layer::back_propagation(const float_t* in_data,
+                                        const float_t* out_data,
+                                        float_t* out_grad, float_t* in_grad) {
+  if (act_)
+    d_relu_gpu(x * z, out_grad, out_data, out_temp);
+  else
+    copy_gpu(x * z, out_grad, out_temp);
+  if (level_ != 0) {
+    sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0,
+              in_temp);
+    update_all(y, context->graph_gpu, in_temp, in_grad, true,
+               context->d_norm_factor);
+    if (dropout_)
+      d_dropout(y, scale_, in_grad, dropout_mask, in_grad);
+  }
+  sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0,
+            d_weight_grad);
 }
 #endif
diff --git a/libdeepgalois/src/layers/relu_layer.cpp b/libdeepgalois/src/layers/relu_layer.cpp
index eb02f66d50..0c52d0eb25 100644
--- a/libdeepgalois/src/layers/relu_layer.cpp
+++ b/libdeepgalois/src/layers/relu_layer.cpp
@@ -1,31 +1,45 @@
 #include "layers/relu_layer.h"
 
 // 𝑦[𝑙] = max(0, 𝑦[𝑙−1])
-void relu_layer::forward_propagation(const tensor_t &in_data, tensor_t &out_data) {
-	galois::do_all(galois::iterate((size_t)0, input_dims[0]), [&](const auto& i) {
-		for (size_t j = 0; j < input_dims[1]; ++j) 
-			out_data[i][j] = std::max(in_data[i][j], (float_t)0);
-	}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("relu_layer-fw"));
+void relu_layer::forward_propagation(const tensor_t& in_data,
+                                     tensor_t& out_data) {
+  galois::do_all(galois::iterate((size_t)0, input_dims[0]),
+                 [&](const auto& i) {
+                   for (size_t j = 0; j < input_dims[1]; ++j)
+                     out_data[i][j] = std::max(in_data[i][j], (float_t)0);
+                 },
+                 galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
+                 galois::loopname("relu_layer-fw"));
 }
 
 // 𝑦[𝑙] = max(0, 𝑦[𝑙−1])
-void relu_layer::forward_propagation(const float_t *in_data, float_t *out_data) {
-	const size_t count = input_dims[0] * input_dims[1];
-	relu_gpu(count, in_data, out_data);
+void relu_layer::forward_propagation(const float_t* in_data,
+                                     float_t* out_data) {
+  const size_t count = input_dims[0] * input_dims[1];
+  relu_gpu(count, in_data, out_data);
 }
 
 // 𝜕𝐿 / 𝜕𝑦[𝑙−1] = 0, 𝑖𝑓 (𝑦[𝑙] < 0)
 //              = 𝜕𝐿 / 𝜕𝑦𝑙 , 𝑜𝑡ℎ𝑒𝑟𝑤𝑖𝑠𝑒
-void relu_layer::back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad) {
-	galois::do_all(galois::iterate((size_t)0, input_dims[0]), [&](const auto& i) {
-		for (size_t j = 0; j < input_dims[1]; ++j) 
-			in_grad[i][j] = out_data[i][j] > float_t(0) ? out_grad[i][j] : float_t(0);
-	}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("relu_layer-bw"));
+void relu_layer::back_propagation(const tensor_t& in_data,
+                                  const tensor_t& out_data, tensor_t& out_grad,
+                                  tensor_t& in_grad) {
+  galois::do_all(galois::iterate((size_t)0, input_dims[0]),
+                 [&](const auto& i) {
+                   for (size_t j = 0; j < input_dims[1]; ++j)
+                     in_grad[i][j] = out_data[i][j] > float_t(0)
+                                         ? out_grad[i][j]
+                                         : float_t(0);
+                 },
+                 galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
+                 galois::loopname("relu_layer-bw"));
 }
 
 // 𝜕𝐿 / 𝜕𝑦[𝑙−1] = 0, 𝑖𝑓 (𝑦[𝑙] < 0)
 //              = 𝜕𝐿 / 𝜕𝑦𝑙 , 𝑜𝑡ℎ𝑒𝑟𝑤𝑖𝑠𝑒
-void relu_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) {
-	const size_t count = input_dims[0] * input_dims[1];
-	d_relu_gpu(count, out_grad, in_data, in_grad);
+void relu_layer::back_propagation(const float_t* in_data,
+                                  const float_t* out_data, float_t* out_grad,
+                                  float_t* in_grad) {
+  const size_t count = input_dims[0] * input_dims[1];
+  d_relu_gpu(count, out_grad, in_data, in_grad);
 }
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp
index 430e1f253b..579de65667 100644
--- a/libdeepgalois/src/layers/softmax_loss_layer.cpp
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp
@@ -1,71 +1,72 @@
 #include "layers/softmax_loss_layer.h"
 
-softmax_loss_layer::softmax_loss_layer(unsigned level, std::vector<size_t> in_dims, 
-	std::vector<size_t> out_dims) : layer(level, in_dims, out_dims) {
-	trainable_ = false;
-	name_ = layer_type() + "_" + std::to_string(level);
-	std::cout << name_ << ": allocating memory for intermediate data... ";
+softmax_loss_layer::softmax_loss_layer(unsigned level,
+                                       std::vector<size_t> in_dims,
+                                       std::vector<size_t> out_dims)
+    : layer(level, in_dims, out_dims) {
+  trainable_ = false;
+  name_      = layer_type() + "_" + std::to_string(level);
 #ifdef CPU_ONLY
-	loss = new float_t[in_dims[0]]; // error for each sample
+  loss = new float_t[in_dims[0]]; // error for each sample
 #else
-	loss_malloc_device(in_dims[0], loss);
+  out_malloc_device(in_dims[0], masks_, d_masks_, loss);
 #endif
-	std::cout << "Done\n";
 }
 #ifdef CPU_ONLY
 // TODO: need kernel fusion optimization
 // 𝑦[i] = 𝑒^𝑥[i] / Σ 𝑒^𝑥[𝑘]
-void softmax_loss_layer::forward_propagation(const float_t *in_data, float_t *out_data) {
-	size_t len = input_dims[1];
-	galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) {
-		if (masks_[i] == 1) { // masked
-			softmax(len, &in_data[len*i], &out_data[len*i]); // normalize using softmax
-			// y is a one hot encoded vector for the labels
-			std::vector<acc_t> y(output_dims[1], 0.0); // ground truth
-			y[context->get_label(i)] = 1.0; // one-hot
-			loss[i] = cross_entropy(len, &y[0], &out_data[len*i]);
-		}
-	}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("softmax-loss-fw"));
+void softmax_loss_layer::forward_propagation(const float_t* in_data,
+                                             float_t* out_data) {
+  // void softmax_loss_layer::forward_propagation(const vec_t &in_data, vec_t
+  // &out_data) {
+  size_t len = input_dims[1];
+  galois::do_all(galois::iterate(begin_, end_),
+                 [&](const auto& i) {
+                   if (masks_[i] == 1) { // masked
+                     softmax(len, &in_data[len * i],
+                             &out_data[len * i]); // normalize using softmax
+                     // y is a one hot encoded vector for the labels
+                     std::vector<acc_t> y(output_dims[1], 0.0); // ground truth
+                     y[context->get_label(i)] = 1.0;            // one-hot
+                     loss[i] = cross_entropy(len, &y[0], &out_data[len * i]);
+                   }
+                 },
+                 galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
+                 galois::loopname("softmax-loss-fw"));
 }
 
-void softmax_loss_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) {
-	size_t len = input_dims[1];
-	galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) {
-		if (masks_[i] == 1) { // masked
-			vec_t norm_grad(len);
-			std::vector<acc_t> y(len, 0.0); // ground truth
-			y[context->get_label(i)] = 1.0;
-			d_cross_entropy(len, &y[0], &out_data[len*i], &norm_grad[0]);
-			d_softmax(len, &in_data[len*i], &out_data[len*i], &in_grad[len*i], &norm_grad[0]);
-		}
-	}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("softmax-loss-bw"));
+// void softmax_loss_layer::back_propagation(const vec_t &in_data, const vec_t
+// &out_data, vec_t &out_grad, vec_t &in_grad) {
+void softmax_loss_layer::back_propagation(const float_t* in_data,
+                                          const float_t* out_data,
+                                          float_t* out_grad, float_t* in_grad) {
+  size_t len = input_dims[1];
+  galois::do_all(galois::iterate(begin_, end_),
+                 [&](const auto& i) {
+                   if (masks_[i] == 1) { // masked
+                     vec_t norm_grad(len);
+                     std::vector<acc_t> y(len, 0.0); // ground truth
+                     y[context->get_label(i)] = 1.0;
+                     d_cross_entropy(len, &y[0], &out_data[len * i],
+                                     &norm_grad[0]);
+                     d_softmax(len, &in_data[len * i], &out_data[len * i],
+                               &in_grad[len * i], &norm_grad[0]);
+                   }
+                 },
+                 galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
+                 galois::loopname("softmax-loss-bw"));
 }
-
-acc_t softmax_loss_layer::get_masked_loss() {
-	AccumF total_loss;
-	AccumU valid_sample_count;
-	total_loss.reset();
-	valid_sample_count.reset();
-	galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) {
-		if (masks_[i]) {
-			total_loss += loss[i];
-			valid_sample_count += 1;
-		}
-	}, galois::chunk_size<256>(), galois::steal(), galois::loopname("getMaskedLoss"));
-	assert(valid_sample_count.reduce() == count_);
-	return total_loss.reduce() / (acc_t)count_;
-}
-
 #else // GPU implementation
-void softmax_loss_layer::forward_propagation(const float_t *in_data, float_t *out_data) {
-	softmax_cross_entropy_gpu(input_dims[0], input_dims[1], in_data, d_masks_, context->d_labels, loss, out_data);
-}
-
-void softmax_loss_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) {
-	d_softmax_cross_entropy_gpu(input_dims[0], input_dims[1], in_data, d_masks_, context->d_labels, out_data, in_grad);
+void softmax_loss_layer::forward_propagation(const float_t* in_data,
+                                             float_t* out_data) {
+  softmax_cross_entropy_gpu(input_dims[0], input_dims[1], in_data, d_masks_,
+                            context->d_labels, loss, out_data);
 }
 
-acc_t softmax_loss_layer::get_masked_loss() {
-	return masked_avg_loss(begin_, end_, count_, masks_, loss);
+void softmax_loss_layer::back_propagation(const float_t* in_data,
+                                          const float_t* out_data,
+                                          float_t* out_grad, float_t* in_grad) {
+  d_softmax_cross_entropy_gpu(input_dims[0], input_dims[1], in_data, d_masks_,
+                              context->d_labels, out_data, in_grad);
 }
 #endif
diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp
index 6b41afb020..2e2d68f05d 100644
--- a/libdeepgalois/src/math_functions.cpp
+++ b/libdeepgalois/src/math_functions.cpp
@@ -10,463 +10,497 @@ extern "C" {
 
 // vector add
 #if defined(__AVX__) || defined(__AVX2__)
-void vadd(const vec_t &a, const vec_t &b, vec_t &out) {
-	//for (size_t i = 0; i < out.size(); ++i) out[i] = a[i] + b[i];
-	size_t n = out.size();
-	size_t vec_len = 8;
-	const size_t alignedN = n - n % vec_len;
-	for (size_t i = 0; i < alignedN; i += vec_len)
-		_mm256_storeu_ps(&out[i], _mm256_add_ps(_mm256_loadu_ps(&a[i]), _mm256_loadu_ps(&b[i])));
-	for (size_t i = alignedN; i < n; ++i) out[i] = a[i] + b[i];
-}
-
-void vadd(size_t n, const float_t *a, const float_t *b, float_t *out) {
-	size_t vec_len = 8;
-	const size_t alignedN = n - n % vec_len;
-	for (size_t i = 0; i < alignedN; i += vec_len)
-		_mm256_storeu_ps(&out[i], _mm256_add_ps(_mm256_loadu_ps(&a[i]), _mm256_loadu_ps(&b[i])));
-	for (size_t i = alignedN; i < n; ++i) out[i] = a[i] + b[i];
+void vadd(const vec_t& a, const vec_t& b, vec_t& out) {
+  // for (size_t i = 0; i < out.size(); ++i) out[i] = a[i] + b[i];
+  size_t n              = out.size();
+  size_t vec_len        = 8;
+  const size_t alignedN = n - n % vec_len;
+  for (size_t i = 0; i < alignedN; i += vec_len)
+    _mm256_storeu_ps(
+        &out[i], _mm256_add_ps(_mm256_loadu_ps(&a[i]), _mm256_loadu_ps(&b[i])));
+  for (size_t i = alignedN; i < n; ++i)
+    out[i] = a[i] + b[i];
+}
+
+void vadd(size_t n, const float_t* a, const float_t* b, float_t* out) {
+  size_t vec_len        = 8;
+  const size_t alignedN = n - n % vec_len;
+  for (size_t i = 0; i < alignedN; i += vec_len)
+    _mm256_storeu_ps(
+        &out[i], _mm256_add_ps(_mm256_loadu_ps(&a[i]), _mm256_loadu_ps(&b[i])));
+  for (size_t i = alignedN; i < n; ++i)
+    out[i] = a[i] + b[i];
 }
 #else
-void vadd(const vec_t &a, const vec_t &b, vec_t &out) {
-	for (size_t i = 0; i < out.size(); ++i) out[i] = a[i] + b[i];
+void vadd(const vec_t& a, const vec_t& b, vec_t& out) {
+  for (size_t i = 0; i < out.size(); ++i)
+    out[i] = a[i] + b[i];
 }
-void vadd(size_t n, const float_t *a, const float_t *b, float_t *out) {
-	for (size_t i = 0; i < n; ++i) out[i] = a[i] + b[i];
+void vadd(size_t n, const float_t* a, const float_t* b, float_t* out) {
+  for (size_t i = 0; i < n; ++i)
+    out[i] = a[i] + b[i];
 }
 #endif
 
 // vector subtract
-void vsub(const vec_t &in_a, const vec_t &in_b, vec_t &out) {
-	for (size_t i = 0; i < out.size(); ++i) out[i] = in_a[i] - in_b[i];
+void vsub(const vec_t& in_a, const vec_t& in_b, vec_t& out) {
+  for (size_t i = 0; i < out.size(); ++i)
+    out[i] = in_a[i] - in_b[i];
 }
 
 // vector multiply
-void vmul(const vec_t &in_a, const vec_t &in_b, vec_t &out) {
-	for (size_t i = 0; i < out.size(); ++i) out[i] = in_a[i] * in_b[i];
+void vmul(const vec_t& in_a, const vec_t& in_b, vec_t& out) {
+  for (size_t i = 0; i < out.size(); ++i)
+    out[i] = in_a[i] * in_b[i];
 }
 
 // vector divide
-void vdiv(const vec_t &in_a, const vec_t &in_b, vec_t &out) {
-	for (size_t i = 0; i < out.size(); ++i) {
-		assert(in_b[i] != 0);
-		out[i] = in_a[i] / in_b[i];
-	}
+void vdiv(const vec_t& in_a, const vec_t& in_b, vec_t& out) {
+  for (size_t i = 0; i < out.size(); ++i) {
+    assert(in_b[i] != 0);
+    out[i] = in_a[i] / in_b[i];
+  }
 }
 
 // vector add scalar
-void add_scalar(const float_t alpha, vec_t &Y) {
-	for (size_t i = 0; i < Y.size(); ++i) Y[i] += alpha;
+void add_scalar(const float_t alpha, vec_t& Y) {
+  for (size_t i = 0; i < Y.size(); ++i)
+    Y[i] += alpha;
 }
 
 // vector subtract scalar
-void sub_scalar(const float_t alpha, vec_t &Y) {
-	for (size_t i = 0; i < Y.size(); ++i) Y[i] -= alpha;
+void sub_scalar(const float_t alpha, vec_t& Y) {
+  for (size_t i = 0; i < Y.size(); ++i)
+    Y[i] -= alpha;
 }
 
 // vector multiply scalar
-void mul_scalar(const float_t alpha, vec_t &Y) {
-	for (size_t i = 0; i < Y.size(); ++i) Y[i] *= alpha;
+void mul_scalar(const float_t alpha, vec_t& Y) {
+  for (size_t i = 0; i < Y.size(); ++i)
+    Y[i] *= alpha;
 }
 
-void mul_scalar(size_t n, const float_t alpha, const float_t *in, float_t *out) {
-	for (size_t i = 0; i < n; ++i) out[i] = alpha *in[i];
+void mul_scalar(size_t n, const float_t alpha, const float_t* in,
+                float_t* out) {
+  for (size_t i = 0; i < n; ++i)
+    out[i] = alpha * in[i];
 }
 
 // vector divide scalar
-void div_scalar(const float_t alpha, vec_t &Y) {
-	assert(alpha != 0);
-	for (size_t i = 0; i < Y.size(); ++i) Y[i] /= alpha;
+void div_scalar(const float_t alpha, vec_t& Y) {
+  assert(alpha != 0);
+  for (size_t i = 0; i < Y.size(); ++i)
+    Y[i] /= alpha;
 }
 
 // dot product
-float_t dot(const vec_t &x, const vec_t &y) {
-	float_t sum = 0;
-	for (size_t i = 0; i < x.size(); ++i)
-		sum += x[i] * y[i];
-	return sum;
+float_t dot(const vec_t& x, const vec_t& y) {
+  float_t sum = 0;
+  for (size_t i = 0; i < x.size(); ++i)
+    sum += x[i] * y[i];
+  return sum;
 }
 
-float_t dot(size_t n, const float_t *x, const float_t *y) {
-	float_t sum = 0;
-	for (size_t i = 0; i < n; ++i)
-		sum += x[i] * y[i];
-	return sum;
+float_t dot(size_t n, const float_t* x, const float_t* y) {
+  float_t sum = 0;
+  for (size_t i = 0; i < n; ++i)
+    sum += x[i] * y[i];
+  return sum;
 }
 
 // matrix-vector multiply
-void mvmul(const vec_t &matrix, const vec_t &in_vector, vec_t &out_vector) {
-	size_t m = out_vector.size();
-	size_t n = in_vector.size();
-	for (size_t i = 0; i < m; ++i) { 
-		for (size_t j = 0; j < n; ++j) { 
-			out_vector[i] += matrix[i*n+j] * in_vector[j];
-		} 
-	} 
+void mvmul(const vec_t& matrix, const vec_t& in_vector, vec_t& out_vector) {
+  size_t m = out_vector.size();
+  size_t n = in_vector.size();
+  for (size_t i = 0; i < m; ++i) {
+    for (size_t j = 0; j < n; ++j) {
+      out_vector[i] += matrix[i * n + j] * in_vector[j];
+    }
+  }
 }
 
 // vector-vector multiply
-void vvmul(const vec_t &a, const vec_t &b, tensor_t &out) {
-	size_t m = a.size();
-	size_t n = b.size();
-	for (size_t i = 0; i < m; ++i) { 
-		for (size_t j = 0; j < n; ++j) { 
-			out[i][j] += a[i] * b[j];
-		} 
-	} 
+void vvmul(const vec_t& a, const vec_t& b, tensor_t& out) {
+  size_t m = a.size();
+  size_t n = b.size();
+  for (size_t i = 0; i < m; ++i) {
+    for (size_t j = 0; j < n; ++j) {
+      out[i][j] += a[i] * b[j];
+    }
+  }
 }
 
 // matrix addition
-void matadd(size_t x, size_t y, const tensor_t &A, const tensor_t &B, tensor_t &C) {
-	for (size_t i = 0; i < x; ++i)
-		for (size_t j = 0; j < y; ++j)
-			C[i][j] = A[i][j] + B[i][j];
+void matadd(size_t x, size_t y, const tensor_t& A, const tensor_t& B,
+            tensor_t& C) {
+  for (size_t i = 0; i < x; ++i)
+    for (size_t j = 0; j < y; ++j)
+      C[i][j] = A[i][j] + B[i][j];
 }
 
 // TODO: vectorize
-void copy2D1D(const tensor_t &in, vec_t &out) {
-	size_t x = in.size();
-	size_t y = in[0].size();
-	auto ptr = &out[0];
-	for (size_t i = 0; i < x; i++) {
-		std::copy(in[i].begin(), in[i].end(), ptr);
-		ptr += y;
-	}
-}
-
-void copy1D1D(const vec_t &in, vec_t &out) {
-	std::copy(in.begin(), in.end(), &out[0]);
-}
-
-void copy1D1D(size_t len, const float_t *in, float_t *out) {
-	std::copy(in, in+len, out);
-}
-
-void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, 
-		const int M, const int N, const int K, const float alpha, 
-		const float* A, const float* B, const float beta, float* C) {
-	int lda = (TransA == CblasNoTrans) ? K : M;
-	int ldb = (TransB == CblasNoTrans) ? N : K;
-	cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N);
-}
-
-void matmul2D(const tensor_t &A, const tensor_t &B, tensor_t &C) {
-	// A: x*z; B: z*y; C: x*y
-	size_t dim_x = A.size();
-	size_t dim_y = C[0].size();
-	size_t dim_z = A[0].size();
-	assert(C.size() == dim_x);
-	assert(B.size() == dim_z);
-	assert(B[0].size() == dim_y);
-
-	for (size_t i = 0; i < dim_x; ++i) { 
-		for (size_t j = 0; j < dim_y; ++j) { 
-			C[i][j] = 0;
-			for (size_t k = 0; k < dim_z; ++k) { 
-				C[i][j] += A[i][k] * B[k][j];
-			} 
-		} 
-	} 
-}
-
-void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z, 
-	const float_t *A, const float_t *B, float_t *C) {
-	galois::StatTimer Tmatmul("MatMul");
-	Tmatmul.start();
-	const CBLAS_TRANSPOSE TransA = CblasNoTrans;
-	const CBLAS_TRANSPOSE TransB = CblasNoTrans;
-	sgemm_cpu(TransA, TransB, dim_x, dim_y, dim_z, 1.0, A, B, 0.0, C);
-	Tmatmul.stop();
-}
-
-void matmul2D1D(const size_t dim_y, const tensor_t &A, const vec_t &B, vec_t &C) {
-	// A: x*z; B: z*y; C: x*y
-	size_t dim_x = A.size();
-	size_t dim_z = A[0].size();
-	assert(B.size() == dim_z*dim_y);
-	assert(C.size() == dim_x*dim_y);
-	vec_t A1D(dim_x*dim_z);
-	copy2D1D(A, A1D);
-	matmul1D1D(dim_x, dim_y, dim_z, &A1D[0], &B[0], &C[0]);
-}
-
-void matmul(const tensor_t &A, const vec_t &B, tensor_t &C) {
-	// A: x*z; B: z*y; C: x*y
-	size_t dim_x = C.size();
-	size_t dim_y = C[0].size();
-	size_t dim_z = A[0].size();
-	assert(A.size() == dim_x);
-	assert(B.size() == dim_y*dim_z);
-	vec_t A1D(dim_x*dim_z);
-	vec_t C1D(dim_x*dim_y, 0);
-	auto ptr = &A1D[0];
-	for (size_t i = 0; i < dim_x; i++) {
-		std::copy(A[i].begin(), A[i].end(), ptr);
-		ptr += dim_z;
-	}
-	matmul1D1D(dim_x, dim_y, dim_z, &A1D[0], &B[0], &C1D[0]);
-	for (size_t i = 0; i < dim_x; i++) {
-		for (size_t j = 0; j < dim_y; ++j) { 
-			C[i][j] = C1D[i*dim_y+j];
-		}
-	}
-}
-
-void transpose2D(const tensor_t &in, tensor_t &out) {
-	size_t x = in.size();
-	size_t y = in[0].size();
-	for (size_t i = 0; i < y; i ++) {
-		for (size_t j = 0; j < x; j ++) {
-			out[i][j] = in[j][i];
-		}
-	}
+void copy2D1D(const tensor_t& in, vec_t& out) {
+  size_t x = in.size();
+  size_t y = in[0].size();
+  auto ptr = &out[0];
+  for (size_t i = 0; i < x; i++) {
+    std::copy(in[i].begin(), in[i].end(), ptr);
+    ptr += y;
+  }
+}
+
+void copy1D1D(const vec_t& in, vec_t& out) {
+  std::copy(in.begin(), in.end(), &out[0]);
+}
+
+void copy1D1D(size_t len, const float_t* in, float_t* out) {
+  std::copy(in, in + len, out);
+}
+
+void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
+               const int M, const int N, const int K, const float alpha,
+               const float* A, const float* B, const float beta, float* C) {
+  int lda = (TransA == CblasNoTrans) ? K : M;
+  int ldb = (TransB == CblasNoTrans) ? N : K;
+  cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb,
+              beta, C, N);
+}
+
+void matmul2D(const tensor_t& A, const tensor_t& B, tensor_t& C) {
+  // A: x*z; B: z*y; C: x*y
+  size_t dim_x = A.size();
+  size_t dim_y = C[0].size();
+  size_t dim_z = A[0].size();
+  assert(C.size() == dim_x);
+  assert(B.size() == dim_z);
+  assert(B[0].size() == dim_y);
+
+  for (size_t i = 0; i < dim_x; ++i) {
+    for (size_t j = 0; j < dim_y; ++j) {
+      C[i][j] = 0;
+      for (size_t k = 0; k < dim_z; ++k) {
+        C[i][j] += A[i][k] * B[k][j];
+      }
+    }
+  }
+}
+
+void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z,
+                const float_t* A, const float_t* B, float_t* C) {
+  galois::StatTimer Tmatmul("MatMul");
+  Tmatmul.start();
+  const CBLAS_TRANSPOSE TransA = CblasNoTrans;
+  const CBLAS_TRANSPOSE TransB = CblasNoTrans;
+  sgemm_cpu(TransA, TransB, dim_x, dim_y, dim_z, 1.0, A, B, 0.0, C);
+  Tmatmul.stop();
+}
+
+void matmul2D1D(const size_t dim_y, const tensor_t& A, const vec_t& B,
+                vec_t& C) {
+  // A: x*z; B: z*y; C: x*y
+  size_t dim_x = A.size();
+  size_t dim_z = A[0].size();
+  assert(B.size() == dim_z * dim_y);
+  assert(C.size() == dim_x * dim_y);
+  vec_t A1D(dim_x * dim_z);
+  copy2D1D(A, A1D);
+  matmul1D1D(dim_x, dim_y, dim_z, &A1D[0], &B[0], &C[0]);
+}
+
+void matmul(const tensor_t& A, const vec_t& B, tensor_t& C) {
+  // A: x*z; B: z*y; C: x*y
+  size_t dim_x = C.size();
+  size_t dim_y = C[0].size();
+  size_t dim_z = A[0].size();
+  assert(A.size() == dim_x);
+  assert(B.size() == dim_y * dim_z);
+  vec_t A1D(dim_x * dim_z);
+  vec_t C1D(dim_x * dim_y, 0);
+  auto ptr = &A1D[0];
+  for (size_t i = 0; i < dim_x; i++) {
+    std::copy(A[i].begin(), A[i].end(), ptr);
+    ptr += dim_z;
+  }
+  matmul1D1D(dim_x, dim_y, dim_z, &A1D[0], &B[0], &C1D[0]);
+  for (size_t i = 0; i < dim_x; i++) {
+    for (size_t j = 0; j < dim_y; ++j) {
+      C[i][j] = C1D[i * dim_y + j];
+    }
+  }
+}
+
+void transpose2D(const tensor_t& in, tensor_t& out) {
+  size_t x = in.size();
+  size_t y = in[0].size();
+  for (size_t i = 0; i < y; i++) {
+    for (size_t j = 0; j < x; j++) {
+      out[i][j] = in[j][i];
+    }
+  }
 }
 
 // TODO: vectorize
-void transpose2D1D(const tensor_t &in, vec_t &out) {
-	size_t x = in.size();
-	size_t y = in[0].size();
-	assert(out.size() == x*y);
-	for (size_t i = 0; i < y; i ++) {
-		for (size_t j = 0; j < x; j ++) {
-			out[i*x+j] = in[j][i];
-		}
-	}
+void transpose2D1D(const tensor_t& in, vec_t& out) {
+  size_t x = in.size();
+  size_t y = in[0].size();
+  assert(out.size() == x * y);
+  for (size_t i = 0; i < y; i++) {
+    for (size_t j = 0; j < x; j++) {
+      out[i * x + j] = in[j][i];
+    }
+  }
 }
 
-void transpose(size_t x, size_t y, const vec_t &in, vec_t &out) {
-	for (size_t i = 0; i < y; i ++) {
-		for (size_t j = 0; j < x; j ++) {
-			out[i*x+j] = in[j*y+i];
-		}
-	}
+void transpose(size_t x, size_t y, const vec_t& in, vec_t& out) {
+  for (size_t i = 0; i < y; i++) {
+    for (size_t j = 0; j < x; j++) {
+      out[i * x + j] = in[j * y + i];
+    }
+  }
 }
 
-void transpose(size_t x, size_t y, const float_t *in, float_t *out) {
-	for (size_t i = 0; i < y; i ++) {
-		for (size_t j = 0; j < x; j ++) {
-			out[i*x+j] = in[j*y+i];
-		}
-	}
+void transpose(size_t x, size_t y, const float_t* in, float_t* out) {
+  for (size_t i = 0; i < y; i++) {
+    for (size_t j = 0; j < x; j++) {
+      out[i * x + j] = in[j * y + i];
+    }
+  }
 }
 
-int argmax(const size_t n, const vec_t &x) {
-	float_t max = x[0];
-	int max_ind = 0;
-	for (size_t i = 1; i < n; i++) {
-		if (x[i] > max) {
-			max_ind = i;
-			max = x[i];
-		}
-	}
-	return max_ind;
+int argmax(const size_t n, const vec_t& x) {
+  float_t max = x[0];
+  int max_ind = 0;
+  for (size_t i = 1; i < n; i++) {
+    if (x[i] > max) {
+      max_ind = i;
+      max     = x[i];
+    }
+  }
+  return max_ind;
 }
 
-int argmax(const size_t n, const float_t *x) {
-	float_t max = x[0];
-	int max_ind = 0;
-	for (size_t i = 1; i < n; i++) {
-		if (x[i] > max) {
-			max_ind = i;
-			max = x[i];
-		}
-	}
-	return max_ind;
+int argmax(const size_t n, const float_t* x) {
+  float_t max = x[0];
+  int max_ind = 0;
+  for (size_t i = 1; i < n; i++) {
+    if (x[i] > max) {
+      max_ind = i;
+      max     = x[i];
+    }
+  }
+  return max_ind;
 }
 
-void clear(vec_t &in) {
-	for (size_t i = 0; i < in.size(); i++) in[i] = 0;
+void clear(vec_t& in) {
+  for (size_t i = 0; i < in.size(); i++)
+    in[i] = 0;
 }
 
-void clear(size_t n, float_t *in) {
-	for (size_t i = 0; i < n; i++) in[i] = 0;
+void clear(size_t n, float_t* in) {
+  for (size_t i = 0; i < n; i++)
+    in[i] = 0;
 }
 
-void relu(const vec_t &in, vec_t &out) {
-	for (size_t i = 0; i < out.size(); ++i) {
-		out[i] = std::max(in[i], (float_t)0) + negative_slope * std::min(in[i], (float_t)0);
-	}
+void relu(const vec_t& in, vec_t& out) {
+  for (size_t i = 0; i < out.size(); ++i) {
+    out[i] = std::max(in[i], (float_t)0) +
+             negative_slope * std::min(in[i], (float_t)0);
+  }
 }
 
-void relu(size_t n, const float_t *in, float_t *out) {
-	for (size_t i = 0; i < n; ++i)
-		out[i] = std::max(in[i], float_t(0));
+void relu(size_t n, const float_t* in, float_t* out) {
+  for (size_t i = 0; i < n; ++i)
+    out[i] = std::max(in[i], float_t(0));
 }
 
-void d_relu(const vec_t &in_diff, const vec_t &fv, vec_t &out_diff) {
-	for (size_t i = 0; i < out_diff.size(); ++i) {
-		out_diff[i] = in_diff[i] * ((fv[i] > (float_t)0) + negative_slope * (fv[i] <= (float_t)0));
-	}
+void d_relu(const vec_t& in_diff, const vec_t& fv, vec_t& out_diff) {
+  for (size_t i = 0; i < out_diff.size(); ++i) {
+    out_diff[i] = in_diff[i] * ((fv[i] > (float_t)0) +
+                                negative_slope * (fv[i] <= (float_t)0));
+  }
 }
 
-void d_mvmul(vec_t &in_diff, vec_t &h_in, tensor_t &out_diff) {
-	vvmul(h_in, in_diff, out_diff); // transposed feature matrix X^T times in_diff 
+void d_mvmul(vec_t& in_diff, vec_t& h_in, tensor_t& out_diff) {
+  vvmul(h_in, in_diff, out_diff); // transposed feature matrix X^T times in_diff
 }
 
-void d_vadd(vec_t &in_diff, vec_t &out_diff) {
-	for (size_t i = 0; i < out_diff.size(); ++i)
-		out_diff[i] = in_diff[i];
+void d_vadd(vec_t& in_diff, vec_t& out_diff) {
+  for (size_t i = 0; i < out_diff.size(); ++i)
+    out_diff[i] = in_diff[i];
 }
 
-float reduce_mean(const vec_t &x) {
-	size_t n = x.size();
-	assert(n > 0);
-	float sum = (float)x[0];
-	for (size_t i = 1; i < n; i++) {
-		sum += (float)x[i];
-	}
-	return sum / (float)n;
+float reduce_mean(const vec_t& x) {
+  size_t n = x.size();
+  assert(n > 0);
+  float sum = (float)x[0];
+  for (size_t i = 1; i < n; i++) {
+    sum += (float)x[i];
+  }
+  return sum / (float)n;
 }
 
-void dropout(const float scale, const float dropout_rate, const vec_t &in, std::vector<unsigned> &masks, vec_t &out) {
-	assert(masks.size() == out.size());
-	//rng_bernoulli(1. - dropout_rate, masks); // Create random numbers
-	for (size_t i = 0; i < in.size(); ++i)
-		masks[i] = bernoulli(dropout_rate);
-	for (size_t i = 0; i < in.size(); ++i)
-		out[i] = in[i] * masks[i] * scale;
+void dropout(const float scale, const float dropout_rate, const vec_t& in,
+             std::vector<unsigned>& mask, vec_t& out) {
+  assert(mask.size() == out.size());
+  // rng_bernoulli(1. - dropout_rate, mask); // Create random numbers
+  for (size_t i = 0; i < in.size(); ++i)
+    mask[i] = bernoulli(dropout_rate);
+  for (size_t i = 0; i < in.size(); ++i)
+    out[i] = in[i] * mask[i] * scale;
 }
 
-void dropout(const float scale, const float dropout_rate, const vec_t &in, std::vector<unsigned> &masks, float_t *out) {
-	for (size_t i = 0; i < in.size(); ++i)
-		masks[i] = bernoulli(dropout_rate);
-	for (size_t i = 0; i < in.size(); ++i)
-		out[i] = in[i] * masks[i] * scale;
+void dropout(const float scale, const float dropout_rate, const vec_t& in,
+             std::vector<unsigned>& mask, float_t* out) {
+  for (size_t i = 0; i < in.size(); ++i)
+    mask[i] = bernoulli(dropout_rate);
+  for (size_t i = 0; i < in.size(); ++i)
+    out[i] = in[i] * mask[i] * scale;
 }
 
-void dropout(size_t n, const float scale, const float dropout_rate, const float_t *in, unsigned *masks, float_t *out) {
-	for (size_t i = 0; i < n; ++i)
-		masks[i] = bernoulli(dropout_rate);
-	for (size_t i = 0; i < n; ++i)
-		out[i] = in[i] * masks[i] * scale;
+void dropout(size_t n, const float scale, const float dropout_rate,
+             const float_t* in, unsigned* mask, float_t* out) {
+  for (size_t i = 0; i < n; ++i)
+    mask[i] = bernoulli(dropout_rate);
+  for (size_t i = 0; i < n; ++i)
+    out[i] = in[i] * mask[i] * scale;
 }
 
-void d_dropout(const float scale, const vec_t &in_diff, std::vector<unsigned> &masks, vec_t &out_diff) {
-	for (size_t i = 0; i < in_diff.size(); ++i)
-		out_diff[i] = in_diff[i] * masks[i] * scale;
+void d_dropout(const float scale, const vec_t& in_diff,
+               std::vector<unsigned>& mask, vec_t& out_diff) {
+  for (size_t i = 0; i < in_diff.size(); ++i)
+    out_diff[i] = in_diff[i] * mask[i] * scale;
 }
 
-void d_dropout(size_t n, const float scale, const float_t *in_diff, unsigned *masks, float_t *out_diff) {
-	for (size_t i = 0; i < n; ++i)
-		out_diff[i] = in_diff[i] * masks[i] * scale;
+void d_dropout(size_t n, const float scale, const float_t* in_diff,
+               unsigned* mask, float_t* out_diff) {
+  for (size_t i = 0; i < n; ++i)
+    out_diff[i] = in_diff[i] * mask[i] * scale;
 }
 
-float_t sigmoid_func(float_t x) {
-	return 0.5 * tanh(0.5 * x) + 0.5;
-}
+float_t sigmoid_func(float_t x) { return 0.5 * tanh(0.5 * x) + 0.5; }
 
 // Sigmoid
-void sigmoid(vec_t &fv) {
-	size_t count = fv.size();
-	for (size_t i = 0; i < count; ++i) {
-		fv[i] = sigmoid_func(fv[i]);
-	}
+void sigmoid(vec_t& fv) {
+  size_t count = fv.size();
+  for (size_t i = 0; i < count; ++i) {
+    fv[i] = sigmoid_func(fv[i]);
+  }
 }
 
 // Softmax function takes an N-dimensional vector (X) of real number,
-// and transforms it into a vector of real number in range (0,1) which add upto 1.
-// To make softmax func numerically stable, we simply normalize the values in the vector, 
-// by multiplying the numerator and denominator with a constant C, where log(C)=-max(X)
+// and transforms it into a vector of real number in range (0,1) which add
+// upto 1. To make softmax func numerically stable, we simply normalize the
+// values in the vector, by multiplying the numerator and denominator with a
+// constant C, where log(C)=-max(X)
 //    exps = np.exp(X - np.max(X))
 //    exps / np.sum(exps)
-void softmax(const vec_t &input, vec_t &output) {
-	const float_t max = *std::max_element(input.begin(), input.end());
-	float_t denominator(0);
-	for (size_t i = 0; i < input.size(); i++) {
-		output[i] = std::exp(input[i] - max);
-		denominator += output[i];
-	}
-	for (size_t i = 0; i < input.size(); i++)
-		output[i] /= denominator;
-}
-
-void softmax(size_t n, const float_t *input, float_t *output) {
-	const float_t max = *std::max_element(input, input+n);
-	float_t denominator(0);
-	for (size_t i = 0; i < n; i++) {
-		output[i] = std::exp(input[i] - max);
-		denominator += output[i];
-	}
-	for (size_t i = 0; i < n; i++)
-		output[i] /= denominator;
-}
-
-void log_softmax(const vec_t &input, vec_t &output) {
-	const float_t max = *std::max_element(input.begin(), input.end());
-	float_t denominator(0);
-	for (size_t i = 0; i < input.size(); i++)
-		denominator += std::exp(input[i] - max);
-	for (size_t i = 0; i < input.size(); i++)
-		output[i] = input[i] - max - denominator;
-}
-
-// Due to the desirable property of softmax function outputting a probability distribution, 
-// we often use it as the final layer in neural networks.
-// For this we need to calculate the derivative or gradient,
-// and pass it back to the previous layer during backpropagation.
-void d_softmax(const vec_t &y, const vec_t &p, vec_t &dy, const vec_t &dp) {
-	auto n = y.size();
-	vec_t df(n, 0);
-	for (size_t i = 0; i < n; i++) {
-		for (size_t j = 0; j < n; j++) {
-			//float_t delta_ij = i == j? 1 : 0;
-			//df[i] += p[j] * (delta_ij - p[i]);
-			df[j] = (j == i) ? p[i] * (float_t(1) - p[i]) : -p[j] * p[i];
-		}
-		// dy = dp * (gradient of softmax)
-		dy[i] = dot(dp, df);
-	}
-}
-
-void d_softmax(size_t n, const float_t *y, const float_t *p, float_t *dy, const float_t *dp) {
-	vec_t df(n, 0);
-	for (size_t i = 0; i < n; i++) {
-		for (size_t j = 0; j < n; j++) {
-			df[j] = (j == i) ? p[i] * (float_t(1) - p[i]) : -p[j] * p[i];
-		}
-		dy[i] = dot(n, dp, &df[0]);
-	}
+void softmax(const vec_t& input, vec_t& output) {
+  const float_t max = *std::max_element(input.begin(), input.end());
+  float_t denominator(0);
+  for (size_t i = 0; i < input.size(); i++) {
+    output[i] = std::exp(input[i] - max);
+    denominator += output[i];
+  }
+  for (size_t i = 0; i < input.size(); i++)
+    output[i] /= denominator;
+}
+
+void softmax(size_t n, const float_t* input, float_t* output) {
+  const float_t max = *std::max_element(input, input + n);
+  float_t denominator(0);
+  for (size_t i = 0; i < n; i++) {
+    output[i] = std::exp(input[i] - max);
+    denominator += output[i];
+  }
+  for (size_t i = 0; i < n; i++)
+    output[i] /= denominator;
+}
+
+void log_softmax(const vec_t& input, vec_t& output) {
+  const float_t max = *std::max_element(input.begin(), input.end());
+  float_t denominator(0);
+  for (size_t i = 0; i < input.size(); i++)
+    denominator += std::exp(input[i] - max);
+  for (size_t i = 0; i < input.size(); i++)
+    output[i] = input[i] - max - denominator;
+}
+
+// Due to the desirable property of softmax function outputting a probability
+// distribution, we often use it as the final layer in neural networks. For this
+// we need to calculate the derivative or gradient, and pass it back to the
+// previous layer during backpropagation.
+void d_softmax(const vec_t& y, const vec_t& p, vec_t& dy, const vec_t& dp) {
+  auto n = y.size();
+  vec_t df(n, 0);
+  for (size_t i = 0; i < n; i++) {
+    for (size_t j = 0; j < n; j++) {
+      // float_t delta_ij = i == j? 1 : 0;
+      // df[i] += p[j] * (delta_ij - p[i]);
+      df[j] = (j == i) ? p[i] * (float_t(1) - p[i]) : -p[j] * p[i];
+    }
+    // dy = dp * (gradient of softmax)
+    dy[i] = dot(dp, df);
+  }
+}
+
+void d_softmax(size_t n, const float_t* y, const float_t* p, float_t* dy,
+               const float_t* dp) {
+  vec_t df(n, 0);
+  for (size_t i = 0; i < n; i++) {
+    for (size_t j = 0; j < n; j++) {
+      df[j] = (j == i) ? p[i] * (float_t(1) - p[i]) : -p[j] * p[i];
+    }
+    dy[i] = dot(n, dp, &df[0]);
+  }
 }
 
 // cross-entropy loss function for multi-class classification
 // y: ground truth
 // p: predicted probability
-float_t cross_entropy(const vec_t &y, const vec_t &p) {
-	auto n = y.size();
-	assert(n > 0);
-	float_t loss = 0.0;
-	for (size_t i = 0; i < n; i++) {
-		if (y[i] == float_t(0)) continue;
-		if (p[i] == float_t(0)) loss -= y[i] * std::log(float_t(1e-10));
-		//if (p[i]==float_t(1)) loss -= (float_t(1) - y[i]) * std::log(float_t(1e-10));
-		else loss -= y[i] * std::log(p[i]);// + (float_t(1) - y[i]) * std::log(float_t(1) - p[i]);
-		//loss -= y[i] * std::log(p[i]);
-	}
-	return loss;
-}
-
-float_t cross_entropy(size_t n, const float_t *y, const float_t *p) {
-	float_t loss = 0.0;
-	for (size_t i = 0; i < n; i++) {
-		if (y[i] == float_t(0)) continue;
-		if (p[i] == float_t(0)) loss -= y[i] * std::log(float_t(1e-10));
-		else loss -= y[i] * std::log(p[i]);
-	}
-	return loss;
-}
-
-void d_cross_entropy(const vec_t &y, const vec_t &p, vec_t &d) {
-	auto n = y.size();
-	//for (size_t i = 0; i < n; i++) d[i] = (p[i] - y[i]) / (p[i] * (float_t(1) - p[i]));
-	for (size_t i = 0; i < n; i++) {
-		d[i] = -y[i] / (p[i] + float_t(1e-10));
-		//d[i] = p[i] - y[i];
-	}
-}
-
-void d_cross_entropy(size_t n, const float_t *y, const float_t *p, float_t *d) {
-	for (size_t i = 0; i < n; i++) {
-		d[i] = -y[i] / (p[i] + float_t(1e-10));
-	}
+float_t cross_entropy(const vec_t& y, const vec_t& p) {
+  auto n = y.size();
+  assert(n > 0);
+  float_t loss = 0.0;
+  for (size_t i = 0; i < n; i++) {
+    if (y[i] == float_t(0))
+      continue;
+    if (p[i] == float_t(0))
+      loss -= y[i] * std::log(float_t(1e-10));
+    // if (p[i]==float_t(1)) loss -= (float_t(1) - y[i]) *
+    // std::log(float_t(1e-10));
+    else
+      loss -=
+          y[i] * std::log(p[i]); // + (float_t(1) - y[i]) * std::log(float_t(1)
+                                 // - p[i]); loss -= y[i] * std::log(p[i]);
+  }
+  return loss;
+}
+
+float_t cross_entropy(size_t n, const float_t* y, const float_t* p) {
+  float_t loss = 0.0;
+  for (size_t i = 0; i < n; i++) {
+    if (y[i] == float_t(0))
+      continue;
+    if (p[i] == float_t(0))
+      loss -= y[i] * std::log(float_t(1e-10));
+    else
+      loss -= y[i] * std::log(p[i]);
+  }
+  return loss;
+}
+
+void d_cross_entropy(const vec_t& y, const vec_t& p, vec_t& d) {
+  auto n = y.size();
+  // for (size_t i = 0; i < n; i++) d[i] = (p[i] - y[i]) / (p[i] * (float_t(1) -
+  // p[i]));
+  for (size_t i = 0; i < n; i++) {
+    d[i] = -y[i] / (p[i] + float_t(1e-10));
+    // d[i] = p[i] - y[i];
+  }
+}
+
+void d_cross_entropy(size_t n, const float_t* y, const float_t* p, float_t* d) {
+  for (size_t i = 0; i < n; i++) {
+    d[i] = -y[i] / (p[i] + float_t(1e-10));
+  }
 }
-
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index 415e141ec9..8dbe141c96 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -1,288 +1,253 @@
 #include "math_functions.hh"
 #include "context.h"
-#include "gg.h"
-#include "ggcuda.h"
-#include "cub/cub.cuh"
-#include <curand_kernel.h>
 
-
-void gpu_rng_uniform(const int n, unsigned *r) {
-	CURAND_CHECK(curandGenerate(Context::curand_generator(), r, n));
-}
-
-void gpu_rng_uniform(const int n, const float_t a, const float_t b, float_t* r) {
-	CURAND_CHECK(curandGenerateUniform(Context::curand_generator(), r, n));
-	const float range = b - a;
-	if (range != float_t(1)) scal_gpu(n, range, r);
-	if (a != float_t(0)) add_scalar_gpu(n, a, r);
-}
-
-void gpu_rng_gaussian(const int n, const float_t mu, const float_t sigma, float_t *r) {
-	CURAND_CHECK(curandGenerateNormal(Context::curand_generator(), r, n, mu, sigma));
-}
-
-void loss_malloc_device(int n, float_t *loss) {
-	CUDA_CHECK(cudaMalloc((void **)&loss, n * sizeof(float_t)));
-}
-
-void copy_masks_device(int n, mask_t *h_masks, mask_t *d_masks) {
-	assert(h_masks != NULL);
-	CUDA_CHECK(cudaMalloc((void **)&d_masks, n * sizeof(mask_t)));
-	CUDA_CHECK(cudaMemcpy(d_masks, h_masks, n * sizeof(mask_t), cudaMemcpyHostToDevice));
+void gpu_rng_uniform(const int n, unsigned* r) {
+  CURAND_CHECK(curandGenerate(Context::curand_generator(), r, n));
 }
 
-void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned *masks, float_t *in, float_t *out, float_t *matrix, float_t *grad) {
-	if (dropout) CUDA_CHECK(cudaMalloc((void **)&masks, x * y * sizeof(unsigned)));
-	CUDA_CHECK(cudaMalloc((void **)&in, x * y * sizeof(float_t)));
-	CUDA_CHECK(cudaMalloc((void **)&out, x * z * sizeof(float_t)));
-	CUDA_CHECK(cudaMalloc((void **)&matrix, y * z * sizeof(float_t)));
-	auto init_range = sqrt(6.0/(y + z));
-	// Glorot & Bengio (AISTATS 2010)
-	gpu_rng_uniform(y*z, -init_range, init_range, matrix);
-	CUDA_CHECK(cudaMalloc((void **)&grad, y * z * sizeof(float_t)));
-	CUDA_CHECK(cudaMemset(grad, 0, y * z * sizeof(float_t)));
+void gpu_rng_uniform(const int n, const float_t a, const float_t b,
+                     float_t* r) {
+  CURAND_CHECK(curandGenerateUniform(Context::curand_generator(), r, n));
+  const float range = b - a;
+  if (range != float_t(1))
+    scal_gpu(n, range, r);
+  if (a != float_t(0))
+    add_scalar_gpu(n, a, r);
 }
 
-void copy_gpu(size_t len, const float_t *in, float_t *out) {
-	CUDA_CHECK(cudaMemcpy(out, in, len * sizeof(float_t), cudaMemcpyDeviceToDevice));
+void gpu_rng_gaussian(const int n, const float_t mu, const float_t sigma,
+                      float_t* r) {
+  CURAND_CHECK(
+      curandGenerateNormal(Context::curand_generator(), r, n, mu, sigma));
 }
 
-__global__ void setup_curand_kernel(const int n, curandState *state) {
-	CUDA_KERNEL_LOOP(i, n) {
-		curand_init(1234, i, 0, &state[i]); // Each thread gets same seed 1234
-		//curand_init(7+i, i, 0, &state[i]); // Each thread gets different seed
-	}
+void out_malloc_device(int n, mask_t* h_masks, mask_t* d_masks, float_t* loss) {
+  CUDA_CHECK(cudaMalloc((void**)&d_masks, n * sizeof(mask_t)));
+  CUDA_CHECK(
+      cudaMemcpy(d_masks, h_masks, n * sizeof(mask_t), cudaMemcpyHostToDevice));
+  CUDA_CHECK(cudaMalloc((void**)&loss, n * sizeof(float_t)));
 }
 
-__device__ bool bernoulli_gpu(int tid, curandState *state, float_t p) {
-	curandState local_state = state[tid];
-	return curand_uniform(&local_state) <= p;
+void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout,
+                         unsigned* masks, float_t* in, float_t* out,
+                         float_t* matrix, float_t* grad) {
+  if (dropout)
+    CUDA_CHECK(cudaMalloc((void**)&masks, x * y * sizeof(unsigned)));
+  CUDA_CHECK(cudaMalloc((void**)&in, x * y * sizeof(float_t)));
+  CUDA_CHECK(cudaMalloc((void**)&out, x * z * sizeof(float_t)));
+  CUDA_CHECK(cudaMalloc((void**)&matrix, y * z * sizeof(float_t)));
+  auto init_range = sqrt(6.0 / (y + z));
+  // Glorot & Bengio (AISTATS 2010)
+  gpu_rng_uniform(y * z, -init_range, init_range, matrix);
+  CUDA_CHECK(cudaMalloc((void**)&grad, y * z * sizeof(float_t)));
+  CUDA_CHECK(cudaMemset(grad, 0, y * z * sizeof(float_t)));
 }
 
-__global__ void dropout_kernel(const int n, const float scale, const float dropout_rate, const float_t* in, unsigned *masks, curandState *state, float_t* out) {
-	CUDA_KERNEL_LOOP(i, n) {
-		masks[i] = bernoulli_gpu(i, state, dropout_rate);
-		out[i] = in[i] * masks[i] * scale;
-	}
+void copy_gpu(size_t len, const float_t* in, float_t* out) {
+  CUDA_CHECK(
+      cudaMemcpy(out, in, len * sizeof(float_t), cudaMemcpyDeviceToDevice));
 }
 
-void dropout_gpu(const int n, const float scale, const float dropout_rate, const float_t *in, unsigned *masks, float_t *out) {
-	curandState *devStates;
-	CUDA_CHECK(cudaMalloc((void **)&devStates, n * sizeof(curandState)));
-	std::cout << "[debug]: setup curand, n = " << n << "\n";
-	setup_curand_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, devStates);
-	CudaTest("solving setup_curand kernel failed");
-	std::cout << "[debug]: dropout_gpu\n";
-	dropout_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, scale, dropout_rate, in, masks, devStates, out);
-	CudaTest("solving dropout kernel failed");
-	CUDA_CHECK(cudaFree(devStates));
-	std::cout << "[debug]: dropout_gpu done\n";
+__global__ void dropout_kernel(const int n, const float scale,
+                               const float dropout_rate, const float_t* in,
+                               unsigned* masks, float_t* out) {
+  CUDA_KERNEL_LOOP(i, n) {
+    // masks[i] = bernoulli(dropout_rate);
+    out[i] = in[i] * masks[i] * scale;
+  }
 }
 
-__global__ void d_dropout_kernel(const int n, const float scale, const float_t *in, const unsigned *masks, float_t *out) {
-	CUDA_KERNEL_LOOP(i, n) {
-		out[i] = in[i] * masks[i] * scale;
-	}
-}
-
-void d_dropout_gpu(const int n, const float scale, const float_t *in, const unsigned *masks, float_t *out) {
-	d_dropout_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, scale, in, masks, out);
-	CudaTest("solving dropout kernel failed");
+void dropout_gpu(const int n, const float scale, const float dropout_rate,
+                 const float_t* in, unsigned* masks, float_t* out) {
+  dropout_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
+      n, scale, dropout_rate, in, masks, out);
 }
 
 // flattern data into 1D before feed into the ReLU operater
 __global__ void relu_kernel(const int n, const float_t* in, float_t* out) {
-	CUDA_KERNEL_LOOP(index, n) {
-		out[index] = in[index] > 0 ? in[index] : 0;
-	}
+  CUDA_KERNEL_LOOP(index, n) { out[index] = in[index] > 0 ? in[index] : 0; }
 }
 
-void relu_gpu(const int n, const float_t *in, float_t* out) {
-	std::cout << "[debug]: relu_gpu\n";
-	relu_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, in, out);
-	CudaTest("solving relu kernel failed");
+void relu_gpu(const int n, const float_t* in, float_t* out) {
+  relu_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, in, out);
 }
 
-__global__ void d_relu_kernel(const int n, const float_t* in_diff, const float_t* data, float_t* out_diff) {
-	CUDA_KERNEL_LOOP(index, n) {
-		out_diff[index] = data[index] > 0 ? in_diff[index] : 0;
-	}
+__global__ void d_relu_kernel(const int n, const float_t* in_diff,
+                              const float_t* data, float_t* out_diff) {
+  CUDA_KERNEL_LOOP(index, n) {
+    out_diff[index] = data[index] > 0 ? in_diff[index] : 0;
+  }
 }
 
-void d_relu_gpu(const int n, const float_t *in_diff, const float_t *data, float_t *out_diff) {
-	d_relu_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, in_diff, data, out_diff);
-	CudaTest("solving d_relu kernel failed");
+void d_relu_gpu(const int n, const float_t* in_diff, const float_t* data,
+                float_t* out_diff) {
+  d_relu_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, in_diff, data,
+                                                          out_diff);
 }
 
-void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, 
-	const int M, const int N, const int K, const float alpha, 
-	const float* A, const float* B, const float beta, float* C) {
-	// Note that cublas follows fortran order.
-	int lda = (TransA == CblasNoTrans) ? K : M;
-	int ldb = (TransB == CblasNoTrans) ? N : K;
-	cublasOperation_t cuTransA = (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-	cublasOperation_t cuTransB = (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-	CUBLAS_CHECK(cublasSgemm(Context::cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
+void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
+               const int M, const int N, const int K, const float alpha,
+               const float* A, const float* B, const float beta, float* C) {
+  // Note that cublas follows fortran order.
+  int lda = (TransA == CblasNoTrans) ? K : M;
+  int ldb = (TransB == CblasNoTrans) ? N : K;
+  cublasOperation_t cuTransA =
+      (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  CUBLAS_CHECK(cublasSgemm(Context::cublas_handle(), cuTransB, cuTransA, N, M,
+                           K, &alpha, B, ldb, A, lda, &beta, C, N));
 }
 
-void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, const float_t *A, const float_t *B, float_t *C) {
-	std::cout << "[debug]: matmul1D1D_gpu\n";
-	const CBLAS_TRANSPOSE TransA = CblasNoTrans;
-	const CBLAS_TRANSPOSE TransB = CblasNoTrans;
-	sgemm_gpu(TransA, TransB, dim_x, dim_y, dim_z, 1.0, A, B, 0.0, C);
+void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z,
+                    const float_t* A, const float_t* B, float_t* C) {
+  const CBLAS_TRANSPOSE TransA = CblasNoTrans;
+  const CBLAS_TRANSPOSE TransB = CblasNoTrans;
+  sgemm_gpu(TransA, TransB, dim_x, dim_y, dim_z, 1.0, A, B, 0.0, C);
 }
 
 // the arguments of the maxima
-int argmax_gpu(const size_t n, const float_t *x) {
-	return 0;
-}
+int argmax_gpu(const size_t n, const float_t* x) { return 0; }
 
-void gemv_gpu(const CBLAS_TRANSPOSE TransA, const int M, const int N, 
-	const float alpha, const float* A, const float* x, const float beta, float* y) {
-	cublasOperation_t cuTransA = (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
-	CUBLAS_CHECK(cublasSgemv(Context::cublas_handle(), cuTransA, N, M, &alpha, A, N, x, 1, &beta, y, 1));
+void gemv_gpu(const CBLAS_TRANSPOSE TransA, const int M, const int N,
+              const float alpha, const float* A, const float* x,
+              const float beta, float* y) {
+  cublasOperation_t cuTransA =
+      (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
+  CUBLAS_CHECK(cublasSgemv(Context::cublas_handle(), cuTransA, N, M, &alpha, A,
+                           N, x, 1, &beta, y, 1));
 }
 
-void scal_gpu(const int N, const float alpha, float *X) {
-	CUBLAS_CHECK(cublasSscal(Context::cublas_handle(), N, &alpha, X, 1));
+void scal_gpu(const int N, const float alpha, float* X) {
+  CUBLAS_CHECK(cublasSscal(Context::cublas_handle(), N, &alpha, X, 1));
 }
 
 void dot_gpu(const int n, const float* x, const float* y, float* out) {
-	CUBLAS_CHECK(cublasSdot(Context::cublas_handle(), n, x, 1, y, 1, out));
+  CUBLAS_CHECK(cublasSdot(Context::cublas_handle(), n, x, 1, y, 1, out));
 }
 
 void asum_gpu(const int n, const float* x, float* y) {
-	CUBLAS_CHECK(cublasSasum(Context::cublas_handle(), n, x, 1, y));
+  CUBLAS_CHECK(cublasSasum(Context::cublas_handle(), n, x, 1, y));
 }
 
-void scale_gpu(const int n, const float alpha, const float *x, float* y) {
-	CUBLAS_CHECK(cublasScopy(Context::cublas_handle(), n, x, 1, y, 1));
-	CUBLAS_CHECK(cublasSscal(Context::cublas_handle(), n, &alpha, y, 1));
+void scale_gpu(const int n, const float alpha, const float* x, float* y) {
+  CUBLAS_CHECK(cublasScopy(Context::cublas_handle(), n, x, 1, y, 1));
+  CUBLAS_CHECK(cublasSscal(Context::cublas_handle(), n, &alpha, y, 1));
 }
 
 __global__ void set_kernel(const int n, const float_t alpha, float_t* y) {
-	CUDA_KERNEL_LOOP(index, n) {
-		y[index] = alpha;
-	}
+  CUDA_KERNEL_LOOP(index, n) { y[index] = alpha; }
 }
 
 void set_gpu(const int N, const float_t alpha, float_t* Y) {
-	if (alpha == 0) {
-		CUDA_CHECK(cudaMemset(Y, 0, sizeof(float_t) * N));
-		return;
-	}
-	set_kernel<<<CUDA_GET_BLOCKS(N), CUDA_NUM_THREADS>>>(N, alpha, Y);
-	CudaTest("solving set kernel failed");
+  if (alpha == 0) {
+    CUDA_CHECK(cudaMemset(Y, 0, sizeof(float_t) * N));
+    return;
+  }
+  set_kernel<<<CUDA_GET_BLOCKS(N), CUDA_NUM_THREADS>>>(N, alpha, Y);
 }
 
-__global__ void add_scalar_kernel(const int n, const float_t alpha, float_t* y) {
-	CUDA_KERNEL_LOOP(index, n) {
-		y[index] += alpha;
-	}
+__global__ void add_scalar_kernel(const int n, const float_t alpha,
+                                  float_t* y) {
+  CUDA_KERNEL_LOOP(index, n) { y[index] += alpha; }
 }
 
 void add_scalar_gpu(const int N, const float_t alpha, float_t* Y) {
-	add_scalar_kernel<<<CUDA_GET_BLOCKS(N), CUDA_NUM_THREADS>>>(N, alpha, Y);
-	CudaTest("solving add_scalar kernel failed");
+  add_scalar_kernel<<<CUDA_GET_BLOCKS(N), CUDA_NUM_THREADS>>>(N, alpha, Y);
 }
 
-__global__ void vadd_kernel(const int n, const float_t* a, const float_t* b, float_t* y) {
-	CUDA_KERNEL_LOOP(index, n) {
-		y[index] = a[index] + b[index];
-	}
+__global__ void vadd_kernel(const int n, const float_t* a, const float_t* b,
+                            float_t* y) {
+  CUDA_KERNEL_LOOP(index, n) { y[index] = a[index] + b[index]; }
 }
 
 void vadd_gpu(const int N, const float_t* a, const float_t* b, float_t* y) {
-	vadd_kernel<<<CUDA_GET_BLOCKS(N), CUDA_NUM_THREADS>>>(N, a, b, y);
-	CudaTest("solving vadd kernel failed");
+  vadd_kernel<<<CUDA_GET_BLOCKS(N), CUDA_NUM_THREADS>>>(N, a, b, y);
 }
 
 // TODO: use warp
-__device__ void softmax(int n, const float_t *input, float_t *output) {
-	float_t max = input[0];
-	for (size_t i = 1; i < n; i++) if (input[i] > max) max = input[i];
-	float_t denominator = 0.0;
-	for (size_t i = 0; i < n; i++) {
-		output[i] = exp(input[i] - max);
-		denominator += output[i];
-	}
-	for (size_t i = 0; i < n; i++) output[i] /= denominator;
+__device__ void softmax(int n, const float_t* input, float_t* output) {
+  float_t max = input[0];
+  for (size_t i = 1; i < n; i++)
+    if (input[i] > max)
+      max = input[i];
+  float_t denominator = 0.0;
+  for (size_t i = 0; i < n; i++) {
+    output[i] = exp(input[i] - max);
+    denominator += output[i];
+  }
+  for (size_t i = 0; i < n; i++)
+    output[i] /= denominator;
 }
 
 // TODO: use warp
-__device__ void d_softmax(size_t n, const float_t *p, const float_t *dp, float_t *dy) {
-	for (size_t i = 0; i < n; i++) {
-		dy[i] = 0;
-		for (size_t j = 0; j < n; j++) {
-			float_t df = (j == i) ? p[i] * (1.0 - p[i]) : -p[j] * p[i];
-			dy[i] += df * dp[j];
-		}
-	}
-}
-
-__device__ void cross_entropy(int n, const label_t idx, const float_t *p, float_t &loss) {
-	if (p[idx] == 0.0) loss -= log(float_t(1e-10));
-	else loss -= log(p[idx]);
-}
-
-__device__ void d_cross_entropy(int n, const label_t idx, const float_t *p, float_t *d) {
-	for (int i = 0; i < n; i++)
-		if (i == (int)idx) d[i] = -1.0 / (p[i] + 1e-10);
-		else d[i] = 0.0;
+__device__ void d_softmax(size_t n, const float_t* p, const float_t* dp,
+                          float_t* dy) {
+  for (size_t i = 0; i < n; i++) {
+    dy[i] = 0;
+    for (size_t j = 0; j < n; j++) {
+      float_t df = (j == i) ? p[i] * (1.0 - p[i]) : -p[j] * p[i];
+      dy[i] += df * dp[j];
+    }
+  }
+}
+
+__device__ void cross_entropy(int n, const label_t idx, const float_t* p,
+                              float_t& loss) {
+  if (p[idx] == 0.0)
+    loss -= log(float_t(1e-10));
+  else
+    loss -= log(p[idx]);
+}
+
+__device__ void d_cross_entropy(int n, const label_t idx, const float_t* p,
+                                float_t* d) {
+  for (int i = 0; i < n; i++)
+    if (i == (int)idx)
+      d[i] = -1.0 / (p[i] + 1e-10);
+    else
+      d[i] = 0.0;
 }
 
 // n: number of vectors
 // len: length of vectors
 // for each vector, do softmax to normalize the vector, and then compute a loss
-__global__ void softmax_cross_entropy_kernel(int n, int len, const float_t *in_data,
-	const mask_t *masks, const label_t *labels, float_t *loss, float_t *out_data) {
-	CUDA_KERNEL_LOOP(i, n) {
-		if (masks[i] == 1) { // masked
-			softmax(len, in_data+len*i, out_data+len*i); // normalize using softmax
-			loss[i] = 0.0;
-			cross_entropy(len, labels[i], &out_data[len*i], loss[i]);
-		}
-	}
-}
-
-void softmax_cross_entropy_gpu(int n, int len, const float_t *in, const mask_t *masks, const label_t *labels, float_t *loss, float_t *out) {
-	softmax_cross_entropy_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, len, in, masks, labels, loss, out);
-	CudaTest("solving softmax_cross_entropy kernel failed");
-}
-
-__global__ void d_softmax_cross_entropy_kernel(int n, int len, const float_t *in,
-	const mask_t *masks, const label_t *labels, const float_t *out, float_t *diff) {
-	CUDA_KERNEL_LOOP(i, n) {
-		float_t out_grad[41]; // TODO
-		d_cross_entropy(len, labels[i], out+len*i, out_grad);
-		d_softmax(len, out+len*i, out_grad, diff+len*i);
-	}
-}
-
-void d_softmax_cross_entropy_gpu(int n, int len, const float_t *in, const mask_t *masks, const label_t *labels, const float_t *out, float_t *diff) {
-	d_softmax_cross_entropy_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, len, in, masks, labels, out, diff);
-	CudaTest("solving d_softmax_cross_entropy kernel failed");
-}
-
-__global__ void masked_avg_loss_kernel(size_t begin, size_t end, mask_t *masks, float_t *loss, HGAccumulator<acc_t> total) {
-	total.thread_entry();
-	__shared__ cub::BlockReduce<acc_t, TB_SIZE>::TempStorage local_loss;
-	CUDA_KERNEL_LOOP(i, end-begin) {
-		if (masks[begin+i] == 1)
-			//total += loss[begin+i];
-			total.reduce(loss[begin+i]);
-	}
-	total.thread_exit<cub::BlockReduce<acc_t, TB_SIZE> >(local_loss);
-}
-
-acc_t masked_avg_loss(size_t begin, size_t end, size_t count, mask_t *masks, float_t *loss) {
-	HGAccumulator<acc_t> loss_accum;
-	Shared<acc_t> total_loss = Shared<acc_t>(1);
-	*(total_loss.cpu_wr_ptr()) = 0;
-	loss_accum.rv = total_loss.gpu_wr_ptr();
-	masked_avg_loss_kernel<<<CUDA_GET_BLOCKS(end-begin), CUDA_NUM_THREADS>>>(begin, end, masks, loss, loss_accum);
-	CudaTest("solving masked_avg_loss kernel failed");
-	cudaDeviceSynchronize();
-	return *(total_loss.cpu_rd_ptr());
+__global__ void softmax_cross_entropy_kernel(int n, int len,
+                                             const float_t* in_data,
+                                             const mask_t* masks,
+                                             const label_t* labels,
+                                             float_t* loss, float_t* out_data) {
+  CUDA_KERNEL_LOOP(i, n) {
+    if (masks[i] == 1) { // masked
+      softmax(len, in_data + len * i,
+              out_data + len * i); // normalize using softmax
+      loss[i] = 0.0;
+      cross_entropy(len, labels[i], &out_data[len * i], loss[i]);
+    }
+  }
+}
+
+void softmax_cross_entropy_gpu(int n, int len, const float_t* in,
+                               const mask_t* masks, const label_t* labels,
+                               float_t* loss, float_t* out) {
+  softmax_cross_entropy_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
+      n, len, in, masks, labels, loss, out);
+}
+
+__global__ void
+d_softmax_cross_entropy_kernel(int n, int len, const float_t* in,
+                               const mask_t* masks, const label_t* labels,
+                               const float_t* out, float_t* diff) {
+  CUDA_KERNEL_LOOP(i, n) {
+    float_t out_grad[41];
+    d_cross_entropy(len, labels[i], out + len * i, out_grad);
+    d_softmax(len, out + len * i, out_grad, diff + len * i);
+  }
+}
+
+void d_softmax_cross_entropy_gpu(int n, int len, const float_t* in,
+                                 const mask_t* masks, const label_t* labels,
+                                 const float_t* out, float_t* diff) {
+  d_softmax_cross_entropy_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
+      n, len, in, masks, labels, out, diff);
 }
-
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index 1d81ea1012..6625f283b3 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -1,87 +1,102 @@
 #include "net.h"
 
 void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1) {
-	context = new Context();
-	num_samples = context->read_graph(dataset_str);
-	num_classes = context->read_labels(dataset_str);
-	context->norm_factor_counting(); // pre-compute normalizing factor
-	num_epochs = epochs;
+  context = new Context();
+  // Context::create_blas_handle();
+  num_samples = context->read_graph(dataset_str);
+  num_classes = context->read_labels(dataset_str);
+  context->degree_counting();
+  context->norm_factor_counting(); // pre-compute normalizing factor
+  num_epochs = epochs;
 
-	std::cout << "Reading label masks ... ";
-	train_mask.resize(num_samples, 0);
-	val_mask.resize(num_samples, 0);
-	if (dataset_str == "reddit") {
-		train_begin = 0, train_count = 153431, train_end = train_begin + train_count;
-		val_begin = 153431, val_count = 23831, val_end = val_begin + val_count;
-		for (size_t i = train_begin; i < train_end; i++) train_mask[i] = 1;
-		for (size_t i = val_begin; i < val_end; i++) val_mask[i] = 1;
-	} else {
-		train_count = read_masks(dataset_str, "train", train_begin, train_end, train_mask);
-		val_count = read_masks(dataset_str, "val", val_begin, val_end, val_mask);
-	}
-	std::cout << "Done\n";
+  std::cout << "Reading label masks ... ";
+  train_mask.resize(num_samples, 0);
+  val_mask.resize(num_samples, 0);
+  if (dataset_str == "reddit") {
+    train_begin = 0, train_count = 153431,
+    train_end = train_begin + train_count;
+    val_begin = 153431, val_count = 23831, val_end = val_begin + val_count;
+    for (size_t i = train_begin; i < train_end; i++)
+      train_mask[i] = 1;
+    for (size_t i = val_begin; i < val_end; i++)
+      val_mask[i] = 1;
+  } else {
+    train_count =
+        read_masks(dataset_str, "train", train_begin, train_end, train_mask);
+    val_count = read_masks(dataset_str, "val", val_begin, val_end, val_mask);
+  }
+  std::cout << "Done\n";
 
-	num_layers = NUM_CONV_LAYERS + 1;
-	feature_dims.resize(num_layers + 1);
-	feature_dims[0] = context->read_features(dataset_str); // input feature dimension: D
-	feature_dims[1] = hidden1; // hidden1 level embedding: 16
-	feature_dims[2] = num_classes; // output embedding: E
-	feature_dims[3] = num_classes; // normalized output embedding: E
-	layers.resize(num_layers);
+  num_layers = NUM_CONV_LAYERS + 1;
+  feature_dims.resize(num_layers + 1);
+  feature_dims[0] =
+      context->read_features(dataset_str); // input feature dimension: D
+  feature_dims[1] = hidden1;               // hidden1 level embedding: 16
+  feature_dims[2] = num_classes;           // output embedding: E
+  feature_dims[3] = num_classes;           // normalized output embedding: E
+  layers.resize(num_layers);
 #ifndef CPU_ONLY
-	context->copy_data_to_device(); // copy labels and input features to the device
+  context
+      ->copy_data_to_device(); // copy labels and input features to the device
 #endif
 }
 
-void Net::train(optimizer *opt, bool need_validate) {
-	std::cout << "\nStart training...\n";
-	galois::StatTimer Tupdate("Train-WeightUpdate");
-	galois::StatTimer Tfw("Train-Forward");
-	galois::StatTimer Tbw("Train-Backward");
-	galois::StatTimer Tval("Validation");
-	Timer t_epoch;
-	// run epoches
-	for (unsigned i = 0; i < num_epochs; i++) {
-		std::cout << "Epoch " << std::setw(2) << i << std::fixed << std::setprecision(3) << ":";
-		t_epoch.Start();
+void Net::train(optimizer* opt, bool need_validate) {
+  std::cout << "\nStart training...\n";
+  galois::StatTimer Tupdate("Train-WeightUpdate");
+  galois::StatTimer Tfw("Train-Forward");
+  galois::StatTimer Tbw("Train-Backward");
+  galois::StatTimer Tval("Validation");
+  Timer t_epoch;
+  // run epoches
+  for (unsigned i = 0; i < num_epochs; i++) {
+    std::cout << "Epoch " << std::setw(2) << i << std::fixed
+              << std::setprecision(3) << ":";
+    t_epoch.Start();
 
-		// training steps
-		set_netphases(net_phase::train);
-		acc_t train_loss = 0.0, train_acc = 0.0;
-		Tfw.start();
-		train_loss = fprop(train_begin, train_end, train_count, &train_mask[0]); // forward
-		train_acc = masked_accuracy(train_begin, train_end, train_count, &train_mask[0]); // predict
-		Tfw.stop();
-		Tbw.start();
-		bprop(); // back propogation
-		Tbw.stop();
-		Tupdate.start();
-		update_weights(opt); // update parameters
-		Tupdate.stop();
-		set_netphases(net_phase::test);
-		std::cout << " train_loss = " << std::setw(5) << train_loss << " train_acc = " << std::setw(5) << train_acc;
-		t_epoch.Stop();
-		double epoch_time = t_epoch.Millisecs();
-		if (need_validate) {
-			// Validation
-			acc_t val_loss = 0.0, val_acc = 0.0;
-			Tval.start();
-			double val_time = evaluate(val_begin, val_end, val_count, &val_mask[0], val_loss, val_acc);
-			Tval.stop();
-			std::cout << " val_loss = " << std::setw(5) << val_loss << " val_acc = " << std::setw(5) << val_acc;
-			std::cout << " time = " << epoch_time + val_time << " ms (train_time = " << epoch_time << " val_time = " << val_time << ")\n";
-		} else {
-			std::cout << " train_time = " << epoch_time << " ms\n";
-		}
-	}
+    // training steps
+    set_netphases(net_phase::train);
+    acc_t train_loss = 0.0, train_acc = 0.0;
+    Tfw.start();
+    train_loss =
+        fprop(train_begin, train_end, train_count, &train_mask[0]); // forward
+    train_acc = masked_accuracy(train_begin, train_end, train_count,
+                                &train_mask[0]); // predict
+    Tfw.stop();
+    Tbw.start();
+    bprop(); // back propogation
+    Tbw.stop();
+    Tupdate.start();
+    update_weights(opt); // update parameters
+    Tupdate.stop();
+    set_netphases(net_phase::test);
+    std::cout << " train_loss = " << std::setw(5) << train_loss
+              << " train_acc = " << std::setw(5) << train_acc;
+    t_epoch.Stop();
+    double epoch_time = t_epoch.Millisecs();
+    if (need_validate) {
+      // Validation
+      acc_t val_loss = 0.0, val_acc = 0.0;
+      Tval.start();
+      double val_time = evaluate(val_begin, val_end, val_count, &val_mask[0],
+                                 val_loss, val_acc);
+      Tval.stop();
+      std::cout << " val_loss = " << std::setw(5) << val_loss
+                << " val_acc = " << std::setw(5) << val_acc;
+      std::cout << " time = " << epoch_time + val_time
+                << " ms (train_time = " << epoch_time
+                << " val_time = " << val_time << ")\n";
+    } else {
+      std::cout << " train_time = " << epoch_time << " ms\n";
+    }
+  }
 }
 
 void Net::construct_layers() {
-	std::cout << "\nConstructing layers...\n";
-	append_conv_layer(0, true); // first conv layer
-	append_conv_layer(1); // hidden1 layer
-	append_out_layer(2); // output layer
-	layers[0]->set_in_data(context->get_in_ptr()); // feed input data
-	set_contexts();
+  std::cout << "\nConstructing layers...\n";
+  append_conv_layer(0, true);                    // first conv layer
+  append_conv_layer(1);                          // hidden1 layer
+  append_out_layer(2);                           // output layer
+  layers[0]->set_in_data(context->get_in_ptr()); // feed input data
+  set_contexts();
 }
-
diff --git a/libdeepgalois/src/node.cpp b/libdeepgalois/src/node.cpp
index 5b60a9f22a..f4278688d1 100644
--- a/libdeepgalois/src/node.cpp
+++ b/libdeepgalois/src/node.cpp
@@ -1,38 +1,37 @@
 #include "node.h"
-#include <iostream>
 
 void edge::alloc() {
-	//std::cout << "Allocating memory for tensors (intermediate features and gradients) ...\n";
 #ifdef CPU_ONLY
-	data_ = new float_t[num_samples_ * ft_dim_];
-	grad_ = new float_t[num_samples_ * ft_dim_];
+  data_ = new float_t[num_samples_ * ft_dim_];
+  grad_ = new float_t[num_samples_ * ft_dim_];
 #else
-	alloc_gpu();
+  alloc_gpu();
 #endif
 }
 
-void edge::merge_grads(vec_t *dst) {
-	assert(grad_ != NULL);
-	dst->resize(ft_dim_);
-	float_t *pdst = &(*dst)[0];
+void edge::merge_grads(vec_t* dst) {
+  assert(grad_ != NULL);
+  dst->resize(ft_dim_);
+  float_t* pdst = &(*dst)[0];
 #ifdef CPU_ONLY
-	std::copy(grad_, grad_+ft_dim_, pdst);
-	// @todo consider adding parallelism and vectorization
-	for (size_t sample = 1; sample < num_samples_; ++sample) {
-		for (size_t i = 0; i < ft_dim_; i++) pdst[i] += grad_[sample*ft_dim_+i];
-		//vectorize::reduce<float_t>(&grad_[sample][0], ft_dim_, pdst);
-	}
+  std::copy(grad_, grad_ + ft_dim_, pdst);
+  // @todo consider adding parallelism and vectorization
+  for (size_t sample = 1; sample < num_samples_; ++sample) {
+    for (size_t i = 0; i < ft_dim_; i++)
+      pdst[i] += grad_[sample * ft_dim_ + i];
+    // vectorize::reduce<float_t>(&grad_[sample][0], ft_dim_, pdst);
+  }
 #else
-	merge_grads_gpu(pdst);
+  merge_grads_gpu(pdst);
 #endif
 }
 
 void edge::clear_grads() {
 #ifdef CPU_ONLY
-	std::fill(grad_, grad_+ft_dim_*num_samples_, float_t(0)); // TODO: need vectorize
-	//vectorize::fill(&grad_[0], grad_.size(), float_t(0));
+  std::fill(grad_, grad_ + ft_dim_ * num_samples_,
+            float_t(0)); // TODO: need vectorize
+  // vectorize::fill(&grad_[0], grad_.size(), float_t(0));
 #else
-	clear_grads_gpu();
+  clear_grads_gpu();
 #endif
 }
-
diff --git a/libdeepgalois/src/node.cu b/libdeepgalois/src/node.cu
index da79217231..2443e9ed7c 100644
--- a/libdeepgalois/src/node.cu
+++ b/libdeepgalois/src/node.cu
@@ -2,14 +2,17 @@
 #include "cutils.h"
 
 void edge::alloc_gpu() {
-	CUDA_CHECK(cudaMalloc((void **)&data_, num_samples_ * ft_dim_ * sizeof(float_t)));
-	CUDA_CHECK(cudaMalloc((void **)&grad_, num_samples_ * ft_dim_ * sizeof(float_t)));
+  CUDA_CHECK(
+      cudaMalloc((void**)&data_, num_samples_ * ft_dim_ * sizeof(float_t)));
+  CUDA_CHECK(
+      cudaMalloc((void**)&grad_, num_samples_ * ft_dim_ * sizeof(float_t)));
 }
 
-void edge::merge_grads_gpu(float_t *dst) {
-	CUDA_CHECK(cudaMemcpy(&dst, grad_, ft_dim_ * sizeof(float_t), cudaMemcpyDeviceToHost));
+void edge::merge_grads_gpu(float_t* dst) {
+  CUDA_CHECK(cudaMemcpy(&dst, grad_, ft_dim_ * sizeof(float_t),
+                        cudaMemcpyDeviceToHost));
 }
 
 void edge::clear_grads_gpu() {
-	CUDA_CHECK(cudaMemset(grad_, 0, num_samples_ * ft_dim_ * sizeof(float_t)));
+  CUDA_CHECK(cudaMemset(grad_, 0, ft_dim_ * num_samples_ * sizeof(float_t)));
 }
diff --git a/libdeepgalois/src/optimizer.cpp b/libdeepgalois/src/optimizer.cpp
index 3372378de1..fb10221f19 100644
--- a/libdeepgalois/src/optimizer.cpp
+++ b/libdeepgalois/src/optimizer.cpp
@@ -1,76 +1,89 @@
 #include "optimizer.h"
 #include "galois/Galois.h"
 
-void adagrad::update(const vec_t &dW, vec_t &W, bool parallelize) {
-	vec_t &g = get<0>(W);
-	if (parallelize) {
-		galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) {
-			g[i] += dW[i] * dW[i];
-			W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps);
-		}, galois::loopname("adagrad_update"));
-	} else {
-		for (size_t i = 0; i < W.size(); i++) {
-			g[i] += dW[i] * dW[i];
-			W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps);
-		}
-	}
+void adagrad::update(const vec_t& dW, vec_t& W, bool parallelize) {
+  vec_t& g = get<0>(W);
+  if (parallelize) {
+    galois::do_all(galois::iterate((size_t)0, W.size()),
+                   [&](const auto& i) {
+                     g[i] += dW[i] * dW[i];
+                     W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps);
+                   },
+                   galois::loopname("adagrad_update"));
+  } else {
+    for (size_t i = 0; i < W.size(); i++) {
+      g[i] += dW[i] * dW[i];
+      W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps);
+    }
+  }
 }
 
-void RMSprop::update(const vec_t &dW, vec_t &W, bool parallelize) {
-	vec_t &g = get<0>(W);
-	galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) {
-		g[i] = mu * g[i] + (1 - mu) * dW[i] * dW[i];
-		W[i] -= alpha * dW[i] / std::sqrt(g[i] + eps);
-	}, galois::loopname("rms_update"));
+void RMSprop::update(const vec_t& dW, vec_t& W, bool parallelize) {
+  vec_t& g = get<0>(W);
+  galois::do_all(galois::iterate((size_t)0, W.size()),
+                 [&](const auto& i) {
+                   g[i] = mu * g[i] + (1 - mu) * dW[i] * dW[i];
+                   W[i] -= alpha * dW[i] / std::sqrt(g[i] + eps);
+                 },
+                 galois::loopname("rms_update"));
 }
 
-void adam::update(const vec_t &dW, vec_t &W, bool parallelize) {
-	vec_t &mt = get<0>(W);
-	vec_t &vt = get<1>(W);
-	galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) {
-		mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i];
-		vt[i] = b2 * vt[i] + (float_t(1) - b2) * dW[i] * dW[i];
-		// L2 norm based update rule
-		W[i] -= alpha * (mt[i] / (float_t(1) - b1_t)) /
-			std::sqrt((vt[i] / (float_t(1) - b2_t)) + eps);
-	}, galois::chunk_size<256>(), galois::steal(), galois::loopname("adam_update"));
-	b1_t *= b1;
-	b2_t *= b2;
+void adam::update(const vec_t& dW, vec_t& W, bool parallelize) {
+  vec_t& mt = get<0>(W);
+  vec_t& vt = get<1>(W);
+  galois::do_all(galois::iterate((size_t)0, W.size()),
+                 [&](const auto& i) {
+                   mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i];
+                   vt[i] = b2 * vt[i] + (float_t(1) - b2) * dW[i] * dW[i];
+                   // L2 norm based update rule
+                   W[i] -= alpha * (mt[i] / (float_t(1) - b1_t)) /
+                           std::sqrt((vt[i] / (float_t(1) - b2_t)) + eps);
+                 },
+                 galois::chunk_size<256>(), galois::steal(),
+                 galois::loopname("adam_update"));
+  b1_t *= b1;
+  b2_t *= b2;
 }
 
-void adamax::update(const vec_t &dW, vec_t &W, bool parallelize) {
-	vec_t &mt = get<0>(W);
-	vec_t &ut = get<1>(W);
-	galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) {
-		mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i];
-		ut[i] = std::max(b2 * ut[i], std::abs(dW[i]));
-		// Lp norm based update rule
-		W[i] -= (alpha / (1.0 - b1_t)) * (mt[i] / (ut[i] + eps));
-	}, galois::loopname("adamax_update"));
-	b1_t *= b1;
+void adamax::update(const vec_t& dW, vec_t& W, bool parallelize) {
+  vec_t& mt = get<0>(W);
+  vec_t& ut = get<1>(W);
+  galois::do_all(galois::iterate((size_t)0, W.size()),
+                 [&](const auto& i) {
+                   mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i];
+                   ut[i] = std::max(b2 * ut[i], std::abs(dW[i]));
+                   // Lp norm based update rule
+                   W[i] -= (alpha / (1.0 - b1_t)) * (mt[i] / (ut[i] + eps));
+                 },
+                 galois::loopname("adamax_update"));
+  b1_t *= b1;
 }
 
-void gradient_descent::update(const vec_t &dW, vec_t &W, bool parallelize) {
-	galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) {
-		W[i] = W[i] - alpha * (dW[i] + lambda * W[i]); 
-	}, galois::loopname("gradient_descent_update"));
+void gradient_descent::update(const vec_t& dW, vec_t& W, bool parallelize) {
+  galois::do_all(
+      galois::iterate((size_t)0, W.size()),
+      [&](const auto& i) { W[i] = W[i] - alpha * (dW[i] + lambda * W[i]); },
+      galois::loopname("gradient_descent_update"));
 }
 
-void momentum::update(const vec_t &dW, vec_t &W, bool parallelize) {
-	vec_t &dWprev = get<0>(W);
-	galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) {
-		float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda);
-		W[i] += V;
-		dWprev[i] = V;
-	}, galois::loopname("momentum_update"));
+void momentum::update(const vec_t& dW, vec_t& W, bool parallelize) {
+  vec_t& dWprev = get<0>(W);
+  galois::do_all(galois::iterate((size_t)0, W.size()),
+                 [&](const auto& i) {
+                   float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda);
+                   W[i] += V;
+                   dWprev[i] = V;
+                 },
+                 galois::loopname("momentum_update"));
 }
 
-void nesterov_momentum::update(const vec_t &dW, vec_t &W, bool parallelize) {
-	vec_t &dWprev = get<0>(W);
-	galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) {
-		float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda);
-		W[i] += (-mu) * dWprev[i] + (1 + mu) * V;
-		dWprev[i] = V;
-	}, galois::loopname("nesterov_momentum_update"));
+void nesterov_momentum::update(const vec_t& dW, vec_t& W, bool parallelize) {
+  vec_t& dWprev = get<0>(W);
+  galois::do_all(galois::iterate((size_t)0, W.size()),
+                 [&](const auto& i) {
+                   float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda);
+                   W[i] += (-mu) * dWprev[i] + (1 + mu) * V;
+                   dWprev[i] = V;
+                 },
+                 galois::loopname("nesterov_momentum_update"));
 }
-
diff --git a/libdeepgalois/src/optimizer.cu b/libdeepgalois/src/optimizer.cu
index 832da51cbf..908ce4f32a 100644
--- a/libdeepgalois/src/optimizer.cu
+++ b/libdeepgalois/src/optimizer.cu
@@ -1,4 +1,3 @@
 #include "optimizer.h"
 
-void adam::update_gpu(const float_t *dW, float_t *W) {
-}
+void adam::update_gpu(const float_t* dW, float_t* W) {}

From 51efbcb099bf2452dd73d83d9b6cc9b95b012077 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Tue, 25 Feb 2020 16:58:39 -0600
Subject: [PATCH 034/660] fix gpu bugs

---
 libdeepgalois/include/layers/layer.h          | 132 ++++------
 libdeepgalois/include/math_functions.hh       |  50 ++--
 libdeepgalois/include/net.h                   |  45 +---
 libdeepgalois/src/aggregator.cu               |  12 +-
 libdeepgalois/src/layers/graph_conv_layer.cpp |  87 +++----
 .../src/layers/softmax_loss_layer.cpp         |   7 +-
 libdeepgalois/src/math_functions.cu           | 246 +++++++++++-------
 libdeepgalois/src/net.cpp                     |  17 ++
 libgpu/include/graph_gpu.h                    |   8 +-
 9 files changed, 306 insertions(+), 298 deletions(-)

diff --git a/libdeepgalois/include/layers/layer.h b/libdeepgalois/include/layers/layer.h
index c0e694d21c..7b8bbc55a4 100644
--- a/libdeepgalois/include/layers/layer.h
+++ b/libdeepgalois/include/layers/layer.h
@@ -52,83 +52,61 @@ class layer : public node {
   virtual void back_propagation(const float_t* in_data, const float_t* out_data,
                                 float_t* out_grad, float_t* in_grad) = 0;
 
-  void set_trainable(bool trainable) { trainable_ = trainable; }
-  bool trainable() const { return trainable_; }
-  void set_name(std::string name) { name_ = name; }
-  std::string get_name() { return name_; }
-  void print_layer_info() {
-    std::cout << "Layer" << level_ << " type: " << layer_type() << " input["
-              << input_dims[0] << "," << input_dims[1] << "] output["
-              << output_dims[0] << "," << output_dims[1] << "]\n";
-  }
-  virtual void set_sample_mask(size_t sample_begin, size_t sample_end,
-                               size_t sample_count, mask_t* masks) {
-    begin_ = sample_begin;
-    end_   = sample_end;
-    count_ = sample_count;
-    masks_ = masks;
-  }
-  void set_in_data(float_t* data) {
-    assert(data.size() == input_dims[0] * input_dims[1]);
-    prev_ = std::make_shared<edge>(this, input_dims[0], input_dims[1]);
-    prev_->set_data(data);
-    // no need to allocate memory for gradients, since this is the input layer.
-    //
-    // allocate memory for intermediate features
-    // prev_->get_data() = data;
-    // std::copy(data.begin(), data.end(), prev_->get_data());
-    // allocate memory for intermediate gradients
-    // prev_->get_gradient().resize(input_dims[0]*input_dims[1]);
-  }
-  void add_edge() {
-    // add an outgoing edge
-    next_ = std::make_shared<edge>(this, output_dims[0], output_dims[1]);
-    // allocate memory for intermediate feature vectors and gradients
-    next_->alloc();
-    // next_->get_data().resize(output_dims[0]*output_dims[1]);
-  }
-  void alloc_grad() {
-    // allocate memory for intermediate gradients
-    // next_->get_gradient().resize(output_dims[0]*output_dims[1]);
-  }
-  void forward() {
-    forward_propagation(prev()->get_data(), next()->get_data());
-  }
-  void backward() {
-    back_propagation(prev()->get_data(), next()->get_data(),
-                     next()->get_gradient(), prev()->get_gradient());
-  }
-  void update_weight(optimizer* opt) {
-    // parallelize only when target size is big enough to mitigate thread
-    // spawning overhead.
-    bool parallel = (W.size() >= 512);
-    // vec_t diff;
-    // prev()->merge_grads(&diff);
-    // auto in_data = prev()->get_data();
-    // float_t rcp_batch_size = float_t(1.0) / in_data.size();
-    // for (size_t i = 0; i < diff.size(); ++i)
-    //	diff[i] *= rcp_batch_size;
-    opt->update(weight_grad, W, parallel); // W += grad
-    // prev()->clear_grads();
-    next()->clear_grads();
-  }
-  inline acc_t get_masked_loss() {
-    AccumF total_loss;
-    AccumU valid_sample_count;
-    total_loss.reset();
-    valid_sample_count.reset();
-    galois::do_all(galois::iterate(begin_, end_),
-                   [&](const auto& i) {
-                     if (masks_[i]) {
-                       total_loss += loss[i];
-                       valid_sample_count += 1;
-                     }
-                   },
-                   galois::chunk_size<256>(), galois::steal(),
-                   galois::loopname("getMaskedLoss"));
-    assert(valid_sample_count.reduce() == count_);
-    return total_loss.reduce() / (acc_t)count_;
-  }
+	void set_trainable(bool trainable) { trainable_ = trainable; }
+	bool trainable() const { return trainable_; }
+	void set_name(std::string name) { name_ = name; }
+	std::string get_name() { return name_; }
+	mask_t *get_device_masks() { return d_masks_; }
+	void print_layer_info() {
+		std::cout << "Layer" << level_ << " type: " << layer_type()
+			<< " input[" << input_dims[0] << "," << input_dims[1] 
+			<< "] output[" << output_dims[0] << "," << output_dims[1] << "]\n";
+	}
+	virtual void set_sample_mask(size_t sample_begin, size_t sample_end, size_t sample_count, mask_t *masks) {
+		begin_ = sample_begin;
+		end_ = sample_end;
+		count_ = sample_count;
+		masks_ = masks;
+#ifndef CPU_ONLY
+		copy_masks_device(input_dims[0], masks_, d_masks_);
+#endif
+	}
+	void set_in_data(float_t *data) {
+		prev_ = std::make_shared<edge>(this, input_dims[0], input_dims[1]);
+		prev_->set_data(data);
+		// no need to allocate memory for gradients, since this is the input layer.
+	}
+	void add_edge() {
+		// add an outgoing edge
+		next_ = std::make_shared<edge>(this, output_dims[0], output_dims[1]);
+		// allocate memory for intermediate feature vectors and gradients
+		next_->alloc();
+	}
+	void alloc_grad() {
+		// allocate memory for intermediate gradients
+	}
+	void forward() {
+		//std::cout << name_ << ": forwarding ... ";
+		forward_propagation(prev()->get_data(), next()->get_data());
+	}
+	void backward() {
+		//std::cout << name_ << ": backwarding ... ";
+		back_propagation(prev()->get_data(), next()->get_data(), next()->get_gradient(), prev()->get_gradient());
+	}
+	void update_weight(optimizer *opt) {
+		//std::cout << name_ << ": weight updating ... ";
+		//vec_t diff;
+		//prev()->merge_grads(&diff);
+#ifdef CPU_ONLY
+		// parallelize only when target size is big enough to mitigate thread spawning overhead.
+		bool parallel = (W.size() >= 512);
+		opt->update(weight_grad, W, parallel); // W += grad
+#else
+		opt->update_gpu(d_weight_grad, d_W); // W += grad
+#endif
+		//prev()->clear_grads();
+		next()->clear_grads();
+	}
 
 protected:
   unsigned level_;                 // layer id: [0, num_layers-1]
diff --git a/libdeepgalois/include/math_functions.hh b/libdeepgalois/include/math_functions.hh
index 6f4348ff34..02afab2c49 100644
--- a/libdeepgalois/include/math_functions.hh
+++ b/libdeepgalois/include/math_functions.hh
@@ -70,37 +70,25 @@ float_t cross_entropy(size_t n, const float_t* y, const float_t* p);
 void d_cross_entropy(const vec_t& y, const vec_t& p, vec_t& d);
 void d_cross_entropy(size_t n, const float_t* y, const float_t* p, float_t* d);
 
-void out_malloc_device(int n, mask_t* h_masks, mask_t* d_masks, float_t* loss);
-void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout,
-                         unsigned* masks, float_t* in, float_t* out,
-                         float_t* matrix, float_t* grad);
-void copy_gpu(size_t len, const float_t* in, float_t* out);
-void malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned* masks,
-                   float_t* in, float_t* out);
-void vadd_gpu(const int n, const float_t* a, const float_t* b,
-              float_t* out);                                 // vector add
-void relu_gpu(const int n, const float_t* in, float_t* out); // ReLU
-void d_relu_gpu(const int n, const float_t* in_diff, const float_t* data,
-                float_t* out_diff); // ReLU derivative
-void dropout_gpu(const int n, const float scale, const float dropout_rate,
-                 const float_t* in, unsigned* mask, float_t* out); // dropout
-void d_dropout_gpu(const float scale, const float_t* in_diff,
-                   const unsigned* mask,
-                   float_t* out_diff); // dropout derivative
-void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
-               const int M, const int N, const int K, const float alpha,
-               const float* A, const float* B, const float beta, float* C);
-void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z,
-                    const float_t* A, const float_t* B,
-                    float_t* C);                  // matrix multiply
-int argmax_gpu(const size_t n, const float_t* x); // the arguments of the maxima
-void softmax_cross_entropy_gpu(int x, int y, const float_t* in_data,
-                               const mask_t* masks, const label_t* labels,
-                               float_t* loss, float_t* out_data);
-void d_softmax_cross_entropy_gpu(int x, int y, const float_t* in_data,
-                                 const mask_t* masks, const label_t* labels,
-                                 const float_t* out_data, float_t* diff);
-void scal_gpu(const int N, const float alpha, float* X);
+void copy_gpu(size_t len, const float_t *in, float_t *out);
+void vadd_gpu(const int n, const float_t *a, const float_t *b, float_t *out); // vector add
+void relu_gpu(const int n, const float_t *in, float_t *out); // ReLU
+void d_relu_gpu(const int n, const float_t *in_diff, const float_t *data, float_t *out_diff); // ReLU derivative
+void dropout_gpu(const int n, const float scale, const float dropout_rate, const float_t *in, unsigned *masks, float_t *out); // dropout
+void d_dropout_gpu(const int n, const float scale, const float_t *in, const unsigned *masks, float_t *out); // dropout derivative
+void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, 
+	const int M, const int N, const int K, const float alpha, const float* A, const float* B, const float beta, float* C);
+void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, const float_t *A, const float_t *B, float_t *C); // matrix multiply
+void softmax_cross_entropy_gpu(int x, int y, const float_t *in_data, const mask_t *masks, const label_t *labels, float_t *loss, float_t *out_data);
+void d_softmax_cross_entropy_gpu(int x, int y, const float_t *in_data, const mask_t *masks, const label_t *labels, const float_t *out_data, float_t *diff);
+void scal_gpu(const int N, const float alpha, float *X);
 void add_scalar_gpu(const int N, const float_t alpha, float_t* Y);
+acc_t masked_avg_loss(size_t begin, size_t end, size_t count, mask_t *masks, float_t *loss);
+acc_t masked_accuracy_gpu(size_t num_classes, size_t begin, size_t end, size_t count, mask_t *masks, float_t *preds, label_t *labels);
+
+void copy_masks_device(int n, mask_t *h_masks, mask_t *&d_masks);
+void malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned *&masks, float_t *&in, float_t *&out);
+void loss_malloc_device(int n, float_t *&loss);
+void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned *&masks, float_t *&in, float_t *&out, float_t *&matrix, float_t *&grad);
 
 #endif
diff --git a/libdeepgalois/include/net.h b/libdeepgalois/include/net.h
index 87a0e3b72b..9d3e1c1184 100644
--- a/libdeepgalois/include/net.h
+++ b/libdeepgalois/include/net.h
@@ -62,17 +62,17 @@ class Net {
     connect(layers[layer_id - 1], layers[layer_id]);
   }
 
-  // forward propagation: [begin, end) is the range of samples used.
-  acc_t fprop(size_t begin, size_t end, size_t count, mask_t* masks) {
-    // set mask for the last layer
-    layers[num_layers - 1]->set_sample_mask(begin, end, count, &masks[0]);
-    // layer0: from N x D to N x 16
-    // layer1: from N x 16 to N x E
-    // layer2: from N x E to N x E (normalize only)
-    for (size_t i = 0; i < num_layers; i++)
-      layers[i]->forward();
-    return layers[num_layers - 1]->get_masked_loss();
-  }
+	// forward propagation: [begin, end) is the range of samples used.
+	acc_t fprop(size_t begin, size_t end, size_t count, mask_t *masks) {
+		// set mask for the last layer
+		layers[num_layers-1]->set_sample_mask(begin, end, count, masks);
+		// layer0: from N x D to N x 16
+		// layer1: from N x 16 to N x E
+		// layer2: from N x E to N x E (normalize only)
+		for (size_t i = 0; i < num_layers; i ++)
+			layers[i]->forward();
+		return layers[num_layers-1]->get_masked_loss();
+	}
 
   // back propogation
   void bprop() {
@@ -108,27 +108,8 @@ class Net {
   std::vector<mask_t> train_mask, val_mask; // masks for traning and validation
   size_t train_begin, train_end, train_count, val_begin, val_end, val_count;
   std::vector<layer*> layers; // all the layers in the neural network
-
-  // comparing outputs with the ground truth (labels)
-  inline acc_t masked_accuracy(size_t begin, size_t end, size_t count,
-                               mask_t* masks) {
-    AccumF accuracy_all;
-    accuracy_all.reset();
-    galois::do_all(galois::iterate(begin, end),
-                   [&](const auto& i) {
-                     if (masks[i] == 1) {
-                       int preds = argmax(num_classes,
-                                          &(layers[NUM_CONV_LAYERS - 1]
-                                                ->next()
-                                                ->get_data()[i * num_classes]));
-                       if ((label_t)preds == context->get_label(i))
-                         accuracy_all += 1.0;
-                     }
-                   },
-                   galois::chunk_size<256>(), galois::steal(),
-                   galois::loopname("getMaskedLoss"));
-    return accuracy_all.reduce() / (acc_t)count;
-  }
+	// comparing outputs with the ground truth (labels)
+	acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t *masks);
 };
 
 #endif
diff --git a/libdeepgalois/src/aggregator.cu b/libdeepgalois/src/aggregator.cu
index ea41fd3dcb..c1f578caa1 100644
--- a/libdeepgalois/src/aggregator.cu
+++ b/libdeepgalois/src/aggregator.cu
@@ -32,10 +32,10 @@ __global__ void update_all_kernel(size_t n, size_t len, CSRGraph& g,
   }
 }
 
-void update_all(size_t len, CSRGraph& g, const float_t* in, float_t* out,
-                bool norm, const float_t* norm_factor) {
-  unsigned n = g.nnodes;
-  CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t)));
-  update_all_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
-      n, len, g, in, out, norm, norm_factor);
+void update_all(size_t len, CSRGraph &g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor) {
+	unsigned n = g.nnodes;
+	std::cout << "[debug]: update_all on GPU, n=" << n << ", len=" << len << "\n";
+	CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t)));
+	update_all_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, len, g, in, out, norm, norm_factor);
+	CudaTest("solving update_all kernel failed");
 }
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 06ec53b2db..1ef9be19c1 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -62,28 +62,22 @@ void graph_conv_layer::init() {
 
 #ifdef CPU_ONLY
 // 𝒉[𝑙] = σ(𝑊 * Σ(𝒉[𝑙-1]))
-void graph_conv_layer::forward_propagation(const float_t* in_data,
-                                           float_t* out_data) {
-  // input: x*y; W: y*z; output: x*z
-  // if y > z: mult W first to reduce the feature size for aggregation
-  // else: aggregate first then mult W (not implemented yet)
-  if (dropout_ && phase_ == net_phase::train) {
-    galois::do_all(galois::iterate((size_t)0, x),
-                   [&](const auto& i) {
-                     dropout(y, scale_, dropout_rate_, &in_data[i * y],
-                             &dropout_mask[i * y], &in_temp[i * y]);
-                   },
-                   galois::loopname("dropout"));
-    matmul1D1D(x, z, y, in_temp, &W[0], out_temp); // x*y; y*z; x*z
-  } else
-    matmul1D1D(x, z, y, in_data, &W[0], out_temp);      // x*y; y*z; x*z
-  aggregate(z, context->graph_cpu, out_temp, out_data); // aggregate
-  if (act_) {
-    galois::do_all(
-        galois::iterate((size_t)0, x),
-        [&](const auto& i) { relu(z, &out_data[i * z], &out_data[i * z]); },
-        galois::loopname("relu"));
-  }
+void graph_conv_layer::forward_propagation(const float_t *in_data, float_t *out_data) {
+	// input: x*y; W: y*z; output: x*z
+	// if y > z: mult W first to reduce the feature size for aggregation
+	// else: aggregate first then mult W (not implemented yet)
+	if (dropout_ && phase_ == net_phase::train) {
+		galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) {
+			dropout(y, scale_, dropout_rate_, &in_data[i*y], &dropout_mask[i*y], &in_temp[i*y]);
+		}, galois::loopname("dropout"));
+		matmul1D1D(x, z, y, in_temp, &W[0], out_temp); // x*y; y*z; x*z
+	} else matmul1D1D(x, z, y, in_data, &W[0], out_temp); // x*y; y*z; x*z
+	aggregate(z, context->graph_cpu, out_temp, out_data);
+	if (act_) {
+		galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) {
+			relu(z, &out_data[i*z], &out_data[i*z]);
+		}, galois::loopname("relu"));
+	}
 }
 
 // 𝜕𝐸 / 𝜕𝑦[𝑙−1] = 𝜕𝐸 / 𝜕𝑦[𝑙] ∗ 𝑊 ^𝑇
@@ -129,36 +123,29 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
 
 #else
 // GPU forward
-void graph_conv_layer::forward_propagation(const float_t* in_data,
-                                           float_t* out_data) {
-  assert(y <= 128); // currently only support feature length <= 128
-  if (dropout_ && phase_ == net_phase::train) {
-    dropout_gpu(x * y, scale_, dropout_rate_, in_data, dropout_mask, in_temp);
-    matmul1D1D_gpu(x, z, y, in_temp, d_W, out_temp);
-  } else
-    matmul1D1D_gpu(x, z, y, in_data, d_W, out_temp);
-  aggregate(z, context->graph_gpu, out_temp, out_data);
-  if (act_)
-    relu_gpu(x * z, out_data, out_data);
+void graph_conv_layer::forward_propagation(const float_t *in_data, float_t *out_data) {
+	assert(y <= 128); // currently only support feature length <= 128
+	assert(in_data != NULL);
+	assert(in_temp != NULL);
+	assert(dropout_mask != NULL);
+	//std::cout << "in_data=" << in_data << ", in_temp=" << in_temp << ", dropout_mask=" << dropout_mask << ", out_temp=" << out_temp << ", out_data=" << out_data << "\n";
+	if (dropout_ && phase_ == net_phase::train) {
+		dropout_gpu(x*y, scale_, dropout_rate_, in_data, dropout_mask, in_temp);
+		matmul1D1D_gpu(x, z, y, in_temp, d_W, out_temp);
+	} else matmul1D1D_gpu(x, z, y, in_data, d_W, out_temp);
+	//aggregate(z, context->graph_gpu, out_temp, out_data);
+	if (act_) relu_gpu(x*z, out_data, out_data);
 }
 
 // GPU backward
-void graph_conv_layer::back_propagation(const float_t* in_data,
-                                        const float_t* out_data,
-                                        float_t* out_grad, float_t* in_grad) {
-  if (act_)
-    d_relu_gpu(x * z, out_grad, out_data, out_temp);
-  else
-    copy_gpu(x * z, out_grad, out_temp);
-  if (level_ != 0) {
-    sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0,
-              in_temp);
-    update_all(y, context->graph_gpu, in_temp, in_grad, true,
-               context->d_norm_factor);
-    if (dropout_)
-      d_dropout(y, scale_, in_grad, dropout_mask, in_grad);
-  }
-  sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0,
-            d_weight_grad);
+void graph_conv_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) {
+	if (act_) d_relu_gpu(x*z, out_grad, out_data, out_temp);
+	else copy_gpu(x*z, out_grad, out_temp);
+	if (level_ != 0) {
+		sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0, in_temp);
+		//update_all(y, context->graph_gpu, in_temp, in_grad, true, context->d_norm_factor);
+		if (dropout_) d_dropout_gpu(x*y, scale_, in_grad, dropout_mask, in_grad);
+	}
+	sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, d_weight_grad);
 }
 #endif
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp
index 579de65667..6c29dc9a14 100644
--- a/libdeepgalois/src/layers/softmax_loss_layer.cpp
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp
@@ -63,10 +63,7 @@ void softmax_loss_layer::forward_propagation(const float_t* in_data,
                             context->d_labels, loss, out_data);
 }
 
-void softmax_loss_layer::back_propagation(const float_t* in_data,
-                                          const float_t* out_data,
-                                          float_t* out_grad, float_t* in_grad) {
-  d_softmax_cross_entropy_gpu(input_dims[0], input_dims[1], in_data, d_masks_,
-                              context->d_labels, out_data, in_grad);
+acc_t softmax_loss_layer::get_masked_loss() {
+	return masked_avg_loss(begin_, end_, count_, d_masks_, loss);
 }
 #endif
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index 8dbe141c96..70ddd8826d 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -1,18 +1,41 @@
 #include "math_functions.hh"
 #include "context.h"
 
-void gpu_rng_uniform(const int n, unsigned* r) {
-  CURAND_CHECK(curandGenerate(Context::curand_generator(), r, n));
+void gpu_rng_uniform(const int n, unsigned *r) {
+	CURAND_CHECK(curandGenerate(Context::curand_generator(), r, n));
 }
 
-void gpu_rng_uniform(const int n, const float_t a, const float_t b,
-                     float_t* r) {
-  CURAND_CHECK(curandGenerateUniform(Context::curand_generator(), r, n));
-  const float range = b - a;
-  if (range != float_t(1))
-    scal_gpu(n, range, r);
-  if (a != float_t(0))
-    add_scalar_gpu(n, a, r);
+void gpu_rng_uniform(const int n, const float_t a, const float_t b, float_t* r) {
+	CURAND_CHECK(curandGenerateUniform(Context::curand_generator(), r, n));
+	const float range = b - a;
+	if (range != float_t(1)) scal_gpu(n, range, r);
+	if (a != float_t(0)) add_scalar_gpu(n, a, r);
+}
+
+void gpu_rng_gaussian(const int n, const float_t mu, const float_t sigma, float_t *r) {
+	CURAND_CHECK(curandGenerateNormal(Context::curand_generator(), r, n, mu, sigma));
+}
+
+void loss_malloc_device(int n, float_t *&loss) {
+	CUDA_CHECK(cudaMalloc((void **)&loss, n * sizeof(float_t)));
+}
+
+void copy_masks_device(int n, mask_t *h_masks, mask_t *&d_masks) {
+	assert(h_masks != NULL);
+	CUDA_CHECK(cudaMalloc((void **)&d_masks, n * sizeof(mask_t)));
+	CUDA_CHECK(cudaMemcpy(d_masks, h_masks, n * sizeof(mask_t), cudaMemcpyHostToDevice));
+}
+
+void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned *&masks, float_t *&in, float_t *&out, float_t *&matrix, float_t *&grad) {
+	if (dropout) CUDA_CHECK(cudaMalloc((void **)&masks, x * y * sizeof(unsigned)));
+	CUDA_CHECK(cudaMalloc((void **)&in, x * y * sizeof(float_t)));
+	CUDA_CHECK(cudaMalloc((void **)&out, x * z * sizeof(float_t)));
+	CUDA_CHECK(cudaMalloc((void **)&matrix, y * z * sizeof(float_t)));
+	auto init_range = sqrt(6.0/(y + z));
+	// Glorot & Bengio (AISTATS 2010)
+	gpu_rng_uniform(y*z, -init_range, init_range, matrix);
+	CUDA_CHECK(cudaMalloc((void **)&grad, y * z * sizeof(float_t)));
+	CUDA_CHECK(cudaMemset(grad, 0, y * z * sizeof(float_t)));
 }
 
 void gpu_rng_gaussian(const int n, const float_t mu, const float_t sigma,
@@ -21,40 +44,33 @@ void gpu_rng_gaussian(const int n, const float_t mu, const float_t sigma,
       curandGenerateNormal(Context::curand_generator(), r, n, mu, sigma));
 }
 
-void out_malloc_device(int n, mask_t* h_masks, mask_t* d_masks, float_t* loss) {
-  CUDA_CHECK(cudaMalloc((void**)&d_masks, n * sizeof(mask_t)));
-  CUDA_CHECK(
-      cudaMemcpy(d_masks, h_masks, n * sizeof(mask_t), cudaMemcpyHostToDevice));
-  CUDA_CHECK(cudaMalloc((void**)&loss, n * sizeof(float_t)));
+__global__ void setup_curand_kernel(const int n, curandState *state) {
+	CUDA_KERNEL_LOOP(i, n) {
+		//curand_init(1234, i, 0, &state[i]); // Each thread gets same seed 1234
+		curand_init(7+i, i, 0, &state[i]); // Each thread gets different seed
+	}
 }
 
-void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout,
-                         unsigned* masks, float_t* in, float_t* out,
-                         float_t* matrix, float_t* grad) {
-  if (dropout)
-    CUDA_CHECK(cudaMalloc((void**)&masks, x * y * sizeof(unsigned)));
-  CUDA_CHECK(cudaMalloc((void**)&in, x * y * sizeof(float_t)));
-  CUDA_CHECK(cudaMalloc((void**)&out, x * z * sizeof(float_t)));
-  CUDA_CHECK(cudaMalloc((void**)&matrix, y * z * sizeof(float_t)));
-  auto init_range = sqrt(6.0 / (y + z));
-  // Glorot & Bengio (AISTATS 2010)
-  gpu_rng_uniform(y * z, -init_range, init_range, matrix);
-  CUDA_CHECK(cudaMalloc((void**)&grad, y * z * sizeof(float_t)));
-  CUDA_CHECK(cudaMemset(grad, 0, y * z * sizeof(float_t)));
+__global__ void dropout_kernel(const int n, const float scale, const float dropout_rate, const float_t* in, unsigned *masks, curandState *state, float_t* out) {
+	CUDA_KERNEL_LOOP(i, n) {
+		//curand_init(1234, i, 0, &state[i]); // Each thread gets same seed 1234
+		//masks[i] = curand_uniform(&state[i]) <= dropout_rate ? 1 : 0;
+		masks[i] = 1.0 - dropout_rate;
+		out[i] = in[i] * masks[i] * scale;
+	}
 }
 
-void copy_gpu(size_t len, const float_t* in, float_t* out) {
-  CUDA_CHECK(
-      cudaMemcpy(out, in, len * sizeof(float_t), cudaMemcpyDeviceToDevice));
-}
-
-__global__ void dropout_kernel(const int n, const float scale,
-                               const float dropout_rate, const float_t* in,
-                               unsigned* masks, float_t* out) {
-  CUDA_KERNEL_LOOP(i, n) {
-    // masks[i] = bernoulli(dropout_rate);
-    out[i] = in[i] * masks[i] * scale;
-  }
+void dropout_gpu(const int n, const float scale, const float dropout_rate, const float_t *in, unsigned *masks, float_t *out) {
+	curandState *devStates;
+	CUDA_CHECK(cudaMalloc((void **)&devStates, n * sizeof(curandState)));
+	//std::cout << "[debug]: setup curand, n = " << n << "\n";
+	//setup_curand_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, devStates);
+	//CudaTest("solving setup_curand kernel failed");
+	//std::cout << "[debug]: dropout_gpu\n";
+	dropout_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, scale, dropout_rate, in, masks, devStates, out);
+	CudaTest("solving dropout kernel failed");
+	CUDA_CHECK(cudaFree(devStates));
+	//std::cout << "[debug]: dropout_gpu done\n";
 }
 
 void dropout_gpu(const int n, const float scale, const float dropout_rate,
@@ -68,8 +84,10 @@ __global__ void relu_kernel(const int n, const float_t* in, float_t* out) {
   CUDA_KERNEL_LOOP(index, n) { out[index] = in[index] > 0 ? in[index] : 0; }
 }
 
-void relu_gpu(const int n, const float_t* in, float_t* out) {
-  relu_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, in, out);
+void relu_gpu(const int n, const float_t *in, float_t* out) {
+	//std::cout << "[debug]: relu_gpu\n";
+	relu_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, in, out);
+	CudaTest("solving relu kernel failed");
 }
 
 __global__ void d_relu_kernel(const int n, const float_t* in_diff,
@@ -99,23 +117,17 @@ void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
                            K, &alpha, B, ldb, A, lda, &beta, C, N));
 }
 
-void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z,
-                    const float_t* A, const float_t* B, float_t* C) {
-  const CBLAS_TRANSPOSE TransA = CblasNoTrans;
-  const CBLAS_TRANSPOSE TransB = CblasNoTrans;
-  sgemm_gpu(TransA, TransB, dim_x, dim_y, dim_z, 1.0, A, B, 0.0, C);
+void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, const float_t *A, const float_t *B, float_t *C) {
+	//std::cout << "[debug]: matmul1D1D_gpu\n";
+	const CBLAS_TRANSPOSE TransA = CblasNoTrans;
+	const CBLAS_TRANSPOSE TransB = CblasNoTrans;
+	sgemm_gpu(TransA, TransB, dim_x, dim_y, dim_z, 1.0, A, B, 0.0, C);
 }
 
-// the arguments of the maxima
-int argmax_gpu(const size_t n, const float_t* x) { return 0; }
-
-void gemv_gpu(const CBLAS_TRANSPOSE TransA, const int M, const int N,
-              const float alpha, const float* A, const float* x,
-              const float beta, float* y) {
-  cublasOperation_t cuTransA =
-      (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
-  CUBLAS_CHECK(cublasSgemv(Context::cublas_handle(), cuTransA, N, M, &alpha, A,
-                           N, x, 1, &beta, y, 1));
+void gemv_gpu(const CBLAS_TRANSPOSE TransA, const int M, const int N, 
+	const float alpha, const float* A, const float* x, const float beta, float* y) {
+	cublasOperation_t cuTransA = (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
+	CUBLAS_CHECK(cublasSgemv(Context::cublas_handle(), cuTransA, N, M, &alpha, A, N, x, 1, &beta, y, 1));
 }
 
 void scal_gpu(const int N, const float alpha, float* X) {
@@ -212,42 +224,90 @@ __device__ void d_cross_entropy(int n, const label_t idx, const float_t* p,
 // n: number of vectors
 // len: length of vectors
 // for each vector, do softmax to normalize the vector, and then compute a loss
-__global__ void softmax_cross_entropy_kernel(int n, int len,
-                                             const float_t* in_data,
-                                             const mask_t* masks,
-                                             const label_t* labels,
-                                             float_t* loss, float_t* out_data) {
-  CUDA_KERNEL_LOOP(i, n) {
-    if (masks[i] == 1) { // masked
-      softmax(len, in_data + len * i,
-              out_data + len * i); // normalize using softmax
-      loss[i] = 0.0;
-      cross_entropy(len, labels[i], &out_data[len * i], loss[i]);
-    }
-  }
+__global__ void softmax_cross_entropy_kernel(int n, int len, const float_t *in_data,
+	const mask_t *masks, const label_t *labels, float_t *loss, float_t *out_data) {
+	CUDA_KERNEL_LOOP(i, n) {
+		if (masks[i] == 1) { // masked
+			softmax(len, in_data+len*i, out_data+len*i); // normalize using softmax
+			loss[i] = 0.0;
+			cross_entropy(len, labels[i], &out_data[len*i], loss[i]);
+		}
+	}
+}
+
+void softmax_cross_entropy_gpu(int n, int len, const float_t *in, const mask_t *masks, const label_t *labels, float_t *loss, float_t *out) {
+	softmax_cross_entropy_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, len, in, masks, labels, loss, out);
+	CudaTest("solving softmax_cross_entropy kernel failed");
+}
+
+__global__ void d_softmax_cross_entropy_kernel(int n, int len, const float_t *in,
+	const mask_t *masks, const label_t *labels, const float_t *out, float_t *diff) {
+	CUDA_KERNEL_LOOP(i, n) {
+		float_t out_grad[41]; // TODO
+		d_cross_entropy(len, labels[i], out+len*i, out_grad);
+		d_softmax(len, out+len*i, out_grad, diff+len*i);
+	}
+}
+
+void d_softmax_cross_entropy_gpu(int n, int len, const float_t *in, const mask_t *masks, const label_t *labels, const float_t *out, float_t *diff) {
+	d_softmax_cross_entropy_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, len, in, masks, labels, out, diff);
+	CudaTest("solving d_softmax_cross_entropy kernel failed");
+}
+
+__global__ void masked_avg_loss_kernel(size_t begin, size_t end, mask_t *masks, float_t *loss, HGAccumulator<acc_t> total) {
+	total.thread_entry();
+	__shared__ cub::BlockReduce<acc_t, CUDA_NUM_THREADS>::TempStorage local_loss;
+	CUDA_KERNEL_LOOP(i, end-begin) {
+		if (masks[begin+i] == 1)
+			//total += loss[begin+i];
+			total.reduce(loss[begin+i]);
+	}
+	total.thread_exit<cub::BlockReduce<acc_t, CUDA_NUM_THREADS> >(local_loss);
+}
+
+acc_t masked_avg_loss(size_t begin, size_t end, size_t count, mask_t *masks, float_t *loss) {
+	HGAccumulator<acc_t> loss_accum;
+	Shared<acc_t> total_loss = Shared<acc_t>(1);
+	*(total_loss.cpu_wr_ptr()) = 0;
+	loss_accum.rv = total_loss.gpu_wr_ptr();
+	masked_avg_loss_kernel<<<CUDA_GET_BLOCKS(end-begin), CUDA_NUM_THREADS>>>(begin, end, masks, loss, loss_accum);
+	CudaTest("solving masked_avg_loss kernel failed");
+	cudaDeviceSynchronize();
+	return *(total_loss.cpu_rd_ptr()) / count;
 }
 
-void softmax_cross_entropy_gpu(int n, int len, const float_t* in,
-                               const mask_t* masks, const label_t* labels,
-                               float_t* loss, float_t* out) {
-  softmax_cross_entropy_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
-      n, len, in, masks, labels, loss, out);
-}
-
-__global__ void
-d_softmax_cross_entropy_kernel(int n, int len, const float_t* in,
-                               const mask_t* masks, const label_t* labels,
-                               const float_t* out, float_t* diff) {
-  CUDA_KERNEL_LOOP(i, n) {
-    float_t out_grad[41];
-    d_cross_entropy(len, labels[i], out + len * i, out_grad);
-    d_softmax(len, out + len * i, out_grad, diff + len * i);
-  }
-}
-
-void d_softmax_cross_entropy_gpu(int n, int len, const float_t* in,
-                                 const mask_t* masks, const label_t* labels,
-                                 const float_t* out, float_t* diff) {
-  d_softmax_cross_entropy_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
-      n, len, in, masks, labels, out, diff);
+// the arguments of the maxima
+__device__ size_t argmax_device(const size_t n, const float_t *x) {
+	float_t max = x[0];
+	size_t max_ind = 0;
+	for (size_t i = 1; i < n; i++) {
+		if (x[i] > max) {
+			max_ind = i;
+			max = x[i];
+		}
+	}
+	return max_ind;
+}
+
+__global__ void masked_accuracy_kernel(size_t num_classes, size_t begin, size_t end, mask_t *masks, float_t *preds, label_t *labels, HGAccumulator<acc_t> total) {
+	total.thread_entry();
+	__shared__ cub::BlockReduce<acc_t, CUDA_NUM_THREADS>::TempStorage local_accuracy;
+	CUDA_KERNEL_LOOP(i, end-begin) {
+		if (masks[begin+i] == 1) {
+			label_t pred = (label_t)argmax_device(num_classes, preds+(begin+i)*num_classes);
+			if (pred == labels[begin+i]) total.reduce(1.0);
+		}
+	}
+	total.thread_exit<cub::BlockReduce<acc_t, CUDA_NUM_THREADS> >(local_accuracy);
+}
+
+acc_t masked_accuracy_gpu(size_t num_classes, size_t begin, size_t end, size_t count, mask_t *masks, float_t *preds, label_t *labels) {
+	HGAccumulator<acc_t> accuracy_accum;
+	Shared<acc_t> total_accuracy = Shared<acc_t>(1);
+	*(total_accuracy.cpu_wr_ptr()) = 0;
+	accuracy_accum.rv = total_accuracy.gpu_wr_ptr();
+	masked_accuracy_kernel<<<CUDA_GET_BLOCKS(end-begin), CUDA_NUM_THREADS>>>(num_classes, begin, end, masks, preds, labels, accuracy_accum);
+	CudaTest("solving masked_avg_loss kernel failed");
+	cudaDeviceSynchronize();
+	return *(total_accuracy.cpu_rd_ptr()) / count;
 }
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index 6625f283b3..f76ccaeb8a 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -100,3 +100,20 @@ void Net::construct_layers() {
   layers[0]->set_in_data(context->get_in_ptr()); // feed input data
   set_contexts();
 }
+
+acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t *masks) {
+#ifdef CPU_ONLY
+	AccumF accuracy_all;
+	accuracy_all.reset();
+	galois::do_all(galois::iterate(begin, end), [&](const auto& i) {
+		if (masks[i] == 1) {
+			int preds = argmax(num_classes, &(layers[NUM_CONV_LAYERS-1]->next()->get_data()[i*num_classes]));
+			if ((label_t)preds == context->get_label(i)) accuracy_all += 1.0;
+		}
+	}, galois::chunk_size<256>(), galois::steal(), galois::loopname("getMaskedLoss"));
+	return accuracy_all.reduce() / (acc_t)count;
+#else
+	return masked_accuracy_gpu(num_classes, begin, end, count, layers[NUM_CONV_LAYERS]->get_device_masks(), layers[NUM_CONV_LAYERS-1]->next()->get_data(), context->d_labels);
+#endif
+}
+
diff --git a/libgpu/include/graph_gpu.h b/libgpu/include/graph_gpu.h
index 2458ad8632..050d7bbc69 100644
--- a/libgpu/include/graph_gpu.h
+++ b/libgpu/include/graph_gpu.h
@@ -110,18 +110,18 @@ struct CSRGraph {
 		//check_cuda(cudaMemcpy(d_degrees, h_degrees, m * sizeof(int), cudaMemcpyHostToDevice));
 	}
 
-	inline __device__ __host__ index_type getEdgeDst(unsigned edge) {
+	__device__ __host__ index_type getEdgeDst(unsigned edge) {
 		assert(edge < nedges);
 		return edge_dst[edge];
 	};
-	inline __device__ __host__ node_data_type getData(unsigned vid) {
+	__device__ __host__ node_data_type getData(unsigned vid) {
 		return node_data[vid];
 	}
-	inline __device__ __host__ index_type edge_begin(unsigned src) {
+	__device__ __host__ index_type edge_begin(unsigned src) {
 		assert(src <= nnodes);
 		return row_start[src];
 	};
-	inline __device__ __host__ index_type edge_end(unsigned src) {
+	__device__ __host__ index_type edge_end(unsigned src) {
 		assert(src <= nnodes);
 		return row_start[src+1];
 	};

From 859862435478f5361815fb9c1e2da1ad5d6dab1f Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 25 Feb 2020 17:42:43 -0600
Subject: [PATCH 035/660] got back optimizer changes

---
 libdeepgalois/include/optimizer.h | 122 ++++++------------------------
 1 file changed, 23 insertions(+), 99 deletions(-)

diff --git a/libdeepgalois/include/optimizer.h b/libdeepgalois/include/optimizer.h
index f1822adc7d..ed8e7654d9 100644
--- a/libdeepgalois/include/optimizer.h
+++ b/libdeepgalois/include/optimizer.h
@@ -15,6 +15,7 @@ struct optimizer {
   optimizer& operator=(optimizer&&)                                = default;
   virtual ~optimizer()                                             = default;
   virtual void update(const vec_t& dW, vec_t& W, bool parallelize) = 0;
+  virtual void update_gpu(const float_t* dW, float_t* W)           = 0;
   virtual void reset() {} // override to implement pre-learning action
 };
 
@@ -46,22 +47,8 @@ struct stateful_optimizer : public optimizer {
  **/
 struct adagrad : public stateful_optimizer<1> {
   adagrad() : alpha(0.01), eps(float_t(1e-8)) {}
-  void update(const vec_t& dW, vec_t& W, bool parallelize) {
-    vec_t& g = get<0>(W);
-    if (parallelize) {
-      galois::do_all(galois::iterate((size_t)0, W.size()),
-                     [&](const auto& i) {
-                       g[i] += dW[i] * dW[i];
-                       W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps);
-                     },
-                     galois::loopname("adagrad_update"));
-    } else {
-      for (size_t i = 0; i < W.size(); i++) {
-        g[i] += dW[i] * dW[i];
-        W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps);
-      }
-    }
-  }
+  void update(const vec_t& dW, vec_t& W, bool parallelize);
+  void update_gpu(const float_t* dW, float_t* W) {}
   float_t alpha; // learning rate
 private:
   float_t eps;
@@ -75,15 +62,8 @@ struct adagrad : public stateful_optimizer<1> {
  **/
 struct RMSprop : public stateful_optimizer<1> {
   RMSprop() : alpha(float_t(0.0001)), mu(float_t(0.99)), eps(float_t(1e-8)) {}
-  void update(const vec_t& dW, vec_t& W, bool parallelize) {
-    vec_t& g = get<0>(W);
-    galois::do_all(galois::iterate((size_t)0, W.size()),
-                   [&](const auto& i) {
-                     g[i] = mu * g[i] + (1 - mu) * dW[i] * dW[i];
-                     W[i] -= alpha * dW[i] / std::sqrt(g[i] + eps);
-                   },
-                   galois::loopname("rms_update"));
-  }
+  void update(const vec_t& dW, vec_t& W, bool parallelize);
+  void update_gpu(const float_t* dW, float_t* W) {}
   float_t alpha; // learning rate
   float_t mu;    // decay term
 private:
@@ -94,25 +74,14 @@ struct RMSprop : public stateful_optimizer<1> {
 // http://arxiv.org/abs/1412.6980
 struct adam : public stateful_optimizer<2> {
   adam()
-      : alpha(0.01), b1(float_t(0.9)), b2(float_t(0.999)), b1_t(float_t(0.9)),
-        b2_t(float_t(0.999)), eps(float_t(1e-8)) {}
-
-  void update(const vec_t& dW, vec_t& W, bool parallelize) {
-    vec_t& mt = get<0>(W);
-    vec_t& vt = get<1>(W);
-    galois::do_all(galois::iterate((size_t)0, W.size()),
-                   [&](const auto& i) {
-                     mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i];
-                     vt[i] = b2 * vt[i] + (float_t(1) - b2) * dW[i] * dW[i];
-                     // L2 norm based update rule
-                     W[i] -= alpha * (mt[i] / (float_t(1) - b1_t)) /
-                             std::sqrt((vt[i] / (float_t(1) - b2_t)) + eps);
-                   },
-                   galois::chunk_size<256>(), galois::steal(),
-                   galois::loopname("adam_update"));
-    b1_t *= b1;
-    b2_t *= b2;
-  }
+      : alpha(float_t(0.01)), b1(float_t(0.9)), b2(float_t(0.999)),
+        b1_t(float_t(0.9)), b2_t(float_t(0.999)), eps(float_t(1e-8)) {}
+  void update(const vec_t& dW, vec_t& W, bool parallelize);
+#ifdef CPU_ONLY
+  void update_gpu(const float_t* dW, float_t* W) {}
+#else
+  void update_gpu(const float_t* dW, float_t* W);
+#endif
 
   float_t alpha; // learning rate
   float_t b1;    // decay term
@@ -134,20 +103,8 @@ struct adamax : public stateful_optimizer<2> {
   adamax()
       : alpha(float_t(0.002)), b1(float_t(0.9)), b2(float_t(0.999)), b1_t(b1),
         eps(float_t(1e-8)) {}
-
-  void update(const vec_t& dW, vec_t& W, bool parallelize) {
-    vec_t& mt = get<0>(W);
-    vec_t& ut = get<1>(W);
-    galois::do_all(galois::iterate((size_t)0, W.size()),
-                   [&](const auto& i) {
-                     mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i];
-                     ut[i] = std::max(b2 * ut[i], std::abs(dW[i]));
-                     // Lp norm based update rule
-                     W[i] -= (alpha / (1.0 - b1_t)) * (mt[i] / (ut[i] + eps));
-                   },
-                   galois::loopname("adamax_update"));
-    b1_t *= b1;
-  }
+  void update(const vec_t& dW, vec_t& W, bool parallelize);
+  void update_gpu(const float_t* dW, float_t* W) {}
 
   float_t alpha; // learning rate
   float_t b1;    // decay term
@@ -158,19 +115,12 @@ struct adamax : public stateful_optimizer<2> {
   float_t eps; // constant value to avoid zero-division
 };
 
-/**
- * SGD without momentum
- *
- * slightly faster than tiny_dnn::momentum
- **/
+// SGD without momentum
+// slightly faster than tiny_dnn::momentum
 struct gradient_descent : public optimizer {
   gradient_descent() : alpha(float_t(0.01)), lambda(float_t(0)) {}
-  void update(const vec_t& dW, vec_t& W, bool parallelize) {
-    galois::do_all(
-        galois::iterate((size_t)0, W.size()),
-        [&](const auto& i) { W[i] = W[i] - alpha * (dW[i] + lambda * W[i]); },
-        galois::loopname("gradient_descent_update"));
-  }
+  void update(const vec_t& dW, vec_t& W, bool parallelize);
+  void update_gpu(const float_t* dW, float_t* W) {}
   float_t alpha;  // learning rate
   float_t lambda; // weight decay
 };
@@ -185,21 +135,8 @@ struct gradient_descent : public optimizer {
 struct momentum : public stateful_optimizer<1> {
 public:
   momentum() : alpha(float_t(0.01)), lambda(float_t(0)), mu(float_t(0.9)) {}
-
-  void update(const vec_t& dW, vec_t& W, bool parallelize) {
-    vec_t& dWprev = get<0>(W);
-
-    // for_i(parallelize, W.size(), [&](size_t i) {
-    galois::do_all(galois::iterate((size_t)0, W.size()),
-                   [&](const auto& i) {
-                     float_t V =
-                         mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda);
-                     W[i] += V;
-                     dWprev[i] = V;
-                     //});
-                   },
-                   galois::loopname("momentum_update"));
-  }
+  void update(const vec_t& dW, vec_t& W, bool parallelize);
+  void update_gpu(const float_t* dW, float_t* W) {}
 
   float_t alpha;  // learning rate
   float_t lambda; // weight decay
@@ -217,21 +154,8 @@ struct nesterov_momentum : public stateful_optimizer<1> {
 public:
   nesterov_momentum()
       : alpha(float_t(0.01)), lambda(float_t(0)), mu(float_t(0.9)) {}
-
-  void update(const vec_t& dW, vec_t& W, bool parallelize) {
-    vec_t& dWprev = get<0>(W);
-
-    // for_i(parallelize, W.size(), [&](size_t i) {
-    galois::do_all(galois::iterate((size_t)0, W.size()),
-                   [&](const auto& i) {
-                     float_t V =
-                         mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda);
-                     W[i] += (-mu) * dWprev[i] + (1 + mu) * V;
-                     dWprev[i] = V;
-                     //});
-                   },
-                   galois::loopname("nesterov_momentum_update"));
-  }
+  void update(const vec_t& dW, vec_t& W, bool parallelize);
+  void update_gpu(const float_t* dW, float_t* W) {}
 
   float_t alpha;  // learning rate
   float_t lambda; // weight decay

From e4bb47cfb1b2275e22e36857ca6faa8a5d86abe7 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 25 Feb 2020 17:54:01 -0600
Subject: [PATCH 036/660] getting back changes  erased by clangformat rebase

---
 libdeepgalois/include/aggregator.h            |   2 -
 libdeepgalois/include/context.h               |  54 ++-
 libdeepgalois/include/cutils.h                |  12 +
 libdeepgalois/include/layers/layer.h          | 113 ++++---
 .../include/layers/softmax_loss_layer.h       |   1 +
 libdeepgalois/include/net.h                   |  45 ++-
 libdeepgalois/include/node.h                  |   4 +-
 libdeepgalois/include/types.h                 |   2 +
 libdeepgalois/src/aggregator.cu               |  14 +-
 libdeepgalois/src/context.cpp                 |  95 +-----
 libdeepgalois/src/layers/graph_conv_layer.cpp |  96 +++---
 .../src/layers/softmax_loss_layer.cpp         |  36 +-
 libdeepgalois/src/math_functions.cpp          |  37 +-
 libdeepgalois/src/math_functions.cu           | 318 +++++++++---------
 libdeepgalois/src/net.cpp                     |  21 +-
 libdeepgalois/src/node.cpp                    |   3 +
 libdeepgalois/src/node.cu                     |   2 +-
 17 files changed, 425 insertions(+), 430 deletions(-)

diff --git a/libdeepgalois/include/aggregator.h b/libdeepgalois/include/aggregator.h
index 01b1a1e8c8..552925c1bf 100644
--- a/libdeepgalois/include/aggregator.h
+++ b/libdeepgalois/include/aggregator.h
@@ -6,8 +6,6 @@ void update_all(size_t len, Graph& g, const float_t* in, float_t* out,
                 bool norm, const float_t* norm_factor);
 #else
 #include "graph_gpu.h"
-#define TB_SIZE 256
-#define WARP_SIZE 32
 void update_all(size_t len, CSRGraph& g, const float_t* in, float_t* out,
                 bool norm, const float_t* norm_factor);
 #endif
diff --git a/libdeepgalois/include/context.h b/libdeepgalois/include/context.h
index 688ed9a2a5..47b32d023e 100644
--- a/libdeepgalois/include/context.h
+++ b/libdeepgalois/include/context.h
@@ -3,27 +3,19 @@
 #include <cassert>
 #include "types.h"
 #include "utils.h"
-#include "lgraph.h"
 #ifdef CPU_ONLY
+#include "lgraph.h"
 #include "gtypes.h"
 #else
 #include "graph_gpu.h"
-#endif
 #include "cutils.h"
+#endif
 
 class Context {
 public:
   Context();
   ~Context();
   enum Brew { CPU, GPU };
-  // static Context& Get();
-#ifndef CPU_ONLY
-  inline static cublasHandle_t cublas_handle() { return cublas_handle_; }
-  inline static curandGenerator_t curand_generator() {
-    return curand_generator_;
-  }
-  // static void create_blas_handle();
-#endif
   Brew mode() { return mode_; }
   void set_mode(Brew mode) { mode_ = mode; }
   int solver_count() { return solver_count_; }
@@ -39,30 +31,36 @@ class Context {
   label_t get_label(size_t i) { return labels[i]; }
   label_t* get_labels_ptr(size_t i) { return &(labels[0]); }
   float_t* get_in_ptr();
-  void degree_counting();
-  void norm_factor_counting();
-  std::vector<label_t> labels; // labels for classification: N x 1
-  float_t* norm_factor; // normalization constant based on graph structure
-  std::vector<unsigned> degrees;
-  vec_t h_feats;      // input features: N x D
-  size_t n;           // number of samples: N
-  size_t num_classes; // number of classes: E
-  size_t feat_len;    // input feature length: D
-#ifdef CPU_ONLY
-  Graph graph_cpu; // the input graph, |V| = N
-  void genGraph(LGraph& lg, Graph& g);
+
   size_t read_graph_cpu(std::string dataset_str, std::string filetype = "gr");
-#else
-  CSRGraph graph_gpu;     // the input graph, |V| = N
-  label_t* d_labels;      // labels on device
-  float_t* d_norm_factor; // norm_factor on device
-  float_t* d_feats;       // input features on device
   size_t read_graph_gpu(std::string dataset_str);
   void copy_data_to_device(); // copy labels and input features
   void SetDevice(const int device_id);
   void DeviceQuery() {}
   bool CheckDevice(const int device_id) { return true; }
   int FindDevice(const int start_id = 0) { return 0; }
+  void norm_factor_counting();
+  void norm_factor_counting_gpu();
+
+  size_t n;                    // number of samples: N
+  size_t num_classes;          // number of classes: E
+  size_t feat_len;             // input feature length: D
+  std::vector<label_t> labels; // labels for classification: N x 1
+  label_t* d_labels;           // labels on device
+  vec_t h_feats;               // input features: N x D
+  float_t* d_feats;            // input features on device
+  float_t* norm_factor;   // normalization constant based on graph structure
+  float_t* d_norm_factor; // norm_factor on device
+
+#ifdef CPU_ONLY
+  Graph graph_cpu; // the input graph, |V| = N
+  void genGraph(LGraph& lg, Graph& g);
+#else
+  CSRGraph graph_gpu; // the input graph, |V| = N
+  inline static cublasHandle_t cublas_handle() { return cublas_handle_; }
+  inline static curandGenerator_t curand_generator() {
+    return curand_generator_;
+  }
 #endif
 
 protected:
@@ -72,8 +70,6 @@ class Context {
       curand_generator_; // used to generate random numbers on GPU
 #endif
   Brew mode_;
-  // shared_ptr<RNG> random_generator_;
-  // Parallel training
   int solver_count_;
   int solver_rank_;
   bool multiprocess_;
diff --git a/libdeepgalois/include/cutils.h b/libdeepgalois/include/cutils.h
index 830a4bbd08..fac2cfaa64 100644
--- a/libdeepgalois/include/cutils.h
+++ b/libdeepgalois/include/cutils.h
@@ -13,6 +13,18 @@ inline int CUDA_GET_BLOCKS(const int N) {
   return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
 }
 
+inline unsigned CudaTest(const char* msg) {
+  cudaError_t e;
+  // cudaThreadSynchronize();
+  cudaDeviceSynchronize();
+  if (cudaSuccess != (e = cudaGetLastError())) {
+    fprintf(stderr, "%s: %d\n", msg, e);
+    fprintf(stderr, "%s\n", cudaGetErrorString(e));
+    exit(-1);
+  }
+  return 0;
+}
+
 inline const char* cublasGetErrorString(cublasStatus_t error) {
   switch (error) {
   case CUBLAS_STATUS_SUCCESS:
diff --git a/libdeepgalois/include/layers/layer.h b/libdeepgalois/include/layers/layer.h
index 7b8bbc55a4..609047853a 100644
--- a/libdeepgalois/include/layers/layer.h
+++ b/libdeepgalois/include/layers/layer.h
@@ -44,69 +44,76 @@ class layer : public node {
   virtual std::string layer_type() const = 0;
   virtual void set_netphase(net_phase phase) {}
   virtual void set_context(Context* ctx) { context = ctx; }
-  // virtual void forward_propagation(const vec_t &in_data, vec_t &out_data) =
-  // 0; virtual void back_propagation(const vec_t &in_data, const vec_t
-  // &out_data, vec_t &out_grad, vec_t &in_grad) = 0;
+  virtual acc_t get_masked_loss() { return acc_t(0); }
   virtual void forward_propagation(const float_t* in_data,
                                    float_t* out_data)                = 0;
   virtual void back_propagation(const float_t* in_data, const float_t* out_data,
                                 float_t* out_grad, float_t* in_grad) = 0;
 
-	void set_trainable(bool trainable) { trainable_ = trainable; }
-	bool trainable() const { return trainable_; }
-	void set_name(std::string name) { name_ = name; }
-	std::string get_name() { return name_; }
-	mask_t *get_device_masks() { return d_masks_; }
-	void print_layer_info() {
-		std::cout << "Layer" << level_ << " type: " << layer_type()
-			<< " input[" << input_dims[0] << "," << input_dims[1] 
-			<< "] output[" << output_dims[0] << "," << output_dims[1] << "]\n";
-	}
-	virtual void set_sample_mask(size_t sample_begin, size_t sample_end, size_t sample_count, mask_t *masks) {
-		begin_ = sample_begin;
-		end_ = sample_end;
-		count_ = sample_count;
-		masks_ = masks;
+  void set_trainable(bool trainable) { trainable_ = trainable; }
+  bool trainable() const { return trainable_; }
+  void set_name(std::string name) { name_ = name; }
+  std::string get_name() { return name_; }
+  void print_layer_info() {
+    std::cout << "Layer" << level_ << " type: " << layer_type() << " input["
+              << input_dims[0] << "," << input_dims[1] << "] output["
+              << output_dims[0] << "," << output_dims[1] << "]\n";
+  }
+  virtual void set_sample_mask(size_t sample_begin, size_t sample_end,
+                               size_t sample_count, mask_t* masks) {
+    begin_ = sample_begin;
+    end_   = sample_end;
+    count_ = sample_count;
+    masks_ = masks;
 #ifndef CPU_ONLY
-		copy_masks_device(input_dims[0], masks_, d_masks_);
+    copy_masks_device(input_dims[0], masks_, d_masks_);
 #endif
-	}
-	void set_in_data(float_t *data) {
-		prev_ = std::make_shared<edge>(this, input_dims[0], input_dims[1]);
-		prev_->set_data(data);
-		// no need to allocate memory for gradients, since this is the input layer.
-	}
-	void add_edge() {
-		// add an outgoing edge
-		next_ = std::make_shared<edge>(this, output_dims[0], output_dims[1]);
-		// allocate memory for intermediate feature vectors and gradients
-		next_->alloc();
-	}
-	void alloc_grad() {
-		// allocate memory for intermediate gradients
-	}
-	void forward() {
-		//std::cout << name_ << ": forwarding ... ";
-		forward_propagation(prev()->get_data(), next()->get_data());
-	}
-	void backward() {
-		//std::cout << name_ << ": backwarding ... ";
-		back_propagation(prev()->get_data(), next()->get_data(), next()->get_gradient(), prev()->get_gradient());
-	}
-	void update_weight(optimizer *opt) {
-		//std::cout << name_ << ": weight updating ... ";
-		//vec_t diff;
-		//prev()->merge_grads(&diff);
+  }
+  void set_in_data(float_t* data) {
+    assert(data.size() == input_dims[0] * input_dims[1]);
+    prev_ = std::make_shared<edge>(this, input_dims[0], input_dims[1]);
+    prev_->set_data(data);
+    // no need to allocate memory for gradients, since this is the input layer.
+    //
+    // allocate memory for intermediate features
+    // prev_->get_data() = data;
+    // std::copy(data.begin(), data.end(), prev_->get_data());
+    // allocate memory for intermediate gradients
+    // prev_->get_gradient().resize(input_dims[0]*input_dims[1]);
+  }
+  void add_edge() {
+    // add an outgoing edge
+    next_ = std::make_shared<edge>(this, output_dims[0], output_dims[1]);
+    // allocate memory for intermediate feature vectors and gradients
+    next_->alloc();
+  }
+  void alloc_grad() {
+    // allocate memory for intermediate gradients
+  }
+  void forward() {
+    std::cout << name_ << ": forwarding ... ";
+    forward_propagation(prev()->get_data(), next()->get_data());
+  }
+  void backward() {
+    std::cout << name_ << ": backwarding ... ";
+    back_propagation(prev()->get_data(), next()->get_data(),
+                     next()->get_gradient(), prev()->get_gradient());
+  }
+  void update_weight(optimizer* opt) {
+    std::cout << name_ << ": weight updating ... ";
+    // vec_t diff;
+    // prev()->merge_grads(&diff);
 #ifdef CPU_ONLY
-		// parallelize only when target size is big enough to mitigate thread spawning overhead.
-		bool parallel = (W.size() >= 512);
-		opt->update(weight_grad, W, parallel); // W += grad
+    // parallelize only when target size is big enough to mitigate thread
+    // spawning overhead.
+    bool parallel = (W.size() >= 512);
+    opt->update(weight_grad, W, parallel); // W += grad
 #else
-		opt->update_gpu(d_weight_grad, d_W); // W += grad
+    opt->update_gpu(d_weight_grad, d_W); // W += grad
 #endif
-		//prev()->clear_grads();
-		next()->clear_grads();
-	}
+    // prev()->clear_grads();
+    next()->clear_grads();
+  }
 
 protected:
   unsigned level_;                 // layer id: [0, num_layers-1]
diff --git a/libdeepgalois/include/layers/softmax_loss_layer.h b/libdeepgalois/include/layers/softmax_loss_layer.h
index 0a680a3209..0fa56cf7fe 100644
--- a/libdeepgalois/include/layers/softmax_loss_layer.h
+++ b/libdeepgalois/include/layers/softmax_loss_layer.h
@@ -12,4 +12,5 @@ class softmax_loss_layer : public layer {
   virtual void forward_propagation(const float_t* in_data, float_t* out_data);
   virtual void back_propagation(const float_t* in_data, const float_t* out_data,
                                 float_t* out_grad, float_t* in_grad);
+  virtual acc_t get_masked_loss();
 };
diff --git a/libdeepgalois/include/net.h b/libdeepgalois/include/net.h
index 9d3e1c1184..87a0e3b72b 100644
--- a/libdeepgalois/include/net.h
+++ b/libdeepgalois/include/net.h
@@ -62,17 +62,17 @@ class Net {
     connect(layers[layer_id - 1], layers[layer_id]);
   }
 
-	// forward propagation: [begin, end) is the range of samples used.
-	acc_t fprop(size_t begin, size_t end, size_t count, mask_t *masks) {
-		// set mask for the last layer
-		layers[num_layers-1]->set_sample_mask(begin, end, count, masks);
-		// layer0: from N x D to N x 16
-		// layer1: from N x 16 to N x E
-		// layer2: from N x E to N x E (normalize only)
-		for (size_t i = 0; i < num_layers; i ++)
-			layers[i]->forward();
-		return layers[num_layers-1]->get_masked_loss();
-	}
+  // forward propagation: [begin, end) is the range of samples used.
+  acc_t fprop(size_t begin, size_t end, size_t count, mask_t* masks) {
+    // set mask for the last layer
+    layers[num_layers - 1]->set_sample_mask(begin, end, count, &masks[0]);
+    // layer0: from N x D to N x 16
+    // layer1: from N x 16 to N x E
+    // layer2: from N x E to N x E (normalize only)
+    for (size_t i = 0; i < num_layers; i++)
+      layers[i]->forward();
+    return layers[num_layers - 1]->get_masked_loss();
+  }
 
   // back propogation
   void bprop() {
@@ -108,8 +108,27 @@ class Net {
   std::vector<mask_t> train_mask, val_mask; // masks for traning and validation
   size_t train_begin, train_end, train_count, val_begin, val_end, val_count;
   std::vector<layer*> layers; // all the layers in the neural network
-	// comparing outputs with the ground truth (labels)
-	acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t *masks);
+
+  // comparing outputs with the ground truth (labels)
+  inline acc_t masked_accuracy(size_t begin, size_t end, size_t count,
+                               mask_t* masks) {
+    AccumF accuracy_all;
+    accuracy_all.reset();
+    galois::do_all(galois::iterate(begin, end),
+                   [&](const auto& i) {
+                     if (masks[i] == 1) {
+                       int preds = argmax(num_classes,
+                                          &(layers[NUM_CONV_LAYERS - 1]
+                                                ->next()
+                                                ->get_data()[i * num_classes]));
+                       if ((label_t)preds == context->get_label(i))
+                         accuracy_all += 1.0;
+                     }
+                   },
+                   galois::chunk_size<256>(), galois::steal(),
+                   galois::loopname("getMaskedLoss"));
+    return accuracy_all.reduce() / (acc_t)count;
+  }
 };
 
 #endif
diff --git a/libdeepgalois/include/node.h b/libdeepgalois/include/node.h
index 918b91b86c..8b48e85aa8 100644
--- a/libdeepgalois/include/node.h
+++ b/libdeepgalois/include/node.h
@@ -31,9 +31,7 @@ class node : public std::enable_shared_from_this<node> {
 class edge {
 public:
   edge(node* prev, size_t n, size_t len)
-      : num_samples_(n), ft_dim_(len),
-        // data_(vec_t(n*len)), grad_(vec_t(n*len)),
-        data_(NULL), grad_(NULL), prev_(prev) {}
+      : num_samples_(n), ft_dim_(len), data_(NULL), grad_(NULL), prev_(prev) {}
 
   void alloc();
   void alloc_gpu();
diff --git a/libdeepgalois/include/types.h b/libdeepgalois/include/types.h
index 5890ed307c..387b5f5b60 100644
--- a/libdeepgalois/include/types.h
+++ b/libdeepgalois/include/types.h
@@ -20,5 +20,7 @@ typedef short label_t;  // label is for classification (supervised learning)
 typedef uint8_t mask_t; // mask is used to indicate different uses of labels:
                         // train, val, test
 #define CHUNK_SIZE 256
+#define TB_SIZE 256
+#define WARP_SIZE 32
 
 #endif
diff --git a/libdeepgalois/src/aggregator.cu b/libdeepgalois/src/aggregator.cu
index c1f578caa1..3a0288b197 100644
--- a/libdeepgalois/src/aggregator.cu
+++ b/libdeepgalois/src/aggregator.cu
@@ -32,10 +32,12 @@ __global__ void update_all_kernel(size_t n, size_t len, CSRGraph& g,
   }
 }
 
-void update_all(size_t len, CSRGraph &g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor) {
-	unsigned n = g.nnodes;
-	std::cout << "[debug]: update_all on GPU, n=" << n << ", len=" << len << "\n";
-	CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t)));
-	update_all_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, len, g, in, out, norm, norm_factor);
-	CudaTest("solving update_all kernel failed");
+void update_all(size_t len, CSRGraph& g, const float_t* in, float_t* out,
+                bool norm, const float_t* norm_factor) {
+  std::cout << "[debug]: update_all on GPU\n";
+  unsigned n = g.nnodes;
+  CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t)));
+  update_all_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
+      n, len, g, in, out, norm, norm_factor);
+  CudaTest("solving update_all kernel failed");
 }
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index 04d7c14476..785f4b2d26 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -1,70 +1,11 @@
 #include "context.h"
 #include "gtypes.h"
-#include <cstdio>
-#include <ctime>
-
-// random seeding
-int64_t cluster_seedgen(void) {
-  int64_t s, seed, pid;
-  FILE* f = fopen("/dev/urandom", "rb");
-  if (f && fread(&seed, 1, sizeof(seed), f) == sizeof(seed)) {
-    fclose(f);
-    return seed;
-  }
-  std::cout << "System entropy source not available, "
-               "using fallback algorithm to generate seed instead.";
-  if (f)
-    fclose(f);
-  pid  = getpid();
-  s    = time(NULL);
-  seed = std::abs(((s * 181) * ((pid - 83) * 359)) % 104729);
-  return seed;
-}
 
 #ifdef CPU_ONLY
 Context::Context()
     : mode_(Context::CPU), solver_count_(1), solver_rank_(0),
       multiprocess_(false) {}
 Context::~Context() {}
-#else
-cublasHandle_t Context::cublas_handle_       = 0;
-curandGenerator_t Context::curand_generator_ = 0;
-
-Context::Context()
-    : mode_(Context::GPU), solver_count_(1), solver_rank_(0),
-      multiprocess_(false) {
-  // void Context::create_blas_handle() {
-  CUBLAS_CHECK(cublasCreate(&cublas_handle_));
-  CURAND_CHECK(
-      curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT));
-  CURAND_CHECK(
-      curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen()));
-}
-
-Context::~Context() {
-  if (cublas_handle_)
-    CUBLAS_CHECK(cublasDestroy(cublas_handle_));
-  if (curand_generator_) {
-    CURAND_CHECK(curandDestroyGenerator(curand_generator_));
-  }
-}
-
-void Context::SetDevice(const int device_id) {
-  int current_device;
-  CUDA_CHECK(cudaGetDevice(&current_device));
-  if (current_device == device_id)
-    return;
-  CUDA_CHECK(cudaSetDevice(device_id));
-  if (cublas_handle_)
-    CUBLAS_CHECK(cublasDestroy(cublas_handle_));
-  if (curand_generator_)
-    CURAND_CHECK(curandDestroyGenerator(curand_generator_));
-  CUBLAS_CHECK(cublasCreate(&cublas_handle_));
-  CURAND_CHECK(
-      curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT));
-  CURAND_CHECK(
-      curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen()));
-}
 #endif
 
 size_t Context::read_graph(std::string dataset_str) {
@@ -113,24 +54,8 @@ void Context::genGraph(LGraph& lg, Graph& g) {
       g.constructEdge(offset, lg.get_dest(offset), 0);
   }
 }
-float_t* Context::get_in_ptr() { return &h_feats[0]; }
-#else
-size_t Context::read_graph_gpu(std::string dataset_str) {
-  std::string filename = path + dataset_str + ".csgr";
-  graph_gpu.read(filename.c_str(), false);
-  return graph_gpu.nnodes;
-}
 
-void Context::copy_data_to_device() {
-  CUDA_CHECK(cudaMalloc((void**)&d_labels, n * sizeof(label_t)));
-  CUDA_CHECK(cudaMemcpy(d_labels, &labels[0], n * sizeof(label_t),
-                        cudaMemcpyHostToDevice));
-  CUDA_CHECK(cudaMalloc((void**)&d_norm_factor, n * sizeof(float_t)));
-  CUDA_CHECK(cudaMalloc((void**)&d_feats, n * feat_len * sizeof(float_t)));
-  CUDA_CHECK(cudaMemcpy(d_feats, &h_feats[0], n * feat_len * sizeof(float_t),
-                        cudaMemcpyHostToDevice));
-}
-float_t* Context::get_in_ptr() { return d_feats; }
+float_t* Context::get_in_ptr() { return &h_feats[0]; }
 #endif
 
 // user-defined pre-computing function, called during initialization
@@ -140,25 +65,17 @@ void Context::norm_factor_counting() {
   norm_factor = new float_t[n];
   galois::do_all(galois::iterate((size_t)0, n),
                  [&](auto v) {
-                   float_t temp = std::sqrt(float_t(degrees[v]));
+                   auto degree  = std::distance(graph_cpu.edge_begin(v),
+                                               graph_cpu.edge_end(v));
+                   float_t temp = std::sqrt(float_t(degree));
                    if (temp == 0.0)
                      norm_factor[v] = 0.0;
                    else
                      norm_factor[v] = 1.0 / temp;
                  },
                  galois::loopname("NormCounting"));
-#endif
-}
-
-void Context::degree_counting() {
-#ifdef CPU_ONLY
-  degrees.resize(n);
-  galois::do_all(galois::iterate((size_t)0, n),
-                 [&](auto v) {
-                   degrees[v] = std::distance(graph_cpu.edge_begin(v),
-                                              graph_cpu.edge_end(v));
-                 },
-                 galois::loopname("DegreeCounting"));
+#else
+  norm_factor_counting_gpu();
 #endif
 }
 
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 1ef9be19c1..715fcafd39 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -37,10 +37,10 @@ graph_conv_layer::graph_conv_layer(unsigned level, bool act, bool norm,
 }
 
 void graph_conv_layer::init() {
-  std::cout << name_
-            << ": allocating memory for parameters and intermediate data... ";
   Timer t_alloc;
   t_alloc.Start();
+  // std::cout << name_ << ": allocating memory for parameters and intermediate
+  // data... ";
 #ifdef CPU_ONLY
   rand_init_matrix(y, z, W); // randomly initialize trainable parameters
   // rand_init_matrix(y, z, Q);
@@ -57,27 +57,33 @@ void graph_conv_layer::init() {
                       d_weight_grad);
 #endif
   t_alloc.Stop();
-  std::cout << "Done, allocation time: " << t_alloc.Millisecs() << " ms\n";
+  // std::cout << "Done, time: " << t_alloc.Millisecs() << " ms\n";
 }
 
 #ifdef CPU_ONLY
 // 𝒉[𝑙] = σ(𝑊 * Σ(𝒉[𝑙-1]))
-void graph_conv_layer::forward_propagation(const float_t *in_data, float_t *out_data) {
-	// input: x*y; W: y*z; output: x*z
-	// if y > z: mult W first to reduce the feature size for aggregation
-	// else: aggregate first then mult W (not implemented yet)
-	if (dropout_ && phase_ == net_phase::train) {
-		galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) {
-			dropout(y, scale_, dropout_rate_, &in_data[i*y], &dropout_mask[i*y], &in_temp[i*y]);
-		}, galois::loopname("dropout"));
-		matmul1D1D(x, z, y, in_temp, &W[0], out_temp); // x*y; y*z; x*z
-	} else matmul1D1D(x, z, y, in_data, &W[0], out_temp); // x*y; y*z; x*z
-	aggregate(z, context->graph_cpu, out_temp, out_data);
-	if (act_) {
-		galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) {
-			relu(z, &out_data[i*z], &out_data[i*z]);
-		}, galois::loopname("relu"));
-	}
+void graph_conv_layer::forward_propagation(const float_t* in_data,
+                                           float_t* out_data) {
+  // input: x*y; W: y*z; output: x*z
+  // if y > z: mult W first to reduce the feature size for aggregation
+  // else: aggregate first then mult W (not implemented yet)
+  if (dropout_ && phase_ == net_phase::train) {
+    galois::do_all(galois::iterate((size_t)0, x),
+                   [&](const auto& i) {
+                     dropout(y, scale_, dropout_rate_, &in_data[i * y],
+                             &dropout_mask[i * y], &in_temp[i * y]);
+                   },
+                   galois::loopname("dropout"));
+    matmul1D1D(x, z, y, in_temp, &W[0], out_temp); // x*y; y*z; x*z
+  } else
+    matmul1D1D(x, z, y, in_data, &W[0], out_temp);      // x*y; y*z; x*z
+  aggregate(z, context->graph_cpu, out_temp, out_data); // aggregate
+  if (act_) {
+    galois::do_all(
+        galois::iterate((size_t)0, x),
+        [&](const auto& i) { relu(z, &out_data[i * z], &out_data[i * z]); },
+        galois::loopname("relu"));
+  }
 }
 
 // 𝜕𝐸 / 𝜕𝑦[𝑙−1] = 𝜕𝐸 / 𝜕𝑦[𝑙] ∗ 𝑊 ^𝑇
@@ -123,29 +129,39 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
 
 #else
 // GPU forward
-void graph_conv_layer::forward_propagation(const float_t *in_data, float_t *out_data) {
-	assert(y <= 128); // currently only support feature length <= 128
-	assert(in_data != NULL);
-	assert(in_temp != NULL);
-	assert(dropout_mask != NULL);
-	//std::cout << "in_data=" << in_data << ", in_temp=" << in_temp << ", dropout_mask=" << dropout_mask << ", out_temp=" << out_temp << ", out_data=" << out_data << "\n";
-	if (dropout_ && phase_ == net_phase::train) {
-		dropout_gpu(x*y, scale_, dropout_rate_, in_data, dropout_mask, in_temp);
-		matmul1D1D_gpu(x, z, y, in_temp, d_W, out_temp);
-	} else matmul1D1D_gpu(x, z, y, in_data, d_W, out_temp);
-	//aggregate(z, context->graph_gpu, out_temp, out_data);
-	if (act_) relu_gpu(x*z, out_data, out_data);
+void graph_conv_layer::forward_propagation(const float_t* in_data,
+                                           float_t* out_data) {
+  assert(y <= 128); // currently only support feature length <= 128
+  assert(in_data != NULL);
+  assert(in_temp != NULL);
+  assert(dropout_mask != NULL);
+  if (dropout_ && phase_ == net_phase::train) {
+    dropout_gpu(x * y, scale_, dropout_rate_, in_data, dropout_mask, in_temp);
+    matmul1D1D_gpu(x, z, y, in_temp, d_W, out_temp);
+  } else
+    matmul1D1D_gpu(x, z, y, in_data, d_W, out_temp);
+  aggregate(z, context->graph_gpu, out_temp, out_data);
+  if (act_)
+    relu_gpu(x * z, out_data, out_data);
 }
 
 // GPU backward
-void graph_conv_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) {
-	if (act_) d_relu_gpu(x*z, out_grad, out_data, out_temp);
-	else copy_gpu(x*z, out_grad, out_temp);
-	if (level_ != 0) {
-		sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0, in_temp);
-		//update_all(y, context->graph_gpu, in_temp, in_grad, true, context->d_norm_factor);
-		if (dropout_) d_dropout_gpu(x*y, scale_, in_grad, dropout_mask, in_grad);
-	}
-	sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, d_weight_grad);
+void graph_conv_layer::back_propagation(const float_t* in_data,
+                                        const float_t* out_data,
+                                        float_t* out_grad, float_t* in_grad) {
+  if (act_)
+    d_relu_gpu(x * z, out_grad, out_data, out_temp);
+  else
+    copy_gpu(x * z, out_grad, out_temp);
+  if (level_ != 0) {
+    sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0,
+              in_temp);
+    update_all(y, context->graph_gpu, in_temp, in_grad, true,
+               context->d_norm_factor);
+    if (dropout_)
+      d_dropout_gpu(y, scale_, in_grad, dropout_mask, in_grad);
+  }
+  sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0,
+            d_weight_grad);
 }
 #endif
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp
index 6c29dc9a14..85e81d038c 100644
--- a/libdeepgalois/src/layers/softmax_loss_layer.cpp
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp
@@ -6,19 +6,19 @@ softmax_loss_layer::softmax_loss_layer(unsigned level,
     : layer(level, in_dims, out_dims) {
   trainable_ = false;
   name_      = layer_type() + "_" + std::to_string(level);
+  std::cout << name_ << ": allocating memory for intermediate data... ";
 #ifdef CPU_ONLY
   loss = new float_t[in_dims[0]]; // error for each sample
 #else
-  out_malloc_device(in_dims[0], masks_, d_masks_, loss);
+  loss_malloc_device(in_dims[0], loss);
 #endif
+  std::cout << "Done\n";
 }
 #ifdef CPU_ONLY
 // TODO: need kernel fusion optimization
 // 𝑦[i] = 𝑒^𝑥[i] / Σ 𝑒^𝑥[𝑘]
 void softmax_loss_layer::forward_propagation(const float_t* in_data,
                                              float_t* out_data) {
-  // void softmax_loss_layer::forward_propagation(const vec_t &in_data, vec_t
-  // &out_data) {
   size_t len = input_dims[1];
   galois::do_all(galois::iterate(begin_, end_),
                  [&](const auto& i) {
@@ -35,8 +35,6 @@ void softmax_loss_layer::forward_propagation(const float_t* in_data,
                  galois::loopname("softmax-loss-fw"));
 }
 
-// void softmax_loss_layer::back_propagation(const vec_t &in_data, const vec_t
-// &out_data, vec_t &out_grad, vec_t &in_grad) {
 void softmax_loss_layer::back_propagation(const float_t* in_data,
                                           const float_t* out_data,
                                           float_t* out_grad, float_t* in_grad) {
@@ -56,6 +54,25 @@ void softmax_loss_layer::back_propagation(const float_t* in_data,
                  galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
                  galois::loopname("softmax-loss-bw"));
 }
+
+acc_t softmax_loss_layer::get_masked_loss() {
+  AccumF total_loss;
+  AccumU valid_sample_count;
+  total_loss.reset();
+  valid_sample_count.reset();
+  galois::do_all(galois::iterate(begin_, end_),
+                 [&](const auto& i) {
+                   if (masks_[i]) {
+                     total_loss += loss[i];
+                     valid_sample_count += 1;
+                   }
+                 },
+                 galois::chunk_size<256>(), galois::steal(),
+                 galois::loopname("getMaskedLoss"));
+  assert(valid_sample_count.reduce() == count_);
+  return total_loss.reduce() / (acc_t)count_;
+}
+
 #else // GPU implementation
 void softmax_loss_layer::forward_propagation(const float_t* in_data,
                                              float_t* out_data) {
@@ -63,7 +80,14 @@ void softmax_loss_layer::forward_propagation(const float_t* in_data,
                             context->d_labels, loss, out_data);
 }
 
+void softmax_loss_layer::back_propagation(const float_t* in_data,
+                                          const float_t* out_data,
+                                          float_t* out_grad, float_t* in_grad) {
+  d_softmax_cross_entropy_gpu(input_dims[0], input_dims[1], in_data, d_masks_,
+                              context->d_labels, out_data, in_grad);
+}
+
 acc_t softmax_loss_layer::get_masked_loss() {
-	return masked_avg_loss(begin_, end_, count_, d_masks_, loss);
+  return masked_avg_loss(begin_, end_, count_, masks_, loss);
 }
 #endif
diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp
index 2e2d68f05d..e21bb42396 100644
--- a/libdeepgalois/src/math_functions.cpp
+++ b/libdeepgalois/src/math_functions.cpp
@@ -340,41 +340,41 @@ float reduce_mean(const vec_t& x) {
 }
 
 void dropout(const float scale, const float dropout_rate, const vec_t& in,
-             std::vector<unsigned>& mask, vec_t& out) {
-  assert(mask.size() == out.size());
-  // rng_bernoulli(1. - dropout_rate, mask); // Create random numbers
+             std::vector<unsigned>& masks, vec_t& out) {
+  assert(masks.size() == out.size());
+  // rng_bernoulli(1. - dropout_rate, masks); // Create random numbers
   for (size_t i = 0; i < in.size(); ++i)
-    mask[i] = bernoulli(dropout_rate);
+    masks[i] = bernoulli(dropout_rate);
   for (size_t i = 0; i < in.size(); ++i)
-    out[i] = in[i] * mask[i] * scale;
+    out[i] = in[i] * masks[i] * scale;
 }
 
 void dropout(const float scale, const float dropout_rate, const vec_t& in,
-             std::vector<unsigned>& mask, float_t* out) {
+             std::vector<unsigned>& masks, float_t* out) {
   for (size_t i = 0; i < in.size(); ++i)
-    mask[i] = bernoulli(dropout_rate);
+    masks[i] = bernoulli(dropout_rate);
   for (size_t i = 0; i < in.size(); ++i)
-    out[i] = in[i] * mask[i] * scale;
+    out[i] = in[i] * masks[i] * scale;
 }
 
 void dropout(size_t n, const float scale, const float dropout_rate,
-             const float_t* in, unsigned* mask, float_t* out) {
+             const float_t* in, unsigned* masks, float_t* out) {
   for (size_t i = 0; i < n; ++i)
-    mask[i] = bernoulli(dropout_rate);
+    masks[i] = bernoulli(dropout_rate);
   for (size_t i = 0; i < n; ++i)
-    out[i] = in[i] * mask[i] * scale;
+    out[i] = in[i] * masks[i] * scale;
 }
 
 void d_dropout(const float scale, const vec_t& in_diff,
-               std::vector<unsigned>& mask, vec_t& out_diff) {
+               std::vector<unsigned>& masks, vec_t& out_diff) {
   for (size_t i = 0; i < in_diff.size(); ++i)
-    out_diff[i] = in_diff[i] * mask[i] * scale;
+    out_diff[i] = in_diff[i] * masks[i] * scale;
 }
 
 void d_dropout(size_t n, const float scale, const float_t* in_diff,
-               unsigned* mask, float_t* out_diff) {
+               unsigned* masks, float_t* out_diff) {
   for (size_t i = 0; i < n; ++i)
-    out_diff[i] = in_diff[i] * mask[i] * scale;
+    out_diff[i] = in_diff[i] * masks[i] * scale;
 }
 
 float_t sigmoid_func(float_t x) { return 0.5 * tanh(0.5 * x) + 0.5; }
@@ -469,9 +469,10 @@ float_t cross_entropy(const vec_t& y, const vec_t& p) {
     // if (p[i]==float_t(1)) loss -= (float_t(1) - y[i]) *
     // std::log(float_t(1e-10));
     else
-      loss -=
-          y[i] * std::log(p[i]); // + (float_t(1) - y[i]) * std::log(float_t(1)
-                                 // - p[i]); loss -= y[i] * std::log(p[i]);
+      loss -= y[i] *
+              std::log(
+                  p[i]); // + (float_t(1) - y[i]) * std::log(float_t(1) - p[i]);
+                         // loss -= y[i] * std::log(p[i]);
   }
   return loss;
 }
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index 70ddd8826d..49fa979e0a 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -1,82 +1,109 @@
 #include "math_functions.hh"
 #include "context.h"
+#include "gg.h"
+#include "ggcuda.h"
+#include "cub/cub.cuh"
+#include <curand_kernel.h>
 
-void gpu_rng_uniform(const int n, unsigned *r) {
-	CURAND_CHECK(curandGenerate(Context::curand_generator(), r, n));
+void gpu_rng_uniform(const int n, unsigned* r) {
+  CURAND_CHECK(curandGenerate(Context::curand_generator(), r, n));
 }
 
-void gpu_rng_uniform(const int n, const float_t a, const float_t b, float_t* r) {
-	CURAND_CHECK(curandGenerateUniform(Context::curand_generator(), r, n));
-	const float range = b - a;
-	if (range != float_t(1)) scal_gpu(n, range, r);
-	if (a != float_t(0)) add_scalar_gpu(n, a, r);
+void gpu_rng_uniform(const int n, const float_t a, const float_t b,
+                     float_t* r) {
+  CURAND_CHECK(curandGenerateUniform(Context::curand_generator(), r, n));
+  const float range = b - a;
+  if (range != float_t(1))
+    scal_gpu(n, range, r);
+  if (a != float_t(0))
+    add_scalar_gpu(n, a, r);
 }
 
-void gpu_rng_gaussian(const int n, const float_t mu, const float_t sigma, float_t *r) {
-	CURAND_CHECK(curandGenerateNormal(Context::curand_generator(), r, n, mu, sigma));
+void gpu_rng_gaussian(const int n, const float_t mu, const float_t sigma,
+                      float_t* r) {
+  CURAND_CHECK(
+      curandGenerateNormal(Context::curand_generator(), r, n, mu, sigma));
 }
 
-void loss_malloc_device(int n, float_t *&loss) {
-	CUDA_CHECK(cudaMalloc((void **)&loss, n * sizeof(float_t)));
+void loss_malloc_device(int n, float_t* loss) {
+  CUDA_CHECK(cudaMalloc((void**)&loss, n * sizeof(float_t)));
 }
 
-void copy_masks_device(int n, mask_t *h_masks, mask_t *&d_masks) {
-	assert(h_masks != NULL);
-	CUDA_CHECK(cudaMalloc((void **)&d_masks, n * sizeof(mask_t)));
-	CUDA_CHECK(cudaMemcpy(d_masks, h_masks, n * sizeof(mask_t), cudaMemcpyHostToDevice));
+void copy_masks_device(int n, mask_t* h_masks, mask_t* d_masks) {
+  assert(h_masks != NULL);
+  CUDA_CHECK(cudaMalloc((void**)&d_masks, n * sizeof(mask_t)));
+  CUDA_CHECK(
+      cudaMemcpy(d_masks, h_masks, n * sizeof(mask_t), cudaMemcpyHostToDevice));
 }
 
-void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned *&masks, float_t *&in, float_t *&out, float_t *&matrix, float_t *&grad) {
-	if (dropout) CUDA_CHECK(cudaMalloc((void **)&masks, x * y * sizeof(unsigned)));
-	CUDA_CHECK(cudaMalloc((void **)&in, x * y * sizeof(float_t)));
-	CUDA_CHECK(cudaMalloc((void **)&out, x * z * sizeof(float_t)));
-	CUDA_CHECK(cudaMalloc((void **)&matrix, y * z * sizeof(float_t)));
-	auto init_range = sqrt(6.0/(y + z));
-	// Glorot & Bengio (AISTATS 2010)
-	gpu_rng_uniform(y*z, -init_range, init_range, matrix);
-	CUDA_CHECK(cudaMalloc((void **)&grad, y * z * sizeof(float_t)));
-	CUDA_CHECK(cudaMemset(grad, 0, y * z * sizeof(float_t)));
+void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout,
+                         unsigned* masks, float_t* in, float_t* out,
+                         float_t* matrix, float_t* grad) {
+  if (dropout)
+    CUDA_CHECK(cudaMalloc((void**)&masks, x * y * sizeof(unsigned)));
+  CUDA_CHECK(cudaMalloc((void**)&in, x * y * sizeof(float_t)));
+  CUDA_CHECK(cudaMalloc((void**)&out, x * z * sizeof(float_t)));
+  CUDA_CHECK(cudaMalloc((void**)&matrix, y * z * sizeof(float_t)));
+  auto init_range = sqrt(6.0 / (y + z));
+  // Glorot & Bengio (AISTATS 2010)
+  gpu_rng_uniform(y * z, -init_range, init_range, matrix);
+  CUDA_CHECK(cudaMalloc((void**)&grad, y * z * sizeof(float_t)));
+  CUDA_CHECK(cudaMemset(grad, 0, y * z * sizeof(float_t)));
 }
 
-void gpu_rng_gaussian(const int n, const float_t mu, const float_t sigma,
-                      float_t* r) {
-  CURAND_CHECK(
-      curandGenerateNormal(Context::curand_generator(), r, n, mu, sigma));
+void copy_gpu(size_t len, const float_t* in, float_t* out) {
+  CUDA_CHECK(
+      cudaMemcpy(out, in, len * sizeof(float_t), cudaMemcpyDeviceToDevice));
 }
 
-__global__ void setup_curand_kernel(const int n, curandState *state) {
-	CUDA_KERNEL_LOOP(i, n) {
-		//curand_init(1234, i, 0, &state[i]); // Each thread gets same seed 1234
-		curand_init(7+i, i, 0, &state[i]); // Each thread gets different seed
-	}
+__global__ void setup_curand_kernel(const int n, curandState* state) {
+  CUDA_KERNEL_LOOP(i, n) {
+    curand_init(1234, i, 0, &state[i]); // Each thread gets same seed 1234
+    // curand_init(7+i, i, 0, &state[i]); // Each thread gets different seed
+  }
 }
 
-__global__ void dropout_kernel(const int n, const float scale, const float dropout_rate, const float_t* in, unsigned *masks, curandState *state, float_t* out) {
-	CUDA_KERNEL_LOOP(i, n) {
-		//curand_init(1234, i, 0, &state[i]); // Each thread gets same seed 1234
-		//masks[i] = curand_uniform(&state[i]) <= dropout_rate ? 1 : 0;
-		masks[i] = 1.0 - dropout_rate;
-		out[i] = in[i] * masks[i] * scale;
-	}
+__device__ bool bernoulli_gpu(int tid, curandState* state, float_t p) {
+  curandState local_state = state[tid];
+  return curand_uniform(&local_state) <= p;
 }
 
-void dropout_gpu(const int n, const float scale, const float dropout_rate, const float_t *in, unsigned *masks, float_t *out) {
-	curandState *devStates;
-	CUDA_CHECK(cudaMalloc((void **)&devStates, n * sizeof(curandState)));
-	//std::cout << "[debug]: setup curand, n = " << n << "\n";
-	//setup_curand_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, devStates);
-	//CudaTest("solving setup_curand kernel failed");
-	//std::cout << "[debug]: dropout_gpu\n";
-	dropout_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, scale, dropout_rate, in, masks, devStates, out);
-	CudaTest("solving dropout kernel failed");
-	CUDA_CHECK(cudaFree(devStates));
-	//std::cout << "[debug]: dropout_gpu done\n";
+__global__ void dropout_kernel(const int n, const float scale,
+                               const float dropout_rate, const float_t* in,
+                               unsigned* masks, curandState* state,
+                               float_t* out) {
+  CUDA_KERNEL_LOOP(i, n) {
+    masks[i] = bernoulli_gpu(i, state, dropout_rate);
+    out[i]   = in[i] * masks[i] * scale;
+  }
 }
 
 void dropout_gpu(const int n, const float scale, const float dropout_rate,
                  const float_t* in, unsigned* masks, float_t* out) {
+  curandState* devStates;
+  CUDA_CHECK(cudaMalloc((void**)&devStates, n * sizeof(curandState)));
+  std::cout << "[debug]: setup curand, n = " << n << "\n";
+  setup_curand_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, devStates);
+  CudaTest("solving setup_curand kernel failed");
+  std::cout << "[debug]: dropout_gpu\n";
   dropout_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
-      n, scale, dropout_rate, in, masks, out);
+      n, scale, dropout_rate, in, masks, devStates, out);
+  CudaTest("solving dropout kernel failed");
+  CUDA_CHECK(cudaFree(devStates));
+  std::cout << "[debug]: dropout_gpu done\n";
+}
+
+__global__ void d_dropout_kernel(const int n, const float scale,
+                                 const float_t* in, const unsigned* masks,
+                                 float_t* out) {
+  CUDA_KERNEL_LOOP(i, n) { out[i] = in[i] * masks[i] * scale; }
+}
+
+void d_dropout_gpu(const int n, const float scale, const float_t* in,
+                   const unsigned* masks, float_t* out) {
+  d_dropout_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, scale, in,
+                                                             masks, out);
+  CudaTest("solving dropout kernel failed");
 }
 
 // flattern data into 1D before feed into the ReLU operater
@@ -84,10 +111,10 @@ __global__ void relu_kernel(const int n, const float_t* in, float_t* out) {
   CUDA_KERNEL_LOOP(index, n) { out[index] = in[index] > 0 ? in[index] : 0; }
 }
 
-void relu_gpu(const int n, const float_t *in, float_t* out) {
-	//std::cout << "[debug]: relu_gpu\n";
-	relu_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, in, out);
-	CudaTest("solving relu kernel failed");
+void relu_gpu(const int n, const float_t* in, float_t* out) {
+  std::cout << "[debug]: relu_gpu\n";
+  relu_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, in, out);
+  CudaTest("solving relu kernel failed");
 }
 
 __global__ void d_relu_kernel(const int n, const float_t* in_diff,
@@ -101,6 +128,7 @@ void d_relu_gpu(const int n, const float_t* in_diff, const float_t* data,
                 float_t* out_diff) {
   d_relu_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, in_diff, data,
                                                           out_diff);
+  CudaTest("solving d_relu kernel failed");
 }
 
 void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
@@ -117,17 +145,24 @@ void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
                            K, &alpha, B, ldb, A, lda, &beta, C, N));
 }
 
-void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, const float_t *A, const float_t *B, float_t *C) {
-	//std::cout << "[debug]: matmul1D1D_gpu\n";
-	const CBLAS_TRANSPOSE TransA = CblasNoTrans;
-	const CBLAS_TRANSPOSE TransB = CblasNoTrans;
-	sgemm_gpu(TransA, TransB, dim_x, dim_y, dim_z, 1.0, A, B, 0.0, C);
+void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z,
+                    const float_t* A, const float_t* B, float_t* C) {
+  std::cout << "[debug]: matmul1D1D_gpu\n";
+  const CBLAS_TRANSPOSE TransA = CblasNoTrans;
+  const CBLAS_TRANSPOSE TransB = CblasNoTrans;
+  sgemm_gpu(TransA, TransB, dim_x, dim_y, dim_z, 1.0, A, B, 0.0, C);
 }
 
-void gemv_gpu(const CBLAS_TRANSPOSE TransA, const int M, const int N, 
-	const float alpha, const float* A, const float* x, const float beta, float* y) {
-	cublasOperation_t cuTransA = (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
-	CUBLAS_CHECK(cublasSgemv(Context::cublas_handle(), cuTransA, N, M, &alpha, A, N, x, 1, &beta, y, 1));
+// the arguments of the maxima
+int argmax_gpu(const size_t n, const float_t* x) { return 0; }
+
+void gemv_gpu(const CBLAS_TRANSPOSE TransA, const int M, const int N,
+              const float alpha, const float* A, const float* x,
+              const float beta, float* y) {
+  cublasOperation_t cuTransA =
+      (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
+  CUBLAS_CHECK(cublasSgemv(Context::cublas_handle(), cuTransA, N, M, &alpha, A,
+                           N, x, 1, &beta, y, 1));
 }
 
 void scal_gpu(const int N, const float alpha, float* X) {
@@ -157,6 +192,7 @@ void set_gpu(const int N, const float_t alpha, float_t* Y) {
     return;
   }
   set_kernel<<<CUDA_GET_BLOCKS(N), CUDA_NUM_THREADS>>>(N, alpha, Y);
+  CudaTest("solving set kernel failed");
 }
 
 __global__ void add_scalar_kernel(const int n, const float_t alpha,
@@ -166,6 +202,7 @@ __global__ void add_scalar_kernel(const int n, const float_t alpha,
 
 void add_scalar_gpu(const int N, const float_t alpha, float_t* Y) {
   add_scalar_kernel<<<CUDA_GET_BLOCKS(N), CUDA_NUM_THREADS>>>(N, alpha, Y);
+  CudaTest("solving add_scalar kernel failed");
 }
 
 __global__ void vadd_kernel(const int n, const float_t* a, const float_t* b,
@@ -175,6 +212,7 @@ __global__ void vadd_kernel(const int n, const float_t* a, const float_t* b,
 
 void vadd_gpu(const int N, const float_t* a, const float_t* b, float_t* y) {
   vadd_kernel<<<CUDA_GET_BLOCKS(N), CUDA_NUM_THREADS>>>(N, a, b, y);
+  CudaTest("solving vadd kernel failed");
 }
 
 // TODO: use warp
@@ -224,90 +262,70 @@ __device__ void d_cross_entropy(int n, const label_t idx, const float_t* p,
 // n: number of vectors
 // len: length of vectors
 // for each vector, do softmax to normalize the vector, and then compute a loss
-__global__ void softmax_cross_entropy_kernel(int n, int len, const float_t *in_data,
-	const mask_t *masks, const label_t *labels, float_t *loss, float_t *out_data) {
-	CUDA_KERNEL_LOOP(i, n) {
-		if (masks[i] == 1) { // masked
-			softmax(len, in_data+len*i, out_data+len*i); // normalize using softmax
-			loss[i] = 0.0;
-			cross_entropy(len, labels[i], &out_data[len*i], loss[i]);
-		}
-	}
-}
-
-void softmax_cross_entropy_gpu(int n, int len, const float_t *in, const mask_t *masks, const label_t *labels, float_t *loss, float_t *out) {
-	softmax_cross_entropy_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, len, in, masks, labels, loss, out);
-	CudaTest("solving softmax_cross_entropy kernel failed");
-}
-
-__global__ void d_softmax_cross_entropy_kernel(int n, int len, const float_t *in,
-	const mask_t *masks, const label_t *labels, const float_t *out, float_t *diff) {
-	CUDA_KERNEL_LOOP(i, n) {
-		float_t out_grad[41]; // TODO
-		d_cross_entropy(len, labels[i], out+len*i, out_grad);
-		d_softmax(len, out+len*i, out_grad, diff+len*i);
-	}
-}
-
-void d_softmax_cross_entropy_gpu(int n, int len, const float_t *in, const mask_t *masks, const label_t *labels, const float_t *out, float_t *diff) {
-	d_softmax_cross_entropy_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, len, in, masks, labels, out, diff);
-	CudaTest("solving d_softmax_cross_entropy kernel failed");
-}
-
-__global__ void masked_avg_loss_kernel(size_t begin, size_t end, mask_t *masks, float_t *loss, HGAccumulator<acc_t> total) {
-	total.thread_entry();
-	__shared__ cub::BlockReduce<acc_t, CUDA_NUM_THREADS>::TempStorage local_loss;
-	CUDA_KERNEL_LOOP(i, end-begin) {
-		if (masks[begin+i] == 1)
-			//total += loss[begin+i];
-			total.reduce(loss[begin+i]);
-	}
-	total.thread_exit<cub::BlockReduce<acc_t, CUDA_NUM_THREADS> >(local_loss);
-}
-
-acc_t masked_avg_loss(size_t begin, size_t end, size_t count, mask_t *masks, float_t *loss) {
-	HGAccumulator<acc_t> loss_accum;
-	Shared<acc_t> total_loss = Shared<acc_t>(1);
-	*(total_loss.cpu_wr_ptr()) = 0;
-	loss_accum.rv = total_loss.gpu_wr_ptr();
-	masked_avg_loss_kernel<<<CUDA_GET_BLOCKS(end-begin), CUDA_NUM_THREADS>>>(begin, end, masks, loss, loss_accum);
-	CudaTest("solving masked_avg_loss kernel failed");
-	cudaDeviceSynchronize();
-	return *(total_loss.cpu_rd_ptr()) / count;
+__global__ void softmax_cross_entropy_kernel(int n, int len,
+                                             const float_t* in_data,
+                                             const mask_t* masks,
+                                             const label_t* labels,
+                                             float_t* loss, float_t* out_data) {
+  CUDA_KERNEL_LOOP(i, n) {
+    if (masks[i] == 1) { // masked
+      softmax(len, in_data + len * i,
+              out_data + len * i); // normalize using softmax
+      loss[i] = 0.0;
+      cross_entropy(len, labels[i], &out_data[len * i], loss[i]);
+    }
+  }
 }
 
-// the arguments of the maxima
-__device__ size_t argmax_device(const size_t n, const float_t *x) {
-	float_t max = x[0];
-	size_t max_ind = 0;
-	for (size_t i = 1; i < n; i++) {
-		if (x[i] > max) {
-			max_ind = i;
-			max = x[i];
-		}
-	}
-	return max_ind;
-}
-
-__global__ void masked_accuracy_kernel(size_t num_classes, size_t begin, size_t end, mask_t *masks, float_t *preds, label_t *labels, HGAccumulator<acc_t> total) {
-	total.thread_entry();
-	__shared__ cub::BlockReduce<acc_t, CUDA_NUM_THREADS>::TempStorage local_accuracy;
-	CUDA_KERNEL_LOOP(i, end-begin) {
-		if (masks[begin+i] == 1) {
-			label_t pred = (label_t)argmax_device(num_classes, preds+(begin+i)*num_classes);
-			if (pred == labels[begin+i]) total.reduce(1.0);
-		}
-	}
-	total.thread_exit<cub::BlockReduce<acc_t, CUDA_NUM_THREADS> >(local_accuracy);
-}
-
-acc_t masked_accuracy_gpu(size_t num_classes, size_t begin, size_t end, size_t count, mask_t *masks, float_t *preds, label_t *labels) {
-	HGAccumulator<acc_t> accuracy_accum;
-	Shared<acc_t> total_accuracy = Shared<acc_t>(1);
-	*(total_accuracy.cpu_wr_ptr()) = 0;
-	accuracy_accum.rv = total_accuracy.gpu_wr_ptr();
-	masked_accuracy_kernel<<<CUDA_GET_BLOCKS(end-begin), CUDA_NUM_THREADS>>>(num_classes, begin, end, masks, preds, labels, accuracy_accum);
-	CudaTest("solving masked_avg_loss kernel failed");
-	cudaDeviceSynchronize();
-	return *(total_accuracy.cpu_rd_ptr()) / count;
+void softmax_cross_entropy_gpu(int n, int len, const float_t* in,
+                               const mask_t* masks, const label_t* labels,
+                               float_t* loss, float_t* out) {
+  softmax_cross_entropy_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
+      n, len, in, masks, labels, loss, out);
+  CudaTest("solving softmax_cross_entropy kernel failed");
+}
+
+__global__ void
+d_softmax_cross_entropy_kernel(int n, int len, const float_t* in,
+                               const mask_t* masks, const label_t* labels,
+                               const float_t* out, float_t* diff) {
+  CUDA_KERNEL_LOOP(i, n) {
+    float_t out_grad[41]; // TODO
+    d_cross_entropy(len, labels[i], out + len * i, out_grad);
+    d_softmax(len, out + len * i, out_grad, diff + len * i);
+  }
+}
+
+void d_softmax_cross_entropy_gpu(int n, int len, const float_t* in,
+                                 const mask_t* masks, const label_t* labels,
+                                 const float_t* out, float_t* diff) {
+  d_softmax_cross_entropy_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
+      n, len, in, masks, labels, out, diff);
+  CudaTest("solving d_softmax_cross_entropy kernel failed");
+}
+
+__global__ void masked_avg_loss_kernel(size_t begin, size_t end, mask_t* masks,
+                                       float_t* loss,
+                                       HGAccumulator<acc_t> total) {
+  total.thread_entry();
+  __shared__ cub::BlockReduce<acc_t, TB_SIZE>::TempStorage local_loss;
+  CUDA_KERNEL_LOOP(i, end - begin) {
+    if (masks[begin + i] == 1)
+      // total += loss[begin+i];
+      total.reduce(loss[begin + i]);
+  }
+  total.thread_exit<cub::BlockReduce<acc_t, TB_SIZE>>(local_loss);
+}
+
+acc_t masked_avg_loss(size_t begin, size_t end, size_t count, mask_t* masks,
+                      float_t* loss) {
+  HGAccumulator<acc_t> loss_accum;
+  Shared<acc_t> total_loss   = Shared<acc_t>(1);
+  *(total_loss.cpu_wr_ptr()) = 0;
+  loss_accum.rv              = total_loss.gpu_wr_ptr();
+  masked_avg_loss_kernel<<<CUDA_GET_BLOCKS(end - begin), CUDA_NUM_THREADS>>>(
+      begin, end, masks, loss, loss_accum);
+  CudaTest("solving masked_avg_loss kernel failed");
+  cudaDeviceSynchronize();
+  return *(total_loss.cpu_rd_ptr());
 }
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index f76ccaeb8a..09267795df 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -1,11 +1,9 @@
 #include "net.h"
 
 void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1) {
-  context = new Context();
-  // Context::create_blas_handle();
+  context     = new Context();
   num_samples = context->read_graph(dataset_str);
   num_classes = context->read_labels(dataset_str);
-  context->degree_counting();
   context->norm_factor_counting(); // pre-compute normalizing factor
   num_epochs = epochs;
 
@@ -100,20 +98,3 @@ void Net::construct_layers() {
   layers[0]->set_in_data(context->get_in_ptr()); // feed input data
   set_contexts();
 }
-
-acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t *masks) {
-#ifdef CPU_ONLY
-	AccumF accuracy_all;
-	accuracy_all.reset();
-	galois::do_all(galois::iterate(begin, end), [&](const auto& i) {
-		if (masks[i] == 1) {
-			int preds = argmax(num_classes, &(layers[NUM_CONV_LAYERS-1]->next()->get_data()[i*num_classes]));
-			if ((label_t)preds == context->get_label(i)) accuracy_all += 1.0;
-		}
-	}, galois::chunk_size<256>(), galois::steal(), galois::loopname("getMaskedLoss"));
-	return accuracy_all.reduce() / (acc_t)count;
-#else
-	return masked_accuracy_gpu(num_classes, begin, end, count, layers[NUM_CONV_LAYERS]->get_device_masks(), layers[NUM_CONV_LAYERS-1]->next()->get_data(), context->d_labels);
-#endif
-}
-
diff --git a/libdeepgalois/src/node.cpp b/libdeepgalois/src/node.cpp
index f4278688d1..9b88620d65 100644
--- a/libdeepgalois/src/node.cpp
+++ b/libdeepgalois/src/node.cpp
@@ -1,6 +1,9 @@
 #include "node.h"
+#include <iostream>
 
 void edge::alloc() {
+  // std::cout << "Allocating memory for tensors (intermediate features and
+  // gradients) ...\n";
 #ifdef CPU_ONLY
   data_ = new float_t[num_samples_ * ft_dim_];
   grad_ = new float_t[num_samples_ * ft_dim_];
diff --git a/libdeepgalois/src/node.cu b/libdeepgalois/src/node.cu
index 2443e9ed7c..e6d149a540 100644
--- a/libdeepgalois/src/node.cu
+++ b/libdeepgalois/src/node.cu
@@ -14,5 +14,5 @@ void edge::merge_grads_gpu(float_t* dst) {
 }
 
 void edge::clear_grads_gpu() {
-  CUDA_CHECK(cudaMemset(grad_, 0, ft_dim_ * num_samples_ * sizeof(float_t)));
+  CUDA_CHECK(cudaMemset(grad_, 0, num_samples_ * ft_dim_ * sizeof(float_t)));
 }

From c9fb35816c2f4364203426b9b5aba737bfa5f06a Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 25 Feb 2020 18:05:02 -0600
Subject: [PATCH 037/660] set 2 of changes

---
 libdeepgalois/include/layers/layer.h          |  18 ++-
 libdeepgalois/include/math_functions.hh       |  53 +++++---
 libdeepgalois/include/net.h                   |  23 +---
 libdeepgalois/src/aggregator.cu               |   2 +-
 libdeepgalois/src/layers/graph_conv_layer.cpp |  21 ++--
 .../src/layers/softmax_loss_layer.cpp         |  36 +-----
 libdeepgalois/src/math_functions.cpp          |   7 +-
 libdeepgalois/src/math_functions.cu           | 119 +++++++++++-------
 libdeepgalois/src/net.cpp                     |  31 ++++-
 9 files changed, 168 insertions(+), 142 deletions(-)

diff --git a/libdeepgalois/include/layers/layer.h b/libdeepgalois/include/layers/layer.h
index 609047853a..cec1da3665 100644
--- a/libdeepgalois/include/layers/layer.h
+++ b/libdeepgalois/include/layers/layer.h
@@ -44,7 +44,9 @@ class layer : public node {
   virtual std::string layer_type() const = 0;
   virtual void set_netphase(net_phase phase) {}
   virtual void set_context(Context* ctx) { context = ctx; }
-  virtual acc_t get_masked_loss() { return acc_t(0); }
+  // virtual void forward_propagation(const vec_t &in_data, vec_t &out_data) =
+  // 0; virtual void back_propagation(const vec_t &in_data, const vec_t
+  // &out_data, vec_t &out_grad, vec_t &in_grad) = 0;
   virtual void forward_propagation(const float_t* in_data,
                                    float_t* out_data)                = 0;
   virtual void back_propagation(const float_t* in_data, const float_t* out_data,
@@ -54,6 +56,7 @@ class layer : public node {
   bool trainable() const { return trainable_; }
   void set_name(std::string name) { name_ = name; }
   std::string get_name() { return name_; }
+  mask_t* get_device_masks() { return d_masks_; }
   void print_layer_info() {
     std::cout << "Layer" << level_ << " type: " << layer_type() << " input["
               << input_dims[0] << "," << input_dims[1] << "] output["
@@ -70,16 +73,9 @@ class layer : public node {
 #endif
   }
   void set_in_data(float_t* data) {
-    assert(data.size() == input_dims[0] * input_dims[1]);
     prev_ = std::make_shared<edge>(this, input_dims[0], input_dims[1]);
     prev_->set_data(data);
     // no need to allocate memory for gradients, since this is the input layer.
-    //
-    // allocate memory for intermediate features
-    // prev_->get_data() = data;
-    // std::copy(data.begin(), data.end(), prev_->get_data());
-    // allocate memory for intermediate gradients
-    // prev_->get_gradient().resize(input_dims[0]*input_dims[1]);
   }
   void add_edge() {
     // add an outgoing edge
@@ -91,16 +87,16 @@ class layer : public node {
     // allocate memory for intermediate gradients
   }
   void forward() {
-    std::cout << name_ << ": forwarding ... ";
+    // std::cout << name_ << ": forwarding ... ";
     forward_propagation(prev()->get_data(), next()->get_data());
   }
   void backward() {
-    std::cout << name_ << ": backwarding ... ";
+    // std::cout << name_ << ": backwarding ... ";
     back_propagation(prev()->get_data(), next()->get_data(),
                      next()->get_gradient(), prev()->get_gradient());
   }
   void update_weight(optimizer* opt) {
-    std::cout << name_ << ": weight updating ... ";
+    // std::cout << name_ << ": weight updating ... ";
     // vec_t diff;
     // prev()->merge_grads(&diff);
 #ifdef CPU_ONLY
diff --git a/libdeepgalois/include/math_functions.hh b/libdeepgalois/include/math_functions.hh
index 02afab2c49..61e95ef5b0 100644
--- a/libdeepgalois/include/math_functions.hh
+++ b/libdeepgalois/include/math_functions.hh
@@ -70,25 +70,42 @@ float_t cross_entropy(size_t n, const float_t* y, const float_t* p);
 void d_cross_entropy(const vec_t& y, const vec_t& p, vec_t& d);
 void d_cross_entropy(size_t n, const float_t* y, const float_t* p, float_t* d);
 
-void copy_gpu(size_t len, const float_t *in, float_t *out);
-void vadd_gpu(const int n, const float_t *a, const float_t *b, float_t *out); // vector add
-void relu_gpu(const int n, const float_t *in, float_t *out); // ReLU
-void d_relu_gpu(const int n, const float_t *in_diff, const float_t *data, float_t *out_diff); // ReLU derivative
-void dropout_gpu(const int n, const float scale, const float dropout_rate, const float_t *in, unsigned *masks, float_t *out); // dropout
-void d_dropout_gpu(const int n, const float scale, const float_t *in, const unsigned *masks, float_t *out); // dropout derivative
-void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, 
-	const int M, const int N, const int K, const float alpha, const float* A, const float* B, const float beta, float* C);
-void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, const float_t *A, const float_t *B, float_t *C); // matrix multiply
-void softmax_cross_entropy_gpu(int x, int y, const float_t *in_data, const mask_t *masks, const label_t *labels, float_t *loss, float_t *out_data);
-void d_softmax_cross_entropy_gpu(int x, int y, const float_t *in_data, const mask_t *masks, const label_t *labels, const float_t *out_data, float_t *diff);
-void scal_gpu(const int N, const float alpha, float *X);
+void copy_gpu(size_t len, const float_t* in, float_t* out);
+void vadd_gpu(const int n, const float_t* a, const float_t* b,
+              float_t* out);                                 // vector add
+void relu_gpu(const int n, const float_t* in, float_t* out); // ReLU
+void d_relu_gpu(const int n, const float_t* in_diff, const float_t* data,
+                float_t* out_diff); // ReLU derivative
+void dropout_gpu(const int n, const float scale, const float dropout_rate,
+                 const float_t* in, unsigned* masks, float_t* out); // dropout
+void d_dropout_gpu(const int n, const float scale, const float_t* in,
+                   const unsigned* masks, float_t* out); // dropout derivative
+void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
+               const int M, const int N, const int K, const float alpha,
+               const float* A, const float* B, const float beta, float* C);
+void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z,
+                    const float_t* A, const float_t* B,
+                    float_t* C); // matrix multiply
+void softmax_cross_entropy_gpu(int x, int y, const float_t* in_data,
+                               const mask_t* masks, const label_t* labels,
+                               float_t* loss, float_t* out_data);
+void d_softmax_cross_entropy_gpu(int x, int y, const float_t* in_data,
+                                 const mask_t* masks, const label_t* labels,
+                                 const float_t* out_data, float_t* diff);
+void scal_gpu(const int N, const float alpha, float* X);
 void add_scalar_gpu(const int N, const float_t alpha, float_t* Y);
-acc_t masked_avg_loss(size_t begin, size_t end, size_t count, mask_t *masks, float_t *loss);
-acc_t masked_accuracy_gpu(size_t num_classes, size_t begin, size_t end, size_t count, mask_t *masks, float_t *preds, label_t *labels);
+acc_t masked_avg_loss(size_t begin, size_t end, size_t count, mask_t* masks,
+                      float_t* loss);
+acc_t masked_accuracy_gpu(size_t num_classes, size_t begin, size_t end,
+                          size_t count, mask_t* masks, float_t* preds,
+                          label_t* labels);
 
-void copy_masks_device(int n, mask_t *h_masks, mask_t *&d_masks);
-void malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned *&masks, float_t *&in, float_t *&out);
-void loss_malloc_device(int n, float_t *&loss);
-void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned *&masks, float_t *&in, float_t *&out, float_t *&matrix, float_t *&grad);
+void copy_masks_device(int n, mask_t* h_masks, mask_t*& d_masks);
+void malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned*& masks,
+                   float_t*& in, float_t*& out);
+void loss_malloc_device(int n, float_t*& loss);
+void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout,
+                         unsigned*& masks, float_t*& in, float_t*& out,
+                         float_t*& matrix, float_t*& grad);
 
 #endif
diff --git a/libdeepgalois/include/net.h b/libdeepgalois/include/net.h
index 87a0e3b72b..4a83caaf88 100644
--- a/libdeepgalois/include/net.h
+++ b/libdeepgalois/include/net.h
@@ -65,7 +65,7 @@ class Net {
   // forward propagation: [begin, end) is the range of samples used.
   acc_t fprop(size_t begin, size_t end, size_t count, mask_t* masks) {
     // set mask for the last layer
-    layers[num_layers - 1]->set_sample_mask(begin, end, count, &masks[0]);
+    layers[num_layers - 1]->set_sample_mask(begin, end, count, masks);
     // layer0: from N x D to N x 16
     // layer1: from N x 16 to N x E
     // layer2: from N x E to N x E (normalize only)
@@ -108,27 +108,8 @@ class Net {
   std::vector<mask_t> train_mask, val_mask; // masks for traning and validation
   size_t train_begin, train_end, train_count, val_begin, val_end, val_count;
   std::vector<layer*> layers; // all the layers in the neural network
-
   // comparing outputs with the ground truth (labels)
-  inline acc_t masked_accuracy(size_t begin, size_t end, size_t count,
-                               mask_t* masks) {
-    AccumF accuracy_all;
-    accuracy_all.reset();
-    galois::do_all(galois::iterate(begin, end),
-                   [&](const auto& i) {
-                     if (masks[i] == 1) {
-                       int preds = argmax(num_classes,
-                                          &(layers[NUM_CONV_LAYERS - 1]
-                                                ->next()
-                                                ->get_data()[i * num_classes]));
-                       if ((label_t)preds == context->get_label(i))
-                         accuracy_all += 1.0;
-                     }
-                   },
-                   galois::chunk_size<256>(), galois::steal(),
-                   galois::loopname("getMaskedLoss"));
-    return accuracy_all.reduce() / (acc_t)count;
-  }
+  acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks);
 };
 
 #endif
diff --git a/libdeepgalois/src/aggregator.cu b/libdeepgalois/src/aggregator.cu
index 3a0288b197..3d6d016363 100644
--- a/libdeepgalois/src/aggregator.cu
+++ b/libdeepgalois/src/aggregator.cu
@@ -34,8 +34,8 @@ __global__ void update_all_kernel(size_t n, size_t len, CSRGraph& g,
 
 void update_all(size_t len, CSRGraph& g, const float_t* in, float_t* out,
                 bool norm, const float_t* norm_factor) {
-  std::cout << "[debug]: update_all on GPU\n";
   unsigned n = g.nnodes;
+  std::cout << "[debug]: update_all on GPU, n=" << n << ", len=" << len << "\n";
   CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t)));
   update_all_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
       n, len, g, in, out, norm, norm_factor);
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 715fcafd39..d53a75e53a 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -37,10 +37,10 @@ graph_conv_layer::graph_conv_layer(unsigned level, bool act, bool norm,
 }
 
 void graph_conv_layer::init() {
+  std::cout << name_
+            << ": allocating memory for parameters and intermediate data... ";
   Timer t_alloc;
   t_alloc.Start();
-  // std::cout << name_ << ": allocating memory for parameters and intermediate
-  // data... ";
 #ifdef CPU_ONLY
   rand_init_matrix(y, z, W); // randomly initialize trainable parameters
   // rand_init_matrix(y, z, Q);
@@ -57,7 +57,7 @@ void graph_conv_layer::init() {
                       d_weight_grad);
 #endif
   t_alloc.Stop();
-  // std::cout << "Done, time: " << t_alloc.Millisecs() << " ms\n";
+  std::cout << "Done, allocation time: " << t_alloc.Millisecs() << " ms\n";
 }
 
 #ifdef CPU_ONLY
@@ -76,8 +76,8 @@ void graph_conv_layer::forward_propagation(const float_t* in_data,
                    galois::loopname("dropout"));
     matmul1D1D(x, z, y, in_temp, &W[0], out_temp); // x*y; y*z; x*z
   } else
-    matmul1D1D(x, z, y, in_data, &W[0], out_temp);      // x*y; y*z; x*z
-  aggregate(z, context->graph_cpu, out_temp, out_data); // aggregate
+    matmul1D1D(x, z, y, in_data, &W[0], out_temp); // x*y; y*z; x*z
+  aggregate(z, context->graph_cpu, out_temp, out_data);
   if (act_) {
     galois::do_all(
         galois::iterate((size_t)0, x),
@@ -135,12 +135,15 @@ void graph_conv_layer::forward_propagation(const float_t* in_data,
   assert(in_data != NULL);
   assert(in_temp != NULL);
   assert(dropout_mask != NULL);
+  // std::cout << "in_data=" << in_data << ", in_temp=" << in_temp << ",
+  // dropout_mask=" << dropout_mask << ", out_temp=" << out_temp << ", out_data="
+  // << out_data << "\n";
   if (dropout_ && phase_ == net_phase::train) {
     dropout_gpu(x * y, scale_, dropout_rate_, in_data, dropout_mask, in_temp);
     matmul1D1D_gpu(x, z, y, in_temp, d_W, out_temp);
   } else
     matmul1D1D_gpu(x, z, y, in_data, d_W, out_temp);
-  aggregate(z, context->graph_gpu, out_temp, out_data);
+  // aggregate(z, context->graph_gpu, out_temp, out_data);
   if (act_)
     relu_gpu(x * z, out_data, out_data);
 }
@@ -156,10 +159,10 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
   if (level_ != 0) {
     sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0,
               in_temp);
-    update_all(y, context->graph_gpu, in_temp, in_grad, true,
-               context->d_norm_factor);
+    // update_all(y, context->graph_gpu, in_temp, in_grad, true,
+    // context->d_norm_factor);
     if (dropout_)
-      d_dropout_gpu(y, scale_, in_grad, dropout_mask, in_grad);
+      d_dropout_gpu(x * y, scale_, in_grad, dropout_mask, in_grad);
   }
   sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0,
             d_weight_grad);
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp
index 85e81d038c..a953dd5f1e 100644
--- a/libdeepgalois/src/layers/softmax_loss_layer.cpp
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp
@@ -6,19 +6,19 @@ softmax_loss_layer::softmax_loss_layer(unsigned level,
     : layer(level, in_dims, out_dims) {
   trainable_ = false;
   name_      = layer_type() + "_" + std::to_string(level);
-  std::cout << name_ << ": allocating memory for intermediate data... ";
 #ifdef CPU_ONLY
   loss = new float_t[in_dims[0]]; // error for each sample
 #else
-  loss_malloc_device(in_dims[0], loss);
+  out_malloc_device(in_dims[0], masks_, d_masks_, loss);
 #endif
-  std::cout << "Done\n";
 }
 #ifdef CPU_ONLY
 // TODO: need kernel fusion optimization
 // 𝑦[i] = 𝑒^𝑥[i] / Σ 𝑒^𝑥[𝑘]
 void softmax_loss_layer::forward_propagation(const float_t* in_data,
                                              float_t* out_data) {
+  // void softmax_loss_layer::forward_propagation(const vec_t &in_data, vec_t
+  // &out_data) {
   size_t len = input_dims[1];
   galois::do_all(galois::iterate(begin_, end_),
                  [&](const auto& i) {
@@ -35,6 +35,8 @@ void softmax_loss_layer::forward_propagation(const float_t* in_data,
                  galois::loopname("softmax-loss-fw"));
 }
 
+// void softmax_loss_layer::back_propagation(const vec_t &in_data, const vec_t
+// &out_data, vec_t &out_grad, vec_t &in_grad) {
 void softmax_loss_layer::back_propagation(const float_t* in_data,
                                           const float_t* out_data,
                                           float_t* out_grad, float_t* in_grad) {
@@ -54,25 +56,6 @@ void softmax_loss_layer::back_propagation(const float_t* in_data,
                  galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
                  galois::loopname("softmax-loss-bw"));
 }
-
-acc_t softmax_loss_layer::get_masked_loss() {
-  AccumF total_loss;
-  AccumU valid_sample_count;
-  total_loss.reset();
-  valid_sample_count.reset();
-  galois::do_all(galois::iterate(begin_, end_),
-                 [&](const auto& i) {
-                   if (masks_[i]) {
-                     total_loss += loss[i];
-                     valid_sample_count += 1;
-                   }
-                 },
-                 galois::chunk_size<256>(), galois::steal(),
-                 galois::loopname("getMaskedLoss"));
-  assert(valid_sample_count.reduce() == count_);
-  return total_loss.reduce() / (acc_t)count_;
-}
-
 #else // GPU implementation
 void softmax_loss_layer::forward_propagation(const float_t* in_data,
                                              float_t* out_data) {
@@ -80,14 +63,7 @@ void softmax_loss_layer::forward_propagation(const float_t* in_data,
                             context->d_labels, loss, out_data);
 }
 
-void softmax_loss_layer::back_propagation(const float_t* in_data,
-                                          const float_t* out_data,
-                                          float_t* out_grad, float_t* in_grad) {
-  d_softmax_cross_entropy_gpu(input_dims[0], input_dims[1], in_data, d_masks_,
-                              context->d_labels, out_data, in_grad);
-}
-
 acc_t softmax_loss_layer::get_masked_loss() {
-  return masked_avg_loss(begin_, end_, count_, masks_, loss);
+  return masked_avg_loss(begin_, end_, count_, d_masks_, loss);
 }
 #endif
diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp
index e21bb42396..3acc213d5e 100644
--- a/libdeepgalois/src/math_functions.cpp
+++ b/libdeepgalois/src/math_functions.cpp
@@ -469,10 +469,9 @@ float_t cross_entropy(const vec_t& y, const vec_t& p) {
     // if (p[i]==float_t(1)) loss -= (float_t(1) - y[i]) *
     // std::log(float_t(1e-10));
     else
-      loss -= y[i] *
-              std::log(
-                  p[i]); // + (float_t(1) - y[i]) * std::log(float_t(1) - p[i]);
-                         // loss -= y[i] * std::log(p[i]);
+      loss -=
+          y[i] * std::log(p[i]); // + (float_t(1) - y[i]) * std::log(float_t(1)
+                                 // - p[i]); loss -= y[i] * std::log(p[i]);
   }
   return loss;
 }
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index 49fa979e0a..e098922ba1 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -1,9 +1,5 @@
 #include "math_functions.hh"
 #include "context.h"
-#include "gg.h"
-#include "ggcuda.h"
-#include "cub/cub.cuh"
-#include <curand_kernel.h>
 
 void gpu_rng_uniform(const int n, unsigned* r) {
   CURAND_CHECK(curandGenerate(Context::curand_generator(), r, n));
@@ -25,11 +21,11 @@ void gpu_rng_gaussian(const int n, const float_t mu, const float_t sigma,
       curandGenerateNormal(Context::curand_generator(), r, n, mu, sigma));
 }
 
-void loss_malloc_device(int n, float_t* loss) {
+void loss_malloc_device(int n, float_t*& loss) {
   CUDA_CHECK(cudaMalloc((void**)&loss, n * sizeof(float_t)));
 }
 
-void copy_masks_device(int n, mask_t* h_masks, mask_t* d_masks) {
+void copy_masks_device(int n, mask_t* h_masks, mask_t*& d_masks) {
   assert(h_masks != NULL);
   CUDA_CHECK(cudaMalloc((void**)&d_masks, n * sizeof(mask_t)));
   CUDA_CHECK(
@@ -37,8 +33,8 @@ void copy_masks_device(int n, mask_t* h_masks, mask_t* d_masks) {
 }
 
 void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout,
-                         unsigned* masks, float_t* in, float_t* out,
-                         float_t* matrix, float_t* grad) {
+                         unsigned*& masks, float_t*& in, float_t*& out,
+                         float_t*& matrix, float_t*& grad) {
   if (dropout)
     CUDA_CHECK(cudaMalloc((void**)&masks, x * y * sizeof(unsigned)));
   CUDA_CHECK(cudaMalloc((void**)&in, x * y * sizeof(float_t)));
@@ -51,29 +47,27 @@ void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout,
   CUDA_CHECK(cudaMemset(grad, 0, y * z * sizeof(float_t)));
 }
 
-void copy_gpu(size_t len, const float_t* in, float_t* out) {
-  CUDA_CHECK(
-      cudaMemcpy(out, in, len * sizeof(float_t), cudaMemcpyDeviceToDevice));
+void gpu_rng_gaussian(const int n, const float_t mu, const float_t sigma,
+                      float_t* r) {
+  CURAND_CHECK(
+      curandGenerateNormal(Context::curand_generator(), r, n, mu, sigma));
 }
 
 __global__ void setup_curand_kernel(const int n, curandState* state) {
   CUDA_KERNEL_LOOP(i, n) {
-    curand_init(1234, i, 0, &state[i]); // Each thread gets same seed 1234
-    // curand_init(7+i, i, 0, &state[i]); // Each thread gets different seed
+    // curand_init(1234, i, 0, &state[i]); // Each thread gets same seed 1234
+    curand_init(7 + i, i, 0, &state[i]); // Each thread gets different seed
   }
 }
 
-__device__ bool bernoulli_gpu(int tid, curandState* state, float_t p) {
-  curandState local_state = state[tid];
-  return curand_uniform(&local_state) <= p;
-}
-
 __global__ void dropout_kernel(const int n, const float scale,
                                const float dropout_rate, const float_t* in,
                                unsigned* masks, curandState* state,
                                float_t* out) {
   CUDA_KERNEL_LOOP(i, n) {
-    masks[i] = bernoulli_gpu(i, state, dropout_rate);
+    // curand_init(1234, i, 0, &state[i]); // Each thread gets same seed 1234
+    // masks[i] = curand_uniform(&state[i]) <= dropout_rate ? 1 : 0;
+    masks[i] = 1.0 - dropout_rate;
     out[i]   = in[i] * masks[i] * scale;
   }
 }
@@ -82,28 +76,21 @@ void dropout_gpu(const int n, const float scale, const float dropout_rate,
                  const float_t* in, unsigned* masks, float_t* out) {
   curandState* devStates;
   CUDA_CHECK(cudaMalloc((void**)&devStates, n * sizeof(curandState)));
-  std::cout << "[debug]: setup curand, n = " << n << "\n";
-  setup_curand_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, devStates);
-  CudaTest("solving setup_curand kernel failed");
-  std::cout << "[debug]: dropout_gpu\n";
+  // std::cout << "[debug]: setup curand, n = " << n << "\n";
+  // setup_curand_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n,
+  // devStates); CudaTest("solving setup_curand kernel failed"); std::cout <<
+  // "[debug]: dropout_gpu\n";
   dropout_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
       n, scale, dropout_rate, in, masks, devStates, out);
   CudaTest("solving dropout kernel failed");
   CUDA_CHECK(cudaFree(devStates));
-  std::cout << "[debug]: dropout_gpu done\n";
+  // std::cout << "[debug]: dropout_gpu done\n";
 }
 
-__global__ void d_dropout_kernel(const int n, const float scale,
-                                 const float_t* in, const unsigned* masks,
-                                 float_t* out) {
-  CUDA_KERNEL_LOOP(i, n) { out[i] = in[i] * masks[i] * scale; }
-}
-
-void d_dropout_gpu(const int n, const float scale, const float_t* in,
-                   const unsigned* masks, float_t* out) {
-  d_dropout_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, scale, in,
-                                                             masks, out);
-  CudaTest("solving dropout kernel failed");
+void dropout_gpu(const int n, const float scale, const float dropout_rate,
+                 const float_t* in, unsigned* masks, float_t* out) {
+  dropout_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
+      n, scale, dropout_rate, in, masks, out);
 }
 
 // flattern data into 1D before feed into the ReLU operater
@@ -112,7 +99,7 @@ __global__ void relu_kernel(const int n, const float_t* in, float_t* out) {
 }
 
 void relu_gpu(const int n, const float_t* in, float_t* out) {
-  std::cout << "[debug]: relu_gpu\n";
+  // std::cout << "[debug]: relu_gpu\n";
   relu_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, in, out);
   CudaTest("solving relu kernel failed");
 }
@@ -128,7 +115,6 @@ void d_relu_gpu(const int n, const float_t* in_diff, const float_t* data,
                 float_t* out_diff) {
   d_relu_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, in_diff, data,
                                                           out_diff);
-  CudaTest("solving d_relu kernel failed");
 }
 
 void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
@@ -147,15 +133,12 @@ void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
 
 void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z,
                     const float_t* A, const float_t* B, float_t* C) {
-  std::cout << "[debug]: matmul1D1D_gpu\n";
+  // std::cout << "[debug]: matmul1D1D_gpu\n";
   const CBLAS_TRANSPOSE TransA = CblasNoTrans;
   const CBLAS_TRANSPOSE TransB = CblasNoTrans;
   sgemm_gpu(TransA, TransB, dim_x, dim_y, dim_z, 1.0, A, B, 0.0, C);
 }
 
-// the arguments of the maxima
-int argmax_gpu(const size_t n, const float_t* x) { return 0; }
-
 void gemv_gpu(const CBLAS_TRANSPOSE TransA, const int M, const int N,
               const float alpha, const float* A, const float* x,
               const float beta, float* y) {
@@ -192,7 +175,6 @@ void set_gpu(const int N, const float_t alpha, float_t* Y) {
     return;
   }
   set_kernel<<<CUDA_GET_BLOCKS(N), CUDA_NUM_THREADS>>>(N, alpha, Y);
-  CudaTest("solving set kernel failed");
 }
 
 __global__ void add_scalar_kernel(const int n, const float_t alpha,
@@ -202,7 +184,6 @@ __global__ void add_scalar_kernel(const int n, const float_t alpha,
 
 void add_scalar_gpu(const int N, const float_t alpha, float_t* Y) {
   add_scalar_kernel<<<CUDA_GET_BLOCKS(N), CUDA_NUM_THREADS>>>(N, alpha, Y);
-  CudaTest("solving add_scalar kernel failed");
 }
 
 __global__ void vadd_kernel(const int n, const float_t* a, const float_t* b,
@@ -212,7 +193,6 @@ __global__ void vadd_kernel(const int n, const float_t* a, const float_t* b,
 
 void vadd_gpu(const int N, const float_t* a, const float_t* b, float_t* y) {
   vadd_kernel<<<CUDA_GET_BLOCKS(N), CUDA_NUM_THREADS>>>(N, a, b, y);
-  CudaTest("solving vadd kernel failed");
 }
 
 // TODO: use warp
@@ -308,13 +288,13 @@ __global__ void masked_avg_loss_kernel(size_t begin, size_t end, mask_t* masks,
                                        float_t* loss,
                                        HGAccumulator<acc_t> total) {
   total.thread_entry();
-  __shared__ cub::BlockReduce<acc_t, TB_SIZE>::TempStorage local_loss;
+  __shared__ cub::BlockReduce<acc_t, CUDA_NUM_THREADS>::TempStorage local_loss;
   CUDA_KERNEL_LOOP(i, end - begin) {
     if (masks[begin + i] == 1)
       // total += loss[begin+i];
       total.reduce(loss[begin + i]);
   }
-  total.thread_exit<cub::BlockReduce<acc_t, TB_SIZE>>(local_loss);
+  total.thread_exit<cub::BlockReduce<acc_t, CUDA_NUM_THREADS>>(local_loss);
 }
 
 acc_t masked_avg_loss(size_t begin, size_t end, size_t count, mask_t* masks,
@@ -327,5 +307,50 @@ acc_t masked_avg_loss(size_t begin, size_t end, size_t count, mask_t* masks,
       begin, end, masks, loss, loss_accum);
   CudaTest("solving masked_avg_loss kernel failed");
   cudaDeviceSynchronize();
-  return *(total_loss.cpu_rd_ptr());
+  return *(total_loss.cpu_rd_ptr()) / count;
+}
+
+// the arguments of the maxima
+__device__ size_t argmax_device(const size_t n, const float_t* x) {
+  float_t max    = x[0];
+  size_t max_ind = 0;
+  for (size_t i = 1; i < n; i++) {
+    if (x[i] > max) {
+      max_ind = i;
+      max     = x[i];
+    }
+  }
+  return max_ind;
+}
+
+__global__ void masked_accuracy_kernel(size_t num_classes, size_t begin,
+                                       size_t end, mask_t* masks,
+                                       float_t* preds, label_t* labels,
+                                       HGAccumulator<acc_t> total) {
+  total.thread_entry();
+  __shared__ cub::BlockReduce<acc_t, CUDA_NUM_THREADS>::TempStorage
+      local_accuracy;
+  CUDA_KERNEL_LOOP(i, end - begin) {
+    if (masks[begin + i] == 1) {
+      label_t pred = (label_t)argmax_device(num_classes,
+                                            preds + (begin + i) * num_classes);
+      if (pred == labels[begin + i])
+        total.reduce(1.0);
+    }
+  }
+  total.thread_exit<cub::BlockReduce<acc_t, CUDA_NUM_THREADS>>(local_accuracy);
+}
+
+acc_t masked_accuracy_gpu(size_t num_classes, size_t begin, size_t end,
+                          size_t count, mask_t* masks, float_t* preds,
+                          label_t* labels) {
+  HGAccumulator<acc_t> accuracy_accum;
+  Shared<acc_t> total_accuracy   = Shared<acc_t>(1);
+  *(total_accuracy.cpu_wr_ptr()) = 0;
+  accuracy_accum.rv              = total_accuracy.gpu_wr_ptr();
+  masked_accuracy_kernel<<<CUDA_GET_BLOCKS(end - begin), CUDA_NUM_THREADS>>>(
+      num_classes, begin, end, masks, preds, labels, accuracy_accum);
+  CudaTest("solving masked_avg_loss kernel failed");
+  cudaDeviceSynchronize();
+  return *(total_accuracy.cpu_rd_ptr()) / count;
 }
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index 09267795df..41393e6f13 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -1,9 +1,11 @@
 #include "net.h"
 
 void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1) {
-  context     = new Context();
+  context = new Context();
+  // Context::create_blas_handle();
   num_samples = context->read_graph(dataset_str);
   num_classes = context->read_labels(dataset_str);
+  context->degree_counting();
   context->norm_factor_counting(); // pre-compute normalizing factor
   num_epochs = epochs;
 
@@ -98,3 +100,30 @@ void Net::construct_layers() {
   layers[0]->set_in_data(context->get_in_ptr()); // feed input data
   set_contexts();
 }
+
+acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count,
+                           mask_t* masks) {
+#ifdef CPU_ONLY
+  AccumF accuracy_all;
+  accuracy_all.reset();
+  galois::do_all(galois::iterate(begin, end),
+                 [&](const auto& i) {
+                   if (masks[i] == 1) {
+                     int preds = argmax(num_classes,
+                                        &(layers[NUM_CONV_LAYERS - 1]
+                                              ->next()
+                                              ->get_data()[i * num_classes]));
+                     if ((label_t)preds == context->get_label(i))
+                       accuracy_all += 1.0;
+                   }
+                 },
+                 galois::chunk_size<256>(), galois::steal(),
+                 galois::loopname("getMaskedLoss"));
+  return accuracy_all.reduce() / (acc_t)count;
+#else
+  return masked_accuracy_gpu(num_classes, begin, end, count,
+                             layers[NUM_CONV_LAYERS]->get_device_masks(),
+                             layers[NUM_CONV_LAYERS - 1]->next()->get_data(),
+                             context->d_labels);
+#endif
+}

From 6336fe34b4fece4f2618fd534529c0e6451ca896 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 25 Feb 2020 18:16:21 -0600
Subject: [PATCH 038/660] softmax missing functions

---
 .../src/layers/softmax_loss_layer.cpp         | 28 +++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp
index a953dd5f1e..66ce404a18 100644
--- a/libdeepgalois/src/layers/softmax_loss_layer.cpp
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp
@@ -56,6 +56,27 @@ void softmax_loss_layer::back_propagation(const float_t* in_data,
                  galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
                  galois::loopname("softmax-loss-bw"));
 }
+
+acc_t softmax_loss_layer::get_masked_loss() {
+  AccumF total_loss;
+  AccumU valid_sample_count;
+  total_loss.reset();
+  valid_sample_count.reset();
+  galois::do_all(galois::iterate(begin_, end_),
+                 [&](const auto& i) {
+                   if (masks_[i]) {
+                     total_loss += loss[i];
+                     valid_sample_count += 1;
+                   }
+                 },
+                 galois::chunk_size<256>(), galois::steal(),
+                 galois::loopname("getMaskedLoss"));
+  assert(valid_sample_count.reduce() == count_);
+  return total_loss.reduce() / (acc_t)count_;
+}
+
+
+
 #else // GPU implementation
 void softmax_loss_layer::forward_propagation(const float_t* in_data,
                                              float_t* out_data) {
@@ -63,6 +84,13 @@ void softmax_loss_layer::forward_propagation(const float_t* in_data,
                             context->d_labels, loss, out_data);
 }
 
+void softmax_loss_layer::back_propagation(const float_t* in_data,
+                                          const float_t* out_data,
+                                          float_t* out_grad, float_t* in_grad) {
+  d_softmax_cross_entropy_gpu(input_dims[0], input_dims[1], in_data, d_masks_,
+                              context->d_labels, out_data, in_grad);
+}
+
 acc_t softmax_loss_layer::get_masked_loss() {
   return masked_avg_loss(begin_, end_, count_, d_masks_, loss);
 }

From 0339aa284f5934bd7079350dba60abc1c91b0fa2 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 25 Feb 2020 18:20:25 -0600
Subject: [PATCH 039/660] remove degree counting from net.cpp

---
 libdeepgalois/src/net.cpp | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index 41393e6f13..ddd6df4afa 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -3,15 +3,18 @@
 void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1) {
   context = new Context();
   // Context::create_blas_handle();
+  // read graph, get num nodes
   num_samples = context->read_graph(dataset_str);
   num_classes = context->read_labels(dataset_str);
-  context->degree_counting();
   context->norm_factor_counting(); // pre-compute normalizing factor
+
   num_epochs = epochs;
 
   std::cout << "Reading label masks ... ";
   train_mask.resize(num_samples, 0);
   val_mask.resize(num_samples, 0);
+
+  // get testing and validation sets
   if (dataset_str == "reddit") {
     train_begin = 0, train_count = 153431,
     train_end = train_begin + train_count;
@@ -25,9 +28,11 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1) {
         read_masks(dataset_str, "train", train_begin, train_end, train_mask);
     val_count = read_masks(dataset_str, "val", val_begin, val_end, val_mask);
   }
+
   std::cout << "Done\n";
 
   num_layers = NUM_CONV_LAYERS + 1;
+  // initialize feature metadata
   feature_dims.resize(num_layers + 1);
   feature_dims[0] =
       context->read_features(dataset_str); // input feature dimension: D
@@ -42,7 +47,7 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1) {
 }
 
 void Net::train(optimizer* opt, bool need_validate) {
-  std::cout << "\nStart training...\n";
+  galois::gPrint("\nStart training...\n");
   galois::StatTimer Tupdate("Train-WeightUpdate");
   galois::StatTimer Tfw("Train-Forward");
   galois::StatTimer Tbw("Train-Backward");

From 8038b6bdd485d7e9b22a0817174b33be833fb520 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Tue, 25 Feb 2020 19:03:41 -0600
Subject: [PATCH 040/660] fix merge errors

---
 libdeepgalois/include/layers/layer.h          |  1 +
 .../src/layers/softmax_loss_layer.cpp         |  2 +-
 libdeepgalois/src/math_functions.cu           | 31 ++++++++++++-------
 3 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/libdeepgalois/include/layers/layer.h b/libdeepgalois/include/layers/layer.h
index cec1da3665..68260c034a 100644
--- a/libdeepgalois/include/layers/layer.h
+++ b/libdeepgalois/include/layers/layer.h
@@ -44,6 +44,7 @@ class layer : public node {
   virtual std::string layer_type() const = 0;
   virtual void set_netphase(net_phase phase) {}
   virtual void set_context(Context* ctx) { context = ctx; }
+  virtual acc_t get_masked_loss() { return acc_t(0); }
   // virtual void forward_propagation(const vec_t &in_data, vec_t &out_data) =
   // 0; virtual void back_propagation(const vec_t &in_data, const vec_t
   // &out_data, vec_t &out_grad, vec_t &in_grad) = 0;
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp
index 66ce404a18..dbebe73f44 100644
--- a/libdeepgalois/src/layers/softmax_loss_layer.cpp
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp
@@ -9,7 +9,7 @@ softmax_loss_layer::softmax_loss_layer(unsigned level,
 #ifdef CPU_ONLY
   loss = new float_t[in_dims[0]]; // error for each sample
 #else
-  out_malloc_device(in_dims[0], masks_, d_masks_, loss);
+  loss_malloc_device(in_dims[0], loss);
 #endif
 }
 #ifdef CPU_ONLY
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index e098922ba1..c507ee313b 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -1,5 +1,9 @@
 #include "math_functions.hh"
 #include "context.h"
+#include "gg.h"
+#include "ggcuda.h"
+#include "cub/cub.cuh"
+#include <curand_kernel.h>
 
 void gpu_rng_uniform(const int n, unsigned* r) {
   CURAND_CHECK(curandGenerate(Context::curand_generator(), r, n));
@@ -47,12 +51,6 @@ void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout,
   CUDA_CHECK(cudaMemset(grad, 0, y * z * sizeof(float_t)));
 }
 
-void gpu_rng_gaussian(const int n, const float_t mu, const float_t sigma,
-                      float_t* r) {
-  CURAND_CHECK(
-      curandGenerateNormal(Context::curand_generator(), r, n, mu, sigma));
-}
-
 __global__ void setup_curand_kernel(const int n, curandState* state) {
   CUDA_KERNEL_LOOP(i, n) {
     // curand_init(1234, i, 0, &state[i]); // Each thread gets same seed 1234
@@ -77,8 +75,8 @@ void dropout_gpu(const int n, const float scale, const float dropout_rate,
   curandState* devStates;
   CUDA_CHECK(cudaMalloc((void**)&devStates, n * sizeof(curandState)));
   // std::cout << "[debug]: setup curand, n = " << n << "\n";
-  // setup_curand_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n,
-  // devStates); CudaTest("solving setup_curand kernel failed"); std::cout <<
+  // setup_curand_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, devStates); 
+  // CudaTest("solving setup_curand kernel failed"); std::cout <<
   // "[debug]: dropout_gpu\n";
   dropout_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
       n, scale, dropout_rate, in, masks, devStates, out);
@@ -87,10 +85,15 @@ void dropout_gpu(const int n, const float scale, const float dropout_rate,
   // std::cout << "[debug]: dropout_gpu done\n";
 }
 
-void dropout_gpu(const int n, const float scale, const float dropout_rate,
-                 const float_t* in, unsigned* masks, float_t* out) {
-  dropout_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
-      n, scale, dropout_rate, in, masks, out);
+__global__ void d_dropout_kernel(const int n, const float scale,
+                                 const float_t* in, const unsigned* masks,
+                                 float_t* out) {
+  CUDA_KERNEL_LOOP(i, n) { out[i] = in[i] * masks[i] * scale; }
+}
+
+void d_dropout_gpu(const int n, const float scale, const float_t* in,
+                   const unsigned* masks, float_t* out) {
+  d_dropout_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, scale, in, masks, out);
 }
 
 // flattern data into 1D before feed into the ReLU operater
@@ -191,6 +194,10 @@ __global__ void vadd_kernel(const int n, const float_t* a, const float_t* b,
   CUDA_KERNEL_LOOP(index, n) { y[index] = a[index] + b[index]; }
 }
 
+void copy_gpu(size_t len, const float_t* in, float_t* out) {
+  CUDA_CHECK(cudaMemcpy(out, in, len * sizeof(float_t), cudaMemcpyDeviceToDevice));
+}
+
 void vadd_gpu(const int N, const float_t* a, const float_t* b, float_t* y) {
   vadd_kernel<<<CUDA_GET_BLOCKS(N), CUDA_NUM_THREADS>>>(N, a, b, y);
 }

From 6ec8a06b362c9cfd1ea8c3cb942b7767d0237537 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Tue, 25 Feb 2020 19:30:47 -0600
Subject: [PATCH 041/660] refine

---
 .../src/layers/softmax_loss_layer.cpp         |  7 -----
 libdeepgalois/src/math_functions.cpp          | 10 +------
 libdeepgalois/src/math_functions.cu           |  4 +++
 libdeepgalois/src/net.cpp                     | 29 +++++++------------
 4 files changed, 15 insertions(+), 35 deletions(-)

diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp
index dbebe73f44..0cd9547250 100644
--- a/libdeepgalois/src/layers/softmax_loss_layer.cpp
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp
@@ -17,8 +17,6 @@ softmax_loss_layer::softmax_loss_layer(unsigned level,
 // 𝑦[i] = 𝑒^𝑥[i] / Σ 𝑒^𝑥[𝑘]
 void softmax_loss_layer::forward_propagation(const float_t* in_data,
                                              float_t* out_data) {
-  // void softmax_loss_layer::forward_propagation(const vec_t &in_data, vec_t
-  // &out_data) {
   size_t len = input_dims[1];
   galois::do_all(galois::iterate(begin_, end_),
                  [&](const auto& i) {
@@ -35,8 +33,6 @@ void softmax_loss_layer::forward_propagation(const float_t* in_data,
                  galois::loopname("softmax-loss-fw"));
 }
 
-// void softmax_loss_layer::back_propagation(const vec_t &in_data, const vec_t
-// &out_data, vec_t &out_grad, vec_t &in_grad) {
 void softmax_loss_layer::back_propagation(const float_t* in_data,
                                           const float_t* out_data,
                                           float_t* out_grad, float_t* in_grad) {
@@ -74,9 +70,6 @@ acc_t softmax_loss_layer::get_masked_loss() {
   assert(valid_sample_count.reduce() == count_);
   return total_loss.reduce() / (acc_t)count_;
 }
-
-
-
 #else // GPU implementation
 void softmax_loss_layer::forward_propagation(const float_t* in_data,
                                              float_t* out_data) {
diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp
index 3acc213d5e..9914fd68d5 100644
--- a/libdeepgalois/src/math_functions.cpp
+++ b/libdeepgalois/src/math_functions.cpp
@@ -466,12 +466,7 @@ float_t cross_entropy(const vec_t& y, const vec_t& p) {
       continue;
     if (p[i] == float_t(0))
       loss -= y[i] * std::log(float_t(1e-10));
-    // if (p[i]==float_t(1)) loss -= (float_t(1) - y[i]) *
-    // std::log(float_t(1e-10));
-    else
-      loss -=
-          y[i] * std::log(p[i]); // + (float_t(1) - y[i]) * std::log(float_t(1)
-                                 // - p[i]); loss -= y[i] * std::log(p[i]);
+    else loss -= y[i] * std::log(p[i]);
   }
   return loss;
 }
@@ -491,11 +486,8 @@ float_t cross_entropy(size_t n, const float_t* y, const float_t* p) {
 
 void d_cross_entropy(const vec_t& y, const vec_t& p, vec_t& d) {
   auto n = y.size();
-  // for (size_t i = 0; i < n; i++) d[i] = (p[i] - y[i]) / (p[i] * (float_t(1) -
-  // p[i]));
   for (size_t i = 0; i < n; i++) {
     d[i] = -y[i] / (p[i] + float_t(1e-10));
-    // d[i] = p[i] - y[i];
   }
 }
 
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index c507ee313b..28e65e149d 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -118,6 +118,7 @@ void d_relu_gpu(const int n, const float_t* in_diff, const float_t* data,
                 float_t* out_diff) {
   d_relu_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, in_diff, data,
                                                           out_diff);
+  CudaTest("solving d_relu kernel failed");
 }
 
 void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
@@ -178,6 +179,7 @@ void set_gpu(const int N, const float_t alpha, float_t* Y) {
     return;
   }
   set_kernel<<<CUDA_GET_BLOCKS(N), CUDA_NUM_THREADS>>>(N, alpha, Y);
+  CudaTest("solving set kernel failed");
 }
 
 __global__ void add_scalar_kernel(const int n, const float_t alpha,
@@ -187,6 +189,7 @@ __global__ void add_scalar_kernel(const int n, const float_t alpha,
 
 void add_scalar_gpu(const int N, const float_t alpha, float_t* Y) {
   add_scalar_kernel<<<CUDA_GET_BLOCKS(N), CUDA_NUM_THREADS>>>(N, alpha, Y);
+  CudaTest("solving add_scalar kernel failed");
 }
 
 __global__ void vadd_kernel(const int n, const float_t* a, const float_t* b,
@@ -200,6 +203,7 @@ void copy_gpu(size_t len, const float_t* in, float_t* out) {
 
 void vadd_gpu(const int N, const float_t* a, const float_t* b, float_t* y) {
   vadd_kernel<<<CUDA_GET_BLOCKS(N), CUDA_NUM_THREADS>>>(N, a, b, y);
+  CudaTest("solving vadd kernel failed");
 }
 
 // TODO: use warp
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index ddd6df4afa..9b78853833 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -2,18 +2,15 @@
 
 void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1) {
   context = new Context();
-  // Context::create_blas_handle();
   // read graph, get num nodes
   num_samples = context->read_graph(dataset_str);
   num_classes = context->read_labels(dataset_str);
   context->norm_factor_counting(); // pre-compute normalizing factor
-
   num_epochs = epochs;
 
   std::cout << "Reading label masks ... ";
   train_mask.resize(num_samples, 0);
   val_mask.resize(num_samples, 0);
-
   // get testing and validation sets
   if (dataset_str == "reddit") {
     train_begin = 0, train_count = 153431,
@@ -28,7 +25,6 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1) {
         read_masks(dataset_str, "train", train_begin, train_end, train_mask);
     val_count = read_masks(dataset_str, "val", val_begin, val_end, val_mask);
   }
-
   std::cout << "Done\n";
 
   num_layers = NUM_CONV_LAYERS + 1;
@@ -41,8 +37,7 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1) {
   feature_dims[3] = num_classes;           // normalized output embedding: E
   layers.resize(num_layers);
 #ifndef CPU_ONLY
-  context
-      ->copy_data_to_device(); // copy labels and input features to the device
+  context->copy_data_to_device(); // copy labels and input features to the device
 #endif
 }
 
@@ -111,19 +106,15 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count,
 #ifdef CPU_ONLY
   AccumF accuracy_all;
   accuracy_all.reset();
-  galois::do_all(galois::iterate(begin, end),
-                 [&](const auto& i) {
-                   if (masks[i] == 1) {
-                     int preds = argmax(num_classes,
-                                        &(layers[NUM_CONV_LAYERS - 1]
-                                              ->next()
-                                              ->get_data()[i * num_classes]));
-                     if ((label_t)preds == context->get_label(i))
-                       accuracy_all += 1.0;
-                   }
-                 },
-                 galois::chunk_size<256>(), galois::steal(),
-                 galois::loopname("getMaskedLoss"));
+  galois::do_all(galois::iterate(begin, end), [&](const auto& i) {
+    if (masks[i] == 1) {
+      int preds = argmax(num_classes, 
+	    &(layers[NUM_CONV_LAYERS - 1]->next()->get_data()[i * num_classes]));
+      if ((label_t)preds == context->get_label(i))
+        accuracy_all += 1.0;
+    }
+  },
+  galois::chunk_size<256>(), galois::steal(), galois::loopname("getMaskedLoss"));
   return accuracy_all.reduce() / (acc_t)count;
 #else
   return masked_accuracy_gpu(num_classes, begin, end, count,

From 969ae109d5e14818507365b8078bdec2bae2fb6e Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Tue, 25 Feb 2020 19:56:38 -0600
Subject: [PATCH 042/660] refine code

---
 libdeepgalois/src/aggregator.cu     | 10 +++-----
 libdeepgalois/src/math_functions.cu | 39 ++++++++++++-----------------
 libdeepgalois/src/optimizer.cpp     | 26 +++++++++----------
 libdeepgalois/src/optimizer.cu      |  3 ++-
 4 files changed, 34 insertions(+), 44 deletions(-)

diff --git a/libdeepgalois/src/aggregator.cu b/libdeepgalois/src/aggregator.cu
index 3d6d016363..c5ed6e0817 100644
--- a/libdeepgalois/src/aggregator.cu
+++ b/libdeepgalois/src/aggregator.cu
@@ -13,19 +13,17 @@ __device__ void scale_add(const int n, const float_t alpha, const float_t* a,
 }
 
 __global__ void update_all_kernel(size_t n, size_t len, CSRGraph& g,
-                                  const float_t* in, float_t* out, bool norm,
-                                  const float_t* norm_factor) {
+                                  const float_t* in, float_t* out,
+                                  bool norm, const float_t* norm_factor) {
   CUDA_KERNEL_LOOP(src, n) {
     float_t a = 0.0, b = 1.0;
-    if (norm)
-      a = norm_factor[src];
+    if (norm) a = norm_factor[src];
     index_type begin = g.edge_begin(src);
     index_type end   = g.edge_end(src);
     for (index_type e = begin; e != end; e++) {
       index_type dst = g.getEdgeDst(e);
       assert(dst < n);
-      if (norm)
-        b = a * norm_factor[dst];
+      if (norm) b = a * norm_factor[dst];
       scale_add(len, b, in + dst * len, out + src * len,
                 out + src * len); // out[src] += in[dst]
     }
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index 28e65e149d..174cd1b36a 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -9,8 +9,7 @@ void gpu_rng_uniform(const int n, unsigned* r) {
   CURAND_CHECK(curandGenerate(Context::curand_generator(), r, n));
 }
 
-void gpu_rng_uniform(const int n, const float_t a, const float_t b,
-                     float_t* r) {
+void gpu_rng_uniform(const int n, const float_t a, const float_t b, float_t* r) {
   CURAND_CHECK(curandGenerateUniform(Context::curand_generator(), r, n));
   const float range = b - a;
   if (range != float_t(1))
@@ -19,10 +18,8 @@ void gpu_rng_uniform(const int n, const float_t a, const float_t b,
     add_scalar_gpu(n, a, r);
 }
 
-void gpu_rng_gaussian(const int n, const float_t mu, const float_t sigma,
-                      float_t* r) {
-  CURAND_CHECK(
-      curandGenerateNormal(Context::curand_generator(), r, n, mu, sigma));
+void gpu_rng_gaussian(const int n, const float_t mu, const float_t sigma, float_t* r) {
+  CURAND_CHECK(curandGenerateNormal(Context::curand_generator(), r, n, mu, sigma));
 }
 
 void loss_malloc_device(int n, float_t*& loss) {
@@ -32,15 +29,13 @@ void loss_malloc_device(int n, float_t*& loss) {
 void copy_masks_device(int n, mask_t* h_masks, mask_t*& d_masks) {
   assert(h_masks != NULL);
   CUDA_CHECK(cudaMalloc((void**)&d_masks, n * sizeof(mask_t)));
-  CUDA_CHECK(
-      cudaMemcpy(d_masks, h_masks, n * sizeof(mask_t), cudaMemcpyHostToDevice));
+  CUDA_CHECK(cudaMemcpy(d_masks, h_masks, n * sizeof(mask_t), cudaMemcpyHostToDevice));
 }
 
 void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout,
                          unsigned*& masks, float_t*& in, float_t*& out,
                          float_t*& matrix, float_t*& grad) {
-  if (dropout)
-    CUDA_CHECK(cudaMalloc((void**)&masks, x * y * sizeof(unsigned)));
+  if (dropout) CUDA_CHECK(cudaMalloc((void**)&masks, x * y * sizeof(unsigned)));
   CUDA_CHECK(cudaMalloc((void**)&in, x * y * sizeof(float_t)));
   CUDA_CHECK(cudaMalloc((void**)&out, x * z * sizeof(float_t)));
   CUDA_CHECK(cudaMalloc((void**)&matrix, y * z * sizeof(float_t)));
@@ -60,11 +55,11 @@ __global__ void setup_curand_kernel(const int n, curandState* state) {
 
 __global__ void dropout_kernel(const int n, const float scale,
                                const float dropout_rate, const float_t* in,
-                               unsigned* masks, curandState* state,
-                               float_t* out) {
+                               unsigned* masks, curandState* state, float_t* out) {
   CUDA_KERNEL_LOOP(i, n) {
-    // curand_init(1234, i, 0, &state[i]); // Each thread gets same seed 1234
-    // masks[i] = curand_uniform(&state[i]) <= dropout_rate ? 1 : 0;
+    // curandState_t curand_state;
+    //curand_init(1234, i, 0, &state[i]); // Each thread gets same seed 1234
+    //masks[i] = curand_uniform(&state[i]) <= dropout_rate ? 1 : 0;
     masks[i] = 1.0 - dropout_rate;
     out[i]   = in[i] * masks[i] * scale;
   }
@@ -74,20 +69,19 @@ void dropout_gpu(const int n, const float scale, const float dropout_rate,
                  const float_t* in, unsigned* masks, float_t* out) {
   curandState* devStates;
   CUDA_CHECK(cudaMalloc((void**)&devStates, n * sizeof(curandState)));
-  // std::cout << "[debug]: setup curand, n = " << n << "\n";
-  // setup_curand_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, devStates); 
-  // CudaTest("solving setup_curand kernel failed"); std::cout <<
-  // "[debug]: dropout_gpu\n";
+  //std::cout << "[debug]: setup curand, n = " << n << "\n";
+  //setup_curand_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, devStates); 
+  //CudaTest("solving setup_curand kernel failed"); 
+  std::cout << "[debug]: dropout_gpu\n";
   dropout_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
       n, scale, dropout_rate, in, masks, devStates, out);
   CudaTest("solving dropout kernel failed");
   CUDA_CHECK(cudaFree(devStates));
-  // std::cout << "[debug]: dropout_gpu done\n";
+  std::cout << "[debug]: dropout_gpu done\n";
 }
 
-__global__ void d_dropout_kernel(const int n, const float scale,
-                                 const float_t* in, const unsigned* masks,
-                                 float_t* out) {
+__global__ void d_dropout_kernel(const int n, const float scale, const float_t* in,
+                                 const unsigned* masks, float_t* out) {
   CUDA_KERNEL_LOOP(i, n) { out[i] = in[i] * masks[i] * scale; }
 }
 
@@ -102,7 +96,6 @@ __global__ void relu_kernel(const int n, const float_t* in, float_t* out) {
 }
 
 void relu_gpu(const int n, const float_t* in, float_t* out) {
-  // std::cout << "[debug]: relu_gpu\n";
   relu_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, in, out);
   CudaTest("solving relu kernel failed");
 }
diff --git a/libdeepgalois/src/optimizer.cpp b/libdeepgalois/src/optimizer.cpp
index fb10221f19..b076df561f 100644
--- a/libdeepgalois/src/optimizer.cpp
+++ b/libdeepgalois/src/optimizer.cpp
@@ -21,26 +21,24 @@ void adagrad::update(const vec_t& dW, vec_t& W, bool parallelize) {
 void RMSprop::update(const vec_t& dW, vec_t& W, bool parallelize) {
   vec_t& g = get<0>(W);
   galois::do_all(galois::iterate((size_t)0, W.size()),
-                 [&](const auto& i) {
-                   g[i] = mu * g[i] + (1 - mu) * dW[i] * dW[i];
-                   W[i] -= alpha * dW[i] / std::sqrt(g[i] + eps);
-                 },
-                 galois::loopname("rms_update"));
+    [&](const auto& i) {
+      g[i] = mu * g[i] + (1 - mu) * dW[i] * dW[i];
+      W[i] -= alpha * dW[i] / std::sqrt(g[i] + eps);
+    }, galois::loopname("rms_update"));
 }
 
 void adam::update(const vec_t& dW, vec_t& W, bool parallelize) {
   vec_t& mt = get<0>(W);
   vec_t& vt = get<1>(W);
   galois::do_all(galois::iterate((size_t)0, W.size()),
-                 [&](const auto& i) {
-                   mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i];
-                   vt[i] = b2 * vt[i] + (float_t(1) - b2) * dW[i] * dW[i];
-                   // L2 norm based update rule
-                   W[i] -= alpha * (mt[i] / (float_t(1) - b1_t)) /
-                           std::sqrt((vt[i] / (float_t(1) - b2_t)) + eps);
-                 },
-                 galois::chunk_size<256>(), galois::steal(),
-                 galois::loopname("adam_update"));
+    [&](const auto& i) {
+      mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i];
+      vt[i] = b2 * vt[i] + (float_t(1) - b2) * dW[i] * dW[i];
+      // L2 norm based update rule
+      W[i] -= alpha * (mt[i] / (float_t(1) - b1_t)) /
+              std::sqrt((vt[i] / (float_t(1) - b2_t)) + eps);
+    }, galois::chunk_size<256>(), galois::steal(),
+    galois::loopname("adam_update"));
   b1_t *= b1;
   b2_t *= b2;
 }
diff --git a/libdeepgalois/src/optimizer.cu b/libdeepgalois/src/optimizer.cu
index 908ce4f32a..a936999d3e 100644
--- a/libdeepgalois/src/optimizer.cu
+++ b/libdeepgalois/src/optimizer.cu
@@ -1,3 +1,4 @@
 #include "optimizer.h"
 
-void adam::update_gpu(const float_t* dW, float_t* W) {}
+void adam::update_gpu(const float_t* dW, float_t* W) {
+}

From 9e4c91ea42aac12a1d36664d7ac0ead74e3af363 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Tue, 25 Feb 2020 20:27:32 -0600
Subject: [PATCH 043/660] add adam gpu

---
 libdeepgalois/include/layers/layer.h |  2 +-
 libdeepgalois/include/optimizer.h    | 18 +++++-----
 libdeepgalois/src/math_functions.cu  |  4 +--
 libdeepgalois/src/optimizer.cpp      | 49 +++++++++++++---------------
 libdeepgalois/src/optimizer.cu       | 25 +++++++++++++-
 5 files changed, 58 insertions(+), 40 deletions(-)

diff --git a/libdeepgalois/include/layers/layer.h b/libdeepgalois/include/layers/layer.h
index 68260c034a..bd8f67fa07 100644
--- a/libdeepgalois/include/layers/layer.h
+++ b/libdeepgalois/include/layers/layer.h
@@ -106,7 +106,7 @@ class layer : public node {
     bool parallel = (W.size() >= 512);
     opt->update(weight_grad, W, parallel); // W += grad
 #else
-    opt->update_gpu(d_weight_grad, d_W); // W += grad
+    opt->update_gpu(input_dims[1]*output_dims[1], d_weight_grad, d_W); // W += grad
 #endif
     // prev()->clear_grads();
     next()->clear_grads();
diff --git a/libdeepgalois/include/optimizer.h b/libdeepgalois/include/optimizer.h
index ed8e7654d9..d9f8de9116 100644
--- a/libdeepgalois/include/optimizer.h
+++ b/libdeepgalois/include/optimizer.h
@@ -15,7 +15,7 @@ struct optimizer {
   optimizer& operator=(optimizer&&)                                = default;
   virtual ~optimizer()                                             = default;
   virtual void update(const vec_t& dW, vec_t& W, bool parallelize) = 0;
-  virtual void update_gpu(const float_t* dW, float_t* W)           = 0;
+  virtual void update_gpu(const size_t n, const float_t* dW, float_t* W) = 0;
   virtual void reset() {} // override to implement pre-learning action
 };
 
@@ -48,7 +48,7 @@ struct stateful_optimizer : public optimizer {
 struct adagrad : public stateful_optimizer<1> {
   adagrad() : alpha(0.01), eps(float_t(1e-8)) {}
   void update(const vec_t& dW, vec_t& W, bool parallelize);
-  void update_gpu(const float_t* dW, float_t* W) {}
+  void update_gpu(const size_t n, const float_t* dW, float_t* W) {}
   float_t alpha; // learning rate
 private:
   float_t eps;
@@ -63,7 +63,7 @@ struct adagrad : public stateful_optimizer<1> {
 struct RMSprop : public stateful_optimizer<1> {
   RMSprop() : alpha(float_t(0.0001)), mu(float_t(0.99)), eps(float_t(1e-8)) {}
   void update(const vec_t& dW, vec_t& W, bool parallelize);
-  void update_gpu(const float_t* dW, float_t* W) {}
+  void update_gpu(const size_t n, const float_t* dW, float_t* W) {}
   float_t alpha; // learning rate
   float_t mu;    // decay term
 private:
@@ -78,9 +78,9 @@ struct adam : public stateful_optimizer<2> {
         b1_t(float_t(0.9)), b2_t(float_t(0.999)), eps(float_t(1e-8)) {}
   void update(const vec_t& dW, vec_t& W, bool parallelize);
 #ifdef CPU_ONLY
-  void update_gpu(const float_t* dW, float_t* W) {}
+  void update_gpu(const size_t n, const float_t* dW, float_t* W) {}
 #else
-  void update_gpu(const float_t* dW, float_t* W);
+  void update_gpu(const size_t n, const float_t* dW, float_t* W);
 #endif
 
   float_t alpha; // learning rate
@@ -104,7 +104,7 @@ struct adamax : public stateful_optimizer<2> {
       : alpha(float_t(0.002)), b1(float_t(0.9)), b2(float_t(0.999)), b1_t(b1),
         eps(float_t(1e-8)) {}
   void update(const vec_t& dW, vec_t& W, bool parallelize);
-  void update_gpu(const float_t* dW, float_t* W) {}
+  void update_gpu(const size_t n, const float_t* dW, float_t* W) {}
 
   float_t alpha; // learning rate
   float_t b1;    // decay term
@@ -120,7 +120,7 @@ struct adamax : public stateful_optimizer<2> {
 struct gradient_descent : public optimizer {
   gradient_descent() : alpha(float_t(0.01)), lambda(float_t(0)) {}
   void update(const vec_t& dW, vec_t& W, bool parallelize);
-  void update_gpu(const float_t* dW, float_t* W) {}
+  void update_gpu(const size_t n, const float_t* dW, float_t* W) {}
   float_t alpha;  // learning rate
   float_t lambda; // weight decay
 };
@@ -136,7 +136,7 @@ struct momentum : public stateful_optimizer<1> {
 public:
   momentum() : alpha(float_t(0.01)), lambda(float_t(0)), mu(float_t(0.9)) {}
   void update(const vec_t& dW, vec_t& W, bool parallelize);
-  void update_gpu(const float_t* dW, float_t* W) {}
+  void update_gpu(const size_t n, const float_t* dW, float_t* W) {}
 
   float_t alpha;  // learning rate
   float_t lambda; // weight decay
@@ -155,7 +155,7 @@ struct nesterov_momentum : public stateful_optimizer<1> {
   nesterov_momentum()
       : alpha(float_t(0.01)), lambda(float_t(0)), mu(float_t(0.9)) {}
   void update(const vec_t& dW, vec_t& W, bool parallelize);
-  void update_gpu(const float_t* dW, float_t* W) {}
+  void update_gpu(const size_t n, const float_t* dW, float_t* W) {}
 
   float_t alpha;  // learning rate
   float_t lambda; // weight decay
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index 174cd1b36a..3bdcbd2607 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -72,12 +72,12 @@ void dropout_gpu(const int n, const float scale, const float dropout_rate,
   //std::cout << "[debug]: setup curand, n = " << n << "\n";
   //setup_curand_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, devStates); 
   //CudaTest("solving setup_curand kernel failed"); 
-  std::cout << "[debug]: dropout_gpu\n";
+  //std::cout << "[debug]: dropout_gpu\n";
   dropout_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
       n, scale, dropout_rate, in, masks, devStates, out);
   CudaTest("solving dropout kernel failed");
   CUDA_CHECK(cudaFree(devStates));
-  std::cout << "[debug]: dropout_gpu done\n";
+  //std::cout << "[debug]: dropout_gpu done\n";
 }
 
 __global__ void d_dropout_kernel(const int n, const float scale, const float_t* in,
diff --git a/libdeepgalois/src/optimizer.cpp b/libdeepgalois/src/optimizer.cpp
index b076df561f..0ec40cf4d0 100644
--- a/libdeepgalois/src/optimizer.cpp
+++ b/libdeepgalois/src/optimizer.cpp
@@ -5,11 +5,10 @@ void adagrad::update(const vec_t& dW, vec_t& W, bool parallelize) {
   vec_t& g = get<0>(W);
   if (parallelize) {
     galois::do_all(galois::iterate((size_t)0, W.size()),
-                   [&](const auto& i) {
-                     g[i] += dW[i] * dW[i];
-                     W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps);
-                   },
-                   galois::loopname("adagrad_update"));
+      [&](const auto& i) {
+        g[i] += dW[i] * dW[i];
+        W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps);
+      }, galois::loopname("adagrad_update"));
   } else {
     for (size_t i = 0; i < W.size(); i++) {
       g[i] += dW[i] * dW[i];
@@ -47,41 +46,37 @@ void adamax::update(const vec_t& dW, vec_t& W, bool parallelize) {
   vec_t& mt = get<0>(W);
   vec_t& ut = get<1>(W);
   galois::do_all(galois::iterate((size_t)0, W.size()),
-                 [&](const auto& i) {
-                   mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i];
-                   ut[i] = std::max(b2 * ut[i], std::abs(dW[i]));
-                   // Lp norm based update rule
-                   W[i] -= (alpha / (1.0 - b1_t)) * (mt[i] / (ut[i] + eps));
-                 },
-                 galois::loopname("adamax_update"));
+    [&](const auto& i) {
+      mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i];
+      ut[i] = std::max(b2 * ut[i], std::abs(dW[i]));
+      // Lp norm based update rule
+      W[i] -= (alpha / (1.0 - b1_t)) * (mt[i] / (ut[i] + eps));
+    }, galois::loopname("adamax_update"));
   b1_t *= b1;
 }
 
 void gradient_descent::update(const vec_t& dW, vec_t& W, bool parallelize) {
-  galois::do_all(
-      galois::iterate((size_t)0, W.size()),
+  galois::do_all(galois::iterate((size_t)0, W.size()),
       [&](const auto& i) { W[i] = W[i] - alpha * (dW[i] + lambda * W[i]); },
-      galois::loopname("gradient_descent_update"));
+    galois::loopname("gradient_descent_update"));
 }
 
 void momentum::update(const vec_t& dW, vec_t& W, bool parallelize) {
   vec_t& dWprev = get<0>(W);
   galois::do_all(galois::iterate((size_t)0, W.size()),
-                 [&](const auto& i) {
-                   float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda);
-                   W[i] += V;
-                   dWprev[i] = V;
-                 },
-                 galois::loopname("momentum_update"));
+    [&](const auto& i) {
+      float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda);
+      W[i] += V;
+      dWprev[i] = V;
+    }, galois::loopname("momentum_update"));
 }
 
 void nesterov_momentum::update(const vec_t& dW, vec_t& W, bool parallelize) {
   vec_t& dWprev = get<0>(W);
   galois::do_all(galois::iterate((size_t)0, W.size()),
-                 [&](const auto& i) {
-                   float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda);
-                   W[i] += (-mu) * dWprev[i] + (1 + mu) * V;
-                   dWprev[i] = V;
-                 },
-                 galois::loopname("nesterov_momentum_update"));
+    [&](const auto& i) {
+      float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda);
+      W[i] += (-mu) * dWprev[i] + (1 + mu) * V;
+      dWprev[i] = V;
+    }, galois::loopname("nesterov_momentum_update"));
 }
diff --git a/libdeepgalois/src/optimizer.cu b/libdeepgalois/src/optimizer.cu
index a936999d3e..7d718ea865 100644
--- a/libdeepgalois/src/optimizer.cu
+++ b/libdeepgalois/src/optimizer.cu
@@ -1,4 +1,27 @@
 #include "optimizer.h"
+#include "cutils.h"
+#include "math_functions.hh"
 
-void adam::update_gpu(const float_t* dW, float_t* W) {
+__global__ void update_kernel(const int n, float_t alpha, float_t b1,
+                         float_t b2, float_t b1_t, float_t b2_t,
+                         float_t eps, float_t* mt, float_t* vt,
+                         const float_t* dW, float_t* W) {
+  CUDA_KERNEL_LOOP(i, n) {
+    mt[i] = b1 * mt[i] + (1.0 - b1) * dW[i];
+    vt[i] = b2 * vt[i] + (1.0 - b2) * dW[i] * dW[i];
+    W[i] -= alpha * (mt[i] / (1.0 - b1_t)) /
+            std::sqrt((vt[i] / (1.0 - b2_t)) + eps);
+  }
+}
+
+void adam::update_gpu(const size_t n, const float_t* dW, float_t* W) {
+  float_t * W1, *W2;
+  CUDA_CHECK(cudaMalloc((void**)&W1, n * sizeof(float_t)));
+  CUDA_CHECK(cudaMalloc((void**)&W2, n * sizeof(float_t)));
+  copy_gpu(n, W, W1);
+  copy_gpu(n, W, W2);
+  update_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
+      n, alpha, b1, b2, b1_t, b2_t, eps, W1, W2, dW, W);
+  b1_t *= b1;
+  b2_t *= b2;
 }

From e225e2ca620be4f7051c4562515a291b9c69d4a7 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Tue, 25 Feb 2020 21:09:17 -0600
Subject: [PATCH 044/660] refine graph_conv_layer.cpp

---
 libdeepgalois/src/layers/graph_conv_layer.cpp | 95 +++++++------------
 1 file changed, 33 insertions(+), 62 deletions(-)

diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index d53a75e53a..6ab8662101 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -1,18 +1,15 @@
 #include "layers/graph_conv_layer.h"
 
 #ifdef CPU_ONLY
-void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in,
-                                 float_t* out) {
+void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in, float_t* out) {
   update_all(len, g, in, out, true, context->norm_factor);
 #else
-void graph_conv_layer::aggregate(size_t len, CSRGraph& g, const float_t* in,
-                                 float_t* out) {
+void graph_conv_layer::aggregate(size_t len, CSRGraph& g, const float_t* in, float_t* out) {
   update_all(len, g, in, out, true, context->d_norm_factor);
 #endif
 }
 
-void graph_conv_layer::combine(const vec_t& self, const vec_t& neighbors,
-                               vec_t& out) {
+void graph_conv_layer::combine(const vec_t& self, const vec_t& neighbors, vec_t& out) {
   vec_t a(out.size(), 0);
   vec_t b(out.size(), 0);
   mvmul(Q, self, a);
@@ -37,8 +34,7 @@ graph_conv_layer::graph_conv_layer(unsigned level, bool act, bool norm,
 }
 
 void graph_conv_layer::init() {
-  std::cout << name_
-            << ": allocating memory for parameters and intermediate data... ";
+  std::cout << name_ << ": allocating memory for params and temp data... ";
   Timer t_alloc;
   t_alloc.Start();
 #ifdef CPU_ONLY
@@ -48,13 +44,11 @@ void graph_conv_layer::init() {
   if (dropout_)
     dropout_mask = new unsigned[x * y];
   in_temp  = new float_t[x * y];
-  out_temp = new float_t
-      [x * z]; // same as pre_sup in original GCN code:
+  out_temp = new float_t[x * z]; // same as pre_sup in original GCN code:
                // https://github.com/chenxuhao/gcn/blob/master/gcn/layers.py
   trans_data = new float_t[y * x]; // y*x
 #else
-  gconv_malloc_device(x, y, z, dropout_, dropout_mask, in_temp, out_temp, d_W,
-                      d_weight_grad);
+  gconv_malloc_device(x, y, z, dropout_, dropout_mask, in_temp, out_temp, d_W, d_weight_grad);
 #endif
   t_alloc.Stop();
   std::cout << "Done, allocation time: " << t_alloc.Millisecs() << " ms\n";
@@ -62,18 +56,16 @@ void graph_conv_layer::init() {
 
 #ifdef CPU_ONLY
 // 𝒉[𝑙] = σ(𝑊 * Σ(𝒉[𝑙-1]))
-void graph_conv_layer::forward_propagation(const float_t* in_data,
-                                           float_t* out_data) {
+void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_data) {
   // input: x*y; W: y*z; output: x*z
   // if y > z: mult W first to reduce the feature size for aggregation
   // else: aggregate first then mult W (not implemented yet)
   if (dropout_ && phase_ == net_phase::train) {
     galois::do_all(galois::iterate((size_t)0, x),
-                   [&](const auto& i) {
-                     dropout(y, scale_, dropout_rate_, &in_data[i * y],
-                             &dropout_mask[i * y], &in_temp[i * y]);
-                   },
-                   galois::loopname("dropout"));
+      [&](const auto& i) {
+        dropout(y, scale_, dropout_rate_, &in_data[i * y],
+                &dropout_mask[i * y], &in_temp[i * y]);
+      }, galois::loopname("dropout"));
     matmul1D1D(x, z, y, in_temp, &W[0], out_temp); // x*y; y*z; x*z
   } else
     matmul1D1D(x, z, y, in_data, &W[0], out_temp); // x*y; y*z; x*z
@@ -90,20 +82,14 @@ void graph_conv_layer::forward_propagation(const float_t* in_data,
 void graph_conv_layer::back_propagation(const float_t* in_data,
                                         const float_t* out_data,
                                         float_t* out_grad, float_t* in_grad) {
-  // void graph_conv_layer::back_propagation(const vec_t &in_data, const vec_t
-  // &out_data, vec_t &out_grad, vec_t &in_grad) {
   if (act_) {
     galois::do_all(galois::iterate((size_t)0, x),
-                   [&](const auto& i) {
-                     for (size_t j = 0; j < z;
-                          ++j) // TODO: use in_data or out_data?
-                       out_temp[i * z + j] = out_data[i * z + j] > float_t(0)
-                                                 ? out_grad[i * z + j]
-                                                 : float_t(0);
-                   },
-                   galois::loopname("d_relu"));
-  } else
-    copy1D1D(x * z, out_grad, out_temp); // TODO: avoid copying
+      [&](const auto& i) {
+        for (size_t j = 0; j < z; ++j) // TODO: use in_data or out_data?
+          out_temp[i * z + j] = out_data[i * z + j] > float_t(0)
+                                ? out_grad[i * z + j] : float_t(0);
+      }, galois::loopname("d_relu"));
+  } else copy1D1D(x * z, out_grad, out_temp); // TODO: avoid copying
   if (level_ != 0) { // no need to calculate in_grad for the first layer
     vec_t trans_W(z * y);
     transpose(y, z, W, trans_W); // derivative of matmul needs transposed matrix
@@ -114,12 +100,11 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
                context->norm_factor); // x*x; x*y -> x*y
     if (dropout_) {
       galois::do_all(galois::iterate((size_t)0, x),
-                     [&](const auto& i) {
-                       d_dropout(y, scale_, &in_grad[i * y],
-                                 &dropout_mask[i * y], &in_grad[i * y]);
-                     },
-                     galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
-                     galois::loopname("d_dropout"));
+        [&](const auto& i) {
+          d_dropout(y, scale_, &in_grad[i * y],
+                    &dropout_mask[i * y], &in_grad[i * y]);
+        }, galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
+        galois::loopname("d_dropout"));
     }
   }
   // calculate weight gradients
@@ -128,43 +113,29 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
 }
 
 #else
-// GPU forward
+// GPU forward: compute output features
 void graph_conv_layer::forward_propagation(const float_t* in_data,
                                            float_t* out_data) {
   assert(y <= 128); // currently only support feature length <= 128
-  assert(in_data != NULL);
-  assert(in_temp != NULL);
-  assert(dropout_mask != NULL);
-  // std::cout << "in_data=" << in_data << ", in_temp=" << in_temp << ",
-  // dropout_mask=" << dropout_mask << ", out_temp=" << out_temp << ", out_data="
-  // << out_data << "\n";
   if (dropout_ && phase_ == net_phase::train) {
     dropout_gpu(x * y, scale_, dropout_rate_, in_data, dropout_mask, in_temp);
     matmul1D1D_gpu(x, z, y, in_temp, d_W, out_temp);
-  } else
-    matmul1D1D_gpu(x, z, y, in_data, d_W, out_temp);
-  // aggregate(z, context->graph_gpu, out_temp, out_data);
-  if (act_)
-    relu_gpu(x * z, out_data, out_data);
+  } else matmul1D1D_gpu(x, z, y, in_data, d_W, out_temp);
+  //aggregate(z, context->graph_gpu, out_temp, out_data);
+  if (act_) relu_gpu(x * z, out_data, out_data);
 }
 
-// GPU backward
+// GPU backward: compute input gradients (in_grad) and weight gradients (d_weight_grad)
 void graph_conv_layer::back_propagation(const float_t* in_data,
                                         const float_t* out_data,
                                         float_t* out_grad, float_t* in_grad) {
-  if (act_)
-    d_relu_gpu(x * z, out_grad, out_data, out_temp);
-  else
-    copy_gpu(x * z, out_grad, out_temp);
+  if (act_) d_relu_gpu(x * z, out_grad, out_data, out_temp);
+  else copy_gpu(x * z, out_grad, out_temp);
   if (level_ != 0) {
-    sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0,
-              in_temp);
-    // update_all(y, context->graph_gpu, in_temp, in_grad, true,
-    // context->d_norm_factor);
-    if (dropout_)
-      d_dropout_gpu(x * y, scale_, in_grad, dropout_mask, in_grad);
+    sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0, in_temp);
+    //update_all(y, context->graph_gpu, in_temp, in_grad, true, context->d_norm_factor);
+    if (dropout_) d_dropout_gpu(x * y, scale_, in_grad, dropout_mask, in_grad);
   }
-  sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0,
-            d_weight_grad);
+  sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, d_weight_grad);
 }
 #endif

From 1ec2fb9f39f0d5dafc91fdad489cd3a795b2777c Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Tue, 25 Feb 2020 21:25:09 -0600
Subject: [PATCH 045/660] refine softmax_loss_layer.cpp

---
 .../src/layers/softmax_loss_layer.cpp         | 61 +++++++++----------
 libdeepgalois/src/math_functions.cu           | 28 +++------
 2 files changed, 38 insertions(+), 51 deletions(-)

diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp
index 0cd9547250..1c305827ac 100644
--- a/libdeepgalois/src/layers/softmax_loss_layer.cpp
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp
@@ -19,18 +19,16 @@ void softmax_loss_layer::forward_propagation(const float_t* in_data,
                                              float_t* out_data) {
   size_t len = input_dims[1];
   galois::do_all(galois::iterate(begin_, end_),
-                 [&](const auto& i) {
-                   if (masks_[i] == 1) { // masked
-                     softmax(len, &in_data[len * i],
-                             &out_data[len * i]); // normalize using softmax
-                     // y is a one hot encoded vector for the labels
-                     std::vector<acc_t> y(output_dims[1], 0.0); // ground truth
-                     y[context->get_label(i)] = 1.0;            // one-hot
-                     loss[i] = cross_entropy(len, &y[0], &out_data[len * i]);
-                   }
-                 },
-                 galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
-                 galois::loopname("softmax-loss-fw"));
+    [&](const auto& i) {
+      if (masks_[i] == 1) { // masked
+        softmax(len, &in_data[len*i], &out_data[len*i]); // normalize using softmax
+        // y is a one hot encoded vector for the labels
+        std::vector<acc_t> y(output_dims[1], 0.0); // ground truth
+        y[context->get_label(i)] = 1.0;            // one-hot
+        loss[i] = cross_entropy(len, &y[0], &out_data[len*i]);
+      }
+    }, galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
+    galois::loopname("softmax-loss-fw"));
 }
 
 void softmax_loss_layer::back_propagation(const float_t* in_data,
@@ -38,19 +36,17 @@ void softmax_loss_layer::back_propagation(const float_t* in_data,
                                           float_t* out_grad, float_t* in_grad) {
   size_t len = input_dims[1];
   galois::do_all(galois::iterate(begin_, end_),
-                 [&](const auto& i) {
-                   if (masks_[i] == 1) { // masked
-                     vec_t norm_grad(len);
-                     std::vector<acc_t> y(len, 0.0); // ground truth
-                     y[context->get_label(i)] = 1.0;
-                     d_cross_entropy(len, &y[0], &out_data[len * i],
-                                     &norm_grad[0]);
-                     d_softmax(len, &in_data[len * i], &out_data[len * i],
-                               &in_grad[len * i], &norm_grad[0]);
-                   }
-                 },
-                 galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
-                 galois::loopname("softmax-loss-bw"));
+    [&](const auto& i) {
+      if (masks_[i] == 1) { // masked
+        vec_t norm_grad(len);
+        std::vector<acc_t> y(len, 0.0); // ground truth
+        y[context->get_label(i)] = 1.0;
+        d_cross_entropy(len, &y[0], &out_data[len * i], &norm_grad[0]);
+        d_softmax(len, &in_data[len * i], &out_data[len * i],
+                  &in_grad[len * i], &norm_grad[0]);
+      }
+    }, galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
+    galois::loopname("softmax-loss-bw"));
 }
 
 acc_t softmax_loss_layer::get_masked_loss() {
@@ -59,14 +55,13 @@ acc_t softmax_loss_layer::get_masked_loss() {
   total_loss.reset();
   valid_sample_count.reset();
   galois::do_all(galois::iterate(begin_, end_),
-                 [&](const auto& i) {
-                   if (masks_[i]) {
-                     total_loss += loss[i];
-                     valid_sample_count += 1;
-                   }
-                 },
-                 galois::chunk_size<256>(), galois::steal(),
-                 galois::loopname("getMaskedLoss"));
+    [&](const auto& i) {
+      if (masks_[i]) {
+        total_loss += loss[i];
+        valid_sample_count += 1;
+      }
+    }, galois::chunk_size<256>(), galois::steal(),
+    galois::loopname("getMaskedLoss"));
   assert(valid_sample_count.reduce() == count_);
   return total_loss.reduce() / (acc_t)count_;
 }
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index 3bdcbd2607..53ff024872 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -215,8 +215,7 @@ __device__ void softmax(int n, const float_t* input, float_t* output) {
 }
 
 // TODO: use warp
-__device__ void d_softmax(size_t n, const float_t* p, const float_t* dp,
-                          float_t* dy) {
+__device__ void d_softmax(size_t n, const float_t* p, const float_t* dp, float_t* dy) {
   for (size_t i = 0; i < n; i++) {
     dy[i] = 0;
     for (size_t j = 0; j < n; j++) {
@@ -226,21 +225,15 @@ __device__ void d_softmax(size_t n, const float_t* p, const float_t* dp,
   }
 }
 
-__device__ void cross_entropy(int n, const label_t idx, const float_t* p,
-                              float_t& loss) {
-  if (p[idx] == 0.0)
-    loss -= log(float_t(1e-10));
-  else
-    loss -= log(p[idx]);
+__device__ void cross_entropy(int n, const label_t idx, const float_t* p, float_t& loss) {
+  if (p[idx] == 0.0) loss -= log(float_t(1e-10));
+  else loss -= log(p[idx]);
 }
 
-__device__ void d_cross_entropy(int n, const label_t idx, const float_t* p,
-                                float_t* d) {
+__device__ void d_cross_entropy(int n, const label_t idx, const float_t* p, float_t* d) {
   for (int i = 0; i < n; i++)
-    if (i == (int)idx)
-      d[i] = -1.0 / (p[i] + 1e-10);
-    else
-      d[i] = 0.0;
+    if (i == (int)idx) d[i] = -1.0 / (p[i] + 1e-10);
+    else d[i] = 0.0;
 }
 
 // n: number of vectors
@@ -253,8 +246,8 @@ __global__ void softmax_cross_entropy_kernel(int n, int len,
                                              float_t* loss, float_t* out_data) {
   CUDA_KERNEL_LOOP(i, n) {
     if (masks[i] == 1) { // masked
-      softmax(len, in_data + len * i,
-              out_data + len * i); // normalize using softmax
+	  // normalize using softmax
+      softmax(len, in_data + len * i, out_data + len * i);
       loss[i] = 0.0;
       cross_entropy(len, labels[i], &out_data[len * i], loss[i]);
     }
@@ -269,8 +262,7 @@ void softmax_cross_entropy_gpu(int n, int len, const float_t* in,
   CudaTest("solving softmax_cross_entropy kernel failed");
 }
 
-__global__ void
-d_softmax_cross_entropy_kernel(int n, int len, const float_t* in,
+__global__ void d_softmax_cross_entropy_kernel(int n, int len, const float_t* in,
                                const mask_t* masks, const label_t* labels,
                                const float_t* out, float_t* diff) {
   CUDA_KERNEL_LOOP(i, n) {

From 9a5722e5b93c5f1a7d919ca3b61e5923bc9262a4 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Tue, 25 Feb 2020 22:15:25 -0600
Subject: [PATCH 046/660] refine src/context.cpp

---
 libdeepgalois/include/layers/layer.h |  3 ++-
 libdeepgalois/src/context.cpp        | 17 +++++++----------
 libdeepgalois/src/optimizer.cu       |  4 ++++
 3 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/libdeepgalois/include/layers/layer.h b/libdeepgalois/include/layers/layer.h
index bd8f67fa07..438ee45993 100644
--- a/libdeepgalois/include/layers/layer.h
+++ b/libdeepgalois/include/layers/layer.h
@@ -97,15 +97,16 @@ class layer : public node {
                      next()->get_gradient(), prev()->get_gradient());
   }
   void update_weight(optimizer* opt) {
-    // std::cout << name_ << ": weight updating ... ";
     // vec_t diff;
     // prev()->merge_grads(&diff);
 #ifdef CPU_ONLY
+    // std::cout << name_ << ": weight updating ... ";
     // parallelize only when target size is big enough to mitigate thread
     // spawning overhead.
     bool parallel = (W.size() >= 512);
     opt->update(weight_grad, W, parallel); // W += grad
 #else
+	std::cout << name_ << ": ";
     opt->update_gpu(input_dims[1]*output_dims[1], d_weight_grad, d_W); // W += grad
 #endif
     // prev()->clear_grads();
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index 785f4b2d26..aab3e1c3cd 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -64,16 +64,13 @@ void Context::norm_factor_counting() {
 #ifdef CPU_ONLY
   norm_factor = new float_t[n];
   galois::do_all(galois::iterate((size_t)0, n),
-                 [&](auto v) {
-                   auto degree  = std::distance(graph_cpu.edge_begin(v),
-                                               graph_cpu.edge_end(v));
-                   float_t temp = std::sqrt(float_t(degree));
-                   if (temp == 0.0)
-                     norm_factor[v] = 0.0;
-                   else
-                     norm_factor[v] = 1.0 / temp;
-                 },
-                 galois::loopname("NormCounting"));
+    [&](auto v) {
+      auto degree  = std::distance(graph_cpu.edge_begin(v),
+                                  graph_cpu.edge_end(v));
+      float_t temp = std::sqrt(float_t(degree));
+      if (temp == 0.0) norm_factor[v] = 0.0;
+      else norm_factor[v] = 1.0 / temp;
+    }, galois::loopname("NormCounting"));
 #else
   norm_factor_counting_gpu();
 #endif
diff --git a/libdeepgalois/src/optimizer.cu b/libdeepgalois/src/optimizer.cu
index 7d718ea865..e58c641245 100644
--- a/libdeepgalois/src/optimizer.cu
+++ b/libdeepgalois/src/optimizer.cu
@@ -1,3 +1,4 @@
+#include <iostream>
 #include "optimizer.h"
 #include "cutils.h"
 #include "math_functions.hh"
@@ -15,6 +16,7 @@ __global__ void update_kernel(const int n, float_t alpha, float_t b1,
 }
 
 void adam::update_gpu(const size_t n, const float_t* dW, float_t* W) {
+  std::cout << updating weights on GPU, n = " << n << "\n";
   float_t * W1, *W2;
   CUDA_CHECK(cudaMalloc((void**)&W1, n * sizeof(float_t)));
   CUDA_CHECK(cudaMalloc((void**)&W2, n * sizeof(float_t)));
@@ -24,4 +26,6 @@ void adam::update_gpu(const size_t n, const float_t* dW, float_t* W) {
       n, alpha, b1, b2, b1_t, b2_t, eps, W1, W2, dW, W);
   b1_t *= b1;
   b2_t *= b2;
+  CUDA_CHECK(cudaFree(W1));
+  CUDA_CHECK(cudaFree(W2));
 }

From 8360c38f6baee35524584d1b3d9fe6455e7169ca Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Tue, 25 Feb 2020 23:45:18 -0600
Subject: [PATCH 047/660] fix bug in aggregator

---
 libdeepgalois/include/cutils.h                | 9 +++++++++
 libdeepgalois/src/aggregator.cu               | 2 +-
 libdeepgalois/src/context.cu                  | 1 +
 libdeepgalois/src/layers/graph_conv_layer.cpp | 9 +++++++--
 libdeepgalois/src/optimizer.cu                | 4 ++--
 5 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/libdeepgalois/include/cutils.h b/libdeepgalois/include/cutils.h
index fac2cfaa64..c817863242 100644
--- a/libdeepgalois/include/cutils.h
+++ b/libdeepgalois/include/cutils.h
@@ -4,6 +4,7 @@
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
 #include <driver_types.h>
+#include <iostream>
 
 // CUDA: use 256 threads per block
 const int CUDA_NUM_THREADS = 256;
@@ -127,3 +128,11 @@ inline const char* curandGetErrorString(curandStatus_t error) {
 
 // CUDA: check for error after kernel execution and exit loudly if there is one.
 #define CUDA_POST_KERNEL_CHECK CUDA_CHECK(cudaPeekAtLastError())
+
+inline void print_device_vector(size_t n, const float_t *d_x, std::string name = "x") {
+  float_t *h_x = new float_t[n];
+  CUDA_CHECK(cudaMemcpy(h_x, d_x, n * sizeof(float_t), cudaMemcpyDeviceToHost));
+  for (size_t i = 0; i < n; i ++) std::cout << name << "[" << i << "]=" << h_x[i] << "\n";
+  delete h_x;
+}
+
diff --git a/libdeepgalois/src/aggregator.cu b/libdeepgalois/src/aggregator.cu
index c5ed6e0817..885660e973 100644
--- a/libdeepgalois/src/aggregator.cu
+++ b/libdeepgalois/src/aggregator.cu
@@ -12,7 +12,7 @@ __device__ void scale_add(const int n, const float_t alpha, const float_t* a,
     y[i] = alpha * a[i] + b[i];
 }
 
-__global__ void update_all_kernel(size_t n, size_t len, CSRGraph& g,
+__global__ void update_all_kernel(size_t n, size_t len, CSRGraph g,
                                   const float_t* in, float_t* out,
                                   bool norm, const float_t* norm_factor) {
   CUDA_KERNEL_LOOP(src, n) {
diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu
index b68f07ab98..0a63bb40bd 100644
--- a/libdeepgalois/src/context.cu
+++ b/libdeepgalois/src/context.cu
@@ -96,6 +96,7 @@ void Context::copy_data_to_device() {
   CUDA_CHECK(cudaMalloc((void**)&d_feats, n * feat_len * sizeof(float_t)));
   CUDA_CHECK(cudaMemcpy(d_feats, &h_feats[0], n * feat_len * sizeof(float_t),
                         cudaMemcpyHostToDevice));
+  print_device_vector(10, d_feats, "d_feats");
 }
 
 float_t* Context::get_in_ptr() { return d_feats; }
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 6ab8662101..073ba9eb76 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -121,8 +121,10 @@ void graph_conv_layer::forward_propagation(const float_t* in_data,
     dropout_gpu(x * y, scale_, dropout_rate_, in_data, dropout_mask, in_temp);
     matmul1D1D_gpu(x, z, y, in_temp, d_W, out_temp);
   } else matmul1D1D_gpu(x, z, y, in_data, d_W, out_temp);
-  //aggregate(z, context->graph_gpu, out_temp, out_data);
+  aggregate(z, context->graph_gpu, out_temp, out_data);
   if (act_) relu_gpu(x * z, out_data, out_data);
+  std::cout << "Forward " << name_ << ":\n";
+  print_device_vector(10, in_data, "in_data");
 }
 
 // GPU backward: compute input gradients (in_grad) and weight gradients (d_weight_grad)
@@ -133,9 +135,12 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
   else copy_gpu(x * z, out_grad, out_temp);
   if (level_ != 0) {
     sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0, in_temp);
-    //update_all(y, context->graph_gpu, in_temp, in_grad, true, context->d_norm_factor);
+    update_all(y, context->graph_gpu, in_temp, in_grad, true, context->d_norm_factor);
     if (dropout_) d_dropout_gpu(x * y, scale_, in_grad, dropout_mask, in_grad);
   }
+  std::cout << "Backward " << name_ << ":\n";
+  print_device_vector(10, in_data, "in_data");
+  print_device_vector(10, out_temp, "out_temp");
   sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, d_weight_grad);
 }
 #endif
diff --git a/libdeepgalois/src/optimizer.cu b/libdeepgalois/src/optimizer.cu
index e58c641245..ee9ff3b8d4 100644
--- a/libdeepgalois/src/optimizer.cu
+++ b/libdeepgalois/src/optimizer.cu
@@ -1,4 +1,3 @@
-#include <iostream>
 #include "optimizer.h"
 #include "cutils.h"
 #include "math_functions.hh"
@@ -16,7 +15,8 @@ __global__ void update_kernel(const int n, float_t alpha, float_t b1,
 }
 
 void adam::update_gpu(const size_t n, const float_t* dW, float_t* W) {
-  std::cout << updating weights on GPU, n = " << n << "\n";
+  std::cout << "updating weights on GPU, n = " << n << "\n";
+  print_device_vector(10, dW, "dW");
   float_t * W1, *W2;
   CUDA_CHECK(cudaMalloc((void**)&W1, n * sizeof(float_t)));
   CUDA_CHECK(cudaMalloc((void**)&W2, n * sizeof(float_t)));

From 8daaf898542ddb2a3b521e2bfb034f905d7fd846 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Wed, 26 Feb 2020 11:06:17 -0600
Subject: [PATCH 048/660] refine gpu operators

---
 libdeepgalois/include/math_functions.hh       |  32 ++--
 libdeepgalois/src/aggregator.cu               |   4 +-
 libdeepgalois/src/context.cu                  |   2 +-
 libdeepgalois/src/layers/graph_conv_layer.cpp |  23 ++-
 .../src/layers/softmax_loss_layer.cpp         |   8 +-
 libdeepgalois/src/math_functions.cu           | 164 ++++++++++++------
 libdeepgalois/src/optimizer.cu                |   4 +-
 7 files changed, 150 insertions(+), 87 deletions(-)

diff --git a/libdeepgalois/include/math_functions.hh b/libdeepgalois/include/math_functions.hh
index 61e95ef5b0..ef313815a7 100644
--- a/libdeepgalois/include/math_functions.hh
+++ b/libdeepgalois/include/math_functions.hh
@@ -53,24 +53,26 @@ void dropout(const float scale, const float dropout_rate, const vec_t& in,
              std::vector<unsigned>& mask, vec_t& out); // dropout
 void dropout(const float scale, const float dropout_rate, const vec_t& in,
              std::vector<unsigned>& mask, float_t* out);
-void dropout(size_t n, const float scale, const float dropout_rate,
+void dropout(int n, const float scale, const float dropout_rate,
              const float_t* in, unsigned* mask, float_t* out);
 void d_dropout(const float scale, const vec_t& in_diff,
                std::vector<unsigned>& mask,
                vec_t& out_diff); // dropout derivative
-void d_dropout(size_t n, const float scale, const float_t* in_diff,
+void d_dropout(int n, const float scale, const float_t* in_diff,
                unsigned* mask, float_t* out_diff);
 void softmax(const vec_t& input, vec_t& output);
-void softmax(size_t n, const float_t* input, float_t* output);
+void softmax(int n, const float_t* input, float_t* output);
 void d_softmax(const vec_t& y, const vec_t& p, vec_t& dy, const vec_t& dp);
-void d_softmax(size_t n, const float_t* y, const float_t* p, float_t* dy,
+void d_softmax(int n, const float_t* y, const float_t* p, float_t* dy,
                const float_t* dp);
 float_t cross_entropy(const vec_t& y, const vec_t& p);
-float_t cross_entropy(size_t n, const float_t* y, const float_t* p);
+float_t cross_entropy(int n, const float_t* y, const float_t* p);
 void d_cross_entropy(const vec_t& y, const vec_t& p, vec_t& d);
-void d_cross_entropy(size_t n, const float_t* y, const float_t* p, float_t* d);
+void d_cross_entropy(int n, const float_t* y, const float_t* p, float_t* d);
 
-void copy_gpu(size_t len, const float_t* in, float_t* out);
+// GPU operators
+void init_const_gpu(int n, float_t value, float_t *array);
+void copy_gpu(int len, const float_t* in, float_t* out);
 void vadd_gpu(const int n, const float_t* a, const float_t* b,
               float_t* out);                                 // vector add
 void relu_gpu(const int n, const float_t* in, float_t* out); // ReLU
@@ -83,29 +85,27 @@ void d_dropout_gpu(const int n, const float scale, const float_t* in,
 void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
                const int M, const int N, const int K, const float alpha,
                const float* A, const float* B, const float beta, float* C);
+void matmul_gpu(const size_t x, const size_t y, const size_t z,
+                    const float_t* A, const float_t* B, float_t* C);
 void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z,
                     const float_t* A, const float_t* B,
                     float_t* C); // matrix multiply
-void softmax_cross_entropy_gpu(int x, int y, const float_t* in_data,
+void softmax_cross_entropy_gpu(int len, int begin, int end, const float_t* in_data,
                                const mask_t* masks, const label_t* labels,
                                float_t* loss, float_t* out_data);
-void d_softmax_cross_entropy_gpu(int x, int y, const float_t* in_data,
+void d_softmax_cross_entropy_gpu(int len, int bengin, int end,
                                  const mask_t* masks, const label_t* labels,
                                  const float_t* out_data, float_t* diff);
 void scal_gpu(const int N, const float alpha, float* X);
 void add_scalar_gpu(const int N, const float_t alpha, float_t* Y);
-acc_t masked_avg_loss(size_t begin, size_t end, size_t count, mask_t* masks,
+acc_t masked_avg_loss(int begin, int end, int count, mask_t* masks,
                       float_t* loss);
-acc_t masked_accuracy_gpu(size_t num_classes, size_t begin, size_t end,
-                          size_t count, mask_t* masks, float_t* preds,
+acc_t masked_accuracy_gpu(int num_classes, int begin, int end,
+                          int count, mask_t* masks, float_t* preds,
                           label_t* labels);
-
 void copy_masks_device(int n, mask_t* h_masks, mask_t*& d_masks);
-void malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned*& masks,
-                   float_t*& in, float_t*& out);
 void loss_malloc_device(int n, float_t*& loss);
 void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout,
                          unsigned*& masks, float_t*& in, float_t*& out,
                          float_t*& matrix, float_t*& grad);
-
 #endif
diff --git a/libdeepgalois/src/aggregator.cu b/libdeepgalois/src/aggregator.cu
index 885660e973..f0c06722b6 100644
--- a/libdeepgalois/src/aggregator.cu
+++ b/libdeepgalois/src/aggregator.cu
@@ -22,7 +22,6 @@ __global__ void update_all_kernel(size_t n, size_t len, CSRGraph g,
     index_type end   = g.edge_end(src);
     for (index_type e = begin; e != end; e++) {
       index_type dst = g.getEdgeDst(e);
-      assert(dst < n);
       if (norm) b = a * norm_factor[dst];
       scale_add(len, b, in + dst * len, out + src * len,
                 out + src * len); // out[src] += in[dst]
@@ -33,7 +32,8 @@ __global__ void update_all_kernel(size_t n, size_t len, CSRGraph g,
 void update_all(size_t len, CSRGraph& g, const float_t* in, float_t* out,
                 bool norm, const float_t* norm_factor) {
   unsigned n = g.nnodes;
-  std::cout << "[debug]: update_all on GPU, n=" << n << ", len=" << len << "\n";
+  //std::cout << "[debug]: update_all on GPU, n=" << n << ", len=" << len << "\n";
+  //print_device_vector(10, norm_factor, "norm_factor");
   CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t)));
   update_all_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
       n, len, g, in, out, norm, norm_factor);
diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu
index 0a63bb40bd..647e010f60 100644
--- a/libdeepgalois/src/context.cu
+++ b/libdeepgalois/src/context.cu
@@ -96,7 +96,7 @@ void Context::copy_data_to_device() {
   CUDA_CHECK(cudaMalloc((void**)&d_feats, n * feat_len * sizeof(float_t)));
   CUDA_CHECK(cudaMemcpy(d_feats, &h_feats[0], n * feat_len * sizeof(float_t),
                         cudaMemcpyHostToDevice));
-  print_device_vector(10, d_feats, "d_feats");
+  //print_device_vector(10, d_feats, "d_feats");
 }
 
 float_t* Context::get_in_ptr() { return d_feats; }
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 073ba9eb76..25b06417bb 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -30,6 +30,7 @@ graph_conv_layer::graph_conv_layer(unsigned level, bool act, bool norm,
   trainable_ = true;
   name_      = layer_type() + "_" + std::to_string(level);
   init();
+  assert(dropout_rate_ < 1.);
   scale_ = 1. / (1. - dropout_rate_);
 }
 
@@ -117,14 +118,19 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
 void graph_conv_layer::forward_propagation(const float_t* in_data,
                                            float_t* out_data) {
   assert(y <= 128); // currently only support feature length <= 128
+  //if (level_ == 0) print_device_vector(20, in_data, "in_data");
+  //if (level_ == 0) print_device_vector(20, d_W, "W");
   if (dropout_ && phase_ == net_phase::train) {
     dropout_gpu(x * y, scale_, dropout_rate_, in_data, dropout_mask, in_temp);
-    matmul1D1D_gpu(x, z, y, in_temp, d_W, out_temp);
-  } else matmul1D1D_gpu(x, z, y, in_data, d_W, out_temp);
+    sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, d_W, 0.0, out_temp);
+    //copy_gpu(x*y, in_data, in_temp);
+    //matmul_gpu(x, z, y, in_temp, d_W, out_temp);
+  } else sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_data, d_W, 0.0, out_temp);
+  //if (level_ == 0) print_device_vector(20, out_temp, "out_temp");
   aggregate(z, context->graph_gpu, out_temp, out_data);
   if (act_) relu_gpu(x * z, out_data, out_data);
-  std::cout << "Forward " << name_ << ":\n";
-  print_device_vector(10, in_data, "in_data");
+  //std::cout << "Forward " << name_ << ":\n";
+  //print_device_vector(20, out_data, "out_data");
 }
 
 // GPU backward: compute input gradients (in_grad) and weight gradients (d_weight_grad)
@@ -138,9 +144,12 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
     update_all(y, context->graph_gpu, in_temp, in_grad, true, context->d_norm_factor);
     if (dropout_) d_dropout_gpu(x * y, scale_, in_grad, dropout_mask, in_grad);
   }
-  std::cout << "Backward " << name_ << ":\n";
-  print_device_vector(10, in_data, "in_data");
-  print_device_vector(10, out_temp, "out_temp");
   sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, d_weight_grad);
+  if (level_ == 0) {
+    std::cout << "Backward " << name_ << ":\n";
+    print_device_vector(20, in_data, "in_data");
+    print_device_vector(20, out_temp, "out_temp");
+    print_device_vector(20, d_weight_grad, "dW");
+  }
 }
 #endif
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp
index 1c305827ac..8457d1255a 100644
--- a/libdeepgalois/src/layers/softmax_loss_layer.cpp
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp
@@ -50,6 +50,7 @@ void softmax_loss_layer::back_propagation(const float_t* in_data,
 }
 
 acc_t softmax_loss_layer::get_masked_loss() {
+  assert(count_ > 0);
   AccumF total_loss;
   AccumU valid_sample_count;
   total_loss.reset();
@@ -68,14 +69,15 @@ acc_t softmax_loss_layer::get_masked_loss() {
 #else // GPU implementation
 void softmax_loss_layer::forward_propagation(const float_t* in_data,
                                              float_t* out_data) {
-  softmax_cross_entropy_gpu(input_dims[0], input_dims[1], in_data, d_masks_,
-                            context->d_labels, loss, out_data);
+  init_const_gpu(input_dims[0], 0.0, loss);
+  softmax_cross_entropy_gpu(input_dims[1], begin_, end_, in_data,
+                            d_masks_, context->d_labels, loss, out_data);
 }
 
 void softmax_loss_layer::back_propagation(const float_t* in_data,
                                           const float_t* out_data,
                                           float_t* out_grad, float_t* in_grad) {
-  d_softmax_cross_entropy_gpu(input_dims[0], input_dims[1], in_data, d_masks_,
+  d_softmax_cross_entropy_gpu(input_dims[1], begin_, end_, d_masks_,
                               context->d_labels, out_data, in_grad);
 }
 
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index 53ff024872..5d12d04986 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -32,6 +32,17 @@ void copy_masks_device(int n, mask_t* h_masks, mask_t*& d_masks) {
   CUDA_CHECK(cudaMemcpy(d_masks, h_masks, n * sizeof(mask_t), cudaMemcpyHostToDevice));
 }
 
+__global__ void init_const_kernel(int n, float_t value, float_t *array) {
+  CUDA_KERNEL_LOOP(i, n) {
+    array[i] = value;
+  }
+}
+
+void init_const_gpu(int n, float_t value, float_t *array) {
+  init_const_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, value, array);
+  CudaTest("solving init_const kernel failed");
+}
+
 void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout,
                          unsigned*& masks, float_t*& in, float_t*& out,
                          float_t*& matrix, float_t*& grad) {
@@ -60,8 +71,10 @@ __global__ void dropout_kernel(const int n, const float scale,
     // curandState_t curand_state;
     //curand_init(1234, i, 0, &state[i]); // Each thread gets same seed 1234
     //masks[i] = curand_uniform(&state[i]) <= dropout_rate ? 1 : 0;
-    masks[i] = 1.0 - dropout_rate;
-    out[i]   = in[i] * masks[i] * scale;
+    //masks[i] = 1.0 - dropout_rate;
+    //out[i]   = in[i] * masks[i] * scale;
+    masks[i] = 1.0;
+    out[i]   = in[i];
   }
 }
 
@@ -82,12 +95,14 @@ void dropout_gpu(const int n, const float scale, const float dropout_rate,
 
 __global__ void d_dropout_kernel(const int n, const float scale, const float_t* in,
                                  const unsigned* masks, float_t* out) {
-  CUDA_KERNEL_LOOP(i, n) { out[i] = in[i] * masks[i] * scale; }
+  //CUDA_KERNEL_LOOP(i, n) { out[i] = in[i] * masks[i] * scale; }
+  CUDA_KERNEL_LOOP(i, n) { out[i] = in[i]; }
 }
 
 void d_dropout_gpu(const int n, const float scale, const float_t* in,
                    const unsigned* masks, float_t* out) {
   d_dropout_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, scale, in, masks, out);
+  CudaTest("solving d_dropout kernel failed");
 }
 
 // flattern data into 1D before feed into the ReLU operater
@@ -114,6 +129,28 @@ void d_relu_gpu(const int n, const float_t* in_diff, const float_t* data,
   CudaTest("solving d_relu kernel failed");
 }
 
+__global__ void matmul_kernel(int x, int y, int z, const float_t* A,
+                              const float_t* B, float_t* C) {
+	int row = blockIdx.x*blockDim.x+threadIdx.x;
+	int col = blockIdx.y*blockDim.y+threadIdx.y;
+	float_t sum = 0.0f;
+	if (row < x && col < y) {
+		for (int i = 0; i < z; i++) {
+			sum += A[row * z + i] * B[i * y + col];
+		}
+	}
+	C[row * y + col] = sum;
+}
+
+#define TILE_SZ 16
+void matmul_gpu(const size_t x, const size_t y, const size_t z,
+                    const float_t* A, const float_t* B, float_t* C) {
+  dim3 threadsPerBlock(TILE_SZ, TILE_SZ);
+  dim3 blocksPerGrid((y-1)/TILE_SZ+1, (x-1)/TILE_SZ+1);
+  matmul_kernel<<<blocksPerGrid,threadsPerBlock>>>(x, y, z, A, B, C);
+  CudaTest("solving matmul kernel failed");
+}
+
 void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
                const int M, const int N, const int K, const float alpha,
                const float* A, const float* B, const float beta, float* C) {
@@ -124,8 +161,8 @@ void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
       (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
   cublasOperation_t cuTransB =
       (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  CUBLAS_CHECK(cublasSgemm(Context::cublas_handle(), cuTransB, cuTransA, N, M,
-                           K, &alpha, B, ldb, A, lda, &beta, C, N));
+  CUBLAS_CHECK(cublasSgemm(Context::cublas_handle(), cuTransB, cuTransA,
+                           N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
 }
 
 void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z,
@@ -190,7 +227,7 @@ __global__ void vadd_kernel(const int n, const float_t* a, const float_t* b,
   CUDA_KERNEL_LOOP(index, n) { y[index] = a[index] + b[index]; }
 }
 
-void copy_gpu(size_t len, const float_t* in, float_t* out) {
+void copy_gpu(int len, const float_t* in, float_t* out) {
   CUDA_CHECK(cudaMemcpy(out, in, len * sizeof(float_t), cudaMemcpyDeviceToDevice));
 }
 
@@ -200,87 +237,100 @@ void vadd_gpu(const int N, const float_t* a, const float_t* b, float_t* y) {
 }
 
 // TODO: use warp
-__device__ void softmax(int n, const float_t* input, float_t* output) {
+__device__ void softmax_device(int n, const float_t* input, float_t* output) {
   float_t max = input[0];
-  for (size_t i = 1; i < n; i++)
+  for (int i = 1; i < n; i++)
     if (input[i] > max)
       max = input[i];
   float_t denominator = 0.0;
-  for (size_t i = 0; i < n; i++) {
-    output[i] = exp(input[i] - max);
+  for (int i = 0; i < n; i++) {
+    output[i] = expf(input[i] - max);
     denominator += output[i];
+	if (output[i] < 0.0) printf("in[%d]=%f, out[%d]=%f\n", i, input[i], i, output[i]);
+    //assert(output[i] >= 0.0);
   }
-  for (size_t i = 0; i < n; i++)
+  assert(denominator != 0.0);
+  for (int i = 0; i < n; i++) {
     output[i] /= denominator;
-}
-
-// TODO: use warp
-__device__ void d_softmax(size_t n, const float_t* p, const float_t* dp, float_t* dy) {
-  for (size_t i = 0; i < n; i++) {
-    dy[i] = 0;
-    for (size_t j = 0; j < n; j++) {
-      float_t df = (j == i) ? p[i] * (1.0 - p[i]) : -p[j] * p[i];
-      dy[i] += df * dp[j];
-    }
+    //assert(output[i] >= 0.0);
+    //assert(output[i] <= 1.0);
   }
 }
 
-__device__ void cross_entropy(int n, const label_t idx, const float_t* p, float_t& loss) {
-  if (p[idx] == 0.0) loss -= log(float_t(1e-10));
-  else loss -= log(p[idx]);
-}
-
-__device__ void d_cross_entropy(int n, const label_t idx, const float_t* p, float_t* d) {
-  for (int i = 0; i < n; i++)
-    if (i == (int)idx) d[i] = -1.0 / (p[i] + 1e-10);
-    else d[i] = 0.0;
+__device__ void cross_entropy_device(int n, const label_t idx, const float_t* p, float_t& loss) {
+  if (p[idx] == 0.0) loss -= logf(float_t(1e-10));
+  else loss -= logf(p[idx]);
 }
 
 // n: number of vectors
 // len: length of vectors
 // for each vector, do softmax to normalize the vector, and then compute a loss
-__global__ void softmax_cross_entropy_kernel(int n, int len,
+__global__ void softmax_cross_entropy_kernel(int len, int begin, int end,
                                              const float_t* in_data,
                                              const mask_t* masks,
                                              const label_t* labels,
                                              float_t* loss, float_t* out_data) {
-  CUDA_KERNEL_LOOP(i, n) {
-    if (masks[i] == 1) { // masked
+  CUDA_KERNEL_LOOP(i, end-begin) {
+    int id = begin + i;
+    if (masks[id] == 1) { // masked
 	  // normalize using softmax
-      softmax(len, in_data + len * i, out_data + len * i);
-      loss[i] = 0.0;
-      cross_entropy(len, labels[i], &out_data[len * i], loss[i]);
+      softmax_device(len, in_data + len*id, out_data + len*id);
+      //loss[id] = 0.0;
+      cross_entropy_device(len, labels[id], out_data + len*id, loss[id]);
     }
   }
 }
 
-void softmax_cross_entropy_gpu(int n, int len, const float_t* in,
+void softmax_cross_entropy_gpu(int len, int begin, int end, const float_t* in,
                                const mask_t* masks, const label_t* labels,
                                float_t* loss, float_t* out) {
-  softmax_cross_entropy_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
-      n, len, in, masks, labels, loss, out);
+  softmax_cross_entropy_kernel<<<CUDA_GET_BLOCKS(end-begin), CUDA_NUM_THREADS>>>(
+      len, begin, end, in, masks, labels, loss, out);
   CudaTest("solving softmax_cross_entropy kernel failed");
 }
 
-__global__ void d_softmax_cross_entropy_kernel(int n, int len, const float_t* in,
+// TODO: use warp
+__device__ void d_softmax(int n, const float_t* p, const float_t* dp, float_t* dy) {
+  for (int i = 0; i < n; i++) {
+    dy[i] = 0;
+    for (int j = 0; j < n; j++) {
+      float_t df = (j == i) ? p[i] * (1.0 - p[i]) : -p[j] * p[i];
+      dy[i] += df * dp[j];
+    }
+  }
+}
+
+__device__ void d_cross_entropy(int n, const label_t idx, const float_t* p, float_t* d) {
+  for (int i = 0; i < n; i++) {
+    //assert(p[i] >= 0.0);
+    //assert(p[i] >= 0.0 && p[i] <= 1.0);
+    if (i == (int)idx) d[i] = -1.0 / (p[i] + 1e-10);
+    else d[i] = 0.0;
+  }
+}
+
+__global__ void d_softmax_cross_entropy_kernel(int len, int begin, int end,
                                const mask_t* masks, const label_t* labels,
                                const float_t* out, float_t* diff) {
-  CUDA_KERNEL_LOOP(i, n) {
-    float_t out_grad[41]; // TODO
-    d_cross_entropy(len, labels[i], out + len * i, out_grad);
-    d_softmax(len, out + len * i, out_grad, diff + len * i);
+  CUDA_KERNEL_LOOP(i, end-begin) {
+    int id = begin + i;
+    if (masks[id] == 1) { // masked
+	  float_t out_grad[41]; // TODO
+      d_cross_entropy(len, labels[id], out + len*id, out_grad);
+      d_softmax(len, out + len*id, out_grad, diff + len*id);
+    }
   }
 }
 
-void d_softmax_cross_entropy_gpu(int n, int len, const float_t* in,
+void d_softmax_cross_entropy_gpu(int len, int begin, int end,
                                  const mask_t* masks, const label_t* labels,
                                  const float_t* out, float_t* diff) {
-  d_softmax_cross_entropy_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
-      n, len, in, masks, labels, out, diff);
+  d_softmax_cross_entropy_kernel<<<CUDA_GET_BLOCKS(end-begin), CUDA_NUM_THREADS>>>(
+      len, begin, end, masks, labels, out, diff);
   CudaTest("solving d_softmax_cross_entropy kernel failed");
 }
 
-__global__ void masked_avg_loss_kernel(size_t begin, size_t end, mask_t* masks,
+__global__ void masked_avg_loss_kernel(int begin, int end, mask_t* masks,
                                        float_t* loss,
                                        HGAccumulator<acc_t> total) {
   total.thread_entry();
@@ -293,8 +343,9 @@ __global__ void masked_avg_loss_kernel(size_t begin, size_t end, mask_t* masks,
   total.thread_exit<cub::BlockReduce<acc_t, CUDA_NUM_THREADS>>(local_loss);
 }
 
-acc_t masked_avg_loss(size_t begin, size_t end, size_t count, mask_t* masks,
+acc_t masked_avg_loss(int begin, int end, int count, mask_t* masks,
                       float_t* loss) {
+  assert(count > 0);
   HGAccumulator<acc_t> loss_accum;
   Shared<acc_t> total_loss   = Shared<acc_t>(1);
   *(total_loss.cpu_wr_ptr()) = 0;
@@ -307,10 +358,10 @@ acc_t masked_avg_loss(size_t begin, size_t end, size_t count, mask_t* masks,
 }
 
 // the arguments of the maxima
-__device__ size_t argmax_device(const size_t n, const float_t* x) {
+__device__ int argmax_device(const int n, const float_t* x) {
   float_t max    = x[0];
-  size_t max_ind = 0;
-  for (size_t i = 1; i < n; i++) {
+  int max_ind = 0;
+  for (int i = 1; i < n; i++) {
     if (x[i] > max) {
       max_ind = i;
       max     = x[i];
@@ -319,8 +370,8 @@ __device__ size_t argmax_device(const size_t n, const float_t* x) {
   return max_ind;
 }
 
-__global__ void masked_accuracy_kernel(size_t num_classes, size_t begin,
-                                       size_t end, mask_t* masks,
+__global__ void masked_accuracy_kernel(int num_classes, int begin,
+                                       int end, mask_t* masks,
                                        float_t* preds, label_t* labels,
                                        HGAccumulator<acc_t> total) {
   total.thread_entry();
@@ -337,9 +388,10 @@ __global__ void masked_accuracy_kernel(size_t num_classes, size_t begin,
   total.thread_exit<cub::BlockReduce<acc_t, CUDA_NUM_THREADS>>(local_accuracy);
 }
 
-acc_t masked_accuracy_gpu(size_t num_classes, size_t begin, size_t end,
-                          size_t count, mask_t* masks, float_t* preds,
+acc_t masked_accuracy_gpu(int num_classes, int begin, int end,
+                          int count, mask_t* masks, float_t* preds,
                           label_t* labels) {
+  assert(count > 0);
   HGAccumulator<acc_t> accuracy_accum;
   Shared<acc_t> total_accuracy   = Shared<acc_t>(1);
   *(total_accuracy.cpu_wr_ptr()) = 0;
diff --git a/libdeepgalois/src/optimizer.cu b/libdeepgalois/src/optimizer.cu
index ee9ff3b8d4..a9326aaefd 100644
--- a/libdeepgalois/src/optimizer.cu
+++ b/libdeepgalois/src/optimizer.cu
@@ -15,8 +15,8 @@ __global__ void update_kernel(const int n, float_t alpha, float_t b1,
 }
 
 void adam::update_gpu(const size_t n, const float_t* dW, float_t* W) {
-  std::cout << "updating weights on GPU, n = " << n << "\n";
-  print_device_vector(10, dW, "dW");
+  //std::cout << "updating weights on GPU, n = " << n << "\n";
+  //print_device_vector(10, dW, "dW");
   float_t * W1, *W2;
   CUDA_CHECK(cudaMalloc((void**)&W1, n * sizeof(float_t)));
   CUDA_CHECK(cudaMalloc((void**)&W2, n * sizeof(float_t)));

From 594d2c78583574da7d8c4fc9989a35e62c7c1eac Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Wed, 26 Feb 2020 14:39:26 -0600
Subject: [PATCH 049/660] fix bug in optimizer.cu

---
 libdeepgalois/include/math_functions.hh       |  4 +-
 libdeepgalois/include/optimizer.h             | 16 +++++-
 libdeepgalois/src/layers/graph_conv_layer.cpp | 49 +++++++++++++++++--
 .../src/layers/softmax_loss_layer.cpp         | 28 ++++++++++-
 libdeepgalois/src/math_functions.cu           | 46 ++++++++++++-----
 libdeepgalois/src/optimizer.cu                | 14 ++----
 6 files changed, 130 insertions(+), 27 deletions(-)

diff --git a/libdeepgalois/include/math_functions.hh b/libdeepgalois/include/math_functions.hh
index ef313815a7..0e0f9f38df 100644
--- a/libdeepgalois/include/math_functions.hh
+++ b/libdeepgalois/include/math_functions.hh
@@ -71,6 +71,7 @@ void d_cross_entropy(const vec_t& y, const vec_t& p, vec_t& d);
 void d_cross_entropy(int n, const float_t* y, const float_t* p, float_t* d);
 
 // GPU operators
+bool isnan_gpu(int n, const float_t *array); // does array contain any 'nan' element
 void init_const_gpu(int n, float_t value, float_t *array);
 void copy_gpu(int len, const float_t* in, float_t* out);
 void vadd_gpu(const int n, const float_t* a, const float_t* b,
@@ -103,8 +104,9 @@ acc_t masked_avg_loss(int begin, int end, int count, mask_t* masks,
 acc_t masked_accuracy_gpu(int num_classes, int begin, int end,
                           int count, mask_t* masks, float_t* preds,
                           label_t* labels);
+bool is_allocated_device(float_t* data);
 void copy_masks_device(int n, mask_t* h_masks, mask_t*& d_masks);
-void loss_malloc_device(int n, float_t*& loss);
+void float_malloc_device(int n, float_t*& loss);
 void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout,
                          unsigned*& masks, float_t*& in, float_t*& out,
                          float_t*& matrix, float_t*& grad);
diff --git a/libdeepgalois/include/optimizer.h b/libdeepgalois/include/optimizer.h
index d9f8de9116..96ef841644 100644
--- a/libdeepgalois/include/optimizer.h
+++ b/libdeepgalois/include/optimizer.h
@@ -3,7 +3,9 @@
 #include <algorithm>
 #include <unordered_map>
 #include "types.h"
-
+#ifndef CPU_ONLY
+#include "math_functions.hh"
+#endif
 // base class of optimizer
 // usesHessian : true if an optimizer uses hessian (2nd order derivative of loss
 // function)
@@ -36,6 +38,18 @@ struct stateful_optimizer : public optimizer {
     return E_[Index][&key];
   }
   std::unordered_map<const vec_t*, vec_t> E_[N];
+#ifndef CPU_ONLY
+  template <int Index>
+  float_t *get_gpu(const size_t n, const float_t *key) {
+    static_assert(Index < N, "index out of range");
+    if (!is_allocated_device(dE_[Index][key])) {
+      float_malloc_device(n, dE_[Index][key]);
+      init_const_gpu(n, 0.0, dE_[Index][key]);
+    }
+    return dE_[Index][key];
+  }
+  std::unordered_map<const float_t*, float_t*> dE_[N];
+#endif
 };
 
 /**
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 25b06417bb..574e9369c0 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -120,17 +120,46 @@ void graph_conv_layer::forward_propagation(const float_t* in_data,
   assert(y <= 128); // currently only support feature length <= 128
   //if (level_ == 0) print_device_vector(20, in_data, "in_data");
   //if (level_ == 0) print_device_vector(20, d_W, "W");
+
+  if (isnan_gpu(x*z, out_temp)) {
+    std::cout << name_ << " forward before sgemm Exception: out_temp nan, exiting\n";
+    exit(0);
+  }
+  init_const_gpu(x*z, 0.0, out_temp);
+  if (isnan_gpu(x*y, in_temp)) {
+    std::cout << name_ << " forward Exception: in_temp nan, exiting\n";
+    exit(0);
+  }
+
+  if (isnan_gpu(y*z, d_W)) {
+    std::cout << name_ << " forward before sgemm Exception: d_W nan, exiting\n";
+    exit(0);
+  }
+
   if (dropout_ && phase_ == net_phase::train) {
     dropout_gpu(x * y, scale_, dropout_rate_, in_data, dropout_mask, in_temp);
-    sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, d_W, 0.0, out_temp);
-    //copy_gpu(x*y, in_data, in_temp);
-    //matmul_gpu(x, z, y, in_temp, d_W, out_temp);
+    //sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, d_W, 0.0, out_temp);
+    matmul_gpu(x, z, y, in_temp, d_W, out_temp);
   } else sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_data, d_W, 0.0, out_temp);
   //if (level_ == 0) print_device_vector(20, out_temp, "out_temp");
   aggregate(z, context->graph_gpu, out_temp, out_data);
   if (act_) relu_gpu(x * z, out_data, out_data);
   //std::cout << "Forward " << name_ << ":\n";
   //print_device_vector(20, out_data, "out_data");
+  if (isnan_gpu(x*y, in_data)) {
+    std::cout << name_ << " forward Exception: in_data nan, exiting\n";
+    exit(0);
+  }
+
+  if (isnan_gpu(x*z, out_temp)) {
+    std::cout << name_ << " forward after sgemm Exception: out_temp nan, exiting\n";
+    exit(0);
+  }
+
+  if (isnan_gpu(x*z, out_data)) {
+    std::cout << name_ << " forward Exception: out_data nan, exiting\n";
+    exit(0);
+  }
 }
 
 // GPU backward: compute input gradients (in_grad) and weight gradients (d_weight_grad)
@@ -143,6 +172,10 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
     sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0, in_temp);
     update_all(y, context->graph_gpu, in_temp, in_grad, true, context->d_norm_factor);
     if (dropout_) d_dropout_gpu(x * y, scale_, in_grad, dropout_mask, in_grad);
+    if (isnan_gpu(x*y, in_grad)) {
+      std::cout << name_ << "Exception: ingrad nan, exiting\n";
+      exit(0);
+    }
   }
   sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, d_weight_grad);
   if (level_ == 0) {
@@ -151,5 +184,15 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
     print_device_vector(20, out_temp, "out_temp");
     print_device_vector(20, d_weight_grad, "dW");
   }
+
+  if (isnan_gpu(x*z, out_temp)) {
+    std::cout << name_ << " backward Exception: out_temp nan, exiting\n";
+    exit(0);
+  }
+
+  if (isnan_gpu(y*z, d_weight_grad)) {
+    std::cout << name_ << "Exception: ingrad nan, exiting\n";
+    exit(0);
+  }
 }
 #endif
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp
index 8457d1255a..c75781843b 100644
--- a/libdeepgalois/src/layers/softmax_loss_layer.cpp
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp
@@ -9,7 +9,7 @@ softmax_loss_layer::softmax_loss_layer(unsigned level,
 #ifdef CPU_ONLY
   loss = new float_t[in_dims[0]]; // error for each sample
 #else
-  loss_malloc_device(in_dims[0], loss);
+  float_malloc_device(in_dims[0], loss);
 #endif
 }
 #ifdef CPU_ONLY
@@ -70,8 +70,30 @@ acc_t softmax_loss_layer::get_masked_loss() {
 void softmax_loss_layer::forward_propagation(const float_t* in_data,
                                              float_t* out_data) {
   init_const_gpu(input_dims[0], 0.0, loss);
+  if (isnan_gpu(input_dims[0]*input_dims[1], in_data)) {
+    std::cout << name_ << " Exception: in_data nan, exiting\n";
+    exit(0);
+  }
+  if (isnan_gpu(output_dims[0], loss)) {
+    std::cout << name_ << " Exception: loss nan, exiting\n";
+    exit(0);
+  }
+  /*
+  if (isnan_gpu(output_dims[0], d_masks_)) {
+    std::cout << name_ << " Exception: masks nan, exiting\n";
+    exit(0);
+  }
+  if (isnan_gpu(output_dims[0], context->d_labels)) {
+    std::cout << name_ << " Exception: labels nan, exiting\n";
+    exit(0);
+  }*/
+
   softmax_cross_entropy_gpu(input_dims[1], begin_, end_, in_data,
                             d_masks_, context->d_labels, loss, out_data);
+  if (isnan_gpu(output_dims[0]*output_dims[1], out_data)) {
+    std::cout << name_ << " Exception: out_data nan, exiting\n";
+    exit(0);
+  }
 }
 
 void softmax_loss_layer::back_propagation(const float_t* in_data,
@@ -79,6 +101,10 @@ void softmax_loss_layer::back_propagation(const float_t* in_data,
                                           float_t* out_grad, float_t* in_grad) {
   d_softmax_cross_entropy_gpu(input_dims[1], begin_, end_, d_masks_,
                               context->d_labels, out_data, in_grad);
+  if (isnan_gpu(input_dims[1]*input_dims[1], in_grad)) {
+    std::cout << name_ << " Exception: ingrad nan, exiting\n";
+    exit(0);
+  }
 }
 
 acc_t softmax_loss_layer::get_masked_loss() {
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index 5d12d04986..99b83e4d6e 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -5,6 +5,29 @@
 #include "cub/cub.cuh"
 #include <curand_kernel.h>
 
+__global__ void init_const_kernel(int n, float_t value, float_t *array) {
+  CUDA_KERNEL_LOOP(i, n) { array[i] = value; }
+}
+
+void init_const_gpu(int n, float_t value, float_t *array) {
+  init_const_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, value, array);
+  CudaTest("solving init_const kernel failed");
+}
+
+__global__ void isnan_test(const int n, const float *data, bool *result) {
+	CUDA_KERNEL_LOOP(i, n) { if (isnan(data[i])) *result = true; }
+}
+
+bool isnan_gpu(int n, const float_t *array) {
+  bool  *d_result, h_result = false;
+  cudaMalloc((void **)&d_result, sizeof (bool));
+  cudaMemcpy(d_result, &h_result, sizeof(bool), cudaMemcpyHostToDevice);
+  isnan_test<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, array, d_result);
+  CudaTest("solving init_const kernel failed");
+  cudaMemcpy(&h_result, d_result, sizeof(bool), cudaMemcpyDeviceToHost);
+  return h_result;
+}
+
 void gpu_rng_uniform(const int n, unsigned* r) {
   CURAND_CHECK(curandGenerate(Context::curand_generator(), r, n));
 }
@@ -22,7 +45,15 @@ void gpu_rng_gaussian(const int n, const float_t mu, const float_t sigma, float_
   CURAND_CHECK(curandGenerateNormal(Context::curand_generator(), r, n, mu, sigma));
 }
 
-void loss_malloc_device(int n, float_t*& loss) {
+bool is_allocated_device(float_t* data) {
+  if (data == NULL) return false;
+  cudaPointerAttributes attributes;
+  CUDA_CHECK(cudaPointerGetAttributes(&attributes, data));
+  if (attributes.devicePointer != NULL) return true;
+  return false;
+}
+
+void float_malloc_device(int n, float_t*& loss) {
   CUDA_CHECK(cudaMalloc((void**)&loss, n * sizeof(float_t)));
 }
 
@@ -32,23 +63,14 @@ void copy_masks_device(int n, mask_t* h_masks, mask_t*& d_masks) {
   CUDA_CHECK(cudaMemcpy(d_masks, h_masks, n * sizeof(mask_t), cudaMemcpyHostToDevice));
 }
 
-__global__ void init_const_kernel(int n, float_t value, float_t *array) {
-  CUDA_KERNEL_LOOP(i, n) {
-    array[i] = value;
-  }
-}
-
-void init_const_gpu(int n, float_t value, float_t *array) {
-  init_const_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, value, array);
-  CudaTest("solving init_const kernel failed");
-}
-
 void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout,
                          unsigned*& masks, float_t*& in, float_t*& out,
                          float_t*& matrix, float_t*& grad) {
   if (dropout) CUDA_CHECK(cudaMalloc((void**)&masks, x * y * sizeof(unsigned)));
   CUDA_CHECK(cudaMalloc((void**)&in, x * y * sizeof(float_t)));
+  init_const_gpu(x*y, 0.0, in);
   CUDA_CHECK(cudaMalloc((void**)&out, x * z * sizeof(float_t)));
+  init_const_gpu(x*z, 0.0, out);
   CUDA_CHECK(cudaMalloc((void**)&matrix, y * z * sizeof(float_t)));
   auto init_range = sqrt(6.0 / (y + z));
   // Glorot & Bengio (AISTATS 2010)
diff --git a/libdeepgalois/src/optimizer.cu b/libdeepgalois/src/optimizer.cu
index a9326aaefd..bf279e4e37 100644
--- a/libdeepgalois/src/optimizer.cu
+++ b/libdeepgalois/src/optimizer.cu
@@ -10,22 +10,18 @@ __global__ void update_kernel(const int n, float_t alpha, float_t b1,
     mt[i] = b1 * mt[i] + (1.0 - b1) * dW[i];
     vt[i] = b2 * vt[i] + (1.0 - b2) * dW[i] * dW[i];
     W[i] -= alpha * (mt[i] / (1.0 - b1_t)) /
-            std::sqrt((vt[i] / (1.0 - b2_t)) + eps);
+            sqrtf((vt[i] / (1.0 - b2_t)) + eps);
   }
 }
 
 void adam::update_gpu(const size_t n, const float_t* dW, float_t* W) {
   //std::cout << "updating weights on GPU, n = " << n << "\n";
   //print_device_vector(10, dW, "dW");
-  float_t * W1, *W2;
-  CUDA_CHECK(cudaMalloc((void**)&W1, n * sizeof(float_t)));
-  CUDA_CHECK(cudaMalloc((void**)&W2, n * sizeof(float_t)));
-  copy_gpu(n, W, W1);
-  copy_gpu(n, W, W2);
+  float_t* cache = get_gpu<0>(n, W);
+  float_t* velocity = get_gpu<1>(n, W);
+
   update_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
-      n, alpha, b1, b2, b1_t, b2_t, eps, W1, W2, dW, W);
+      n, alpha, b1, b2, b1_t, b2_t, eps, cache, velocity, dW, W);
   b1_t *= b1;
   b2_t *= b2;
-  CUDA_CHECK(cudaFree(W1));
-  CUDA_CHECK(cudaFree(W2));
 }

From 2f62cf42885ab646a0e4108d2bb910ef1bfda377 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Wed, 26 Feb 2020 14:46:20 -0600
Subject: [PATCH 050/660] gpu working

---
 libdeepgalois/src/layers/graph_conv_layer.cpp | 49 +++----------------
 .../src/layers/softmax_loss_layer.cpp         | 26 ----------
 2 files changed, 6 insertions(+), 69 deletions(-)

diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 574e9369c0..c0c07cb889 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -121,45 +121,17 @@ void graph_conv_layer::forward_propagation(const float_t* in_data,
   //if (level_ == 0) print_device_vector(20, in_data, "in_data");
   //if (level_ == 0) print_device_vector(20, d_W, "W");
 
-  if (isnan_gpu(x*z, out_temp)) {
-    std::cout << name_ << " forward before sgemm Exception: out_temp nan, exiting\n";
-    exit(0);
-  }
   init_const_gpu(x*z, 0.0, out_temp);
-  if (isnan_gpu(x*y, in_temp)) {
-    std::cout << name_ << " forward Exception: in_temp nan, exiting\n";
-    exit(0);
-  }
-
-  if (isnan_gpu(y*z, d_W)) {
-    std::cout << name_ << " forward before sgemm Exception: d_W nan, exiting\n";
-    exit(0);
-  }
-
   if (dropout_ && phase_ == net_phase::train) {
     dropout_gpu(x * y, scale_, dropout_rate_, in_data, dropout_mask, in_temp);
-    //sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, d_W, 0.0, out_temp);
-    matmul_gpu(x, z, y, in_temp, d_W, out_temp);
+    sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, d_W, 0.0, out_temp);
+    //matmul_gpu(x, z, y, in_temp, d_W, out_temp);
   } else sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_data, d_W, 0.0, out_temp);
   //if (level_ == 0) print_device_vector(20, out_temp, "out_temp");
   aggregate(z, context->graph_gpu, out_temp, out_data);
   if (act_) relu_gpu(x * z, out_data, out_data);
   //std::cout << "Forward " << name_ << ":\n";
   //print_device_vector(20, out_data, "out_data");
-  if (isnan_gpu(x*y, in_data)) {
-    std::cout << name_ << " forward Exception: in_data nan, exiting\n";
-    exit(0);
-  }
-
-  if (isnan_gpu(x*z, out_temp)) {
-    std::cout << name_ << " forward after sgemm Exception: out_temp nan, exiting\n";
-    exit(0);
-  }
-
-  if (isnan_gpu(x*z, out_data)) {
-    std::cout << name_ << " forward Exception: out_data nan, exiting\n";
-    exit(0);
-  }
 }
 
 // GPU backward: compute input gradients (in_grad) and weight gradients (d_weight_grad)
@@ -172,22 +144,13 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
     sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0, in_temp);
     update_all(y, context->graph_gpu, in_temp, in_grad, true, context->d_norm_factor);
     if (dropout_) d_dropout_gpu(x * y, scale_, in_grad, dropout_mask, in_grad);
-    if (isnan_gpu(x*y, in_grad)) {
-      std::cout << name_ << "Exception: ingrad nan, exiting\n";
-      exit(0);
-    }
   }
   sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, d_weight_grad);
   if (level_ == 0) {
-    std::cout << "Backward " << name_ << ":\n";
-    print_device_vector(20, in_data, "in_data");
-    print_device_vector(20, out_temp, "out_temp");
-    print_device_vector(20, d_weight_grad, "dW");
-  }
-
-  if (isnan_gpu(x*z, out_temp)) {
-    std::cout << name_ << " backward Exception: out_temp nan, exiting\n";
-    exit(0);
+    //std::cout << "Backward " << name_ << ":\n";
+    //print_device_vector(20, in_data, "in_data");
+    //print_device_vector(20, out_temp, "out_temp");
+    //print_device_vector(20, d_weight_grad, "dW");
   }
 
   if (isnan_gpu(y*z, d_weight_grad)) {
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp
index c75781843b..af04b06bbf 100644
--- a/libdeepgalois/src/layers/softmax_loss_layer.cpp
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp
@@ -70,30 +70,8 @@ acc_t softmax_loss_layer::get_masked_loss() {
 void softmax_loss_layer::forward_propagation(const float_t* in_data,
                                              float_t* out_data) {
   init_const_gpu(input_dims[0], 0.0, loss);
-  if (isnan_gpu(input_dims[0]*input_dims[1], in_data)) {
-    std::cout << name_ << " Exception: in_data nan, exiting\n";
-    exit(0);
-  }
-  if (isnan_gpu(output_dims[0], loss)) {
-    std::cout << name_ << " Exception: loss nan, exiting\n";
-    exit(0);
-  }
-  /*
-  if (isnan_gpu(output_dims[0], d_masks_)) {
-    std::cout << name_ << " Exception: masks nan, exiting\n";
-    exit(0);
-  }
-  if (isnan_gpu(output_dims[0], context->d_labels)) {
-    std::cout << name_ << " Exception: labels nan, exiting\n";
-    exit(0);
-  }*/
-
   softmax_cross_entropy_gpu(input_dims[1], begin_, end_, in_data,
                             d_masks_, context->d_labels, loss, out_data);
-  if (isnan_gpu(output_dims[0]*output_dims[1], out_data)) {
-    std::cout << name_ << " Exception: out_data nan, exiting\n";
-    exit(0);
-  }
 }
 
 void softmax_loss_layer::back_propagation(const float_t* in_data,
@@ -101,10 +79,6 @@ void softmax_loss_layer::back_propagation(const float_t* in_data,
                                           float_t* out_grad, float_t* in_grad) {
   d_softmax_cross_entropy_gpu(input_dims[1], begin_, end_, d_masks_,
                               context->d_labels, out_data, in_grad);
-  if (isnan_gpu(input_dims[1]*input_dims[1], in_grad)) {
-    std::cout << name_ << " Exception: ingrad nan, exiting\n";
-    exit(0);
-  }
 }
 
 acc_t softmax_loss_layer::get_masked_loss() {

From a6ebf28cdeb4db15a6a04fdd350eab94e937dd86 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Wed, 26 Feb 2020 17:38:10 -0600
Subject: [PATCH 051/660] fix include/math_functions.hh

---
 libdeepgalois/include/math_functions.hh | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/libdeepgalois/include/math_functions.hh b/libdeepgalois/include/math_functions.hh
index 0e0f9f38df..414635b0e2 100644
--- a/libdeepgalois/include/math_functions.hh
+++ b/libdeepgalois/include/math_functions.hh
@@ -53,22 +53,22 @@ void dropout(const float scale, const float dropout_rate, const vec_t& in,
              std::vector<unsigned>& mask, vec_t& out); // dropout
 void dropout(const float scale, const float dropout_rate, const vec_t& in,
              std::vector<unsigned>& mask, float_t* out);
-void dropout(int n, const float scale, const float dropout_rate,
+void dropout(size_t n, const float scale, const float dropout_rate,
              const float_t* in, unsigned* mask, float_t* out);
 void d_dropout(const float scale, const vec_t& in_diff,
                std::vector<unsigned>& mask,
                vec_t& out_diff); // dropout derivative
-void d_dropout(int n, const float scale, const float_t* in_diff,
+void d_dropout(size_t n, const float scale, const float_t* in_diff,
                unsigned* mask, float_t* out_diff);
 void softmax(const vec_t& input, vec_t& output);
-void softmax(int n, const float_t* input, float_t* output);
+void softmax(size_t n, const float_t* input, float_t* output);
 void d_softmax(const vec_t& y, const vec_t& p, vec_t& dy, const vec_t& dp);
-void d_softmax(int n, const float_t* y, const float_t* p, float_t* dy,
+void d_softmax(size_t n, const float_t* y, const float_t* p, float_t* dy,
                const float_t* dp);
 float_t cross_entropy(const vec_t& y, const vec_t& p);
-float_t cross_entropy(int n, const float_t* y, const float_t* p);
+float_t cross_entropy(size_t n, const float_t* y, const float_t* p);
 void d_cross_entropy(const vec_t& y, const vec_t& p, vec_t& d);
-void d_cross_entropy(int n, const float_t* y, const float_t* p, float_t* d);
+void d_cross_entropy(size_t n, const float_t* y, const float_t* p, float_t* d);
 
 // GPU operators
 bool isnan_gpu(int n, const float_t *array); // does array contain any 'nan' element

From 3efa9b8f9e6d508f51bd53e1347ed94fac75e5b3 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Wed, 26 Feb 2020 17:51:54 -0600
Subject: [PATCH 052/660] remove debug code

---
 libdeepgalois/src/layers/graph_conv_layer.cpp | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index c0c07cb889..710bd79b64 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -118,20 +118,13 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
 void graph_conv_layer::forward_propagation(const float_t* in_data,
                                            float_t* out_data) {
   assert(y <= 128); // currently only support feature length <= 128
-  //if (level_ == 0) print_device_vector(20, in_data, "in_data");
-  //if (level_ == 0) print_device_vector(20, d_W, "W");
-
   init_const_gpu(x*z, 0.0, out_temp);
   if (dropout_ && phase_ == net_phase::train) {
     dropout_gpu(x * y, scale_, dropout_rate_, in_data, dropout_mask, in_temp);
     sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, d_W, 0.0, out_temp);
-    //matmul_gpu(x, z, y, in_temp, d_W, out_temp);
   } else sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_data, d_W, 0.0, out_temp);
-  //if (level_ == 0) print_device_vector(20, out_temp, "out_temp");
   aggregate(z, context->graph_gpu, out_temp, out_data);
   if (act_) relu_gpu(x * z, out_data, out_data);
-  //std::cout << "Forward " << name_ << ":\n";
-  //print_device_vector(20, out_data, "out_data");
 }
 
 // GPU backward: compute input gradients (in_grad) and weight gradients (d_weight_grad)
@@ -146,16 +139,5 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
     if (dropout_) d_dropout_gpu(x * y, scale_, in_grad, dropout_mask, in_grad);
   }
   sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, d_weight_grad);
-  if (level_ == 0) {
-    //std::cout << "Backward " << name_ << ":\n";
-    //print_device_vector(20, in_data, "in_data");
-    //print_device_vector(20, out_temp, "out_temp");
-    //print_device_vector(20, d_weight_grad, "dW");
-  }
-
-  if (isnan_gpu(y*z, d_weight_grad)) {
-    std::cout << name_ << "Exception: ingrad nan, exiting\n";
-    exit(0);
-  }
 }
 #endif

From 8089b4eec819d43d775160edea67bd78fcffd51b Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Thu, 27 Feb 2020 14:16:53 -0600
Subject: [PATCH 053/660] float->float_t

---
 libdeepgalois/include/layers/graph_conv_layer.h | 6 +++---
 libdeepgalois/include/net.h                     | 2 +-
 libdeepgalois/src/layers/graph_conv_layer.cpp   | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/libdeepgalois/include/layers/graph_conv_layer.h b/libdeepgalois/include/layers/graph_conv_layer.h
index 7dfc8c2154..86a91c7287 100644
--- a/libdeepgalois/include/layers/graph_conv_layer.h
+++ b/libdeepgalois/include/layers/graph_conv_layer.h
@@ -18,7 +18,7 @@
 class graph_conv_layer : public layer {
 public:
   graph_conv_layer(unsigned level, bool act, bool norm, bool bias, bool dropout,
-                   float dropout_rate, std::vector<size_t> in_dims,
+                   float_t dropout_rate, std::vector<size_t> in_dims,
                    std::vector<size_t> out_dims);
   graph_conv_layer(unsigned level, std::vector<size_t> in_dims,
                    std::vector<size_t> out_dims)
@@ -49,8 +49,8 @@ class graph_conv_layer : public layer {
   bool norm_;    // whether to normalize data
   bool bias_;    // whether to add bias afterwards
   bool dropout_; // whether to use dropout at first
-  const float dropout_rate_;
-  float scale_;
+  const float_t dropout_rate_;
+  float_t scale_;
   net_phase phase_;
   size_t x;
   size_t y;
diff --git a/libdeepgalois/include/net.h b/libdeepgalois/include/net.h
index 4a83caaf88..0182a7e65e 100644
--- a/libdeepgalois/include/net.h
+++ b/libdeepgalois/include/net.h
@@ -39,7 +39,7 @@ class Net {
 
   void append_conv_layer(size_t layer_id, bool act = false, bool norm = true,
                          bool bias = false, bool dropout = true,
-                         float dropout_rate = 0.5) {
+                         float_t dropout_rate = 0.5) {
     assert(dropout_rate < 1.0);
     assert(layer_id < NUM_CONV_LAYERS);
     std::vector<size_t> in_dims(2), out_dims(2);
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 710bd79b64..7f69f915de 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -18,7 +18,7 @@ void graph_conv_layer::combine(const vec_t& self, const vec_t& neighbors, vec_t&
 }
 
 graph_conv_layer::graph_conv_layer(unsigned level, bool act, bool norm,
-                                   bool bias, bool dropout, float dropout_rate,
+                                   bool bias, bool dropout, float_t dropout_rate,
                                    std::vector<size_t> in_dims,
                                    std::vector<size_t> out_dims)
     : layer(level, in_dims, out_dims), act_(act), norm_(norm), bias_(bias),

From b95adf0ca0d0d1c3f9aeefc945fdb95362fd4393 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 25 Feb 2020 18:21:39 -0600
Subject: [PATCH 054/660] various comments on some files

---
 libdeepgalois/include/lgraph.h | 14 ++++++++++++++
 libdeepgalois/include/net.h    |  1 +
 libdeepgalois/include/utils.h  |  2 ++
 libdeepgalois/src/context.cpp  |  4 ++--
 lonestargnn/gcn/gcn.cpp        |  1 +
 5 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/libdeepgalois/include/lgraph.h b/libdeepgalois/include/lgraph.h
index 65cd004c82..b15a505b45 100644
--- a/libdeepgalois/include/lgraph.h
+++ b/libdeepgalois/include/lgraph.h
@@ -11,6 +11,12 @@
 typedef unsigned IndexT;
 typedef float ValueT;
 
+/**
+ * Used to temporarily store read edges from edge list; graph itself doesn't
+ * use these.
+ *
+ * Source, dest, label.
+ */
 struct Edge {
   IndexT src;
   IndexT dst;
@@ -25,6 +31,14 @@ struct Edge {
 };
 typedef std::vector<Edge> EdgeList;
 
+/**
+ * Learning graph.
+ *
+ * Provides basic accesors and such; nothing special. Just a CSR.
+ * Ultimatly becomes an LC_CSR.
+ *
+ * @todo remove this intermediate step if using edgelists
+ */
 class LGraph {
 public:
   LGraph() : symmetrize_(false), directed_(false) {}
diff --git a/libdeepgalois/include/net.h b/libdeepgalois/include/net.h
index 0182a7e65e..743ac5ea11 100644
--- a/libdeepgalois/include/net.h
+++ b/libdeepgalois/include/net.h
@@ -24,6 +24,7 @@ class Net {
   size_t get_nnodes() { return num_samples; }
   void train(optimizer* opt, bool need_validate); // training
   void construct_layers();
+  //! Save the context object to all layers of the network
   void set_contexts() {
     for (size_t i = 0; i < num_layers; i++)
       layers[i]->set_context(context);
diff --git a/libdeepgalois/include/utils.h b/libdeepgalois/include/utils.h
index 1c330daa5b..6ce0ef105f 100644
--- a/libdeepgalois/include/utils.h
+++ b/libdeepgalois/include/utils.h
@@ -94,6 +94,8 @@ inline bool bernoulli(float_t p) {
   return uniform_rand(float_t(0), float_t(1)) <= p;
 }
 
+//! Get masks from datafile where first line tells range of 
+//! set to create mask from
 inline size_t read_masks(std::string dataset_str, std::string mask_type,
                          size_t& begin, size_t& end,
                          std::vector<uint8_t>& masks) {
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index aab3e1c3cd..237416342c 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -58,8 +58,6 @@ void Context::genGraph(LGraph& lg, Graph& g) {
 float_t* Context::get_in_ptr() { return &h_feats[0]; }
 #endif
 
-// user-defined pre-computing function, called during initialization
-// for each vertex v, compute pow(|N(v)|, -0.5), where |N(v)| is the degree of v
 void Context::norm_factor_counting() {
 #ifdef CPU_ONLY
   norm_factor = new float_t[n];
@@ -113,6 +111,8 @@ size_t Context::read_labels(std::string dataset_str) {
   return num_classes;
 }
 
+//! Read features, return the length of a feature vector
+//! Features are stored in the Context class
 size_t Context::read_features(std::string dataset_str) {
   std::cout << "Reading features ... ";
   Timer t_read;
diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp
index 9bfe231181..fe0e2708a6 100644
--- a/lonestargnn/gcn/gcn.cpp
+++ b/lonestargnn/gcn/gcn.cpp
@@ -10,6 +10,7 @@ int main(int argc, char** argv) {
   galois::SharedMemSys G;
   LonestarGnnStart(argc, argv, name, desc, url);
   Net network; // the neural network to train
+  // read network, features, ground truth, initialize metadata
   network.init(dataset, epochs, hidden1);
   network.construct_layers(); // default setting for now; can be customized by
                               // the user

From ad7a9df6582043ab40ba310a990014e360b319f0 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 27 Feb 2020 16:45:59 -0600
Subject: [PATCH 055/660] optimizer; tiny dnn copyright

---
 libdeepgalois/include/optimizer.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/libdeepgalois/include/optimizer.h b/libdeepgalois/include/optimizer.h
index 96ef841644..28cbabc5f5 100644
--- a/libdeepgalois/include/optimizer.h
+++ b/libdeepgalois/include/optimizer.h
@@ -1,3 +1,11 @@
+/**
+ * Code modified from below link.
+ *
+ * https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/optimizers/optimizer.h
+ * Copyright (c) 2013, Taiga Nomi and the respective contributors
+ * All rights reserved.
+ * Reused under 3-BSD
+ */
 #pragma once
 
 #include <algorithm>
@@ -6,6 +14,7 @@
 #ifndef CPU_ONLY
 #include "math_functions.hh"
 #endif
+
 // base class of optimizer
 // usesHessian : true if an optimizer uses hessian (2nd order derivative of loss
 // function)

From 72af67a587a3e838865733eb586ffb4f43afc3c4 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 27 Feb 2020 16:47:21 -0600
Subject: [PATCH 056/660] added licensenote.txt for later release purposes

---
 libdeepgalois/licensenote.txt | 8 ++++++++
 1 file changed, 8 insertions(+)
 create mode 100644 libdeepgalois/licensenote.txt

diff --git a/libdeepgalois/licensenote.txt b/libdeepgalois/licensenote.txt
new file mode 100644
index 0000000000..c1e14addca
--- /dev/null
+++ b/libdeepgalois/licensenote.txt
@@ -0,0 +1,8 @@
+TODO
+
+figure out which files have coded based on other codebsaes, get license,
+note here
+
+e.g.
+https://github.com/tiny-dnn/tiny-dnn/tree/master/tiny_dnn
+under BSD-3

From b90ae48cd90f9843a257ab88060e61f4f1343bd7 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 27 Feb 2020 16:57:38 -0600
Subject: [PATCH 057/660] layer copyright + some comments i made while reading

---
 libdeepgalois/include/layers/layer.h | 34 ++++++++++++++++++++++++----
 1 file changed, 30 insertions(+), 4 deletions(-)

diff --git a/libdeepgalois/include/layers/layer.h b/libdeepgalois/include/layers/layer.h
index 438ee45993..b4fecdbca2 100644
--- a/libdeepgalois/include/layers/layer.h
+++ b/libdeepgalois/include/layers/layer.h
@@ -1,4 +1,13 @@
 #pragma once
+/**
+ * Code based on below link.
+ *
+ * https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/layers/layer.h
+ *
+ * Copyright (c) 2013, Taiga Nomi and the respective contributors
+ * All rights reserved.
+ * Reused/revised under 3-BSD
+ */
 
 #include <queue>
 #include <cmath>
@@ -29,8 +38,10 @@
  * - in_shape            ... specify input data shapes
  * - out_shape           ... specify output data shapes
  * - layer_type          ... name of layer
+ *
+ * Node inheritance is just to get accessed to linked-list semantics it
+ * provides
  **/
-
 class layer : public node {
 public:
   layer(unsigned level, std::vector<size_t> in_dims,
@@ -43,21 +54,26 @@ class layer : public node {
   virtual ~layer()                       = default;
   virtual std::string layer_type() const = 0;
   virtual void set_netphase(net_phase phase) {}
+  //! save context
   virtual void set_context(Context* ctx) { context = ctx; }
   virtual acc_t get_masked_loss() { return acc_t(0); }
-  // virtual void forward_propagation(const vec_t &in_data, vec_t &out_data) =
-  // 0; virtual void back_propagation(const vec_t &in_data, const vec_t
-  // &out_data, vec_t &out_grad, vec_t &in_grad) = 0;
+
+  // main functions for layer work
   virtual void forward_propagation(const float_t* in_data,
                                    float_t* out_data)                = 0;
   virtual void back_propagation(const float_t* in_data, const float_t* out_data,
                                 float_t* out_grad, float_t* in_grad) = 0;
 
+  // is this layer trainable?
   void set_trainable(bool trainable) { trainable_ = trainable; }
   bool trainable() const { return trainable_; }
+
+  // name metadata
   void set_name(std::string name) { name_ = name; }
   std::string get_name() { return name_; }
+
   mask_t* get_device_masks() { return d_masks_; }
+  //! debug print function
   void print_layer_info() {
     std::cout << "Layer" << level_ << " type: " << layer_type() << " input["
               << input_dims[0] << "," << input_dims[1] << "] output["
@@ -73,11 +89,14 @@ class layer : public node {
     copy_masks_device(input_dims[0], masks_, d_masks_);
 #endif
   }
+
+  //! set the data of the previous layer connected to this one
   void set_in_data(float_t* data) {
     prev_ = std::make_shared<edge>(this, input_dims[0], input_dims[1]);
     prev_->set_data(data);
     // no need to allocate memory for gradients, since this is the input layer.
   }
+
   void add_edge() {
     // add an outgoing edge
     next_ = std::make_shared<edge>(this, output_dims[0], output_dims[1]);
@@ -87,15 +106,22 @@ class layer : public node {
   void alloc_grad() {
     // allocate memory for intermediate gradients
   }
+
+  //! calls forward propagation using previous layer as input and writes
+  //! to next layer as output
   void forward() {
     // std::cout << name_ << ": forwarding ... ";
     forward_propagation(prev()->get_data(), next()->get_data());
   }
+
+  //! calls backward propagation
   void backward() {
     // std::cout << name_ << ": backwarding ... ";
     back_propagation(prev()->get_data(), next()->get_data(),
                      next()->get_gradient(), prev()->get_gradient());
   }
+
+  //! use optimizer to update weights given gradient
   void update_weight(optimizer* opt) {
     // vec_t diff;
     // prev()->merge_grads(&diff);

From 03927df74445ac51319f09153b6132849039b942 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 27 Feb 2020 16:57:57 -0600
Subject: [PATCH 058/660] node.h copyright

---
 libdeepgalois/include/node.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/libdeepgalois/include/node.h b/libdeepgalois/include/node.h
index 8b48e85aa8..947e997275 100644
--- a/libdeepgalois/include/node.h
+++ b/libdeepgalois/include/node.h
@@ -1,4 +1,14 @@
 #pragma once
+/**
+ * Code modified from below
+ *
+ * https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/node.h
+ *
+ * Copyright (c) 2013, Taiga Nomi and the respective contributors
+ * All rights reserved.
+ * Reused/revised under 3-BSD
+ */
+
 #include <vector>
 #include <memory>
 #include <cassert>

From 6727ddd377ec54cebfb61dbd622e62ed49ae6c23 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Sat, 29 Feb 2020 14:35:27 -0600
Subject: [PATCH 059/660] add pubmed dataset

---
 libdeepgalois/include/utils.h | 6 +++---
 libdeepgalois/src/net.cpp     | 4 ++--
 lonestargnn/gcn/gcn.cpp       | 3 ++-
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/libdeepgalois/include/utils.h b/libdeepgalois/include/utils.h
index 6ce0ef105f..086dcf321a 100644
--- a/libdeepgalois/include/utils.h
+++ b/libdeepgalois/include/utils.h
@@ -99,7 +99,7 @@ inline bool bernoulli(float_t p) {
 inline size_t read_masks(std::string dataset_str, std::string mask_type,
                          size_t& begin, size_t& end,
                          std::vector<uint8_t>& masks) {
-  if (dataset_str != "citeseer" && dataset_str != "cora") {
+  if (dataset_str != "citeseer" && dataset_str != "cora" && dataset_str != "pubmed") {
     std::cout << "Dataset currently not supported\n";
     exit(1);
   }
@@ -123,8 +123,8 @@ inline size_t read_masks(std::string dataset_str, std::string mask_type,
     }
     i++;
   }
-  // std::cout << mask_type + "_mask range: [" << begin << ", " << end
-  //	<< ") Number of valid samples: " << sample_count << "\n";
+  std::cout << mask_type + "_mask range: [" << begin << ", " << end
+    << ") Number of valid samples: " << sample_count << "\n";
   in.close();
   return sample_count;
 }
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index 9b78853833..6e253a2afd 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -8,7 +8,7 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1) {
   context->norm_factor_counting(); // pre-compute normalizing factor
   num_epochs = epochs;
 
-  std::cout << "Reading label masks ... ";
+  //std::cout << "Reading label masks ... ";
   train_mask.resize(num_samples, 0);
   val_mask.resize(num_samples, 0);
   // get testing and validation sets
@@ -25,7 +25,7 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1) {
         read_masks(dataset_str, "train", train_begin, train_end, train_mask);
     val_count = read_masks(dataset_str, "val", val_begin, val_end, val_mask);
   }
-  std::cout << "Done\n";
+  //std::cout << "Done\n";
 
   num_layers = NUM_CONV_LAYERS + 1;
   // initialize feature metadata
diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp
index fe0e2708a6..55c4e2320f 100644
--- a/lonestargnn/gcn/gcn.cpp
+++ b/lonestargnn/gcn/gcn.cpp
@@ -27,6 +27,7 @@ int main(int argc, char** argv) {
   Ttrain.stop();
 
   if (do_test) {
+    std::cout << "\n";
     // test using test samples
     size_t n        = network.get_nnodes();
     acc_t test_loss = 0.0, test_acc = 0.0;
@@ -44,7 +45,7 @@ int main(int argc, char** argv) {
     Ttest.start();
     double test_time = network.evaluate(test_begin, test_end, test_count,
                                         &test_mask[0], test_loss, test_acc);
-    std::cout << "\nTesting: test_loss = " << test_loss
+    std::cout << "Testing: test_loss = " << test_loss
               << " test_acc = " << test_acc << " test_time = " << test_time
               << "\n";
     Ttest.stop();

From f6c35c88c5ce1b98e42fbc06b9536ae5619a8b6f Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Sat, 29 Feb 2020 15:56:15 -0600
Subject: [PATCH 060/660] update aggregator

---
 libdeepgalois/include/types.h   |  3 ++-
 libdeepgalois/src/aggregator.cu | 36 ++++++++++++++++++++++++++++++---
 2 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/libdeepgalois/include/types.h b/libdeepgalois/include/types.h
index 387b5f5b60..b669a25188 100644
--- a/libdeepgalois/include/types.h
+++ b/libdeepgalois/include/types.h
@@ -21,6 +21,7 @@ typedef uint8_t mask_t; // mask is used to indicate different uses of labels:
                         // train, val, test
 #define CHUNK_SIZE 256
 #define TB_SIZE 256
+#define BLOCK_SIZE 256
 #define WARP_SIZE 32
-
+#define WARPS_PER_BLOCK (BLOCK_SIZE / WARP_SIZE)
 #endif
diff --git a/libdeepgalois/src/aggregator.cu b/libdeepgalois/src/aggregator.cu
index f0c06722b6..522975dca3 100644
--- a/libdeepgalois/src/aggregator.cu
+++ b/libdeepgalois/src/aggregator.cu
@@ -12,7 +12,7 @@ __device__ void scale_add(const int n, const float_t alpha, const float_t* a,
     y[i] = alpha * a[i] + b[i];
 }
 
-__global__ void update_all_kernel(size_t n, size_t len, CSRGraph g,
+__global__ void update_all_naive(size_t n, size_t len, CSRGraph g,
                                   const float_t* in, float_t* out,
                                   bool norm, const float_t* norm_factor) {
   CUDA_KERNEL_LOOP(src, n) {
@@ -29,13 +29,43 @@ __global__ void update_all_kernel(size_t n, size_t len, CSRGraph g,
   }
 }
 
+__global__ void update_all_warp(size_t n, size_t len, CSRGraph g,
+                                  const float_t* in, float_t* out,
+                                  bool norm, const float_t* norm_factor) {
+  __shared__ index_type ptrs[BLOCK_SIZE/WARP_SIZE][2];
+  const int thread_id   = BLOCK_SIZE * blockIdx.x + threadIdx.x;  // global thread index
+  const int thread_lane = threadIdx.x & (WARP_SIZE-1);            // thread index within the warp
+  const int warp_id     = thread_id   / WARP_SIZE;                // global warp index
+  const int warp_lane   = threadIdx.x / WARP_SIZE;                // warp index within the CTA
+  const int num_warps   = (BLOCK_SIZE / WARP_SIZE) * gridDim.x;   // total number of active warps
+
+  for(int src = warp_id; src < n; src += num_warps) {
+    float_t a = 0.0, b = 1.0;
+    if (norm) a = norm_factor[src];
+    if (thread_lane < 2)
+      ptrs[warp_lane][thread_lane] = g.edge_begin(src + thread_lane);
+    __syncthreads();
+    const index_type row_begin = ptrs[warp_lane][0];
+    const index_type row_end   = ptrs[warp_lane][1];
+    index_type base_src = src * len;
+    for(index_type offset = row_begin; offset < row_end; offset ++) {
+      index_type dst = g.getEdgeDst(offset);
+      if (norm) b = a * norm_factor[dst];
+      index_type base_dst = dst * len;
+      for (int i = 0; i < len; i += WARP_SIZE)
+        if (thread_lane+i < len)
+          out[base_src+thread_lane+i] += in[base_dst+thread_lane+i] * b;
+    }
+  }
+}
+
 void update_all(size_t len, CSRGraph& g, const float_t* in, float_t* out,
                 bool norm, const float_t* norm_factor) {
   unsigned n = g.nnodes;
   //std::cout << "[debug]: update_all on GPU, n=" << n << ", len=" << len << "\n";
   //print_device_vector(10, norm_factor, "norm_factor");
   CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t)));
-  update_all_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
-      n, len, g, in, out, norm, norm_factor);
+  //update_all_naive<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, len, g, in, out, norm, norm_factor);
+  update_all_warp<<<(n-1)/WARPS_PER_BLOCK+1, BLOCK_SIZE>>>(n, len, g, in, out, norm, norm_factor);
   CudaTest("solving update_all kernel failed");
 }

From 53a8ee82061d94cd04f9bdbf2def855bbdee8e6e Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Mon, 2 Mar 2020 09:51:14 -0600
Subject: [PATCH 061/660] update dropout_gpu

---
 libdeepgalois/include/math_functions.hh       |  4 +--
 libdeepgalois/src/layers/graph_conv_layer.cpp |  2 +-
 libdeepgalois/src/math_functions.cu           | 36 +++++++------------
 3 files changed, 15 insertions(+), 27 deletions(-)

diff --git a/libdeepgalois/include/math_functions.hh b/libdeepgalois/include/math_functions.hh
index 414635b0e2..d647a35e3a 100644
--- a/libdeepgalois/include/math_functions.hh
+++ b/libdeepgalois/include/math_functions.hh
@@ -81,8 +81,8 @@ void d_relu_gpu(const int n, const float_t* in_diff, const float_t* data,
                 float_t* out_diff); // ReLU derivative
 void dropout_gpu(const int n, const float scale, const float dropout_rate,
                  const float_t* in, unsigned* masks, float_t* out); // dropout
-void d_dropout_gpu(const int n, const float scale, const float_t* in,
-                   const unsigned* masks, float_t* out); // dropout derivative
+void d_dropout_gpu(const int n, const float scale, const float dropout_rate,
+                   const float_t* in, const unsigned* masks, float_t* out); // dropout derivative
 void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
                const int M, const int N, const int K, const float alpha,
                const float* A, const float* B, const float beta, float* C);
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 7f69f915de..115b297512 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -136,7 +136,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
   if (level_ != 0) {
     sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0, in_temp);
     update_all(y, context->graph_gpu, in_temp, in_grad, true, context->d_norm_factor);
-    if (dropout_) d_dropout_gpu(x * y, scale_, in_grad, dropout_mask, in_grad);
+    if (dropout_) d_dropout_gpu(x * y, scale_, dropout_rate_, in_grad, dropout_mask, in_grad);
   }
   sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, d_weight_grad);
 }
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index 99b83e4d6e..ce8a8283ff 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -87,43 +87,31 @@ __global__ void setup_curand_kernel(const int n, curandState* state) {
 }
 
 __global__ void dropout_kernel(const int n, const float scale,
-                               const float dropout_rate, const float_t* in,
-                               unsigned* masks, curandState* state, float_t* out) {
-  CUDA_KERNEL_LOOP(i, n) {
-    // curandState_t curand_state;
-    //curand_init(1234, i, 0, &state[i]); // Each thread gets same seed 1234
-    //masks[i] = curand_uniform(&state[i]) <= dropout_rate ? 1 : 0;
-    //masks[i] = 1.0 - dropout_rate;
-    //out[i]   = in[i] * masks[i] * scale;
-    masks[i] = 1.0;
-    out[i]   = in[i];
-  }
+                               const float threshold, const float_t* in,
+                               unsigned* masks, float_t* out) {
+  CUDA_KERNEL_LOOP(i, n) { out[i] = in[i] * (masks[i] > threshold) * scale; }
 }
 
 void dropout_gpu(const int n, const float scale, const float dropout_rate,
                  const float_t* in, unsigned* masks, float_t* out) {
-  curandState* devStates;
-  CUDA_CHECK(cudaMalloc((void**)&devStates, n * sizeof(curandState)));
-  //std::cout << "[debug]: setup curand, n = " << n << "\n";
-  //setup_curand_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, devStates); 
-  //CudaTest("solving setup_curand kernel failed"); 
+  gpu_rng_uniform(n, masks);
   //std::cout << "[debug]: dropout_gpu\n";
   dropout_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
-      n, scale, dropout_rate, in, masks, devStates, out);
+      n, scale, dropout_rate, in, masks, out);
   CudaTest("solving dropout kernel failed");
-  CUDA_CHECK(cudaFree(devStates));
   //std::cout << "[debug]: dropout_gpu done\n";
 }
 
-__global__ void d_dropout_kernel(const int n, const float scale, const float_t* in,
+__global__ void d_dropout_kernel(const int n, const float scale,
+                                 const float threshold, const float_t* in,
                                  const unsigned* masks, float_t* out) {
-  //CUDA_KERNEL_LOOP(i, n) { out[i] = in[i] * masks[i] * scale; }
-  CUDA_KERNEL_LOOP(i, n) { out[i] = in[i]; }
+  CUDA_KERNEL_LOOP(i, n) { out[i] = in[i] * (masks[i] > threshold) * scale; }
 }
 
-void d_dropout_gpu(const int n, const float scale, const float_t* in,
-                   const unsigned* masks, float_t* out) {
-  d_dropout_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, scale, in, masks, out);
+void d_dropout_gpu(const int n, const float scale, const float dropout_rate, 
+                   const float_t* in, const unsigned* masks, float_t* out) {
+  d_dropout_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
+      n, scale, dropout_rate, in, masks, out);
   CudaTest("solving d_dropout kernel failed");
 }
 

From 405e8a47a6b51b2835f3441d84ad3c8a3251604b Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Mon, 2 Mar 2020 13:20:03 -0600
Subject: [PATCH 062/660] add cusparse

---
 libdeepgalois/include/aggregator.h            |  2 +
 libdeepgalois/include/context.h               | 21 +++++--
 libdeepgalois/include/cutils.h                | 38 ++++++++++++
 libdeepgalois/include/layers/layer.h          |  2 +-
 libdeepgalois/include/math_functions.hh       |  4 ++
 libdeepgalois/src/aggregator.cu               | 11 +++-
 libdeepgalois/src/context.cu                  | 59 +++++++++++++++----
 libdeepgalois/src/layers/graph_conv_layer.cpp | 13 +++-
 libdeepgalois/src/math_functions.cu           | 17 ++++++
 libgpu/include/graph_gpu.h                    |  8 +++
 10 files changed, 153 insertions(+), 22 deletions(-)

diff --git a/libdeepgalois/include/aggregator.h b/libdeepgalois/include/aggregator.h
index 552925c1bf..6853ea7126 100644
--- a/libdeepgalois/include/aggregator.h
+++ b/libdeepgalois/include/aggregator.h
@@ -8,4 +8,6 @@ void update_all(size_t len, Graph& g, const float_t* in, float_t* out,
 #include "graph_gpu.h"
 void update_all(size_t len, CSRGraph& g, const float_t* in, float_t* out,
                 bool norm, const float_t* norm_factor);
+void update_all_cusparse(size_t len, CSRGraph& g, const float_t* in, 
+                float_t* out, bool norm, const float_t* norm_factor);
 #endif
diff --git a/libdeepgalois/include/context.h b/libdeepgalois/include/context.h
index 47b32d023e..7444e90251 100644
--- a/libdeepgalois/include/context.h
+++ b/libdeepgalois/include/context.h
@@ -1,4 +1,14 @@
 #pragma once
+/**
+ * Code modified from below
+ *
+ * https://github.com/BVLC/caffe/blob/master/include/caffe/common.hpp
+ *
+ * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+ * All rights reserved.
+ * Reused/revised under BSD 2-Clause license
+ */
+
 #include <string>
 #include <cassert>
 #include "types.h"
@@ -58,6 +68,8 @@ class Context {
 #else
   CSRGraph graph_gpu; // the input graph, |V| = N
   inline static cublasHandle_t cublas_handle() { return cublas_handle_; }
+  inline static cusparseHandle_t cusparse_handle() { return cusparse_handle_; }
+  inline static cusparseMatDescr_t cusparse_matdescr() { return cusparse_matdescr_; }
   inline static curandGenerator_t curand_generator() {
     return curand_generator_;
   }
@@ -66,15 +78,12 @@ class Context {
 protected:
 #ifndef CPU_ONLY
   static cublasHandle_t cublas_handle_; // used to call cuBLAS
-  static curandGenerator_t
-      curand_generator_; // used to generate random numbers on GPU
+  static cusparseHandle_t cusparse_handle_; // used to call cuSPARSE
+  static cusparseMatDescr_t cusparse_matdescr_; // used to call cuSPARSE
+  static curandGenerator_t curand_generator_; // used to generate random numbers on GPU
 #endif
   Brew mode_;
   int solver_count_;
   int solver_rank_;
   bool multiprocess_;
-
-private:
-  // The private constructor to avoid duplicate instantiation.
-  // Context();
 };
diff --git a/libdeepgalois/include/cutils.h b/libdeepgalois/include/cutils.h
index c817863242..7be873a183 100644
--- a/libdeepgalois/include/cutils.h
+++ b/libdeepgalois/include/cutils.h
@@ -1,6 +1,7 @@
 #pragma once
 #include <cuda.h>
 #include <curand.h>
+#include <cusparse.h>
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
 #include <driver_types.h>
@@ -56,6 +57,32 @@ inline const char* cublasGetErrorString(cublasStatus_t error) {
   return "Unknown cublas status";
 }
 
+inline const char* cusparseGetErrorString(cusparseStatus_t error) {
+  switch (error) {
+  case CUSPARSE_STATUS_SUCCESS:
+    return "CUSPARSE_STATUS_SUCCESS";
+  case CUSPARSE_STATUS_NOT_INITIALIZED:
+    return "CUSPARSE_STATUS_NOT_INITIALIZED";
+  case CUSPARSE_STATUS_ALLOC_FAILED:
+    return "CUSPARSE_STATUS_ALLOC_FAILED";
+  case CUSPARSE_STATUS_INVALID_VALUE:
+    return "CUSPARSE_STATUS_INVALID_VALUE";
+  case CUSPARSE_STATUS_ARCH_MISMATCH:
+    return "CUSPARSE_STATUS_ARCH_MISMATCH";
+  case CUSPARSE_STATUS_MAPPING_ERROR:
+    return "CUSPARSE_STATUS_MAPPING_ERROR";
+  case CUSPARSE_STATUS_EXECUTION_FAILED:
+    return "CUSPARSE_STATUS_EXECUTION_FAILED";
+  case CUSPARSE_STATUS_INTERNAL_ERROR:
+    return "CUSPARSE_STATUS_INTERNAL_ERROR";
+  case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+      return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+  case CUSPARSE_STATUS_ZERO_PIVOT:
+        return "CUSPARSE_STATUS_ZERO_PIVOT";
+  }
+  return "Unknown cusparse status";
+}
+
 inline const char* curandGetErrorString(curandStatus_t error) {
   switch (error) {
   case CURAND_STATUS_SUCCESS:
@@ -110,6 +137,17 @@ inline const char* curandGetErrorString(curandStatus_t error) {
     }                                                                          \
   } while (0)
 
+#define CUSPARSE_CHECK(condition)                                              \
+  do {                                                                         \
+    cusparseStatus_t status = condition;                                       \
+    if (status != CUSPARSE_STATUS_SUCCESS) {                                   \
+      fprintf(stderr,                                                          \
+              "error %d: cuSPARSE error in file '%s' in line %i : %s.\n",      \
+              status, __FILE__, __LINE__, cusparseGetErrorString(status));     \
+      exit(EXIT_FAILURE);                                                      \
+    }                                                                          \
+  } while (0)
+
 #define CURAND_CHECK(condition)                                                \
   do {                                                                         \
     curandStatus_t status = condition;                                         \
diff --git a/libdeepgalois/include/layers/layer.h b/libdeepgalois/include/layers/layer.h
index b4fecdbca2..355f75a440 100644
--- a/libdeepgalois/include/layers/layer.h
+++ b/libdeepgalois/include/layers/layer.h
@@ -132,7 +132,7 @@ class layer : public node {
     bool parallel = (W.size() >= 512);
     opt->update(weight_grad, W, parallel); // W += grad
 #else
-	std::cout << name_ << ": ";
+	//std::cout << name_ << ": ";
     opt->update_gpu(input_dims[1]*output_dims[1], d_weight_grad, d_W); // W += grad
 #endif
     // prev()->clear_grads();
diff --git a/libdeepgalois/include/math_functions.hh b/libdeepgalois/include/math_functions.hh
index d647a35e3a..f89f34a5a5 100644
--- a/libdeepgalois/include/math_functions.hh
+++ b/libdeepgalois/include/math_functions.hh
@@ -91,6 +91,10 @@ void matmul_gpu(const size_t x, const size_t y, const size_t z,
 void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z,
                     const float_t* A, const float_t* B,
                     float_t* C); // matrix multiply
+void csrmm_gpu(const int M, const int N, const int K, const int nnz, 
+               const float alpha, const float* A_nonzeros, 
+	           const int* A_idx_ptr, const int* A_nonzero_idx,
+               const float* B, const float beta, float* C);
 void softmax_cross_entropy_gpu(int len, int begin, int end, const float_t* in_data,
                                const mask_t* masks, const label_t* labels,
                                float_t* loss, float_t* out_data);
diff --git a/libdeepgalois/src/aggregator.cu b/libdeepgalois/src/aggregator.cu
index 522975dca3..bbd7fbf8b3 100644
--- a/libdeepgalois/src/aggregator.cu
+++ b/libdeepgalois/src/aggregator.cu
@@ -62,10 +62,17 @@ __global__ void update_all_warp(size_t n, size_t len, CSRGraph g,
 void update_all(size_t len, CSRGraph& g, const float_t* in, float_t* out,
                 bool norm, const float_t* norm_factor) {
   unsigned n = g.nnodes;
-  //std::cout << "[debug]: update_all on GPU, n=" << n << ", len=" << len << "\n";
-  //print_device_vector(10, norm_factor, "norm_factor");
   CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t)));
   //update_all_naive<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, len, g, in, out, norm, norm_factor);
   update_all_warp<<<(n-1)/WARPS_PER_BLOCK+1, BLOCK_SIZE>>>(n, len, g, in, out, norm, norm_factor);
   CudaTest("solving update_all kernel failed");
 }
+
+void update_all_cusparse(size_t len, CSRGraph& g, const float_t* in, float_t* out,
+                bool norm, const float_t* norm_factor) {
+  unsigned n = g.nnodes;
+  CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t)));
+  //std::cout << "[debug]: update_all on GPU, n=" << n << ", len=" << len << "\n";
+  //print_device_vector(10, norm_factor, "norm_factor");
+  csrmm_gpu(n, len, n, g.nedges, 1.0, norm_factor, (const int*)g.row_start_ptr(), (const int*)g.edge_dst_ptr(), in, 0.0, out);
+}
diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu
index 647e010f60..1ba6bcc8bd 100644
--- a/libdeepgalois/src/context.cu
+++ b/libdeepgalois/src/context.cu
@@ -22,7 +22,8 @@ int64_t cluster_seedgen(void) {
   return seed;
 }
 
-__global__ void norm_factor_counting_kernel(int n, CSRGraph graph,
+// computing normalization factor for each vertex
+__global__ void norm_factor_counting_node(int n, CSRGraph graph,
                                             float_t* norm_fac) {
   CUDA_KERNEL_LOOP(i, n) {
     float_t temp = sqrt(float_t(graph.getOutDegree(i)));
@@ -33,34 +34,69 @@ __global__ void norm_factor_counting_kernel(int n, CSRGraph graph,
   }
 }
 
+// TODO: make sure self-loop added for each vertex
+// computing normalization factor for each edge
+__global__ void norm_factor_counting_edge(int n, CSRGraph graph,
+                                            float_t* norm_fac) {
+  CUDA_KERNEL_LOOP(src, n) {
+    float_t d_src = float_t(graph.getOutDegree(src));
+    assert(d_src != 0.0); // should never be zero since self-loop added for each vertex
+    d_src = 1.0 / sqrt(d_src);
+    index_type start = graph.edge_begin(src);
+    index_type end = graph.edge_end(src);
+	for (index_type e = start; e != end; e++) {
+      index_type dst = graph.getEdgeDst(e);
+      float_t d_dst = float_t(graph.getOutDegree(dst));
+      assert(d_dst != 0.0);
+      d_dst = 1.0 / sqrt(d_dst);
+      norm_fac[e] = d_src * d_dst;
+    }
+  }
+}
+
 void Context::norm_factor_counting_gpu() {
-  std::cout << "Pre-computing normalization factor (n=" << n << ")\n";
   assert(graph_gpu.nnodes == n);
+  std::cout << "Pre-computing normalization factor (n=" << n << ")\n";
+#ifdef USE_CUSPARSE
+  int nnz = graph_gpu.nedges;
+  CUDA_CHECK(cudaMalloc((void**)&d_norm_factor, nnz * sizeof(float_t)));
+  init_const_kernel<<<CUDA_GET_BLOCKS(nnz), CUDA_NUM_THREADS>>>(nnz, 0.0, d_norm_factor);
+  norm_factor_counting_edge<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
+      n, graph_gpu, d_norm_factor);
+#else
   CUDA_CHECK(cudaMalloc((void**)&d_norm_factor, n * sizeof(float_t)));
-  norm_factor_counting_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
+  norm_factor_counting_node<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
       n, graph_gpu, d_norm_factor);
+#endif
   CudaTest("solving norm_factor_counting kernel failed");
 }
 
-cublasHandle_t Context::cublas_handle_       = 0;
-curandGenerator_t Context::curand_generator_ = 0;
+cublasHandle_t Context::cublas_handle_         = 0;
+cusparseHandle_t Context::cusparse_handle_     = 0;
+cusparseMatDescr_t Context::cusparse_matdescr_ = 0;
+curandGenerator_t Context::curand_generator_   = 0;
 
 Context::Context()
     : mode_(Context::GPU), solver_count_(1), solver_rank_(0),
       multiprocess_(false) {
   CUBLAS_CHECK(cublasCreate(&cublas_handle_));
-  CURAND_CHECK(
-      curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT));
-  CURAND_CHECK(
-      curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen()));
+  CUSPARSE_CHECK(cusparseCreate(&cusparse_handle_));
+  CUSPARSE_CHECK(cusparseCreateMatDescr(&cusparse_matdescr_));
+  CUSPARSE_CHECK(cusparseSetMatType(cusparse_matdescr_,CUSPARSE_MATRIX_TYPE_GENERAL));
+  CUSPARSE_CHECK(cusparseSetMatIndexBase(cusparse_matdescr_,CUSPARSE_INDEX_BASE_ZERO));
+  CURAND_CHECK(curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT));
+  CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen()));
 }
 
 Context::~Context() {
   if (cublas_handle_)
     CUBLAS_CHECK(cublasDestroy(cublas_handle_));
-  if (curand_generator_) {
+  if (cusparse_handle_)
+    CUSPARSE_CHECK(cusparseDestroy(cusparse_handle_));
+  if (cusparse_matdescr_)
+    CUSPARSE_CHECK(cusparseDestroyMatDescr(cusparse_matdescr_));
+  if (curand_generator_)
     CURAND_CHECK(curandDestroyGenerator(curand_generator_));
-  }
 }
 
 void Context::SetDevice(const int device_id) {
@@ -100,3 +136,4 @@ void Context::copy_data_to_device() {
 }
 
 float_t* Context::get_in_ptr() { return d_feats; }
+
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 115b297512..753deed714 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -3,11 +3,16 @@
 #ifdef CPU_ONLY
 void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in, float_t* out) {
   update_all(len, g, in, out, true, context->norm_factor);
+}
 #else
 void graph_conv_layer::aggregate(size_t len, CSRGraph& g, const float_t* in, float_t* out) {
+  #ifdef USE_CUSPARSE
+  update_all_cusparse(y, context->graph_gpu, in_temp, in_grad, true, context->d_norm_factor);
+  #else
   update_all(len, g, in, out, true, context->d_norm_factor);
-#endif
+  #endif
 }
+#endif
 
 void graph_conv_layer::combine(const vec_t& self, const vec_t& neighbors, vec_t& out) {
   vec_t a(out.size(), 0);
@@ -35,7 +40,7 @@ graph_conv_layer::graph_conv_layer(unsigned level, bool act, bool norm,
 }
 
 void graph_conv_layer::init() {
-  std::cout << name_ << ": allocating memory for params and temp data... ";
+  //std::cout << name_ << ": allocating memory for params and temp data... ";
   Timer t_alloc;
   t_alloc.Start();
 #ifdef CPU_ONLY
@@ -135,7 +140,11 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
   else copy_gpu(x * z, out_grad, out_temp);
   if (level_ != 0) {
     sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0, in_temp);
+#ifdef USE_CUSPARSE
+    update_all_cusparse(y, context->graph_gpu, in_temp, in_grad, true, context->d_norm_factor);
+#else
     update_all(y, context->graph_gpu, in_temp, in_grad, true, context->d_norm_factor);
+#endif
     if (dropout_) d_dropout_gpu(x * y, scale_, dropout_rate_, in_grad, dropout_mask, in_grad);
   }
   sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, d_weight_grad);
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index ce8a8283ff..eb8f07c8b3 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -183,6 +183,23 @@ void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z,
   sgemm_gpu(TransA, TransB, dim_x, dim_y, dim_z, 1.0, A, B, 0.0, C);
 }
 
+void csrmm_gpu(const int M, const int N, const int K, const int nnz, 
+               const float alpha, const float* A_nonzeros, 
+	           const int* A_idx_ptr, const int* A_nnz_idx,
+               const float* B, const float beta, float* C) {
+  float *transpose_C;
+  CUDA_CHECK(cudaMalloc((void**)&transpose_C, N * K * sizeof(float)));
+  CUSPARSE_CHECK(cusparseScsrmm2(Context::cusparse_handle(),
+                 CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE,
+                 M, N, K, nnz, &alpha, Context::cusparse_matdescr(), A_nonzeros, 
+                 A_idx_ptr, A_nnz_idx, B, N, &beta, transpose_C, M)); 
+  //transpose C
+  const float one = 1.0;
+  const float zero = 0.0; 
+  CUBLAS_CHECK(cublasSgeam(Context::cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_T,
+                           N, M, &one, transpose_C, M, &zero, transpose_C, M, C, N)); 
+}
+
 void gemv_gpu(const CBLAS_TRANSPOSE TransA, const int M, const int N,
               const float alpha, const float* A, const float* x,
               const float beta, float* y) {
diff --git a/libgpu/include/graph_gpu.h b/libgpu/include/graph_gpu.h
index 050d7bbc69..4784e510a5 100644
--- a/libgpu/include/graph_gpu.h
+++ b/libgpu/include/graph_gpu.h
@@ -125,6 +125,14 @@ struct CSRGraph {
 		assert(src <= nnodes);
 		return row_start[src+1];
 	};
+	__device__ __host__ index_type *row_start_ptr() { return row_start; }
+	__device__ __host__ const index_type *row_start_ptr() const { return row_start; }
+	__device__ __host__ index_type *edge_dst_ptr() { return edge_dst; }
+	__device__ __host__ const index_type *edge_dst_ptr() const { return edge_dst; }
+	__device__ __host__ node_data_type *node_data_ptr() { return node_data; }
+	__device__ __host__ const node_data_type *node_data_ptr() const { return node_data; }
+	__device__ __host__ edge_data_type *edge_data_ptr() { return edge_data; }
+	__device__ __host__ const edge_data_type *edge_data_ptr() const { return edge_data; }
 
   index_type nnodes, nedges;
   index_type* row_start; // row_start[node] points into edge_dst, node starts at

From 43a52f34c2477df2200df99b08416aaa630158de Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Mon, 2 Mar 2020 14:02:30 -0600
Subject: [PATCH 063/660] update lgraph

---
 libdeepgalois/include/lgraph.h | 146 ++++++---------------------------
 1 file changed, 24 insertions(+), 122 deletions(-)

diff --git a/libdeepgalois/include/lgraph.h b/libdeepgalois/include/lgraph.h
index b15a505b45..f3426db2a2 100644
--- a/libdeepgalois/include/lgraph.h
+++ b/libdeepgalois/include/lgraph.h
@@ -11,26 +11,6 @@
 typedef unsigned IndexT;
 typedef float ValueT;
 
-/**
- * Used to temporarily store read edges from edge list; graph itself doesn't
- * use these.
- *
- * Source, dest, label.
- */
-struct Edge {
-  IndexT src;
-  IndexT dst;
-  ValueT elabel;
-  Edge() : src(0), dst(0), elabel(0) {}
-  Edge(IndexT from, IndexT to, ValueT el) : src(from), dst(to), elabel(el) {}
-  std::string to_string() const {
-    std::stringstream ss;
-    ss << "e(" << src << "," << dst << "," << elabel << ")";
-    return ss.str();
-  }
-};
-typedef std::vector<Edge> EdgeList;
-
 /**
  * Learning graph.
  *
@@ -41,15 +21,10 @@ typedef std::vector<Edge> EdgeList;
  */
 class LGraph {
 public:
-  LGraph() : symmetrize_(false), directed_(false) {}
+  LGraph() : directed_(false) {}
   void clean() {
     delete[] rowptr_;
     delete[] colidx_;
-    delete[] weight_;
-    degrees.clear();
-    el.clear();
-    // labels_.clear();
-    // vertices.clear();
   }
   bool directed() const { return directed_; }
   size_t num_vertices() const { return num_vertices_; }
@@ -59,111 +34,49 @@ class LGraph {
   unsigned out_degree(IndexT n) const { return rowptr_[n + 1] - rowptr_[n]; }
   IndexT get_offset(IndexT n) { return rowptr_[n]; }
   IndexT get_dest(IndexT n) { return colidx_[n]; }
-  ValueT get_weight(IndexT n) { return weight_[n]; }
-  unsigned get_max_degree() { return max_degree; }
-  // ValueT * labels() { return labels_.data(); }
-  // ValueT get_label(IndexT n) { return labels_[n]; }
-  void read_edgelist(const char* filename, bool symmetrize = false) {
+
+  void read_edgelist(const char* filename, bool symmetrize = false, bool add_self_loop = false) {
     std::ifstream in;
     std::string line;
     in.open(filename, std::ios::in);
-    IndexT max_vid = 0;
+    size_t m, n;
+    in >> m >> n >> std::ws;
+    num_vertices_ = m;
+    num_edges_    = 0;
+    std::cout << "num_vertices " << num_vertices_ << "\n";
+    vertices.resize(m);
+    for (size_t i = 0; i < n; i++) {
+      std::set<IndexT> neighbors;
+      if (add_self_loop) neighbors.insert(i);
+      vertices.push_back(neighbors);
+    }
     while (std::getline(in, line)) {
       std::istringstream edge_stream(line);
       IndexT u, v;
       edge_stream >> u;
       edge_stream >> v;
-      el.push_back(Edge(u, v, 1));
-      if (symmetrize)
-        el.push_back(Edge(v, u, 1));
-      if (u > max_vid)
-        max_vid = u;
-      if (v > max_vid)
-        max_vid = v;
+      vertices[u].insert(v);
+      if (symmetrize) vertices[v].insert(u);
     }
     in.close();
-    directed_     = true;
-    num_vertices_ = max_vid + 1;
-    num_edges_    = el.size();
-    std::cout << "num_vertices_ " << num_vertices_ << " num_edges_ "
-              << num_edges_ << "\n";
-    MakeGraphFromEL();
+	for (size_t i = 0; i < n; i++) num_edges_ += vertices[i].size();
+	std::cout << "num_edges " << num_edges_ << "\n";
+    MakeCSR(vertices);
   }
 
 private:
-  EdgeList el;
-  bool symmetrize_; // whether to symmetrize a directed graph
   bool directed_;
   size_t num_vertices_;
   size_t num_edges_;
   IndexT* rowptr_;
   IndexT* colidx_;
-  ValueT* weight_;
-  unsigned max_degree;
-  std::vector<IndexT> degrees;
-  std::vector<ValueT> labels_;
-  std::vector<std::vector<Edge>> vertices;
-
-  static bool compare_id(Edge a, Edge b) { return (a.dst < b.dst); }
 
-  void MakeGraphFromEL() {
-    SquishGraph();
-    MakeCSR(false);
-  }
-
-  void SquishGraph(bool remove_selfloops  = true,
-                   bool remove_redundents = true) {
-    std::vector<Edge> neighbors;
-    for (size_t i = 0; i < num_vertices_; i++)
-      vertices.push_back(neighbors);
-    for (size_t i = 0; i < num_edges_; i++)
-      vertices[el[i].src].push_back(el[i]);
-    el.clear();
-    printf("Sorting the neighbor lists...");
-    for (size_t i = 0; i < num_vertices_; i++)
-      std::sort(vertices[i].begin(), vertices[i].end(), compare_id);
-    printf(" Done\n");
-    // remove self loops
-    int num_selfloops = 0;
-    if (remove_selfloops) {
-      printf("Removing self loops...");
-      for (size_t i = 0; i < num_vertices_; i++) {
-        for (unsigned j = 0; j < vertices[i].size(); j++) {
-          if (i == vertices[i][j].dst) {
-            vertices[i].erase(vertices[i].begin() + j);
-            num_selfloops++;
-            j--;
-          }
-        }
-      }
-      printf(" %d selfloops are removed\n", num_selfloops);
-      num_edges_ -= num_selfloops;
-    }
-    // remove redundent
-    int num_redundents = 0;
-    if (remove_redundents) {
-      printf("Removing redundent edges...");
-      for (size_t i = 0; i < num_vertices_; i++) {
-        for (unsigned j = 1; j < vertices[i].size(); j++) {
-          if (vertices[i][j].dst == vertices[i][j - 1].dst) {
-            vertices[i].erase(vertices[i].begin() + j);
-            num_redundents++;
-            j--;
-          }
-        }
-      }
-      printf(" %d redundent edges are removed\n", num_redundents);
-      num_edges_ -= num_redundents;
-    }
-  }
-
-  void MakeCSR(bool transpose) {
+  void MakeCSR(std::vector<std::set<IndexT> > vertices, bool transpose) {
+    std::vector<IndexT> degrees;
     degrees.resize(num_vertices_);
     std::fill(degrees.begin(), degrees.end(), 0);
     for (size_t i = 0; i < num_vertices_; i++)
       degrees[i] = vertices[i].size();
-    max_degree = *(std::max_element(degrees.begin(), degrees.end()));
-
     std::vector<IndexT> offsets(degrees.size() + 1);
     IndexT total = 0;
     for (size_t n = 0; n < degrees.size(); n++) {
@@ -171,26 +84,15 @@ class LGraph {
       total += degrees[n];
     }
     offsets[degrees.size()] = total;
-
+    degrees.clear();
     assert(num_edges_ == offsets[num_vertices_]);
-    weight_ = new ValueT[num_edges_];
     colidx_ = new IndexT[num_edges_];
     rowptr_ = new IndexT[num_vertices_ + 1];
     for (size_t i = 0; i < num_vertices_ + 1; i++)
       rowptr_[i] = offsets[i];
     for (size_t i = 0; i < num_vertices_; i++) {
-      for (auto it = vertices[i].begin(); it < vertices[i].end(); it++) {
-        Edge e = *it;
-        assert(i == e.src);
-        if (symmetrize_ || (!symmetrize_ && !transpose)) {
-          weight_[offsets[e.src]]   = e.elabel;
-          colidx_[offsets[e.src]++] = e.dst;
-        }
-        if (symmetrize_ || (!symmetrize_ && transpose)) {
-          weight_[offsets[e.dst]]   = e.elabel;
-          colidx_[offsets[e.dst]++] = e.src;
-        }
-      }
+      for (auto dst : vertices[i])
+          colidx_[offsets[i]++] = dst;
     }
   }
 };

From 7fdbfe639bdf256abdb2287d984f859c992644e6 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Mon, 2 Mar 2020 14:15:41 -0600
Subject: [PATCH 064/660] fix lgraph

---
 libdeepgalois/include/lgraph.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libdeepgalois/include/lgraph.h b/libdeepgalois/include/lgraph.h
index f3426db2a2..2eb5ec6863 100644
--- a/libdeepgalois/include/lgraph.h
+++ b/libdeepgalois/include/lgraph.h
@@ -44,7 +44,7 @@ class LGraph {
     num_vertices_ = m;
     num_edges_    = 0;
     std::cout << "num_vertices " << num_vertices_ << "\n";
-    vertices.resize(m);
+    std::vector<std::set<IndexT> > vertices(m);
     for (size_t i = 0; i < n; i++) {
       std::set<IndexT> neighbors;
       if (add_self_loop) neighbors.insert(i);
@@ -71,7 +71,7 @@ class LGraph {
   IndexT* rowptr_;
   IndexT* colidx_;
 
-  void MakeCSR(std::vector<std::set<IndexT> > vertices, bool transpose) {
+  void MakeCSR(std::vector<std::set<IndexT> > vertices) {
     std::vector<IndexT> degrees;
     degrees.resize(num_vertices_);
     std::fill(degrees.begin(), degrees.end(), 0);

From f07d8c22ef080d7d67a38dff8f58eac180300acc Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Mon, 2 Mar 2020 14:41:23 -0600
Subject: [PATCH 065/660] add selfloop

---
 libdeepgalois/include/context.h |  1 +
 libdeepgalois/src/context.cpp   | 27 ++++++++++++++++++++++++---
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/libdeepgalois/include/context.h b/libdeepgalois/include/context.h
index 7444e90251..61a8eed69d 100644
--- a/libdeepgalois/include/context.h
+++ b/libdeepgalois/include/context.h
@@ -65,6 +65,7 @@ class Context {
 #ifdef CPU_ONLY
   Graph graph_cpu; // the input graph, |V| = N
   void genGraph(LGraph& lg, Graph& g);
+  void add_selfloop(Graph og, Graph &g);
 #else
   CSRGraph graph_gpu; // the input graph, |V| = N
   inline static cublasHandle_t cublas_handle() { return cublas_handle_; }
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index 237416342c..6f7169add4 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -18,20 +18,23 @@ size_t Context::read_graph(std::string dataset_str) {
 }
 
 #ifdef CPU_ONLY
-size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype) {
+size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype, bool selfloop = false) {
   galois::StatTimer Tread("GraphReadingTime");
   Tread.start();
-  LGraph lgraph;
   if (filetype == "el") {
     std::string filename = path + dataset_str + ".el";
     printf("Reading .el file: %s\n", filename.c_str());
+    LGraph lgraph;
     lgraph.read_edgelist(filename.c_str(), true); // symmetrize
     genGraph(lgraph, graph_cpu);
     lgraph.clean();
   } else if (filetype == "gr") {
     std::string filename = path + dataset_str + ".csgr";
     printf("Reading .gr file: %s\n", filename.c_str());
-    galois::graphs::readGraph(graph_cpu, filename);
+    if (selfloop) {
+      galois::graphs::readGraph(graph_temp, filename);
+      add_selfloop(graph_temp, graph_cpu);
+    } else galois::graphs::readGraph(graph_cpu, filename);
   } else {
     printf("Unkown file format\n");
     exit(1);
@@ -55,6 +58,24 @@ void Context::genGraph(LGraph& lg, Graph& g) {
   }
 }
 
+void Context::add_selfloop(Graph og, Graph &g) {
+  g.allocateFrom(og.size(), og.size()+og.sizeEdges());
+  g.constructNodes();
+  for (size_t src = 0; src < og.size(); src++) {
+    g.getData(src) = 1;
+    auto row_end = og.edge_end(src);
+    g.fixEndEdge(src, row_end+src+1);
+    bool self_inserted = false;
+    for (auto e : og.edges(src)) {
+      auto dst = og.edgeDst(e);
+      if (!self_inserted && dst > src) {
+        g.constructEdge(e, src, 0);
+        self_inserted = true;
+      }
+      g.constructEdge(e, dst, 0);
+  }
+}
+
 float_t* Context::get_in_ptr() { return &h_feats[0]; }
 #endif
 

From b040b0d171872c26747961509e49438c329377cf Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 2 Mar 2020 15:41:41 -0600
Subject: [PATCH 066/660] comments/licensing things

---
 .../include/layers/graph_conv_layer.h         | 31 +++++++++++--------
 libdeepgalois/licensenote.txt                 |  2 ++
 libdeepgalois/src/layers/graph_conv_layer.cpp | 11 +++++--
 libdeepgalois/src/math_functions.cpp          |  1 +
 4 files changed, 30 insertions(+), 15 deletions(-)

diff --git a/libdeepgalois/include/layers/graph_conv_layer.h b/libdeepgalois/include/layers/graph_conv_layer.h
index 86a91c7287..e4296a44ff 100644
--- a/libdeepgalois/include/layers/graph_conv_layer.h
+++ b/libdeepgalois/include/layers/graph_conv_layer.h
@@ -2,19 +2,24 @@
 #include "layer.h"
 #include "aggregator.h"
 
-/* GraphConv Layer
-    Parameters
-    ----------
-    x: int, number of samples.
-    y: int, Input feature size.
-    z: int, Output feature size.
-    dropout: bool, optional, if True, a dropout operation is applied before
-   other operations. norm : bool, optional, if True, the normalizer
-   :math:`c_{ij}` is applied. Default: ``True``. bias : bool, optional, if True,
-   adds a learnable bias to the output. Default: ``False``. activation: callable
-   activation function/layer or None, optional If not None, applies an
-   activation function to the updated node features. Default: ``None``.
-*/
+
+/**
+ * GraphConv Layer; based on DGL implementation
+ * https://docs.dgl.ai/en/0.4.x/_modules/dgl/nn/pytorch/conv/graphconv.html
+ *
+ *   Parameters
+ *   ----------
+ *   x: int, number of samples.
+ *   y: int, Input feature size.
+ *   z: int, Output feature size.
+ *   dropout: bool, optional, if True, a dropout operation is applied before
+ *   other operations.
+ *   norm : bool, optional, if True, the normalizer :math:`c_{ij}` is applied.
+ *          Default: ``True``.
+ *   bias : bool, optional, if True, adds a learnable bias to the output.
+ *          Default: ``False``.
+ *   activation: default false
+ */
 class graph_conv_layer : public layer {
 public:
   graph_conv_layer(unsigned level, bool act, bool norm, bool bias, bool dropout,
diff --git a/libdeepgalois/licensenote.txt b/libdeepgalois/licensenote.txt
index c1e14addca..224adbc701 100644
--- a/libdeepgalois/licensenote.txt
+++ b/libdeepgalois/licensenote.txt
@@ -6,3 +6,5 @@ note here
 e.g.
 https://github.com/tiny-dnn/tiny-dnn/tree/master/tiny_dnn
 under BSD-3
+
+DGL structure as well from what I can tell
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 753deed714..c7ae0b944d 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -73,9 +73,12 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_
                 &dropout_mask[i * y], &in_temp[i * y]);
       }, galois::loopname("dropout"));
     matmul1D1D(x, z, y, in_temp, &W[0], out_temp); // x*y; y*z; x*z
-  } else
+  } else {
     matmul1D1D(x, z, y, in_data, &W[0], out_temp); // x*y; y*z; x*z
+  }
+
   aggregate(z, context->graph_cpu, out_temp, out_data);
+
   if (act_) {
     galois::do_all(
         galois::iterate((size_t)0, x),
@@ -95,7 +98,10 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
           out_temp[i * z + j] = out_data[i * z + j] > float_t(0)
                                 ? out_grad[i * z + j] : float_t(0);
       }, galois::loopname("d_relu"));
-  } else copy1D1D(x * z, out_grad, out_temp); // TODO: avoid copying
+  } else {
+    copy1D1D(x * z, out_grad, out_temp); // TODO: avoid copying
+  }
+
   if (level_ != 0) { // no need to calculate in_grad for the first layer
     vec_t trans_W(z * y);
     transpose(y, z, W, trans_W); // derivative of matmul needs transposed matrix
@@ -113,6 +119,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
         galois::loopname("d_dropout"));
     }
   }
+
   // calculate weight gradients
   transpose(x, y, in_data, trans_data);                       // y*x
   matmul1D1D(y, z, x, trans_data, out_temp, &weight_grad[0]); // y*x; x*z; y*z
diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp
index 9914fd68d5..5c6c8b7ec3 100644
--- a/libdeepgalois/src/math_functions.cpp
+++ b/libdeepgalois/src/math_functions.cpp
@@ -185,6 +185,7 @@ void matmul2D(const tensor_t& A, const tensor_t& B, tensor_t& C) {
   }
 }
 
+// num rows in A, C; num columns in B, C; num columns in A, rows in B
 void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z,
                 const float_t* A, const float_t* B, float_t* C) {
   galois::StatTimer Tmatmul("MatMul");

From 87a35505889bf9b5379cec8b9c093fda6f84af20 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Mon, 2 Mar 2020 15:44:15 -0600
Subject: [PATCH 067/660] add selfloop

---
 libdeepgalois/include/context.h |  6 +++---
 libdeepgalois/include/net.h     |  2 +-
 libdeepgalois/src/context.cpp   | 31 +++++++++++++++++++------------
 libdeepgalois/src/context.cu    |  3 ++-
 libdeepgalois/src/net.cpp       |  4 ++--
 libgpu/include/graph_gpu.h      | 30 ++++++++++++++++++++++++++----
 lonestargnn/gcn/gcn.cpp         |  2 +-
 lonestargnn/lonestargnn.h       |  1 +
 8 files changed, 55 insertions(+), 24 deletions(-)

diff --git a/libdeepgalois/include/context.h b/libdeepgalois/include/context.h
index 61a8eed69d..bfc3a90c25 100644
--- a/libdeepgalois/include/context.h
+++ b/libdeepgalois/include/context.h
@@ -35,15 +35,15 @@ class Context {
   bool multiprocess() { return multiprocess_; }
   void set_multiprocess(bool val) { multiprocess_ = val; }
   bool root_solver() { return solver_rank_ == 0; }
-  size_t read_graph(std::string dataset_str);
+  size_t read_graph(std::string dataset_str, bool selfloop);
   size_t read_labels(std::string dataset_str);
   size_t read_features(std::string dataset_str);
   label_t get_label(size_t i) { return labels[i]; }
   label_t* get_labels_ptr(size_t i) { return &(labels[0]); }
   float_t* get_in_ptr();
 
-  size_t read_graph_cpu(std::string dataset_str, std::string filetype = "gr");
-  size_t read_graph_gpu(std::string dataset_str);
+  size_t read_graph_cpu(std::string dataset_str, std::string filetype, bool selfloop);
+  size_t read_graph_gpu(std::string dataset_str, bool selfloop);
   void copy_data_to_device(); // copy labels and input features
   void SetDevice(const int device_id);
   void DeviceQuery() {}
diff --git a/libdeepgalois/include/net.h b/libdeepgalois/include/net.h
index 743ac5ea11..79364105e9 100644
--- a/libdeepgalois/include/net.h
+++ b/libdeepgalois/include/net.h
@@ -18,7 +18,7 @@
 class Net {
 public:
   Net() {}
-  void init(std::string dataset_str, unsigned epochs, unsigned hidden1);
+  void init(std::string dataset_str, unsigned epochs, unsigned hidden1, bool selfloop);
   size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; }
   size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id + 1]; }
   size_t get_nnodes() { return num_samples; }
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index 6f7169add4..eefa2da886 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -8,17 +8,17 @@ Context::Context()
 Context::~Context() {}
 #endif
 
-size_t Context::read_graph(std::string dataset_str) {
+size_t Context::read_graph(std::string dataset_str, bool selfloop) {
 #ifdef CPU_ONLY
-  n = read_graph_cpu(dataset_str, "gr");
+  n = read_graph_cpu(dataset_str, "gr", selfloop);
 #else
-  n = read_graph_gpu(dataset_str);
+  n = read_graph_gpu(dataset_str, selfloop);
 #endif
   return n;
 }
 
 #ifdef CPU_ONLY
-size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype, bool selfloop = false) {
+size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype, bool selfloop) {
   galois::StatTimer Tread("GraphReadingTime");
   Tread.start();
   if (filetype == "el") {
@@ -63,16 +63,23 @@ void Context::add_selfloop(Graph og, Graph &g) {
   g.constructNodes();
   for (size_t src = 0; src < og.size(); src++) {
     g.getData(src) = 1;
-    auto row_end = og.edge_end(src);
-    g.fixEndEdge(src, row_end+src+1);
+    auto begin = og.edge_begin(src);
+    auto end = og.edge_end(src);
+    g.fixEndEdge(src, end+src+1);
     bool self_inserted = false;
-    for (auto e : og.edges(src)) {
+    for (auto e = begin; e != end; e++) {
       auto dst = og.edgeDst(e);
-      if (!self_inserted && dst > src) {
-        g.constructEdge(e, src, 0);
-        self_inserted = true;
-      }
-      g.constructEdge(e, dst, 0);
+      if (!self_inserted) {
+        if (dst > src) {
+          g.constructEdge(e+src, src, 0);
+          g.constructEdge(e+src+1, dst, 0);
+          self_inserted = true;
+        else if (e+1 == end) {
+          g.constructEdge(e+src+1, src, 0);
+          g.constructEdge(e+src, dst, 0);
+          self_inserted = true;
+        } else g.constructEdge(e+src, dst, 0);
+      } else g.constructEdge(e+src+1, dst, 0);
   }
 }
 
diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu
index 1ba6bcc8bd..23c27f370f 100644
--- a/libdeepgalois/src/context.cu
+++ b/libdeepgalois/src/context.cu
@@ -116,10 +116,11 @@ void Context::SetDevice(const int device_id) {
       curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen()));
 }
 
-size_t Context::read_graph_gpu(std::string dataset_str) {
+size_t Context::read_graph_gpu(std::string dataset_str, bool selfloop) {
   std::string filename = path + dataset_str + ".csgr";
   CSRGraph g;
   g.read(filename.c_str(), false);
+  if (selfloop) g.add_selfloop();
   g.copy_to_gpu(graph_gpu);
   return graph_gpu.nnodes;
 }
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index 6e253a2afd..6bfe6f0f30 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -1,9 +1,9 @@
 #include "net.h"
 
-void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1) {
+void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1, bool selfloop) {
   context = new Context();
   // read graph, get num nodes
-  num_samples = context->read_graph(dataset_str);
+  num_samples = context->read_graph(dataset_str, selfloop);
   num_classes = context->read_labels(dataset_str);
   context->norm_factor_counting(); // pre-compute normalizing factor
   num_epochs = epochs;
diff --git a/libgpu/include/graph_gpu.h b/libgpu/include/graph_gpu.h
index 4784e510a5..be337eb9ac 100644
--- a/libgpu/include/graph_gpu.h
+++ b/libgpu/include/graph_gpu.h
@@ -104,10 +104,32 @@ struct CSRGraph {
 		check_cuda(cudaMalloc((void **)&node_data, m * sizeof(node_data_type)));
 		check_cuda(cudaMemcpy(node_data, h_labels, m * sizeof(node_data_type), cudaMemcpyHostToDevice));
 		#endif
-		//int *h_degrees = (int *)malloc(m * sizeof(int));
-		//for (int i = 0; i < m; i++) h_degrees[i] = h_row_offsets[i + 1] - h_row_offsets[i];
-		//check_cuda(cudaMalloc((void **)&d_degrees, m * sizeof(int)));
-		//check_cuda(cudaMemcpy(d_degrees, h_degrees, m * sizeof(int), cudaMemcpyHostToDevice));
+	}
+
+	void add_selfloop() {
+		index_type *new_edge_dst = new index_type[nnodes+nedges];
+		for (index_type i = 0; i < nnodes; i++) {
+			index_type start = row_start[i];
+			index_type end = row_start[i+1];
+			bool selfloop_inserted = false;
+			for (index_type e = start; e != end; e++) {
+				index_type dst = edge_dst[e];
+				if (!selfloop_inserted) {
+					if (i < dst) {
+						selfloop_inserted = true;
+						new_edge_dst[e+i] = i;
+						new_edge_dst[e+i+1] = dst;
+					} else if (e+1 == end) {
+						selfloop_inserted = true;
+						new_edge_dst[e+i+1] = i;
+						new_edge_dst[e+i] = dst;
+					} else new_edge_dst[e+i] = dst;
+				} else new_edge_dst[e+i+1] = dst;
+			}
+		}
+		for (index_type i = 0; i < nnodes; i++) row_start[i] += i;
+		delete edge_dst;
+		edge_dst = new_edge_dst;
 	}
 
 	__device__ __host__ index_type getEdgeDst(unsigned edge) {
diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp
index 55c4e2320f..0f419896c8 100644
--- a/lonestargnn/gcn/gcn.cpp
+++ b/lonestargnn/gcn/gcn.cpp
@@ -11,7 +11,7 @@ int main(int argc, char** argv) {
   LonestarGnnStart(argc, argv, name, desc, url);
   Net network; // the neural network to train
   // read network, features, ground truth, initialize metadata
-  network.init(dataset, epochs, hidden1);
+  network.init(dataset, epochs, hidden1, add_selfloop);
   network.construct_layers(); // default setting for now; can be customized by
                               // the user
   network.print_layers_info();
diff --git a/lonestargnn/lonestargnn.h b/lonestargnn/lonestargnn.h
index 7ecbe32d7a..e41fb39ab4 100644
--- a/lonestargnn/lonestargnn.h
+++ b/lonestargnn/lonestargnn.h
@@ -48,6 +48,7 @@ static cll::opt<unsigned> max_degree(
 static cll::opt<unsigned> do_validate("dv", cll::desc("enable validation"),
                                       cll::init(1));
 static cll::opt<unsigned> do_test("dt", cll::desc("enable test"), cll::init(1));
+static cll::opt<unsigned> add_selfloop("sl", cll::desc("add selfloop"), cll::init(0));
 
 //! standard global options to the benchmarks
 extern llvm::cl::opt<bool> skipVerify;

From 987557635c871e588320bdae1ffc55aed1662102 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Mon, 2 Mar 2020 16:21:50 -0600
Subject: [PATCH 068/660] comment add_selfloop

---
 libdeepgalois/include/context.h |  2 +-
 libdeepgalois/src/context.cpp   | 10 +++++++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/libdeepgalois/include/context.h b/libdeepgalois/include/context.h
index bfc3a90c25..4715c2c248 100644
--- a/libdeepgalois/include/context.h
+++ b/libdeepgalois/include/context.h
@@ -65,7 +65,7 @@ class Context {
 #ifdef CPU_ONLY
   Graph graph_cpu; // the input graph, |V| = N
   void genGraph(LGraph& lg, Graph& g);
-  void add_selfloop(Graph og, Graph &g);
+  void add_selfloop(Graph &og, Graph &g);
 #else
   CSRGraph graph_gpu; // the input graph, |V| = N
   inline static cublasHandle_t cublas_handle() { return cublas_handle_; }
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index eefa2da886..e30b5f0f37 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -32,6 +32,7 @@ size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype, bo
     std::string filename = path + dataset_str + ".csgr";
     printf("Reading .gr file: %s\n", filename.c_str());
     if (selfloop) {
+      Graph graph_temp;
       galois::graphs::readGraph(graph_temp, filename);
       add_selfloop(graph_temp, graph_cpu);
     } else galois::graphs::readGraph(graph_cpu, filename);
@@ -58,9 +59,10 @@ void Context::genGraph(LGraph& lg, Graph& g) {
   }
 }
 
-void Context::add_selfloop(Graph og, Graph &g) {
+void Context::add_selfloop(Graph &og, Graph &g) {
   g.allocateFrom(og.size(), og.size()+og.sizeEdges());
   g.constructNodes();
+  /*
   for (size_t src = 0; src < og.size(); src++) {
     g.getData(src) = 1;
     auto begin = og.edge_begin(src);
@@ -68,19 +70,21 @@ void Context::add_selfloop(Graph og, Graph &g) {
     g.fixEndEdge(src, end+src+1);
     bool self_inserted = false;
     for (auto e = begin; e != end; e++) {
-      auto dst = og.edgeDst(e);
+      auto dst = og.getEdgeDst(e);
       if (!self_inserted) {
         if (dst > src) {
           g.constructEdge(e+src, src, 0);
           g.constructEdge(e+src+1, dst, 0);
           self_inserted = true;
-        else if (e+1 == end) {
+        } else if (e+1 == end) {
           g.constructEdge(e+src+1, src, 0);
           g.constructEdge(e+src, dst, 0);
           self_inserted = true;
         } else g.constructEdge(e+src, dst, 0);
       } else g.constructEdge(e+src+1, dst, 0);
+    }
   }
+  */
 }
 
 float_t* Context::get_in_ptr() { return &h_feats[0]; }

From 655feaa6fefd596bb9a2219c92acd77e25bcabc2 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Mon, 2 Mar 2020 17:43:06 -0600
Subject: [PATCH 069/660] fix selfloop

---
 libgpu/include/graph_gpu.h | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/libgpu/include/graph_gpu.h b/libgpu/include/graph_gpu.h
index be337eb9ac..da420ea416 100644
--- a/libgpu/include/graph_gpu.h
+++ b/libgpu/include/graph_gpu.h
@@ -105,8 +105,19 @@ struct CSRGraph {
 		check_cuda(cudaMemcpy(node_data, h_labels, m * sizeof(node_data_type), cudaMemcpyHostToDevice));
 		#endif
 	}
-
+	void print_neighbors(index_type vid) {
+		printf("Vertex %d neighbors: [ ", vid);
+		index_type start = row_start[vid];
+		index_type end = row_start[vid+1];
+		for (index_type e = start; e != end; e++) {
+			index_type dst = edge_dst[e];
+			printf("%d ",  dst);
+		}
+		printf("]\n");
+	}
 	void add_selfloop() {
+		print_neighbors(nnodes-1);
+		print_neighbors(0);
 		index_type *new_edge_dst = new index_type[nnodes+nedges];
 		for (index_type i = 0; i < nnodes; i++) {
 			index_type start = row_start[i];
@@ -127,9 +138,13 @@ struct CSRGraph {
 				} else new_edge_dst[e+i+1] = dst;
 			}
 		}
-		for (index_type i = 0; i < nnodes; i++) row_start[i] += i;
+		for (index_type i = 0; i <= nnodes; i++) row_start[i] += i;
 		delete edge_dst;
 		edge_dst = new_edge_dst;
+		nedges += nnodes;
+        printf("nnodes = %d, nedges = %d\n", nnodes, nedges);
+		print_neighbors(nnodes-1);
+		print_neighbors(0);
 	}
 
 	__device__ __host__ index_type getEdgeDst(unsigned edge) {

From 8cca98240cc87ed9d094b0e2558662a60adb91eb Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 2 Mar 2020 16:17:22 -0600
Subject: [PATCH 070/660] random.h is from Caffe; add TODO

---
 libdeepgalois/include/random.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/libdeepgalois/include/random.h b/libdeepgalois/include/random.h
index 8560a24de1..b63914bca1 100644
--- a/libdeepgalois/include/random.h
+++ b/libdeepgalois/include/random.h
@@ -1,3 +1,6 @@
+// From Caffe library it seems
+// TODO get the license from it
+
 #ifndef RANDOM_H
 #define RANDOM_H
 typedef boost::mt19937 rng_t;

From 4002f29138d082b84e43a81923200b7fb6e267ae Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 2 Mar 2020 16:22:54 -0600
Subject: [PATCH 071/660] moving some unused(?) files

---
 libdeepgalois/include/{ => unused}/random.h | 0
 libdeepgalois/include/{ => unused}/timer.h  | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename libdeepgalois/include/{ => unused}/random.h (100%)
 rename libdeepgalois/include/{ => unused}/timer.h (100%)

diff --git a/libdeepgalois/include/random.h b/libdeepgalois/include/unused/random.h
similarity index 100%
rename from libdeepgalois/include/random.h
rename to libdeepgalois/include/unused/random.h
diff --git a/libdeepgalois/include/timer.h b/libdeepgalois/include/unused/timer.h
similarity index 100%
rename from libdeepgalois/include/timer.h
rename to libdeepgalois/include/unused/timer.h

From c49db0ab0ff1f9b5dd39bf1dae344227d1513294 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 2 Mar 2020 16:27:09 -0600
Subject: [PATCH 072/660] initialize deepgalois directory: context.h moved

---
 libdeepgalois/include/{ => deepgalois}/context.h | 0
 libdeepgalois/include/layers/layer.h             | 2 +-
 libdeepgalois/include/net.h                      | 2 +-
 libdeepgalois/src/context.cpp                    | 2 +-
 4 files changed, 3 insertions(+), 3 deletions(-)
 rename libdeepgalois/include/{ => deepgalois}/context.h (100%)

diff --git a/libdeepgalois/include/context.h b/libdeepgalois/include/deepgalois/context.h
similarity index 100%
rename from libdeepgalois/include/context.h
rename to libdeepgalois/include/deepgalois/context.h
diff --git a/libdeepgalois/include/layers/layer.h b/libdeepgalois/include/layers/layer.h
index 355f75a440..9d6f01f644 100644
--- a/libdeepgalois/include/layers/layer.h
+++ b/libdeepgalois/include/layers/layer.h
@@ -26,7 +26,7 @@
 #include "../types.h"
 #include "../utils.h"
 #include "../gtypes.h"
-#include "../context.h"
+#include "deepgalois/context.h"
 #include "../optimizer.h"
 #include "../math_functions.hh"
 /**
diff --git a/libdeepgalois/include/net.h b/libdeepgalois/include/net.h
index 79364105e9..1530a9c4dd 100644
--- a/libdeepgalois/include/net.h
+++ b/libdeepgalois/include/net.h
@@ -4,7 +4,7 @@
 #include <random>
 #include "types.h"
 #include "gtypes.h"
-#include "context.h"
+#include "deepgalois/context.h"
 #include "galois/Timer.h"
 #include "layers.h"
 #include "optimizer.h"
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index e30b5f0f37..b257f8422c 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -1,4 +1,4 @@
-#include "context.h"
+#include "deepgalois/context.h"
 #include "gtypes.h"
 
 #ifdef CPU_ONLY

From f54717a6ea05ce285ed1dc970f3f0d0217e15f1d Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 2 Mar 2020 16:35:14 -0600
Subject: [PATCH 073/660] gtypes, types, utils to deepgalois directory

---
 libdeepgalois/include/aggregator.h              | 4 ++--
 libdeepgalois/include/{ => deepgalois}/gtypes.h | 0
 libdeepgalois/include/{ => deepgalois}/types.h  | 0
 libdeepgalois/include/{ => deepgalois}/utils.h  | 0
 libdeepgalois/include/layers/layer.h            | 8 ++++----
 libdeepgalois/include/math_functions.hh         | 3 ++-
 libdeepgalois/include/net.h                     | 4 ++--
 libdeepgalois/include/node.h                    | 2 +-
 libdeepgalois/include/optimizer.h               | 2 +-
 libdeepgalois/src/aggregator.cpp                | 2 --
 libdeepgalois/src/context.cpp                   | 1 -
 libdeepgalois/src/math_functions.cpp            | 1 -
 lonestargnn/lonestargnn.h                       | 4 ++--
 13 files changed, 14 insertions(+), 17 deletions(-)
 rename libdeepgalois/include/{ => deepgalois}/gtypes.h (100%)
 rename libdeepgalois/include/{ => deepgalois}/types.h (100%)
 rename libdeepgalois/include/{ => deepgalois}/utils.h (100%)

diff --git a/libdeepgalois/include/aggregator.h b/libdeepgalois/include/aggregator.h
index 6853ea7126..c54f8f69bc 100644
--- a/libdeepgalois/include/aggregator.h
+++ b/libdeepgalois/include/aggregator.h
@@ -1,7 +1,7 @@
 #pragma once
-#include "types.h"
+#include "deepgalois/types.h"
 #ifdef CPU_ONLY
-#include "gtypes.h"
+#include "deepgalois/gtypes.h"
 void update_all(size_t len, Graph& g, const float_t* in, float_t* out,
                 bool norm, const float_t* norm_factor);
 #else
diff --git a/libdeepgalois/include/gtypes.h b/libdeepgalois/include/deepgalois/gtypes.h
similarity index 100%
rename from libdeepgalois/include/gtypes.h
rename to libdeepgalois/include/deepgalois/gtypes.h
diff --git a/libdeepgalois/include/types.h b/libdeepgalois/include/deepgalois/types.h
similarity index 100%
rename from libdeepgalois/include/types.h
rename to libdeepgalois/include/deepgalois/types.h
diff --git a/libdeepgalois/include/utils.h b/libdeepgalois/include/deepgalois/utils.h
similarity index 100%
rename from libdeepgalois/include/utils.h
rename to libdeepgalois/include/deepgalois/utils.h
diff --git a/libdeepgalois/include/layers/layer.h b/libdeepgalois/include/layers/layer.h
index 9d6f01f644..b393098680 100644
--- a/libdeepgalois/include/layers/layer.h
+++ b/libdeepgalois/include/layers/layer.h
@@ -1,6 +1,6 @@
 #pragma once
 /**
- * Code based on below link.
+ * Code from on below link. Modified under Galois.
  *
  * https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/layers/layer.h
  *
@@ -23,9 +23,9 @@
 #include <algorithm>
 #include <unordered_set>
 #include "../node.h"
-#include "../types.h"
-#include "../utils.h"
-#include "../gtypes.h"
+#include "deepgalois/types.h"
+#include "deepgalois/utils.h"
+#include "deepgalois/gtypes.h"
 #include "deepgalois/context.h"
 #include "../optimizer.h"
 #include "../math_functions.hh"
diff --git a/libdeepgalois/include/math_functions.hh b/libdeepgalois/include/math_functions.hh
index f89f34a5a5..2d3adc5404 100644
--- a/libdeepgalois/include/math_functions.hh
+++ b/libdeepgalois/include/math_functions.hh
@@ -4,7 +4,8 @@
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
-#include "types.h"
+#include "deepgalois/types.h"
+#include "deepgalois/utils.h"
 
 extern "C" {
 #include <cblas.h>
diff --git a/libdeepgalois/include/net.h b/libdeepgalois/include/net.h
index 1530a9c4dd..ba60ddf771 100644
--- a/libdeepgalois/include/net.h
+++ b/libdeepgalois/include/net.h
@@ -2,8 +2,8 @@
 #define _MODEL_H_
 
 #include <random>
-#include "types.h"
-#include "gtypes.h"
+#include "deepgalois/types.h"
+#include "deepgalois/gtypes.h"
 #include "deepgalois/context.h"
 #include "galois/Timer.h"
 #include "layers.h"
diff --git a/libdeepgalois/include/node.h b/libdeepgalois/include/node.h
index 947e997275..fa58ddea2b 100644
--- a/libdeepgalois/include/node.h
+++ b/libdeepgalois/include/node.h
@@ -12,7 +12,7 @@
 #include <vector>
 #include <memory>
 #include <cassert>
-#include "types.h"
+#include "deepgalois/types.h"
 class node;
 class layer;
 class edge;
diff --git a/libdeepgalois/include/optimizer.h b/libdeepgalois/include/optimizer.h
index 28cbabc5f5..cd6b36447c 100644
--- a/libdeepgalois/include/optimizer.h
+++ b/libdeepgalois/include/optimizer.h
@@ -10,7 +10,7 @@
 
 #include <algorithm>
 #include <unordered_map>
-#include "types.h"
+#include "deepgalois/types.h"
 #ifndef CPU_ONLY
 #include "math_functions.hh"
 #endif
diff --git a/libdeepgalois/src/aggregator.cpp b/libdeepgalois/src/aggregator.cpp
index 6bb301b0be..370d3a6514 100644
--- a/libdeepgalois/src/aggregator.cpp
+++ b/libdeepgalois/src/aggregator.cpp
@@ -1,5 +1,3 @@
-#include "types.h"
-#include "gtypes.h"
 #include "aggregator.h"
 #include "math_functions.hh"
 
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index b257f8422c..284f693829 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -1,5 +1,4 @@
 #include "deepgalois/context.h"
-#include "gtypes.h"
 
 #ifdef CPU_ONLY
 Context::Context()
diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp
index 5c6c8b7ec3..451fe59070 100644
--- a/libdeepgalois/src/math_functions.cpp
+++ b/libdeepgalois/src/math_functions.cpp
@@ -1,5 +1,4 @@
 #include "math_functions.hh"
-#include "utils.h"
 #include "galois/Timer.h"
 #include <immintrin.h>
 
diff --git a/lonestargnn/lonestargnn.h b/lonestargnn/lonestargnn.h
index e41fb39ab4..baf7681995 100644
--- a/lonestargnn/lonestargnn.h
+++ b/lonestargnn/lonestargnn.h
@@ -104,6 +104,6 @@ void LonestarGnnStart(int argc, char** argv, const char* app, const char* desc,
   galois::runtime::reportParam("(NULL)", "Hostname", name);
 }
 
-#include "types.h"
-#include "utils.h"
+#include "deepgalois/types.h"
+#include "deepgalois/utils.h"
 #include "net.h"

From 07f02c25c3c6ba78b00a5154a0aa625d9fde6fbd Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 2 Mar 2020 16:37:42 -0600
Subject: [PATCH 074/660] added tinydnn note for math_functions.hh

---
 libdeepgalois/include/math_functions.hh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/libdeepgalois/include/math_functions.hh b/libdeepgalois/include/math_functions.hh
index 2d3adc5404..3e1af207da 100644
--- a/libdeepgalois/include/math_functions.hh
+++ b/libdeepgalois/include/math_functions.hh
@@ -1,3 +1,7 @@
+/**
+ * File inspired by similar one from TinyDNN
+ * https://github.com/tiny-dnn/
+ */
 #ifndef _MATH_FUNCTIONS_
 #define _MATH_FUNCTIONS_
 #include <cmath>

From 4c2fabf62492910e9e18174ac5715177f03244e0 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 2 Mar 2020 16:39:30 -0600
Subject: [PATCH 075/660] aggregator.h to deepgalois

---
 libdeepgalois/include/{ => deepgalois}/aggregator.h | 0
 libdeepgalois/include/layers/graph_conv_layer.h     | 2 +-
 libdeepgalois/src/aggregator.cpp                    | 2 +-
 3 files changed, 2 insertions(+), 2 deletions(-)
 rename libdeepgalois/include/{ => deepgalois}/aggregator.h (100%)

diff --git a/libdeepgalois/include/aggregator.h b/libdeepgalois/include/deepgalois/aggregator.h
similarity index 100%
rename from libdeepgalois/include/aggregator.h
rename to libdeepgalois/include/deepgalois/aggregator.h
diff --git a/libdeepgalois/include/layers/graph_conv_layer.h b/libdeepgalois/include/layers/graph_conv_layer.h
index e4296a44ff..4016b49024 100644
--- a/libdeepgalois/include/layers/graph_conv_layer.h
+++ b/libdeepgalois/include/layers/graph_conv_layer.h
@@ -1,6 +1,6 @@
 #pragma once
 #include "layer.h"
-#include "aggregator.h"
+#include "deepgalois/aggregator.h"
 
 
 /**
diff --git a/libdeepgalois/src/aggregator.cpp b/libdeepgalois/src/aggregator.cpp
index 370d3a6514..b9d1a70c7a 100644
--- a/libdeepgalois/src/aggregator.cpp
+++ b/libdeepgalois/src/aggregator.cpp
@@ -1,4 +1,4 @@
-#include "aggregator.h"
+#include "deepgalois/aggregator.h"
 #include "math_functions.hh"
 
 void update_all(size_t len, Graph& g, const float_t* in, float_t* out,

From 56878dcef44e2ee5fb81e12f9fb98d47940e680f Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 2 Mar 2020 16:44:07 -0600
Subject: [PATCH 076/660] remove layers.h, move layers directory into
 deepgalois

---
 .../include/{ => deepgalois}/layers/arithmetic_layer.h    | 0
 .../include/{ => deepgalois}/layers/graph_conv_layer.h    | 0
 libdeepgalois/include/{ => deepgalois}/layers/layer.h     | 6 +++---
 .../include/{ => deepgalois}/layers/linear_layer.h        | 0
 .../include/{ => deepgalois}/layers/relu_layer.h          | 0
 .../include/{ => deepgalois}/layers/softmax_loss_layer.h  | 0
 libdeepgalois/include/layers.h                            | 8 --------
 libdeepgalois/include/net.h                               | 5 +++--
 libdeepgalois/src/layers/graph_conv_layer.cpp             | 2 +-
 libdeepgalois/src/layers/relu_layer.cpp                   | 2 +-
 libdeepgalois/src/layers/softmax_loss_layer.cpp           | 2 +-
 11 files changed, 9 insertions(+), 16 deletions(-)
 rename libdeepgalois/include/{ => deepgalois}/layers/arithmetic_layer.h (100%)
 rename libdeepgalois/include/{ => deepgalois}/layers/graph_conv_layer.h (100%)
 rename libdeepgalois/include/{ => deepgalois}/layers/layer.h (98%)
 rename libdeepgalois/include/{ => deepgalois}/layers/linear_layer.h (100%)
 rename libdeepgalois/include/{ => deepgalois}/layers/relu_layer.h (100%)
 rename libdeepgalois/include/{ => deepgalois}/layers/softmax_loss_layer.h (100%)
 delete mode 100644 libdeepgalois/include/layers.h

diff --git a/libdeepgalois/include/layers/arithmetic_layer.h b/libdeepgalois/include/deepgalois/layers/arithmetic_layer.h
similarity index 100%
rename from libdeepgalois/include/layers/arithmetic_layer.h
rename to libdeepgalois/include/deepgalois/layers/arithmetic_layer.h
diff --git a/libdeepgalois/include/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
similarity index 100%
rename from libdeepgalois/include/layers/graph_conv_layer.h
rename to libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
diff --git a/libdeepgalois/include/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h
similarity index 98%
rename from libdeepgalois/include/layers/layer.h
rename to libdeepgalois/include/deepgalois/layers/layer.h
index b393098680..7c40bc256c 100644
--- a/libdeepgalois/include/layers/layer.h
+++ b/libdeepgalois/include/deepgalois/layers/layer.h
@@ -22,13 +22,13 @@
 #include <utility>
 #include <algorithm>
 #include <unordered_set>
-#include "../node.h"
+#include "../../node.h"
 #include "deepgalois/types.h"
 #include "deepgalois/utils.h"
 #include "deepgalois/gtypes.h"
 #include "deepgalois/context.h"
-#include "../optimizer.h"
-#include "../math_functions.hh"
+#include "../../optimizer.h"
+#include "../../math_functions.hh"
 /**
  * base class of all kind of NN layers
  *
diff --git a/libdeepgalois/include/layers/linear_layer.h b/libdeepgalois/include/deepgalois/layers/linear_layer.h
similarity index 100%
rename from libdeepgalois/include/layers/linear_layer.h
rename to libdeepgalois/include/deepgalois/layers/linear_layer.h
diff --git a/libdeepgalois/include/layers/relu_layer.h b/libdeepgalois/include/deepgalois/layers/relu_layer.h
similarity index 100%
rename from libdeepgalois/include/layers/relu_layer.h
rename to libdeepgalois/include/deepgalois/layers/relu_layer.h
diff --git a/libdeepgalois/include/layers/softmax_loss_layer.h b/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h
similarity index 100%
rename from libdeepgalois/include/layers/softmax_loss_layer.h
rename to libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h
diff --git a/libdeepgalois/include/layers.h b/libdeepgalois/include/layers.h
deleted file mode 100644
index 432d315183..0000000000
--- a/libdeepgalois/include/layers.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef _LAYERS_H_
-#define _LAYERS_H_
-//#include "layers/relu_layer.h"
-//#include "layers/linear_layer.h"
-//#include "layers/arithmetic_layer.h"
-#include "layers/graph_conv_layer.h"
-#include "layers/softmax_loss_layer.h"
-#endif
diff --git a/libdeepgalois/include/net.h b/libdeepgalois/include/net.h
index ba60ddf771..b1d514050e 100644
--- a/libdeepgalois/include/net.h
+++ b/libdeepgalois/include/net.h
@@ -2,11 +2,12 @@
 #define _MODEL_H_
 
 #include <random>
+#include "galois/Timer.h"
 #include "deepgalois/types.h"
 #include "deepgalois/gtypes.h"
 #include "deepgalois/context.h"
-#include "galois/Timer.h"
-#include "layers.h"
+#include "deepgalois/layers/graph_conv_layer.h"
+#include "deepgalois/layers/softmax_loss_layer.h"
 #include "optimizer.h"
 
 #define NUM_CONV_LAYERS 2
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index c7ae0b944d..adcd7cc33c 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -1,4 +1,4 @@
-#include "layers/graph_conv_layer.h"
+#include "deepgalois/layers/graph_conv_layer.h"
 
 #ifdef CPU_ONLY
 void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in, float_t* out) {
diff --git a/libdeepgalois/src/layers/relu_layer.cpp b/libdeepgalois/src/layers/relu_layer.cpp
index 0c52d0eb25..ce2a167cb0 100644
--- a/libdeepgalois/src/layers/relu_layer.cpp
+++ b/libdeepgalois/src/layers/relu_layer.cpp
@@ -1,4 +1,4 @@
-#include "layers/relu_layer.h"
+#include "deepgalois/layers/relu_layer.h"
 
 // 𝑦[𝑙] = max(0, 𝑦[𝑙−1])
 void relu_layer::forward_propagation(const tensor_t& in_data,
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp
index af04b06bbf..cc3e3b941b 100644
--- a/libdeepgalois/src/layers/softmax_loss_layer.cpp
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp
@@ -1,4 +1,4 @@
-#include "layers/softmax_loss_layer.h"
+#include "deepgalois/layers/softmax_loss_layer.h"
 
 softmax_loss_layer::softmax_loss_layer(unsigned level,
                                        std::vector<size_t> in_dims,

From d47a23ac2280b4b8fe8366bee72519914c034970 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 2 Mar 2020 16:49:32 -0600
Subject: [PATCH 077/660] lgraph, cutils, math, net, node, optimizer to
 deepgalois

---
 libdeepgalois/include/deepgalois/context.h             | 10 +++++-----
 libdeepgalois/include/{ => deepgalois}/cutils.h        |  0
 libdeepgalois/include/deepgalois/layers/layer.h        |  6 +++---
 libdeepgalois/include/{ => deepgalois}/lgraph.h        |  0
 .../include/{ => deepgalois}/math_functions.hh         |  0
 libdeepgalois/include/{ => deepgalois}/net.h           |  2 +-
 libdeepgalois/include/{ => deepgalois}/node.h          |  0
 libdeepgalois/include/{ => deepgalois}/optimizer.h     |  2 +-
 libdeepgalois/src/aggregator.cpp                       |  2 +-
 libdeepgalois/src/math_functions.cpp                   |  2 +-
 libdeepgalois/src/net.cpp                              |  2 +-
 libdeepgalois/src/node.cpp                             |  2 +-
 libdeepgalois/src/optimizer.cpp                        |  2 +-
 lonestargnn/lonestargnn.h                              |  2 +-
 14 files changed, 16 insertions(+), 16 deletions(-)
 rename libdeepgalois/include/{ => deepgalois}/cutils.h (100%)
 rename libdeepgalois/include/{ => deepgalois}/lgraph.h (100%)
 rename libdeepgalois/include/{ => deepgalois}/math_functions.hh (100%)
 rename libdeepgalois/include/{ => deepgalois}/net.h (99%)
 rename libdeepgalois/include/{ => deepgalois}/node.h (100%)
 rename libdeepgalois/include/{ => deepgalois}/optimizer.h (99%)

diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h
index 4715c2c248..644f3f0c15 100644
--- a/libdeepgalois/include/deepgalois/context.h
+++ b/libdeepgalois/include/deepgalois/context.h
@@ -11,14 +11,14 @@
 
 #include <string>
 #include <cassert>
-#include "types.h"
-#include "utils.h"
+#include "deepgalois/types.h"
+#include "deepgalois/utils.h"
 #ifdef CPU_ONLY
-#include "lgraph.h"
-#include "gtypes.h"
+#include "deepgalois/lgraph.h"
+#include "deepgalois/gtypes.h"
 #else
 #include "graph_gpu.h"
-#include "cutils.h"
+#include "deepgalois/cutils.h"
 #endif
 
 class Context {
diff --git a/libdeepgalois/include/cutils.h b/libdeepgalois/include/deepgalois/cutils.h
similarity index 100%
rename from libdeepgalois/include/cutils.h
rename to libdeepgalois/include/deepgalois/cutils.h
diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h
index 7c40bc256c..b5757de2e3 100644
--- a/libdeepgalois/include/deepgalois/layers/layer.h
+++ b/libdeepgalois/include/deepgalois/layers/layer.h
@@ -22,13 +22,13 @@
 #include <utility>
 #include <algorithm>
 #include <unordered_set>
-#include "../../node.h"
+#include "deepgalois/node.h"
 #include "deepgalois/types.h"
 #include "deepgalois/utils.h"
 #include "deepgalois/gtypes.h"
 #include "deepgalois/context.h"
-#include "../../optimizer.h"
-#include "../../math_functions.hh"
+#include "deepgalois/optimizer.h"
+#include "deepgalois/math_functions.hh"
 /**
  * base class of all kind of NN layers
  *
diff --git a/libdeepgalois/include/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h
similarity index 100%
rename from libdeepgalois/include/lgraph.h
rename to libdeepgalois/include/deepgalois/lgraph.h
diff --git a/libdeepgalois/include/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh
similarity index 100%
rename from libdeepgalois/include/math_functions.hh
rename to libdeepgalois/include/deepgalois/math_functions.hh
diff --git a/libdeepgalois/include/net.h b/libdeepgalois/include/deepgalois/net.h
similarity index 99%
rename from libdeepgalois/include/net.h
rename to libdeepgalois/include/deepgalois/net.h
index b1d514050e..79176674c2 100644
--- a/libdeepgalois/include/net.h
+++ b/libdeepgalois/include/deepgalois/net.h
@@ -8,7 +8,7 @@
 #include "deepgalois/context.h"
 #include "deepgalois/layers/graph_conv_layer.h"
 #include "deepgalois/layers/softmax_loss_layer.h"
-#include "optimizer.h"
+#include "deepgalois/optimizer.h"
 
 #define NUM_CONV_LAYERS 2
 
diff --git a/libdeepgalois/include/node.h b/libdeepgalois/include/deepgalois/node.h
similarity index 100%
rename from libdeepgalois/include/node.h
rename to libdeepgalois/include/deepgalois/node.h
diff --git a/libdeepgalois/include/optimizer.h b/libdeepgalois/include/deepgalois/optimizer.h
similarity index 99%
rename from libdeepgalois/include/optimizer.h
rename to libdeepgalois/include/deepgalois/optimizer.h
index cd6b36447c..2c2d783d0d 100644
--- a/libdeepgalois/include/optimizer.h
+++ b/libdeepgalois/include/deepgalois/optimizer.h
@@ -12,7 +12,7 @@
 #include <unordered_map>
 #include "deepgalois/types.h"
 #ifndef CPU_ONLY
-#include "math_functions.hh"
+#include "deepgalois/math_functions.hh"
 #endif
 
 // base class of optimizer
diff --git a/libdeepgalois/src/aggregator.cpp b/libdeepgalois/src/aggregator.cpp
index b9d1a70c7a..c2a50710dd 100644
--- a/libdeepgalois/src/aggregator.cpp
+++ b/libdeepgalois/src/aggregator.cpp
@@ -1,5 +1,5 @@
 #include "deepgalois/aggregator.h"
-#include "math_functions.hh"
+#include "deepgalois/math_functions.hh"
 
 void update_all(size_t len, Graph& g, const float_t* in, float_t* out,
                 bool norm, const float_t* norm_factor) {
diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp
index 451fe59070..144419f16d 100644
--- a/libdeepgalois/src/math_functions.cpp
+++ b/libdeepgalois/src/math_functions.cpp
@@ -1,4 +1,4 @@
-#include "math_functions.hh"
+#include "deepgalois/math_functions.hh"
 #include "galois/Timer.h"
 #include <immintrin.h>
 
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index 6bfe6f0f30..33da83d0fc 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -1,4 +1,4 @@
-#include "net.h"
+#include "deepgalois/net.h"
 
 void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1, bool selfloop) {
   context = new Context();
diff --git a/libdeepgalois/src/node.cpp b/libdeepgalois/src/node.cpp
index 9b88620d65..4ab918e0cd 100644
--- a/libdeepgalois/src/node.cpp
+++ b/libdeepgalois/src/node.cpp
@@ -1,4 +1,4 @@
-#include "node.h"
+#include "deepgalois/node.h"
 #include <iostream>
 
 void edge::alloc() {
diff --git a/libdeepgalois/src/optimizer.cpp b/libdeepgalois/src/optimizer.cpp
index 0ec40cf4d0..c9c8768610 100644
--- a/libdeepgalois/src/optimizer.cpp
+++ b/libdeepgalois/src/optimizer.cpp
@@ -1,4 +1,4 @@
-#include "optimizer.h"
+#include "deepgalois/optimizer.h"
 #include "galois/Galois.h"
 
 void adagrad::update(const vec_t& dW, vec_t& W, bool parallelize) {
diff --git a/lonestargnn/lonestargnn.h b/lonestargnn/lonestargnn.h
index baf7681995..a04905b5cb 100644
--- a/lonestargnn/lonestargnn.h
+++ b/lonestargnn/lonestargnn.h
@@ -106,4 +106,4 @@ void LonestarGnnStart(int argc, char** argv, const char* app, const char* desc,
 
 #include "deepgalois/types.h"
 #include "deepgalois/utils.h"
-#include "net.h"
+#include "deepgalois/net.h"

From 7d69511228db07d588a1dfa2bef3e159bee38a12 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 2 Mar 2020 16:55:06 -0600
Subject: [PATCH 078/660] Context  class now deepgalois::Context

---
 libdeepgalois/include/deepgalois/context.h      | 2 ++
 libdeepgalois/include/deepgalois/layers/layer.h | 4 ++--
 libdeepgalois/include/deepgalois/net.h          | 2 +-
 libdeepgalois/src/context.cpp                   | 5 ++++-
 libdeepgalois/src/context.cu                    | 4 +++-
 libdeepgalois/src/net.cpp                       | 2 +-
 6 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h
index 644f3f0c15..d7f400d582 100644
--- a/libdeepgalois/include/deepgalois/context.h
+++ b/libdeepgalois/include/deepgalois/context.h
@@ -21,6 +21,7 @@
 #include "deepgalois/cutils.h"
 #endif
 
+namespace deepgalois {
 class Context {
 public:
   Context();
@@ -88,3 +89,4 @@ class Context {
   int solver_rank_;
   bool multiprocess_;
 };
+} // end deepgalois namespace
diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h
index b5757de2e3..bf89ad216d 100644
--- a/libdeepgalois/include/deepgalois/layers/layer.h
+++ b/libdeepgalois/include/deepgalois/layers/layer.h
@@ -55,7 +55,7 @@ class layer : public node {
   virtual std::string layer_type() const = 0;
   virtual void set_netphase(net_phase phase) {}
   //! save context
-  virtual void set_context(Context* ctx) { context = ctx; }
+  virtual void set_context(deepgalois::Context* ctx) { context = ctx; }
   virtual acc_t get_masked_loss() { return acc_t(0); }
 
   // main functions for layer work
@@ -158,7 +158,7 @@ class layer : public node {
   mask_t* masks_; // masks to show which samples are valid
   mask_t* d_masks_;
   float_t* loss; // error for each vertex: N x 1
-  Context* context;
+  deepgalois::Context* context;
 };
 
 // head: layer i+1, tail: layer i
diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h
index 79176674c2..efdd99b7b3 100644
--- a/libdeepgalois/include/deepgalois/net.h
+++ b/libdeepgalois/include/deepgalois/net.h
@@ -101,7 +101,7 @@ class Net {
   }
 
 protected:
-  Context* context;
+  deepgalois::Context* context;
   size_t num_samples;               // number of samples: N
   size_t num_classes;               // number of vertex classes: E
   size_t num_layers;                // for now hard-coded: NUM_CONV_LAYERS + 1
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index 284f693829..5e2ccf4c02 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -1,5 +1,7 @@
 #include "deepgalois/context.h"
 
+namespace deepgalois {
+
 #ifdef CPU_ONLY
 Context::Context()
     : mode_(Context::CPU), solver_count_(1), solver_rank_(0),
@@ -179,4 +181,5 @@ inline void init_features(size_t dim, vec_t &x) {
     for (size_t i = 0; i < dim; ++i)
         x[i] = dist(rng);
 }
-//*/
+*/
+} // end deepgalois namespace
diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu
index 23c27f370f..270252c5d8 100644
--- a/libdeepgalois/src/context.cu
+++ b/libdeepgalois/src/context.cu
@@ -54,6 +54,8 @@ __global__ void norm_factor_counting_edge(int n, CSRGraph graph,
   }
 }
 
+namespace deepgalois {
+
 void Context::norm_factor_counting_gpu() {
   assert(graph_gpu.nnodes == n);
   std::cout << "Pre-computing normalization factor (n=" << n << ")\n";
@@ -137,4 +139,4 @@ void Context::copy_data_to_device() {
 }
 
 float_t* Context::get_in_ptr() { return d_feats; }
-
+} // namespace context
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index 33da83d0fc..9c907dbf57 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -1,7 +1,7 @@
 #include "deepgalois/net.h"
 
 void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1, bool selfloop) {
-  context = new Context();
+  context = new deepgalois::Context();
   // read graph, get num nodes
   num_samples = context->read_graph(dataset_str, selfloop);
   num_classes = context->read_labels(dataset_str);

From 7f28900ced040ad353a1533074b0a49a71f77755 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 2 Mar 2020 16:58:29 -0600
Subject: [PATCH 079/660] Net class now deepgalois::Net

---
 libdeepgalois/include/deepgalois/net.h | 6 ++++++
 libdeepgalois/src/net.cpp              | 4 ++++
 lonestargnn/gcn/gcn.cpp                | 2 +-
 3 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h
index efdd99b7b3..0e18f39e1c 100644
--- a/libdeepgalois/include/deepgalois/net.h
+++ b/libdeepgalois/include/deepgalois/net.h
@@ -1,3 +1,5 @@
+// TODO if this code was based on something, get copyright/license and put here
+
 #ifndef _MODEL_H_
 #define _MODEL_H_
 
@@ -12,6 +14,8 @@
 
 #define NUM_CONV_LAYERS 2
 
+namespace deepgalois {
+
 // N: number of vertices, D: feature vector dimentions,
 // E: number of distinct labels, i.e. number of vertex classes
 // layer 1: features N x D, weights D x 16, out N x 16 (hidden1=16)
@@ -114,4 +118,6 @@ class Net {
   acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks);
 };
 
+} // namespace deepgalois
+
 #endif
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index 9c907dbf57..2221b3daad 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -1,5 +1,7 @@
 #include "deepgalois/net.h"
 
+namespace deepgalois {
+
 void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1, bool selfloop) {
   context = new deepgalois::Context();
   // read graph, get num nodes
@@ -123,3 +125,5 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count,
                              context->d_labels);
 #endif
 }
+
+} // namespace deepgalois
diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp
index 0f419896c8..2d47237298 100644
--- a/lonestargnn/gcn/gcn.cpp
+++ b/lonestargnn/gcn/gcn.cpp
@@ -9,7 +9,7 @@ const char* url  = 0;
 int main(int argc, char** argv) {
   galois::SharedMemSys G;
   LonestarGnnStart(argc, argv, name, desc, url);
-  Net network; // the neural network to train
+  deepgalois::Net network; // the neural network to train
   // read network, features, ground truth, initialize metadata
   network.init(dataset, epochs, hidden1, add_selfloop);
   network.construct_layers(); // default setting for now; can be customized by

From ce01e5b2702fe5fa73f850aec2bba80ff425cc38 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 2 Mar 2020 17:04:20 -0600
Subject: [PATCH 080/660] optimizers now deepgalois::optimizer

---
 libdeepgalois/include/deepgalois/layers/layer.h |  2 +-
 libdeepgalois/include/deepgalois/optimizer.h    | 11 ++++++++++-
 libdeepgalois/src/optimizer.cpp                 |  4 ++++
 lonestargnn/gcn/gcn.cpp                         |  2 +-
 4 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h
index bf89ad216d..028148c194 100644
--- a/libdeepgalois/include/deepgalois/layers/layer.h
+++ b/libdeepgalois/include/deepgalois/layers/layer.h
@@ -122,7 +122,7 @@ class layer : public node {
   }
 
   //! use optimizer to update weights given gradient
-  void update_weight(optimizer* opt) {
+  void update_weight(deepgalois::optimizer* opt) {
     // vec_t diff;
     // prev()->merge_grads(&diff);
 #ifdef CPU_ONLY
diff --git a/libdeepgalois/include/deepgalois/optimizer.h b/libdeepgalois/include/deepgalois/optimizer.h
index 2c2d783d0d..b6a90917ff 100644
--- a/libdeepgalois/include/deepgalois/optimizer.h
+++ b/libdeepgalois/include/deepgalois/optimizer.h
@@ -1,5 +1,5 @@
 /**
- * Code modified from below link.
+ * Code taken/modified from below link.
  *
  * https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/optimizers/optimizer.h
  * Copyright (c) 2013, Taiga Nomi and the respective contributors
@@ -8,6 +8,11 @@
  */
 #pragma once
 
+// TODO:
+// - use classes, not structs (modern C++)
+// - templatize this instead of using inheritance
+// - put optimizers in their own namespace
+
 #include <algorithm>
 #include <unordered_map>
 #include "deepgalois/types.h"
@@ -15,6 +20,8 @@
 #include "deepgalois/math_functions.hh"
 #endif
 
+namespace deepgalois {
+
 // base class of optimizer
 // usesHessian : true if an optimizer uses hessian (2nd order derivative of loss
 // function)
@@ -184,3 +191,5 @@ struct nesterov_momentum : public stateful_optimizer<1> {
   float_t lambda; // weight decay
   float_t mu;     // momentum
 };
+
+} // namespace deepgalois
diff --git a/libdeepgalois/src/optimizer.cpp b/libdeepgalois/src/optimizer.cpp
index c9c8768610..c3267f282e 100644
--- a/libdeepgalois/src/optimizer.cpp
+++ b/libdeepgalois/src/optimizer.cpp
@@ -1,6 +1,8 @@
 #include "deepgalois/optimizer.h"
 #include "galois/Galois.h"
 
+namespace deepgalois {
+
 void adagrad::update(const vec_t& dW, vec_t& W, bool parallelize) {
   vec_t& g = get<0>(W);
   if (parallelize) {
@@ -80,3 +82,5 @@ void nesterov_momentum::update(const vec_t& dW, vec_t& W, bool parallelize) {
       dWprev[i] = V;
     }, galois::loopname("nesterov_momentum_update"));
 }
+
+} // namespace deepgalois
diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp
index 2d47237298..7b4977dbe1 100644
--- a/lonestargnn/gcn/gcn.cpp
+++ b/lonestargnn/gcn/gcn.cpp
@@ -20,7 +20,7 @@ int main(int argc, char** argv) {
   // the optimizer used to update parameters, see optimizer.h for more details
   // optimizer *opt = new gradient_descent();
   // optimizer *opt = new adagrad();
-  optimizer* opt = new adam();
+  deepgalois::optimizer* opt = new deepgalois::adam();
   galois::StatTimer Ttrain("TrainAndVal");
   Ttrain.start();
   network.train(opt, do_validate); // do training using training samples

From f9f0d5ba5d3631852eb878a18c0fb3d4c6bc4364 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 2 Mar 2020 17:16:06 -0600
Subject: [PATCH 081/660] deepgalois namespace for node and layers TODO qualify
 classes with deepgalois; right now relies on everything being in deepgalois
 namespace

---
 .../include/deepgalois/layers/arithmetic_layer.h     |  2 ++
 .../include/deepgalois/layers/graph_conv_layer.h     |  3 ++-
 libdeepgalois/include/deepgalois/layers/layer.h      | 12 +++++++++---
 .../include/deepgalois/layers/linear_layer.h         |  2 ++
 libdeepgalois/include/deepgalois/layers/relu_layer.h |  2 ++
 .../include/deepgalois/layers/softmax_loss_layer.h   |  2 ++
 libdeepgalois/include/deepgalois/node.h              |  5 +++++
 libdeepgalois/src/layers/graph_conv_layer.cpp        |  4 ++++
 libdeepgalois/src/layers/relu_layer.cpp              |  4 ++++
 libdeepgalois/src/layers/softmax_loss_layer.cpp      |  4 ++++
 libdeepgalois/src/node.cpp                           |  4 ++++
 11 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/layers/arithmetic_layer.h b/libdeepgalois/include/deepgalois/layers/arithmetic_layer.h
index 63dc66f780..c28d0ed89c 100644
--- a/libdeepgalois/include/deepgalois/layers/arithmetic_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/arithmetic_layer.h
@@ -1,6 +1,7 @@
 #pragma once
 #include "layer.h"
 
+namespace deepgalois {
 // element-wise add N vectors ```y_i = x0_i + x1_i + ... + xnum_i```
 class elementwise_add_layer : public layer {
 public:
@@ -24,3 +25,4 @@ class elementwise_add_layer : public layer {
     in_grad = out_grad;
   }
 };
+} // namespace
diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
index 4016b49024..ed681bdf30 100644
--- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
@@ -2,7 +2,6 @@
 #include "layer.h"
 #include "deepgalois/aggregator.h"
 
-
 /**
  * GraphConv Layer; based on DGL implementation
  * https://docs.dgl.ai/en/0.4.x/_modules/dgl/nn/pytorch/conv/graphconv.html
@@ -20,6 +19,7 @@
  *          Default: ``False``.
  *   activation: default false
  */
+namespace deepgalois {
 class graph_conv_layer : public layer {
 public:
   graph_conv_layer(unsigned level, bool act, bool norm, bool bias, bool dropout,
@@ -84,3 +84,4 @@ class graph_conv_layer : public layer {
     }
   }
 };
+} // namespace
diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h
index 028148c194..c0deaf6748 100644
--- a/libdeepgalois/include/deepgalois/layers/layer.h
+++ b/libdeepgalois/include/deepgalois/layers/layer.h
@@ -29,6 +29,9 @@
 #include "deepgalois/context.h"
 #include "deepgalois/optimizer.h"
 #include "deepgalois/math_functions.hh"
+
+namespace deepgalois {
+
 /**
  * base class of all kind of NN layers
  *
@@ -42,7 +45,7 @@
  * Node inheritance is just to get accessed to linked-list semantics it
  * provides
  **/
-class layer : public node {
+class layer : public deepgalois::node {
 public:
   layer(unsigned level, std::vector<size_t> in_dims,
         std::vector<size_t> out_dims)
@@ -92,14 +95,14 @@ class layer : public node {
 
   //! set the data of the previous layer connected to this one
   void set_in_data(float_t* data) {
-    prev_ = std::make_shared<edge>(this, input_dims[0], input_dims[1]);
+    prev_ = std::make_shared<deepgalois::edge>(this, input_dims[0], input_dims[1]);
     prev_->set_data(data);
     // no need to allocate memory for gradients, since this is the input layer.
   }
 
   void add_edge() {
     // add an outgoing edge
-    next_ = std::make_shared<edge>(this, output_dims[0], output_dims[1]);
+    next_ = std::make_shared<deepgalois::edge>(this, output_dims[0], output_dims[1]);
     // allocate memory for intermediate feature vectors and gradients
     next_->alloc();
   }
@@ -161,6 +164,7 @@ class layer : public node {
   deepgalois::Context* context;
 };
 
+
 // head: layer i+1, tail: layer i
 inline void connect(layer* head, layer* tail, size_t head_index = 0,
                     size_t tail_index = 0) {
@@ -178,3 +182,5 @@ inline void connect(layer* head, layer* tail, size_t head_index = 0,
   tail->prev_ = head->next_;
   tail->prev_->add_next_node(tail);
 }
+
+} // namespace deepgalois
diff --git a/libdeepgalois/include/deepgalois/layers/linear_layer.h b/libdeepgalois/include/deepgalois/layers/linear_layer.h
index 55d5d245d8..d68ae12479 100644
--- a/libdeepgalois/include/deepgalois/layers/linear_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/linear_layer.h
@@ -1,6 +1,7 @@
 #pragma once
 #include "layer.h"
 
+namespace deepgalois {
 class linear_layer : public layer {
 public:
   linear_layer(unsigned level, float_t scale, float_t bias,
@@ -30,3 +31,4 @@ class linear_layer : public layer {
 protected:
   float_t scale_, bias_;
 };
+} // namespace
diff --git a/libdeepgalois/include/deepgalois/layers/relu_layer.h b/libdeepgalois/include/deepgalois/layers/relu_layer.h
index 8a7b447038..a85d51608d 100644
--- a/libdeepgalois/include/deepgalois/layers/relu_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/relu_layer.h
@@ -1,6 +1,7 @@
 #pragma once
 #include "layer.h"
 
+namespace deepgalois {
 // ReLU Layer
 class relu_layer : public layer {
 public:
@@ -19,3 +20,4 @@ class relu_layer : public layer {
   virtual void back_propagation(const float_t* in_data, const float_t* out_data,
                                 float_t* out_grad, float_t* in_grad);
 };
+} // namespace
diff --git a/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h b/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h
index 0fa56cf7fe..798ad7a79a 100644
--- a/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h
@@ -1,6 +1,7 @@
 #pragma once
 #include "layer.h"
 
+namespace deepgalois {
 class softmax_loss_layer : public layer {
 public:
   softmax_loss_layer(unsigned level, std::vector<size_t> in_dims,
@@ -14,3 +15,4 @@ class softmax_loss_layer : public layer {
                                 float_t* out_grad, float_t* in_grad);
   virtual acc_t get_masked_loss();
 };
+}
diff --git a/libdeepgalois/include/deepgalois/node.h b/libdeepgalois/include/deepgalois/node.h
index fa58ddea2b..fcb20513c0 100644
--- a/libdeepgalois/include/deepgalois/node.h
+++ b/libdeepgalois/include/deepgalois/node.h
@@ -13,6 +13,9 @@
 #include <memory>
 #include <cassert>
 #include "deepgalois/types.h"
+
+namespace deepgalois {
+
 class node;
 class layer;
 class edge;
@@ -69,3 +72,5 @@ class edge {
   node* prev_;         // previous node, "producer" of data
   node* next_;         // next node, "consumer" of data
 };
+
+} // namespace deepgalois
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index adcd7cc33c..b496f52d57 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -1,5 +1,7 @@
 #include "deepgalois/layers/graph_conv_layer.h"
 
+namespace deepgalois {
+
 #ifdef CPU_ONLY
 void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in, float_t* out) {
   update_all(len, g, in, out, true, context->norm_factor);
@@ -157,3 +159,5 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
   sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, d_weight_grad);
 }
 #endif
+
+} // namespace
diff --git a/libdeepgalois/src/layers/relu_layer.cpp b/libdeepgalois/src/layers/relu_layer.cpp
index ce2a167cb0..7441294f83 100644
--- a/libdeepgalois/src/layers/relu_layer.cpp
+++ b/libdeepgalois/src/layers/relu_layer.cpp
@@ -1,5 +1,7 @@
 #include "deepgalois/layers/relu_layer.h"
 
+namespace deepgalois {
+
 // 𝑦[𝑙] = max(0, 𝑦[𝑙−1])
 void relu_layer::forward_propagation(const tensor_t& in_data,
                                      tensor_t& out_data) {
@@ -43,3 +45,5 @@ void relu_layer::back_propagation(const float_t* in_data,
   const size_t count = input_dims[0] * input_dims[1];
   d_relu_gpu(count, out_grad, in_data, in_grad);
 }
+
+} // namespace
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp
index cc3e3b941b..f16ba58fbe 100644
--- a/libdeepgalois/src/layers/softmax_loss_layer.cpp
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp
@@ -1,5 +1,7 @@
 #include "deepgalois/layers/softmax_loss_layer.h"
 
+namespace deepgalois {
+
 softmax_loss_layer::softmax_loss_layer(unsigned level,
                                        std::vector<size_t> in_dims,
                                        std::vector<size_t> out_dims)
@@ -85,3 +87,5 @@ acc_t softmax_loss_layer::get_masked_loss() {
   return masked_avg_loss(begin_, end_, count_, d_masks_, loss);
 }
 #endif
+
+} // namespace
diff --git a/libdeepgalois/src/node.cpp b/libdeepgalois/src/node.cpp
index 4ab918e0cd..e3117d9da2 100644
--- a/libdeepgalois/src/node.cpp
+++ b/libdeepgalois/src/node.cpp
@@ -1,6 +1,8 @@
 #include "deepgalois/node.h"
 #include <iostream>
 
+namespace deepgalois {
+
 void edge::alloc() {
   // std::cout << "Allocating memory for tensors (intermediate features and
   // gradients) ...\n";
@@ -38,3 +40,5 @@ void edge::clear_grads() {
   clear_grads_gpu();
 #endif
 }
+
+} // namespace deepgalois

From 47d84e719a06a42a5c7af14be65e132a69f6f700 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 2 Mar 2020 17:17:19 -0600
Subject: [PATCH 082/660] moved deepgalois node to layers (only used there)

---
 libdeepgalois/include/deepgalois/layers/layer.h      | 2 +-
 libdeepgalois/include/deepgalois/{ => layers}/node.h | 0
 libdeepgalois/src/node.cpp                           | 2 +-
 3 files changed, 2 insertions(+), 2 deletions(-)
 rename libdeepgalois/include/deepgalois/{ => layers}/node.h (100%)

diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h
index c0deaf6748..f30ad03b7b 100644
--- a/libdeepgalois/include/deepgalois/layers/layer.h
+++ b/libdeepgalois/include/deepgalois/layers/layer.h
@@ -22,7 +22,7 @@
 #include <utility>
 #include <algorithm>
 #include <unordered_set>
-#include "deepgalois/node.h"
+#include "deepgalois/layers/node.h"
 #include "deepgalois/types.h"
 #include "deepgalois/utils.h"
 #include "deepgalois/gtypes.h"
diff --git a/libdeepgalois/include/deepgalois/node.h b/libdeepgalois/include/deepgalois/layers/node.h
similarity index 100%
rename from libdeepgalois/include/deepgalois/node.h
rename to libdeepgalois/include/deepgalois/layers/node.h
diff --git a/libdeepgalois/src/node.cpp b/libdeepgalois/src/node.cpp
index e3117d9da2..b1ee96a58b 100644
--- a/libdeepgalois/src/node.cpp
+++ b/libdeepgalois/src/node.cpp
@@ -1,4 +1,4 @@
-#include "deepgalois/node.h"
+#include "deepgalois/layers/node.h"
 #include <iostream>
 
 namespace deepgalois {

From 6116ad939f6b7ce5ea89479c74b66f9c6d0932a8 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 2 Mar 2020 17:22:17 -0600
Subject: [PATCH 083/660] utils in deepgalois namespace

---
 libdeepgalois/include/deepgalois/utils.h | 5 +++++
 libdeepgalois/src/math_functions.cpp     | 6 +++---
 lonestargnn/gcn/gcn.cpp                  | 4 ++--
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/utils.h b/libdeepgalois/include/deepgalois/utils.h
index 086dcf321a..8b76d570dc 100644
--- a/libdeepgalois/include/deepgalois/utils.h
+++ b/libdeepgalois/include/deepgalois/utils.h
@@ -8,8 +8,11 @@
 #include <sys/time.h>
 #include <sys/resource.h>
 
+namespace deepgalois {
+
 const std::string path =
     "/net/ohm/export/iss/inputs/Learning/"; // path to the input dataset
+
 enum class net_phase { train, test };
 
 class ResourceManager {
@@ -128,3 +131,5 @@ inline size_t read_masks(std::string dataset_str, std::string mask_type,
   in.close();
   return sample_count;
 }
+
+}
diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp
index 144419f16d..979f5ce9d7 100644
--- a/libdeepgalois/src/math_functions.cpp
+++ b/libdeepgalois/src/math_functions.cpp
@@ -344,7 +344,7 @@ void dropout(const float scale, const float dropout_rate, const vec_t& in,
   assert(masks.size() == out.size());
   // rng_bernoulli(1. - dropout_rate, masks); // Create random numbers
   for (size_t i = 0; i < in.size(); ++i)
-    masks[i] = bernoulli(dropout_rate);
+    masks[i] = deepgalois::bernoulli(dropout_rate);
   for (size_t i = 0; i < in.size(); ++i)
     out[i] = in[i] * masks[i] * scale;
 }
@@ -352,7 +352,7 @@ void dropout(const float scale, const float dropout_rate, const vec_t& in,
 void dropout(const float scale, const float dropout_rate, const vec_t& in,
              std::vector<unsigned>& masks, float_t* out) {
   for (size_t i = 0; i < in.size(); ++i)
-    masks[i] = bernoulli(dropout_rate);
+    masks[i] = deepgalois::bernoulli(dropout_rate);
   for (size_t i = 0; i < in.size(); ++i)
     out[i] = in[i] * masks[i] * scale;
 }
@@ -360,7 +360,7 @@ void dropout(const float scale, const float dropout_rate, const vec_t& in,
 void dropout(size_t n, const float scale, const float dropout_rate,
              const float_t* in, unsigned* masks, float_t* out) {
   for (size_t i = 0; i < n; ++i)
-    masks[i] = bernoulli(dropout_rate);
+    masks[i] = deepgalois::bernoulli(dropout_rate);
   for (size_t i = 0; i < n; ++i)
     out[i] = in[i] * masks[i] * scale;
 }
diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp
index 7b4977dbe1..005e6b1477 100644
--- a/lonestargnn/gcn/gcn.cpp
+++ b/lonestargnn/gcn/gcn.cpp
@@ -15,7 +15,7 @@ int main(int argc, char** argv) {
   network.construct_layers(); // default setting for now; can be customized by
                               // the user
   network.print_layers_info();
-  ResourceManager rm;
+  deepgalois::ResourceManager rm;
 
   // the optimizer used to update parameters, see optimizer.h for more details
   // optimizer *opt = new gradient_descent();
@@ -40,7 +40,7 @@ int main(int argc, char** argv) {
       for (size_t i = test_begin; i < test_end; i++)
         test_mask[i] = 1;
     } else
-      test_count = read_masks(dataset, "test", test_begin, test_end, test_mask);
+      test_count = deepgalois::read_masks(dataset, "test", test_begin, test_end, test_mask);
     galois::StatTimer Ttest("Test");
     Ttest.start();
     double test_time = network.evaluate(test_begin, test_end, test_count,

From 78f609a837b4fb1d49af691d2ee4db83585b5fa3 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 2 Mar 2020 17:24:32 -0600
Subject: [PATCH 084/660] TODOs to deepgalois utils: galois supports a lot of
 what is being defined there, so reuse galois instead

---
 libdeepgalois/include/deepgalois/utils.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/libdeepgalois/include/deepgalois/utils.h b/libdeepgalois/include/deepgalois/utils.h
index 8b76d570dc..51c0bb5c95 100644
--- a/libdeepgalois/include/deepgalois/utils.h
+++ b/libdeepgalois/include/deepgalois/utils.h
@@ -15,6 +15,8 @@ const std::string path =
 
 enum class net_phase { train, test };
 
+//! tracks max mem usage with rusage
+// TODO use Galois's getrusage functionality
 class ResourceManager {
 public:
   ResourceManager() {}
@@ -41,6 +43,7 @@ class ResourceManager {
   }
 };
 
+// TODO don't need a separate timer: use Galois's regular timer
 class Timer {
 public:
   Timer() {}

From e55eea07fe447a6c4b57d4f688871e252c08a47b Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 2 Mar 2020 17:26:24 -0600
Subject: [PATCH 085/660] gtypes, math func, types TODO need namespaces

---
 libdeepgalois/include/deepgalois/gtypes.h          | 2 ++
 libdeepgalois/include/deepgalois/math_functions.hh | 2 ++
 libdeepgalois/include/deepgalois/types.h           | 2 ++
 3 files changed, 6 insertions(+)

diff --git a/libdeepgalois/include/deepgalois/gtypes.h b/libdeepgalois/include/deepgalois/gtypes.h
index e11c1058cc..5278660692 100644
--- a/libdeepgalois/include/deepgalois/gtypes.h
+++ b/libdeepgalois/include/deepgalois/gtypes.h
@@ -2,6 +2,8 @@
 #include "galois/Galois.h"
 #include "galois/graphs/LCGraph.h"
 
+// TODO namespace
+
 typedef galois::GAccumulator<acc_t> AccumF;
 typedef galois::GAccumulator<size_t> AccumU;
 
diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh
index 3e1af207da..b5c51203f8 100644
--- a/libdeepgalois/include/deepgalois/math_functions.hh
+++ b/libdeepgalois/include/deepgalois/math_functions.hh
@@ -16,6 +16,8 @@ extern "C" {
 //#include <clapack.h>
 }
 
+// TODO namespace
+
 const float negative_slope = 0;
 
 void vadd(const vec_t& a, const vec_t& b, vec_t& out); // vector add
diff --git a/libdeepgalois/include/deepgalois/types.h b/libdeepgalois/include/deepgalois/types.h
index b669a25188..118f04bd04 100644
--- a/libdeepgalois/include/deepgalois/types.h
+++ b/libdeepgalois/include/deepgalois/types.h
@@ -3,6 +3,8 @@
 #include <vector>
 #include <stdint.h>
 
+// TODO namespace
+
 #ifdef CNN_USE_DOUBLE
 typedef double float_t;
 typedef double feature_t;

From f1038c7b20ef4c41757e5b7aa52d034fd849df08 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 2 Mar 2020 17:27:39 -0600
Subject: [PATCH 086/660] lgraph namespace deepgalois

---
 libdeepgalois/include/deepgalois/lgraph.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/libdeepgalois/include/deepgalois/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h
index 2eb5ec6863..7a86960338 100644
--- a/libdeepgalois/include/deepgalois/lgraph.h
+++ b/libdeepgalois/include/deepgalois/lgraph.h
@@ -8,6 +8,9 @@
 #include <fstream>
 #include <iostream>
 #include <algorithm>
+
+namespace deepgalois {
+
 typedef unsigned IndexT;
 typedef float ValueT;
 
@@ -97,4 +100,5 @@ class LGraph {
   }
 };
 
+} // namespace
 #endif

From a79de8096fb4c68757de73db72d8044965d9d18b Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 2 Mar 2020 17:28:49 -0600
Subject: [PATCH 087/660] aggregator deepgalois namespace

---
 libdeepgalois/include/deepgalois/aggregator.h | 5 +++++
 libdeepgalois/src/aggregator.cpp              | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/libdeepgalois/include/deepgalois/aggregator.h b/libdeepgalois/include/deepgalois/aggregator.h
index c54f8f69bc..17a8451aee 100644
--- a/libdeepgalois/include/deepgalois/aggregator.h
+++ b/libdeepgalois/include/deepgalois/aggregator.h
@@ -2,12 +2,17 @@
 #include "deepgalois/types.h"
 #ifdef CPU_ONLY
 #include "deepgalois/gtypes.h"
+
+namespace deepgalois {
 void update_all(size_t len, Graph& g, const float_t* in, float_t* out,
                 bool norm, const float_t* norm_factor);
+}
 #else
 #include "graph_gpu.h"
+namespace deepgalois {
 void update_all(size_t len, CSRGraph& g, const float_t* in, float_t* out,
                 bool norm, const float_t* norm_factor);
 void update_all_cusparse(size_t len, CSRGraph& g, const float_t* in, 
                 float_t* out, bool norm, const float_t* norm_factor);
+}
 #endif
diff --git a/libdeepgalois/src/aggregator.cpp b/libdeepgalois/src/aggregator.cpp
index c2a50710dd..360300dba3 100644
--- a/libdeepgalois/src/aggregator.cpp
+++ b/libdeepgalois/src/aggregator.cpp
@@ -1,7 +1,7 @@
 #include "deepgalois/aggregator.h"
 #include "deepgalois/math_functions.hh"
 
-void update_all(size_t len, Graph& g, const float_t* in, float_t* out,
+void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* out,
                 bool norm, const float_t* norm_factor) {
   galois::do_all(galois::iterate(g.begin(), g.end()),
                  [&](const auto& src) {

From ef5447550346d613be7ef497b4c78bd7169e01b8 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 2 Mar 2020 17:32:59 -0600
Subject: [PATCH 088/660] removed unused files in deepgalois

---
 libdeepgalois/include/unused/random.h | 71 ---------------------------
 libdeepgalois/include/unused/timer.h  | 28 -----------
 2 files changed, 99 deletions(-)
 delete mode 100644 libdeepgalois/include/unused/random.h
 delete mode 100644 libdeepgalois/include/unused/timer.h

diff --git a/libdeepgalois/include/unused/random.h b/libdeepgalois/include/unused/random.h
deleted file mode 100644
index b63914bca1..0000000000
--- a/libdeepgalois/include/unused/random.h
+++ /dev/null
@@ -1,71 +0,0 @@
-// From Caffe library it seems
-// TODO get the license from it
-
-#ifndef RANDOM_H
-#define RANDOM_H
-typedef boost::mt19937 rng_t;
-
-// random seeding
-int64_t seedgen(void) {
-  int64_t s, seed, pid;
-  FILE* f = fopen("/dev/urandom", "rb");
-  if (f && fread(&seed, 1, sizeof(seed), f) == sizeof(seed)) {
-    fclose(f);
-    return seed;
-  }
-  std::cout << "System entropy source not available, using fallback algorithm "
-               "to generate seed instead.";
-  if (f)
-    fclose(f);
-  pid  = getpid();
-  s    = time(NULL);
-  seed = std::abs(((s * 181) * ((pid - 83) * 359)) % 104729);
-  return seed;
-}
-
-// This random number generator facade hides boost and CUDA rng
-// implementation from one another (for cross-platform compatibility).
-class RNG {
-public:
-  RNG() : generator_(new Generator()) {}
-  explicit RNG(unsigned int seed) : generator_(new Generator(seed)) {}
-  explicit RNG(const RNG&);
-  RNG& operator=(const RNG& other) {
-    generator_ = other.generator_;
-    return *this;
-  }
-  void* generator() { return static_cast<void*>(generator_->rng()); }
-
-private:
-  class Generator {
-  public:
-    Generator() : rng_(new rng_t(seedgen())) {}
-    explicit Generator(unsigned seed) : rng_(new rng_t(seed)) {}
-    rng_t* rng() { return rng_.get(); }
-
-  private:
-    std::shared_ptr<rng_t> rng_;
-  };
-
-  std::shared_ptr<Generator> generator_;
-};
-
-std::shared_ptr<RNG> random_generator_;
-inline static RNG& rng_stream() {
-  random_generator_.reset(new RNG());
-  return *random_generator_;
-}
-
-inline rng_t* rng() { return static_cast<rng_t*>(rng_stream().generator()); }
-
-#include <boost/random/bernoulli_distribution.hpp>
-template <typename DataTy = float>
-void rng_bernoulli(const DataTy p, std::vector<unsigned>& r) {
-  boost::bernoulli_distribution<DataTy> random_distribution(p);
-  boost::variate_generator<rng_t*, boost::bernoulli_distribution<DataTy>>
-      variate_generator(rng(), random_distribution);
-  for (size_t i = 0; i < r.size(); ++i)
-    r[i] = static_cast<unsigned>(variate_generator());
-}
-
-#endif
diff --git a/libdeepgalois/include/unused/timer.h b/libdeepgalois/include/unused/timer.h
deleted file mode 100644
index af01412463..0000000000
--- a/libdeepgalois/include/unused/timer.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#ifndef TIMER_H_
-#define TIMER_H_
-#include <sys/time.h>
-
-class Timer {
-public:
-  Timer() {}
-  void Start() { gettimeofday(&start_time_, NULL); }
-  void Stop() {
-    gettimeofday(&elapsed_time_, NULL);
-    elapsed_time_.tv_sec -= start_time_.tv_sec;
-    elapsed_time_.tv_usec -= start_time_.tv_usec;
-  }
-  double Seconds() const {
-    return elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec / 1e6;
-  }
-  double Millisecs() const {
-    return 1000 * elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec / 1000;
-  }
-  double Microsecs() const {
-    return 1e6 * elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec;
-  }
-
-private:
-  struct timeval start_time_;
-  struct timeval elapsed_time_;
-};
-#endif // TIMER_H_

From f9ec7df2c72ade22be05260d314ce6f56ec3c503 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 2 Mar 2020 17:33:16 -0600
Subject: [PATCH 089/660] removed the gnn directory from experimental (unused)

---
 lonestar/experimental/gnn/CMakeLists.txt      |  16 -
 lonestar/experimental/gnn/README.md           |  60 ---
 lonestar/experimental/gnn/gnn.cpp             |  46 --
 lonestar/experimental/gnn/gnn.h               |  32 --
 lonestar/experimental/gnn/graph_sage.cpp      |  41 --
 lonestar/experimental/gnn/layers.h            |   8 -
 .../gnn/layers/arithmetic_layer.h             |  22 -
 .../gnn/layers/graph_conv_layer.h             | 186 -------
 lonestar/experimental/gnn/layers/layer.h      | 156 ------
 .../experimental/gnn/layers/linear_layer.h    |  28 -
 lonestar/experimental/gnn/layers/relu_layer.h |  24 -
 .../gnn/layers/softmax_loss_layer.h           |  47 --
 lonestar/experimental/gnn/lgraph.h            | 179 -------
 lonestar/experimental/gnn/math_functions.hpp  | 500 ------------------
 lonestar/experimental/gnn/net.h               | 341 ------------
 lonestar/experimental/gnn/node.h              | 109 ----
 lonestar/experimental/gnn/optimizer.h         | 221 --------
 lonestar/experimental/gnn/random.h            |  63 ---
 lonestar/experimental/gnn/run-citeseer.sh     |   1 -
 lonestar/experimental/gnn/timer.h             |  21 -
 lonestar/experimental/gnn/types.h             |  34 --
 lonestar/experimental/gnn/utils.h             | 119 -----
 22 files changed, 2254 deletions(-)
 delete mode 100644 lonestar/experimental/gnn/CMakeLists.txt
 delete mode 100644 lonestar/experimental/gnn/README.md
 delete mode 100644 lonestar/experimental/gnn/gnn.cpp
 delete mode 100644 lonestar/experimental/gnn/gnn.h
 delete mode 100644 lonestar/experimental/gnn/graph_sage.cpp
 delete mode 100644 lonestar/experimental/gnn/layers.h
 delete mode 100644 lonestar/experimental/gnn/layers/arithmetic_layer.h
 delete mode 100644 lonestar/experimental/gnn/layers/graph_conv_layer.h
 delete mode 100644 lonestar/experimental/gnn/layers/layer.h
 delete mode 100644 lonestar/experimental/gnn/layers/linear_layer.h
 delete mode 100644 lonestar/experimental/gnn/layers/relu_layer.h
 delete mode 100644 lonestar/experimental/gnn/layers/softmax_loss_layer.h
 delete mode 100644 lonestar/experimental/gnn/lgraph.h
 delete mode 100644 lonestar/experimental/gnn/math_functions.hpp
 delete mode 100644 lonestar/experimental/gnn/net.h
 delete mode 100644 lonestar/experimental/gnn/node.h
 delete mode 100644 lonestar/experimental/gnn/optimizer.h
 delete mode 100644 lonestar/experimental/gnn/random.h
 delete mode 100755 lonestar/experimental/gnn/run-citeseer.sh
 delete mode 100644 lonestar/experimental/gnn/timer.h
 delete mode 100644 lonestar/experimental/gnn/types.h
 delete mode 100644 lonestar/experimental/gnn/utils.h

diff --git a/lonestar/experimental/gnn/CMakeLists.txt b/lonestar/experimental/gnn/CMakeLists.txt
deleted file mode 100644
index cff09bb371..0000000000
--- a/lonestar/experimental/gnn/CMakeLists.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-SET(USE_BLAS ON CACHE BOOL "Use blas")
-
-SET(BLAS_INC /net/ohm/export/cdgc/cxh/OpenBLAS/build/include)
-SET(BLAS_LIB /net/ohm/export/cdgc/cxh/OpenBLAS/build/lib)
-
-if (USE_BLAS)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWITH_BLAS")
-    include_directories(${BLAS_INC})
-    link_directories(${BLAS_LIB})
-endif()
-
-app(gnn gnn.cpp)
-
-if (USE_BLAS)
-    target_link_libraries(gnn -lopenblas)
-endif()
diff --git a/lonestar/experimental/gnn/README.md b/lonestar/experimental/gnn/README.md
deleted file mode 100644
index 930609763c..0000000000
--- a/lonestar/experimental/gnn/README.md
+++ /dev/null
@@ -1,60 +0,0 @@
-DESCRIPTION 
-===========
-
-This application does vertex classification in an undirected graph.
-It uses graph neural network (GNN) to train the vertex features 
-which are then used to classify vertices into different classes.
-
-INPUT
-===========
-
-The input dataset contains three parts:
-1. the input graph file: edgelist format of a |V| x |V| sparse matrix.
-2. the vertex label file: |V| lines with each line a integer.
-3. the input feature file: edgelist format of |V| x |D| sparse matrix.
-
-Vertex ids are expected to be sequential integers between 0 and |V|-1.
-|V| is the number of vertices. |D| is the dimension of input feature vectors.
-
-BUILD
-===========
-
-1. Run cmake at BUILD directory `cd build; cmake -DUSE_EXP=1 ../`
-
-2. Run `cd <BUILD>/lonestar/experimental/gnn; make -j`
-
-RUN
-===========
-
-The following are a few example command lines.
-
-$ export OPENBLAS_NUM_THREADS=28
-$ ./gnn cora -t=1 -k=3
-$ ./gnn citeseer -t=3 -k=30
-$ ./gnn reddit -t=56 -k=3
-
-PERFORMANCE
-===========
-- I
-- I
-- I
-
-REFERENCES
-===========
-The GCN model:
-Semi-Supervised Classification with Graph Convolutional Networks (ICLR 2017)  
-http://arxiv.org/abs/1609.02907 
-https://github.com/tkipf/gcn
-
-DGL:
-Deep Graph Library: Towards Efficient and Scalable Deep Learning on Graphs
-https://arxiv.org/abs/1909.01315
-https://github.com/dmlc/dgl
-
-GraphSAGE: 
-Inductive Representation Learning on Large Graphs
-http://snap.stanford.edu/graphsage/
-
-NeuGraph: Parallel Deep Neural Network Computation on Large Graphs
-https://www.usenix.org/conference/atc19/presentation/ma
-
diff --git a/lonestar/experimental/gnn/gnn.cpp b/lonestar/experimental/gnn/gnn.cpp
deleted file mode 100644
index 97cb2620af..0000000000
--- a/lonestar/experimental/gnn/gnn.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-// Graph Neural Networks
-// Xuhao Chen <cxh@utexas.edu>
-#include "gnn.h"
-
-const char* name = "Graph Convolutional Networks";
-const char* desc = "Graph convolutional neural networks on an undirected graph";
-const char* url  = 0;
-
-int main(int argc, char** argv) {
-	galois::SharedMemSys G;
-	LonestarStart(argc, argv, name, desc, url);
-	Net network; // the neural network to train
-	network.init();
-	network.construct_layers(); // default setting for now; see its implementation to find how to customize it by the user
-	network.print_layers_info();
-	ResourceManager rm;
-
-	// the optimizer used to update parameters, see optimizer.h for more details
-	//optimizer *opt = new gradient_descent();
-	//optimizer *opt = new adagrad(); 
-	optimizer *opt = new adam();
-	galois::StatTimer Ttrain("TrainAndVal");
-	Ttrain.start();
-	network.train(opt); // do training using training samples
-	Ttrain.stop();
-
-	if (do_test) {
-		// test using test samples
-		size_t n = network.get_nnodes();
-		acc_t test_loss = 0.0, test_acc = 0.0;
-		size_t test_begin = 0, test_end = n, test_count = n;
-		MaskList test_mask(n, 0);
-		if (dataset == "reddit") {
-			test_begin = 177262; test_count = 55703; test_end = test_begin + test_count;
-			for (size_t i = test_begin; i < test_end; i++) test_mask[i] = 1;
-		} else test_count = read_masks(dataset, "test", test_begin, test_end, test_mask);
-		galois::StatTimer Ttest("Test");
-		Ttest.start();
-		double test_time = network.evaluate(test_begin, test_end, test_count, test_mask, test_loss, test_acc);
-		std::cout << "\nTesting: test_loss = " << test_loss << " test_acc = " << test_acc << " test_time = " << test_time << "\n";
-		Ttest.stop();
-	}
-	std::cout << "\n" << rm.get_peak_memory() << "\n\n";
-	return 0;
-}
-
diff --git a/lonestar/experimental/gnn/gnn.h b/lonestar/experimental/gnn/gnn.h
deleted file mode 100644
index f80dacf4ed..0000000000
--- a/lonestar/experimental/gnn/gnn.h
+++ /dev/null
@@ -1,32 +0,0 @@
-#ifndef _GNN_H_
-#define _GNN_H_
-
-#include "galois/Galois.h"
-#include "galois/Reduction.h"
-#include "galois/Timer.h"
-#include "galois/ParallelSTL.h"
-#include "llvm/Support/CommandLine.h"
-#include "Lonestar/BoilerPlate.h"
-#include "galois/runtime/Profile.h"
-#include <boost/iterator/transform_iterator.hpp>
-
-namespace cll = llvm::cl;
-static cll::opt<std::string> dataset(cll::Positional, cll::desc("<dataset name>"), cll::Required); // 'cora', 'citeseer', 'pubmed'
-static cll::opt<std::string> filetype(cll::Positional, cll::desc("<filetype: el,gr>"), cll::init("gr")); // file format of the input graph
-static cll::opt<std::string> model("m", cll::desc("Model string"), cll::init("gcn")); // 'gcn', 'gcn_cheby', 'dense'
-static cll::opt<float> learning_rate("lr", cll::desc("Initial learning rate (default value 0.01)"), cll::init(0.01));
-static cll::opt<unsigned> epochs("k", cll::desc("number of epoch, i.e. iterations (default value 1)"), cll::init(1));
-static cll::opt<unsigned> hidden1("h", cll::desc("Number of units in hidden layer 1 (default value 16)"), cll::init(16));
-static cll::opt<float> dropout_rate("d", cll::desc("Dropout rate (1 - keep probability) (default value 0.5)"), cll::init(0.5));
-static cll::opt<float> weight_decay("wd", cll::desc("Weight for L2 loss on embedding matrix (default value 5e-4)"), cll::init(5e-4));
-static cll::opt<float> early_stopping("es", cll::desc("Tolerance for early stopping (# of epochs) (default value 10)"), cll::init(10));
-static cll::opt<unsigned> max_degree("md", cll::desc("Maximum Chebyshev polynomial degree (default value 3)"), cll::init(3));
-static cll::opt<unsigned> do_validate("dv", cll::desc("enable validation"), cll::init(1));
-static cll::opt<unsigned> do_test("dt", cll::desc("enable test"), cll::init(1));
-#define CHUNK_SIZE 256
-
-#include "types.h"
-#include "utils.h"
-#include "net.h"
-
-#endif
diff --git a/lonestar/experimental/gnn/graph_sage.cpp b/lonestar/experimental/gnn/graph_sage.cpp
deleted file mode 100644
index b70cdc183c..0000000000
--- a/lonestar/experimental/gnn/graph_sage.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-// Graph Neural Networks
-// Xuhao Chen <cxh@utexas.edu>
-#include "gnn.h"
-
-const char* name = "GraphSage";
-const char* desc = "A graph neural network variant: GraphSAGE";
-const char* url  = 0;
-
-class GraphSageMean: public graph_conv_layer { 
-	// user-defined combine function
-};
-
-int main(int argc, char** argv) {
-	galois::SharedMemSys G;
-	LonestarStart(argc, argv, name, desc, url);
-	Net network; // the neural network to train
-	network.init(); // default setting for now; see its implementation to find how to customize it by the user
-	ResourceManager rm;
-
-	// the optimizer used to update parameters, see optimizer.h for more details
-	//optimizer *opt = new gradient_descent();
-	//optimizer *opt = new adagrad(); 
-	optimizer *opt = new adam();
-	galois::StatTimer Ttrain("Train");
-	Ttrain.start();
-	network.train(opt); // do training using training samples
-	Ttrain.stop();
-
-	// test using test samples
-	acc_t test_loss = 0.0, test_acc = 0.0;
-	size_t test_begin = 2312, test_end = 3312; // [2312, 3327) test size = 1015 TODO: replace ad-hoc settings
-	galois::StatTimer Ttest("Test");
-	Ttest.start();
-	double test_time = network.evaluate(test_begin, test_end, test_loss, test_acc);
-	std::cout << "\nTesting: test_loss = " << test_loss << " test_acc = " << test_acc << " test_time = " << test_time << "\n";
-	Ttest.stop();
-
-	std::cout << "\n" << rm.get_peak_memory() << "\n\n";
-	return 0;
-}
-
diff --git a/lonestar/experimental/gnn/layers.h b/lonestar/experimental/gnn/layers.h
deleted file mode 100644
index 9650e931a9..0000000000
--- a/lonestar/experimental/gnn/layers.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef _LAYERS_H_
-#define _LAYERS_H_
-#include "layers/relu_layer.h"
-#include "layers/linear_layer.h"
-#include "layers/arithmetic_layer.h"
-#include "layers/graph_conv_layer.h"
-#include "layers/softmax_loss_layer.h"
-#endif
diff --git a/lonestar/experimental/gnn/layers/arithmetic_layer.h b/lonestar/experimental/gnn/layers/arithmetic_layer.h
deleted file mode 100644
index aed91e0379..0000000000
--- a/lonestar/experimental/gnn/layers/arithmetic_layer.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#pragma once
-#include "layer.h"
-
-// element-wise add N vectors ```y_i = x0_i + x1_i + ... + xnum_i```
-class elementwise_add_layer : public layer {
-public:
-	elementwise_add_layer(unsigned level, std::vector<size_t> in_dim,
-		std::vector<size_t> out_dim) : layer(level, in_dim, out_dim) {
-		trainable_ = false;
-	}
-	std::string layer_type() const override { return std::string("elementwise_add"); }
-	void forward_propagation(const tensor_t &in_data, tensor_t &out_data) override {
-		for (size_t sample = 0; sample < in_data.size(); ++sample) {
-			for (size_t j = 0; j < in_data[0].size(); j++)
-				out_data[sample][j] = in_data[sample][j];
-		}
-	}
-	void back_propagation(const tensor_t &in_data, const tensor_t &out_data, 
-		tensor_t &out_grad, tensor_t &in_grad) override {
-		in_grad = out_grad;
-	}
-};
diff --git a/lonestar/experimental/gnn/layers/graph_conv_layer.h b/lonestar/experimental/gnn/layers/graph_conv_layer.h
deleted file mode 100644
index b81f7bc10e..0000000000
--- a/lonestar/experimental/gnn/layers/graph_conv_layer.h
+++ /dev/null
@@ -1,186 +0,0 @@
-#pragma once
-#include "layer.h"
-
-/* GraphConv Layer
-	Parameters
-	----------
-	x: int, number of samples.
-	y: int, Input feature size.
-	z: int, Output feature size.
-	dropout: bool, optional, if True, a dropout operation is applied before other operations.
-	norm : bool, optional, if True, the normalizer :math:`c_{ij}` is applied. Default: ``True``.
-	bias : bool, optional, if True, adds a learnable bias to the output. Default: ``False``.
-	activation: callable activation function/layer or None, optional
-	If not None, applies an activation function to the updated node features. Default: ``None``.
-*/
-class graph_conv_layer: public layer {
-public:
-	graph_conv_layer(unsigned level, Graph *g, bool act, bool norm, bool bias, bool dropout,
-		std::vector<size_t> in_dims, std::vector<size_t> out_dims) :
-		layer(level, in_dims, out_dims), graph(g), act_(act), norm_(norm), bias_(bias), dropout_(dropout) {
-		assert(input_dims[0] == output_dims[0]); // num_vertices
-		x = input_dims[0];
-		y = input_dims[1];
-		z = output_dims[1];
-		trainable_ = true;
-		name_ = layer_type() + "_" + std::to_string(level);
-		//std::cout << name_ << " constructed: act(" << act_ << ") dropout(" << dropout << ")\n";
-		init();
-	}
-	void init() {
-		std::cout << name_ << ": allocating memory for parameters and intermediate data... ";
-		Timer t_alloc;
-		t_alloc.Start();
-		// randomly initialize trainable parameters for conv layers
-		rand_init_matrix(y, z, W);
-		//rand_init_matrix(y, z, Q);
-		zero_init_matrix(y, z, weight_grad);
-		alloc_grad();
-		if (dropout_) {
-			dropout_mask.resize(x);
-			for (size_t i = 0; i < x; i++) dropout_mask[i].resize(y);
-		}
-		in_temp.resize(x*y);
-		//for (size_t i = 0; i < x; ++i) in_temp[i].resize(y);
-		out_temp.resize(x*z); // same as pre_sup in original GCN code: https://github.com/chenxuhao/gcn/blob/master/gcn/layers.py
-		//for (size_t i = 0; i < x; ++i) out_temp[i].resize(z);
-		trans_data.resize(y*x); // y*x
-		//for (size_t i = 0; i < y; ++i) trans_data[i].resize(x);
-		if (norm_) norm_factor_counting();
-		t_alloc.Stop();
-		std::cout << "Done, allocation time: " << t_alloc.Millisecs() << " ms\n";
-	}
-	graph_conv_layer(unsigned level, std::vector<size_t> in_dims, 
-		std::vector<size_t> out_dims) : graph_conv_layer(level, NULL, false, true, false, true, in_dims, out_dims) {}
-	~graph_conv_layer() {}
-	std::string layer_type() const override { return std::string("graph_conv"); }
-
-	// user-defined aggregate function
-	void aggregate(Graph *g, const vec_t &in, tensor_t &out) { update_all(g, in, out, true, norm_factor); }
-
-	// user-defined combine function
-	void combine(const vec_t &self, const vec_t &neighbors, const vec_t mat_v, const vec_t mat_u, vec_t &out) {
-		vec_t a(out.size(), 0);
-		vec_t b(out.size(), 0);
-		mvmul(mat_v, self, a);
-		mvmul(mat_u, neighbors, b); 
-		vadd(a, b, out); // out = W*self + Q*neighbors
-	}
-
-	void set_context(net_phase ctx) override { phase_ = ctx; }
-
-	// 𝒉[𝑙] = σ(𝑊 * Σ(𝒉[𝑙-1]))
-	void forward_propagation(const tensor_t &in_data, tensor_t &out_data) override {
-		// input: x*y; W: y*z; output: x*z
-		// if y > z:
-		// mult W first to reduce the feature size for aggregation
-		// else: aggregate first then mult W (not implemented yet)
-		//Timer t_matmul, t_agg, t_dropout;
-		//t_matmul.Start();
-		if (dropout_ && phase_ == net_phase::train) {
-			//t_dropout.Start();
-			//for (size_t i = 0; i < x; ++i) {
-			galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) {
-				dropout(in_data[i], dropout_mask[i], &in_temp[i*y]);
-			}, galois::loopname("dropout"));
-			//t_dropout.Stop();
-			matmul1D1D(x, z, y, in_temp, W, out_temp); // x*y; y*z; x*z
-		} else matmul2D1D(z, in_data, W, out_temp); // x*y; y*z; x*z
-		//t_matmul.Stop();
-		//t_agg.Start();
-		aggregate(graph, out_temp, out_data); // aggregate
-		//t_agg.Stop();
-		if (act_) {
-			galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) {
-				relu(out_data[i], out_data[i]);
-			}, galois::loopname("relu"));
-		}
-		//double dropout_time = 0;
-		//if (dropout_ && phase_ == net_phase::train) dropout_time = t_dropout.Millisecs();
-		//std::cout << "\n\t" << name_ << " matmul time: " << t_matmul.Millisecs() 
-		//	<< ", aggregation time: " << t_agg.Millisecs() << ", dropout time: " << dropout_time << "\n";
-	}
-
-	// 𝜕𝐸 / 𝜕𝑦[𝑙−1] = 𝜕𝐸 / 𝜕𝑦[𝑙] ∗ 𝑊 ^𝑇
-	void back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad) override {
-		if (act_) {
-			//for (size_t j = 0; j < z; ++j) 
-			galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) {
-				for (size_t j = 0; j < z; ++j) 
-					//if (out_data[i][j] <= 0.0) out_temp[i][j] = 0.0;
-					out_temp[i*z+j] = out_data[i][j] > float_t(0) ? out_grad[i][j] : float_t(0);
-			}, galois::loopname("d_relu"));
-		//} else out_temp = out_grad; // TODO: avoid copying
-		} else copy2D1D(out_grad, out_temp);
-		if (level_ != 0) { // no need to calculate in_grad for the first layer
-			vec_t trans_W(z*y);
-			transpose(y, z, W, trans_W); // derivative of matmul needs transposed matrix
-			matmul1D1D(x, y, z, out_temp, trans_W, in_temp); // x*z; z*y -> x*y
-			update_all(graph, in_temp, in_grad, true, norm_factor); // x*x; x*y -> x*y NOTE: since graph is symmetric, the derivative is the same
-			if (dropout_) {
-				galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) {
-					d_dropout(in_grad[i], dropout_mask[i], in_grad[i]);
-				}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("d_dropout"));
-			}
-		}
-
-		// calculate weight gradients
-		transpose2D1D(in_data, trans_data); // y*x
-		matmul1D1D(y, z, x, trans_data, out_temp, weight_grad); // y*x; x*z; y*z
-	}
-
-	void degree_counting() {
-		assert(x == graph->size());
-		degrees.resize(x);
-		galois::do_all(galois::iterate((size_t)0, x), [&] (auto v) {
-			degrees[v] = std::distance(graph->edge_begin(v), graph->edge_end(v));
-		}, galois::loopname("DegreeCounting"));
-	}
-
-	// for each vertex v, compute pow(|N(v)|, -0.5), where |N(v)| is the degree of v
-	void norm_factor_counting() {
-		degree_counting();
-		norm_factor.resize(x);
-		galois::do_all(galois::iterate((size_t)0, x), [&] (auto v) {
-			float_t temp = std::sqrt(float_t(degrees[v]));
-			if (temp == 0.0) norm_factor[v] = 0.0;
-			else norm_factor[v] = 1.0 / temp;
-		}, galois::loopname("NormCounting"));
-	}
-
-private:
-	Graph *graph;
-	bool act_; // whether to use activation function at the end
-	bool norm_; // whether to normalize data
-	bool bias_; // whether to add bias afterwards
-	bool dropout_; // whether to use dropout at first
-	net_phase phase_;
-	size_t x;
-	size_t y;
-	size_t z;
-	vec_t out_temp;
-	vec_t in_temp;
-	vec_t trans_data; // y*x
-	std::vector<unsigned> degrees;
-	std::vector<float_t> norm_factor; // normalization constant based on graph structure
-	std::vector<std::vector<unsigned> > dropout_mask;
-
-	// Glorot & Bengio (AISTATS 2010) init
-	inline void rand_init_matrix(size_t dim_x, size_t dim_y, vec_t &matrix) {
-		auto init_range = sqrt(6.0/(dim_x + dim_y));
-		std::default_random_engine rng;
-		std::uniform_real_distribution<float_t> dist(-init_range, init_range);
-		matrix.resize(dim_x * dim_y);
-		for (size_t i = 0; i < dim_x; ++i) {
-			for (size_t j = 0; j < dim_y; ++j)
-				matrix[i*dim_y+j] = dist(rng);
-		}
-	}
-	inline void zero_init_matrix(size_t dim_x, size_t dim_y, vec_t &matrix) {
-		matrix.resize(dim_x * dim_y);
-		for (size_t i = 0; i < dim_x; ++i) {
-			for (size_t j = 0; j < dim_y; ++j)
-				matrix[i*dim_y+j] = 0;
-		}
-	}
-};
diff --git a/lonestar/experimental/gnn/layers/layer.h b/lonestar/experimental/gnn/layers/layer.h
deleted file mode 100644
index 4a8a545738..0000000000
--- a/lonestar/experimental/gnn/layers/layer.h
+++ /dev/null
@@ -1,156 +0,0 @@
-#pragma once
-
-#include <queue>
-#include <cmath>
-#include <vector>
-#include <limits>
-#include <memory>
-#include <string>
-#include <iomanip>
-#include <numeric>
-#include <sstream>
-#include <utility>
-#include <algorithm>
-#include <unordered_set>
-#include "../node.h"
-#include "../types.h"
-#include "../utils.h"
-#include "../optimizer.h"
-#include "../math_functions.hpp"
-/**
- * base class of all kind of NN layers
- *
- * sub-class should override these methods:
- * - forward_propagation ... body of forward-pass calculation
- * - back_propagation    ... body of backward-pass calculation
- * - in_shape            ... specify input data shapes
- * - out_shape           ... specify output data shapes
- * - layer_type          ... name of layer
- **/
-
-class layer : public node {
-public:
-	layer(unsigned level, std::vector<size_t> in_dims, std::vector<size_t> out_dims) :
-		node(in_dims.size(), out_dims.size()), 
-		level_(level), begin_(0), end_(0), num_dims(in_dims.size()),
-		input_dims(in_dims), output_dims(out_dims) { add_edge(); }
-	virtual ~layer() = default;
-	virtual void forward_propagation(const tensor_t &in_data, tensor_t &out_data) = 0;
-	virtual void back_propagation(const tensor_t &in_data, const tensor_t &out_data,
-			tensor_t &out_grad, tensor_t &in_grad) = 0;
-	virtual std::string layer_type() const = 0;
-	virtual void set_context(net_phase ctx) {}
-	//virtual void setup(Graph *g, vec_t *diff, LabelList *lab) = 0;
-
-	void set_trainable(bool trainable) { trainable_ = trainable; }
-	bool trainable() const { return trainable_; }
-	void set_name(std::string name) { name_ = name; }
-	std::string get_name() { return name_; }
-	void print_layer_info() {
-		std::cout << "Layer" << level_ << " type: " << layer_type()
-			<< " input[" << input_dims[0] << "," << input_dims[1] 
-			<< "] output[" << output_dims[0] << "," << output_dims[1] << "]\n";
-	}
-	virtual void set_sample_mask(size_t sample_begin, size_t sample_end, size_t sample_count, MaskList &masks) {
-		begin_ = sample_begin;
-		end_ = sample_end;
-		count_ = sample_count;
-		masks_ = masks;
-	}
-	void set_in_data(tensor_t data) {
-		prev_ = std::make_shared<edge>(this, input_dims[1]);
-		prev_->get_data() = data;
-		prev_->get_gradient().resize(input_dims[0]);
-		// allocate memory for intermediate gradients
-		//std::cout << "l0 in_grad alloc: x=" << output_dims[0] << ", y=" << output_dims[1] << "\n";
-		for (size_t i = 0; i < input_dims[0]; ++i)
-			prev_->get_gradient()[i].resize(input_dims[1]);
-	}
-	void add_edge() {
-		// add an outgoing edge
-		next_ = std::make_shared<edge>(this, output_dims[1]);
-		// allocate memory for intermediate feature vectors
-		next_->get_data().resize(output_dims[0]);
-		for (size_t i = 0; i < output_dims[0]; ++i)
-			next_->get_data()[i].resize(output_dims[1]);
-	}
-	void alloc_grad() {
-		// allocate memory for intermediate gradients
-		//std::cout << "l" << level_ << " out_grad alloc: x=" << output_dims[0] << ", y=" << output_dims[1] << "\n";
-		next_->get_gradient().resize(output_dims[0]);
-		for (size_t i = 0; i < output_dims[0]; ++i)
-			next_->get_gradient()[i].resize(output_dims[1]);
-	}
-	void forward() {
-		forward_propagation(prev()->get_data(), next()->get_data());
-	}
-	void backward() {
-		back_propagation(prev()->get_data(), next()->get_data(), next()->get_gradient(), prev()->get_gradient());
-	}
-	void update_weight(optimizer *opt) {
-		//std::cout << "[debug] " << name_ << ": updating weight...\n"; 
-		// parallelize only when target size is big enough to mitigate thread spawning overhead.
-		bool parallel = (W.size() >= 512);
-		//vec_t diff;
-		//prev()->merge_grads(&diff);
-		//auto in_data = prev()->get_data();
-		//float_t rcp_batch_size = float_t(1.0) / in_data.size();
-		//for (size_t i = 0; i < diff.size(); ++i)
-		//	diff[i] *= rcp_batch_size;
-		opt->update(weight_grad, W, parallel); // W += grad
-		prev()->clear_grads();
-	}
-	inline acc_t get_masked_loss() {
-		//acc_t total_loss = acc_t(0);
-		//size_t valid_sample_count = 0;
-		AccumF total_loss;
-		AccumU valid_sample_count;
-		total_loss.reset();
-		valid_sample_count.reset();
-		//for (size_t i = begin_; i < end_; i ++) {
-		galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) {
-			if (masks_[i]) {
-				total_loss += loss[i];
-				valid_sample_count += 1;
-			}
-		}, galois::chunk_size<256>(), galois::steal(), galois::loopname("getMaskedLoss"));
-		//}
-		assert(valid_sample_count.reduce() == count_);
-		return total_loss.reduce() / (acc_t)count_;
-	}
-
-protected:
-	unsigned level_; // layer id: [0, num_layers-1]
-	size_t begin_; // sample begin index
-	size_t end_; // sample end index
-	size_t count_; // number of samples
-	MaskList masks_; // masks to show which samples are valid
-	size_t num_dims; // number of dimensions
-	std::vector<size_t> input_dims; // input dimensions
-	std::vector<size_t> output_dims; // output dimentions
-	std::string name_; // name of this layer
-	bool trainable_; // is this layer trainable
-	vec_t W; // parameters to learn, for vertex v, layer0: D x 16, layer1: 16 x E
-	vec_t Q; // parameters to learn, for vertex u, i.e. v's neighbors, layer0: D x 16, layer1: 16 x E
-	vec_t weight_grad; // weight gradient for updating parameters
-	vec_t loss; // error for each vertex: N x 1
-};
-
-// head: layer i+1, tail: layer i
-inline void connect(layer *head, layer *tail,
-    	size_t head_index = 0, size_t tail_index = 0) {
-	//auto out_shape = head->out_shape()[head_index];
-	//auto in_shape  = tail->in_shape()[tail_index];
-	//head->setup(false);
-	//if (in_shape.size() == 0) {
-	//	tail->set_in_shape(out_shape);
-	//	in_shape = out_shape;
-	//}
-	//if (out_shape.size() != in_shape.size()) 
-	//	connection_mismatch(*head, *tail);
-	//if (!head->next_[head_index])
-	//	throw nn_error("output edge must not be null");
-	tail->prev_ = head->next_;
-	tail->prev_->add_next_node(tail);
-}
-
diff --git a/lonestar/experimental/gnn/layers/linear_layer.h b/lonestar/experimental/gnn/layers/linear_layer.h
deleted file mode 100644
index e4ff524f3f..0000000000
--- a/lonestar/experimental/gnn/layers/linear_layer.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#pragma once
-#include "layer.h"
-
-class linear_layer : public layer {
-public:
-	linear_layer(unsigned level, float_t scale, float_t bias,
-		std::vector<size_t> in_dims, std::vector<size_t> out_dims) :
-		layer(level, in_dims, out_dims), scale_(scale), bias_(bias) {
-		trainable_ = false; }
-	linear_layer(unsigned level, std::vector<size_t> in_dim,
-		std::vector<size_t> out_dim) : linear_layer(level, 1.0, 0.0, in_dim, out_dim) { }
-	std::string layer_type() const override { return "linear"; }
-
-	void forward_propagation(const tensor_t &in_data, tensor_t &out_data) override {
-		for (size_t sample = 0; sample < input_dims[0]; ++sample) {
-			for (size_t i = 0; i < input_dims[1]; i ++)
-				out_data[sample][i] = scale_ * in_data[sample][i] + bias_;
-		}
-	}
-	void back_propagation(const tensor_t &in_data, const tensor_t &out_data, 
-		tensor_t &out_grad, tensor_t &in_grad) override {
-		for (size_t sample = 0; sample < input_dims[0]; ++sample)
-			for (size_t i = 0; i < input_dims[1]; i++)
-				in_grad[sample][i] = out_grad[sample][i]  * scale_;
-	}
-protected:
-	float_t scale_, bias_;
-};
diff --git a/lonestar/experimental/gnn/layers/relu_layer.h b/lonestar/experimental/gnn/layers/relu_layer.h
deleted file mode 100644
index 389e6b3c1f..0000000000
--- a/lonestar/experimental/gnn/layers/relu_layer.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#pragma once
-#include "layer.h"
-
-// ReLU Layer
-class relu_layer : public layer {
-public:
-	relu_layer(unsigned level, std::vector<size_t> in_dims, std::vector<size_t> out_dims)
-		: layer(level, in_dims, out_dims) {
-		trainable_ = false;
-	}
-	std::string layer_type() const override { return std::string("relu"); }
-	// 𝑦[𝑙] = max(0, 𝑦[𝑙−1])
-	void forward_propagation(const tensor_t &in_data, tensor_t &out_data) override {
-		galois::do_all(galois::iterate((size_t)0, input_dims[0]), [&](const auto& i) {
-			for (size_t j = 0; j < input_dims[1]; ++j) 
-				out_data[i][j] = std::max(in_data[i][j], (float_t)0) +
-					negative_slope * std::min(in_data[i][j], (float_t)0);
-		}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("relu_layer-fw"));
-	}
-	// 𝜕𝐿 / 𝜕𝑦[𝑙−1] = 0, 𝑖𝑓 (𝑦[𝑙] < 0)
-	//              = 𝜕𝐿 / 𝜕𝑦𝑙 , 𝑜𝑡ℎ𝑒𝑟𝑤𝑖𝑠𝑒
-	void back_propagation(const tensor_t &in_data, const tensor_t &out_data, 
-		tensor_t &out_grad, tensor_t &in_grad) override {}
-};
diff --git a/lonestar/experimental/gnn/layers/softmax_loss_layer.h b/lonestar/experimental/gnn/layers/softmax_loss_layer.h
deleted file mode 100644
index bdd52e4d38..0000000000
--- a/lonestar/experimental/gnn/layers/softmax_loss_layer.h
+++ /dev/null
@@ -1,47 +0,0 @@
-#pragma once
-#include "layer.h"
-
-class softmax_loss_layer: public layer {
-public:
-	softmax_loss_layer(unsigned level, std::vector<size_t> in_dims, 
-		std::vector<size_t> out_dims, LabelList *lab)
-		: layer(level, in_dims, out_dims), labels(lab) {
-		trainable_ = false;
-		loss.resize(in_dims[0]); // error for each sample
-		name_ = layer_type() + "_" + std::to_string(level);
-	}
-	softmax_loss_layer(unsigned level, std::vector<size_t> in_dims, 
-		std::vector<size_t> out_dims) : 
-		softmax_loss_layer(level, in_dims, out_dims, NULL) {}
-	~softmax_loss_layer() {}
-	std::string layer_type() const override { return std::string("softmax_loss"); }
-
-	// TODO: need kernel fusion optimization
-	// 𝑦[i] = 𝑒^𝑥[i] / Σ 𝑒^𝑥[𝑘]
-	void forward_propagation(const tensor_t &in_data, tensor_t &out_data) override {
-		galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) {
-			if (masks_[i] == 1) { // masked
-				softmax(in_data[i], out_data[i]); // normalize using softmax
-				// y is a one hot encoded vector for the labels
-				std::vector<acc_t> y(output_dims[1], 0.0); // ground truth
-				y[(*labels)[i]] = 1.0; // one-hot
-				loss[i] = cross_entropy(y, out_data[i]);
-			}
-		}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("softmax-loss-fw"));
-	}
-
-	void back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad) override {
-		//std::cout << name_ << " backward: x=" << in_grad.size() << ", y=" << in_grad[0].size() << "\n";
-		galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) {
-			vec_t norm_grad(output_dims[1]);
-			std::vector<acc_t> y(output_dims[1], 0.0); // ground truth
-			y[(*labels)[i]] = 1.0;
-			d_cross_entropy(y, out_data[i], norm_grad);
-			d_softmax(in_data[i], out_data[i], in_grad[i], norm_grad);
-		}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("softmax-loss-bw"));
-	}
-
-private:
-	LabelList *labels;
-};
-
diff --git a/lonestar/experimental/gnn/lgraph.h b/lonestar/experimental/gnn/lgraph.h
deleted file mode 100644
index 78f6f76aec..0000000000
--- a/lonestar/experimental/gnn/lgraph.h
+++ /dev/null
@@ -1,179 +0,0 @@
-#ifndef __LGRAPH_HPP__
-#define __LGRAPH_HPP__
-
-//defines the Learning Graph (LGraph) data structure
-#include <set>
-#include <string>
-#include <sstream>
-#include <fstream>
-#include <iostream>
-#include <algorithm>
-typedef unsigned IndexT;
-typedef float ValueT;
-
-struct Edge {
-	IndexT src;
-	IndexT dst;
-	ValueT elabel;
-	Edge() : src(0), dst(0), elabel(0) {}
-	Edge(IndexT from, IndexT to, ValueT el) :
-		src(from), dst(to), elabel(el) {}
-	std::string to_string() const {
-		std::stringstream ss;
-		ss << "e(" << src << "," << dst << "," << elabel << ")";
-		return ss.str();
-	}
-};
-typedef std::vector<Edge> EdgeList;
-
-class LGraph {
-public:
-	LGraph() : symmetrize_(false), directed_(false) {}
-	void clean() {
-		delete[] rowptr_;
-		delete[] colidx_;
-		delete[] weight_;
-		degrees.clear();
-		el.clear();
-		//labels_.clear();
-		//vertices.clear();
-	}
-	bool directed() const { return directed_; }
-	size_t num_vertices() const { return num_vertices_; }
-	size_t num_edges() const { return num_edges_; }
-	IndexT * out_rowptr() const { return rowptr_; }
-	IndexT * out_colidx() const { return colidx_; }
-	unsigned out_degree(IndexT n) const { return rowptr_[n+1] - rowptr_[n]; }
-	IndexT get_offset(IndexT n) { return rowptr_[n]; }
-	IndexT get_dest(IndexT n) { return colidx_[n]; }
-	ValueT get_weight(IndexT n) { return weight_[n]; }
-	unsigned get_max_degree() { return max_degree; }
-	//ValueT * labels() { return labels_.data(); }
-	//ValueT get_label(IndexT n) { return labels_[n]; }
-	void read_edgelist(const char *filename, bool symmetrize = false) {
-		std::ifstream in;
-		std::string line;
-		in.open(filename, std::ios::in);
-		IndexT max_vid = 0;
-		while (std::getline(in, line)) {
-			std::istringstream edge_stream(line);
-			IndexT u, v;
-			edge_stream >> u;
-			edge_stream >> v;
-			el.push_back(Edge(u, v, 1));
-			if (symmetrize) el.push_back(Edge(v, u, 1));
-			if (u > max_vid) max_vid = u;
-			if (v > max_vid) max_vid = v;
-		}
-		in.close();
-		directed_ = true;
-		num_vertices_ = max_vid+1;
-		num_edges_ = el.size();
-		std::cout << "num_vertices_ " << num_vertices_ << " num_edges_ " << num_edges_ << "\n";
-		MakeGraphFromEL();
-	}
-
-private:
-	EdgeList el;
-	bool symmetrize_; // whether to symmetrize a directed graph
-	bool directed_;
-	size_t num_vertices_;
-	size_t num_edges_;
-	IndexT *rowptr_;
-	IndexT *colidx_;
-	ValueT *weight_;
-	unsigned max_degree;
-	std::vector<IndexT> degrees;
-	std::vector<ValueT> labels_;
-	std::vector<std::vector<Edge> > vertices;
-
-	static bool compare_id(Edge a, Edge b) { return (a.dst < b.dst); }
-
-	void MakeGraphFromEL() {
-		SquishGraph();
-		MakeCSR(false);
-	}
-
-	void SquishGraph(bool remove_selfloops = true, bool remove_redundents = true) {
-		std::vector<Edge> neighbors;
-		for (size_t i = 0; i < num_vertices_; i++)
-			vertices.push_back(neighbors);
-		for (size_t i = 0; i < num_edges_; i ++)
-			vertices[el[i].src].push_back(el[i]);
-		el.clear();
-		printf("Sorting the neighbor lists...");
-		for (size_t i = 0; i < num_vertices_; i ++)
-			std::sort(vertices[i].begin(), vertices[i].end(), compare_id);
-		printf(" Done\n");
-		//remove self loops
-		int num_selfloops = 0;
-		if(remove_selfloops) {
-			printf("Removing self loops...");
-			for(size_t i = 0; i < num_vertices_; i ++) {
-				for(unsigned j = 0; j < vertices[i].size(); j ++) {
-					if(i == vertices[i][j].dst) {
-						vertices[i].erase(vertices[i].begin()+j);
-						num_selfloops ++;
-						j --;
-					}
-				}
-			}
-			printf(" %d selfloops are removed\n", num_selfloops);
-			num_edges_ -= num_selfloops;
-		}
-		// remove redundent
-		int num_redundents = 0;
-		if(remove_redundents) {
-			printf("Removing redundent edges...");
-			for (size_t i = 0; i < num_vertices_; i ++) {
-				for (unsigned j = 1; j < vertices[i].size(); j ++) {
-					if (vertices[i][j].dst == vertices[i][j-1].dst) {
-						vertices[i].erase(vertices[i].begin()+j);
-						num_redundents ++;
-						j --;
-					}
-				}
-			}
-			printf(" %d redundent edges are removed\n", num_redundents);
-			num_edges_ -= num_redundents;
-		}
-	}
-
-	void MakeCSR(bool transpose) {
-		degrees.resize(num_vertices_);
-		std::fill(degrees.begin(), degrees.end(), 0);
-		for (size_t i = 0; i < num_vertices_; i ++)
-			degrees[i] = vertices[i].size();
-		max_degree = *(std::max_element(degrees.begin(), degrees.end()));
-
-		std::vector<IndexT> offsets(degrees.size() + 1);
-		IndexT total = 0;
-		for (size_t n = 0; n < degrees.size(); n++) {
-			offsets[n] = total;
-			total += degrees[n];
-		}
-		offsets[degrees.size()] = total;
-
-		assert(num_edges_ == offsets[num_vertices_]);
-		weight_ = new ValueT[num_edges_];
-		colidx_ = new IndexT[num_edges_];
-		rowptr_ = new IndexT[num_vertices_+1]; 
-		for (size_t i = 0; i < num_vertices_+1; i ++) rowptr_[i] = offsets[i];
-		for (size_t i = 0; i < num_vertices_; i ++) {
-			for (auto it = vertices[i].begin(); it < vertices[i].end(); it ++) {
-				Edge e = *it;
-				assert(i == e.src);
-				if (symmetrize_ || (!symmetrize_ && !transpose)) {
-					weight_[offsets[e.src]] = e.elabel;
-					colidx_[offsets[e.src]++] = e.dst;
-				}
-				if (symmetrize_ || (!symmetrize_ && transpose)) {
-					weight_[offsets[e.dst]] = e.elabel;
-					colidx_[offsets[e.dst]++] = e.src;
-				}
-			}
-		}
-	}
-};
-
-#endif
diff --git a/lonestar/experimental/gnn/math_functions.hpp b/lonestar/experimental/gnn/math_functions.hpp
deleted file mode 100644
index 8791416441..0000000000
--- a/lonestar/experimental/gnn/math_functions.hpp
+++ /dev/null
@@ -1,500 +0,0 @@
-#ifndef _MATH_FUNCTIONS_
-#define _MATH_FUNCTIONS_
-#include <cmath>
-#include "utils.h"
-#include <immintrin.h>
-
-#ifdef WITH_BLAS
-extern "C" {
-#include <cblas.h>
-//#include <clapack.h>
-}
-#endif
-
-const float negative_slope = 0;
-
-// vector add
-template <typename DataTy = float>
-inline void vadd(const std::vector<DataTy> &a, const std::vector<DataTy> &b, std::vector<DataTy> &out) {
-	//for (size_t i = 0; i < out.size(); ++i) out[i] = a[i] + b[i];
-	size_t n = out.size();
-	size_t vec_len = 8;
-	const size_t alignedN = n - n % vec_len;
-	for (size_t i = 0; i < alignedN; i += vec_len)
-		_mm256_storeu_ps(&out[i], _mm256_add_ps(_mm256_loadu_ps(&a[i]), _mm256_loadu_ps(&b[i])));
-	for (size_t i = alignedN; i < n; ++i) out[i] = a[i] + b[i];
-}
-
-template <typename DataTy = float>
-inline void vadd(size_t n, const DataTy *a, const DataTy *b, DataTy *out) {
-	size_t vec_len = 8;
-	const size_t alignedN = n - n % vec_len;
-	for (size_t i = 0; i < alignedN; i += vec_len)
-		_mm256_storeu_ps(&out[i], _mm256_add_ps(_mm256_loadu_ps(&a[i]), _mm256_loadu_ps(&b[i])));
-	for (size_t i = alignedN; i < n; ++i) out[i] = a[i] + b[i];
-}
-
-// vector subtract
-template <typename DataTy = float>
-inline void vsub(const std::vector<DataTy> &in_a, const std::vector<DataTy> &in_b, std::vector<DataTy> &out) {
-	for (size_t i = 0; i < out.size(); ++i) out[i] = in_a[i] - in_b[i];
-}
-
-// vector multiply
-template <typename DataTy = float>
-inline void vmul(const std::vector<DataTy> &in_a, const std::vector<DataTy> &in_b, std::vector<DataTy> &out) {
-	for (size_t i = 0; i < out.size(); ++i) out[i] = in_a[i] * in_b[i];
-}
-
-// vector divide
-template <typename DataTy = float>
-inline void vdiv(const std::vector<DataTy> &in_a, const std::vector<DataTy> &in_b, std::vector<DataTy> &out) {
-	for (size_t i = 0; i < out.size(); ++i) {
-		assert(in_b[i] != 0);
-		out[i] = in_a[i] / in_b[i];
-	}
-}
-
-// vector add scalar
-template <typename DataTy = float>
-inline void add_scalar(const DataTy alpha, std::vector<DataTy> &Y) {
-	for (size_t i = 0; i < Y.size(); ++i) Y[i] += alpha;
-}
-
-// vector subtract scalar
-template <typename DataTy = float>
-inline void sub_scalar(const DataTy alpha, std::vector<DataTy> &Y) {
-	for (size_t i = 0; i < Y.size(); ++i) Y[i] -= alpha;
-}
-
-// vector multiply scalar
-template <typename DataTy = float>
-inline void mul_scalar(const DataTy alpha, std::vector<DataTy> &Y) {
-	for (size_t i = 0; i < Y.size(); ++i) Y[i] *= alpha;
-}
-
-template <typename DataTy = float>
-inline void mul_scalar(size_t n, const DataTy alpha, const DataTy *in, DataTy *out) {
-	for (size_t i = 0; i < n; ++i) out[i] = alpha *in[i];
-}
-
-// vector divide scalar
-template <typename DataTy = float>
-inline void div_scalar(const DataTy alpha, std::vector<DataTy> &Y) {
-	assert(alpha != 0);
-	for (size_t i = 0; i < Y.size(); ++i) Y[i] /= alpha;
-}
-
-// dot product
-template <typename DataTy = float>
-inline DataTy dot(const std::vector<DataTy> &x, const std::vector<DataTy> &y) {
-	DataTy sum = 0;
-	for (size_t i = 0; i < x.size(); ++i)
-		sum += x[i] * y[i];
-	return sum;
-}
-
-// matrix-vector multiply
-inline void mvmul(const vec_t &matrix, const vec_t &in_vector, vec_t &out_vector) {
-	size_t m = out_vector.size();
-	size_t n = in_vector.size();
-	for (size_t i = 0; i < m; ++i) { 
-		for (size_t j = 0; j < n; ++j) { 
-			out_vector[i] += matrix[i*n+j] * in_vector[j];
-		} 
-	} 
-}
-
-// vector-vector multiply
-inline void vvmul(const vec_t &a, const vec_t &b, tensor_t &out) {
-	size_t m = a.size();
-	size_t n = b.size();
-	for (size_t i = 0; i < m; ++i) { 
-		for (size_t j = 0; j < n; ++j) { 
-			out[i][j] += a[i] * b[j];
-		} 
-	} 
-}
-
-// matrix addition
-inline void matadd(size_t x, size_t y, const tensor_t &A, const tensor_t &B, tensor_t &C) {
-	for (size_t i = 0; i < x; ++i)
-		for (size_t j = 0; j < y; ++j)
-			C[i][j] = A[i][j] + B[i][j];
-}
-
-// TODO: vectorize
-template <typename DataTy = float>
-inline void copy2D1D(const tensor_t &in, vec_t &out) {
-	size_t x = in.size();
-	size_t y = in[0].size();
-#ifdef WITH_BLAS
-	auto ptr = &out[0];
-	for (size_t i = 0; i < x; i++) {
-		std::copy(in[i].begin(), in[i].end(), ptr);
-		ptr += y;
-	}
-#else
-	assert(out.size() == x*y);
-	for (size_t i = 0; i < x; i ++) {
-		for (size_t j = 0; j < y; j ++) {
-			out[i*y+j] = in[i][j];
-		}
-	}
-#endif
-}
-
-// matrix multiply: all 2D
-inline void matmul2D(const tensor_t &A, const tensor_t &B, tensor_t &C) {
-	// A: x*z; B: z*y; C: x*y
-	size_t dim_x = A.size();
-	size_t dim_y = C[0].size();
-	size_t dim_z = A[0].size();
-	assert(C.size() == dim_x);
-	assert(B.size() == dim_z);
-	assert(B[0].size() == dim_y);
-
-	for (size_t i = 0; i < dim_x; ++i) { 
-		for (size_t j = 0; j < dim_y; ++j) { 
-			C[i][j] = 0;
-			for (size_t k = 0; k < dim_z; ++k) { 
-				C[i][j] += A[i][k] * B[k][j];
-			} 
-		} 
-	} 
-}
-
-inline void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z, 
-	const vec_t &A, const vec_t &B, vec_t &C) {
-	galois::StatTimer Tmatmul("MatMul");
-	Tmatmul.start();
-#ifdef WITH_BLAS
-	const int M = dim_x;
-	const int N = dim_y;
-	const int K = dim_z;
-	const float alpha = 1.0;
-	const float beta = 0.0;
-	const CBLAS_TRANSPOSE TransA = CblasNoTrans;
-	const CBLAS_TRANSPOSE TransB = CblasNoTrans;
-	int lda = (TransA == CblasNoTrans) ? K : M;
-	int ldb = (TransB == CblasNoTrans) ? N : K;
-	cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, &A[0], lda, &B[0], ldb, beta, &C[0], N);
-#else
-	//std::cout << "using naive matmul, slow\n";
-	assert(A.size() == dim_x*dim_z);
-	assert(B.size() == dim_z*dim_y);
-	assert(C.size() == dim_x*dim_y);
-
-	for (size_t i = 0; i < dim_x; ++i) { 
-		for (size_t j = 0; j < dim_y; ++j) { 
-			C[i*dim_y+j] = 0;
-			for (size_t k = 0; k < dim_z; ++k) { 
-				C[i*dim_y+j] += A[i*dim_z+k] * B[k*dim_y+j];
-			} 
-		} 
-	} 
-#endif
-	Tmatmul.stop();
-}
-
-inline void matmul2D1D(const size_t dim_y, const tensor_t &A, const vec_t &B, vec_t &C) {
-	// A: x*z; B: z*y; C: x*y
-	size_t dim_x = A.size();
-	size_t dim_z = A[0].size();
-	assert(B.size() == dim_z*dim_y);
-	assert(C.size() == dim_x*dim_y);
-
-#ifdef WITH_BLAS
-	vec_t A1D(dim_x*dim_z);
-	copy2D1D(A, A1D);
-	matmul1D1D(dim_x, dim_y, dim_z, A1D, B, C);
-#else
-	for (size_t i = 0; i < dim_x; ++i) { 
-		for (size_t j = 0; j < dim_y; ++j) { 
-			C[i*dim_y+j] = 0;
-			for (size_t k = 0; k < dim_z; ++k) { 
-				C[i*dim_y+j] += A[i][k] * B[k][j];
-			} 
-		} 
-	} 
-#endif
-}
-
-// matrix multiply
-inline void matmul(const tensor_t &A, const vec_t &B, tensor_t &C) {
-	// A: x*z; B: z*y; C: x*y
-	size_t dim_x = C.size();
-	size_t dim_y = C[0].size();
-	size_t dim_z = A[0].size();
-	assert(A.size() == dim_x);
-	assert(B.size() == dim_y*dim_z);
-
-#ifdef WITH_BLAS
-	vec_t A1D(dim_x*dim_z);
-	vec_t C1D(dim_x*dim_y, 0);
-	auto ptr = &A1D[0];
-	for (size_t i = 0; i < dim_x; i++) {
-		std::copy(A[i].begin(), A[i].end(), ptr);
-		ptr += dim_z;
-	}
-	matmul1D1D(dim_x, dim_y, dim_z, A1D, B, C1D);
-	for (size_t i = 0; i < dim_x; i++) {
-		for (size_t j = 0; j < dim_y; ++j) { 
-			C[i][j] = C1D[i*dim_y+j];
-		}
-	}
-#else
-	for (size_t i = 0; i < dim_x; ++i) { 
-		for (size_t j = 0; j < dim_y; ++j) { 
-			C[i][j] = 0;
-			for (size_t k = 0; k < dim_z; ++k) { 
-				C[i][j] += A[i][k] * B[k*dim_y+j];
-			} 
-		} 
-	} 
-#endif
-}
-
-template <typename DataTy = float>
-inline void transpose2D(const tensor_t &in, tensor_t &out) {
-	size_t x = in.size();
-	size_t y = in[0].size();
-	for (size_t i = 0; i < y; i ++) {
-		for (size_t j = 0; j < x; j ++) {
-			out[i][j] = in[j][i];
-		}
-	}
-}
-
-// TODO: vectorize
-template <typename DataTy = float>
-inline void transpose2D1D(const tensor_t &in, vec_t &out) {
-	size_t x = in.size();
-	size_t y = in[0].size();
-	assert(out.size() == x*y);
-	for (size_t i = 0; i < y; i ++) {
-		for (size_t j = 0; j < x; j ++) {
-			out[i*x+j] = in[j][i];
-		}
-	}
-}
-
-template <typename DataTy = float>
-inline void transpose(size_t x, size_t y, const vec_t &in, vec_t &out) {
-	for (size_t i = 0; i < y; i ++) {
-		for (size_t j = 0; j < x; j ++) {
-			out[i*x+j] = in[j*y+i];
-		}
-	}
-}
-
-template <typename DataTy = float>
-inline int argmax(const size_t n, const std::vector<DataTy> &x) {
-	DataTy max = x[0];
-	int max_ind = 0;
-	for (size_t i = 1; i < n; i++) {
-		if (x[i] > max) {
-			max_ind = i;
-			max = x[i];
-		}
-	}
-	return max_ind;
-}
-
-inline void clear(vec_t &in) {
-	for (size_t i = 0; i < in.size(); i++) in[i] = 0;
-}
-
-inline void update_all(Graph *g, const tensor_t &in, tensor_t &out, bool norm, const vec_t &norm_factor) {
-	galois::do_all(galois::iterate(g->begin(), g->end()), [&](const auto& src) {
-		clear(out[src]); // TODO: vectorize clear
-		float_t a = 0.0, b = 0.0;
-		if (norm) a = norm_factor[src];
-		// gather neighbors' embeddings
-		for (const auto e : g->edges(src)) {
-			const auto dst = g->getEdgeDst(e);
-			if (norm) {
-				b = a * norm_factor[dst];
-				vec_t neighbor = in[dst];
-				mul_scalar(b, neighbor);
-				vadd(out[src], neighbor, out[src]); // out[src] += in[dst]
-			} else vadd(out[src], in[dst], out[src]); // out[src] += in[dst]
-		}
-	}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("update_all"));
-}
-
-inline void update_all(Graph *g, const vec_t &in, tensor_t &out, bool norm, const vec_t &norm_factor) {
-	size_t len = out[0].size();
-	galois::do_all(galois::iterate(g->begin(), g->end()), [&](const auto& src) {
-		clear(out[src]);
-		float_t a = 0.0, b = 0.0;
-		if (norm) a = norm_factor[src];
-		// gather neighbors' embeddings
-		for (const auto e : g->edges(src)) {
-			const auto dst = g->getEdgeDst(e);
-			if (norm) {
-				b = a * norm_factor[dst];
-				vec_t neighbor(len);
-				mul_scalar(len, b, &in[dst*len], neighbor.data());
-				vadd(out[src], neighbor, out[src]); // out[src] += in[dst]
-			} else vadd(len, out[src].data(), &in[dst*len], out[src].data()); // out[src] += in[dst]
-		}
-	}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("update_all"));
-}
-
-template <typename DataTy = float>
-inline void relu(const std::vector<DataTy> &in, std::vector<DataTy> &out) {
-	for (size_t i = 0; i < out.size(); ++i) {
-		out[i] = std::max(in[i], (DataTy)0) + negative_slope * std::min(in[i], (DataTy)0);
-	}
-}
-
-template <typename DataTy = float>
-inline void d_relu(const std::vector<DataTy> &in_diff, const std::vector<DataTy> &fv, std::vector<DataTy> &out_diff) {
-	for (size_t i = 0; i < out_diff.size(); ++i) {
-		out_diff[i] = in_diff[i] * ((fv[i] > (DataTy)0)  + negative_slope * (fv[i] <= (DataTy)0));
-	}
-}
-
-inline void d_mvmul(vec_t &in_diff, vec_t &h_in, tensor_t &out_diff) {
-	vvmul(h_in, in_diff, out_diff); // transposed feature matrix X^T times in_diff 
-}
-
-inline void d_vadd(vec_t &in_diff, vec_t &out_diff) {
-	for (size_t i = 0; i < out_diff.size(); ++i)
-		out_diff[i] = in_diff[i];
-}
-
-template <typename DataTy = float>
-inline float reduce_mean(const std::vector<DataTy> &x) {
-	size_t n = x.size();
-	assert(n > 0);
-	float sum = (float)x[0];
-	for (size_t i = 1; i < n; i++) {
-		sum += (float)x[i];
-	}
-	return sum / (float)n;
-}
-
-const float scale_ = 1. / (1. - dropout_rate);
-
-inline void dropout(const vec_t &in, std::vector<unsigned> &mask, vec_t &out) {
-	assert(mask.size() == out.size());
-	//rng_bernoulli(1. - dropout_rate, mask); // Create random numbers
-	for (size_t i = 0; i < in.size(); ++i)
-		mask[i] = bernoulli(dropout_rate);
-	for (size_t i = 0; i < in.size(); ++i)
-		out[i] = in[i] * mask[i] * scale_;
-}
-
-inline void dropout(const vec_t &in, std::vector<unsigned> &mask, float_t *out) {
-	for (size_t i = 0; i < in.size(); ++i)
-		mask[i] = bernoulli(dropout_rate);
-	for (size_t i = 0; i < in.size(); ++i)
-		out[i] = in[i] * mask[i] * scale_;
-}
-
-inline void d_dropout(const vec_t &in_diff, std::vector<unsigned> &mask, vec_t &out_diff) {
-	for (size_t i = 0; i < in_diff.size(); ++i)
-		out_diff[i] = in_diff[i] * mask[i] * scale_;
-}
-
-template <typename DataTy = float>
-inline DataTy sigmoid_func(DataTy x) {
-	return 0.5 * tanh(0.5 * x) + 0.5;
-}
-
-// Sigmoid
-template <typename DataTy = float>
-inline void sigmoid(std::vector<DataTy> &fv) {
-	size_t count = fv.size();
-	for (size_t i = 0; i < count; ++i) {
-		fv[i] = sigmoid_func(fv[i]);
-	}
-}
-
-// Softmax function takes an N-dimensional vector (X) of real number,
-// and transforms it into a vector of real number in range (0,1) which add upto 1.
-// To make softmax func numerically stable, we simply normalize the values in the vector, 
-// by multiplying the numerator and denominator with a constant C, where log(C)=-max(X)
-//    exps = np.exp(X - np.max(X))
-//    exps / np.sum(exps)
-template <typename DataTy = float>
-inline void softmax(const std::vector<DataTy> &input, std::vector<DataTy> &output) {
-	const float_t max = *std::max_element(input.begin(), input.end());
-	float_t denominator(0);
-	for (size_t i = 0; i < input.size(); i++) {
-		output[i] = std::exp(input[i] - max);
-		denominator += output[i];
-	}
-	for (size_t i = 0; i < input.size(); i++)
-		output[i] /= denominator;
-}
-
-template <typename DataTy = float>
-inline void log_softmax(const std::vector<DataTy> &input, std::vector<DataTy> &output) {
-	const float_t max = *std::max_element(input.begin(), input.end());
-	float_t denominator(0);
-	for (size_t i = 0; i < input.size(); i++)
-		denominator += std::exp(input[i] - max);
-	for (size_t i = 0; i < input.size(); i++)
-		output[i] = input[i] - max - denominator;
-}
-
-// Due to the desirable property of softmax function outputting a probability distribution, 
-// we often use it as the final layer in neural networks.
-// For this we need to calculate the derivative or gradient,
-// and pass it back to the previous layer during backpropagation.
-template <typename DataTy = float>
-inline void d_softmax(const std::vector<DataTy> &y, const std::vector<DataTy> &p, 
-		std::vector<DataTy> &dy, const std::vector<DataTy> &dp) {
-	auto n = y.size();
-	vec_t df(n, 0);
-	for (size_t i = 0; i < n; i++) {
-		for (size_t j = 0; j < n; j++) {
-			//DataTy delta_ij = i == j? 1 : 0;
-			//df[i] += p[j] * (delta_ij - p[i]);
-			df[j] = (j == i) ? p[i] * (float_t(1) - p[i]) : -p[j] * p[i];
-		}
-		// dy = dp * (gradient of softmax)
-		dy[i] = dot(dp, df);
-	}
-/* 
-	for (size_t j = 0; j < x.size(); j++) {
-		for (size_t k = 0; k < x.size(); k++) {
-			df[k] = (k == j) ? y[j] * (float_t(1) - y[j]) : -y[k] * y[j];
-		}
-		dx[j] = vectorize::dot(&dy[0], &df[0], len);
-	}
-*/
-}
-
-// cross-entropy loss function for multi-class classification
-// y: ground truth
-// p: predicted probability
-template <typename DataTy = float>
-inline DataTy cross_entropy(const std::vector<DataTy> &y, const std::vector<DataTy> &p) {
-	auto n = y.size();
-	assert(n > 0);
-	DataTy loss = 0.0;
-	for (size_t i = 0; i < n; i++) {
-		if (y[i] == float_t(0)) continue;
-		if (p[i] == float_t(0)) loss -= y[i] * std::log(float_t(1e-10));
-		//if (p[i]==float_t(1)) loss -= (float_t(1) - y[i]) * std::log(float_t(1e-10));
-		else loss -= y[i] * std::log(p[i]);// + (float_t(1) - y[i]) * std::log(float_t(1) - p[i]);
-		//loss -= y[i] * std::log(p[i]);
-	}
-	return loss;
-}
-
-template <typename DataTy = float>
-inline void d_cross_entropy(const std::vector<DataTy> &y, const std::vector<DataTy> &p, std::vector<DataTy> &d) {
-	auto n = y.size();
-	//for (size_t i = 0; i < n; i++) d[i] = (p[i] - y[i]) / (p[i] * (float_t(1) - p[i]));
-	for (size_t i = 0; i < n; i++) {
-		d[i] = -y[i] / (p[i] + float_t(1e-10));
-		//d[i] = p[i] - y[i];
-	}
-}
-
-#endif
diff --git a/lonestar/experimental/gnn/net.h b/lonestar/experimental/gnn/net.h
deleted file mode 100644
index fac7caee00..0000000000
--- a/lonestar/experimental/gnn/net.h
+++ /dev/null
@@ -1,341 +0,0 @@
-#ifndef _MODEL_H_
-#define _MODEL_H_
-
-#include <random>
-#include "gnn.h"
-#include "lgraph.h"
-#include "layers.h"
-#include "optimizer.h"
-
-#define NUM_CONV_LAYERS 2
-
-// N: number of vertices, D: feature vector dimentions, 
-// E: number of distinct labels, i.e. number of vertex classes
-// layer 1: features N x D, weights D x 16, out N x 16 (hidden1=16)
-// layer 2: features N x 16, weights 16 x E, out N x E
-class Net {
-public:
-	Net() {}
-
-	// user-defined aggregate function
-	virtual void aggregate(Graph *g, size_t dim, const tensor_t &in_feats, tensor_t &out_feats) {}
-	
-	// user-defined combine function
-	virtual void combine(const vec_t ma, const vec_t mb, const vec_t &a, const vec_t &b, vec_t &out) {}
-	
-	void init() {
-		assert(dropout_rate < 1.0);
-		read_graph(dataset, g); 
-		n = g.size(); // N
-		labels.resize(n, 0); // label for each vertex: N x 1
-		num_classes = read_labels(dataset, labels);
-
-		std::cout << "Reading label masks ... ";
-		train_mask.resize(n, 0);
-		val_mask.resize(n, 0);
-		if (dataset == "reddit") {
-			train_begin = 0, train_count = 153431, train_end = train_begin + train_count;
-			val_begin = 153431, val_count = 23831, val_end = val_begin + val_count;
-			for (size_t i = train_begin; i < train_end; i++) train_mask[i] = 1;
-			for (size_t i = val_begin; i < val_end; i++) val_mask[i] = 1;
-		} else {
-			train_count = read_masks(dataset, "train", train_begin, train_end, train_mask);
-			val_count = read_masks(dataset, "val", val_begin, val_end, val_mask);
-		}
-		std::cout << "Done\n";
-
-		num_layers = NUM_CONV_LAYERS + 1;
-		feature_dims.resize(num_layers + 1);
-		input_features.resize(n); // input embedding: N x D
-		feature_dims[0] = read_features(dataset, input_features); // input feature dimension: D
-		feature_dims[1] = hidden1; // hidden1 level embedding: 16
-		feature_dims[2] = num_classes; // output embedding: E
-		feature_dims[3] = num_classes; // normalized output embedding: E
-		layers.resize(num_layers);
-	}
-	size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; }
-	size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id+1]; }
-	size_t get_nnodes() { return n; }
-	size_t get_nedges() { return g.sizeEdges(); }
-	size_t get_ft_dim() { return feature_dims[0]; }
-	size_t get_nclasses() { return num_classes; }
-	size_t get_label(size_t i) { return labels[i]; }
-	void construct_layers() {
-		std::cout << "\nConstructing layers...\n";
-		append_conv_layer(0, true); // first conv layer
-		append_conv_layer(1); // hidden1 layer
-		append_out_layer(2); // output layer
-		layers[0]->set_in_data(input_features); // feed input data
-	}
-
-	void set_netphase(net_phase phase) {
-		for (size_t i = 0; i < num_layers; i ++)
-			layers[i]->set_context(phase);
-	}
-
-	void print_layers_info() {
-		for (size_t i = 0; i < num_layers; i ++)
-			layers[i]->print_layer_info();
-	}
-
-	void append_conv_layer(size_t layer_id, bool act = false, bool norm = true, bool bias = false, bool dropout = true) {
-		assert(layer_id < NUM_CONV_LAYERS);
-		std::vector<size_t> in_dims(2), out_dims(2);
-		in_dims[0] = out_dims[0] = n;
-		in_dims[1] = get_in_dim(layer_id);
-		out_dims[1] = get_out_dim(layer_id);
-		layers[layer_id] = new graph_conv_layer(layer_id, &g, act, norm, bias, dropout, in_dims, out_dims);
-		if(layer_id > 0) connect(layers[layer_id-1], layers[layer_id]);
-	}
-
-	void append_out_layer(size_t layer_id) {
-		assert(layer_id > 0); // can not be the first layer
-		std::vector<size_t> in_dims(2), out_dims(2);
-		in_dims[0] = out_dims[0] = n;
-		in_dims[1] = get_in_dim(layer_id);
-		out_dims[1] = get_out_dim(layer_id);
-		layers[layer_id] = new softmax_loss_layer(layer_id, in_dims, out_dims, &labels);
-		connect(layers[layer_id-1], layers[layer_id]);
-	}
-
-	// forward propagation: [begin, end) is the range of samples used.
-	acc_t fprop(size_t begin, size_t end, size_t count, MaskList &masks) {
-		// set mask for the last layer
-		layers[num_layers-1]->set_sample_mask(begin, end, count, masks);
-		// layer0: from N x D to N x 16
-		// layer1: from N x 16 to N x E
-		// layer2: from N x E to N x E (normalize only)
-		for (size_t i = 0; i < num_layers; i ++)
-			layers[i]->forward();
-		return layers[num_layers-1]->get_masked_loss();
-	}
-
-	// back propogation
-	void bprop() {
-		for (size_t i = num_layers; i != 0; i --)
-			layers[i-1]->backward();
-	}
-
-	// update trainable weights after back-propagation
-	void update_weights(optimizer *opt) {
-		for (size_t i = 0; i < num_layers; i ++)
-			if (layers[i]->trainable()) layers[i]->update_weight(opt);
-	}
-
-	// evaluate, i.e. inference or predict
-	double evaluate(size_t begin, size_t end, size_t count, MaskList &masks, acc_t &loss, acc_t &acc) {
-		Timer t_eval;
-		t_eval.Start();
-		loss = fprop(begin, end, count, masks);
-		acc = masked_accuracy(begin, end, count, masks);
-		t_eval.Stop();
-		return t_eval.Millisecs();
-	}
-
-	// training
-	void train(optimizer *opt) {
-		std::cout << "\nStart training...\n";
-		galois::StatTimer Tupdate("Train-WeightUpdate");
-		galois::StatTimer Tfw("Train-Forward");
-		galois::StatTimer Tbw("Train-Backward");
-		galois::StatTimer Tval("Validation");
-		Timer t_epoch;
-		// run epoches
-		for (size_t i = 0; i < epochs; i++) {
-			std::cout << "Epoch " << std::setw(2) << i << std::fixed << std::setprecision(3) << ":";
-			t_epoch.Start();
-
-			// training steps
-			set_netphase(net_phase::train);
-			acc_t train_loss = 0.0, train_acc = 0.0;
-			Tfw.start();
-			train_loss = fprop(train_begin, train_end, train_count, train_mask); // forward
-			train_acc = masked_accuracy(train_begin, train_end, train_count, train_mask); // predict
-			Tfw.stop();
-			Tbw.start();
-			bprop(); // back propogation
-			Tbw.stop();
-			Tupdate.start();
-			update_weights(opt); // update parameters
-			Tupdate.stop();
-			set_netphase(net_phase::test);
-			std::cout << " train_loss = " << std::setw(5) << train_loss << " train_acc = " << std::setw(5) << train_acc;
-			t_epoch.Stop();
-			double epoch_time = t_epoch.Millisecs();
-
-			if (do_validate) {
-				// Validation
-				acc_t val_loss = 0.0, val_acc = 0.0;
-				Tval.start();
-				double val_time = evaluate(val_begin, val_end, val_count, val_mask, val_loss, val_acc);
-				Tval.stop();
-				std::cout << " val_loss = " << std::setw(5) << val_loss << " val_acc = " << std::setw(5) << val_acc;
-				std::cout << " time = " << epoch_time + val_time << " ms (train_time = " << epoch_time << " val_time = " << val_time << ")\n";
-			} else {
-				std::cout << " train_time = " << epoch_time << " ms\n";
-			}
-		}
-	}
-
-protected:
-	size_t n; // number of samples: N
-	size_t num_classes; // number of vertex classes: E
-	size_t num_layers; // for now hard-coded: NUM_CONV_LAYERS + 1
-	std::vector<size_t> feature_dims; // feature dimnesions for each layer
-
-	Graph g; // the input graph, |V| = N
-	tensor_t input_features; // input features: N x D
-	std::vector<label_t> labels; // labels for classification: N x 1
-	MaskList train_mask, val_mask; // masks for traning and validation
-	size_t train_begin, train_end, train_count, val_begin, val_end, val_count;
-
-	std::vector<layer *> layers; // all the layers in the neural network
-	/*
-	inline void init_features(size_t dim, vec_t &x) {
-		std::default_random_engine rng;
-		std::uniform_real_distribution<feature_t> dist(0, 0.1);
-		for (size_t i = 0; i < dim; ++i)
-			x[i] = dist(rng);
-	}
-	//*/
-
-	// labels contain the ground truth (e.g. vertex classes) for each example (num_examples x 1).
-	// Note that labels is not one-hot encoded vector and it can be computed
-	// as y.argmax(axis=1) from one-hot encoded vector (y) of labels if required.
-	size_t read_labels(std::string dataset_str, LabelList &labels) {
-		std::cout << "Reading labels ... ";
-		Timer t_read;
-		t_read.Start();
-		std::string filename = path + dataset_str + "-labels.txt";
-		std::ifstream in;
-		std::string line;
-		in.open(filename, std::ios::in);
-		size_t m, n;
-		in >> m >> n >> std::ws;
-		assert(m == labels.size()); // number of vertices
-		unsigned v = 0;
-		while (std::getline(in, line)) {
-			std::istringstream label_stream(line);
-			unsigned x;
-			for (size_t idx = 0; idx < n; ++idx) {
-				label_stream >> x;
-				if (x != 0) {
-					labels[v] = idx;
-					break;
-				}
-			}
-			v ++;
-		}
-		in.close();
-		t_read.Stop();
-		// number of vertex classes
-		std::cout << "Done, unique label counts: " << n << ", time: " << t_read.Millisecs() << " ms\n";
-		return n;
-	}
-
-	size_t read_features(std::string dataset_str, tensor_t &feats) {
-		std::cout << "Reading features ... ";
-		Timer t_read;
-		t_read.Start();
-		std::string filename = path + dataset_str + ".ft";
-		std::ifstream in;
-		std::string line;
-		in.open(filename, std::ios::in);
-		size_t m, n;
-		in >> m >> n >> std::ws;
-		assert(m == feats.size()); // m = number of vertices
-		for (size_t i = 0; i < m; ++i) {
-			feats[i].resize(n);
-			for (size_t j = 0; j < n; ++j)
-				feats[i][j] = 0;
-		}
-		while (std::getline(in, line)) {
-			std::istringstream edge_stream(line);
-			unsigned u, v;
-			float_t w;
-			edge_stream >> u;
-			edge_stream >> v;
-			edge_stream >> w;
-			feats[u][v] = w;
-		}
-		/*
-		for (size_t i = 0; i < 10; ++i)
-			for (size_t j = 0; j < n; ++j)
-				if (feats[i][j] > 0)
-					std::cout << "feats[" << i << "][" << j << "]: " << feats[i][j] << std::endl;
-		//*/
-		in.close();
-		t_read.Stop();
-		std::cout << "Done, feature dimention: " << n << ", time: " << t_read.Millisecs() << " ms\n";
-		return n;
-	}
-
-	unsigned read_graph(std::string dataset_str, Graph &graph) {
-		//printf("Start readGraph\n");
-		galois::StatTimer Tread("GraphReadingTime");
-		Tread.start();
-		LGraph lgraph;
-		unsigned max_degree = 0;
-		if (filetype == "el") {
-			std::string filename = path + dataset_str + ".el";
-			printf("Reading .el file: %s\n", filename.c_str());
-			lgraph.read_edgelist(filename.c_str(), true); //symmetrize
-			genGraph(lgraph, graph);
-		} else if (filetype == "gr") {
-			std::string filename = path + dataset_str + ".csgr";
-			printf("Reading .gr file: %s\n", filename.c_str());
-			galois::graphs::readGraph(graph, filename);
-			/*
-			galois::do_all(galois::iterate(graph.begin(), graph.end()), [&](const auto& vid) {
-				graph.getData(vid) = 1;
-				//for (auto e : graph.edges(n)) graph.getEdgeData(e) = 1;
-			}, galois::chunk_size<256>(), galois::steal(), galois::loopname("assignVertexLabels"));
-			std::vector<unsigned> degrees(graph.size());
-			galois::do_all(galois::iterate(graph.begin(), graph.end()), [&](const auto& vid) {
-				degrees[vid] = std::distance(graph.edge_begin(vid), graph.edge_end(vid));
-			}, galois::loopname("computeMaxDegree"));
-			max_degree = *(std::max_element(degrees.begin(), degrees.end()));
-			*/
-		} else { printf("Unkown file format\n"); exit(1); }
-		if (filetype != "gr") {
-			max_degree = lgraph.get_max_degree();
-			lgraph.clean();
-		}
-		printf("max degree = %u\n", max_degree);
-		Tread.stop();
-		//printf("Done readGraph\n");
-		std::cout << "num_vertices " << g.size() << " num_edges " << g.sizeEdges() << "\n";
-		return max_degree;
-	}
-
-	void genGraph(LGraph &lg, Graph &g) {
-		g.allocateFrom(lg.num_vertices(), lg.num_edges());
-		g.constructNodes();
-		for (size_t i = 0; i < lg.num_vertices(); i++) {
-			g.getData(i) = 1;
-			auto row_begin = lg.get_offset(i);
-			auto row_end = lg.get_offset(i+1);
-			g.fixEndEdge(i, row_end);
-			for (auto offset = row_begin; offset < row_end; offset ++)
-				g.constructEdge(offset, lg.get_dest(offset), 0); // do not consider edge labels now
-		}
-	}
-
-	inline acc_t masked_accuracy(size_t begin, size_t end, size_t count, MaskList &masks) {
-		// comparing outputs with the ground truth (labels)
-		//acc_t accuracy_all = 0.0;
-		AccumF accuracy_all;
-		accuracy_all.reset();
-		//for (size_t i = begin; i < end; i++) {
-		galois::do_all(galois::iterate(begin, end), [&](const auto& i) {
-			if (masks[i] == 1) {
-				int prediction = argmax(num_classes, layers[NUM_CONV_LAYERS-1]->next()->get_data()[i]);
-				if ((label_t)prediction == labels[i]) accuracy_all += 1.0;
-			}
-		}, galois::chunk_size<256>(), galois::steal(), galois::loopname("getMaskedLoss"));
-		//}
-		return accuracy_all.reduce() / (acc_t)count;
-	}
-};
-
-#endif
diff --git a/lonestar/experimental/gnn/node.h b/lonestar/experimental/gnn/node.h
deleted file mode 100644
index deffebad9b..0000000000
--- a/lonestar/experimental/gnn/node.h
+++ /dev/null
@@ -1,109 +0,0 @@
-#pragma once
-#include <vector>
-class node;
-class layer;
-class edge;
-
-typedef std::shared_ptr<edge> edgeptr_t;
-
-// node data structure
-class node : public std::enable_shared_from_this<node> {
-public:
-	node(size_t in_size, size_t out_size) {}//: prev_(in_size), next_(out_size) {}
-	virtual ~node() {}
-	const edgeptr_t prev() const { return prev_; }
-	//const std::vector<edgeptr_t> &prev() const { return prev_; }
-	const edgeptr_t next() const { return next_; }
-	//const std::vector<edgeptr_t> &next() const { return next_; }
-	//std::vector<node *> prev_nodes() const;
-	//std::vector<node *> next_nodes() const;
-
-protected:
-	node() = delete;
-	friend void connect(layer *head, layer *tail, size_t head_index, size_t tail_index);
-	//mutable std::vector<edgeptr_t> prev_;
-	//mutable std::vector<edgeptr_t> next_;
-	mutable edgeptr_t prev_;
-	mutable edgeptr_t next_;
-};
-
-// edges manage the input/output data and gradients between nodes
-class edge {
-public:
-	edge(node *prev, size_t len) :
-		ft_dim_(len),
-		data_({vec_t(len)}),
-		grad_({vec_t(len)}),
-		prev_(prev) {}
-
-	void merge_grads(vec_t *dst) {
-		assert(!grad_.empty());
-		const auto &grad_head = grad_[0];
-		size_t sz             = grad_head.size();
-		dst->resize(sz);
-		float_t *pdst = &(*dst)[0];
-		std::copy(grad_head.begin(), grad_head.end(), pdst);
-		// @todo consider adding parallelism and vectorization
-		for (size_t sample = 1; sample < grad_.size(); ++sample) {
-			for (size_t i = 0; i < sz; i++)
-				pdst[i] += grad_[sample][i];
-			//vectorize::reduce<float_t>(&grad_[sample][0], sz, pdst);
-		}
-	}
-	void clear_grads() {
-		for (size_t sample = 0; sample < grad_.size(); ++sample) {
-			auto &g = grad_[sample];
-			std::fill(g.begin(), g.end(), 0.0); // TODO: need vectorize
-			//vectorize::fill(&g[0], g.size(), float_t{0});
-		}
-	}
-
-	tensor_t *get_data_ptr() { return &data_; }
-	tensor_t &get_data() { return data_; }
-	//const tensor_t *get_data() const { return &data_; }
-	const tensor_t &get_data() const { return data_; }
-	//tensor_t *get_gradient() { return &grad_; }
-	tensor_t &get_gradient() { return grad_; }
-	//const tensor_t *get_gradient() const { return &grad_; }
-	const tensor_t &get_gradient() const { return grad_; }
-
-	//const std::vector<node *> &next() const { return next_; }
-	const node *next() const { return next_; }
-	node *prev() { return prev_; }
-	const node *prev() const { return prev_; }
-	//const shape3d &shape() const { return shape_; }
-	//vector_type vtype() const { return vtype_; }
-	//void add_next_node(node *next) { next_.push_back(next); }
-	void add_next_node(node *next) { next_ = next; }
-private:
-	//shape3d shape_;
-	size_t ft_dim_;
-	//vector_type vtype_;
-	tensor_t data_;
-	tensor_t grad_;
-	node *prev_;                // previous node, "producer" of this tensor
-	node *next_;                // next node, "consumer" of this tensor
-	//std::vector<node *> next_;  // next nodes, "consumers" of this tensor
-};
-/*
-inline std::vector<node *> node::prev_nodes() const {
-	std::vector<node *> vecs;
-	for (auto &e : prev_) {
-		if (e && e->prev()) {
-			vecs.insert(vecs.end(), e->prev());
-		}
-	}
-	return vecs;
-}
-
-inline std::vector<node *> node::next_nodes() const {
-	std::vector<node *> vecs;
-	for (auto &e : next_) {
-		if (e) {
-			auto n = e->next();
-			vecs.insert(vecs.end(), n.begin(), n.end());
-		}
-	}
-	return vecs;
-}
-*/
diff --git a/lonestar/experimental/gnn/optimizer.h b/lonestar/experimental/gnn/optimizer.h
deleted file mode 100644
index 2896881fed..0000000000
--- a/lonestar/experimental/gnn/optimizer.h
+++ /dev/null
@@ -1,221 +0,0 @@
-#pragma once
-
-#include <algorithm>
-#include <unordered_map>
-#include "types.h"
-
-// base class of optimizer
-// usesHessian : true if an optimizer uses hessian (2nd order derivative of loss function)
-struct optimizer {
-	optimizer()                  = default;
-	optimizer(const optimizer &) = default;
-	optimizer(optimizer &&)      = default;
-	optimizer &operator=(const optimizer &) = default;
-	optimizer &operator=(optimizer &&) = default;
-	virtual ~optimizer()               = default;
-	virtual void update(const vec_t &dW, vec_t &W, bool parallelize) = 0;
-	virtual void reset() {}  // override to implement pre-learning action
-};
-
-// helper class to hold N values for each weight
-template <int N>
-struct stateful_optimizer : public optimizer {
-	void reset() override { for (auto &e : E_) e.clear(); }
-protected:
-	template <int Index>
-	vec_t &get(const vec_t &key) {
-		static_assert(Index < N, "index out of range");
-		if (E_[Index][&key].empty()) E_[Index][&key].resize(key.size(), float_t());
-		return E_[Index][&key];
-	}
-	std::unordered_map<const vec_t *, vec_t> E_[N];
-};
-
-/**
- * adaptive gradient method
- *
- * J Duchi, E Hazan and Y Singer,
- * Adaptive subgradient methods for online learning and stochastic optimization
- * The Journal of Machine Learning Research, pages 2121-2159, 2011.
- **/
-struct adagrad : public stateful_optimizer<1> {
-	adagrad() : alpha(learning_rate), eps(float_t(1e-8)) {}
-	void update(const vec_t &dW, vec_t &W, bool parallelize) {
-		vec_t &g = get<0>(W);
-		if (parallelize) {
-			galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) {
-				g[i] += dW[i] * dW[i];
-				W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps);
-			}, galois::loopname("adagrad_update"));
-		} else {
-			for (size_t i = 0; i < W.size(); i++) {
-				g[i] += dW[i] * dW[i];
-				W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps);
-			}
-		}
-	}
-	float_t alpha;  // learning rate
-	private:
-	float_t eps;
-};
-
-/**
- * RMSprop
- *
- * T Tieleman, and G E Hinton,
- * Lecture 6.5 - rmsprop, COURSERA: Neural Networks for Machine Learning (2012)
- **/
-struct RMSprop : public stateful_optimizer<1> {
-	RMSprop() : alpha(float_t(0.0001)), mu(float_t(0.99)), eps(float_t(1e-8)) {}
-	void update(const vec_t &dW, vec_t &W, bool parallelize) {
-		vec_t &g = get<0>(W);
-		galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) {
-			g[i] = mu * g[i] + (1 - mu) * dW[i] * dW[i];
-			W[i] -= alpha * dW[i] / std::sqrt(g[i] + eps);
-		}, galois::loopname("rms_update"));
-	}
-	float_t alpha;  // learning rate
-	float_t mu;     // decay term
-private:
-	float_t eps;  // constant value to avoid zero-division
-};
-
-// Adam: A Method for Stochastic Optimization
-// http://arxiv.org/abs/1412.6980
-struct adam : public stateful_optimizer<2> {
-	adam() : alpha(learning_rate), b1(float_t(0.9)),
-		b2(float_t(0.999)), b1_t(float_t(0.9)),
-		b2_t(float_t(0.999)), eps(float_t(1e-8)) {}
-
-	void update(const vec_t &dW, vec_t &W, bool parallelize) {
-		vec_t &mt = get<0>(W);
-		vec_t &vt = get<1>(W);
-		galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) {
-			mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i];
-			vt[i] = b2 * vt[i] + (float_t(1) - b2) * dW[i] * dW[i];
-			// L2 norm based update rule
-			W[i] -= alpha * (mt[i] / (float_t(1) - b1_t)) /
-				std::sqrt((vt[i] / (float_t(1) - b2_t)) + eps);
-		}, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("adam_update"));
-		b1_t *= b1;
-		b2_t *= b2;
-	}
-
-	float_t alpha;  // learning rate
-	float_t b1;     // decay term
-	float_t b2;     // decay term
-	float_t b1_t;   // decay term power t
-	float_t b2_t;   // decay term power t
-
-private:
-	float_t eps;  // constant value to avoid zero-division
-};
-
-/**
- * @brief [a new optimizer (2015)]
- * @details [see Adam: A Method for Stochastic Optimization (Algorithm 2)
- *               http://arxiv.org/abs/1412.6980]
- *
- */
-struct adamax : public stateful_optimizer<2> {
-	adamax()
-		: alpha(float_t(0.002)),
-		b1(float_t(0.9)),
-		b2(float_t(0.999)),
-		b1_t(b1),
-		eps(float_t(1e-8)) {}
-
-	void update(const vec_t &dW, vec_t &W, bool parallelize) {
-		vec_t &mt = get<0>(W);
-		vec_t &ut = get<1>(W);
-		galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) {
-			mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i];
-			ut[i] = std::max(b2 * ut[i], std::abs(dW[i]));
-			// Lp norm based update rule
-			W[i] -= (alpha / (1.0 - b1_t)) * (mt[i] / (ut[i] + eps));
-		}, galois::loopname("adamax_update"));
-		b1_t *= b1;
-	}
-
-	float_t alpha;  // learning rate
-	float_t b1;     // decay term
-	float_t b2;     // decay term
-	float_t b1_t;   // decay term power t
-
-private:
-	float_t eps;  // constant value to avoid zero-division
-};
-
-/**
- * SGD without momentum
- *
- * slightly faster than tiny_dnn::momentum
- **/
-struct gradient_descent : public optimizer {
-	gradient_descent() : alpha(float_t(0.01)), lambda(float_t(0)) {}
-	void update(const vec_t &dW, vec_t &W, bool parallelize) {
-		galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) {
-			W[i] = W[i] - alpha * (dW[i] + lambda * W[i]); 
-		}, galois::loopname("gradient_descent_update"));
-	}
-	float_t alpha;   // learning rate
-	float_t lambda;  // weight decay
-};
-
-/**
- * SGD with momentum
- *
- * B T Polyak,
- * Some methods of speeding up the convergence of iteration methods
- * USSR Computational Mathematics and Mathematical Physics, 4(5):1-17, 1964.
- **/
-struct momentum : public stateful_optimizer<1> {
- public:
-  momentum() : alpha(float_t(0.01)), lambda(float_t(0)), mu(float_t(0.9)) {}
-
-  void update(const vec_t &dW, vec_t &W, bool parallelize) {
-    vec_t &dWprev = get<0>(W);
-
-    //for_i(parallelize, W.size(), [&](size_t i) {
-	galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) {
-      float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda);
-      W[i] += V;
-      dWprev[i] = V;
-    //});
-	}, galois::loopname("momentum_update"));
-  }
-
-  float_t alpha;   // learning rate
-  float_t lambda;  // weight decay
-  float_t mu;      // momentum
-};
-
-/**
- * SGD with Nesterov momentum
- *
- * Y Nesterov,
- * A method for unconstrained convex minimization problem with the rate of
- * convergence o(1/k2), Doklady ANSSSR, vol.269, pp.543-547, 1983.
- **/
-struct nesterov_momentum : public stateful_optimizer<1> {
- public:
-  nesterov_momentum()
-    : alpha(float_t(0.01)), lambda(float_t(0)), mu(float_t(0.9)) {}
-
-  void update(const vec_t &dW, vec_t &W, bool parallelize) {
-    vec_t &dWprev = get<0>(W);
-
-    //for_i(parallelize, W.size(), [&](size_t i) {
-	galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) {
-      float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda);
-      W[i] += (-mu) * dWprev[i] + (1 + mu) * V;
-      dWprev[i] = V;
-    //});
-	}, galois::loopname("nesterov_momentum_update"));
-  }
-
-  float_t alpha;   // learning rate
-  float_t lambda;  // weight decay
-  float_t mu;      // momentum
-};
-
diff --git a/lonestar/experimental/gnn/random.h b/lonestar/experimental/gnn/random.h
deleted file mode 100644
index 9236e9c391..0000000000
--- a/lonestar/experimental/gnn/random.h
+++ /dev/null
@@ -1,63 +0,0 @@
-#ifndef RANDOM_H
-#define RANDOM_H
-typedef boost::mt19937 rng_t;
-
-// random seeding
-int64_t seedgen(void) {
-	int64_t s, seed, pid;
-	FILE* f = fopen("/dev/urandom", "rb");
-	if (f && fread(&seed, 1, sizeof(seed), f) == sizeof(seed)) {
-		fclose(f);
-		return seed;
-	}
-	std::cout << "System entropy source not available, using fallback algorithm to generate seed instead.";
-	if (f) fclose(f);
-	pid = getpid();
-	s = time(NULL);
-	seed = std::abs(((s * 181) * ((pid - 83) * 359)) % 104729);
-	return seed;
-}
-
-// This random number generator facade hides boost and CUDA rng
-// implementation from one another (for cross-platform compatibility).
-class RNG {
-public:
-	RNG() : generator_(new Generator()) { }
-	explicit RNG(unsigned int seed) : generator_(new Generator(seed)) { }
-	explicit RNG(const RNG&);
-	RNG& operator=(const RNG& other) { generator_ = other.generator_; return *this; }
-	void* generator() { return static_cast<void*>(generator_->rng()); }
-private:
-	class Generator {
-		public:
-			Generator() : rng_(new rng_t(seedgen())) {}
-			explicit Generator(unsigned seed) : rng_(new rng_t(seed)) {}
-			rng_t* rng() { return rng_.get(); }
-		private:
-			std::shared_ptr<rng_t> rng_;
-	};
-
-	std::shared_ptr<Generator> generator_;
-};
-
-std::shared_ptr<RNG> random_generator_;
-inline static RNG& rng_stream() {
-	random_generator_.reset(new RNG());
-	return *random_generator_;
-}
-
-inline rng_t* rng() {
-	return static_cast<rng_t*>(rng_stream().generator());
-}
-
-#include <boost/random/bernoulli_distribution.hpp>
-template <typename DataTy = float>
-void rng_bernoulli(const DataTy p, std::vector<unsigned> &r) {
-	boost::bernoulli_distribution<DataTy> random_distribution(p);
-	boost::variate_generator<rng_t*, boost::bernoulli_distribution<DataTy> >
-		variate_generator(rng(), random_distribution);
-	for (size_t i = 0; i < r.size(); ++i)
-		r[i] = static_cast<unsigned>(variate_generator());
-}
-
-#endif
diff --git a/lonestar/experimental/gnn/run-citeseer.sh b/lonestar/experimental/gnn/run-citeseer.sh
deleted file mode 100755
index 30772b4f6e..0000000000
--- a/lonestar/experimental/gnn/run-citeseer.sh
+++ /dev/null
@@ -1 +0,0 @@
-./gnn citeseer -t=56 -k=3
diff --git a/lonestar/experimental/gnn/timer.h b/lonestar/experimental/gnn/timer.h
deleted file mode 100644
index e6c838c37b..0000000000
--- a/lonestar/experimental/gnn/timer.h
+++ /dev/null
@@ -1,21 +0,0 @@
-#ifndef TIMER_H_
-#define TIMER_H_
-#include <sys/time.h>
-
-class Timer {
-public:
-	Timer() {}
-	void Start() { gettimeofday(&start_time_, NULL); }
-	void Stop() {
-		gettimeofday(&elapsed_time_, NULL);
-		elapsed_time_.tv_sec  -= start_time_.tv_sec;
-		elapsed_time_.tv_usec -= start_time_.tv_usec;
-	}
-	double Seconds() const { return elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec/1e6; }
-	double Millisecs() const { return 1000*elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec/1000; }
-	double Microsecs() const { return 1e6*elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec; }
-private:
-	struct timeval start_time_;
-	struct timeval elapsed_time_;
-};
-#endif  // TIMER_H_
diff --git a/lonestar/experimental/gnn/types.h b/lonestar/experimental/gnn/types.h
deleted file mode 100644
index bc9fe21049..0000000000
--- a/lonestar/experimental/gnn/types.h
+++ /dev/null
@@ -1,34 +0,0 @@
-#ifndef TYPES_H
-#define TYPES_H
-#include <vector>
-#include "galois/Galois.h"
-#include "galois/graphs/LCGraph.h"
-
-#ifdef CNN_USE_DOUBLE
-typedef double float_t;
-typedef double feature_t;
-#else
-typedef float float_t;
-typedef float feature_t; // feature type
-#endif
-typedef std::vector<float_t> vec_t; // feature vector (1D)
-typedef std::vector<vec_t> tensor_t; // feature vectors (2D): num_samples x feature_dim
-typedef std::vector<feature_t> FV; // feature vector
-typedef std::vector<FV> FV2D; // feature vectors: num_samples x feature_dim
-typedef float acc_t; // Accuracy type
-typedef short label_t; // label is for classification (supervised learning)
-typedef uint8_t mask_t; // mask is used to indicate different uses of labels: train, val, test
-typedef std::vector<label_t> LabelList; // label list to store label for each vertex
-typedef std::vector<mask_t> MaskList; // mask list to store mask for each vertex
-typedef galois::GAccumulator<acc_t> AccumF;
-typedef galois::GAccumulator<size_t> AccumU;
-
-#ifdef EDGE_LABEL
-typedef galois::graphs::LC_CSR_Graph<uint32_t, uint32_t>::with_numa_alloc<true>::type ::with_no_lockable<true>::type Graph;
-#else
-typedef galois::graphs::LC_CSR_Graph<uint32_t, void>::with_numa_alloc<true>::type ::with_no_lockable<true>::type Graph;
-#endif
-
-typedef Graph::GraphNode GNode;
-
-#endif
diff --git a/lonestar/experimental/gnn/utils.h b/lonestar/experimental/gnn/utils.h
deleted file mode 100644
index 70356654b9..0000000000
--- a/lonestar/experimental/gnn/utils.h
+++ /dev/null
@@ -1,119 +0,0 @@
-#pragma once
-
-#include <random>
-#include <iomanip>
-#include <fstream>
-#include <sys/time.h>
-#include <sys/resource.h>
-#include "gnn.h"
-
-std::string path = "/h2/xchen/datasets/Learning/"; // path to the input dataset
-enum class net_phase { train, test };
-
-class ResourceManager {
-public:
-	ResourceManager() {}
-	~ResourceManager(){}
-	//peak memory usage
-	std::string get_peak_memory() {
-		double kbm;
-		struct rusage CurUsage;
-		getrusage(RUSAGE_SELF, &CurUsage);
-		kbm = (double)CurUsage.ru_maxrss;
-		double mbm = kbm / 1024.0;
-		double gbm = mbm / 1024.0;
-		return
-			"Peak memory: " +
-			to_string_with_precision(mbm, 3) + " MB; " +
-			to_string_with_precision(gbm, 3) + " GB";
-	}
-private:
-	template <typename T = double>
-	std::string to_string_with_precision(const T a_value, const int& n) {
-		std::ostringstream out;
-		out << std::fixed;
-		out << std::setprecision(n) << a_value;
-		return out.str();
-	}
-};
-
-class Timer {
-public:
-	Timer() {}
-	void Start() { gettimeofday(&start_time_, NULL); }
-	void Stop() {
-		gettimeofday(&elapsed_time_, NULL);
-		elapsed_time_.tv_sec  -= start_time_.tv_sec;
-		elapsed_time_.tv_usec -= start_time_.tv_usec;
-	}
-	double Seconds() const { return elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec/1e6; }
-	double Millisecs() const { return 1000*elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec/1000; }
-	double Microsecs() const { return 1e6*elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec; }
-private:
-	struct timeval start_time_;
-	struct timeval elapsed_time_;
-};
-
-class random_generator {
-public:
-	static random_generator &get_instance() {
-		static random_generator instance;
-		return instance;
-	}
-	std::mt19937 &operator()() { return gen_; }
-	void set_seed(unsigned int seed) { gen_.seed(seed); }
-
-private:
-	random_generator() : gen_(1) {}
-	std::mt19937 gen_;
-};
-
-template <typename T>
-inline typename std::enable_if<std::is_integral<T>::value, T>::type
-uniform_rand(T min, T max) {
-	std::uniform_int_distribution<T> dst(min, max);
-	return dst(random_generator::get_instance()());
-}
-
-template <typename T>
-inline typename std::enable_if<std::is_floating_point<T>::value, T>::type
-uniform_rand(T min, T max) {
-	std::uniform_real_distribution<T> dst(min, max);
-	return dst(random_generator::get_instance()());
-}
-
-inline bool bernoulli(float_t p) {
-	return uniform_rand(float_t{0}, float_t{1}) <= p;
-}
-
-size_t read_masks(std::string dataset_str, std::string mask_type, size_t &begin, size_t &end, MaskList &masks) {
-	if (dataset_str != "citeseer" && dataset_str != "cora") {
-		std::cout << "Dataset currently not supported\n";
-		exit(1);
-	}
-	size_t i = 0;
-	size_t sample_count = 0;
-	std::string filename = path + dataset_str + "-" + mask_type + "_mask.txt";
-	//std::cout << "Reading " << filename << "\n";
-	std::ifstream in;
-	std::string line;
-	in.open(filename, std::ios::in);
-	in >> begin >> end >> std::ws;
-	while (std::getline(in, line)) {
-		std::istringstream mask_stream(line);
-		if (i >= begin && i < end) {
-			unsigned mask = 0;
-			mask_stream >> mask;
-			if (mask == 1) {
-				masks[i] = 1;
-				sample_count ++;
-			}
-		}
-		i ++;
-	} 
-	//std::cout << mask_type + "_mask range: [" << begin << ", " << end
-	//	<< ") Number of valid samples: " << sample_count << "\n";
-	in.close();
-	return sample_count;
-}
-

From 5cec98cebb6fb1507d884bf122ebd11a1680d767 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 2 Mar 2020 17:39:44 -0600
Subject: [PATCH 090/660] correctly context.h copyright header (replace with
 TODO)

---
 libdeepgalois/include/deepgalois/context.h | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h
index d7f400d582..2f769dc917 100644
--- a/libdeepgalois/include/deepgalois/context.h
+++ b/libdeepgalois/include/deepgalois/context.h
@@ -1,12 +1,6 @@
 #pragma once
 /**
- * Code modified from below
- *
- * https://github.com/BVLC/caffe/blob/master/include/caffe/common.hpp
- *
- * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
- * All rights reserved.
- * Reused/revised under BSD 2-Clause license
+ * TODO if used from somewhere, get copyright/licences
  */
 
 #include <string>

From bed7ba7c33cb71d1eba53bf0c6f29f646c0a7e99 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 2 Mar 2020 17:50:06 -0600
Subject: [PATCH 091/660] gcn: some comments, gPrint

---
 lonestargnn/gcn/gcn.cpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp
index 005e6b1477..3357fd904e 100644
--- a/lonestargnn/gcn/gcn.cpp
+++ b/lonestargnn/gcn/gcn.cpp
@@ -15,6 +15,8 @@ int main(int argc, char** argv) {
   network.construct_layers(); // default setting for now; can be customized by
                               // the user
   network.print_layers_info();
+
+  // tracks peak memory usage
   deepgalois::ResourceManager rm;
 
   // the optimizer used to update parameters, see optimizer.h for more details
@@ -27,7 +29,7 @@ int main(int argc, char** argv) {
   Ttrain.stop();
 
   if (do_test) {
-    std::cout << "\n";
+    galois::gPrint("\n");
     // test using test samples
     size_t n        = network.get_nnodes();
     acc_t test_loss = 0.0, test_acc = 0.0;
@@ -45,11 +47,10 @@ int main(int argc, char** argv) {
     Ttest.start();
     double test_time = network.evaluate(test_begin, test_end, test_count,
                                         &test_mask[0], test_loss, test_acc);
-    std::cout << "Testing: test_loss = " << test_loss
-              << " test_acc = " << test_acc << " test_time = " << test_time
-              << "\n";
+    galois::gPrint("Testing: test_loss = ", test_loss, " test_acc = ", test_acc,
+                   " test_time = ", test_time, "\n");
     Ttest.stop();
   }
-  std::cout << "\n" << rm.get_peak_memory() << "\n\n";
+  galois::gPrint("\n", rm.get_peak_memory(), "\n\n");
   return 0;
 }

From d46fb46ed0eb256c4ff2690bb8ab69b618eaf688 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 2 Mar 2020 18:14:35 -0600
Subject: [PATCH 092/660] aggregator.h: always include CPU version of
 update_all will not conflict with GPU since different signature

---
 libdeepgalois/include/deepgalois/aggregator.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/aggregator.h b/libdeepgalois/include/deepgalois/aggregator.h
index 17a8451aee..bdc8c5aa5e 100644
--- a/libdeepgalois/include/deepgalois/aggregator.h
+++ b/libdeepgalois/include/deepgalois/aggregator.h
@@ -1,13 +1,11 @@
 #pragma once
 #include "deepgalois/types.h"
-#ifdef CPU_ONLY
 #include "deepgalois/gtypes.h"
-
 namespace deepgalois {
 void update_all(size_t len, Graph& g, const float_t* in, float_t* out,
                 bool norm, const float_t* norm_factor);
 }
-#else
+#ifndef CPU_ONLY
 #include "graph_gpu.h"
 namespace deepgalois {
 void update_all(size_t len, CSRGraph& g, const float_t* in, float_t* out,

From bbb41251cf49985f71b9a327d137efb0ca046cea Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 2 Mar 2020 18:22:04 -0600
Subject: [PATCH 093/660] fixing GPU end deepgalois includes/namepsaces

---
 libdeepgalois/src/aggregator.cu     |  6 +++---
 libdeepgalois/src/context.cu        |  2 +-
 libdeepgalois/src/math_functions.cu | 30 ++++++++++++++---------------
 libdeepgalois/src/node.cu           | 10 +++++-----
 libdeepgalois/src/optimizer.cu      |  8 ++++----
 5 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/libdeepgalois/src/aggregator.cu b/libdeepgalois/src/aggregator.cu
index bbd7fbf8b3..cd89cd92b1 100644
--- a/libdeepgalois/src/aggregator.cu
+++ b/libdeepgalois/src/aggregator.cu
@@ -1,9 +1,9 @@
 #include "gg.h"
 #include "ggcuda.h"
 #include "cub/cub.cuh"
-#include "cutils.h"
-#include "aggregator.h"
-#include "math_functions.hh"
+#include "deepgalois/cutils.h"
+#include "deepgalois/aggregator.h"
+#include "deepgalois/math_functions.hh"
 
 // TODO: use warp
 __device__ void scale_add(const int n, const float_t alpha, const float_t* a,
diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu
index 270252c5d8..29bec6f008 100644
--- a/libdeepgalois/src/context.cu
+++ b/libdeepgalois/src/context.cu
@@ -2,7 +2,7 @@
 #include <cstdio>
 #include <unistd.h>
 #include <sys/types.h>
-#include "context.h"
+#include "deepgalois/context.h"
 
 // random seeding
 int64_t cluster_seedgen(void) {
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index eb8f07c8b3..9131bf9509 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -1,5 +1,5 @@
-#include "math_functions.hh"
-#include "context.h"
+#include "deepgalois/math_functions.hh"
+#include "deepgalois/context.h"
 #include "gg.h"
 #include "ggcuda.h"
 #include "cub/cub.cuh"
@@ -29,11 +29,11 @@ bool isnan_gpu(int n, const float_t *array) {
 }
 
 void gpu_rng_uniform(const int n, unsigned* r) {
-  CURAND_CHECK(curandGenerate(Context::curand_generator(), r, n));
+  CURAND_CHECK(curandGenerate(deepgalois::Context::curand_generator(), r, n));
 }
 
 void gpu_rng_uniform(const int n, const float_t a, const float_t b, float_t* r) {
-  CURAND_CHECK(curandGenerateUniform(Context::curand_generator(), r, n));
+  CURAND_CHECK(curandGenerateUniform(deepgalois::Context::curand_generator(), r, n));
   const float range = b - a;
   if (range != float_t(1))
     scal_gpu(n, range, r);
@@ -42,7 +42,7 @@ void gpu_rng_uniform(const int n, const float_t a, const float_t b, float_t* r)
 }
 
 void gpu_rng_gaussian(const int n, const float_t mu, const float_t sigma, float_t* r) {
-  CURAND_CHECK(curandGenerateNormal(Context::curand_generator(), r, n, mu, sigma));
+  CURAND_CHECK(curandGenerateNormal(deepgalois::Context::curand_generator(), r, n, mu, sigma));
 }
 
 bool is_allocated_device(float_t* data) {
@@ -171,7 +171,7 @@ void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
       (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
   cublasOperation_t cuTransB =
       (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  CUBLAS_CHECK(cublasSgemm(Context::cublas_handle(), cuTransB, cuTransA,
+  CUBLAS_CHECK(cublasSgemm(deepgalois::Context::cublas_handle(), cuTransB, cuTransA,
                            N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
 }
 
@@ -189,14 +189,14 @@ void csrmm_gpu(const int M, const int N, const int K, const int nnz,
                const float* B, const float beta, float* C) {
   float *transpose_C;
   CUDA_CHECK(cudaMalloc((void**)&transpose_C, N * K * sizeof(float)));
-  CUSPARSE_CHECK(cusparseScsrmm2(Context::cusparse_handle(),
+  CUSPARSE_CHECK(cusparseScsrmm2(deepgalois::Context::cusparse_handle(),
                  CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE,
-                 M, N, K, nnz, &alpha, Context::cusparse_matdescr(), A_nonzeros, 
+                 M, N, K, nnz, &alpha, deepgalois::Context::cusparse_matdescr(), A_nonzeros, 
                  A_idx_ptr, A_nnz_idx, B, N, &beta, transpose_C, M)); 
   //transpose C
   const float one = 1.0;
   const float zero = 0.0; 
-  CUBLAS_CHECK(cublasSgeam(Context::cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_T,
+  CUBLAS_CHECK(cublasSgeam(deepgalois::Context::cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_T,
                            N, M, &one, transpose_C, M, &zero, transpose_C, M, C, N)); 
 }
 
@@ -205,25 +205,25 @@ void gemv_gpu(const CBLAS_TRANSPOSE TransA, const int M, const int N,
               const float beta, float* y) {
   cublasOperation_t cuTransA =
       (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
-  CUBLAS_CHECK(cublasSgemv(Context::cublas_handle(), cuTransA, N, M, &alpha, A,
+  CUBLAS_CHECK(cublasSgemv(deepgalois::Context::cublas_handle(), cuTransA, N, M, &alpha, A,
                            N, x, 1, &beta, y, 1));
 }
 
 void scal_gpu(const int N, const float alpha, float* X) {
-  CUBLAS_CHECK(cublasSscal(Context::cublas_handle(), N, &alpha, X, 1));
+  CUBLAS_CHECK(cublasSscal(deepgalois::Context::cublas_handle(), N, &alpha, X, 1));
 }
 
 void dot_gpu(const int n, const float* x, const float* y, float* out) {
-  CUBLAS_CHECK(cublasSdot(Context::cublas_handle(), n, x, 1, y, 1, out));
+  CUBLAS_CHECK(cublasSdot(deepgalois::Context::cublas_handle(), n, x, 1, y, 1, out));
 }
 
 void asum_gpu(const int n, const float* x, float* y) {
-  CUBLAS_CHECK(cublasSasum(Context::cublas_handle(), n, x, 1, y));
+  CUBLAS_CHECK(cublasSasum(deepgalois::Context::cublas_handle(), n, x, 1, y));
 }
 
 void scale_gpu(const int n, const float alpha, const float* x, float* y) {
-  CUBLAS_CHECK(cublasScopy(Context::cublas_handle(), n, x, 1, y, 1));
-  CUBLAS_CHECK(cublasSscal(Context::cublas_handle(), n, &alpha, y, 1));
+  CUBLAS_CHECK(cublasScopy(deepgalois::Context::cublas_handle(), n, x, 1, y, 1));
+  CUBLAS_CHECK(cublasSscal(deepgalois::Context::cublas_handle(), n, &alpha, y, 1));
 }
 
 __global__ void set_kernel(const int n, const float_t alpha, float_t* y) {
diff --git a/libdeepgalois/src/node.cu b/libdeepgalois/src/node.cu
index e6d149a540..88d486f369 100644
--- a/libdeepgalois/src/node.cu
+++ b/libdeepgalois/src/node.cu
@@ -1,18 +1,18 @@
-#include "node.h"
-#include "cutils.h"
+#include "deepgalois/layers/node.h"
+#include "deepgalois/cutils.h"
 
-void edge::alloc_gpu() {
+void deepgalois::edge::alloc_gpu() {
   CUDA_CHECK(
       cudaMalloc((void**)&data_, num_samples_ * ft_dim_ * sizeof(float_t)));
   CUDA_CHECK(
       cudaMalloc((void**)&grad_, num_samples_ * ft_dim_ * sizeof(float_t)));
 }
 
-void edge::merge_grads_gpu(float_t* dst) {
+void deepgalois::edge::merge_grads_gpu(float_t* dst) {
   CUDA_CHECK(cudaMemcpy(&dst, grad_, ft_dim_ * sizeof(float_t),
                         cudaMemcpyDeviceToHost));
 }
 
-void edge::clear_grads_gpu() {
+void deepgalois::edge::clear_grads_gpu() {
   CUDA_CHECK(cudaMemset(grad_, 0, num_samples_ * ft_dim_ * sizeof(float_t)));
 }
diff --git a/libdeepgalois/src/optimizer.cu b/libdeepgalois/src/optimizer.cu
index bf279e4e37..7628c3aeba 100644
--- a/libdeepgalois/src/optimizer.cu
+++ b/libdeepgalois/src/optimizer.cu
@@ -1,6 +1,6 @@
-#include "optimizer.h"
-#include "cutils.h"
-#include "math_functions.hh"
+#include "deepgalois/optimizer.h"
+#include "deepgalois/cutils.h"
+#include "deepgalois/math_functions.hh"
 
 __global__ void update_kernel(const int n, float_t alpha, float_t b1,
                          float_t b2, float_t b1_t, float_t b2_t,
@@ -14,7 +14,7 @@ __global__ void update_kernel(const int n, float_t alpha, float_t b1,
   }
 }
 
-void adam::update_gpu(const size_t n, const float_t* dW, float_t* W) {
+void deepgalois::adam::update_gpu(const size_t n, const float_t* dW, float_t* W) {
   //std::cout << "updating weights on GPU, n = " << n << "\n";
   //print_device_vector(10, dW, "dW");
   float_t* cache = get_gpu<0>(n, W);

From c9001a727b93d2598898d79c0e233b7582b2cd27 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 3 Mar 2020 11:47:36 -0600
Subject: [PATCH 094/660] aggregator moved to layers (only used htere)

---
 libdeepgalois/CMakeLists.txt                               | 2 +-
 libdeepgalois/include/deepgalois/{ => layers}/aggregator.h | 0
 libdeepgalois/include/deepgalois/layers/graph_conv_layer.h | 5 +++--
 libdeepgalois/include/deepgalois/layers/layer.h            | 2 +-
 libdeepgalois/src/{ => layers}/aggregator.cpp              | 2 +-
 libdeepgalois/src/{ => layers}/aggregator.cu               | 0
 6 files changed, 6 insertions(+), 5 deletions(-)
 rename libdeepgalois/include/deepgalois/{ => layers}/aggregator.h (100%)
 rename libdeepgalois/src/{ => layers}/aggregator.cpp (96%)
 rename libdeepgalois/src/{ => layers}/aggregator.cu (100%)

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index 1ce41abc73..813992d433 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -46,8 +46,8 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 set(sources
   src/layers/graph_conv_layer.cpp
   src/layers/softmax_loss_layer.cpp
+  src/layers/aggregator.cpp
   src/math_functions.cpp
-  src/aggregator.cpp
   src/optimizer.cpp
   src/context.cpp
   src/node.cpp
diff --git a/libdeepgalois/include/deepgalois/aggregator.h b/libdeepgalois/include/deepgalois/layers/aggregator.h
similarity index 100%
rename from libdeepgalois/include/deepgalois/aggregator.h
rename to libdeepgalois/include/deepgalois/layers/aggregator.h
diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
index ed681bdf30..f5b7906f73 100644
--- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
@@ -1,9 +1,10 @@
 #pragma once
 #include "layer.h"
-#include "deepgalois/aggregator.h"
+#include "deepgalois/layers/aggregator.h"
 
 /**
- * GraphConv Layer; based on DGL implementation
+ * GraphConv Layer; based on DGL implementation + follows TinyDNN layer
+ * convention 
  * https://docs.dgl.ai/en/0.4.x/_modules/dgl/nn/pytorch/conv/graphconv.html
  *
  *   Parameters
diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h
index f30ad03b7b..da6e866d6b 100644
--- a/libdeepgalois/include/deepgalois/layers/layer.h
+++ b/libdeepgalois/include/deepgalois/layers/layer.h
@@ -1,6 +1,6 @@
 #pragma once
 /**
- * Code from on below link. Modified under Galois.
+ * Code from on below link. Modified under Galois's license.
  *
  * https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/layers/layer.h
  *
diff --git a/libdeepgalois/src/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp
similarity index 96%
rename from libdeepgalois/src/aggregator.cpp
rename to libdeepgalois/src/layers/aggregator.cpp
index 360300dba3..5c7586e9a4 100644
--- a/libdeepgalois/src/aggregator.cpp
+++ b/libdeepgalois/src/layers/aggregator.cpp
@@ -1,4 +1,4 @@
-#include "deepgalois/aggregator.h"
+#include "deepgalois/layers/aggregator.h"
 #include "deepgalois/math_functions.hh"
 
 void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* out,
diff --git a/libdeepgalois/src/aggregator.cu b/libdeepgalois/src/layers/aggregator.cu
similarity index 100%
rename from libdeepgalois/src/aggregator.cu
rename to libdeepgalois/src/layers/aggregator.cu

From 263ca21ba14138dc8c19d8d15b62d3fd2cfdb712 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 3 Mar 2020 12:22:26 -0600
Subject: [PATCH 095/660] aggregator comments; vadd moved to deepgalois::math

---
 .../include/deepgalois/layers/aggregator.h    |  2 ++
 .../include/deepgalois/math_functions.hh      | 12 +++++++
 libdeepgalois/src/layers/aggregator.cpp       | 31 +++++++++++++------
 libdeepgalois/src/layers/graph_conv_layer.cpp |  2 +-
 libdeepgalois/src/math_functions.cpp          |  8 +++++
 5 files changed, 45 insertions(+), 10 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/layers/aggregator.h b/libdeepgalois/include/deepgalois/layers/aggregator.h
index bdc8c5aa5e..806f81a3e0 100644
--- a/libdeepgalois/include/deepgalois/layers/aggregator.h
+++ b/libdeepgalois/include/deepgalois/layers/aggregator.h
@@ -2,6 +2,8 @@
 #include "deepgalois/types.h"
 #include "deepgalois/gtypes.h"
 namespace deepgalois {
+//! For each node in the graph, add the embeddings of all of its neighbors
+//! together (using norm_factor if specified)
 void update_all(size_t len, Graph& g, const float_t* in, float_t* out,
                 bool norm, const float_t* norm_factor);
 }
diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh
index b5c51203f8..101d5125be 100644
--- a/libdeepgalois/include/deepgalois/math_functions.hh
+++ b/libdeepgalois/include/deepgalois/math_functions.hh
@@ -20,8 +20,18 @@ extern "C" {
 
 const float negative_slope = 0;
 
+namespace deepgalois {
+namespace math {
+
+//! add two same size vectors into out
 void vadd(const vec_t& a, const vec_t& b, vec_t& out); // vector add
+//! add 2 arrays for n elements
 void vadd(size_t n, const float_t* a, const float_t* b, float_t* out);
+
+} // deepgalois
+} // math
+
+
 void vsub(const vec_t& a, const vec_t& b, vec_t& out);
 void vmul(const vec_t& a, const vec_t& b, vec_t& out);
 void vdiv(const vec_t& a, const vec_t& b, vec_t& out);
@@ -50,7 +60,9 @@ void transpose(size_t x, size_t y, const vec_t& in, vec_t& out);
 void transpose(size_t x, size_t y, const float_t* in, float_t* out);
 int argmax(const size_t n, const vec_t& x);   // the arguments of the maxima
 int argmax(const size_t n, const float_t* x); // the arguments of the maxima
+//! clear entire vector
 void clear(vec_t& in);
+//! clear n elements of a vector
 void clear(size_t n, float_t* in);
 void relu(const vec_t& in, vec_t& out);               // ReLU
 void relu(size_t n, const float_t* in, float_t* out); // ReLU
diff --git a/libdeepgalois/src/layers/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp
index 5c7586e9a4..398da276d2 100644
--- a/libdeepgalois/src/layers/aggregator.cpp
+++ b/libdeepgalois/src/layers/aggregator.cpp
@@ -3,26 +3,39 @@
 
 void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* out,
                 bool norm, const float_t* norm_factor) {
-  galois::do_all(galois::iterate(g.begin(), g.end()),
-                 [&](const auto& src) {
+  galois::do_all(galois::iterate(g),
+                 [&](const GNode src) {
+                   // zero out this node's out values
                    clear(len, &out[src * len]);
-                   float_t a = 0.0, b = 0.0;
-                   if (norm)
-                     a = norm_factor[src];
+                   float_t a = 0.0;
+                   float_t b = 0.0;
+
+                   // get normalization factor if needed
+                   if (norm) a = norm_factor[src];
+
                    // gather neighbors' embeddings
                    for (const auto e : g.edges(src)) {
                      const auto dst = g.getEdgeDst(e);
+
                      if (norm) {
+                       // normalize b as well
                        b = a * norm_factor[dst];
                        vec_t neighbor(len);
+                       // scale the neighbor's data  using the normalization
+                       // factor
                        mul_scalar(len, b, &in[dst * len], &neighbor[0]);
-                       vadd(len, &out[src * len], &neighbor[0],
+                       // use scaled data to update
+                       deepgalois::math::vadd(len, &out[src * len], &neighbor[0],
                             &out[src * len]); // out[src] += in[dst]
                      } else
-                       vadd(len, &out[src * len], &in[dst * len],
-                            &out[src * len]); // out[src] += in[dst]
+                       // add embeddings from neighbors together
+                       deepgalois::math::vadd(len, &out[src * len],
+                                              &in[dst * len],
+                                              &out[src * len]); // out[src] += in[dst]
                    }
                  },
-                 galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
+                 galois::chunk_size<CHUNK_SIZE>(),
+                 galois::steal(),
+                 galois::no_stats(),
                  galois::loopname("update_all"));
 }
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index b496f52d57..442478b220 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -21,7 +21,7 @@ void graph_conv_layer::combine(const vec_t& self, const vec_t& neighbors, vec_t&
   vec_t b(out.size(), 0);
   mvmul(Q, self, a);
   mvmul(W, neighbors, b);
-  vadd(a, b, out); // out = W*self + Q*neighbors
+  deepgalois::math::vadd(a, b, out); // out = W*self + Q*neighbors
 }
 
 graph_conv_layer::graph_conv_layer(unsigned level, bool act, bool norm,
diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp
index 979f5ce9d7..5b5ee78031 100644
--- a/libdeepgalois/src/math_functions.cpp
+++ b/libdeepgalois/src/math_functions.cpp
@@ -7,6 +7,9 @@ extern "C" {
 //#include <clapack.h>
 }
 
+namespace deepgalois {
+namespace math {
+
 // vector add
 #if defined(__AVX__) || defined(__AVX2__)
 void vadd(const vec_t& a, const vec_t& b, vec_t& out) {
@@ -41,6 +44,11 @@ void vadd(size_t n, const float_t* a, const float_t* b, float_t* out) {
 }
 #endif
 
+} // deepgalois
+} // math
+
+
+
 // vector subtract
 void vsub(const vec_t& in_a, const vec_t& in_b, vec_t& out) {
   for (size_t i = 0; i < out.size(); ++i)

From 8bbb5aa5673ba7ed87b823ec497b0590504cc8e9 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 3 Mar 2020 12:25:49 -0600
Subject: [PATCH 096/660] cmakelist change for previous commit aggregator
 change

---
 libdeepgalois/CMakeLists.txt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index 813992d433..7f481cb385 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -28,11 +28,11 @@ else()
   link_directories(${CUDA_LIB})
   link_directories(${CMAKE_SOURCE_DIR}/libgpu)
   set(CUDA_SOURCES
+	  src/layers/aggregator.cu
     src/math_functions.cu
-	src/aggregator.cu
-	src/optimizer.cu
-	src/context.cu
-	src/node.cu
+	  src/optimizer.cu
+	  src/context.cu
+	  src/node.cu
   )
   cuda_add_library(dg_gpu ${CUDA_SOURCES})
   target_link_libraries(dg_gpu galois_gpu -lcudart -lcublas -lcurand)

From 1f03f4dcdc786be43b31f03fc3ccd1ade0b1fafe Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 3 Mar 2020 12:26:04 -0600
Subject: [PATCH 097/660] mulscalar and clear moved to deepgalois::math

---
 .../include/deepgalois/math_functions.hh      | 14 ++++---
 libdeepgalois/src/layers/aggregator.cpp       |  4 +-
 libdeepgalois/src/math_functions.cpp          | 41 ++++++++++---------
 3 files changed, 31 insertions(+), 28 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh
index 101d5125be..8ac1eb653b 100644
--- a/libdeepgalois/include/deepgalois/math_functions.hh
+++ b/libdeepgalois/include/deepgalois/math_functions.hh
@@ -27,6 +27,14 @@ namespace math {
 void vadd(const vec_t& a, const vec_t& b, vec_t& out); // vector add
 //! add 2 arrays for n elements
 void vadd(size_t n, const float_t* a, const float_t* b, float_t* out);
+//! multiply vector by scalar
+void mul_scalar(const float_t alpha, vec_t& Y);
+//! multiply n elements of vector by scalar
+void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out);
+//! clear entire vector
+void clear(vec_t& in);
+//! clear n elements of a vector
+void clear(size_t n, float_t* in);
 
 } // deepgalois
 } // math
@@ -37,8 +45,6 @@ void vmul(const vec_t& a, const vec_t& b, vec_t& out);
 void vdiv(const vec_t& a, const vec_t& b, vec_t& out);
 void add_scalar(const float_t alpha, vec_t& Y);
 void sub_scalar(const float_t alpha, vec_t& Y);
-void mul_scalar(const float_t alpha, vec_t& Y);
-void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out);
 void div_scalar(const float_t alpha, vec_t& Y);
 float_t dot(const vec_t& x, const vec_t& y);
 void mvmul(const vec_t& matrix, const vec_t& in_vector, vec_t& out_vector);
@@ -60,10 +66,6 @@ void transpose(size_t x, size_t y, const vec_t& in, vec_t& out);
 void transpose(size_t x, size_t y, const float_t* in, float_t* out);
 int argmax(const size_t n, const vec_t& x);   // the arguments of the maxima
 int argmax(const size_t n, const float_t* x); // the arguments of the maxima
-//! clear entire vector
-void clear(vec_t& in);
-//! clear n elements of a vector
-void clear(size_t n, float_t* in);
 void relu(const vec_t& in, vec_t& out);               // ReLU
 void relu(size_t n, const float_t* in, float_t* out); // ReLU
 void d_relu(const vec_t& in_diff, const vec_t& data,
diff --git a/libdeepgalois/src/layers/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp
index 398da276d2..33d0033638 100644
--- a/libdeepgalois/src/layers/aggregator.cpp
+++ b/libdeepgalois/src/layers/aggregator.cpp
@@ -6,7 +6,7 @@ void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* ou
   galois::do_all(galois::iterate(g),
                  [&](const GNode src) {
                    // zero out this node's out values
-                   clear(len, &out[src * len]);
+                   deepgalois::math::clear(len, &out[src * len]);
                    float_t a = 0.0;
                    float_t b = 0.0;
 
@@ -23,7 +23,7 @@ void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* ou
                        vec_t neighbor(len);
                        // scale the neighbor's data  using the normalization
                        // factor
-                       mul_scalar(len, b, &in[dst * len], &neighbor[0]);
+                       deepgalois::math::mul_scalar(len, b, &in[dst * len], &neighbor[0]);
                        // use scaled data to update
                        deepgalois::math::vadd(len, &out[src * len], &neighbor[0],
                             &out[src * len]); // out[src] += in[dst]
diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp
index 5b5ee78031..b0206dea90 100644
--- a/libdeepgalois/src/math_functions.cpp
+++ b/libdeepgalois/src/math_functions.cpp
@@ -44,6 +44,27 @@ void vadd(size_t n, const float_t* a, const float_t* b, float_t* out) {
 }
 #endif
 
+void clear(vec_t& in) {
+  for (size_t i = 0; i < in.size(); i++)
+    in[i] = 0;
+}
+
+void clear(size_t n, float_t* in) {
+  for (size_t i = 0; i < n; i++)
+    in[i] = 0;
+}
+
+// vector multiply scalar
+void mul_scalar(const float_t alpha, vec_t& Y) {
+  for (size_t i = 0; i < Y.size(); ++i)
+    Y[i] *= alpha;
+}
+
+void mul_scalar(size_t n, const float_t alpha, const float_t* in,
+                float_t* out) {
+  for (size_t i = 0; i < n; ++i)
+    out[i] = alpha * in[i];
+}
 } // deepgalois
 } // math
 
@@ -81,17 +102,6 @@ void sub_scalar(const float_t alpha, vec_t& Y) {
     Y[i] -= alpha;
 }
 
-// vector multiply scalar
-void mul_scalar(const float_t alpha, vec_t& Y) {
-  for (size_t i = 0; i < Y.size(); ++i)
-    Y[i] *= alpha;
-}
-
-void mul_scalar(size_t n, const float_t alpha, const float_t* in,
-                float_t* out) {
-  for (size_t i = 0; i < n; ++i)
-    out[i] = alpha * in[i];
-}
 
 // vector divide scalar
 void div_scalar(const float_t alpha, vec_t& Y) {
@@ -299,15 +309,6 @@ int argmax(const size_t n, const float_t* x) {
   return max_ind;
 }
 
-void clear(vec_t& in) {
-  for (size_t i = 0; i < in.size(); i++)
-    in[i] = 0;
-}
-
-void clear(size_t n, float_t* in) {
-  for (size_t i = 0; i < n; i++)
-    in[i] = 0;
-}
 
 void relu(const vec_t& in, vec_t& out) {
   for (size_t i = 0; i < out.size(); ++i) {

From 23fd8672ea036d302d5413b6cfa1ef04dca6b82c Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 3 Mar 2020 12:34:49 -0600
Subject: [PATCH 098/660] removed chunk size specification from aggregator (for
 now?)

---
 libdeepgalois/src/layers/aggregator.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/libdeepgalois/src/layers/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp
index 33d0033638..d32ab2c598 100644
--- a/libdeepgalois/src/layers/aggregator.cpp
+++ b/libdeepgalois/src/layers/aggregator.cpp
@@ -34,7 +34,6 @@ void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* ou
                                               &out[src * len]); // out[src] += in[dst]
                    }
                  },
-                 galois::chunk_size<CHUNK_SIZE>(),
                  galois::steal(),
                  galois::no_stats(),
                  galois::loopname("update_all"));

From a91e585431f46006d5b6f5e6d9489b9596917916 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 3 Mar 2020 12:35:06 -0600
Subject: [PATCH 099/660] qualified net_phase with deepgalois namepsace

---
 libdeepgalois/include/deepgalois/layers/graph_conv_layer.h | 4 ++--
 libdeepgalois/include/deepgalois/layers/layer.h            | 2 +-
 libdeepgalois/include/deepgalois/net.h                     | 2 +-
 libdeepgalois/src/layers/graph_conv_layer.cpp              | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
index f5b7906f73..92bf1587fc 100644
--- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
@@ -33,7 +33,7 @@ class graph_conv_layer : public layer {
   ~graph_conv_layer() {}
   void init();
   std::string layer_type() const override { return std::string("graph_conv"); }
-  void set_netphase(net_phase ctx) override { phase_ = ctx; }
+  void set_netphase(deepgalois::net_phase ctx) override { phase_ = ctx; }
   // virtual void forward_propagation(const vec_t &in_data, vec_t &out_data);
   // virtual void back_propagation(const vec_t &in_data, const vec_t &out_data,
   // vec_t &out_grad, vec_t &in_grad);
@@ -57,7 +57,7 @@ class graph_conv_layer : public layer {
   bool dropout_; // whether to use dropout at first
   const float_t dropout_rate_;
   float_t scale_;
-  net_phase phase_;
+  deepgalois::net_phase phase_;
   size_t x;
   size_t y;
   size_t z;
diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h
index da6e866d6b..941a1aa9b3 100644
--- a/libdeepgalois/include/deepgalois/layers/layer.h
+++ b/libdeepgalois/include/deepgalois/layers/layer.h
@@ -56,7 +56,7 @@ class layer : public deepgalois::node {
   }
   virtual ~layer()                       = default;
   virtual std::string layer_type() const = 0;
-  virtual void set_netphase(net_phase phase) {}
+  virtual void set_netphase(deepgalois::net_phase phase) {}
   //! save context
   virtual void set_context(deepgalois::Context* ctx) { context = ctx; }
   virtual acc_t get_masked_loss() { return acc_t(0); }
diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h
index 0e18f39e1c..c7c574510e 100644
--- a/libdeepgalois/include/deepgalois/net.h
+++ b/libdeepgalois/include/deepgalois/net.h
@@ -34,7 +34,7 @@ class Net {
     for (size_t i = 0; i < num_layers; i++)
       layers[i]->set_context(context);
   }
-  void set_netphases(net_phase phase) {
+  void set_netphases(deepgalois::net_phase phase) {
     for (size_t i = 0; i < num_layers; i++)
       layers[i]->set_netphase(phase);
   }
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 442478b220..8665674ead 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -68,7 +68,7 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_
   // input: x*y; W: y*z; output: x*z
   // if y > z: mult W first to reduce the feature size for aggregation
   // else: aggregate first then mult W (not implemented yet)
-  if (dropout_ && phase_ == net_phase::train) {
+  if (dropout_ && phase_ == deepgalois::net_phase::train) {
     galois::do_all(galois::iterate((size_t)0, x),
       [&](const auto& i) {
         dropout(y, scale_, dropout_rate_, &in_data[i * y],
@@ -133,7 +133,7 @@ void graph_conv_layer::forward_propagation(const float_t* in_data,
                                            float_t* out_data) {
   assert(y <= 128); // currently only support feature length <= 128
   init_const_gpu(x*z, 0.0, out_temp);
-  if (dropout_ && phase_ == net_phase::train) {
+  if (dropout_ && phase_ == deepgalois::net_phase::train) {
     dropout_gpu(x * y, scale_, dropout_rate_, in_data, dropout_mask, in_temp);
     sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, d_W, 0.0, out_temp);
   } else sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_data, d_W, 0.0, out_temp);

From 3c8bd7b771462749a202726cd9d511bdddd32043 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 3 Mar 2020 13:11:00 -0600
Subject: [PATCH 100/660] comments/rearrange layer/net

---
 libdeepgalois/include/deepgalois/layers/layer.h |  9 +++++----
 libdeepgalois/include/deepgalois/net.h          | 11 ++++++++---
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h
index 941a1aa9b3..10a60c7f89 100644
--- a/libdeepgalois/include/deepgalois/layers/layer.h
+++ b/libdeepgalois/include/deepgalois/layers/layer.h
@@ -59,6 +59,7 @@ class layer : public deepgalois::node {
   virtual void set_netphase(deepgalois::net_phase phase) {}
   //! save context
   virtual void set_context(deepgalois::Context* ctx) { context = ctx; }
+  //! return layer loss
   virtual acc_t get_masked_loss() { return acc_t(0); }
 
   // main functions for layer work
@@ -78,9 +79,9 @@ class layer : public deepgalois::node {
   mask_t* get_device_masks() { return d_masks_; }
   //! debug print function
   void print_layer_info() {
-    std::cout << "Layer" << level_ << " type: " << layer_type() << " input["
-              << input_dims[0] << "," << input_dims[1] << "] output["
-              << output_dims[0] << "," << output_dims[1] << "]\n";
+    galois::gPrint("Layer", level_, " type: ", layer_type(), " input[",
+                   input_dims[0], ",", input_dims[1], "] output[",
+                   output_dims[0], ",", output_dims[1], "]\n");
   }
   virtual void set_sample_mask(size_t sample_begin, size_t sample_end,
                                size_t sample_count, mask_t* masks) {
@@ -124,7 +125,7 @@ class layer : public deepgalois::node {
                      next()->get_gradient(), prev()->get_gradient());
   }
 
-  //! use optimizer to update weights given gradient
+  //! use optimizer to update weights given gradient (weight_grad)
   void update_weight(deepgalois::optimizer* opt) {
     // vec_t diff;
     // prev()->merge_grads(&diff);
diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h
index c7c574510e..67f7f10eae 100644
--- a/libdeepgalois/include/deepgalois/net.h
+++ b/libdeepgalois/include/deepgalois/net.h
@@ -34,15 +34,18 @@ class Net {
     for (size_t i = 0; i < num_layers; i++)
       layers[i]->set_context(context);
   }
+  //! set netphases for all layers in this network
   void set_netphases(deepgalois::net_phase phase) {
     for (size_t i = 0; i < num_layers; i++)
       layers[i]->set_netphase(phase);
   }
+  //! print all layers
   void print_layers_info() {
     for (size_t i = 0; i < num_layers; i++)
       layers[i]->print_layer_info();
   }
 
+  //! Add a convolution layer to the network
   void append_conv_layer(size_t layer_id, bool act = false, bool norm = true,
                          bool bias = false, bool dropout = true,
                          float_t dropout_rate = 0.5) {
@@ -58,6 +61,7 @@ class Net {
       connect(layers[layer_id - 1], layers[layer_id]);
   }
 
+  //! Add an output layer to the network
   void append_out_layer(size_t layer_id) {
     assert(layer_id > 0); // can not be the first layer
     std::vector<size_t> in_dims(2), out_dims(2);
@@ -68,15 +72,16 @@ class Net {
     connect(layers[layer_id - 1], layers[layer_id]);
   }
 
-  // forward propagation: [begin, end) is the range of samples used.
+  //! forward propagation: [begin, end) is the range of samples used.
+  //! calls "forward" on the layers of the network and returns the loss of the
+  //! final layer
   acc_t fprop(size_t begin, size_t end, size_t count, mask_t* masks) {
     // set mask for the last layer
     layers[num_layers - 1]->set_sample_mask(begin, end, count, masks);
     // layer0: from N x D to N x 16
     // layer1: from N x 16 to N x E
     // layer2: from N x E to N x E (normalize only)
-    for (size_t i = 0; i < num_layers; i++)
-      layers[i]->forward();
+    for (size_t i = 0; i < num_layers; i++) layers[i]->forward();
     return layers[num_layers - 1]->get_masked_loss();
   }
 

From b5848b4fa869e11ff9feef008a92e99db8d0650b Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 3 Mar 2020 13:11:23 -0600
Subject: [PATCH 101/660] matmul1d1d moved to deepgalois::math

---
 .../deepgalois/layers/graph_conv_layer.h      |  3 -
 .../include/deepgalois/math_functions.hh      |  7 ++-
 libdeepgalois/src/layers/graph_conv_layer.cpp | 12 ++--
 libdeepgalois/src/math_functions.cpp          | 60 ++++++++++---------
 4 files changed, 42 insertions(+), 40 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
index 92bf1587fc..3fe9ddc31d 100644
--- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
@@ -34,9 +34,6 @@ class graph_conv_layer : public layer {
   void init();
   std::string layer_type() const override { return std::string("graph_conv"); }
   void set_netphase(deepgalois::net_phase ctx) override { phase_ = ctx; }
-  // virtual void forward_propagation(const vec_t &in_data, vec_t &out_data);
-  // virtual void back_propagation(const vec_t &in_data, const vec_t &out_data,
-  // vec_t &out_grad, vec_t &in_grad);
   virtual void forward_propagation(const float_t* in_data, float_t* out_data);
   virtual void back_propagation(const float_t* in_data, const float_t* out_data,
                                 float_t* out_grad, float_t* in_grad);
diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh
index 8ac1eb653b..410078ce99 100644
--- a/libdeepgalois/include/deepgalois/math_functions.hh
+++ b/libdeepgalois/include/deepgalois/math_functions.hh
@@ -36,6 +36,10 @@ void clear(vec_t& in);
 //! clear n elements of a vector
 void clear(size_t n, float_t* in);
 
+void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z,
+                const float_t* A, const float_t* B,
+                float_t* C); // matrix multiply
+
 } // deepgalois
 } // math
 
@@ -55,9 +59,6 @@ void copy2D1D(const tensor_t& in, vec_t& out);
 void copy1D1D(const vec_t& in, vec_t& out);
 void copy1D1D(size_t len, const float_t* in, float_t* out);
 void matmul2D(const tensor_t& A, const tensor_t& B, tensor_t& C);
-void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z,
-                const float_t* A, const float_t* B,
-                float_t* C); // matrix multiply
 void matmul2D1D(const size_t dim_y, const tensor_t& A, const vec_t& B,
                 vec_t& C);
 void transpose2D(const tensor_t& in, tensor_t& out);
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 8665674ead..f535a3812a 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -74,12 +74,12 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_
         dropout(y, scale_, dropout_rate_, &in_data[i * y],
                 &dropout_mask[i * y], &in_temp[i * y]);
       }, galois::loopname("dropout"));
-    matmul1D1D(x, z, y, in_temp, &W[0], out_temp); // x*y; y*z; x*z
+    deepgalois::math::matmul1D1D(x, z, y, in_temp, &W[0], out_temp); // x*y; y*z; x*z
   } else {
-    matmul1D1D(x, z, y, in_data, &W[0], out_temp); // x*y; y*z; x*z
+    deepgalois::math::matmul1D1D(x, z, y, in_data, &W[0], out_temp); // x*y; y*z; x*z
   }
 
-  aggregate(z, context->graph_cpu, out_temp, out_data);
+  graph_conv_layer::aggregate(z, context->graph_cpu, out_temp, out_data);
 
   if (act_) {
     galois::do_all(
@@ -107,7 +107,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
   if (level_ != 0) { // no need to calculate in_grad for the first layer
     vec_t trans_W(z * y);
     transpose(y, z, W, trans_W); // derivative of matmul needs transposed matrix
-    matmul1D1D(x, y, z, out_temp, &trans_W[0], in_temp); // x*z; z*y -> x*y
+    deepgalois::math::matmul1D1D(x, y, z, out_temp, &trans_W[0], in_temp); // x*z; z*y -> x*y
     // sgemm_cpu(x, y, z, 1.0, out_temp, trans_W, 0.0, in_temp); // x*z; z*y ->
     // x*y NOTE: since graph is symmetric, the derivative is the same
     update_all(y, context->graph_cpu, in_temp, in_grad, true,
@@ -124,7 +124,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
 
   // calculate weight gradients
   transpose(x, y, in_data, trans_data);                       // y*x
-  matmul1D1D(y, z, x, trans_data, out_temp, &weight_grad[0]); // y*x; x*z; y*z
+  deepgalois::math::matmul1D1D(y, z, x, trans_data, out_temp, &weight_grad[0]); // y*x; x*z; y*z
 }
 
 #else
@@ -137,7 +137,7 @@ void graph_conv_layer::forward_propagation(const float_t* in_data,
     dropout_gpu(x * y, scale_, dropout_rate_, in_data, dropout_mask, in_temp);
     sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, d_W, 0.0, out_temp);
   } else sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_data, d_W, 0.0, out_temp);
-  aggregate(z, context->graph_gpu, out_temp, out_data);
+  graph_conv_layer::aggregate(z, context->graph_gpu, out_temp, out_data);
   if (act_) relu_gpu(x * z, out_data, out_data);
 }
 
diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp
index b0206dea90..255e6483ce 100644
--- a/libdeepgalois/src/math_functions.cpp
+++ b/libdeepgalois/src/math_functions.cpp
@@ -44,6 +44,18 @@ void vadd(size_t n, const float_t* a, const float_t* b, float_t* out) {
 }
 #endif
 
+// vector multiply scalar
+void mul_scalar(const float_t alpha, vec_t& Y) {
+  for (size_t i = 0; i < Y.size(); ++i)
+    Y[i] *= alpha;
+}
+
+void mul_scalar(size_t n, const float_t alpha, const float_t* in,
+                float_t* out) {
+  for (size_t i = 0; i < n; ++i)
+    out[i] = alpha * in[i];
+}
+
 void clear(vec_t& in) {
   for (size_t i = 0; i < in.size(); i++)
     in[i] = 0;
@@ -54,17 +66,27 @@ void clear(size_t n, float_t* in) {
     in[i] = 0;
 }
 
-// vector multiply scalar
-void mul_scalar(const float_t alpha, vec_t& Y) {
-  for (size_t i = 0; i < Y.size(); ++i)
-    Y[i] *= alpha;
+void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
+               const int M, const int N, const int K, const float alpha,
+               const float* A, const float* B, const float beta, float* C) {
+  int lda = (TransA == CblasNoTrans) ? K : M;
+  int ldb = (TransB == CblasNoTrans) ? N : K;
+  cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb,
+              beta, C, N);
 }
 
-void mul_scalar(size_t n, const float_t alpha, const float_t* in,
-                float_t* out) {
-  for (size_t i = 0; i < n; ++i)
-    out[i] = alpha * in[i];
+// num rows in A, C; num columns in B, C; num columns in A, rows in B
+void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z,
+                const float_t* A, const float_t* B, float_t* C) {
+  galois::StatTimer Tmatmul("MatMul");
+  Tmatmul.start();
+  const CBLAS_TRANSPOSE TransA = CblasNoTrans;
+  const CBLAS_TRANSPOSE TransB = CblasNoTrans;
+  sgemm_cpu(TransA, TransB, dim_x, dim_y, dim_z, 1.0, A, B, 0.0, C);
+  Tmatmul.stop();
 }
+
+
 } // deepgalois
 } // math
 
@@ -174,14 +196,6 @@ void copy1D1D(size_t len, const float_t* in, float_t* out) {
   std::copy(in, in + len, out);
 }
 
-void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
-               const int M, const int N, const int K, const float alpha,
-               const float* A, const float* B, const float beta, float* C) {
-  int lda = (TransA == CblasNoTrans) ? K : M;
-  int ldb = (TransB == CblasNoTrans) ? N : K;
-  cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb,
-              beta, C, N);
-}
 
 void matmul2D(const tensor_t& A, const tensor_t& B, tensor_t& C) {
   // A: x*z; B: z*y; C: x*y
@@ -202,16 +216,6 @@ void matmul2D(const tensor_t& A, const tensor_t& B, tensor_t& C) {
   }
 }
 
-// num rows in A, C; num columns in B, C; num columns in A, rows in B
-void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z,
-                const float_t* A, const float_t* B, float_t* C) {
-  galois::StatTimer Tmatmul("MatMul");
-  Tmatmul.start();
-  const CBLAS_TRANSPOSE TransA = CblasNoTrans;
-  const CBLAS_TRANSPOSE TransB = CblasNoTrans;
-  sgemm_cpu(TransA, TransB, dim_x, dim_y, dim_z, 1.0, A, B, 0.0, C);
-  Tmatmul.stop();
-}
 
 void matmul2D1D(const size_t dim_y, const tensor_t& A, const vec_t& B,
                 vec_t& C) {
@@ -222,7 +226,7 @@ void matmul2D1D(const size_t dim_y, const tensor_t& A, const vec_t& B,
   assert(C.size() == dim_x * dim_y);
   vec_t A1D(dim_x * dim_z);
   copy2D1D(A, A1D);
-  matmul1D1D(dim_x, dim_y, dim_z, &A1D[0], &B[0], &C[0]);
+  deepgalois::math::matmul1D1D(dim_x, dim_y, dim_z, &A1D[0], &B[0], &C[0]);
 }
 
 void matmul(const tensor_t& A, const vec_t& B, tensor_t& C) {
@@ -239,7 +243,7 @@ void matmul(const tensor_t& A, const vec_t& B, tensor_t& C) {
     std::copy(A[i].begin(), A[i].end(), ptr);
     ptr += dim_z;
   }
-  matmul1D1D(dim_x, dim_y, dim_z, &A1D[0], &B[0], &C1D[0]);
+  deepgalois::math::matmul1D1D(dim_x, dim_y, dim_z, &A1D[0], &B[0], &C1D[0]);
   for (size_t i = 0; i < dim_x; i++) {
     for (size_t j = 0; j < dim_y; ++j) {
       C[i][j] = C1D[i * dim_y + j];

From e9e3153be737a7b56cc2d10a7a94acaac5b63784 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 3 Mar 2020 13:18:41 -0600
Subject: [PATCH 102/660] dropout functions moved to deepgalios::math +
 commented

---
 .../include/deepgalois/math_functions.hh      | 26 +++---
 libdeepgalois/src/layers/graph_conv_layer.cpp |  4 +-
 libdeepgalois/src/math_functions.cpp          | 90 ++++++++++---------
 3 files changed, 63 insertions(+), 57 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh
index 410078ce99..09fd3d753a 100644
--- a/libdeepgalois/include/deepgalois/math_functions.hh
+++ b/libdeepgalois/include/deepgalois/math_functions.hh
@@ -36,6 +36,21 @@ void clear(vec_t& in);
 //! clear n elements of a vector
 void clear(size_t n, float_t* in);
 
+// dropout functions apply a random scale to in vector
+void dropout(const float scale, const float dropout_rate, const vec_t& in,
+             std::vector<unsigned>& mask, vec_t& out); // dropout
+void dropout(const float scale, const float dropout_rate, const vec_t& in,
+             std::vector<unsigned>& mask, float_t* out);
+void dropout(size_t n, const float scale, const float dropout_rate,
+             const float_t* in, unsigned* mask, float_t* out);
+// dropout calls that use existing scales in masks instead of generating them
+void d_dropout(const float scale, const vec_t& in_diff,
+               std::vector<unsigned>& mask,
+               vec_t& out_diff); // dropout derivative
+void d_dropout(size_t n, const float scale, const float_t* in_diff,
+               unsigned* mask, float_t* out_diff);
+
+
 void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z,
                 const float_t* A, const float_t* B,
                 float_t* C); // matrix multiply
@@ -71,17 +86,6 @@ void relu(const vec_t& in, vec_t& out);               // ReLU
 void relu(size_t n, const float_t* in, float_t* out); // ReLU
 void d_relu(const vec_t& in_diff, const vec_t& data,
             vec_t& out_diff); // ReLU derivative
-void dropout(const float scale, const float dropout_rate, const vec_t& in,
-             std::vector<unsigned>& mask, vec_t& out); // dropout
-void dropout(const float scale, const float dropout_rate, const vec_t& in,
-             std::vector<unsigned>& mask, float_t* out);
-void dropout(size_t n, const float scale, const float dropout_rate,
-             const float_t* in, unsigned* mask, float_t* out);
-void d_dropout(const float scale, const vec_t& in_diff,
-               std::vector<unsigned>& mask,
-               vec_t& out_diff); // dropout derivative
-void d_dropout(size_t n, const float scale, const float_t* in_diff,
-               unsigned* mask, float_t* out_diff);
 void softmax(const vec_t& input, vec_t& output);
 void softmax(size_t n, const float_t* input, float_t* output);
 void d_softmax(const vec_t& y, const vec_t& p, vec_t& dy, const vec_t& dp);
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index f535a3812a..c071a1cd0d 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -71,7 +71,7 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_
   if (dropout_ && phase_ == deepgalois::net_phase::train) {
     galois::do_all(galois::iterate((size_t)0, x),
       [&](const auto& i) {
-        dropout(y, scale_, dropout_rate_, &in_data[i * y],
+        deepgalois::math::dropout(y, scale_, dropout_rate_, &in_data[i * y],
                 &dropout_mask[i * y], &in_temp[i * y]);
       }, galois::loopname("dropout"));
     deepgalois::math::matmul1D1D(x, z, y, in_temp, &W[0], out_temp); // x*y; y*z; x*z
@@ -115,7 +115,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
     if (dropout_) {
       galois::do_all(galois::iterate((size_t)0, x),
         [&](const auto& i) {
-          d_dropout(y, scale_, &in_grad[i * y],
+          deepgalois::math::d_dropout(y, scale_, &in_grad[i * y],
                     &dropout_mask[i * y], &in_grad[i * y]);
         }, galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
         galois::loopname("d_dropout"));
diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp
index 255e6483ce..96e9552b56 100644
--- a/libdeepgalois/src/math_functions.cpp
+++ b/libdeepgalois/src/math_functions.cpp
@@ -10,6 +10,16 @@ extern "C" {
 namespace deepgalois {
 namespace math {
 
+//! wrapper function to call cblas_sgemm
+void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
+               const int M, const int N, const int K, const float alpha,
+               const float* A, const float* B, const float beta, float* C) {
+  int lda = (TransA == CblasNoTrans) ? K : M;
+  int ldb = (TransB == CblasNoTrans) ? N : K;
+  cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb,
+              beta, C, N);
+}
+
 // vector add
 #if defined(__AVX__) || defined(__AVX2__)
 void vadd(const vec_t& a, const vec_t& b, vec_t& out) {
@@ -66,13 +76,42 @@ void clear(size_t n, float_t* in) {
     in[i] = 0;
 }
 
-void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
-               const int M, const int N, const int K, const float alpha,
-               const float* A, const float* B, const float beta, float* C) {
-  int lda = (TransA == CblasNoTrans) ? K : M;
-  int ldb = (TransB == CblasNoTrans) ? N : K;
-  cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb,
-              beta, C, N);
+void dropout(const float scale, const float dropout_rate, const vec_t& in,
+             std::vector<unsigned>& masks, vec_t& out) {
+  assert(masks.size() == out.size());
+  // rng_bernoulli(1. - dropout_rate, masks); // Create random numbers
+  for (size_t i = 0; i < in.size(); ++i)
+    masks[i] = deepgalois::bernoulli(dropout_rate);
+  for (size_t i = 0; i < in.size(); ++i)
+    out[i] = in[i] * masks[i] * scale;
+}
+
+void dropout(const float scale, const float dropout_rate, const vec_t& in,
+             std::vector<unsigned>& masks, float_t* out) {
+  for (size_t i = 0; i < in.size(); ++i)
+    masks[i] = deepgalois::bernoulli(dropout_rate);
+  for (size_t i = 0; i < in.size(); ++i)
+    out[i] = in[i] * masks[i] * scale;
+}
+
+void dropout(size_t n, const float scale, const float dropout_rate,
+             const float_t* in, unsigned* masks, float_t* out) {
+  for (size_t i = 0; i < n; ++i)
+    masks[i] = deepgalois::bernoulli(dropout_rate);
+  for (size_t i = 0; i < n; ++i)
+    out[i] = in[i] * masks[i] * scale;
+}
+
+void d_dropout(const float scale, const vec_t& in_diff,
+               std::vector<unsigned>& masks, vec_t& out_diff) {
+  for (size_t i = 0; i < in_diff.size(); ++i)
+    out_diff[i] = in_diff[i] * masks[i] * scale;
+}
+
+void d_dropout(size_t n, const float scale, const float_t* in_diff,
+               unsigned* masks, float_t* out_diff) {
+  for (size_t i = 0; i < n; ++i)
+    out_diff[i] = in_diff[i] * masks[i] * scale;
 }
 
 // num rows in A, C; num columns in B, C; num columns in A, rows in B
@@ -86,7 +125,6 @@ void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z,
   Tmatmul.stop();
 }
 
-
 } // deepgalois
 } // math
 
@@ -352,43 +390,7 @@ float reduce_mean(const vec_t& x) {
   return sum / (float)n;
 }
 
-void dropout(const float scale, const float dropout_rate, const vec_t& in,
-             std::vector<unsigned>& masks, vec_t& out) {
-  assert(masks.size() == out.size());
-  // rng_bernoulli(1. - dropout_rate, masks); // Create random numbers
-  for (size_t i = 0; i < in.size(); ++i)
-    masks[i] = deepgalois::bernoulli(dropout_rate);
-  for (size_t i = 0; i < in.size(); ++i)
-    out[i] = in[i] * masks[i] * scale;
-}
-
-void dropout(const float scale, const float dropout_rate, const vec_t& in,
-             std::vector<unsigned>& masks, float_t* out) {
-  for (size_t i = 0; i < in.size(); ++i)
-    masks[i] = deepgalois::bernoulli(dropout_rate);
-  for (size_t i = 0; i < in.size(); ++i)
-    out[i] = in[i] * masks[i] * scale;
-}
-
-void dropout(size_t n, const float scale, const float dropout_rate,
-             const float_t* in, unsigned* masks, float_t* out) {
-  for (size_t i = 0; i < n; ++i)
-    masks[i] = deepgalois::bernoulli(dropout_rate);
-  for (size_t i = 0; i < n; ++i)
-    out[i] = in[i] * masks[i] * scale;
-}
-
-void d_dropout(const float scale, const vec_t& in_diff,
-               std::vector<unsigned>& masks, vec_t& out_diff) {
-  for (size_t i = 0; i < in_diff.size(); ++i)
-    out_diff[i] = in_diff[i] * masks[i] * scale;
-}
 
-void d_dropout(size_t n, const float scale, const float_t* in_diff,
-               unsigned* masks, float_t* out_diff) {
-  for (size_t i = 0; i < n; ++i)
-    out_diff[i] = in_diff[i] * masks[i] * scale;
-}
 
 float_t sigmoid_func(float_t x) { return 0.5 * tanh(0.5 * x) + 0.5; }
 

From 88f6c009b687a56ac578d4affc83c904b84669fc Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 3 Mar 2020 16:41:23 -0600
Subject: [PATCH 103/660] relu/copy1d1d moved to deepgalois::math, some
 comments in graphconv

---
 .../deepgalois/layers/graph_conv_layer.h      |  2 +-
 .../include/deepgalois/math_functions.hh      | 26 ++++++----
 libdeepgalois/include/deepgalois/net.h        |  5 +-
 libdeepgalois/src/layers/graph_conv_layer.cpp | 15 +++---
 libdeepgalois/src/math_functions.cpp          | 52 ++++++++++---------
 5 files changed, 58 insertions(+), 42 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
index 3fe9ddc31d..518dede084 100644
--- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
@@ -58,7 +58,7 @@ class graph_conv_layer : public layer {
   size_t x;
   size_t y;
   size_t z;
-  float_t* out_temp;
+  float_t* out_temp; //!< intermediate data temporary
   float_t* in_temp;
   float_t* trans_data;    // y*x
   unsigned* dropout_mask; // x*y
diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh
index 09fd3d753a..c9ecbe6dea 100644
--- a/libdeepgalois/include/deepgalois/math_functions.hh
+++ b/libdeepgalois/include/deepgalois/math_functions.hh
@@ -18,11 +18,12 @@ extern "C" {
 
 // TODO namespace
 
-const float negative_slope = 0;
 
 namespace deepgalois {
 namespace math {
 
+const float negative_slope = 0;
+
 //! add two same size vectors into out
 void vadd(const vec_t& a, const vec_t& b, vec_t& out); // vector add
 //! add 2 arrays for n elements
@@ -36,20 +37,33 @@ void clear(vec_t& in);
 //! clear n elements of a vector
 void clear(size_t n, float_t* in);
 
-// dropout functions apply a random scale to in vector
+// dropout functions randomly remove weights
 void dropout(const float scale, const float dropout_rate, const vec_t& in,
              std::vector<unsigned>& mask, vec_t& out); // dropout
 void dropout(const float scale, const float dropout_rate, const vec_t& in,
              std::vector<unsigned>& mask, float_t* out);
 void dropout(size_t n, const float scale, const float dropout_rate,
              const float_t* in, unsigned* mask, float_t* out);
-// dropout calls that use existing scales in masks instead of generating them
+// dropout calls that use existing dropouts in masks instead of generating them;
+// derivative
 void d_dropout(const float scale, const vec_t& in_diff,
                std::vector<unsigned>& mask,
                vec_t& out_diff); // dropout derivative
 void d_dropout(size_t n, const float scale, const float_t* in_diff,
                unsigned* mask, float_t* out_diff);
 
+//! relu = keep if positive
+void relu(const vec_t& in, vec_t& out);
+//! relu = keep if positive; first n units
+void relu(size_t n, const float_t* in, float_t* out);
+//! relu derivative; generally, 1 if x > 0, 0 otherwise
+void d_relu(const vec_t& in_diff, const vec_t& data,
+            vec_t& out_diff); // ReLU derivative
+
+//! copy vector from in -> out
+void copy1D1D(const vec_t& in, vec_t& out);
+//! copy vector from in -> out; first len elements
+void copy1D1D(size_t len, const float_t* in, float_t* out);
 
 void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z,
                 const float_t* A, const float_t* B,
@@ -71,8 +85,6 @@ void vvmul(const vec_t& a, const vec_t& b, tensor_t& out);
 void matadd(size_t x, size_t y, const tensor_t& A, const tensor_t& B,
             tensor_t& C);
 void copy2D1D(const tensor_t& in, vec_t& out);
-void copy1D1D(const vec_t& in, vec_t& out);
-void copy1D1D(size_t len, const float_t* in, float_t* out);
 void matmul2D(const tensor_t& A, const tensor_t& B, tensor_t& C);
 void matmul2D1D(const size_t dim_y, const tensor_t& A, const vec_t& B,
                 vec_t& C);
@@ -82,10 +94,6 @@ void transpose(size_t x, size_t y, const vec_t& in, vec_t& out);
 void transpose(size_t x, size_t y, const float_t* in, float_t* out);
 int argmax(const size_t n, const vec_t& x);   // the arguments of the maxima
 int argmax(const size_t n, const float_t* x); // the arguments of the maxima
-void relu(const vec_t& in, vec_t& out);               // ReLU
-void relu(size_t n, const float_t* in, float_t* out); // ReLU
-void d_relu(const vec_t& in_diff, const vec_t& data,
-            vec_t& out_diff); // ReLU derivative
 void softmax(const vec_t& input, vec_t& output);
 void softmax(size_t n, const float_t* input, float_t* output);
 void d_softmax(const vec_t& y, const vec_t& p, vec_t& dy, const vec_t& dp);
diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h
index 67f7f10eae..47a48dea78 100644
--- a/libdeepgalois/include/deepgalois/net.h
+++ b/libdeepgalois/include/deepgalois/net.h
@@ -81,7 +81,10 @@ class Net {
     // layer0: from N x D to N x 16
     // layer1: from N x 16 to N x E
     // layer2: from N x E to N x E (normalize only)
-    for (size_t i = 0; i < num_layers; i++) layers[i]->forward();
+    for (size_t i = 0; i < num_layers; i++) {
+      layers[i]->forward();
+      // TODO need to sync model between layers here
+    }
     return layers[num_layers - 1]->get_masked_loss();
   }
 
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index c071a1cd0d..9060850c02 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -4,14 +4,14 @@ namespace deepgalois {
 
 #ifdef CPU_ONLY
 void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in, float_t* out) {
-  update_all(len, g, in, out, true, context->norm_factor);
+  deepgalois::update_all(len, g, in, out, true, context->norm_factor);
 }
 #else
 void graph_conv_layer::aggregate(size_t len, CSRGraph& g, const float_t* in, float_t* out) {
   #ifdef USE_CUSPARSE
   update_all_cusparse(y, context->graph_gpu, in_temp, in_grad, true, context->d_norm_factor);
   #else
-  update_all(len, g, in, out, true, context->d_norm_factor);
+  deepgalois::update_all(len, g, in, out, true, context->d_norm_factor);
   #endif
 }
 #endif
@@ -81,10 +81,12 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_
 
   graph_conv_layer::aggregate(z, context->graph_cpu, out_temp, out_data);
 
+  // run relu activation on output if specified
   if (act_) {
     galois::do_all(
         galois::iterate((size_t)0, x),
-        [&](const auto& i) { relu(z, &out_data[i * z], &out_data[i * z]); },
+        [&](const auto& i) { deepgalois::math::relu(z, &out_data[i * z],
+                                                    &out_data[i * z]); },
         galois::loopname("relu"));
   }
 }
@@ -94,6 +96,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
                                         const float_t* out_data,
                                         float_t* out_grad, float_t* in_grad) {
   if (act_) {
+    // note; assumption here is that out_grad contains 1s or 0s via relu?
     galois::do_all(galois::iterate((size_t)0, x),
       [&](const auto& i) {
         for (size_t j = 0; j < z; ++j) // TODO: use in_data or out_data?
@@ -101,7 +104,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
                                 ? out_grad[i * z + j] : float_t(0);
       }, galois::loopname("d_relu"));
   } else {
-    copy1D1D(x * z, out_grad, out_temp); // TODO: avoid copying
+    deepgalois::math::copy1D1D(x * z, out_grad, out_temp); // TODO: avoid copying
   }
 
   if (level_ != 0) { // no need to calculate in_grad for the first layer
@@ -110,8 +113,8 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
     deepgalois::math::matmul1D1D(x, y, z, out_temp, &trans_W[0], in_temp); // x*z; z*y -> x*y
     // sgemm_cpu(x, y, z, 1.0, out_temp, trans_W, 0.0, in_temp); // x*z; z*y ->
     // x*y NOTE: since graph is symmetric, the derivative is the same
-    update_all(y, context->graph_cpu, in_temp, in_grad, true,
-               context->norm_factor); // x*x; x*y -> x*y
+    deepgalois::update_all(y, context->graph_cpu, in_temp, in_grad, true,
+                           context->norm_factor); // x*x; x*y -> x*y
     if (dropout_) {
       galois::do_all(galois::iterate((size_t)0, x),
         [&](const auto& i) {
diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp
index 96e9552b56..42ce73b689 100644
--- a/libdeepgalois/src/math_functions.cpp
+++ b/libdeepgalois/src/math_functions.cpp
@@ -114,6 +114,33 @@ void d_dropout(size_t n, const float scale, const float_t* in_diff,
     out_diff[i] = in_diff[i] * masks[i] * scale;
 }
 
+void relu(const vec_t& in, vec_t& out) {
+  for (size_t i = 0; i < out.size(); ++i) {
+    out[i] = std::max(in[i], (float_t)0) +
+             negative_slope * std::min(in[i], (float_t)0);
+  }
+}
+
+void relu(size_t n, const float_t* in, float_t* out) {
+  for (size_t i = 0; i < n; ++i)
+    out[i] = std::max(in[i], float_t(0));
+}
+
+void d_relu(const vec_t& in_diff, const vec_t& fv, vec_t& out_diff) {
+  for (size_t i = 0; i < out_diff.size(); ++i) {
+    out_diff[i] = in_diff[i] * ((fv[i] > (float_t)0) +
+                                negative_slope * (fv[i] <= (float_t)0));
+  }
+}
+
+void copy1D1D(const vec_t& in, vec_t& out) {
+  std::copy(in.begin(), in.end(), &out[0]);
+}
+
+void copy1D1D(size_t len, const float_t* in, float_t* out) {
+  std::copy(in, in + len, out);
+}
+
 // num rows in A, C; num columns in B, C; num columns in A, rows in B
 void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z,
                 const float_t* A, const float_t* B, float_t* C) {
@@ -226,13 +253,6 @@ void copy2D1D(const tensor_t& in, vec_t& out) {
   }
 }
 
-void copy1D1D(const vec_t& in, vec_t& out) {
-  std::copy(in.begin(), in.end(), &out[0]);
-}
-
-void copy1D1D(size_t len, const float_t* in, float_t* out) {
-  std::copy(in, in + len, out);
-}
 
 
 void matmul2D(const tensor_t& A, const tensor_t& B, tensor_t& C) {
@@ -352,24 +372,6 @@ int argmax(const size_t n, const float_t* x) {
 }
 
 
-void relu(const vec_t& in, vec_t& out) {
-  for (size_t i = 0; i < out.size(); ++i) {
-    out[i] = std::max(in[i], (float_t)0) +
-             negative_slope * std::min(in[i], (float_t)0);
-  }
-}
-
-void relu(size_t n, const float_t* in, float_t* out) {
-  for (size_t i = 0; i < n; ++i)
-    out[i] = std::max(in[i], float_t(0));
-}
-
-void d_relu(const vec_t& in_diff, const vec_t& fv, vec_t& out_diff) {
-  for (size_t i = 0; i < out_diff.size(); ++i) {
-    out_diff[i] = in_diff[i] * ((fv[i] > (float_t)0) +
-                                negative_slope * (fv[i] <= (float_t)0));
-  }
-}
 
 void d_mvmul(vec_t& in_diff, vec_t& h_in, tensor_t& out_diff) {
   vvmul(h_in, in_diff, out_diff); // transposed feature matrix X^T times in_diff

From 3d156aaaaf4cf0af996b852b24ae735afef9a149 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 3 Mar 2020 17:21:47 -0600
Subject: [PATCH 104/660] transpose to math TODO make parallel or remove
 completely and use sgemm

---
 .../include/deepgalois/math_functions.hh      |  7 ++--
 libdeepgalois/src/layers/graph_conv_layer.cpp |  8 +++--
 libdeepgalois/src/math_functions.cpp          | 33 ++++++++++---------
 3 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh
index c9ecbe6dea..cc277093f3 100644
--- a/libdeepgalois/include/deepgalois/math_functions.hh
+++ b/libdeepgalois/include/deepgalois/math_functions.hh
@@ -69,6 +69,11 @@ void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z,
                 const float_t* A, const float_t* B,
                 float_t* C); // matrix multiply
 
+//! transposes a matrix (vector)
+void transpose(size_t x, size_t y, const vec_t& in, vec_t& out);
+//! transposes a matrix (malloc'd array)
+void transpose(size_t x, size_t y, const float_t* in, float_t* out);
+
 } // deepgalois
 } // math
 
@@ -90,8 +95,6 @@ void matmul2D1D(const size_t dim_y, const tensor_t& A, const vec_t& B,
                 vec_t& C);
 void transpose2D(const tensor_t& in, tensor_t& out);
 void transpose2D1D(const tensor_t& in, vec_t& out);
-void transpose(size_t x, size_t y, const vec_t& in, vec_t& out);
-void transpose(size_t x, size_t y, const float_t* in, float_t* out);
 int argmax(const size_t n, const vec_t& x);   // the arguments of the maxima
 int argmax(const size_t n, const float_t* x); // the arguments of the maxima
 void softmax(const vec_t& input, vec_t& output);
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 9060850c02..2ce46756c3 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -107,9 +107,13 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
     deepgalois::math::copy1D1D(x * z, out_grad, out_temp); // TODO: avoid copying
   }
 
+  // at this point, out_temp has the derivative of activation
+
+  // this calculates feature gradients
   if (level_ != 0) { // no need to calculate in_grad for the first layer
     vec_t trans_W(z * y);
-    transpose(y, z, W, trans_W); // derivative of matmul needs transposed matrix
+    // derivative of matmul needs transposed matrix
+    deepgalois::math::transpose(y, z, W, trans_W);
     deepgalois::math::matmul1D1D(x, y, z, out_temp, &trans_W[0], in_temp); // x*z; z*y -> x*y
     // sgemm_cpu(x, y, z, 1.0, out_temp, trans_W, 0.0, in_temp); // x*z; z*y ->
     // x*y NOTE: since graph is symmetric, the derivative is the same
@@ -126,7 +130,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
   }
 
   // calculate weight gradients
-  transpose(x, y, in_data, trans_data);                       // y*x
+  deepgalois::math::transpose(x, y, in_data, trans_data);                       // y*x
   deepgalois::math::matmul1D1D(y, z, x, trans_data, out_temp, &weight_grad[0]); // y*x; x*z; y*z
 }
 
diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp
index 42ce73b689..af4e62c90c 100644
--- a/libdeepgalois/src/math_functions.cpp
+++ b/libdeepgalois/src/math_functions.cpp
@@ -152,6 +152,24 @@ void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z,
   Tmatmul.stop();
 }
 
+// TODO make parallel
+void transpose(size_t x, size_t y, const vec_t& in, vec_t& out) {
+  for (size_t i = 0; i < y; i++) {
+    for (size_t j = 0; j < x; j++) {
+      out[i * x + j] = in[j * y + i];
+    }
+  }
+}
+
+// TODO make parallel
+void transpose(size_t x, size_t y, const float_t* in, float_t* out) {
+  for (size_t i = 0; i < y; i++) {
+    for (size_t j = 0; j < x; j++) {
+      out[i * x + j] = in[j * y + i];
+    }
+  }
+}
+
 } // deepgalois
 } // math
 
@@ -331,21 +349,6 @@ void transpose2D1D(const tensor_t& in, vec_t& out) {
   }
 }
 
-void transpose(size_t x, size_t y, const vec_t& in, vec_t& out) {
-  for (size_t i = 0; i < y; i++) {
-    for (size_t j = 0; j < x; j++) {
-      out[i * x + j] = in[j * y + i];
-    }
-  }
-}
-
-void transpose(size_t x, size_t y, const float_t* in, float_t* out) {
-  for (size_t i = 0; i < y; i++) {
-    for (size_t j = 0; j < x; j++) {
-      out[i * x + j] = in[j * y + i];
-    }
-  }
-}
 
 int argmax(const size_t n, const vec_t& x) {
   float_t max = x[0];

From 7f3a386da149ae14e40336505f750ff7d53ed38c Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 3 Mar 2020 19:55:19 -0600
Subject: [PATCH 105/660] comments for training flow; clean up; namespace
 scoping

---
 .../deepgalois/layers/graph_conv_layer.h      |  4 +++
 .../include/deepgalois/layers/layer.h         | 19 +++-----------
 .../include/deepgalois/layers/node.h          |  3 +--
 libdeepgalois/include/deepgalois/net.h        |  3 +--
 libdeepgalois/src/layers/graph_conv_layer.cpp | 25 +++++++++++--------
 .../src/layers/softmax_loss_layer.cpp         |  1 +
 libdeepgalois/src/net.cpp                     | 20 ++++++++++++---
 7 files changed, 42 insertions(+), 33 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
index 518dede084..0a43cf0095 100644
--- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
@@ -34,7 +34,11 @@ class graph_conv_layer : public layer {
   void init();
   std::string layer_type() const override { return std::string("graph_conv"); }
   void set_netphase(deepgalois::net_phase ctx) override { phase_ = ctx; }
+  //! Uses weights contained in this layer to update in_data (results from previous)
+  //! and save result to out_data
   virtual void forward_propagation(const float_t* in_data, float_t* out_data);
+  //! Uses gradients from layer after this one to update both own weight gradients
+  //! as well as gradients for the features (in_grad)
   virtual void back_propagation(const float_t* in_data, const float_t* out_data,
                                 float_t* out_grad, float_t* in_grad);
   // user-defined aggregate function
diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h
index 10a60c7f89..bb009cd57a 100644
--- a/libdeepgalois/include/deepgalois/layers/layer.h
+++ b/libdeepgalois/include/deepgalois/layers/layer.h
@@ -134,7 +134,7 @@ class layer : public deepgalois::node {
     // parallelize only when target size is big enough to mitigate thread
     // spawning overhead.
     bool parallel = (W.size() >= 512);
-    opt->update(weight_grad, W, parallel); // W += grad
+    opt->update(layer::weight_grad, layer::W, parallel); // W += grad
 #else
 	//std::cout << name_ << ": ";
     opt->update_gpu(input_dims[1]*output_dims[1], d_weight_grad, d_W); // W += grad
@@ -166,20 +166,9 @@ class layer : public deepgalois::node {
 };
 
 
-// head: layer i+1, tail: layer i
-inline void connect(layer* head, layer* tail, size_t head_index = 0,
-                    size_t tail_index = 0) {
-  // auto out_shape = head->out_shape()[head_index];
-  // auto in_shape  = tail->in_shape()[tail_index];
-  // head->setup(false);
-  // if (in_shape.size() == 0) {
-  //	tail->set_in_shape(out_shape);
-  //	in_shape = out_shape;
-  //}
-  // if (out_shape.size() != in_shape.size())
-  //	connection_mismatch(*head, *tail);
-  // if (!head->next_[head_index])
-  //	throw nn_error("output edge must not be null");
+//! Connects tail to head's edge and sets that edge's target to tail
+//inline void connect(layer* head, layer* tail) {
+inline void connect(layer* head, layer* tail) {
   tail->prev_ = head->next_;
   tail->prev_->add_next_node(tail);
 }
diff --git a/libdeepgalois/include/deepgalois/layers/node.h b/libdeepgalois/include/deepgalois/layers/node.h
index fcb20513c0..9b43167656 100644
--- a/libdeepgalois/include/deepgalois/layers/node.h
+++ b/libdeepgalois/include/deepgalois/layers/node.h
@@ -34,8 +34,7 @@ class node : public std::enable_shared_from_this<node> {
 
 protected:
   node() = delete;
-  friend void connect(layer* head, layer* tail, size_t head_index,
-                      size_t tail_index);
+  friend void connect(layer* head, layer* tail);
   mutable edgeptr_t prev_;
   mutable edgeptr_t next_;
 };
diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h
index 47a48dea78..dfc4f3d0d7 100644
--- a/libdeepgalois/include/deepgalois/net.h
+++ b/libdeepgalois/include/deepgalois/net.h
@@ -57,8 +57,7 @@ class Net {
     out_dims[1]              = get_out_dim(layer_id);
     layers[layer_id] = new graph_conv_layer(layer_id, act, norm, bias, dropout,
                                             dropout_rate, in_dims, out_dims);
-    if (layer_id > 0)
-      connect(layers[layer_id - 1], layers[layer_id]);
+    if (layer_id > 0) connect(layers[layer_id - 1], layers[layer_id]);
   }
 
   //! Add an output layer to the network
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 2ce46756c3..f40f9ad591 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -48,7 +48,7 @@ void graph_conv_layer::init() {
 #ifdef CPU_ONLY
   rand_init_matrix(y, z, W); // randomly initialize trainable parameters
   // rand_init_matrix(y, z, Q);
-  zero_init_matrix(y, z, weight_grad);
+  zero_init_matrix(y, z, layer::weight_grad);
   if (dropout_)
     dropout_mask = new unsigned[x * y];
   in_temp  = new float_t[x * y];
@@ -56,7 +56,7 @@ void graph_conv_layer::init() {
                // https://github.com/chenxuhao/gcn/blob/master/gcn/layers.py
   trans_data = new float_t[y * x]; // y*x
 #else
-  gconv_malloc_device(x, y, z, dropout_, dropout_mask, in_temp, out_temp, d_W, d_weight_grad);
+  gconv_malloc_device(x, y, z, dropout_, dropout_mask, in_temp, out_temp, d_W, layer::d_weight_grad);
 #endif
   t_alloc.Stop();
   std::cout << "Done, allocation time: " << t_alloc.Millisecs() << " ms\n";
@@ -74,11 +74,12 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_
         deepgalois::math::dropout(y, scale_, dropout_rate_, &in_data[i * y],
                 &dropout_mask[i * y], &in_temp[i * y]);
       }, galois::loopname("dropout"));
-    deepgalois::math::matmul1D1D(x, z, y, in_temp, &W[0], out_temp); // x*y; y*z; x*z
+    deepgalois::math::matmul1D1D(x, z, y, in_temp, &layer::W[0], out_temp); // x*y; y*z; x*z
   } else {
-    deepgalois::math::matmul1D1D(x, z, y, in_data, &W[0], out_temp); // x*y; y*z; x*z
+    deepgalois::math::matmul1D1D(x, z, y, in_data, &layer::W[0], out_temp); // x*y; y*z; x*z
   }
 
+  // aggregate based on graph topology
   graph_conv_layer::aggregate(z, context->graph_cpu, out_temp, out_data);
 
   // run relu activation on output if specified
@@ -100,6 +101,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
     galois::do_all(galois::iterate((size_t)0, x),
       [&](const auto& i) {
         for (size_t j = 0; j < z; ++j) // TODO: use in_data or out_data?
+          // check if original data greater than 0; if so keep grad
           out_temp[i * z + j] = out_data[i * z + j] > float_t(0)
                                 ? out_grad[i * z + j] : float_t(0);
       }, galois::loopname("d_relu"));
@@ -107,13 +109,14 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
     deepgalois::math::copy1D1D(x * z, out_grad, out_temp); // TODO: avoid copying
   }
 
-  // at this point, out_temp has the derivative of activation
+  // at this point, out_temp has the derivative of data from last step to
+  // use for both updating gradients for features and gradients for weights
 
-  // this calculates feature gradients
+  // this calculates gradients for the node predictions
   if (level_ != 0) { // no need to calculate in_grad for the first layer
     vec_t trans_W(z * y);
     // derivative of matmul needs transposed matrix
-    deepgalois::math::transpose(y, z, W, trans_W);
+    deepgalois::math::transpose(y, z, layer::W, trans_W);
     deepgalois::math::matmul1D1D(x, y, z, out_temp, &trans_W[0], in_temp); // x*z; z*y -> x*y
     // sgemm_cpu(x, y, z, 1.0, out_temp, trans_W, 0.0, in_temp); // x*z; z*y ->
     // x*y NOTE: since graph is symmetric, the derivative is the same
@@ -129,9 +132,11 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
     }
   }
 
-  // calculate weight gradients
+  // calculate weight gradients by using previous layer's transpose multiplied
+  // by gradients from last back prop step
   deepgalois::math::transpose(x, y, in_data, trans_data);                       // y*x
-  deepgalois::math::matmul1D1D(y, z, x, trans_data, out_temp, &weight_grad[0]); // y*x; x*z; y*z
+  // updates THIS layer's weight gradients to update them
+  deepgalois::math::matmul1D1D(y, z, x, trans_data, out_temp, &layer::weight_grad[0]); // y*x; x*z; y*z
 }
 
 #else
@@ -163,7 +168,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
 #endif
     if (dropout_) d_dropout_gpu(x * y, scale_, dropout_rate_, in_grad, dropout_mask, in_grad);
   }
-  sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, d_weight_grad);
+  sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, layer::d_weight_grad);
 }
 #endif
 
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp
index f16ba58fbe..eda3de054d 100644
--- a/libdeepgalois/src/layers/softmax_loss_layer.cpp
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp
@@ -36,6 +36,7 @@ void softmax_loss_layer::forward_propagation(const float_t* in_data,
 void softmax_loss_layer::back_propagation(const float_t* in_data,
                                           const float_t* out_data,
                                           float_t* out_grad, float_t* in_grad) {
+  // note: out_grad is ignored because it shouldn't exist (this is output layer)
   size_t len = input_dims[1];
   galois::do_all(galois::iterate(begin_, end_),
     [&](const auto& i) {
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index 2221b3daad..7407e99d9f 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -49,8 +49,9 @@ void Net::train(optimizer* opt, bool need_validate) {
   galois::StatTimer Tfw("Train-Forward");
   galois::StatTimer Tbw("Train-Backward");
   galois::StatTimer Tval("Validation");
+
   Timer t_epoch;
-  // run epoches
+  // run epochs
   for (unsigned i = 0; i < num_epochs; i++) {
     std::cout << "Epoch " << std::setw(2) << i << std::fixed
               << std::setprecision(3) << ":";
@@ -59,18 +60,29 @@ void Net::train(optimizer* opt, bool need_validate) {
     // training steps
     set_netphases(net_phase::train);
     acc_t train_loss = 0.0, train_acc = 0.0;
+
+    // forward: after this phase, layer edges will contain intermediate features
+    // for use during backprop
     Tfw.start();
     train_loss =
-        fprop(train_begin, train_end, train_count, &train_mask[0]); // forward
+        Net::fprop(train_begin, train_end, train_count, &train_mask[0]); // forward
     train_acc = masked_accuracy(train_begin, train_end, train_count,
                                 &train_mask[0]); // predict
     Tfw.stop();
+
+    // backward: use intermediate features + ground truth to update layers
+    // with feature gradients whcih are then used to calculate weight gradients
     Tbw.start();
-    bprop(); // back propogation
+    Net::bprop();
     Tbw.stop();
+
+    // gradient update: use gradients stored on each layer to update model for
+    // next epoch
     Tupdate.start();
-    update_weights(opt); // update parameters
+    Net::update_weights(opt); // update parameters
     Tupdate.stop();
+
+    // validation / testing
     set_netphases(net_phase::test);
     std::cout << " train_loss = " << std::setw(5) << train_loss
               << " train_acc = " << std::setw(5) << train_acc;

From 916e51e39bd5c3205b1ba580b6c5141c0bdb2b82 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Wed, 4 Mar 2020 08:49:19 -0600
Subject: [PATCH 106/660] fix header

---
 libdeepgalois/src/layers/aggregator.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libdeepgalois/src/layers/aggregator.cu b/libdeepgalois/src/layers/aggregator.cu
index cd89cd92b1..166ea4b4bb 100644
--- a/libdeepgalois/src/layers/aggregator.cu
+++ b/libdeepgalois/src/layers/aggregator.cu
@@ -2,7 +2,7 @@
 #include "ggcuda.h"
 #include "cub/cub.cuh"
 #include "deepgalois/cutils.h"
-#include "deepgalois/aggregator.h"
+#include "deepgalois/layers/aggregator.h"
 #include "deepgalois/math_functions.hh"
 
 // TODO: use warp

From 6f1a3e25697ae8ae373b86bf85934b2aa3c68b2a Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Wed, 4 Mar 2020 10:23:48 -0600
Subject: [PATCH 107/660] fix gpu

---
 libdeepgalois/CMakeLists.txt                  | 32 ++++---
 .../include/deepgalois/layers/aggregator.h    |  7 +-
 .../deepgalois/layers/graph_conv_layer.h      | 15 ++--
 .../include/deepgalois/layers/layer.h         |  7 +-
 .../include/deepgalois/math_functions.hh      |  3 +-
 libdeepgalois/src/layers/aggregator.cpp       | 60 ++++++-------
 libdeepgalois/src/layers/aggregator.cu        |  4 +-
 libdeepgalois/src/layers/graph_conv_layer.cpp | 89 +++++--------------
 libdeepgalois/src/layers/graph_conv_layer.cu  | 50 +++++++++++
 libdeepgalois/src/layers/layer.cpp            | 12 +++
 libdeepgalois/src/math_functions.cpp          |  4 +-
 11 files changed, 149 insertions(+), 134 deletions(-)
 create mode 100644 libdeepgalois/src/layers/graph_conv_layer.cu
 create mode 100644 libdeepgalois/src/layers/layer.cpp

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index 7f481cb385..9f797fc655 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -1,11 +1,11 @@
 cmake_minimum_required(VERSION 2.8)
 
-SET(OPENBLAS_INC /net/ohm/export/cdgc/cxh/OpenBLAS-faraday/build/include)
-SET(OPENBLAS_LIB /net/ohm/export/cdgc/cxh/OpenBLAS-faraday/build/lib)
+SET(OPENBLAS_INC /net/ohm/export/cdgc/cxh/OpenBLAS/build/include)
+SET(OPENBLAS_LIB /net/ohm/export/cdgc/cxh/OpenBLAS/build/lib)
 set(CUB_ROOT "${CMAKE_SOURCE_DIR}/cub") # only required headers
 set(MGPU_ROOT "${CMAKE_SOURCE_DIR}/moderngpu") # only required headers
-SET(CUDA_INC /org/centers/cdgc/cuda/cuda-8.0/include)
-SET(CUDA_LIB /org/centers/cdgc/cuda/cuda-8.0/lib64/)
+SET(CUDA_INC /org/centers/cdgc/cuda/cuda-10.0/include)
+SET(CUDA_LIB /org/centers/cdgc/cuda/cuda-10.0/lib64/)
 include_directories(${OPENBLAS_INC})
 include_directories(${CMAKE_SOURCE_DIR}/libgalois/include)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
@@ -16,26 +16,33 @@ include_directories("${MGPU_ROOT}/src")
 link_directories(${OPENBLAS_LIB})
 link_directories(${CMAKE_SOURCE_DIR}/libgalois)
 
-set(USE_CPU ON CACHE BOOL "Build DeepGalois without CUDA support")
+set(USE_CPU OFF CACHE BOOL "Build DeepGalois without CUDA support")
 if(USE_CPU)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCPU_ONLY")
 else()
+  #set( CMAKE_VERBOSE_MAKEFILE on )
   find_package(CUDA REQUIRED)
   set(CUDA_SEPARABLE_COMPILATION ON)
   set(CUDA_PROPAGATE_HOST_FLAGS OFF)
   set(CUDA_HOST_COMPILER g++)
-  #list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_60,code=sm_60")
+  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -gencode arch=compute_60,code=sm_60)
+  #set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -gencode arch=compute_61,code=sm_61)
+  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -gencode arch=compute_70,code=sm_70)
+  #set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -G -Xcompiler -rdynamic)
+  #set(CUDA_INCLUDE_DIRS /org/centers/cdgc/cuda/cuda-10.0/include ${CUDA_INCLUDE_DIRS})
   link_directories(${CUDA_LIB})
   link_directories(${CMAKE_SOURCE_DIR}/libgpu)
   set(CUDA_SOURCES
-	  src/layers/aggregator.cu
+    src/layers/graph_conv_layer.cu
+    #src/layers/softmax_loss_layer.cu
+    src/layers/aggregator.cu
     src/math_functions.cu
-	  src/optimizer.cu
-	  src/context.cu
-	  src/node.cu
+    src/optimizer.cu
+    src/context.cu
+    src/node.cu
   )
   cuda_add_library(dg_gpu ${CUDA_SOURCES})
-  target_link_libraries(dg_gpu galois_gpu -lcudart -lcublas -lcurand)
+  target_link_libraries(dg_gpu galois_gpu -lcudart -lcublas -lcusparse -lcurand)
   set_target_properties(dg_gpu PROPERTIES COMPILE_FLAGS "-DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_CUDA")
   set_target_properties(dg_gpu PROPERTIES CUDA_SEPERABLE_COMPILATION ON)
   #cuda_compile(MF_O src/math_functions.cu)
@@ -47,6 +54,7 @@ set(sources
   src/layers/graph_conv_layer.cpp
   src/layers/softmax_loss_layer.cpp
   src/layers/aggregator.cpp
+  src/layers/layer.cpp
   src/math_functions.cpp
   src/optimizer.cpp
   src/context.cpp
@@ -63,7 +71,7 @@ endif()
 
 target_link_libraries(dg_cpu ${MPI_CXX_LIBRARIES})
 target_link_libraries(dg_cpu -lopenblas)
-target_link_libraries(dg_cpu -lcudart -lcublas -lcurand)
+target_link_libraries(dg_cpu -lcudart -lcublas -lcusparse -lcurand)
 
 target_include_directories(dg_cpu PUBLIC
   ${CMAKE_SOURCE_DIR}/libllvm/include
diff --git a/libdeepgalois/include/deepgalois/layers/aggregator.h b/libdeepgalois/include/deepgalois/layers/aggregator.h
index 806f81a3e0..1d6a1acebb 100644
--- a/libdeepgalois/include/deepgalois/layers/aggregator.h
+++ b/libdeepgalois/include/deepgalois/layers/aggregator.h
@@ -1,13 +1,14 @@
 #pragma once
 #include "deepgalois/types.h"
-#include "deepgalois/gtypes.h"
-namespace deepgalois {
 //! For each node in the graph, add the embeddings of all of its neighbors
 //! together (using norm_factor if specified)
+#ifdef CPU_ONLY
+#include "deepgalois/gtypes.h"
+namespace deepgalois {
 void update_all(size_t len, Graph& g, const float_t* in, float_t* out,
                 bool norm, const float_t* norm_factor);
 }
-#ifndef CPU_ONLY
+#else
 #include "graph_gpu.h"
 namespace deepgalois {
 void update_all(size_t len, CSRGraph& g, const float_t* in, float_t* out,
diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
index 0a43cf0095..b2b80b69e8 100644
--- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
@@ -23,15 +23,15 @@
 namespace deepgalois {
 class graph_conv_layer : public layer {
 public:
-  graph_conv_layer(unsigned level, bool act, bool norm, bool bias, bool dropout,
-                   float_t dropout_rate, std::vector<size_t> in_dims,
-                   std::vector<size_t> out_dims);
+  graph_conv_layer(unsigned level, bool act, bool norm, bool bias,
+                   bool dropout, float_t dropout_rate,
+                   std::vector<size_t> in_dims, std::vector<size_t> out_dims);
   graph_conv_layer(unsigned level, std::vector<size_t> in_dims,
                    std::vector<size_t> out_dims)
-      : graph_conv_layer(level, false, true, false, true, 0.5, in_dims,
-                         out_dims) {}
+      : graph_conv_layer(level, false, true, false, true, 0.5, in_dims, out_dims) {}
   ~graph_conv_layer() {}
   void init();
+  void init_gpu();
   std::string layer_type() const override { return std::string("graph_conv"); }
   void set_netphase(deepgalois::net_phase ctx) override { phase_ = ctx; }
   //! Uses weights contained in this layer to update in_data (results from previous)
@@ -45,11 +45,10 @@ class graph_conv_layer : public layer {
 #ifdef CPU_ONLY
   virtual void aggregate(size_t len, Graph& g, const float_t* in, float_t* out);
 #else
-  virtual void aggregate(size_t len, CSRGraph& g, const float_t* in,
-                         float_t* out);
+  virtual void aggregate(size_t len, CSRGraph& g, const float_t* in, float_t* out);
 #endif
   // user-defined combine function
-  virtual void combine(const vec_t& self, const vec_t& neighbors, vec_t& out);
+  virtual void combine(size_t dim_x, size_t dim_y, const float_t* self, const float_t* neighbors, float_t* out);
 
 private:
   bool act_;     // whether to use activation function at the end
diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h
index bb009cd57a..19bb176f90 100644
--- a/libdeepgalois/include/deepgalois/layers/layer.h
+++ b/libdeepgalois/include/deepgalois/layers/layer.h
@@ -25,7 +25,6 @@
 #include "deepgalois/layers/node.h"
 #include "deepgalois/types.h"
 #include "deepgalois/utils.h"
-#include "deepgalois/gtypes.h"
 #include "deepgalois/context.h"
 #include "deepgalois/optimizer.h"
 #include "deepgalois/math_functions.hh"
@@ -78,11 +77,7 @@ class layer : public deepgalois::node {
 
   mask_t* get_device_masks() { return d_masks_; }
   //! debug print function
-  void print_layer_info() {
-    galois::gPrint("Layer", level_, " type: ", layer_type(), " input[",
-                   input_dims[0], ",", input_dims[1], "] output[",
-                   output_dims[0], ",", output_dims[1], "]\n");
-  }
+  void print_layer_info();
   virtual void set_sample_mask(size_t sample_begin, size_t sample_end,
                                size_t sample_count, mask_t* masks) {
     begin_ = sample_begin;
diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh
index cc277093f3..61eceda3f2 100644
--- a/libdeepgalois/include/deepgalois/math_functions.hh
+++ b/libdeepgalois/include/deepgalois/math_functions.hh
@@ -85,7 +85,8 @@ void add_scalar(const float_t alpha, vec_t& Y);
 void sub_scalar(const float_t alpha, vec_t& Y);
 void div_scalar(const float_t alpha, vec_t& Y);
 float_t dot(const vec_t& x, const vec_t& y);
-void mvmul(const vec_t& matrix, const vec_t& in_vector, vec_t& out_vector);
+//void mvmul(const vec_t& matrix, const vec_t& in_vector, vec_t& out_vector);
+void mvmul(size_t m, size_t n, const float_t *matrix, const float_t *in_vector, float_t *out_vector);
 void vvmul(const vec_t& a, const vec_t& b, tensor_t& out);
 void matadd(size_t x, size_t y, const tensor_t& A, const tensor_t& B,
             tensor_t& C);
diff --git a/libdeepgalois/src/layers/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp
index d32ab2c598..581c5f564c 100644
--- a/libdeepgalois/src/layers/aggregator.cpp
+++ b/libdeepgalois/src/layers/aggregator.cpp
@@ -1,40 +1,38 @@
 #include "deepgalois/layers/aggregator.h"
 #include "deepgalois/math_functions.hh"
 
+#ifdef CPU_ONLY
 void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* out,
                 bool norm, const float_t* norm_factor) {
-  galois::do_all(galois::iterate(g),
-                 [&](const GNode src) {
-                   // zero out this node's out values
-                   deepgalois::math::clear(len, &out[src * len]);
-                   float_t a = 0.0;
-                   float_t b = 0.0;
+  galois::do_all(galois::iterate(g), [&](const GNode src) {
+   // zero out this node's out values
+   deepgalois::math::clear(len, &out[src * len]);
+   float_t a = 0.0;
+   float_t b = 0.0;
 
-                   // get normalization factor if needed
-                   if (norm) a = norm_factor[src];
+   // get normalization factor if needed
+   if (norm) a = norm_factor[src];
 
-                   // gather neighbors' embeddings
-                   for (const auto e : g.edges(src)) {
-                     const auto dst = g.getEdgeDst(e);
+   // gather neighbors' embeddings
+   for (const auto e : g.edges(src)) {
+     const auto dst = g.getEdgeDst(e);
 
-                     if (norm) {
-                       // normalize b as well
-                       b = a * norm_factor[dst];
-                       vec_t neighbor(len);
-                       // scale the neighbor's data  using the normalization
-                       // factor
-                       deepgalois::math::mul_scalar(len, b, &in[dst * len], &neighbor[0]);
-                       // use scaled data to update
-                       deepgalois::math::vadd(len, &out[src * len], &neighbor[0],
-                            &out[src * len]); // out[src] += in[dst]
-                     } else
-                       // add embeddings from neighbors together
-                       deepgalois::math::vadd(len, &out[src * len],
-                                              &in[dst * len],
-                                              &out[src * len]); // out[src] += in[dst]
-                   }
-                 },
-                 galois::steal(),
-                 galois::no_stats(),
-                 galois::loopname("update_all"));
+     if (norm) {
+       // normalize b as well
+       b = a * norm_factor[dst];
+       vec_t neighbor(len);
+       // scale the neighbor's data  using the normalization
+       // factor
+       deepgalois::math::mul_scalar(len, b, &in[dst * len], &neighbor[0]);
+       // use scaled data to update
+       deepgalois::math::vadd(len, &out[src * len], &neighbor[0],
+            &out[src * len]); // out[src] += in[dst]
+     } else
+       // add embeddings from neighbors together
+       deepgalois::math::vadd(len, &out[src * len],
+                              &in[dst * len],
+                              &out[src * len]); // out[src] += in[dst]
+   }
+ }, galois::steal(), galois::no_stats(), galois::loopname("update_all"));
 }
+#endif
diff --git a/libdeepgalois/src/layers/aggregator.cu b/libdeepgalois/src/layers/aggregator.cu
index 166ea4b4bb..06eac8bb75 100644
--- a/libdeepgalois/src/layers/aggregator.cu
+++ b/libdeepgalois/src/layers/aggregator.cu
@@ -59,7 +59,7 @@ __global__ void update_all_warp(size_t n, size_t len, CSRGraph g,
   }
 }
 
-void update_all(size_t len, CSRGraph& g, const float_t* in, float_t* out,
+void deepgalois::update_all(size_t len, CSRGraph& g, const float_t* in, float_t* out,
                 bool norm, const float_t* norm_factor) {
   unsigned n = g.nnodes;
   CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t)));
@@ -68,7 +68,7 @@ void update_all(size_t len, CSRGraph& g, const float_t* in, float_t* out,
   CudaTest("solving update_all kernel failed");
 }
 
-void update_all_cusparse(size_t len, CSRGraph& g, const float_t* in, float_t* out,
+void deepgalois::update_all_cusparse(size_t len, CSRGraph& g, const float_t* in, float_t* out,
                 bool norm, const float_t* norm_factor) {
   unsigned n = g.nnodes;
   CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t)));
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index f40f9ad591..8e7b9f4eb5 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -2,28 +2,6 @@
 
 namespace deepgalois {
 
-#ifdef CPU_ONLY
-void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in, float_t* out) {
-  deepgalois::update_all(len, g, in, out, true, context->norm_factor);
-}
-#else
-void graph_conv_layer::aggregate(size_t len, CSRGraph& g, const float_t* in, float_t* out) {
-  #ifdef USE_CUSPARSE
-  update_all_cusparse(y, context->graph_gpu, in_temp, in_grad, true, context->d_norm_factor);
-  #else
-  deepgalois::update_all(len, g, in, out, true, context->d_norm_factor);
-  #endif
-}
-#endif
-
-void graph_conv_layer::combine(const vec_t& self, const vec_t& neighbors, vec_t& out) {
-  vec_t a(out.size(), 0);
-  vec_t b(out.size(), 0);
-  mvmul(Q, self, a);
-  mvmul(W, neighbors, b);
-  deepgalois::math::vadd(a, b, out); // out = W*self + Q*neighbors
-}
-
 graph_conv_layer::graph_conv_layer(unsigned level, bool act, bool norm,
                                    bool bias, bool dropout, float_t dropout_rate,
                                    std::vector<size_t> in_dims,
@@ -36,16 +14,29 @@ graph_conv_layer::graph_conv_layer(unsigned level, bool act, bool norm,
   z          = output_dims[1];
   trainable_ = true;
   name_      = layer_type() + "_" + std::to_string(level);
+#ifdef CPU_ONLY
   init();
+#else
+  init_gpu();
+#endif
   assert(dropout_rate_ < 1.);
   scale_ = 1. / (1. - dropout_rate_);
 }
 
-void graph_conv_layer::init() {
-  //std::cout << name_ << ": allocating memory for params and temp data... ";
-  Timer t_alloc;
-  t_alloc.Start();
 #ifdef CPU_ONLY
+void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in, float_t* out) {
+  deepgalois::update_all(len, g, in, out, true, context->norm_factor);
+}
+
+void graph_conv_layer::combine(size_t dim_x, size_t dim_y, const float_t* self, const float_t* neighbors, float_t* out) {
+  vec_t a(dim_y, 0);
+  vec_t b(dim_y, 0);
+  mvmul(dim_x, dim_y, Q, self, a);
+  mvmul(dim_x, dim_y, W, neighbors, b);
+  deepgalois::math::vadd(len, a, b, out); // out = W*self + Q*neighbors
+}
+
+void graph_conv_layer::init() {
   rand_init_matrix(y, z, W); // randomly initialize trainable parameters
   // rand_init_matrix(y, z, Q);
   zero_init_matrix(y, z, layer::weight_grad);
@@ -55,25 +46,18 @@ void graph_conv_layer::init() {
   out_temp = new float_t[x * z]; // same as pre_sup in original GCN code:
                // https://github.com/chenxuhao/gcn/blob/master/gcn/layers.py
   trans_data = new float_t[y * x]; // y*x
-#else
-  gconv_malloc_device(x, y, z, dropout_, dropout_mask, in_temp, out_temp, d_W, layer::d_weight_grad);
-#endif
-  t_alloc.Stop();
-  std::cout << "Done, allocation time: " << t_alloc.Millisecs() << " ms\n";
 }
 
-#ifdef CPU_ONLY
 // 𝒉[𝑙] = σ(𝑊 * Σ(𝒉[𝑙-1]))
 void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_data) {
   // input: x*y; W: y*z; output: x*z
   // if y > z: mult W first to reduce the feature size for aggregation
   // else: aggregate first then mult W (not implemented yet)
   if (dropout_ && phase_ == deepgalois::net_phase::train) {
-    galois::do_all(galois::iterate((size_t)0, x),
-      [&](const auto& i) {
-        deepgalois::math::dropout(y, scale_, dropout_rate_, &in_data[i * y],
-                &dropout_mask[i * y], &in_temp[i * y]);
-      }, galois::loopname("dropout"));
+    galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) {
+      deepgalois::math::dropout(y, scale_, dropout_rate_, &in_data[i * y],
+                                &dropout_mask[i * y], &in_temp[i * y]);
+    }, galois::loopname("dropout"));
     deepgalois::math::matmul1D1D(x, z, y, in_temp, &layer::W[0], out_temp); // x*y; y*z; x*z
   } else {
     deepgalois::math::matmul1D1D(x, z, y, in_data, &layer::W[0], out_temp); // x*y; y*z; x*z
@@ -139,37 +123,6 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
   deepgalois::math::matmul1D1D(y, z, x, trans_data, out_temp, &layer::weight_grad[0]); // y*x; x*z; y*z
 }
 
-#else
-// GPU forward: compute output features
-void graph_conv_layer::forward_propagation(const float_t* in_data,
-                                           float_t* out_data) {
-  assert(y <= 128); // currently only support feature length <= 128
-  init_const_gpu(x*z, 0.0, out_temp);
-  if (dropout_ && phase_ == deepgalois::net_phase::train) {
-    dropout_gpu(x * y, scale_, dropout_rate_, in_data, dropout_mask, in_temp);
-    sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, d_W, 0.0, out_temp);
-  } else sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_data, d_W, 0.0, out_temp);
-  graph_conv_layer::aggregate(z, context->graph_gpu, out_temp, out_data);
-  if (act_) relu_gpu(x * z, out_data, out_data);
-}
-
-// GPU backward: compute input gradients (in_grad) and weight gradients (d_weight_grad)
-void graph_conv_layer::back_propagation(const float_t* in_data,
-                                        const float_t* out_data,
-                                        float_t* out_grad, float_t* in_grad) {
-  if (act_) d_relu_gpu(x * z, out_grad, out_data, out_temp);
-  else copy_gpu(x * z, out_grad, out_temp);
-  if (level_ != 0) {
-    sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0, in_temp);
-#ifdef USE_CUSPARSE
-    update_all_cusparse(y, context->graph_gpu, in_temp, in_grad, true, context->d_norm_factor);
-#else
-    update_all(y, context->graph_gpu, in_temp, in_grad, true, context->d_norm_factor);
-#endif
-    if (dropout_) d_dropout_gpu(x * y, scale_, dropout_rate_, in_grad, dropout_mask, in_grad);
-  }
-  sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, layer::d_weight_grad);
-}
 #endif
 
 } // namespace
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cu b/libdeepgalois/src/layers/graph_conv_layer.cu
new file mode 100644
index 0000000000..dc6e0e72db
--- /dev/null
+++ b/libdeepgalois/src/layers/graph_conv_layer.cu
@@ -0,0 +1,50 @@
+#include "deepgalois/layers/graph_conv_layer.h"
+
+namespace deepgalois {
+
+void graph_conv_layer::init_gpu() {
+  gconv_malloc_device(x, y, z, dropout_, dropout_mask, in_temp, out_temp, d_W, layer::d_weight_grad);
+}
+
+void graph_conv_layer::aggregate(size_t len, CSRGraph& g, const float_t* in, float_t* out) {
+  #ifdef USE_CUSPARSE
+  deepgalois::update_all_cusparse(y, context->graph_gpu, in_temp, in_grad, true, context->d_norm_factor);
+  #else
+  deepgalois::update_all(len, g, in, out, true, context->d_norm_factor);
+  #endif
+}
+
+void graph_conv_layer::combine(size_t dim_x, size_t dim_y, const float_t* self, const float_t* neighbors, float_t* out) {
+}
+
+// GPU forward: compute output features
+void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_data) {
+  //assert(y <= 128); // currently only support feature length <= 128
+  init_const_gpu(x*z, 0.0, out_temp);
+  if (dropout_ && phase_ == deepgalois::net_phase::train) {
+    dropout_gpu(x * y, scale_, dropout_rate_, in_data, dropout_mask, in_temp);
+    sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, d_W, 0.0, out_temp);
+  } else sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_data, d_W, 0.0, out_temp);
+  graph_conv_layer::aggregate(z, context->graph_gpu, out_temp, out_data);
+  if (act_) relu_gpu(x * z, out_data, out_data);
+}
+
+// GPU backward: compute input gradients (in_grad) and weight gradients (d_weight_grad)
+void graph_conv_layer::back_propagation(const float_t* in_data,
+                                        const float_t* out_data,
+                                        float_t* out_grad, float_t* in_grad) {
+  if (act_) d_relu_gpu(x * z, out_grad, out_data, out_temp);
+  else copy_gpu(x * z, out_grad, out_temp);
+  if (level_ != 0) {
+    sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0, in_temp);
+#ifdef USE_CUSPARSE
+    update_all_cusparse(y, context->graph_gpu, in_temp, in_grad, true, context->d_norm_factor);
+#else
+    update_all(y, context->graph_gpu, in_temp, in_grad, true, context->d_norm_factor);
+#endif
+    if (dropout_) d_dropout_gpu(x * y, scale_, dropout_rate_, in_grad, dropout_mask, in_grad);
+  }
+  sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, layer::d_weight_grad);
+}
+
+}
diff --git a/libdeepgalois/src/layers/layer.cpp b/libdeepgalois/src/layers/layer.cpp
new file mode 100644
index 0000000000..6abb1ffb6a
--- /dev/null
+++ b/libdeepgalois/src/layers/layer.cpp
@@ -0,0 +1,12 @@
+#include "deepgalois/layers/layer.h"
+#include "galois/Galois.h"
+
+namespace deepgalois {
+
+void layer::print_layer_info() {
+  galois::gPrint("Layer", level_, " type: ", layer_type(), " input[",
+                 input_dims[0], ",", input_dims[1], "] output[",
+                 output_dims[0], ",", output_dims[1], "]\n");
+}
+
+}
diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp
index af4e62c90c..12a2907500 100644
--- a/libdeepgalois/src/math_functions.cpp
+++ b/libdeepgalois/src/math_functions.cpp
@@ -231,9 +231,7 @@ float_t dot(size_t n, const float_t* x, const float_t* y) {
 }
 
 // matrix-vector multiply
-void mvmul(const vec_t& matrix, const vec_t& in_vector, vec_t& out_vector) {
-  size_t m = out_vector.size();
-  size_t n = in_vector.size();
+void mvmul(size_t m, size_t n, const float_t *matrix, const float_t *in_vector, float_t *out_vector) {
   for (size_t i = 0; i < m; ++i) {
     for (size_t j = 0; j < n; ++j) {
       out_vector[i] += matrix[i * n + j] * in_vector[j];

From b07c624f21671bd04ecdbffd4a93530f4f508d9a Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Wed, 4 Mar 2020 10:33:20 -0600
Subject: [PATCH 108/660] fix cpu

---
 libdeepgalois/CMakeLists.txt                  |  6 +++---
 libdeepgalois/src/layers/graph_conv_layer.cpp | 10 +++++-----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index 9f797fc655..34e094ce14 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 2.8)
 
-SET(OPENBLAS_INC /net/ohm/export/cdgc/cxh/OpenBLAS/build/include)
-SET(OPENBLAS_LIB /net/ohm/export/cdgc/cxh/OpenBLAS/build/lib)
+SET(OPENBLAS_INC /net/ohm/export/cdgc/cxh/OpenBLAS-faraday/build/include)
+SET(OPENBLAS_LIB /net/ohm/export/cdgc/cxh/OpenBLAS-faraday/build/lib)
 set(CUB_ROOT "${CMAKE_SOURCE_DIR}/cub") # only required headers
 set(MGPU_ROOT "${CMAKE_SOURCE_DIR}/moderngpu") # only required headers
 SET(CUDA_INC /org/centers/cdgc/cuda/cuda-10.0/include)
@@ -16,7 +16,7 @@ include_directories("${MGPU_ROOT}/src")
 link_directories(${OPENBLAS_LIB})
 link_directories(${CMAKE_SOURCE_DIR}/libgalois)
 
-set(USE_CPU OFF CACHE BOOL "Build DeepGalois without CUDA support")
+set(USE_CPU ON CACHE BOOL "Build DeepGalois without CUDA support")
 if(USE_CPU)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCPU_ONLY")
 else()
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 8e7b9f4eb5..d174b716ac 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -28,11 +28,11 @@ void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in, float_
   deepgalois::update_all(len, g, in, out, true, context->norm_factor);
 }
 
-void graph_conv_layer::combine(size_t dim_x, size_t dim_y, const float_t* self, const float_t* neighbors, float_t* out) {
-  vec_t a(dim_y, 0);
-  vec_t b(dim_y, 0);
-  mvmul(dim_x, dim_y, Q, self, a);
-  mvmul(dim_x, dim_y, W, neighbors, b);
+void graph_conv_layer::combine(size_t n, size_t len, const float_t* self, const float_t* neighbors, float_t* out) {
+  float_t *a = new float_t[len];
+  float_t *b = new float_t[len];
+  mvmul(n, len, &Q[0], self, a);
+  mvmul(n, len, &W[0], neighbors, b);
   deepgalois::math::vadd(len, a, b, out); // out = W*self + Q*neighbors
 }
 

From 6612216a0845bb39e86e3aa38265c7ece5e4aab3 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Wed, 4 Mar 2020 13:55:19 -0600
Subject: [PATCH 109/660] use sgemm_cpu

---
 .../include/deepgalois/math_functions.hh      | 70 +++++++++---------
 libdeepgalois/src/layers/aggregator.cpp       | 10 +--
 libdeepgalois/src/layers/graph_conv_layer.cpp | 73 +++++--------------
 libdeepgalois/src/math_functions.cpp          | 67 +++++++++--------
 4 files changed, 90 insertions(+), 130 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh
index 61eceda3f2..26639f6f55 100644
--- a/libdeepgalois/include/deepgalois/math_functions.hh
+++ b/libdeepgalois/include/deepgalois/math_functions.hh
@@ -21,63 +21,61 @@ extern "C" {
 
 namespace deepgalois {
 namespace math {
-
-const float negative_slope = 0;
-
-//! add two same size vectors into out
-void vadd(const vec_t& a, const vec_t& b, vec_t& out); // vector add
 //! add 2 arrays for n elements
-void vadd(size_t n, const float_t* a, const float_t* b, float_t* out);
-//! multiply vector by scalar
-void mul_scalar(const float_t alpha, vec_t& Y);
+void vadd_cpu(size_t n, const float_t* a, const float_t* b, float_t* out);
 //! multiply n elements of vector by scalar
 void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out);
-//! clear entire vector
-void clear(vec_t& in);
 //! clear n elements of a vector
-void clear(size_t n, float_t* in);
+void clear_cpu(size_t n, float_t* in);
+// dropout functions randomly remove weights
+void dropout_cpu(size_t n, const float scale, const float dropout_rate,
+             const float_t* in, unsigned* mask, float_t* out);
+// dropout derivative: use existing dropouts in masks instead of generating them;
+void d_dropout_cpu(size_t n, const float scale, const float_t* in_diff,
+               unsigned* mask, float_t* out_diff);
+//! ReLU = keep if positive
+void relu_cpu(size_t n, const float_t* in, float_t* out);
+//! ReLU derivative; generally, 1 if data > 0, 0 otherwise
+void d_relu_cpu(size_t n, const float_t* in, const float_t* data, float_t* out);
+//! copy vector from in -> out; first len elements
+void copy_cpu(size_t len, const float_t* in, float_t* out);
+// single-precision dense matrix multiply
+void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
+               const int M, const int N, const int K, const float alpha,
+               const float* A, const float* B, const float beta, float* C);
+// single-precision sparse matrix dense matrix multiply, C = A * B, A is sparse
+void csrmm_cpu(const int M, const int N, const int K, const int nnz, 
+               const float alpha, const float* A_nonzeros, 
+	           const int* A_idx_ptr, const int* A_nonzero_idx,
+               const float* B, const float beta, float* C);
+} // deepgalois
+} // math
 
+//! clear entire vector
+void clear(vec_t& in);
+//! multiply vector by scalar
+void mul_scalar(const float_t alpha, vec_t& Y);
+//! add two same size vectors into out
+void vadd(const vec_t& a, const vec_t& b, vec_t& out); // vector add
 // dropout functions randomly remove weights
 void dropout(const float scale, const float dropout_rate, const vec_t& in,
              std::vector<unsigned>& mask, vec_t& out); // dropout
 void dropout(const float scale, const float dropout_rate, const vec_t& in,
              std::vector<unsigned>& mask, float_t* out);
-void dropout(size_t n, const float scale, const float dropout_rate,
-             const float_t* in, unsigned* mask, float_t* out);
-// dropout calls that use existing dropouts in masks instead of generating them;
-// derivative
 void d_dropout(const float scale, const vec_t& in_diff,
-               std::vector<unsigned>& mask,
-               vec_t& out_diff); // dropout derivative
-void d_dropout(size_t n, const float scale, const float_t* in_diff,
-               unsigned* mask, float_t* out_diff);
-
-//! relu = keep if positive
+               std::vector<unsigned>& mask, vec_t& out_diff);
+//! ReLU = keep if positive
 void relu(const vec_t& in, vec_t& out);
-//! relu = keep if positive; first n units
-void relu(size_t n, const float_t* in, float_t* out);
-//! relu derivative; generally, 1 if x > 0, 0 otherwise
-void d_relu(const vec_t& in_diff, const vec_t& data,
-            vec_t& out_diff); // ReLU derivative
-
 //! copy vector from in -> out
 void copy1D1D(const vec_t& in, vec_t& out);
-//! copy vector from in -> out; first len elements
-void copy1D1D(size_t len, const float_t* in, float_t* out);
-
+//! matrix multiply
 void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z,
                 const float_t* A, const float_t* B,
                 float_t* C); // matrix multiply
-
 //! transposes a matrix (vector)
 void transpose(size_t x, size_t y, const vec_t& in, vec_t& out);
 //! transposes a matrix (malloc'd array)
 void transpose(size_t x, size_t y, const float_t* in, float_t* out);
-
-} // deepgalois
-} // math
-
-
 void vsub(const vec_t& a, const vec_t& b, vec_t& out);
 void vmul(const vec_t& a, const vec_t& b, vec_t& out);
 void vdiv(const vec_t& a, const vec_t& b, vec_t& out);
diff --git a/libdeepgalois/src/layers/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp
index 581c5f564c..3fffb86054 100644
--- a/libdeepgalois/src/layers/aggregator.cpp
+++ b/libdeepgalois/src/layers/aggregator.cpp
@@ -6,7 +6,7 @@ void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* ou
                 bool norm, const float_t* norm_factor) {
   galois::do_all(galois::iterate(g), [&](const GNode src) {
    // zero out this node's out values
-   deepgalois::math::clear(len, &out[src * len]);
+   deepgalois::math::clear_cpu(len, &out[src * len]);
    float_t a = 0.0;
    float_t b = 0.0;
 
@@ -21,16 +21,14 @@ void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* ou
        // normalize b as well
        b = a * norm_factor[dst];
        vec_t neighbor(len);
-       // scale the neighbor's data  using the normalization
-       // factor
+       // scale the neighbor's data using the normalization factor
        deepgalois::math::mul_scalar(len, b, &in[dst * len], &neighbor[0]);
        // use scaled data to update
-       deepgalois::math::vadd(len, &out[src * len], &neighbor[0],
+       deepgalois::math::vadd_cpu(len, &out[src * len], &neighbor[0],
             &out[src * len]); // out[src] += in[dst]
      } else
        // add embeddings from neighbors together
-       deepgalois::math::vadd(len, &out[src * len],
-                              &in[dst * len],
+       deepgalois::math::vadd_cpu(len, &out[src * len], &in[dst * len],
                               &out[src * len]); // out[src] += in[dst]
    }
  }, galois::steal(), galois::no_stats(), galois::loopname("update_all"));
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index d174b716ac..01c313d97d 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -33,18 +33,16 @@ void graph_conv_layer::combine(size_t n, size_t len, const float_t* self, const
   float_t *b = new float_t[len];
   mvmul(n, len, &Q[0], self, a);
   mvmul(n, len, &W[0], neighbors, b);
-  deepgalois::math::vadd(len, a, b, out); // out = W*self + Q*neighbors
+  deepgalois::math::vadd_cpu(len, a, b, out); // out = W*self + Q*neighbors
 }
 
 void graph_conv_layer::init() {
   rand_init_matrix(y, z, W); // randomly initialize trainable parameters
   // rand_init_matrix(y, z, Q);
   zero_init_matrix(y, z, layer::weight_grad);
-  if (dropout_)
-    dropout_mask = new unsigned[x * y];
+  if (dropout_) dropout_mask = new unsigned[x * y];
   in_temp  = new float_t[x * y];
-  out_temp = new float_t[x * z]; // same as pre_sup in original GCN code:
-               // https://github.com/chenxuhao/gcn/blob/master/gcn/layers.py
+  out_temp = new float_t[x * z];
   trans_data = new float_t[y * x]; // y*x
 }
 
@@ -54,75 +52,42 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_
   // if y > z: mult W first to reduce the feature size for aggregation
   // else: aggregate first then mult W (not implemented yet)
   if (dropout_ && phase_ == deepgalois::net_phase::train) {
-    galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) {
-      deepgalois::math::dropout(y, scale_, dropout_rate_, &in_data[i * y],
-                                &dropout_mask[i * y], &in_temp[i * y]);
-    }, galois::loopname("dropout"));
-    deepgalois::math::matmul1D1D(x, z, y, in_temp, &layer::W[0], out_temp); // x*y; y*z; x*z
-  } else {
-    deepgalois::math::matmul1D1D(x, z, y, in_data, &layer::W[0], out_temp); // x*y; y*z; x*z
-  }
+    deepgalois::math::dropout_cpu(x*y, scale_, dropout_rate_, in_data, dropout_mask, in_temp);
+    deepgalois::math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, &layer::W[0], 0.0, out_temp);
+  } else deepgalois::math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_data, &layer::W[0], 0.0, out_temp);
 
   // aggregate based on graph topology
   graph_conv_layer::aggregate(z, context->graph_cpu, out_temp, out_data);
 
   // run relu activation on output if specified
-  if (act_) {
-    galois::do_all(
-        galois::iterate((size_t)0, x),
-        [&](const auto& i) { deepgalois::math::relu(z, &out_data[i * z],
-                                                    &out_data[i * z]); },
-        galois::loopname("relu"));
-  }
+  if (act_) deepgalois::math::relu_cpu(x*z, out_data, out_data);
 }
 
 // 𝜕𝐸 / 𝜕𝑦[𝑙−1] = 𝜕𝐸 / 𝜕𝑦[𝑙] ∗ 𝑊 ^𝑇
 void graph_conv_layer::back_propagation(const float_t* in_data,
                                         const float_t* out_data,
                                         float_t* out_grad, float_t* in_grad) {
-  if (act_) {
-    // note; assumption here is that out_grad contains 1s or 0s via relu?
-    galois::do_all(galois::iterate((size_t)0, x),
-      [&](const auto& i) {
-        for (size_t j = 0; j < z; ++j) // TODO: use in_data or out_data?
-          // check if original data greater than 0; if so keep grad
-          out_temp[i * z + j] = out_data[i * z + j] > float_t(0)
-                                ? out_grad[i * z + j] : float_t(0);
-      }, galois::loopname("d_relu"));
-  } else {
-    deepgalois::math::copy1D1D(x * z, out_grad, out_temp); // TODO: avoid copying
-  }
+  // note; assumption here is that out_grad contains 1s or 0s via relu?
+  if (act_) deepgalois::math::d_relu_cpu(x*z, out_grad, out_data, out_temp);
+  else deepgalois::math::copy_cpu(x * z, out_grad, out_temp); // TODO: avoid copying
 
   // at this point, out_temp has the derivative of data from last step to
   // use for both updating gradients for features and gradients for weights
-
   // this calculates gradients for the node predictions
   if (level_ != 0) { // no need to calculate in_grad for the first layer
-    vec_t trans_W(z * y);
     // derivative of matmul needs transposed matrix
-    deepgalois::math::transpose(y, z, layer::W, trans_W);
-    deepgalois::math::matmul1D1D(x, y, z, out_temp, &trans_W[0], in_temp); // x*z; z*y -> x*y
-    // sgemm_cpu(x, y, z, 1.0, out_temp, trans_W, 0.0, in_temp); // x*z; z*y ->
+    deepgalois::math::sgemm_cpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, &W[0], 0.0, in_temp); // x*z; z*y ->
     // x*y NOTE: since graph is symmetric, the derivative is the same
-    deepgalois::update_all(y, context->graph_cpu, in_temp, in_grad, true,
-                           context->norm_factor); // x*x; x*y -> x*y
-    if (dropout_) {
-      galois::do_all(galois::iterate((size_t)0, x),
-        [&](const auto& i) {
-          deepgalois::math::d_dropout(y, scale_, &in_grad[i * y],
-                    &dropout_mask[i * y], &in_grad[i * y]);
-        }, galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
-        galois::loopname("d_dropout"));
-    }
+    deepgalois::update_all(y, context->graph_cpu, in_temp, in_grad, true, context->norm_factor); // x*x; x*y -> x*y
+    if (dropout_) deepgalois::math::d_dropout_cpu(x*y, scale_, in_grad, dropout_mask, in_grad);
   }
 
-  // calculate weight gradients by using previous layer's transpose multiplied
-  // by gradients from last back prop step
-  deepgalois::math::transpose(x, y, in_data, trans_data);                       // y*x
-  // updates THIS layer's weight gradients to update them
-  deepgalois::math::matmul1D1D(y, z, x, trans_data, out_temp, &layer::weight_grad[0]); // y*x; x*z; y*z
+  // calculate weight gradients using input data
+  // multiplied by gradients from last back prop step
+  //deepgalois::math::transpose(x, y, in_data, trans_data); // x*y -> y*x
+  //deepgalois::math::matmul1D1D(y, z, x, trans_data, out_temp, &layer::weight_grad[0]); // y*x; x*z; y*z
+  deepgalois::math::sgemm_cpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, &layer::weight_grad[0]); // y*x; x*z; y*z
 }
-
 #endif
-
 } // namespace
+
diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp
index 12a2907500..700a4ce688 100644
--- a/libdeepgalois/src/math_functions.cpp
+++ b/libdeepgalois/src/math_functions.cpp
@@ -1,5 +1,6 @@
 #include "deepgalois/math_functions.hh"
 #include "galois/Timer.h"
+#include "galois/Galois.h"
 #include <immintrin.h>
 
 extern "C" {
@@ -14,10 +15,13 @@ namespace math {
 void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
                const int M, const int N, const int K, const float alpha,
                const float* A, const float* B, const float beta, float* C) {
+  galois::StatTimer Tmatmul("MatMul");
+  Tmatmul.start();
   int lda = (TransA == CblasNoTrans) ? K : M;
   int ldb = (TransB == CblasNoTrans) ? N : K;
   cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb,
               beta, C, N);
+  Tmatmul.stop();
 }
 
 // vector add
@@ -34,23 +38,20 @@ void vadd(const vec_t& a, const vec_t& b, vec_t& out) {
     out[i] = a[i] + b[i];
 }
 
-void vadd(size_t n, const float_t* a, const float_t* b, float_t* out) {
+void vadd_cpu(size_t n, const float_t* a, const float_t* b, float_t* out) {
   size_t vec_len        = 8;
   const size_t alignedN = n - n % vec_len;
   for (size_t i = 0; i < alignedN; i += vec_len)
-    _mm256_storeu_ps(
-        &out[i], _mm256_add_ps(_mm256_loadu_ps(&a[i]), _mm256_loadu_ps(&b[i])));
-  for (size_t i = alignedN; i < n; ++i)
-    out[i] = a[i] + b[i];
+    _mm256_storeu_ps(&out[i], _mm256_add_ps(_mm256_loadu_ps(&a[i]), _mm256_loadu_ps(&b[i])));
+  for (size_t i = alignedN; i < n; ++i) out[i] = a[i] + b[i];
 }
 #else
 void vadd(const vec_t& a, const vec_t& b, vec_t& out) {
   for (size_t i = 0; i < out.size(); ++i)
     out[i] = a[i] + b[i];
 }
-void vadd(size_t n, const float_t* a, const float_t* b, float_t* out) {
-  for (size_t i = 0; i < n; ++i)
-    out[i] = a[i] + b[i];
+void vadd_cpu(size_t n, const float_t* a, const float_t* b, float_t* out) {
+  for (size_t i = 0; i < n; ++i) out[i] = a[i] + b[i];
 }
 #endif
 
@@ -71,9 +72,9 @@ void clear(vec_t& in) {
     in[i] = 0;
 }
 
-void clear(size_t n, float_t* in) {
-  for (size_t i = 0; i < n; i++)
-    in[i] = 0;
+void clear_cpu(size_t n, float_t* in) {
+  for (size_t i = 0; i < n; i++) in[i] = 0;
+  // memset(in, 0, n*sizeof(float_t));
 }
 
 void dropout(const float scale, const float dropout_rate, const vec_t& in,
@@ -94,12 +95,12 @@ void dropout(const float scale, const float dropout_rate, const vec_t& in,
     out[i] = in[i] * masks[i] * scale;
 }
 
-void dropout(size_t n, const float scale, const float dropout_rate,
+void dropout_cpu(size_t n, const float scale, const float dropout_rate,
              const float_t* in, unsigned* masks, float_t* out) {
-  for (size_t i = 0; i < n; ++i)
+  galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) {
     masks[i] = deepgalois::bernoulli(dropout_rate);
-  for (size_t i = 0; i < n; ++i)
     out[i] = in[i] * masks[i] * scale;
+  }, galois::loopname("dropout"));
 }
 
 void d_dropout(const float scale, const vec_t& in_diff,
@@ -108,48 +109,46 @@ void d_dropout(const float scale, const vec_t& in_diff,
     out_diff[i] = in_diff[i] * masks[i] * scale;
 }
 
-void d_dropout(size_t n, const float scale, const float_t* in_diff,
-               unsigned* masks, float_t* out_diff) {
-  for (size_t i = 0; i < n; ++i)
-    out_diff[i] = in_diff[i] * masks[i] * scale;
+void d_dropout_cpu(size_t n, const float scale, const float_t* in,
+               unsigned* masks, float_t* out) {
+  galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) {
+    out[i] = in[i] * masks[i] * scale;
+  }, galois::loopname("d_dropout"));
 }
 
 void relu(const vec_t& in, vec_t& out) {
   for (size_t i = 0; i < out.size(); ++i) {
-    out[i] = std::max(in[i], (float_t)0) +
-             negative_slope * std::min(in[i], (float_t)0);
+    out[i] = std::max(in[i], (float_t)0);
   }
 }
 
-void relu(size_t n, const float_t* in, float_t* out) {
-  for (size_t i = 0; i < n; ++i)
+void relu_cpu(size_t n, const float_t* in, float_t* out) {
+  // TODO: vectorize
+  galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) {
     out[i] = std::max(in[i], float_t(0));
+  }, galois::loopname("relu"));
 }
 
-void d_relu(const vec_t& in_diff, const vec_t& fv, vec_t& out_diff) {
-  for (size_t i = 0; i < out_diff.size(); ++i) {
-    out_diff[i] = in_diff[i] * ((fv[i] > (float_t)0) +
-                                negative_slope * (fv[i] <= (float_t)0));
-  }
+void d_relu_cpu(size_t n, const float_t* in, const float_t* data, float_t* out) {
+  // TODO: vectorize
+  // check if original data greater than 0; if so keep grad
+  galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) {
+    out[i] = data[i] > float_t(0) ? in[i] : float_t(0);
+  }, galois::loopname("d_relu"));
 }
 
 void copy1D1D(const vec_t& in, vec_t& out) {
   std::copy(in.begin(), in.end(), &out[0]);
 }
 
-void copy1D1D(size_t len, const float_t* in, float_t* out) {
+void copy_cpu(size_t len, const float_t* in, float_t* out) {
   std::copy(in, in + len, out);
 }
 
 // num rows in A, C; num columns in B, C; num columns in A, rows in B
 void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z,
                 const float_t* A, const float_t* B, float_t* C) {
-  galois::StatTimer Tmatmul("MatMul");
-  Tmatmul.start();
-  const CBLAS_TRANSPOSE TransA = CblasNoTrans;
-  const CBLAS_TRANSPOSE TransB = CblasNoTrans;
-  sgemm_cpu(TransA, TransB, dim_x, dim_y, dim_z, 1.0, A, B, 0.0, C);
-  Tmatmul.stop();
+  sgemm_cpu(CblasNoTrans, CblasNoTrans, dim_x, dim_y, dim_z, 1.0, A, B, 0.0, C);
 }
 
 // TODO make parallel

From 032827f23b3138662ef730c5670ea8c35dd50261 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Wed, 4 Mar 2020 14:53:24 -0600
Subject: [PATCH 110/660] add csrmm_cpu impl

---
 .../include/deepgalois/layers/aggregator.h    |  4 +-
 libdeepgalois/src/layers/aggregator.cpp       | 57 ++++++++++---------
 libdeepgalois/src/layers/aggregator.cu        |  2 +-
 libdeepgalois/src/math_functions.cpp          | 21 +++++++
 4 files changed, 56 insertions(+), 28 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/layers/aggregator.h b/libdeepgalois/include/deepgalois/layers/aggregator.h
index 1d6a1acebb..ffdd3935a8 100644
--- a/libdeepgalois/include/deepgalois/layers/aggregator.h
+++ b/libdeepgalois/include/deepgalois/layers/aggregator.h
@@ -7,13 +7,15 @@
 namespace deepgalois {
 void update_all(size_t len, Graph& g, const float_t* in, float_t* out,
                 bool norm, const float_t* norm_factor);
+void update_all_csrmm(size_t len, Graph& g, const float_t* in, 
+                float_t* out, bool norm, const float_t* norm_factor);
 }
 #else
 #include "graph_gpu.h"
 namespace deepgalois {
 void update_all(size_t len, CSRGraph& g, const float_t* in, float_t* out,
                 bool norm, const float_t* norm_factor);
-void update_all_cusparse(size_t len, CSRGraph& g, const float_t* in, 
+void update_all_csrmm(size_t len, CSRGraph& g, const float_t* in, 
                 float_t* out, bool norm, const float_t* norm_factor);
 }
 #endif
diff --git a/libdeepgalois/src/layers/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp
index 3fffb86054..94752742ed 100644
--- a/libdeepgalois/src/layers/aggregator.cpp
+++ b/libdeepgalois/src/layers/aggregator.cpp
@@ -4,33 +4,38 @@
 #ifdef CPU_ONLY
 void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* out,
                 bool norm, const float_t* norm_factor) {
+  // zero out the output data
+  deepgalois::math::clear_cpu(g.size()*len, out);
   galois::do_all(galois::iterate(g), [&](const GNode src) {
-   // zero out this node's out values
-   deepgalois::math::clear_cpu(len, &out[src * len]);
-   float_t a = 0.0;
-   float_t b = 0.0;
-
-   // get normalization factor if needed
-   if (norm) a = norm_factor[src];
-
-   // gather neighbors' embeddings
-   for (const auto e : g.edges(src)) {
-     const auto dst = g.getEdgeDst(e);
+    float_t a = 0.0;
+    float_t b = 0.0;
+    // get normalization factor if needed
+    if (norm) a = norm_factor[src];
+    // gather neighbors' embeddings
+    for (const auto e : g.edges(src)) {
+      const auto dst = g.getEdgeDst(e);
+      if (norm) {
+        // normalize b as well
+        b = a * norm_factor[dst];
+        vec_t neighbor(len);
+        // scale the neighbor's data using the normalization factor
+        deepgalois::math::mul_scalar(len, b, &in[dst * len], &neighbor[0]);
+        // use scaled data to update
+        deepgalois::math::vadd_cpu(len, &out[src * len], &neighbor[0],
+                                   &out[src * len]); // out[src] += in[dst]
+      } else
+        // add embeddings from neighbors together
+        deepgalois::math::vadd_cpu(len, &out[src * len], &in[dst * len],
+                                   &out[src * len]); // out[src] += in[dst]
+    }
+  }, galois::steal(), galois::no_stats(), galois::loopname("update_all"));
+}
 
-     if (norm) {
-       // normalize b as well
-       b = a * norm_factor[dst];
-       vec_t neighbor(len);
-       // scale the neighbor's data using the normalization factor
-       deepgalois::math::mul_scalar(len, b, &in[dst * len], &neighbor[0]);
-       // use scaled data to update
-       deepgalois::math::vadd_cpu(len, &out[src * len], &neighbor[0],
-            &out[src * len]); // out[src] += in[dst]
-     } else
-       // add embeddings from neighbors together
-       deepgalois::math::vadd_cpu(len, &out[src * len], &in[dst * len],
-                              &out[src * len]); // out[src] += in[dst]
-   }
- }, galois::steal(), galois::no_stats(), galois::loopname("update_all"));
+void deepgalois::update_all_csrmm(size_t len, Graph& g, const float_t* in, float_t* out,
+                                  bool norm, const float_t* norm_factor) {
+  unsigned n = g.size();
+  deepgalois::math::clear_cpu(n*len, out);
+  //csrmm_cpu(n, len, n, g.sizeEdges(), 1.0, norm_factor, 
+  //          (const int*)g.row_start_ptr(), (const int*)g.edge_dst_ptr(), in, 0.0, out);
 }
 #endif
diff --git a/libdeepgalois/src/layers/aggregator.cu b/libdeepgalois/src/layers/aggregator.cu
index 06eac8bb75..ee5fe56b4d 100644
--- a/libdeepgalois/src/layers/aggregator.cu
+++ b/libdeepgalois/src/layers/aggregator.cu
@@ -68,7 +68,7 @@ void deepgalois::update_all(size_t len, CSRGraph& g, const float_t* in, float_t*
   CudaTest("solving update_all kernel failed");
 }
 
-void deepgalois::update_all_cusparse(size_t len, CSRGraph& g, const float_t* in, float_t* out,
+void deepgalois::update_all_csrmm(size_t len, CSRGraph& g, const float_t* in, float_t* out,
                 bool norm, const float_t* norm_factor) {
   unsigned n = g.nnodes;
   CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t)));
diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp
index 700a4ce688..d3f7d0fca0 100644
--- a/libdeepgalois/src/math_functions.cpp
+++ b/libdeepgalois/src/math_functions.cpp
@@ -8,6 +8,12 @@ extern "C" {
 //#include <clapack.h>
 }
 
+#define NOT_IMPLEMENTED                \
+  do {                                 \
+    std::cout << "Not Implemented Yet";\
+    exit(1);                           \
+  } while(0);
+
 namespace deepgalois {
 namespace math {
 
@@ -24,6 +30,21 @@ void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
   Tmatmul.stop();
 }
 
+void csrmm_cpu(const int M, const int N, const int K, const int nnz, 
+               const float alpha, const float* A_nonzeros, 
+	           const int* A_idx_ptr, const int* A_nnz_idx,
+               const float* B, const float beta, float* C) {
+#ifdef USE_MKL
+  const char *matdescra = "GXXCX";//6 bytes
+  const char transa = 'N';
+  mkl_scsrmm (&transa, &M , &N, &K, &alpha , matdescra,
+    A_nonzeros, A_nnz_idx, A_idx_ptr, A_idx_ptr+1,
+    B, &N, &beta , C, &N);
+#else
+  NOT_IMPLEMENTED;
+#endif
+}
+
 // vector add
 #if defined(__AVX__) || defined(__AVX2__)
 void vadd(const vec_t& a, const vec_t& b, vec_t& out) {

From d832912238e873ead1c16238d07f76f2f2ae8497 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Wed, 4 Mar 2020 15:25:43 -0600
Subject: [PATCH 111/660] fix agg

---
 libdeepgalois/src/layers/aggregator.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libdeepgalois/src/layers/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp
index 94752742ed..6d7c7f6cbe 100644
--- a/libdeepgalois/src/layers/aggregator.cpp
+++ b/libdeepgalois/src/layers/aggregator.cpp
@@ -5,8 +5,8 @@
 void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* out,
                 bool norm, const float_t* norm_factor) {
   // zero out the output data
-  deepgalois::math::clear_cpu(g.size()*len, out);
   galois::do_all(galois::iterate(g), [&](const GNode src) {
+    deepgalois::math::clear_cpu(len , &out[src * len]);
     float_t a = 0.0;
     float_t b = 0.0;
     // get normalization factor if needed

From 2f03a623639a8b0aac97a2baa57aae9e4270e055 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Wed, 4 Mar 2020 18:55:43 -0600
Subject: [PATCH 112/660] modify norm_factor

---
 libdeepgalois/include/deepgalois/context.h    |  4 --
 .../deepgalois/layers/graph_conv_layer.h      |  3 +-
 libdeepgalois/src/context.cpp                 | 16 -----
 libdeepgalois/src/context.cu                  | 49 ---------------
 libdeepgalois/src/layers/graph_conv_layer.cpp | 20 ++++--
 libdeepgalois/src/layers/graph_conv_layer.cu  | 61 ++++++++++++++++---
 libdeepgalois/src/net.cpp                     |  1 -
 7 files changed, 70 insertions(+), 84 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h
index 2f769dc917..c906661d76 100644
--- a/libdeepgalois/include/deepgalois/context.h
+++ b/libdeepgalois/include/deepgalois/context.h
@@ -44,8 +44,6 @@ class Context {
   void DeviceQuery() {}
   bool CheckDevice(const int device_id) { return true; }
   int FindDevice(const int start_id = 0) { return 0; }
-  void norm_factor_counting();
-  void norm_factor_counting_gpu();
 
   size_t n;                    // number of samples: N
   size_t num_classes;          // number of classes: E
@@ -54,8 +52,6 @@ class Context {
   label_t* d_labels;           // labels on device
   vec_t h_feats;               // input features: N x D
   float_t* d_feats;            // input features on device
-  float_t* norm_factor;   // normalization constant based on graph structure
-  float_t* d_norm_factor; // norm_factor on device
 
 #ifdef CPU_ONLY
   Graph graph_cpu; // the input graph, |V| = N
diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
index b2b80b69e8..d4935b85db 100644
--- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
@@ -31,7 +31,7 @@ class graph_conv_layer : public layer {
       : graph_conv_layer(level, false, true, false, true, 0.5, in_dims, out_dims) {}
   ~graph_conv_layer() {}
   void init();
-  void init_gpu();
+  void norm_factor_counting();
   std::string layer_type() const override { return std::string("graph_conv"); }
   void set_netphase(deepgalois::net_phase ctx) override { phase_ = ctx; }
   //! Uses weights contained in this layer to update in_data (results from previous)
@@ -65,6 +65,7 @@ class graph_conv_layer : public layer {
   float_t* in_temp;
   float_t* trans_data;    // y*x
   unsigned* dropout_mask; // x*y
+  float_t* norm_factor;   // normalization constant based on graph structure, TODO: make it static
 
   // Glorot & Bengio (AISTATS 2010)
   inline void rand_init_matrix(size_t dim_x, size_t dim_y, vec_t& matrix) {
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index 5e2ccf4c02..54ff169c37 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -91,22 +91,6 @@ void Context::add_selfloop(Graph &og, Graph &g) {
 float_t* Context::get_in_ptr() { return &h_feats[0]; }
 #endif
 
-void Context::norm_factor_counting() {
-#ifdef CPU_ONLY
-  norm_factor = new float_t[n];
-  galois::do_all(galois::iterate((size_t)0, n),
-    [&](auto v) {
-      auto degree  = std::distance(graph_cpu.edge_begin(v),
-                                  graph_cpu.edge_end(v));
-      float_t temp = std::sqrt(float_t(degree));
-      if (temp == 0.0) norm_factor[v] = 0.0;
-      else norm_factor[v] = 1.0 / temp;
-    }, galois::loopname("NormCounting"));
-#else
-  norm_factor_counting_gpu();
-#endif
-}
-
 // labels contain the ground truth (e.g. vertex classes) for each example
 // (num_examples x 1). Note that labels is not one-hot encoded vector and it can
 // be computed as y.argmax(axis=1) from one-hot encoded vector (y) of labels if
diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu
index 29bec6f008..3ea78b0912 100644
--- a/libdeepgalois/src/context.cu
+++ b/libdeepgalois/src/context.cu
@@ -22,57 +22,8 @@ int64_t cluster_seedgen(void) {
   return seed;
 }
 
-// computing normalization factor for each vertex
-__global__ void norm_factor_counting_node(int n, CSRGraph graph,
-                                            float_t* norm_fac) {
-  CUDA_KERNEL_LOOP(i, n) {
-    float_t temp = sqrt(float_t(graph.getOutDegree(i)));
-    if (temp == 0.0)
-      norm_fac[i] = 0.0;
-    else
-      norm_fac[i] = 1.0 / temp;
-  }
-}
-
-// TODO: make sure self-loop added for each vertex
-// computing normalization factor for each edge
-__global__ void norm_factor_counting_edge(int n, CSRGraph graph,
-                                            float_t* norm_fac) {
-  CUDA_KERNEL_LOOP(src, n) {
-    float_t d_src = float_t(graph.getOutDegree(src));
-    assert(d_src != 0.0); // should never be zero since self-loop added for each vertex
-    d_src = 1.0 / sqrt(d_src);
-    index_type start = graph.edge_begin(src);
-    index_type end = graph.edge_end(src);
-	for (index_type e = start; e != end; e++) {
-      index_type dst = graph.getEdgeDst(e);
-      float_t d_dst = float_t(graph.getOutDegree(dst));
-      assert(d_dst != 0.0);
-      d_dst = 1.0 / sqrt(d_dst);
-      norm_fac[e] = d_src * d_dst;
-    }
-  }
-}
-
 namespace deepgalois {
 
-void Context::norm_factor_counting_gpu() {
-  assert(graph_gpu.nnodes == n);
-  std::cout << "Pre-computing normalization factor (n=" << n << ")\n";
-#ifdef USE_CUSPARSE
-  int nnz = graph_gpu.nedges;
-  CUDA_CHECK(cudaMalloc((void**)&d_norm_factor, nnz * sizeof(float_t)));
-  init_const_kernel<<<CUDA_GET_BLOCKS(nnz), CUDA_NUM_THREADS>>>(nnz, 0.0, d_norm_factor);
-  norm_factor_counting_edge<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
-      n, graph_gpu, d_norm_factor);
-#else
-  CUDA_CHECK(cudaMalloc((void**)&d_norm_factor, n * sizeof(float_t)));
-  norm_factor_counting_node<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
-      n, graph_gpu, d_norm_factor);
-#endif
-  CudaTest("solving norm_factor_counting kernel failed");
-}
-
 cublasHandle_t Context::cublas_handle_         = 0;
 cusparseHandle_t Context::cusparse_handle_     = 0;
 cusparseMatDescr_t Context::cusparse_matdescr_ = 0;
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 01c313d97d..96ddf2339d 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -14,18 +14,15 @@ graph_conv_layer::graph_conv_layer(unsigned level, bool act, bool norm,
   z          = output_dims[1];
   trainable_ = true;
   name_      = layer_type() + "_" + std::to_string(level);
-#ifdef CPU_ONLY
   init();
-#else
-  init_gpu();
-#endif
   assert(dropout_rate_ < 1.);
   scale_ = 1. / (1. - dropout_rate_);
+  if (norm_) norm_factor_counting(); // pre-compute normalizing factor
 }
 
 #ifdef CPU_ONLY
 void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in, float_t* out) {
-  deepgalois::update_all(len, g, in, out, true, context->norm_factor);
+  deepgalois::update_all(len, g, in, out, norm_, norm_factor);
 }
 
 void graph_conv_layer::combine(size_t n, size_t len, const float_t* self, const float_t* neighbors, float_t* out) {
@@ -46,6 +43,17 @@ void graph_conv_layer::init() {
   trans_data = new float_t[y * x]; // y*x
 }
 
+void graph_conv_layer::norm_factor_counting() {
+  norm_factor = new float_t[n];
+  galois::do_all(galois::iterate((size_t)0, n),
+    [&](auto v) {
+      auto degree  = std::distance(context->graph_cpu.edge_begin(v), context->graph_cpu.edge_end(v));
+      float_t temp = std::sqrt(float_t(degree));
+      if (temp == 0.0) norm_factor[v] = 0.0;
+      else norm_factor[v] = 1.0 / temp;
+    }, galois::loopname("NormCounting"));
+}
+
 // 𝒉[𝑙] = σ(𝑊 * Σ(𝒉[𝑙-1]))
 void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_data) {
   // input: x*y; W: y*z; output: x*z
@@ -78,7 +86,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
     // derivative of matmul needs transposed matrix
     deepgalois::math::sgemm_cpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, &W[0], 0.0, in_temp); // x*z; z*y ->
     // x*y NOTE: since graph is symmetric, the derivative is the same
-    deepgalois::update_all(y, context->graph_cpu, in_temp, in_grad, true, context->norm_factor); // x*x; x*y -> x*y
+    deepgalois::update_all(y, context->graph_cpu, in_temp, in_grad, true, norm_factor); // x*x; x*y -> x*y
     if (dropout_) deepgalois::math::d_dropout_cpu(x*y, scale_, in_grad, dropout_mask, in_grad);
   }
 
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cu b/libdeepgalois/src/layers/graph_conv_layer.cu
index dc6e0e72db..210dd8e54d 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cu
+++ b/libdeepgalois/src/layers/graph_conv_layer.cu
@@ -1,22 +1,70 @@
 #include "deepgalois/layers/graph_conv_layer.h"
 
+// computing normalization factor for each vertex
+__global__ void norm_factor_counting_node(int n, CSRGraph graph,
+                                            float_t* norm_fac) {
+  CUDA_KERNEL_LOOP(i, n) {
+    float_t temp = sqrt(float_t(graph.getOutDegree(i)));
+    if (temp == 0.0) norm_fac[i] = 0.0;
+    else norm_fac[i] = 1.0 / temp;
+  }
+}
+
+// TODO: make sure self-loop added for each vertex
+// computing normalization factor for each edge
+__global__ void norm_factor_counting_edge(int n, CSRGraph graph, float_t* norm_fac) {
+  CUDA_KERNEL_LOOP(src, n) {
+    float_t d_src = float_t(graph.getOutDegree(src));
+    assert(d_src != 0.0); // should never be zero since self-loop added for each vertex
+    d_src = 1.0 / sqrt(d_src);
+    index_type start = graph.edge_begin(src);
+    index_type end = graph.edge_end(src);
+	for (index_type e = start; e != end; e++) {
+      index_type dst = graph.getEdgeDst(e);
+      float_t d_dst = float_t(graph.getOutDegree(dst));
+      assert(d_dst != 0.0);
+      d_dst = 1.0 / sqrt(d_dst);
+      norm_fac[e] = d_src * d_dst;
+    }
+  }
+}
+
 namespace deepgalois {
 
-void graph_conv_layer::init_gpu() {
+void graph_conv_layer::init() {
   gconv_malloc_device(x, y, z, dropout_, dropout_mask, in_temp, out_temp, d_W, layer::d_weight_grad);
 }
 
 void graph_conv_layer::aggregate(size_t len, CSRGraph& g, const float_t* in, float_t* out) {
   #ifdef USE_CUSPARSE
-  deepgalois::update_all_cusparse(y, context->graph_gpu, in_temp, in_grad, true, context->d_norm_factor);
+  deepgalois::update_all_csrmm(y, context->graph_gpu, in_temp, in_grad, norm_, norm_factor);
   #else
-  deepgalois::update_all(len, g, in, out, true, context->d_norm_factor);
+  deepgalois::update_all(len, g, in, out, norm_, norm_factor);
   #endif
 }
 
 void graph_conv_layer::combine(size_t dim_x, size_t dim_y, const float_t* self, const float_t* neighbors, float_t* out) {
 }
 
+void graph_conv_layer::norm_factor_counting() {
+  std::cout << "debug\n";
+  int n = x;//context->graph_gpu.nnodes;
+  std::cout << "Pre-computing normalization factor (n=" << n << ") ... ";
+#ifdef USE_CUSPARSE
+  int nnz = context->graph_gpu.nedges;
+  CUDA_CHECK(cudaMalloc((void**)&norm_factor, nnz * sizeof(float_t)));
+  init_const_kernel<<<CUDA_GET_BLOCKS(nnz), CUDA_NUM_THREADS>>>(nnz, 0.0, norm_factor);
+  norm_factor_counting_edge<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
+      n, context->graph_gpu, norm_factor);
+#else
+  CUDA_CHECK(cudaMalloc((void**)&norm_factor, n * sizeof(float_t)));
+  norm_factor_counting_node<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
+      n, context->graph_gpu, norm_factor);
+#endif
+  CudaTest("solving norm_factor_counting kernel failed");
+  std::cout << "Done\n";
+}
+
 // GPU forward: compute output features
 void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_data) {
   //assert(y <= 128); // currently only support feature length <= 128
@@ -38,13 +86,12 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
   if (level_ != 0) {
     sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0, in_temp);
 #ifdef USE_CUSPARSE
-    update_all_cusparse(y, context->graph_gpu, in_temp, in_grad, true, context->d_norm_factor);
+    update_all_csrmm(y, context->graph_gpu, in_temp, in_grad, true, norm_factor);
 #else
-    update_all(y, context->graph_gpu, in_temp, in_grad, true, context->d_norm_factor);
+    update_all(y, context->graph_gpu, in_temp, in_grad, true, norm_factor);
 #endif
     if (dropout_) d_dropout_gpu(x * y, scale_, dropout_rate_, in_grad, dropout_mask, in_grad);
   }
   sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, layer::d_weight_grad);
 }
-
-}
+} // namespace
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index 7407e99d9f..031541e060 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -7,7 +7,6 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1, bool
   // read graph, get num nodes
   num_samples = context->read_graph(dataset_str, selfloop);
   num_classes = context->read_labels(dataset_str);
-  context->norm_factor_counting(); // pre-compute normalizing factor
   num_epochs = epochs;
 
   //std::cout << "Reading label masks ... ";

From 5b887666eda70b827afb418d7c9a9eccc8173d8e Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Wed, 4 Mar 2020 19:41:51 -0600
Subject: [PATCH 113/660] fix bug

---
 libdeepgalois/include/deepgalois/context.h    |  2 +
 .../deepgalois/layers/graph_conv_layer.h      |  4 +-
 libdeepgalois/src/context.cpp                 | 17 +++---
 libdeepgalois/src/context.cu                  | 50 ++++++++++++++++++
 libdeepgalois/src/layers/graph_conv_layer.cpp | 14 +----
 libdeepgalois/src/layers/graph_conv_layer.cu  | 52 +------------------
 libdeepgalois/src/net.cpp                     |  1 +
 7 files changed, 69 insertions(+), 71 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h
index c906661d76..a622a0f0f7 100644
--- a/libdeepgalois/include/deepgalois/context.h
+++ b/libdeepgalois/include/deepgalois/context.h
@@ -44,6 +44,7 @@ class Context {
   void DeviceQuery() {}
   bool CheckDevice(const int device_id) { return true; }
   int FindDevice(const int start_id = 0) { return 0; }
+  void norm_factor_counting();
 
   size_t n;                    // number of samples: N
   size_t num_classes;          // number of classes: E
@@ -52,6 +53,7 @@ class Context {
   label_t* d_labels;           // labels on device
   vec_t h_feats;               // input features: N x D
   float_t* d_feats;            // input features on device
+  float_t* norm_factor;        // normalization constant based on graph structure
 
 #ifdef CPU_ONLY
   Graph graph_cpu; // the input graph, |V| = N
diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
index d4935b85db..b3f9d16d2b 100644
--- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
@@ -31,9 +31,9 @@ class graph_conv_layer : public layer {
       : graph_conv_layer(level, false, true, false, true, 0.5, in_dims, out_dims) {}
   ~graph_conv_layer() {}
   void init();
-  void norm_factor_counting();
   std::string layer_type() const override { return std::string("graph_conv"); }
   void set_netphase(deepgalois::net_phase ctx) override { phase_ = ctx; }
+  void set_context(deepgalois::Context* ctx) { context = ctx; norm_factor = ctx->norm_factor; }
   //! Uses weights contained in this layer to update in_data (results from previous)
   //! and save result to out_data
   virtual void forward_propagation(const float_t* in_data, float_t* out_data);
@@ -65,7 +65,7 @@ class graph_conv_layer : public layer {
   float_t* in_temp;
   float_t* trans_data;    // y*x
   unsigned* dropout_mask; // x*y
-  float_t* norm_factor;   // normalization constant based on graph structure, TODO: make it static
+  float_t* norm_factor;   // normalization constant based on graph structure
 
   // Glorot & Bengio (AISTATS 2010)
   inline void rand_init_matrix(size_t dim_x, size_t dim_y, vec_t& matrix) {
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index 54ff169c37..8b5917b70c 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -7,18 +7,12 @@ Context::Context()
     : mode_(Context::CPU), solver_count_(1), solver_rank_(0),
       multiprocess_(false) {}
 Context::~Context() {}
-#endif
 
 size_t Context::read_graph(std::string dataset_str, bool selfloop) {
-#ifdef CPU_ONLY
   n = read_graph_cpu(dataset_str, "gr", selfloop);
-#else
-  n = read_graph_gpu(dataset_str, selfloop);
-#endif
   return n;
 }
 
-#ifdef CPU_ONLY
 size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype, bool selfloop) {
   galois::StatTimer Tread("GraphReadingTime");
   Tread.start();
@@ -60,6 +54,17 @@ void Context::genGraph(LGraph& lg, Graph& g) {
   }
 }
 
+void Context::norm_factor_counting() {
+  norm_factor = new float_t[n];
+  galois::do_all(galois::iterate((size_t)0, n),
+    [&](auto v) {
+      auto degree  = std::distance(graph_cpu.edge_begin(v), graph_cpu.edge_end(v));
+      float_t temp = std::sqrt(float_t(degree));
+      if (temp == 0.0) norm_factor[v] = 0.0;
+      else norm_factor[v] = 1.0 / temp;
+    }, galois::loopname("NormCounting"));
+}
+
 void Context::add_selfloop(Graph &og, Graph &g) {
   g.allocateFrom(og.size(), og.size()+og.sizeEdges());
   g.constructNodes();
diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu
index 3ea78b0912..5ddbdc3dd8 100644
--- a/libdeepgalois/src/context.cu
+++ b/libdeepgalois/src/context.cu
@@ -22,6 +22,34 @@ int64_t cluster_seedgen(void) {
   return seed;
 }
 
+// computing normalization factor for each vertex
+__global__ void norm_factor_counting_node(int n, CSRGraph graph, float_t* norm_fac) {
+  CUDA_KERNEL_LOOP(i, n) {
+    float_t temp = sqrt(float_t(graph.getOutDegree(i)));
+    if (temp == 0.0) norm_fac[i] = 0.0;
+    else norm_fac[i] = 1.0 / temp;
+  }
+}
+
+// TODO: make sure self-loop added for each vertex
+// computing normalization factor for each edge
+__global__ void norm_factor_counting_edge(int n, CSRGraph graph, float_t* norm_fac) {
+  CUDA_KERNEL_LOOP(src, n) {
+    float_t d_src = float_t(graph.getOutDegree(src));
+    assert(d_src != 0.0); // should never be zero since self-loop added for each vertex
+    d_src = 1.0 / sqrt(d_src);
+    index_type start = graph.edge_begin(src);
+    index_type end = graph.edge_end(src);
+	for (index_type e = start; e != end; e++) {
+      index_type dst = graph.getEdgeDst(e);
+      float_t d_dst = float_t(graph.getOutDegree(dst));
+      assert(d_dst != 0.0);
+      d_dst = 1.0 / sqrt(d_dst);
+      norm_fac[e] = d_src * d_dst;
+    }
+  }
+}
+
 namespace deepgalois {
 
 cublasHandle_t Context::cublas_handle_         = 0;
@@ -52,6 +80,28 @@ Context::~Context() {
     CURAND_CHECK(curandDestroyGenerator(curand_generator_));
 }
 
+size_t Context::read_graph(std::string dataset_str, bool selfloop) {
+  n = read_graph_gpu(dataset_str, selfloop);
+  return n;
+}
+
+void Context::norm_factor_counting() {
+  std::cout << "Pre-computing normalization factor (n=" << n << ") ... ";
+#ifdef USE_CUSPARSE
+  int nnz = graph_gpu.nedges;
+  CUDA_CHECK(cudaMalloc((void**)&norm_factor, nnz * sizeof(float_t)));
+  init_const_kernel<<<CUDA_GET_BLOCKS(nnz), CUDA_NUM_THREADS>>>(nnz, 0.0, norm_factor);
+  norm_factor_counting_edge<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
+      n, graph_gpu, norm_factor);
+#else
+  CUDA_CHECK(cudaMalloc((void**)&norm_factor, n * sizeof(float_t)));
+  norm_factor_counting_node<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
+      n, graph_gpu, norm_factor);
+#endif
+  CudaTest("solving norm_factor_counting kernel failed");
+  std::cout << "Done\n";
+}
+
 void Context::SetDevice(const int device_id) {
   int current_device;
   CUDA_CHECK(cudaGetDevice(&current_device));
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 96ddf2339d..189f396cf8 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -17,7 +17,6 @@ graph_conv_layer::graph_conv_layer(unsigned level, bool act, bool norm,
   init();
   assert(dropout_rate_ < 1.);
   scale_ = 1. / (1. - dropout_rate_);
-  if (norm_) norm_factor_counting(); // pre-compute normalizing factor
 }
 
 #ifdef CPU_ONLY
@@ -43,17 +42,6 @@ void graph_conv_layer::init() {
   trans_data = new float_t[y * x]; // y*x
 }
 
-void graph_conv_layer::norm_factor_counting() {
-  norm_factor = new float_t[n];
-  galois::do_all(galois::iterate((size_t)0, n),
-    [&](auto v) {
-      auto degree  = std::distance(context->graph_cpu.edge_begin(v), context->graph_cpu.edge_end(v));
-      float_t temp = std::sqrt(float_t(degree));
-      if (temp == 0.0) norm_factor[v] = 0.0;
-      else norm_factor[v] = 1.0 / temp;
-    }, galois::loopname("NormCounting"));
-}
-
 // 𝒉[𝑙] = σ(𝑊 * Σ(𝒉[𝑙-1]))
 void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_data) {
   // input: x*y; W: y*z; output: x*z
@@ -86,7 +74,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
     // derivative of matmul needs transposed matrix
     deepgalois::math::sgemm_cpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, &W[0], 0.0, in_temp); // x*z; z*y ->
     // x*y NOTE: since graph is symmetric, the derivative is the same
-    deepgalois::update_all(y, context->graph_cpu, in_temp, in_grad, true, norm_factor); // x*x; x*y -> x*y
+    deepgalois::update_all(y, context->graph_cpu, in_temp, in_grad, norm_, norm_factor); // x*x; x*y -> x*y
     if (dropout_) deepgalois::math::d_dropout_cpu(x*y, scale_, in_grad, dropout_mask, in_grad);
   }
 
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cu b/libdeepgalois/src/layers/graph_conv_layer.cu
index 210dd8e54d..69630b50f9 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cu
+++ b/libdeepgalois/src/layers/graph_conv_layer.cu
@@ -1,34 +1,5 @@
 #include "deepgalois/layers/graph_conv_layer.h"
 
-// computing normalization factor for each vertex
-__global__ void norm_factor_counting_node(int n, CSRGraph graph,
-                                            float_t* norm_fac) {
-  CUDA_KERNEL_LOOP(i, n) {
-    float_t temp = sqrt(float_t(graph.getOutDegree(i)));
-    if (temp == 0.0) norm_fac[i] = 0.0;
-    else norm_fac[i] = 1.0 / temp;
-  }
-}
-
-// TODO: make sure self-loop added for each vertex
-// computing normalization factor for each edge
-__global__ void norm_factor_counting_edge(int n, CSRGraph graph, float_t* norm_fac) {
-  CUDA_KERNEL_LOOP(src, n) {
-    float_t d_src = float_t(graph.getOutDegree(src));
-    assert(d_src != 0.0); // should never be zero since self-loop added for each vertex
-    d_src = 1.0 / sqrt(d_src);
-    index_type start = graph.edge_begin(src);
-    index_type end = graph.edge_end(src);
-	for (index_type e = start; e != end; e++) {
-      index_type dst = graph.getEdgeDst(e);
-      float_t d_dst = float_t(graph.getOutDegree(dst));
-      assert(d_dst != 0.0);
-      d_dst = 1.0 / sqrt(d_dst);
-      norm_fac[e] = d_src * d_dst;
-    }
-  }
-}
-
 namespace deepgalois {
 
 void graph_conv_layer::init() {
@@ -46,25 +17,6 @@ void graph_conv_layer::aggregate(size_t len, CSRGraph& g, const float_t* in, flo
 void graph_conv_layer::combine(size_t dim_x, size_t dim_y, const float_t* self, const float_t* neighbors, float_t* out) {
 }
 
-void graph_conv_layer::norm_factor_counting() {
-  std::cout << "debug\n";
-  int n = x;//context->graph_gpu.nnodes;
-  std::cout << "Pre-computing normalization factor (n=" << n << ") ... ";
-#ifdef USE_CUSPARSE
-  int nnz = context->graph_gpu.nedges;
-  CUDA_CHECK(cudaMalloc((void**)&norm_factor, nnz * sizeof(float_t)));
-  init_const_kernel<<<CUDA_GET_BLOCKS(nnz), CUDA_NUM_THREADS>>>(nnz, 0.0, norm_factor);
-  norm_factor_counting_edge<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
-      n, context->graph_gpu, norm_factor);
-#else
-  CUDA_CHECK(cudaMalloc((void**)&norm_factor, n * sizeof(float_t)));
-  norm_factor_counting_node<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
-      n, context->graph_gpu, norm_factor);
-#endif
-  CudaTest("solving norm_factor_counting kernel failed");
-  std::cout << "Done\n";
-}
-
 // GPU forward: compute output features
 void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_data) {
   //assert(y <= 128); // currently only support feature length <= 128
@@ -86,9 +38,9 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
   if (level_ != 0) {
     sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0, in_temp);
 #ifdef USE_CUSPARSE
-    update_all_csrmm(y, context->graph_gpu, in_temp, in_grad, true, norm_factor);
+    update_all_csrmm(y, context->graph_gpu, in_temp, in_grad, norm_, norm_factor);
 #else
-    update_all(y, context->graph_gpu, in_temp, in_grad, true, norm_factor);
+    update_all(y, context->graph_gpu, in_temp, in_grad, norm_, norm_factor);
 #endif
     if (dropout_) d_dropout_gpu(x * y, scale_, dropout_rate_, in_grad, dropout_mask, in_grad);
   }
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index 031541e060..30f0e86488 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -111,6 +111,7 @@ void Net::construct_layers() {
   append_conv_layer(1);                          // hidden1 layer
   append_out_layer(2);                           // output layer
   layers[0]->set_in_data(context->get_in_ptr()); // feed input data
+  context->norm_factor_counting();
   set_contexts();
 }
 

From cee94061ab951b7ec7996fbaeb07027708c1b877 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 5 Mar 2020 18:10:21 -0600
Subject: [PATCH 114/660] gluon gradients wrapper (WIP: need mirror setup)

---
 .../deepgalois/layers/GluonGradients.h        | 112 ++++++++++++++++++
 1 file changed, 112 insertions(+)
 create mode 100644 libdeepgalois/include/deepgalois/layers/GluonGradients.h

diff --git a/libdeepgalois/include/deepgalois/layers/GluonGradients.h b/libdeepgalois/include/deepgalois/layers/GluonGradients.h
new file mode 100644
index 0000000000..4131fd22d4
--- /dev/null
+++ b/libdeepgalois/include/deepgalois/layers/GluonGradients.h
@@ -0,0 +1,112 @@
+#ifndef __GLUON_GRADIENTS__
+#define __GLUON_GRADIENTS__
+
+#include "deepgalois/types.h"
+
+/**
+ * Wraps the weight gradients and provides an interface for Gluon to
+ * synchronize them during distributed execution.
+ */
+class GluonGradients {
+private:
+  //! Data type used for gradients
+  using GradientType = float_t;
+  //! type that's being used by the gradient vector
+  using GradientVecType = vec_t;
+
+  GradientVecType& _gradients;
+  size_t _numWeights;
+  size_t _numOwned;
+
+  //! my nodes whose's masters are on other hosts; global ids
+  std::vector<std::vector<size_t>> mirrorNodes;
+  // TODO save mirror ranges here as well
+
+public:
+  /**
+   * Save weight gradients + number of them (i.e. size).
+   * Then setup mirror metadata for Gluon to use during setup.
+   */
+  GluonGradients(GradientVecType& gradients, size_t numWeights)
+      : _gradients(gradients), _numWeights(numWeights) {
+  }
+
+  //! Size is number of weights
+  size_t size() const {
+    return _numWeights;
+  }
+
+  //! Global size is number of weights
+  size_t globalSize() const {
+    return _numWeights;
+  }
+
+  //! Return the weights owned by this host
+  size_t numMasters const {
+    return _numOwned;
+  }
+
+  //! GID is same as LID since all hosts have all weights
+  uint32_t getGID(const uint32_t nodeID) const {
+    return nodeID;
+  }
+
+  //! LID is same as GID since all hosts have all weights
+  uint32_t getLID(const uint32_t nodeID) const {
+    return nodeID;
+  }
+
+  //! Return local weight w
+  GradientType getData(uint32_t w) {
+    return _gradients[w];
+  }
+
+  std::vector<std::pair<uint32_t, uint32_t>> getMirrorRanges() const {
+    // TODO
+  }
+
+  //! Return mirror nodes for each host from this host's point of view
+  std::vector<std::vector<size_t>>& getMirrorNodes() {
+    return mirrorNodes;
+  }
+
+  //! clears the vector
+  // TODO return to this when we start distributing on GPUs
+  void deallocate() {
+    _gradients.clear();
+  }
+
+  // Essentially no-op functions follow
+
+  //! no nodes with edges
+  size_t getNumNodesWithEdges() {
+    return 0;
+  }
+
+  //! No edges; not a vertex cut
+  bool is_vertex_cut() const {
+    return false;
+  }
+
+  //! no edges, return 0
+  unsigned edge_begin(uint32_t dummy) {
+    return 0;
+  }
+
+  //! no edges, return 0
+  unsigned edge_end(uint32_t dummy) {
+    return 0;
+  }
+
+  //! no edges, return 0
+  unsigned getEdgeDst(uint32_t dummy) {
+    return 0;
+  }
+
+  //! no edges, return 0
+  unsigned getEdgeData(uint32_t dummy) {
+    return 0;
+  }
+};
+
+#endif // end header guard

From 798edcb907771557aa04ac059bc5b5d9091a1bf1 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 5 Mar 2020 18:15:44 -0600
Subject: [PATCH 115/660] removing redundant includes

---
 libdeepgalois/include/deepgalois/layers/layer.h | 13 -------------
 libdeepgalois/include/deepgalois/lgraph.h       |  4 ----
 2 files changed, 17 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h
index 19bb176f90..b2e06d5d61 100644
--- a/libdeepgalois/include/deepgalois/layers/layer.h
+++ b/libdeepgalois/include/deepgalois/layers/layer.h
@@ -9,19 +9,6 @@
  * Reused/revised under 3-BSD
  */
 
-#include <queue>
-#include <cmath>
-#include <vector>
-#include <limits>
-#include <memory>
-#include <string>
-#include <cassert>
-#include <iomanip>
-#include <numeric>
-#include <sstream>
-#include <utility>
-#include <algorithm>
-#include <unordered_set>
 #include "deepgalois/layers/node.h"
 #include "deepgalois/types.h"
 #include "deepgalois/utils.h"
diff --git a/libdeepgalois/include/deepgalois/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h
index 7a86960338..029d12d44b 100644
--- a/libdeepgalois/include/deepgalois/lgraph.h
+++ b/libdeepgalois/include/deepgalois/lgraph.h
@@ -4,10 +4,6 @@
 // defines the Learning Graph (LGraph) data structure
 #include <set>
 #include <string>
-#include <sstream>
-#include <fstream>
-#include <iostream>
-#include <algorithm>
 
 namespace deepgalois {
 

From d04129b541e1fcab955832863e77bdd36fc8965b Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Thu, 5 Mar 2020 19:33:05 -0600
Subject: [PATCH 116/660] fix matmul&aggregate order

---
 libdeepgalois/src/layers/graph_conv_layer.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 189f396cf8..0457936b84 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -64,17 +64,17 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
                                         const float_t* out_data,
                                         float_t* out_grad, float_t* in_grad) {
   // note; assumption here is that out_grad contains 1s or 0s via relu?
-  if (act_) deepgalois::math::d_relu_cpu(x*z, out_grad, out_data, out_temp);
-  else deepgalois::math::copy_cpu(x * z, out_grad, out_temp); // TODO: avoid copying
+  if (act_) deepgalois::math::d_relu_cpu(x*z, out_grad, out_data, out_grad);
+  //else deepgalois::math::copy_cpu(x * z, out_grad, out_temp); // TODO: avoid copying
 
   // at this point, out_temp has the derivative of data from last step to
   // use for both updating gradients for features and gradients for weights
   // this calculates gradients for the node predictions
   if (level_ != 0) { // no need to calculate in_grad for the first layer
-    // derivative of matmul needs transposed matrix
-    deepgalois::math::sgemm_cpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, &W[0], 0.0, in_temp); // x*z; z*y ->
     // x*y NOTE: since graph is symmetric, the derivative is the same
-    deepgalois::update_all(y, context->graph_cpu, in_temp, in_grad, norm_, norm_factor); // x*x; x*y -> x*y
+    deepgalois::update_all(z, context->graph_cpu, out_grad, out_temp, norm_, norm_factor); // x*x; x*z -> x*z
+    // derivative of matmul needs transposed matrix
+    deepgalois::math::sgemm_cpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, &W[0], 0.0, in_grad); // x*z; z*y -> x*y
     if (dropout_) deepgalois::math::d_dropout_cpu(x*y, scale_, in_grad, dropout_mask, in_grad);
   }
 
@@ -82,7 +82,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
   // multiplied by gradients from last back prop step
   //deepgalois::math::transpose(x, y, in_data, trans_data); // x*y -> y*x
   //deepgalois::math::matmul1D1D(y, z, x, trans_data, out_temp, &layer::weight_grad[0]); // y*x; x*z; y*z
-  deepgalois::math::sgemm_cpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, &layer::weight_grad[0]); // y*x; x*z; y*z
+  deepgalois::math::sgemm_cpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_grad, 0.0, &layer::weight_grad[0]); // y*x; x*z; y*z
 }
 #endif
 } // namespace

From 063046ee79ab3ee1445c4158eb72f2237660a665 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Fri, 6 Mar 2020 06:59:52 -0600
Subject: [PATCH 117/660] update cmake

---
 libdeepgalois/CMakeLists.txt | 6 ++----
 lonestargnn/CMakeLists.txt   | 4 ++--
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index 34e094ce14..da85c18185 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -17,7 +17,7 @@ link_directories(${OPENBLAS_LIB})
 link_directories(${CMAKE_SOURCE_DIR}/libgalois)
 
 set(USE_CPU ON CACHE BOOL "Build DeepGalois without CUDA support")
-if(USE_CPU)
+if(NOT ENABLE_HETERO_GALOIS)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCPU_ONLY")
 else()
   #set( CMAKE_VERBOSE_MAKEFILE on )
@@ -45,8 +45,6 @@ else()
   target_link_libraries(dg_gpu galois_gpu -lcudart -lcublas -lcusparse -lcurand)
   set_target_properties(dg_gpu PROPERTIES COMPILE_FLAGS "-DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_CUDA")
   set_target_properties(dg_gpu PROPERTIES CUDA_SEPERABLE_COMPILATION ON)
-  #cuda_compile(MF_O src/math_functions.cu)
-  #cuda_compile(AGG_O src/aggregator.cu)
 endif()
 
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
@@ -63,7 +61,7 @@ set(sources
 )
 add_library(dg_cpu STATIC ${sources})
 
-if(USE_CPU)
+if(NOT ENABLE_HETERO_GALOIS)
   target_link_libraries(dg_cpu galois_shmem gllvm)
 else()
   target_link_libraries(dg_cpu galois_shmem gllvm galois_gpu)
diff --git a/lonestargnn/CMakeLists.txt b/lonestargnn/CMakeLists.txt
index e48887e261..3f6cb7672f 100644
--- a/lonestargnn/CMakeLists.txt
+++ b/lonestargnn/CMakeLists.txt
@@ -7,7 +7,7 @@ include_directories(${CMAKE_SOURCE_DIR}/libdeepgalois/include)
 
 SET(CUDA_INC /org/centers/cdgc/cuda/cuda-10.0/include)
 include_directories(${CUDA_INC})
-if(NOT USE_CPU)
+if(ENABLE_HETERO_GALOIS)
   include_directories(${CMAKE_SOURCE_DIR}/libgpu/include)
 endif()
 
@@ -15,7 +15,7 @@ SET(OPENBLAS_INC /net/ohm/export/cdgc/cxh/OpenBLAS-faraday/build/include)
 SET(OPENBLAS_LIB /net/ohm/export/cdgc/cxh/OpenBLAS-faraday/build/lib)
 include_directories(${OPENBLAS_INC})
 link_directories(${OPENBLAS_LIB})
-if(USE_CPU)
+if(NOT ENABLE_HETERO_GALOIS)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCPU_ONLY")
 endif()
 

From a2b5d625ddb2e88bb65133430074ff91ef269d5f Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Fri, 6 Mar 2020 07:25:54 -0600
Subject: [PATCH 118/660] fix cmake

---
 CMakeLists.txt                 | 19 -------------------
 lonestargnn/gcn/CMakeLists.txt |  2 +-
 2 files changed, 1 insertion(+), 20 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 79555a0b31..f1b0489c10 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -502,27 +502,8 @@ if(USE_PANGOLIN)
   add_subdirectory(lonestarmine)
 endif(USE_PANGOLIN)
 if(USE_DEEPGALOIS)
-  SET(CUDA_SEPARABLE_COMPILATION ON)
-  find_package(CUDA REQUIRED)
-  set(CUDA_PROPAGATE_HOST_FLAGS off)
-  set(CUDA_SEPARABLE_COMPILATION on)
-  set(CUDA_HOST_COMPILER g++)
-  string(REPLACE "." "" GENCODES ${CUDA_CAPABILITY})
-  string(REPLACE "," ";" GENCODES ${GENCODES})
-  foreach(GENCODE ${GENCODES})
-    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; --expt-extended-lambda -gencode arch=compute_${GENCODE},code=sm_${GENCODE})
-  endforeach()
-  list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
-  cuda_include_directories("${CMAKE_SOURCE_DIR}/libgpu/include")
   add_subdirectory(libdeepgalois)
   add_subdirectory(lonestargnn)
-  set(CUB_ROOT "${CMAKE_SOURCE_DIR}/cub") # only required headers
-  cuda_include_directories("${CUB_ROOT}")
-  link_directories(${CMAKE_SOURCE_DIR}/cub)
-  set(MGPU_ROOT "${CMAKE_SOURCE_DIR}/moderngpu") # only required headers
-  cuda_include_directories("${MGPU_ROOT}/src")
-  link_directories(${CMAKE_SOURCE_DIR}/moderngpu/src)
-  add_subdirectory(libgpu)
 endif(USE_DEEPGALOIS)
 if(ENABLE_DIST_GALOIS)
   add_subdirectory(libdist)
diff --git a/lonestargnn/gcn/CMakeLists.txt b/lonestargnn/gcn/CMakeLists.txt
index 3d25bb3966..c3fb95c07f 100644
--- a/lonestargnn/gcn/CMakeLists.txt
+++ b/lonestargnn/gcn/CMakeLists.txt
@@ -1,6 +1,6 @@
 app(gcn gcn.cpp)
 target_link_libraries(gcn dg_cpu)
-if(NOT USE_CPU)
+if(ENABLE_HETERO_GALOIS)
   target_link_libraries(gcn dg_gpu)
   target_link_libraries(gcn -lcudart -lcublas -lcurand -lcudadevrt)
 endif()

From 3d7e20e94b9a81a22c0369523bad7cc941b8b9f7 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Fri, 6 Mar 2020 10:37:52 -0600
Subject: [PATCH 119/660] add softmax_loss_layer.cu

---
 libdeepgalois/CMakeLists.txt                  |  2 +-
 .../include/deepgalois/math_functions.hh      |  2 -
 .../src/layers/softmax_loss_layer.cpp         | 25 +-------
 .../src/layers/softmax_loss_layer.cu          | 62 +++++++++++++++++++
 libdeepgalois/src/math_functions.cu           | 29 +--------
 libdeepgalois/src/net.cu                      | 45 ++++++++++++++
 6 files changed, 111 insertions(+), 54 deletions(-)
 create mode 100644 libdeepgalois/src/layers/softmax_loss_layer.cu
 create mode 100644 libdeepgalois/src/net.cu

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index da85c18185..1d53f24bd5 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -34,7 +34,7 @@ else()
   link_directories(${CMAKE_SOURCE_DIR}/libgpu)
   set(CUDA_SOURCES
     src/layers/graph_conv_layer.cu
-    #src/layers/softmax_loss_layer.cu
+    src/layers/softmax_loss_layer.cu
     src/layers/aggregator.cu
     src/math_functions.cu
     src/optimizer.cu
diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh
index 26639f6f55..06dd72c528 100644
--- a/libdeepgalois/include/deepgalois/math_functions.hh
+++ b/libdeepgalois/include/deepgalois/math_functions.hh
@@ -139,8 +139,6 @@ void d_softmax_cross_entropy_gpu(int len, int bengin, int end,
                                  const float_t* out_data, float_t* diff);
 void scal_gpu(const int N, const float alpha, float* X);
 void add_scalar_gpu(const int N, const float_t alpha, float_t* Y);
-acc_t masked_avg_loss(int begin, int end, int count, mask_t* masks,
-                      float_t* loss);
 acc_t masked_accuracy_gpu(int num_classes, int begin, int end,
                           int count, mask_t* masks, float_t* preds,
                           label_t* labels);
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp
index eda3de054d..4146dcd17f 100644
--- a/libdeepgalois/src/layers/softmax_loss_layer.cpp
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp
@@ -2,19 +2,16 @@
 
 namespace deepgalois {
 
+#ifdef CPU_ONLY
 softmax_loss_layer::softmax_loss_layer(unsigned level,
                                        std::vector<size_t> in_dims,
                                        std::vector<size_t> out_dims)
     : layer(level, in_dims, out_dims) {
   trainable_ = false;
   name_      = layer_type() + "_" + std::to_string(level);
-#ifdef CPU_ONLY
   loss = new float_t[in_dims[0]]; // error for each sample
-#else
-  float_malloc_device(in_dims[0], loss);
-#endif
 }
-#ifdef CPU_ONLY
+
 // TODO: need kernel fusion optimization
 // 𝑦[i] = 𝑒^𝑥[i] / Σ 𝑒^𝑥[𝑘]
 void softmax_loss_layer::forward_propagation(const float_t* in_data,
@@ -69,24 +66,6 @@ acc_t softmax_loss_layer::get_masked_loss() {
   assert(valid_sample_count.reduce() == count_);
   return total_loss.reduce() / (acc_t)count_;
 }
-#else // GPU implementation
-void softmax_loss_layer::forward_propagation(const float_t* in_data,
-                                             float_t* out_data) {
-  init_const_gpu(input_dims[0], 0.0, loss);
-  softmax_cross_entropy_gpu(input_dims[1], begin_, end_, in_data,
-                            d_masks_, context->d_labels, loss, out_data);
-}
-
-void softmax_loss_layer::back_propagation(const float_t* in_data,
-                                          const float_t* out_data,
-                                          float_t* out_grad, float_t* in_grad) {
-  d_softmax_cross_entropy_gpu(input_dims[1], begin_, end_, d_masks_,
-                              context->d_labels, out_data, in_grad);
-}
-
-acc_t softmax_loss_layer::get_masked_loss() {
-  return masked_avg_loss(begin_, end_, count_, d_masks_, loss);
-}
 #endif
 
 } // namespace
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cu b/libdeepgalois/src/layers/softmax_loss_layer.cu
new file mode 100644
index 0000000000..e9216b1ae2
--- /dev/null
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cu
@@ -0,0 +1,62 @@
+#include "deepgalois/layers/softmax_loss_layer.h"
+#include "gg.h"
+#include "ggcuda.h"
+
+__global__ void masked_avg_loss_kernel(int begin, int end, mask_t* masks,
+                                       float_t* loss,
+                                       HGAccumulator<acc_t> total) {
+  total.thread_entry();
+  __shared__ cub::BlockReduce<acc_t, CUDA_NUM_THREADS>::TempStorage local_loss;
+  CUDA_KERNEL_LOOP(i, end - begin) {
+    if (masks[begin + i] == 1)
+      // total += loss[begin+i];
+      total.reduce(loss[begin + i]);
+  }
+  total.thread_exit<cub::BlockReduce<acc_t, CUDA_NUM_THREADS>>(local_loss);
+}
+
+//acc_t masked_avg_loss(int begin, int end, int count, mask_t* masks, float_t* loss);
+acc_t masked_avg_loss(int begin, int end, int count, mask_t* masks,
+                      float_t* loss) {
+  assert(count > 0);
+  HGAccumulator<acc_t> loss_accum;
+  Shared<acc_t> total_loss   = Shared<acc_t>(1);
+  *(total_loss.cpu_wr_ptr()) = 0;
+  loss_accum.rv              = total_loss.gpu_wr_ptr();
+  masked_avg_loss_kernel<<<CUDA_GET_BLOCKS(end - begin), CUDA_NUM_THREADS>>>(
+      begin, end, masks, loss, loss_accum);
+  CudaTest("solving masked_avg_loss kernel failed");
+  cudaDeviceSynchronize();
+  return *(total_loss.cpu_rd_ptr()) / count;
+}
+
+namespace deepgalois {
+
+softmax_loss_layer::softmax_loss_layer(unsigned level,
+                                       std::vector<size_t> in_dims,
+                                       std::vector<size_t> out_dims)
+    : layer(level, in_dims, out_dims) {
+  trainable_ = false;
+  name_      = layer_type() + "_" + std::to_string(level);
+  float_malloc_device(in_dims[0], loss);
+}
+
+void softmax_loss_layer::forward_propagation(const float_t* in_data,
+                                             float_t* out_data) {
+  init_const_gpu(input_dims[0], 0.0, loss);
+  softmax_cross_entropy_gpu(input_dims[1], begin_, end_, in_data,
+                            d_masks_, context->d_labels, loss, out_data);
+}
+
+void softmax_loss_layer::back_propagation(const float_t* in_data,
+                                          const float_t* out_data,
+                                          float_t* out_grad, float_t* in_grad) {
+  d_softmax_cross_entropy_gpu(input_dims[1], begin_, end_, d_masks_,
+                              context->d_labels, out_data, in_grad);
+}
+
+acc_t softmax_loss_layer::get_masked_loss() {
+  return masked_avg_loss(begin_, end_, count_, d_masks_, loss);
+}
+
+} // namespace
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index 9131bf9509..e899f16226 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -357,33 +357,6 @@ void d_softmax_cross_entropy_gpu(int len, int begin, int end,
   CudaTest("solving d_softmax_cross_entropy kernel failed");
 }
 
-__global__ void masked_avg_loss_kernel(int begin, int end, mask_t* masks,
-                                       float_t* loss,
-                                       HGAccumulator<acc_t> total) {
-  total.thread_entry();
-  __shared__ cub::BlockReduce<acc_t, CUDA_NUM_THREADS>::TempStorage local_loss;
-  CUDA_KERNEL_LOOP(i, end - begin) {
-    if (masks[begin + i] == 1)
-      // total += loss[begin+i];
-      total.reduce(loss[begin + i]);
-  }
-  total.thread_exit<cub::BlockReduce<acc_t, CUDA_NUM_THREADS>>(local_loss);
-}
-
-acc_t masked_avg_loss(int begin, int end, int count, mask_t* masks,
-                      float_t* loss) {
-  assert(count > 0);
-  HGAccumulator<acc_t> loss_accum;
-  Shared<acc_t> total_loss   = Shared<acc_t>(1);
-  *(total_loss.cpu_wr_ptr()) = 0;
-  loss_accum.rv              = total_loss.gpu_wr_ptr();
-  masked_avg_loss_kernel<<<CUDA_GET_BLOCKS(end - begin), CUDA_NUM_THREADS>>>(
-      begin, end, masks, loss, loss_accum);
-  CudaTest("solving masked_avg_loss kernel failed");
-  cudaDeviceSynchronize();
-  return *(total_loss.cpu_rd_ptr()) / count;
-}
-
 // the arguments of the maxima
 __device__ int argmax_device(const int n, const float_t* x) {
   float_t max    = x[0];
@@ -425,7 +398,7 @@ acc_t masked_accuracy_gpu(int num_classes, int begin, int end,
   accuracy_accum.rv              = total_accuracy.gpu_wr_ptr();
   masked_accuracy_kernel<<<CUDA_GET_BLOCKS(end - begin), CUDA_NUM_THREADS>>>(
       num_classes, begin, end, masks, preds, labels, accuracy_accum);
-  CudaTest("solving masked_avg_loss kernel failed");
+  CudaTest("solving masked_accuracy kernel failed");
   cudaDeviceSynchronize();
   return *(total_accuracy.cpu_rd_ptr()) / count;
 }
diff --git a/libdeepgalois/src/net.cu b/libdeepgalois/src/net.cu
new file mode 100644
index 0000000000..28cd021df9
--- /dev/null
+++ b/libdeepgalois/src/net.cu
@@ -0,0 +1,45 @@
+#include "deepgalois/net.h"
+#include "gg.h"
+#include "ggcuda.h"
+
+__global__ void masked_accuracy_kernel(int num_classes, int begin,
+                                       int end, mask_t* masks,
+                                       float_t* preds, label_t* labels,
+                                       HGAccumulator<acc_t> total) {
+  total.thread_entry();
+  __shared__ cub::BlockReduce<acc_t, CUDA_NUM_THREADS>::TempStorage
+      local_accuracy;
+  CUDA_KERNEL_LOOP(i, end - begin) {
+    if (masks[begin + i] == 1) {
+      label_t pred = (label_t)argmax_device(num_classes,
+                                            preds + (begin + i) * num_classes);
+      if (pred == labels[begin + i])
+        total.reduce(1.0);
+    }
+  }
+  total.thread_exit<cub::BlockReduce<acc_t, CUDA_NUM_THREADS>>(local_accuracy);
+}
+
+acc_t masked_accuracy_gpu(int num_classes, int begin, int end,
+                          int count, mask_t* masks, float_t* preds,
+                          label_t* labels) {
+  assert(count > 0);
+  HGAccumulator<acc_t> accuracy_accum;
+  Shared<acc_t> total_accuracy   = Shared<acc_t>(1);
+  *(total_accuracy.cpu_wr_ptr()) = 0;
+  accuracy_accum.rv              = total_accuracy.gpu_wr_ptr();
+  masked_accuracy_kernel<<<CUDA_GET_BLOCKS(end - begin), CUDA_NUM_THREADS>>>(
+      num_classes, begin, end, masks, preds, labels, accuracy_accum);
+  CudaTest("solving masked_accuracy kernel failed");
+  cudaDeviceSynchronize();
+  return *(total_accuracy.cpu_rd_ptr()) / count;
+}
+
+acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count,
+                           mask_t* masks) {
+  return masked_accuracy_gpu(num_classes, begin, end, count,
+                             layers[NUM_CONV_LAYERS]->get_device_masks(),
+                             layers[NUM_CONV_LAYERS - 1]->next()->get_data(),
+                             context->d_labels);
+}
+

From 2393762b5f1ebcbe1e23968bda6731a636b2112b Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Fri, 6 Mar 2020 10:54:35 -0600
Subject: [PATCH 120/660] add net.cu

---
 libdeepgalois/CMakeLists.txt                  |  1 +
 .../include/deepgalois/math_functions.hh      |  3 --
 libdeepgalois/src/math_functions.cu           | 45 -------------------
 libdeepgalois/src/net.cpp                     | 10 +----
 libdeepgalois/src/net.cu                      | 22 +++++++--
 5 files changed, 21 insertions(+), 60 deletions(-)

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index 1d53f24bd5..2cf03d281f 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -40,6 +40,7 @@ else()
     src/optimizer.cu
     src/context.cu
     src/node.cu
+    src/net.cu
   )
   cuda_add_library(dg_gpu ${CUDA_SOURCES})
   target_link_libraries(dg_gpu galois_gpu -lcudart -lcublas -lcusparse -lcurand)
diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh
index 06dd72c528..158455f73b 100644
--- a/libdeepgalois/include/deepgalois/math_functions.hh
+++ b/libdeepgalois/include/deepgalois/math_functions.hh
@@ -139,9 +139,6 @@ void d_softmax_cross_entropy_gpu(int len, int bengin, int end,
                                  const float_t* out_data, float_t* diff);
 void scal_gpu(const int N, const float alpha, float* X);
 void add_scalar_gpu(const int N, const float_t alpha, float_t* Y);
-acc_t masked_accuracy_gpu(int num_classes, int begin, int end,
-                          int count, mask_t* masks, float_t* preds,
-                          label_t* labels);
 bool is_allocated_device(float_t* data);
 void copy_masks_device(int n, mask_t* h_masks, mask_t*& d_masks);
 void float_malloc_device(int n, float_t*& loss);
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index e899f16226..531480091d 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -357,48 +357,3 @@ void d_softmax_cross_entropy_gpu(int len, int begin, int end,
   CudaTest("solving d_softmax_cross_entropy kernel failed");
 }
 
-// the arguments of the maxima
-__device__ int argmax_device(const int n, const float_t* x) {
-  float_t max    = x[0];
-  int max_ind = 0;
-  for (int i = 1; i < n; i++) {
-    if (x[i] > max) {
-      max_ind = i;
-      max     = x[i];
-    }
-  }
-  return max_ind;
-}
-
-__global__ void masked_accuracy_kernel(int num_classes, int begin,
-                                       int end, mask_t* masks,
-                                       float_t* preds, label_t* labels,
-                                       HGAccumulator<acc_t> total) {
-  total.thread_entry();
-  __shared__ cub::BlockReduce<acc_t, CUDA_NUM_THREADS>::TempStorage
-      local_accuracy;
-  CUDA_KERNEL_LOOP(i, end - begin) {
-    if (masks[begin + i] == 1) {
-      label_t pred = (label_t)argmax_device(num_classes,
-                                            preds + (begin + i) * num_classes);
-      if (pred == labels[begin + i])
-        total.reduce(1.0);
-    }
-  }
-  total.thread_exit<cub::BlockReduce<acc_t, CUDA_NUM_THREADS>>(local_accuracy);
-}
-
-acc_t masked_accuracy_gpu(int num_classes, int begin, int end,
-                          int count, mask_t* masks, float_t* preds,
-                          label_t* labels) {
-  assert(count > 0);
-  HGAccumulator<acc_t> accuracy_accum;
-  Shared<acc_t> total_accuracy   = Shared<acc_t>(1);
-  *(total_accuracy.cpu_wr_ptr()) = 0;
-  accuracy_accum.rv              = total_accuracy.gpu_wr_ptr();
-  masked_accuracy_kernel<<<CUDA_GET_BLOCKS(end - begin), CUDA_NUM_THREADS>>>(
-      num_classes, begin, end, masks, preds, labels, accuracy_accum);
-  CudaTest("solving masked_accuracy kernel failed");
-  cudaDeviceSynchronize();
-  return *(total_accuracy.cpu_rd_ptr()) / count;
-}
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index 30f0e86488..a194bd43d7 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -115,9 +115,8 @@ void Net::construct_layers() {
   set_contexts();
 }
 
-acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count,
-                           mask_t* masks) {
 #ifdef CPU_ONLY
+acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks) {
   AccumF accuracy_all;
   accuracy_all.reset();
   galois::do_all(galois::iterate(begin, end), [&](const auto& i) {
@@ -130,12 +129,7 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count,
   },
   galois::chunk_size<256>(), galois::steal(), galois::loopname("getMaskedLoss"));
   return accuracy_all.reduce() / (acc_t)count;
-#else
-  return masked_accuracy_gpu(num_classes, begin, end, count,
-                             layers[NUM_CONV_LAYERS]->get_device_masks(),
-                             layers[NUM_CONV_LAYERS - 1]->next()->get_data(),
-                             context->d_labels);
-#endif
 }
+#endif
 
 } // namespace deepgalois
diff --git a/libdeepgalois/src/net.cu b/libdeepgalois/src/net.cu
index 28cd021df9..947967d07c 100644
--- a/libdeepgalois/src/net.cu
+++ b/libdeepgalois/src/net.cu
@@ -2,6 +2,19 @@
 #include "gg.h"
 #include "ggcuda.h"
 
+// the arguments of the maxima
+__device__ int argmax_device(const int n, const float_t* x) {
+  float_t max    = x[0];
+  int max_ind = 0;
+  for (int i = 1; i < n; i++) {
+    if (x[i] > max) {
+      max_ind = i;
+      max     = x[i];
+    }
+  }
+  return max_ind;
+}
+
 __global__ void masked_accuracy_kernel(int num_classes, int begin,
                                        int end, mask_t* masks,
                                        float_t* preds, label_t* labels,
@@ -20,9 +33,9 @@ __global__ void masked_accuracy_kernel(int num_classes, int begin,
   total.thread_exit<cub::BlockReduce<acc_t, CUDA_NUM_THREADS>>(local_accuracy);
 }
 
-acc_t masked_accuracy_gpu(int num_classes, int begin, int end,
-                          int count, mask_t* masks, float_t* preds,
-                          label_t* labels) {
+//acc_t masked_accuracy_gpu(int num_classes, int begin, int end, int count, mask_t* masks, float_t* preds, label_t* labels);
+acc_t masked_accuracy_gpu(int num_classes, int begin, int end, int count,
+                          mask_t* masks, float_t* preds, label_t* labels) {
   assert(count > 0);
   HGAccumulator<acc_t> accuracy_accum;
   Shared<acc_t> total_accuracy   = Shared<acc_t>(1);
@@ -35,6 +48,7 @@ acc_t masked_accuracy_gpu(int num_classes, int begin, int end,
   return *(total_accuracy.cpu_rd_ptr()) / count;
 }
 
+namespace deepgalois {
 acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count,
                            mask_t* masks) {
   return masked_accuracy_gpu(num_classes, begin, end, count,
@@ -42,4 +56,4 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count,
                              layers[NUM_CONV_LAYERS - 1]->next()->get_data(),
                              context->d_labels);
 }
-
+}

From 92d1687d757494a36f47213328d0d64fbee0ee71 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Fri, 6 Mar 2020 11:15:19 -0600
Subject: [PATCH 121/660] fix aggregate

---
 libdeepgalois/src/layers/graph_conv_layer.cpp |  9 ++++-----
 libdeepgalois/src/layers/graph_conv_layer.cu  | 13 +++++++------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 0457936b84..3800e6d2ad 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -67,12 +67,13 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
   if (act_) deepgalois::math::d_relu_cpu(x*z, out_grad, out_data, out_grad);
   //else deepgalois::math::copy_cpu(x * z, out_grad, out_temp); // TODO: avoid copying
 
+  // x*y NOTE: since graph is symmetric, the derivative is the same
+  deepgalois::update_all(z, context->graph_cpu, out_grad, out_temp, norm_, norm_factor); // x*x; x*z -> x*z
+
   // at this point, out_temp has the derivative of data from last step to
   // use for both updating gradients for features and gradients for weights
   // this calculates gradients for the node predictions
   if (level_ != 0) { // no need to calculate in_grad for the first layer
-    // x*y NOTE: since graph is symmetric, the derivative is the same
-    deepgalois::update_all(z, context->graph_cpu, out_grad, out_temp, norm_, norm_factor); // x*x; x*z -> x*z
     // derivative of matmul needs transposed matrix
     deepgalois::math::sgemm_cpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, &W[0], 0.0, in_grad); // x*z; z*y -> x*y
     if (dropout_) deepgalois::math::d_dropout_cpu(x*y, scale_, in_grad, dropout_mask, in_grad);
@@ -80,9 +81,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
 
   // calculate weight gradients using input data
   // multiplied by gradients from last back prop step
-  //deepgalois::math::transpose(x, y, in_data, trans_data); // x*y -> y*x
-  //deepgalois::math::matmul1D1D(y, z, x, trans_data, out_temp, &layer::weight_grad[0]); // y*x; x*z; y*z
-  deepgalois::math::sgemm_cpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_grad, 0.0, &layer::weight_grad[0]); // y*x; x*z; y*z
+  deepgalois::math::sgemm_cpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, &layer::weight_grad[0]); // y*x; x*z; y*z
 }
 #endif
 } // namespace
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cu b/libdeepgalois/src/layers/graph_conv_layer.cu
index 69630b50f9..5717d37af8 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cu
+++ b/libdeepgalois/src/layers/graph_conv_layer.cu
@@ -33,17 +33,18 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_
 void graph_conv_layer::back_propagation(const float_t* in_data,
                                         const float_t* out_data,
                                         float_t* out_grad, float_t* in_grad) {
-  if (act_) d_relu_gpu(x * z, out_grad, out_data, out_temp);
-  else copy_gpu(x * z, out_grad, out_temp);
-  if (level_ != 0) {
-    sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0, in_temp);
+  if (act_) d_relu_gpu(x * z, out_grad, out_data, out_grad);
 #ifdef USE_CUSPARSE
-    update_all_csrmm(y, context->graph_gpu, in_temp, in_grad, norm_, norm_factor);
+  update_all_csrmm(z, context->graph_gpu, out_grad, out_temp, norm_, norm_factor);
 #else
-    update_all(y, context->graph_gpu, in_temp, in_grad, norm_, norm_factor);
+  update_all(z, context->graph_gpu, out_grad, out_temp, norm_, norm_factor);
 #endif
+  if (level_ != 0) {
+    sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0, in_grad);
     if (dropout_) d_dropout_gpu(x * y, scale_, dropout_rate_, in_grad, dropout_mask, in_grad);
   }
   sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, layer::d_weight_grad);
 }
+
 } // namespace
+

From fab9f9eeadddcfb9780b09f7ba3dcd98d0b284ba Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 6 Mar 2020 13:44:33 -0600
Subject: [PATCH 122/660] linking dist libs with dg_cpu, reorg cmakelists

---
 libdeepgalois/CMakeLists.txt | 42 +++++++++++++++++++++++++-----------
 1 file changed, 30 insertions(+), 12 deletions(-)

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index 2cf03d281f..ae00edabc0 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -1,26 +1,29 @@
 cmake_minimum_required(VERSION 2.8)
 
+# open blas
 SET(OPENBLAS_INC /net/ohm/export/cdgc/cxh/OpenBLAS-faraday/build/include)
 SET(OPENBLAS_LIB /net/ohm/export/cdgc/cxh/OpenBLAS-faraday/build/lib)
-set(CUB_ROOT "${CMAKE_SOURCE_DIR}/cub") # only required headers
-set(MGPU_ROOT "${CMAKE_SOURCE_DIR}/moderngpu") # only required headers
-SET(CUDA_INC /org/centers/cdgc/cuda/cuda-10.0/include)
-SET(CUDA_LIB /org/centers/cdgc/cuda/cuda-10.0/lib64/)
 include_directories(${OPENBLAS_INC})
+link_directories(${OPENBLAS_LIB})
+
+# galois base libs
 include_directories(${CMAKE_SOURCE_DIR}/libgalois/include)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
-include_directories(${CUDA_INC})
-include_directories(${CMAKE_SOURCE_DIR}/libgpu/include)
-include_directories("${CUB_ROOT}")
-include_directories("${MGPU_ROOT}/src")
-link_directories(${OPENBLAS_LIB})
 link_directories(${CMAKE_SOURCE_DIR}/libgalois)
 
-set(USE_CPU ON CACHE BOOL "Build DeepGalois without CUDA support")
 if(NOT ENABLE_HETERO_GALOIS)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCPU_ONLY")
 else()
-  #set( CMAKE_VERBOSE_MAKEFILE on )
+  # hetero path
+  set(CUB_ROOT "${CMAKE_SOURCE_DIR}/cub") # only required headers
+  include_directories("${CUB_ROOT}")
+  set(MGPU_ROOT "${CMAKE_SOURCE_DIR}/moderngpu") # only required headers
+  include_directories("${MGPU_ROOT}/src")
+
+  SET(CUDA_INC /org/centers/cdgc/cuda/cuda-10.0/include)
+  include_directories(${CUDA_INC})
+  include_directories(${CMAKE_SOURCE_DIR}/libgpu/include)
+
   find_package(CUDA REQUIRED)
   set(CUDA_SEPARABLE_COMPILATION ON)
   set(CUDA_PROPAGATE_HOST_FLAGS OFF)
@@ -30,8 +33,10 @@ else()
   set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -gencode arch=compute_70,code=sm_70)
   #set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -G -Xcompiler -rdynamic)
   #set(CUDA_INCLUDE_DIRS /org/centers/cdgc/cuda/cuda-10.0/include ${CUDA_INCLUDE_DIRS})
+  SET(CUDA_LIB /org/centers/cdgc/cuda/cuda-10.0/lib64/)
   link_directories(${CUDA_LIB})
   link_directories(${CMAKE_SOURCE_DIR}/libgpu)
+
   set(CUDA_SOURCES
     src/layers/graph_conv_layer.cu
     src/layers/softmax_loss_layer.cu
@@ -70,7 +75,10 @@ endif()
 
 target_link_libraries(dg_cpu ${MPI_CXX_LIBRARIES})
 target_link_libraries(dg_cpu -lopenblas)
-target_link_libraries(dg_cpu -lcudart -lcublas -lcusparse -lcurand)
+
+if(ENABLE_HETERO_GALOIS)
+  target_link_libraries(dg_cpu -lcudart -lcublas -lcusparse -lcurand)
+endif()
 
 target_include_directories(dg_cpu PUBLIC
   ${CMAKE_SOURCE_DIR}/libllvm/include
@@ -78,6 +86,16 @@ target_include_directories(dg_cpu PUBLIC
   ${CMAKE_CURRENT_SOURCE_DIR}/include
 )
 
+# dist galois setup/linking to dg_cpu
+if(ENABLE_DIST_GALOIS)
+  target_link_libraries(dg_cpu galois_dist_async galois_cusp galois_gluon)
+  target_include_directories(dg_cpu PUBLIC
+    ${CMAKE_SOURCE_DIR}/libdist/include
+    ${CMAKE_SOURCE_DIR}/libcusp/include
+    ${CMAKE_SOURCE_DIR}/libgluon/include
+  )
+endif()
+
 set_target_properties(dg_cpu PROPERTIES
   INTERFACE_POSITION_INDEPENDENT_CODE On
   POSITION_INDEPENDENT_CODE On

From cf6b1541be59f2406c7701c2707e106ebb3e457f Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Fri, 6 Mar 2020 15:59:47 -0600
Subject: [PATCH 123/660] add avx2 for mul_scalar

---
 libdeepgalois/src/math_functions.cpp | 41 ++++++++++++++--------------
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp
index d3f7d0fca0..cd21a6b1a0 100644
--- a/libdeepgalois/src/math_functions.cpp
+++ b/libdeepgalois/src/math_functions.cpp
@@ -45,48 +45,47 @@ void csrmm_cpu(const int M, const int N, const int K, const int nnz,
 #endif
 }
 
+const size_t vec_len = 8;
 // vector add
 #if defined(__AVX__) || defined(__AVX2__)
-void vadd(const vec_t& a, const vec_t& b, vec_t& out) {
-  // for (size_t i = 0; i < out.size(); ++i) out[i] = a[i] + b[i];
-  size_t n              = out.size();
-  size_t vec_len        = 8;
-  const size_t alignedN = n - n % vec_len;
-  for (size_t i = 0; i < alignedN; i += vec_len)
-    _mm256_storeu_ps(
-        &out[i], _mm256_add_ps(_mm256_loadu_ps(&a[i]), _mm256_loadu_ps(&b[i])));
-  for (size_t i = alignedN; i < n; ++i)
-    out[i] = a[i] + b[i];
-}
-
 void vadd_cpu(size_t n, const float_t* a, const float_t* b, float_t* out) {
-  size_t vec_len        = 8;
   const size_t alignedN = n - n % vec_len;
   for (size_t i = 0; i < alignedN; i += vec_len)
     _mm256_storeu_ps(&out[i], _mm256_add_ps(_mm256_loadu_ps(&a[i]), _mm256_loadu_ps(&b[i])));
   for (size_t i = alignedN; i < n; ++i) out[i] = a[i] + b[i];
 }
+
+void vadd(const vec_t& a, const vec_t& b, vec_t& out) {
+  size_t n = out.size();
+  vadd_cpu(n, &a[0], &b[0], &out[0]);
+}
 #else
 void vadd(const vec_t& a, const vec_t& b, vec_t& out) {
-  for (size_t i = 0; i < out.size(); ++i)
-    out[i] = a[i] + b[i];
+  for (size_t i = 0; i < out.size(); ++i) out[i] = a[i] + b[i];
 }
 void vadd_cpu(size_t n, const float_t* a, const float_t* b, float_t* out) {
   for (size_t i = 0; i < n; ++i) out[i] = a[i] + b[i];
 }
 #endif
 
+#if defined(__AVX__) || defined(__AVX2__)
+void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out) {
+  const size_t alignedN = n - n % vec_len;
+  const __m256 scal = _mm256_set1_ps(alpha);
+  for (size_t i = 0; i < alignedN; i += vec_len)
+    _mm256_storeu_ps(&out[i], _mm256_mul_ps(_mm256_loadu_ps(&in[i]), scal));
+  for (size_t i = alignedN; i < n; ++i) out[i] = alpha * in[i];
+}
+#else
 // vector multiply scalar
 void mul_scalar(const float_t alpha, vec_t& Y) {
-  for (size_t i = 0; i < Y.size(); ++i)
-    Y[i] *= alpha;
+  for (size_t i = 0; i < Y.size(); ++i) Y[i] *= alpha;
 }
 
-void mul_scalar(size_t n, const float_t alpha, const float_t* in,
-                float_t* out) {
-  for (size_t i = 0; i < n; ++i)
-    out[i] = alpha * in[i];
+void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out) {
+  for (size_t i = 0; i < n; ++i) out[i] = alpha * in[i];
 }
+#endif
 
 void clear(vec_t& in) {
   for (size_t i = 0; i < in.size(); i++)

From b963c932210790e5074a83d606362c1ac729a213 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 6 Mar 2020 16:12:48 -0600
Subject: [PATCH 124/660] finishing up mirror setup for gluon gradients

---
 .../deepgalois/layers/GluonGradients.h        | 95 +++++++++++++++++--
 1 file changed, 86 insertions(+), 9 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/layers/GluonGradients.h b/libdeepgalois/include/deepgalois/layers/GluonGradients.h
index 4131fd22d4..578226552e 100644
--- a/libdeepgalois/include/deepgalois/layers/GluonGradients.h
+++ b/libdeepgalois/include/deepgalois/layers/GluonGradients.h
@@ -1,8 +1,12 @@
 #ifndef __GLUON_GRADIENTS__
 #define __GLUON_GRADIENTS__
 
+#include "galois/gstl.h"
+#include "galois/runtime/Network.h"
 #include "deepgalois/types.h"
 
+namespace deepgalois {
+
 /**
  * Wraps the weight gradients and provides an interface for Gluon to
  * synchronize them during distributed execution.
@@ -15,13 +19,25 @@ class GluonGradients {
   using GradientVecType = vec_t;
 
   GradientVecType& _gradients;
+  //! number of weight gradients
   size_t _numWeights;
+  //! number of gradients this host is responsible for
   size_t _numOwned;
 
-  //! my nodes whose's masters are on other hosts; global ids
-  std::vector<std::vector<size_t>> mirrorNodes;
-  // TODO save mirror ranges here as well
+  //! My host ID
+  unsigned _myHost;
+  //! Total num hosts in system
+  unsigned _totalHosts;
+
+  //! first node I own
+  unsigned _beginMaster;
+  //! last node I own (contiguous chunk)
+  unsigned _endMaster;
 
+  //! my nodes whose's masters are on other hosts; global ids
+  std::vector<std::vector<size_t>> _mirrorNodes;
+  //! nodes that are mirrors on this host
+  std::vector<std::pair<uint32_t, uint32_t>> _mirrorRanges;
 public:
   /**
    * Save weight gradients + number of them (i.e. size).
@@ -29,6 +45,53 @@ class GluonGradients {
    */
   GluonGradients(GradientVecType& gradients, size_t numWeights)
       : _gradients(gradients), _numWeights(numWeights) {
+    _myHost = galois::runtime::getSystemNetworkInterface().ID;
+    _totalHosts = galois::runtime::getSystemNetworkInterface().Num;
+
+    // allocate a vector for each host
+    _mirrorNodes.resize(_totalHosts);
+
+    // loop through distribution of weights to hosts
+    for (unsigned h = 0; h < _totalHosts; h++) {
+      std::pair<size_t, size_t> curRange =
+        galois::block_range((size_t)0, _numWeights, h, _totalHosts);
+
+      if (h != _myHost) {
+        // setup mirrors for the host h which is just the list of IDs
+        size_t curW = curRange.first;
+        size_t lastW = curRange.second;
+        size_t numW = lastW - curW;
+
+        // set mirrors for host h
+        _mirrorNodes[h].reserve(numW);
+        for (; curW < lastW; curW++) {
+          _mirrorNodes[h].push_back(curW);
+        }
+      } else {
+        // these belong to this host; save, then mirror ranges can be
+        // calculated from this
+        _beginMaster = curRange.first;
+        _endMaster = curRange.second;
+        _numOwned = _endMaster - _beginMaster;
+
+        // first range is 0 to begin master
+        if (_beginMaster > 0) {
+          galois::gInfo("[", _myHost, "] Mirror range ", 0, " to ",
+                        _beginMaster);
+          _mirrorRanges.emplace_back(0, _beginMaster);
+        }
+
+        // second range is endMaster to end
+        if (_endMaster < _numWeights) {
+          galois::gInfo("[", _myHost, "] Mirror range ", _endMaster, " to ",
+                        _numWeights);
+          _mirrorRanges.emplace_back(_endMaster, _numWeights);
+        }
+      }
+    }
+
+    galois::gInfo("[", _myHost, "] This host owns ", _beginMaster, " to ",
+                  _endMaster);
   }
 
   //! Size is number of weights
@@ -42,10 +105,20 @@ class GluonGradients {
   }
 
   //! Return the weights owned by this host
-  size_t numMasters const {
+  size_t numMasters() const {
     return _numOwned;
   }
 
+  //! Return host ID
+  unsigned myHostID() const {
+    return _myHost;
+  }
+
+  //! Return num hosts in the system
+  unsigned numHosts() const {
+    return _totalHosts;
+  }
+
   //! GID is same as LID since all hosts have all weights
   uint32_t getGID(const uint32_t nodeID) const {
     return nodeID;
@@ -57,21 +130,23 @@ class GluonGradients {
   }
 
   //! Return local weight w
-  GradientType getData(uint32_t w) {
+  GradientType getData(uint32_t w) const {
     return _gradients[w];
   }
 
-  std::vector<std::pair<uint32_t, uint32_t>> getMirrorRanges() const {
-    // TODO
+  //! Return ranges for mirrors (unowned nodes)
+  const std::vector<std::pair<uint32_t, uint32_t>>& getMirrorRanges() const {
+    return _mirrorRanges;
   }
 
   //! Return mirror nodes for each host from this host's point of view
   std::vector<std::vector<size_t>>& getMirrorNodes() {
-    return mirrorNodes;
+    return _mirrorNodes;
   }
 
   //! clears the vector
-  // TODO return to this when we start distributing on GPUs
+  // TODO return to this when we start distributing on GPUs; wrapper
+  // end probably shouldn't be managing this MAYBE
   void deallocate() {
     _gradients.clear();
   }
@@ -109,4 +184,6 @@ class GluonGradients {
   }
 };
 
+}
+
 #endif // end header guard

From 6775ca2eefc82c151beb84535fa7ce9de4e847c6 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 6 Mar 2020 16:13:01 -0600
Subject: [PATCH 125/660] initialize gluon substrate in conv layer

---
 libdeepgalois/include/deepgalois/layers/layer.h | 12 +++++++++++-
 libdeepgalois/src/layers/graph_conv_layer.cpp   | 11 +++++++++++
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h
index b2e06d5d61..c394ac7bbf 100644
--- a/libdeepgalois/include/deepgalois/layers/layer.h
+++ b/libdeepgalois/include/deepgalois/layers/layer.h
@@ -9,12 +9,16 @@
  * Reused/revised under 3-BSD
  */
 
-#include "deepgalois/layers/node.h"
 #include "deepgalois/types.h"
 #include "deepgalois/utils.h"
 #include "deepgalois/context.h"
 #include "deepgalois/optimizer.h"
 #include "deepgalois/math_functions.hh"
+#include "deepgalois/layers/node.h"
+#ifdef GALOIS_USE_DIST
+#include "deepgalois/layers/GluonGradients.h"
+#include "galois/graphs/GluonSubstrate.h"
+#endif
 
 namespace deepgalois {
 
@@ -145,6 +149,12 @@ class layer : public deepgalois::node {
   mask_t* d_masks_;
   float_t* loss; // error for each vertex: N x 1
   deepgalois::Context* context;
+
+#ifdef GALOIS_USE_DIST
+  // Used for synchronization of weight gradients
+  deepgalois::GluonGradients* gradientGraph;
+  galois::graphs::GluonSubstrate<deepgalois::GluonGradients>* syncSub;
+#endif
 };
 
 
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 3800e6d2ad..48051e1ab7 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -36,6 +36,17 @@ void graph_conv_layer::init() {
   rand_init_matrix(y, z, W); // randomly initialize trainable parameters
   // rand_init_matrix(y, z, Q);
   zero_init_matrix(y, z, layer::weight_grad);
+
+#ifdef GALOIS_USE_DIST
+  // setup gluon
+  layer::gradientGraph = new deepgalois::GluonGradients(layer::weight_grad,
+                                                        y * z);
+  layer::syncSub =
+    new galois::graphs::GluonSubstrate<deepgalois::GluonGradients>(
+      *layer::gradientGraph, layer::gradientGraph->myHostID(),
+      layer::gradientGraph->numHosts(), false);
+#endif
+
   if (dropout_) dropout_mask = new unsigned[x * y];
   in_temp  = new float_t[x * y];
   out_temp = new float_t[x * z];

From 9d2b456f319cc5c71e8d9ed6ef4d1cf798f9d7a2 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 6 Mar 2020 16:49:26 -0600
Subject: [PATCH 126/660] LonestarGnn start uses DistSys as necessary

---
 lonestargnn/gcn/gcn.cpp   |  5 +++++
 lonestargnn/lonestargnn.h | 13 +++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp
index 3357fd904e..086b6701de 100644
--- a/lonestargnn/gcn/gcn.cpp
+++ b/lonestargnn/gcn/gcn.cpp
@@ -7,7 +7,12 @@ const char* desc = "Graph convolutional neural networks on an undirected graph";
 const char* url  = 0;
 
 int main(int argc, char** argv) {
+#ifndef GALOIS_USE_DIST
   galois::SharedMemSys G;
+#else
+  galois::DistMemSys G;
+#endif
+
   LonestarGnnStart(argc, argv, name, desc, url);
   deepgalois::Net network; // the neural network to train
   // read network, features, ground truth, initialize metadata
diff --git a/lonestargnn/lonestargnn.h b/lonestargnn/lonestargnn.h
index a04905b5cb..e932738636 100644
--- a/lonestargnn/lonestargnn.h
+++ b/lonestargnn/lonestargnn.h
@@ -10,6 +10,10 @@
 #include "galois/runtime/Profile.h"
 #include "llvm/Support/CommandLine.h"
 #include <boost/iterator/transform_iterator.hpp>
+#ifdef GALOIS_USE_DIST
+#include "galois/DistGalois.h"
+#include "galois/runtime/Network.h"
+#endif
 
 namespace cll = llvm::cl;
 static cll::opt<std::string>
@@ -80,6 +84,11 @@ void LonestarGnnStart(int argc, char** argv, const char* app, const char* desc,
   llvm::cl::ParseCommandLineOptions(argc, argv);
   numThreads = galois::setActiveThreads(numThreads);
   galois::runtime::setStatFile(statFile);
+
+#ifdef GALOIS_USE_DIST
+  auto& net = galois::runtime::getSystemNetworkInterface();
+  if (net.ID == 0) {
+#endif
   LonestarGnnPrintVersion();
   std::cout << "Copyright (C) " << galois::getCopyrightYear()
             << " The University of Texas at Austin\n";
@@ -99,6 +108,10 @@ void LonestarGnnStart(int argc, char** argv, const char* app, const char* desc,
   }
   galois::runtime::reportParam("(NULL)", "CommandLine", cmdout.str());
   galois::runtime::reportParam("(NULL)", "Threads", numThreads);
+#ifdef GALOIS_USE_DIST
+  }
+#endif
+
   char name[256];
   gethostname(name, 256);
   galois::runtime::reportParam("(NULL)", "Hostname", name);

From 1816956da3eed26543b93462a9db6fecd5cf18a5 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 6 Mar 2020 17:37:54 -0600
Subject: [PATCH 127/660] dummy sync structures for weight gradients

---
 .../deepgalois/layers/GluonGradients.h        |  2 +-
 .../deepgalois/layers/GradientSyncStructs.h   | 51 +++++++++++++++++++
 .../include/deepgalois/layers/layer.h         |  3 +-
 3 files changed, 54 insertions(+), 2 deletions(-)
 create mode 100644 libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h

diff --git a/libdeepgalois/include/deepgalois/layers/GluonGradients.h b/libdeepgalois/include/deepgalois/layers/GluonGradients.h
index 578226552e..1643a62027 100644
--- a/libdeepgalois/include/deepgalois/layers/GluonGradients.h
+++ b/libdeepgalois/include/deepgalois/layers/GluonGradients.h
@@ -130,7 +130,7 @@ class GluonGradients {
   }
 
   //! Return local weight w
-  GradientType getData(uint32_t w) const {
+  GradientType& getData(uint32_t w) const {
     return _gradients[w];
   }
 
diff --git a/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h
new file mode 100644
index 0000000000..e38eb5192a
--- /dev/null
+++ b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h
@@ -0,0 +1,51 @@
+#ifndef __GRAD_SYNC_STRUCT__
+#define __GRAD_SYNC_STRUCT__
+
+#include "deepgalois/types.h"
+
+struct GradientSync {
+  using ValTy = float_t;
+
+  static ValTy extract(uint32_t node_id, float_t& weight) {
+    return weight;
+  }
+
+  static bool reduce(uint32_t node_id, float_t& weight, ValTy y) {
+    // TODO merge function here
+    // for now make sure the weights are close enough
+    if (std::abs(weight - y) > 0.00001) {
+      galois::gInfo("weight ", node_id, " not consistent with one received");
+    }
+
+    return true;
+  }
+
+  //! reset weight to 0
+  static void reset(uint32_t node_id, float_t &weight) {
+    weight = 0;
+  }
+
+  //! save weight
+  static void setVal(uint32_t node_id, float_t &weight, ValTy y) {
+    weight = y;
+  }
+
+  // GPU options TODO for GPU
+  static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {
+    return false;
+  }
+  static bool extract_batch(unsigned, uint8_t*) { return false; }
+  static bool extract_reset_batch(unsigned, uint8_t*, size_t*,
+                                  DataCommMode*) { return false; }
+  static bool extract_reset_batch(unsigned, uint8_t*) { return false; }
+  static bool reduce_batch(unsigned, uint8_t*, DataCommMode) { return false; }
+  static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) {
+    return false;
+  }
+  static bool setVal_batch(unsigned, uint8_t*, DataCommMode) { return false; }
+
+};
+
+// TODO bitset; might have to do it manually
+//GALOIS_SYNC_STRUCTURE_BITSET(TODOTHIS?);
+#endif
diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h
index c394ac7bbf..a91f495915 100644
--- a/libdeepgalois/include/deepgalois/layers/layer.h
+++ b/libdeepgalois/include/deepgalois/layers/layer.h
@@ -16,8 +16,9 @@
 #include "deepgalois/math_functions.hh"
 #include "deepgalois/layers/node.h"
 #ifdef GALOIS_USE_DIST
-#include "deepgalois/layers/GluonGradients.h"
 #include "galois/graphs/GluonSubstrate.h"
+#include "deepgalois/layers/GluonGradients.h"
+#include "deepgalois/layers/GradientSyncStructs.h"
 #endif
 
 namespace deepgalois {

From 67b3b1fc211e6c99ee29a9f2229def9bb5bcb3c9 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 6 Mar 2020 18:03:55 -0600
Subject: [PATCH 128/660] sync of weight gradients called + working

---
 .../include/deepgalois/layers/GradientSyncStructs.h |  1 -
 libdeepgalois/src/layers/graph_conv_layer.cpp       | 13 ++++++++++---
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h
index e38eb5192a..df88352bcf 100644
--- a/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h
+++ b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h
@@ -43,7 +43,6 @@ struct GradientSync {
     return false;
   }
   static bool setVal_batch(unsigned, uint8_t*, DataCommMode) { return false; }
-
 };
 
 // TODO bitset; might have to do it manually
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 48051e1ab7..ceeae8605d 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -86,13 +86,20 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
   // this calculates gradients for the node predictions
   if (level_ != 0) { // no need to calculate in_grad for the first layer
     // derivative of matmul needs transposed matrix
-    deepgalois::math::sgemm_cpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, &W[0], 0.0, in_grad); // x*z; z*y -> x*y
-    if (dropout_) deepgalois::math::d_dropout_cpu(x*y, scale_, in_grad, dropout_mask, in_grad);
+    deepgalois::math::sgemm_cpu(CblasNoTrans, CblasTrans, x, y, z, 1.0,
+                                out_temp, &W[0], 0.0, in_grad); // x*z; z*y -> x*y
+    if (dropout_) {
+      deepgalois::math::d_dropout_cpu(x*y, scale_, in_grad, dropout_mask,
+                                      in_grad);
+    }
   }
 
   // calculate weight gradients using input data
   // multiplied by gradients from last back prop step
-  deepgalois::math::sgemm_cpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, &layer::weight_grad[0]); // y*x; x*z; y*z
+  deepgalois::math::sgemm_cpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data,
+                              out_temp, 0.0, &layer::weight_grad[0]); // y*x; x*z; y*z
+  layer::syncSub->sync<writeAny, readAny, GradientSync>("GradientSync");
+  //galois::gInfo("[", layer::gradientGraph->myHostID(), "] Sync done");
 }
 #endif
 } // namespace

From 65e65d75c0bdecab7f25909aff287930dee3c426 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 9 Mar 2020 12:30:29 -0500
Subject: [PATCH 129/660] clean up CMake for deepgalois; fixed gpu issue

---
 libdeepgalois/CMakeLists.txt | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index ae00edabc0..bdc0f97942 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -66,20 +66,9 @@ set(sources
   src/net.cpp
 )
 add_library(dg_cpu STATIC ${sources})
-
-if(NOT ENABLE_HETERO_GALOIS)
-  target_link_libraries(dg_cpu galois_shmem gllvm)
-else()
-  target_link_libraries(dg_cpu galois_shmem gllvm galois_gpu)
-endif()
-
+target_link_libraries(dg_cpu galois_shmem gllvm)
 target_link_libraries(dg_cpu ${MPI_CXX_LIBRARIES})
 target_link_libraries(dg_cpu -lopenblas)
-
-if(ENABLE_HETERO_GALOIS)
-  target_link_libraries(dg_cpu -lcudart -lcublas -lcusparse -lcurand)
-endif()
-
 target_include_directories(dg_cpu PUBLIC
   ${CMAKE_SOURCE_DIR}/libllvm/include
   ${CMAKE_SOURCE_DIR}/libgalois/include
@@ -94,6 +83,15 @@ if(ENABLE_DIST_GALOIS)
     ${CMAKE_SOURCE_DIR}/libcusp/include
     ${CMAKE_SOURCE_DIR}/libgluon/include
   )
+
+  if(ENABLE_HETERO_GALOIS)
+    target_link_libraries(dg_gpu galois_dist_async galois_cusp galois_gluon)
+    target_include_directories(dg_gpu PUBLIC
+      ${CMAKE_SOURCE_DIR}/libdist/include
+      ${CMAKE_SOURCE_DIR}/libcusp/include
+      ${CMAKE_SOURCE_DIR}/libgluon/include
+    )
+  endif()
 endif()
 
 set_target_properties(dg_cpu PROPERTIES

From 02d8f2082bcdd1b0be0dec51ce4f2f6049ef12a1 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Mon, 9 Mar 2020 13:53:01 -0500
Subject: [PATCH 130/660] add cusparse flag

---
 libdeepgalois/include/deepgalois/types.h     | 1 +
 libdeepgalois/src/context.cpp                | 4 ++--
 libdeepgalois/src/context.cu                 | 3 ++-
 libdeepgalois/src/layers/graph_conv_layer.cu | 2 +-
 4 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/types.h b/libdeepgalois/include/deepgalois/types.h
index 118f04bd04..1a32d5a47d 100644
--- a/libdeepgalois/include/deepgalois/types.h
+++ b/libdeepgalois/include/deepgalois/types.h
@@ -26,4 +26,5 @@ typedef uint8_t mask_t; // mask is used to indicate different uses of labels:
 #define BLOCK_SIZE 256
 #define WARP_SIZE 32
 #define WARPS_PER_BLOCK (BLOCK_SIZE / WARP_SIZE)
+#define USE_CUSPARSE
 #endif
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index 8b5917b70c..dc959f5876 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -70,7 +70,7 @@ void Context::add_selfloop(Graph &og, Graph &g) {
   g.constructNodes();
   /*
   for (size_t src = 0; src < og.size(); src++) {
-    g.getData(src) = 1;
+    //g.getData(src) = 1;
     auto begin = og.edge_begin(src);
     auto end = og.edge_end(src);
     g.fixEndEdge(src, end+src+1);
@@ -90,7 +90,7 @@ void Context::add_selfloop(Graph &og, Graph &g) {
       } else g.constructEdge(e+src+1, dst, 0);
     }
   }
-  */
+  //*/
 }
 
 float_t* Context::get_in_ptr() { return &h_feats[0]; }
diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu
index 5ddbdc3dd8..564c0aaa08 100644
--- a/libdeepgalois/src/context.cu
+++ b/libdeepgalois/src/context.cu
@@ -3,6 +3,7 @@
 #include <unistd.h>
 #include <sys/types.h>
 #include "deepgalois/context.h"
+#include "deepgalois/math_functions.hh"
 
 // random seeding
 int64_t cluster_seedgen(void) {
@@ -90,7 +91,7 @@ void Context::norm_factor_counting() {
 #ifdef USE_CUSPARSE
   int nnz = graph_gpu.nedges;
   CUDA_CHECK(cudaMalloc((void**)&norm_factor, nnz * sizeof(float_t)));
-  init_const_kernel<<<CUDA_GET_BLOCKS(nnz), CUDA_NUM_THREADS>>>(nnz, 0.0, norm_factor);
+  init_const_gpu(nnz, 0.0, norm_factor);
   norm_factor_counting_edge<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
       n, graph_gpu, norm_factor);
 #else
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cu b/libdeepgalois/src/layers/graph_conv_layer.cu
index 5717d37af8..b2a9209bd4 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cu
+++ b/libdeepgalois/src/layers/graph_conv_layer.cu
@@ -8,7 +8,7 @@ void graph_conv_layer::init() {
 
 void graph_conv_layer::aggregate(size_t len, CSRGraph& g, const float_t* in, float_t* out) {
   #ifdef USE_CUSPARSE
-  deepgalois::update_all_csrmm(y, context->graph_gpu, in_temp, in_grad, norm_, norm_factor);
+  deepgalois::update_all_csrmm(len, g, in, out, norm_, norm_factor);
   #else
   deepgalois::update_all(len, g, in, out, norm_, norm_factor);
   #endif

From 64bb52ded981880cb5e178cc20fc2f737317ba48 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Mon, 9 Mar 2020 15:30:20 -0500
Subject: [PATCH 131/660] fix bug in adding selfloop

---
 libdeepgalois/src/context.cpp | 4 ++++
 libdeepgalois/src/context.cu  | 3 +++
 libgpu/include/graph_gpu.h    | 4 ++++
 3 files changed, 11 insertions(+)

diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index dc959f5876..79bd0be985 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -75,6 +75,10 @@ void Context::add_selfloop(Graph &og, Graph &g) {
     auto end = og.edge_end(src);
     g.fixEndEdge(src, end+src+1);
     bool self_inserted = false;
+    if (begin == end) {
+      new_edge_dst[begin+i] = i;
+      continue;
+    }
     for (auto e = begin; e != end; e++) {
       auto dst = og.getEdgeDst(e);
       if (!self_inserted) {
diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu
index 564c0aaa08..d727904107 100644
--- a/libdeepgalois/src/context.cu
+++ b/libdeepgalois/src/context.cu
@@ -36,6 +36,7 @@ __global__ void norm_factor_counting_node(int n, CSRGraph graph, float_t* norm_f
 // computing normalization factor for each edge
 __global__ void norm_factor_counting_edge(int n, CSRGraph graph, float_t* norm_fac) {
   CUDA_KERNEL_LOOP(src, n) {
+    assert(src < n);
     float_t d_src = float_t(graph.getOutDegree(src));
     assert(d_src != 0.0); // should never be zero since self-loop added for each vertex
     d_src = 1.0 / sqrt(d_src);
@@ -43,6 +44,8 @@ __global__ void norm_factor_counting_edge(int n, CSRGraph graph, float_t* norm_f
     index_type end = graph.edge_end(src);
 	for (index_type e = start; e != end; e++) {
       index_type dst = graph.getEdgeDst(e);
+      if (dst >= n) printf("src=%d, dst=%d, e=%d, start=%d, end=%d\n", src, dst, e, start, end);
+      assert(dst < n);
       float_t d_dst = float_t(graph.getOutDegree(dst));
       assert(d_dst != 0.0);
       d_dst = 1.0 / sqrt(d_dst);
diff --git a/libgpu/include/graph_gpu.h b/libgpu/include/graph_gpu.h
index da420ea416..3f2c88a308 100644
--- a/libgpu/include/graph_gpu.h
+++ b/libgpu/include/graph_gpu.h
@@ -123,6 +123,10 @@ struct CSRGraph {
 			index_type start = row_start[i];
 			index_type end = row_start[i+1];
 			bool selfloop_inserted = false;
+			if (start == end) {
+				new_edge_dst[start+i] = i;
+				continue;
+			}
 			for (index_type e = start; e != end; e++) {
 				index_type dst = edge_dst[e];
 				if (!selfloop_inserted) {

From ae75bd890398396f067ddbe995ca2ada02e2cf3c Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 9 Mar 2020 12:53:26 -0500
Subject: [PATCH 132/660] seed argument to rand_init_matrix

---
 libdeepgalois/include/deepgalois/layers/graph_conv_layer.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
index b3f9d16d2b..4904b13905 100644
--- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
@@ -68,9 +68,9 @@ class graph_conv_layer : public layer {
   float_t* norm_factor;   // normalization constant based on graph structure
 
   // Glorot & Bengio (AISTATS 2010)
-  inline void rand_init_matrix(size_t dim_x, size_t dim_y, vec_t& matrix) {
+  inline void rand_init_matrix(size_t dim_x, size_t dim_y, vec_t& matrix, unsigned seed=1) {
     auto init_range = sqrt(6.0 / (dim_x + dim_y));
-    std::default_random_engine rng;
+    std::default_random_engine rng(seed);
     std::uniform_real_distribution<float_t> dist(-init_range, init_range);
     matrix.resize(dim_x * dim_y);
     for (size_t i = 0; i < dim_x; ++i) {

From 5129f80aa2e9f53bce2598f0247eacbe4724b7e9 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 9 Mar 2020 13:42:22 -0500
Subject: [PATCH 133/660] made it so dist execution explicitly uses same seed
 for weight matrix

---
 libdeepgalois/src/layers/graph_conv_layer.cpp | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index ceeae8605d..3a0cf8ad4e 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -33,10 +33,6 @@ void graph_conv_layer::combine(size_t n, size_t len, const float_t* self, const
 }
 
 void graph_conv_layer::init() {
-  rand_init_matrix(y, z, W); // randomly initialize trainable parameters
-  // rand_init_matrix(y, z, Q);
-  zero_init_matrix(y, z, layer::weight_grad);
-
 #ifdef GALOIS_USE_DIST
   // setup gluon
   layer::gradientGraph = new deepgalois::GluonGradients(layer::weight_grad,
@@ -47,6 +43,16 @@ void graph_conv_layer::init() {
       layer::gradientGraph->numHosts(), false);
 #endif
 
+#ifdef GALOIS_USE_DIST
+  // make sure seed consistent across all hosts for weight matrix
+  rand_init_matrix(y, z, W, 1);
+#else
+  rand_init_matrix(y, z, W);
+#endif
+
+  // rand_init_matrix(y, z, Q);
+  zero_init_matrix(y, z, layer::weight_grad);
+
   if (dropout_) dropout_mask = new unsigned[x * y];
   in_temp  = new float_t[x * y];
   out_temp = new float_t[x * z];

From 3526cf2022f6c44beb8ec081fab6c0c4cf558f3e Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 9 Mar 2020 14:31:45 -0500
Subject: [PATCH 134/660] softmax math functions to namespace; TODOs placed for
 sync

---
 .../include/deepgalois/math_functions.hh      |  21 +-
 libdeepgalois/src/layers/graph_conv_layer.cpp |   3 +
 .../src/layers/softmax_loss_layer.cpp         |  35 ++-
 libdeepgalois/src/math_functions.cpp          | 226 ++++++++----------
 4 files changed, 140 insertions(+), 145 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh
index 158455f73b..8f73ed609e 100644
--- a/libdeepgalois/include/deepgalois/math_functions.hh
+++ b/libdeepgalois/include/deepgalois/math_functions.hh
@@ -25,6 +25,8 @@ namespace math {
 void vadd_cpu(size_t n, const float_t* a, const float_t* b, float_t* out);
 //! multiply n elements of vector by scalar
 void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out);
+//! do dot product of 2 vectors
+float_t dot(const vec_t& x, const vec_t& y);
 //! clear n elements of a vector
 void clear_cpu(size_t n, float_t* in);
 // dropout functions randomly remove weights
@@ -37,6 +39,15 @@ void d_dropout_cpu(size_t n, const float scale, const float_t* in_diff,
 void relu_cpu(size_t n, const float_t* in, float_t* out);
 //! ReLU derivative; generally, 1 if data > 0, 0 otherwise
 void d_relu_cpu(size_t n, const float_t* in, const float_t* data, float_t* out);
+void softmax(const vec_t& input, vec_t& output);
+void softmax(size_t n, const float_t* input, float_t* output);
+void d_softmax(const vec_t& y, const vec_t& p, vec_t& dy, const vec_t& dp);
+void d_softmax(size_t n, const float_t* y, const float_t* p, float_t* dy,
+               const float_t* dp);
+float_t cross_entropy(const vec_t& y, const vec_t& p);
+float_t cross_entropy(size_t n, const float_t* y, const float_t* p);
+void d_cross_entropy(const vec_t& y, const vec_t& p, vec_t& d);
+void d_cross_entropy(size_t n, const float_t* y, const float_t* p, float_t* d);
 //! copy vector from in -> out; first len elements
 void copy_cpu(size_t len, const float_t* in, float_t* out);
 // single-precision dense matrix multiply
@@ -82,7 +93,6 @@ void vdiv(const vec_t& a, const vec_t& b, vec_t& out);
 void add_scalar(const float_t alpha, vec_t& Y);
 void sub_scalar(const float_t alpha, vec_t& Y);
 void div_scalar(const float_t alpha, vec_t& Y);
-float_t dot(const vec_t& x, const vec_t& y);
 //void mvmul(const vec_t& matrix, const vec_t& in_vector, vec_t& out_vector);
 void mvmul(size_t m, size_t n, const float_t *matrix, const float_t *in_vector, float_t *out_vector);
 void vvmul(const vec_t& a, const vec_t& b, tensor_t& out);
@@ -96,15 +106,6 @@ void transpose2D(const tensor_t& in, tensor_t& out);
 void transpose2D1D(const tensor_t& in, vec_t& out);
 int argmax(const size_t n, const vec_t& x);   // the arguments of the maxima
 int argmax(const size_t n, const float_t* x); // the arguments of the maxima
-void softmax(const vec_t& input, vec_t& output);
-void softmax(size_t n, const float_t* input, float_t* output);
-void d_softmax(const vec_t& y, const vec_t& p, vec_t& dy, const vec_t& dp);
-void d_softmax(size_t n, const float_t* y, const float_t* p, float_t* dy,
-               const float_t* dp);
-float_t cross_entropy(const vec_t& y, const vec_t& p);
-float_t cross_entropy(size_t n, const float_t* y, const float_t* p);
-void d_cross_entropy(const vec_t& y, const vec_t& p, vec_t& d);
-void d_cross_entropy(size_t n, const float_t* y, const float_t* p, float_t* d);
 
 // GPU operators
 bool isnan_gpu(int n, const float_t *array); // does array contain any 'nan' element
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 3a0cf8ad4e..1c631a9d21 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -71,6 +71,7 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_
 
   // aggregate based on graph topology
   graph_conv_layer::aggregate(z, context->graph_cpu, out_temp, out_data);
+  // TODO sync required here
 
   // run relu activation on output if specified
   if (act_) deepgalois::math::relu_cpu(x*z, out_data, out_data);
@@ -85,7 +86,9 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
   //else deepgalois::math::copy_cpu(x * z, out_grad, out_temp); // TODO: avoid copying
 
   // x*y NOTE: since graph is symmetric, the derivative is the same
+  // this is the aggregate call
   deepgalois::update_all(z, context->graph_cpu, out_grad, out_temp, norm_, norm_factor); // x*x; x*z -> x*z
+  // TODO sync required here
 
   // at this point, out_temp has the derivative of data from last step to
   // use for both updating gradients for features and gradients for weights
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp
index 4146dcd17f..9b64a0d353 100644
--- a/libdeepgalois/src/layers/softmax_loss_layer.cpp
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp
@@ -20,33 +20,42 @@ void softmax_loss_layer::forward_propagation(const float_t* in_data,
   galois::do_all(galois::iterate(begin_, end_),
     [&](const auto& i) {
       if (masks_[i] == 1) { // masked
-        softmax(len, &in_data[len*i], &out_data[len*i]); // normalize using softmax
-        // y is a one hot encoded vector for the labels
-        std::vector<acc_t> y(output_dims[1], 0.0); // ground truth
-        y[context->get_label(i)] = 1.0;            // one-hot
-        loss[i] = cross_entropy(len, &y[0], &out_data[len*i]);
+        // output is normalized input for this layer
+        math::softmax(len, &in_data[len*i], &out_data[len*i]); // normalize using softmax
+        // one hot encoded vector for the labels
+        std::vector<acc_t> groundTruth(output_dims[1], 0.0); // ground truth
+        groundTruth[context->get_label(i)] = 1.0;            // one-hot
+        // loss calculation
+        loss[i] = math::cross_entropy(len, &groundTruth[0], &out_data[len*i]);
       }
     }, galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
     galois::loopname("softmax-loss-fw"));
+
+    // no sync required in distributed execution since no graph topology used
+    // in this forward pass; only a post-process pretty much
 }
 
 void softmax_loss_layer::back_propagation(const float_t* in_data,
                                           const float_t* out_data,
                                           float_t* out_grad, float_t* in_grad) {
   // note: out_grad is ignored because it shouldn't exist (this is output layer)
-  size_t len = input_dims[1];
-  galois::do_all(galois::iterate(begin_, end_),
+  size_t len = layer::input_dims[1];
+  galois::do_all(galois::iterate(layer::begin_, layer::end_),
     [&](const auto& i) {
       if (masks_[i] == 1) { // masked
         vec_t norm_grad(len);
-        std::vector<acc_t> y(len, 0.0); // ground truth
-        y[context->get_label(i)] = 1.0;
-        d_cross_entropy(len, &y[0], &out_data[len * i], &norm_grad[0]);
-        d_softmax(len, &in_data[len * i], &out_data[len * i],
-                  &in_grad[len * i], &norm_grad[0]);
+        std::vector<acc_t> groundTruth(len, 0.0);
+        groundTruth[context->get_label(i)] = 1.0;
+        // use ground truth to determine derivative of cross entropy
+        math::d_cross_entropy(len, &groundTruth[0], &out_data[len * i], &norm_grad[0]);
+        // derviative softmax to gradient used in the next layer
+        math::d_softmax(len, &in_data[len * i], &out_data[len * i],
+                        &in_grad[len * i], &norm_grad[0]);
       }
     }, galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
     galois::loopname("softmax-loss-bw"));
+
+  // no weight sync required: this is all local graph information
 }
 
 acc_t softmax_loss_layer::get_masked_loss() {
@@ -55,7 +64,7 @@ acc_t softmax_loss_layer::get_masked_loss() {
   AccumU valid_sample_count;
   total_loss.reset();
   valid_sample_count.reset();
-  galois::do_all(galois::iterate(begin_, end_),
+  galois::do_all(galois::iterate(layer::begin_, layer::end_),
     [&](const auto& i) {
       if (masks_[i]) {
         total_loss += loss[i];
diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp
index cd21a6b1a0..6b383e4b78 100644
--- a/libdeepgalois/src/math_functions.cpp
+++ b/libdeepgalois/src/math_functions.cpp
@@ -87,6 +87,21 @@ void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out)
 }
 #endif
 
+// dot product
+float_t dot(const vec_t& x, const vec_t& y) {
+  float_t sum = 0;
+  for (size_t i = 0; i < x.size(); ++i)
+    sum += x[i] * y[i];
+  return sum;
+}
+
+float_t dot(size_t n, const float_t* x, const float_t* y) {
+  float_t sum = 0;
+  for (size_t i = 0; i < n; ++i)
+    sum += x[i] * y[i];
+  return sum;
+}
+
 void clear(vec_t& in) {
   for (size_t i = 0; i < in.size(); i++)
     in[i] = 0;
@@ -157,6 +172,95 @@ void d_relu_cpu(size_t n, const float_t* in, const float_t* data, float_t* out)
   }, galois::loopname("d_relu"));
 }
 
+void softmax(const vec_t& input, vec_t& output) {
+  const float_t max = *std::max_element(input.begin(), input.end());
+  float_t denominator(0);
+  for (size_t i = 0; i < input.size(); i++) {
+    output[i] = std::exp(input[i] - max);
+    denominator += output[i];
+  }
+  for (size_t i = 0; i < input.size(); i++)
+    output[i] /= denominator;
+}
+
+void softmax(size_t n, const float_t* input, float_t* output) {
+  const float_t max = *std::max_element(input, input + n);
+  float_t denominator(0);
+  for (size_t i = 0; i < n; i++) {
+    output[i] = std::exp(input[i] - max);
+    denominator += output[i];
+  }
+  for (size_t i = 0; i < n; i++)
+    output[i] /= denominator;
+}
+
+void d_softmax(const vec_t& y, const vec_t& p, vec_t& dy, const vec_t& dp) {
+  auto n = y.size();
+  vec_t df(n, 0);
+  for (size_t i = 0; i < n; i++) {
+    for (size_t j = 0; j < n; j++) {
+      // float_t delta_ij = i == j? 1 : 0;
+      // df[i] += p[j] * (delta_ij - p[i]);
+      df[j] = (j == i) ? p[i] * (float_t(1) - p[i]) : -p[j] * p[i];
+    }
+    // dy = dp * (gradient of softmax)
+    dy[i] = dot(dp, df);
+  }
+}
+
+void d_softmax(size_t n, const float_t* y, const float_t* p, float_t* dy,
+               const float_t* dp) {
+  vec_t df(n, 0);
+  for (size_t i = 0; i < n; i++) {
+    for (size_t j = 0; j < n; j++) {
+      df[j] = (j == i) ? p[i] * (float_t(1) - p[i]) : -p[j] * p[i];
+    }
+    dy[i] = dot(n, dp, &df[0]);
+  }
+}
+
+// cross-entropy loss function for multi-class classification
+// y: ground truth
+// p: predicted probability
+float_t cross_entropy(const vec_t& y, const vec_t& p) {
+  auto n = y.size();
+  assert(n > 0);
+  float_t loss = 0.0;
+  for (size_t i = 0; i < n; i++) {
+    if (y[i] == float_t(0))
+      continue;
+    if (p[i] == float_t(0))
+      loss -= y[i] * std::log(float_t(1e-10));
+    else loss -= y[i] * std::log(p[i]);
+  }
+  return loss;
+}
+
+float_t cross_entropy(size_t n, const float_t* y, const float_t* p) {
+  float_t loss = 0.0;
+  for (size_t i = 0; i < n; i++) {
+    if (y[i] == float_t(0))
+      continue;
+    if (p[i] == float_t(0))
+      loss -= y[i] * std::log(float_t(1e-10));
+    else
+      loss -= y[i] * std::log(p[i]);
+  }
+  return loss;
+}
+
+void d_cross_entropy(const vec_t& y, const vec_t& p, vec_t& d) {
+  auto n = y.size();
+  for (size_t i = 0; i < n; i++) {
+    d[i] = -y[i] / (p[i] + float_t(1e-10));
+  }
+}
+
+void d_cross_entropy(size_t n, const float_t* y, const float_t* p, float_t* d) {
+  for (size_t i = 0; i < n; i++) {
+    d[i] = -y[i] / (p[i] + float_t(1e-10));
+  }
+}
 void copy1D1D(const vec_t& in, vec_t& out) {
   std::copy(in.begin(), in.end(), &out[0]);
 }
@@ -188,7 +292,6 @@ void transpose(size_t x, size_t y, const float_t* in, float_t* out) {
     }
   }
 }
-
 } // deepgalois
 } // math
 
@@ -234,20 +337,6 @@ void div_scalar(const float_t alpha, vec_t& Y) {
     Y[i] /= alpha;
 }
 
-// dot product
-float_t dot(const vec_t& x, const vec_t& y) {
-  float_t sum = 0;
-  for (size_t i = 0; i < x.size(); ++i)
-    sum += x[i] * y[i];
-  return sum;
-}
-
-float_t dot(size_t n, const float_t* x, const float_t* y) {
-  float_t sum = 0;
-  for (size_t i = 0; i < n; ++i)
-    sum += x[i] * y[i];
-  return sum;
-}
 
 // matrix-vector multiply
 void mvmul(size_t m, size_t n, const float_t *matrix, const float_t *in_vector, float_t *out_vector) {
@@ -424,112 +513,5 @@ void sigmoid(vec_t& fv) {
   }
 }
 
-// Softmax function takes an N-dimensional vector (X) of real number,
-// and transforms it into a vector of real number in range (0,1) which add
-// upto 1. To make softmax func numerically stable, we simply normalize the
-// values in the vector, by multiplying the numerator and denominator with a
-// constant C, where log(C)=-max(X)
-//    exps = np.exp(X - np.max(X))
-//    exps / np.sum(exps)
-void softmax(const vec_t& input, vec_t& output) {
-  const float_t max = *std::max_element(input.begin(), input.end());
-  float_t denominator(0);
-  for (size_t i = 0; i < input.size(); i++) {
-    output[i] = std::exp(input[i] - max);
-    denominator += output[i];
-  }
-  for (size_t i = 0; i < input.size(); i++)
-    output[i] /= denominator;
-}
-
-void softmax(size_t n, const float_t* input, float_t* output) {
-  const float_t max = *std::max_element(input, input + n);
-  float_t denominator(0);
-  for (size_t i = 0; i < n; i++) {
-    output[i] = std::exp(input[i] - max);
-    denominator += output[i];
-  }
-  for (size_t i = 0; i < n; i++)
-    output[i] /= denominator;
-}
-
-void log_softmax(const vec_t& input, vec_t& output) {
-  const float_t max = *std::max_element(input.begin(), input.end());
-  float_t denominator(0);
-  for (size_t i = 0; i < input.size(); i++)
-    denominator += std::exp(input[i] - max);
-  for (size_t i = 0; i < input.size(); i++)
-    output[i] = input[i] - max - denominator;
-}
-
-// Due to the desirable property of softmax function outputting a probability
-// distribution, we often use it as the final layer in neural networks. For this
-// we need to calculate the derivative or gradient, and pass it back to the
-// previous layer during backpropagation.
-void d_softmax(const vec_t& y, const vec_t& p, vec_t& dy, const vec_t& dp) {
-  auto n = y.size();
-  vec_t df(n, 0);
-  for (size_t i = 0; i < n; i++) {
-    for (size_t j = 0; j < n; j++) {
-      // float_t delta_ij = i == j? 1 : 0;
-      // df[i] += p[j] * (delta_ij - p[i]);
-      df[j] = (j == i) ? p[i] * (float_t(1) - p[i]) : -p[j] * p[i];
-    }
-    // dy = dp * (gradient of softmax)
-    dy[i] = dot(dp, df);
-  }
-}
-
-void d_softmax(size_t n, const float_t* y, const float_t* p, float_t* dy,
-               const float_t* dp) {
-  vec_t df(n, 0);
-  for (size_t i = 0; i < n; i++) {
-    for (size_t j = 0; j < n; j++) {
-      df[j] = (j == i) ? p[i] * (float_t(1) - p[i]) : -p[j] * p[i];
-    }
-    dy[i] = dot(n, dp, &df[0]);
-  }
-}
 
-// cross-entropy loss function for multi-class classification
-// y: ground truth
-// p: predicted probability
-float_t cross_entropy(const vec_t& y, const vec_t& p) {
-  auto n = y.size();
-  assert(n > 0);
-  float_t loss = 0.0;
-  for (size_t i = 0; i < n; i++) {
-    if (y[i] == float_t(0))
-      continue;
-    if (p[i] == float_t(0))
-      loss -= y[i] * std::log(float_t(1e-10));
-    else loss -= y[i] * std::log(p[i]);
-  }
-  return loss;
-}
-
-float_t cross_entropy(size_t n, const float_t* y, const float_t* p) {
-  float_t loss = 0.0;
-  for (size_t i = 0; i < n; i++) {
-    if (y[i] == float_t(0))
-      continue;
-    if (p[i] == float_t(0))
-      loss -= y[i] * std::log(float_t(1e-10));
-    else
-      loss -= y[i] * std::log(p[i]);
-  }
-  return loss;
-}
 
-void d_cross_entropy(const vec_t& y, const vec_t& p, vec_t& d) {
-  auto n = y.size();
-  for (size_t i = 0; i < n; i++) {
-    d[i] = -y[i] / (p[i] + float_t(1e-10));
-  }
-}
-
-void d_cross_entropy(size_t n, const float_t* y, const float_t* p, float_t* d) {
-  for (size_t i = 0; i < n; i++) {
-    d[i] = -y[i] / (p[i] + float_t(1e-10));
-  }
-}

From 5fdcc11eb9dc75ac41da511291f60b07c3ab3b4e Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 9 Mar 2020 17:36:15 -0500
Subject: [PATCH 135/660] removed unnecessary Caffe things from context;
 updated license note

---
 libdeepgalois/include/deepgalois/context.h | 21 +---------
 libdeepgalois/licensenote.txt              | 49 ++++++++++++++++++++++
 libdeepgalois/src/context.cpp              |  8 ++--
 3 files changed, 56 insertions(+), 22 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h
index a622a0f0f7..be0154e33f 100644
--- a/libdeepgalois/include/deepgalois/context.h
+++ b/libdeepgalois/include/deepgalois/context.h
@@ -1,6 +1,6 @@
 #pragma once
 /**
- * TODO if used from somewhere, get copyright/licences
+ * Based on common.hpp file of the Caffe deep learning library.
  */
 
 #include <string>
@@ -20,16 +20,7 @@ class Context {
 public:
   Context();
   ~Context();
-  enum Brew { CPU, GPU };
-  Brew mode() { return mode_; }
-  void set_mode(Brew mode) { mode_ = mode; }
-  int solver_count() { return solver_count_; }
-  void set_solver_count(int val) { solver_count_ = val; }
-  int solver_rank() { return solver_rank_; }
-  void set_solver_rank(int val) { solver_rank_ = val; }
-  bool multiprocess() { return multiprocess_; }
-  void set_multiprocess(bool val) { multiprocess_ = val; }
-  bool root_solver() { return solver_rank_ == 0; }
+
   size_t read_graph(std::string dataset_str, bool selfloop);
   size_t read_labels(std::string dataset_str);
   size_t read_features(std::string dataset_str);
@@ -40,10 +31,6 @@ class Context {
   size_t read_graph_cpu(std::string dataset_str, std::string filetype, bool selfloop);
   size_t read_graph_gpu(std::string dataset_str, bool selfloop);
   void copy_data_to_device(); // copy labels and input features
-  void SetDevice(const int device_id);
-  void DeviceQuery() {}
-  bool CheckDevice(const int device_id) { return true; }
-  int FindDevice(const int start_id = 0) { return 0; }
   void norm_factor_counting();
 
   size_t n;                    // number of samples: N
@@ -76,9 +63,5 @@ class Context {
   static cusparseMatDescr_t cusparse_matdescr_; // used to call cuSPARSE
   static curandGenerator_t curand_generator_; // used to generate random numbers on GPU
 #endif
-  Brew mode_;
-  int solver_count_;
-  int solver_rank_;
-  bool multiprocess_;
 };
 } // end deepgalois namespace
diff --git a/libdeepgalois/licensenote.txt b/libdeepgalois/licensenote.txt
index 224adbc701..cf1aeb6caf 100644
--- a/libdeepgalois/licensenote.txt
+++ b/libdeepgalois/licensenote.txt
@@ -8,3 +8,52 @@ https://github.com/tiny-dnn/tiny-dnn/tree/master/tiny_dnn
 under BSD-3
 
 DGL structure as well from what I can tell
+
+================================================================================
+Caffe License
+================================================================================
+
+COPYRIGHT
+
+All contributions by the University of California:
+Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+All rights reserved.
+
+All other contributions:
+Copyright (c) 2014-2017, the respective contributors
+All rights reserved.
+
+Caffe uses a shared copyright model: each contributor holds copyright over
+their contributions to Caffe. The project versioning records all such
+contribution and copyright details. If a contributor wants to further mark
+their specific copyright on a particular contribution, they should indicate
+their copyright solely in the commit message of the change when it is
+committed.
+
+LICENSE
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met: 
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer. 
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution. 
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+CONTRIBUTION AGREEMENT
+
+By contributing to the BVLC/caffe repository through pull-request, comment,
+or otherwise, the contributor releases their content to the
+license and copyright terms herein.
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index 79bd0be985..2717567d28 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -1,11 +1,13 @@
+/**
+ * Based on common.hpp file of the Caffe deep learning library.
+ */
+
 #include "deepgalois/context.h"
 
 namespace deepgalois {
 
 #ifdef CPU_ONLY
-Context::Context()
-    : mode_(Context::CPU), solver_count_(1), solver_rank_(0),
-      multiprocess_(false) {}
+Context::Context() {}
 Context::~Context() {}
 
 size_t Context::read_graph(std::string dataset_str, bool selfloop) {

From 88097af6cbdf77bb60beded033d4924d42f6f594 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 9 Mar 2020 18:08:18 -0500
Subject: [PATCH 136/660] lonestargnn.h moved to include directry

---
 lonestargnn/CMakeLists.txt              | 2 +-
 lonestargnn/{ => include}/lonestargnn.h | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename lonestargnn/{ => include}/lonestargnn.h (100%)

diff --git a/lonestargnn/CMakeLists.txt b/lonestargnn/CMakeLists.txt
index 3f6cb7672f..90711e2212 100644
--- a/lonestargnn/CMakeLists.txt
+++ b/lonestargnn/CMakeLists.txt
@@ -2,7 +2,7 @@ include_directories(BEFORE
   ${CMAKE_SOURCE_DIR}/libllvm/include
   ${CMAKE_CURRENT_BINARY_DIR}/../libllvm/include
 )
-include_directories(${CMAKE_SOURCE_DIR}/lonestargnn)
+include_directories(${CMAKE_SOURCE_DIR}/lonestargnn/include)
 include_directories(${CMAKE_SOURCE_DIR}/libdeepgalois/include)
 
 SET(CUDA_INC /org/centers/cdgc/cuda/cuda-10.0/include)
diff --git a/lonestargnn/lonestargnn.h b/lonestargnn/include/lonestargnn.h
similarity index 100%
rename from lonestargnn/lonestargnn.h
rename to lonestargnn/include/lonestargnn.h

From 7fb67db34d01aa1fe720b37acdeea7b858fa283e Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 9 Mar 2020 18:09:41 -0500
Subject: [PATCH 137/660] net: caffe header notif

---
 libdeepgalois/include/deepgalois/net.h | 5 +++--
 libdeepgalois/src/net.cpp              | 4 ++++
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h
index dfc4f3d0d7..0daf730c42 100644
--- a/libdeepgalois/include/deepgalois/net.h
+++ b/libdeepgalois/include/deepgalois/net.h
@@ -1,5 +1,6 @@
-// TODO if this code was based on something, get copyright/license and put here
-
+/**
+ * Based on the net.hpp file from Caffe deep learning framework.
+ */
 #ifndef _MODEL_H_
 #define _MODEL_H_
 
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index a194bd43d7..d594e789e8 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -1,3 +1,7 @@
+/**
+ * Based on the net.hpp file from Caffe deep learning framework.
+ */
+
 #include "deepgalois/net.h"
 
 namespace deepgalois {

From 87297fc88ab58063018bf0144fd4a67d452326eb Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 9 Mar 2020 18:27:45 -0500
Subject: [PATCH 138/660] copy distgraphload to lonestargnn TODO need to remove
 some args and merge with current gnn cmdline args

---
 lonestargnn/CMakeLists.txt                   |   9 +
 lonestargnn/gcn/CMakeLists.txt               |   3 +
 lonestargnn/gcn/gcn.cpp                      |   8 +
 lonestargnn/include/DistributedGraphLoader.h | 441 +++++++++++++++++++
 lonestargnn/src/DistributedGraphLoader.cpp   |  87 ++++
 5 files changed, 548 insertions(+)
 create mode 100644 lonestargnn/include/DistributedGraphLoader.h
 create mode 100644 lonestargnn/src/DistributedGraphLoader.cpp

diff --git a/lonestargnn/CMakeLists.txt b/lonestargnn/CMakeLists.txt
index 90711e2212..a06dd1907b 100644
--- a/lonestargnn/CMakeLists.txt
+++ b/lonestargnn/CMakeLists.txt
@@ -2,6 +2,7 @@ include_directories(BEFORE
   ${CMAKE_SOURCE_DIR}/libllvm/include
   ${CMAKE_CURRENT_BINARY_DIR}/../libllvm/include
 )
+
 include_directories(${CMAKE_SOURCE_DIR}/lonestargnn/include)
 include_directories(${CMAKE_SOURCE_DIR}/libdeepgalois/include)
 
@@ -19,4 +20,12 @@ if(NOT ENABLE_HETERO_GALOIS)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCPU_ONLY")
 endif()
 
+if(ENABLE_DIST_GALOIS)
+  add_library(distgraphloader STATIC src/DistributedGraphLoader.cpp)
+  target_include_directories(distgraphloader PUBLIC
+    include
+  )
+  target_link_libraries(distgraphloader galois_cusp)
+endif()
+
 add_subdirectory(gcn)
diff --git a/lonestargnn/gcn/CMakeLists.txt b/lonestargnn/gcn/CMakeLists.txt
index c3fb95c07f..48c7156dcc 100644
--- a/lonestargnn/gcn/CMakeLists.txt
+++ b/lonestargnn/gcn/CMakeLists.txt
@@ -1,5 +1,8 @@
 app(gcn gcn.cpp)
 target_link_libraries(gcn dg_cpu)
+if(ENABLE_DIST_GALOIS)
+  target_link_libraries(gcn distgraphloader)
+endif()
 if(ENABLE_HETERO_GALOIS)
   target_link_libraries(gcn dg_gpu)
   target_link_libraries(gcn -lcudart -lcublas -lcurand -lcudadevrt)
diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp
index 086b6701de..f2e3f3f1eb 100644
--- a/lonestargnn/gcn/gcn.cpp
+++ b/lonestargnn/gcn/gcn.cpp
@@ -1,6 +1,9 @@
 // Graph Neural Networks
 // Xuhao Chen <cxh@utexas.edu>
 #include "lonestargnn.h"
+#ifdef GALOIS_USE_DIST
+#include "DistributedGraphLoader.h"
+#endif
 
 const char* name = "Graph Convolutional Networks";
 const char* desc = "Graph convolutional neural networks on an undirected graph";
@@ -21,6 +24,11 @@ int main(int argc, char** argv) {
                               // the user
   network.print_layers_info();
 
+#ifdef GALOIS_USE_DIST
+  std::vector<unsigned> dummy;
+  galois::graphs::constructSymmetricGraph<unsigned, void>(dummy);
+#endif
+
   // tracks peak memory usage
   deepgalois::ResourceManager rm;
 
diff --git a/lonestargnn/include/DistributedGraphLoader.h b/lonestargnn/include/DistributedGraphLoader.h
new file mode 100644
index 0000000000..43d27d9669
--- /dev/null
+++ b/lonestargnn/include/DistributedGraphLoader.h
@@ -0,0 +1,441 @@
+/*
+ * This file belongs to the Galois project, a C++ library for exploiting parallelism.
+ * The code is being released under the terms of the 3-Clause BSD License (a
+ * copy is located in LICENSE.txt at the top-level directory).
+ *
+ * Copyright (C) 2019, The University of Texas at Austin. All rights reserved.
+ * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
+ * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
+ * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
+ * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
+ * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
+ * shall University be liable for incidental, special, indirect, direct or
+ * consequential damages or loss of profits, interruption of business, or
+ * related expenses which may arise from use of Software or Documentation,
+ * including but not limited to those resulting from defects in Software and/or
+ * Documentation, or loss or inaccuracy of data of any kind.
+ */
+
+/**
+ * @file DistributedGraphLoader.h
+ *
+ * Contains definitions for the common distributed graph loading functionality
+ * of Galois.
+ *
+ * @todo Refactoring a bunch of this code is likely very possible to do
+ */
+#ifndef D_GRAPH_LOADER
+#define D_GRAPH_LOADER
+
+#include "galois/graphs/CuSPPartitioner.h"
+
+/*******************************************************************************
+ * Supported partitioning schemes
+ ******************************************************************************/
+namespace galois {
+namespace graphs {
+
+//! enums of partitioning schemes supported
+enum PARTITIONING_SCHEME {
+  OEC,                   //!< outgoing edge cut
+  IEC,                   //!< incoming edge cut
+  HOVC,                  //!< outgoing hybrid vertex cut
+  HIVC,                  //!< incoming hybrid vertex cut
+  CART_VCUT,             //!< cartesian vertex cut
+  CART_VCUT_IEC,         //!< cartesian vertex cut using iec
+  //CEC,                   //!< custom edge cut
+  GINGER_O,              //!< Ginger, outgoing
+  GINGER_I,              //!< Ginger, incoming
+  FENNEL_O,              //!< Fennel, oec
+  FENNEL_I,              //!< Fennel, iec
+  SUGAR_O                //!< Sugar, oec
+};
+
+/**
+ * Turns a PARTITIONING_SCHEME enum to a string
+ *
+ * @param e partitioning scheme enum
+ * @return string version of e
+ */
+inline const char* EnumToString(PARTITIONING_SCHEME e) {
+  switch (e) {
+  case OEC:
+    return "oec";
+  case IEC:
+    return "iec";
+  case HOVC:
+    return "hovc";
+  case HIVC:
+    return "hivc";
+  case CART_VCUT:
+    return "cvc";
+  case CART_VCUT_IEC:
+    return "cvc_iec";
+  //case CEC:
+  //  return "cec";
+  case GINGER_O:
+    return "ginger-oec";
+  case GINGER_I:
+    return "ginger-iec";
+  case FENNEL_O:
+    return "fennel-oec";
+  case FENNEL_I:
+    return "fennel-iec";
+  case SUGAR_O:
+    return "sugar-oec";
+  default:
+    GALOIS_DIE("Unsupported partition");
+  }
+}
+} // end namespace graphs
+} // end namespace galois
+
+/*******************************************************************************
+ * Graph-loading-related command line arguments
+ ******************************************************************************/
+namespace cll = llvm::cl;
+
+//! input graph file
+extern cll::opt<std::string> inputFile;
+//! input graph file, but transposed
+extern cll::opt<std::string> inputFileTranspose;
+//! symmetric input graph file
+extern cll::opt<bool> inputFileSymmetric;
+//! partitioning scheme to use
+extern cll::opt<galois::graphs::PARTITIONING_SCHEME> partitionScheme;
+////! path to vertex id map for custom edge cut
+//extern cll::opt<std::string> vertexIDMapFileName;
+//! true if you want to read graph structure from a file
+extern cll::opt<bool> readFromFile;
+//! path to local graph structure to read
+extern cll::opt<std::string> localGraphFileName;
+//! if true, the local graph structure will be saved to disk after partitioning
+extern cll::opt<bool> saveLocalGraph;
+//! file specifying blocking of masters
+extern cll::opt<std::string> mastersFile;
+
+// @todo command line argument for read balancing across hosts
+
+namespace galois {
+namespace graphs {
+
+/*******************************************************************************
+ * Graph-loading functions
+ ******************************************************************************/
+
+/**
+ * Loads a symmetric graph file (i.e. directed graph with edges in both
+ * directions)
+ *
+ * @tparam NodeData node data to store in graph
+ * @tparam EdgeData edge data to store in graph
+ * @param scaleFactor How to split nodes among hosts
+ * @returns a pointer to a newly allocated DistGraph based on the command line
+ * loaded based on command line arguments
+ */
+template <typename NodeData, typename EdgeData>
+DistGraph<NodeData, EdgeData>*
+constructSymmetricGraph(std::vector<unsigned>& scaleFactor) {
+  if (!inputFileSymmetric) {
+    GALOIS_DIE("Calling constructSymmetricGraph without inputFileSymmetric "
+               "flag");
+  }
+
+  switch (partitionScheme) {
+  case OEC:
+  case IEC:
+    return cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
+      inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, inputFileTranspose,
+      mastersFile
+    );
+  case HOVC:
+  case HIVC:
+    return cuspPartitionGraph<GenericHVC, NodeData, EdgeData>(
+      inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, inputFileTranspose
+    );
+
+  case CART_VCUT:
+  case CART_VCUT_IEC:
+    return cuspPartitionGraph<GenericCVC, NodeData, EdgeData>(
+      inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, inputFileTranspose
+    );
+
+  //case CEC:
+  //  return new Graph_customEdgeCut(inputFile, "", net.ID, net.Num,
+  //                                 scaleFactor, vertexIDMapFileName, false);
+
+  case GINGER_O:
+  case GINGER_I:
+    return cuspPartitionGraph<GingerP, NodeData, EdgeData>(
+      inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, inputFileTranspose
+    );
+
+  case FENNEL_O:
+  case FENNEL_I:
+    return cuspPartitionGraph<FennelP, NodeData, EdgeData>(
+      inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, inputFileTranspose
+    );
+
+  case SUGAR_O:
+    return cuspPartitionGraph<SugarP, NodeData, EdgeData>(
+      inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, inputFileTranspose
+    );
+  default:
+    GALOIS_DIE("Error: partition scheme specified is invalid");
+    return nullptr;
+  }
+}
+
+/**
+ * Loads a graph file with the purpose of iterating over the out edges
+ * of the graph.
+ *
+ * @tparam NodeData node data to store in graph
+ * @tparam EdgeData edge data to store in graph
+ * @tparam iterateOut says if you want to iterate over out edges or not; if
+ * false, will iterate over in edgse
+ * @tparam enable_if this function  will only be enabled if iterateOut is true
+ * @param scaleFactor How to split nodes among hosts
+ * @returns a pointer to a newly allocated DistGraph based on the command line
+ * loaded based on command line arguments
+ */
+template <typename NodeData, typename EdgeData, bool iterateOut = true,
+          typename std::enable_if<iterateOut>::type* = nullptr>
+DistGraph<NodeData, EdgeData>*
+constructGraph(std::vector<unsigned>& scaleFactor) {
+  // 1 host = no concept of cut; just load from edgeCut, no transpose
+  auto& net = galois::runtime::getSystemNetworkInterface();
+  if (net.Num == 1) {
+    return cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
+      inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false, inputFileTranspose
+    );
+  }
+
+  switch (partitionScheme) {
+  case OEC:
+    return cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
+      inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false, inputFileTranspose,
+      mastersFile
+    );
+  case IEC:
+    if (inputFileTranspose.size()) {
+      return cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
+        inputFile, galois::CUSP_CSC, galois::CUSP_CSR, false, inputFileTranspose,
+        mastersFile
+      );
+    } else {
+      GALOIS_DIE("Error: attempting incoming edge cut without transpose "
+                 "graph");
+      break;
+    }
+
+  case HOVC:
+    return cuspPartitionGraph<GenericHVC, NodeData, EdgeData>(
+      inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false, inputFileTranspose
+    );
+  case HIVC:
+    if (inputFileTranspose.size()) {
+      return cuspPartitionGraph<GenericHVC, NodeData, EdgeData>(
+        inputFile, galois::CUSP_CSC, galois::CUSP_CSR, false, inputFileTranspose
+      );
+    } else {
+      GALOIS_DIE("Error: attempting incoming hybrid cut without transpose "
+                 "graph");
+      break;
+    }
+
+  case CART_VCUT:
+    return cuspPartitionGraph<GenericCVC, NodeData, EdgeData>(
+      inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false, inputFileTranspose
+    );
+
+  case CART_VCUT_IEC:
+    if (inputFileTranspose.size()) {
+      return cuspPartitionGraph<GenericCVC, NodeData, EdgeData>(
+        inputFile, galois::CUSP_CSC, galois::CUSP_CSR, false, inputFileTranspose
+      );
+    } else {
+      GALOIS_DIE("Error: attempting cvc incoming cut without "
+                 "transpose graph");
+      break;
+    }
+
+  //case CEC:
+  //  return new Graph_customEdgeCut(inputFile, "", net.ID, net.Num,
+  //                                 scaleFactor, vertexIDMapFileName, false);
+
+  case GINGER_O:
+    return cuspPartitionGraph<GingerP, NodeData, EdgeData>(
+      inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false, inputFileTranspose
+    );
+  case GINGER_I:
+    if (inputFileTranspose.size()) {
+      return cuspPartitionGraph<GingerP, NodeData, EdgeData>(
+        inputFile, galois::CUSP_CSC, galois::CUSP_CSR, false, inputFileTranspose
+      );
+    } else {
+      GALOIS_DIE("Error: attempting Ginger without transpose graph");
+      break;
+    }
+
+  case FENNEL_O:
+    return cuspPartitionGraph<FennelP, NodeData, EdgeData>(
+      inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false, inputFileTranspose
+    );
+  case FENNEL_I:
+    if (inputFileTranspose.size()) {
+      return cuspPartitionGraph<FennelP, NodeData, EdgeData>(
+        inputFile, galois::CUSP_CSC, galois::CUSP_CSR, false, inputFileTranspose
+      );
+    } else {
+      GALOIS_DIE("Error: attempting Fennel incoming without transpose graph");
+      break;
+    }
+
+  case SUGAR_O:
+    return cuspPartitionGraph<SugarP, NodeData, EdgeData>(
+      inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false, inputFileTranspose
+    );
+
+  default:
+    GALOIS_DIE("Error: partition scheme specified is invalid");
+    return nullptr;
+  }
+}
+
+/**
+ * Loads a graph file with the purpose of iterating over the in edges
+ * of the graph.
+ *
+ * @tparam NodeData node data to store in graph
+ * @tparam EdgeData edge data to store in graph
+ * @tparam iterateOut says if you want to iterate over out edges or not; if
+ * false, will iterate over in edges
+ * @tparam enable_if this function  will only be enabled if iterateOut is false
+ * (i.e. iterate over in-edges)
+ * @param scaleFactor How to split nodes among hosts
+ * @returns a pointer to a newly allocated DistGraph based on the command line
+ * loaded based on command line arguments
+ */
+template <typename NodeData, typename EdgeData, bool iterateOut = true,
+          typename std::enable_if<!iterateOut>::type* = nullptr>
+DistGraph<NodeData, EdgeData>*
+constructGraph(std::vector<unsigned>& scaleFactor) {
+  auto& net = galois::runtime::getSystemNetworkInterface();
+
+  // 1 host = no concept of cut; just load from edgeCut
+  if (net.Num == 1) {
+    if (inputFileTranspose.size()) {
+      return cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
+        inputFile, galois::CUSP_CSC, galois::CUSP_CSC, false, inputFileTranspose
+      );
+    } else {
+      fprintf(stderr, "WARNING: Loading transpose graph through in-memory "
+                      "transpose to iterate over in-edges: pass in transpose "
+                      "graph with -graphTranspose to avoid unnecessary "
+                      "overhead.\n");
+      return cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false, inputFileTranspose
+      );
+    }
+  }
+
+  switch (partitionScheme) {
+  case OEC:
+    return cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
+      inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false, inputFileTranspose,
+      mastersFile
+    );
+  case IEC:
+    if (inputFileTranspose.size()) {
+      return cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
+        inputFile, galois::CUSP_CSC, galois::CUSP_CSC, false, inputFileTranspose,
+        mastersFile
+      );
+    } else {
+      GALOIS_DIE("Error: attempting incoming edge cut without transpose "
+                 "graph");
+      break;
+    }
+
+  case HOVC:
+    return cuspPartitionGraph<GenericHVC, NodeData, EdgeData>(
+      inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false, inputFileTranspose
+    );
+  case HIVC:
+    if (inputFileTranspose.size()) {
+      return cuspPartitionGraph<GenericHVC, NodeData, EdgeData>(
+        inputFile, galois::CUSP_CSC, galois::CUSP_CSC, false, inputFileTranspose
+      );
+    } else {
+      GALOIS_DIE("Error: (hivc) iterate over in-edges without transpose graph");
+      break;
+    }
+
+  case CART_VCUT:
+    return cuspPartitionGraph<GenericCVCColumnFlip, NodeData, EdgeData>(
+      inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false, inputFileTranspose
+    );
+  case CART_VCUT_IEC:
+    if (inputFileTranspose.size()) {
+      return cuspPartitionGraph<GenericCVCColumnFlip, NodeData, EdgeData>(
+        inputFile, galois::CUSP_CSC, galois::CUSP_CSC, false, inputFileTranspose
+      );
+    } else {
+      GALOIS_DIE("Error: (cvc) iterate over in-edges without transpose graph");
+      break;
+    }
+
+  //case CEC:
+  //  if (inputFileTranspose.size()) {
+  //    return new Graph_customEdgeCut(inputFileTranspose, "", net.ID,
+  //                                   net.Num, scaleFactor, vertexIDMapFileName,
+  //                                   false);
+  //  } else {
+  //    GALOIS_DIE("Error: (cec) iterate over in-edges without transpose graph");
+  //    break;
+  //  }
+
+  case GINGER_O:
+    return cuspPartitionGraph<GingerP, NodeData, EdgeData>(
+      inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false, inputFileTranspose
+    );
+  case GINGER_I:
+    if (inputFileTranspose.size()) {
+      return cuspPartitionGraph<GingerP, NodeData, EdgeData>(
+        inputFile, galois::CUSP_CSC, galois::CUSP_CSC, false, inputFileTranspose
+      );
+    } else {
+      GALOIS_DIE("Error: attempting Ginger without transpose graph");
+      break;
+    }
+
+  case FENNEL_O:
+    return cuspPartitionGraph<FennelP, NodeData, EdgeData>(
+      inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false, inputFileTranspose
+    );
+  case FENNEL_I:
+    if (inputFileTranspose.size()) {
+      return cuspPartitionGraph<FennelP, NodeData, EdgeData>(
+        inputFile, galois::CUSP_CSC, galois::CUSP_CSC, false, inputFileTranspose
+      );
+    } else {
+      GALOIS_DIE("Error: attempting Fennel incoming without transpose graph");
+      break;
+    }
+
+  case SUGAR_O:
+    return cuspPartitionGraph<SugarColumnFlipP, NodeData, EdgeData>(
+      inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false, inputFileTranspose
+    );
+
+  default:
+    GALOIS_DIE("Error: partition scheme specified is invalid");
+    return nullptr;
+  }
+}
+
+} // end namespace graphs
+} // end namespace galois
+#endif
diff --git a/lonestargnn/src/DistributedGraphLoader.cpp b/lonestargnn/src/DistributedGraphLoader.cpp
new file mode 100644
index 0000000000..f2e336028e
--- /dev/null
+++ b/lonestargnn/src/DistributedGraphLoader.cpp
@@ -0,0 +1,87 @@
+/*
+ * This file belongs to the Galois project, a C++ library for exploiting parallelism.
+ * The code is being released under the terms of the 3-Clause BSD License (a
+ * copy is located in LICENSE.txt at the top-level directory).
+ *
+ * Copyright (C) 2019, The University of Texas at Austin. All rights reserved.
+ * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
+ * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
+ * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
+ * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
+ * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
+ * shall University be liable for incidental, special, indirect, direct or
+ * consequential damages or loss of profits, interruption of business, or
+ * related expenses which may arise from use of Software or Documentation,
+ * including but not limited to those resulting from defects in Software and/or
+ * Documentation, or loss or inaccuracy of data of any kind.
+ */
+
+/**
+ * @file DistributedGraphLoader.cpp
+ *
+ * Contains definitions for command line arguments related to distributed
+ * graph loading.
+ */
+
+#include "DistributedGraphLoader.h"
+
+using namespace galois::graphs;
+
+namespace cll = llvm::cl;
+
+cll::opt<std::string> inputFile(cll::Positional, cll::desc("<input file>"),
+                                cll::Required);
+cll::opt<std::string> inputFileTranspose("graphTranspose",
+                                         cll::desc("<input file, transposed>"),
+                                         cll::init(""));
+cll::opt<bool>
+    inputFileSymmetric("symmetricGraph",
+                       cll::desc("Set this flag if graph is symmetric"),
+                       cll::init(false));
+
+cll::opt<PARTITIONING_SCHEME> partitionScheme(
+    "partition", cll::desc("Type of partitioning."),
+    cll::values(
+        clEnumValN(OEC, "oec", "Outgoing Edge-Cut (default)"),
+        clEnumValN(IEC, "iec", "Incoming Edge-Cut"),
+        clEnumValN(HOVC, "hovc", "Outgoing Hybrid Vertex-Cut"),
+        clEnumValN(HIVC, "hivc", "Incoming Hybrid Vertex-Cut"),
+        clEnumValN(CART_VCUT, "cvc", "Cartesian Vertex-Cut of oec"),
+        clEnumValN(CART_VCUT_IEC, "cvc-iec", "Cartesian Vertex-Cut of iec"),
+        //clEnumValN(CEC, "cec", "Custom edge cut from vertexID mapping"),
+        clEnumValN(GINGER_O, "ginger-o", "ginger, outgiong edges, using CuSP"),
+        clEnumValN(GINGER_I, "ginger-i", "ginger, incoming edges, using CuSP"),
+        clEnumValN(FENNEL_O, "fennel-o", "fennel, outgoing edge cut, using CuSP"),
+        clEnumValN(FENNEL_I, "fennel-i", "fennel, incoming edge cut, using CuSP"),
+        clEnumValN(SUGAR_O, "sugar-o", "fennel, incoming edge cut, using CuSP"),
+        clEnumValEnd),
+    cll::init(OEC));
+
+//cll::opt<std::string>
+//    vertexIDMapFileName("vertexIDMapFileName",
+//                        cll::desc("<file containing the "
+//                                  "vertexID to hosts mapping for "
+//                                  "the custom edge cut.>"),
+//                        cll::init(""), cll::Hidden);
+
+cll::opt<bool> readFromFile("readFromFile",
+                            cll::desc("Set this flag if graph is to be "
+                                      "constructed from file (file must be "
+                                      "created by Abelian CSR)"),
+                            cll::init(false), cll::Hidden);
+
+cll::opt<std::string>
+    localGraphFileName("localGraphFileName",
+                       cll::desc("Name of the local file to construct "
+                                 "local graph (file must be created by "
+                                 "Abelian CSR)"),
+                       cll::init("local_graph"), cll::Hidden);
+
+cll::opt<bool> saveLocalGraph("saveLocalGraph",
+                              cll::desc("Set to save the local CSR graph"),
+                              cll::init(false), cll::Hidden);
+
+cll::opt<std::string> mastersFile("mastersFile",
+                                  cll::desc("File specifying masters blocking"),
+                                  cll::init(""), cll::Hidden);

From c58b33c2b2666faba9170f1f87e303764f1ebd19 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 9 Mar 2020 18:42:03 -0500
Subject: [PATCH 139/660] constructSymGraph now integrated: partitioning of
 input possible

---
 lonestargnn/include/DistributedGraphLoader.h | 298 +------------------
 lonestargnn/src/DistributedGraphLoader.cpp   |  38 ---
 2 files changed, 14 insertions(+), 322 deletions(-)

diff --git a/lonestargnn/include/DistributedGraphLoader.h b/lonestargnn/include/DistributedGraphLoader.h
index 43d27d9669..b7da4faa54 100644
--- a/lonestargnn/include/DistributedGraphLoader.h
+++ b/lonestargnn/include/DistributedGraphLoader.h
@@ -23,12 +23,15 @@
  * Contains definitions for the common distributed graph loading functionality
  * of Galois.
  *
+ * Version for GNNs which only support symmetric graphs at this point in time.
+ *
  * @todo Refactoring a bunch of this code is likely very possible to do
  */
-#ifndef D_GRAPH_LOADER
-#define D_GRAPH_LOADER
+#ifndef D_GRAPH_LOADER_SYM
+#define D_GRAPH_LOADER_SYM
 
 #include "galois/graphs/CuSPPartitioner.h"
+#include "deepgalois/utils.h"
 
 /*******************************************************************************
  * Supported partitioning schemes
@@ -44,7 +47,6 @@ enum PARTITIONING_SCHEME {
   HIVC,                  //!< incoming hybrid vertex cut
   CART_VCUT,             //!< cartesian vertex cut
   CART_VCUT_IEC,         //!< cartesian vertex cut using iec
-  //CEC,                   //!< custom edge cut
   GINGER_O,              //!< Ginger, outgoing
   GINGER_I,              //!< Ginger, incoming
   FENNEL_O,              //!< Fennel, oec
@@ -72,8 +74,6 @@ inline const char* EnumToString(PARTITIONING_SCHEME e) {
     return "cvc";
   case CART_VCUT_IEC:
     return "cvc_iec";
-  //case CEC:
-  //  return "cec";
   case GINGER_O:
     return "ginger-oec";
   case GINGER_I:
@@ -97,23 +97,9 @@ inline const char* EnumToString(PARTITIONING_SCHEME e) {
 namespace cll = llvm::cl;
 
 //! input graph file
-extern cll::opt<std::string> inputFile;
-//! input graph file, but transposed
-extern cll::opt<std::string> inputFileTranspose;
-//! symmetric input graph file
-extern cll::opt<bool> inputFileSymmetric;
+extern cll::opt<std::string> dataset;
 //! partitioning scheme to use
 extern cll::opt<galois::graphs::PARTITIONING_SCHEME> partitionScheme;
-////! path to vertex id map for custom edge cut
-//extern cll::opt<std::string> vertexIDMapFileName;
-//! true if you want to read graph structure from a file
-extern cll::opt<bool> readFromFile;
-//! path to local graph structure to read
-extern cll::opt<std::string> localGraphFileName;
-//! if true, the local graph structure will be saved to disk after partitioning
-extern cll::opt<bool> saveLocalGraph;
-//! file specifying blocking of masters
-extern cll::opt<std::string> mastersFile;
 
 // @todo command line argument for read balancing across hosts
 
@@ -137,299 +123,43 @@ namespace graphs {
 template <typename NodeData, typename EdgeData>
 DistGraph<NodeData, EdgeData>*
 constructSymmetricGraph(std::vector<unsigned>& scaleFactor) {
-  if (!inputFileSymmetric) {
-    GALOIS_DIE("Calling constructSymmetricGraph without inputFileSymmetric "
-               "flag");
-  }
+  std::string inputFile = deepgalois::path + dataset + ".csgr";
+  galois::gInfo("File to read is ", inputFile);
 
   switch (partitionScheme) {
   case OEC:
   case IEC:
     return cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
-      inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, inputFileTranspose,
-      mastersFile
+      inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, ""
     );
   case HOVC:
   case HIVC:
     return cuspPartitionGraph<GenericHVC, NodeData, EdgeData>(
-      inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, inputFileTranspose
+      inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, ""
     );
 
   case CART_VCUT:
   case CART_VCUT_IEC:
     return cuspPartitionGraph<GenericCVC, NodeData, EdgeData>(
-      inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, inputFileTranspose
+      inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, ""
     );
 
-  //case CEC:
-  //  return new Graph_customEdgeCut(inputFile, "", net.ID, net.Num,
-  //                                 scaleFactor, vertexIDMapFileName, false);
-
   case GINGER_O:
   case GINGER_I:
     return cuspPartitionGraph<GingerP, NodeData, EdgeData>(
-      inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, inputFileTranspose
+      inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, ""
     );
 
   case FENNEL_O:
   case FENNEL_I:
     return cuspPartitionGraph<FennelP, NodeData, EdgeData>(
-      inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, inputFileTranspose
+      inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, ""
     );
 
   case SUGAR_O:
     return cuspPartitionGraph<SugarP, NodeData, EdgeData>(
-      inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, inputFileTranspose
-    );
-  default:
-    GALOIS_DIE("Error: partition scheme specified is invalid");
-    return nullptr;
-  }
-}
-
-/**
- * Loads a graph file with the purpose of iterating over the out edges
- * of the graph.
- *
- * @tparam NodeData node data to store in graph
- * @tparam EdgeData edge data to store in graph
- * @tparam iterateOut says if you want to iterate over out edges or not; if
- * false, will iterate over in edgse
- * @tparam enable_if this function  will only be enabled if iterateOut is true
- * @param scaleFactor How to split nodes among hosts
- * @returns a pointer to a newly allocated DistGraph based on the command line
- * loaded based on command line arguments
- */
-template <typename NodeData, typename EdgeData, bool iterateOut = true,
-          typename std::enable_if<iterateOut>::type* = nullptr>
-DistGraph<NodeData, EdgeData>*
-constructGraph(std::vector<unsigned>& scaleFactor) {
-  // 1 host = no concept of cut; just load from edgeCut, no transpose
-  auto& net = galois::runtime::getSystemNetworkInterface();
-  if (net.Num == 1) {
-    return cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
-      inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false, inputFileTranspose
-    );
-  }
-
-  switch (partitionScheme) {
-  case OEC:
-    return cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
-      inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false, inputFileTranspose,
-      mastersFile
+      inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, ""
     );
-  case IEC:
-    if (inputFileTranspose.size()) {
-      return cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSC, galois::CUSP_CSR, false, inputFileTranspose,
-        mastersFile
-      );
-    } else {
-      GALOIS_DIE("Error: attempting incoming edge cut without transpose "
-                 "graph");
-      break;
-    }
-
-  case HOVC:
-    return cuspPartitionGraph<GenericHVC, NodeData, EdgeData>(
-      inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false, inputFileTranspose
-    );
-  case HIVC:
-    if (inputFileTranspose.size()) {
-      return cuspPartitionGraph<GenericHVC, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSC, galois::CUSP_CSR, false, inputFileTranspose
-      );
-    } else {
-      GALOIS_DIE("Error: attempting incoming hybrid cut without transpose "
-                 "graph");
-      break;
-    }
-
-  case CART_VCUT:
-    return cuspPartitionGraph<GenericCVC, NodeData, EdgeData>(
-      inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false, inputFileTranspose
-    );
-
-  case CART_VCUT_IEC:
-    if (inputFileTranspose.size()) {
-      return cuspPartitionGraph<GenericCVC, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSC, galois::CUSP_CSR, false, inputFileTranspose
-      );
-    } else {
-      GALOIS_DIE("Error: attempting cvc incoming cut without "
-                 "transpose graph");
-      break;
-    }
-
-  //case CEC:
-  //  return new Graph_customEdgeCut(inputFile, "", net.ID, net.Num,
-  //                                 scaleFactor, vertexIDMapFileName, false);
-
-  case GINGER_O:
-    return cuspPartitionGraph<GingerP, NodeData, EdgeData>(
-      inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false, inputFileTranspose
-    );
-  case GINGER_I:
-    if (inputFileTranspose.size()) {
-      return cuspPartitionGraph<GingerP, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSC, galois::CUSP_CSR, false, inputFileTranspose
-      );
-    } else {
-      GALOIS_DIE("Error: attempting Ginger without transpose graph");
-      break;
-    }
-
-  case FENNEL_O:
-    return cuspPartitionGraph<FennelP, NodeData, EdgeData>(
-      inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false, inputFileTranspose
-    );
-  case FENNEL_I:
-    if (inputFileTranspose.size()) {
-      return cuspPartitionGraph<FennelP, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSC, galois::CUSP_CSR, false, inputFileTranspose
-      );
-    } else {
-      GALOIS_DIE("Error: attempting Fennel incoming without transpose graph");
-      break;
-    }
-
-  case SUGAR_O:
-    return cuspPartitionGraph<SugarP, NodeData, EdgeData>(
-      inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false, inputFileTranspose
-    );
-
-  default:
-    GALOIS_DIE("Error: partition scheme specified is invalid");
-    return nullptr;
-  }
-}
-
-/**
- * Loads a graph file with the purpose of iterating over the in edges
- * of the graph.
- *
- * @tparam NodeData node data to store in graph
- * @tparam EdgeData edge data to store in graph
- * @tparam iterateOut says if you want to iterate over out edges or not; if
- * false, will iterate over in edges
- * @tparam enable_if this function  will only be enabled if iterateOut is false
- * (i.e. iterate over in-edges)
- * @param scaleFactor How to split nodes among hosts
- * @returns a pointer to a newly allocated DistGraph based on the command line
- * loaded based on command line arguments
- */
-template <typename NodeData, typename EdgeData, bool iterateOut = true,
-          typename std::enable_if<!iterateOut>::type* = nullptr>
-DistGraph<NodeData, EdgeData>*
-constructGraph(std::vector<unsigned>& scaleFactor) {
-  auto& net = galois::runtime::getSystemNetworkInterface();
-
-  // 1 host = no concept of cut; just load from edgeCut
-  if (net.Num == 1) {
-    if (inputFileTranspose.size()) {
-      return cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSC, galois::CUSP_CSC, false, inputFileTranspose
-      );
-    } else {
-      fprintf(stderr, "WARNING: Loading transpose graph through in-memory "
-                      "transpose to iterate over in-edges: pass in transpose "
-                      "graph with -graphTranspose to avoid unnecessary "
-                      "overhead.\n");
-      return cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false, inputFileTranspose
-      );
-    }
-  }
-
-  switch (partitionScheme) {
-  case OEC:
-    return cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
-      inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false, inputFileTranspose,
-      mastersFile
-    );
-  case IEC:
-    if (inputFileTranspose.size()) {
-      return cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSC, galois::CUSP_CSC, false, inputFileTranspose,
-        mastersFile
-      );
-    } else {
-      GALOIS_DIE("Error: attempting incoming edge cut without transpose "
-                 "graph");
-      break;
-    }
-
-  case HOVC:
-    return cuspPartitionGraph<GenericHVC, NodeData, EdgeData>(
-      inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false, inputFileTranspose
-    );
-  case HIVC:
-    if (inputFileTranspose.size()) {
-      return cuspPartitionGraph<GenericHVC, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSC, galois::CUSP_CSC, false, inputFileTranspose
-      );
-    } else {
-      GALOIS_DIE("Error: (hivc) iterate over in-edges without transpose graph");
-      break;
-    }
-
-  case CART_VCUT:
-    return cuspPartitionGraph<GenericCVCColumnFlip, NodeData, EdgeData>(
-      inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false, inputFileTranspose
-    );
-  case CART_VCUT_IEC:
-    if (inputFileTranspose.size()) {
-      return cuspPartitionGraph<GenericCVCColumnFlip, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSC, galois::CUSP_CSC, false, inputFileTranspose
-      );
-    } else {
-      GALOIS_DIE("Error: (cvc) iterate over in-edges without transpose graph");
-      break;
-    }
-
-  //case CEC:
-  //  if (inputFileTranspose.size()) {
-  //    return new Graph_customEdgeCut(inputFileTranspose, "", net.ID,
-  //                                   net.Num, scaleFactor, vertexIDMapFileName,
-  //                                   false);
-  //  } else {
-  //    GALOIS_DIE("Error: (cec) iterate over in-edges without transpose graph");
-  //    break;
-  //  }
-
-  case GINGER_O:
-    return cuspPartitionGraph<GingerP, NodeData, EdgeData>(
-      inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false, inputFileTranspose
-    );
-  case GINGER_I:
-    if (inputFileTranspose.size()) {
-      return cuspPartitionGraph<GingerP, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSC, galois::CUSP_CSC, false, inputFileTranspose
-      );
-    } else {
-      GALOIS_DIE("Error: attempting Ginger without transpose graph");
-      break;
-    }
-
-  case FENNEL_O:
-    return cuspPartitionGraph<FennelP, NodeData, EdgeData>(
-      inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false, inputFileTranspose
-    );
-  case FENNEL_I:
-    if (inputFileTranspose.size()) {
-      return cuspPartitionGraph<FennelP, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSC, galois::CUSP_CSC, false, inputFileTranspose
-      );
-    } else {
-      GALOIS_DIE("Error: attempting Fennel incoming without transpose graph");
-      break;
-    }
-
-  case SUGAR_O:
-    return cuspPartitionGraph<SugarColumnFlipP, NodeData, EdgeData>(
-      inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false, inputFileTranspose
-    );
-
   default:
     GALOIS_DIE("Error: partition scheme specified is invalid");
     return nullptr;
diff --git a/lonestargnn/src/DistributedGraphLoader.cpp b/lonestargnn/src/DistributedGraphLoader.cpp
index f2e336028e..dbdf24ab90 100644
--- a/lonestargnn/src/DistributedGraphLoader.cpp
+++ b/lonestargnn/src/DistributedGraphLoader.cpp
@@ -30,16 +30,6 @@ using namespace galois::graphs;
 
 namespace cll = llvm::cl;
 
-cll::opt<std::string> inputFile(cll::Positional, cll::desc("<input file>"),
-                                cll::Required);
-cll::opt<std::string> inputFileTranspose("graphTranspose",
-                                         cll::desc("<input file, transposed>"),
-                                         cll::init(""));
-cll::opt<bool>
-    inputFileSymmetric("symmetricGraph",
-                       cll::desc("Set this flag if graph is symmetric"),
-                       cll::init(false));
-
 cll::opt<PARTITIONING_SCHEME> partitionScheme(
     "partition", cll::desc("Type of partitioning."),
     cll::values(
@@ -57,31 +47,3 @@ cll::opt<PARTITIONING_SCHEME> partitionScheme(
         clEnumValN(SUGAR_O, "sugar-o", "fennel, incoming edge cut, using CuSP"),
         clEnumValEnd),
     cll::init(OEC));
-
-//cll::opt<std::string>
-//    vertexIDMapFileName("vertexIDMapFileName",
-//                        cll::desc("<file containing the "
-//                                  "vertexID to hosts mapping for "
-//                                  "the custom edge cut.>"),
-//                        cll::init(""), cll::Hidden);
-
-cll::opt<bool> readFromFile("readFromFile",
-                            cll::desc("Set this flag if graph is to be "
-                                      "constructed from file (file must be "
-                                      "created by Abelian CSR)"),
-                            cll::init(false), cll::Hidden);
-
-cll::opt<std::string>
-    localGraphFileName("localGraphFileName",
-                       cll::desc("Name of the local file to construct "
-                                 "local graph (file must be created by "
-                                 "Abelian CSR)"),
-                       cll::init("local_graph"), cll::Hidden);
-
-cll::opt<bool> saveLocalGraph("saveLocalGraph",
-                              cll::desc("Set to save the local CSR graph"),
-                              cll::init(false), cll::Hidden);
-
-cll::opt<std::string> mastersFile("mastersFile",
-                                  cll::desc("File specifying masters blocking"),
-                                  cll::init(""), cll::Hidden);

From 408090268a484583651b67dbffa084928511484c Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Tue, 10 Mar 2020 07:45:12 -0500
Subject: [PATCH 140/660] remove print

---
 libgpu/include/graph_gpu.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/libgpu/include/graph_gpu.h b/libgpu/include/graph_gpu.h
index 3f2c88a308..e2057bf7af 100644
--- a/libgpu/include/graph_gpu.h
+++ b/libgpu/include/graph_gpu.h
@@ -116,8 +116,8 @@ struct CSRGraph {
 		printf("]\n");
 	}
 	void add_selfloop() {
-		print_neighbors(nnodes-1);
-		print_neighbors(0);
+		//print_neighbors(nnodes-1);
+		//print_neighbors(0);
 		index_type *new_edge_dst = new index_type[nnodes+nedges];
 		for (index_type i = 0; i < nnodes; i++) {
 			index_type start = row_start[i];
@@ -147,8 +147,8 @@ struct CSRGraph {
 		edge_dst = new_edge_dst;
 		nedges += nnodes;
         printf("nnodes = %d, nedges = %d\n", nnodes, nedges);
-		print_neighbors(nnodes-1);
-		print_neighbors(0);
+		//print_neighbors(nnodes-1);
+		//print_neighbors(0);
 	}
 
 	__device__ __host__ index_type getEdgeDst(unsigned edge) {

From 4ff0d384936e1368498e6ef8b290fa6a4eaaa36b Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Tue, 10 Mar 2020 09:25:34 -0500
Subject: [PATCH 141/660] fix GPU compiling

---
 .../include/deepgalois/math_functions.hh      |  5 ++--
 libdeepgalois/src/context.cu                  | 23 +++++++------------
 libdeepgalois/src/layers/aggregator.cu        |  5 +++-
 libdeepgalois/src/math_functions.cu           | 12 ++++++----
 4 files changed, 22 insertions(+), 23 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh
index 8f73ed609e..593ef03c5c 100644
--- a/libdeepgalois/include/deepgalois/math_functions.hh
+++ b/libdeepgalois/include/deepgalois/math_functions.hh
@@ -131,7 +131,7 @@ void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z,
 void csrmm_gpu(const int M, const int N, const int K, const int nnz, 
                const float alpha, const float* A_nonzeros, 
 	           const int* A_idx_ptr, const int* A_nonzero_idx,
-               const float* B, const float beta, float* C);
+               const float* B, const float beta, float* trans_C, float* C);
 void softmax_cross_entropy_gpu(int len, int begin, int end, const float_t* in_data,
                                const mask_t* masks, const label_t* labels,
                                float_t* loss, float_t* out_data);
@@ -142,7 +142,8 @@ void scal_gpu(const int N, const float alpha, float* X);
 void add_scalar_gpu(const int N, const float_t alpha, float_t* Y);
 bool is_allocated_device(float_t* data);
 void copy_masks_device(int n, mask_t* h_masks, mask_t*& d_masks);
-void float_malloc_device(int n, float_t*& loss);
+void float_malloc_device(int n, float_t*& ptr);
+void float_free_device(float_t*& ptr);
 void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout,
                          unsigned*& masks, float_t*& in, float_t*& out,
                          float_t*& matrix, float_t*& grad);
diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu
index d727904107..dfb0e3cc5e 100644
--- a/libdeepgalois/src/context.cu
+++ b/libdeepgalois/src/context.cu
@@ -61,9 +61,7 @@ cusparseHandle_t Context::cusparse_handle_     = 0;
 cusparseMatDescr_t Context::cusparse_matdescr_ = 0;
 curandGenerator_t Context::curand_generator_   = 0;
 
-Context::Context()
-    : mode_(Context::GPU), solver_count_(1), solver_rank_(0),
-      multiprocess_(false) {
+Context::Context() {
   CUBLAS_CHECK(cublasCreate(&cublas_handle_));
   CUSPARSE_CHECK(cusparseCreate(&cusparse_handle_));
   CUSPARSE_CHECK(cusparseCreateMatDescr(&cusparse_matdescr_));
@@ -105,24 +103,19 @@ void Context::norm_factor_counting() {
   CudaTest("solving norm_factor_counting kernel failed");
   std::cout << "Done\n";
 }
-
+/*
 void Context::SetDevice(const int device_id) {
   int current_device;
   CUDA_CHECK(cudaGetDevice(&current_device));
-  if (current_device == device_id)
-    return;
+  if (current_device == device_id) return;
   CUDA_CHECK(cudaSetDevice(device_id));
-  if (cublas_handle_)
-    CUBLAS_CHECK(cublasDestroy(cublas_handle_));
-  if (curand_generator_)
-    CURAND_CHECK(curandDestroyGenerator(curand_generator_));
+  if (cublas_handle_) CUBLAS_CHECK(cublasDestroy(cublas_handle_));
+  if (curand_generator_) CURAND_CHECK(curandDestroyGenerator(curand_generator_));
   CUBLAS_CHECK(cublasCreate(&cublas_handle_));
-  CURAND_CHECK(
-      curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT));
-  CURAND_CHECK(
-      curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen()));
+  CURAND_CHECK(curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT));
+  CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen()));
 }
-
+*/
 size_t Context::read_graph_gpu(std::string dataset_str, bool selfloop) {
   std::string filename = path + dataset_str + ".csgr";
   CSRGraph g;
diff --git a/libdeepgalois/src/layers/aggregator.cu b/libdeepgalois/src/layers/aggregator.cu
index ee5fe56b4d..1f739eef58 100644
--- a/libdeepgalois/src/layers/aggregator.cu
+++ b/libdeepgalois/src/layers/aggregator.cu
@@ -74,5 +74,8 @@ void deepgalois::update_all_csrmm(size_t len, CSRGraph& g, const float_t* in, fl
   CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t)));
   //std::cout << "[debug]: update_all on GPU, n=" << n << ", len=" << len << "\n";
   //print_device_vector(10, norm_factor, "norm_factor");
-  csrmm_gpu(n, len, n, g.nedges, 1.0, norm_factor, (const int*)g.row_start_ptr(), (const int*)g.edge_dst_ptr(), in, 0.0, out);
+  float *temp;
+  float_malloc_device(n*len, temp); // TODO: avoid repetitive allocation
+  csrmm_gpu(n, len, n, g.nedges, 1.0, norm_factor, (const int*)g.row_start_ptr(), (const int*)g.edge_dst_ptr(), in, 0.0, temp, out);
+  float_free_device(temp);
 }
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index 531480091d..e723ba289f 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -53,8 +53,12 @@ bool is_allocated_device(float_t* data) {
   return false;
 }
 
-void float_malloc_device(int n, float_t*& loss) {
-  CUDA_CHECK(cudaMalloc((void**)&loss, n * sizeof(float_t)));
+void float_malloc_device(int n, float_t*& ptr) {
+  CUDA_CHECK(cudaMalloc((void**)&ptr, n * sizeof(float_t)));
+}
+
+void float_free_device(float_t*& ptr) {
+  CUDA_CHECK(cudaFree(ptr));
 }
 
 void copy_masks_device(int n, mask_t* h_masks, mask_t*& d_masks) {
@@ -186,9 +190,7 @@ void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z,
 void csrmm_gpu(const int M, const int N, const int K, const int nnz, 
                const float alpha, const float* A_nonzeros, 
 	           const int* A_idx_ptr, const int* A_nnz_idx,
-               const float* B, const float beta, float* C) {
-  float *transpose_C;
-  CUDA_CHECK(cudaMalloc((void**)&transpose_C, N * K * sizeof(float)));
+               const float* B, const float beta, float *transpose_C, float* C) {
   CUSPARSE_CHECK(cusparseScsrmm2(deepgalois::Context::cusparse_handle(),
                  CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE,
                  M, N, K, nnz, &alpha, deepgalois::Context::cusparse_matdescr(), A_nonzeros, 

From ba38d5158ced6bd458a6a1956e730e4132aa84bb Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 10 Mar 2020 14:19:19 -0500
Subject: [PATCH 142/660] graph_cpu is now a pointer Done for compatibility
 with Dist template graphs later

---
 libdeepgalois/include/deepgalois/context.h    |  2 +-
 libdeepgalois/src/context.cpp                 | 17 ++++++++++-------
 libdeepgalois/src/layers/graph_conv_layer.cpp |  6 ++++--
 3 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h
index be0154e33f..b765515e50 100644
--- a/libdeepgalois/include/deepgalois/context.h
+++ b/libdeepgalois/include/deepgalois/context.h
@@ -43,7 +43,7 @@ class Context {
   float_t* norm_factor;        // normalization constant based on graph structure
 
 #ifdef CPU_ONLY
-  Graph graph_cpu; // the input graph, |V| = N
+  Graph* graph_cpu; // the input graph, |V| = N
   void genGraph(LGraph& lg, Graph& g);
   void add_selfloop(Graph &og, Graph &g);
 #else
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index 2717567d28..cf481aa040 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -15,6 +15,7 @@ size_t Context::read_graph(std::string dataset_str, bool selfloop) {
   return n;
 }
 
+#ifndef GALOIS_USE_DIST
 size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype, bool selfloop) {
   galois::StatTimer Tread("GraphReadingTime");
   Tread.start();
@@ -23,7 +24,7 @@ size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype, bo
     printf("Reading .el file: %s\n", filename.c_str());
     LGraph lgraph;
     lgraph.read_edgelist(filename.c_str(), true); // symmetrize
-    genGraph(lgraph, graph_cpu);
+    genGraph(lgraph, *graph_cpu);
     lgraph.clean();
   } else if (filetype == "gr") {
     std::string filename = path + dataset_str + ".csgr";
@@ -31,16 +32,17 @@ size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype, bo
     if (selfloop) {
       Graph graph_temp;
       galois::graphs::readGraph(graph_temp, filename);
-      add_selfloop(graph_temp, graph_cpu);
-    } else galois::graphs::readGraph(graph_cpu, filename);
+      add_selfloop(graph_temp, *graph_cpu);
+    } else galois::graphs::readGraph(*graph_cpu, filename);
+// TODO dist version of self loop
   } else {
     printf("Unkown file format\n");
     exit(1);
   }
   Tread.stop();
-  std::cout << "num_vertices " << graph_cpu.size() << " num_edges "
-            << graph_cpu.sizeEdges() << "\n";
-  return graph_cpu.size();
+  std::cout << "num_vertices " << graph_cpu->size() << " num_edges "
+            << graph_cpu->sizeEdges() << "\n";
+  return graph_cpu->size();
 }
 
 void Context::genGraph(LGraph& lg, Graph& g) {
@@ -55,12 +57,13 @@ void Context::genGraph(LGraph& lg, Graph& g) {
       g.constructEdge(offset, lg.get_dest(offset), 0);
   }
 }
+#endif
 
 void Context::norm_factor_counting() {
   norm_factor = new float_t[n];
   galois::do_all(galois::iterate((size_t)0, n),
     [&](auto v) {
-      auto degree  = std::distance(graph_cpu.edge_begin(v), graph_cpu.edge_end(v));
+      auto degree  = std::distance(graph_cpu->edge_begin(v), graph_cpu->edge_end(v));
       float_t temp = std::sqrt(float_t(degree));
       if (temp == 0.0) norm_factor[v] = 0.0;
       else norm_factor[v] = 1.0 / temp;
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 1c631a9d21..86ab1abd2f 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -70,7 +70,7 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_
   } else deepgalois::math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_data, &layer::W[0], 0.0, out_temp);
 
   // aggregate based on graph topology
-  graph_conv_layer::aggregate(z, context->graph_cpu, out_temp, out_data);
+  graph_conv_layer::aggregate(z, *(context->graph_cpu), out_temp, out_data);
   // TODO sync required here
 
   // run relu activation on output if specified
@@ -87,7 +87,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
 
   // x*y NOTE: since graph is symmetric, the derivative is the same
   // this is the aggregate call
-  deepgalois::update_all(z, context->graph_cpu, out_grad, out_temp, norm_, norm_factor); // x*x; x*z -> x*z
+  deepgalois::update_all(z, *(context->graph_cpu), out_grad, out_temp, norm_, norm_factor); // x*x; x*z -> x*z
   // TODO sync required here
 
   // at this point, out_temp has the derivative of data from last step to
@@ -107,8 +107,10 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
   // multiplied by gradients from last back prop step
   deepgalois::math::sgemm_cpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data,
                               out_temp, 0.0, &layer::weight_grad[0]); // y*x; x*z; y*z
+#ifdef GALOIS_USE_DIST
   layer::syncSub->sync<writeAny, readAny, GradientSync>("GradientSync");
   //galois::gInfo("[", layer::gradientGraph->myHostID(), "] Sync done");
+#endif
 }
 #endif
 } // namespace

From 85665b9fa429bb71e50143a203ed9f14cdbc7a0f Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 10 Mar 2020 15:56:21 -0500
Subject: [PATCH 143/660] gcn app update to load distgraph

---
 lonestargnn/gcn/gcn.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp
index f2e3f3f1eb..3642959b95 100644
--- a/lonestargnn/gcn/gcn.cpp
+++ b/lonestargnn/gcn/gcn.cpp
@@ -18,16 +18,18 @@ int main(int argc, char** argv) {
 
   LonestarGnnStart(argc, argv, name, desc, url);
   deepgalois::Net network; // the neural network to train
+
+#ifdef GALOIS_USE_DIST
+  std::vector<unsigned> dummy;
+  Graph* testing = galois::graphs::constructSymmetricGraph<char, void>(dummy);
+#endif
+
   // read network, features, ground truth, initialize metadata
   network.init(dataset, epochs, hidden1, add_selfloop);
   network.construct_layers(); // default setting for now; can be customized by
                               // the user
   network.print_layers_info();
 
-#ifdef GALOIS_USE_DIST
-  std::vector<unsigned> dummy;
-  galois::graphs::constructSymmetricGraph<unsigned, void>(dummy);
-#endif
 
   // tracks peak memory usage
   deepgalois::ResourceManager rm;

From f14825ae6178deb4d0192016f60fcc152acb7bc2 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 10 Mar 2020 15:57:23 -0500
Subject: [PATCH 144/660] dist context placeholders

---
 .../include/deepgalois/DistContext.h          | 39 +++++++++++++++++++
 libdeepgalois/src/DistContext.cpp             | 29 ++++++++++++++
 2 files changed, 68 insertions(+)
 create mode 100644 libdeepgalois/include/deepgalois/DistContext.h
 create mode 100644 libdeepgalois/src/DistContext.cpp

diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h
new file mode 100644
index 0000000000..f22d7c221a
--- /dev/null
+++ b/libdeepgalois/include/deepgalois/DistContext.h
@@ -0,0 +1,39 @@
+#ifndef __DG_DIST_CONTEXT__
+#define __DG_DIST_CONTEXT__
+/**
+ * Based on common.hpp file of the Caffe deep learning library.
+ */
+#include "deepgalois/types.h"
+#include "deepgalois/utils.h"
+#include "deepgalois/gtypes.h"
+
+namespace deepgalois {
+
+class DistContext {
+  size_t n;                    // number of samples: N
+  size_t num_classes;          // number of classes: E
+  size_t feat_len;             // input feature length: D
+  std::vector<label_t> labels; // labels for classification: N x 1
+  vec_t h_feats;               // input features: N x D
+
+public:
+  DistContext();
+  ~DistContext();
+
+  size_t saveGraph(Graph* dGraph);
+  size_t read_labels(std::string dataset_str);
+  size_t read_features(std::string dataset_str);
+  void norm_factor_counting();
+
+  // TODO why are these public
+  float_t* norm_factor;        // normalization constant based on graph structure
+  Graph* graph_cpu; // the input graph, |V| = N
+
+  label_t get_label(size_t i) { return labels[i]; }
+  size_t read_graph_cpu(std::string dataset_str);
+  float_t* get_in_ptr();
+};
+
+} // end deepgalois namespace
+
+#endif
diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp
new file mode 100644
index 0000000000..22ecdbb1c6
--- /dev/null
+++ b/libdeepgalois/src/DistContext.cpp
@@ -0,0 +1,29 @@
+#include "deepgalois/DistContext.h"
+
+namespace deepgalois {
+DistContext::DistContext() {}
+DistContext::~DistContext() {}
+
+size_t DistContext::saveGraph(Graph* dGraph) {
+  // TODO
+  return 0;
+}
+size_t DistContext::read_labels(std::string dataset_str) {
+  // TODO
+  return 0;
+}
+size_t DistContext::read_features(std::string dataset_str) {
+  // TODO
+  return 0;
+}
+
+float_t* DistContext::get_in_ptr() {
+  // TODO
+  return nullptr;
+}
+
+void DistContext::norm_factor_counting() {
+  // TODO
+}
+
+}  // deepgalois

From ae361bef40770467062a2eeb535d8f090fc6708a Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 10 Mar 2020 16:02:55 -0500
Subject: [PATCH 145/660] if dist_galois, use DistContext; all placeholder at
 the moment TODO implement new functions/new flow

---
 libdeepgalois/CMakeLists.txt                  | 16 +++++++++++++
 libdeepgalois/include/deepgalois/gtypes.h     |  7 ++++++
 .../deepgalois/layers/graph_conv_layer.h      |  2 +-
 .../include/deepgalois/layers/layer.h         | 14 +++++++++--
 libdeepgalois/include/deepgalois/net.h        | 12 +++++++++-
 libdeepgalois/src/context.cpp                 | 24 +++++++++----------
 libdeepgalois/src/layers/aggregator.cpp       |  8 ++++++-
 libdeepgalois/src/net.cpp                     |  9 ++++++-
 8 files changed, 73 insertions(+), 19 deletions(-)

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index bdc0f97942..b625c317e3 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -54,6 +54,20 @@ else()
 endif()
 
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+if(ENABLE_DIST_GALOIS)
+# do not link regular context.cpp; TODO do this conditional in cleaner way
+set(sources
+  src/layers/graph_conv_layer.cpp
+  src/layers/softmax_loss_layer.cpp
+  src/layers/aggregator.cpp
+  src/layers/layer.cpp
+  src/math_functions.cpp
+  src/optimizer.cpp
+  src/DistContext.cpp
+  src/node.cpp
+  src/net.cpp
+)
+else()
 set(sources
   src/layers/graph_conv_layer.cpp
   src/layers/softmax_loss_layer.cpp
@@ -65,6 +79,8 @@ set(sources
   src/node.cpp
   src/net.cpp
 )
+endif()
+
 add_library(dg_cpu STATIC ${sources})
 target_link_libraries(dg_cpu galois_shmem gllvm)
 target_link_libraries(dg_cpu ${MPI_CXX_LIBRARIES})
diff --git a/libdeepgalois/include/deepgalois/gtypes.h b/libdeepgalois/include/deepgalois/gtypes.h
index 5278660692..c30c72f730 100644
--- a/libdeepgalois/include/deepgalois/gtypes.h
+++ b/libdeepgalois/include/deepgalois/gtypes.h
@@ -1,12 +1,16 @@
 #pragma once
 #include "galois/Galois.h"
 #include "galois/graphs/LCGraph.h"
+#ifdef GALOIS_USE_DIST
+#include "galois/graphs/NewGeneric.h"
+#endif
 
 // TODO namespace
 
 typedef galois::GAccumulator<acc_t> AccumF;
 typedef galois::GAccumulator<size_t> AccumU;
 
+#ifndef GALOIS_USE_DIST
 #ifdef EDGE_LABEL
 typedef galois::graphs::LC_CSR_Graph<uint32_t, uint32_t>::with_numa_alloc<
     true>::type ::with_no_lockable<true>::type Graph;
@@ -14,5 +18,8 @@ typedef galois::graphs::LC_CSR_Graph<uint32_t, uint32_t>::with_numa_alloc<
 typedef galois::graphs::LC_CSR_Graph<uint32_t, void>::with_numa_alloc<
     true>::type ::with_no_lockable<true>::type Graph;
 #endif
+#else
+using Graph = galois::graphs::DistGraph<char, void>;
+#endif
 
 typedef Graph::GraphNode GNode;
diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
index 4904b13905..66749a8572 100644
--- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
@@ -33,7 +33,7 @@ class graph_conv_layer : public layer {
   void init();
   std::string layer_type() const override { return std::string("graph_conv"); }
   void set_netphase(deepgalois::net_phase ctx) override { phase_ = ctx; }
-  void set_context(deepgalois::Context* ctx) { context = ctx; norm_factor = ctx->norm_factor; }
+  void set_context(layer::ContextType* ctx) { context = ctx; norm_factor = ctx->norm_factor; }
   //! Uses weights contained in this layer to update in_data (results from previous)
   //! and save result to out_data
   virtual void forward_propagation(const float_t* in_data, float_t* out_data);
diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h
index a91f495915..116ab43aa1 100644
--- a/libdeepgalois/include/deepgalois/layers/layer.h
+++ b/libdeepgalois/include/deepgalois/layers/layer.h
@@ -11,7 +11,11 @@
 
 #include "deepgalois/types.h"
 #include "deepgalois/utils.h"
+#ifndef GALOIS_USE_DIST
 #include "deepgalois/context.h"
+#else
+#include "deepgalois/DistContext.h"
+#endif
 #include "deepgalois/optimizer.h"
 #include "deepgalois/math_functions.hh"
 #include "deepgalois/layers/node.h"
@@ -38,6 +42,12 @@ namespace deepgalois {
  **/
 class layer : public deepgalois::node {
 public:
+#ifndef GALOIS_USE_DIST
+  using ContextType = deepgalois::Context;
+#else
+  using ContextType = deepgalois::DistContext;
+#endif
+
   layer(unsigned level, std::vector<size_t> in_dims,
         std::vector<size_t> out_dims)
       : node(in_dims.size(), out_dims.size()), level_(level), begin_(0),
@@ -49,7 +59,7 @@ class layer : public deepgalois::node {
   virtual std::string layer_type() const = 0;
   virtual void set_netphase(deepgalois::net_phase phase) {}
   //! save context
-  virtual void set_context(deepgalois::Context* ctx) { context = ctx; }
+  virtual void set_context(ContextType* ctx) { context = ctx; }
   //! return layer loss
   virtual acc_t get_masked_loss() { return acc_t(0); }
 
@@ -149,7 +159,7 @@ class layer : public deepgalois::node {
   mask_t* masks_; // masks to show which samples are valid
   mask_t* d_masks_;
   float_t* loss; // error for each vertex: N x 1
-  deepgalois::Context* context;
+  ContextType* context;
 
 #ifdef GALOIS_USE_DIST
   // Used for synchronization of weight gradients
diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h
index 0daf730c42..4f481d1d0a 100644
--- a/libdeepgalois/include/deepgalois/net.h
+++ b/libdeepgalois/include/deepgalois/net.h
@@ -8,10 +8,16 @@
 #include "galois/Timer.h"
 #include "deepgalois/types.h"
 #include "deepgalois/gtypes.h"
-#include "deepgalois/context.h"
 #include "deepgalois/layers/graph_conv_layer.h"
 #include "deepgalois/layers/softmax_loss_layer.h"
 #include "deepgalois/optimizer.h"
+#ifndef GALOIS_USE_DIST
+#include "deepgalois/context.h"
+#else
+#include "deepgalois/DistContext.h"
+#endif
+
+
 
 #define NUM_CONV_LAYERS 2
 
@@ -113,7 +119,11 @@ class Net {
   }
 
 protected:
+#ifndef GALOIS_USE_DIST
   deepgalois::Context* context;
+#else
+  deepgalois::DistContext* context;
+#endif
   size_t num_samples;               // number of samples: N
   size_t num_classes;               // number of vertex classes: E
   size_t num_layers;                // for now hard-coded: NUM_CONV_LAYERS + 1
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index cf481aa040..9206b1cc1a 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -15,7 +15,6 @@ size_t Context::read_graph(std::string dataset_str, bool selfloop) {
   return n;
 }
 
-#ifndef GALOIS_USE_DIST
 size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype, bool selfloop) {
   galois::StatTimer Tread("GraphReadingTime");
   Tread.start();
@@ -57,18 +56,6 @@ void Context::genGraph(LGraph& lg, Graph& g) {
       g.constructEdge(offset, lg.get_dest(offset), 0);
   }
 }
-#endif
-
-void Context::norm_factor_counting() {
-  norm_factor = new float_t[n];
-  galois::do_all(galois::iterate((size_t)0, n),
-    [&](auto v) {
-      auto degree  = std::distance(graph_cpu->edge_begin(v), graph_cpu->edge_end(v));
-      float_t temp = std::sqrt(float_t(degree));
-      if (temp == 0.0) norm_factor[v] = 0.0;
-      else norm_factor[v] = 1.0 / temp;
-    }, galois::loopname("NormCounting"));
-}
 
 void Context::add_selfloop(Graph &og, Graph &g) {
   g.allocateFrom(og.size(), og.size()+og.sizeEdges());
@@ -103,6 +90,17 @@ void Context::add_selfloop(Graph &og, Graph &g) {
 }
 
 float_t* Context::get_in_ptr() { return &h_feats[0]; }
+
+void Context::norm_factor_counting() {
+  norm_factor = new float_t[n];
+  galois::do_all(galois::iterate((size_t)0, n),
+    [&](auto v) {
+      auto degree  = std::distance(graph_cpu->edge_begin(v), graph_cpu->edge_end(v));
+      float_t temp = std::sqrt(float_t(degree));
+      if (temp == 0.0) norm_factor[v] = 0.0;
+      else norm_factor[v] = 1.0 / temp;
+    }, galois::loopname("NormCounting"));
+}
 #endif
 
 // labels contain the ground truth (e.g. vertex classes) for each example
diff --git a/libdeepgalois/src/layers/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp
index 6d7c7f6cbe..4e86f148e8 100644
--- a/libdeepgalois/src/layers/aggregator.cpp
+++ b/libdeepgalois/src/layers/aggregator.cpp
@@ -5,7 +5,13 @@
 void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* out,
                 bool norm, const float_t* norm_factor) {
   // zero out the output data
-  galois::do_all(galois::iterate(g), [&](const GNode src) {
+  #ifndef GALOIS_USE_DIST
+  galois::do_all(g,
+  #else
+  auto& rangeObj = g.allNodesRange();
+  galois::do_all(galois::iterate(rangeObj),
+  #endif
+  [&](const GNode src) {
     deepgalois::math::clear_cpu(len , &out[src * len]);
     float_t a = 0.0;
     float_t b = 0.0;
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index d594e789e8..53baa60c13 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -7,9 +7,16 @@
 namespace deepgalois {
 
 void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1, bool selfloop) {
+  #ifndef GALOIS_USE_DIST
   context = new deepgalois::Context();
-  // read graph, get num nodes
   num_samples = context->read_graph(dataset_str, selfloop);
+  #else
+  context = new deepgalois::DistContext();
+  // TODO self loop?
+  // TODO num samples
+  #endif
+
+  // read graph, get num nodes
   num_classes = context->read_labels(dataset_str);
   num_epochs = epochs;
 

From 7e9c10d7caf1244b85815dcb290054acf91e235a Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 10 Mar 2020 16:58:29 -0500
Subject: [PATCH 146/660] Dist net: read masks, test sets, saving dGraph setup
 TODO labels and features in dist setting

---
 .../include/deepgalois/DistContext.h          |  7 ++-
 libdeepgalois/include/deepgalois/gtypes.h     |  7 ++-
 libdeepgalois/include/deepgalois/net.h        | 15 ++++--
 libdeepgalois/include/deepgalois/utils.h      | 46 ++++++++++++++++++-
 libdeepgalois/src/DistContext.cpp             | 42 +++++++++++++++--
 libdeepgalois/src/net.cpp                     | 44 ++++++++++++++----
 lonestargnn/gcn/gcn.cpp                       | 26 +++++++++--
 7 files changed, 162 insertions(+), 25 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h
index f22d7c221a..c7317dd7d2 100644
--- a/libdeepgalois/include/deepgalois/DistContext.h
+++ b/libdeepgalois/include/deepgalois/DistContext.h
@@ -20,7 +20,7 @@ class DistContext {
   DistContext();
   ~DistContext();
 
-  size_t saveGraph(Graph* dGraph);
+  void saveGraph(Graph* dGraph);
   size_t read_labels(std::string dataset_str);
   size_t read_features(std::string dataset_str);
   void norm_factor_counting();
@@ -29,7 +29,10 @@ class DistContext {
   float_t* norm_factor;        // normalization constant based on graph structure
   Graph* graph_cpu; // the input graph, |V| = N
 
-  label_t get_label(size_t i) { return labels[i]; }
+  label_t get_label(size_t i) {
+    // TODO global id only or lid only or both?
+    return labels[i];
+  }
   size_t read_graph_cpu(std::string dataset_str);
   float_t* get_in_ptr();
 };
diff --git a/libdeepgalois/include/deepgalois/gtypes.h b/libdeepgalois/include/deepgalois/gtypes.h
index c30c72f730..5dc08fc99e 100644
--- a/libdeepgalois/include/deepgalois/gtypes.h
+++ b/libdeepgalois/include/deepgalois/gtypes.h
@@ -1,6 +1,9 @@
-#pragma once
+#ifndef __DG_GTYPES__
+#define __DG_GTYPES__
+
 #include "galois/Galois.h"
 #include "galois/graphs/LCGraph.h"
+#include "deepgalois/types.h"
 #ifdef GALOIS_USE_DIST
 #include "galois/graphs/NewGeneric.h"
 #endif
@@ -23,3 +26,5 @@ using Graph = galois::graphs::DistGraph<char, void>;
 #endif
 
 typedef Graph::GraphNode GNode;
+
+#endif
diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h
index 4f481d1d0a..f905d2a595 100644
--- a/libdeepgalois/include/deepgalois/net.h
+++ b/libdeepgalois/include/deepgalois/net.h
@@ -30,7 +30,12 @@ namespace deepgalois {
 class Net {
 public:
   Net() {}
+  #ifndef GALOIS_USE_DIST
   void init(std::string dataset_str, unsigned epochs, unsigned hidden1, bool selfloop);
+  #else
+  void init(std::string dataset_str, unsigned epochs, unsigned hidden1,
+            bool selfloop, Graph* dGraph);
+  #endif
   size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; }
   size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id + 1]; }
   size_t get_nnodes() { return num_samples; }
@@ -96,20 +101,24 @@ class Net {
 
   // back propogation
   void bprop() {
-    for (size_t i = num_layers; i != 0; i--)
+    for (size_t i = num_layers; i != 0; i--) {
       layers[i - 1]->backward();
+    }
   }
 
   // update trainable weights after back-propagation
   void update_weights(optimizer* opt) {
-    for (size_t i = 0; i < num_layers; i++)
-      if (layers[i]->trainable())
+    for (size_t i = 0; i < num_layers; i++) {
+      if (layers[i]->trainable()) {
         layers[i]->update_weight(opt);
+      }
+    }
   }
 
   // evaluate, i.e. inference or predict
   double evaluate(size_t begin, size_t end, size_t count, mask_t* masks,
                   acc_t& loss, acc_t& acc) {
+    // TODO may need to do something for the dist case
     Timer t_eval;
     t_eval.Start();
     loss = fprop(begin, end, count, masks);
diff --git a/libdeepgalois/include/deepgalois/utils.h b/libdeepgalois/include/deepgalois/utils.h
index 51c0bb5c95..ad33285879 100644
--- a/libdeepgalois/include/deepgalois/utils.h
+++ b/libdeepgalois/include/deepgalois/utils.h
@@ -7,6 +7,9 @@
 #include <iostream>
 #include <sys/time.h>
 #include <sys/resource.h>
+#ifdef GALOIS_USE_DIST
+#include "deepgalois/gtypes.h"
+#endif
 
 namespace deepgalois {
 
@@ -100,7 +103,9 @@ inline bool bernoulli(float_t p) {
   return uniform_rand(float_t(0), float_t(1)) <= p;
 }
 
-//! Get masks from datafile where first line tells range of 
+
+#ifndef GALOIS_USE_DIST
+//! Get masks from datafile where first line tells range of
 //! set to create mask from
 inline size_t read_masks(std::string dataset_str, std::string mask_type,
                          size_t& begin, size_t& end,
@@ -134,5 +139,44 @@ inline size_t read_masks(std::string dataset_str, std::string mask_type,
   in.close();
   return sample_count;
 }
+#else
+//! Get masks from datafile where first line tells range of
+//! set to create mask from; needs graph object due to local IDs
+inline size_t read_masks(std::string dataset_str, std::string mask_type,
+                         size_t& begin, size_t& end,
+                         std::vector<uint8_t>& masks, Graph* dGraph) {
+  if (dataset_str != "citeseer" && dataset_str != "cora" && dataset_str != "pubmed") {
+    std::cout << "Dataset currently not supported\n";
+    exit(1);
+  }
+  size_t i             = 0;
+  size_t sample_count  = 0;
+  std::string filename = path + dataset_str + "-" + mask_type + "_mask.txt";
+
+  std::ifstream in;
+  std::string line;
+  in.open(filename, std::ios::in);
+  in >> begin >> end >> std::ws;
+  while (std::getline(in, line)) {
+    std::istringstream mask_stream(line);
+    if (i >= begin && i < end) {
+      unsigned mask = 0;
+      mask_stream >> mask;
+      if (mask == 1) {
+        // only bother if it's local
+        if (dGraph->isLocal(i)) {
+          masks[dGraph->getLID(i)] = 1;
+          sample_count++;
+        }
+      }
+    }
+    i++;
+  }
+  std::cout << mask_type + "_mask range: [" << begin << ", " << end
+    << ") Number of valid samples: " << sample_count << "\n";
+  in.close();
+  return sample_count;
+}
+#endif
 
 }
diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp
index 22ecdbb1c6..a6b85965fe 100644
--- a/libdeepgalois/src/DistContext.cpp
+++ b/libdeepgalois/src/DistContext.cpp
@@ -4,14 +4,46 @@ namespace deepgalois {
 DistContext::DistContext() {}
 DistContext::~DistContext() {}
 
-size_t DistContext::saveGraph(Graph* dGraph) {
-  // TODO
-  return 0;
+void DistContext::saveGraph(Graph* dGraph) {
+  graph_cpu = dGraph;
 }
 size_t DistContext::read_labels(std::string dataset_str) {
-  // TODO
-  return 0;
+  Graph* dGraph = DistContext::graph_cpu;
+  unsigned myID = galois::runtime::getSystemNetworkInterface().ID;
+  galois::gPrint("[", myID, "] Reading labels...\n");
+
+  //std::string filename = path + dataset_str + "-labels.txt";
+  //std::ifstream in;
+  //std::string line;
+  //in.open(filename, std::ios::in);
+  //size_t m;
+  //// read file header
+  //an >> m >> num_classes >> std::ws;
+  //assert(m == dGraph->globalSize());
+  //// size of labels is only # local nodes
+  //labels.resize(dGraph.size(), 0);
+
+  //unsigned v = 0;
+  //while (std::getline(in, line)) {
+  //  std::istringstream label_stream(line);
+  //  unsigned x;
+  //  for (size_t idx = 0; idx < num_classes; ++idx) {
+  //    label_stream >> x;
+  //    if (x != 0) {
+  //      labels[v] = idx;
+  //      break;
+  //    }
+  //  }
+  //  v++;
+  //}
+  //in.close();
+
+  //// print the number of vertex classes
+  //std::cout << "Done, unique label counts: " << num_classes
+  //          << ", time: " << t_read.Millisecs() << " ms\n";
+  //return num_classes;
 }
+
 size_t DistContext::read_features(std::string dataset_str) {
   // TODO
   return 0;
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index 53baa60c13..704951f59e 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -6,15 +6,22 @@
 
 namespace deepgalois {
 
+#ifndef GALOIS_USE_DIST
 void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1, bool selfloop) {
-  #ifndef GALOIS_USE_DIST
+#else
+void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1,
+               bool selfloop, Graph* dGraph) {
+#endif
+#ifndef GALOIS_USE_DIST
   context = new deepgalois::Context();
   num_samples = context->read_graph(dataset_str, selfloop);
-  #else
+#else
   context = new deepgalois::DistContext();
+  num_samples = dGraph->size();
+  context->saveGraph(dGraph);
   // TODO self loop?
   // TODO num samples
-  #endif
+#endif
 
   // read graph, get num nodes
   num_classes = context->read_labels(dataset_str);
@@ -28,14 +35,35 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1, bool
     train_begin = 0, train_count = 153431,
     train_end = train_begin + train_count;
     val_begin = 153431, val_count = 23831, val_end = val_begin + val_count;
-    for (size_t i = train_begin; i < train_end; i++)
-      train_mask[i] = 1;
-    for (size_t i = val_begin; i < val_end; i++)
-      val_mask[i] = 1;
+    // TODO do all can be used below
+#ifndef GALOIS_USE_DIST
+    for (size_t i = train_begin; i < train_end; i++) train_mask[i] = 1;
+    for (size_t i = val_begin; i < val_end; i++) val_mask[i] = 1;
+#else
+    // find local ID from global ID, set if it exists
+    for (size_t i = train_begin; i < train_end; i++) {
+      if (dGraph->isLocal(i)) {
+        train_mask[dGraph->getLID(i)] = 1;
+      }
+    }
+    for (size_t i = val_begin; i < val_end; i++) {
+      if (dGraph->isLocal(i)) {
+        val_mask[dGraph->getLID(i)] = 1;
+      }
+    }
+#endif
   } else {
+#ifndef GALOIS_USE_DIST
     train_count =
         read_masks(dataset_str, "train", train_begin, train_end, train_mask);
     val_count = read_masks(dataset_str, "val", val_begin, val_end, val_mask);
+#else
+    train_count =
+        read_masks(dataset_str, "train", train_begin, train_end, train_mask,
+                   dGraph);
+    val_count = read_masks(dataset_str, "val", val_begin, val_end, val_mask,
+                           dGraph);
+#endif
   }
   //std::cout << "Done\n";
 
@@ -132,7 +160,7 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks
   accuracy_all.reset();
   galois::do_all(galois::iterate(begin, end), [&](const auto& i) {
     if (masks[i] == 1) {
-      int preds = argmax(num_classes, 
+      int preds = argmax(num_classes,
 	    &(layers[NUM_CONV_LAYERS - 1]->next()->get_data()[i * num_classes]));
       if ((label_t)preds == context->get_label(i))
         accuracy_all += 1.0;
diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp
index 3642959b95..d688258cd3 100644
--- a/lonestargnn/gcn/gcn.cpp
+++ b/lonestargnn/gcn/gcn.cpp
@@ -15,22 +15,24 @@ int main(int argc, char** argv) {
 #else
   galois::DistMemSys G;
 #endif
-
   LonestarGnnStart(argc, argv, name, desc, url);
   deepgalois::Net network; // the neural network to train
 
 #ifdef GALOIS_USE_DIST
-  std::vector<unsigned> dummy;
-  Graph* testing = galois::graphs::constructSymmetricGraph<char, void>(dummy);
+  std::vector<unsigned> dummyVec;
+  Graph* dGraph = galois::graphs::constructSymmetricGraph<char, void>(dummyVec);
 #endif
 
+#ifndef GALOIS_USE_DIST
   // read network, features, ground truth, initialize metadata
   network.init(dataset, epochs, hidden1, add_selfloop);
+#else
+  network.init(dataset, epochs, hidden1, add_selfloop, dGraph);
+#endif
   network.construct_layers(); // default setting for now; can be customized by
                               // the user
   network.print_layers_info();
 
-
   // tracks peak memory usage
   deepgalois::ResourceManager rm;
 
@@ -54,10 +56,24 @@ int main(int argc, char** argv) {
       test_begin = 177262;
       test_count = 55703;
       test_end   = test_begin + test_count;
+#ifndef GALOIS_USE_DIST
       for (size_t i = test_begin; i < test_end; i++)
         test_mask[i] = 1;
-    } else
+#else
+      for (size_t i = test_begin; i < test_end; i++)  {
+        if (dGraph->isLocal(i)) {
+          test_mask[dGraph->getLID(i)] = 1;
+        }
+      }
+#endif
+    } else {
+#ifndef GALOIS_USE_DIST
       test_count = deepgalois::read_masks(dataset, "test", test_begin, test_end, test_mask);
+#else
+      test_count = deepgalois::read_masks(dataset, "test", test_begin, test_end,
+                                          test_mask, dGraph);
+#endif
+    }
     galois::StatTimer Ttest("Test");
     Ttest.start();
     double test_time = network.evaluate(test_begin, test_end, test_count,

From eada9951fb134c82b0769495236be23e55d340c5 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 10 Mar 2020 17:01:39 -0500
Subject: [PATCH 147/660] do_all aggregator fix for shared mem deepgalois

---
 libdeepgalois/src/layers/aggregator.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libdeepgalois/src/layers/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp
index 4e86f148e8..40a8fdcf8f 100644
--- a/libdeepgalois/src/layers/aggregator.cpp
+++ b/libdeepgalois/src/layers/aggregator.cpp
@@ -6,7 +6,7 @@ void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* ou
                 bool norm, const float_t* norm_factor) {
   // zero out the output data
   #ifndef GALOIS_USE_DIST
-  galois::do_all(g,
+  galois::do_all(galois::iterate(g),
   #else
   auto& rangeObj = g.allNodesRange();
   galois::do_all(galois::iterate(rangeObj),

From 32f1c47e147ba9955a9a7196e493718024748c42 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 10 Mar 2020 17:18:34 -0500
Subject: [PATCH 148/660] cmake changes for DEEPGALOIS + HETERO DistContext
 currently does not support gpus, so have to separate it out for now

---
 CMakeLists.txt | 39 +++++++++++++++++++++++++++++++++++----
 1 file changed, 35 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f1b0489c10..4be9753f54 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -182,7 +182,36 @@ add_definitions(-DGALOIS_COPYRIGHT_YEAR=${GALOIS_COPYRIGHT_YEAR})
 
 # Distributed-heterogeneous features
 if(ENABLE_HETERO_GALOIS)
-  set(ENABLE_DIST_GALOIS ON)
+  if (NOT USE_DEEPGALOIS)
+    # do not turn on DIST_GALOIS by default if DEEP_GALOIS is enabled
+    # with HETERO galois
+    set(ENABLE_DIST_GALOIS ON)
+  endif()
+  if (USE_DEEPGALOIS)
+    SET(CUDA_SEPARABLE_COMPILATION ON)
+    find_package(CUDA REQUIRED)
+    set(CUDA_PROPAGATE_HOST_FLAGS off)
+    set(CUDA_SEPARABLE_COMPILATION on)
+    set(CUDA_HOST_COMPILER g++)
+
+    string(REPLACE "." "" GENCODES ${CUDA_CAPABILITY})
+    string(REPLACE "," ";" GENCODES ${GENCODES})
+    foreach(GENCODE ${GENCODES})
+      set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; --expt-extended-lambda -gencode arch=compute_${GENCODE},code=sm_${GENCODE})
+    endforeach()
+
+    cuda_include_directories("${CMAKE_SOURCE_DIR}/libgpu/include")
+
+    # MGPU v1.1
+    set(MGPU_ROOT "${CMAKE_SOURCE_DIR}/moderngpu") # only required headers
+    cuda_include_directories("${MGPU_ROOT}/src")
+
+    # CUB v1.6.4
+    set(CUB_ROOT "${CMAKE_SOURCE_DIR}/cub") # only required headers
+    cuda_include_directories("${CUB_ROOT}")
+
+    #find_package(OpenCL REQUIRED)
+  endif()
 endif()
 if(ENABLE_DIST_GALOIS)
   add_definitions(-DGALOIS_USE_DIST)
@@ -509,10 +538,12 @@ if(ENABLE_DIST_GALOIS)
   add_subdirectory(libdist)
   add_subdirectory(libcusp)
   add_subdirectory(libgluon)
-  if(ENABLE_HETERO_GALOIS)
-    add_subdirectory(libgpu)
-  endif(ENABLE_HETERO_GALOIS)
 endif(ENABLE_DIST_GALOIS)
+
+if(ENABLE_HETERO_GALOIS)
+  add_subdirectory(libgpu)
+endif(ENABLE_HETERO_GALOIS)
+
 add_subdirectory(tools)
 add_subdirectory(scripts)
 

From 110793810dd57ab217b6921597fd8e7dab199c60 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 10 Mar 2020 17:28:00 -0500
Subject: [PATCH 149/660] naive distirubted label reading for deepgalois
 complete

---
 libdeepgalois/src/DistContext.cpp | 76 ++++++++++++++++++-------------
 1 file changed, 44 insertions(+), 32 deletions(-)

diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp
index a6b85965fe..4847079376 100644
--- a/libdeepgalois/src/DistContext.cpp
+++ b/libdeepgalois/src/DistContext.cpp
@@ -10,38 +10,50 @@ void DistContext::saveGraph(Graph* dGraph) {
 size_t DistContext::read_labels(std::string dataset_str) {
   Graph* dGraph = DistContext::graph_cpu;
   unsigned myID = galois::runtime::getSystemNetworkInterface().ID;
-  galois::gPrint("[", myID, "] Reading labels...\n");
-
-  //std::string filename = path + dataset_str + "-labels.txt";
-  //std::ifstream in;
-  //std::string line;
-  //in.open(filename, std::ios::in);
-  //size_t m;
-  //// read file header
-  //an >> m >> num_classes >> std::ws;
-  //assert(m == dGraph->globalSize());
-  //// size of labels is only # local nodes
-  //labels.resize(dGraph.size(), 0);
-
-  //unsigned v = 0;
-  //while (std::getline(in, line)) {
-  //  std::istringstream label_stream(line);
-  //  unsigned x;
-  //  for (size_t idx = 0; idx < num_classes; ++idx) {
-  //    label_stream >> x;
-  //    if (x != 0) {
-  //      labels[v] = idx;
-  //      break;
-  //    }
-  //  }
-  //  v++;
-  //}
-  //in.close();
-
-  //// print the number of vertex classes
-  //std::cout << "Done, unique label counts: " << num_classes
-  //          << ", time: " << t_read.Millisecs() << " ms\n";
-  //return num_classes;
+  galois::gPrint("[", myID, "] Reading labels from disk...\n");
+
+  std::string filename = path + dataset_str + "-labels.txt";
+  std::ifstream in;
+  std::string line;
+  in.open(filename, std::ios::in);
+  size_t m;
+  // read file header
+  in >> m >> num_classes >> std::ws;
+  assert(m == dGraph->globalSize());
+  // size of labels should be # local nodes
+  labels.resize(dGraph->size(), 0);
+
+  uint32_t foundVertices = 0;
+  unsigned v = 0;
+  // each line contains a set of 0s and 1s
+  while (std::getline(in, line)) {
+    // only bother if local node
+    if (dGraph->isLocal(v)) {
+      std::istringstream label_stream(line);
+      unsigned x;
+      // for each class
+      for (size_t idx = 0; idx < num_classes; ++idx) {
+        // check if that class is labeled
+        label_stream >> x;
+        if (x != 0) {
+          // set local id
+          labels[dGraph->getLID(v)] = idx;
+          foundVertices++;
+          break;
+        }
+      }
+    }
+    // always increment v
+    v++;
+  }
+
+  in.close();
+
+  // print the number of vertex classes
+  galois::gPrint("[", myID, "] Done with labels, unique label counts: ",
+                 num_classes, "; set ", foundVertices, " nodes\n");
+
+  return num_classes;
 }
 
 size_t DistContext::read_features(std::string dataset_str) {

From c891c338f41aba5a9275037db9d1b5337f4997a4 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 10 Mar 2020 17:47:09 -0500
Subject: [PATCH 150/660] feature reading for distributed case

---
 .../include/deepgalois/DistContext.h          |  3 +-
 libdeepgalois/src/DistContext.cpp             | 41 ++++++++++++++++++-
 libdeepgalois/src/net.cpp                     |  1 -
 3 files changed, 41 insertions(+), 4 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h
index c7317dd7d2..5449b337fb 100644
--- a/libdeepgalois/include/deepgalois/DistContext.h
+++ b/libdeepgalois/include/deepgalois/DistContext.h
@@ -26,13 +26,14 @@ class DistContext {
   void norm_factor_counting();
 
   // TODO why are these public
-  float_t* norm_factor;        // normalization constant based on graph structure
+  float_t* norm_factor; // normalization constant based on graph structure
   Graph* graph_cpu; // the input graph, |V| = N
 
   label_t get_label(size_t i) {
     // TODO global id only or lid only or both?
     return labels[i];
   }
+
   size_t read_graph_cpu(std::string dataset_str);
   float_t* get_in_ptr();
 };
diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp
index 4847079376..3859263f1d 100644
--- a/libdeepgalois/src/DistContext.cpp
+++ b/libdeepgalois/src/DistContext.cpp
@@ -57,8 +57,45 @@ size_t DistContext::read_labels(std::string dataset_str) {
 }
 
 size_t DistContext::read_features(std::string dataset_str) {
-  // TODO
-  return 0;
+  Graph* dGraph = DistContext::graph_cpu;
+  unsigned myID = galois::runtime::getSystemNetworkInterface().ID;
+  galois::gPrint("[", myID, "] Reading features from disk...\n");
+
+  std::string filename = path + dataset_str + ".ft";
+  std::ifstream in;
+  std::string line;
+
+  in.open(filename, std::ios::in);
+  size_t m; // m = number of global vertices
+
+  // header read
+  in >> m >> feat_len >> std::ws;
+  // use local size, not global size
+  h_feats.resize(dGraph->size() * feat_len, 0);
+
+  // loop through all features
+  while (std::getline(in, line)) {
+    std::istringstream edge_stream(line);
+    unsigned u, v;
+    float_t w;
+    // vertex to set feature for
+    edge_stream >> u;
+    // only set if local
+    if (dGraph->isLocal(u)) {
+      // feature index
+      edge_stream >> v;
+      // actual feature
+      edge_stream >> w;
+
+      h_feats[dGraph->getLID(u) * feat_len + v] = w;
+    }
+  }
+  in.close();
+
+  galois::gPrint("[", myID, "] Done with features, feature length: ",
+                 feat_len, "\n");
+
+  return feat_len;
 }
 
 float_t* DistContext::get_in_ptr() {
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index 704951f59e..a101ddb4ff 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -20,7 +20,6 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1,
   num_samples = dGraph->size();
   context->saveGraph(dGraph);
   // TODO self loop?
-  // TODO num samples
 #endif
 
   // read graph, get num nodes

From eb4c80d5db5fecb522b935e439c39bd89f465631 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 10 Mar 2020 18:01:33 -0500
Subject: [PATCH 151/660] the rest of missing distcontext functions TODO norm
 factor needs dist execution

---
 .../include/deepgalois/DistContext.h          | 18 +++++++++++------
 libdeepgalois/src/DistContext.cpp             | 20 ++++++++++++++++---
 2 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h
index 5449b337fb..15b91babda 100644
--- a/libdeepgalois/include/deepgalois/DistContext.h
+++ b/libdeepgalois/include/deepgalois/DistContext.h
@@ -10,31 +10,37 @@
 namespace deepgalois {
 
 class DistContext {
-  size_t n;                    // number of samples: N
+  size_t localVertices;        // number of samples: N
   size_t num_classes;          // number of classes: E
   size_t feat_len;             // input feature length: D
   std::vector<label_t> labels; // labels for classification: N x 1
   vec_t h_feats;               // input features: N x D
 
 public:
+  // TODO why are these public
+  float_t* norm_factor; // normalization constant based on graph structure
+  Graph* graph_cpu; // the input graph, |V| = N
+
   DistContext();
   ~DistContext();
 
+  //! save graph pointer to context object
   void saveGraph(Graph* dGraph);
+  //! read labels of local nodes only
   size_t read_labels(std::string dataset_str);
+  //! read features of local nodes only
   size_t read_features(std::string dataset_str);
+  //! find norm factor by looking at degree
+  // TODO this is a distributed operation
   void norm_factor_counting();
 
-  // TODO why are these public
-  float_t* norm_factor; // normalization constant based on graph structure
-  Graph* graph_cpu; // the input graph, |V| = N
-
+  //! return label for some node
   label_t get_label(size_t i) {
     // TODO global id only or lid only or both?
     return labels[i];
   }
 
-  size_t read_graph_cpu(std::string dataset_str);
+  //! returns pointer to the features of each local node
   float_t* get_in_ptr();
 };
 
diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp
index 3859263f1d..90214b19d0 100644
--- a/libdeepgalois/src/DistContext.cpp
+++ b/libdeepgalois/src/DistContext.cpp
@@ -6,7 +6,10 @@ DistContext::~DistContext() {}
 
 void DistContext::saveGraph(Graph* dGraph) {
   graph_cpu = dGraph;
+
+  localVertices = graph_cpu->size();
 }
+
 size_t DistContext::read_labels(std::string dataset_str) {
   Graph* dGraph = DistContext::graph_cpu;
   unsigned myID = galois::runtime::getSystemNetworkInterface().ID;
@@ -99,12 +102,23 @@ size_t DistContext::read_features(std::string dataset_str) {
 }
 
 float_t* DistContext::get_in_ptr() {
-  // TODO
-  return nullptr;
+  return &h_feats[0];
 }
 
 void DistContext::norm_factor_counting() {
-  // TODO
+  // TODO: this is a distributed operation
+
+  // create for now, TODO need to actually fill it in
+  norm_factor = new float_t[localVertices];
+  //galois::do_all(galois::iterate((size_t)0, localVertices),
+  //  [&](auto v) {
+  //    auto degree  = std::distance(graph_cpu->edge_begin(v), graph_cpu->edge_end(v));
+  //    float_t temp = std::sqrt(float_t(degree));
+  //    if (temp == 0.0) norm_factor[v] = 0.0;
+  //    else norm_factor[v] = 1.0 / temp;
+  //  }, galois::loopname("NormCounting"));
+
+  return;
 }
 
 }  // deepgalois

From 56b945d443617559a7118232c624652d56e0292b Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 10 Mar 2020 18:49:02 -0500
Subject: [PATCH 152/660] temp dist context fix for norm_factor

---
 libdeepgalois/src/DistContext.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp
index 90214b19d0..768b1dbab9 100644
--- a/libdeepgalois/src/DistContext.cpp
+++ b/libdeepgalois/src/DistContext.cpp
@@ -110,6 +110,11 @@ void DistContext::norm_factor_counting() {
 
   // create for now, TODO need to actually fill it in
   norm_factor = new float_t[localVertices];
+  galois::do_all(galois::iterate((size_t)0, localVertices),
+    [&](auto v) {
+      norm_factor[v] = 0.01;
+    }, galois::loopname("NormCounting"));
+
   //galois::do_all(galois::iterate((size_t)0, localVertices),
   //  [&](auto v) {
   //    auto degree  = std::distance(graph_cpu->edge_begin(v), graph_cpu->edge_end(v));

From aa11e2d083988b9974f998c57ae681a0845706b7 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Tue, 10 Mar 2020 18:54:24 -0500
Subject: [PATCH 153/660] gpu errors

---
 libdeepgalois/include/deepgalois/math_functions.hh | 1 +
 libdeepgalois/src/context.cu                       | 7 +++++++
 libdeepgalois/src/math_functions.cu                | 4 ++++
 libdeepgalois/src/net.cu                           | 1 +
 4 files changed, 13 insertions(+)

diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh
index 593ef03c5c..27866be13c 100644
--- a/libdeepgalois/include/deepgalois/math_functions.hh
+++ b/libdeepgalois/include/deepgalois/math_functions.hh
@@ -144,6 +144,7 @@ bool is_allocated_device(float_t* data);
 void copy_masks_device(int n, mask_t* h_masks, mask_t*& d_masks);
 void float_malloc_device(int n, float_t*& ptr);
 void float_free_device(float_t*& ptr);
+void float_copy_device(int n, float_t* h_ptr, float_t *d_ptr);
 void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout,
                          unsigned*& masks, float_t*& in, float_t*& out,
                          float_t*& matrix, float_t*& grad);
diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu
index dfb0e3cc5e..4d77433eda 100644
--- a/libdeepgalois/src/context.cu
+++ b/libdeepgalois/src/context.cu
@@ -136,5 +136,12 @@ void Context::copy_data_to_device() {
   //print_device_vector(10, d_feats, "d_feats");
 }
 
+//void Context::copy_data_to_device() {
+  //float_malloc_device(n, d_labels);
+  //float_copy_device(n, &labels[0], d_labels);
+  //float_malloc_device(n*feat_len, d_feats);
+  //float_copy_device(n*feat_len, &h_feats[0], d_feats);
+//}
+
 float_t* Context::get_in_ptr() { return d_feats; }
 } // namespace context
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index e723ba289f..7cb5253e13 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -61,6 +61,10 @@ void float_free_device(float_t*& ptr) {
   CUDA_CHECK(cudaFree(ptr));
 }
 
+void float_copy_device(int n, float_t* h_ptr, float_t *d_ptr) {
+  CUDA_CHECK(cudaMemcpy(d_ptr, h_ptr, n * sizeof(float_t), cudaMemcpyHostToDevice));
+}
+
 void copy_masks_device(int n, mask_t* h_masks, mask_t*& d_masks) {
   assert(h_masks != NULL);
   CUDA_CHECK(cudaMalloc((void**)&d_masks, n * sizeof(mask_t)));
diff --git a/libdeepgalois/src/net.cu b/libdeepgalois/src/net.cu
index 947967d07c..62dec7cad4 100644
--- a/libdeepgalois/src/net.cu
+++ b/libdeepgalois/src/net.cu
@@ -1,4 +1,5 @@
 #include "deepgalois/net.h"
+#include "deepgalois/cutils.h"
 #include "gg.h"
 #include "ggcuda.h"
 

From ef6372ed8ae6f642998f1eab719c7aeb7f1d182e Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Wed, 11 Mar 2020 10:41:12 -0500
Subject: [PATCH 154/660] d_softmax_cross_entropy_kernel

---
 libdeepgalois/src/math_functions.cu | 55 ++++++++++++++++++++++-------
 1 file changed, 43 insertions(+), 12 deletions(-)

diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index 7cb5253e13..6ebc222412 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -322,8 +322,29 @@ void softmax_cross_entropy_gpu(int len, int begin, int end, const float_t* in,
   CudaTest("solving softmax_cross_entropy kernel failed");
 }
 
+__device__ void d_cross_entropy_device(int n, const label_t idx, const float_t* p, float_t* d) {
+  for (int i = 0; i < n; i++) {
+    if (i == (int)idx) d[i] = -1.0 / (p[i] + 1e-10);
+    else d[i] = 0.0;
+  }
+}
+
+__global__ void d_cross_entropy_kernel(int len, int begin, int end,
+                                const mask_t* masks, const label_t* labels,
+                                const float_t* data, float_t* grad) {
+  int base = begin * len;
+  CUDA_KERNEL_LOOP(i, (end-begin)*len) {
+    int id = begin + i/len;
+    if (masks[id] == 1) { // masked
+      if (i%len == (int)labels[id]) grad[i] = -1.0 / (data[i+base] + 1e-10);
+      else grad[i] = 0.0;
+      //d_cross_entropy_device(len, labels[id], data + len*id, grad + len*i);
+    }
+  }
+} 
+
 // TODO: use warp
-__device__ void d_softmax(int n, const float_t* p, const float_t* dp, float_t* dy) {
+__device__ void d_softmax_device(int n, const float_t* p, const float_t* dp, float_t* dy) {
   for (int i = 0; i < n; i++) {
     dy[i] = 0;
     for (int j = 0; j < n; j++) {
@@ -333,14 +354,16 @@ __device__ void d_softmax(int n, const float_t* p, const float_t* dp, float_t* d
   }
 }
 
-__device__ void d_cross_entropy(int n, const label_t idx, const float_t* p, float_t* d) {
-  for (int i = 0; i < n; i++) {
-    //assert(p[i] >= 0.0);
-    //assert(p[i] >= 0.0 && p[i] <= 1.0);
-    if (i == (int)idx) d[i] = -1.0 / (p[i] + 1e-10);
-    else d[i] = 0.0;
+__global__ void d_softmax_kernel(int len, int begin, int end,
+                                const mask_t* masks, const float_t* data,
+                                const float_t* in_grad, float_t* out_grad) {
+  CUDA_KERNEL_LOOP(i, end-begin) {
+    int id = begin + i;
+    if (masks[id] == 1) { // masked
+      d_softmax_device(len, data + len*id, in_grad + len*i, out_grad + len*id);
+    }
   }
-}
+} 
 
 __global__ void d_softmax_cross_entropy_kernel(int len, int begin, int end,
                                const mask_t* masks, const label_t* labels,
@@ -349,8 +372,8 @@ __global__ void d_softmax_cross_entropy_kernel(int len, int begin, int end,
     int id = begin + i;
     if (masks[id] == 1) { // masked
 	  float_t out_grad[41]; // TODO
-      d_cross_entropy(len, labels[id], out + len*id, out_grad);
-      d_softmax(len, out + len*id, out_grad, diff + len*id);
+      d_cross_entropy_device(len, labels[id], out + len*id, out_grad);
+      d_softmax_device(len, out + len*id, out_grad, diff + len*id);
     }
   }
 }
@@ -358,8 +381,16 @@ __global__ void d_softmax_cross_entropy_kernel(int len, int begin, int end,
 void d_softmax_cross_entropy_gpu(int len, int begin, int end,
                                  const mask_t* masks, const label_t* labels,
                                  const float_t* out, float_t* diff) {
-  d_softmax_cross_entropy_kernel<<<CUDA_GET_BLOCKS(end-begin), CUDA_NUM_THREADS>>>(
-      len, begin, end, masks, labels, out, diff);
+//  d_softmax_cross_entropy_kernel<<<CUDA_GET_BLOCKS(end-begin), CUDA_NUM_THREADS>>>(
+//      len, begin, end, masks, labels, out, diff);
+//  CudaTest("solving d_softmax_cross_entropy kernel failed");
+  float_t *grad;
+  float_malloc_device((end-begin)*len, grad);
+  d_cross_entropy_kernel<<<CUDA_GET_BLOCKS((end-begin)*len), CUDA_NUM_THREADS>>>(
+      len, begin, end, masks, labels, out, grad);
+  CudaTest("solving d_cross_entropy kernel failed");
+  d_softmax_kernel<<<CUDA_GET_BLOCKS(end-begin), CUDA_NUM_THREADS>>>(
+      len, begin, end, masks, out, grad, diff);
   CudaTest("solving d_softmax_cross_entropy kernel failed");
 }
 

From a447d05253a722d6ee5ca0f1c3712337693e8be5 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Wed, 11 Mar 2020 14:34:52 -0500
Subject: [PATCH 155/660] gpu softmax updated

---
 libdeepgalois/include/deepgalois/types.h |   1 +
 libdeepgalois/src/math_functions.cu      | 137 +++++++++++++++++++++--
 2 files changed, 130 insertions(+), 8 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/types.h b/libdeepgalois/include/deepgalois/types.h
index 1a32d5a47d..611ba57828 100644
--- a/libdeepgalois/include/deepgalois/types.h
+++ b/libdeepgalois/include/deepgalois/types.h
@@ -25,6 +25,7 @@ typedef uint8_t mask_t; // mask is used to indicate different uses of labels:
 #define TB_SIZE 256
 #define BLOCK_SIZE 256
 #define WARP_SIZE 32
+#define MAX_NUM_CLASSES 64
 #define WARPS_PER_BLOCK (BLOCK_SIZE / WARP_SIZE)
 #define USE_CUSPARSE
 #endif
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index 6ebc222412..e15a503eca 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -343,6 +343,36 @@ __global__ void d_cross_entropy_kernel(int len, int begin, int end,
   }
 } 
 
+__global__ void d_cross_entropy_warp(int len, int begin, int end,
+                                const mask_t* masks, const label_t* labels,
+                                const float_t* data, float_t* grad) {
+  __shared__ float_t p[BLOCK_SIZE/WARP_SIZE][MAX_NUM_CLASSES];
+  const int thread_id   = BLOCK_SIZE * blockIdx.x + threadIdx.x;  // global thread index
+  const int thread_lane = threadIdx.x & (WARP_SIZE-1);            // thread index within the warp
+  const int warp_id     = thread_id   / WARP_SIZE;                // global warp index
+  const int warp_lane   = threadIdx.x / WARP_SIZE;                // warp index within the CTA
+  const int num_warps   = (BLOCK_SIZE / WARP_SIZE) * gridDim.x;   // total number of active warps
+
+  for (int wid = warp_id; wid < end-begin; wid += num_warps) {
+    int id = begin + wid;
+    int base = id * len;	
+    if (masks[id] == 1) {
+      for (int i = 0; i < len; i += WARP_SIZE) {
+        int pid = thread_lane + i;
+        if (pid < len) p[warp_lane][pid] = data[base+pid];
+      }
+      __syncthreads();
+      for (int i = 0; i < len; i += WARP_SIZE) {
+        int pid = thread_lane + i;
+        if (pid < len) {
+          if (pid == (int)labels[id])
+            grad[wid*len+pid] = -1.0 / (p[warp_lane][pid] + 1e-10);
+          else grad[wid*len+pid] = 0.0;
+        }
+      }
+    }
+  }
+}
 // TODO: use warp
 __device__ void d_softmax_device(int n, const float_t* p, const float_t* dp, float_t* dy) {
   for (int i = 0; i < n; i++) {
@@ -365,6 +395,46 @@ __global__ void d_softmax_kernel(int len, int begin, int end,
   }
 } 
 
+__global__ void d_softmax_warp(int len, int begin, int end,
+                                const mask_t* masks, const float_t* data,
+                                const float_t* in_grad, float_t* out_grad) {
+  __shared__ float_t p[BLOCK_SIZE/WARP_SIZE][MAX_NUM_CLASSES];
+  __shared__ float_t d[BLOCK_SIZE/WARP_SIZE][MAX_NUM_CLASSES];
+  const int thread_id   = BLOCK_SIZE * blockIdx.x + threadIdx.x;  // global thread index
+  const int thread_lane = threadIdx.x & (WARP_SIZE-1);            // thread index within the warp
+  const int warp_id     = thread_id   / WARP_SIZE;                // global warp index
+  const int warp_lane   = threadIdx.x / WARP_SIZE;                // warp index within the CTA
+  const int num_warps   = (BLOCK_SIZE / WARP_SIZE) * gridDim.x;   // total number of active warps
+
+  for (int wid = warp_id; wid < end-begin; wid += num_warps) {
+    int id = begin + wid;
+    int base = id * len;	
+    if (masks[id] == 1) {
+      for (int i = 0; i < len; i += WARP_SIZE) {
+        int pid = thread_lane + i;
+        if (pid < len) {
+          p[warp_lane][pid] = data[base+pid];
+          d[warp_lane][pid] = in_grad[wid*len+pid];
+        }
+      }
+      __syncthreads();
+      for (int i = 0; i < len; i += WARP_SIZE) {
+        int pid = thread_lane + i;
+        if (pid < len) {
+          float_t sum = 0.0;
+          float_t self = p[warp_lane][pid];
+          for (int j = 0; j < len; j++) {
+            float_t df = (j == pid) ? self * (1.0 - self) : -p[warp_lane][j] * self;
+            sum += df * d[warp_lane][j];
+          }
+          out_grad[base+pid] = sum;
+        }
+      }
+      __syncthreads();
+    }
+  }
+}
+
 __global__ void d_softmax_cross_entropy_kernel(int len, int begin, int end,
                                const mask_t* masks, const label_t* labels,
                                const float_t* out, float_t* diff) {
@@ -378,19 +448,70 @@ __global__ void d_softmax_cross_entropy_kernel(int len, int begin, int end,
   }
 }
 
+__global__ void d_softmax_cross_entropy_warp(int len, int begin, int end,
+                                const mask_t* masks, const label_t* labels,
+                                const float_t* data, float_t* grad) {
+  __shared__ float_t p[BLOCK_SIZE/WARP_SIZE][MAX_NUM_CLASSES];
+  __shared__ float_t d[BLOCK_SIZE/WARP_SIZE][MAX_NUM_CLASSES];
+  const int thread_id   = BLOCK_SIZE * blockIdx.x + threadIdx.x;  // global thread index
+  const int thread_lane = threadIdx.x & (WARP_SIZE-1);            // thread index within the warp
+  const int warp_id     = thread_id   / WARP_SIZE;                // global warp index
+  const int warp_lane   = threadIdx.x / WARP_SIZE;                // warp index within the CTA
+  const int num_warps   = (BLOCK_SIZE / WARP_SIZE) * gridDim.x;   // total number of active warps
+
+  for (int wid = warp_id; wid < end-begin; wid += num_warps) {
+    int id = begin + wid;
+    int base = id * len;	
+    if (masks[id] == 1) {
+      for (int i = 0; i < len; i += WARP_SIZE) {
+        int pid = thread_lane + i;
+        if (pid < len) p[warp_lane][pid] = data[base+pid];
+      }
+      __syncthreads();
+      for (int i = 0; i < len; i += WARP_SIZE) {
+        int pid = thread_lane + i;
+        if (pid < len) {
+          if (pid == (int)labels[id])
+            d[warp_lane][pid] = -1.0 / (p[warp_lane][pid] + 1e-10);
+          else d[warp_lane][pid] = 0.0;
+        }
+      }
+      __syncthreads();
+      for (int i = 0; i < len; i += WARP_SIZE) {
+        int pid = thread_lane + i;
+        if (pid < len) {
+          float_t sum = 0.0;
+          float_t self = p[warp_lane][pid];
+          for (int j = 0; j < len; j++) {
+            float_t df = (j == pid) ? self * (1.0 - self) : -p[warp_lane][j] * self;
+            sum += df * d[warp_lane][j];
+          }
+          grad[base+pid] = sum;
+        }
+      }
+      __syncthreads();
+    }
+  }
+}
+
 void d_softmax_cross_entropy_gpu(int len, int begin, int end,
                                  const mask_t* masks, const label_t* labels,
                                  const float_t* out, float_t* diff) {
 //  d_softmax_cross_entropy_kernel<<<CUDA_GET_BLOCKS(end-begin), CUDA_NUM_THREADS>>>(
 //      len, begin, end, masks, labels, out, diff);
 //  CudaTest("solving d_softmax_cross_entropy kernel failed");
-  float_t *grad;
-  float_malloc_device((end-begin)*len, grad);
-  d_cross_entropy_kernel<<<CUDA_GET_BLOCKS((end-begin)*len), CUDA_NUM_THREADS>>>(
-      len, begin, end, masks, labels, out, grad);
-  CudaTest("solving d_cross_entropy kernel failed");
-  d_softmax_kernel<<<CUDA_GET_BLOCKS(end-begin), CUDA_NUM_THREADS>>>(
-      len, begin, end, masks, out, grad, diff);
-  CudaTest("solving d_softmax_cross_entropy kernel failed");
+  //float_t *grad;
+  //float_malloc_device((end-begin)*len, grad);
+  //d_cross_entropy_kernel<<<CUDA_GET_BLOCKS((end-begin)*len), CUDA_NUM_THREADS>>>(
+  //d_cross_entropy_warp<<<(end-begin-1)/WARPS_PER_BLOCK+1, BLOCK_SIZE>>>(
+  //    len, begin, end, masks, labels, out, grad);
+  //CudaTest("solving d_cross_entropy kernel failed");
+  //d_softmax_kernel<<<CUDA_GET_BLOCKS(end-begin), CUDA_NUM_THREADS>>>(
+  //d_softmax_warp<<<(end-begin-1)/WARPS_PER_BLOCK+1, BLOCK_SIZE>>>(
+  //    len, begin, end, masks, out, grad, diff);
+  //CudaTest("solving d_softmax kernel failed");
+  d_softmax_cross_entropy_warp<<<(end-begin-1)/WARPS_PER_BLOCK+1, BLOCK_SIZE>>>(
+      len, begin, end, masks, labels, out, diff);
+  CudaTest("solving d_softmax_cross_entropy_warp kernel failed");
 }
 

From b36c722acc90441ca87e6aeec3879f48e8e2d1d5 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Fri, 13 Mar 2020 08:53:15 -0500
Subject: [PATCH 156/660] match DGL gpu performance

---
 .../deepgalois/layers/graph_conv_layer.h      |  2 +
 .../include/deepgalois/math_functions.hh      |  4 +-
 libdeepgalois/src/layers/graph_conv_layer.cu  | 64 +++++++++++++++----
 libdeepgalois/src/math_functions.cu           | 61 ++++++++++++------
 4 files changed, 94 insertions(+), 37 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
index 66749a8572..c77467eeca 100644
--- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
@@ -49,6 +49,7 @@ class graph_conv_layer : public layer {
 #endif
   // user-defined combine function
   virtual void combine(size_t dim_x, size_t dim_y, const float_t* self, const float_t* neighbors, float_t* out);
+  void d_aggregate(size_t len, CSRGraph& g, const float_t* in, float_t* out);
 
 private:
   bool act_;     // whether to use activation function at the end
@@ -63,6 +64,7 @@ class graph_conv_layer : public layer {
   size_t z;
   float_t* out_temp; //!< intermediate data temporary
   float_t* in_temp;
+  float_t* in_temp1;
   float_t* trans_data;    // y*x
   unsigned* dropout_mask; // x*y
   float_t* norm_factor;   // normalization constant based on graph structure
diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh
index 27866be13c..ffc0343438 100644
--- a/libdeepgalois/include/deepgalois/math_functions.hh
+++ b/libdeepgalois/include/deepgalois/math_functions.hh
@@ -140,12 +140,10 @@ void d_softmax_cross_entropy_gpu(int len, int bengin, int end,
                                  const float_t* out_data, float_t* diff);
 void scal_gpu(const int N, const float alpha, float* X);
 void add_scalar_gpu(const int N, const float_t alpha, float_t* Y);
+void rng_uniform_gpu(const int n, const float_t a, const float_t b, float_t* r);
 bool is_allocated_device(float_t* data);
 void copy_masks_device(int n, mask_t* h_masks, mask_t*& d_masks);
 void float_malloc_device(int n, float_t*& ptr);
 void float_free_device(float_t*& ptr);
 void float_copy_device(int n, float_t* h_ptr, float_t *d_ptr);
-void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout,
-                         unsigned*& masks, float_t*& in, float_t*& out,
-                         float_t*& matrix, float_t*& grad);
 #endif
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cu b/libdeepgalois/src/layers/graph_conv_layer.cu
index b2a9209bd4..15796c95d3 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cu
+++ b/libdeepgalois/src/layers/graph_conv_layer.cu
@@ -3,7 +3,26 @@
 namespace deepgalois {
 
 void graph_conv_layer::init() {
-  gconv_malloc_device(x, y, z, dropout_, dropout_mask, in_temp, out_temp, d_W, layer::d_weight_grad);
+  if (dropout_) CUDA_CHECK(cudaMalloc((void**)&dropout_mask, x * y * sizeof(unsigned)));
+  //CUDA_CHECK(cudaMalloc((void**)&in_temp, x * y * sizeof(float_t)));
+  float_malloc_device(x*y, in_temp);
+  init_const_gpu(x*y, 0.0, in_temp);
+  if (y <= z) {
+    float_malloc_device(x*y, in_temp1);
+    init_const_gpu(x*y, 0.0, in_temp1);
+  }
+  //CUDA_CHECK(cudaMalloc((void**)&out_temp, x * z * sizeof(float_t)));
+  float_malloc_device(x*z, out_temp);
+  init_const_gpu(x*z, 0.0, out_temp);
+  //CUDA_CHECK(cudaMalloc((void**)&d_W, y * z * sizeof(float_t)));
+  float_malloc_device(y*z, d_W);
+  auto init_range = sqrt(6.0 / (y + z));
+  // Glorot & Bengio (AISTATS 2010)
+  rng_uniform_gpu(y * z, -init_range, init_range, d_W);
+  //CUDA_CHECK(cudaMalloc((void**)&layer::d_weight_grad, y * z * sizeof(float_t)));
+  float_malloc_device(y*z, layer::d_weight_grad);
+  //CUDA_CHECK(cudaMemset(layer::d_weight_grad, 0, y * z * sizeof(float_t)));
+  init_const_gpu(y*z, 0.0, layer::d_weight_grad);
 }
 
 void graph_conv_layer::aggregate(size_t len, CSRGraph& g, const float_t* in, float_t* out) {
@@ -14,18 +33,31 @@ void graph_conv_layer::aggregate(size_t len, CSRGraph& g, const float_t* in, flo
   #endif
 }
 
+void graph_conv_layer::d_aggregate(size_t len, CSRGraph& g, const float_t* in, float_t* out) {
+#ifdef USE_CUSPARSE
+  deepgalois::update_all_csrmm(len, g, in, out, norm_, norm_factor);
+#else
+  deepgalois::update_all(len, g, in, out, norm_, norm_factor);
+#endif
+}
+
 void graph_conv_layer::combine(size_t dim_x, size_t dim_y, const float_t* self, const float_t* neighbors, float_t* out) {
 }
 
 // GPU forward: compute output features
+// NOTE: in_data will be used in back-prop, so it can not be modified
 void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_data) {
-  //assert(y <= 128); // currently only support feature length <= 128
+  assert(z <= MAX_NUM_CLASSES); // currently only support feature length <= 128
   init_const_gpu(x*z, 0.0, out_temp);
-  if (dropout_ && phase_ == deepgalois::net_phase::train) {
+  if (dropout_ && phase_ == deepgalois::net_phase::train)
     dropout_gpu(x * y, scale_, dropout_rate_, in_data, dropout_mask, in_temp);
+  if (y > z) {
     sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, d_W, 0.0, out_temp);
-  } else sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_data, d_W, 0.0, out_temp);
-  graph_conv_layer::aggregate(z, context->graph_gpu, out_temp, out_data);
+    graph_conv_layer::aggregate(z, context->graph_gpu, out_temp, out_data);
+  } else {
+    graph_conv_layer::aggregate(y, context->graph_gpu, in_temp, in_temp1);
+    sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp1, d_W, 0.0, out_data);
+  }
   if (act_) relu_gpu(x * z, out_data, out_data);
 }
 
@@ -34,16 +66,20 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
                                         const float_t* out_data,
                                         float_t* out_grad, float_t* in_grad) {
   if (act_) d_relu_gpu(x * z, out_grad, out_data, out_grad);
-#ifdef USE_CUSPARSE
-  update_all_csrmm(z, context->graph_gpu, out_grad, out_temp, norm_, norm_factor);
-#else
-  update_all(z, context->graph_gpu, out_grad, out_temp, norm_, norm_factor);
-#endif
-  if (level_ != 0) {
-    sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0, in_grad);
-    if (dropout_) d_dropout_gpu(x * y, scale_, dropout_rate_, in_grad, dropout_mask, in_grad);
+  if (y > z) {
+    graph_conv_layer::d_aggregate(z, context->graph_gpu, out_grad, out_temp);
+    if (level_ != 0)
+      sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0, in_grad);
+    sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, layer::d_weight_grad);
+  } else {
+    if (level_ != 0) {
+      sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_grad, d_W, 0.0, in_temp);
+      graph_conv_layer::d_aggregate(y, context->graph_gpu, in_temp, in_grad);
+    }
+    sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_grad, 0.0, layer::d_weight_grad);
   }
-  sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, layer::d_weight_grad);
+  if (level_ != 0 && dropout_)
+    d_dropout_gpu(x * y, scale_, dropout_rate_, in_grad, dropout_mask, in_grad);
 }
 
 } // namespace
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index e15a503eca..8002d728a5 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -32,7 +32,7 @@ void gpu_rng_uniform(const int n, unsigned* r) {
   CURAND_CHECK(curandGenerate(deepgalois::Context::curand_generator(), r, n));
 }
 
-void gpu_rng_uniform(const int n, const float_t a, const float_t b, float_t* r) {
+void rng_uniform_gpu(const int n, const float_t a, const float_t b, float_t* r) {
   CURAND_CHECK(curandGenerateUniform(deepgalois::Context::curand_generator(), r, n));
   const float range = b - a;
   if (range != float_t(1))
@@ -71,22 +71,6 @@ void copy_masks_device(int n, mask_t* h_masks, mask_t*& d_masks) {
   CUDA_CHECK(cudaMemcpy(d_masks, h_masks, n * sizeof(mask_t), cudaMemcpyHostToDevice));
 }
 
-void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout,
-                         unsigned*& masks, float_t*& in, float_t*& out,
-                         float_t*& matrix, float_t*& grad) {
-  if (dropout) CUDA_CHECK(cudaMalloc((void**)&masks, x * y * sizeof(unsigned)));
-  CUDA_CHECK(cudaMalloc((void**)&in, x * y * sizeof(float_t)));
-  init_const_gpu(x*y, 0.0, in);
-  CUDA_CHECK(cudaMalloc((void**)&out, x * z * sizeof(float_t)));
-  init_const_gpu(x*z, 0.0, out);
-  CUDA_CHECK(cudaMalloc((void**)&matrix, y * z * sizeof(float_t)));
-  auto init_range = sqrt(6.0 / (y + z));
-  // Glorot & Bengio (AISTATS 2010)
-  gpu_rng_uniform(y * z, -init_range, init_range, matrix);
-  CUDA_CHECK(cudaMalloc((void**)&grad, y * z * sizeof(float_t)));
-  CUDA_CHECK(cudaMemset(grad, 0, y * z * sizeof(float_t)));
-}
-
 __global__ void setup_curand_kernel(const int n, curandState* state) {
   CUDA_KERNEL_LOOP(i, n) {
     // curand_init(1234, i, 0, &state[i]); // Each thread gets same seed 1234
@@ -185,16 +169,21 @@ void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
 
 void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z,
                     const float_t* A, const float_t* B, float_t* C) {
-  // std::cout << "[debug]: matmul1D1D_gpu\n";
   const CBLAS_TRANSPOSE TransA = CblasNoTrans;
   const CBLAS_TRANSPOSE TransB = CblasNoTrans;
   sgemm_gpu(TransA, TransB, dim_x, dim_y, dim_z, 1.0, A, B, 0.0, C);
 }
 
+// C = A x B, where A is a sparse matrix in CSR format, B is the dense matrix for vertex
+// feature tensor. However, since cusparse only supports column-major, while feature 
+// tensor is stored in row-major, the actual computation is: C = trans(A x trans(B)).
+// Currently, we use cublasSgeam to implement transposition and allocate intermediate
+// workspace memory (transpose_C) for this.
 void csrmm_gpu(const int M, const int N, const int K, const int nnz, 
                const float alpha, const float* A_nonzeros, 
 	           const int* A_idx_ptr, const int* A_nnz_idx,
                const float* B, const float beta, float *transpose_C, float* C) {
+  //std::cout << "[debug] csrmm_gpu m=" << M << ", n=" << N << ", k=" << K << ", nnz=" << nnz << "\n";
   CUSPARSE_CHECK(cusparseScsrmm2(deepgalois::Context::cusparse_handle(),
                  CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE,
                  M, N, K, nnz, &alpha, deepgalois::Context::cusparse_matdescr(), A_nonzeros, 
@@ -203,9 +192,41 @@ void csrmm_gpu(const int M, const int N, const int K, const int nnz,
   const float one = 1.0;
   const float zero = 0.0; 
   CUBLAS_CHECK(cublasSgeam(deepgalois::Context::cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_T,
-                           N, M, &one, transpose_C, M, &zero, transpose_C, M, C, N)); 
+                           N, M, &one, transpose_C, M, &zero, NULL, M, C, N)); 
 }
-
+/*
+void csrmm_gpu_new(const int M, const int N, const int K, const int nnz, 
+               const float alpha, const float* A_nonzeros, 
+	           const int* A_idx_ptr, const int* A_nnz_idx,
+               const float* B, const float beta, float *transpose_C, float* C) {
+  std::cout << "[debug]: csrmm_gpu\n";
+  cusparseSpMatDescr_t A_descr;
+  CUSPARSE_CHECK(cusparseCreateCsr(&A_descr, M, K, nnz, A_idx_ptr, A_nnz_idx, A_nonzeros,
+   	             CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));
+  cusparseDnMatDescr_t B_descr;
+  CUSPARSE_CHECK(cusparseCreateDnMat(&B_descr, K, N, K, B, CUDA_R_32F, CUSPARSE_ORDER_COL));
+  cusparseDnMatDescr_t C_descr;
+  CUSPARSE_CHECK(cusparseCreateDnMat(&C_descr, M, N, M, C, CUDA_R_32F, CUSPARSE_ORDER_COL));
+  size_t bufferSize;
+  CUSPARSE_CHECK(cusparseSpMM_bufferSize(deepgalois::Context::cusparse_handle(),
+                       CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE,
+                       (void*)&alpha, A_descr, B_descr, (void*)&beta, C_descr,
+                       CUDA_R_32F, CUSPARSE_COOMM_ALG1, &bufferSize));
+  cudaDeviceSynchronize();
+  void* buffer = NULL;
+  if (bufferSize > 0) CUDA_CHECK(cudaMalloc(&buffer, bufferSize));
+  CUSPARSE_CHECK(cusparseSpMM(deepgalois::Context::cusparse_handle(),
+                 CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE,
+                 (const void*)&alpha, A_descr, B_descr, (const void*)&beta, C_descr, 
+                 CUDA_R_32F, CUSPARSE_COOMM_ALG1, buffer));
+  cudaDeviceSynchronize();
+  //transpose C
+  const float one = 1.0;
+  const float zero = 0.0; 
+  CUBLAS_CHECK(cublasSgeam(deepgalois::Context::cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_T,
+                           N, M, &one, transpose_C, M, &zero, NULL, M, C, N)); 
+}
+//*/
 void gemv_gpu(const CBLAS_TRANSPOSE TransA, const int M, const int N,
               const float alpha, const float* A, const float* x,
               const float beta, float* y) {

From 6f2955d076e06d7479d5e694ecf2ddc62d0ce9cf Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Fri, 13 Mar 2020 10:17:50 -0500
Subject: [PATCH 157/660] fix bug

---
 libdeepgalois/src/layers/graph_conv_layer.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libdeepgalois/src/layers/graph_conv_layer.cu b/libdeepgalois/src/layers/graph_conv_layer.cu
index 15796c95d3..12d9902179 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cu
+++ b/libdeepgalois/src/layers/graph_conv_layer.cu
@@ -51,6 +51,7 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_
   init_const_gpu(x*z, 0.0, out_temp);
   if (dropout_ && phase_ == deepgalois::net_phase::train)
     dropout_gpu(x * y, scale_, dropout_rate_, in_data, dropout_mask, in_temp);
+  else copy_gpu(x*y, in_data, in_temp); 
   if (y > z) {
     sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, d_W, 0.0, out_temp);
     graph_conv_layer::aggregate(z, context->graph_gpu, out_temp, out_data);

From 17184398f80797ca8d25646beb2dd6a8eba7a09a Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Sun, 15 Mar 2020 16:05:56 -0500
Subject: [PATCH 158/660] add gin

---
 libdeepgalois/include/deepgalois/types.h |  2 ++
 lonestargnn/gin/CMakeLists.txt           |  9 ++++++
 lonestargnn/gin/gin.cpp                  | 35 ++++++++++++++++++++++++
 3 files changed, 46 insertions(+)
 create mode 100644 lonestargnn/gin/CMakeLists.txt
 create mode 100644 lonestargnn/gin/gin.cpp

diff --git a/libdeepgalois/include/deepgalois/types.h b/libdeepgalois/include/deepgalois/types.h
index 611ba57828..c1658c045f 100644
--- a/libdeepgalois/include/deepgalois/types.h
+++ b/libdeepgalois/include/deepgalois/types.h
@@ -21,6 +21,8 @@ typedef float acc_t;               // Accuracy type
 typedef short label_t;  // label is for classification (supervised learning)
 typedef uint8_t mask_t; // mask is used to indicate different uses of labels:
                         // train, val, test
+typedef uint32_t VertexID;
+
 #define CHUNK_SIZE 256
 #define TB_SIZE 256
 #define BLOCK_SIZE 256
diff --git a/lonestargnn/gin/CMakeLists.txt b/lonestargnn/gin/CMakeLists.txt
new file mode 100644
index 0000000000..f32f47179e
--- /dev/null
+++ b/lonestargnn/gin/CMakeLists.txt
@@ -0,0 +1,9 @@
+app(gin gin.cpp)
+target_link_libraries(gin dg_cpu)
+if(ENABLE_DIST_GALOIS)
+  target_link_libraries(gin distgraphloader)
+endif()
+if(ENABLE_HETERO_GALOIS)
+  target_link_libraries(gin dg_gpu)
+  target_link_libraries(gin -lcudart -lcublas -lcurand -lcudadevrt)
+endif()
diff --git a/lonestargnn/gin/gin.cpp b/lonestargnn/gin/gin.cpp
new file mode 100644
index 0000000000..aecfcf9b35
--- /dev/null
+++ b/lonestargnn/gin/gin.cpp
@@ -0,0 +1,35 @@
+// Graph Neural Networks
+// Xuhao Chen <cxh@utexas.edu>
+#include "lonestargnn.h"
+#ifdef GALOIS_USE_DIST
+#include "DistributedGraphLoader.h"
+#endif
+
+const char* name = "Graph Isomorphism Network (GIN)";
+const char* desc = "Graph isomorphism neural networks on an undirected graph";
+const char* url  = 0;
+static cll::opt<unsigned>learn_eps("le", cll::desc("whether to learn the parameter epsilon (default value false)"), cll::init(0));
+static cll::opt<std::string>agg_type("at", cll::desc("Aggregator Type"), cll::init("sum"));
+
+template <>
+class graph_conv_layer<agg_type> {
+public:
+  FV apply_edge(VertexID src, VertexID dst, FV2D in_data) {
+    return in_data[dst];
+  }
+  FV apply_vertex(VertexID src, FV2D in_data) {
+    FV a = deepgalois::matmul(deepgalois::accum, deepgalois::W);
+    FV b = deepgalois::scale(in_data[src], 1.0 + self.eps);
+    return deepgalois::vadd(a, b);
+  }
+};
+
+int main(int argc, char** argv) {
+  galois::SharedMemSys G;
+  LonestarGnnStart(argc, argv, name, desc, url);
+  deepgalois::Net network; // the neural network to train
+
+  graph_conv_layer<agg_type> layer0;
+  return 0;
+}
+

From 04bac8e4d0634e8ef3a8a85b66ff95ffb25f5322 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 18 Mar 2020 15:36:15 -0500
Subject: [PATCH 159/660] initialization fo sync substrate

---
 .../include/deepgalois/DistContext.h          |  7 ++++++-
 libdeepgalois/src/DistContext.cpp             | 14 +++++++++++++
 libdeepgalois/src/net.cpp                     | 21 +++++++++----------
 3 files changed, 30 insertions(+), 12 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h
index 15b91babda..4baaaae8a9 100644
--- a/libdeepgalois/include/deepgalois/DistContext.h
+++ b/libdeepgalois/include/deepgalois/DistContext.h
@@ -3,6 +3,7 @@
 /**
  * Based on common.hpp file of the Caffe deep learning library.
  */
+#include "galois/graphs/GluonSubstrate.h"
 #include "deepgalois/types.h"
 #include "deepgalois/utils.h"
 #include "deepgalois/gtypes.h"
@@ -15,6 +16,7 @@ class DistContext {
   size_t feat_len;             // input feature length: D
   std::vector<label_t> labels; // labels for classification: N x 1
   vec_t h_feats;               // input features: N x D
+  galois::graphs::GluonSubstrate<Graph>* syncSubstrate;
 
 public:
   // TODO why are these public
@@ -34,9 +36,12 @@ class DistContext {
   // TODO this is a distributed operation
   void norm_factor_counting();
 
+  void initializeSyncSubstrate();
+  galois::graphs::GluonSubstrate<Graph>* getSyncSubstrate();
+
   //! return label for some node
+  //! NOTE: this is LID, not GID
   label_t get_label(size_t i) {
-    // TODO global id only or lid only or both?
     return labels[i];
   }
 
diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp
index 768b1dbab9..c206c1d654 100644
--- a/libdeepgalois/src/DistContext.cpp
+++ b/libdeepgalois/src/DistContext.cpp
@@ -126,4 +126,18 @@ void DistContext::norm_factor_counting() {
   return;
 }
 
+void DistContext::initializeSyncSubstrate() {
+  DistContext::syncSubstrate =
+    new galois::graphs::GluonSubstrate<Graph>(
+      *DistContext::graph_cpu,
+      galois::runtime::getSystemNetworkInterface().ID,
+      galois::runtime::getSystemNetworkInterface().Num,
+      false
+    );
+}
+
+galois::graphs::GluonSubstrate<Graph>* DistContext::getSyncSubstrate() {
+  return DistContext::syncSubstrate;
+};
+
 }  // deepgalois
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index a101ddb4ff..c14a8397c6 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -19,7 +19,8 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1,
   context = new deepgalois::DistContext();
   num_samples = dGraph->size();
   context->saveGraph(dGraph);
-  // TODO self loop?
+  // TODO self loop setup?
+  context->initializeSyncSubstrate();
 #endif
 
   // read graph, get num nodes
@@ -90,8 +91,7 @@ void Net::train(optimizer* opt, bool need_validate) {
   Timer t_epoch;
   // run epochs
   for (unsigned i = 0; i < num_epochs; i++) {
-    std::cout << "Epoch " << std::setw(2) << i << std::fixed
-              << std::setprecision(3) << ":";
+    galois::gPrint("Epoch ", std::setw(2), i, std::fixed, std::setprecision(3), ":");
     t_epoch.Start();
 
     // training steps
@@ -121,8 +121,8 @@ void Net::train(optimizer* opt, bool need_validate) {
 
     // validation / testing
     set_netphases(net_phase::test);
-    std::cout << " train_loss = " << std::setw(5) << train_loss
-              << " train_acc = " << std::setw(5) << train_acc;
+    galois::gPrint("train_loss = ", std::setw(5), train_loss, " train_acc = ",
+                   std::setw(5), train_acc);
     t_epoch.Stop();
     double epoch_time = t_epoch.Millisecs();
     if (need_validate) {
@@ -132,13 +132,12 @@ void Net::train(optimizer* opt, bool need_validate) {
       double val_time = evaluate(val_begin, val_end, val_count, &val_mask[0],
                                  val_loss, val_acc);
       Tval.stop();
-      std::cout << " val_loss = " << std::setw(5) << val_loss
-                << " val_acc = " << std::setw(5) << val_acc;
-      std::cout << " time = " << epoch_time + val_time
-                << " ms (train_time = " << epoch_time
-                << " val_time = " << val_time << ")\n";
+      galois::gPrint(" val_loss = ", std::setw(5), val_loss, " val_acc = ",
+                     std::setw(5), val_acc);
+      galois::gPrint(" time = ", epoch_time + val_time, " ms (train_time = ",
+                     epoch_time, " val_time = ", val_time, ")\n");
     } else {
-      std::cout << " train_time = " << epoch_time << " ms\n";
+      galois::gPrint(" train_time = ", epoch_time, " ms\n");
     }
   }
 }

From 3d591c3e6b9a359ee236916767dc03a84bf1ff7a Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 18 Mar 2020 17:24:13 -0500
Subject: [PATCH 160/660] fix compile issue of conv layer on cpu

---
 libdeepgalois/include/deepgalois/layers/graph_conv_layer.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
index c77467eeca..2267b1a55c 100644
--- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
@@ -49,7 +49,9 @@ class graph_conv_layer : public layer {
 #endif
   // user-defined combine function
   virtual void combine(size_t dim_x, size_t dim_y, const float_t* self, const float_t* neighbors, float_t* out);
+#ifndef CPU_ONLY
   void d_aggregate(size_t len, CSRGraph& g, const float_t* in, float_t* out);
+#endif
 
 private:
   bool act_;     // whether to use activation function at the end

From c5f5843b5cb96e39d321d39b9ea2bc09fd5cb1f2 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 18 Mar 2020 17:34:35 -0500
Subject: [PATCH 161/660] Sync structs for graph conv layer defined

---
 .../layers/GraphConvSyncStructures.h          | 63 +++++++++++++++++++
 .../deepgalois/layers/graph_conv_layer.h      |  3 +
 libdeepgalois/include/deepgalois/types.h      | 16 ++++-
 3 files changed, 80 insertions(+), 2 deletions(-)
 create mode 100644 libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h

diff --git a/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h b/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h
new file mode 100644
index 0000000000..3b95d55f82
--- /dev/null
+++ b/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h
@@ -0,0 +1,63 @@
+#ifndef __GRAPH_CONV_SYNC_STRUCT__
+#define __GRAPH_CONV_SYNC_STRUCT__
+
+struct GraphConvSync {
+  using ValTy = std::vector<float>;
+
+  //! return a vector of floats to sync
+  static ValTy extract(uint32_t node_id, char& filler) {
+    // TODO figure out how to avoid copy from C array to vector; best
+    // way is if original data is in a vector probably, but that has the
+    // issue of not being able to directly call BLAS
+    ValTy vecToReturn;
+    // allocate space
+    vecToReturn.resize(deepgalois::_syncVectorSize);
+    // copy the node's data to vector to serialize/send
+    for (unsigned i = 0; i < deepgalois::_syncVectorSize; i++) {
+      vecToReturn[i] =
+        deepgalois::_dataToSync[node_id * deepgalois::_syncVectorSize + i];
+    }
+    // move constructor should kick in here to avoid return copy
+    return vecToReturn;
+  }
+
+  //! reduction is addition in this case; add received vector to
+  //! own vector
+  static bool reduce(uint32_t node_id, char& filler, ValTy y) {
+    assert(y.size() == deepgalois::_syncVectorSize);
+    // loop and do addition
+    for (unsigned i = 0; i < deepgalois::_syncVectorSize; i++) {
+      deepgalois::_dataToSync[node_id * deepgalois::_syncVectorSize + i] += y[i];
+    }
+    return true;
+  }
+
+  //! do nothing (waste of a write)
+  static void reset(uint32_t node_id, char& filler) {
+  }
+
+  //! element wise set
+  static void setVal(uint32_t node_id, char& filler, ValTy y) {
+    assert(y.size() == deepgalois::_syncVectorSize);
+    // loop and do addition
+    for (unsigned i = 0; i < deepgalois::_syncVectorSize; i++) {
+      deepgalois::_dataToSync[node_id * deepgalois::_syncVectorSize + i] = y[i];
+    }
+  }
+
+  // GPU options TODO for GPU
+  static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {
+    return false;
+  }
+  static bool extract_batch(unsigned, uint8_t*) { return false; }
+  static bool extract_reset_batch(unsigned, uint8_t*, size_t*,
+                                  DataCommMode*) { return false; }
+  static bool extract_reset_batch(unsigned, uint8_t*) { return false; }
+  static bool reduce_batch(unsigned, uint8_t*, DataCommMode) { return false; }
+  static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) {
+    return false;
+  }
+  static bool setVal_batch(unsigned, uint8_t*, DataCommMode) { return false; }
+};
+
+#endif
diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
index 2267b1a55c..0bf7a7e698 100644
--- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
@@ -1,6 +1,9 @@
 #pragma once
 #include "layer.h"
 #include "deepgalois/layers/aggregator.h"
+#ifdef GALOIS_USE_DIST
+#include "deepgalois/layers/GraphConvSyncStructures.h"
+#endif
 
 /**
  * GraphConv Layer; based on DGL implementation + follows TinyDNN layer
diff --git a/libdeepgalois/include/deepgalois/types.h b/libdeepgalois/include/deepgalois/types.h
index c1658c045f..3c3c7ce747 100644
--- a/libdeepgalois/include/deepgalois/types.h
+++ b/libdeepgalois/include/deepgalois/types.h
@@ -1,5 +1,5 @@
-#ifndef TYPES_H
-#define TYPES_H
+#ifndef _GNN_TYPES_H_
+#define _GNN_TYPES_H_
 #include <vector>
 #include <stdint.h>
 
@@ -30,4 +30,16 @@ typedef uint32_t VertexID;
 #define MAX_NUM_CLASSES 64
 #define WARPS_PER_BLOCK (BLOCK_SIZE / WARP_SIZE)
 #define USE_CUSPARSE
+
+
+#ifdef GALOIS_USE_DIST
+namespace deepgalois {
+  //! Set this to let sync struct know where to get data from
+  static float_t* _dataToSync = nullptr;
+  //! Set this to let sync struct know the size of the vector to use during
+  //! sync
+  static long unsigned _syncVectorSize = 0;
+}
+#endif
+
 #endif

From 6c500c55704285f8d778fa19f15a94e89752bcac Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 18 Mar 2020 18:12:26 -0500
Subject: [PATCH 162/660] norm factor is temporarily 1 for dist execution TODO
 needs to be based on degree?

---
 libdeepgalois/src/DistContext.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp
index c206c1d654..9069fad351 100644
--- a/libdeepgalois/src/DistContext.cpp
+++ b/libdeepgalois/src/DistContext.cpp
@@ -112,7 +112,7 @@ void DistContext::norm_factor_counting() {
   norm_factor = new float_t[localVertices];
   galois::do_all(galois::iterate((size_t)0, localVertices),
     [&](auto v) {
-      norm_factor[v] = 0.01;
+      norm_factor[v] = 1;
     }, galois::loopname("NormCounting"));
 
   //galois::do_all(galois::iterate((size_t)0, localVertices),

From 2c94782bd7b4393c4a22c36f9adefcc32f93dff8 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 18 Mar 2020 18:21:27 -0500
Subject: [PATCH 163/660] less messy prints for dist execution purposes (new
 line) TODO merge evertyhign into single print

---
 libdeepgalois/src/net.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index c14a8397c6..f8d21dee99 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -122,7 +122,7 @@ void Net::train(optimizer* opt, bool need_validate) {
     // validation / testing
     set_netphases(net_phase::test);
     galois::gPrint("train_loss = ", std::setw(5), train_loss, " train_acc = ",
-                   std::setw(5), train_acc);
+                   std::setw(5), train_acc, "\n");
     t_epoch.Stop();
     double epoch_time = t_epoch.Millisecs();
     if (need_validate) {
@@ -133,7 +133,7 @@ void Net::train(optimizer* opt, bool need_validate) {
                                  val_loss, val_acc);
       Tval.stop();
       galois::gPrint(" val_loss = ", std::setw(5), val_loss, " val_acc = ",
-                     std::setw(5), val_acc);
+                     std::setw(5), val_acc, "\n");
       galois::gPrint(" time = ", epoch_time + val_time, " ms (train_time = ",
                      epoch_time, " val_time = ", val_time, ")\n");
     } else {

From b29c1a54da2a58dcb52c4a01b88247744bba7034 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 18 Mar 2020 18:25:42 -0500
Subject: [PATCH 164/660] sync calls added to graph_conv_layer TODO weight
 gradient combination

---
 libdeepgalois/src/layers/graph_conv_layer.cpp | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 86ab1abd2f..f3dbd62e94 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -71,7 +71,11 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_
 
   // aggregate based on graph topology
   graph_conv_layer::aggregate(z, *(context->graph_cpu), out_temp, out_data);
-  // TODO sync required here
+  // TODO sync of out_data required here
+  deepgalois::_syncVectorSize = z;
+  deepgalois::_dataToSync = out_data;
+  layer::context->getSyncSubstrate()->sync<writeAny, readAny,
+                                          GraphConvSync>("AggSync");
 
   // run relu activation on output if specified
   if (act_) deepgalois::math::relu_cpu(x*z, out_data, out_data);
@@ -88,7 +92,11 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
   // x*y NOTE: since graph is symmetric, the derivative is the same
   // this is the aggregate call
   deepgalois::update_all(z, *(context->graph_cpu), out_grad, out_temp, norm_, norm_factor); // x*x; x*z -> x*z
-  // TODO sync required here
+  // sync agg
+  deepgalois::_syncVectorSize = z;
+  deepgalois::_dataToSync = out_temp;
+  layer::context->getSyncSubstrate()->sync<writeAny, readAny,
+                                          GraphConvSync>("AggSyncBack");
 
   // at this point, out_temp has the derivative of data from last step to
   // use for both updating gradients for features and gradients for weights

From dbb0204b9b94c08a1069b0c0523f3f0a45504028 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 18 Mar 2020 18:36:48 -0500
Subject: [PATCH 165/660] for now gradient sync is a trivial summation TODO
 change it to something different

---
 .../include/deepgalois/layers/GradientSyncStructs.h       | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h
index df88352bcf..d0074d11ed 100644
--- a/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h
+++ b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h
@@ -13,10 +13,10 @@ struct GradientSync {
   static bool reduce(uint32_t node_id, float_t& weight, ValTy y) {
     // TODO merge function here
     // for now make sure the weights are close enough
-    if (std::abs(weight - y) > 0.00001) {
-      galois::gInfo("weight ", node_id, " not consistent with one received");
-    }
-
+    //if (std::abs(weight - y) > 0.00001) {
+    //  galois::gInfo("weight ", node_id, " not consistent with one received");
+    //}
+    weight += y;
     return true;
   }
 

From 75f2dad1c51dab464f188be2f37701672b2f43a2 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 18 Mar 2020 18:44:41 -0500
Subject: [PATCH 166/660] USE_DST wrapping around sync calls

---
 libdeepgalois/src/layers/graph_conv_layer.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index f3dbd62e94..171b32305c 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -71,12 +71,13 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_
 
   // aggregate based on graph topology
   graph_conv_layer::aggregate(z, *(context->graph_cpu), out_temp, out_data);
+#ifdef GALOIS_USE_DIST
   // TODO sync of out_data required here
   deepgalois::_syncVectorSize = z;
   deepgalois::_dataToSync = out_data;
   layer::context->getSyncSubstrate()->sync<writeAny, readAny,
                                           GraphConvSync>("AggSync");
-
+#endif
   // run relu activation on output if specified
   if (act_) deepgalois::math::relu_cpu(x*z, out_data, out_data);
 }
@@ -92,11 +93,13 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
   // x*y NOTE: since graph is symmetric, the derivative is the same
   // this is the aggregate call
   deepgalois::update_all(z, *(context->graph_cpu), out_grad, out_temp, norm_, norm_factor); // x*x; x*z -> x*z
+#ifdef GALOIS_USE_DIST
   // sync agg
   deepgalois::_syncVectorSize = z;
   deepgalois::_dataToSync = out_temp;
   layer::context->getSyncSubstrate()->sync<writeAny, readAny,
                                           GraphConvSync>("AggSyncBack");
+#endif
 
   // at this point, out_temp has the derivative of data from last step to
   // use for both updating gradients for features and gradients for weights

From 61af2fcb94aeb71ce31ea3a9dacac29a9fe274df Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 27 Mar 2020 11:44:12 -0500
Subject: [PATCH 167/660] net: print modifications that take into account dist
 execution

---
 libdeepgalois/src/net.cpp | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index f8d21dee99..7677417c99 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -82,6 +82,12 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1,
 }
 
 void Net::train(optimizer* opt, bool need_validate) {
+#ifdef GALOIS_USE_DIST
+  unsigned myID = galois::runtime::getSystemNetworkInterface().ID;
+#else
+  unsigned myID = 0;
+#endif
+
   galois::gPrint("\nStart training...\n");
   galois::StatTimer Tupdate("Train-WeightUpdate");
   galois::StatTimer Tfw("Train-Forward");
@@ -91,7 +97,8 @@ void Net::train(optimizer* opt, bool need_validate) {
   Timer t_epoch;
   // run epochs
   for (unsigned i = 0; i < num_epochs; i++) {
-    galois::gPrint("Epoch ", std::setw(2), i, std::fixed, std::setprecision(3), ":");
+    galois::gPrint("[", myID, "] Epoch ", std::setw(2), i, std::fixed,
+                   std::setprecision(3), "\n");
     t_epoch.Start();
 
     // training steps
@@ -121,7 +128,7 @@ void Net::train(optimizer* opt, bool need_validate) {
 
     // validation / testing
     set_netphases(net_phase::test);
-    galois::gPrint("train_loss = ", std::setw(5), train_loss, " train_acc = ",
+    galois::gPrint("[", myID, "] train_loss = ", std::setw(5), train_loss, " train_acc = ",
                    std::setw(5), train_acc, "\n");
     t_epoch.Stop();
     double epoch_time = t_epoch.Millisecs();
@@ -132,12 +139,12 @@ void Net::train(optimizer* opt, bool need_validate) {
       double val_time = evaluate(val_begin, val_end, val_count, &val_mask[0],
                                  val_loss, val_acc);
       Tval.stop();
-      galois::gPrint(" val_loss = ", std::setw(5), val_loss, " val_acc = ",
+      galois::gPrint("[", myID, "] val_loss = ", std::setw(5), val_loss, " val_acc = ",
                      std::setw(5), val_acc, "\n");
-      galois::gPrint(" time = ", epoch_time + val_time, " ms (train_time = ",
+      galois::gPrint("[", myID, "] time = ", epoch_time + val_time, " ms (train_time = ",
                      epoch_time, " val_time = ", val_time, ")\n");
     } else {
-      galois::gPrint(" train_time = ", epoch_time, " ms\n");
+      galois::gPrint("[", myID, "] train_time = ", epoch_time, " ms\n");
     }
   }
 }

From f7b48605f2537a0cb5b8d1ae9065929b140b3bb7 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 27 Mar 2020 12:53:58 -0500
Subject: [PATCH 168/660] shared memory context return graph pointer

---
 libdeepgalois/include/deepgalois/context.h | 2 ++
 libdeepgalois/src/context.cpp              | 4 ++++
 2 files changed, 6 insertions(+)

diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h
index b765515e50..b5822a5555 100644
--- a/libdeepgalois/include/deepgalois/context.h
+++ b/libdeepgalois/include/deepgalois/context.h
@@ -46,6 +46,8 @@ class Context {
   Graph* graph_cpu; // the input graph, |V| = N
   void genGraph(LGraph& lg, Graph& g);
   void add_selfloop(Graph &og, Graph &g);
+  //! returns pointer to the graph
+  Graph* getGraphPointer();
 #else
   CSRGraph graph_gpu; // the input graph, |V| = N
   inline static cublasHandle_t cublas_handle() { return cublas_handle_; }
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index 9206b1cc1a..404b8fef7f 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -89,6 +89,10 @@ void Context::add_selfloop(Graph &og, Graph &g) {
   //*/
 }
 
+Graph* Context::getGraphPointer() {
+  return Context::graph_cpu;
+}
+
 float_t* Context::get_in_ptr() { return &h_feats[0]; }
 
 void Context::norm_factor_counting() {

From 2bbbcde18139727ef32b892c49e473b815c39517 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 27 Mar 2020 12:54:24 -0500
Subject: [PATCH 169/660] dist context, return graph pointer

---
 libdeepgalois/include/deepgalois/DistContext.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h
index 4baaaae8a9..704247d54b 100644
--- a/libdeepgalois/include/deepgalois/DistContext.h
+++ b/libdeepgalois/include/deepgalois/DistContext.h
@@ -39,6 +39,10 @@ class DistContext {
   void initializeSyncSubstrate();
   galois::graphs::GluonSubstrate<Graph>* getSyncSubstrate();
 
+  Graph* getGraphPointer() {
+    return graph_cpu;
+  }
+
   //! return label for some node
   //! NOTE: this is LID, not GID
   label_t get_label(size_t i) {

From 03412cd81f3ec20fe8346ed3b7624d705bd9c18c Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 27 Mar 2020 13:03:01 -0500
Subject: [PATCH 170/660] semblance of unified accuracy (masked_accuracy made
 distributed)

---
 libdeepgalois/include/deepgalois/gtypes.h |  3 ++
 libdeepgalois/include/deepgalois/net.h    |  5 ++-
 libdeepgalois/src/net.cpp                 | 53 +++++++++++++++++++++--
 3 files changed, 55 insertions(+), 6 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/gtypes.h b/libdeepgalois/include/deepgalois/gtypes.h
index 5dc08fc99e..dfc2e1d8c6 100644
--- a/libdeepgalois/include/deepgalois/gtypes.h
+++ b/libdeepgalois/include/deepgalois/gtypes.h
@@ -12,6 +12,9 @@
 
 typedef galois::GAccumulator<acc_t> AccumF;
 typedef galois::GAccumulator<size_t> AccumU;
+#ifdef GALOIS_USE_DIST
+using AccuracyAccum = galois::DGAccumulator<acc_t>;
+#endif
 
 #ifndef GALOIS_USE_DIST
 #ifdef EDGE_LABEL
diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h
index f905d2a595..74cf3f6058 100644
--- a/libdeepgalois/include/deepgalois/net.h
+++ b/libdeepgalois/include/deepgalois/net.h
@@ -122,7 +122,7 @@ class Net {
     Timer t_eval;
     t_eval.Start();
     loss = fprop(begin, end, count, masks);
-    acc  = masked_accuracy(begin, end, count, masks);
+    acc  = masked_accuracy(begin, end, count, masks, context->getGraphPointer());
     t_eval.Stop();
     return t_eval.Millisecs();
   }
@@ -142,7 +142,8 @@ class Net {
   size_t train_begin, train_end, train_count, val_begin, val_end, val_count;
   std::vector<layer*> layers; // all the layers in the neural network
   // comparing outputs with the ground truth (labels)
-  acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks);
+  acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks,
+                        Graph* dGraph);
 };
 
 } // namespace deepgalois
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index 7677417c99..8991f779c5 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -67,6 +67,9 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1,
   }
   //std::cout << "Done\n";
 
+  // NOTE: train_begin/train_end are global IDs, train_mask is a local id
+  // train count and val count are LOCAL counts
+
   num_layers = NUM_CONV_LAYERS + 1;
   // initialize feature metadata
   feature_dims.resize(num_layers + 1);
@@ -111,7 +114,7 @@ void Net::train(optimizer* opt, bool need_validate) {
     train_loss =
         Net::fprop(train_begin, train_end, train_count, &train_mask[0]); // forward
     train_acc = masked_accuracy(train_begin, train_end, train_count,
-                                &train_mask[0]); // predict
+                                &train_mask[0], context->getGraphPointer()); // predict
     Tfw.stop();
 
     // backward: use intermediate features + ground truth to update layers
@@ -160,18 +163,60 @@ void Net::construct_layers() {
 }
 
 #ifdef CPU_ONLY
-acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks) {
+/**
+ *
+ * @param begin GLOBAL begin
+ * @param end GLOBAL end
+ * @param count GLOBAL training count
+ */
+acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks,
+                           Graph* dGraph) {
+#ifndef GALOIS_USE_DIST
   AccumF accuracy_all;
+#else
+  AccuracyAccum accuracy_all;
+  galois::DGAccumulator<uint32_t> sampleCount;
+  sampleCount.reset();
+#endif
+
   accuracy_all.reset();
+
   galois::do_all(galois::iterate(begin, end), [&](const auto& i) {
+#ifndef GALOIS_USE_DIST
     if (masks[i] == 1) {
+      // get prediction
       int preds = argmax(num_classes,
-	    &(layers[NUM_CONV_LAYERS - 1]->next()->get_data()[i * num_classes]));
+      	    &(layers[NUM_CONV_LAYERS - 1]->next()->get_data()[i * num_classes]));
+      // check prediction
       if ((label_t)preds == context->get_label(i))
         accuracy_all += 1.0;
     }
+#else
+    // only look at owned nodes (i.e. masters); the prediction for these
+    // should only by handled on the owner
+    if (dGraph->isOwned(i)) {
+      sampleCount += 1;
+
+      uint32_t localID = dGraph->getLID(i);
+      if (masks[localID] == 1) {
+        // get prediction
+        int preds = argmax(num_classes,
+        	    &(layers[NUM_CONV_LAYERS - 1]->next()->get_data()[localID * num_classes]));
+        // check prediction
+        if ((label_t)preds == context->get_label(localID))
+          accuracy_all += 1.0;
+      }
+    }
+#endif
   },
-  galois::chunk_size<256>(), galois::steal(), galois::loopname("getMaskedLoss"));
+  galois::loopname("getMaskedLoss"));
+
+#ifdef GALOIS_USE_DIST
+  count = sampleCount.reduce();
+  galois::gDebug("sample count is ", count);
+#endif
+
+  // all hosts should get same accuracy
   return accuracy_all.reduce() / (acc_t)count;
 }
 #endif

From 4315ebc546f82633ccb65c27436b80078580fbdb Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Thu, 16 Apr 2020 19:39:46 -0500
Subject: [PATCH 171/660] add sampler

---
 libdeepgalois/CMakeLists.txt               | 1 +
 libdeepgalois/include/deepgalois/sampler.h | 7 +++++++
 libdeepgalois/src/sampler.cpp              | 4 ++++
 3 files changed, 12 insertions(+)
 create mode 100644 libdeepgalois/include/deepgalois/sampler.h
 create mode 100644 libdeepgalois/src/sampler.cpp

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index b625c317e3..e8ff6e420a 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -76,6 +76,7 @@ set(sources
   src/math_functions.cpp
   src/optimizer.cpp
   src/context.cpp
+  src/sampler.cpp
   src/node.cpp
   src/net.cpp
 )
diff --git a/libdeepgalois/include/deepgalois/sampler.h b/libdeepgalois/include/deepgalois/sampler.h
new file mode 100644
index 0000000000..079a84d415
--- /dev/null
+++ b/libdeepgalois/include/deepgalois/sampler.h
@@ -0,0 +1,7 @@
+#pragma once
+#include "deepgalois/gtypes.h"
+
+void subgraph_sampler(Graph &g, Graph &sg);
+galois::runtime::iterable<galois::NoDerefIterator<Graph::edge_iterator> > neighbor_sampler(Graph &g, GNode v);
+Graph::edge_iterator sampled_edge_begin(Graph &g, GNode v) { return g.edge_begin(v); }
+Graph::edge_iterator sampled_edge_end(Graph &g, GNode v) { return g.edge_end(v); }
diff --git a/libdeepgalois/src/sampler.cpp b/libdeepgalois/src/sampler.cpp
new file mode 100644
index 0000000000..5077d5b756
--- /dev/null
+++ b/libdeepgalois/src/sampler.cpp
@@ -0,0 +1,4 @@
+#include "deepgalois/sampler.h"
+
+void subgraph_sampler(Graph &g, Graph &sg) {
+}

From b2c17a8498e69c4647812a988f1c64ba9d64246e Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Thu, 16 Apr 2020 21:55:47 -0500
Subject: [PATCH 172/660] update sampler

---
 libdeepgalois/src/sampler.cpp | 111 +++++++++++++++++++++++++++++++++-
 1 file changed, 110 insertions(+), 1 deletion(-)

diff --git a/libdeepgalois/src/sampler.cpp b/libdeepgalois/src/sampler.cpp
index 5077d5b756..98dac8c75c 100644
--- a/libdeepgalois/src/sampler.cpp
+++ b/libdeepgalois/src/sampler.cpp
@@ -1,4 +1,113 @@
 #include "deepgalois/sampler.h"
+#include <time.h> 
+#include <vector>
 
-void subgraph_sampler(Graph &g, Graph &sg) {
+// selecet k vertices from begin to end
+static std::vector<GNode> selectVertex(GNode begin, GNode end, size_t k) {
+    auto i = begin;
+  
+    // reservoir[] is the output array. Initialize  
+    // it with first k vertices 
+    std::vector<GNode> reservoir(k);
+    for (; i < k; i++) reservoir[i] = i;
+  
+    // Use a different seed value so that we don't get  
+    // same result each time we run this program  
+    srand(time(NULL));  
+  
+    // Iterate from the (k+1)th element to nth element  
+    for (; i < end; i++) {  
+        // Pick a random index from 0 to i.  
+        auto j = rand() % (i + 1);  
+  
+        // If the randomly picked index is smaller than k,  
+        // then replace the element present at the index  
+        // with new element from stream  
+        if (j < k) reservoir[j] = i;
+    }
+	return reservoir;
+}
+
+// Utility function to find ceiling of r in arr[l..h]
+int findCeil(std::vector<unsigned> arr, unsigned r, unsigned l, unsigned h) {  
+	unsigned mid;
+	while (l < h) {
+		mid = l + ((h - l) >> 1); // Same as mid = (l+h)/2
+		(r > arr[mid]) ? (l = mid + 1) : (h = mid);
+	}
+	return (arr[l] >= r) ? l : -1;  
+} 
+
+// select one element from n  elements given a frequency (probability) distribution
+// https://www.geeksforgeeks.org/random-number-generator-in-arbitrary-probability-distribution-fashion/
+size_t selectOneVertex(size_t n, std::vector<unsigned> dist) {
+	std::vector<unsigned> offsets(n);
+	offsets[0] = dist[0];
+	// compute the prefix sum of the distribution
+	for (size_t i = 1; i < n; ++i) offsets[i] = offsets[i-1] + dist[i];
+	// offsets[n-1] is sum of all frequencies
+	unsigned sum = offsets[n-1];
+	unsigned r = (rand() % sum) + 1;
+	// find which range r falls into,
+	// and return the index of the range
+	return findCeil(offsets, r, 0, n - 1);
+}
+
+inline unsigned getDegree(Graph &g, GNode v) {
+	return std::distance(g.edge_begin(v), g.edge_end(v));
+}
+
+void generate_subgraph(std::set<GNode> &vertex_set, Graph &g, Graph &sub) {
+	auto nv = vertex_set.size();
+	size_t ne = 0;
+	std::vector<unsigned> offsets(nv+1);
+	offsets[0] = 0;
+	size_t i = 0;
+	std::vector<GNode> vertices(nv);
+	for (auto v : vertex_set) {
+		vertices[i] = v;
+		offsets[i+1] = offsets[i] + getDegree(g, v);
+		i++;
+	}
+	// TODO: need to remove edges whose has endpoint not belong to the selected vertex subset
+	sub.allocateFrom(nv, ne);
+	sub.constructNodes();
+	for (i = 0; i < nv; i++) {
+		g.fixEndEdge(i, offsets[i+1]);
+		for (unsigned offset = 0; offset < offsets[i+1]-offsets[i]; offset ++) {
+			g.constructEdge(offsets[i]+offset, g.getEdgeDst(g.edge_begin(vertices[i])+offset), 0);
+		}
+	}
+}
+
+// generate a subgraph sg with size n from the input graph g
+// n: number of vertices in the subgraph
+// m: number of vertices in the frontier
+void subgraph_sampler(Graph &g, Graph &sg, size_t n, size_t m) {
+    auto num_vertices = g.size(); // number of vertices in the original input graph
+    auto frontier = selectVertex(0, num_vertices, m); // randomly select m vertices from g as frontier
+	std::set<GNode> vertex_set;
+	for (size_t i = 0; i < m; i++)
+		vertex_set.insert(frontier[i]);
+	std::vector<unsigned> degrees(m);
+	//std::vector<float> probabilities(m);
+	//unsigned sum_degree = 0;
+	for (size_t i = 0; i < m; i++) {
+		degrees[i] = getDegree(g, frontier[i]);
+		//sum_degree += degrees[i];
+	}
+	for (size_t i = 0; i < n - m; i++) {
+		//for (size_t i = 0; i < m; i++)
+		//	probabilities[i] = (float)degrees[i] / (float)sum_degree;
+		auto pos = selectOneVertex(m, degrees);
+		GNode u = frontier[pos];
+		auto degree = degrees[pos];
+		auto neighbor_id = rand() % degree;
+		frontier[pos] = g.getEdgeDst(g.edge_begin(u) + neighbor_id);
+		degrees[pos] = getDegree(g, frontier[pos]);
+		//sum_degree -= degree;
+		//sum_degree += degrees[pos];
+		vertex_set.insert(u);
+	}
+	generate_subgraph(vertex_set, g, sg);
 }

From e292fbb556511747fb7c8526a9273d2489ad2add Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Fri, 17 Apr 2020 20:51:16 -0500
Subject: [PATCH 173/660] add sigmoid

---
 libdeepgalois/CMakeLists.txt                  |  1 +
 .../deepgalois/layers/sigmoid_loss_layer.h    | 18 +++++
 .../include/deepgalois/math_functions.hh      | 11 +++-
 .../src/layers/sigmoid_loss_layer.cpp         | 65 +++++++++++++++++++
 libdeepgalois/src/math_functions.cpp          | 25 ++++---
 5 files changed, 109 insertions(+), 11 deletions(-)
 create mode 100644 libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h
 create mode 100644 libdeepgalois/src/layers/sigmoid_loss_layer.cpp

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index e8ff6e420a..f92d8950a9 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -71,6 +71,7 @@ else()
 set(sources
   src/layers/graph_conv_layer.cpp
   src/layers/softmax_loss_layer.cpp
+  src/layers/sigmoid_loss_layer.cpp
   src/layers/aggregator.cpp
   src/layers/layer.cpp
   src/math_functions.cpp
diff --git a/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h b/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h
new file mode 100644
index 0000000000..31bab85daa
--- /dev/null
+++ b/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h
@@ -0,0 +1,18 @@
+#pragma once
+#include "layer.h"
+
+namespace deepgalois {
+class sigmoid_loss_layer : public layer {
+public:
+  sigmoid_loss_layer(unsigned level, std::vector<size_t> in_dims,
+                     std::vector<size_t> out_dims);
+  ~sigmoid_loss_layer() {}
+  std::string layer_type() const override {
+    return std::string("sigmoid_loss");
+  }
+  virtual void forward_propagation(const float_t* in_data, float_t* out_data);
+  virtual void back_propagation(const float_t* in_data, const float_t* out_data,
+                                float_t* out_grad, float_t* in_grad);
+  virtual acc_t get_masked_loss();
+};
+}
diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh
index ffc0343438..7aa388ab13 100644
--- a/libdeepgalois/include/deepgalois/math_functions.hh
+++ b/libdeepgalois/include/deepgalois/math_functions.hh
@@ -39,15 +39,22 @@ void d_dropout_cpu(size_t n, const float scale, const float_t* in_diff,
 void relu_cpu(size_t n, const float_t* in, float_t* out);
 //! ReLU derivative; generally, 1 if data > 0, 0 otherwise
 void d_relu_cpu(size_t n, const float_t* in, const float_t* data, float_t* out);
+
+// Loss function for single-class label (one-hot) data: softmax
 void softmax(const vec_t& input, vec_t& output);
 void softmax(size_t n, const float_t* input, float_t* output);
 void d_softmax(const vec_t& y, const vec_t& p, vec_t& dy, const vec_t& dp);
-void d_softmax(size_t n, const float_t* y, const float_t* p, float_t* dy,
-               const float_t* dp);
+void d_softmax(size_t n, const float_t* y, const float_t* p, float_t* dy, const float_t* dp);
+
 float_t cross_entropy(const vec_t& y, const vec_t& p);
 float_t cross_entropy(size_t n, const float_t* y, const float_t* p);
 void d_cross_entropy(const vec_t& y, const vec_t& p, vec_t& d);
 void d_cross_entropy(size_t n, const float_t* y, const float_t* p, float_t* d);
+
+// Loss function for multi-class label (one-hot) data: sigmoid
+void sigmoid(size_t n, const float_t* input, float_t* output);
+void d_sigmoid(size_t n, const float_t* y, const float_t* p, float_t* dy, const float_t* dp);
+
 //! copy vector from in -> out; first len elements
 void copy_cpu(size_t len, const float_t* in, float_t* out);
 // single-precision dense matrix multiply
diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
new file mode 100644
index 0000000000..220d3da102
--- /dev/null
+++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
@@ -0,0 +1,65 @@
+#include "deepgalois/layers/sigmoid_loss_layer.h"
+
+namespace deepgalois {
+
+#ifdef CPU_ONLY
+sigmoid_loss_layer::sigmoid_loss_layer(unsigned level,
+                                       std::vector<size_t> in_dims,
+                                       std::vector<size_t> out_dims)
+    : layer(level, in_dims, out_dims) {
+  trainable_ = false;
+  name_      = layer_type() + "_" + std::to_string(level);
+  loss = new float_t[in_dims[0]]; // error for each sample
+}
+
+void sigmoid_loss_layer::forward_propagation(const float_t* in_data, float_t* out_data) {
+  size_t len = input_dims[1];
+  galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) {
+    if (masks_[i] == 1) { // masked
+      // output is normalized input for this layer
+      math::sigmoid(len, &in_data[len*i], &out_data[len*i]); // normalize using sigmoid
+      // one hot encoded vector for the labels
+      std::vector<acc_t> groundTruth(output_dims[1], 0.0); // ground truth
+      groundTruth[context->get_label(i)] = 1.0;            // one-hot TODO: modify for multi-class label
+      // loss calculation
+      loss[i] = math::cross_entropy(len, &groundTruth[0], &out_data[len*i]);
+    }
+  }, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("sigmoid-loss-fw"));
+}
+
+void sigmoid_loss_layer::back_propagation(const float_t* in_data, const float_t* out_data,
+                                          float_t* out_grad, float_t* in_grad) {
+  size_t len = layer::input_dims[1];
+  galois::do_all(galois::iterate(layer::begin_, layer::end_), [&](const auto& i) {
+    if (masks_[i] == 1) { // masked
+      vec_t norm_grad(len);
+      std::vector<acc_t> groundTruth(len, 0.0);
+      groundTruth[context->get_label(i)] = 1.0;
+      // use ground truth to determine derivative of cross entropy
+      math::d_cross_entropy(len, &groundTruth[0], &out_data[len * i], &norm_grad[0]);
+      // derviative sigmoid to gradient used in the next layer
+      math::d_sigmoid(len, &in_data[len * i], &out_data[len * i], &in_grad[len * i], &norm_grad[0]);
+    }
+  }, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("sigmoid-loss-bw"));
+}
+
+acc_t sigmoid_loss_layer::get_masked_loss() {
+  assert(count_ > 0);
+  AccumF total_loss;
+  AccumU valid_sample_count;
+  total_loss.reset();
+  valid_sample_count.reset();
+  galois::do_all(galois::iterate(layer::begin_, layer::end_),
+    [&](const auto& i) {
+      if (masks_[i]) {
+        total_loss += loss[i];
+        valid_sample_count += 1;
+      }
+    }, galois::chunk_size<256>(), galois::steal(),
+    galois::loopname("getMaskedLoss"));
+  assert(valid_sample_count.reduce() == count_);
+  return total_loss.reduce() / (acc_t)count_;
+}
+#endif
+
+} // namespace
diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp
index 6b383e4b78..58e3543652 100644
--- a/libdeepgalois/src/math_functions.cpp
+++ b/libdeepgalois/src/math_functions.cpp
@@ -501,17 +501,24 @@ float reduce_mean(const vec_t& x) {
   return sum / (float)n;
 }
 
-
-
-float_t sigmoid_func(float_t x) { return 0.5 * tanh(0.5 * x) + 0.5; }
+// use sigmoid instead of softmax for multi-class datasets, e.g. ppi, yelp and amazon
+// inline float_t sigmoid_func(float_t x) { return 0.5 * tanh(0.5 * x) + 0.5; }
+inline float_t sigmoid_func(float_t x) { return 1./(1.+expf(-x)); }
 
 // Sigmoid
-void sigmoid(vec_t& fv) {
-  size_t count = fv.size();
-  for (size_t i = 0; i < count; ++i) {
-    fv[i] = sigmoid_func(fv[i]);
-  }
+void sigmoid(const vec_t& input, vec_t &output) {
+  for (size_t i = 0; i < input.size(); ++i)
+    output[i] = sigmoid_func(input[i]);
 }
 
+void sigmoid(size_t n, const float_t* input, float_t* output) {
+  for (int i=0; i< n; i++) {
+    output[i] = 1. / (1. + expf(-input[i]));
+  }
+}
 
-
+void d_sigmoid(size_t n, const float_t* y, const float_t* p, float_t* dy, const float_t* dp) {
+  for (int i=0; i< n; i++) {
+    dy[i] = dp[i] * p[i] * (float_t(1) - p[i]);
+  }
+}

From fdd84d3c5a22a7875a37f540aa74d5ec2b559659 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Sat, 18 Apr 2020 17:51:35 -0500
Subject: [PATCH 174/660] fix bug

---
 libdeepgalois/include/deepgalois/context.h    |  2 +
 .../include/deepgalois/math_functions.hh      |  2 +
 libdeepgalois/include/deepgalois/net.h        | 10 ++++-
 libdeepgalois/src/context.cpp                 |  1 +
 libdeepgalois/src/math_functions.cpp          | 44 ++++++++++---------
 libdeepgalois/src/net.cpp                     | 28 +++++++-----
 lonestargnn/gcn/gcn.cpp                       |  2 +-
 lonestargnn/include/lonestargnn.h             |  4 +-
 8 files changed, 57 insertions(+), 36 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h
index b5822a5555..754b7a8491 100644
--- a/libdeepgalois/include/deepgalois/context.h
+++ b/libdeepgalois/include/deepgalois/context.h
@@ -32,10 +32,12 @@ class Context {
   size_t read_graph_gpu(std::string dataset_str, bool selfloop);
   void copy_data_to_device(); // copy labels and input features
   void norm_factor_counting();
+  //void set_label_class(bool is_single = true) { is_single_class = is_single; }
 
   size_t n;                    // number of samples: N
   size_t num_classes;          // number of classes: E
   size_t feat_len;             // input feature length: D
+  //bool is_single_class;        // single-class (one-hot) or multi-class label
   std::vector<label_t> labels; // labels for classification: N x 1
   label_t* d_labels;           // labels on device
   vec_t h_feats;               // input features: N x D
diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh
index 7aa388ab13..46f571ac35 100644
--- a/libdeepgalois/include/deepgalois/math_functions.hh
+++ b/libdeepgalois/include/deepgalois/math_functions.hh
@@ -52,7 +52,9 @@ void d_cross_entropy(const vec_t& y, const vec_t& p, vec_t& d);
 void d_cross_entropy(size_t n, const float_t* y, const float_t* p, float_t* d);
 
 // Loss function for multi-class label (one-hot) data: sigmoid
+void sigmoid(const vec_t& input, vec_t& output);
 void sigmoid(size_t n, const float_t* input, float_t* output);
+void d_sigmoid(const vec_t& y, const vec_t& p, vec_t& dy, const vec_t& dp);
 void d_sigmoid(size_t n, const float_t* y, const float_t* p, float_t* dy, const float_t* dp);
 
 //! copy vector from in -> out; first len elements
diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h
index 74cf3f6058..e29e1863ff 100644
--- a/libdeepgalois/include/deepgalois/net.h
+++ b/libdeepgalois/include/deepgalois/net.h
@@ -10,6 +10,7 @@
 #include "deepgalois/gtypes.h"
 #include "deepgalois/layers/graph_conv_layer.h"
 #include "deepgalois/layers/softmax_loss_layer.h"
+#include "deepgalois/layers/sigmoid_loss_layer.h"
 #include "deepgalois/optimizer.h"
 #ifndef GALOIS_USE_DIST
 #include "deepgalois/context.h"
@@ -31,7 +32,8 @@ class Net {
 public:
   Net() {}
   #ifndef GALOIS_USE_DIST
-  void init(std::string dataset_str, unsigned epochs, unsigned hidden1, bool selfloop);
+  void init(std::string dataset_str, unsigned epochs, unsigned hidden1, 
+            bool selfloop, bool is_single = true);
   #else
   void init(std::string dataset_str, unsigned epochs, unsigned hidden1,
             bool selfloop, Graph* dGraph);
@@ -79,7 +81,10 @@ class Net {
     in_dims[0] = out_dims[0] = num_samples;
     in_dims[1]               = get_in_dim(layer_id);
     out_dims[1]              = get_out_dim(layer_id);
-    layers[layer_id] = new softmax_loss_layer(layer_id, in_dims, out_dims);
+	if (is_single_class)
+	  layers[layer_id] = new softmax_loss_layer(layer_id, in_dims, out_dims);
+    else
+	  layers[layer_id] = new sigmoid_loss_layer(layer_id, in_dims, out_dims);
     connect(layers[layer_id - 1], layers[layer_id]);
   }
 
@@ -133,6 +138,7 @@ class Net {
 #else
   deepgalois::DistContext* context;
 #endif
+  bool is_single_class;             // single-class (one-hot) or multi-class label
   size_t num_samples;               // number of samples: N
   size_t num_classes;               // number of vertex classes: E
   size_t num_layers;                // for now hard-coded: NUM_CONV_LAYERS + 1
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index 404b8fef7f..52db06ca62 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -18,6 +18,7 @@ size_t Context::read_graph(std::string dataset_str, bool selfloop) {
 size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype, bool selfloop) {
   galois::StatTimer Tread("GraphReadingTime");
   Tread.start();
+  graph_cpu = new Graph(); 
   if (filetype == "el") {
     std::string filename = path + dataset_str + ".el";
     printf("Reading .el file: %s\n", filename.c_str());
diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp
index 58e3543652..cdde9cc964 100644
--- a/libdeepgalois/src/math_functions.cpp
+++ b/libdeepgalois/src/math_functions.cpp
@@ -261,6 +261,29 @@ void d_cross_entropy(size_t n, const float_t* y, const float_t* p, float_t* d) {
     d[i] = -y[i] / (p[i] + float_t(1e-10));
   }
 }
+
+// use sigmoid instead of softmax for multi-class datasets, e.g. ppi, yelp and amazon
+// inline float_t sigmoid_func(float_t x) { return 0.5 * tanh(0.5 * x) + 0.5; }
+inline float_t sigmoid_func(float_t x) { return 1./(1.+expf(-x)); }
+
+// Sigmoid
+void sigmoid(const vec_t& in, vec_t &out) {
+  for (size_t i = 0; i < in.size(); ++i)
+    out[i] = sigmoid_func(in[i]);
+}
+
+void sigmoid(size_t n, const float_t* in, float_t* out) {
+  for (size_t i = 0; i < n; i++) {
+    out[i] = 1. / (1. + expf(-in[i]));
+  }
+}
+
+void d_sigmoid(size_t n, const float_t* y, const float_t* p, float_t* dy, const float_t* dp) {
+  for (size_t i = 0; i < n; i++) {
+    dy[i] = dp[i] * p[i] * (float_t(1) - p[i]);
+  }
+}
+
 void copy1D1D(const vec_t& in, vec_t& out) {
   std::copy(in.begin(), in.end(), &out[0]);
 }
@@ -501,24 +524,3 @@ float reduce_mean(const vec_t& x) {
   return sum / (float)n;
 }
 
-// use sigmoid instead of softmax for multi-class datasets, e.g. ppi, yelp and amazon
-// inline float_t sigmoid_func(float_t x) { return 0.5 * tanh(0.5 * x) + 0.5; }
-inline float_t sigmoid_func(float_t x) { return 1./(1.+expf(-x)); }
-
-// Sigmoid
-void sigmoid(const vec_t& input, vec_t &output) {
-  for (size_t i = 0; i < input.size(); ++i)
-    output[i] = sigmoid_func(input[i]);
-}
-
-void sigmoid(size_t n, const float_t* input, float_t* output) {
-  for (int i=0; i< n; i++) {
-    output[i] = 1. / (1. + expf(-input[i]));
-  }
-}
-
-void d_sigmoid(size_t n, const float_t* y, const float_t* p, float_t* dy, const float_t* dp) {
-  for (int i=0; i< n; i++) {
-    dy[i] = dp[i] * p[i] * (float_t(1) - p[i]);
-  }
-}
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index 8991f779c5..19a3508ebf 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -7,12 +7,14 @@
 namespace deepgalois {
 
 #ifndef GALOIS_USE_DIST
-void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1, bool selfloop) {
+void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1, 
+               bool selfloop, bool is_single) {
 #else
 void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1,
                bool selfloop, Graph* dGraph) {
 #endif
 #ifndef GALOIS_USE_DIST
+  is_single_class = is_single;
   context = new deepgalois::Context();
   num_samples = context->read_graph(dataset_str, selfloop);
 #else
@@ -87,8 +89,13 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1,
 void Net::train(optimizer* opt, bool need_validate) {
 #ifdef GALOIS_USE_DIST
   unsigned myID = galois::runtime::getSystemNetworkInterface().ID;
+  std::string header = "[" + std::to_string(myID) + "] ";
+  std::string seperator = "\n";
 #else
-  unsigned myID = 0;
+  //std::string header = "[" + std::to_string(0) + "] ";
+  //std::string seperator = "\n";
+  std::string header = "";
+  std::string seperator = " ";
 #endif
 
   galois::gPrint("\nStart training...\n");
@@ -100,8 +107,7 @@ void Net::train(optimizer* opt, bool need_validate) {
   Timer t_epoch;
   // run epochs
   for (unsigned i = 0; i < num_epochs; i++) {
-    galois::gPrint("[", myID, "] Epoch ", std::setw(2), i, std::fixed,
-                   std::setprecision(3), "\n");
+    galois::gPrint(header, "Epoch ", std::setw(3), i, seperator);
     t_epoch.Start();
 
     // training steps
@@ -131,8 +137,8 @@ void Net::train(optimizer* opt, bool need_validate) {
 
     // validation / testing
     set_netphases(net_phase::test);
-    galois::gPrint("[", myID, "] train_loss = ", std::setw(5), train_loss, " train_acc = ",
-                   std::setw(5), train_acc, "\n");
+    galois::gPrint(header, "train_loss ", std::setprecision(3), std::fixed, train_loss,
+                   " train_acc ", train_acc, seperator);
     t_epoch.Stop();
     double epoch_time = t_epoch.Millisecs();
     if (need_validate) {
@@ -142,12 +148,12 @@ void Net::train(optimizer* opt, bool need_validate) {
       double val_time = evaluate(val_begin, val_end, val_count, &val_mask[0],
                                  val_loss, val_acc);
       Tval.stop();
-      galois::gPrint("[", myID, "] val_loss = ", std::setw(5), val_loss, " val_acc = ",
-                     std::setw(5), val_acc, "\n");
-      galois::gPrint("[", myID, "] time = ", epoch_time + val_time, " ms (train_time = ",
-                     epoch_time, " val_time = ", val_time, ")\n");
+      galois::gPrint(header, "val_loss ", std::setprecision(3), std::fixed, val_loss,
+                     " val_acc ", val_acc, seperator);
+      galois::gPrint(header, "time ", std::setprecision(3), std::fixed, epoch_time + val_time, 
+                     " ms (train_time ", epoch_time, " val_time ", val_time, ")\n");
     } else {
-      galois::gPrint("[", myID, "] train_time = ", epoch_time, " ms\n");
+      galois::gPrint(header, "train_time ", std::fixed, epoch_time, " ms\n");
     }
   }
 }
diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp
index d688258cd3..e23097befe 100644
--- a/lonestargnn/gcn/gcn.cpp
+++ b/lonestargnn/gcn/gcn.cpp
@@ -25,7 +25,7 @@ int main(int argc, char** argv) {
 
 #ifndef GALOIS_USE_DIST
   // read network, features, ground truth, initialize metadata
-  network.init(dataset, epochs, hidden1, add_selfloop);
+  network.init(dataset, epochs, hidden1, add_selfloop, is_single_class);
 #else
   network.init(dataset, epochs, hidden1, add_selfloop, dGraph);
 #endif
diff --git a/lonestargnn/include/lonestargnn.h b/lonestargnn/include/lonestargnn.h
index e932738636..7e2c3ec589 100644
--- a/lonestargnn/include/lonestargnn.h
+++ b/lonestargnn/include/lonestargnn.h
@@ -52,7 +52,9 @@ static cll::opt<unsigned> max_degree(
 static cll::opt<unsigned> do_validate("dv", cll::desc("enable validation"),
                                       cll::init(1));
 static cll::opt<unsigned> do_test("dt", cll::desc("enable test"), cll::init(1));
-static cll::opt<unsigned> add_selfloop("sl", cll::desc("add selfloop"), cll::init(0));
+static cll::opt<bool> add_selfloop("sl", cll::desc("add selfloop"), cll::init(0));
+static cll::opt<bool> is_single_class("sc", 
+    cll::desc("single-class or multi-class label (default single)"), cll::init(1));
 
 //! standard global options to the benchmarks
 extern llvm::cl::opt<bool> skipVerify;

From 20b7a0985aeb521ccdd6c1f47f43574df1d0d00c Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Sat, 18 Apr 2020 23:23:15 -0500
Subject: [PATCH 175/660] use binary input feature

---
 libdeepgalois/include/deepgalois/context.h    | 27 ++++----
 libdeepgalois/include/deepgalois/types.h      |  2 +-
 libdeepgalois/include/deepgalois/utils.h      |  4 +-
 libdeepgalois/src/context.cpp                 | 67 +++++++++++++------
 .../src/layers/sigmoid_loss_layer.cpp         | 12 ++--
 libdeepgalois/src/net.cpp                     |  1 +
 6 files changed, 70 insertions(+), 43 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h
index 754b7a8491..a2407bd478 100644
--- a/libdeepgalois/include/deepgalois/context.h
+++ b/libdeepgalois/include/deepgalois/context.h
@@ -23,25 +23,20 @@ class Context {
 
   size_t read_graph(std::string dataset_str, bool selfloop);
   size_t read_labels(std::string dataset_str);
-  size_t read_features(std::string dataset_str);
-  label_t get_label(size_t i) { return labels[i]; }
-  label_t* get_labels_ptr(size_t i) { return &(labels[0]); }
+  size_t read_features(std::string dataset_str, std::string filetype = "bin");
+  label_t get_label(size_t i) { return labels[i]; } // single-class (one-hot) label
+  label_t get_label(size_t i, size_t j) { return labels[i*num_classes+j]; } // multi-class label
+  label_t* get_labels_ptr(size_t i) { return labels; }
   float_t* get_in_ptr();
 
   size_t read_graph_cpu(std::string dataset_str, std::string filetype, bool selfloop);
   size_t read_graph_gpu(std::string dataset_str, bool selfloop);
   void copy_data_to_device(); // copy labels and input features
   void norm_factor_counting();
-  //void set_label_class(bool is_single = true) { is_single_class = is_single; }
+  void set_label_class(bool is_single = true) { is_single_class = is_single; }
 
-  size_t n;                    // number of samples: N
-  size_t num_classes;          // number of classes: E
-  size_t feat_len;             // input feature length: D
-  //bool is_single_class;        // single-class (one-hot) or multi-class label
-  std::vector<label_t> labels; // labels for classification: N x 1
-  label_t* d_labels;           // labels on device
-  vec_t h_feats;               // input features: N x D
   float_t* d_feats;            // input features on device
+  label_t* d_labels;           // labels on device
   float_t* norm_factor;        // normalization constant based on graph structure
 
 #ifdef CPU_ONLY
@@ -55,12 +50,16 @@ class Context {
   inline static cublasHandle_t cublas_handle() { return cublas_handle_; }
   inline static cusparseHandle_t cusparse_handle() { return cusparse_handle_; }
   inline static cusparseMatDescr_t cusparse_matdescr() { return cusparse_matdescr_; }
-  inline static curandGenerator_t curand_generator() {
-    return curand_generator_;
-  }
+  inline static curandGenerator_t curand_generator() { return curand_generator_; }
 #endif
 
 protected:
+  size_t n;                    // number of samples: N
+  size_t num_classes;          // number of classes: E
+  size_t feat_len;             // input feature length: D
+  bool is_single_class;        // single-class (one-hot) or multi-class label
+  label_t *labels;             // labels for classification: N x 1
+  float_t* h_feats;            // input features: N x D
 #ifndef CPU_ONLY
   static cublasHandle_t cublas_handle_; // used to call cuBLAS
   static cusparseHandle_t cusparse_handle_; // used to call cuSPARSE
diff --git a/libdeepgalois/include/deepgalois/types.h b/libdeepgalois/include/deepgalois/types.h
index 3c3c7ce747..e7600b4605 100644
--- a/libdeepgalois/include/deepgalois/types.h
+++ b/libdeepgalois/include/deepgalois/types.h
@@ -18,7 +18,7 @@ typedef std::vector<vec_t>
 typedef std::vector<feature_t> FV; // feature vector
 typedef std::vector<FV> FV2D;      // feature vectors: num_samples x feature_dim
 typedef float acc_t;               // Accuracy type
-typedef short label_t;  // label is for classification (supervised learning)
+typedef uint8_t label_t;  // label is for classification (supervised learning)
 typedef uint8_t mask_t; // mask is used to indicate different uses of labels:
                         // train, val, test
 typedef uint32_t VertexID;
diff --git a/libdeepgalois/include/deepgalois/utils.h b/libdeepgalois/include/deepgalois/utils.h
index ad33285879..8279dca8e8 100644
--- a/libdeepgalois/include/deepgalois/utils.h
+++ b/libdeepgalois/include/deepgalois/utils.h
@@ -110,7 +110,7 @@ inline bool bernoulli(float_t p) {
 inline size_t read_masks(std::string dataset_str, std::string mask_type,
                          size_t& begin, size_t& end,
                          std::vector<uint8_t>& masks) {
-  if (dataset_str != "citeseer" && dataset_str != "cora" && dataset_str != "pubmed") {
+  if (dataset_str != "citeseer" && dataset_str != "cora" && dataset_str != "pubmed" && dataset_str != "flickr") {
     std::cout << "Dataset currently not supported\n";
     exit(1);
   }
@@ -145,7 +145,7 @@ inline size_t read_masks(std::string dataset_str, std::string mask_type,
 inline size_t read_masks(std::string dataset_str, std::string mask_type,
                          size_t& begin, size_t& end,
                          std::vector<uint8_t>& masks, Graph* dGraph) {
-  if (dataset_str != "citeseer" && dataset_str != "cora" && dataset_str != "pubmed") {
+  if (dataset_str != "citeseer" && dataset_str != "cora" && dataset_str != "pubmed" && dataset_str != "flickr") {
     std::cout << "Dataset currently not supported\n";
     exit(1);
   }
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index 52db06ca62..8ffaacb8b6 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -94,7 +94,7 @@ Graph* Context::getGraphPointer() {
   return Context::graph_cpu;
 }
 
-float_t* Context::get_in_ptr() { return &h_feats[0]; }
+float_t* Context::get_in_ptr() { return h_feats; }
 
 void Context::norm_factor_counting() {
   norm_factor = new float_t[n];
@@ -123,16 +123,23 @@ size_t Context::read_labels(std::string dataset_str) {
   size_t m; // m: number of samples
   in >> m >> num_classes >> std::ws;
   assert(m == n);
-  labels.resize(m, 0); // label for each vertex: N x 1
+  if (is_single_class)
+    labels = new label_t[m]; // single-class (one-hot) label for each vertex: N x 1
+  else
+    labels = new label_t[m*num_classes]; // multi-class label for each vertex: N x E
   unsigned v = 0;
   while (std::getline(in, line)) {
     std::istringstream label_stream(line);
     unsigned x;
     for (size_t idx = 0; idx < num_classes; ++idx) {
       label_stream >> x;
-      if (x != 0) {
-        labels[v] = idx;
-        break;
+      if (is_single_class) {
+        if (x != 0) {
+          labels[v] = idx;
+          break;
+        }
+      } else {
+        labels[v*num_classes+idx] = x;
       }
     }
     v++;
@@ -142,36 +149,56 @@ size_t Context::read_labels(std::string dataset_str) {
   // print the number of vertex classes
   std::cout << "Done, unique label counts: " << num_classes
             << ", time: " << t_read.Millisecs() << " ms\n";
+  //for (auto i = 0; i < 10; i ++) std::cout << "labels[" << i << "] = " << unsigned(labels[i]) << "\n";
   return num_classes;
 }
 
 //! Read features, return the length of a feature vector
 //! Features are stored in the Context class
-size_t Context::read_features(std::string dataset_str) {
+size_t Context::read_features(std::string dataset_str, std::string filetype) {
+  //filetype = "txt";
   std::cout << "Reading features ... ";
   Timer t_read;
   t_read.Start();
+  size_t m; // m = number of vertices
   std::string filename = path + dataset_str + ".ft";
   std::ifstream in;
-  std::string line;
-  in.open(filename, std::ios::in);
-  size_t m; // m = number of vertices
-  in >> m >> feat_len >> std::ws;
-  // assert(m == );
-  h_feats.resize(m * feat_len, 0);
-  while (std::getline(in, line)) {
-    std::istringstream edge_stream(line);
-    unsigned u, v;
-    float_t w;
-    edge_stream >> u;
-    edge_stream >> v;
-    edge_stream >> w;
-    h_feats[u * feat_len + v] = w;
+
+  if (filetype == "bin") {
+    std::string file_dims = path + dataset_str + "-dims.txt";
+    std::ifstream ifs;
+    ifs.open(file_dims, std::ios::in);
+    ifs >> m >> feat_len >> std::ws;
+    ifs.close();
+  } else {
+    in.open(filename, std::ios::in);
+    in >> m >> feat_len >> std::ws;
+  }
+  std::cout << "N x D: " << m << " x " << feat_len << "\n";
+  h_feats = new float_t[m * feat_len];
+  if (filetype == "bin") {
+    filename = path + dataset_str + "-feats.bin";
+    in.open(filename, std::ios::binary|std::ios::in);
+    in.read((char*)h_feats, sizeof(float_t) * m * feat_len);
+  } else {
+    std::string line;
+    while (std::getline(in, line)) {
+      std::istringstream edge_stream(line);
+      unsigned u, v;
+      float_t w;
+      edge_stream >> u;
+      edge_stream >> v;
+      edge_stream >> w;
+      h_feats[u * feat_len + v] = w;
+    }
   }
   in.close();
   t_read.Stop();
   std::cout << "Done, feature length: " << feat_len
             << ", time: " << t_read.Millisecs() << " ms\n";
+  //for (auto i = 0; i < 6; i ++) 
+    //for (auto j = 0; j < 6; j ++) 
+      //std::cout << "feats[" << i << "][" << j << "] = " << h_feats[i*feat_len+j] << "\n";
   return feat_len;
 }
 
diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
index 220d3da102..30c01d846c 100644
--- a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
+++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
@@ -19,10 +19,10 @@ void sigmoid_loss_layer::forward_propagation(const float_t* in_data, float_t* ou
       // output is normalized input for this layer
       math::sigmoid(len, &in_data[len*i], &out_data[len*i]); // normalize using sigmoid
       // one hot encoded vector for the labels
-      std::vector<acc_t> groundTruth(output_dims[1], 0.0); // ground truth
-      groundTruth[context->get_label(i)] = 1.0;            // one-hot TODO: modify for multi-class label
+      acc_t *ground_truth = new acc_t[len];
+      for (size_t j = 0; j < len; j++) ground_truth[j] = context->get_label(i, j);
       // loss calculation
-      loss[i] = math::cross_entropy(len, &groundTruth[0], &out_data[len*i]);
+      loss[i] = math::cross_entropy(len, ground_truth, &out_data[len*i]);
     }
   }, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("sigmoid-loss-fw"));
 }
@@ -33,10 +33,10 @@ void sigmoid_loss_layer::back_propagation(const float_t* in_data, const float_t*
   galois::do_all(galois::iterate(layer::begin_, layer::end_), [&](const auto& i) {
     if (masks_[i] == 1) { // masked
       vec_t norm_grad(len);
-      std::vector<acc_t> groundTruth(len, 0.0);
-      groundTruth[context->get_label(i)] = 1.0;
+      acc_t *ground_truth = new acc_t[len];
+      for (size_t j = 0; j < len; j++) ground_truth[j] = context->get_label(i, j);
       // use ground truth to determine derivative of cross entropy
-      math::d_cross_entropy(len, &groundTruth[0], &out_data[len * i], &norm_grad[0]);
+      math::d_cross_entropy(len, ground_truth, &out_data[len * i], &norm_grad[0]);
       // derviative sigmoid to gradient used in the next layer
       math::d_sigmoid(len, &in_data[len * i], &out_data[len * i], &in_grad[len * i], &norm_grad[0]);
     }
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index 19a3508ebf..45e79f4cf4 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -16,6 +16,7 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1,
 #ifndef GALOIS_USE_DIST
   is_single_class = is_single;
   context = new deepgalois::Context();
+  context->set_label_class(is_single);
   num_samples = context->read_graph(dataset_str, selfloop);
 #else
   context = new deepgalois::DistContext();

From 8f9d86dc55b0ba72d1ce764c47dede9f1189d40e Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Sun, 19 Apr 2020 14:59:54 -0500
Subject: [PATCH 176/660] add multi-class accuracy

---
 libdeepgalois/CMakeLists.txt                  |   2 +
 libdeepgalois/include/deepgalois/context.h    |   2 +-
 libdeepgalois/include/deepgalois/net.h        |  58 ++------
 libdeepgalois/include/deepgalois/utils.h      |  82 ++----------
 libdeepgalois/src/context.cpp                 |   7 +-
 .../src/layers/sigmoid_loss_layer.cpp         |  14 +-
 libdeepgalois/src/net.cpp                     |  61 ++++++++-
 libdeepgalois/src/utils.cpp                   | 124 ++++++++++++++++++
 8 files changed, 216 insertions(+), 134 deletions(-)
 create mode 100644 libdeepgalois/src/utils.cpp

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index f92d8950a9..58538bb8b0 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -64,6 +64,7 @@ set(sources
   src/math_functions.cpp
   src/optimizer.cpp
   src/DistContext.cpp
+  src/utils.cpp
   src/node.cpp
   src/net.cpp
 )
@@ -78,6 +79,7 @@ set(sources
   src/optimizer.cpp
   src/context.cpp
   src/sampler.cpp
+  src/utils.cpp
   src/node.cpp
   src/net.cpp
 )
diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h
index a2407bd478..b73de071cc 100644
--- a/libdeepgalois/include/deepgalois/context.h
+++ b/libdeepgalois/include/deepgalois/context.h
@@ -26,7 +26,7 @@ class Context {
   size_t read_features(std::string dataset_str, std::string filetype = "bin");
   label_t get_label(size_t i) { return labels[i]; } // single-class (one-hot) label
   label_t get_label(size_t i, size_t j) { return labels[i*num_classes+j]; } // multi-class label
-  label_t* get_labels_ptr(size_t i) { return labels; }
+  label_t* get_labels_ptr() { return labels; }
   float_t* get_in_ptr();
 
   size_t read_graph_cpu(std::string dataset_str, std::string filetype, bool selfloop);
diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h
index e29e1863ff..dcd538e7f6 100644
--- a/libdeepgalois/include/deepgalois/net.h
+++ b/libdeepgalois/include/deepgalois/net.h
@@ -41,8 +41,17 @@ class Net {
   size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; }
   size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id + 1]; }
   size_t get_nnodes() { return num_samples; }
-  void train(optimizer* opt, bool need_validate); // training
   void construct_layers();
+  void append_out_layer(size_t layer_id);
+  void train(optimizer* opt, bool need_validate); // training
+  double evaluate(size_t begin, size_t end, size_t count, 
+                  mask_t* masks, acc_t& loss, acc_t& acc); // inference
+
+  //! Add a convolution layer to the network
+  void append_conv_layer(size_t layer_id, bool act = false, bool norm = true,
+                         bool bias = false, bool dropout = true,
+                         float_t dropout_rate = 0.5);
+
   //! Save the context object to all layers of the network
   void set_contexts() {
     for (size_t i = 0; i < num_layers; i++)
@@ -59,35 +68,6 @@ class Net {
       layers[i]->print_layer_info();
   }
 
-  //! Add a convolution layer to the network
-  void append_conv_layer(size_t layer_id, bool act = false, bool norm = true,
-                         bool bias = false, bool dropout = true,
-                         float_t dropout_rate = 0.5) {
-    assert(dropout_rate < 1.0);
-    assert(layer_id < NUM_CONV_LAYERS);
-    std::vector<size_t> in_dims(2), out_dims(2);
-    in_dims[0] = out_dims[0] = num_samples;
-    in_dims[1]               = get_in_dim(layer_id);
-    out_dims[1]              = get_out_dim(layer_id);
-    layers[layer_id] = new graph_conv_layer(layer_id, act, norm, bias, dropout,
-                                            dropout_rate, in_dims, out_dims);
-    if (layer_id > 0) connect(layers[layer_id - 1], layers[layer_id]);
-  }
-
-  //! Add an output layer to the network
-  void append_out_layer(size_t layer_id) {
-    assert(layer_id > 0); // can not be the first layer
-    std::vector<size_t> in_dims(2), out_dims(2);
-    in_dims[0] = out_dims[0] = num_samples;
-    in_dims[1]               = get_in_dim(layer_id);
-    out_dims[1]              = get_out_dim(layer_id);
-	if (is_single_class)
-	  layers[layer_id] = new softmax_loss_layer(layer_id, in_dims, out_dims);
-    else
-	  layers[layer_id] = new sigmoid_loss_layer(layer_id, in_dims, out_dims);
-    connect(layers[layer_id - 1], layers[layer_id]);
-  }
-
   //! forward propagation: [begin, end) is the range of samples used.
   //! calls "forward" on the layers of the network and returns the loss of the
   //! final layer
@@ -120,18 +100,6 @@ class Net {
     }
   }
 
-  // evaluate, i.e. inference or predict
-  double evaluate(size_t begin, size_t end, size_t count, mask_t* masks,
-                  acc_t& loss, acc_t& acc) {
-    // TODO may need to do something for the dist case
-    Timer t_eval;
-    t_eval.Start();
-    loss = fprop(begin, end, count, masks);
-    acc  = masked_accuracy(begin, end, count, masks, context->getGraphPointer());
-    t_eval.Stop();
-    return t_eval.Millisecs();
-  }
-
 protected:
 #ifndef GALOIS_USE_DIST
   deepgalois::Context* context;
@@ -143,13 +111,15 @@ class Net {
   size_t num_classes;               // number of vertex classes: E
   size_t num_layers;                // for now hard-coded: NUM_CONV_LAYERS + 1
   unsigned num_epochs;              // number of epochs
+
   std::vector<size_t> feature_dims; // feature dimnesions for each layer
   std::vector<mask_t> train_mask, val_mask; // masks for traning and validation
   size_t train_begin, train_end, train_count, val_begin, val_end, val_count;
   std::vector<layer*> layers; // all the layers in the neural network
+
   // comparing outputs with the ground truth (labels)
-  acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks,
-                        Graph* dGraph);
+  acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, Graph* dGraph);
+  acc_t masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, Graph* dGraph);
 };
 
 } // namespace deepgalois
diff --git a/libdeepgalois/include/deepgalois/utils.h b/libdeepgalois/include/deepgalois/utils.h
index 8279dca8e8..b7a84bb10a 100644
--- a/libdeepgalois/include/deepgalois/utils.h
+++ b/libdeepgalois/include/deepgalois/utils.h
@@ -9,6 +9,8 @@
 #include <sys/resource.h>
 #ifdef GALOIS_USE_DIST
 #include "deepgalois/gtypes.h"
+#else
+#include "deepgalois/types.h"
 #endif
 
 namespace deepgalois {
@@ -103,80 +105,14 @@ inline bool bernoulli(float_t p) {
   return uniform_rand(float_t(0), float_t(1)) <= p;
 }
 
+acc_t masked_f1_score(size_t begin, size_t end, size_t count, mask_t *masks, 
+                      size_t num_classes, label_t *ground_truth, float_t *pred);
 
-#ifndef GALOIS_USE_DIST
-//! Get masks from datafile where first line tells range of
-//! set to create mask from
-inline size_t read_masks(std::string dataset_str, std::string mask_type,
-                         size_t& begin, size_t& end,
-                         std::vector<uint8_t>& masks) {
-  if (dataset_str != "citeseer" && dataset_str != "cora" && dataset_str != "pubmed" && dataset_str != "flickr") {
-    std::cout << "Dataset currently not supported\n";
-    exit(1);
-  }
-  size_t i             = 0;
-  size_t sample_count  = 0;
-  std::string filename = path + dataset_str + "-" + mask_type + "_mask.txt";
-  // std::cout << "Reading " << filename << "\n";
-  std::ifstream in;
-  std::string line;
-  in.open(filename, std::ios::in);
-  in >> begin >> end >> std::ws;
-  while (std::getline(in, line)) {
-    std::istringstream mask_stream(line);
-    if (i >= begin && i < end) {
-      unsigned mask = 0;
-      mask_stream >> mask;
-      if (mask == 1) {
-        masks[i] = 1;
-        sample_count++;
-      }
-    }
-    i++;
-  }
-  std::cout << mask_type + "_mask range: [" << begin << ", " << end
-    << ") Number of valid samples: " << sample_count << "\n";
-  in.close();
-  return sample_count;
-}
+#ifdef GALOIS_USE_DIST
+size_t read_masks(std::string dataset_str, std::string mask_type,
+                         size_t& begin, size_t& end, std::vector<uint8_t>& masks, Graph* dGraph);
 #else
-//! Get masks from datafile where first line tells range of
-//! set to create mask from; needs graph object due to local IDs
-inline size_t read_masks(std::string dataset_str, std::string mask_type,
-                         size_t& begin, size_t& end,
-                         std::vector<uint8_t>& masks, Graph* dGraph) {
-  if (dataset_str != "citeseer" && dataset_str != "cora" && dataset_str != "pubmed" && dataset_str != "flickr") {
-    std::cout << "Dataset currently not supported\n";
-    exit(1);
-  }
-  size_t i             = 0;
-  size_t sample_count  = 0;
-  std::string filename = path + dataset_str + "-" + mask_type + "_mask.txt";
-
-  std::ifstream in;
-  std::string line;
-  in.open(filename, std::ios::in);
-  in >> begin >> end >> std::ws;
-  while (std::getline(in, line)) {
-    std::istringstream mask_stream(line);
-    if (i >= begin && i < end) {
-      unsigned mask = 0;
-      mask_stream >> mask;
-      if (mask == 1) {
-        // only bother if it's local
-        if (dGraph->isLocal(i)) {
-          masks[dGraph->getLID(i)] = 1;
-          sample_count++;
-        }
-      }
-    }
-    i++;
-  }
-  std::cout << mask_type + "_mask range: [" << begin << ", " << end
-    << ") Number of valid samples: " << sample_count << "\n";
-  in.close();
-  return sample_count;
-}
+size_t read_masks(std::string dataset_str, std::string mask_type,
+                         size_t& begin, size_t& end, std::vector<uint8_t>& masks);
 #endif
-
 }
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index 8ffaacb8b6..daf83a6b24 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -123,10 +123,13 @@ size_t Context::read_labels(std::string dataset_str) {
   size_t m; // m: number of samples
   in >> m >> num_classes >> std::ws;
   assert(m == n);
-  if (is_single_class)
+  if (is_single_class) {
+    std::cout << "Using single-class (one-hot) labels\n";
     labels = new label_t[m]; // single-class (one-hot) label for each vertex: N x 1
-  else
+  } else {
+    std::cout << "Using multi-class labels\n";
     labels = new label_t[m*num_classes]; // multi-class label for each vertex: N x E
+  }
   unsigned v = 0;
   while (std::getline(in, line)) {
     std::istringstream label_stream(line);
diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
index 30c01d846c..5a9508c1aa 100644
--- a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
+++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
@@ -49,14 +49,12 @@ acc_t sigmoid_loss_layer::get_masked_loss() {
   AccumU valid_sample_count;
   total_loss.reset();
   valid_sample_count.reset();
-  galois::do_all(galois::iterate(layer::begin_, layer::end_),
-    [&](const auto& i) {
-      if (masks_[i]) {
-        total_loss += loss[i];
-        valid_sample_count += 1;
-      }
-    }, galois::chunk_size<256>(), galois::steal(),
-    galois::loopname("getMaskedLoss"));
+  galois::do_all(galois::iterate(layer::begin_, layer::end_), [&](const auto& i) {
+    if (masks_[i]) {
+      total_loss += loss[i];
+      valid_sample_count += 1;
+    }
+  }, galois::chunk_size<256>(), galois::steal(), galois::loopname("getMaskedLoss"));
   assert(valid_sample_count.reduce() == count_);
   return total_loss.reduce() / (acc_t)count_;
 }
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index 45e79f4cf4..56cd0c2dfc 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -120,8 +120,13 @@ void Net::train(optimizer* opt, bool need_validate) {
     Tfw.start();
     train_loss =
         Net::fprop(train_begin, train_end, train_count, &train_mask[0]); // forward
-    train_acc = masked_accuracy(train_begin, train_end, train_count,
-                                &train_mask[0], context->getGraphPointer()); // predict
+    if (is_single_class) {
+      train_acc = masked_accuracy(train_begin, train_end, train_count,
+                                  &train_mask[0], context->getGraphPointer()); // predict
+    } else {
+      train_acc = masked_multi_class_accuracy(train_begin, train_end, train_count,
+                                  &train_mask[0], context->getGraphPointer()); // predict
+    }
     Tfw.stop();
 
     // backward: use intermediate features + ground truth to update layers
@@ -159,6 +164,18 @@ void Net::train(optimizer* opt, bool need_validate) {
   }
 }
 
+// evaluate, i.e. inference or predict
+double Net::evaluate(size_t begin, size_t end, size_t count, mask_t* masks,
+                     acc_t& loss, acc_t& acc) {
+  // TODO may need to do something for the dist case
+  Timer t_eval;
+  t_eval.Start();
+  loss = fprop(begin, end, count, masks);
+  acc  = masked_accuracy(begin, end, count, masks, context->getGraphPointer());
+  t_eval.Stop();
+  return t_eval.Millisecs();
+}
+
 void Net::construct_layers() {
   std::cout << "\nConstructing layers...\n";
   append_conv_layer(0, true);                    // first conv layer
@@ -169,6 +186,34 @@ void Net::construct_layers() {
   set_contexts();
 }
 
+//! Add an output layer to the network
+void Net::append_out_layer(size_t layer_id) {
+  assert(layer_id > 0); // can not be the first layer
+  std::vector<size_t> in_dims(2), out_dims(2);
+  in_dims[0] = out_dims[0] = num_samples;
+  in_dims[1]               = get_in_dim(layer_id);
+  out_dims[1]              = get_out_dim(layer_id);
+  if (is_single_class)
+    layers[layer_id] = new softmax_loss_layer(layer_id, in_dims, out_dims);
+  else
+    layers[layer_id] = new sigmoid_loss_layer(layer_id, in_dims, out_dims);
+  connect(layers[layer_id - 1], layers[layer_id]);
+}
+
+//! Add a convolution layer to the network
+void Net::append_conv_layer(size_t layer_id, bool act, bool norm, bool bias,
+                            bool dropout, float_t dropout_rate) {
+  assert(dropout_rate < 1.0);
+  assert(layer_id < NUM_CONV_LAYERS);
+  std::vector<size_t> in_dims(2), out_dims(2);
+  in_dims[0] = out_dims[0] = num_samples;
+  in_dims[1]               = get_in_dim(layer_id);
+  out_dims[1]              = get_out_dim(layer_id);
+  layers[layer_id] = new graph_conv_layer(layer_id, act, norm, bias, dropout,
+                                          dropout_rate, in_dims, out_dims);
+  if (layer_id > 0) connect(layers[layer_id - 1], layers[layer_id]);
+}
+
 #ifdef CPU_ONLY
 /**
  *
@@ -176,8 +221,7 @@ void Net::construct_layers() {
  * @param end GLOBAL end
  * @param count GLOBAL training count
  */
-acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks,
-                           Graph* dGraph) {
+acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, Graph* dGraph) {
 #ifndef GALOIS_USE_DIST
   AccumF accuracy_all;
 #else
@@ -215,8 +259,7 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks
       }
     }
 #endif
-  },
-  galois::loopname("getMaskedLoss"));
+  }, galois::loopname("getMaskedLoss"));
 
 #ifdef GALOIS_USE_DIST
   count = sampleCount.reduce();
@@ -226,6 +269,12 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks
   // all hosts should get same accuracy
   return accuracy_all.reduce() / (acc_t)count;
 }
+
+acc_t Net::masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, Graph* dGraph) {
+  auto preds = layers[NUM_CONV_LAYERS - 1]->next()->get_data();
+  auto ground_truth = context->get_labels_ptr();
+  return deepgalois::masked_f1_score(begin, end, count, masks, num_classes, ground_truth, preds);
+}
 #endif
 
 } // namespace deepgalois
diff --git a/libdeepgalois/src/utils.cpp b/libdeepgalois/src/utils.cpp
new file mode 100644
index 0000000000..d43bec403d
--- /dev/null
+++ b/libdeepgalois/src/utils.cpp
@@ -0,0 +1,124 @@
+#include "galois/Galois.h"
+#include "deepgalois/utils.h"
+
+namespace deepgalois {
+
+#define NUM_DATASETS 8
+const std::string dataset_names[NUM_DATASETS] = {"cora", "citeseer", "ppi", "pubmed", "flickr", "yelp", "reddit", "amazon"};
+
+acc_t masked_f1_score(size_t begin, size_t end, size_t count, mask_t *masks, 
+                      size_t num_classes, label_t *ground_truth, float_t *pred) {
+  std::vector<acc_t> true_positive(num_classes, 0);
+  std::vector<acc_t> false_positive(num_classes, 0);
+  galois::do_all(galois::iterate(begin, end), [&](const auto& i) {
+    if (masks[i] == 1) {
+      for (size_t j = 0; j < num_classes; j++) {
+        auto idx = i * num_classes + j;
+        if (ground_truth[idx] == 1 && pred[idx] > 0.5) {
+          true_positive[j] ++;
+        } else if (ground_truth[idx] == 0 && pred[idx] > 0.5) {
+          false_positive[j] ++;
+        }
+      }
+	}
+  }, galois::loopname("MaskedF1Score"));
+  acc_t pNumerator = 0.0;
+  acc_t pDenominator = 0.0;
+  for (size_t i = 0; i < num_classes; i++) {
+    auto fp = false_positive[i]; // false positive
+	auto tp = true_positive[i]; // true positive
+	pNumerator = pNumerator + tp;
+	pDenominator = pDenominator + (tp + fp);
+  }
+  acc_t precisionMicro = pNumerator / pDenominator;
+  return precisionMicro;
+}
+
+#ifndef GALOIS_USE_DIST
+//! Get masks from datafile where first line tells range of
+//! set to create mask from
+size_t read_masks(std::string dataset_str, std::string mask_type,
+                         size_t& begin, size_t& end, std::vector<uint8_t>& masks) {
+  bool dataset_found = false;
+  for (int i = 0; i < NUM_DATASETS; i++) {
+    if (dataset_str == dataset_names[i]) {
+      dataset_found = true;
+      break;
+    }
+  }
+  if (!dataset_found) {
+    std::cout << "Dataset currently not supported\n";
+    exit(1);
+  }
+  size_t i             = 0;
+  size_t sample_count  = 0;
+  std::string filename = path + dataset_str + "-" + mask_type + "_mask.txt";
+  // std::cout << "Reading " << filename << "\n";
+  std::ifstream in;
+  std::string line;
+  in.open(filename, std::ios::in);
+  in >> begin >> end >> std::ws;
+  while (std::getline(in, line)) {
+    std::istringstream mask_stream(line);
+    if (i >= begin && i < end) {
+      unsigned mask = 0;
+      mask_stream >> mask;
+      if (mask == 1) {
+        masks[i] = 1;
+        sample_count++;
+      }
+    }
+    i++;
+  }
+  std::cout << mask_type + "_mask range: [" << begin << ", " << end
+    << ") Number of valid samples: " << sample_count << " (" 
+    << (float)sample_count/(float)masks.size()*(float)100 << "\%)\n";
+  in.close();
+  return sample_count;
+}
+#else
+size_t read_masks(std::string dataset_str, std::string mask_type,
+                         size_t& begin, size_t& end,
+                         std::vector<uint8_t>& masks, Graph* dGraph) {
+  bool dataset_found = false;
+  for (int i = 0; i < NUM_DATASETS; i++) {
+    if (dataset_str == dataset_names[i]) {
+      dataset_found = true;
+      break;
+    }
+  }
+  if (!dataset_found) {
+    std::cout << "Dataset currently not supported\n";
+    exit(1);
+  }
+  size_t i             = 0;
+  size_t sample_count  = 0;
+  std::string filename = path + dataset_str + "-" + mask_type + "_mask.txt";
+
+  std::ifstream in;
+  std::string line;
+  in.open(filename, std::ios::in);
+  in >> begin >> end >> std::ws;
+  while (std::getline(in, line)) {
+    std::istringstream mask_stream(line);
+    if (i >= begin && i < end) {
+      unsigned mask = 0;
+      mask_stream >> mask;
+      if (mask == 1) {
+        // only bother if it's local
+        if (dGraph->isLocal(i)) {
+          masks[dGraph->getLID(i)] = 1;
+          sample_count++;
+        }
+      }
+    }
+    i++;
+  }
+  std::cout << mask_type + "_mask range: [" << begin << ", " << end
+    << ") Number of valid samples: " << sample_count << "\n";
+  in.close();
+  return sample_count;
+}
+#endif
+
+}

From 966dc071c63b43d6e8f6e6ce1076774a93f7f759 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Sun, 19 Apr 2020 18:51:35 -0500
Subject: [PATCH 177/660] fix f1

---
 .../deepgalois/layers/softmax_loss_layer.h    |  2 +-
 libdeepgalois/src/context.cpp                 |  6 +-
 .../src/layers/sigmoid_loss_layer.cpp         | 21 +++---
 .../src/layers/softmax_loss_layer.cpp         | 72 +++++++++----------
 .../src/layers/softmax_loss_layer.cu          |  4 ++
 libdeepgalois/src/net.cpp                     |  6 +-
 libdeepgalois/src/utils.cpp                   | 26 ++++++-
 7 files changed, 88 insertions(+), 49 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h b/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h
index 798ad7a79a..7194d06f2e 100644
--- a/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h
@@ -6,7 +6,7 @@ class softmax_loss_layer : public layer {
 public:
   softmax_loss_layer(unsigned level, std::vector<size_t> in_dims,
                      std::vector<size_t> out_dims);
-  ~softmax_loss_layer() {}
+  ~softmax_loss_layer();
   std::string layer_type() const override {
     return std::string("softmax_loss");
   }
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index daf83a6b24..8d6616182f 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -8,7 +8,11 @@ namespace deepgalois {
 
 #ifdef CPU_ONLY
 Context::Context() {}
-Context::~Context() {}
+Context::~Context() {
+  if (labels) delete labels;
+  if (h_feats) delete h_feats;
+  if (norm_factor) delete norm_factor;
+}
 
 size_t Context::read_graph(std::string dataset_str, bool selfloop) {
   n = read_graph_cpu(dataset_str, "gr", selfloop);
diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
index 5a9508c1aa..57264976db 100644
--- a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
+++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
@@ -16,13 +16,15 @@ void sigmoid_loss_layer::forward_propagation(const float_t* in_data, float_t* ou
   size_t len = input_dims[1];
   galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) {
     if (masks_[i] == 1) { // masked
+      size_t idx = len * i;
       // output is normalized input for this layer
-      math::sigmoid(len, &in_data[len*i], &out_data[len*i]); // normalize using sigmoid
+      math::sigmoid(len, &in_data[idx], &out_data[idx]); // normalize using sigmoid
       // one hot encoded vector for the labels
-      acc_t *ground_truth = new acc_t[len];
-      for (size_t j = 0; j < len; j++) ground_truth[j] = context->get_label(i, j);
+      float_t *ground_truth = new float_t[len];
+      for (size_t j = 0; j < len; j++) ground_truth[j] = (float_t)context->get_label(i, j);
       // loss calculation
-      loss[i] = math::cross_entropy(len, ground_truth, &out_data[len*i]);
+      loss[i] = math::cross_entropy(len, ground_truth, &out_data[idx]);
+	  delete ground_truth;
     }
   }, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("sigmoid-loss-fw"));
 }
@@ -32,13 +34,16 @@ void sigmoid_loss_layer::back_propagation(const float_t* in_data, const float_t*
   size_t len = layer::input_dims[1];
   galois::do_all(galois::iterate(layer::begin_, layer::end_), [&](const auto& i) {
     if (masks_[i] == 1) { // masked
-      vec_t norm_grad(len);
-      acc_t *ground_truth = new acc_t[len];
+      size_t idx = len * i;
+      float_t *norm_grad = new float_t[len];
+      float_t *ground_truth = new float_t[len];
       for (size_t j = 0; j < len; j++) ground_truth[j] = context->get_label(i, j);
       // use ground truth to determine derivative of cross entropy
-      math::d_cross_entropy(len, ground_truth, &out_data[len * i], &norm_grad[0]);
+      math::d_cross_entropy(len, ground_truth, &out_data[idx], norm_grad);
       // derviative sigmoid to gradient used in the next layer
-      math::d_sigmoid(len, &in_data[len * i], &out_data[len * i], &in_grad[len * i], &norm_grad[0]);
+      math::d_sigmoid(len, &in_data[idx], &out_data[idx], &in_grad[idx], norm_grad);
+	  delete norm_grad;
+	  delete ground_truth;
     }
   }, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("sigmoid-loss-bw"));
 }
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp
index 9b64a0d353..7c5b11d233 100644
--- a/libdeepgalois/src/layers/softmax_loss_layer.cpp
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp
@@ -12,27 +12,29 @@ softmax_loss_layer::softmax_loss_layer(unsigned level,
   loss = new float_t[in_dims[0]]; // error for each sample
 }
 
+softmax_loss_layer::~softmax_loss_layer() {
+  delete loss;
+}
+
 // TODO: need kernel fusion optimization
 // 𝑦[i] = 𝑒^𝑥[i] / Σ 𝑒^𝑥[𝑘]
 void softmax_loss_layer::forward_propagation(const float_t* in_data,
                                              float_t* out_data) {
   size_t len = input_dims[1];
-  galois::do_all(galois::iterate(begin_, end_),
-    [&](const auto& i) {
-      if (masks_[i] == 1) { // masked
-        // output is normalized input for this layer
-        math::softmax(len, &in_data[len*i], &out_data[len*i]); // normalize using softmax
-        // one hot encoded vector for the labels
-        std::vector<acc_t> groundTruth(output_dims[1], 0.0); // ground truth
-        groundTruth[context->get_label(i)] = 1.0;            // one-hot
-        // loss calculation
-        loss[i] = math::cross_entropy(len, &groundTruth[0], &out_data[len*i]);
-      }
-    }, galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
-    galois::loopname("softmax-loss-fw"));
+  galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) {
+    if (masks_[i] == 1) { // masked
+      // output is normalized input for this layer
+      math::softmax(len, &in_data[len*i], &out_data[len*i]); // normalize using softmax
+      // one hot encoded vector for the labels
+      std::vector<acc_t> groundTruth(output_dims[1], 0.0); // ground truth
+      groundTruth[context->get_label(i)] = 1.0;            // one-hot
+      // loss calculation
+      loss[i] = math::cross_entropy(len, &groundTruth[0], &out_data[len*i]);
+    }
+  }, galois::chunk_size<64>(), galois::steal(), galois::loopname("softmax-loss-fw"));
 
-    // no sync required in distributed execution since no graph topology used
-    // in this forward pass; only a post-process pretty much
+  // no sync required in distributed execution since no graph topology used
+  // in this forward pass; only a post-process pretty much
 }
 
 void softmax_loss_layer::back_propagation(const float_t* in_data,
@@ -40,20 +42,18 @@ void softmax_loss_layer::back_propagation(const float_t* in_data,
                                           float_t* out_grad, float_t* in_grad) {
   // note: out_grad is ignored because it shouldn't exist (this is output layer)
   size_t len = layer::input_dims[1];
-  galois::do_all(galois::iterate(layer::begin_, layer::end_),
-    [&](const auto& i) {
-      if (masks_[i] == 1) { // masked
-        vec_t norm_grad(len);
-        std::vector<acc_t> groundTruth(len, 0.0);
-        groundTruth[context->get_label(i)] = 1.0;
-        // use ground truth to determine derivative of cross entropy
-        math::d_cross_entropy(len, &groundTruth[0], &out_data[len * i], &norm_grad[0]);
-        // derviative softmax to gradient used in the next layer
-        math::d_softmax(len, &in_data[len * i], &out_data[len * i],
-                        &in_grad[len * i], &norm_grad[0]);
-      }
-    }, galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
-    galois::loopname("softmax-loss-bw"));
+  galois::do_all(galois::iterate(layer::begin_, layer::end_), [&](const auto& i) {
+    if (masks_[i] == 1) { // masked
+      vec_t norm_grad(len);
+      std::vector<acc_t> groundTruth(len, 0.0);
+      groundTruth[context->get_label(i)] = 1.0;
+      // use ground truth to determine derivative of cross entropy
+      math::d_cross_entropy(len, &groundTruth[0], &out_data[len * i], &norm_grad[0]);
+      // derviative softmax to gradient used in the next layer
+      math::d_softmax(len, &in_data[len * i], &out_data[len * i],
+                      &in_grad[len * i], &norm_grad[0]);
+    }
+  }, galois::chunk_size<64>(), galois::steal(), galois::loopname("softmax-loss-bw"));
 
   // no weight sync required: this is all local graph information
 }
@@ -64,14 +64,12 @@ acc_t softmax_loss_layer::get_masked_loss() {
   AccumU valid_sample_count;
   total_loss.reset();
   valid_sample_count.reset();
-  galois::do_all(galois::iterate(layer::begin_, layer::end_),
-    [&](const auto& i) {
-      if (masks_[i]) {
-        total_loss += loss[i];
-        valid_sample_count += 1;
-      }
-    }, galois::chunk_size<256>(), galois::steal(),
-    galois::loopname("getMaskedLoss"));
+  galois::do_all(galois::iterate(layer::begin_, layer::end_), [&](const auto& i) {
+    if (masks_[i]) {
+      total_loss += loss[i];
+      valid_sample_count += 1;
+    }
+  }, galois::chunk_size<64>(), galois::steal(), galois::loopname("getMaskedLoss"));
   assert(valid_sample_count.reduce() == count_);
   return total_loss.reduce() / (acc_t)count_;
 }
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cu b/libdeepgalois/src/layers/softmax_loss_layer.cu
index e9216b1ae2..6ed45bc98e 100644
--- a/libdeepgalois/src/layers/softmax_loss_layer.cu
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cu
@@ -41,6 +41,10 @@ softmax_loss_layer::softmax_loss_layer(unsigned level,
   float_malloc_device(in_dims[0], loss);
 }
 
+softmax_loss_layer::~softmax_loss_layer() {
+  float_free_device(loss);
+}
+
 void softmax_loss_layer::forward_propagation(const float_t* in_data,
                                              float_t* out_data) {
   init_const_gpu(input_dims[0], 0.0, loss);
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index 56cd0c2dfc..340b77650a 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -171,7 +171,11 @@ double Net::evaluate(size_t begin, size_t end, size_t count, mask_t* masks,
   Timer t_eval;
   t_eval.Start();
   loss = fprop(begin, end, count, masks);
-  acc  = masked_accuracy(begin, end, count, masks, context->getGraphPointer());
+  if (is_single_class) {
+    acc = masked_accuracy(begin, end, count, masks, context->getGraphPointer());
+  } else {
+    acc = masked_multi_class_accuracy(begin, end, count, masks, context->getGraphPointer());
+  }
   t_eval.Stop();
   return t_eval.Millisecs();
 }
diff --git a/libdeepgalois/src/utils.cpp b/libdeepgalois/src/utils.cpp
index d43bec403d..72d6560aca 100644
--- a/libdeepgalois/src/utils.cpp
+++ b/libdeepgalois/src/utils.cpp
@@ -6,10 +6,24 @@ namespace deepgalois {
 #define NUM_DATASETS 8
 const std::string dataset_names[NUM_DATASETS] = {"cora", "citeseer", "ppi", "pubmed", "flickr", "yelp", "reddit", "amazon"};
 
+// Compute the F1 score, also known as balanced F-score or F-measure
+// The F1 score can be interpreted as a weighted average of the precision and recall, 
+// where an F1 score reaches its best value at 1 and worst score at 0. 
+// The relative contribution of precision and recall to the F1 score are equal.
+// The formula for the F1 score is:
+// F1 = 2 * (precision * recall) / (precision + recall)
+// where precision = TP / (TP + FP), recall = TP / (TP + FN)
+// TP: true positive; FP: false positive; FN: false negtive.
+// In the multi-class and multi-label case, this is the weighted average of the F1 score of each class.
+// Please refer to https://sebastianraschka.com/faq/docs/multiclass-metric.html,
+// http://pageperso.lif.univ-mrs.fr/~francois.denis/IAAM1/scikit-learn-docs.pdf (p.1672)
+// and https://github.com/ashokpant/accuracy-evaluation-cpp/blob/master/src/evaluation.hpp
 acc_t masked_f1_score(size_t begin, size_t end, size_t count, mask_t *masks, 
                       size_t num_classes, label_t *ground_truth, float_t *pred) {
+  float beta = 1;
   std::vector<acc_t> true_positive(num_classes, 0);
   std::vector<acc_t> false_positive(num_classes, 0);
+  std::vector<acc_t> false_negtive(num_classes, 0);
   galois::do_all(galois::iterate(begin, end), [&](const auto& i) {
     if (masks[i] == 1) {
       for (size_t j = 0; j < num_classes; j++) {
@@ -18,20 +32,30 @@ acc_t masked_f1_score(size_t begin, size_t end, size_t count, mask_t *masks,
           true_positive[j] ++;
         } else if (ground_truth[idx] == 0 && pred[idx] > 0.5) {
           false_positive[j] ++;
+        } else if (ground_truth[idx] == 1 && pred[idx] <= 0.5) {
+          false_negtive[j] ++;
         }
       }
 	}
   }, galois::loopname("MaskedF1Score"));
   acc_t pNumerator = 0.0;
   acc_t pDenominator = 0.0;
+  acc_t rNumerator = 0.0;
+  acc_t rDenominator = 0.0;
   for (size_t i = 0; i < num_classes; i++) {
+    auto fn = false_negtive[i]; // false negtive
     auto fp = false_positive[i]; // false positive
 	auto tp = true_positive[i]; // true positive
 	pNumerator = pNumerator + tp;
 	pDenominator = pDenominator + (tp + fp);
+    rNumerator = rNumerator + tp;
+    rDenominator = rDenominator + (tp + fn);
   }
+  auto recallMicro = rNumerator / rDenominator;
   acc_t precisionMicro = pNumerator / pDenominator;
-  return precisionMicro;
+  auto fscoreMicro = (((beta * beta) + 1) * precisionMicro * recallMicro) / 
+                     ((beta * beta) * precisionMicro + recallMicro);
+  return fscoreMicro;
 }
 
 #ifndef GALOIS_USE_DIST

From 20a7e5224b63431bc3e11936916bc95ffca763da Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Sun, 19 Apr 2020 19:52:45 -0500
Subject: [PATCH 178/660] add scripts

---
 libdeepgalois/scripts/run-multi.sh   | 33 ++++++++++++++++++++++++++++
 libdeepgalois/scripts/run-single.sh  | 33 ++++++++++++++++++++++++++++
 libdeepgalois/scripts/test-multi.sh  |  1 +
 libdeepgalois/scripts/test-single.sh |  1 +
 libdeepgalois/src/net.cpp            |  6 +++++
 lonestargnn/include/lonestargnn.h    |  6 ++---
 6 files changed, 77 insertions(+), 3 deletions(-)
 create mode 100755 libdeepgalois/scripts/run-multi.sh
 create mode 100755 libdeepgalois/scripts/run-single.sh
 create mode 100755 libdeepgalois/scripts/test-multi.sh
 create mode 100755 libdeepgalois/scripts/test-single.sh

diff --git a/libdeepgalois/scripts/run-multi.sh b/libdeepgalois/scripts/run-multi.sh
new file mode 100755
index 0000000000..660fac74b3
--- /dev/null
+++ b/libdeepgalois/scripts/run-multi.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+GALOIS_HOME=/net/ohm/export/cdgc/cxh/GaloisCpp
+LONESTARGNN=$GALOIS_HOME/build-gnn-cpu/lonestargnn
+GNNS="gcn"
+#GRAPHS="ppi yelp amazon"
+GRAPHS="ppi"
+EPOCHS="200"
+NTHREADS="56"
+DROPOUT="0.1 0.2 0.3 0.5"
+LEARNINGRATES="0.01"
+HIDDENDIM="16 64 128"
+OUTDIR=/net/ohm/export/cdgc/cxh/outputs/DeepGalois
+
+for GNN in $GNNS; do
+	for NT in $NTHREADS; do
+		for GR in $GRAPHS; do
+			for K in $EPOCHS; do
+				for DR in $DROPOUT; do
+					for LR in $LEARNINGRATES; do
+						for HD in $HIDDENDIM; do
+							EXEC_DIR=$LONESTARGNN/$GNN
+							echo $EXEC_DIR
+							echo "$EXEC_DIR/$GNN $GR -k=$K -t=$NT -d=$DR -lr=$LR -h=$HD &> $OUTDIR/$GNN-$GR-$K-$DR-$LR-$HD-$NT.log"
+							$EXEC_DIR/$GNN $GR -k=$K -t=$NT -d=$DR -lr=$LR -h=$HD -sc=0 &> $OUTDIR/$GNN-$GR-$K-$DR-$LR-$HD-$NT.log
+							echo "Done. Check out $OUTDIR/$GNN-$GR-$K-$DR-$NT.log"
+						done
+					done
+				done
+			done
+		done
+	done
+done
diff --git a/libdeepgalois/scripts/run-single.sh b/libdeepgalois/scripts/run-single.sh
new file mode 100755
index 0000000000..9c0d9fcb63
--- /dev/null
+++ b/libdeepgalois/scripts/run-single.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+GALOIS_HOME=/net/ohm/export/cdgc/cxh/GaloisCpp
+LONESTARGNN=$GALOIS_HOME/build-gnn-cpu/lonestargnn
+GNNS="gcn"
+GRAPHS="cora citeseer pudmed flickr reddit"
+#GRAPHS="cora"
+EPOCHS="200"
+NTHREADS="56"
+DROPOUT="0.1 0.2 0.3 0.5"
+LEARNINGRATES="0.01"
+HIDDENDIM="16 32 64 128 256 512"
+OUTDIR=/net/ohm/export/cdgc/cxh/outputs/DeepGalois
+
+for GNN in $GNNS; do
+	for NT in $NTHREADS; do
+		for GR in $GRAPHS; do
+			for K in $EPOCHS; do
+				for DR in $DROPOUT; do
+					for LR in $LEARNINGRATES; do
+						for HD in $HIDDENDIM; do
+							EXEC_DIR=$LONESTARGNN/$GNN
+							echo $EXEC_DIR
+							echo "$EXEC_DIR/$GNN $GR -k=$K -t=$NT -d=$DR -lr=$LR -h=$HD &> $OUTDIR/$GNN-$GR-$K-$DR-$LR-$HD-$NT.log"
+							$EXEC_DIR/$GNN $GR -k=$K -t=$NT -d=$DR -lr=$LR -h=$HD &> $OUTDIR/$GNN-$GR-$K-$DR-$LR-$HD-$NT.log
+							echo "Done. Check out $OUTDIR/$GNN-$GR-$K-$DR-$NT.log"
+						done
+					done
+				done
+			done
+		done
+	done
+done
diff --git a/libdeepgalois/scripts/test-multi.sh b/libdeepgalois/scripts/test-multi.sh
new file mode 100755
index 0000000000..a67bd047a8
--- /dev/null
+++ b/libdeepgalois/scripts/test-multi.sh
@@ -0,0 +1 @@
+./gcn ppi -k=20 -t=14 -sc=0 -h=128
diff --git a/libdeepgalois/scripts/test-single.sh b/libdeepgalois/scripts/test-single.sh
new file mode 100755
index 0000000000..78093d71ed
--- /dev/null
+++ b/libdeepgalois/scripts/test-single.sh
@@ -0,0 +1 @@
+./gcn cora -k=200 -t=14
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index 340b77650a..6d390ea867 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -104,6 +104,7 @@ void Net::train(optimizer* opt, bool need_validate) {
   galois::StatTimer Tfw("Train-Forward");
   galois::StatTimer Tbw("Train-Backward");
   galois::StatTimer Tval("Validation");
+  double total_train_time = 0.0;
 
   Timer t_epoch;
   // run epochs
@@ -147,6 +148,7 @@ void Net::train(optimizer* opt, bool need_validate) {
                    " train_acc ", train_acc, seperator);
     t_epoch.Stop();
     double epoch_time = t_epoch.Millisecs();
+    total_train_time += epoch_time;
     if (need_validate) {
       // Validation
       acc_t val_loss = 0.0, val_acc = 0.0;
@@ -162,6 +164,10 @@ void Net::train(optimizer* opt, bool need_validate) {
       galois::gPrint(header, "train_time ", std::fixed, epoch_time, " ms\n");
     }
   }
+  double avg_train_time = total_train_time / (double)num_epochs;
+  double throughput = 1000.0 * (double)num_epochs / total_train_time;
+  galois::gPrint("\nAverage training time: ", avg_train_time, 
+                 " ms. Throughput: ", throughput, " epoch/s\n");
 }
 
 // evaluate, i.e. inference or predict
diff --git a/lonestargnn/include/lonestargnn.h b/lonestargnn/include/lonestargnn.h
index 7e2c3ec589..72acb8d1ff 100644
--- a/lonestargnn/include/lonestargnn.h
+++ b/lonestargnn/include/lonestargnn.h
@@ -19,9 +19,9 @@ namespace cll = llvm::cl;
 static cll::opt<std::string>
     dataset(cll::Positional, cll::desc("<dataset name>"),
             cll::Required); // 'cora', 'citeseer', 'pubmed'
-static cll::opt<std::string>
-    filetype(cll::Positional, cll::desc("<filetype: el,gr>"),
-             cll::init("gr")); // file format of the input graph
+//static cll::opt<std::string>
+//    filetype(cll::Positional, cll::desc("<filetype: el,gr>"),
+//             cll::init("gr")); // file format of the input graph
 static cll::opt<std::string>
     model("m", cll::desc("Model string"),
           cll::init("gcn")); // 'gcn', 'gcn_cheby', 'dense'

From b9401f378244a0db3d623dc6882759b2b5ad1994 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Mon, 20 Apr 2020 19:54:49 -0500
Subject: [PATCH 179/660] add gpu sigmoid

---
 libdeepgalois/CMakeLists.txt                  |  1 +
 libdeepgalois/include/deepgalois/context.h    |  3 +-
 .../deepgalois/layers/sigmoid_loss_layer.h    |  2 +-
 .../include/deepgalois/math_functions.hh      |  7 ++
 libdeepgalois/include/deepgalois/net.h        |  5 ++
 libdeepgalois/include/deepgalois/sampler.h    | 35 +++++++-
 libdeepgalois/include/deepgalois/types.h      |  2 +
 libdeepgalois/src/context.cu                  |  5 +-
 .../src/layers/sigmoid_loss_layer.cpp         |  4 +
 .../src/layers/sigmoid_loss_layer.cu          | 38 +++++++++
 .../src/layers/softmax_loss_layer.cu          | 30 +------
 libdeepgalois/src/math_functions.cu           | 83 +++++++++++++++++--
 libdeepgalois/src/net.cpp                     | 21 +++--
 libdeepgalois/src/net.cu                      | 74 ++++++++++++++++-
 libdeepgalois/src/sampler.cpp                 | 51 +++++++-----
 libdeepgalois/src/utils.cpp                   |  2 +-
 16 files changed, 289 insertions(+), 74 deletions(-)
 create mode 100644 libdeepgalois/src/layers/sigmoid_loss_layer.cu

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index 58538bb8b0..c2c64d4f0c 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -40,6 +40,7 @@ else()
   set(CUDA_SOURCES
     src/layers/graph_conv_layer.cu
     src/layers/softmax_loss_layer.cu
+    src/layers/sigmoid_loss_layer.cu
     src/layers/aggregator.cu
     src/math_functions.cu
     src/optimizer.cu
diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h
index b73de071cc..d6bb004b7a 100644
--- a/libdeepgalois/include/deepgalois/context.h
+++ b/libdeepgalois/include/deepgalois/context.h
@@ -44,13 +44,14 @@ class Context {
   void genGraph(LGraph& lg, Graph& g);
   void add_selfloop(Graph &og, Graph &g);
   //! returns pointer to the graph
-  Graph* getGraphPointer();
+  Graph* getCpuGraphPointer();
 #else
   CSRGraph graph_gpu; // the input graph, |V| = N
   inline static cublasHandle_t cublas_handle() { return cublas_handle_; }
   inline static cusparseHandle_t cusparse_handle() { return cusparse_handle_; }
   inline static cusparseMatDescr_t cusparse_matdescr() { return cusparse_matdescr_; }
   inline static curandGenerator_t curand_generator() { return curand_generator_; }
+  CSRGraph* getGpuGraphPointer() { return &graph_gpu; }
 #endif
 
 protected:
diff --git a/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h b/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h
index 31bab85daa..334bf4363e 100644
--- a/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h
@@ -6,7 +6,7 @@ class sigmoid_loss_layer : public layer {
 public:
   sigmoid_loss_layer(unsigned level, std::vector<size_t> in_dims,
                      std::vector<size_t> out_dims);
-  ~sigmoid_loss_layer() {}
+  ~sigmoid_loss_layer();
   std::string layer_type() const override {
     return std::string("sigmoid_loss");
   }
diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh
index 46f571ac35..2c3a8014ee 100644
--- a/libdeepgalois/include/deepgalois/math_functions.hh
+++ b/libdeepgalois/include/deepgalois/math_functions.hh
@@ -147,6 +147,12 @@ void softmax_cross_entropy_gpu(int len, int begin, int end, const float_t* in_da
 void d_softmax_cross_entropy_gpu(int len, int bengin, int end,
                                  const mask_t* masks, const label_t* labels,
                                  const float_t* out_data, float_t* diff);
+void sigmoid_cross_entropy_gpu(int len, int begin, int end, const float_t* in_data,
+                               const mask_t* masks, const label_t* labels,
+                               float_t* loss, float_t* out_data);
+void d_sigmoid_cross_entropy_gpu(int len, int bengin, int end,
+                                 const mask_t* masks, const label_t* labels,
+                                 const float_t* out_data, float_t* diff);
 void scal_gpu(const int N, const float alpha, float* X);
 void add_scalar_gpu(const int N, const float_t alpha, float_t* Y);
 void rng_uniform_gpu(const int n, const float_t a, const float_t b, float_t* r);
@@ -155,4 +161,5 @@ void copy_masks_device(int n, mask_t* h_masks, mask_t*& d_masks);
 void float_malloc_device(int n, float_t*& ptr);
 void float_free_device(float_t*& ptr);
 void float_copy_device(int n, float_t* h_ptr, float_t *d_ptr);
+acc_t masked_avg_loss_gpu(int begin, int end, int count, mask_t* masks, float_t* loss);
 #endif
diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h
index dcd538e7f6..98573d60b5 100644
--- a/libdeepgalois/include/deepgalois/net.h
+++ b/libdeepgalois/include/deepgalois/net.h
@@ -118,8 +118,13 @@ class Net {
   std::vector<layer*> layers; // all the layers in the neural network
 
   // comparing outputs with the ground truth (labels)
+#ifdef CPU_ONLY
   acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, Graph* dGraph);
   acc_t masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, Graph* dGraph);
+#else
+  acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, CSRGraph *gGraph);
+  acc_t masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, CSRGraph *gGraph);
+#endif
 };
 
 } // namespace deepgalois
diff --git a/libdeepgalois/include/deepgalois/sampler.h b/libdeepgalois/include/deepgalois/sampler.h
index 079a84d415..8842f0e442 100644
--- a/libdeepgalois/include/deepgalois/sampler.h
+++ b/libdeepgalois/include/deepgalois/sampler.h
@@ -1,7 +1,34 @@
 #pragma once
 #include "deepgalois/gtypes.h"
 
-void subgraph_sampler(Graph &g, Graph &sg);
-galois::runtime::iterable<galois::NoDerefIterator<Graph::edge_iterator> > neighbor_sampler(Graph &g, GNode v);
-Graph::edge_iterator sampled_edge_begin(Graph &g, GNode v) { return g.edge_begin(v); }
-Graph::edge_iterator sampled_edge_end(Graph &g, GNode v) { return g.edge_end(v); }
+namespace deepgalois {
+class Sampler {
+public:
+  Sampler() : m(1000) {}
+  ~Sampler() {}
+
+  // sample a subgraph sg of size n from graph g
+  void subgraph_sampler(Graph &g, Graph &sg, size_t n);
+
+  // !API function for user-defined selection strategy
+  virtual void select_vertices(Graph &g, VertexList &vertex_set, size_t n, size_t m);
+
+  galois::runtime::iterable<galois::NoDerefIterator<Graph::edge_iterator> > neighbor_sampler(Graph &g, GNode v);
+
+  Graph::edge_iterator sampled_edge_begin(Graph &g, GNode v) { return g.edge_begin(v); }
+
+  Graph::edge_iterator sampled_edge_end(Graph &g, GNode v) { return g.edge_end(v); }
+
+protected:
+  size_t m;
+  // Utility function to randomly select k items from [begin, end)
+  VertexList selectVertex(GNode begin, GNode end, size_t k);
+  // Utility function to find ceiling of r in arr[l..h]
+  inline int findCeil(std::vector<unsigned> arr, unsigned r, unsigned l, unsigned h);
+  // Utility function to select one element from n elements given a frequency (probability) distribution
+  size_t selectOneVertex(size_t n, std::vector<unsigned> dist);
+  // Given a subset of vertices and a graph g, generate a subgraph sg from the graph g
+  void generate_subgraph(VertexList &vertex_set, Graph &g, Graph &sub);
+};
+
+}
diff --git a/libdeepgalois/include/deepgalois/types.h b/libdeepgalois/include/deepgalois/types.h
index e7600b4605..35f9970b4c 100644
--- a/libdeepgalois/include/deepgalois/types.h
+++ b/libdeepgalois/include/deepgalois/types.h
@@ -22,6 +22,8 @@ typedef uint8_t label_t;  // label is for classification (supervised learning)
 typedef uint8_t mask_t; // mask is used to indicate different uses of labels:
                         // train, val, test
 typedef uint32_t VertexID;
+typedef uint64_t EdgeID;
+typedef std::vector<VertexID> VertexList;
 
 #define CHUNK_SIZE 256
 #define TB_SIZE 256
diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu
index 4d77433eda..34db607c60 100644
--- a/libdeepgalois/src/context.cu
+++ b/libdeepgalois/src/context.cu
@@ -126,9 +126,8 @@ size_t Context::read_graph_gpu(std::string dataset_str, bool selfloop) {
 }
 
 void Context::copy_data_to_device() {
-  assert(labels.size() == n);
   CUDA_CHECK(cudaMalloc((void**)&d_labels, n * sizeof(label_t)));
-  CUDA_CHECK(cudaMemcpy(d_labels, &labels[0], n * sizeof(label_t),
+  CUDA_CHECK(cudaMemcpy(d_labels, labels, n * sizeof(label_t),
                         cudaMemcpyHostToDevice));
   CUDA_CHECK(cudaMalloc((void**)&d_feats, n * feat_len * sizeof(float_t)));
   CUDA_CHECK(cudaMemcpy(d_feats, &h_feats[0], n * feat_len * sizeof(float_t),
@@ -138,7 +137,7 @@ void Context::copy_data_to_device() {
 
 //void Context::copy_data_to_device() {
   //float_malloc_device(n, d_labels);
-  //float_copy_device(n, &labels[0], d_labels);
+  //float_copy_device(n, labels, d_labels);
   //float_malloc_device(n*feat_len, d_feats);
   //float_copy_device(n*feat_len, &h_feats[0], d_feats);
 //}
diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
index 57264976db..4a76861860 100644
--- a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
+++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
@@ -12,6 +12,10 @@ sigmoid_loss_layer::sigmoid_loss_layer(unsigned level,
   loss = new float_t[in_dims[0]]; // error for each sample
 }
 
+sigmoid_loss_layer::~sigmoid_loss_layer() {
+  delete loss;
+}
+
 void sigmoid_loss_layer::forward_propagation(const float_t* in_data, float_t* out_data) {
   size_t len = input_dims[1];
   galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) {
diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cu b/libdeepgalois/src/layers/sigmoid_loss_layer.cu
new file mode 100644
index 0000000000..185a03f1fe
--- /dev/null
+++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cu
@@ -0,0 +1,38 @@
+#include "deepgalois/layers/sigmoid_loss_layer.h"
+#include "gg.h"
+#include "ggcuda.h"
+
+namespace deepgalois {
+
+sigmoid_loss_layer::sigmoid_loss_layer(unsigned level,
+                                       std::vector<size_t> in_dims,
+                                       std::vector<size_t> out_dims)
+    : layer(level, in_dims, out_dims) {
+  trainable_ = false;
+  name_      = layer_type() + "_" + std::to_string(level);
+  float_malloc_device(in_dims[0], loss);
+}
+
+sigmoid_loss_layer::~sigmoid_loss_layer() {
+  float_free_device(loss);
+}
+
+void sigmoid_loss_layer::forward_propagation(const float_t* in_data,
+                                             float_t* out_data) {
+  init_const_gpu(input_dims[0], 0.0, loss);
+  sigmoid_cross_entropy_gpu(input_dims[1], begin_, end_, in_data,
+                            d_masks_, context->d_labels, loss, out_data);
+}
+
+void sigmoid_loss_layer::back_propagation(const float_t* in_data,
+                                          const float_t* out_data,
+                                          float_t* out_grad, float_t* in_grad) {
+  d_sigmoid_cross_entropy_gpu(input_dims[1], begin_, end_, d_masks_,
+                              context->d_labels, out_data, in_grad);
+}
+
+acc_t sigmoid_loss_layer::get_masked_loss() {
+  return masked_avg_loss_gpu(begin_, end_, count_, d_masks_, loss);
+}
+
+} // namespace
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cu b/libdeepgalois/src/layers/softmax_loss_layer.cu
index 6ed45bc98e..b232284017 100644
--- a/libdeepgalois/src/layers/softmax_loss_layer.cu
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cu
@@ -2,34 +2,6 @@
 #include "gg.h"
 #include "ggcuda.h"
 
-__global__ void masked_avg_loss_kernel(int begin, int end, mask_t* masks,
-                                       float_t* loss,
-                                       HGAccumulator<acc_t> total) {
-  total.thread_entry();
-  __shared__ cub::BlockReduce<acc_t, CUDA_NUM_THREADS>::TempStorage local_loss;
-  CUDA_KERNEL_LOOP(i, end - begin) {
-    if (masks[begin + i] == 1)
-      // total += loss[begin+i];
-      total.reduce(loss[begin + i]);
-  }
-  total.thread_exit<cub::BlockReduce<acc_t, CUDA_NUM_THREADS>>(local_loss);
-}
-
-//acc_t masked_avg_loss(int begin, int end, int count, mask_t* masks, float_t* loss);
-acc_t masked_avg_loss(int begin, int end, int count, mask_t* masks,
-                      float_t* loss) {
-  assert(count > 0);
-  HGAccumulator<acc_t> loss_accum;
-  Shared<acc_t> total_loss   = Shared<acc_t>(1);
-  *(total_loss.cpu_wr_ptr()) = 0;
-  loss_accum.rv              = total_loss.gpu_wr_ptr();
-  masked_avg_loss_kernel<<<CUDA_GET_BLOCKS(end - begin), CUDA_NUM_THREADS>>>(
-      begin, end, masks, loss, loss_accum);
-  CudaTest("solving masked_avg_loss kernel failed");
-  cudaDeviceSynchronize();
-  return *(total_loss.cpu_rd_ptr()) / count;
-}
-
 namespace deepgalois {
 
 softmax_loss_layer::softmax_loss_layer(unsigned level,
@@ -60,7 +32,7 @@ void softmax_loss_layer::back_propagation(const float_t* in_data,
 }
 
 acc_t softmax_loss_layer::get_masked_loss() {
-  return masked_avg_loss(begin_, end_, count_, d_masks_, loss);
+  return masked_avg_loss_gpu(begin_, end_, count_, d_masks_, loss);
 }
 
 } // namespace
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index 8002d728a5..5e607f6bed 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -311,6 +311,11 @@ __device__ void softmax_device(int n, const float_t* input, float_t* output) {
   }
 }
 
+__device__ void sigmoid_device(int n, const float_t* in, float_t* out) {
+  for (int i = 0; i < n; i++)
+    out[i] = 1. / (1. + expf(-in[i]));
+}
+
 __device__ void cross_entropy_device(int n, const label_t idx, const float_t* p, float_t& loss) {
   if (p[idx] == 0.0) loss -= logf(float_t(1e-10));
   else loss -= logf(p[idx]);
@@ -343,6 +348,31 @@ void softmax_cross_entropy_gpu(int len, int begin, int end, const float_t* in,
   CudaTest("solving softmax_cross_entropy kernel failed");
 }
 
+// n: number of vectors
+// len: length of vectors
+// for each vector, do softmax to normalize the vector, and then compute a loss
+__global__ void sigmoid_cross_entropy_kernel(int len, int begin, int end,
+                                             const float_t* in_data,
+                                             const mask_t* masks,
+                                             const label_t* labels,
+                                             float_t* loss, float_t* out_data) {
+  CUDA_KERNEL_LOOP(i, end-begin) {
+    int id = begin + i;
+    if (masks[id] == 1) { // masked
+      sigmoid_device(len, in_data + len*id, out_data + len*id);
+      cross_entropy_device(len, labels[id], out_data + len*id, loss[id]);
+    }
+  }
+}
+
+void sigmoid_cross_entropy_gpu(int len, int begin, int end, const float_t* in,
+                               const mask_t* masks, const label_t* labels,
+                               float_t* loss, float_t* out) {
+  sigmoid_cross_entropy_kernel<<<CUDA_GET_BLOCKS(end-begin), CUDA_NUM_THREADS>>>(
+      len, begin, end, in, masks, labels, loss, out);
+  CudaTest("solving sigmoid_cross_entropy kernel failed");
+}
+
 __device__ void d_cross_entropy_device(int n, const label_t idx, const float_t* p, float_t* d) {
   for (int i = 0; i < n; i++) {
     if (i == (int)idx) d[i] = -1.0 / (p[i] + 1e-10);
@@ -394,7 +424,7 @@ __global__ void d_cross_entropy_warp(int len, int begin, int end,
     }
   }
 }
-// TODO: use warp
+
 __device__ void d_softmax_device(int n, const float_t* p, const float_t* dp, float_t* dy) {
   for (int i = 0; i < n; i++) {
     dy[i] = 0;
@@ -406,8 +436,8 @@ __device__ void d_softmax_device(int n, const float_t* p, const float_t* dp, flo
 }
 
 __global__ void d_softmax_kernel(int len, int begin, int end,
-                                const mask_t* masks, const float_t* data,
-                                const float_t* in_grad, float_t* out_grad) {
+                                 const mask_t* masks, const float_t* data,
+                                 const float_t* in_grad, float_t* out_grad) {
   CUDA_KERNEL_LOOP(i, end-begin) {
     int id = begin + i;
     if (masks[id] == 1) { // masked
@@ -417,8 +447,8 @@ __global__ void d_softmax_kernel(int len, int begin, int end,
 } 
 
 __global__ void d_softmax_warp(int len, int begin, int end,
-                                const mask_t* masks, const float_t* data,
-                                const float_t* in_grad, float_t* out_grad) {
+                               const mask_t* masks, const float_t* data,
+                               const float_t* in_grad, float_t* out_grad) {
   __shared__ float_t p[BLOCK_SIZE/WARP_SIZE][MAX_NUM_CLASSES];
   __shared__ float_t d[BLOCK_SIZE/WARP_SIZE][MAX_NUM_CLASSES];
   const int thread_id   = BLOCK_SIZE * blockIdx.x + threadIdx.x;  // global thread index
@@ -457,8 +487,8 @@ __global__ void d_softmax_warp(int len, int begin, int end,
 }
 
 __global__ void d_softmax_cross_entropy_kernel(int len, int begin, int end,
-                               const mask_t* masks, const label_t* labels,
-                               const float_t* out, float_t* diff) {
+                                               const mask_t* masks, const label_t* labels,
+                                               const float_t* out, float_t* diff) {
   CUDA_KERNEL_LOOP(i, end-begin) {
     int id = begin + i;
     if (masks[id] == 1) { // masked
@@ -536,3 +566,42 @@ void d_softmax_cross_entropy_gpu(int len, int begin, int end,
   CudaTest("solving d_softmax_cross_entropy_warp kernel failed");
 }
 
+__global__ void d_sigmoid_cross_entropy_warp(int len, int begin, int end,
+                                             const mask_t* masks, const label_t* labels,
+                                             const float_t* data, float_t* grad) {
+
+}
+
+void d_sigmoid_cross_entropy_gpu(int len, int begin, int end,
+                                 const mask_t* masks, const label_t* labels,
+                                 const float_t* out, float_t* diff) {
+  d_sigmoid_cross_entropy_warp<<<(end-begin-1)/WARPS_PER_BLOCK+1, BLOCK_SIZE>>>(
+      len, begin, end, masks, labels, out, diff);
+  CudaTest("solving d_softmax_cross_entropy_warp kernel failed");
+}
+
+__global__ void masked_avg_loss_kernel(int begin, int end, mask_t* masks,
+                                       float_t* loss, HGAccumulator<acc_t> total) {
+  total.thread_entry();
+  __shared__ cub::BlockReduce<acc_t, CUDA_NUM_THREADS>::TempStorage local_loss;
+  CUDA_KERNEL_LOOP(i, end - begin) {
+    if (masks[begin + i] == 1)
+      total.reduce(loss[begin + i]);
+  }
+  total.thread_exit<cub::BlockReduce<acc_t, CUDA_NUM_THREADS>>(local_loss);
+}
+
+//acc_t masked_avg_loss(int begin, int end, int count, mask_t* masks, float_t* loss);
+acc_t masked_avg_loss_gpu(int begin, int end, int count, mask_t* masks, float_t* loss) {
+  assert(count > 0);
+  HGAccumulator<acc_t> loss_accum;
+  Shared<acc_t> total_loss   = Shared<acc_t>(1);
+  *(total_loss.cpu_wr_ptr()) = 0;
+  loss_accum.rv              = total_loss.gpu_wr_ptr();
+  masked_avg_loss_kernel<<<CUDA_GET_BLOCKS(end - begin), CUDA_NUM_THREADS>>>(
+      begin, end, masks, loss, loss_accum);
+  CudaTest("solving masked_avg_loss kernel failed");
+  cudaDeviceSynchronize();
+  return *(total_loss.cpu_rd_ptr()) / count;
+}
+
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index 6d390ea867..6f1d18351d 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -119,14 +119,18 @@ void Net::train(optimizer* opt, bool need_validate) {
     // forward: after this phase, layer edges will contain intermediate features
     // for use during backprop
     Tfw.start();
-    train_loss =
-        Net::fprop(train_begin, train_end, train_count, &train_mask[0]); // forward
+    train_loss = Net::fprop(train_begin, train_end, train_count, &train_mask[0]); // forward
+#ifdef CPU_ONLY
+    Graph *g = context->getGraphPointer();
+#else
+	CSRGraph *g = context->getGpuGraphPointer();
+#endif
     if (is_single_class) {
       train_acc = masked_accuracy(train_begin, train_end, train_count,
-                                  &train_mask[0], context->getGraphPointer()); // predict
+                                  &train_mask[0], g); // predict
     } else {
       train_acc = masked_multi_class_accuracy(train_begin, train_end, train_count,
-                                  &train_mask[0], context->getGraphPointer()); // predict
+                                              &train_mask[0], g); // predict
     }
     Tfw.stop();
 
@@ -177,10 +181,15 @@ double Net::evaluate(size_t begin, size_t end, size_t count, mask_t* masks,
   Timer t_eval;
   t_eval.Start();
   loss = fprop(begin, end, count, masks);
+#ifdef CPU_ONLY
+  Graph* g = context->getCpuGraphPointer();
+#else
+  CSRGraph* g = context->getGpuGraphPointer();
+#endif
   if (is_single_class) {
-    acc = masked_accuracy(begin, end, count, masks, context->getGraphPointer());
+    acc = masked_accuracy(begin, end, count, masks, g);
   } else {
-    acc = masked_multi_class_accuracy(begin, end, count, masks, context->getGraphPointer());
+    acc = masked_multi_class_accuracy(begin, end, count, masks, g);
   }
   t_eval.Stop();
   return t_eval.Millisecs();
diff --git a/libdeepgalois/src/net.cu b/libdeepgalois/src/net.cu
index 62dec7cad4..70f70b9a88 100644
--- a/libdeepgalois/src/net.cu
+++ b/libdeepgalois/src/net.cu
@@ -34,7 +34,6 @@ __global__ void masked_accuracy_kernel(int num_classes, int begin,
   total.thread_exit<cub::BlockReduce<acc_t, CUDA_NUM_THREADS>>(local_accuracy);
 }
 
-//acc_t masked_accuracy_gpu(int num_classes, int begin, int end, int count, mask_t* masks, float_t* preds, label_t* labels);
 acc_t masked_accuracy_gpu(int num_classes, int begin, int end, int count,
                           mask_t* masks, float_t* preds, label_t* labels) {
   assert(count > 0);
@@ -49,12 +48,83 @@ acc_t masked_accuracy_gpu(int num_classes, int begin, int end, int count,
   return *(total_accuracy.cpu_rd_ptr()) / count;
 }
 
+__global__ void masked_f1_score_kernel(int num_classes, int begin,
+                                       int end, mask_t* masks,
+                                       float_t* preds, label_t* labels,
+                                       float_t* true_positive,
+                                       float_t* false_positive,
+                                       float_t* false_negtive) {
+  CUDA_KERNEL_LOOP(i, end - begin) {
+    if (masks[begin + i] == 1) {
+      for (size_t j = 0; j < num_classes; j++) {
+        auto idx = i * num_classes + j;
+        if (labels[idx] == 1 && preds[idx] > 0.5) {
+          true_positive[j] ++;
+        } else if (labels[idx] == 0 && preds[idx] > 0.5) {
+          false_positive[j] ++;
+        } else if (labels[idx] == 1 && preds[idx] <= 0.5) {
+          false_negtive[j] ++;
+        }
+      }
+	}
+  }
+}
+
+acc_t masked_f1_score_gpu(int num_classes, int begin, int end, int count,
+                          mask_t* masks, float_t* preds, label_t* labels) {
+  float beta = 1.0;
+  assert(count > 0);
+  float *h_tp = new float[num_classes];
+  float *h_fp = new float[num_classes];
+  float *h_fn = new float[num_classes];
+  float *d_tp, *d_fp, *d_fn;
+  float_malloc_device(num_classes, d_tp);
+  float_malloc_device(num_classes, d_fp);
+  float_malloc_device(num_classes, d_fn);
+  masked_f1_score_kernel<<<CUDA_GET_BLOCKS(end - begin), CUDA_NUM_THREADS>>>(
+      num_classes, begin, end, masks, preds, labels, d_tp, d_fp, d_fn);
+  cudaMemcpy(&h_tp, d_tp, sizeof(bool), cudaMemcpyDeviceToHost);
+  cudaMemcpy(&h_fp, d_fp, sizeof(bool), cudaMemcpyDeviceToHost);
+  cudaMemcpy(&h_fn, d_fn, sizeof(bool), cudaMemcpyDeviceToHost);
+
+  acc_t pNumerator = 0.0;
+  acc_t pDenominator = 0.0;
+  acc_t rNumerator = 0.0;
+  acc_t rDenominator = 0.0;
+  for (size_t i = 0; i < num_classes; i++) {
+    auto fn = h_fn[i]; // false negtive
+    auto fp = h_fp[i]; // false positive
+	auto tp = h_tp[i]; // true positive
+	pNumerator = pNumerator + tp;
+	pDenominator = pDenominator + (tp + fp);
+    rNumerator = rNumerator + tp;
+    rDenominator = rDenominator + (tp + fn);
+  }
+  auto recallMicro = rNumerator / rDenominator;
+  acc_t precisionMicro = pNumerator / pDenominator;
+  auto fscoreMicro = (((beta * beta) + 1) * precisionMicro * recallMicro) / 
+                     ((beta * beta) * precisionMicro + recallMicro);
+  float_free_device(d_tp);
+  float_free_device(d_fp);
+  float_free_device(d_fn);
+  return fscoreMicro;
+}
+
 namespace deepgalois {
 acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count,
-                           mask_t* masks) {
+                           mask_t* masks, CSRGraph *g) {
   return masked_accuracy_gpu(num_classes, begin, end, count,
                              layers[NUM_CONV_LAYERS]->get_device_masks(),
                              layers[NUM_CONV_LAYERS - 1]->next()->get_data(),
                              context->d_labels);
 }
+
+acc_t Net::masked_multi_class_accuracy(size_t begin, size_t end, size_t count, 
+                                       mask_t* masks, CSRGraph* g) {
+	return masked_f1_score_gpu(num_classes, begin, end, count,
+                             layers[NUM_CONV_LAYERS]->get_device_masks(),
+                             layers[NUM_CONV_LAYERS - 1]->next()->get_data(),
+                             context->d_labels);
 }
+
+} // end namespace
diff --git a/libdeepgalois/src/sampler.cpp b/libdeepgalois/src/sampler.cpp
index 98dac8c75c..fdfb9802cf 100644
--- a/libdeepgalois/src/sampler.cpp
+++ b/libdeepgalois/src/sampler.cpp
@@ -2,13 +2,19 @@
 #include <time.h> 
 #include <vector>
 
-// selecet k vertices from begin to end
-static std::vector<GNode> selectVertex(GNode begin, GNode end, size_t k) {
+inline unsigned getDegree(Graph &g, GNode v) {
+	return std::distance(g.edge_begin(v), g.edge_end(v));
+}
+
+namespace deepgalois {
+
+// Utility function to randomly select k items from [begin, end)
+VertexList Sampler::selectVertex(GNode begin, GNode end, size_t k) {
     auto i = begin;
   
     // reservoir[] is the output array. Initialize  
     // it with first k vertices 
-    std::vector<GNode> reservoir(k);
+    VertexList reservoir(k);
     for (; i < k; i++) reservoir[i] = i;
   
     // Use a different seed value so that we don't get  
@@ -29,7 +35,7 @@ static std::vector<GNode> selectVertex(GNode begin, GNode end, size_t k) {
 }
 
 // Utility function to find ceiling of r in arr[l..h]
-int findCeil(std::vector<unsigned> arr, unsigned r, unsigned l, unsigned h) {  
+inline int Sampler::findCeil(std::vector<unsigned> arr, unsigned r, unsigned l, unsigned h) {  
 	unsigned mid;
 	while (l < h) {
 		mid = l + ((h - l) >> 1); // Same as mid = (l+h)/2
@@ -38,9 +44,9 @@ int findCeil(std::vector<unsigned> arr, unsigned r, unsigned l, unsigned h) {
 	return (arr[l] >= r) ? l : -1;  
 } 
 
-// select one element from n  elements given a frequency (probability) distribution
+// Utility function to select one element from n elements given a frequency (probability) distribution
 // https://www.geeksforgeeks.org/random-number-generator-in-arbitrary-probability-distribution-fashion/
-size_t selectOneVertex(size_t n, std::vector<unsigned> dist) {
+size_t Sampler::selectOneVertex(size_t n, std::vector<unsigned> dist) {
 	std::vector<unsigned> offsets(n);
 	offsets[0] = dist[0];
 	// compute the prefix sum of the distribution
@@ -53,17 +59,14 @@ size_t selectOneVertex(size_t n, std::vector<unsigned> dist) {
 	return findCeil(offsets, r, 0, n - 1);
 }
 
-inline unsigned getDegree(Graph &g, GNode v) {
-	return std::distance(g.edge_begin(v), g.edge_end(v));
-}
-
-void generate_subgraph(std::set<GNode> &vertex_set, Graph &g, Graph &sub) {
+// Given a subset of vertices and a graph g, generate a subgraph sg from the graph g
+void Sampler::generate_subgraph(VertexList &vertex_set, Graph &g, Graph &sub) {
 	auto nv = vertex_set.size();
 	size_t ne = 0;
 	std::vector<unsigned> offsets(nv+1);
 	offsets[0] = 0;
 	size_t i = 0;
-	std::vector<GNode> vertices(nv);
+	VertexList vertices(nv);
 	for (auto v : vertex_set) {
 		vertices[i] = v;
 		offsets[i+1] = offsets[i] + getDegree(g, v);
@@ -80,15 +83,15 @@ void generate_subgraph(std::set<GNode> &vertex_set, Graph &g, Graph &sub) {
 	}
 }
 
-// generate a subgraph sg with size n from the input graph g
-// n: number of vertices in the subgraph
-// m: number of vertices in the frontier
-void subgraph_sampler(Graph &g, Graph &sg, size_t n, size_t m) {
+// !API function for user-defined selection strategy
+// Select n vertices from graph g and put them in vertex_set.
+// n: number of vertices in the subgraph;
+// m: number of vertices in the frontier.
+void Sampler::select_vertices(Graph &g, VertexList &vertex_set, size_t n, size_t m) {
+	assert(n == vertex_set.size());
     auto num_vertices = g.size(); // number of vertices in the original input graph
     auto frontier = selectVertex(0, num_vertices, m); // randomly select m vertices from g as frontier
-	std::set<GNode> vertex_set;
-	for (size_t i = 0; i < m; i++)
-		vertex_set.insert(frontier[i]);
+	for (size_t i = 0; i < m; i++) vertex_set[i] = frontier[i];
 	std::vector<unsigned> degrees(m);
 	//std::vector<float> probabilities(m);
 	//unsigned sum_degree = 0;
@@ -107,7 +110,15 @@ void subgraph_sampler(Graph &g, Graph &sg, size_t n, size_t m) {
 		degrees[pos] = getDegree(g, frontier[pos]);
 		//sum_degree -= degree;
 		//sum_degree += degrees[pos];
-		vertex_set.insert(u);
+		vertex_set.push_back(u);
 	}
+}
+
+void Sampler::subgraph_sampler(Graph &g, Graph&sg, size_t n) {
+	VertexList vertex_set(n);
+	select_vertices(g, vertex_set, n, m); 
 	generate_subgraph(vertex_set, g, sg);
 }
+
+} // end namespace
+
diff --git a/libdeepgalois/src/utils.cpp b/libdeepgalois/src/utils.cpp
index 72d6560aca..9030af2249 100644
--- a/libdeepgalois/src/utils.cpp
+++ b/libdeepgalois/src/utils.cpp
@@ -20,7 +20,7 @@ const std::string dataset_names[NUM_DATASETS] = {"cora", "citeseer", "ppi", "pub
 // and https://github.com/ashokpant/accuracy-evaluation-cpp/blob/master/src/evaluation.hpp
 acc_t masked_f1_score(size_t begin, size_t end, size_t count, mask_t *masks, 
                       size_t num_classes, label_t *ground_truth, float_t *pred) {
-  float beta = 1;
+  float beta = 1.0;
   std::vector<acc_t> true_positive(num_classes, 0);
   std::vector<acc_t> false_positive(num_classes, 0);
   std::vector<acc_t> false_negtive(num_classes, 0);

From de60b193054ac35bf0bd61917d2648d450252e6c Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Mon, 20 Apr 2020 21:35:30 -0500
Subject: [PATCH 180/660] fix

---
 libdeepgalois/include/deepgalois/DistContext.h | 2 +-
 libdeepgalois/scripts/run-single.sh            | 2 +-
 libdeepgalois/src/context.cpp                  | 2 +-
 libdeepgalois/src/net.cpp                      | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h
index 704247d54b..37e2eea372 100644
--- a/libdeepgalois/include/deepgalois/DistContext.h
+++ b/libdeepgalois/include/deepgalois/DistContext.h
@@ -39,7 +39,7 @@ class DistContext {
   void initializeSyncSubstrate();
   galois::graphs::GluonSubstrate<Graph>* getSyncSubstrate();
 
-  Graph* getGraphPointer() {
+  Graph* getCpuGraphPointer() {
     return graph_cpu;
   }
 
diff --git a/libdeepgalois/scripts/run-single.sh b/libdeepgalois/scripts/run-single.sh
index 9c0d9fcb63..37a393d788 100755
--- a/libdeepgalois/scripts/run-single.sh
+++ b/libdeepgalois/scripts/run-single.sh
@@ -3,7 +3,7 @@
 GALOIS_HOME=/net/ohm/export/cdgc/cxh/GaloisCpp
 LONESTARGNN=$GALOIS_HOME/build-gnn-cpu/lonestargnn
 GNNS="gcn"
-GRAPHS="cora citeseer pudmed flickr reddit"
+GRAPHS="cora citeseer pubmed flickr reddit"
 #GRAPHS="cora"
 EPOCHS="200"
 NTHREADS="56"
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index 8d6616182f..7a94df8c17 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -94,7 +94,7 @@ void Context::add_selfloop(Graph &og, Graph &g) {
   //*/
 }
 
-Graph* Context::getGraphPointer() {
+Graph* Context::getCpuGraphPointer() {
   return Context::graph_cpu;
 }
 
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index 6f1d18351d..08af8872f0 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -121,7 +121,7 @@ void Net::train(optimizer* opt, bool need_validate) {
     Tfw.start();
     train_loss = Net::fprop(train_begin, train_end, train_count, &train_mask[0]); // forward
 #ifdef CPU_ONLY
-    Graph *g = context->getGraphPointer();
+    Graph *g = context->getCpuGraphPointer();
 #else
 	CSRGraph *g = context->getGpuGraphPointer();
 #endif

From d8943bafa95b694d246e20377aafedcbb3c5abda Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Tue, 21 Apr 2020 09:25:47 -0500
Subject: [PATCH 181/660] fix minor

---
 libdeepgalois/include/deepgalois/context.h |  1 +
 libdeepgalois/include/deepgalois/types.h   |  2 +-
 libdeepgalois/src/context.cpp              |  7 ++++++-
 libdeepgalois/src/context.cu               | 14 ++++++++++++--
 libdeepgalois/src/math_functions.cu        | 12 +++++++++++-
 5 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h
index d6bb004b7a..206b395bb8 100644
--- a/libdeepgalois/include/deepgalois/context.h
+++ b/libdeepgalois/include/deepgalois/context.h
@@ -59,6 +59,7 @@ class Context {
   size_t num_classes;          // number of classes: E
   size_t feat_len;             // input feature length: D
   bool is_single_class;        // single-class (one-hot) or multi-class label
+  bool is_selfloop_added;      // whether selfloop is added to the input graph
   label_t *labels;             // labels for classification: N x 1
   float_t* h_feats;            // input features: N x D
 #ifndef CPU_ONLY
diff --git a/libdeepgalois/include/deepgalois/types.h b/libdeepgalois/include/deepgalois/types.h
index 35f9970b4c..92e0d31772 100644
--- a/libdeepgalois/include/deepgalois/types.h
+++ b/libdeepgalois/include/deepgalois/types.h
@@ -29,7 +29,7 @@ typedef std::vector<VertexID> VertexList;
 #define TB_SIZE 256
 #define BLOCK_SIZE 256
 #define WARP_SIZE 32
-#define MAX_NUM_CLASSES 64
+#define MAX_NUM_CLASSES 128
 #define WARPS_PER_BLOCK (BLOCK_SIZE / WARP_SIZE)
 #define USE_CUSPARSE
 
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index 7a94df8c17..98b3f7ed15 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -7,7 +7,11 @@
 namespace deepgalois {
 
 #ifdef CPU_ONLY
-Context::Context() {}
+Context::Context() : n(0), num_classes(0), feat_len(0), 
+  is_single_class(true), is_selfloop_added(false), 
+  labels(NULL), h_feats(NULL), norm_factor(NULL),
+  d_labels(NULL), d_feats(NULL) {}
+
 Context::~Context() {
   if (labels) delete labels;
   if (h_feats) delete h_feats;
@@ -37,6 +41,7 @@ size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype, bo
       Graph graph_temp;
       galois::graphs::readGraph(graph_temp, filename);
       add_selfloop(graph_temp, *graph_cpu);
+      is_selfloop_added = selfloop;
     } else galois::graphs::readGraph(*graph_cpu, filename);
 // TODO dist version of self loop
   } else {
diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu
index 34db607c60..4ed442c70d 100644
--- a/libdeepgalois/src/context.cu
+++ b/libdeepgalois/src/context.cu
@@ -61,7 +61,10 @@ cusparseHandle_t Context::cusparse_handle_     = 0;
 cusparseMatDescr_t Context::cusparse_matdescr_ = 0;
 curandGenerator_t Context::curand_generator_   = 0;
 
-Context::Context() {
+Context::Context() : n(0), num_classes(0), feat_len(0), 
+  is_single_class(true), is_selfloop_added(false), 
+  labels(NULL), h_feats(NULL), norm_factor(NULL),
+  d_labels(NULL), d_feats(NULL) {
   CUBLAS_CHECK(cublasCreate(&cublas_handle_));
   CUSPARSE_CHECK(cusparseCreate(&cusparse_handle_));
   CUSPARSE_CHECK(cusparseCreateMatDescr(&cusparse_matdescr_));
@@ -89,6 +92,10 @@ size_t Context::read_graph(std::string dataset_str, bool selfloop) {
 
 void Context::norm_factor_counting() {
   std::cout << "Pre-computing normalization factor (n=" << n << ") ... ";
+  if (!is_selfloop_added) {
+    std::cout << "Set -sl=1 to add selfloop\n";	  
+    exit(0);
+  }
 #ifdef USE_CUSPARSE
   int nnz = graph_gpu.nedges;
   CUDA_CHECK(cudaMalloc((void**)&norm_factor, nnz * sizeof(float_t)));
@@ -120,7 +127,10 @@ size_t Context::read_graph_gpu(std::string dataset_str, bool selfloop) {
   std::string filename = path + dataset_str + ".csgr";
   CSRGraph g;
   g.read(filename.c_str(), false);
-  if (selfloop) g.add_selfloop();
+  if (selfloop) {
+    g.add_selfloop();
+    is_selfloop_added = selfloop;
+  }
   g.copy_to_gpu(graph_gpu);
   return graph_gpu.nnodes;
 }
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index 5e607f6bed..fa5f02de21 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -321,6 +321,16 @@ __device__ void cross_entropy_device(int n, const label_t idx, const float_t* p,
   else loss -= logf(p[idx]);
 }
 
+// y: ground truth
+// p: predictions
+__device__ void cross_entropy_multi_device(int n, const label_t *y, const float_t* p, float_t& loss) {
+  for (int i = 0; i < n; i++) {
+    if (y[i] == 0) continue;
+    if (p[i] == float_t(0)) loss -= logf(float_t(1e-10)); // avoid NaN exception
+    else loss -= logf(p[i]);
+  }
+}
+
 // n: number of vectors
 // len: length of vectors
 // for each vector, do softmax to normalize the vector, and then compute a loss
@@ -360,7 +370,7 @@ __global__ void sigmoid_cross_entropy_kernel(int len, int begin, int end,
     int id = begin + i;
     if (masks[id] == 1) { // masked
       sigmoid_device(len, in_data + len*id, out_data + len*id);
-      cross_entropy_device(len, labels[id], out_data + len*id, loss[id]);
+      cross_entropy_multi_device(len, labels, out_data + len*id, loss[id]);
     }
   }
 }

From f7dfad169878768f2b0eef400aef87d5b746a236 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Tue, 21 Apr 2020 10:22:28 -0500
Subject: [PATCH 182/660] refine

---
 libdeepgalois/include/deepgalois/context.h    | 14 ++++---
 .../deepgalois/layers/graph_conv_layer.h      |  2 +-
 .../src/layers/sigmoid_loss_layer.cu          |  4 +-
 .../src/layers/softmax_loss_layer.cu          |  4 +-
 libdeepgalois/src/net.cu                      | 41 ++++++++++---------
 libdeepgalois/src/utils.cpp                   | 18 ++++----
 6 files changed, 43 insertions(+), 40 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h
index 206b395bb8..e40b6a6371 100644
--- a/libdeepgalois/include/deepgalois/context.h
+++ b/libdeepgalois/include/deepgalois/context.h
@@ -22,23 +22,22 @@ class Context {
   ~Context();
 
   size_t read_graph(std::string dataset_str, bool selfloop);
+  size_t read_graph_cpu(std::string dataset_str, std::string filetype, bool selfloop);
+  size_t read_graph_gpu(std::string dataset_str, bool selfloop);
   size_t read_labels(std::string dataset_str);
   size_t read_features(std::string dataset_str, std::string filetype = "bin");
+
   label_t get_label(size_t i) { return labels[i]; } // single-class (one-hot) label
   label_t get_label(size_t i, size_t j) { return labels[i*num_classes+j]; } // multi-class label
   label_t* get_labels_ptr() { return labels; }
+  label_t* get_labels_device_ptr() { return d_labels; }
   float_t* get_in_ptr();
+  float_t* get_norm_factor() { return norm_factor; }
 
-  size_t read_graph_cpu(std::string dataset_str, std::string filetype, bool selfloop);
-  size_t read_graph_gpu(std::string dataset_str, bool selfloop);
   void copy_data_to_device(); // copy labels and input features
   void norm_factor_counting();
   void set_label_class(bool is_single = true) { is_single_class = is_single; }
 
-  float_t* d_feats;            // input features on device
-  label_t* d_labels;           // labels on device
-  float_t* norm_factor;        // normalization constant based on graph structure
-
 #ifdef CPU_ONLY
   Graph* graph_cpu; // the input graph, |V| = N
   void genGraph(LGraph& lg, Graph& g);
@@ -62,6 +61,9 @@ class Context {
   bool is_selfloop_added;      // whether selfloop is added to the input graph
   label_t *labels;             // labels for classification: N x 1
   float_t* h_feats;            // input features: N x D
+  float_t* norm_factor;        // normalization constant based on graph structure
+  label_t* d_labels;           // labels on device
+  float_t* d_feats;            // input features on device
 #ifndef CPU_ONLY
   static cublasHandle_t cublas_handle_; // used to call cuBLAS
   static cusparseHandle_t cusparse_handle_; // used to call cuSPARSE
diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
index 0bf7a7e698..c9b8729d62 100644
--- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
@@ -36,7 +36,7 @@ class graph_conv_layer : public layer {
   void init();
   std::string layer_type() const override { return std::string("graph_conv"); }
   void set_netphase(deepgalois::net_phase ctx) override { phase_ = ctx; }
-  void set_context(layer::ContextType* ctx) { context = ctx; norm_factor = ctx->norm_factor; }
+  void set_context(layer::ContextType* ctx) { context = ctx; norm_factor = ctx->get_norm_factor(); }
   //! Uses weights contained in this layer to update in_data (results from previous)
   //! and save result to out_data
   virtual void forward_propagation(const float_t* in_data, float_t* out_data);
diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cu b/libdeepgalois/src/layers/sigmoid_loss_layer.cu
index 185a03f1fe..6d7268d4af 100644
--- a/libdeepgalois/src/layers/sigmoid_loss_layer.cu
+++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cu
@@ -21,14 +21,14 @@ void sigmoid_loss_layer::forward_propagation(const float_t* in_data,
                                              float_t* out_data) {
   init_const_gpu(input_dims[0], 0.0, loss);
   sigmoid_cross_entropy_gpu(input_dims[1], begin_, end_, in_data,
-                            d_masks_, context->d_labels, loss, out_data);
+                            d_masks_, context->get_labels_device_ptr(), loss, out_data);
 }
 
 void sigmoid_loss_layer::back_propagation(const float_t* in_data,
                                           const float_t* out_data,
                                           float_t* out_grad, float_t* in_grad) {
   d_sigmoid_cross_entropy_gpu(input_dims[1], begin_, end_, d_masks_,
-                              context->d_labels, out_data, in_grad);
+                              context->get_labels_device_ptr(), out_data, in_grad);
 }
 
 acc_t sigmoid_loss_layer::get_masked_loss() {
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cu b/libdeepgalois/src/layers/softmax_loss_layer.cu
index b232284017..c2f3a98303 100644
--- a/libdeepgalois/src/layers/softmax_loss_layer.cu
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cu
@@ -21,14 +21,14 @@ void softmax_loss_layer::forward_propagation(const float_t* in_data,
                                              float_t* out_data) {
   init_const_gpu(input_dims[0], 0.0, loss);
   softmax_cross_entropy_gpu(input_dims[1], begin_, end_, in_data,
-                            d_masks_, context->d_labels, loss, out_data);
+                            d_masks_, context->get_labels_device_ptr(), loss, out_data);
 }
 
 void softmax_loss_layer::back_propagation(const float_t* in_data,
                                           const float_t* out_data,
                                           float_t* out_grad, float_t* in_grad) {
   d_softmax_cross_entropy_gpu(input_dims[1], begin_, end_, d_masks_,
-                              context->d_labels, out_data, in_grad);
+                              context->get_labels_device_ptr(), out_data, in_grad);
 }
 
 acc_t softmax_loss_layer::get_masked_loss() {
diff --git a/libdeepgalois/src/net.cu b/libdeepgalois/src/net.cu
index 70f70b9a88..e7f7d7b603 100644
--- a/libdeepgalois/src/net.cu
+++ b/libdeepgalois/src/net.cu
@@ -48,22 +48,23 @@ acc_t masked_accuracy_gpu(int num_classes, int begin, int end, int count,
   return *(total_accuracy.cpu_rd_ptr()) / count;
 }
 
+typedef float f1count_t;
 __global__ void masked_f1_score_kernel(int num_classes, int begin,
                                        int end, mask_t* masks,
                                        float_t* preds, label_t* labels,
-                                       float_t* true_positive,
-                                       float_t* false_positive,
-                                       float_t* false_negtive) {
+                                       f1count_t* true_positive,
+                                       f1count_t* false_positive,
+                                       f1count_t* false_negtive) {
   CUDA_KERNEL_LOOP(i, end - begin) {
     if (masks[begin + i] == 1) {
       for (size_t j = 0; j < num_classes; j++) {
         auto idx = i * num_classes + j;
         if (labels[idx] == 1 && preds[idx] > 0.5) {
-          true_positive[j] ++;
+          atomicAdd(&true_positive[j], 1.0);
         } else if (labels[idx] == 0 && preds[idx] > 0.5) {
-          false_positive[j] ++;
+          atomicAdd(&false_positive[j], 1.0);
         } else if (labels[idx] == 1 && preds[idx] <= 0.5) {
-          false_negtive[j] ++;
+          atomicAdd(&false_negtive[j], 1.0);
         }
       }
 	}
@@ -74,35 +75,35 @@ acc_t masked_f1_score_gpu(int num_classes, int begin, int end, int count,
                           mask_t* masks, float_t* preds, label_t* labels) {
   float beta = 1.0;
   assert(count > 0);
-  float *h_tp = new float[num_classes];
-  float *h_fp = new float[num_classes];
-  float *h_fn = new float[num_classes];
-  float *d_tp, *d_fp, *d_fn;
+  f1count_t* h_tp = new f1count_t[num_classes];
+  f1count_t* h_fp = new f1count_t[num_classes];
+  f1count_t* h_fn = new f1count_t[num_classes];
+  f1count_t* d_tp, *d_fp, *d_fn;
   float_malloc_device(num_classes, d_tp);
   float_malloc_device(num_classes, d_fp);
   float_malloc_device(num_classes, d_fn);
   masked_f1_score_kernel<<<CUDA_GET_BLOCKS(end - begin), CUDA_NUM_THREADS>>>(
       num_classes, begin, end, masks, preds, labels, d_tp, d_fp, d_fn);
-  cudaMemcpy(&h_tp, d_tp, sizeof(bool), cudaMemcpyDeviceToHost);
-  cudaMemcpy(&h_fp, d_fp, sizeof(bool), cudaMemcpyDeviceToHost);
-  cudaMemcpy(&h_fn, d_fn, sizeof(bool), cudaMemcpyDeviceToHost);
+  cudaMemcpy(h_tp, d_tp, num_classes * sizeof(f1count_t), cudaMemcpyDeviceToHost);
+  cudaMemcpy(h_fp, d_fp, num_classes * sizeof(f1count_t), cudaMemcpyDeviceToHost);
+  cudaMemcpy(h_fn, d_fn, num_classes * sizeof(f1count_t), cudaMemcpyDeviceToHost);
 
   acc_t pNumerator = 0.0;
   acc_t pDenominator = 0.0;
   acc_t rNumerator = 0.0;
   acc_t rDenominator = 0.0;
   for (size_t i = 0; i < num_classes; i++) {
-    auto fn = h_fn[i]; // false negtive
-    auto fp = h_fp[i]; // false positive
-	auto tp = h_tp[i]; // true positive
+    acc_t fn = (acc_t)h_fn[i]; // false negtive
+    acc_t fp = (acc_t)h_fp[i]; // false positive
+	acc_t tp = (acc_t)h_tp[i]; // true positive
 	pNumerator = pNumerator + tp;
 	pDenominator = pDenominator + (tp + fp);
     rNumerator = rNumerator + tp;
     rDenominator = rDenominator + (tp + fn);
   }
-  auto recallMicro = rNumerator / rDenominator;
+  acc_t recallMicro = rNumerator / rDenominator;
   acc_t precisionMicro = pNumerator / pDenominator;
-  auto fscoreMicro = (((beta * beta) + 1) * precisionMicro * recallMicro) / 
+  acc_t fscoreMicro = (((beta * beta) + 1) * precisionMicro * recallMicro) / 
                      ((beta * beta) * precisionMicro + recallMicro);
   float_free_device(d_tp);
   float_free_device(d_fp);
@@ -116,7 +117,7 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count,
   return masked_accuracy_gpu(num_classes, begin, end, count,
                              layers[NUM_CONV_LAYERS]->get_device_masks(),
                              layers[NUM_CONV_LAYERS - 1]->next()->get_data(),
-                             context->d_labels);
+                             context->get_labels_device_ptr());
 }
 
 acc_t Net::masked_multi_class_accuracy(size_t begin, size_t end, size_t count, 
@@ -124,7 +125,7 @@ acc_t Net::masked_multi_class_accuracy(size_t begin, size_t end, size_t count,
 	return masked_f1_score_gpu(num_classes, begin, end, count,
                              layers[NUM_CONV_LAYERS]->get_device_masks(),
                              layers[NUM_CONV_LAYERS - 1]->next()->get_data(),
-                             context->d_labels);
+                             context->get_labels_device_ptr());
 }
 
 } // end namespace
diff --git a/libdeepgalois/src/utils.cpp b/libdeepgalois/src/utils.cpp
index 9030af2249..77657c3f3c 100644
--- a/libdeepgalois/src/utils.cpp
+++ b/libdeepgalois/src/utils.cpp
@@ -21,19 +21,19 @@ const std::string dataset_names[NUM_DATASETS] = {"cora", "citeseer", "ppi", "pub
 acc_t masked_f1_score(size_t begin, size_t end, size_t count, mask_t *masks, 
                       size_t num_classes, label_t *ground_truth, float_t *pred) {
   float beta = 1.0;
-  std::vector<acc_t> true_positive(num_classes, 0);
-  std::vector<acc_t> false_positive(num_classes, 0);
-  std::vector<acc_t> false_negtive(num_classes, 0);
+  std::vector<int> true_positive(num_classes, 0);
+  std::vector<int> false_positive(num_classes, 0);
+  std::vector<int> false_negtive(num_classes, 0);
   galois::do_all(galois::iterate(begin, end), [&](const auto& i) {
     if (masks[i] == 1) {
       for (size_t j = 0; j < num_classes; j++) {
         auto idx = i * num_classes + j;
         if (ground_truth[idx] == 1 && pred[idx] > 0.5) {
-          true_positive[j] ++;
+          __sync_fetch_and_add(&true_positive[j], 1);
         } else if (ground_truth[idx] == 0 && pred[idx] > 0.5) {
-          false_positive[j] ++;
+          __sync_fetch_and_add(&false_positive[j], 1);
         } else if (ground_truth[idx] == 1 && pred[idx] <= 0.5) {
-          false_negtive[j] ++;
+          __sync_fetch_and_add(&false_negtive[j], 1);
         }
       }
 	}
@@ -43,9 +43,9 @@ acc_t masked_f1_score(size_t begin, size_t end, size_t count, mask_t *masks,
   acc_t rNumerator = 0.0;
   acc_t rDenominator = 0.0;
   for (size_t i = 0; i < num_classes; i++) {
-    auto fn = false_negtive[i]; // false negtive
-    auto fp = false_positive[i]; // false positive
-	auto tp = true_positive[i]; // true positive
+    acc_t fn = (acc_t)false_negtive[i]; // false negtive
+    acc_t fp = (acc_t)false_positive[i]; // false positive
+	acc_t tp = (acc_t)true_positive[i]; // true positive
 	pNumerator = pNumerator + tp;
 	pDenominator = pDenominator + (tp + fp);
     rNumerator = rNumerator + tp;

From 9a78d96b23ef786175b93b37f740c88f933a5b45 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Tue, 21 Apr 2020 13:42:13 -0500
Subject: [PATCH 183/660] fix gpu

---
 .../include/deepgalois/layers/layer.h         |   5 +-
 libdeepgalois/include/deepgalois/net.h        |  37 +++---
 libdeepgalois/include/deepgalois/utils.h      |   4 +-
 libdeepgalois/src/net.cpp                     | 107 +++++++++++++-----
 libdeepgalois/src/net.cu                      |   6 +-
 libdeepgalois/src/utils.cpp                   |  11 +-
 lonestargnn/gcn/gcn.cpp                       |  31 +----
 7 files changed, 123 insertions(+), 78 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h
index 116ab43aa1..e3c47bf72c 100644
--- a/libdeepgalois/include/deepgalois/layers/layer.h
+++ b/libdeepgalois/include/deepgalois/layers/layer.h
@@ -85,9 +85,10 @@ class layer : public deepgalois::node {
     begin_ = sample_begin;
     end_   = sample_end;
     count_ = sample_count;
+#ifdef CPU_ONLY
     masks_ = masks;
-#ifndef CPU_ONLY
-    copy_masks_device(input_dims[0], masks_, d_masks_);
+#else
+	d_masks_ = masks;
 #endif
   }
 
diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h
index 98573d60b5..af55864424 100644
--- a/libdeepgalois/include/deepgalois/net.h
+++ b/libdeepgalois/include/deepgalois/net.h
@@ -30,7 +30,11 @@ namespace deepgalois {
 // layer 2: features N x 16, weights 16 x E, out N x E
 class Net {
 public:
-  Net() {}
+  Net() : is_single_class(true), num_samples(0), num_classes(0),
+          num_layers(0), num_epochs(0), 
+          train_begin(0), train_end(0), train_count(0),
+          val_begin(0), val_end(0), val_count(0),
+          train_masks(NULL), val_masks(NULL), context(NULL) {}
   #ifndef GALOIS_USE_DIST
   void init(std::string dataset_str, unsigned epochs, unsigned hidden1, 
             bool selfloop, bool is_single = true);
@@ -44,8 +48,8 @@ class Net {
   void construct_layers();
   void append_out_layer(size_t layer_id);
   void train(optimizer* opt, bool need_validate); // training
-  double evaluate(size_t begin, size_t end, size_t count, 
-                  mask_t* masks, acc_t& loss, acc_t& acc); // inference
+  double evaluate(std::string type, acc_t& loss, acc_t& acc); // inference
+  void read_test_masks(std::string dataset, Graph* dGraph);
 
   //! Add a convolution layer to the network
   void append_conv_layer(size_t layer_id, bool act = false, bool norm = true,
@@ -101,21 +105,28 @@ class Net {
   }
 
 protected:
+  bool is_single_class;              // single-class (one-hot) or multi-class label
+  size_t num_samples;                // number of samples: N
+  size_t num_classes;                // number of vertex classes: E
+  size_t num_layers;                 // for now hard-coded: NUM_CONV_LAYERS + 1
+  unsigned num_epochs;               // number of epochs
+  size_t train_begin, train_end, train_count;
+  size_t val_begin, val_end, val_count;
+  size_t test_begin, test_end, test_count;
+
+  mask_t* train_masks;               // masks for training
+  mask_t* d_train_masks;             // masks for training on device
+  mask_t* val_masks;                 // masks for validation
+  mask_t* d_val_masks;               // masks for validation on device
+  mask_t* test_masks;                // masks for test
+  mask_t* d_test_masks;              // masks for test on device
+  std::vector<size_t> feature_dims;  // feature dimnesions for each layer
+  std::vector<layer*> layers;        // all the layers in the neural network
 #ifndef GALOIS_USE_DIST
   deepgalois::Context* context;
 #else
   deepgalois::DistContext* context;
 #endif
-  bool is_single_class;             // single-class (one-hot) or multi-class label
-  size_t num_samples;               // number of samples: N
-  size_t num_classes;               // number of vertex classes: E
-  size_t num_layers;                // for now hard-coded: NUM_CONV_LAYERS + 1
-  unsigned num_epochs;              // number of epochs
-
-  std::vector<size_t> feature_dims; // feature dimnesions for each layer
-  std::vector<mask_t> train_mask, val_mask; // masks for traning and validation
-  size_t train_begin, train_end, train_count, val_begin, val_end, val_count;
-  std::vector<layer*> layers; // all the layers in the neural network
 
   // comparing outputs with the ground truth (labels)
 #ifdef CPU_ONLY
diff --git a/libdeepgalois/include/deepgalois/utils.h b/libdeepgalois/include/deepgalois/utils.h
index b7a84bb10a..71a0b7748c 100644
--- a/libdeepgalois/include/deepgalois/utils.h
+++ b/libdeepgalois/include/deepgalois/utils.h
@@ -110,9 +110,9 @@ acc_t masked_f1_score(size_t begin, size_t end, size_t count, mask_t *masks,
 
 #ifdef GALOIS_USE_DIST
 size_t read_masks(std::string dataset_str, std::string mask_type,
-                         size_t& begin, size_t& end, std::vector<uint8_t>& masks, Graph* dGraph);
+                  size_t n, size_t& begin, size_t& end, mask_t* masks, Graph* dGraph);
 #else
 size_t read_masks(std::string dataset_str, std::string mask_type,
-                         size_t& begin, size_t& end, std::vector<uint8_t>& masks);
+                  size_t n, size_t& begin, size_t& end, mask_t* masks);
 #endif
 }
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index 08af8872f0..c0919b8c52 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -31,8 +31,10 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1,
   num_epochs = epochs;
 
   //std::cout << "Reading label masks ... ";
-  train_mask.resize(num_samples, 0);
-  val_mask.resize(num_samples, 0);
+  train_masks = new mask_t[num_samples];
+  val_masks = new mask_t[num_samples];
+  std::fill(train_masks, train_masks+num_samples, 0);
+  std::fill(val_masks, val_masks+num_samples, 0);
   // get testing and validation sets
   if (dataset_str == "reddit") {
     train_begin = 0, train_count = 153431,
@@ -40,37 +42,32 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1,
     val_begin = 153431, val_count = 23831, val_end = val_begin + val_count;
     // TODO do all can be used below
 #ifndef GALOIS_USE_DIST
-    for (size_t i = train_begin; i < train_end; i++) train_mask[i] = 1;
-    for (size_t i = val_begin; i < val_end; i++) val_mask[i] = 1;
+    for (size_t i = train_begin; i < train_end; i++) train_masks[i] = 1;
+    for (size_t i = val_begin; i < val_end; i++) val_masks[i] = 1;
 #else
     // find local ID from global ID, set if it exists
     for (size_t i = train_begin; i < train_end; i++) {
       if (dGraph->isLocal(i)) {
-        train_mask[dGraph->getLID(i)] = 1;
+        train_masks[dGraph->getLID(i)] = 1;
       }
     }
     for (size_t i = val_begin; i < val_end; i++) {
       if (dGraph->isLocal(i)) {
-        val_mask[dGraph->getLID(i)] = 1;
+        val_masks[dGraph->getLID(i)] = 1;
       }
     }
 #endif
   } else {
 #ifndef GALOIS_USE_DIST
-    train_count =
-        read_masks(dataset_str, "train", train_begin, train_end, train_mask);
-    val_count = read_masks(dataset_str, "val", val_begin, val_end, val_mask);
+    train_count = read_masks(dataset_str, "train", num_samples, train_begin, train_end, train_masks);
+    val_count = read_masks(dataset_str, "val", num_samples, val_begin, val_end, val_masks);
 #else
-    train_count =
-        read_masks(dataset_str, "train", train_begin, train_end, train_mask,
-                   dGraph);
-    val_count = read_masks(dataset_str, "val", val_begin, val_end, val_mask,
-                           dGraph);
+    train_count = read_masks(dataset_str, "train", num_samples, train_begin, train_end, train_masks, dGraph);
+    val_count = read_masks(dataset_str, "val", num_samples, val_begin, val_end, val_masks, dGraph);
 #endif
   }
-  //std::cout << "Done\n";
 
-  // NOTE: train_begin/train_end are global IDs, train_mask is a local id
+  // NOTE: train_begin/train_end are global IDs, train_masks is a local id
   // train count and val count are LOCAL counts
 
   num_layers = NUM_CONV_LAYERS + 1;
@@ -82,7 +79,10 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1,
   feature_dims[2] = num_classes;           // output embedding: E
   feature_dims[3] = num_classes;           // normalized output embedding: E
   layers.resize(num_layers);
+
 #ifndef CPU_ONLY
+  copy_masks_device(num_samples, train_masks, d_train_masks);
+  copy_masks_device(num_samples, val_masks, d_val_masks);
   context->copy_data_to_device(); // copy labels and input features to the device
 #endif
 }
@@ -119,19 +119,20 @@ void Net::train(optimizer* opt, bool need_validate) {
     // forward: after this phase, layer edges will contain intermediate features
     // for use during backprop
     Tfw.start();
-    train_loss = Net::fprop(train_begin, train_end, train_count, &train_mask[0]); // forward
+    double fw_time = evaluate("train", train_loss, train_acc);
+	/*
+    train_loss = Net::fprop(train_begin, train_end, train_count, train_masks); // forward
 #ifdef CPU_ONLY
     Graph *g = context->getCpuGraphPointer();
 #else
 	CSRGraph *g = context->getGpuGraphPointer();
 #endif
     if (is_single_class) {
-      train_acc = masked_accuracy(train_begin, train_end, train_count,
-                                  &train_mask[0], g); // predict
+      train_acc = masked_accuracy(train_begin, train_end, train_count, train_masks, g); // predict
     } else {
-      train_acc = masked_multi_class_accuracy(train_begin, train_end, train_count,
-                                              &train_mask[0], g); // predict
+      train_acc = masked_multi_class_accuracy(train_begin, train_end, train_count, train_masks, g); // predict
     }
+	*/
     Tfw.stop();
 
     // backward: use intermediate features + ground truth to update layers
@@ -157,15 +158,15 @@ void Net::train(optimizer* opt, bool need_validate) {
       // Validation
       acc_t val_loss = 0.0, val_acc = 0.0;
       Tval.start();
-      double val_time = evaluate(val_begin, val_end, val_count, &val_mask[0],
-                                 val_loss, val_acc);
+      double val_time = evaluate("val", val_loss, val_acc);
       Tval.stop();
       galois::gPrint(header, "val_loss ", std::setprecision(3), std::fixed, val_loss,
                      " val_acc ", val_acc, seperator);
       galois::gPrint(header, "time ", std::setprecision(3), std::fixed, epoch_time + val_time, 
                      " ms (train_time ", epoch_time, " val_time ", val_time, ")\n");
     } else {
-      galois::gPrint(header, "train_time ", std::fixed, epoch_time, " ms\n");
+      galois::gPrint(header, "train_time ", std::fixed, epoch_time, 
+                     " ms (fw ", fw_time, ", bw ", epoch_time - fw_time, ")\n");
     }
   }
   double avg_train_time = total_train_time / (double)num_epochs;
@@ -175,11 +176,38 @@ void Net::train(optimizer* opt, bool need_validate) {
 }
 
 // evaluate, i.e. inference or predict
-double Net::evaluate(size_t begin, size_t end, size_t count, mask_t* masks,
-                     acc_t& loss, acc_t& acc) {
+double Net::evaluate(std::string type, acc_t& loss, acc_t& acc) {
   // TODO may need to do something for the dist case
   Timer t_eval;
   t_eval.Start();
+  size_t begin = 0, end = 0, count = 0;
+  mask_t* masks = NULL;
+  if (type == "train") {
+    begin = train_begin;
+    end = train_end;
+    count = train_count;
+    masks = train_masks;
+  } else if (type == "val") {
+    begin = val_begin;
+    end = val_end;
+    count = val_count;
+    masks = val_masks;
+  } else {
+    begin = test_begin;
+    end = test_end;
+    count = test_count;
+    masks = test_masks;
+  }
+#ifndef CPU_ONLY
+  if (type == "train") {
+    masks = d_train_masks;
+  } else if (type == "val") {
+    masks = d_val_masks;
+  } else {
+    masks = d_test_masks;
+  }
+#endif
+
   loss = fprop(begin, end, count, masks);
 #ifdef CPU_ONLY
   Graph* g = context->getCpuGraphPointer();
@@ -233,6 +261,33 @@ void Net::append_conv_layer(size_t layer_id, bool act, bool norm, bool bias,
   if (layer_id > 0) connect(layers[layer_id - 1], layers[layer_id]);
 }
 
+void Net::read_test_masks(std::string dataset, Graph* dGraph) {
+  test_masks = new mask_t[num_samples];
+  if (dataset == "reddit") {
+    test_begin = 177262;
+    test_count = 55703;
+    test_end   = test_begin + test_count;
+#ifndef GALOIS_USE_DIST
+    for (size_t i = test_begin; i < test_end; i++) test_masks[i] = 1;
+#else
+    for (size_t i = test_begin; i < test_end; i++)  {
+      if (dGraph->isLocal(i)) {
+        test_masks[dGraph->getLID(i)] = 1;
+      }
+    }
+#endif
+  } else {
+#ifndef GALOIS_USE_DIST
+    test_count = deepgalois::read_masks(dataset, "test", num_samples, test_begin, test_end, test_masks);
+#else
+    test_count = deepgalois::read_masks(dataset, "test", num_samples, test_begin, test_end, test_masks, dGraph);
+#endif
+  }
+#ifndef CPU_ONLY
+  copy_masks_device(num_samples, test_masks, d_test_masks);
+#endif
+}
+
 #ifdef CPU_ONLY
 /**
  *
diff --git a/libdeepgalois/src/net.cu b/libdeepgalois/src/net.cu
index e7f7d7b603..a26cf603b6 100644
--- a/libdeepgalois/src/net.cu
+++ b/libdeepgalois/src/net.cu
@@ -114,16 +114,14 @@ acc_t masked_f1_score_gpu(int num_classes, int begin, int end, int count,
 namespace deepgalois {
 acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count,
                            mask_t* masks, CSRGraph *g) {
-  return masked_accuracy_gpu(num_classes, begin, end, count,
-                             layers[NUM_CONV_LAYERS]->get_device_masks(),
+  return masked_accuracy_gpu(num_classes, begin, end, count, masks,
                              layers[NUM_CONV_LAYERS - 1]->next()->get_data(),
                              context->get_labels_device_ptr());
 }
 
 acc_t Net::masked_multi_class_accuracy(size_t begin, size_t end, size_t count, 
                                        mask_t* masks, CSRGraph* g) {
-	return masked_f1_score_gpu(num_classes, begin, end, count,
-                             layers[NUM_CONV_LAYERS]->get_device_masks(),
+	return masked_f1_score_gpu(num_classes, begin, end, count, masks,
                              layers[NUM_CONV_LAYERS - 1]->next()->get_data(),
                              context->get_labels_device_ptr());
 }
diff --git a/libdeepgalois/src/utils.cpp b/libdeepgalois/src/utils.cpp
index 77657c3f3c..46470e2997 100644
--- a/libdeepgalois/src/utils.cpp
+++ b/libdeepgalois/src/utils.cpp
@@ -62,7 +62,7 @@ acc_t masked_f1_score(size_t begin, size_t end, size_t count, mask_t *masks,
 //! Get masks from datafile where first line tells range of
 //! set to create mask from
 size_t read_masks(std::string dataset_str, std::string mask_type,
-                         size_t& begin, size_t& end, std::vector<uint8_t>& masks) {
+                  size_t n, size_t& begin, size_t& end, mask_t* masks) {
   bool dataset_found = false;
   for (int i = 0; i < NUM_DATASETS; i++) {
     if (dataset_str == dataset_names[i]) {
@@ -96,14 +96,14 @@ size_t read_masks(std::string dataset_str, std::string mask_type,
   }
   std::cout << mask_type + "_mask range: [" << begin << ", " << end
     << ") Number of valid samples: " << sample_count << " (" 
-    << (float)sample_count/(float)masks.size()*(float)100 << "\%)\n";
+    << (float)sample_count/(float)n*(float)100 << "\%)\n";
   in.close();
   return sample_count;
 }
 #else
 size_t read_masks(std::string dataset_str, std::string mask_type,
-                         size_t& begin, size_t& end,
-                         std::vector<uint8_t>& masks, Graph* dGraph) {
+                         size_t n, size_t& begin, size_t& end,
+                         mask_t* masks, Graph* dGraph) {
   bool dataset_found = false;
   for (int i = 0; i < NUM_DATASETS; i++) {
     if (dataset_str == dataset_names[i]) {
@@ -139,7 +139,8 @@ size_t read_masks(std::string dataset_str, std::string mask_type,
     i++;
   }
   std::cout << mask_type + "_mask range: [" << begin << ", " << end
-    << ") Number of valid samples: " << sample_count << "\n";
+    << ") Number of valid samples: " << sample_count << "("
+    << (float)sample_count/(float)n*(float)100 << "\%)\n";
   in.close();
   return sample_count;
 }
diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp
index e23097befe..109b0522f1 100644
--- a/lonestargnn/gcn/gcn.cpp
+++ b/lonestargnn/gcn/gcn.cpp
@@ -46,38 +46,17 @@ int main(int argc, char** argv) {
   Ttrain.stop();
 
   if (do_test) {
-    galois::gPrint("\n");
     // test using test samples
-    size_t n        = network.get_nnodes();
-    acc_t test_loss = 0.0, test_acc = 0.0;
-    size_t test_begin = 0, test_end = n, test_count = n;
-    std::vector<mask_t> test_mask(n, 0);
-    if (dataset == "reddit") {
-      test_begin = 177262;
-      test_count = 55703;
-      test_end   = test_begin + test_count;
-#ifndef GALOIS_USE_DIST
-      for (size_t i = test_begin; i < test_end; i++)
-        test_mask[i] = 1;
-#else
-      for (size_t i = test_begin; i < test_end; i++)  {
-        if (dGraph->isLocal(i)) {
-          test_mask[dGraph->getLID(i)] = 1;
-        }
-      }
-#endif
-    } else {
+    galois::gPrint("\n");
 #ifndef GALOIS_USE_DIST
-      test_count = deepgalois::read_masks(dataset, "test", test_begin, test_end, test_mask);
+    network.read_test_masks(dataset, NULL);
 #else
-      test_count = deepgalois::read_masks(dataset, "test", test_begin, test_end,
-                                          test_mask, dGraph);
+    network.read_test_masks(dataset, dGraph);
 #endif
-    }
     galois::StatTimer Ttest("Test");
     Ttest.start();
-    double test_time = network.evaluate(test_begin, test_end, test_count,
-                                        &test_mask[0], test_loss, test_acc);
+    acc_t test_loss = 0.0, test_acc = 0.0;
+    double test_time = network.evaluate("test", test_loss, test_acc);
     galois::gPrint("Testing: test_loss = ", test_loss, " test_acc = ", test_acc,
                    " test_time = ", test_time, "\n");
     Ttest.stop();

From bcb03c51dcb1550f50cdba46bc19d2e740cc056b Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Tue, 21 Apr 2020 18:46:06 -0500
Subject: [PATCH 184/660] fix f1score gpu

---
 libdeepgalois/include/deepgalois/net.h        | 17 +------
 libdeepgalois/src/context.cu                  | 16 +++++--
 .../src/layers/sigmoid_loss_layer.cpp         |  2 +-
 libdeepgalois/src/math_functions.cu           | 46 ++++++++++++++++++-
 libdeepgalois/src/net.cpp                     | 16 +++++++
 libdeepgalois/src/net.cu                      | 22 ++++++---
 6 files changed, 89 insertions(+), 30 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h
index af55864424..cb7578dafc 100644
--- a/libdeepgalois/include/deepgalois/net.h
+++ b/libdeepgalois/include/deepgalois/net.h
@@ -50,6 +50,7 @@ class Net {
   void train(optimizer* opt, bool need_validate); // training
   double evaluate(std::string type, acc_t& loss, acc_t& acc); // inference
   void read_test_masks(std::string dataset, Graph* dGraph);
+  acc_t fprop(size_t begin, size_t end, size_t count, mask_t* masks); // forward propagation
 
   //! Add a convolution layer to the network
   void append_conv_layer(size_t layer_id, bool act = false, bool norm = true,
@@ -72,22 +73,6 @@ class Net {
       layers[i]->print_layer_info();
   }
 
-  //! forward propagation: [begin, end) is the range of samples used.
-  //! calls "forward" on the layers of the network and returns the loss of the
-  //! final layer
-  acc_t fprop(size_t begin, size_t end, size_t count, mask_t* masks) {
-    // set mask for the last layer
-    layers[num_layers - 1]->set_sample_mask(begin, end, count, masks);
-    // layer0: from N x D to N x 16
-    // layer1: from N x 16 to N x E
-    // layer2: from N x E to N x E (normalize only)
-    for (size_t i = 0; i < num_layers; i++) {
-      layers[i]->forward();
-      // TODO need to sync model between layers here
-    }
-    return layers[num_layers - 1]->get_masked_loss();
-  }
-
   // back propogation
   void bprop() {
     for (size_t i = num_layers; i != 0; i--) {
diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu
index 4ed442c70d..93300abffb 100644
--- a/libdeepgalois/src/context.cu
+++ b/libdeepgalois/src/context.cu
@@ -83,6 +83,9 @@ Context::~Context() {
     CUSPARSE_CHECK(cusparseDestroyMatDescr(cusparse_matdescr_));
   if (curand_generator_)
     CURAND_CHECK(curandDestroyGenerator(curand_generator_));
+  if (d_labels) CUDA_CHECK(cudaFree(d_labels));
+  if (d_feats) CUDA_CHECK(cudaFree(d_feats));
+  if (norm_factor) CUDA_CHECK(cudaFree(norm_factor));
 }
 
 size_t Context::read_graph(std::string dataset_str, bool selfloop) {
@@ -136,12 +139,15 @@ size_t Context::read_graph_gpu(std::string dataset_str, bool selfloop) {
 }
 
 void Context::copy_data_to_device() {
-  CUDA_CHECK(cudaMalloc((void**)&d_labels, n * sizeof(label_t)));
-  CUDA_CHECK(cudaMemcpy(d_labels, labels, n * sizeof(label_t),
-                        cudaMemcpyHostToDevice));
+  if (is_single_class) {
+    CUDA_CHECK(cudaMalloc((void**)&d_labels, n * sizeof(label_t)));
+    CUDA_CHECK(cudaMemcpy(d_labels, labels, n * sizeof(label_t), cudaMemcpyHostToDevice));
+  } else {
+    CUDA_CHECK(cudaMalloc((void**)&d_labels, n * num_classes * sizeof(label_t)));
+    CUDA_CHECK(cudaMemcpy(d_labels, labels, n * num_classes * sizeof(label_t), cudaMemcpyHostToDevice));
+  }
   CUDA_CHECK(cudaMalloc((void**)&d_feats, n * feat_len * sizeof(float_t)));
-  CUDA_CHECK(cudaMemcpy(d_feats, &h_feats[0], n * feat_len * sizeof(float_t),
-                        cudaMemcpyHostToDevice));
+  CUDA_CHECK(cudaMemcpy(d_feats, &h_feats[0], n * feat_len * sizeof(float_t), cudaMemcpyHostToDevice));
   //print_device_vector(10, d_feats, "d_feats");
 }
 
diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
index 4a76861860..feb493a636 100644
--- a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
+++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
@@ -41,7 +41,7 @@ void sigmoid_loss_layer::back_propagation(const float_t* in_data, const float_t*
       size_t idx = len * i;
       float_t *norm_grad = new float_t[len];
       float_t *ground_truth = new float_t[len];
-      for (size_t j = 0; j < len; j++) ground_truth[j] = context->get_label(i, j);
+      for (size_t j = 0; j < len; j++) ground_truth[j] = (float_t)context->get_label(i, j);
       // use ground truth to determine derivative of cross entropy
       math::d_cross_entropy(len, ground_truth, &out_data[idx], norm_grad);
       // derviative sigmoid to gradient used in the next layer
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index fa5f02de21..6438fc5db3 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -529,6 +529,8 @@ __global__ void d_softmax_cross_entropy_warp(int len, int begin, int end,
         if (pid < len) p[warp_lane][pid] = data[base+pid];
       }
       __syncthreads();
+
+      // cross entropy derivative
       for (int i = 0; i < len; i += WARP_SIZE) {
         int pid = thread_lane + i;
         if (pid < len) {
@@ -538,6 +540,8 @@ __global__ void d_softmax_cross_entropy_warp(int len, int begin, int end,
         }
       }
       __syncthreads();
+
+      // softmax derivative
       for (int i = 0; i < len; i += WARP_SIZE) {
         int pid = thread_lane + i;
         if (pid < len) {
@@ -579,7 +583,47 @@ void d_softmax_cross_entropy_gpu(int len, int begin, int end,
 __global__ void d_sigmoid_cross_entropy_warp(int len, int begin, int end,
                                              const mask_t* masks, const label_t* labels,
                                              const float_t* data, float_t* grad) {
+  __shared__ float_t p[BLOCK_SIZE/WARP_SIZE][MAX_NUM_CLASSES];
+  __shared__ float_t d[BLOCK_SIZE/WARP_SIZE][MAX_NUM_CLASSES];
+  const int thread_id   = BLOCK_SIZE * blockIdx.x + threadIdx.x;  // global thread index
+  const int thread_lane = threadIdx.x & (WARP_SIZE-1);            // thread index within the warp
+  const int warp_id     = thread_id   / WARP_SIZE;                // global warp index
+  const int warp_lane   = threadIdx.x / WARP_SIZE;                // warp index within the CTA
+  const int num_warps   = (BLOCK_SIZE / WARP_SIZE) * gridDim.x;   // total number of active warps
+
+  for (int wid = warp_id; wid < end-begin; wid += num_warps) {
+    int id = begin + wid;
+    int base = id * len;	
+    if (masks[id] == 1) {
+      for (int i = 0; i < len; i += WARP_SIZE) {
+        int pid = thread_lane + i;
+        if (pid < len) p[warp_lane][pid] = data[base+pid];
+      }
+      __syncthreads();
 
+      // cross entropy derivative
+      for (int i = 0; i < len; i += WARP_SIZE) {
+        int pid = thread_lane + i;
+        if (pid < len) {
+          //if (p[warp_lane][pid] == 0)
+            d[warp_lane][pid] = -(float_t)labels[base+pid] / (p[warp_lane][pid] + 1e-10);
+          //else d[warp_lane][pid] = -(float_t)labels[pid] / 1e-10;
+        }
+      }
+      __syncthreads();
+
+      // sigmoid derivative
+      for (int i = 0; i < len; i += WARP_SIZE) {
+        int pid = thread_lane + i;
+        if (pid < len) {
+          float_t self = p[warp_lane][pid];
+          float_t dp = d[warp_lane][pid];
+          grad[base+pid] = dp * self * (float_t(1) - self);
+        }
+      }
+      __syncthreads();
+    }
+  }
 }
 
 void d_sigmoid_cross_entropy_gpu(int len, int begin, int end,
@@ -587,7 +631,7 @@ void d_sigmoid_cross_entropy_gpu(int len, int begin, int end,
                                  const float_t* out, float_t* diff) {
   d_sigmoid_cross_entropy_warp<<<(end-begin-1)/WARPS_PER_BLOCK+1, BLOCK_SIZE>>>(
       len, begin, end, masks, labels, out, diff);
-  CudaTest("solving d_softmax_cross_entropy_warp kernel failed");
+  CudaTest("solving d_sigmoid_cross_entropy_warp kernel failed");
 }
 
 __global__ void masked_avg_loss_kernel(int begin, int end, mask_t* masks,
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index c0919b8c52..45b91142d8 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -223,6 +223,22 @@ double Net::evaluate(std::string type, acc_t& loss, acc_t& acc) {
   return t_eval.Millisecs();
 }
 
+//! forward propagation: [begin, end) is the range of samples used.
+//! calls "forward" on the layers of the network and returns the loss of the
+//! final layer
+acc_t Net::fprop(size_t begin, size_t end, size_t count, mask_t* masks) {
+  // set mask for the last layer
+  layers[num_layers - 1]->set_sample_mask(begin, end, count, masks);
+  // layer0: from N x D to N x 16
+  // layer1: from N x 16 to N x E
+  // layer2: from N x E to N x E (normalize only)
+  for (size_t i = 0; i < num_layers; i++) {
+    layers[i]->forward();
+    // TODO need to sync model between layers here
+  }
+  return layers[num_layers - 1]->get_masked_loss();
+}
+
 void Net::construct_layers() {
   std::cout << "\nConstructing layers...\n";
   append_conv_layer(0, true);                    // first conv layer
diff --git a/libdeepgalois/src/net.cu b/libdeepgalois/src/net.cu
index a26cf603b6..e8d60b1e03 100644
--- a/libdeepgalois/src/net.cu
+++ b/libdeepgalois/src/net.cu
@@ -25,8 +25,7 @@ __global__ void masked_accuracy_kernel(int num_classes, int begin,
       local_accuracy;
   CUDA_KERNEL_LOOP(i, end - begin) {
     if (masks[begin + i] == 1) {
-      label_t pred = (label_t)argmax_device(num_classes,
-                                            preds + (begin + i) * num_classes);
+      label_t pred = (label_t)argmax_device(num_classes, preds + (begin + i) * num_classes);
       if (pred == labels[begin + i])
         total.reduce(1.0);
     }
@@ -56,9 +55,10 @@ __global__ void masked_f1_score_kernel(int num_classes, int begin,
                                        f1count_t* false_positive,
                                        f1count_t* false_negtive) {
   CUDA_KERNEL_LOOP(i, end - begin) {
-    if (masks[begin + i] == 1) {
+    int id = begin + i;
+    if (masks[id] == 1) {
       for (size_t j = 0; j < num_classes; j++) {
-        auto idx = i * num_classes + j;
+        int idx = id * num_classes + j;
         if (labels[idx] == 1 && preds[idx] > 0.5) {
           atomicAdd(&true_positive[j], 1.0);
         } else if (labels[idx] == 0 && preds[idx] > 0.5) {
@@ -82,11 +82,15 @@ acc_t masked_f1_score_gpu(int num_classes, int begin, int end, int count,
   float_malloc_device(num_classes, d_tp);
   float_malloc_device(num_classes, d_fp);
   float_malloc_device(num_classes, d_fn);
+  init_const_gpu(num_classes, 0.0, d_tp);
+  init_const_gpu(num_classes, 0.0, d_fp);
+  init_const_gpu(num_classes, 0.0, d_fn);
   masked_f1_score_kernel<<<CUDA_GET_BLOCKS(end - begin), CUDA_NUM_THREADS>>>(
       num_classes, begin, end, masks, preds, labels, d_tp, d_fp, d_fn);
-  cudaMemcpy(h_tp, d_tp, num_classes * sizeof(f1count_t), cudaMemcpyDeviceToHost);
-  cudaMemcpy(h_fp, d_fp, num_classes * sizeof(f1count_t), cudaMemcpyDeviceToHost);
-  cudaMemcpy(h_fn, d_fn, num_classes * sizeof(f1count_t), cudaMemcpyDeviceToHost);
+  CudaTest("solving masked_f1_score_kernel kernel failed");
+  CUDA_CHECK(cudaMemcpy(h_tp, d_tp, num_classes * sizeof(f1count_t), cudaMemcpyDeviceToHost));
+  CUDA_CHECK(cudaMemcpy(h_fp, d_fp, num_classes * sizeof(f1count_t), cudaMemcpyDeviceToHost));
+  CUDA_CHECK(cudaMemcpy(h_fn, d_fn, num_classes * sizeof(f1count_t), cudaMemcpyDeviceToHost));
 
   acc_t pNumerator = 0.0;
   acc_t pDenominator = 0.0;
@@ -105,9 +109,13 @@ acc_t masked_f1_score_gpu(int num_classes, int begin, int end, int count,
   acc_t precisionMicro = pNumerator / pDenominator;
   acc_t fscoreMicro = (((beta * beta) + 1) * precisionMicro * recallMicro) / 
                      ((beta * beta) * precisionMicro + recallMicro);
+  
   float_free_device(d_tp);
   float_free_device(d_fp);
   float_free_device(d_fn);
+  delete h_tp;
+  delete h_fp;
+  delete h_fn;
   return fscoreMicro;
 }
 

From a2d83a606bf2a8f60a5018bd16daa0be0b1eea4c Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Wed, 22 Apr 2020 20:25:18 -0500
Subject: [PATCH 185/660] add weight_decay

---
 .../deepgalois/layers/graph_conv_layer.h      |  1 +
 .../include/deepgalois/layers/layer.h         |  3 +-
 .../deepgalois/layers/sigmoid_loss_layer.h    |  2 +-
 .../deepgalois/layers/softmax_loss_layer.h    |  2 +-
 .../include/deepgalois/math_functions.hh      |  7 ++-
 libdeepgalois/include/deepgalois/net.h        | 29 ++++-----
 libdeepgalois/include/deepgalois/utils.h      |  2 +-
 libdeepgalois/src/layers/graph_conv_layer.cpp | 11 +++-
 libdeepgalois/src/layers/graph_conv_layer.cu  | 11 +++-
 .../src/layers/sigmoid_loss_layer.cpp         |  2 +-
 .../src/layers/sigmoid_loss_layer.cu          |  2 +-
 .../src/layers/softmax_loss_layer.cpp         |  2 +-
 .../src/layers/softmax_loss_layer.cu          |  2 +-
 libdeepgalois/src/math_functions.cpp          | 25 ++++++--
 libdeepgalois/src/math_functions.cu           |  5 ++
 libdeepgalois/src/net.cpp                     | 56 +++++++++--------
 libdeepgalois/src/net.cu                      |  4 +-
 lonestargnn/gcn/gcn.cpp                       | 19 +++---
 lonestargnn/include/lonestargnn.h             | 60 +++++++------------
 19 files changed, 139 insertions(+), 106 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
index c9b8729d62..63062133df 100644
--- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
@@ -37,6 +37,7 @@ class graph_conv_layer : public layer {
   std::string layer_type() const override { return std::string("graph_conv"); }
   void set_netphase(deepgalois::net_phase ctx) override { phase_ = ctx; }
   void set_context(layer::ContextType* ctx) { context = ctx; norm_factor = ctx->get_norm_factor(); }
+  virtual acc_t get_weight_decay_loss();
   //! Uses weights contained in this layer to update in_data (results from previous)
   //! and save result to out_data
   virtual void forward_propagation(const float_t* in_data, float_t* out_data);
diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h
index e3c47bf72c..188feebe75 100644
--- a/libdeepgalois/include/deepgalois/layers/layer.h
+++ b/libdeepgalois/include/deepgalois/layers/layer.h
@@ -61,7 +61,8 @@ class layer : public deepgalois::node {
   //! save context
   virtual void set_context(ContextType* ctx) { context = ctx; }
   //! return layer loss
-  virtual acc_t get_masked_loss() { return acc_t(0); }
+  virtual acc_t get_prediction_loss() { return acc_t(0); }
+  virtual acc_t get_weight_decay_loss() { return acc_t(0); }
 
   // main functions for layer work
   virtual void forward_propagation(const float_t* in_data,
diff --git a/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h b/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h
index 334bf4363e..0f46cde043 100644
--- a/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h
@@ -13,6 +13,6 @@ class sigmoid_loss_layer : public layer {
   virtual void forward_propagation(const float_t* in_data, float_t* out_data);
   virtual void back_propagation(const float_t* in_data, const float_t* out_data,
                                 float_t* out_grad, float_t* in_grad);
-  virtual acc_t get_masked_loss();
+  virtual acc_t get_prediction_loss();
 };
 }
diff --git a/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h b/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h
index 7194d06f2e..1a5b7e86ee 100644
--- a/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h
@@ -13,6 +13,6 @@ class softmax_loss_layer : public layer {
   virtual void forward_propagation(const float_t* in_data, float_t* out_data);
   virtual void back_propagation(const float_t* in_data, const float_t* out_data,
                                 float_t* out_grad, float_t* in_grad);
-  virtual acc_t get_masked_loss();
+  virtual acc_t get_prediction_loss();
 };
 }
diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh
index 2c3a8014ee..a70ae071f9 100644
--- a/libdeepgalois/include/deepgalois/math_functions.hh
+++ b/libdeepgalois/include/deepgalois/math_functions.hh
@@ -27,6 +27,8 @@ void vadd_cpu(size_t n, const float_t* a, const float_t* b, float_t* out);
 void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out);
 //! do dot product of 2 vectors
 float_t dot(const vec_t& x, const vec_t& y);
+//! Computes half the L2 norm of a tensor without the sqrt: output = sum(t ** 2) / 2
+float_t l2_norm(size_t n, const float_t* a);
 //! clear n elements of a vector
 void clear_cpu(size_t n, float_t* in);
 // dropout functions randomly remove weights
@@ -153,8 +155,8 @@ void sigmoid_cross_entropy_gpu(int len, int begin, int end, const float_t* in_da
 void d_sigmoid_cross_entropy_gpu(int len, int bengin, int end,
                                  const mask_t* masks, const label_t* labels,
                                  const float_t* out_data, float_t* diff);
-void scal_gpu(const int N, const float alpha, float* X);
-void add_scalar_gpu(const int N, const float_t alpha, float_t* Y);
+void scal_gpu(const int n, const float alpha, float* X);
+void add_scalar_gpu(const int n, const float_t alpha, float_t* Y);
 void rng_uniform_gpu(const int n, const float_t a, const float_t b, float_t* r);
 bool is_allocated_device(float_t* data);
 void copy_masks_device(int n, mask_t* h_masks, mask_t*& d_masks);
@@ -162,4 +164,5 @@ void float_malloc_device(int n, float_t*& ptr);
 void float_free_device(float_t*& ptr);
 void float_copy_device(int n, float_t* h_ptr, float_t *d_ptr);
 acc_t masked_avg_loss_gpu(int begin, int end, int count, mask_t* masks, float_t* loss);
+acc_t l2_norm_gpu(int n, float_t *tensor);
 #endif
diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h
index cb7578dafc..6a03611371 100644
--- a/libdeepgalois/include/deepgalois/net.h
+++ b/libdeepgalois/include/deepgalois/net.h
@@ -18,10 +18,6 @@
 #include "deepgalois/DistContext.h"
 #endif
 
-
-
-#define NUM_CONV_LAYERS 2
-
 namespace deepgalois {
 
 // N: number of vertices, D: feature vector dimentions,
@@ -31,17 +27,15 @@ namespace deepgalois {
 class Net {
 public:
   Net() : is_single_class(true), num_samples(0), num_classes(0),
-          num_layers(0), num_epochs(0), 
+          num_conv_layers(0), num_layers(0), num_epochs(0),
+          learning_rate(0.0), dropout_rate(0.0), weight_decay(0.0),
           train_begin(0), train_end(0), train_count(0),
           val_begin(0), val_end(0), val_count(0),
-          train_masks(NULL), val_masks(NULL), context(NULL) {}
-  #ifndef GALOIS_USE_DIST
-  void init(std::string dataset_str, unsigned epochs, unsigned hidden1, 
-            bool selfloop, bool is_single = true);
-  #else
-  void init(std::string dataset_str, unsigned epochs, unsigned hidden1,
-            bool selfloop, Graph* dGraph);
-  #endif
+          test_begin(0), test_end(0), test_count(0),
+          train_masks(NULL), val_masks(NULL), test_masks(NULL), context(NULL) {}
+  void init(std::string dataset_str, unsigned num_conv, unsigned epochs,
+            unsigned hidden1, float lr, float dropout, float wd,
+            bool selfloop, bool is_single, Graph* dGraph);
   size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; }
   size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id + 1]; }
   size_t get_nnodes() { return num_samples; }
@@ -54,8 +48,7 @@ class Net {
 
   //! Add a convolution layer to the network
   void append_conv_layer(size_t layer_id, bool act = false, bool norm = true,
-                         bool bias = false, bool dropout = true,
-                         float_t dropout_rate = 0.5);
+                         bool bias = false, bool dropout = true);
 
   //! Save the context object to all layers of the network
   void set_contexts() {
@@ -93,8 +86,12 @@ class Net {
   bool is_single_class;              // single-class (one-hot) or multi-class label
   size_t num_samples;                // number of samples: N
   size_t num_classes;                // number of vertex classes: E
-  size_t num_layers;                 // for now hard-coded: NUM_CONV_LAYERS + 1
+  size_t num_conv_layers;            // number of convolutional layers
+  size_t num_layers;                 // total number of layers (conv + output)
   unsigned num_epochs;               // number of epochs
+  float learning_rate;               // learning rate
+  float dropout_rate;                // dropout rate
+  float weight_decay;                // weighti decay for over-fitting
   size_t train_begin, train_end, train_count;
   size_t val_begin, val_end, val_count;
   size_t test_begin, test_end, test_count;
diff --git a/libdeepgalois/include/deepgalois/utils.h b/libdeepgalois/include/deepgalois/utils.h
index 71a0b7748c..097457290d 100644
--- a/libdeepgalois/include/deepgalois/utils.h
+++ b/libdeepgalois/include/deepgalois/utils.h
@@ -102,7 +102,7 @@ uniform_rand(T min, T max) {
 }
 
 inline bool bernoulli(float_t p) {
-  return uniform_rand(float_t(0), float_t(1)) <= p;
+  return uniform_rand(float_t(0), float_t(1)) > p;
 }
 
 acc_t masked_f1_score(size_t begin, size_t end, size_t count, mask_t *masks, 
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 171b32305c..b640acd75a 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -123,6 +123,15 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
   //galois::gInfo("[", layer::gradientGraph->myHostID(), "] Sync done");
 #endif
 }
-#endif
+
+acc_t graph_conv_layer::get_weight_decay_loss() {
+  acc_t loss = 0.0;
+  for (size_t i = 0; i < y*z; i+=z) {
+    loss += math::l2_norm(z, &layer::W[i]);
+  }
+  return loss;
+}
+
+#endif // end if CPU_ONLY
 } // namespace
 
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cu b/libdeepgalois/src/layers/graph_conv_layer.cu
index 12d9902179..322500d916 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cu
+++ b/libdeepgalois/src/layers/graph_conv_layer.cu
@@ -47,7 +47,11 @@ void graph_conv_layer::combine(size_t dim_x, size_t dim_y, const float_t* self,
 // GPU forward: compute output features
 // NOTE: in_data will be used in back-prop, so it can not be modified
 void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_data) {
-  assert(z <= MAX_NUM_CLASSES); // currently only support feature length <= 128
+  if (z > MAX_NUM_CLASSES) {
+    std::cout << "Currently support maximum hidden feature length of " << MAX_NUM_CLASSES << "\n"; 
+	// currently only support feature length <= 128
+    exit(0);
+  }
   init_const_gpu(x*z, 0.0, out_temp);
   if (dropout_ && phase_ == deepgalois::net_phase::train)
     dropout_gpu(x * y, scale_, dropout_rate_, in_data, dropout_mask, in_temp);
@@ -83,5 +87,10 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
     d_dropout_gpu(x * y, scale_, dropout_rate_, in_grad, dropout_mask, in_grad);
 }
 
+acc_t graph_conv_layer::get_weight_decay_loss() {
+  acc_t loss = l2_norm_gpu(y*z, d_W);
+  return loss;
+}
+
 } // namespace
 
diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
index feb493a636..763bd6646d 100644
--- a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
+++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
@@ -52,7 +52,7 @@ void sigmoid_loss_layer::back_propagation(const float_t* in_data, const float_t*
   }, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("sigmoid-loss-bw"));
 }
 
-acc_t sigmoid_loss_layer::get_masked_loss() {
+acc_t sigmoid_loss_layer::get_prediction_loss() {
   assert(count_ > 0);
   AccumF total_loss;
   AccumU valid_sample_count;
diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cu b/libdeepgalois/src/layers/sigmoid_loss_layer.cu
index 6d7268d4af..c52b9089f0 100644
--- a/libdeepgalois/src/layers/sigmoid_loss_layer.cu
+++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cu
@@ -31,7 +31,7 @@ void sigmoid_loss_layer::back_propagation(const float_t* in_data,
                               context->get_labels_device_ptr(), out_data, in_grad);
 }
 
-acc_t sigmoid_loss_layer::get_masked_loss() {
+acc_t sigmoid_loss_layer::get_prediction_loss() {
   return masked_avg_loss_gpu(begin_, end_, count_, d_masks_, loss);
 }
 
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp
index 7c5b11d233..4a92e56ec3 100644
--- a/libdeepgalois/src/layers/softmax_loss_layer.cpp
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp
@@ -58,7 +58,7 @@ void softmax_loss_layer::back_propagation(const float_t* in_data,
   // no weight sync required: this is all local graph information
 }
 
-acc_t softmax_loss_layer::get_masked_loss() {
+acc_t softmax_loss_layer::get_prediction_loss() {
   assert(count_ > 0);
   AccumF total_loss;
   AccumU valid_sample_count;
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cu b/libdeepgalois/src/layers/softmax_loss_layer.cu
index c2f3a98303..e73ef27f33 100644
--- a/libdeepgalois/src/layers/softmax_loss_layer.cu
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cu
@@ -31,7 +31,7 @@ void softmax_loss_layer::back_propagation(const float_t* in_data,
                               context->get_labels_device_ptr(), out_data, in_grad);
 }
 
-acc_t softmax_loss_layer::get_masked_loss() {
+acc_t softmax_loss_layer::get_prediction_loss() {
   return masked_avg_loss_gpu(begin_, end_, count_, d_masks_, loss);
 }
 
diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp
index cdde9cc964..f81444fa70 100644
--- a/libdeepgalois/src/math_functions.cpp
+++ b/libdeepgalois/src/math_functions.cpp
@@ -45,7 +45,7 @@ void csrmm_cpu(const int M, const int N, const int K, const int nnz,
 #endif
 }
 
-const size_t vec_len = 8;
+const size_t vec_len = 8; // for 32-bit floating point in AVX2
 // vector add
 #if defined(__AVX__) || defined(__AVX2__)
 void vadd_cpu(size_t n, const float_t* a, const float_t* b, float_t* out) {
@@ -76,6 +76,17 @@ void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out)
     _mm256_storeu_ps(&out[i], _mm256_mul_ps(_mm256_loadu_ps(&in[i]), scal));
   for (size_t i = alignedN; i < n; ++i) out[i] = alpha * in[i];
 }
+
+float_t l2_norm(size_t n, const float_t* in) {
+  const size_t alignedN = n - n % vec_len;
+  __m256 vsum = _mm256_set1_ps(0.0);
+  for (size_t i = 0; i < alignedN; i += vec_len) {
+    __m256 a = _mm256_loadu_ps(&in[i]);
+    vsum = _mm256_add_ps(vsum, _mm256_mul_ps(a, a));
+  }
+  __m256 sum = _mm256_hadd_ps(vsum, vsum);
+  return ((float_t*)&sum)[0] + ((float_t*)&sum)[2];;
+}
 #else
 // vector multiply scalar
 void mul_scalar(const float_t alpha, vec_t& Y) {
@@ -85,6 +96,12 @@ void mul_scalar(const float_t alpha, vec_t& Y) {
 void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out) {
   for (size_t i = 0; i < n; ++i) out[i] = alpha * in[i];
 }
+
+float_t l2_norm(size_t n, const float_t* a) {
+  float_t sum = 0.0;
+  for (size_t i = 0; i < n; ++i) sum += a[i] * a[i];
+  return sum/2.0;
+}
 #endif
 
 // dot product
@@ -117,7 +134,7 @@ void dropout(const float scale, const float dropout_rate, const vec_t& in,
   assert(masks.size() == out.size());
   // rng_bernoulli(1. - dropout_rate, masks); // Create random numbers
   for (size_t i = 0; i < in.size(); ++i)
-    masks[i] = deepgalois::bernoulli(dropout_rate);
+    masks[i] = deepgalois::bernoulli(dropout_rate)?1:0;
   for (size_t i = 0; i < in.size(); ++i)
     out[i] = in[i] * masks[i] * scale;
 }
@@ -125,7 +142,7 @@ void dropout(const float scale, const float dropout_rate, const vec_t& in,
 void dropout(const float scale, const float dropout_rate, const vec_t& in,
              std::vector<unsigned>& masks, float_t* out) {
   for (size_t i = 0; i < in.size(); ++i)
-    masks[i] = deepgalois::bernoulli(dropout_rate);
+    masks[i] = deepgalois::bernoulli(dropout_rate)?1:0;
   for (size_t i = 0; i < in.size(); ++i)
     out[i] = in[i] * masks[i] * scale;
 }
@@ -133,7 +150,7 @@ void dropout(const float scale, const float dropout_rate, const vec_t& in,
 void dropout_cpu(size_t n, const float scale, const float dropout_rate,
              const float_t* in, unsigned* masks, float_t* out) {
   galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) {
-    masks[i] = deepgalois::bernoulli(dropout_rate);
+    masks[i] = deepgalois::bernoulli(dropout_rate)?1:0;
     out[i] = in[i] * masks[i] * scale;
   }, galois::loopname("dropout"));
 }
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index 6438fc5db3..c15b749e8d 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -659,3 +659,8 @@ acc_t masked_avg_loss_gpu(int begin, int end, int count, mask_t* masks, float_t*
   return *(total_loss.cpu_rd_ptr()) / count;
 }
 
+acc_t l2_norm_gpu(int n, float_t * tensor) {
+  acc_t sum = 0.0;
+  return sum / 2.0;
+}
+
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index 45b91142d8..91e39affeb 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -6,15 +6,22 @@
 
 namespace deepgalois {
 
-#ifndef GALOIS_USE_DIST
-void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1, 
-               bool selfloop, bool is_single) {
-#else
-void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1,
-               bool selfloop, Graph* dGraph) {
-#endif
-#ifndef GALOIS_USE_DIST
+void Net::init(std::string dataset_str, unsigned num_conv, unsigned epochs,
+               unsigned hidden1, float lr, float dropout, float wd,
+               bool selfloop, bool is_single, Graph* dGraph) {
+  num_conv_layers = num_conv;
+  num_epochs = epochs;
+  learning_rate = lr;
+  dropout_rate = dropout;
+  weight_decay = wd;
   is_single_class = is_single;
+  galois::gPrint("Configuration: num_conv_layers ", num_conv_layers,
+                 ", num_epochs ", num_epochs,
+                 ", hidden1 ", hidden1,
+                 ", learning_rate ", learning_rate,
+                 ", dropout_rate ", dropout_rate,
+                 ", weight_decay ", weight_decay, "\n");
+#ifndef GALOIS_USE_DIST
   context = new deepgalois::Context();
   context->set_label_class(is_single);
   num_samples = context->read_graph(dataset_str, selfloop);
@@ -28,14 +35,14 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1,
 
   // read graph, get num nodes
   num_classes = context->read_labels(dataset_str);
-  num_epochs = epochs;
 
   //std::cout << "Reading label masks ... ";
   train_masks = new mask_t[num_samples];
   val_masks = new mask_t[num_samples];
   std::fill(train_masks, train_masks+num_samples, 0);
   std::fill(val_masks, val_masks+num_samples, 0);
-  // get testing and validation sets
+
+  // get training and validation sets
   if (dataset_str == "reddit") {
     train_begin = 0, train_count = 153431,
     train_end = train_begin + train_count;
@@ -70,7 +77,7 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1,
   // NOTE: train_begin/train_end are global IDs, train_masks is a local id
   // train count and val count are LOCAL counts
 
-  num_layers = NUM_CONV_LAYERS + 1;
+  num_layers = num_conv_layers + 1;
   // initialize feature metadata
   feature_dims.resize(num_layers + 1);
   feature_dims[0] =
@@ -88,15 +95,12 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1,
 }
 
 void Net::train(optimizer* opt, bool need_validate) {
-#ifdef GALOIS_USE_DIST
-  unsigned myID = galois::runtime::getSystemNetworkInterface().ID;
-  std::string header = "[" + std::to_string(myID) + "] ";
-  std::string seperator = "\n";
-#else
-  //std::string header = "[" + std::to_string(0) + "] ";
-  //std::string seperator = "\n";
   std::string header = "";
   std::string seperator = " ";
+#ifdef GALOIS_USE_DIST
+  unsigned myID = galois::runtime::getSystemNetworkInterface().ID;
+  header = "[" + std::to_string(myID) + "] ";
+  seperator = "\n";
 #endif
 
   galois::gPrint("\nStart training...\n");
@@ -236,7 +240,11 @@ acc_t Net::fprop(size_t begin, size_t end, size_t count, mask_t* masks) {
     layers[i]->forward();
     // TODO need to sync model between layers here
   }
-  return layers[num_layers - 1]->get_masked_loss();
+  // prediction error
+  auto loss = layers[num_layers - 1]->get_prediction_loss();
+  // Squared Norm Regularization to mitigate overfitting
+  loss += weight_decay * layers[0]->get_weight_decay_loss();
+  return loss;
 }
 
 void Net::construct_layers() {
@@ -265,9 +273,9 @@ void Net::append_out_layer(size_t layer_id) {
 
 //! Add a convolution layer to the network
 void Net::append_conv_layer(size_t layer_id, bool act, bool norm, bool bias,
-                            bool dropout, float_t dropout_rate) {
+                            bool dropout) {
   assert(dropout_rate < 1.0);
-  assert(layer_id < NUM_CONV_LAYERS);
+  assert(layer_id < num_conv_layers);
   std::vector<size_t> in_dims(2), out_dims(2);
   in_dims[0] = out_dims[0] = num_samples;
   in_dims[1]               = get_in_dim(layer_id);
@@ -327,7 +335,7 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks
     if (masks[i] == 1) {
       // get prediction
       int preds = argmax(num_classes,
-      	    &(layers[NUM_CONV_LAYERS - 1]->next()->get_data()[i * num_classes]));
+      	    &(layers[num_conv_layers - 1]->next()->get_data()[i * num_classes]));
       // check prediction
       if ((label_t)preds == context->get_label(i))
         accuracy_all += 1.0;
@@ -342,7 +350,7 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks
       if (masks[localID] == 1) {
         // get prediction
         int preds = argmax(num_classes,
-        	    &(layers[NUM_CONV_LAYERS - 1]->next()->get_data()[localID * num_classes]));
+        	    &(layers[num_conv_layers - 1]->next()->get_data()[localID * num_classes]));
         // check prediction
         if ((label_t)preds == context->get_label(localID))
           accuracy_all += 1.0;
@@ -361,7 +369,7 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks
 }
 
 acc_t Net::masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, Graph* dGraph) {
-  auto preds = layers[NUM_CONV_LAYERS - 1]->next()->get_data();
+  auto preds = layers[num_conv_layers - 1]->next()->get_data();
   auto ground_truth = context->get_labels_ptr();
   return deepgalois::masked_f1_score(begin, end, count, masks, num_classes, ground_truth, preds);
 }
diff --git a/libdeepgalois/src/net.cu b/libdeepgalois/src/net.cu
index e8d60b1e03..c7acda5666 100644
--- a/libdeepgalois/src/net.cu
+++ b/libdeepgalois/src/net.cu
@@ -123,14 +123,14 @@ namespace deepgalois {
 acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count,
                            mask_t* masks, CSRGraph *g) {
   return masked_accuracy_gpu(num_classes, begin, end, count, masks,
-                             layers[NUM_CONV_LAYERS - 1]->next()->get_data(),
+                             layers[num_conv_layers - 1]->next()->get_data(),
                              context->get_labels_device_ptr());
 }
 
 acc_t Net::masked_multi_class_accuracy(size_t begin, size_t end, size_t count, 
                                        mask_t* masks, CSRGraph* g) {
 	return masked_f1_score_gpu(num_classes, begin, end, count, masks,
-                             layers[NUM_CONV_LAYERS - 1]->next()->get_data(),
+                             layers[num_conv_layers - 1]->next()->get_data(),
                              context->get_labels_device_ptr());
 }
 
diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp
index 109b0522f1..1a3698bc96 100644
--- a/lonestargnn/gcn/gcn.cpp
+++ b/lonestargnn/gcn/gcn.cpp
@@ -18,25 +18,22 @@ int main(int argc, char** argv) {
   LonestarGnnStart(argc, argv, name, desc, url);
   deepgalois::Net network; // the neural network to train
 
+  Graph* dGraph = NULL;
 #ifdef GALOIS_USE_DIST
   std::vector<unsigned> dummyVec;
   Graph* dGraph = galois::graphs::constructSymmetricGraph<char, void>(dummyVec);
 #endif
 
-#ifndef GALOIS_USE_DIST
   // read network, features, ground truth, initialize metadata
-  network.init(dataset, epochs, hidden1, add_selfloop, is_single_class);
-#else
-  network.init(dataset, epochs, hidden1, add_selfloop, dGraph);
-#endif
-  network.construct_layers(); // default setting for now; can be customized by
-                              // the user
+  network.init(dataset, num_conv_layers, epochs, hidden1, learning_rate, 
+               dropout_rate, weight_decay, add_selfloop, is_single_class, dGraph);
+  // default setting for now; can be customized by the user
+  network.construct_layers();
   network.print_layers_info();
+  deepgalois::ResourceManager rm; // tracks peak memory usage
 
-  // tracks peak memory usage
-  deepgalois::ResourceManager rm;
-
-  // the optimizer used to update parameters, see optimizer.h for more details
+  // the optimizer used to update parameters, 
+  // see optimizer.h for more details
   // optimizer *opt = new gradient_descent();
   // optimizer *opt = new adagrad();
   deepgalois::optimizer* opt = new deepgalois::adam();
diff --git a/lonestargnn/include/lonestargnn.h b/lonestargnn/include/lonestargnn.h
index 72acb8d1ff..1c96548a36 100644
--- a/lonestargnn/include/lonestargnn.h
+++ b/lonestargnn/include/lonestargnn.h
@@ -16,45 +16,31 @@
 #endif
 
 namespace cll = llvm::cl;
-static cll::opt<std::string>
-    dataset(cll::Positional, cll::desc("<dataset name>"),
-            cll::Required); // 'cora', 'citeseer', 'pubmed'
-//static cll::opt<std::string>
-//    filetype(cll::Positional, cll::desc("<filetype: el,gr>"),
-//             cll::init("gr")); // file format of the input graph
-static cll::opt<std::string>
-    model("m", cll::desc("Model string"),
-          cll::init("gcn")); // 'gcn', 'gcn_cheby', 'dense'
-static cll::opt<float>
-    learning_rate("lr", cll::desc("Initial learning rate (default value 0.01)"),
-                  cll::init(0.01));
-static cll::opt<unsigned>
-    epochs("k", cll::desc("number of epoch, i.e. iterations (default value 1)"),
-           cll::init(1));
-static cll::opt<unsigned>
-    hidden1("h",
-            cll::desc("Number of units in hidden layer 1 (default value 16)"),
-            cll::init(16));
-static cll::opt<float> dropout_rate(
-    "d", cll::desc("Dropout rate (1 - keep probability) (default value 0.5)"),
-    cll::init(0.5));
-static cll::opt<float> weight_decay(
-    "wd",
-    cll::desc("Weight for L2 loss on embedding matrix (default value 5e-4)"),
-    cll::init(5e-4));
-static cll::opt<float> early_stopping(
-    "es",
-    cll::desc("Tolerance for early stopping (# of epochs) (default value 10)"),
-    cll::init(10));
-static cll::opt<unsigned> max_degree(
-    "md", cll::desc("Maximum Chebyshev polynomial degree (default value 3)"),
-    cll::init(3));
-static cll::opt<unsigned> do_validate("dv", cll::desc("enable validation"),
-                                      cll::init(1));
-static cll::opt<unsigned> do_test("dt", cll::desc("enable test"), cll::init(1));
-static cll::opt<bool> add_selfloop("sl", cll::desc("add selfloop"), cll::init(0));
+static cll::opt<std::string> dataset(cll::Positional, 
+    cll::desc("<dataset name>"), cll::Required); // 'cora', 'citeseer', 'pubmed'
+//static cll::opt<std::string> model("m", 
+//  cll::desc("Model string"), cll::init("gcn")); // 'gcn', 'gcn_cheby', 'dense'
+static cll::opt<unsigned> epochs("k",
+    cll::desc("number of epoch, i.e. iterations (default value 1)"), cll::init(1));
+static cll::opt<unsigned> num_conv_layers("nc",
+    cll::desc("number of convolutional layers, (default value 2)"), cll::init(2));
+static cll::opt<unsigned> hidden1("h",
+    cll::desc("Number of units in hidden layer 1 (default value 16)"), cll::init(16));
+static cll::opt<float> learning_rate("lr", 
+    cll::desc("Initial learning rate (default value 0.01)"), cll::init(0.01));
+static cll::opt<float> dropout_rate("d", 
+    cll::desc("Dropout rate (1 - keep probability) (default value 0.5)"), cll::init(0.5));
+static cll::opt<float> weight_decay("wd",
+    cll::desc("Weight for L2 loss on embedding matrix (default value 5e-4)"), cll::init(5e-4));
+static cll::opt<float> early_stopping("es",
+    cll::desc("Tolerance for early stopping (# of epochs) (default value 10)"), cll::init(10));
+static cll::opt<unsigned> max_degree("md", 
+    cll::desc("Maximum size of the downsampled adjacency lists (default value 25)"), cll::init(25));
 static cll::opt<bool> is_single_class("sc", 
     cll::desc("single-class or multi-class label (default single)"), cll::init(1));
+static cll::opt<bool> do_validate("dv", cll::desc("enable validation"), cll::init(1));
+static cll::opt<bool> do_test("dt", cll::desc("enable test"), cll::init(1));
+static cll::opt<bool> add_selfloop("sl", cll::desc("add selfloop"), cll::init(0));
 
 //! standard global options to the benchmarks
 extern llvm::cl::opt<bool> skipVerify;

From 6b59d1c7a9a29b84188d8696467f37d6501ac3c8 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Thu, 23 Apr 2020 10:09:18 -0500
Subject: [PATCH 186/660] add leaky_relu_layer

---
 libdeepgalois/CMakeLists.txt                  | 17 ++++--
 .../deepgalois/layers/leaky_relu_layer.h      | 20 +++++++
 .../include/deepgalois/layers/relu_layer.h    | 11 +---
 .../include/deepgalois/math_functions.hh      |  4 ++
 libdeepgalois/include/deepgalois/types.h      |  2 +-
 libdeepgalois/src/layers/leaky_relu_layer.cpp | 32 ++++++++++++
 libdeepgalois/src/layers/leaky_relu_layer.cu  | 17 ++++++
 libdeepgalois/src/layers/relu_layer.cpp       | 52 +++++--------------
 libdeepgalois/src/layers/relu_layer.cu        | 19 +++++++
 libdeepgalois/src/math_functions.cu           | 32 ++++++++++--
 10 files changed, 149 insertions(+), 57 deletions(-)
 create mode 100644 libdeepgalois/include/deepgalois/layers/leaky_relu_layer.h
 create mode 100644 libdeepgalois/src/layers/leaky_relu_layer.cpp
 create mode 100644 libdeepgalois/src/layers/leaky_relu_layer.cu
 create mode 100644 libdeepgalois/src/layers/relu_layer.cu

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index c2c64d4f0c..193988f414 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -41,6 +41,8 @@ else()
     src/layers/graph_conv_layer.cu
     src/layers/softmax_loss_layer.cu
     src/layers/sigmoid_loss_layer.cu
+    src/layers/leaky_relu_layer.cu
+    src/layers/relu_layer.cu
     src/layers/aggregator.cu
     src/math_functions.cu
     src/optimizer.cu
@@ -58,25 +60,30 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 if(ENABLE_DIST_GALOIS)
 # do not link regular context.cpp; TODO do this conditional in cleaner way
 set(sources
-  src/layers/graph_conv_layer.cpp
   src/layers/softmax_loss_layer.cpp
+  src/layers/sigmoid_loss_layer.cpp
+  src/layers/graph_conv_layer.cpp
+  src/layers/leaky_relu_layer.cpp
+  src/layers/relu_layer.cpp
   src/layers/aggregator.cpp
-  src/layers/layer.cpp
   src/math_functions.cpp
-  src/optimizer.cpp
+  src/layers/layer.cpp
   src/DistContext.cpp
+  src/optimizer.cpp
   src/utils.cpp
   src/node.cpp
   src/net.cpp
 )
 else()
 set(sources
-  src/layers/graph_conv_layer.cpp
   src/layers/softmax_loss_layer.cpp
   src/layers/sigmoid_loss_layer.cpp
+  src/layers/graph_conv_layer.cpp
+  src/layers/leaky_relu_layer.cpp
+  src/layers/relu_layer.cpp
   src/layers/aggregator.cpp
-  src/layers/layer.cpp
   src/math_functions.cpp
+  src/layers/layer.cpp
   src/optimizer.cpp
   src/context.cpp
   src/sampler.cpp
diff --git a/libdeepgalois/include/deepgalois/layers/leaky_relu_layer.h b/libdeepgalois/include/deepgalois/layers/leaky_relu_layer.h
new file mode 100644
index 0000000000..a8b6136eea
--- /dev/null
+++ b/libdeepgalois/include/deepgalois/layers/leaky_relu_layer.h
@@ -0,0 +1,20 @@
+#pragma once
+#include "layer.h"
+
+namespace deepgalois {
+// Leaky ReLU Layer
+class leaky_relu_layer : public layer {
+public:
+  leaky_relu_layer(unsigned level, float_t eps, dims_t in_dims, dims_t out_dims);
+  leaky_relu_layer(unsigned level, dims_t in_dims, dims_t out_dims) :
+    leaky_relu_layer(level, 0.0, in_dims, out_dims) {}
+  ~leaky_relu_layer() {}
+  std::string layer_type() const override { return std::string("leaky_relu"); }
+  virtual void forward_propagation(const float_t* in_data, float_t* out_data);
+  virtual void back_propagation(const float_t* in_data, const float_t* out_data,
+                                float_t* out_grad, float_t* in_grad);
+protected:
+  float_t epsilon_;
+  size_t n;
+};
+} // namespace
diff --git a/libdeepgalois/include/deepgalois/layers/relu_layer.h b/libdeepgalois/include/deepgalois/layers/relu_layer.h
index a85d51608d..601c5d67ed 100644
--- a/libdeepgalois/include/deepgalois/layers/relu_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/relu_layer.h
@@ -5,18 +5,11 @@ namespace deepgalois {
 // ReLU Layer
 class relu_layer : public layer {
 public:
-  relu_layer(unsigned level, std::vector<size_t> in_dims,
-             std::vector<size_t> out_dims)
-      : layer(level, in_dims, out_dims) {
-    trainable_ = false;
-  }
+  relu_layer(unsigned level, dims_t in_dims, dims_t out_dims)
+      : layer(level, in_dims, out_dims) { trainable_ = false; }
   ~relu_layer() {}
   std::string layer_type() const override { return std::string("relu"); }
-  virtual void forward_propagation(const tensor_t& in_data, tensor_t& out_data);
   virtual void forward_propagation(const float_t* in_data, float_t* out_data);
-  virtual void back_propagation(const tensor_t& in_data,
-                                const tensor_t& out_data, tensor_t& out_grad,
-                                tensor_t& in_grad);
   virtual void back_propagation(const float_t* in_data, const float_t* out_data,
                                 float_t* out_grad, float_t* in_grad);
 };
diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh
index a70ae071f9..a66d721d34 100644
--- a/libdeepgalois/include/deepgalois/math_functions.hh
+++ b/libdeepgalois/include/deepgalois/math_functions.hh
@@ -127,6 +127,10 @@ void vadd_gpu(const int n, const float_t* a, const float_t* b,
 void relu_gpu(const int n, const float_t* in, float_t* out); // ReLU
 void d_relu_gpu(const int n, const float_t* in_diff, const float_t* data,
                 float_t* out_diff); // ReLU derivative
+void leaky_relu_gpu(const int n, const float_t epsilon, 
+                    const float_t* in, float_t* out); // Leaky ReLU
+void d_leaky_relu_gpu(const int n, const float_t epsilon, const float_t* in_diff, 
+                      const float_t* data, float_t* out_diff); // Leaky ReLU derivative
 void dropout_gpu(const int n, const float scale, const float dropout_rate,
                  const float_t* in, unsigned* masks, float_t* out); // dropout
 void d_dropout_gpu(const int n, const float scale, const float dropout_rate,
diff --git a/libdeepgalois/include/deepgalois/types.h b/libdeepgalois/include/deepgalois/types.h
index 92e0d31772..a2f6164439 100644
--- a/libdeepgalois/include/deepgalois/types.h
+++ b/libdeepgalois/include/deepgalois/types.h
@@ -24,6 +24,7 @@ typedef uint8_t mask_t; // mask is used to indicate different uses of labels:
 typedef uint32_t VertexID;
 typedef uint64_t EdgeID;
 typedef std::vector<VertexID> VertexList;
+typedef std::vector<size_t> dims_t; // dimentions type
 
 #define CHUNK_SIZE 256
 #define TB_SIZE 256
@@ -33,7 +34,6 @@ typedef std::vector<VertexID> VertexList;
 #define WARPS_PER_BLOCK (BLOCK_SIZE / WARP_SIZE)
 #define USE_CUSPARSE
 
-
 #ifdef GALOIS_USE_DIST
 namespace deepgalois {
   //! Set this to let sync struct know where to get data from
diff --git a/libdeepgalois/src/layers/leaky_relu_layer.cpp b/libdeepgalois/src/layers/leaky_relu_layer.cpp
new file mode 100644
index 0000000000..650a0aa1be
--- /dev/null
+++ b/libdeepgalois/src/layers/leaky_relu_layer.cpp
@@ -0,0 +1,32 @@
+#include "deepgalois/layers/leaky_relu_layer.h"
+
+namespace deepgalois {
+
+leaky_relu_layer::leaky_relu_layer(unsigned level, float_t eps,
+                                   dims_t in_dims, dims_t out_dims)
+    : layer(level, in_dims, out_dims), epsilon_(eps) {
+  assert(input_dims[0] == output_dims[0]); // num_vertices
+  trainable_ = false;
+  n = input_dims[0] * input_dims[1];
+  name_ = layer_type() + "_" + std::to_string(level);
+}
+
+#ifdef CPU_ONLY
+// 𝑦[𝑙] = 𝑦[𝑙−1] > 0 ? 𝑦[𝑙−1]) : 𝑦[𝑙−1] * ε 
+void leaky_relu_layer::forward_propagation(const float_t* in_data, float_t* out_data) {
+  galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) {
+    out_data[i] = in_data[i] > (float_t)0 ? in_data[i] : epsilon_ * in_data[i];
+  }, galois::chunk_size<64>(), galois::steal(), galois::loopname("leaky_relu_layer-fw"));
+}
+
+// 𝜕𝐿 / 𝜕𝑦[𝑙−1] = 𝜕𝐿 / 𝜕𝑦𝑙 * ε,   𝑖𝑓 (𝑦[𝑙] ≤ 0)
+//              = 𝜕𝐿 / 𝜕𝑦𝑙,       𝑖𝑓 (𝑦[𝑙] > 0)
+void leaky_relu_layer::back_propagation(const float_t* in_data, const float_t* out_data,
+                                  float_t* out_grad, float_t* in_grad) {
+  galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) {
+    in_grad[i] = out_grad[i] * (out_data[i] > float_t(0) ? float_t(1) : epsilon_);
+  }, galois::chunk_size<64>(), galois::steal(), galois::loopname("leaky_relu_layer-bw"));
+}
+#endif
+
+} // namespace
diff --git a/libdeepgalois/src/layers/leaky_relu_layer.cu b/libdeepgalois/src/layers/leaky_relu_layer.cu
new file mode 100644
index 0000000000..43e7f93d04
--- /dev/null
+++ b/libdeepgalois/src/layers/leaky_relu_layer.cu
@@ -0,0 +1,17 @@
+#include "deepgalois/layers/leaky_relu_layer.h"
+
+namespace deepgalois {
+
+// 𝑦[𝑙] = 𝑦[𝑙−1] > 0 ? 𝑦[𝑙−1]) : 𝑦[𝑙−1] * ε 
+void leaky_relu_layer::forward_propagation(const float_t* in_data, float_t* out_data) {
+  leaky_relu_gpu(n, epsilon_, in_data, out_data);
+}
+
+// 𝜕𝐿 / 𝜕𝑦[𝑙−1] = 𝜕𝐿 / 𝜕𝑦𝑙 * ε,   𝑖𝑓 (𝑦[𝑙] ≤ 0)
+//              = 𝜕𝐿 / 𝜕𝑦𝑙,       𝑖𝑓 (𝑦[𝑙] > 0)
+void leaky_relu_layer::back_propagation(const float_t* in_data, const float_t* out_data,
+                                  float_t* out_grad, float_t* in_grad) {
+  d_leaky_relu_gpu(n, epsilon_, out_grad, in_data, in_grad);
+}
+
+} // namespace
diff --git a/libdeepgalois/src/layers/relu_layer.cpp b/libdeepgalois/src/layers/relu_layer.cpp
index 7441294f83..f0d3a74a49 100644
--- a/libdeepgalois/src/layers/relu_layer.cpp
+++ b/libdeepgalois/src/layers/relu_layer.cpp
@@ -2,48 +2,24 @@
 
 namespace deepgalois {
 
+#ifdef CPU_ONLY
 // 𝑦[𝑙] = max(0, 𝑦[𝑙−1])
-void relu_layer::forward_propagation(const tensor_t& in_data,
-                                     tensor_t& out_data) {
-  galois::do_all(galois::iterate((size_t)0, input_dims[0]),
-                 [&](const auto& i) {
-                   for (size_t j = 0; j < input_dims[1]; ++j)
-                     out_data[i][j] = std::max(in_data[i][j], (float_t)0);
-                 },
-                 galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
-                 galois::loopname("relu_layer-fw"));
-}
-
-// 𝑦[𝑙] = max(0, 𝑦[𝑙−1])
-void relu_layer::forward_propagation(const float_t* in_data,
-                                     float_t* out_data) {
-  const size_t count = input_dims[0] * input_dims[1];
-  relu_gpu(count, in_data, out_data);
-}
-
-// 𝜕𝐿 / 𝜕𝑦[𝑙−1] = 0, 𝑖𝑓 (𝑦[𝑙] < 0)
-//              = 𝜕𝐿 / 𝜕𝑦𝑙 , 𝑜𝑡ℎ𝑒𝑟𝑤𝑖𝑠𝑒
-void relu_layer::back_propagation(const tensor_t& in_data,
-                                  const tensor_t& out_data, tensor_t& out_grad,
-                                  tensor_t& in_grad) {
-  galois::do_all(galois::iterate((size_t)0, input_dims[0]),
-                 [&](const auto& i) {
-                   for (size_t j = 0; j < input_dims[1]; ++j)
-                     in_grad[i][j] = out_data[i][j] > float_t(0)
-                                         ? out_grad[i][j]
-                                         : float_t(0);
-                 },
-                 galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
-                 galois::loopname("relu_layer-bw"));
+void relu_layer::forward_propagation(const float_t* in_data, float_t* out_data) {
+  size_t n = input_dims[0] * input_dims[1];
+  galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) {
+    out_data[i] = std::max(in_data[i], (float_t)0);
+  }, galois::chunk_size<64>(), galois::steal(), galois::loopname("relu_layer-fw"));
 }
 
 // 𝜕𝐿 / 𝜕𝑦[𝑙−1] = 0, 𝑖𝑓 (𝑦[𝑙] < 0)
-//              = 𝜕𝐿 / 𝜕𝑦𝑙 , 𝑜𝑡ℎ𝑒𝑟𝑤𝑖𝑠𝑒
-void relu_layer::back_propagation(const float_t* in_data,
-                                  const float_t* out_data, float_t* out_grad,
-                                  float_t* in_grad) {
-  const size_t count = input_dims[0] * input_dims[1];
-  d_relu_gpu(count, out_grad, in_data, in_grad);
+//              = 𝜕𝐿 / 𝜕𝑦𝑙, 𝑜𝑡ℎ𝑒𝑟𝑤𝑖𝑠𝑒
+void relu_layer::back_propagation(const float_t* in_data, const float_t* out_data,
+                                  float_t* out_grad, float_t* in_grad) {
+  size_t n = input_dims[0] * input_dims[1];
+  galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) {
+    in_grad[i] = out_data[i] > float_t(0) ? out_grad[i] : float_t(0);
+  }, galois::chunk_size<64>(), galois::steal(), galois::loopname("relu_layer-bw"));
 }
+#endif
 
 } // namespace
diff --git a/libdeepgalois/src/layers/relu_layer.cu b/libdeepgalois/src/layers/relu_layer.cu
new file mode 100644
index 0000000000..f3a45936b4
--- /dev/null
+++ b/libdeepgalois/src/layers/relu_layer.cu
@@ -0,0 +1,19 @@
+#include "deepgalois/layers/relu_layer.h"
+
+namespace deepgalois {
+
+// 𝑦[𝑙] = max(0, 𝑦[𝑙−1])
+void relu_layer::forward_propagation(const float_t* in_data, float_t* out_data) {
+  const size_t count = input_dims[0] * input_dims[1];
+  relu_gpu(count, in_data, out_data);
+}
+
+// 𝜕𝐿 / 𝜕𝑦[𝑙−1] = 0, 𝑖𝑓 (𝑦[𝑙] < 0)
+//              = 𝜕𝐿 / 𝜕𝑦𝑙, 𝑜𝑡ℎ𝑒𝑟𝑤𝑖𝑠𝑒
+void relu_layer::back_propagation(const float_t* in_data, const float_t* out_data,
+                                  float_t* out_grad, float_t* in_grad) {
+  const size_t count = input_dims[0] * input_dims[1];
+  d_relu_gpu(count, out_grad, in_data, in_grad);
+}
+
+} // namespace
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index c15b749e8d..61114f0daf 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -109,7 +109,7 @@ void d_dropout_gpu(const int n, const float scale, const float dropout_rate,
 
 // flattern data into 1D before feed into the ReLU operater
 __global__ void relu_kernel(const int n, const float_t* in, float_t* out) {
-  CUDA_KERNEL_LOOP(index, n) { out[index] = in[index] > 0 ? in[index] : 0; }
+  CUDA_KERNEL_LOOP(i, n) { out[i] = in[i] > 0 ? in[i] : 0; }
 }
 
 void relu_gpu(const int n, const float_t* in, float_t* out) {
@@ -119,9 +119,7 @@ void relu_gpu(const int n, const float_t* in, float_t* out) {
 
 __global__ void d_relu_kernel(const int n, const float_t* in_diff,
                               const float_t* data, float_t* out_diff) {
-  CUDA_KERNEL_LOOP(index, n) {
-    out_diff[index] = data[index] > 0 ? in_diff[index] : 0;
-  }
+  CUDA_KERNEL_LOOP(i, n) { out_diff[i] = data[i] > 0 ? in_diff[i] : 0; }
 }
 
 void d_relu_gpu(const int n, const float_t* in_diff, const float_t* data,
@@ -131,6 +129,32 @@ void d_relu_gpu(const int n, const float_t* in_diff, const float_t* data,
   CudaTest("solving d_relu kernel failed");
 }
 
+// flattern data into 1D before feed into the ReLU operater
+__global__ void leaky_relu_kernel(const int n, const float_t epsilon,
+                                  const float_t* in, float_t* out) {
+  CUDA_KERNEL_LOOP(i, n) { out[i] = in[i] > 0 ? in[i] : epsilon * in[i]; }
+}
+
+void leaky_relu_gpu(const int n, const float_t epsilon, 
+                    const float_t* in, float_t* out) {
+  leaky_relu_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, epsilon, in, out);
+  CudaTest("solving leaky_relu kernel failed");
+}
+
+__global__ void d_leaky_relu_kernel(const int n, const float_t epsilon, 
+    const float_t* in_diff, const float_t* data, float_t* out_diff) {
+  CUDA_KERNEL_LOOP(i, n) {
+    out_diff[i] = in_diff[i] * (data[i] > 0 ? 1.0 : epsilon);
+  }
+}
+
+void d_leaky_relu_gpu(const int n, const float_t epsilon, const float_t* in_diff, 
+                      const float_t* data, float_t* out_diff) {
+  d_leaky_relu_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
+      n, epsilon, in_diff, data, out_diff);
+  CudaTest("solving d_leaky_relu kernel failed");
+}
+
 __global__ void matmul_kernel(int x, int y, int z, const float_t* A,
                               const float_t* B, float_t* C) {
 	int row = blockIdx.x*blockDim.x+threadIdx.x;

From 693faed70c3b3d93f9d571d159faf6bd5773d234 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Thu, 23 Apr 2020 12:47:30 -0500
Subject: [PATCH 187/660] update leaky_relu

---
 .../include/deepgalois/math_functions.hh      |  2 ++
 libdeepgalois/src/layers/leaky_relu_layer.cpp |  8 ++------
 libdeepgalois/src/layers/relu_layer.cpp       |  8 ++------
 libdeepgalois/src/math_functions.cpp          | 19 +++++++++++++++++--
 4 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh
index a66d721d34..dd4d5e4219 100644
--- a/libdeepgalois/include/deepgalois/math_functions.hh
+++ b/libdeepgalois/include/deepgalois/math_functions.hh
@@ -41,6 +41,8 @@ void d_dropout_cpu(size_t n, const float scale, const float_t* in_diff,
 void relu_cpu(size_t n, const float_t* in, float_t* out);
 //! ReLU derivative; generally, 1 if data > 0, 0 otherwise
 void d_relu_cpu(size_t n, const float_t* in, const float_t* data, float_t* out);
+void leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in, float_t* out);
+void d_leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in, const float_t* data, float_t* out);
 
 // Loss function for single-class label (one-hot) data: softmax
 void softmax(const vec_t& input, vec_t& output);
diff --git a/libdeepgalois/src/layers/leaky_relu_layer.cpp b/libdeepgalois/src/layers/leaky_relu_layer.cpp
index 650a0aa1be..0d5a7f66fb 100644
--- a/libdeepgalois/src/layers/leaky_relu_layer.cpp
+++ b/libdeepgalois/src/layers/leaky_relu_layer.cpp
@@ -14,18 +14,14 @@ leaky_relu_layer::leaky_relu_layer(unsigned level, float_t eps,
 #ifdef CPU_ONLY
 // 𝑦[𝑙] = 𝑦[𝑙−1] > 0 ? 𝑦[𝑙−1]) : 𝑦[𝑙−1] * ε 
 void leaky_relu_layer::forward_propagation(const float_t* in_data, float_t* out_data) {
-  galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) {
-    out_data[i] = in_data[i] > (float_t)0 ? in_data[i] : epsilon_ * in_data[i];
-  }, galois::chunk_size<64>(), galois::steal(), galois::loopname("leaky_relu_layer-fw"));
+  math::leaky_relu_cpu(n, epsilon_, in_data, out_data);
 }
 
 // 𝜕𝐿 / 𝜕𝑦[𝑙−1] = 𝜕𝐿 / 𝜕𝑦𝑙 * ε,   𝑖𝑓 (𝑦[𝑙] ≤ 0)
 //              = 𝜕𝐿 / 𝜕𝑦𝑙,       𝑖𝑓 (𝑦[𝑙] > 0)
 void leaky_relu_layer::back_propagation(const float_t* in_data, const float_t* out_data,
                                   float_t* out_grad, float_t* in_grad) {
-  galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) {
-    in_grad[i] = out_grad[i] * (out_data[i] > float_t(0) ? float_t(1) : epsilon_);
-  }, galois::chunk_size<64>(), galois::steal(), galois::loopname("leaky_relu_layer-bw"));
+  math::d_leaky_relu_cpu(n, epsilon_, out_grad, out_data, in_grad);
 }
 #endif
 
diff --git a/libdeepgalois/src/layers/relu_layer.cpp b/libdeepgalois/src/layers/relu_layer.cpp
index f0d3a74a49..2e89af1bd5 100644
--- a/libdeepgalois/src/layers/relu_layer.cpp
+++ b/libdeepgalois/src/layers/relu_layer.cpp
@@ -6,9 +6,7 @@ namespace deepgalois {
 // 𝑦[𝑙] = max(0, 𝑦[𝑙−1])
 void relu_layer::forward_propagation(const float_t* in_data, float_t* out_data) {
   size_t n = input_dims[0] * input_dims[1];
-  galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) {
-    out_data[i] = std::max(in_data[i], (float_t)0);
-  }, galois::chunk_size<64>(), galois::steal(), galois::loopname("relu_layer-fw"));
+  math::relu_cpu(n, in_data, out_data);
 }
 
 // 𝜕𝐿 / 𝜕𝑦[𝑙−1] = 0, 𝑖𝑓 (𝑦[𝑙] < 0)
@@ -16,9 +14,7 @@ void relu_layer::forward_propagation(const float_t* in_data, float_t* out_data)
 void relu_layer::back_propagation(const float_t* in_data, const float_t* out_data,
                                   float_t* out_grad, float_t* in_grad) {
   size_t n = input_dims[0] * input_dims[1];
-  galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) {
-    in_grad[i] = out_data[i] > float_t(0) ? out_grad[i] : float_t(0);
-  }, galois::chunk_size<64>(), galois::steal(), galois::loopname("relu_layer-bw"));
+  math::d_relu_cpu(n, out_grad, out_data, in_grad);
 }
 #endif
 
diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp
index f81444fa70..c6b64504e8 100644
--- a/libdeepgalois/src/math_functions.cpp
+++ b/libdeepgalois/src/math_functions.cpp
@@ -178,7 +178,7 @@ void relu_cpu(size_t n, const float_t* in, float_t* out) {
   // TODO: vectorize
   galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) {
     out[i] = std::max(in[i], float_t(0));
-  }, galois::loopname("relu"));
+  }, galois::chunk_size<64>(), galois::loopname("relu"));
 }
 
 void d_relu_cpu(size_t n, const float_t* in, const float_t* data, float_t* out) {
@@ -186,7 +186,22 @@ void d_relu_cpu(size_t n, const float_t* in, const float_t* data, float_t* out)
   // check if original data greater than 0; if so keep grad
   galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) {
     out[i] = data[i] > float_t(0) ? in[i] : float_t(0);
-  }, galois::loopname("d_relu"));
+  }, galois::chunk_size<64>(), galois::loopname("d_relu"));
+}
+
+void leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in, float_t* out) {
+  // TODO: vectorize
+  galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) {
+    out[i] = in[i] > 0 ? in[i] : epsilon * in[i];
+  }, galois::chunk_size<64>(), galois::loopname("leaky_relu"));
+}
+
+void d_leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in, 
+                      const float_t* data, float_t* out) {
+  // TODO: vectorize
+  galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) {
+    out[i] = in[i] * (data[i] > float_t(0) ? float_t(1) : epsilon);
+  }, galois::chunk_size<64>(), galois::loopname("d_leaky_relu"));
 }
 
 void softmax(const vec_t& input, vec_t& output) {

From b5796f33a8b42bf84153081c2643acab37f297f2 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Fri, 24 Apr 2020 14:32:43 -0500
Subject: [PATCH 188/660] support arbitrary num_conv_layers

---
 .../include/deepgalois/math_functions.hh      |  5 +--
 libdeepgalois/include/deepgalois/net.h        | 18 +++--------
 libdeepgalois/src/math_functions.cpp          | 15 +++++++++
 libdeepgalois/src/math_functions.cu           | 30 +++++++++++------
 libdeepgalois/src/net.cpp                     | 32 ++++++++++++++-----
 5 files changed, 66 insertions(+), 34 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh
index dd4d5e4219..edd7fc6eb6 100644
--- a/libdeepgalois/include/deepgalois/math_functions.hh
+++ b/libdeepgalois/include/deepgalois/math_functions.hh
@@ -27,6 +27,7 @@ void vadd_cpu(size_t n, const float_t* a, const float_t* b, float_t* out);
 void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out);
 //! do dot product of 2 vectors
 float_t dot(const vec_t& x, const vec_t& y);
+float_t axpy(size_t n, const float_t a, float_t *x, float_t *y);
 //! Computes half the L2 norm of a tensor without the sqrt: output = sum(t ** 2) / 2
 float_t l2_norm(size_t n, const float_t* a);
 //! clear n elements of a vector
@@ -124,8 +125,8 @@ int argmax(const size_t n, const float_t* x); // the arguments of the maxima
 bool isnan_gpu(int n, const float_t *array); // does array contain any 'nan' element
 void init_const_gpu(int n, float_t value, float_t *array);
 void copy_gpu(int len, const float_t* in, float_t* out);
-void vadd_gpu(const int n, const float_t* a, const float_t* b,
-              float_t* out);                                 // vector add
+void vadd_gpu(const int n, const float_t* a, const float_t* b, float_t* out); // vector add
+void axpy_gpu(const int n, const float_t a, const float_t* x, float_t* y); // axpy
 void relu_gpu(const int n, const float_t* in, float_t* out); // ReLU
 void d_relu_gpu(const int n, const float_t* in_diff, const float_t* data,
                 float_t* out_diff); // ReLU derivative
diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h
index 6a03611371..e4016231d4 100644
--- a/libdeepgalois/include/deepgalois/net.h
+++ b/libdeepgalois/include/deepgalois/net.h
@@ -66,21 +66,11 @@ class Net {
       layers[i]->print_layer_info();
   }
 
-  // back propogation
-  void bprop() {
-    for (size_t i = num_layers; i != 0; i--) {
-      layers[i - 1]->backward();
-    }
-  }
-
+  void bprop(); // back propogation
+  void normalize();
+  void regularize();
   // update trainable weights after back-propagation
-  void update_weights(optimizer* opt) {
-    for (size_t i = 0; i < num_layers; i++) {
-      if (layers[i]->trainable()) {
-        layers[i]->update_weight(opt);
-      }
-    }
-  }
+  void update_weights(optimizer* opt);
 
 protected:
   bool is_single_class;              // single-class (one-hot) or multi-class label
diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp
index c6b64504e8..9cff465a73 100644
--- a/libdeepgalois/src/math_functions.cpp
+++ b/libdeepgalois/src/math_functions.cpp
@@ -77,6 +77,17 @@ void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out)
   for (size_t i = alignedN; i < n; ++i) out[i] = alpha * in[i];
 }
 
+// SAXPY stands for “Single-precision A*X Plus Y"
+float_t axpy(size_t n, const float_t a, float_t *x, float_t *y) {
+  const size_t alignedN = n - n % vec_len;
+  const __m256 alpha = _mm256_set1_ps(a);
+  for (size_t i = 0; i < alignedN; i += vec_len) {
+    __m256  product = _mm256_mul_ps(_mm256_loadu_ps(&x[i]), alpha);
+    _mm256_storeu_ps(&y[i], _mm256_add_ps(_mm256_loadu_ps(&y[i]), product));
+  }
+  for (size_t i = alignedN; i < n; ++i) y[i] = a * x[i] + y[i];
+}
+
 float_t l2_norm(size_t n, const float_t* in) {
   const size_t alignedN = n - n % vec_len;
   __m256 vsum = _mm256_set1_ps(0.0);
@@ -97,6 +108,10 @@ void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out)
   for (size_t i = 0; i < n; ++i) out[i] = alpha * in[i];
 }
 
+float_t axpy(size_t n, const float_t a, float_t *x, float_t *y) {
+  for (size_t i = 0; i < n; ++i) y[i] = a * x[i] + y[i];
+}
+
 float_t l2_norm(size_t n, const float_t* a) {
   float_t sum = 0.0;
   for (size_t i = 0; i < n; ++i) sum += a[i] * a[i];
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index 61114f0daf..b906702d9c 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -281,12 +281,12 @@ __global__ void set_kernel(const int n, const float_t alpha, float_t* y) {
   CUDA_KERNEL_LOOP(index, n) { y[index] = alpha; }
 }
 
-void set_gpu(const int N, const float_t alpha, float_t* Y) {
+void set_gpu(const int n, const float_t alpha, float_t* Y) {
   if (alpha == 0) {
-    CUDA_CHECK(cudaMemset(Y, 0, sizeof(float_t) * N));
+    CUDA_CHECK(cudaMemset(Y, 0, sizeof(float_t) * n));
     return;
   }
-  set_kernel<<<CUDA_GET_BLOCKS(N), CUDA_NUM_THREADS>>>(N, alpha, Y);
+  set_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, alpha, Y);
   CudaTest("solving set kernel failed");
 }
 
@@ -295,8 +295,8 @@ __global__ void add_scalar_kernel(const int n, const float_t alpha,
   CUDA_KERNEL_LOOP(index, n) { y[index] += alpha; }
 }
 
-void add_scalar_gpu(const int N, const float_t alpha, float_t* Y) {
-  add_scalar_kernel<<<CUDA_GET_BLOCKS(N), CUDA_NUM_THREADS>>>(N, alpha, Y);
+void add_scalar_gpu(const int n, const float_t alpha, float_t* Y) {
+  add_scalar_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, alpha, Y);
   CudaTest("solving add_scalar kernel failed");
 }
 
@@ -305,13 +305,23 @@ __global__ void vadd_kernel(const int n, const float_t* a, const float_t* b,
   CUDA_KERNEL_LOOP(index, n) { y[index] = a[index] + b[index]; }
 }
 
-void copy_gpu(int len, const float_t* in, float_t* out) {
-  CUDA_CHECK(cudaMemcpy(out, in, len * sizeof(float_t), cudaMemcpyDeviceToDevice));
+void vadd_gpu(const int n, const float_t* a, const float_t* b, float_t* y) {
+  vadd_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, a, b, y);
+  CudaTest("solving vadd kernel failed");
 }
 
-void vadd_gpu(const int N, const float_t* a, const float_t* b, float_t* y) {
-  vadd_kernel<<<CUDA_GET_BLOCKS(N), CUDA_NUM_THREADS>>>(N, a, b, y);
-  CudaTest("solving vadd kernel failed");
+__global__ void axpy_kernel(const int n, const float_t a, const float_t* x,
+                            float_t* y) {
+  CUDA_KERNEL_LOOP(i, n) { y[i] = a * x[i] + y[i]; }
+}
+
+void axpy_gpu(const int n, const float_t a, const float_t* x, float_t* y) {
+  axpy_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, a, x, y);
+  CudaTest("solving axpy kernel failed");
+}
+
+void copy_gpu(int len, const float_t* in, float_t* out) {
+  CUDA_CHECK(cudaMemcpy(out, in, len * sizeof(float_t), cudaMemcpyDeviceToDevice));
 }
 
 // TODO: use warp
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index 91e39affeb..58fe59312e 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -80,11 +80,11 @@ void Net::init(std::string dataset_str, unsigned num_conv, unsigned epochs,
   num_layers = num_conv_layers + 1;
   // initialize feature metadata
   feature_dims.resize(num_layers + 1);
-  feature_dims[0] =
-      context->read_features(dataset_str); // input feature dimension: D
-  feature_dims[1] = hidden1;               // hidden1 level embedding: 16
-  feature_dims[2] = num_classes;           // output embedding: E
-  feature_dims[3] = num_classes;           // normalized output embedding: E
+  feature_dims[0] = context->read_features(dataset_str); // input feature dimension: D
+  for (size_t i = 1; i < num_conv_layers; i++)
+    feature_dims[i] = hidden1;                  // hidden1 level embedding: 16
+  feature_dims[num_conv_layers] = num_classes;  // output embedding: E
+  feature_dims[num_layers] = num_classes;       // normalized output embedding: E
   layers.resize(num_layers);
 
 #ifndef CPU_ONLY
@@ -247,11 +247,27 @@ acc_t Net::fprop(size_t begin, size_t end, size_t count, mask_t* masks) {
   return loss;
 }
 
+void Net::bprop() {
+  for (size_t i = num_layers; i != 0; i--) {
+    layers[i - 1]->backward();
+  }
+}
+
+void Net::update_weights(optimizer* opt) {
+  for (size_t i = 0; i < num_layers; i++) {
+    if (layers[i]->trainable()) {
+      layers[i]->update_weight(opt);
+    }
+  }
+}
+
 void Net::construct_layers() {
+  // append conv layers
   std::cout << "\nConstructing layers...\n";
-  append_conv_layer(0, true);                    // first conv layer
-  append_conv_layer(1);                          // hidden1 layer
-  append_out_layer(2);                           // output layer
+  for (size_t i = 0; i < num_conv_layers-1; i++)
+    append_conv_layer(i, true);                  // conv layers, act=true
+  append_conv_layer(num_conv_layers-1);          // the last hidden layer, act=false
+  append_out_layer(num_layers-1);                // output layer
   layers[0]->set_in_data(context->get_in_ptr()); // feed input data
   context->norm_factor_counting();
   set_contexts();

From d38dba1823f49d80c0ae14d80e9da358b6759147 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Fri, 24 Apr 2020 16:31:11 -0500
Subject: [PATCH 189/660] fix weight decay

---
 .../include/deepgalois/layers/layer.h         |  8 +++-
 .../include/deepgalois/math_functions.hh      |  2 +-
 libdeepgalois/include/deepgalois/net.h        |  7 ++--
 libdeepgalois/src/layers/graph_conv_layer.cpp |  6 +--
 libdeepgalois/src/layers/graph_conv_layer.cu  |  3 +-
 libdeepgalois/src/math_functions.cpp          | 28 +++++++++----
 libdeepgalois/src/math_functions.cu           | 41 ++++++++++++-------
 libdeepgalois/src/net.cpp                     | 20 +++++++++
 8 files changed, 78 insertions(+), 37 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h
index 188feebe75..17ab4e6694 100644
--- a/libdeepgalois/include/deepgalois/layers/layer.h
+++ b/libdeepgalois/include/deepgalois/layers/layer.h
@@ -79,6 +79,11 @@ class layer : public deepgalois::node {
   std::string get_name() { return name_; }
 
   mask_t* get_device_masks() { return d_masks_; }
+  float_t* get_weights_ptr() { return &W[0]; }
+  float_t* get_weights_device_ptr() { return d_W; }
+  float_t* get_grads_ptr() { return &weight_grad[0]; }
+  float_t* get_grads_device_ptr() { return d_weight_grad; }
+
   //! debug print function
   void print_layer_info();
   virtual void set_sample_mask(size_t sample_begin, size_t sample_end,
@@ -126,16 +131,15 @@ class layer : public deepgalois::node {
 
   //! use optimizer to update weights given gradient (weight_grad)
   void update_weight(deepgalois::optimizer* opt) {
+    // std::cout << name_ << ": weight updating ... ";
     // vec_t diff;
     // prev()->merge_grads(&diff);
 #ifdef CPU_ONLY
-    // std::cout << name_ << ": weight updating ... ";
     // parallelize only when target size is big enough to mitigate thread
     // spawning overhead.
     bool parallel = (W.size() >= 512);
     opt->update(layer::weight_grad, layer::W, parallel); // W += grad
 #else
-	//std::cout << name_ << ": ";
     opt->update_gpu(input_dims[1]*output_dims[1], d_weight_grad, d_W); // W += grad
 #endif
     // prev()->clear_grads();
diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh
index edd7fc6eb6..e33345793b 100644
--- a/libdeepgalois/include/deepgalois/math_functions.hh
+++ b/libdeepgalois/include/deepgalois/math_functions.hh
@@ -27,7 +27,7 @@ void vadd_cpu(size_t n, const float_t* a, const float_t* b, float_t* out);
 void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out);
 //! do dot product of 2 vectors
 float_t dot(const vec_t& x, const vec_t& y);
-float_t axpy(size_t n, const float_t a, float_t *x, float_t *y);
+void axpy(size_t n, const float_t a, float_t *x, float_t *y);
 //! Computes half the L2 norm of a tensor without the sqrt: output = sum(t ** 2) / 2
 float_t l2_norm(size_t n, const float_t* a);
 //! clear n elements of a vector
diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h
index e4016231d4..69355ee6b2 100644
--- a/libdeepgalois/include/deepgalois/net.h
+++ b/libdeepgalois/include/deepgalois/net.h
@@ -67,10 +67,9 @@ class Net {
   }
 
   void bprop(); // back propogation
-  void normalize();
-  void regularize();
-  // update trainable weights after back-propagation
-  void update_weights(optimizer* opt);
+  void normalize(); // Scale gradient to counterbalance accumulation
+  void regularize(); // add weight decay
+  void update_weights(optimizer* opt); // update trainable weights after back-propagation
 
 protected:
   bool is_single_class;              // single-class (one-hot) or multi-class label
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index b640acd75a..3233cd0bc6 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -125,11 +125,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
 }
 
 acc_t graph_conv_layer::get_weight_decay_loss() {
-  acc_t loss = 0.0;
-  for (size_t i = 0; i < y*z; i+=z) {
-    loss += math::l2_norm(z, &layer::W[i]);
-  }
-  return loss;
+  return math::l2_norm(y*z, &layer::W[0]);
 }
 
 #endif // end if CPU_ONLY
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cu b/libdeepgalois/src/layers/graph_conv_layer.cu
index 322500d916..28e6002279 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cu
+++ b/libdeepgalois/src/layers/graph_conv_layer.cu
@@ -88,8 +88,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
 }
 
 acc_t graph_conv_layer::get_weight_decay_loss() {
-  acc_t loss = l2_norm_gpu(y*z, d_W);
-  return loss;
+  return l2_norm_gpu(y*z, d_W);
 }
 
 } // namespace
diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp
index 9cff465a73..0cc7812e9e 100644
--- a/libdeepgalois/src/math_functions.cpp
+++ b/libdeepgalois/src/math_functions.cpp
@@ -78,7 +78,8 @@ void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out)
 }
 
 // SAXPY stands for “Single-precision A*X Plus Y"
-float_t axpy(size_t n, const float_t a, float_t *x, float_t *y) {
+/*
+void axpy(size_t n, const float_t a, float_t *x, float_t *y) {
   const size_t alignedN = n - n % vec_len;
   const __m256 alpha = _mm256_set1_ps(a);
   for (size_t i = 0; i < alignedN; i += vec_len) {
@@ -96,8 +97,9 @@ float_t l2_norm(size_t n, const float_t* in) {
     vsum = _mm256_add_ps(vsum, _mm256_mul_ps(a, a));
   }
   __m256 sum = _mm256_hadd_ps(vsum, vsum);
-  return ((float_t*)&sum)[0] + ((float_t*)&sum)[2];;
+  return (((float_t*)&sum)[0] + ((float_t*)&sum)[2]) / 2.0;
 }
+*/
 #else
 // vector multiply scalar
 void mul_scalar(const float_t alpha, vec_t& Y) {
@@ -108,16 +110,24 @@ void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out)
   for (size_t i = 0; i < n; ++i) out[i] = alpha * in[i];
 }
 
-float_t axpy(size_t n, const float_t a, float_t *x, float_t *y) {
-  for (size_t i = 0; i < n; ++i) y[i] = a * x[i] + y[i];
+//void axpy(size_t n, const float_t a, float_t *x, float_t *y) {
+//  for (size_t i = 0; i < n; ++i) y[i] = a * x[i] + y[i];
+//}
+
+//float_t l2_norm(size_t n, const float_t* a) {
+//  float_t sum = 0.0;
+//  for (size_t i = 0; i < n; ++i) sum += a[i] * a[i];
+//  return sum / 2.0;
+//}
+#endif
+
+void axpy(size_t n, const float_t a, float_t *x, float_t *y) {
+  cblas_saxpy(n, a, x, 1, y, 1);
 }
 
-float_t l2_norm(size_t n, const float_t* a) {
-  float_t sum = 0.0;
-  for (size_t i = 0; i < n; ++i) sum += a[i] * a[i];
-  return sum/2.0;
+float_t l2_norm(size_t n, const float_t* x) {
+  return cblas_snrm2(n, x, 1);
 }
-#endif
 
 // dot product
 float_t dot(const vec_t& x, const vec_t& y) {
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index b906702d9c..0bef3a47d3 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -278,21 +278,20 @@ void scale_gpu(const int n, const float alpha, const float* x, float* y) {
 }
 
 __global__ void set_kernel(const int n, const float_t alpha, float_t* y) {
-  CUDA_KERNEL_LOOP(index, n) { y[index] = alpha; }
+  CUDA_KERNEL_LOOP(i, n) { y[i] = alpha; }
 }
 
-void set_gpu(const int n, const float_t alpha, float_t* Y) {
+void set_gpu(const int n, const float_t alpha, float_t* y) {
   if (alpha == 0) {
-    CUDA_CHECK(cudaMemset(Y, 0, sizeof(float_t) * n));
+    CUDA_CHECK(cudaMemset(y, 0, sizeof(float_t) * n));
     return;
   }
-  set_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, alpha, Y);
+  set_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, alpha, y);
   CudaTest("solving set kernel failed");
 }
 
-__global__ void add_scalar_kernel(const int n, const float_t alpha,
-                                  float_t* y) {
-  CUDA_KERNEL_LOOP(index, n) { y[index] += alpha; }
+__global__ void add_scalar_kernel(const int n, const float_t a, float_t* y) {
+  CUDA_KERNEL_LOOP(i, n) { y[i] += a; }
 }
 
 void add_scalar_gpu(const int n, const float_t alpha, float_t* Y) {
@@ -302,7 +301,7 @@ void add_scalar_gpu(const int n, const float_t alpha, float_t* Y) {
 
 __global__ void vadd_kernel(const int n, const float_t* a, const float_t* b,
                             float_t* y) {
-  CUDA_KERNEL_LOOP(index, n) { y[index] = a[index] + b[index]; }
+  CUDA_KERNEL_LOOP(i, n) { y[i] = a[i] + b[i]; }
 }
 
 void vadd_gpu(const int n, const float_t* a, const float_t* b, float_t* y) {
@@ -316,10 +315,29 @@ __global__ void axpy_kernel(const int n, const float_t a, const float_t* x,
 }
 
 void axpy_gpu(const int n, const float_t a, const float_t* x, float_t* y) {
-  axpy_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, a, x, y);
+  //axpy_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, a, x, y);
+  CUBLAS_CHECK(cublasSaxpy(deepgalois::Context::cublas_handle(), n, &a, x, 1, y, 1));
   CudaTest("solving axpy kernel failed");
 }
 
+__global__ void l2_norm_kernel(const int n, const float_t* a, float_t *sum) {
+  CUDA_KERNEL_LOOP(i, n) {
+    float_t product = a[i] * a[i];
+    atomicAdd(sum, product);
+  }
+}
+
+acc_t l2_norm_gpu(int n, float_t* x) {
+  float_t sum = 0.0;
+  CUBLAS_CHECK(cublasSnrm2(deepgalois::Context::cublas_handle(), n, x, 1, &sum));
+  //float_t *d_sum;
+  //CUDA_CHECK(cudaMalloc((void**)&d_sum, sizeof(float_t));
+  //CUDA_CHECK(cudaMemcpy(d_sum, &sum, sizeof(acc_t), cudaMemcpyHostToDevice));
+  //l2_norm_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, x, d_sum);
+  //CUDA_CHECK(cudaMemcpy(d_sum, &sum, sizeof(float_t), cudaMemcpyDeviceToHost));
+  return (acc_t)sum / 2.0;
+}
+
 void copy_gpu(int len, const float_t* in, float_t* out) {
   CUDA_CHECK(cudaMemcpy(out, in, len * sizeof(float_t), cudaMemcpyDeviceToDevice));
 }
@@ -693,8 +711,3 @@ acc_t masked_avg_loss_gpu(int begin, int end, int count, mask_t* masks, float_t*
   return *(total_loss.cpu_rd_ptr()) / count;
 }
 
-acc_t l2_norm_gpu(int n, float_t * tensor) {
-  acc_t sum = 0.0;
-  return sum / 2.0;
-}
-
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index 58fe59312e..1428c7508d 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -253,7 +253,27 @@ void Net::bprop() {
   }
 }
 
+// Scale gradient to counterbalance accumulation
+void Net::normalize() {
+}
+
+// add weight decay
+void Net::regularize() {
+  size_t layer_id = 0;
+  auto n = feature_dims[layer_id] * feature_dims[layer_id+1];
+#ifdef CPU_ONLY
+  // TODO: parallel
+  math::axpy(n, weight_decay, layers[layer_id]->get_weights_ptr(), 
+    layers[layer_id]->get_grads_ptr());
+#else
+  axpy_gpu(n, weight_decay, layers[layer_id]->get_weights_device_ptr(), 
+    layers[layer_id]->get_grads_device_ptr());
+#endif
+}
+
 void Net::update_weights(optimizer* opt) {
+  normalize();
+  regularize();
   for (size_t i = 0; i < num_layers; i++) {
     if (layers[i]->trainable()) {
       layers[i]->update_weight(opt);

From 725f7b91b51c413cc6c04f2ec106ab50a4b9c9b7 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Sat, 25 Apr 2020 16:16:55 -0500
Subject: [PATCH 190/660] add l2_norm_layer

---
 libdeepgalois/CMakeLists.txt                  |  4 ++
 .../include/deepgalois/layers/l2_norm_layer.h | 20 ++++++
 .../include/deepgalois/math_functions.hh      |  4 +-
 libdeepgalois/include/deepgalois/net.h        | 26 ++++----
 libdeepgalois/src/layers/l2_norm_layer.cpp    | 54 +++++++++++++++
 libdeepgalois/src/layers/l2_norm_layer.cu     | 18 +++++
 libdeepgalois/src/math_functions.cu           |  9 ++-
 libdeepgalois/src/net.cpp                     | 35 ++++++++--
 libdeepgalois/src/net.cu                      |  2 +-
 libdeepgalois/src/utils.cpp                   | 66 ++++++++++---------
 lonestargnn/gcn/gcn.cpp                       |  3 +-
 lonestargnn/include/lonestargnn.h             |  4 +-
 12 files changed, 191 insertions(+), 54 deletions(-)
 create mode 100644 libdeepgalois/include/deepgalois/layers/l2_norm_layer.h
 create mode 100644 libdeepgalois/src/layers/l2_norm_layer.cpp
 create mode 100644 libdeepgalois/src/layers/l2_norm_layer.cu

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index 193988f414..3f592f0d18 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -42,6 +42,7 @@ else()
     src/layers/softmax_loss_layer.cu
     src/layers/sigmoid_loss_layer.cu
     src/layers/leaky_relu_layer.cu
+    src/layers/l2_norm_layer.cu
     src/layers/relu_layer.cu
     src/layers/aggregator.cu
     src/math_functions.cu
@@ -64,12 +65,14 @@ set(sources
   src/layers/sigmoid_loss_layer.cpp
   src/layers/graph_conv_layer.cpp
   src/layers/leaky_relu_layer.cpp
+  src/layers/l2_norm_layer.cpp
   src/layers/relu_layer.cpp
   src/layers/aggregator.cpp
   src/math_functions.cpp
   src/layers/layer.cpp
   src/DistContext.cpp
   src/optimizer.cpp
+  src/sampler.cpp
   src/utils.cpp
   src/node.cpp
   src/net.cpp
@@ -80,6 +83,7 @@ set(sources
   src/layers/sigmoid_loss_layer.cpp
   src/layers/graph_conv_layer.cpp
   src/layers/leaky_relu_layer.cpp
+  src/layers/l2_norm_layer.cpp
   src/layers/relu_layer.cpp
   src/layers/aggregator.cpp
   src/math_functions.cpp
diff --git a/libdeepgalois/include/deepgalois/layers/l2_norm_layer.h b/libdeepgalois/include/deepgalois/layers/l2_norm_layer.h
new file mode 100644
index 0000000000..b15c1ae671
--- /dev/null
+++ b/libdeepgalois/include/deepgalois/layers/l2_norm_layer.h
@@ -0,0 +1,20 @@
+#pragma once
+#include "layer.h"
+
+namespace deepgalois {
+// L2 Normalization Layer
+class l2_norm_layer : public layer {
+public:
+  l2_norm_layer(unsigned level, float_t eps, float_t scale, dims_t in_dims, dims_t out_dims);
+  l2_norm_layer(unsigned level, dims_t in_dims, dims_t out_dims) :
+    l2_norm_layer(level, 1e-12, 20, in_dims, out_dims) {}
+  ~l2_norm_layer() {}
+  std::string layer_type() const override { return std::string("l2_norm"); }
+  virtual void forward_propagation(const float_t* in_data, float_t* out_data);
+  virtual void back_propagation(const float_t* in_data, const float_t* out_data,
+                                float_t* out_grad, float_t* in_grad);
+protected:
+  float_t epsilon_;
+  float_t scale_;
+};
+} // namespace
diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh
index e33345793b..5611caaa94 100644
--- a/libdeepgalois/include/deepgalois/math_functions.hh
+++ b/libdeepgalois/include/deepgalois/math_functions.hh
@@ -171,5 +171,7 @@ void float_malloc_device(int n, float_t*& ptr);
 void float_free_device(float_t*& ptr);
 void float_copy_device(int n, float_t* h_ptr, float_t *d_ptr);
 acc_t masked_avg_loss_gpu(int begin, int end, int count, mask_t* masks, float_t* loss);
-acc_t l2_norm_gpu(int n, float_t *tensor);
+acc_t l2_norm_gpu(int n, const float_t *in);
+void l2_norm_gpu(size_t x, size_t y, const float_t* in, float_t *out);
+void d_l2_norm_gpu(size_t x, size_t y, const float_t* in_data, float_t *in_diff, float_t *out_diff);
 #endif
diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h
index 69355ee6b2..a3fa9d0dee 100644
--- a/libdeepgalois/include/deepgalois/net.h
+++ b/libdeepgalois/include/deepgalois/net.h
@@ -26,7 +26,8 @@ namespace deepgalois {
 // layer 2: features N x 16, weights 16 x E, out N x E
 class Net {
 public:
-  Net() : is_single_class(true), num_samples(0), num_classes(0),
+  Net() : is_single_class(true), has_l2norm(false), has_dense(false),
+          num_samples(0), num_classes(0),
           num_conv_layers(0), num_layers(0), num_epochs(0),
           learning_rate(0.0), dropout_rate(0.0), weight_decay(0.0),
           train_begin(0), train_end(0), train_count(0),
@@ -35,20 +36,26 @@ class Net {
           train_masks(NULL), val_masks(NULL), test_masks(NULL), context(NULL) {}
   void init(std::string dataset_str, unsigned num_conv, unsigned epochs,
             unsigned hidden1, float lr, float dropout, float wd,
-            bool selfloop, bool is_single, Graph* dGraph);
+            bool selfloop, bool single, bool l2norm, bool dense, Graph* dGraph);
   size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; }
   size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id + 1]; }
   size_t get_nnodes() { return num_samples; }
+
   void construct_layers();
   void append_out_layer(size_t layer_id);
+  void append_l2norm_layer(size_t layer_id);
+  void append_dense_layer(size_t layer_id);
+  void append_conv_layer(size_t layer_id, bool act = false, bool norm = true,
+         bool bias = false, bool dropout = true); //! Add a convolution layer to the network
+
   void train(optimizer* opt, bool need_validate); // training
   double evaluate(std::string type, acc_t& loss, acc_t& acc); // inference
   void read_test_masks(std::string dataset, Graph* dGraph);
   acc_t fprop(size_t begin, size_t end, size_t count, mask_t* masks); // forward propagation
-
-  //! Add a convolution layer to the network
-  void append_conv_layer(size_t layer_id, bool act = false, bool norm = true,
-                         bool bias = false, bool dropout = true);
+  void bprop(); // back propogation
+  void normalize(); // Scale gradient to counterbalance accumulation
+  void regularize(); // add weight decay
+  void update_weights(optimizer* opt); // update trainable weights after back-propagation
 
   //! Save the context object to all layers of the network
   void set_contexts() {
@@ -66,13 +73,10 @@ class Net {
       layers[i]->print_layer_info();
   }
 
-  void bprop(); // back propogation
-  void normalize(); // Scale gradient to counterbalance accumulation
-  void regularize(); // add weight decay
-  void update_weights(optimizer* opt); // update trainable weights after back-propagation
-
 protected:
   bool is_single_class;              // single-class (one-hot) or multi-class label
+  bool has_l2norm;                   // whether the net contains an l2_norm layer
+  bool has_dense;                    // whether the net contains an dense layer
   size_t num_samples;                // number of samples: N
   size_t num_classes;                // number of vertex classes: E
   size_t num_conv_layers;            // number of convolutional layers
diff --git a/libdeepgalois/src/layers/l2_norm_layer.cpp b/libdeepgalois/src/layers/l2_norm_layer.cpp
new file mode 100644
index 0000000000..46379aed60
--- /dev/null
+++ b/libdeepgalois/src/layers/l2_norm_layer.cpp
@@ -0,0 +1,54 @@
+#include "deepgalois/layers/l2_norm_layer.h"
+
+namespace deepgalois {
+
+l2_norm_layer::l2_norm_layer(unsigned level, float_t eps, float_t scale,
+                             dims_t in_dims, dims_t out_dims)
+    : layer(level, in_dims, out_dims), epsilon_(eps), scale_(scale) {
+  assert(input_dims[0] == output_dims[0]); // num_vertices
+  trainable_ = false;
+  name_ = layer_type() + "_" + std::to_string(level);
+}
+
+#ifdef CPU_ONLY
+void l2_norm_layer::forward_propagation(const float_t* in_data, float_t* out_data) {
+  size_t x = input_dims[0];
+  size_t y = input_dims[1];
+  galois::do_all(galois::iterate((size_t)0, x), [&](const auto i) {
+  //for (size_t i = 0; i < x; i++) {
+    float_t sum = 0.0;
+    size_t idx = i * y;
+    for (size_t j = 0; j < y; j++) {
+      sum += in_data[idx + j] * in_data[idx + j];
+    }
+    sum = std::max(sum, epsilon_);
+    sum = sqrt(sum);
+    for (size_t j = 0; j < y; j++) {
+      out_data[idx + j] = in_data[idx + j] / sum  * scale_;
+    }
+  }, galois::loopname("l2_norm"));
+}
+
+void l2_norm_layer::back_propagation(const float_t* in_data, const float_t* out_data,
+                                  float_t* out_grad, float_t* in_grad) {
+  size_t x = input_dims[0];
+  size_t y = input_dims[1];
+  galois::do_all(galois::iterate((size_t)0, x), [&](const auto i) {
+  //for (size_t i = 0; i < x; i++) {
+    float_t sum_x2 = 0.0;
+    float_t coef0_axis0 = 0, coef1_axis0 = 0;
+    size_t idx = i * y;
+    for (size_t j = 0; j < y; j++) {
+      sum_x2 += powf(in_data[idx + j], 2);
+      coef0_axis0 -= in_data[idx + j] * out_grad[idx + j];
+    }
+    coef1_axis0 = powf(sum_x2, -1.5);
+    for (size_t j = 0; j < y; j++) {
+      in_grad[idx + j] = in_data[idx + j] * coef0_axis0 * coef1_axis0
+                         + out_grad[idx + j] * sum_x2 * coef1_axis0;
+    }
+  }, galois::loopname("d_l2_norm"));
+}
+#endif
+
+} // namespace
diff --git a/libdeepgalois/src/layers/l2_norm_layer.cu b/libdeepgalois/src/layers/l2_norm_layer.cu
new file mode 100644
index 0000000000..56128eb0d3
--- /dev/null
+++ b/libdeepgalois/src/layers/l2_norm_layer.cu
@@ -0,0 +1,18 @@
+#include "deepgalois/layers/l2_norm_layer.h"
+
+namespace deepgalois {
+
+void l2_norm_layer::forward_propagation(const float_t* in_data, float_t* out_data) {
+  size_t x = input_dims[0];
+  size_t y = input_dims[1];
+  l2_norm_gpu(x, y, in_data, out_data);
+}
+
+void l2_norm_layer::back_propagation(const float_t* in_data, const float_t* out_data,
+                                  float_t* out_grad, float_t* in_grad) {
+  size_t x = input_dims[0];
+  size_t y = input_dims[1];
+  d_l2_norm_gpu(x, y, in_data, out_grad, in_grad);
+}
+
+} // namespace
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index 0bef3a47d3..c1746d9075 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -327,7 +327,7 @@ __global__ void l2_norm_kernel(const int n, const float_t* a, float_t *sum) {
   }
 }
 
-acc_t l2_norm_gpu(int n, float_t* x) {
+acc_t l2_norm_gpu(int n, const float_t* x) {
   float_t sum = 0.0;
   CUBLAS_CHECK(cublasSnrm2(deepgalois::Context::cublas_handle(), n, x, 1, &sum));
   //float_t *d_sum;
@@ -338,6 +338,13 @@ acc_t l2_norm_gpu(int n, float_t* x) {
   return (acc_t)sum / 2.0;
 }
 
+void l2_norm_gpu(size_t x, size_t y, const float_t* in, float_t *out) {
+}
+
+void d_l2_norm_gpu(size_t x, size_t y, const float_t* in_data, 
+                   float_t *in_diff, float_t *out_diff) {
+}
+
 void copy_gpu(int len, const float_t* in, float_t* out) {
   CUDA_CHECK(cudaMemcpy(out, in, len * sizeof(float_t), cudaMemcpyDeviceToDevice));
 }
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index 1428c7508d..c5ef556032 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -8,13 +8,16 @@ namespace deepgalois {
 
 void Net::init(std::string dataset_str, unsigned num_conv, unsigned epochs,
                unsigned hidden1, float lr, float dropout, float wd,
-               bool selfloop, bool is_single, Graph* dGraph) {
+               bool selfloop, bool single, bool l2norm, bool dense, Graph* dGraph) {
+  assert(num_conv > 0);
   num_conv_layers = num_conv;
   num_epochs = epochs;
   learning_rate = lr;
   dropout_rate = dropout;
   weight_decay = wd;
-  is_single_class = is_single;
+  is_single_class = single;
+  has_l2norm = l2norm;
+  has_dense = dense;
   galois::gPrint("Configuration: num_conv_layers ", num_conv_layers,
                  ", num_epochs ", num_epochs,
                  ", hidden1 ", hidden1,
@@ -23,7 +26,7 @@ void Net::init(std::string dataset_str, unsigned num_conv, unsigned epochs,
                  ", weight_decay ", weight_decay, "\n");
 #ifndef GALOIS_USE_DIST
   context = new deepgalois::Context();
-  context->set_label_class(is_single);
+  context->set_label_class(is_single_class);
   num_samples = context->read_graph(dataset_str, selfloop);
 #else
   context = new deepgalois::DistContext();
@@ -78,13 +81,19 @@ void Net::init(std::string dataset_str, unsigned num_conv, unsigned epochs,
   // train count and val count are LOCAL counts
 
   num_layers = num_conv_layers + 1;
+  if (has_l2norm) num_layers ++;
+  if (has_dense) num_layers ++;
   // initialize feature metadata
   feature_dims.resize(num_layers + 1);
   feature_dims[0] = context->read_features(dataset_str); // input feature dimension: D
   for (size_t i = 1; i < num_conv_layers; i++)
-    feature_dims[i] = hidden1;                  // hidden1 level embedding: 16
-  feature_dims[num_conv_layers] = num_classes;  // output embedding: E
-  feature_dims[num_layers] = num_classes;       // normalized output embedding: E
+    feature_dims[i] = hidden1;                           // hidden1 level embedding: 16
+  feature_dims[num_conv_layers] = num_classes;           // output embedding: E
+  if (has_l2norm) 
+      feature_dims[num_conv_layers+1] = num_classes;     // l2 normalized embedding: E
+  if (has_dense) 
+      feature_dims[num_layers-1] = num_classes;          // MLP embedding: E
+  feature_dims[num_layers] = num_classes;                // normalized output embedding: E
   layers.resize(num_layers);
 
 #ifndef CPU_ONLY
@@ -287,12 +296,24 @@ void Net::construct_layers() {
   for (size_t i = 0; i < num_conv_layers-1; i++)
     append_conv_layer(i, true);                  // conv layers, act=true
   append_conv_layer(num_conv_layers-1);          // the last hidden layer, act=false
+  if (has_l2norm)
+    append_l2norm_layer(num_conv_layers);        // l2_norm layer
+  if (has_dense)
+    append_dense_layer(num_layers-2);            // dense layer
   append_out_layer(num_layers-1);                // output layer
   layers[0]->set_in_data(context->get_in_ptr()); // feed input data
   context->norm_factor_counting();
   set_contexts();
 }
 
+//! Add an l2_norm layer to the network
+void Net::append_l2norm_layer(size_t layer_id) {
+}
+
+//! Add an dense layer to the network
+void Net::append_dense_layer(size_t layer_id) {
+}
+
 //! Add an output layer to the network
 void Net::append_out_layer(size_t layer_id) {
   assert(layer_id > 0); // can not be the first layer
@@ -405,7 +426,7 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks
 }
 
 acc_t Net::masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, Graph* dGraph) {
-  auto preds = layers[num_conv_layers - 1]->next()->get_data();
+  auto preds = layers[num_conv_layers]->next()->get_data();
   auto ground_truth = context->get_labels_ptr();
   return deepgalois::masked_f1_score(begin, end, count, masks, num_classes, ground_truth, preds);
 }
diff --git a/libdeepgalois/src/net.cu b/libdeepgalois/src/net.cu
index c7acda5666..d46b807711 100644
--- a/libdeepgalois/src/net.cu
+++ b/libdeepgalois/src/net.cu
@@ -130,7 +130,7 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count,
 acc_t Net::masked_multi_class_accuracy(size_t begin, size_t end, size_t count, 
                                        mask_t* masks, CSRGraph* g) {
 	return masked_f1_score_gpu(num_classes, begin, end, count, masks,
-                             layers[num_conv_layers - 1]->next()->get_data(),
+                             layers[num_conv_layers]->next()->get_data(),
                              context->get_labels_device_ptr());
 }
 
diff --git a/libdeepgalois/src/utils.cpp b/libdeepgalois/src/utils.cpp
index 46470e2997..b2b65c9582 100644
--- a/libdeepgalois/src/utils.cpp
+++ b/libdeepgalois/src/utils.cpp
@@ -13,49 +13,53 @@ const std::string dataset_names[NUM_DATASETS] = {"cora", "citeseer", "ppi", "pub
 // The formula for the F1 score is:
 // F1 = 2 * (precision * recall) / (precision + recall)
 // where precision = TP / (TP + FP), recall = TP / (TP + FN)
-// TP: true positive; FP: false positive; FN: false negtive.
+// TP: true positive; FP: false positive; FN: false negative.
 // In the multi-class and multi-label case, this is the weighted average of the F1 score of each class.
 // Please refer to https://sebastianraschka.com/faq/docs/multiclass-metric.html,
 // http://pageperso.lif.univ-mrs.fr/~francois.denis/IAAM1/scikit-learn-docs.pdf (p.1672)
 // and https://github.com/ashokpant/accuracy-evaluation-cpp/blob/master/src/evaluation.hpp
 acc_t masked_f1_score(size_t begin, size_t end, size_t count, mask_t *masks, 
                       size_t num_classes, label_t *ground_truth, float_t *pred) {
-  float beta = 1.0;
-  std::vector<int> true_positive(num_classes, 0);
-  std::vector<int> false_positive(num_classes, 0);
-  std::vector<int> false_negtive(num_classes, 0);
-  galois::do_all(galois::iterate(begin, end), [&](const auto& i) {
-    if (masks[i] == 1) {
-      for (size_t j = 0; j < num_classes; j++) {
-        auto idx = i * num_classes + j;
+  double precision_cls(0.), recall_cls(0.), f1_accum(0.);
+  int tp_accum(0), fn_accum(0), fp_accum(0), tn_accum(0);
+  for (size_t col = 0; col < num_classes; col++) {
+    int tp_cls(0), fp_cls(0), fn_cls(0), tn_cls(0);
+    for (size_t row = begin; row < end; row ++) {
+    //galois::do_all(galois::iterate(begin, end), [&](const auto& row) {
+      if (masks[row] == 1) {
+        auto idx = row * num_classes + col;
         if (ground_truth[idx] == 1 && pred[idx] > 0.5) {
-          __sync_fetch_and_add(&true_positive[j], 1);
+          //__sync_fetch_and_add(&tp_cls, 1);
+          tp_cls += 1;
         } else if (ground_truth[idx] == 0 && pred[idx] > 0.5) {
-          __sync_fetch_and_add(&false_positive[j], 1);
+          //__sync_fetch_and_add(&fp_cls, 1);
+          fp_cls += 1;
         } else if (ground_truth[idx] == 1 && pred[idx] <= 0.5) {
-          __sync_fetch_and_add(&false_negtive[j], 1);
+          //__sync_fetch_and_add(&fn_cls, 1);
+          fn_cls += 1;
+        } else if (ground_truth[idx] == 0 && pred[idx] <= 0.5) {
+          //__sync_fetch_and_add(&tn_cls, 1);
+          tn_cls += 1;
         }
       }
-	}
-  }, galois::loopname("MaskedF1Score"));
-  acc_t pNumerator = 0.0;
-  acc_t pDenominator = 0.0;
-  acc_t rNumerator = 0.0;
-  acc_t rDenominator = 0.0;
-  for (size_t i = 0; i < num_classes; i++) {
-    acc_t fn = (acc_t)false_negtive[i]; // false negtive
-    acc_t fp = (acc_t)false_positive[i]; // false positive
-	acc_t tp = (acc_t)true_positive[i]; // true positive
-	pNumerator = pNumerator + tp;
-	pDenominator = pDenominator + (tp + fp);
-    rNumerator = rNumerator + tp;
-    rDenominator = rDenominator + (tp + fn);
+    }
+    //}, galois::loopname("MaskedF1Score"));
+    tp_accum += tp_cls;
+    fn_accum += fn_cls;
+    fp_accum += fp_cls;
+    tn_accum += tn_cls;
+    precision_cls = tp_cls + fp_cls > 0 ? (double)tp_cls/(double)(tp_cls+fp_cls) : 0.;
+    recall_cls = tp_cls+fn_cls > 0 ? (double)tp_cls/(double)(tp_cls+fn_cls) : 0.;
+    f1_accum += recall_cls+precision_cls > 0. ? 2.*(recall_cls*precision_cls)/(recall_cls+precision_cls) : 0.;
   }
-  auto recallMicro = rNumerator / rDenominator;
-  acc_t precisionMicro = pNumerator / pDenominator;
-  auto fscoreMicro = (((beta * beta) + 1) * precisionMicro * recallMicro) / 
-                     ((beta * beta) * precisionMicro + recallMicro);
-  return fscoreMicro;
+  double f1_macro = f1_accum/(double)num_classes;
+  //double accuracy_mic = (double)(tp_accum+tn_accum)/(double)(tp_accum+tn_accum+fp_accum+fn_accum);
+  double precision_mic = tp_accum+fp_accum > 0 ? (double)tp_accum/(double)(tp_accum+fp_accum) : 0.;
+  double recall_mic = tp_accum+fn_accum > 0 ? (double)tp_accum/(double)(tp_accum+fn_accum) : 0.;
+  double f1_micro = recall_mic+precision_mic > 0. ? 2.*(recall_mic*precision_mic)/(recall_mic+precision_mic) : 0.;
+  std::cout << std::setprecision(3) << std::fixed <<
+      " (f1_micro: " << f1_micro << ", f1_macro: " << f1_macro << ") ";
+  return f1_micro;
 }
 
 #ifndef GALOIS_USE_DIST
diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp
index 1a3698bc96..4a6a06639a 100644
--- a/lonestargnn/gcn/gcn.cpp
+++ b/lonestargnn/gcn/gcn.cpp
@@ -26,7 +26,8 @@ int main(int argc, char** argv) {
 
   // read network, features, ground truth, initialize metadata
   network.init(dataset, num_conv_layers, epochs, hidden1, learning_rate, 
-               dropout_rate, weight_decay, add_selfloop, is_single_class, dGraph);
+               dropout_rate, weight_decay, add_selfloop, 
+               is_single_class, add_l2norm, add_dense, dGraph);
   // default setting for now; can be customized by the user
   network.construct_layers();
   network.print_layers_info();
diff --git a/lonestargnn/include/lonestargnn.h b/lonestargnn/include/lonestargnn.h
index 1c96548a36..e2191fb7a1 100644
--- a/lonestargnn/include/lonestargnn.h
+++ b/lonestargnn/include/lonestargnn.h
@@ -28,7 +28,7 @@ static cll::opt<unsigned> hidden1("h",
     cll::desc("Number of units in hidden layer 1 (default value 16)"), cll::init(16));
 static cll::opt<float> learning_rate("lr", 
     cll::desc("Initial learning rate (default value 0.01)"), cll::init(0.01));
-static cll::opt<float> dropout_rate("d", 
+static cll::opt<float> dropout_rate("dr", 
     cll::desc("Dropout rate (1 - keep probability) (default value 0.5)"), cll::init(0.5));
 static cll::opt<float> weight_decay("wd",
     cll::desc("Weight for L2 loss on embedding matrix (default value 5e-4)"), cll::init(5e-4));
@@ -41,6 +41,8 @@ static cll::opt<bool> is_single_class("sc",
 static cll::opt<bool> do_validate("dv", cll::desc("enable validation"), cll::init(1));
 static cll::opt<bool> do_test("dt", cll::desc("enable test"), cll::init(1));
 static cll::opt<bool> add_selfloop("sl", cll::desc("add selfloop"), cll::init(0));
+static cll::opt<bool> add_l2norm("l2", cll::desc("add an l2_norm layer"), cll::init(0));
+static cll::opt<bool> add_dense("d", cll::desc("add an dense layer"), cll::init(0));
 
 //! standard global options to the benchmarks
 extern llvm::cl::opt<bool> skipVerify;

From 707a773285e3ffbff0d2ed1264971300ff2725ed Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Sat, 25 Apr 2020 20:49:51 -0500
Subject: [PATCH 191/660] update sampler

---
 libdeepgalois/include/deepgalois/context.h    |  6 ++-
 .../include/deepgalois/layers/layer.h         |  4 +-
 .../deepgalois/layers/sigmoid_loss_layer.h    |  1 +
 .../deepgalois/layers/softmax_loss_layer.h    |  5 +--
 libdeepgalois/include/deepgalois/net.h        | 15 ++++++-
 libdeepgalois/include/deepgalois/sampler.h    |  2 +-
 .../src/layers/sigmoid_loss_layer.cpp         |  9 +++-
 .../src/layers/sigmoid_loss_layer.cu          |  6 ++-
 .../src/layers/softmax_loss_layer.cpp         |  9 +++-
 .../src/layers/softmax_loss_layer.cu          |  6 ++-
 libdeepgalois/src/net.cpp                     | 43 ++++++++++++++++++-
 libdeepgalois/src/sampler.cpp                 |  4 +-
 lonestargnn/gcn/gcn.cpp                       |  3 +-
 lonestargnn/include/lonestargnn.h             |  4 +-
 14 files changed, 95 insertions(+), 22 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h
index e40b6a6371..786fc48d5d 100644
--- a/libdeepgalois/include/deepgalois/context.h
+++ b/libdeepgalois/include/deepgalois/context.h
@@ -30,6 +30,7 @@ class Context {
   label_t get_label(size_t i) { return labels[i]; } // single-class (one-hot) label
   label_t get_label(size_t i, size_t j) { return labels[i*num_classes+j]; } // multi-class label
   label_t* get_labels_ptr() { return labels; }
+  label_t* get_labels_subg_ptr() { return labels_subg; }
   label_t* get_labels_device_ptr() { return d_labels; }
   float_t* get_in_ptr();
   float_t* get_norm_factor() { return norm_factor; }
@@ -40,10 +41,12 @@ class Context {
 
 #ifdef CPU_ONLY
   Graph* graph_cpu; // the input graph, |V| = N
+  Graph* subgraph_cpu;
   void genGraph(LGraph& lg, Graph& g);
   void add_selfloop(Graph &og, Graph &g);
   //! returns pointer to the graph
   Graph* getCpuGraphPointer();
+  Graph* getCpuSubgraphPointer() { return subgraph_cpu; };
 #else
   CSRGraph graph_gpu; // the input graph, |V| = N
   inline static cublasHandle_t cublas_handle() { return cublas_handle_; }
@@ -59,7 +62,8 @@ class Context {
   size_t feat_len;             // input feature length: D
   bool is_single_class;        // single-class (one-hot) or multi-class label
   bool is_selfloop_added;      // whether selfloop is added to the input graph
-  label_t *labels;             // labels for classification: N x 1
+  label_t *labels;             // labels for classification. Single-class label: Nx1, multi-class label: NxE 
+  label_t *labels_subg;        // labels for subgraph
   float_t* h_feats;            // input features: N x D
   float_t* norm_factor;        // normalization constant based on graph structure
   label_t* d_labels;           // labels on device
diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h
index 17ab4e6694..a18802f198 100644
--- a/libdeepgalois/include/deepgalois/layers/layer.h
+++ b/libdeepgalois/include/deepgalois/layers/layer.h
@@ -52,7 +52,7 @@ class layer : public deepgalois::node {
         std::vector<size_t> out_dims)
       : node(in_dims.size(), out_dims.size()), level_(level), begin_(0),
         end_(0), num_dims(in_dims.size()), input_dims(in_dims),
-        output_dims(out_dims) {
+        output_dims(out_dims), labels(NULL) {
     add_edge();
   }
   virtual ~layer()                       = default;
@@ -72,6 +72,7 @@ class layer : public deepgalois::node {
 
   // is this layer trainable?
   void set_trainable(bool trainable) { trainable_ = trainable; }
+  void set_labels_ptr(label_t *ptr) { labels = ptr; }
   bool trainable() const { return trainable_; }
 
   // name metadata
@@ -166,6 +167,7 @@ class layer : public deepgalois::node {
   mask_t* d_masks_;
   float_t* loss; // error for each vertex: N x 1
   ContextType* context;
+  label_t* labels;
 
 #ifdef GALOIS_USE_DIST
   // Used for synchronization of weight gradients
diff --git a/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h b/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h
index 0f46cde043..760b6f0ab1 100644
--- a/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h
@@ -10,6 +10,7 @@ class sigmoid_loss_layer : public layer {
   std::string layer_type() const override {
     return std::string("sigmoid_loss");
   }
+  inline label_t get_label(size_t i, size_t j);
   virtual void forward_propagation(const float_t* in_data, float_t* out_data);
   virtual void back_propagation(const float_t* in_data, const float_t* out_data,
                                 float_t* out_grad, float_t* in_grad);
diff --git a/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h b/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h
index 1a5b7e86ee..060698e3d9 100644
--- a/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h
@@ -7,9 +7,8 @@ class softmax_loss_layer : public layer {
   softmax_loss_layer(unsigned level, std::vector<size_t> in_dims,
                      std::vector<size_t> out_dims);
   ~softmax_loss_layer();
-  std::string layer_type() const override {
-    return std::string("softmax_loss");
-  }
+  std::string layer_type() const override { return std::string("softmax_loss"); }
+  inline label_t get_label(size_t i);
   virtual void forward_propagation(const float_t* in_data, float_t* out_data);
   virtual void back_propagation(const float_t* in_data, const float_t* out_data,
                                 float_t* out_grad, float_t* in_grad);
diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h
index a3fa9d0dee..0cd94adc05 100644
--- a/libdeepgalois/include/deepgalois/net.h
+++ b/libdeepgalois/include/deepgalois/net.h
@@ -8,10 +8,12 @@
 #include "galois/Timer.h"
 #include "deepgalois/types.h"
 #include "deepgalois/gtypes.h"
+#include "deepgalois/layers/l2_norm_layer.h"
 #include "deepgalois/layers/graph_conv_layer.h"
 #include "deepgalois/layers/softmax_loss_layer.h"
 #include "deepgalois/layers/sigmoid_loss_layer.h"
 #include "deepgalois/optimizer.h"
+#include "deepgalois/sampler.h"
 #ifndef GALOIS_USE_DIST
 #include "deepgalois/context.h"
 #else
@@ -27,6 +29,7 @@ namespace deepgalois {
 class Net {
 public:
   Net() : is_single_class(true), has_l2norm(false), has_dense(false),
+          neighbor_sample_size(0), subgraph_sample_size(0),
           num_samples(0), num_classes(0),
           num_conv_layers(0), num_layers(0), num_epochs(0),
           learning_rate(0.0), dropout_rate(0.0), weight_decay(0.0),
@@ -36,7 +39,9 @@ class Net {
           train_masks(NULL), val_masks(NULL), test_masks(NULL), context(NULL) {}
   void init(std::string dataset_str, unsigned num_conv, unsigned epochs,
             unsigned hidden1, float lr, float dropout, float wd,
-            bool selfloop, bool single, bool l2norm, bool dense, Graph* dGraph);
+            bool selfloop, bool single, bool l2norm, bool dense, 
+            unsigned neigh_sample_size = 0, unsigned subg_sample = 0, 
+            Graph* dGraph = NULL);
   size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; }
   size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id + 1]; }
   size_t get_nnodes() { return num_samples; }
@@ -77,6 +82,8 @@ class Net {
   bool is_single_class;              // single-class (one-hot) or multi-class label
   bool has_l2norm;                   // whether the net contains an l2_norm layer
   bool has_dense;                    // whether the net contains an dense layer
+  unsigned neighbor_sample_size;     // neighbor sampling
+  unsigned subgraph_sample_size;     // subgraph sampling
   size_t num_samples;                // number of samples: N
   size_t num_classes;                // number of vertex classes: E
   size_t num_conv_layers;            // number of convolutional layers
@@ -95,16 +102,20 @@ class Net {
   mask_t* d_val_masks;               // masks for validation on device
   mask_t* test_masks;                // masks for test
   mask_t* d_test_masks;              // masks for test on device
+  mask_t* subgraph_masks;            // masks for subgraph
   std::vector<size_t> feature_dims;  // feature dimnesions for each layer
   std::vector<layer*> layers;        // all the layers in the neural network
+  Sampler *sampler;
 #ifndef GALOIS_USE_DIST
   deepgalois::Context* context;
 #else
   deepgalois::DistContext* context;
 #endif
 
-  // comparing outputs with the ground truth (labels)
+  void lookup_labels(size_t n, mask_t *masks, const label_t *labels, label_t *sub_labels);
+
 #ifdef CPU_ONLY
+  // comparing outputs with the ground truth (labels)
   acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, Graph* dGraph);
   acc_t masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, Graph* dGraph);
 #else
diff --git a/libdeepgalois/include/deepgalois/sampler.h b/libdeepgalois/include/deepgalois/sampler.h
index 8842f0e442..900ff1de2e 100644
--- a/libdeepgalois/include/deepgalois/sampler.h
+++ b/libdeepgalois/include/deepgalois/sampler.h
@@ -8,7 +8,7 @@ class Sampler {
   ~Sampler() {}
 
   // sample a subgraph sg of size n from graph g
-  void subgraph_sampler(Graph &g, Graph &sg, size_t n);
+  void subgraph_sample(size_t n, Graph &g, Graph &sg, VertexList &vertex_set, mask_t *masks);
 
   // !API function for user-defined selection strategy
   virtual void select_vertices(Graph &g, VertexList &vertex_set, size_t n, size_t m);
diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
index 763bd6646d..b94cd83e14 100644
--- a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
+++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
@@ -16,6 +16,11 @@ sigmoid_loss_layer::~sigmoid_loss_layer() {
   delete loss;
 }
 
+inline label_t sigmoid_loss_layer::get_label(size_t i, size_t j) {
+  return context->get_label(i, j);
+  //return labels(i*input_dims[1]+j);
+}
+
 void sigmoid_loss_layer::forward_propagation(const float_t* in_data, float_t* out_data) {
   size_t len = input_dims[1];
   galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) {
@@ -25,7 +30,7 @@ void sigmoid_loss_layer::forward_propagation(const float_t* in_data, float_t* ou
       math::sigmoid(len, &in_data[idx], &out_data[idx]); // normalize using sigmoid
       // one hot encoded vector for the labels
       float_t *ground_truth = new float_t[len];
-      for (size_t j = 0; j < len; j++) ground_truth[j] = (float_t)context->get_label(i, j);
+      for (size_t j = 0; j < len; j++) ground_truth[j] = (float_t)get_label(i, j);
       // loss calculation
       loss[i] = math::cross_entropy(len, ground_truth, &out_data[idx]);
 	  delete ground_truth;
@@ -41,7 +46,7 @@ void sigmoid_loss_layer::back_propagation(const float_t* in_data, const float_t*
       size_t idx = len * i;
       float_t *norm_grad = new float_t[len];
       float_t *ground_truth = new float_t[len];
-      for (size_t j = 0; j < len; j++) ground_truth[j] = (float_t)context->get_label(i, j);
+      for (size_t j = 0; j < len; j++) ground_truth[j] = (float_t)get_label(i, j);
       // use ground truth to determine derivative of cross entropy
       math::d_cross_entropy(len, ground_truth, &out_data[idx], norm_grad);
       // derviative sigmoid to gradient used in the next layer
diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cu b/libdeepgalois/src/layers/sigmoid_loss_layer.cu
index c52b9089f0..e5adbcfc6f 100644
--- a/libdeepgalois/src/layers/sigmoid_loss_layer.cu
+++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cu
@@ -19,16 +19,18 @@ sigmoid_loss_layer::~sigmoid_loss_layer() {
 
 void sigmoid_loss_layer::forward_propagation(const float_t* in_data,
                                              float_t* out_data) {
+  //label_t *labels = context->get_labels_device_ptr();
   init_const_gpu(input_dims[0], 0.0, loss);
   sigmoid_cross_entropy_gpu(input_dims[1], begin_, end_, in_data,
-                            d_masks_, context->get_labels_device_ptr(), loss, out_data);
+                            d_masks_, labels, loss, out_data);
 }
 
 void sigmoid_loss_layer::back_propagation(const float_t* in_data,
                                           const float_t* out_data,
                                           float_t* out_grad, float_t* in_grad) {
+  //label_t *labels = context->get_labels_device_ptr();
   d_sigmoid_cross_entropy_gpu(input_dims[1], begin_, end_, d_masks_,
-                              context->get_labels_device_ptr(), out_data, in_grad);
+                              labels, out_data, in_grad);
 }
 
 acc_t sigmoid_loss_layer::get_prediction_loss() {
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp
index 4a92e56ec3..0428f248b2 100644
--- a/libdeepgalois/src/layers/softmax_loss_layer.cpp
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp
@@ -16,6 +16,11 @@ softmax_loss_layer::~softmax_loss_layer() {
   delete loss;
 }
 
+inline label_t softmax_loss_layer::get_label(size_t i) {
+  //return labels[i];
+  return context->get_label(i);
+}
+
 // TODO: need kernel fusion optimization
 // 𝑦[i] = 𝑒^𝑥[i] / Σ 𝑒^𝑥[𝑘]
 void softmax_loss_layer::forward_propagation(const float_t* in_data,
@@ -27,7 +32,7 @@ void softmax_loss_layer::forward_propagation(const float_t* in_data,
       math::softmax(len, &in_data[len*i], &out_data[len*i]); // normalize using softmax
       // one hot encoded vector for the labels
       std::vector<acc_t> groundTruth(output_dims[1], 0.0); // ground truth
-      groundTruth[context->get_label(i)] = 1.0;            // one-hot
+      groundTruth[get_label(i)] = 1.0;            // one-hot
       // loss calculation
       loss[i] = math::cross_entropy(len, &groundTruth[0], &out_data[len*i]);
     }
@@ -46,7 +51,7 @@ void softmax_loss_layer::back_propagation(const float_t* in_data,
     if (masks_[i] == 1) { // masked
       vec_t norm_grad(len);
       std::vector<acc_t> groundTruth(len, 0.0);
-      groundTruth[context->get_label(i)] = 1.0;
+      groundTruth[get_label(i)] = 1.0;
       // use ground truth to determine derivative of cross entropy
       math::d_cross_entropy(len, &groundTruth[0], &out_data[len * i], &norm_grad[0]);
       // derviative softmax to gradient used in the next layer
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cu b/libdeepgalois/src/layers/softmax_loss_layer.cu
index e73ef27f33..5e9a573abe 100644
--- a/libdeepgalois/src/layers/softmax_loss_layer.cu
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cu
@@ -20,15 +20,17 @@ softmax_loss_layer::~softmax_loss_layer() {
 void softmax_loss_layer::forward_propagation(const float_t* in_data,
                                              float_t* out_data) {
   init_const_gpu(input_dims[0], 0.0, loss);
+  //label_t *labels = context->get_labels_device_ptr();
   softmax_cross_entropy_gpu(input_dims[1], begin_, end_, in_data,
-                            d_masks_, context->get_labels_device_ptr(), loss, out_data);
+                            d_masks_, labels, loss, out_data);
 }
 
 void softmax_loss_layer::back_propagation(const float_t* in_data,
                                           const float_t* out_data,
                                           float_t* out_grad, float_t* in_grad) {
+  //label_t *labels = context->get_labels_device_ptr();
   d_softmax_cross_entropy_gpu(input_dims[1], begin_, end_, d_masks_,
-                              context->get_labels_device_ptr(), out_data, in_grad);
+                              labels, out_data, in_grad);
 }
 
 acc_t softmax_loss_layer::get_prediction_loss() {
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index c5ef556032..596aadac04 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -8,7 +8,8 @@ namespace deepgalois {
 
 void Net::init(std::string dataset_str, unsigned num_conv, unsigned epochs,
                unsigned hidden1, float lr, float dropout, float wd,
-               bool selfloop, bool single, bool l2norm, bool dense, Graph* dGraph) {
+               bool selfloop, bool single, bool l2norm, bool dense, 
+               unsigned neigh_sz, unsigned subg_sz, Graph* dGraph) {
   assert(num_conv > 0);
   num_conv_layers = num_conv;
   num_epochs = epochs;
@@ -18,6 +19,8 @@ void Net::init(std::string dataset_str, unsigned num_conv, unsigned epochs,
   is_single_class = single;
   has_l2norm = l2norm;
   has_dense = dense;
+  neighbor_sample_size = neigh_sz;
+  subgraph_sample_size = subg_sz;
   galois::gPrint("Configuration: num_conv_layers ", num_conv_layers,
                  ", num_epochs ", num_epochs,
                  ", hidden1 ", hidden1,
@@ -28,6 +31,7 @@ void Net::init(std::string dataset_str, unsigned num_conv, unsigned epochs,
   context = new deepgalois::Context();
   context->set_label_class(is_single_class);
   num_samples = context->read_graph(dataset_str, selfloop);
+  if (subgraph_sample_size) sampler = new deepgalois::Sampler();
 #else
   context = new deepgalois::DistContext();
   num_samples = dGraph->size();
@@ -103,6 +107,9 @@ void Net::init(std::string dataset_str, unsigned num_conv, unsigned epochs,
 #endif
 }
 
+void Net::lookup_labels(size_t n, mask_t *masks, const label_t *labels, label_t *sub_labels) {
+}
+
 void Net::train(optimizer* opt, bool need_validate) {
   std::string header = "";
   std::string seperator = " ";
@@ -118,6 +125,11 @@ void Net::train(optimizer* opt, bool need_validate) {
   galois::StatTimer Tbw("Train-Backward");
   galois::StatTimer Tval("Validation");
   double total_train_time = 0.0;
+  int num_subg_remain = 0;
+  if (subgraph_sample_size) {
+    subgraph_masks = new mask_t[num_samples];
+    std::copy(train_masks, train_masks+num_samples, subgraph_masks);
+  }
 
   Timer t_epoch;
   // run epochs
@@ -125,6 +137,15 @@ void Net::train(optimizer* opt, bool need_validate) {
     galois::gPrint(header, "Epoch ", std::setw(3), i, seperator);
     t_epoch.Start();
 
+    if (subgraph_sample_size && num_subg_remain == 0) {
+#ifdef CPU_ONLY
+      VertexList vertices;
+      sampler->subgraph_sample(subgraph_sample_size, *(context->getCpuGraphPointer()),
+                               *(context->getCpuSubgraphPointer()), vertices, subgraph_masks);
+      lookup_labels(num_samples, subgraph_masks, context->get_labels_ptr(), context->get_labels_subg_ptr());
+#endif
+      num_subg_remain += 1; // num_threads
+    }
     // training steps
     set_netphases(net_phase::train);
     acc_t train_loss = 0.0, train_acc = 0.0;
@@ -200,6 +221,7 @@ double Net::evaluate(std::string type, acc_t& loss, acc_t& acc) {
     end = train_end;
     count = train_count;
     masks = train_masks;
+    if (subgraph_sample_size) masks = subgraph_masks;
   } else if (type == "val") {
     begin = val_begin;
     end = val_end;
@@ -308,10 +330,24 @@ void Net::construct_layers() {
 
 //! Add an l2_norm layer to the network
 void Net::append_l2norm_layer(size_t layer_id) {
+  assert(layer_id > 0); // can not be the first layer
+  std::vector<size_t> in_dims(2), out_dims(2);
+  in_dims[0]       = num_samples;
+  in_dims[0]       = num_samples;
+  in_dims[1]       = get_in_dim(layer_id);
+  out_dims[1]      = get_out_dim(layer_id);
+  layers[layer_id] = new l2_norm_layer(layer_id, in_dims, out_dims);
 }
 
 //! Add an dense layer to the network
 void Net::append_dense_layer(size_t layer_id) {
+  assert(layer_id > 0); // can not be the first layer
+  std::vector<size_t> in_dims(2), out_dims(2);
+  in_dims[0]       = num_samples;
+  in_dims[0]       = num_samples;
+  in_dims[1]       = get_in_dim(layer_id);
+  out_dims[1]      = get_out_dim(layer_id);
+  //layers[layer_id] = new dense_layer(layer_id, in_dims, out_dims);
 }
 
 //! Add an output layer to the network
@@ -325,6 +361,11 @@ void Net::append_out_layer(size_t layer_id) {
     layers[layer_id] = new softmax_loss_layer(layer_id, in_dims, out_dims);
   else
     layers[layer_id] = new sigmoid_loss_layer(layer_id, in_dims, out_dims);
+#ifdef CPU_ONLY
+  layers[layer_id]->set_labels_ptr(context->get_labels_ptr());
+#else
+  layers[layer_id]->set_labels_ptr(context->get_labels_device_ptr());
+#endif
   connect(layers[layer_id - 1], layers[layer_id]);
 }
 
diff --git a/libdeepgalois/src/sampler.cpp b/libdeepgalois/src/sampler.cpp
index fdfb9802cf..3b3ae84b85 100644
--- a/libdeepgalois/src/sampler.cpp
+++ b/libdeepgalois/src/sampler.cpp
@@ -114,8 +114,8 @@ void Sampler::select_vertices(Graph &g, VertexList &vertex_set, size_t n, size_t
 	}
 }
 
-void Sampler::subgraph_sampler(Graph &g, Graph&sg, size_t n) {
-	VertexList vertex_set(n);
+void Sampler::subgraph_sample(size_t n, Graph &g, Graph&sg, VertexList &vertex_set, mask_t *masks) {
+	vertex_set.resize(n);
 	select_vertices(g, vertex_set, n, m); 
 	generate_subgraph(vertex_set, g, sg);
 }
diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp
index 4a6a06639a..ba9cbe3529 100644
--- a/lonestargnn/gcn/gcn.cpp
+++ b/lonestargnn/gcn/gcn.cpp
@@ -27,7 +27,8 @@ int main(int argc, char** argv) {
   // read network, features, ground truth, initialize metadata
   network.init(dataset, num_conv_layers, epochs, hidden1, learning_rate, 
                dropout_rate, weight_decay, add_selfloop, 
-               is_single_class, add_l2norm, add_dense, dGraph);
+               is_single_class, add_l2norm, add_dense, 
+               neighbor_sample_sz, subgraph_sample_sz, dGraph);
   // default setting for now; can be customized by the user
   network.construct_layers();
   network.print_layers_info();
diff --git a/lonestargnn/include/lonestargnn.h b/lonestargnn/include/lonestargnn.h
index e2191fb7a1..cdfda9eba0 100644
--- a/lonestargnn/include/lonestargnn.h
+++ b/lonestargnn/include/lonestargnn.h
@@ -34,8 +34,6 @@ static cll::opt<float> weight_decay("wd",
     cll::desc("Weight for L2 loss on embedding matrix (default value 5e-4)"), cll::init(5e-4));
 static cll::opt<float> early_stopping("es",
     cll::desc("Tolerance for early stopping (# of epochs) (default value 10)"), cll::init(10));
-static cll::opt<unsigned> max_degree("md", 
-    cll::desc("Maximum size of the downsampled adjacency lists (default value 25)"), cll::init(25));
 static cll::opt<bool> is_single_class("sc", 
     cll::desc("single-class or multi-class label (default single)"), cll::init(1));
 static cll::opt<bool> do_validate("dv", cll::desc("enable validation"), cll::init(1));
@@ -43,6 +41,8 @@ static cll::opt<bool> do_test("dt", cll::desc("enable test"), cll::init(1));
 static cll::opt<bool> add_selfloop("sl", cll::desc("add selfloop"), cll::init(0));
 static cll::opt<bool> add_l2norm("l2", cll::desc("add an l2_norm layer"), cll::init(0));
 static cll::opt<bool> add_dense("d", cll::desc("add an dense layer"), cll::init(0));
+static cll::opt<unsigned> neighbor_sample_sz("ns", cll::desc("neighbor sampling size (default value 0)"), cll::init(0));
+static cll::opt<unsigned> subgraph_sample_sz("ss", cll::desc("subgraph sampling size (default value 0)"), cll::init(0));
 
 //! standard global options to the benchmarks
 extern llvm::cl::opt<bool> skipVerify;

From 22efddfcf0cad139a7eb660dc95bb7b7735ba19e Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Sun, 26 Apr 2020 16:41:35 -0500
Subject: [PATCH 192/660] refine

---
 .../include/deepgalois/DistContext.h          |   2 +-
 libdeepgalois/include/deepgalois/context.h    |  33 +++---
 .../deepgalois/layers/graph_conv_layer.h      |   2 +-
 .../include/deepgalois/layers/layer.h         |  66 ++++++------
 libdeepgalois/include/deepgalois/lgraph.h     | 100 ------------------
 libdeepgalois/src/context.cpp                 |  88 ++++++++++-----
 libdeepgalois/src/context.cu                  |   1 -
 libdeepgalois/src/layers/graph_conv_layer.cpp |  27 +++--
 libdeepgalois/src/layers/graph_conv_layer.cu  |   8 +-
 .../src/layers/sigmoid_loss_layer.cpp         |   4 +-
 .../src/layers/sigmoid_loss_layer.cu          |   2 -
 .../src/layers/softmax_loss_layer.cpp         |   4 +-
 .../src/layers/softmax_loss_layer.cu          |   2 -
 libdeepgalois/src/net.cpp                     |  28 +----
 libdeepgalois/src/net.cu                      |  39 +++++--
 15 files changed, 174 insertions(+), 232 deletions(-)
 delete mode 100644 libdeepgalois/include/deepgalois/lgraph.h

diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h
index 37e2eea372..704247d54b 100644
--- a/libdeepgalois/include/deepgalois/DistContext.h
+++ b/libdeepgalois/include/deepgalois/DistContext.h
@@ -39,7 +39,7 @@ class DistContext {
   void initializeSyncSubstrate();
   galois::graphs::GluonSubstrate<Graph>* getSyncSubstrate();
 
-  Graph* getCpuGraphPointer() {
+  Graph* getGraphPointer() {
     return graph_cpu;
   }
 
diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h
index 786fc48d5d..ea2b5f2156 100644
--- a/libdeepgalois/include/deepgalois/context.h
+++ b/libdeepgalois/include/deepgalois/context.h
@@ -8,7 +8,6 @@
 #include "deepgalois/types.h"
 #include "deepgalois/utils.h"
 #ifdef CPU_ONLY
-#include "deepgalois/lgraph.h"
 #include "deepgalois/gtypes.h"
 #else
 #include "graph_gpu.h"
@@ -28,32 +27,35 @@ class Context {
   size_t read_features(std::string dataset_str, std::string filetype = "bin");
 
   label_t get_label(size_t i) { return labels[i]; } // single-class (one-hot) label
-  label_t get_label(size_t i, size_t j) { return labels[i*num_classes+j]; } // multi-class label
-  label_t* get_labels_ptr() { return labels; }
-  label_t* get_labels_subg_ptr() { return labels_subg; }
-  label_t* get_labels_device_ptr() { return d_labels; }
-  float_t* get_in_ptr();
-  float_t* get_norm_factor() { return norm_factor; }
+  //label_t get_label(size_t i, size_t j) { return labels[i*num_classes+j]; } // multi-class label
+  float_t* get_norm_factor_ptr() { return norm_factor; }
 
+  void set_label_class(bool is_single = true) { is_single_class = is_single; }
   void copy_data_to_device(); // copy labels and input features
   void norm_factor_counting();
-  void set_label_class(bool is_single = true) { is_single_class = is_single; }
 
 #ifdef CPU_ONLY
   Graph* graph_cpu; // the input graph, |V| = N
   Graph* subgraph_cpu;
-  void genGraph(LGraph& lg, Graph& g);
   void add_selfloop(Graph &og, Graph &g);
   //! returns pointer to the graph
-  Graph* getCpuGraphPointer();
-  Graph* getCpuSubgraphPointer() { return subgraph_cpu; };
+  Graph* getGraphPointer() { return graph_cpu; }
+  Graph* getSubgraphPointer() { return subgraph_cpu; };
+  float_t* get_in_ptr() { return h_feats; }
+  label_t* get_labels_ptr() { return labels; }
+  label_t* get_labels_subg_ptr() { return labels_subg; }
 #else
   CSRGraph graph_gpu; // the input graph, |V| = N
+  CSRGraph subgraph_gpu;
+  CSRGraph* getGraphPointer() { return &graph_gpu; }
+  CSRGraph* getSubgraphPointer() { return &subgraph_gpu; };
+  float_t* get_in_ptr() { return d_feats; }
+  label_t* get_labels_ptr() { return d_labels; }
+  label_t* get_labels_subg_ptr() { return d_labels_subg; }
   inline static cublasHandle_t cublas_handle() { return cublas_handle_; }
   inline static cusparseHandle_t cusparse_handle() { return cusparse_handle_; }
   inline static cusparseMatDescr_t cusparse_matdescr() { return cusparse_matdescr_; }
   inline static curandGenerator_t curand_generator() { return curand_generator_; }
-  CSRGraph* getGpuGraphPointer() { return &graph_gpu; }
 #endif
 
 protected:
@@ -67,12 +69,17 @@ class Context {
   float_t* h_feats;            // input features: N x D
   float_t* norm_factor;        // normalization constant based on graph structure
   label_t* d_labels;           // labels on device
+  label_t *d_labels_subg;      // labels for subgraph on device
   float_t* d_feats;            // input features on device
-#ifndef CPU_ONLY
+
+#ifdef CPU_ONLY
+  void read_edgelist(const char* filename, bool symmetrize = false, bool add_self_loop = false);
+#else
   static cublasHandle_t cublas_handle_; // used to call cuBLAS
   static cusparseHandle_t cusparse_handle_; // used to call cuSPARSE
   static cusparseMatDescr_t cusparse_matdescr_; // used to call cuSPARSE
   static curandGenerator_t curand_generator_; // used to generate random numbers on GPU
 #endif
 };
+
 } // end deepgalois namespace
diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
index 63062133df..eb42fe1093 100644
--- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
@@ -36,7 +36,7 @@ class graph_conv_layer : public layer {
   void init();
   std::string layer_type() const override { return std::string("graph_conv"); }
   void set_netphase(deepgalois::net_phase ctx) override { phase_ = ctx; }
-  void set_context(layer::ContextType* ctx) { context = ctx; norm_factor = ctx->get_norm_factor(); }
+  void set_context(layer::ContextType* ctx) { context = ctx; norm_factor = ctx->get_norm_factor_ptr(); }
   virtual acc_t get_weight_decay_loss();
   //! Uses weights contained in this layer to update in_data (results from previous)
   //! and save result to out_data
diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h
index a18802f198..0e94a53d49 100644
--- a/libdeepgalois/include/deepgalois/layers/layer.h
+++ b/libdeepgalois/include/deepgalois/layers/layer.h
@@ -57,36 +57,38 @@ class layer : public deepgalois::node {
   }
   virtual ~layer()                       = default;
   virtual std::string layer_type() const = 0;
-  virtual void set_netphase(deepgalois::net_phase phase) {}
-  //! save context
-  virtual void set_context(ContextType* ctx) { context = ctx; }
-  //! return layer loss
+  void print_layer_info(); //! debug print function
+
+  // get methods
   virtual acc_t get_prediction_loss() { return acc_t(0); }
   virtual acc_t get_weight_decay_loss() { return acc_t(0); }
-
-  // main functions for layer work
-  virtual void forward_propagation(const float_t* in_data,
-                                   float_t* out_data)                = 0;
-  virtual void back_propagation(const float_t* in_data, const float_t* out_data,
-                                float_t* out_grad, float_t* in_grad) = 0;
-
-  // is this layer trainable?
-  void set_trainable(bool trainable) { trainable_ = trainable; }
-  void set_labels_ptr(label_t *ptr) { labels = ptr; }
   bool trainable() const { return trainable_; }
-
-  // name metadata
-  void set_name(std::string name) { name_ = name; }
   std::string get_name() { return name_; }
-
   mask_t* get_device_masks() { return d_masks_; }
   float_t* get_weights_ptr() { return &W[0]; }
   float_t* get_weights_device_ptr() { return d_W; }
   float_t* get_grads_ptr() { return &weight_grad[0]; }
   float_t* get_grads_device_ptr() { return d_weight_grad; }
 
-  //! debug print function
-  void print_layer_info();
+  // set methods
+  virtual void set_netphase(deepgalois::net_phase phase) {}
+  virtual void set_context(ContextType* ctx) { context = ctx; }
+  void set_trainable(bool trainable) { trainable_ = trainable; } // is this layer trainable?
+  void set_labels_ptr(label_t *ptr) { labels = ptr; }
+  void set_name(std::string name) { name_ = name; } // name metadata
+#ifdef CPU_ONLY
+  void set_graph_ptr(Graph *ptr) { graph_cpu = ptr; }
+#else
+  void set_graph_ptr(CSRGraph *ptr) { graph_gpu = ptr; }
+#endif
+
+  //! set the data of the previous layer connected to this one
+  void set_in_data(float_t* data) {
+    prev_ = std::make_shared<deepgalois::edge>(this, input_dims[0], input_dims[1]);
+    prev_->set_data(data);
+    // no need to allocate memory for gradients, since this is the input layer.
+  }
+
   virtual void set_sample_mask(size_t sample_begin, size_t sample_end,
                                size_t sample_count, mask_t* masks) {
     begin_ = sample_begin;
@@ -99,22 +101,18 @@ class layer : public deepgalois::node {
 #endif
   }
 
-  //! set the data of the previous layer connected to this one
-  void set_in_data(float_t* data) {
-    prev_ = std::make_shared<deepgalois::edge>(this, input_dims[0], input_dims[1]);
-    prev_->set_data(data);
-    // no need to allocate memory for gradients, since this is the input layer.
-  }
-
   void add_edge() {
     // add an outgoing edge
     next_ = std::make_shared<deepgalois::edge>(this, output_dims[0], output_dims[1]);
     // allocate memory for intermediate feature vectors and gradients
     next_->alloc();
   }
-  void alloc_grad() {
-    // allocate memory for intermediate gradients
-  }
+
+  // main functions for layer work
+  virtual void forward_propagation(const float_t* in_data,
+                                   float_t* out_data)                = 0;
+  virtual void back_propagation(const float_t* in_data, const float_t* out_data,
+                                float_t* out_grad, float_t* in_grad) = 0;
 
   //! calls forward propagation using previous layer as input and writes
   //! to next layer as output
@@ -132,9 +130,6 @@ class layer : public deepgalois::node {
 
   //! use optimizer to update weights given gradient (weight_grad)
   void update_weight(deepgalois::optimizer* opt) {
-    // std::cout << name_ << ": weight updating ... ";
-    // vec_t diff;
-    // prev()->merge_grads(&diff);
 #ifdef CPU_ONLY
     // parallelize only when target size is big enough to mitigate thread
     // spawning overhead.
@@ -168,6 +163,11 @@ class layer : public deepgalois::node {
   float_t* loss; // error for each vertex: N x 1
   ContextType* context;
   label_t* labels;
+#ifdef CPU_ONLY
+  Graph *graph_cpu;
+#else
+  CSRGraph *graph_gpu;
+#endif
 
 #ifdef GALOIS_USE_DIST
   // Used for synchronization of weight gradients
diff --git a/libdeepgalois/include/deepgalois/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h
deleted file mode 100644
index 029d12d44b..0000000000
--- a/libdeepgalois/include/deepgalois/lgraph.h
+++ /dev/null
@@ -1,100 +0,0 @@
-#ifndef __LGRAPH_HPP__
-#define __LGRAPH_HPP__
-
-// defines the Learning Graph (LGraph) data structure
-#include <set>
-#include <string>
-
-namespace deepgalois {
-
-typedef unsigned IndexT;
-typedef float ValueT;
-
-/**
- * Learning graph.
- *
- * Provides basic accesors and such; nothing special. Just a CSR.
- * Ultimatly becomes an LC_CSR.
- *
- * @todo remove this intermediate step if using edgelists
- */
-class LGraph {
-public:
-  LGraph() : directed_(false) {}
-  void clean() {
-    delete[] rowptr_;
-    delete[] colidx_;
-  }
-  bool directed() const { return directed_; }
-  size_t num_vertices() const { return num_vertices_; }
-  size_t num_edges() const { return num_edges_; }
-  IndexT* out_rowptr() const { return rowptr_; }
-  IndexT* out_colidx() const { return colidx_; }
-  unsigned out_degree(IndexT n) const { return rowptr_[n + 1] - rowptr_[n]; }
-  IndexT get_offset(IndexT n) { return rowptr_[n]; }
-  IndexT get_dest(IndexT n) { return colidx_[n]; }
-
-  void read_edgelist(const char* filename, bool symmetrize = false, bool add_self_loop = false) {
-    std::ifstream in;
-    std::string line;
-    in.open(filename, std::ios::in);
-    size_t m, n;
-    in >> m >> n >> std::ws;
-    num_vertices_ = m;
-    num_edges_    = 0;
-    std::cout << "num_vertices " << num_vertices_ << "\n";
-    std::vector<std::set<IndexT> > vertices(m);
-    for (size_t i = 0; i < n; i++) {
-      std::set<IndexT> neighbors;
-      if (add_self_loop) neighbors.insert(i);
-      vertices.push_back(neighbors);
-    }
-    while (std::getline(in, line)) {
-      std::istringstream edge_stream(line);
-      IndexT u, v;
-      edge_stream >> u;
-      edge_stream >> v;
-      vertices[u].insert(v);
-      if (symmetrize) vertices[v].insert(u);
-    }
-    in.close();
-	for (size_t i = 0; i < n; i++) num_edges_ += vertices[i].size();
-	std::cout << "num_edges " << num_edges_ << "\n";
-    MakeCSR(vertices);
-  }
-
-private:
-  bool directed_;
-  size_t num_vertices_;
-  size_t num_edges_;
-  IndexT* rowptr_;
-  IndexT* colidx_;
-
-  void MakeCSR(std::vector<std::set<IndexT> > vertices) {
-    std::vector<IndexT> degrees;
-    degrees.resize(num_vertices_);
-    std::fill(degrees.begin(), degrees.end(), 0);
-    for (size_t i = 0; i < num_vertices_; i++)
-      degrees[i] = vertices[i].size();
-    std::vector<IndexT> offsets(degrees.size() + 1);
-    IndexT total = 0;
-    for (size_t n = 0; n < degrees.size(); n++) {
-      offsets[n] = total;
-      total += degrees[n];
-    }
-    offsets[degrees.size()] = total;
-    degrees.clear();
-    assert(num_edges_ == offsets[num_vertices_]);
-    colidx_ = new IndexT[num_edges_];
-    rowptr_ = new IndexT[num_vertices_ + 1];
-    for (size_t i = 0; i < num_vertices_ + 1; i++)
-      rowptr_[i] = offsets[i];
-    for (size_t i = 0; i < num_vertices_; i++) {
-      for (auto dst : vertices[i])
-          colidx_[offsets[i]++] = dst;
-    }
-  }
-};
-
-} // namespace
-#endif
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index 98b3f7ed15..0fc3fe0a95 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -1,7 +1,7 @@
 /**
  * Based on common.hpp file of the Caffe deep learning library.
  */
-
+#include <set>
 #include "deepgalois/context.h"
 
 namespace deepgalois {
@@ -30,10 +30,7 @@ size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype, bo
   if (filetype == "el") {
     std::string filename = path + dataset_str + ".el";
     printf("Reading .el file: %s\n", filename.c_str());
-    LGraph lgraph;
-    lgraph.read_edgelist(filename.c_str(), true); // symmetrize
-    genGraph(lgraph, *graph_cpu);
-    lgraph.clean();
+    read_edgelist(filename.c_str(), true); // symmetrize
   } else if (filetype == "gr") {
     std::string filename = path + dataset_str + ".csgr";
     printf("Reading .gr file: %s\n", filename.c_str());
@@ -54,19 +51,6 @@ size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype, bo
   return graph_cpu->size();
 }
 
-void Context::genGraph(LGraph& lg, Graph& g) {
-  g.allocateFrom(lg.num_vertices(), lg.num_edges());
-  g.constructNodes();
-  for (size_t i = 0; i < lg.num_vertices(); i++) {
-    g.getData(i)   = 1;
-    auto row_begin = lg.get_offset(i);
-    auto row_end   = lg.get_offset(i + 1);
-    g.fixEndEdge(i, row_end);
-    for (auto offset = row_begin; offset < row_end; offset++)
-      g.constructEdge(offset, lg.get_dest(offset), 0);
-  }
-}
-
 void Context::add_selfloop(Graph &og, Graph &g) {
   g.allocateFrom(og.size(), og.size()+og.sizeEdges());
   g.constructNodes();
@@ -99,12 +83,6 @@ void Context::add_selfloop(Graph &og, Graph &g) {
   //*/
 }
 
-Graph* Context::getCpuGraphPointer() {
-  return Context::graph_cpu;
-}
-
-float_t* Context::get_in_ptr() { return h_feats; }
-
 void Context::norm_factor_counting() {
   norm_factor = new float_t[n];
   galois::do_all(galois::iterate((size_t)0, n),
@@ -115,6 +93,68 @@ void Context::norm_factor_counting() {
       else norm_factor[v] = 1.0 / temp;
     }, galois::loopname("NormCounting"));
 }
+
+void Context::read_edgelist(const char* filename, bool symmetrize, bool add_self_loop) {
+  std::ifstream in;
+  std::string line;
+  in.open(filename, std::ios::in);
+  size_t m, n;
+  in >> m >> n >> std::ws;
+  size_t num_vertices_ = m;
+  size_t num_edges_    = 0;
+  std::cout << "num_vertices " << num_vertices_ << "\n";
+  std::vector<std::set<uint32_t> > vertices(m);
+  for (size_t i = 0; i < n; i++) {
+    std::set<uint32_t> neighbors;
+    if (add_self_loop) neighbors.insert(i);
+    vertices.push_back(neighbors);
+  }
+  while (std::getline(in, line)) {
+    std::istringstream edge_stream(line);
+    VertexID u, v;
+    edge_stream >> u;
+    edge_stream >> v;
+    vertices[u].insert(v);
+    if (symmetrize) vertices[v].insert(u);
+  }
+  in.close();
+  for (size_t i = 0; i < n; i++) num_edges_ += vertices[i].size();
+  std::cout << "num_edges " << num_edges_ << "\n";
+
+  std::vector<uint32_t> degrees;
+  degrees.resize(num_vertices_);
+  std::fill(degrees.begin(), degrees.end(), 0);
+  for (size_t i = 0; i < num_vertices_; i++)
+    degrees[i] = vertices[i].size();
+  std::vector<uint32_t> offsets(degrees.size() + 1);
+  uint32_t total = 0;
+  for (size_t n = 0; n < degrees.size(); n++) {
+    offsets[n] = total;
+    total += degrees[n];
+  }
+  offsets[degrees.size()] = total;
+  degrees.clear();
+  assert(num_edges_ == offsets[num_vertices_]);
+  EdgeID *colidx_ = new EdgeID[num_edges_];
+  VertexID *rowptr_ = new VertexID[num_vertices_ + 1];
+  for (size_t i = 0; i < num_vertices_ + 1; i++)
+    rowptr_[i] = offsets[i];
+  for (size_t i = 0; i < num_vertices_; i++) {
+    for (auto dst : vertices[i])
+        colidx_[offsets[i]++] = dst;
+  }
+
+  graph_cpu->allocateFrom(num_vertices_, num_edges_);
+  graph_cpu->constructNodes();
+  for (size_t i = 0; i < num_vertices_; i++) {
+    auto row_begin = rowptr_[i];
+    auto row_end = rowptr_[i+1];
+    graph_cpu->fixEndEdge(i, row_end);
+    for (auto offset = row_begin; offset < row_end; offset++)
+      graph_cpu->constructEdge(offset, colidx_[offset], 0);
+  }
+}
+
 #endif
 
 // labels contain the ground truth (e.g. vertex classes) for each example
diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu
index 93300abffb..86ad9003bf 100644
--- a/libdeepgalois/src/context.cu
+++ b/libdeepgalois/src/context.cu
@@ -158,5 +158,4 @@ void Context::copy_data_to_device() {
   //float_copy_device(n*feat_len, &h_feats[0], d_feats);
 //}
 
-float_t* Context::get_in_ptr() { return d_feats; }
 } // namespace context
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 3233cd0bc6..3c63468159 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -29,7 +29,7 @@ void graph_conv_layer::combine(size_t n, size_t len, const float_t* self, const
   float_t *b = new float_t[len];
   mvmul(n, len, &Q[0], self, a);
   mvmul(n, len, &W[0], neighbors, b);
-  deepgalois::math::vadd_cpu(len, a, b, out); // out = W*self + Q*neighbors
+  math::vadd_cpu(len, a, b, out); // out = W*self + Q*neighbors
 }
 
 void graph_conv_layer::init() {
@@ -65,12 +65,12 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_
   // if y > z: mult W first to reduce the feature size for aggregation
   // else: aggregate first then mult W (not implemented yet)
   if (dropout_ && phase_ == deepgalois::net_phase::train) {
-    deepgalois::math::dropout_cpu(x*y, scale_, dropout_rate_, in_data, dropout_mask, in_temp);
-    deepgalois::math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, &layer::W[0], 0.0, out_temp);
-  } else deepgalois::math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_data, &layer::W[0], 0.0, out_temp);
+    math::dropout_cpu(x*y, scale_, dropout_rate_, in_data, dropout_mask, in_temp);
+    math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, &layer::W[0], 0.0, out_temp);
+  } else math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_data, &layer::W[0], 0.0, out_temp);
 
   // aggregate based on graph topology
-  graph_conv_layer::aggregate(z, *(context->graph_cpu), out_temp, out_data);
+  graph_conv_layer::aggregate(z, *graph_cpu, out_temp, out_data);
 #ifdef GALOIS_USE_DIST
   // TODO sync of out_data required here
   deepgalois::_syncVectorSize = z;
@@ -79,7 +79,7 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_
                                           GraphConvSync>("AggSync");
 #endif
   // run relu activation on output if specified
-  if (act_) deepgalois::math::relu_cpu(x*z, out_data, out_data);
+  if (act_) math::relu_cpu(x*z, out_data, out_data);
 }
 
 // 𝜕𝐸 / 𝜕𝑦[𝑙−1] = 𝜕𝐸 / 𝜕𝑦[𝑙] ∗ 𝑊 ^𝑇
@@ -87,12 +87,12 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
                                         const float_t* out_data,
                                         float_t* out_grad, float_t* in_grad) {
   // note; assumption here is that out_grad contains 1s or 0s via relu?
-  if (act_) deepgalois::math::d_relu_cpu(x*z, out_grad, out_data, out_grad);
+  if (act_) math::d_relu_cpu(x*z, out_grad, out_data, out_grad);
   //else deepgalois::math::copy_cpu(x * z, out_grad, out_temp); // TODO: avoid copying
 
   // x*y NOTE: since graph is symmetric, the derivative is the same
   // this is the aggregate call
-  deepgalois::update_all(z, *(context->graph_cpu), out_grad, out_temp, norm_, norm_factor); // x*x; x*z -> x*z
+  deepgalois::update_all(z, *graph_cpu, out_grad, out_temp, norm_, norm_factor); // x*x; x*z -> x*z
 #ifdef GALOIS_USE_DIST
   // sync agg
   deepgalois::_syncVectorSize = z;
@@ -106,18 +106,17 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
   // this calculates gradients for the node predictions
   if (level_ != 0) { // no need to calculate in_grad for the first layer
     // derivative of matmul needs transposed matrix
-    deepgalois::math::sgemm_cpu(CblasNoTrans, CblasTrans, x, y, z, 1.0,
-                                out_temp, &W[0], 0.0, in_grad); // x*z; z*y -> x*y
+    math::sgemm_cpu(CblasNoTrans, CblasTrans, x, y, z, 1.0,
+                    out_temp, &W[0], 0.0, in_grad); // x*z; z*y -> x*y
     if (dropout_) {
-      deepgalois::math::d_dropout_cpu(x*y, scale_, in_grad, dropout_mask,
-                                      in_grad);
+      math::d_dropout_cpu(x*y, scale_, in_grad, dropout_mask, in_grad);
     }
   }
 
   // calculate weight gradients using input data
   // multiplied by gradients from last back prop step
-  deepgalois::math::sgemm_cpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data,
-                              out_temp, 0.0, &layer::weight_grad[0]); // y*x; x*z; y*z
+  math::sgemm_cpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data,
+                  out_temp, 0.0, &layer::weight_grad[0]); // y*x; x*z; y*z
 #ifdef GALOIS_USE_DIST
   layer::syncSub->sync<writeAny, readAny, GradientSync>("GradientSync");
   //galois::gInfo("[", layer::gradientGraph->myHostID(), "] Sync done");
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cu b/libdeepgalois/src/layers/graph_conv_layer.cu
index 28e6002279..ac29b73a7b 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cu
+++ b/libdeepgalois/src/layers/graph_conv_layer.cu
@@ -58,9 +58,9 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_
   else copy_gpu(x*y, in_data, in_temp); 
   if (y > z) {
     sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, d_W, 0.0, out_temp);
-    graph_conv_layer::aggregate(z, context->graph_gpu, out_temp, out_data);
+    graph_conv_layer::aggregate(z, *graph_gpu, out_temp, out_data);
   } else {
-    graph_conv_layer::aggregate(y, context->graph_gpu, in_temp, in_temp1);
+    graph_conv_layer::aggregate(y, *graph_gpu, in_temp, in_temp1);
     sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp1, d_W, 0.0, out_data);
   }
   if (act_) relu_gpu(x * z, out_data, out_data);
@@ -72,14 +72,14 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
                                         float_t* out_grad, float_t* in_grad) {
   if (act_) d_relu_gpu(x * z, out_grad, out_data, out_grad);
   if (y > z) {
-    graph_conv_layer::d_aggregate(z, context->graph_gpu, out_grad, out_temp);
+    graph_conv_layer::d_aggregate(z, *graph_gpu, out_grad, out_temp);
     if (level_ != 0)
       sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0, in_grad);
     sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, layer::d_weight_grad);
   } else {
     if (level_ != 0) {
       sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_grad, d_W, 0.0, in_temp);
-      graph_conv_layer::d_aggregate(y, context->graph_gpu, in_temp, in_grad);
+      graph_conv_layer::d_aggregate(y, *graph_gpu, in_temp, in_grad);
     }
     sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_grad, 0.0, layer::d_weight_grad);
   }
diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
index b94cd83e14..a5ec7eef49 100644
--- a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
+++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
@@ -17,8 +17,8 @@ sigmoid_loss_layer::~sigmoid_loss_layer() {
 }
 
 inline label_t sigmoid_loss_layer::get_label(size_t i, size_t j) {
-  return context->get_label(i, j);
-  //return labels(i*input_dims[1]+j);
+  //return context->get_label(i, j);
+  return labels[i*input_dims[1]+j];
 }
 
 void sigmoid_loss_layer::forward_propagation(const float_t* in_data, float_t* out_data) {
diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cu b/libdeepgalois/src/layers/sigmoid_loss_layer.cu
index e5adbcfc6f..1fcc55e207 100644
--- a/libdeepgalois/src/layers/sigmoid_loss_layer.cu
+++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cu
@@ -19,7 +19,6 @@ sigmoid_loss_layer::~sigmoid_loss_layer() {
 
 void sigmoid_loss_layer::forward_propagation(const float_t* in_data,
                                              float_t* out_data) {
-  //label_t *labels = context->get_labels_device_ptr();
   init_const_gpu(input_dims[0], 0.0, loss);
   sigmoid_cross_entropy_gpu(input_dims[1], begin_, end_, in_data,
                             d_masks_, labels, loss, out_data);
@@ -28,7 +27,6 @@ void sigmoid_loss_layer::forward_propagation(const float_t* in_data,
 void sigmoid_loss_layer::back_propagation(const float_t* in_data,
                                           const float_t* out_data,
                                           float_t* out_grad, float_t* in_grad) {
-  //label_t *labels = context->get_labels_device_ptr();
   d_sigmoid_cross_entropy_gpu(input_dims[1], begin_, end_, d_masks_,
                               labels, out_data, in_grad);
 }
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp
index 0428f248b2..2f944656de 100644
--- a/libdeepgalois/src/layers/softmax_loss_layer.cpp
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp
@@ -17,8 +17,8 @@ softmax_loss_layer::~softmax_loss_layer() {
 }
 
 inline label_t softmax_loss_layer::get_label(size_t i) {
-  //return labels[i];
-  return context->get_label(i);
+  return labels[i];
+  //return context->get_label(i);
 }
 
 // TODO: need kernel fusion optimization
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cu b/libdeepgalois/src/layers/softmax_loss_layer.cu
index 5e9a573abe..3eb5065edd 100644
--- a/libdeepgalois/src/layers/softmax_loss_layer.cu
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cu
@@ -20,7 +20,6 @@ softmax_loss_layer::~softmax_loss_layer() {
 void softmax_loss_layer::forward_propagation(const float_t* in_data,
                                              float_t* out_data) {
   init_const_gpu(input_dims[0], 0.0, loss);
-  //label_t *labels = context->get_labels_device_ptr();
   softmax_cross_entropy_gpu(input_dims[1], begin_, end_, in_data,
                             d_masks_, labels, loss, out_data);
 }
@@ -28,7 +27,6 @@ void softmax_loss_layer::forward_propagation(const float_t* in_data,
 void softmax_loss_layer::back_propagation(const float_t* in_data,
                                           const float_t* out_data,
                                           float_t* out_grad, float_t* in_grad) {
-  //label_t *labels = context->get_labels_device_ptr();
   d_softmax_cross_entropy_gpu(input_dims[1], begin_, end_, d_masks_,
                               labels, out_data, in_grad);
 }
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index 596aadac04..d44c9b4632 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -140,8 +140,8 @@ void Net::train(optimizer* opt, bool need_validate) {
     if (subgraph_sample_size && num_subg_remain == 0) {
 #ifdef CPU_ONLY
       VertexList vertices;
-      sampler->subgraph_sample(subgraph_sample_size, *(context->getCpuGraphPointer()),
-                               *(context->getCpuSubgraphPointer()), vertices, subgraph_masks);
+      sampler->subgraph_sample(subgraph_sample_size, *(context->getGraphPointer()),
+                               *(context->getSubgraphPointer()), vertices, subgraph_masks);
       lookup_labels(num_samples, subgraph_masks, context->get_labels_ptr(), context->get_labels_subg_ptr());
 #endif
       num_subg_remain += 1; // num_threads
@@ -154,19 +154,6 @@ void Net::train(optimizer* opt, bool need_validate) {
     // for use during backprop
     Tfw.start();
     double fw_time = evaluate("train", train_loss, train_acc);
-	/*
-    train_loss = Net::fprop(train_begin, train_end, train_count, train_masks); // forward
-#ifdef CPU_ONLY
-    Graph *g = context->getCpuGraphPointer();
-#else
-	CSRGraph *g = context->getGpuGraphPointer();
-#endif
-    if (is_single_class) {
-      train_acc = masked_accuracy(train_begin, train_end, train_count, train_masks, g); // predict
-    } else {
-      train_acc = masked_multi_class_accuracy(train_begin, train_end, train_count, train_masks, g); // predict
-    }
-	*/
     Tfw.stop();
 
     // backward: use intermediate features + ground truth to update layers
@@ -244,11 +231,7 @@ double Net::evaluate(std::string type, acc_t& loss, acc_t& acc) {
 #endif
 
   loss = fprop(begin, end, count, masks);
-#ifdef CPU_ONLY
-  Graph* g = context->getCpuGraphPointer();
-#else
-  CSRGraph* g = context->getGpuGraphPointer();
-#endif
+  auto g = context->getGraphPointer();
   if (is_single_class) {
     acc = masked_accuracy(begin, end, count, masks, g);
   } else {
@@ -361,11 +344,7 @@ void Net::append_out_layer(size_t layer_id) {
     layers[layer_id] = new softmax_loss_layer(layer_id, in_dims, out_dims);
   else
     layers[layer_id] = new sigmoid_loss_layer(layer_id, in_dims, out_dims);
-#ifdef CPU_ONLY
   layers[layer_id]->set_labels_ptr(context->get_labels_ptr());
-#else
-  layers[layer_id]->set_labels_ptr(context->get_labels_device_ptr());
-#endif
   connect(layers[layer_id - 1], layers[layer_id]);
 }
 
@@ -380,6 +359,7 @@ void Net::append_conv_layer(size_t layer_id, bool act, bool norm, bool bias,
   out_dims[1]              = get_out_dim(layer_id);
   layers[layer_id] = new graph_conv_layer(layer_id, act, norm, bias, dropout,
                                           dropout_rate, in_dims, out_dims);
+  layers[layer_id]->set_graph_ptr(context->getGraphPointer());
   if (layer_id > 0) connect(layers[layer_id - 1], layers[layer_id]);
 }
 
diff --git a/libdeepgalois/src/net.cu b/libdeepgalois/src/net.cu
index d46b807711..27c3ea5de8 100644
--- a/libdeepgalois/src/net.cu
+++ b/libdeepgalois/src/net.cu
@@ -53,7 +53,8 @@ __global__ void masked_f1_score_kernel(int num_classes, int begin,
                                        float_t* preds, label_t* labels,
                                        f1count_t* true_positive,
                                        f1count_t* false_positive,
-                                       f1count_t* false_negtive) {
+                                       f1count_t* false_negtive,
+                                       f1count_t* true_negtive) {
   CUDA_KERNEL_LOOP(i, end - begin) {
     int id = begin + i;
     if (masks[id] == 1) {
@@ -65,6 +66,8 @@ __global__ void masked_f1_score_kernel(int num_classes, int begin,
           atomicAdd(&false_positive[j], 1.0);
         } else if (labels[idx] == 1 && preds[idx] <= 0.5) {
           atomicAdd(&false_negtive[j], 1.0);
+        } else {
+          atomicAdd(&true_negtive[j], 1.0);
         }
       }
 	}
@@ -78,45 +81,63 @@ acc_t masked_f1_score_gpu(int num_classes, int begin, int end, int count,
   f1count_t* h_tp = new f1count_t[num_classes];
   f1count_t* h_fp = new f1count_t[num_classes];
   f1count_t* h_fn = new f1count_t[num_classes];
-  f1count_t* d_tp, *d_fp, *d_fn;
+  f1count_t* h_tn = new f1count_t[num_classes];
+  f1count_t* d_tp, *d_fp, *d_fn, *d_tn;
   float_malloc_device(num_classes, d_tp);
   float_malloc_device(num_classes, d_fp);
   float_malloc_device(num_classes, d_fn);
+  float_malloc_device(num_classes, d_tn);
   init_const_gpu(num_classes, 0.0, d_tp);
   init_const_gpu(num_classes, 0.0, d_fp);
   init_const_gpu(num_classes, 0.0, d_fn);
+  init_const_gpu(num_classes, 0.0, d_tn);
   masked_f1_score_kernel<<<CUDA_GET_BLOCKS(end - begin), CUDA_NUM_THREADS>>>(
-      num_classes, begin, end, masks, preds, labels, d_tp, d_fp, d_fn);
+      num_classes, begin, end, masks, preds, labels, d_tp, d_fp, d_fn, d_tn);
   CudaTest("solving masked_f1_score_kernel kernel failed");
   CUDA_CHECK(cudaMemcpy(h_tp, d_tp, num_classes * sizeof(f1count_t), cudaMemcpyDeviceToHost));
   CUDA_CHECK(cudaMemcpy(h_fp, d_fp, num_classes * sizeof(f1count_t), cudaMemcpyDeviceToHost));
   CUDA_CHECK(cudaMemcpy(h_fn, d_fn, num_classes * sizeof(f1count_t), cudaMemcpyDeviceToHost));
+  CUDA_CHECK(cudaMemcpy(h_tn, d_tn, num_classes * sizeof(f1count_t), cudaMemcpyDeviceToHost));
 
   acc_t pNumerator = 0.0;
   acc_t pDenominator = 0.0;
   acc_t rNumerator = 0.0;
   acc_t rDenominator = 0.0;
+  acc_t precisionMacro = 0.0;
+  acc_t recallMacro = 0.0;
   for (size_t i = 0; i < num_classes; i++) {
     acc_t fn = (acc_t)h_fn[i]; // false negtive
     acc_t fp = (acc_t)h_fp[i]; // false positive
 	acc_t tp = (acc_t)h_tp[i]; // true positive
+	acc_t tn = (acc_t)h_tn[i]; // true positive
+
+    precisionMacro = precisionMacro + (tp / (tp + fp));
+    recallMacro = recallMacro + (tp / (tp + fn));
 	pNumerator = pNumerator + tp;
 	pDenominator = pDenominator + (tp + fp);
     rNumerator = rNumerator + tp;
     rDenominator = rDenominator + (tp + fn);
   }
+  precisionMacro = precisionMacro / num_classes;
+  recallMacro = recallMacro / num_classes;
+  acc_t f1_macro = (((beta * beta) + 1) * precisionMacro * recallMacro) / 
+                   ((beta * beta) * precisionMacro + recallMacro);
   acc_t recallMicro = rNumerator / rDenominator;
   acc_t precisionMicro = pNumerator / pDenominator;
-  acc_t fscoreMicro = (((beta * beta) + 1) * precisionMicro * recallMicro) / 
-                     ((beta * beta) * precisionMicro + recallMicro);
-  
+  acc_t f1_micro = (((beta * beta) + 1) * precisionMicro * recallMicro) / 
+                   ((beta * beta) * precisionMicro + recallMicro);
+  std::cout << std::setprecision(3) << std::fixed <<
+      " (f1_micro: " << f1_micro << ", f1_macro: " << f1_macro << ") ";
+ 
   float_free_device(d_tp);
   float_free_device(d_fp);
   float_free_device(d_fn);
+  float_free_device(d_tn);
   delete h_tp;
   delete h_fp;
   delete h_fn;
-  return fscoreMicro;
+  delete h_tn;
+  return f1_micro;
 }
 
 namespace deepgalois {
@@ -124,14 +145,14 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count,
                            mask_t* masks, CSRGraph *g) {
   return masked_accuracy_gpu(num_classes, begin, end, count, masks,
                              layers[num_conv_layers - 1]->next()->get_data(),
-                             context->get_labels_device_ptr());
+                             context->get_labels_ptr());
 }
 
 acc_t Net::masked_multi_class_accuracy(size_t begin, size_t end, size_t count, 
                                        mask_t* masks, CSRGraph* g) {
 	return masked_f1_score_gpu(num_classes, begin, end, count, masks,
                              layers[num_conv_layers]->next()->get_data(),
-                             context->get_labels_device_ptr());
+                             context->get_labels_ptr());
 }
 
 } // end namespace

From b897ef54159935f4b8a8885834c799c93748663e Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Sun, 26 Apr 2020 23:16:10 -0500
Subject: [PATCH 193/660] update sampler

---
 libdeepgalois/include/deepgalois/context.h    |  20 +--
 .../include/deepgalois/layers/layer.h         |   5 +-
 libdeepgalois/include/deepgalois/net.h        |   4 +-
 libdeepgalois/include/deepgalois/sampler.h    |  43 +++++--
 libdeepgalois/include/deepgalois/types.h      |   2 +
 libdeepgalois/src/context.cpp                 |  13 +-
 libdeepgalois/src/context.cu                  |   8 +-
 libdeepgalois/src/net.cpp                     |  70 +++++++++--
 libdeepgalois/src/sampler.cpp                 | 118 +++++++++++-------
 lonestargnn/include/lonestargnn.h             |   1 +
 10 files changed, 200 insertions(+), 84 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h
index ea2b5f2156..52a306e90d 100644
--- a/libdeepgalois/include/deepgalois/context.h
+++ b/libdeepgalois/include/deepgalois/context.h
@@ -26,7 +26,7 @@ class Context {
   size_t read_labels(std::string dataset_str);
   size_t read_features(std::string dataset_str, std::string filetype = "bin");
 
-  label_t get_label(size_t i) { return labels[i]; } // single-class (one-hot) label
+  label_t get_label(size_t i) { return h_labels[i]; } // single-class (one-hot) label
   //label_t get_label(size_t i, size_t j) { return labels[i*num_classes+j]; } // multi-class label
   float_t* get_norm_factor_ptr() { return norm_factor; }
 
@@ -41,15 +41,17 @@ class Context {
   //! returns pointer to the graph
   Graph* getGraphPointer() { return graph_cpu; }
   Graph* getSubgraphPointer() { return subgraph_cpu; };
-  float_t* get_in_ptr() { return h_feats; }
-  label_t* get_labels_ptr() { return labels; }
-  label_t* get_labels_subg_ptr() { return labels_subg; }
+  float_t* get_feats_ptr() { return h_feats; }
+  float_t* get_feats_subg_ptr() { return h_feats_subg; }
+  label_t* get_labels_ptr() { return h_labels; }
+  label_t* get_labels_subg_ptr() { return h_labels_subg; }
 #else
   CSRGraph graph_gpu; // the input graph, |V| = N
   CSRGraph subgraph_gpu;
   CSRGraph* getGraphPointer() { return &graph_gpu; }
   CSRGraph* getSubgraphPointer() { return &subgraph_gpu; };
-  float_t* get_in_ptr() { return d_feats; }
+  float_t* get_feats_ptr() { return d_feats; }
+  float_t* get_feats_subg_ptr() { return d_feats_subg; }
   label_t* get_labels_ptr() { return d_labels; }
   label_t* get_labels_subg_ptr() { return d_labels_subg; }
   inline static cublasHandle_t cublas_handle() { return cublas_handle_; }
@@ -64,13 +66,15 @@ class Context {
   size_t feat_len;             // input feature length: D
   bool is_single_class;        // single-class (one-hot) or multi-class label
   bool is_selfloop_added;      // whether selfloop is added to the input graph
-  label_t *labels;             // labels for classification. Single-class label: Nx1, multi-class label: NxE 
-  label_t *labels_subg;        // labels for subgraph
+  label_t *h_labels;           // labels for classification. Single-class label: Nx1, multi-class label: NxE 
+  label_t *h_labels_subg;      // labels for subgraph
   float_t* h_feats;            // input features: N x D
-  float_t* norm_factor;        // normalization constant based on graph structure
+  float_t* h_feats_subg;       // input features for subgraph
   label_t* d_labels;           // labels on device
   label_t *d_labels_subg;      // labels for subgraph on device
   float_t* d_feats;            // input features on device
+  float_t* d_feats_subg;       // input features for subgraph on device
+  float_t* norm_factor;        // normalization constant based on graph structure
 
 #ifdef CPU_ONLY
   void read_edgelist(const char* filename, bool symmetrize = false, bool add_self_loop = false);
diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h
index 0e94a53d49..a359467ad8 100644
--- a/libdeepgalois/include/deepgalois/layers/layer.h
+++ b/libdeepgalois/include/deepgalois/layers/layer.h
@@ -52,9 +52,7 @@ class layer : public deepgalois::node {
         std::vector<size_t> out_dims)
       : node(in_dims.size(), out_dims.size()), level_(level), begin_(0),
         end_(0), num_dims(in_dims.size()), input_dims(in_dims),
-        output_dims(out_dims), labels(NULL) {
-    add_edge();
-  }
+        output_dims(out_dims), labels(NULL) { }
   virtual ~layer()                       = default;
   virtual std::string layer_type() const = 0;
   void print_layer_info(); //! debug print function
@@ -81,6 +79,7 @@ class layer : public deepgalois::node {
 #else
   void set_graph_ptr(CSRGraph *ptr) { graph_gpu = ptr; }
 #endif
+  void update_dim_size(size_t sg_size) { input_dims[0] = output_dims[0] = sg_size; }
 
   //! set the data of the previous layer connected to this one
   void set_in_data(float_t* data) {
diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h
index 0cd94adc05..910cae89b5 100644
--- a/libdeepgalois/include/deepgalois/net.h
+++ b/libdeepgalois/include/deepgalois/net.h
@@ -95,6 +95,7 @@ class Net {
   size_t train_begin, train_end, train_count;
   size_t val_begin, val_end, val_count;
   size_t test_begin, test_end, test_count;
+  int val_interval;
 
   mask_t* train_masks;               // masks for training
   mask_t* d_train_masks;             // masks for training on device
@@ -112,7 +113,8 @@ class Net {
   deepgalois::DistContext* context;
 #endif
 
-  void lookup_labels(size_t n, mask_t *masks, const label_t *labels, label_t *sub_labels);
+  void lookup_labels(size_t n, const mask_t *masks, const label_t *labels, label_t *sub_labels);
+  void lookup_feats(size_t n, const mask_t *masks, const float_t *feats, float_t *sg_feats);
 
 #ifdef CPU_ONLY
   // comparing outputs with the ground truth (labels)
diff --git a/libdeepgalois/include/deepgalois/sampler.h b/libdeepgalois/include/deepgalois/sampler.h
index 900ff1de2e..676426c0c3 100644
--- a/libdeepgalois/include/deepgalois/sampler.h
+++ b/libdeepgalois/include/deepgalois/sampler.h
@@ -4,14 +4,14 @@
 namespace deepgalois {
 class Sampler {
 public:
-  Sampler() : m(1000) {}
+  Sampler() : m_(1000) {}
   ~Sampler() {}
 
   // sample a subgraph sg of size n from graph g
-  void subgraph_sample(size_t n, Graph &g, Graph &sg, VertexList &vertex_set, mask_t *masks);
+  void subgraph_sample(size_t n, Graph &sg, mask_t *masks);
 
   // !API function for user-defined selection strategy
-  virtual void select_vertices(Graph &g, VertexList &vertex_set, size_t n, size_t m);
+  virtual void select_vertices(size_t nv, size_t n, int m, Graph &g, VertexList vertices, VertexList &vertex_set);
 
   galois::runtime::iterable<galois::NoDerefIterator<Graph::edge_iterator> > neighbor_sampler(Graph &g, GNode v);
 
@@ -19,16 +19,45 @@ class Sampler {
 
   Graph::edge_iterator sampled_edge_end(Graph &g, GNode v) { return g.edge_end(v); }
 
+  void set_masked_graph(size_t begin, size_t end, size_t count, mask_t *masks, Graph *g) {
+    begin_ = begin;
+    end_ = end;
+    count_ = count;
+    masks_ = masks;
+    graph = g;
+    generate_masked_graph(count, masks, *g, masked_graph);
+    size_t idx = 0;
+    vertices_.resize(count);
+    for (size_t i = begin; i < end; i++) {
+      if (masks_[i] == 1) vertices_[idx++] = i;
+    }
+  }
+
 protected:
-  size_t m;
+  int m_;
+  size_t count_;
+  size_t begin_;
+  size_t end_;
+  VertexList vertices_;
+  mask_t *masks_;
+  Graph masked_graph;
+  Graph *graph;
+
   // Utility function to randomly select k items from [begin, end)
-  VertexList selectVertex(GNode begin, GNode end, size_t k);
+  template <typename T = uint32_t>
+  T* select_k_items(T k, T begin, T end);
+
   // Utility function to find ceiling of r in arr[l..h]
-  inline int findCeil(std::vector<unsigned> arr, unsigned r, unsigned l, unsigned h);
+  template <typename T = int>
+  inline T findCeil(std::vector<T> arr, T r, T l, T h);
+
   // Utility function to select one element from n elements given a frequency (probability) distribution
-  size_t selectOneVertex(size_t n, std::vector<unsigned> dist);
+  template <typename T = int>
+  T select_one_item(T n, std::vector<T> dist);
+
   // Given a subset of vertices and a graph g, generate a subgraph sg from the graph g
   void generate_subgraph(VertexList &vertex_set, Graph &g, Graph &sub);
+  void generate_masked_graph(size_t n, mask_t *masks, Graph &g, Graph &mg);
 };
 
 }
diff --git a/libdeepgalois/include/deepgalois/types.h b/libdeepgalois/include/deepgalois/types.h
index a2f6164439..9c4a8333d3 100644
--- a/libdeepgalois/include/deepgalois/types.h
+++ b/libdeepgalois/include/deepgalois/types.h
@@ -1,5 +1,6 @@
 #ifndef _GNN_TYPES_H_
 #define _GNN_TYPES_H_
+#include <set>
 #include <vector>
 #include <stdint.h>
 
@@ -24,6 +25,7 @@ typedef uint8_t mask_t; // mask is used to indicate different uses of labels:
 typedef uint32_t VertexID;
 typedef uint64_t EdgeID;
 typedef std::vector<VertexID> VertexList;
+typedef std::set<VertexID> VertexSet;
 typedef std::vector<size_t> dims_t; // dimentions type
 
 #define CHUNK_SIZE 256
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index 0fc3fe0a95..103aa94363 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -1,7 +1,6 @@
 /**
  * Based on common.hpp file of the Caffe deep learning library.
  */
-#include <set>
 #include "deepgalois/context.h"
 
 namespace deepgalois {
@@ -9,11 +8,11 @@ namespace deepgalois {
 #ifdef CPU_ONLY
 Context::Context() : n(0), num_classes(0), feat_len(0), 
   is_single_class(true), is_selfloop_added(false), 
-  labels(NULL), h_feats(NULL), norm_factor(NULL),
+  h_labels(NULL), h_feats(NULL), norm_factor(NULL),
   d_labels(NULL), d_feats(NULL) {}
 
 Context::~Context() {
-  if (labels) delete labels;
+  if (h_labels) delete h_labels;
   if (h_feats) delete h_feats;
   if (norm_factor) delete norm_factor;
 }
@@ -174,10 +173,10 @@ size_t Context::read_labels(std::string dataset_str) {
   assert(m == n);
   if (is_single_class) {
     std::cout << "Using single-class (one-hot) labels\n";
-    labels = new label_t[m]; // single-class (one-hot) label for each vertex: N x 1
+    h_labels = new label_t[m]; // single-class (one-hot) label for each vertex: N x 1
   } else {
     std::cout << "Using multi-class labels\n";
-    labels = new label_t[m*num_classes]; // multi-class label for each vertex: N x E
+   h_labels = new label_t[m*num_classes]; // multi-class label for each vertex: N x E
   }
   unsigned v = 0;
   while (std::getline(in, line)) {
@@ -187,11 +186,11 @@ size_t Context::read_labels(std::string dataset_str) {
       label_stream >> x;
       if (is_single_class) {
         if (x != 0) {
-          labels[v] = idx;
+          h_labels[v] = idx;
           break;
         }
       } else {
-        labels[v*num_classes+idx] = x;
+        h_labels[v*num_classes+idx] = x;
       }
     }
     v++;
diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu
index 86ad9003bf..7530bd3946 100644
--- a/libdeepgalois/src/context.cu
+++ b/libdeepgalois/src/context.cu
@@ -63,7 +63,7 @@ curandGenerator_t Context::curand_generator_   = 0;
 
 Context::Context() : n(0), num_classes(0), feat_len(0), 
   is_single_class(true), is_selfloop_added(false), 
-  labels(NULL), h_feats(NULL), norm_factor(NULL),
+  h_labels(NULL), h_feats(NULL), norm_factor(NULL),
   d_labels(NULL), d_feats(NULL) {
   CUBLAS_CHECK(cublasCreate(&cublas_handle_));
   CUSPARSE_CHECK(cusparseCreate(&cusparse_handle_));
@@ -141,10 +141,10 @@ size_t Context::read_graph_gpu(std::string dataset_str, bool selfloop) {
 void Context::copy_data_to_device() {
   if (is_single_class) {
     CUDA_CHECK(cudaMalloc((void**)&d_labels, n * sizeof(label_t)));
-    CUDA_CHECK(cudaMemcpy(d_labels, labels, n * sizeof(label_t), cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpy(d_labels, h_labels, n * sizeof(label_t), cudaMemcpyHostToDevice));
   } else {
     CUDA_CHECK(cudaMalloc((void**)&d_labels, n * num_classes * sizeof(label_t)));
-    CUDA_CHECK(cudaMemcpy(d_labels, labels, n * num_classes * sizeof(label_t), cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpy(d_labels, h_labels, n * num_classes * sizeof(label_t), cudaMemcpyHostToDevice));
   }
   CUDA_CHECK(cudaMalloc((void**)&d_feats, n * feat_len * sizeof(float_t)));
   CUDA_CHECK(cudaMemcpy(d_feats, &h_feats[0], n * feat_len * sizeof(float_t), cudaMemcpyHostToDevice));
@@ -153,7 +153,7 @@ void Context::copy_data_to_device() {
 
 //void Context::copy_data_to_device() {
   //float_malloc_device(n, d_labels);
-  //float_copy_device(n, labels, d_labels);
+  //float_copy_device(n, h_labels, d_labels);
   //float_malloc_device(n*feat_len, d_feats);
   //float_copy_device(n*feat_len, &h_feats[0], d_feats);
 //}
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index d44c9b4632..ca875ebf12 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -21,6 +21,7 @@ void Net::init(std::string dataset_str, unsigned num_conv, unsigned epochs,
   has_dense = dense;
   neighbor_sample_size = neigh_sz;
   subgraph_sample_size = subg_sz;
+  val_interval = 1;
   galois::gPrint("Configuration: num_conv_layers ", num_conv_layers,
                  ", num_epochs ", num_epochs,
                  ", hidden1 ", hidden1,
@@ -107,7 +108,30 @@ void Net::init(std::string dataset_str, unsigned num_conv, unsigned epochs,
 #endif
 }
 
-void Net::lookup_labels(size_t n, mask_t *masks, const label_t *labels, label_t *sub_labels) {
+// generate labels for the subgraph
+void Net::lookup_labels(size_t n, const mask_t *masks, const label_t *labels, label_t *sg_labels) {
+  size_t count = 0;
+  for (size_t i = 0; i < n; i++) {
+    if (masks[i] == 1) {
+      if (is_single_class) {
+        sg_labels[count] = labels[i];
+      } else {
+        std::copy(labels+i*num_classes, labels+(i+1)*num_classes, sg_labels+count*num_classes);
+	  }
+      count ++;
+	}
+  }
+}
+
+void Net::lookup_feats(size_t n, const mask_t *masks, const float_t *feats, float_t *sg_feats) {
+  size_t count = 0;
+  size_t len = feature_dims[0];
+  for (size_t i = 0; i < n; i++) {
+    if (masks[i] == 1) {
+      std::copy(feats+i*len, feats+(i+1)*len, sg_feats+count*len);
+      count ++;
+	}
+  }
 }
 
 void Net::train(optimizer* opt, bool need_validate) {
@@ -125,24 +149,37 @@ void Net::train(optimizer* opt, bool need_validate) {
   galois::StatTimer Tbw("Train-Backward");
   galois::StatTimer Tval("Validation");
   double total_train_time = 0.0;
+
   int num_subg_remain = 0;
+#ifdef CPU_ONLY
   if (subgraph_sample_size) {
     subgraph_masks = new mask_t[num_samples];
-    std::copy(train_masks, train_masks+num_samples, subgraph_masks);
+    sampler->set_masked_graph(train_begin, train_end, train_count, train_masks, context->getGraphPointer());
   }
-
+#endif
   Timer t_epoch;
   // run epochs
-  for (unsigned i = 0; i < num_epochs; i++) {
-    galois::gPrint(header, "Epoch ", std::setw(3), i, seperator);
+  for (unsigned ep = 0; ep < num_epochs; ep++) {
+    galois::gPrint(header, "Epoch ", std::setw(3), ep, seperator);
     t_epoch.Start();
 
     if (subgraph_sample_size && num_subg_remain == 0) {
 #ifdef CPU_ONLY
-      VertexList vertices;
-      sampler->subgraph_sample(subgraph_sample_size, *(context->getGraphPointer()),
-                               *(context->getSubgraphPointer()), vertices, subgraph_masks);
+      // generate subgraph
+      sampler->subgraph_sample(subgraph_sample_size, *(context->getSubgraphPointer()), subgraph_masks);
+      for (size_t i = 0; i < num_conv_layers-1; i++) {
+        layers[i]->set_graph_ptr(context->getSubgraphPointer());
+	  }
+      // update masks for subgraph
+      layers[num_layers - 1]->set_sample_mask(train_begin, train_end, train_count, subgraph_masks);
+
+      // update labels for subgraph
       lookup_labels(num_samples, subgraph_masks, context->get_labels_ptr(), context->get_labels_subg_ptr());
+      layers[num_layers-1]->set_labels_ptr(context->get_labels_subg_ptr());
+
+      // update features for subgraph
+      lookup_feats(num_samples, subgraph_masks, context->get_feats_ptr(), context->get_feats_subg_ptr());
+      layers[0]->set_in_data(context->get_feats_subg_ptr()); // feed input data
 #endif
       num_subg_remain += 1; // num_threads
     }
@@ -175,7 +212,7 @@ void Net::train(optimizer* opt, bool need_validate) {
     t_epoch.Stop();
     double epoch_time = t_epoch.Millisecs();
     total_train_time += epoch_time;
-    if (need_validate) {
+    if (need_validate && ep % val_interval == 0) {
       // Validation
       acc_t val_loss = 0.0, val_acc = 0.0;
       Tval.start();
@@ -306,7 +343,18 @@ void Net::construct_layers() {
   if (has_dense)
     append_dense_layer(num_layers-2);            // dense layer
   append_out_layer(num_layers-1);                // output layer
-  layers[0]->set_in_data(context->get_in_ptr()); // feed input data
+
+  // allocate memory for intermediate features and gradients
+  for (size_t i = 0; i < num_layers; i++) {
+    if (subgraph_sample_size)
+      layers[i]->update_dim_size(subgraph_sample_size);
+    layers[i]->add_edge();
+  }
+  for (size_t i = 1; i < num_layers; i++) {
+    connect(layers[i - 1], layers[i]);
+  }
+  layers[0]->set_in_data(context->get_feats_ptr()); // feed input data
+  // precompute the normalization constant based on graph structure
   context->norm_factor_counting();
   set_contexts();
 }
@@ -345,7 +393,6 @@ void Net::append_out_layer(size_t layer_id) {
   else
     layers[layer_id] = new sigmoid_loss_layer(layer_id, in_dims, out_dims);
   layers[layer_id]->set_labels_ptr(context->get_labels_ptr());
-  connect(layers[layer_id - 1], layers[layer_id]);
 }
 
 //! Add a convolution layer to the network
@@ -360,7 +407,6 @@ void Net::append_conv_layer(size_t layer_id, bool act, bool norm, bool bias,
   layers[layer_id] = new graph_conv_layer(layer_id, act, norm, bias, dropout,
                                           dropout_rate, in_dims, out_dims);
   layers[layer_id]->set_graph_ptr(context->getGraphPointer());
-  if (layer_id > 0) connect(layers[layer_id - 1], layers[layer_id]);
 }
 
 void Net::read_test_masks(std::string dataset, Graph* dGraph) {
diff --git a/libdeepgalois/src/sampler.cpp b/libdeepgalois/src/sampler.cpp
index 3b3ae84b85..dbdd984556 100644
--- a/libdeepgalois/src/sampler.cpp
+++ b/libdeepgalois/src/sampler.cpp
@@ -8,13 +8,14 @@ inline unsigned getDegree(Graph &g, GNode v) {
 
 namespace deepgalois {
 
-// Utility function to randomly select k items from [begin, end)
-VertexList Sampler::selectVertex(GNode begin, GNode end, size_t k) {
+// Utility function to randomly select k vertices from [begin, end)
+template <typename T = int>
+T* Sampler::select_k_items(T k, T begin, T end) {
     auto i = begin;
   
     // reservoir[] is the output array. Initialize  
     // it with first k vertices 
-    VertexList reservoir(k);
+    T *reservoir = new T[k];
     for (; i < k; i++) reservoir[i] = i;
   
     // Use a different seed value so that we don't get  
@@ -35,8 +36,9 @@ VertexList Sampler::selectVertex(GNode begin, GNode end, size_t k) {
 }
 
 // Utility function to find ceiling of r in arr[l..h]
-inline int Sampler::findCeil(std::vector<unsigned> arr, unsigned r, unsigned l, unsigned h) {  
-	unsigned mid;
+template <typename T = int>
+inline T Sampler::findCeil(std::vector<T> arr, T r, T l, T h) {  
+	T mid;
 	while (l < h) {
 		mid = l + ((h - l) >> 1); // Same as mid = (l+h)/2
 		(r > arr[mid]) ? (l = mid + 1) : (h = mid);
@@ -46,16 +48,16 @@ inline int Sampler::findCeil(std::vector<unsigned> arr, unsigned r, unsigned l,
 
 // Utility function to select one element from n elements given a frequency (probability) distribution
 // https://www.geeksforgeeks.org/random-number-generator-in-arbitrary-probability-distribution-fashion/
-size_t Sampler::selectOneVertex(size_t n, std::vector<unsigned> dist) {
-	std::vector<unsigned> offsets(n);
+template <typename T = int>
+T Sampler::select_one_item(T n, std::vector<T> dist) {
+	std::vector<T> offsets(n);
 	offsets[0] = dist[0];
 	// compute the prefix sum of the distribution
-	for (size_t i = 1; i < n; ++i) offsets[i] = offsets[i-1] + dist[i];
+	for (T i = 1; i < n; ++i) offsets[i] = offsets[i-1] + dist[i];
 	// offsets[n-1] is sum of all frequencies
-	unsigned sum = offsets[n-1];
-	unsigned r = (rand() % sum) + 1;
-	// find which range r falls into,
-	// and return the index of the range
+	T sum = offsets[n-1];
+	T r = (rand() % sum) + 1;
+	// find which range r falls into, and return the index of the range
 	return findCeil(offsets, r, 0, n - 1);
 }
 
@@ -83,41 +85,73 @@ void Sampler::generate_subgraph(VertexList &vertex_set, Graph &g, Graph &sub) {
 	}
 }
 
+void Sampler::generate_masked_graph(size_t n, mask_t *masks, Graph &g, Graph &sub) {
+  std::vector<uint32_t> degrees(n, 0);
+  galois::do_all(galois::iterate(g), [&](const GNode src) {
+    if (masks[src] == 1) {
+      for (const auto e : g.edges(src)) {
+        const auto dst = g.getEdgeDst(e);
+        if (masks[dst] == 1) degrees[src] ++;
+      }
+    }
+  }, galois::loopname("update_degrees"));
+  std::vector<uint32_t> offsets(n+1);
+  offsets[0] = 0;
+  for (size_t i = 0; i < n; i ++) {
+    offsets[i+1] = offsets[i] + degrees[i];
+  }
+  size_t ne = offsets[n];
+  sub.allocateFrom(n, ne);
+  sub.constructNodes();
+  galois::do_all(galois::iterate(sub), [&](const GNode src) {
+    g.fixEndEdge(src, offsets[src+1]);
+    if (masks[src] == 1) {
+      auto idx = offsets[src];
+      for (const auto e : g.edges(src)) {
+        const auto dst = g.getEdgeDst(e);
+        if (masks[dst] == 1) g.constructEdge(idx++, dst, 0);
+      }
+    }
+  }, galois::loopname("gen_subgraph"));
+}
+
 // !API function for user-defined selection strategy
-// Select n vertices from graph g and put them in vertex_set.
+// Select n vertices from vertices and put them in vertex_set.
+// nv: number of vertices in the original graph;
 // n: number of vertices in the subgraph;
 // m: number of vertices in the frontier.
-void Sampler::select_vertices(Graph &g, VertexList &vertex_set, size_t n, size_t m) {
-	assert(n == vertex_set.size());
-    auto num_vertices = g.size(); // number of vertices in the original input graph
-    auto frontier = selectVertex(0, num_vertices, m); // randomly select m vertices from g as frontier
-	for (size_t i = 0; i < m; i++) vertex_set[i] = frontier[i];
-	std::vector<unsigned> degrees(m);
-	//std::vector<float> probabilities(m);
-	//unsigned sum_degree = 0;
-	for (size_t i = 0; i < m; i++) {
-		degrees[i] = getDegree(g, frontier[i]);
-		//sum_degree += degrees[i];
-	}
-	for (size_t i = 0; i < n - m; i++) {
-		//for (size_t i = 0; i < m; i++)
-		//	probabilities[i] = (float)degrees[i] / (float)sum_degree;
-		auto pos = selectOneVertex(m, degrees);
-		GNode u = frontier[pos];
-		auto degree = degrees[pos];
-		auto neighbor_id = rand() % degree;
-		frontier[pos] = g.getEdgeDst(g.edge_begin(u) + neighbor_id);
-		degrees[pos] = getDegree(g, frontier[pos]);
-		//sum_degree -= degree;
-		//sum_degree += degrees[pos];
-		vertex_set.push_back(u);
-	}
+void Sampler::select_vertices(size_t nv, size_t n, int m, Graph &g, VertexList vertices, VertexList &vertex_set) {
+  assert(nv == vertices.size());
+  assert(n == vertex_set.size());
+  auto frontier_indices = select_k_items(m, 0, (int)nv); // randomly select m vertices from vertices as frontier
+  VertexList frontier(m);
+  for (int i = 0; i < m; i++) vertex_set[i] = frontier[i] = vertices[frontier_indices[i]];
+  std::vector<int> degrees(m);
+  galois::do_all(galois::iterate(g.begin(), g.end()), [&](const auto i) {
+    degrees[i] = getDegree(g, frontier[i]);
+  }, galois::loopname("compute_degrees"));
+  for (size_t i = 0; i < n - m; i++) {
+    auto pos = select_one_item((int)m, degrees);
+    auto u = frontier[pos];
+    auto degree = degrees[pos];
+    auto neighbor_id = rand() % degree; // randomly select a neighbor
+    auto dst = g.getEdgeDst(g.edge_begin(u) + neighbor_id);
+    frontier[pos] = dst;
+    degrees[pos] = getDegree(g, frontier[pos]);
+    vertex_set.push_back(u);
+  }
+}
+
+void update_masks(size_t n, VertexList vertices, mask_t *masks) {
+	std::fill(masks, masks+n, 0);
+	for (auto v : vertices) masks[v] = 1;
 }
 
-void Sampler::subgraph_sample(size_t n, Graph &g, Graph&sg, VertexList &vertex_set, mask_t *masks) {
-	vertex_set.resize(n);
-	select_vertices(g, vertex_set, n, m); 
-	generate_subgraph(vertex_set, g, sg);
+void Sampler::subgraph_sample(size_t n, Graph&sg, mask_t *masks) {
+  VertexList vertex_set(n);
+  select_vertices(count_, n, m_, masked_graph, vertices_, vertex_set); 
+  generate_subgraph(vertex_set, masked_graph, sg);
+  update_masks(graph->size(), vertex_set, masks);
 }
 
 } // end namespace
diff --git a/lonestargnn/include/lonestargnn.h b/lonestargnn/include/lonestargnn.h
index cdfda9eba0..324f5a31ba 100644
--- a/lonestargnn/include/lonestargnn.h
+++ b/lonestargnn/include/lonestargnn.h
@@ -41,6 +41,7 @@ static cll::opt<bool> do_test("dt", cll::desc("enable test"), cll::init(1));
 static cll::opt<bool> add_selfloop("sl", cll::desc("add selfloop"), cll::init(0));
 static cll::opt<bool> add_l2norm("l2", cll::desc("add an l2_norm layer"), cll::init(0));
 static cll::opt<bool> add_dense("d", cll::desc("add an dense layer"), cll::init(0));
+static cll::opt<unsigned> val_interval("vi", cll::desc("validation interval (default value 1)"), cll::init(1));
 static cll::opt<unsigned> neighbor_sample_sz("ns", cll::desc("neighbor sampling size (default value 0)"), cll::init(0));
 static cll::opt<unsigned> subgraph_sample_sz("ss", cll::desc("subgraph sampling size (default value 0)"), cll::init(0));
 

From 87c742db977096f8038c77632a11b057ba597dba Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Mon, 27 Apr 2020 11:18:12 -0500
Subject: [PATCH 194/660] update utils

---
 .../include/deepgalois/DistContext.h          |   8 +-
 libdeepgalois/include/deepgalois/configs.h    |  11 ++
 libdeepgalois/include/deepgalois/context.h    |   4 +-
 .../deepgalois/layers/graph_conv_layer.h      |  24 +--
 .../include/deepgalois/layers/layer.h         |   3 +-
 .../include/deepgalois/math_functions.hh      |   1 -
 libdeepgalois/include/deepgalois/net.h        |   2 +-
 libdeepgalois/include/deepgalois/sampler.h    |  20 +--
 libdeepgalois/include/deepgalois/types.h      |   2 +
 libdeepgalois/include/deepgalois/utils.h      |  83 ++++++++--
 libdeepgalois/src/DistContext.cpp             |  46 ++++++
 libdeepgalois/src/context.cpp                 |  51 +++++-
 libdeepgalois/src/context.cu                  |  10 +-
 libdeepgalois/src/layers/graph_conv_layer.cpp |  22 ++-
 libdeepgalois/src/layers/graph_conv_layer.cu  |   2 +-
 libdeepgalois/src/math_functions.cpp          |   1 +
 libdeepgalois/src/net.cpp                     |  13 +-
 libdeepgalois/src/net.cu                      |   3 +-
 libdeepgalois/src/sampler.cpp                 | 152 +++++++-----------
 libdeepgalois/src/utils.cpp                   | 127 +++++----------
 20 files changed, 333 insertions(+), 252 deletions(-)
 create mode 100644 libdeepgalois/include/deepgalois/configs.h

diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h
index 704247d54b..3054915ded 100644
--- a/libdeepgalois/include/deepgalois/DistContext.h
+++ b/libdeepgalois/include/deepgalois/DistContext.h
@@ -5,7 +5,6 @@
  */
 #include "galois/graphs/GluonSubstrate.h"
 #include "deepgalois/types.h"
-#include "deepgalois/utils.h"
 #include "deepgalois/gtypes.h"
 
 namespace deepgalois {
@@ -28,10 +27,17 @@ class DistContext {
 
   //! save graph pointer to context object
   void saveGraph(Graph* dGraph);
+
   //! read labels of local nodes only
   size_t read_labels(std::string dataset_str);
+
   //! read features of local nodes only
   size_t read_features(std::string dataset_str);
+
+  //! read masks of local nodes only
+  size_t read_masks(std::string dataset_str, std::string mask_type,
+                    size_t n, size_t& begin, size_t& end, mask_t* masks, Graph* dGraph);
+
   //! find norm factor by looking at degree
   // TODO this is a distributed operation
   void norm_factor_counting();
diff --git a/libdeepgalois/include/deepgalois/configs.h b/libdeepgalois/include/deepgalois/configs.h
new file mode 100644
index 0000000000..3de67ecb74
--- /dev/null
+++ b/libdeepgalois/include/deepgalois/configs.h
@@ -0,0 +1,11 @@
+#pragma once
+
+namespace deepgalois {
+
+const std::string path =
+    "/net/ohm/export/iss/inputs/Learning/"; // path to the input dataset
+
+#define NUM_DATASETS 8
+const std::string dataset_names[NUM_DATASETS] = {"cora", "citeseer", "ppi", "pubmed", "flickr", "yelp", "reddit", "amazon"};
+
+}
diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h
index 52a306e90d..d995a41c8c 100644
--- a/libdeepgalois/include/deepgalois/context.h
+++ b/libdeepgalois/include/deepgalois/context.h
@@ -6,7 +6,6 @@
 #include <string>
 #include <cassert>
 #include "deepgalois/types.h"
-#include "deepgalois/utils.h"
 #ifdef CPU_ONLY
 #include "deepgalois/gtypes.h"
 #else
@@ -15,6 +14,7 @@
 #endif
 
 namespace deepgalois {
+
 class Context {
 public:
   Context();
@@ -25,6 +25,8 @@ class Context {
   size_t read_graph_gpu(std::string dataset_str, bool selfloop);
   size_t read_labels(std::string dataset_str);
   size_t read_features(std::string dataset_str, std::string filetype = "bin");
+  size_t read_masks(std::string dataset_str, std::string mask_type,
+                    size_t n, size_t& begin, size_t& end, mask_t* masks);
 
   label_t get_label(size_t i) { return h_labels[i]; } // single-class (one-hot) label
   //label_t get_label(size_t i, size_t j) { return labels[i*num_classes+j]; } // multi-class label
diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
index eb42fe1093..8a6992e30c 100644
--- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
@@ -35,7 +35,7 @@ class graph_conv_layer : public layer {
   ~graph_conv_layer() {}
   void init();
   std::string layer_type() const override { return std::string("graph_conv"); }
-  void set_netphase(deepgalois::net_phase ctx) override { phase_ = ctx; }
+  void set_netphase(net_phase ctx) override { phase_ = ctx; }
   void set_context(layer::ContextType* ctx) { context = ctx; norm_factor = ctx->get_norm_factor_ptr(); }
   virtual acc_t get_weight_decay_loss();
   //! Uses weights contained in this layer to update in_data (results from previous)
@@ -64,7 +64,7 @@ class graph_conv_layer : public layer {
   bool dropout_; // whether to use dropout at first
   const float_t dropout_rate_;
   float_t scale_;
-  deepgalois::net_phase phase_;
+  net_phase phase_;
   size_t x;
   size_t y;
   size_t z;
@@ -76,22 +76,8 @@ class graph_conv_layer : public layer {
   float_t* norm_factor;   // normalization constant based on graph structure
 
   // Glorot & Bengio (AISTATS 2010)
-  inline void rand_init_matrix(size_t dim_x, size_t dim_y, vec_t& matrix, unsigned seed=1) {
-    auto init_range = sqrt(6.0 / (dim_x + dim_y));
-    std::default_random_engine rng(seed);
-    std::uniform_real_distribution<float_t> dist(-init_range, init_range);
-    matrix.resize(dim_x * dim_y);
-    for (size_t i = 0; i < dim_x; ++i) {
-      for (size_t j = 0; j < dim_y; ++j)
-        matrix[i * dim_y + j] = dist(rng);
-    }
-  }
-  inline void zero_init_matrix(size_t dim_x, size_t dim_y, vec_t& matrix) {
-    matrix.resize(dim_x * dim_y);
-    for (size_t i = 0; i < dim_x; ++i) {
-      for (size_t j = 0; j < dim_y; ++j)
-        matrix[i * dim_y + j] = 0;
-    }
-  }
+  inline void rand_init_matrix(size_t dim_x, size_t dim_y, vec_t& matrix, unsigned seed=1);
+  inline void zero_init_matrix(size_t dim_x, size_t dim_y, vec_t& matrix);
 };
+
 } // namespace
diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h
index a359467ad8..d0bfac6e16 100644
--- a/libdeepgalois/include/deepgalois/layers/layer.h
+++ b/libdeepgalois/include/deepgalois/layers/layer.h
@@ -10,7 +10,6 @@
  */
 
 #include "deepgalois/types.h"
-#include "deepgalois/utils.h"
 #ifndef GALOIS_USE_DIST
 #include "deepgalois/context.h"
 #else
@@ -69,7 +68,7 @@ class layer : public deepgalois::node {
   float_t* get_grads_device_ptr() { return d_weight_grad; }
 
   // set methods
-  virtual void set_netphase(deepgalois::net_phase phase) {}
+  virtual void set_netphase(net_phase phase) {}
   virtual void set_context(ContextType* ctx) { context = ctx; }
   void set_trainable(bool trainable) { trainable_ = trainable; } // is this layer trainable?
   void set_labels_ptr(label_t *ptr) { labels = ptr; }
diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh
index 5611caaa94..9c0e58dc45 100644
--- a/libdeepgalois/include/deepgalois/math_functions.hh
+++ b/libdeepgalois/include/deepgalois/math_functions.hh
@@ -9,7 +9,6 @@
 #include <cstddef>
 #include <cstdint>
 #include "deepgalois/types.h"
-#include "deepgalois/utils.h"
 
 extern "C" {
 #include <cblas.h>
diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h
index 910cae89b5..d0adf2d55f 100644
--- a/libdeepgalois/include/deepgalois/net.h
+++ b/libdeepgalois/include/deepgalois/net.h
@@ -68,7 +68,7 @@ class Net {
       layers[i]->set_context(context);
   }
   //! set netphases for all layers in this network
-  void set_netphases(deepgalois::net_phase phase) {
+  void set_netphases(net_phase phase) {
     for (size_t i = 0; i < num_layers; i++)
       layers[i]->set_netphase(phase);
   }
diff --git a/libdeepgalois/include/deepgalois/sampler.h b/libdeepgalois/include/deepgalois/sampler.h
index 676426c0c3..9f57ed53da 100644
--- a/libdeepgalois/include/deepgalois/sampler.h
+++ b/libdeepgalois/include/deepgalois/sampler.h
@@ -11,7 +11,7 @@ class Sampler {
   void subgraph_sample(size_t n, Graph &sg, mask_t *masks);
 
   // !API function for user-defined selection strategy
-  virtual void select_vertices(size_t nv, size_t n, int m, Graph &g, VertexList vertices, VertexList &vertex_set);
+  virtual void select_vertices(size_t nv, size_t n, int m, Graph &g, VertexList vertices, VertexSet &vertex_set);
 
   galois::runtime::iterable<galois::NoDerefIterator<Graph::edge_iterator> > neighbor_sampler(Graph &g, GNode v);
 
@@ -43,21 +43,13 @@ class Sampler {
   Graph masked_graph;
   Graph *graph;
 
-  // Utility function to randomly select k items from [begin, end)
-  template <typename T = uint32_t>
-  T* select_k_items(T k, T begin, T end);
-
-  // Utility function to find ceiling of r in arr[l..h]
-  template <typename T = int>
-  inline T findCeil(std::vector<T> arr, T r, T l, T h);
-
-  // Utility function to select one element from n elements given a frequency (probability) distribution
-  template <typename T = int>
-  T select_one_item(T n, std::vector<T> dist);
-
   // Given a subset of vertices and a graph g, generate a subgraph sg from the graph g
-  void generate_subgraph(VertexList &vertex_set, Graph &g, Graph &sub);
+  void generate_subgraph(VertexSet &vertex_set, Graph &g, Graph &sub);
   void generate_masked_graph(size_t n, mask_t *masks, Graph &g, Graph &mg);
+
+  void get_masked_degrees(size_t n, mask_t *masks, Graph &g, std::vector<uint32_t> &degrees);
+  void update_masks(size_t n, VertexSet vertices, mask_t *masks);
+  inline VertexList reindexing_vertice(VertexSet vertex_set);
 };
 
 }
diff --git a/libdeepgalois/include/deepgalois/types.h b/libdeepgalois/include/deepgalois/types.h
index 9c4a8333d3..9c6c79c6e5 100644
--- a/libdeepgalois/include/deepgalois/types.h
+++ b/libdeepgalois/include/deepgalois/types.h
@@ -28,6 +28,8 @@ typedef std::vector<VertexID> VertexList;
 typedef std::set<VertexID> VertexSet;
 typedef std::vector<size_t> dims_t; // dimentions type
 
+enum class net_phase { train, test };
+
 #define CHUNK_SIZE 256
 #define TB_SIZE 256
 #define BLOCK_SIZE 256
diff --git a/libdeepgalois/include/deepgalois/utils.h b/libdeepgalois/include/deepgalois/utils.h
index 097457290d..60974b9f8a 100644
--- a/libdeepgalois/include/deepgalois/utils.h
+++ b/libdeepgalois/include/deepgalois/utils.h
@@ -15,11 +15,6 @@
 
 namespace deepgalois {
 
-const std::string path =
-    "/net/ohm/export/iss/inputs/Learning/"; // path to the input dataset
-
-enum class net_phase { train, test };
-
 //! tracks max mem usage with rusage
 // TODO use Galois's getrusage functionality
 class ResourceManager {
@@ -105,14 +100,76 @@ inline bool bernoulli(float_t p) {
   return uniform_rand(float_t(0), float_t(1)) > p;
 }
 
+// sequential prefix sum
+template <typename InTy = unsigned, typename OutTy = unsigned>
+inline std::vector<OutTy> prefix_sum(const std::vector<InTy> &in) {
+  std::vector<OutTy> prefix(in.size() + 1);
+  OutTy total = 0;
+  for (size_t i = 0; i < in.size(); i ++) {
+    prefix[i] = total;
+    total += (OutTy)in[i];
+  }
+  prefix[in.size()] = total;
+  return prefix;
+}
+
+template <typename InTy = unsigned, typename OutTy = unsigned>
+OutTy* parallel_prefix_sum(const std::vector<InTy> &in);
+
+// Utility function to randomly select k items from [begin, end)
+template <typename T = int>
+inline T* select_k_items(T k, T begin, T end) {
+    auto i = begin;
+  
+    // reservoir[] is the output array. Initialize  
+    // it with first k vertices 
+    T *reservoir = new T[k];
+    for (; i < k; i++) reservoir[i] = i;
+  
+    // Use a different seed value so that we don't get  
+    // same result each time we run this program  
+    srand(time(NULL));  
+  
+    // Iterate from the (k+1)th element to nth element  
+    for (; i < end; i++) {  
+        // Pick a random index from 0 to i.  
+        auto j = rand() % (i + 1);  
+  
+        // If the randomly picked index is smaller than k,  
+        // then replace the element present at the index  
+        // with new element from stream  
+        if (j < k) reservoir[j] = i;
+    }
+	return reservoir;
+}
+
+// Utility function to find ceiling of r in arr[l..h]
+template <typename T = int>
+inline T find_ceil(T *arr, T r, T l, T h) {  
+	T mid;
+	while (l < h) {
+		mid = l + ((h - l) >> 1); // Same as mid = (l+h)/2
+		(r > arr[mid]) ? (l = mid + 1) : (h = mid);
+	}
+	return (arr[l] >= r) ? l : -1;  
+} 
+
+// Utility function to select one element from n elements given a frequency (probability) distribution
+// https://www.geeksforgeeks.org/random-number-generator-in-arbitrary-probability-distribution-fashion/
+template <typename T = int>
+T select_one_item(T n, T *dist) {
+	T *offsets = new T[n];
+	offsets[0] = dist[0];
+	// compute the prefix sum of the distribution
+	for (T i = 1; i < n; ++i) offsets[i] = offsets[i-1] + dist[i];
+	// offsets[n-1] is sum of all frequencies
+	T sum = offsets[n-1];
+	T r = (rand() % sum) + 1;
+	// find which range r falls into, and return the index of the range
+	return find_ceil(offsets, r, 0, n - 1);
+}
+
 acc_t masked_f1_score(size_t begin, size_t end, size_t count, mask_t *masks, 
                       size_t num_classes, label_t *ground_truth, float_t *pred);
 
-#ifdef GALOIS_USE_DIST
-size_t read_masks(std::string dataset_str, std::string mask_type,
-                  size_t n, size_t& begin, size_t& end, mask_t* masks, Graph* dGraph);
-#else
-size_t read_masks(std::string dataset_str, std::string mask_type,
-                  size_t n, size_t& begin, size_t& end, mask_t* masks);
-#endif
-}
+} // end namespace
diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp
index 9069fad351..7c4fd00a46 100644
--- a/libdeepgalois/src/DistContext.cpp
+++ b/libdeepgalois/src/DistContext.cpp
@@ -1,4 +1,6 @@
 #include "deepgalois/DistContext.h"
+#include "deepgalois/utils.h"
+#include "deepgalois/configs.h"
 
 namespace deepgalois {
 DistContext::DistContext() {}
@@ -101,6 +103,50 @@ size_t DistContext::read_features(std::string dataset_str) {
   return feat_len;
 }
 
+size_t DistContext::read_masks(std::string dataset_str, std::string mask_type,
+                               size_t n, size_t& begin, size_t& end,
+                               mask_t* masks, Graph* dGraph) {
+  bool dataset_found = false;
+  for (int i = 0; i < NUM_DATASETS; i++) {
+    if (dataset_str == dataset_names[i]) {
+      dataset_found = true;
+      break;
+    }
+  }
+  if (!dataset_found) {
+    std::cout << "Dataset currently not supported\n";
+    exit(1);
+  }
+  size_t i             = 0;
+  size_t sample_count  = 0;
+  std::string filename = path + dataset_str + "-" + mask_type + "_mask.txt";
+
+  std::ifstream in;
+  std::string line;
+  in.open(filename, std::ios::in);
+  in >> begin >> end >> std::ws;
+  while (std::getline(in, line)) {
+    std::istringstream mask_stream(line);
+    if (i >= begin && i < end) {
+      unsigned mask = 0;
+      mask_stream >> mask;
+      if (mask == 1) {
+        // only bother if it's local
+        if (dGraph->isLocal(i)) {
+          masks[dGraph->getLID(i)] = 1;
+          sample_count++;
+        }
+      }
+    }
+    i++;
+  }
+  std::cout << mask_type + "_mask range: [" << begin << ", " << end
+    << ") Number of valid samples: " << sample_count << "("
+    << (float)sample_count/(float)n*(float)100 << "\%)\n";
+  in.close();
+  return sample_count;
+}
+
 float_t* DistContext::get_in_ptr() {
   return &h_feats[0];
 }
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index 103aa94363..38ee7543c0 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -2,14 +2,19 @@
  * Based on common.hpp file of the Caffe deep learning library.
  */
 #include "deepgalois/context.h"
+#include "deepgalois/utils.h"
+#include "deepgalois/configs.h"
 
 namespace deepgalois {
 
 #ifdef CPU_ONLY
 Context::Context() : n(0), num_classes(0), feat_len(0), 
   is_single_class(true), is_selfloop_added(false), 
-  h_labels(NULL), h_feats(NULL), norm_factor(NULL),
-  d_labels(NULL), d_feats(NULL) {}
+  h_labels(NULL), h_labels_subg(NULL), 
+  h_feats(NULL), h_feats_subg(NULL),
+  d_labels(NULL), d_labels_subg(NULL),
+  d_feats(NULL), d_feats_subg(NULL),
+  norm_factor(NULL) {}
 
 Context::~Context() {
   if (h_labels) delete h_labels;
@@ -253,6 +258,48 @@ size_t Context::read_features(std::string dataset_str, std::string filetype) {
   return feat_len;
 }
 
+//! Get masks from datafile where first line tells range of
+//! set to create mask from
+size_t Context::read_masks(std::string dataset_str, std::string mask_type,
+                  size_t n, size_t& begin, size_t& end, mask_t* masks) {
+  bool dataset_found = false;
+  for (int i = 0; i < NUM_DATASETS; i++) {
+    if (dataset_str == dataset_names[i]) {
+      dataset_found = true;
+      break;
+    }
+  }
+  if (!dataset_found) {
+    std::cout << "Dataset currently not supported\n";
+    exit(1);
+  }
+  size_t i             = 0;
+  size_t sample_count  = 0;
+  std::string filename = path + dataset_str + "-" + mask_type + "_mask.txt";
+  // std::cout << "Reading " << filename << "\n";
+  std::ifstream in;
+  std::string line;
+  in.open(filename, std::ios::in);
+  in >> begin >> end >> std::ws;
+  while (std::getline(in, line)) {
+    std::istringstream mask_stream(line);
+    if (i >= begin && i < end) {
+      unsigned mask = 0;
+      mask_stream >> mask;
+      if (mask == 1) {
+        masks[i] = 1;
+        sample_count++;
+      }
+    }
+    i++;
+  }
+  std::cout << mask_type + "_mask range: [" << begin << ", " << end
+    << ") Number of valid samples: " << sample_count << " (" 
+    << (float)sample_count/(float)n*(float)100 << "\%)\n";
+  in.close();
+  return sample_count;
+}
+
 /*
 inline void init_features(size_t dim, vec_t &x) {
     std::default_random_engine rng;
diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu
index 7530bd3946..bdef92b52a 100644
--- a/libdeepgalois/src/context.cu
+++ b/libdeepgalois/src/context.cu
@@ -4,6 +4,7 @@
 #include <sys/types.h>
 #include "deepgalois/context.h"
 #include "deepgalois/math_functions.hh"
+#include "deepgalois/configs.h"
 
 // random seeding
 int64_t cluster_seedgen(void) {
@@ -62,9 +63,12 @@ cusparseMatDescr_t Context::cusparse_matdescr_ = 0;
 curandGenerator_t Context::curand_generator_   = 0;
 
 Context::Context() : n(0), num_classes(0), feat_len(0), 
-  is_single_class(true), is_selfloop_added(false), 
-  h_labels(NULL), h_feats(NULL), norm_factor(NULL),
-  d_labels(NULL), d_feats(NULL) {
+                     is_single_class(true), is_selfloop_added(false), 
+                     h_labels(NULL), h_labels_subg(NULL), 
+                     h_feats(NULL), h_feats_subg(NULL),
+                     d_labels(NULL), d_labels_subg(NULL),
+                     d_feats(NULL), d_feats_subg(NULL),
+                     norm_factor(NULL) {
   CUBLAS_CHECK(cublasCreate(&cublas_handle_));
   CUSPARSE_CHECK(cusparseCreate(&cusparse_handle_));
   CUSPARSE_CHECK(cusparseCreateMatDescr(&cusparse_matdescr_));
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 3c63468159..7616bfa6c6 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -1,4 +1,5 @@
 #include "deepgalois/layers/graph_conv_layer.h"
+#include "deepgalois/utils.h"
 
 namespace deepgalois {
 
@@ -19,6 +20,25 @@ graph_conv_layer::graph_conv_layer(unsigned level, bool act, bool norm,
   scale_ = 1. / (1. - dropout_rate_);
 }
 
+inline void graph_conv_layer::rand_init_matrix(size_t dim_x, size_t dim_y, vec_t& matrix, unsigned seed) {
+  auto init_range = sqrt(6.0 / (dim_x + dim_y));
+  std::default_random_engine rng(seed);
+  std::uniform_real_distribution<float_t> dist(-init_range, init_range);
+  matrix.resize(dim_x * dim_y);
+  for (size_t i = 0; i < dim_x; ++i) {
+    for (size_t j = 0; j < dim_y; ++j)
+      matrix[i * dim_y + j] = dist(rng);
+  }
+}
+
+inline void graph_conv_layer::zero_init_matrix(size_t dim_x, size_t dim_y, vec_t& matrix) {
+  matrix.resize(dim_x * dim_y);
+  for (size_t i = 0; i < dim_x; ++i) {
+    for (size_t j = 0; j < dim_y; ++j)
+      matrix[i * dim_y + j] = 0;
+  }
+}
+
 #ifdef CPU_ONLY
 void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in, float_t* out) {
   deepgalois::update_all(len, g, in, out, norm_, norm_factor);
@@ -64,7 +84,7 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_
   // input: x*y; W: y*z; output: x*z
   // if y > z: mult W first to reduce the feature size for aggregation
   // else: aggregate first then mult W (not implemented yet)
-  if (dropout_ && phase_ == deepgalois::net_phase::train) {
+  if (dropout_ && phase_ == net_phase::train) {
     math::dropout_cpu(x*y, scale_, dropout_rate_, in_data, dropout_mask, in_temp);
     math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, &layer::W[0], 0.0, out_temp);
   } else math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_data, &layer::W[0], 0.0, out_temp);
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cu b/libdeepgalois/src/layers/graph_conv_layer.cu
index ac29b73a7b..3702a0d709 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cu
+++ b/libdeepgalois/src/layers/graph_conv_layer.cu
@@ -53,7 +53,7 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_
     exit(0);
   }
   init_const_gpu(x*z, 0.0, out_temp);
-  if (dropout_ && phase_ == deepgalois::net_phase::train)
+  if (dropout_ && phase_ == net_phase::train)
     dropout_gpu(x * y, scale_, dropout_rate_, in_data, dropout_mask, in_temp);
   else copy_gpu(x*y, in_data, in_temp); 
   if (y > z) {
diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp
index 0cc7812e9e..555eb7bfca 100644
--- a/libdeepgalois/src/math_functions.cpp
+++ b/libdeepgalois/src/math_functions.cpp
@@ -2,6 +2,7 @@
 #include "galois/Timer.h"
 #include "galois/Galois.h"
 #include <immintrin.h>
+#include "deepgalois/utils.h"
 
 extern "C" {
 #include <cblas.h>
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index ca875ebf12..4d73752436 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -3,6 +3,7 @@
  */
 
 #include "deepgalois/net.h"
+#include "deepgalois/utils.h"
 
 namespace deepgalois {
 
@@ -74,11 +75,11 @@ void Net::init(std::string dataset_str, unsigned num_conv, unsigned epochs,
 #endif
   } else {
 #ifndef GALOIS_USE_DIST
-    train_count = read_masks(dataset_str, "train", num_samples, train_begin, train_end, train_masks);
-    val_count = read_masks(dataset_str, "val", num_samples, val_begin, val_end, val_masks);
+    train_count = context->read_masks(dataset_str, "train", num_samples, train_begin, train_end, train_masks);
+    val_count = context->read_masks(dataset_str, "val", num_samples, val_begin, val_end, val_masks);
 #else
-    train_count = read_masks(dataset_str, "train", num_samples, train_begin, train_end, train_masks, dGraph);
-    val_count = read_masks(dataset_str, "val", num_samples, val_begin, val_end, val_masks, dGraph);
+    train_count = context->read_masks(dataset_str, "train", num_samples, train_begin, train_end, train_masks, dGraph);
+    val_count = context->read_masks(dataset_str, "val", num_samples, val_begin, val_end, val_masks, dGraph);
 #endif
   }
 
@@ -426,9 +427,9 @@ void Net::read_test_masks(std::string dataset, Graph* dGraph) {
 #endif
   } else {
 #ifndef GALOIS_USE_DIST
-    test_count = deepgalois::read_masks(dataset, "test", num_samples, test_begin, test_end, test_masks);
+    test_count = context->read_masks(dataset, "test", num_samples, test_begin, test_end, test_masks);
 #else
-    test_count = deepgalois::read_masks(dataset, "test", num_samples, test_begin, test_end, test_masks, dGraph);
+    test_count = context->read_masks(dataset, "test", num_samples, test_begin, test_end, test_masks, dGraph);
 #endif
   }
 #ifndef CPU_ONLY
diff --git a/libdeepgalois/src/net.cu b/libdeepgalois/src/net.cu
index 27c3ea5de8..900ba1a762 100644
--- a/libdeepgalois/src/net.cu
+++ b/libdeepgalois/src/net.cu
@@ -2,6 +2,7 @@
 #include "deepgalois/cutils.h"
 #include "gg.h"
 #include "ggcuda.h"
+#include <iomanip>
 
 // the arguments of the maxima
 __device__ int argmax_device(const int n, const float_t* x) {
@@ -109,7 +110,7 @@ acc_t masked_f1_score_gpu(int num_classes, int begin, int end, int count,
     acc_t fn = (acc_t)h_fn[i]; // false negtive
     acc_t fp = (acc_t)h_fp[i]; // false positive
 	acc_t tp = (acc_t)h_tp[i]; // true positive
-	acc_t tn = (acc_t)h_tn[i]; // true positive
+	//acc_t tn = (acc_t)h_tn[i]; // true positive
 
     precisionMacro = precisionMacro + (tp / (tp + fp));
     recallMacro = recallMacro + (tp / (tp + fn));
diff --git a/libdeepgalois/src/sampler.cpp b/libdeepgalois/src/sampler.cpp
index dbdd984556..a86fa110c2 100644
--- a/libdeepgalois/src/sampler.cpp
+++ b/libdeepgalois/src/sampler.cpp
@@ -1,3 +1,4 @@
+#include "deepgalois/utils.h"
 #include "deepgalois/sampler.h"
 #include <time.h> 
 #include <vector>
@@ -8,85 +9,8 @@ inline unsigned getDegree(Graph &g, GNode v) {
 
 namespace deepgalois {
 
-// Utility function to randomly select k vertices from [begin, end)
-template <typename T = int>
-T* Sampler::select_k_items(T k, T begin, T end) {
-    auto i = begin;
-  
-    // reservoir[] is the output array. Initialize  
-    // it with first k vertices 
-    T *reservoir = new T[k];
-    for (; i < k; i++) reservoir[i] = i;
-  
-    // Use a different seed value so that we don't get  
-    // same result each time we run this program  
-    srand(time(NULL));  
-  
-    // Iterate from the (k+1)th element to nth element  
-    for (; i < end; i++) {  
-        // Pick a random index from 0 to i.  
-        auto j = rand() % (i + 1);  
-  
-        // If the randomly picked index is smaller than k,  
-        // then replace the element present at the index  
-        // with new element from stream  
-        if (j < k) reservoir[j] = i;
-    }
-	return reservoir;
-}
-
-// Utility function to find ceiling of r in arr[l..h]
-template <typename T = int>
-inline T Sampler::findCeil(std::vector<T> arr, T r, T l, T h) {  
-	T mid;
-	while (l < h) {
-		mid = l + ((h - l) >> 1); // Same as mid = (l+h)/2
-		(r > arr[mid]) ? (l = mid + 1) : (h = mid);
-	}
-	return (arr[l] >= r) ? l : -1;  
-} 
-
-// Utility function to select one element from n elements given a frequency (probability) distribution
-// https://www.geeksforgeeks.org/random-number-generator-in-arbitrary-probability-distribution-fashion/
-template <typename T = int>
-T Sampler::select_one_item(T n, std::vector<T> dist) {
-	std::vector<T> offsets(n);
-	offsets[0] = dist[0];
-	// compute the prefix sum of the distribution
-	for (T i = 1; i < n; ++i) offsets[i] = offsets[i-1] + dist[i];
-	// offsets[n-1] is sum of all frequencies
-	T sum = offsets[n-1];
-	T r = (rand() % sum) + 1;
-	// find which range r falls into, and return the index of the range
-	return findCeil(offsets, r, 0, n - 1);
-}
-
-// Given a subset of vertices and a graph g, generate a subgraph sg from the graph g
-void Sampler::generate_subgraph(VertexList &vertex_set, Graph &g, Graph &sub) {
-	auto nv = vertex_set.size();
-	size_t ne = 0;
-	std::vector<unsigned> offsets(nv+1);
-	offsets[0] = 0;
-	size_t i = 0;
-	VertexList vertices(nv);
-	for (auto v : vertex_set) {
-		vertices[i] = v;
-		offsets[i+1] = offsets[i] + getDegree(g, v);
-		i++;
-	}
-	// TODO: need to remove edges whose has endpoint not belong to the selected vertex subset
-	sub.allocateFrom(nv, ne);
-	sub.constructNodes();
-	for (i = 0; i < nv; i++) {
-		g.fixEndEdge(i, offsets[i+1]);
-		for (unsigned offset = 0; offset < offsets[i+1]-offsets[i]; offset ++) {
-			g.constructEdge(offsets[i]+offset, g.getEdgeDst(g.edge_begin(vertices[i])+offset), 0);
-		}
-	}
-}
-
-void Sampler::generate_masked_graph(size_t n, mask_t *masks, Graph &g, Graph &sub) {
-  std::vector<uint32_t> degrees(n, 0);
+void Sampler::get_masked_degrees(size_t n, mask_t *masks, Graph &g, std::vector<uint32_t> &degrees) {
+  assert(degrees.size() == n);
   galois::do_all(galois::iterate(g), [&](const GNode src) {
     if (masks[src] == 1) {
       for (const auto e : g.edges(src)) {
@@ -95,11 +19,12 @@ void Sampler::generate_masked_graph(size_t n, mask_t *masks, Graph &g, Graph &su
       }
     }
   }, galois::loopname("update_degrees"));
-  std::vector<uint32_t> offsets(n+1);
-  offsets[0] = 0;
-  for (size_t i = 0; i < n; i ++) {
-    offsets[i+1] = offsets[i] + degrees[i];
-  }
+}
+
+void Sampler::generate_masked_graph(size_t n, mask_t *masks, Graph &g, Graph &sub) {
+  std::vector<uint32_t> degrees(n, 0);
+  get_masked_degrees(n, masks, g, degrees);
+  auto offsets = deepgalois::parallel_prefix_sum(degrees);
   size_t ne = offsets[n];
   sub.allocateFrom(n, ne);
   sub.constructNodes();
@@ -120,15 +45,16 @@ void Sampler::generate_masked_graph(size_t n, mask_t *masks, Graph &g, Graph &su
 // nv: number of vertices in the original graph;
 // n: number of vertices in the subgraph;
 // m: number of vertices in the frontier.
-void Sampler::select_vertices(size_t nv, size_t n, int m, Graph &g, VertexList vertices, VertexList &vertex_set) {
+void Sampler::select_vertices(size_t nv, size_t n, int m, Graph &g, VertexList vertices, VertexSet &vertex_set) {
   assert(nv == vertices.size());
-  assert(n == vertex_set.size());
-  auto frontier_indices = select_k_items(m, 0, (int)nv); // randomly select m vertices from vertices as frontier
+  auto frontier_indices = deepgalois::select_k_items(m, 0, (int)nv); // randomly select m vertices from vertices as frontier
   VertexList frontier(m);
-  for (int i = 0; i < m; i++) vertex_set[i] = frontier[i] = vertices[frontier_indices[i]];
-  std::vector<int> degrees(m);
+  for (int i = 0; i < m; i++)
+    frontier[i] = vertices[frontier_indices[i]];
+  vertex_set.insert(frontier.begin(), frontier.end());
+  int *degrees = new int[m];
   galois::do_all(galois::iterate(g.begin(), g.end()), [&](const auto i) {
-    degrees[i] = getDegree(g, frontier[i]);
+    degrees[i] = (int)getDegree(g, frontier[i]);
   }, galois::loopname("compute_degrees"));
   for (size_t i = 0; i < n - m; i++) {
     auto pos = select_one_item((int)m, degrees);
@@ -138,20 +64,56 @@ void Sampler::select_vertices(size_t nv, size_t n, int m, Graph &g, VertexList v
     auto dst = g.getEdgeDst(g.edge_begin(u) + neighbor_id);
     frontier[pos] = dst;
     degrees[pos] = getDegree(g, frontier[pos]);
-    vertex_set.push_back(u);
+    vertex_set.insert(u);
   }
+  assert(n == vertex_set.size());
 }
 
-void update_masks(size_t n, VertexList vertices, mask_t *masks) {
+void Sampler::update_masks(size_t n, VertexSet vertices, mask_t *masks) {
 	std::fill(masks, masks+n, 0);
 	for (auto v : vertices) masks[v] = 1;
 }
 
+inline VertexList Sampler::reindexing_vertice(VertexSet vertex_set) {
+  VertexList new_ids(vertex_set.size(), 0);
+  int vid = 0;
+  for (auto v : vertex_set) {
+    new_ids[v] = vid++; // reindex
+  }
+  return new_ids;
+}
+
+// Given a subset of vertices and a graph g, generate a subgraph sg from the graph g
+void Sampler::generate_subgraph(VertexSet &vertex_set, Graph &g, Graph &sub) {
+  //auto n = g.size(); // old graph size
+  auto nv = vertex_set.size(); // new graph (subgraph) size
+  VertexList new_ids = reindexing_vertice(vertex_set);
+  std::vector<uint32_t> degrees(nv, 0); // degrees of vertices in the subgraph
+  for (auto v : vertex_set) {
+	degrees[new_ids[v]] = std::distance(g.edge_begin(v), g.edge_end(v));
+  }
+  auto offsets = deepgalois::parallel_prefix_sum(degrees);
+  auto ne = offsets[nv];
+  sub.allocateFrom(nv, ne);
+  sub.constructNodes();
+  VertexList old_ids(vertex_set.begin(), vertex_set.end()); // vertex ID mapping
+  galois::do_all(galois::iterate((size_t)0, nv), [&](const auto i) {
+    g.fixEndEdge(i, offsets[i+1]);
+    unsigned j = 0;
+    auto old_id = old_ids[i];
+    for (auto e : g.edges(old_id)) {
+      g.constructEdge(offsets[i]+j, g.getEdgeDst(e), 0);
+      j ++;
+    }
+  }, galois::loopname("compute_degrees"));
+}
+
 void Sampler::subgraph_sample(size_t n, Graph&sg, mask_t *masks) {
-  VertexList vertex_set(n);
-  select_vertices(count_, n, m_, masked_graph, vertices_, vertex_set); 
+  VertexSet vertex_set; // n = 9000 by default
+  select_vertices(count_, n, m_, masked_graph, vertices_, vertex_set); // m = 1000 by default
+  update_masks(graph->size(), vertex_set, masks); // set masks for vertices in the vertex_set
+  generate_masked_graph(n, masks, masked_graph, sg); // remove edges whose destination is not masked
   generate_subgraph(vertex_set, masked_graph, sg);
-  update_masks(graph->size(), vertex_set, masks);
 }
 
 } // end namespace
diff --git a/libdeepgalois/src/utils.cpp b/libdeepgalois/src/utils.cpp
index b2b65c9582..dedb9c225a 100644
--- a/libdeepgalois/src/utils.cpp
+++ b/libdeepgalois/src/utils.cpp
@@ -3,8 +3,41 @@
 
 namespace deepgalois {
 
-#define NUM_DATASETS 8
-const std::string dataset_names[NUM_DATASETS] = {"cora", "citeseer", "ppi", "pubmed", "flickr", "yelp", "reddit", "amazon"};
+// parallel prefix sum
+template <typename InTy, typename OutTy>
+OutTy* parallel_prefix_sum(const std::vector<InTy> &in) {
+  const size_t block_size = 1<<20;
+  const size_t num_blocks = (in.size() + block_size - 1) / block_size;
+  std::vector<OutTy> local_sums(num_blocks);
+  // count how many bits are set on each thread
+  galois::do_all(galois::iterate((size_t)0, num_blocks), [&](const size_t& block) {
+    OutTy lsum = 0;
+    size_t block_end = std::min((block + 1) * block_size, in.size());
+    for (size_t i=block * block_size; i < block_end; i++)
+      lsum += in[i];
+    local_sums[block] = lsum;
+  });
+  std::vector<OutTy> bulk_prefix(num_blocks+1);
+  OutTy total = 0;
+  for (size_t block=0; block < num_blocks; block++) {
+    bulk_prefix[block] = total;
+    total += local_sums[block];
+  }
+  bulk_prefix[num_blocks] = total;
+  OutTy *prefix = new OutTy[in.size() + 1];
+  galois::do_all(galois::iterate((size_t)0, num_blocks), [&](const size_t& block) {
+    OutTy local_total = bulk_prefix[block];
+    size_t block_end = std::min((block + 1) * block_size, in.size());
+    for (size_t i=block * block_size; i < block_end; i++) {
+      prefix[i] = local_total;
+      local_total += in[i];
+    }
+  });
+  prefix[in.size()] = bulk_prefix[num_blocks];
+  return prefix;
+}
+
+template uint32_t* parallel_prefix_sum<uint32_t, uint32_t>(const std::vector<uint32_t> &in);
 
 // Compute the F1 score, also known as balanced F-score or F-measure
 // The F1 score can be interpreted as a weighted average of the precision and recall, 
@@ -62,92 +95,4 @@ acc_t masked_f1_score(size_t begin, size_t end, size_t count, mask_t *masks,
   return f1_micro;
 }
 
-#ifndef GALOIS_USE_DIST
-//! Get masks from datafile where first line tells range of
-//! set to create mask from
-size_t read_masks(std::string dataset_str, std::string mask_type,
-                  size_t n, size_t& begin, size_t& end, mask_t* masks) {
-  bool dataset_found = false;
-  for (int i = 0; i < NUM_DATASETS; i++) {
-    if (dataset_str == dataset_names[i]) {
-      dataset_found = true;
-      break;
-    }
-  }
-  if (!dataset_found) {
-    std::cout << "Dataset currently not supported\n";
-    exit(1);
-  }
-  size_t i             = 0;
-  size_t sample_count  = 0;
-  std::string filename = path + dataset_str + "-" + mask_type + "_mask.txt";
-  // std::cout << "Reading " << filename << "\n";
-  std::ifstream in;
-  std::string line;
-  in.open(filename, std::ios::in);
-  in >> begin >> end >> std::ws;
-  while (std::getline(in, line)) {
-    std::istringstream mask_stream(line);
-    if (i >= begin && i < end) {
-      unsigned mask = 0;
-      mask_stream >> mask;
-      if (mask == 1) {
-        masks[i] = 1;
-        sample_count++;
-      }
-    }
-    i++;
-  }
-  std::cout << mask_type + "_mask range: [" << begin << ", " << end
-    << ") Number of valid samples: " << sample_count << " (" 
-    << (float)sample_count/(float)n*(float)100 << "\%)\n";
-  in.close();
-  return sample_count;
-}
-#else
-size_t read_masks(std::string dataset_str, std::string mask_type,
-                         size_t n, size_t& begin, size_t& end,
-                         mask_t* masks, Graph* dGraph) {
-  bool dataset_found = false;
-  for (int i = 0; i < NUM_DATASETS; i++) {
-    if (dataset_str == dataset_names[i]) {
-      dataset_found = true;
-      break;
-    }
-  }
-  if (!dataset_found) {
-    std::cout << "Dataset currently not supported\n";
-    exit(1);
-  }
-  size_t i             = 0;
-  size_t sample_count  = 0;
-  std::string filename = path + dataset_str + "-" + mask_type + "_mask.txt";
-
-  std::ifstream in;
-  std::string line;
-  in.open(filename, std::ios::in);
-  in >> begin >> end >> std::ws;
-  while (std::getline(in, line)) {
-    std::istringstream mask_stream(line);
-    if (i >= begin && i < end) {
-      unsigned mask = 0;
-      mask_stream >> mask;
-      if (mask == 1) {
-        // only bother if it's local
-        if (dGraph->isLocal(i)) {
-          masks[dGraph->getLID(i)] = 1;
-          sample_count++;
-        }
-      }
-    }
-    i++;
-  }
-  std::cout << mask_type + "_mask range: [" << begin << ", " << end
-    << ") Number of valid samples: " << sample_count << "("
-    << (float)sample_count/(float)n*(float)100 << "\%)\n";
-  in.close();
-  return sample_count;
-}
-#endif
-
-}
+} // end namespace

From 5202c7adffc275670e988907320c3242f5bc7c95 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Mon, 27 Apr 2020 12:47:49 -0500
Subject: [PATCH 195/660] fix dist

---
 .../include/deepgalois/DistContext.h          | 34 ++++++++++++-------
 libdeepgalois/include/deepgalois/sampler.h    |  4 +--
 libdeepgalois/src/DistContext.cpp             |  6 ++--
 libdeepgalois/src/sampler.cpp                 | 25 ++++++++------
 lonestargnn/gcn/gcn.cpp                       |  2 +-
 lonestargnn/include/DistributedGraphLoader.h  |  2 +-
 6 files changed, 44 insertions(+), 29 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h
index 3054915ded..c0ee3ec704 100644
--- a/libdeepgalois/include/deepgalois/DistContext.h
+++ b/libdeepgalois/include/deepgalois/DistContext.h
@@ -10,18 +10,25 @@
 namespace deepgalois {
 
 class DistContext {
+protected:
   size_t localVertices;        // number of samples: N
   size_t num_classes;          // number of classes: E
   size_t feat_len;             // input feature length: D
-  std::vector<label_t> labels; // labels for classification: N x 1
-  vec_t h_feats;               // input features: N x D
   galois::graphs::GluonSubstrate<Graph>* syncSubstrate;
 
-public:
-  // TODO why are these public
-  float_t* norm_factor; // normalization constant based on graph structure
-  Graph* graph_cpu; // the input graph, |V| = N
+  Graph* graph_cpu;            // the input graph, |V| = N
+  Graph* subgraph_cpu;
+  label_t *h_labels;           // labels for classification. Single-class label: Nx1, multi-class label: NxE 
+  label_t *h_labels_subg;      // labels for subgraph
+  float_t* h_feats;            // input features: N x D
+  float_t* h_feats_subg;       // input features for subgraph
+  label_t* d_labels;           // labels on device
+  label_t *d_labels_subg;      // labels for subgraph on device
+  float_t* d_feats;            // input features on device
+  float_t* d_feats_subg;       // input features for subgraph on device
+  float_t* norm_factor;        // normalization constant based on graph structure
 
+public:
   DistContext();
   ~DistContext();
 
@@ -42,18 +49,21 @@ class DistContext {
   // TODO this is a distributed operation
   void norm_factor_counting();
 
+  float_t* get_norm_factor_ptr() { return norm_factor; }
+  Graph* getGraphPointer() { return graph_cpu; }
+  Graph* getSubgraphPointer() { return subgraph_cpu; };
+  float_t* get_feats_ptr() { return h_feats; }
+  float_t* get_feats_subg_ptr() { return h_feats_subg; }
+  label_t* get_labels_ptr() { return h_labels; }
+  label_t* get_labels_subg_ptr() { return h_labels_subg; }
+
   void initializeSyncSubstrate();
   galois::graphs::GluonSubstrate<Graph>* getSyncSubstrate();
 
-  Graph* getGraphPointer() {
-    return graph_cpu;
-  }
 
   //! return label for some node
   //! NOTE: this is LID, not GID
-  label_t get_label(size_t i) {
-    return labels[i];
-  }
+  label_t get_label(size_t i) { return h_labels[i]; }
 
   //! returns pointer to the features of each local node
   float_t* get_in_ptr();
diff --git a/libdeepgalois/include/deepgalois/sampler.h b/libdeepgalois/include/deepgalois/sampler.h
index 9f57ed53da..14342c1c6d 100644
--- a/libdeepgalois/include/deepgalois/sampler.h
+++ b/libdeepgalois/include/deepgalois/sampler.h
@@ -40,12 +40,12 @@ class Sampler {
   size_t end_;
   VertexList vertices_;
   mask_t *masks_;
-  Graph masked_graph;
+  Graph *masked_graph;
   Graph *graph;
 
   // Given a subset of vertices and a graph g, generate a subgraph sg from the graph g
   void generate_subgraph(VertexSet &vertex_set, Graph &g, Graph &sub);
-  void generate_masked_graph(size_t n, mask_t *masks, Graph &g, Graph &mg);
+  void generate_masked_graph(size_t n, mask_t *masks, Graph &g, Graph *mg);
 
   void get_masked_degrees(size_t n, mask_t *masks, Graph &g, std::vector<uint32_t> &degrees);
   void update_masks(size_t n, VertexSet vertices, mask_t *masks);
diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp
index 7c4fd00a46..2a9ad81575 100644
--- a/libdeepgalois/src/DistContext.cpp
+++ b/libdeepgalois/src/DistContext.cpp
@@ -26,7 +26,7 @@ size_t DistContext::read_labels(std::string dataset_str) {
   in >> m >> num_classes >> std::ws;
   assert(m == dGraph->globalSize());
   // size of labels should be # local nodes
-  labels.resize(dGraph->size(), 0);
+  h_labels = new label_t[dGraph->size()]; // single-class (one-hot) label for each vertex: N x 1
 
   uint32_t foundVertices = 0;
   unsigned v = 0;
@@ -42,7 +42,7 @@ size_t DistContext::read_labels(std::string dataset_str) {
         label_stream >> x;
         if (x != 0) {
           // set local id
-          labels[dGraph->getLID(v)] = idx;
+          h_labels[dGraph->getLID(v)] = idx;
           foundVertices++;
           break;
         }
@@ -76,7 +76,7 @@ size_t DistContext::read_features(std::string dataset_str) {
   // header read
   in >> m >> feat_len >> std::ws;
   // use local size, not global size
-  h_feats.resize(dGraph->size() * feat_len, 0);
+  h_feats = new float_t[dGraph->size() * feat_len];
 
   // loop through all features
   while (std::getline(in, line)) {
diff --git a/libdeepgalois/src/sampler.cpp b/libdeepgalois/src/sampler.cpp
index a86fa110c2..6ee47a452e 100644
--- a/libdeepgalois/src/sampler.cpp
+++ b/libdeepgalois/src/sampler.cpp
@@ -11,7 +11,7 @@ namespace deepgalois {
 
 void Sampler::get_masked_degrees(size_t n, mask_t *masks, Graph &g, std::vector<uint32_t> &degrees) {
   assert(degrees.size() == n);
-  galois::do_all(galois::iterate(g), [&](const GNode src) {
+  galois::do_all(galois::iterate(size_t(0), n), [&](const GNode src) {
     if (masks[src] == 1) {
       for (const auto e : g.edges(src)) {
         const auto dst = g.getEdgeDst(e);
@@ -21,14 +21,16 @@ void Sampler::get_masked_degrees(size_t n, mask_t *masks, Graph &g, std::vector<
   }, galois::loopname("update_degrees"));
 }
 
-void Sampler::generate_masked_graph(size_t n, mask_t *masks, Graph &g, Graph &sub) {
+void Sampler::generate_masked_graph(size_t n, mask_t *masks, Graph &g, Graph *sub) {
   std::vector<uint32_t> degrees(n, 0);
   get_masked_degrees(n, masks, g, degrees);
   auto offsets = deepgalois::parallel_prefix_sum(degrees);
   size_t ne = offsets[n];
-  sub.allocateFrom(n, ne);
-  sub.constructNodes();
-  galois::do_all(galois::iterate(sub), [&](const GNode src) {
+#ifndef GALOIS_USE_DIST
+  sub = new Graph();
+  sub->allocateFrom(n, ne);
+  sub->constructNodes();
+  galois::do_all(galois::iterate((size_t)0, n), [&](const GNode src) {
     g.fixEndEdge(src, offsets[src+1]);
     if (masks[src] == 1) {
       auto idx = offsets[src];
@@ -38,6 +40,7 @@ void Sampler::generate_masked_graph(size_t n, mask_t *masks, Graph &g, Graph &su
       }
     }
   }, galois::loopname("gen_subgraph"));
+#endif
 }
 
 // !API function for user-defined selection strategy
@@ -53,7 +56,7 @@ void Sampler::select_vertices(size_t nv, size_t n, int m, Graph &g, VertexList v
     frontier[i] = vertices[frontier_indices[i]];
   vertex_set.insert(frontier.begin(), frontier.end());
   int *degrees = new int[m];
-  galois::do_all(galois::iterate(g.begin(), g.end()), [&](const auto i) {
+  galois::do_all(galois::iterate(size_t(0), g.size()), [&](const auto i) {
     degrees[i] = (int)getDegree(g, frontier[i]);
   }, galois::loopname("compute_degrees"));
   for (size_t i = 0; i < n - m; i++) {
@@ -94,6 +97,7 @@ void Sampler::generate_subgraph(VertexSet &vertex_set, Graph &g, Graph &sub) {
   }
   auto offsets = deepgalois::parallel_prefix_sum(degrees);
   auto ne = offsets[nv];
+#ifndef GALOIS_USE_DIST
   sub.allocateFrom(nv, ne);
   sub.constructNodes();
   VertexList old_ids(vertex_set.begin(), vertex_set.end()); // vertex ID mapping
@@ -105,15 +109,16 @@ void Sampler::generate_subgraph(VertexSet &vertex_set, Graph &g, Graph &sub) {
       g.constructEdge(offsets[i]+j, g.getEdgeDst(e), 0);
       j ++;
     }
-  }, galois::loopname("compute_degrees"));
+  }, galois::loopname("construct_graph"));
+#endif
 }
 
 void Sampler::subgraph_sample(size_t n, Graph&sg, mask_t *masks) {
   VertexSet vertex_set; // n = 9000 by default
-  select_vertices(count_, n, m_, masked_graph, vertices_, vertex_set); // m = 1000 by default
+  select_vertices(count_, n, m_, *masked_graph, vertices_, vertex_set); // m = 1000 by default
   update_masks(graph->size(), vertex_set, masks); // set masks for vertices in the vertex_set
-  generate_masked_graph(n, masks, masked_graph, sg); // remove edges whose destination is not masked
-  generate_subgraph(vertex_set, masked_graph, sg);
+  generate_masked_graph(n, masks, *masked_graph, &sg); // remove edges whose destination is not masked
+  generate_subgraph(vertex_set, *masked_graph, sg);
 }
 
 } // end namespace
diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp
index ba9cbe3529..de999a095e 100644
--- a/lonestargnn/gcn/gcn.cpp
+++ b/lonestargnn/gcn/gcn.cpp
@@ -21,7 +21,7 @@ int main(int argc, char** argv) {
   Graph* dGraph = NULL;
 #ifdef GALOIS_USE_DIST
   std::vector<unsigned> dummyVec;
-  Graph* dGraph = galois::graphs::constructSymmetricGraph<char, void>(dummyVec);
+  dGraph = galois::graphs::constructSymmetricGraph<char, void>(dummyVec);
 #endif
 
   // read network, features, ground truth, initialize metadata
diff --git a/lonestargnn/include/DistributedGraphLoader.h b/lonestargnn/include/DistributedGraphLoader.h
index b7da4faa54..247ad0763c 100644
--- a/lonestargnn/include/DistributedGraphLoader.h
+++ b/lonestargnn/include/DistributedGraphLoader.h
@@ -31,7 +31,7 @@
 #define D_GRAPH_LOADER_SYM
 
 #include "galois/graphs/CuSPPartitioner.h"
-#include "deepgalois/utils.h"
+#include "deepgalois/configs.h"
 
 /*******************************************************************************
  * Supported partitioning schemes

From b183f2b1d05868e492ee10400471be85c3f71e6a Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Mon, 27 Apr 2020 15:12:34 -0500
Subject: [PATCH 196/660] fix some bugs

---
 .../include/deepgalois/DistContext.h          |  1 +
 libdeepgalois/include/deepgalois/context.h    |  1 +
 .../deepgalois/layers/graph_conv_layer.h      |  5 +-
 .../include/deepgalois/layers/layer.h         |  1 +
 .../deepgalois/layers/sigmoid_loss_layer.h    |  5 +-
 .../deepgalois/layers/softmax_loss_layer.h    |  1 +
 libdeepgalois/include/deepgalois/sampler.h    | 27 ++----
 libdeepgalois/src/context.cpp                 |  4 +
 libdeepgalois/src/layers/graph_conv_layer.cpp | 17 ++--
 libdeepgalois/src/layers/graph_conv_layer.cu  | 16 +++-
 .../src/layers/sigmoid_loss_layer.cpp         |  5 +-
 .../src/layers/sigmoid_loss_layer.cu          |  5 +-
 .../src/layers/softmax_loss_layer.cpp         |  5 +-
 .../src/layers/softmax_loss_layer.cu          |  5 +-
 libdeepgalois/src/net.cpp                     | 14 ++-
 libdeepgalois/src/sampler.cpp                 | 93 +++++++++++++------
 16 files changed, 132 insertions(+), 73 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h
index c0ee3ec704..4444143f09 100644
--- a/libdeepgalois/include/deepgalois/DistContext.h
+++ b/libdeepgalois/include/deepgalois/DistContext.h
@@ -48,6 +48,7 @@ class DistContext {
   //! find norm factor by looking at degree
   // TODO this is a distributed operation
   void norm_factor_counting();
+  void createSubgraph() {}
 
   float_t* get_norm_factor_ptr() { return norm_factor; }
   Graph* getGraphPointer() { return graph_cpu; }
diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h
index d995a41c8c..15b9605cec 100644
--- a/libdeepgalois/include/deepgalois/context.h
+++ b/libdeepgalois/include/deepgalois/context.h
@@ -39,6 +39,7 @@ class Context {
 #ifdef CPU_ONLY
   Graph* graph_cpu; // the input graph, |V| = N
   Graph* subgraph_cpu;
+  void createSubgraph();
   void add_selfloop(Graph &og, Graph &g);
   //! returns pointer to the graph
   Graph* getGraphPointer() { return graph_cpu; }
diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
index 8a6992e30c..7f0aa5a9a3 100644
--- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
@@ -33,7 +33,7 @@ class graph_conv_layer : public layer {
                    std::vector<size_t> out_dims)
       : graph_conv_layer(level, false, true, false, true, 0.5, in_dims, out_dims) {}
   ~graph_conv_layer() {}
-  void init();
+  void malloc_and_init();
   std::string layer_type() const override { return std::string("graph_conv"); }
   void set_netphase(net_phase ctx) override { phase_ = ctx; }
   void set_context(layer::ContextType* ctx) { context = ctx; norm_factor = ctx->get_norm_factor_ptr(); }
@@ -65,9 +65,6 @@ class graph_conv_layer : public layer {
   const float_t dropout_rate_;
   float_t scale_;
   net_phase phase_;
-  size_t x;
-  size_t y;
-  size_t z;
   float_t* out_temp; //!< intermediate data temporary
   float_t* in_temp;
   float_t* in_temp1;
diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h
index d0bfac6e16..c604f6ffbe 100644
--- a/libdeepgalois/include/deepgalois/layers/layer.h
+++ b/libdeepgalois/include/deepgalois/layers/layer.h
@@ -55,6 +55,7 @@ class layer : public deepgalois::node {
   virtual ~layer()                       = default;
   virtual std::string layer_type() const = 0;
   void print_layer_info(); //! debug print function
+  virtual void malloc_and_init() {}
 
   // get methods
   virtual acc_t get_prediction_loss() { return acc_t(0); }
diff --git a/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h b/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h
index 760b6f0ab1..c8b1241acc 100644
--- a/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h
@@ -7,9 +7,8 @@ class sigmoid_loss_layer : public layer {
   sigmoid_loss_layer(unsigned level, std::vector<size_t> in_dims,
                      std::vector<size_t> out_dims);
   ~sigmoid_loss_layer();
-  std::string layer_type() const override {
-    return std::string("sigmoid_loss");
-  }
+  std::string layer_type() const override { return std::string("sigmoid_loss"); }
+  void malloc_and_init();
   inline label_t get_label(size_t i, size_t j);
   virtual void forward_propagation(const float_t* in_data, float_t* out_data);
   virtual void back_propagation(const float_t* in_data, const float_t* out_data,
diff --git a/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h b/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h
index 060698e3d9..43f07728cd 100644
--- a/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h
@@ -8,6 +8,7 @@ class softmax_loss_layer : public layer {
                      std::vector<size_t> out_dims);
   ~softmax_loss_layer();
   std::string layer_type() const override { return std::string("softmax_loss"); }
+  void malloc_and_init();
   inline label_t get_label(size_t i);
   virtual void forward_propagation(const float_t* in_data, float_t* out_data);
   virtual void back_propagation(const float_t* in_data, const float_t* out_data,
diff --git a/libdeepgalois/include/deepgalois/sampler.h b/libdeepgalois/include/deepgalois/sampler.h
index 14342c1c6d..01616d01f5 100644
--- a/libdeepgalois/include/deepgalois/sampler.h
+++ b/libdeepgalois/include/deepgalois/sampler.h
@@ -8,10 +8,10 @@ class Sampler {
   ~Sampler() {}
 
   // sample a subgraph sg of size n from graph g
-  void subgraph_sample(size_t n, Graph &sg, mask_t *masks);
+  void subgraph_sample(size_t n, Graph &sg, mask_t* masks);
 
   // !API function for user-defined selection strategy
-  virtual void select_vertices(size_t nv, size_t n, int m, Graph &g, VertexList vertices, VertexSet &vertex_set);
+  virtual void select_vertices(size_t nv, size_t n, int m, Graph* g, VertexList vertices, VertexSet &vertex_set);
 
   galois::runtime::iterable<galois::NoDerefIterator<Graph::edge_iterator> > neighbor_sampler(Graph &g, GNode v);
 
@@ -19,19 +19,7 @@ class Sampler {
 
   Graph::edge_iterator sampled_edge_end(Graph &g, GNode v) { return g.edge_end(v); }
 
-  void set_masked_graph(size_t begin, size_t end, size_t count, mask_t *masks, Graph *g) {
-    begin_ = begin;
-    end_ = end;
-    count_ = count;
-    masks_ = masks;
-    graph = g;
-    generate_masked_graph(count, masks, *g, masked_graph);
-    size_t idx = 0;
-    vertices_.resize(count);
-    for (size_t i = begin; i < end; i++) {
-      if (masks_[i] == 1) vertices_[idx++] = i;
-    }
-  }
+  void set_masked_graph(size_t begin, size_t end, size_t count, mask_t* masks, Graph* g);
 
 protected:
   int m_;
@@ -45,11 +33,10 @@ class Sampler {
 
   // Given a subset of vertices and a graph g, generate a subgraph sg from the graph g
   void generate_subgraph(VertexSet &vertex_set, Graph &g, Graph &sub);
-  void generate_masked_graph(size_t n, mask_t *masks, Graph &g, Graph *mg);
-
-  void get_masked_degrees(size_t n, mask_t *masks, Graph &g, std::vector<uint32_t> &degrees);
-  void update_masks(size_t n, VertexSet vertices, mask_t *masks);
-  inline VertexList reindexing_vertice(VertexSet vertex_set);
+  void generate_masked_graph(size_t n, mask_t* masks, Graph* g, Graph& mg);
+  void get_masked_degrees(size_t n, mask_t* masks, Graph* g, std::vector<uint32_t> &degrees);
+  void update_masks(size_t n, VertexSet vertices, mask_t* masks);
+  inline VertexList reindexing_vertice(size_t n, VertexSet vertex_set);
 };
 
 }
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index 38ee7543c0..f6d443f4f1 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -27,6 +27,10 @@ size_t Context::read_graph(std::string dataset_str, bool selfloop) {
   return n;
 }
 
+void Context::createSubgraph() {
+  subgraph_cpu = new Graph(); 
+}
+
 size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype, bool selfloop) {
   galois::StatTimer Tread("GraphReadingTime");
   Tread.start();
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 7616bfa6c6..9903768070 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -10,12 +10,8 @@ graph_conv_layer::graph_conv_layer(unsigned level, bool act, bool norm,
     : layer(level, in_dims, out_dims), act_(act), norm_(norm), bias_(bias),
       dropout_(dropout), dropout_rate_(dropout_rate) {
   assert(input_dims[0] == output_dims[0]); // num_vertices
-  x          = input_dims[0];
-  y          = input_dims[1];
-  z          = output_dims[1];
   trainable_ = true;
   name_      = layer_type() + "_" + std::to_string(level);
-  init();
   assert(dropout_rate_ < 1.);
   scale_ = 1. / (1. - dropout_rate_);
 }
@@ -52,7 +48,10 @@ void graph_conv_layer::combine(size_t n, size_t len, const float_t* self, const
   math::vadd_cpu(len, a, b, out); // out = W*self + Q*neighbors
 }
 
-void graph_conv_layer::init() {
+void graph_conv_layer::malloc_and_init() {
+  size_t x = input_dims[0];
+  size_t y = input_dims[1];
+  size_t z = output_dims[1];
 #ifdef GALOIS_USE_DIST
   // setup gluon
   layer::gradientGraph = new deepgalois::GluonGradients(layer::weight_grad,
@@ -81,6 +80,9 @@ void graph_conv_layer::init() {
 
 // 𝒉[𝑙] = σ(𝑊 * Σ(𝒉[𝑙-1]))
 void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_data) {
+  size_t x = input_dims[0];
+  size_t y = input_dims[1];
+  size_t z = output_dims[1];
   // input: x*y; W: y*z; output: x*z
   // if y > z: mult W first to reduce the feature size for aggregation
   // else: aggregate first then mult W (not implemented yet)
@@ -106,6 +108,9 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_
 void graph_conv_layer::back_propagation(const float_t* in_data,
                                         const float_t* out_data,
                                         float_t* out_grad, float_t* in_grad) {
+  size_t x = input_dims[0];
+  size_t y = input_dims[1];
+  size_t z = output_dims[1];
   // note; assumption here is that out_grad contains 1s or 0s via relu?
   if (act_) math::d_relu_cpu(x*z, out_grad, out_data, out_grad);
   //else deepgalois::math::copy_cpu(x * z, out_grad, out_temp); // TODO: avoid copying
@@ -144,7 +149,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
 }
 
 acc_t graph_conv_layer::get_weight_decay_loss() {
-  return math::l2_norm(y*z, &layer::W[0]);
+  return math::l2_norm(input_dims[1]*output_dims[1], &layer::W[0]);
 }
 
 #endif // end if CPU_ONLY
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cu b/libdeepgalois/src/layers/graph_conv_layer.cu
index 3702a0d709..ed89089450 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cu
+++ b/libdeepgalois/src/layers/graph_conv_layer.cu
@@ -2,7 +2,11 @@
 
 namespace deepgalois {
 
-void graph_conv_layer::init() {
+void graph_conv_layer::malloc_and_init() {
+  size_t x = input_dims[0];
+  size_t y = input_dims[1];
+  size_t z = output_dims[1];
+
   if (dropout_) CUDA_CHECK(cudaMalloc((void**)&dropout_mask, x * y * sizeof(unsigned)));
   //CUDA_CHECK(cudaMalloc((void**)&in_temp, x * y * sizeof(float_t)));
   float_malloc_device(x*y, in_temp);
@@ -47,6 +51,10 @@ void graph_conv_layer::combine(size_t dim_x, size_t dim_y, const float_t* self,
 // GPU forward: compute output features
 // NOTE: in_data will be used in back-prop, so it can not be modified
 void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_data) {
+  size_t x = input_dims[0];
+  size_t y = input_dims[1];
+  size_t z = output_dims[1];
+ 
   if (z > MAX_NUM_CLASSES) {
     std::cout << "Currently support maximum hidden feature length of " << MAX_NUM_CLASSES << "\n"; 
 	// currently only support feature length <= 128
@@ -70,6 +78,10 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_
 void graph_conv_layer::back_propagation(const float_t* in_data,
                                         const float_t* out_data,
                                         float_t* out_grad, float_t* in_grad) {
+  size_t x = input_dims[0];
+  size_t y = input_dims[1];
+  size_t z = output_dims[1];
+ 
   if (act_) d_relu_gpu(x * z, out_grad, out_data, out_grad);
   if (y > z) {
     graph_conv_layer::d_aggregate(z, *graph_gpu, out_grad, out_temp);
@@ -88,7 +100,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
 }
 
 acc_t graph_conv_layer::get_weight_decay_loss() {
-  return l2_norm_gpu(y*z, d_W);
+  return l2_norm_gpu(input_dims[1]*output_dims[1], d_W);
 }
 
 } // namespace
diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
index a5ec7eef49..19606eec6c 100644
--- a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
+++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
@@ -9,13 +9,16 @@ sigmoid_loss_layer::sigmoid_loss_layer(unsigned level,
     : layer(level, in_dims, out_dims) {
   trainable_ = false;
   name_      = layer_type() + "_" + std::to_string(level);
-  loss = new float_t[in_dims[0]]; // error for each sample
 }
 
 sigmoid_loss_layer::~sigmoid_loss_layer() {
   delete loss;
 }
 
+void sigmoid_loss_layer::malloc_and_init() {
+  loss = new float_t[input_dims[0]]; // error for each sample
+}
+
 inline label_t sigmoid_loss_layer::get_label(size_t i, size_t j) {
   //return context->get_label(i, j);
   return labels[i*input_dims[1]+j];
diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cu b/libdeepgalois/src/layers/sigmoid_loss_layer.cu
index 1fcc55e207..4159569601 100644
--- a/libdeepgalois/src/layers/sigmoid_loss_layer.cu
+++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cu
@@ -10,13 +10,16 @@ sigmoid_loss_layer::sigmoid_loss_layer(unsigned level,
     : layer(level, in_dims, out_dims) {
   trainable_ = false;
   name_      = layer_type() + "_" + std::to_string(level);
-  float_malloc_device(in_dims[0], loss);
 }
 
 sigmoid_loss_layer::~sigmoid_loss_layer() {
   float_free_device(loss);
 }
 
+void sigmoid_loss_layer::malloc_and_init() {
+  float_malloc_device(input_dims[0], loss);
+}
+
 void sigmoid_loss_layer::forward_propagation(const float_t* in_data,
                                              float_t* out_data) {
   init_const_gpu(input_dims[0], 0.0, loss);
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp
index 2f944656de..9e4fda933e 100644
--- a/libdeepgalois/src/layers/softmax_loss_layer.cpp
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp
@@ -9,13 +9,16 @@ softmax_loss_layer::softmax_loss_layer(unsigned level,
     : layer(level, in_dims, out_dims) {
   trainable_ = false;
   name_      = layer_type() + "_" + std::to_string(level);
-  loss = new float_t[in_dims[0]]; // error for each sample
 }
 
 softmax_loss_layer::~softmax_loss_layer() {
   delete loss;
 }
 
+void softmax_loss_layer::malloc_and_init() {
+  loss = new float_t[input_dims[0]]; // error for each sample
+}
+
 inline label_t softmax_loss_layer::get_label(size_t i) {
   return labels[i];
   //return context->get_label(i);
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cu b/libdeepgalois/src/layers/softmax_loss_layer.cu
index 3eb5065edd..fd3fc11140 100644
--- a/libdeepgalois/src/layers/softmax_loss_layer.cu
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cu
@@ -10,13 +10,16 @@ softmax_loss_layer::softmax_loss_layer(unsigned level,
     : layer(level, in_dims, out_dims) {
   trainable_ = false;
   name_      = layer_type() + "_" + std::to_string(level);
-  float_malloc_device(in_dims[0], loss);
 }
 
 softmax_loss_layer::~softmax_loss_layer() {
   float_free_device(loss);
 }
 
+void softmax_loss_layer::malloc_and_init() {
+  float_malloc_device(input_dims[0], loss);
+}
+
 void softmax_loss_layer::forward_propagation(const float_t* in_data,
                                              float_t* out_data) {
   init_const_gpu(input_dims[0], 0.0, loss);
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index 4d73752436..e9ed3b4fd4 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -111,6 +111,7 @@ void Net::init(std::string dataset_str, unsigned num_conv, unsigned epochs,
 
 // generate labels for the subgraph
 void Net::lookup_labels(size_t n, const mask_t *masks, const label_t *labels, label_t *sg_labels) {
+  if (sg_labels == NULL) sg_labels = new label_t[subgraph_sample_size];
   size_t count = 0;
   for (size_t i = 0; i < n; i++) {
     if (masks[i] == 1) {
@@ -127,6 +128,7 @@ void Net::lookup_labels(size_t n, const mask_t *masks, const label_t *labels, la
 void Net::lookup_feats(size_t n, const mask_t *masks, const float_t *feats, float_t *sg_feats) {
   size_t count = 0;
   size_t len = feature_dims[0];
+  if (sg_feats == NULL) sg_feats = new float_t[subgraph_sample_size*len];
   for (size_t i = 0; i < n; i++) {
     if (masks[i] == 1) {
       std::copy(feats+i*len, feats+(i+1)*len, sg_feats+count*len);
@@ -144,7 +146,6 @@ void Net::train(optimizer* opt, bool need_validate) {
   seperator = "\n";
 #endif
 
-  galois::gPrint("\nStart training...\n");
   galois::StatTimer Tupdate("Train-WeightUpdate");
   galois::StatTimer Tfw("Train-Forward");
   galois::StatTimer Tbw("Train-Backward");
@@ -154,10 +155,12 @@ void Net::train(optimizer* opt, bool need_validate) {
   int num_subg_remain = 0;
 #ifdef CPU_ONLY
   if (subgraph_sample_size) {
+    galois::gPrint("\nConstruct training vertex set induced graph...\n");
     subgraph_masks = new mask_t[num_samples];
     sampler->set_masked_graph(train_begin, train_end, train_count, train_masks, context->getGraphPointer());
   }
 #endif
+  galois::gPrint("\nStart training...\n");
   Timer t_epoch;
   // run epochs
   for (unsigned ep = 0; ep < num_epochs; ep++) {
@@ -167,7 +170,9 @@ void Net::train(optimizer* opt, bool need_validate) {
     if (subgraph_sample_size && num_subg_remain == 0) {
 #ifdef CPU_ONLY
       // generate subgraph
-      sampler->subgraph_sample(subgraph_sample_size, *(context->getSubgraphPointer()), subgraph_masks);
+      context->createSubgraph();
+      auto subgraph_ptr = context->getSubgraphPointer();
+      sampler->subgraph_sample(subgraph_sample_size, *(subgraph_ptr), subgraph_masks);
       for (size_t i = 0; i < num_conv_layers-1; i++) {
         layers[i]->set_graph_ptr(context->getSubgraphPointer());
 	  }
@@ -351,9 +356,10 @@ void Net::construct_layers() {
       layers[i]->update_dim_size(subgraph_sample_size);
     layers[i]->add_edge();
   }
-  for (size_t i = 1; i < num_layers; i++) {
+  for (size_t i = 1; i < num_layers; i++)
     connect(layers[i - 1], layers[i]);
-  }
+  for (size_t i = 0; i < num_layers; i++)
+    layers[i]->malloc_and_init();
   layers[0]->set_in_data(context->get_feats_ptr()); // feed input data
   // precompute the normalization constant based on graph structure
   context->norm_factor_counting();
diff --git a/libdeepgalois/src/sampler.cpp b/libdeepgalois/src/sampler.cpp
index 6ee47a452e..c126660fb4 100644
--- a/libdeepgalois/src/sampler.cpp
+++ b/libdeepgalois/src/sampler.cpp
@@ -3,40 +3,58 @@
 #include <time.h> 
 #include <vector>
 
-inline unsigned getDegree(Graph &g, GNode v) {
-	return std::distance(g.edge_begin(v), g.edge_end(v));
+inline unsigned getDegree(Graph *g, GNode v) {
+	return std::distance(g->edge_begin(v), g->edge_end(v));
 }
 
 namespace deepgalois {
 
-void Sampler::get_masked_degrees(size_t n, mask_t *masks, Graph &g, std::vector<uint32_t> &degrees) {
+void Sampler::set_masked_graph(size_t begin, size_t end, size_t count, mask_t *masks, Graph *g) {
+  galois::gPrint("Set masked graph: begin=", begin, ", end=", end, ", count=", count, "\n");
+  begin_ = begin;
+  end_ = end;
+  count_ = count;
+  masks_ = masks;
+  graph = g;
+#ifndef GALOIS_USE_DIST
+  masked_graph = new Graph();
+#endif
+  generate_masked_graph(g->size(), masks, g, *masked_graph);
+  size_t idx = 0;
+  vertices_.resize(count);
+  for (size_t i = begin; i < end; i++) {
+    if (masks_[i] == 1) vertices_[idx++] = i;
+  }
+}
+
+void Sampler::get_masked_degrees(size_t n, mask_t *masks, Graph *g, std::vector<uint32_t> &degrees) {
   assert(degrees.size() == n);
   galois::do_all(galois::iterate(size_t(0), n), [&](const GNode src) {
     if (masks[src] == 1) {
-      for (const auto e : g.edges(src)) {
-        const auto dst = g.getEdgeDst(e);
+      for (const auto e : g->edges(src)) {
+        const auto dst = g->getEdgeDst(e);
         if (masks[dst] == 1) degrees[src] ++;
       }
     }
   }, galois::loopname("update_degrees"));
 }
 
-void Sampler::generate_masked_graph(size_t n, mask_t *masks, Graph &g, Graph *sub) {
+void Sampler::generate_masked_graph(size_t n, mask_t* masks, Graph* g, Graph& sub) {
   std::vector<uint32_t> degrees(n, 0);
   get_masked_degrees(n, masks, g, degrees);
   auto offsets = deepgalois::parallel_prefix_sum(degrees);
   size_t ne = offsets[n];
+  galois::gPrint("Generate masked graph: num_vertices=", n, ", num_edges=", ne, "\n");
 #ifndef GALOIS_USE_DIST
-  sub = new Graph();
-  sub->allocateFrom(n, ne);
-  sub->constructNodes();
+  sub.allocateFrom(n, ne);
+  sub.constructNodes();
   galois::do_all(galois::iterate((size_t)0, n), [&](const GNode src) {
-    g.fixEndEdge(src, offsets[src+1]);
+    sub.fixEndEdge(src, offsets[src+1]);
     if (masks[src] == 1) {
       auto idx = offsets[src];
-      for (const auto e : g.edges(src)) {
-        const auto dst = g.getEdgeDst(e);
-        if (masks[dst] == 1) g.constructEdge(idx++, dst, 0);
+      for (const auto e : g->edges(src)) {
+        const auto dst = g->getEdgeDst(e);
+        if (masks[dst] == 1) sub.constructEdge(idx++, dst, 0);
       }
     }
   }, galois::loopname("gen_subgraph"));
@@ -48,37 +66,48 @@ void Sampler::generate_masked_graph(size_t n, mask_t *masks, Graph &g, Graph *su
 // nv: number of vertices in the original graph;
 // n: number of vertices in the subgraph;
 // m: number of vertices in the frontier.
-void Sampler::select_vertices(size_t nv, size_t n, int m, Graph &g, VertexList vertices, VertexSet &vertex_set) {
+void Sampler::select_vertices(size_t nv, size_t n, int m, Graph *g, VertexList vertices, VertexSet &vertex_set) {
+  galois::gPrint("Select a vertex set of size ", n, " from ", nv, " vertices, graph size: ", g->size(), "\n");
   assert(nv == vertices.size());
   auto frontier_indices = deepgalois::select_k_items(m, 0, (int)nv); // randomly select m vertices from vertices as frontier
   VertexList frontier(m);
   for (int i = 0; i < m; i++)
     frontier[i] = vertices[frontier_indices[i]];
   vertex_set.insert(frontier.begin(), frontier.end());
+  galois::gPrint("vertex_set size: ", vertex_set.size(), "\n");
   int *degrees = new int[m];
-  galois::do_all(galois::iterate(size_t(0), g.size()), [&](const auto i) {
+  galois::do_all(galois::iterate(size_t(0), size_t(m)), [&](const auto i) {
     degrees[i] = (int)getDegree(g, frontier[i]);
   }, galois::loopname("compute_degrees"));
   for (size_t i = 0; i < n - m; i++) {
     auto pos = select_one_item((int)m, degrees);
     auto u = frontier[pos];
     auto degree = degrees[pos];
-    auto neighbor_id = rand() % degree; // randomly select a neighbor
-    auto dst = g.getEdgeDst(g.edge_begin(u) + neighbor_id);
-    frontier[pos] = dst;
-    degrees[pos] = getDegree(g, frontier[pos]);
-    vertex_set.insert(u);
+    int j =0;
+    for (; j < degree; j ++) {
+      auto neighbor_id = rand() % degree; // randomly select a neighbor
+      auto dst = g->getEdgeDst(g->edge_begin(u) + neighbor_id);
+      if (vertex_set.find(dst) == vertex_set.end()) {
+        frontier[pos] = dst;
+        degrees[pos] = getDegree(g, frontier[pos]);
+        vertex_set.insert(dst);
+        break;
+      }
+    }
+    if (j == degree) galois::gPrint("Not found from ", degree, " neighbors\n");
   }
+  galois::gPrint("Done selection, vertex_set size: ", vertex_set.size(), "\n");
   assert(n == vertex_set.size());
 }
 
 void Sampler::update_masks(size_t n, VertexSet vertices, mask_t *masks) {
-	std::fill(masks, masks+n, 0);
-	for (auto v : vertices) masks[v] = 1;
+  galois::gPrint("Updating masks, size = ", vertices.size(), "\n");
+  std::fill(masks, masks+n, 0);
+  for (auto v : vertices) masks[v] = 1;
 }
 
-inline VertexList Sampler::reindexing_vertice(VertexSet vertex_set) {
-  VertexList new_ids(vertex_set.size(), 0);
+inline VertexList Sampler::reindexing_vertice(size_t n, VertexSet vertex_set) {
+  VertexList new_ids(n, 0);
   int vid = 0;
   for (auto v : vertex_set) {
     new_ids[v] = vid++; // reindex
@@ -90,23 +119,24 @@ inline VertexList Sampler::reindexing_vertice(VertexSet vertex_set) {
 void Sampler::generate_subgraph(VertexSet &vertex_set, Graph &g, Graph &sub) {
   //auto n = g.size(); // old graph size
   auto nv = vertex_set.size(); // new graph (subgraph) size
-  VertexList new_ids = reindexing_vertice(vertex_set);
+  VertexList new_ids = reindexing_vertice(graph->size(), vertex_set);
   std::vector<uint32_t> degrees(nv, 0); // degrees of vertices in the subgraph
   for (auto v : vertex_set) {
 	degrees[new_ids[v]] = std::distance(g.edge_begin(v), g.edge_end(v));
   }
   auto offsets = deepgalois::parallel_prefix_sum(degrees);
   auto ne = offsets[nv];
+  galois::gPrint("Generate subgraph: num_vertices=", nv, ", num_edges=", ne, "\n");
 #ifndef GALOIS_USE_DIST
   sub.allocateFrom(nv, ne);
   sub.constructNodes();
   VertexList old_ids(vertex_set.begin(), vertex_set.end()); // vertex ID mapping
   galois::do_all(galois::iterate((size_t)0, nv), [&](const auto i) {
-    g.fixEndEdge(i, offsets[i+1]);
+    sub.fixEndEdge(i, offsets[i+1]);
     unsigned j = 0;
     auto old_id = old_ids[i];
     for (auto e : g.edges(old_id)) {
-      g.constructEdge(offsets[i]+j, g.getEdgeDst(e), 0);
+      sub.constructEdge(offsets[i]+j, g.getEdgeDst(e), 0);
       j ++;
     }
   }, galois::loopname("construct_graph"));
@@ -115,10 +145,13 @@ void Sampler::generate_subgraph(VertexSet &vertex_set, Graph &g, Graph &sub) {
 
 void Sampler::subgraph_sample(size_t n, Graph&sg, mask_t *masks) {
   VertexSet vertex_set; // n = 9000 by default
-  select_vertices(count_, n, m_, *masked_graph, vertices_, vertex_set); // m = 1000 by default
+  select_vertices(count_, n, m_, masked_graph, vertices_, vertex_set); // m = 1000 by default
   update_masks(graph->size(), vertex_set, masks); // set masks for vertices in the vertex_set
-  generate_masked_graph(n, masks, *masked_graph, &sg); // remove edges whose destination is not masked
-  generate_subgraph(vertex_set, *masked_graph, sg);
+#ifndef GALOIS_USE_DIST
+  Graph masked_sg;
+  generate_masked_graph(graph->size(), masks, masked_graph, masked_sg); // remove edges whose destination is not masked
+  generate_subgraph(vertex_set, masked_sg, sg);
+#endif
 }
 
 } // end namespace

From 4217994068865950c702f2a6f8fef00f53c8dd70 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Mon, 27 Apr 2020 19:40:04 -0500
Subject: [PATCH 197/660] fix pointers

---
 .../include/deepgalois/DistContext.h          |  2 ++
 libdeepgalois/include/deepgalois/context.h    |  2 ++
 .../include/deepgalois/layers/layer.h         |  1 +
 libdeepgalois/include/deepgalois/net.h        |  3 --
 libdeepgalois/src/context.cpp                 | 28 +++++++++++++++
 libdeepgalois/src/net.cpp                     | 34 ++-----------------
 6 files changed, 36 insertions(+), 34 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h
index 4444143f09..21dd025aec 100644
--- a/libdeepgalois/include/deepgalois/DistContext.h
+++ b/libdeepgalois/include/deepgalois/DistContext.h
@@ -49,6 +49,8 @@ class DistContext {
   // TODO this is a distributed operation
   void norm_factor_counting();
   void createSubgraph() {}
+  void gen_subgraph_labels(size_t m, const mask_t *masks) {}
+  void gen_subgraph_feats(size_t m, const mask_t *masks) {}
 
   float_t* get_norm_factor_ptr() { return norm_factor; }
   Graph* getGraphPointer() { return graph_cpu; }
diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h
index 15b9605cec..eb41fdf200 100644
--- a/libdeepgalois/include/deepgalois/context.h
+++ b/libdeepgalois/include/deepgalois/context.h
@@ -48,6 +48,8 @@ class Context {
   float_t* get_feats_subg_ptr() { return h_feats_subg; }
   label_t* get_labels_ptr() { return h_labels; }
   label_t* get_labels_subg_ptr() { return h_labels_subg; }
+  void gen_subgraph_labels(size_t m, const mask_t *masks);
+  void gen_subgraph_feats(size_t m, const mask_t *masks);
 #else
   CSRGraph graph_gpu; // the input graph, |V| = N
   CSRGraph subgraph_gpu;
diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h
index c604f6ffbe..b49c8797a9 100644
--- a/libdeepgalois/include/deepgalois/layers/layer.h
+++ b/libdeepgalois/include/deepgalois/layers/layer.h
@@ -73,6 +73,7 @@ class layer : public deepgalois::node {
   virtual void set_context(ContextType* ctx) { context = ctx; }
   void set_trainable(bool trainable) { trainable_ = trainable; } // is this layer trainable?
   void set_labels_ptr(label_t *ptr) { labels = ptr; }
+  void set_feats_ptr(float_t *ptr) { prev_->set_data(ptr); }
   void set_name(std::string name) { name_ = name; } // name metadata
 #ifdef CPU_ONLY
   void set_graph_ptr(Graph *ptr) { graph_cpu = ptr; }
diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h
index d0adf2d55f..820367bef5 100644
--- a/libdeepgalois/include/deepgalois/net.h
+++ b/libdeepgalois/include/deepgalois/net.h
@@ -113,9 +113,6 @@ class Net {
   deepgalois::DistContext* context;
 #endif
 
-  void lookup_labels(size_t n, const mask_t *masks, const label_t *labels, label_t *sub_labels);
-  void lookup_feats(size_t n, const mask_t *masks, const float_t *feats, float_t *sg_feats);
-
 #ifdef CPU_ONLY
   // comparing outputs with the ground truth (labels)
   acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, Graph* dGraph);
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index f6d443f4f1..8d779c0b80 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -31,6 +31,34 @@ void Context::createSubgraph() {
   subgraph_cpu = new Graph(); 
 }
 
+// generate labels for the subgraph, m is subgraph size
+void Context::gen_subgraph_labels(size_t m, const mask_t *masks) {
+  if (h_labels_subg == NULL) h_labels_subg = new label_t[m];
+  size_t count = 0;
+  for (size_t i = 0; i < n; i++) {
+    if (masks[i] == 1) {
+      if (is_single_class) {
+        h_labels_subg[count] = h_labels[i];
+      } else {
+        std::copy(h_labels+i*num_classes, h_labels+(i+1)*num_classes, h_labels_subg+count*num_classes);
+	  }
+      count ++;
+	}
+  }
+}
+
+// generate input features for the subgraph, m is subgraph size
+void Context::gen_subgraph_feats(size_t m, const mask_t *masks) {
+  size_t count = 0;
+  if (h_feats_subg == NULL) h_feats_subg = new float_t[m*feat_len];
+  for (size_t i = 0; i < n; i++) {
+    if (masks[i] == 1) {
+      std::copy(h_feats+i*feat_len, h_feats+(i+1)*feat_len, h_feats_subg+count*feat_len);
+      count ++;
+	}
+  }
+}
+
 size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype, bool selfloop) {
   galois::StatTimer Tread("GraphReadingTime");
   Tread.start();
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index e9ed3b4fd4..a991e5fe17 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -109,34 +109,6 @@ void Net::init(std::string dataset_str, unsigned num_conv, unsigned epochs,
 #endif
 }
 
-// generate labels for the subgraph
-void Net::lookup_labels(size_t n, const mask_t *masks, const label_t *labels, label_t *sg_labels) {
-  if (sg_labels == NULL) sg_labels = new label_t[subgraph_sample_size];
-  size_t count = 0;
-  for (size_t i = 0; i < n; i++) {
-    if (masks[i] == 1) {
-      if (is_single_class) {
-        sg_labels[count] = labels[i];
-      } else {
-        std::copy(labels+i*num_classes, labels+(i+1)*num_classes, sg_labels+count*num_classes);
-	  }
-      count ++;
-	}
-  }
-}
-
-void Net::lookup_feats(size_t n, const mask_t *masks, const float_t *feats, float_t *sg_feats) {
-  size_t count = 0;
-  size_t len = feature_dims[0];
-  if (sg_feats == NULL) sg_feats = new float_t[subgraph_sample_size*len];
-  for (size_t i = 0; i < n; i++) {
-    if (masks[i] == 1) {
-      std::copy(feats+i*len, feats+(i+1)*len, sg_feats+count*len);
-      count ++;
-	}
-  }
-}
-
 void Net::train(optimizer* opt, bool need_validate) {
   std::string header = "";
   std::string seperator = " ";
@@ -180,12 +152,12 @@ void Net::train(optimizer* opt, bool need_validate) {
       layers[num_layers - 1]->set_sample_mask(train_begin, train_end, train_count, subgraph_masks);
 
       // update labels for subgraph
-      lookup_labels(num_samples, subgraph_masks, context->get_labels_ptr(), context->get_labels_subg_ptr());
+      context->gen_subgraph_labels(subgraph_sample_size, subgraph_masks);
       layers[num_layers-1]->set_labels_ptr(context->get_labels_subg_ptr());
 
       // update features for subgraph
-      lookup_feats(num_samples, subgraph_masks, context->get_feats_ptr(), context->get_feats_subg_ptr());
-      layers[0]->set_in_data(context->get_feats_subg_ptr()); // feed input data
+      context->gen_subgraph_feats(subgraph_sample_size, subgraph_masks);
+      layers[0]->set_feats_ptr(context->get_feats_subg_ptr()); // feed input data
 #endif
       num_subg_remain += 1; // num_threads
     }

From db20360b31885992ec70c290f7c9d351d6f1883a Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Mon, 27 Apr 2020 23:19:52 -0500
Subject: [PATCH 198/660] fix norm_factor

---
 .../include/deepgalois/DistContext.h          |  3 +--
 libdeepgalois/include/deepgalois/context.h    |  4 +++-
 .../deepgalois/layers/graph_conv_layer.h      |  7 ++----
 .../include/deepgalois/layers/layer.h         |  2 +-
 libdeepgalois/src/DistContext.cpp             |  2 +-
 libdeepgalois/src/context.cpp                 | 24 ++++++++++---------
 libdeepgalois/src/context.cu                  |  2 +-
 libdeepgalois/src/layers/aggregator.cpp       |  8 +++----
 libdeepgalois/src/layers/graph_conv_layer.cpp | 14 ++++++++---
 libdeepgalois/src/layers/graph_conv_layer.cu  |  2 ++
 libdeepgalois/src/net.cpp                     |  5 +++-
 11 files changed, 43 insertions(+), 30 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h
index 21dd025aec..04aca5fc9e 100644
--- a/libdeepgalois/include/deepgalois/DistContext.h
+++ b/libdeepgalois/include/deepgalois/DistContext.h
@@ -47,7 +47,7 @@ class DistContext {
 
   //! find norm factor by looking at degree
   // TODO this is a distributed operation
-  void norm_factor_counting();
+  void norm_factor_counting(size_t g_size);
   void createSubgraph() {}
   void gen_subgraph_labels(size_t m, const mask_t *masks) {}
   void gen_subgraph_feats(size_t m, const mask_t *masks) {}
@@ -63,7 +63,6 @@ class DistContext {
   void initializeSyncSubstrate();
   galois::graphs::GluonSubstrate<Graph>* getSyncSubstrate();
 
-
   //! return label for some node
   //! NOTE: this is LID, not GID
   label_t get_label(size_t i) { return h_labels[i]; }
diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h
index eb41fdf200..e368319dff 100644
--- a/libdeepgalois/include/deepgalois/context.h
+++ b/libdeepgalois/include/deepgalois/context.h
@@ -33,8 +33,9 @@ class Context {
   float_t* get_norm_factor_ptr() { return norm_factor; }
 
   void set_label_class(bool is_single = true) { is_single_class = is_single; }
+  void set_use_subgraph(bool use_subg) { use_subgraph = use_subg; }
   void copy_data_to_device(); // copy labels and input features
-  void norm_factor_counting();
+  void norm_factor_counting(size_t g_size);
 
 #ifdef CPU_ONLY
   Graph* graph_cpu; // the input graph, |V| = N
@@ -71,6 +72,7 @@ class Context {
   size_t feat_len;             // input feature length: D
   bool is_single_class;        // single-class (one-hot) or multi-class label
   bool is_selfloop_added;      // whether selfloop is added to the input graph
+  bool use_subgraph;           // whether to use subgraph
   label_t *h_labels;           // labels for classification. Single-class label: Nx1, multi-class label: NxE 
   label_t *h_labels_subg;      // labels for subgraph
   float_t* h_feats;            // input features: N x D
diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
index 7f0aa5a9a3..dc38642330 100644
--- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
@@ -36,7 +36,6 @@ class graph_conv_layer : public layer {
   void malloc_and_init();
   std::string layer_type() const override { return std::string("graph_conv"); }
   void set_netphase(net_phase ctx) override { phase_ = ctx; }
-  void set_context(layer::ContextType* ctx) { context = ctx; norm_factor = ctx->get_norm_factor_ptr(); }
   virtual acc_t get_weight_decay_loss();
   //! Uses weights contained in this layer to update in_data (results from previous)
   //! and save result to out_data
@@ -48,14 +47,13 @@ class graph_conv_layer : public layer {
   // user-defined aggregate function
 #ifdef CPU_ONLY
   virtual void aggregate(size_t len, Graph& g, const float_t* in, float_t* out);
+  void d_aggregate(size_t len, Graph& g, const float_t* in, float_t* out);
 #else
   virtual void aggregate(size_t len, CSRGraph& g, const float_t* in, float_t* out);
+  void d_aggregate(size_t len, CSRGraph& g, const float_t* in, float_t* out);
 #endif
   // user-defined combine function
   virtual void combine(size_t dim_x, size_t dim_y, const float_t* self, const float_t* neighbors, float_t* out);
-#ifndef CPU_ONLY
-  void d_aggregate(size_t len, CSRGraph& g, const float_t* in, float_t* out);
-#endif
 
 private:
   bool act_;     // whether to use activation function at the end
@@ -70,7 +68,6 @@ class graph_conv_layer : public layer {
   float_t* in_temp1;
   float_t* trans_data;    // y*x
   unsigned* dropout_mask; // x*y
-  float_t* norm_factor;   // normalization constant based on graph structure
 
   // Glorot & Bengio (AISTATS 2010)
   inline void rand_init_matrix(size_t dim_x, size_t dim_y, vec_t& matrix, unsigned seed=1);
diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h
index b49c8797a9..79196172ca 100644
--- a/libdeepgalois/include/deepgalois/layers/layer.h
+++ b/libdeepgalois/include/deepgalois/layers/layer.h
@@ -70,7 +70,7 @@ class layer : public deepgalois::node {
 
   // set methods
   virtual void set_netphase(net_phase phase) {}
-  virtual void set_context(ContextType* ctx) { context = ctx; }
+  void set_context(ContextType* ctx) { context = ctx; }
   void set_trainable(bool trainable) { trainable_ = trainable; } // is this layer trainable?
   void set_labels_ptr(label_t *ptr) { labels = ptr; }
   void set_feats_ptr(float_t *ptr) { prev_->set_data(ptr); }
diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp
index 2a9ad81575..174e7eb210 100644
--- a/libdeepgalois/src/DistContext.cpp
+++ b/libdeepgalois/src/DistContext.cpp
@@ -151,7 +151,7 @@ float_t* DistContext::get_in_ptr() {
   return &h_feats[0];
 }
 
-void DistContext::norm_factor_counting() {
+void DistContext::norm_factor_counting(size_t g_size) {
   // TODO: this is a distributed operation
 
   // create for now, TODO need to actually fill it in
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index 8d779c0b80..b17f6d7eaa 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -8,8 +8,9 @@
 namespace deepgalois {
 
 #ifdef CPU_ONLY
-Context::Context() : n(0), num_classes(0), feat_len(0), 
-  is_single_class(true), is_selfloop_added(false), 
+Context::Context() : n(0), num_classes(0), 
+  feat_len(0), is_single_class(true), 
+  is_selfloop_added(false), use_subgraph(false),
   h_labels(NULL), h_labels_subg(NULL), 
   h_feats(NULL), h_feats_subg(NULL),
   d_labels(NULL), d_labels_subg(NULL),
@@ -119,15 +120,16 @@ void Context::add_selfloop(Graph &og, Graph &g) {
   //*/
 }
 
-void Context::norm_factor_counting() {
-  norm_factor = new float_t[n];
-  galois::do_all(galois::iterate((size_t)0, n),
-    [&](auto v) {
-      auto degree  = std::distance(graph_cpu->edge_begin(v), graph_cpu->edge_end(v));
-      float_t temp = std::sqrt(float_t(degree));
-      if (temp == 0.0) norm_factor[v] = 0.0;
-      else norm_factor[v] = 1.0 / temp;
-    }, galois::loopname("NormCounting"));
+void Context::norm_factor_counting(size_t g_size) {
+  Graph *g = graph_cpu;
+  if (use_subgraph) g = subgraph_cpu;
+  if (norm_factor == NULL) norm_factor = new float_t[g_size];
+  galois::do_all(galois::iterate((size_t)0, g_size), [&](auto v) {
+    auto degree  = std::distance(g->edge_begin(v), g->edge_end(v));
+    float_t temp = std::sqrt(float_t(degree));
+    if (temp == 0.0) norm_factor[v] = 0.0;
+    else norm_factor[v] = 1.0 / temp;
+  }, galois::loopname("NormCounting"));
 }
 
 void Context::read_edgelist(const char* filename, bool symmetrize, bool add_self_loop) {
diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu
index bdef92b52a..23abd3f1c2 100644
--- a/libdeepgalois/src/context.cu
+++ b/libdeepgalois/src/context.cu
@@ -97,7 +97,7 @@ size_t Context::read_graph(std::string dataset_str, bool selfloop) {
   return n;
 }
 
-void Context::norm_factor_counting() {
+void Context::norm_factor_counting(size_t g_size) {
   std::cout << "Pre-computing normalization factor (n=" << n << ") ... ";
   if (!is_selfloop_added) {
     std::cout << "Set -sl=1 to add selfloop\n";	  
diff --git a/libdeepgalois/src/layers/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp
index 40a8fdcf8f..65308172f1 100644
--- a/libdeepgalois/src/layers/aggregator.cpp
+++ b/libdeepgalois/src/layers/aggregator.cpp
@@ -6,12 +6,11 @@ void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* ou
                 bool norm, const float_t* norm_factor) {
   // zero out the output data
   #ifndef GALOIS_USE_DIST
-  galois::do_all(galois::iterate(g),
+  galois::do_all(galois::iterate(size_t(0), g.size()),[&](const auto src) {
   #else
   auto& rangeObj = g.allNodesRange();
-  galois::do_all(galois::iterate(rangeObj),
+  galois::do_all(galois::iterate(rangeObj), [&](const auto src) {
   #endif
-  [&](const GNode src) {
     deepgalois::math::clear_cpu(len , &out[src * len]);
     float_t a = 0.0;
     float_t b = 0.0;
@@ -29,10 +28,11 @@ void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* ou
         // use scaled data to update
         deepgalois::math::vadd_cpu(len, &out[src * len], &neighbor[0],
                                    &out[src * len]); // out[src] += in[dst]
-      } else
+      } else {
         // add embeddings from neighbors together
         deepgalois::math::vadd_cpu(len, &out[src * len], &in[dst * len],
                                    &out[src * len]); // out[src] += in[dst]
+      }
     }
   }, galois::steal(), galois::no_stats(), galois::loopname("update_all"));
 }
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 9903768070..5e3b6aa320 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -37,7 +37,15 @@ inline void graph_conv_layer::zero_init_matrix(size_t dim_x, size_t dim_y, vec_t
 
 #ifdef CPU_ONLY
 void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in, float_t* out) {
-  deepgalois::update_all(len, g, in, out, norm_, norm_factor);
+  // normalization constant based on graph structure
+  float_t* norm_consts = context->get_norm_factor_ptr();
+  update_all(len, g, in, out, norm_, norm_consts);
+}
+
+// since graph is symmetric, the derivative is the same
+void graph_conv_layer::d_aggregate(size_t len, Graph& g, const float_t* in, float_t* out) {
+  float_t* norm_consts = context->get_norm_factor_ptr();
+  update_all(len, g, in, out, norm_, norm_consts); // x*x; x*z -> x*z
 }
 
 void graph_conv_layer::combine(size_t n, size_t len, const float_t* self, const float_t* neighbors, float_t* out) {
@@ -83,6 +91,7 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_
   size_t x = input_dims[0];
   size_t y = input_dims[1];
   size_t z = output_dims[1];
+  //std::cout << "x=" << x << ", y=" << y << ", z=" << z << "\n";
   // input: x*y; W: y*z; output: x*z
   // if y > z: mult W first to reduce the feature size for aggregation
   // else: aggregate first then mult W (not implemented yet)
@@ -115,9 +124,8 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
   if (act_) math::d_relu_cpu(x*z, out_grad, out_data, out_grad);
   //else deepgalois::math::copy_cpu(x * z, out_grad, out_temp); // TODO: avoid copying
 
-  // x*y NOTE: since graph is symmetric, the derivative is the same
   // this is the aggregate call
-  deepgalois::update_all(z, *graph_cpu, out_grad, out_temp, norm_, norm_factor); // x*x; x*z -> x*z
+  graph_conv_layer::d_aggregate(z, *graph_cpu, out_grad, out_temp);
 #ifdef GALOIS_USE_DIST
   // sync agg
   deepgalois::_syncVectorSize = z;
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cu b/libdeepgalois/src/layers/graph_conv_layer.cu
index ed89089450..c3f97a49d4 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cu
+++ b/libdeepgalois/src/layers/graph_conv_layer.cu
@@ -30,6 +30,7 @@ void graph_conv_layer::malloc_and_init() {
 }
 
 void graph_conv_layer::aggregate(size_t len, CSRGraph& g, const float_t* in, float_t* out) {
+  float_t* norm_factor = context->get_norm_factor_ptr();
   #ifdef USE_CUSPARSE
   deepgalois::update_all_csrmm(len, g, in, out, norm_, norm_factor);
   #else
@@ -38,6 +39,7 @@ void graph_conv_layer::aggregate(size_t len, CSRGraph& g, const float_t* in, flo
 }
 
 void graph_conv_layer::d_aggregate(size_t len, CSRGraph& g, const float_t* in, float_t* out) {
+  float_t* norm_factor = context->get_norm_factor_ptr();
 #ifdef USE_CUSPARSE
   deepgalois::update_all_csrmm(len, g, in, out, norm_, norm_factor);
 #else
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index a991e5fe17..7c9d049fc2 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -32,6 +32,7 @@ void Net::init(std::string dataset_str, unsigned num_conv, unsigned epochs,
 #ifndef GALOIS_USE_DIST
   context = new deepgalois::Context();
   context->set_label_class(is_single_class);
+  context->set_use_subgraph(subgraph_sample_size > 0);
   num_samples = context->read_graph(dataset_str, selfloop);
   if (subgraph_sample_size) sampler = new deepgalois::Sampler();
 #else
@@ -158,6 +159,8 @@ void Net::train(optimizer* opt, bool need_validate) {
       // update features for subgraph
       context->gen_subgraph_feats(subgraph_sample_size, subgraph_masks);
       layers[0]->set_feats_ptr(context->get_feats_subg_ptr()); // feed input data
+
+      context->norm_factor_counting(subgraph_sample_size);
 #endif
       num_subg_remain += 1; // num_threads
     }
@@ -334,7 +337,7 @@ void Net::construct_layers() {
     layers[i]->malloc_and_init();
   layers[0]->set_in_data(context->get_feats_ptr()); // feed input data
   // precompute the normalization constant based on graph structure
-  context->norm_factor_counting();
+  if (!subgraph_sample_size) context->norm_factor_counting(num_samples);
   set_contexts();
 }
 

From d242fab483a4a07fcb2959351d1941462876eba9 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Tue, 28 Apr 2020 08:31:24 -0500
Subject: [PATCH 199/660] udapte g_conv

---
 libdeepgalois/src/layers/graph_conv_layer.cpp | 64 +++++++++++--------
 1 file changed, 36 insertions(+), 28 deletions(-)

diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 5e3b6aa320..c7d0307fd4 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -36,6 +36,7 @@ inline void graph_conv_layer::zero_init_matrix(size_t dim_x, size_t dim_y, vec_t
 }
 
 #ifdef CPU_ONLY
+// aggregate based on graph topology
 void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in, float_t* out) {
   // normalization constant based on graph structure
   float_t* norm_consts = context->get_norm_factor_ptr();
@@ -84,6 +85,7 @@ void graph_conv_layer::malloc_and_init() {
   in_temp  = new float_t[x * y];
   out_temp = new float_t[x * z];
   trans_data = new float_t[y * x]; // y*x
+  if (y <= z) in_temp1 = new float_t[x * y];
 }
 
 // 𝒉[𝑙] = σ(𝑊 * Σ(𝒉[𝑙-1]))
@@ -92,22 +94,26 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_
   size_t y = input_dims[1];
   size_t z = output_dims[1];
   //std::cout << "x=" << x << ", y=" << y << ", z=" << z << "\n";
+
   // input: x*y; W: y*z; output: x*z
   // if y > z: mult W first to reduce the feature size for aggregation
-  // else: aggregate first then mult W (not implemented yet)
-  if (dropout_ && phase_ == net_phase::train) {
+  // else: aggregate first then mult W
+  if (dropout_ && phase_ == net_phase::train)
     math::dropout_cpu(x*y, scale_, dropout_rate_, in_data, dropout_mask, in_temp);
-    math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, &layer::W[0], 0.0, out_temp);
-  } else math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_data, &layer::W[0], 0.0, out_temp);
+  else math::copy_cpu(x*y, in_data, in_temp); 
 
-  // aggregate based on graph topology
-  graph_conv_layer::aggregate(z, *graph_cpu, out_temp, out_data);
+  if (y > z) {
+    math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, &layer::W[0], 0.0, out_temp);
+    aggregate(z, *graph_cpu, out_temp, out_data);
+  } else {
+    aggregate(y, *graph_cpu, in_temp, in_temp1);
+    math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp1, &layer::W[0], 0.0, out_data);
+  }
 #ifdef GALOIS_USE_DIST
   // TODO sync of out_data required here
   deepgalois::_syncVectorSize = z;
   deepgalois::_dataToSync = out_data;
-  layer::context->getSyncSubstrate()->sync<writeAny, readAny,
-                                          GraphConvSync>("AggSync");
+  layer::context->getSyncSubstrate()->sync<writeAny, readAny, GraphConvSync>("AggSync");
 #endif
   // run relu activation on output if specified
   if (act_) math::relu_cpu(x*z, out_data, out_data);
@@ -122,34 +128,36 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
   size_t z = output_dims[1];
   // note; assumption here is that out_grad contains 1s or 0s via relu?
   if (act_) math::d_relu_cpu(x*z, out_grad, out_data, out_grad);
-  //else deepgalois::math::copy_cpu(x * z, out_grad, out_temp); // TODO: avoid copying
+  //else math::copy_cpu(x * z, out_grad, out_temp); // TODO: avoid copying
+
+  if (y > z) {
+    d_aggregate(z, *graph_cpu, out_grad, out_temp);
+    // at this point, out_temp has the derivative of data from last step to
+    // use for both updating gradients for features and gradients for weights
+    // this calculates gradients for the node predictions
+    if (level_ != 0) // no need to calculate in_grad for the first layer
+      // derivative of matmul needs transposed matrix
+      math::sgemm_cpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, &W[0], 0.0, in_grad); // x*z; z*y -> x*y
+    // calculate weight gradients using input data; multiplied by gradients from last back prop step
+    math::sgemm_cpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, &layer::weight_grad[0]); // y*x; x*z; y*z
+  } else {
+    if (level_ != 0) {
+      math::sgemm_cpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_grad, &W[0], 0.0, in_temp);
+      d_aggregate(y, *graph_cpu, in_temp, in_grad);
+    }
+    math::sgemm_cpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_grad, 0.0, &layer::weight_grad[0]);
+  }
 
-  // this is the aggregate call
-  graph_conv_layer::d_aggregate(z, *graph_cpu, out_grad, out_temp);
 #ifdef GALOIS_USE_DIST
   // sync agg
   deepgalois::_syncVectorSize = z;
   deepgalois::_dataToSync = out_temp;
-  layer::context->getSyncSubstrate()->sync<writeAny, readAny,
-                                          GraphConvSync>("AggSyncBack");
+  layer::context->getSyncSubstrate()->sync<writeAny, readAny, GraphConvSync>("AggSyncBack");
 #endif
 
-  // at this point, out_temp has the derivative of data from last step to
-  // use for both updating gradients for features and gradients for weights
-  // this calculates gradients for the node predictions
-  if (level_ != 0) { // no need to calculate in_grad for the first layer
-    // derivative of matmul needs transposed matrix
-    math::sgemm_cpu(CblasNoTrans, CblasTrans, x, y, z, 1.0,
-                    out_temp, &W[0], 0.0, in_grad); // x*z; z*y -> x*y
-    if (dropout_) {
-      math::d_dropout_cpu(x*y, scale_, in_grad, dropout_mask, in_grad);
-    }
-  }
+  if (level_ != 0 && dropout_)
+    math::d_dropout_cpu(x*y, scale_, in_grad, dropout_mask, in_grad);
 
-  // calculate weight gradients using input data
-  // multiplied by gradients from last back prop step
-  math::sgemm_cpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data,
-                  out_temp, 0.0, &layer::weight_grad[0]); // y*x; x*z; y*z
 #ifdef GALOIS_USE_DIST
   layer::syncSub->sync<writeAny, readAny, GradientSync>("GradientSync");
   //galois::gInfo("[", layer::gradientGraph->myHostID(), "] Sync done");

From 11f7ce4f910aa79a5911ceb2311a901c30e12119 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Tue, 28 Apr 2020 16:06:06 -0500
Subject: [PATCH 200/660] add MKL

---
 CMakeLists.txt                                | 14 +++++++
 cmake/Modules/FindMKL.cmake                   | 24 ++++++++++++
 libdeepgalois/CMakeLists.txt                  | 30 ++++++++------
 .../include/deepgalois/layers/layer.h         |  1 -
 .../include/deepgalois/math_functions.hh      | 11 +++---
 libdeepgalois/include/deepgalois/optimizer.h  | 16 +-------
 libdeepgalois/src/layers/aggregator.cpp       | 20 +++++-----
 libdeepgalois/src/layers/graph_conv_layer.cpp |  9 +++++
 libdeepgalois/src/layers/graph_conv_layer.cu  |  1 +
 libdeepgalois/src/layers/l2_norm_layer.cpp    |  1 +
 libdeepgalois/src/layers/l2_norm_layer.cu     |  1 +
 libdeepgalois/src/layers/leaky_relu_layer.cpp |  1 +
 libdeepgalois/src/layers/leaky_relu_layer.cu  |  1 +
 libdeepgalois/src/layers/relu_layer.cpp       |  1 +
 libdeepgalois/src/layers/relu_layer.cu        |  1 +
 .../src/layers/sigmoid_loss_layer.cpp         |  1 +
 .../src/layers/sigmoid_loss_layer.cu          |  1 +
 .../src/layers/softmax_loss_layer.cpp         |  1 +
 .../src/layers/softmax_loss_layer.cu          |  1 +
 libdeepgalois/src/math_functions.cpp          | 39 ++++++++++---------
 libdeepgalois/src/net.cpp                     |  9 ++++-
 libdeepgalois/src/net.cu                      |  1 +
 libdeepgalois/src/optimizer.cpp               |  1 +
 libdeepgalois/src/optimizer.cu                | 17 +++++++-
 libdeepgalois/src/sampler.cpp                 |  9 ++++-
 .../include/galois/graphs/LC_CSR_Graph.h      |  3 ++
 lonestargnn/CMakeLists.txt                    | 14 +++----
 27 files changed, 157 insertions(+), 72 deletions(-)
 create mode 100644 cmake/Modules/FindMKL.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4be9753f54..1f1b853aef 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,6 +16,8 @@ set(GALOIS_COPYRIGHT_YEAR "2018") # Also in COPYRIGHT
 #set(GCC_MPIP_LINK_FLAGS "-L${MPIP_DIR} -L${LIBUNWIND_DIR} -lmpiP -lbfd -liberty -lm -lunwind")
 #link_directories(LIBUNWIND_DIR MPIP_DIR)
 
+SET(OPENBLAS_ROOT /net/ohm/export/cdgc/cxh/OpenBLAS-faraday/build)
+SET(CUDA_HOME /org/centers/cdgc/cuda/cuda-10.0)
 
 if(NOT CMAKE_BUILD_TYPE)
   message(STATUS "No build type selected, default to release")
@@ -37,6 +39,7 @@ set(NUM_TEST_GPUS "0" CACHE STRING "Number of test GPUs to use (on a single mach
 ###### General features ######
 set(USE_GPROF OFF CACHE BOOL "Enable GCC profiling")
 set(USE_VTUNE OFF CACHE BOOL "Use VTune for profiling")
+set(USE_MKL_BLAS OFF CACHE BOOL "Use MKL for BLAS")
 set(USE_PAPI OFF CACHE BOOL "Use PAPI counters for profiling")
 set(USE_HPCTK OFF CACHE BOOL "Use HPCToolKit for profiling")
 set(USE_STRICT_CONFIG OFF CACHE BOOL "Instead of falling back gracefully, fail")
@@ -320,6 +323,17 @@ if(USE_VTUNE)
   endif()
 endif()
 
+if(USE_MKL_BLAS)
+  SET(MKL_ROOT /opt/apps/sysnet/intel/17.0/mkl)
+  find_package(MKL)
+  message(STATUS "MKL: ${MKL_INCLUDE_DIRS}")
+  if (MKL_FOUND)
+    include_directories(${MKL_INCLUDE_DIRS})
+  else()
+    message(WARNING "MKL not found")
+  endif()
+endif()
+
 if(USE_PAPI)
   if (PAPI_ROOT STREQUAL "")
     set(PAPI_ROOT /usr)
diff --git a/cmake/Modules/FindMKL.cmake b/cmake/Modules/FindMKL.cmake
new file mode 100644
index 0000000000..d87020f770
--- /dev/null
+++ b/cmake/Modules/FindMKL.cmake
@@ -0,0 +1,24 @@
+# Find MKL libraries
+# Once done this will define
+#  MKL_FOUND - System has MKL
+#  MKL_INCLUDE_DIRS - The MKL include directories
+#  MKL_LIBRARIES - The libraries needed to use MKL
+
+set(MKL_LIBRARIES) # Include-only library
+
+if(MKL_INCLUDE_DIRS)
+  set(MKL_FIND_QUIETLY TRUE)
+endif()
+
+find_path(MKL_INCLUDE_DIRS mkl.h PATHS ${MKL_ROOT} PATH_SUFFIXES include)
+message(STATUS "MKL_INCLUDE_DIRS: ${MKL_INCLUDE_DIRS}")
+find_library(MKL_LIBRARY NAMES mkl_rt PATHS ${MKL_ROOT} PATH_SUFFIXES lib/intel64)
+message(STATUS "MKL_LIBRARY: ${MKL_LIBRARY}")
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(MKL DEFAULT_MSG MKL_LIBRARY MKL_INCLUDE_DIRS)
+if(MKL_FOUND)
+  set(MKL_FOUND on)
+endif()
+
+mark_as_advanced(MKL_INCLUDE_DIRS)
diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index 3f592f0d18..de0cd30dc9 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -1,10 +1,20 @@
 cmake_minimum_required(VERSION 2.8)
 
-# open blas
-SET(OPENBLAS_INC /net/ohm/export/cdgc/cxh/OpenBLAS-faraday/build/include)
-SET(OPENBLAS_LIB /net/ohm/export/cdgc/cxh/OpenBLAS-faraday/build/lib)
-include_directories(${OPENBLAS_INC})
-link_directories(${OPENBLAS_LIB})
+SET(BLAS_INC_DIR ${OPENBLAS_ROOT}/include)
+SET(BLAS_LIB_DIR ${OPENBLAS_ROOT}/lib)
+set(BLAS_LIB -lopenblas)
+if(USE_MKL_BLAS)
+  SET(BLAS_INC_DIR ${MKL_ROOT}/include)
+  SET(BLAS_LIB_DIR ${MKL_ROOT}/lib/intel64)
+  set(BLAS_LIB "-lmkl_intel_lp64 -lmkl_sequential -lmkl_core")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_MKL")
+endif()
+
+# blas library
+include_directories(${BLAS_INC_DIR})
+link_directories(${BLAS_LIB_DIR})
+message(STATUS "BLAS_INC_DIR: ${BLAS_INC_DIR}")
+message(STATUS "BLAS_LIB_DIR: ${BLAS_LIB_DIR}")
 
 # galois base libs
 include_directories(${CMAKE_SOURCE_DIR}/libgalois/include)
@@ -19,9 +29,7 @@ else()
   include_directories("${CUB_ROOT}")
   set(MGPU_ROOT "${CMAKE_SOURCE_DIR}/moderngpu") # only required headers
   include_directories("${MGPU_ROOT}/src")
-
-  SET(CUDA_INC /org/centers/cdgc/cuda/cuda-10.0/include)
-  include_directories(${CUDA_INC})
+  include_directories(${CUDA_HOME}/include)
   include_directories(${CMAKE_SOURCE_DIR}/libgpu/include)
 
   find_package(CUDA REQUIRED)
@@ -32,9 +40,7 @@ else()
   #set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -gencode arch=compute_61,code=sm_61)
   set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -gencode arch=compute_70,code=sm_70)
   #set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -G -Xcompiler -rdynamic)
-  #set(CUDA_INCLUDE_DIRS /org/centers/cdgc/cuda/cuda-10.0/include ${CUDA_INCLUDE_DIRS})
-  SET(CUDA_LIB /org/centers/cdgc/cuda/cuda-10.0/lib64/)
-  link_directories(${CUDA_LIB})
+  link_directories(${CUDA_HOME}/lib64)
   link_directories(${CMAKE_SOURCE_DIR}/libgpu)
 
   set(CUDA_SOURCES
@@ -100,7 +106,7 @@ endif()
 add_library(dg_cpu STATIC ${sources})
 target_link_libraries(dg_cpu galois_shmem gllvm)
 target_link_libraries(dg_cpu ${MPI_CXX_LIBRARIES})
-target_link_libraries(dg_cpu -lopenblas)
+target_link_libraries(dg_cpu ${BLAS_LIB})
 target_include_directories(dg_cpu PUBLIC
   ${CMAKE_SOURCE_DIR}/libllvm/include
   ${CMAKE_SOURCE_DIR}/libgalois/include
diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h
index 79196172ca..7f1c05ce60 100644
--- a/libdeepgalois/include/deepgalois/layers/layer.h
+++ b/libdeepgalois/include/deepgalois/layers/layer.h
@@ -16,7 +16,6 @@
 #include "deepgalois/DistContext.h"
 #endif
 #include "deepgalois/optimizer.h"
-#include "deepgalois/math_functions.hh"
 #include "deepgalois/layers/node.h"
 #ifdef GALOIS_USE_DIST
 #include "galois/graphs/GluonSubstrate.h"
diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh
index 9c0e58dc45..72b836da64 100644
--- a/libdeepgalois/include/deepgalois/math_functions.hh
+++ b/libdeepgalois/include/deepgalois/math_functions.hh
@@ -10,15 +10,16 @@
 #include <cstdint>
 #include "deepgalois/types.h"
 
+#ifdef USE_MKL
+#include <mkl.h>
+#else  // If use MKL, simply include the MKL header
 extern "C" {
 #include <cblas.h>
-//#include <clapack.h>
 }
-
-// TODO namespace
-
+#endif
 
 namespace deepgalois {
+
 namespace math {
 //! add 2 arrays for n elements
 void vadd_cpu(size_t n, const float_t* a, const float_t* b, float_t* out);
@@ -27,6 +28,7 @@ void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out);
 //! do dot product of 2 vectors
 float_t dot(const vec_t& x, const vec_t& y);
 void axpy(size_t n, const float_t a, float_t *x, float_t *y);
+int argmax(const size_t n, const float_t* x); // the arguments of the maxima
 //! Computes half the L2 norm of a tensor without the sqrt: output = sum(t ** 2) / 2
 float_t l2_norm(size_t n, const float_t* a);
 //! clear n elements of a vector
@@ -118,7 +120,6 @@ void matmul2D1D(const size_t dim_y, const tensor_t& A, const vec_t& B,
 void transpose2D(const tensor_t& in, tensor_t& out);
 void transpose2D1D(const tensor_t& in, vec_t& out);
 int argmax(const size_t n, const vec_t& x);   // the arguments of the maxima
-int argmax(const size_t n, const float_t* x); // the arguments of the maxima
 
 // GPU operators
 bool isnan_gpu(int n, const float_t *array); // does array contain any 'nan' element
diff --git a/libdeepgalois/include/deepgalois/optimizer.h b/libdeepgalois/include/deepgalois/optimizer.h
index b6a90917ff..b745f12cb6 100644
--- a/libdeepgalois/include/deepgalois/optimizer.h
+++ b/libdeepgalois/include/deepgalois/optimizer.h
@@ -16,9 +16,6 @@
 #include <algorithm>
 #include <unordered_map>
 #include "deepgalois/types.h"
-#ifndef CPU_ONLY
-#include "deepgalois/math_functions.hh"
-#endif
 
 namespace deepgalois {
 
@@ -41,10 +38,8 @@ struct optimizer {
 template <int N>
 struct stateful_optimizer : public optimizer {
   void reset() override {
-    for (auto& e : E_)
-      e.clear();
+    for (auto& e : E_) e.clear();
   }
-
 protected:
   template <int Index>
   vec_t& get(const vec_t& key) {
@@ -56,14 +51,7 @@ struct stateful_optimizer : public optimizer {
   std::unordered_map<const vec_t*, vec_t> E_[N];
 #ifndef CPU_ONLY
   template <int Index>
-  float_t *get_gpu(const size_t n, const float_t *key) {
-    static_assert(Index < N, "index out of range");
-    if (!is_allocated_device(dE_[Index][key])) {
-      float_malloc_device(n, dE_[Index][key]);
-      init_const_gpu(n, 0.0, dE_[Index][key]);
-    }
-    return dE_[Index][key];
-  }
+  float_t *get_gpu(const size_t n, const float_t *key);
   std::unordered_map<const float_t*, float_t*> dE_[N];
 #endif
 };
diff --git a/libdeepgalois/src/layers/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp
index 65308172f1..b374dd9d91 100644
--- a/libdeepgalois/src/layers/aggregator.cpp
+++ b/libdeepgalois/src/layers/aggregator.cpp
@@ -11,7 +11,7 @@ void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* ou
   auto& rangeObj = g.allNodesRange();
   galois::do_all(galois::iterate(rangeObj), [&](const auto src) {
   #endif
-    deepgalois::math::clear_cpu(len , &out[src * len]);
+    math::clear_cpu(len , &out[src * len]);
     float_t a = 0.0;
     float_t b = 0.0;
     // get normalization factor if needed
@@ -24,14 +24,12 @@ void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* ou
         b = a * norm_factor[dst];
         vec_t neighbor(len);
         // scale the neighbor's data using the normalization factor
-        deepgalois::math::mul_scalar(len, b, &in[dst * len], &neighbor[0]);
-        // use scaled data to update
-        deepgalois::math::vadd_cpu(len, &out[src * len], &neighbor[0],
-                                   &out[src * len]); // out[src] += in[dst]
+        math::mul_scalar(len, b, &in[dst * len], &neighbor[0]);
+        // use scaled data to update; out[src] += in[dst]
+        math::vadd_cpu(len, &out[src * len], &neighbor[0],  &out[src * len]);
       } else {
-        // add embeddings from neighbors together
-        deepgalois::math::vadd_cpu(len, &out[src * len], &in[dst * len],
-                                   &out[src * len]); // out[src] += in[dst]
+        // add embeddings from neighbors together; out[src] += in[dst]
+        math::vadd_cpu(len, &out[src * len], &in[dst * len], &out[src * len]);
       }
     }
   }, galois::steal(), galois::no_stats(), galois::loopname("update_all"));
@@ -40,8 +38,8 @@ void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* ou
 void deepgalois::update_all_csrmm(size_t len, Graph& g, const float_t* in, float_t* out,
                                   bool norm, const float_t* norm_factor) {
   unsigned n = g.size();
-  deepgalois::math::clear_cpu(n*len, out);
-  //csrmm_cpu(n, len, n, g.sizeEdges(), 1.0, norm_factor, 
-  //          (const int*)g.row_start_ptr(), (const int*)g.edge_dst_ptr(), in, 0.0, out);
+  math::clear_cpu(n*len, out);
+  math::csrmm_cpu(n, len, n, g.sizeEdges(), 1.0, norm_factor, 
+            (const int*)g.row_start_ptr(), (const int*)g.edge_dst_ptr(), in, 0.0, out);
 }
 #endif
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index c7d0307fd4..dae3d14ce5 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -1,4 +1,5 @@
 #include "deepgalois/layers/graph_conv_layer.h"
+#include "deepgalois/math_functions.hh"
 #include "deepgalois/utils.h"
 
 namespace deepgalois {
@@ -40,13 +41,21 @@ inline void graph_conv_layer::zero_init_matrix(size_t dim_x, size_t dim_y, vec_t
 void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in, float_t* out) {
   // normalization constant based on graph structure
   float_t* norm_consts = context->get_norm_factor_ptr();
+#ifdef USE_MKL
+  update_all_csrmm(len, g, in, out, norm_, norm_consts);
+#else
   update_all(len, g, in, out, norm_, norm_consts);
+#endif
 }
 
 // since graph is symmetric, the derivative is the same
 void graph_conv_layer::d_aggregate(size_t len, Graph& g, const float_t* in, float_t* out) {
   float_t* norm_consts = context->get_norm_factor_ptr();
+#ifdef USE_MKL
+  update_all_csrmm(len, g, in, out, norm_, norm_consts); // x*x; x*z -> x*z
+#else
   update_all(len, g, in, out, norm_, norm_consts); // x*x; x*z -> x*z
+#endif
 }
 
 void graph_conv_layer::combine(size_t n, size_t len, const float_t* self, const float_t* neighbors, float_t* out) {
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cu b/libdeepgalois/src/layers/graph_conv_layer.cu
index c3f97a49d4..41f6e30a0f 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cu
+++ b/libdeepgalois/src/layers/graph_conv_layer.cu
@@ -1,4 +1,5 @@
 #include "deepgalois/layers/graph_conv_layer.h"
+#include "deepgalois/math_functions.hh"
 
 namespace deepgalois {
 
diff --git a/libdeepgalois/src/layers/l2_norm_layer.cpp b/libdeepgalois/src/layers/l2_norm_layer.cpp
index 46379aed60..3e12a1d603 100644
--- a/libdeepgalois/src/layers/l2_norm_layer.cpp
+++ b/libdeepgalois/src/layers/l2_norm_layer.cpp
@@ -1,4 +1,5 @@
 #include "deepgalois/layers/l2_norm_layer.h"
+#include "deepgalois/math_functions.hh"
 
 namespace deepgalois {
 
diff --git a/libdeepgalois/src/layers/l2_norm_layer.cu b/libdeepgalois/src/layers/l2_norm_layer.cu
index 56128eb0d3..e600b6fbbb 100644
--- a/libdeepgalois/src/layers/l2_norm_layer.cu
+++ b/libdeepgalois/src/layers/l2_norm_layer.cu
@@ -1,4 +1,5 @@
 #include "deepgalois/layers/l2_norm_layer.h"
+#include "deepgalois/math_functions.hh"
 
 namespace deepgalois {
 
diff --git a/libdeepgalois/src/layers/leaky_relu_layer.cpp b/libdeepgalois/src/layers/leaky_relu_layer.cpp
index 0d5a7f66fb..f7cfe375cc 100644
--- a/libdeepgalois/src/layers/leaky_relu_layer.cpp
+++ b/libdeepgalois/src/layers/leaky_relu_layer.cpp
@@ -1,4 +1,5 @@
 #include "deepgalois/layers/leaky_relu_layer.h"
+#include "deepgalois/math_functions.hh"
 
 namespace deepgalois {
 
diff --git a/libdeepgalois/src/layers/leaky_relu_layer.cu b/libdeepgalois/src/layers/leaky_relu_layer.cu
index 43e7f93d04..6fe4d005ac 100644
--- a/libdeepgalois/src/layers/leaky_relu_layer.cu
+++ b/libdeepgalois/src/layers/leaky_relu_layer.cu
@@ -1,4 +1,5 @@
 #include "deepgalois/layers/leaky_relu_layer.h"
+#include "deepgalois/math_functions.hh"
 
 namespace deepgalois {
 
diff --git a/libdeepgalois/src/layers/relu_layer.cpp b/libdeepgalois/src/layers/relu_layer.cpp
index 2e89af1bd5..aee6e29a07 100644
--- a/libdeepgalois/src/layers/relu_layer.cpp
+++ b/libdeepgalois/src/layers/relu_layer.cpp
@@ -1,4 +1,5 @@
 #include "deepgalois/layers/relu_layer.h"
+#include "deepgalois/math_functions.hh"
 
 namespace deepgalois {
 
diff --git a/libdeepgalois/src/layers/relu_layer.cu b/libdeepgalois/src/layers/relu_layer.cu
index f3a45936b4..0d39a9dab2 100644
--- a/libdeepgalois/src/layers/relu_layer.cu
+++ b/libdeepgalois/src/layers/relu_layer.cu
@@ -1,4 +1,5 @@
 #include "deepgalois/layers/relu_layer.h"
+#include "deepgalois/math_functions.hh"
 
 namespace deepgalois {
 
diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
index 19606eec6c..ca34389127 100644
--- a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
+++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
@@ -1,4 +1,5 @@
 #include "deepgalois/layers/sigmoid_loss_layer.h"
+#include "deepgalois/math_functions.hh"
 
 namespace deepgalois {
 
diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cu b/libdeepgalois/src/layers/sigmoid_loss_layer.cu
index 4159569601..f00689dfc9 100644
--- a/libdeepgalois/src/layers/sigmoid_loss_layer.cu
+++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cu
@@ -1,4 +1,5 @@
 #include "deepgalois/layers/sigmoid_loss_layer.h"
+#include "deepgalois/math_functions.hh"
 #include "gg.h"
 #include "ggcuda.h"
 
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp
index 9e4fda933e..f1c1aa27e4 100644
--- a/libdeepgalois/src/layers/softmax_loss_layer.cpp
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp
@@ -1,4 +1,5 @@
 #include "deepgalois/layers/softmax_loss_layer.h"
+#include "deepgalois/math_functions.hh"
 
 namespace deepgalois {
 
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cu b/libdeepgalois/src/layers/softmax_loss_layer.cu
index fd3fc11140..59a955526b 100644
--- a/libdeepgalois/src/layers/softmax_loss_layer.cu
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cu
@@ -1,4 +1,5 @@
 #include "deepgalois/layers/softmax_loss_layer.h"
+#include "deepgalois/math_functions.hh"
 #include "gg.h"
 #include "ggcuda.h"
 
diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp
index 555eb7bfca..aa41ffc41f 100644
--- a/libdeepgalois/src/math_functions.cpp
+++ b/libdeepgalois/src/math_functions.cpp
@@ -4,10 +4,13 @@
 #include <immintrin.h>
 #include "deepgalois/utils.h"
 
+#ifdef USE_MKL
+#include <mkl.h>
+#else  // If use MKL, simply include the MKL header
 extern "C" {
 #include <cblas.h>
-//#include <clapack.h>
 }
+#endif
 
 #define NOT_IMPLEMENTED                \
   do {                                 \
@@ -38,9 +41,11 @@ void csrmm_cpu(const int M, const int N, const int K, const int nnz,
 #ifdef USE_MKL
   const char *matdescra = "GXXCX";//6 bytes
   const char transa = 'N';
-  mkl_scsrmm (&transa, &M , &N, &K, &alpha , matdescra,
-    A_nonzeros, A_nnz_idx, A_idx_ptr, A_idx_ptr+1,
-    B, &N, &beta , C, &N);
+  printf("Calling Intel MKL\n");
+  exit(1);
+  mkl_scsrmm(&transa, &M , &N, &K, &alpha , matdescra,
+             A_nonzeros, A_nnz_idx, A_idx_ptr, A_idx_ptr+1,
+             B, &N, &beta , C, &N);
 #else
   NOT_IMPLEMENTED;
 #endif
@@ -126,6 +131,18 @@ void axpy(size_t n, const float_t a, float_t *x, float_t *y) {
   cblas_saxpy(n, a, x, 1, y, 1);
 }
 
+int argmax(const size_t n, const float_t* x) {
+  float_t max = x[0];
+  int max_ind = 0;
+  for (size_t i = 1; i < n; i++) {
+    if (x[i] > max) {
+      max_ind = i;
+      max     = x[i];
+    }
+  }
+  return max_ind;
+}
+
 float_t l2_norm(size_t n, const float_t* x) {
   return cblas_snrm2(n, x, 1);
 }
@@ -549,20 +566,6 @@ int argmax(const size_t n, const vec_t& x) {
   return max_ind;
 }
 
-int argmax(const size_t n, const float_t* x) {
-  float_t max = x[0];
-  int max_ind = 0;
-  for (size_t i = 1; i < n; i++) {
-    if (x[i] > max) {
-      max_ind = i;
-      max     = x[i];
-    }
-  }
-  return max_ind;
-}
-
-
-
 void d_mvmul(vec_t& in_diff, vec_t& h_in, tensor_t& out_diff) {
   vvmul(h_in, in_diff, out_diff); // transposed feature matrix X^T times in_diff
 }
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index 7c9d049fc2..7da9fcbb18 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -4,6 +4,7 @@
 
 #include "deepgalois/net.h"
 #include "deepgalois/utils.h"
+#include "deepgalois/math_functions.hh"
 
 namespace deepgalois {
 
@@ -84,6 +85,10 @@ void Net::init(std::string dataset_str, unsigned num_conv, unsigned epochs,
 #endif
   }
 
+  if (subgraph_sample_size > train_count) {
+    galois::gPrint("FATAL: subgraph size can not be larger than the size of training set\n");
+    exit(1);
+  }
   // NOTE: train_begin/train_end are global IDs, train_masks is a local id
   // train count and val count are LOCAL counts
 
@@ -440,7 +445,7 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks
 #ifndef GALOIS_USE_DIST
     if (masks[i] == 1) {
       // get prediction
-      int preds = argmax(num_classes,
+      int preds = math::argmax(num_classes,
       	    &(layers[num_conv_layers - 1]->next()->get_data()[i * num_classes]));
       // check prediction
       if ((label_t)preds == context->get_label(i))
@@ -455,7 +460,7 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks
       uint32_t localID = dGraph->getLID(i);
       if (masks[localID] == 1) {
         // get prediction
-        int preds = argmax(num_classes,
+        int preds = math::argmax(num_classes,
         	    &(layers[num_conv_layers - 1]->next()->get_data()[localID * num_classes]));
         // check prediction
         if ((label_t)preds == context->get_label(localID))
diff --git a/libdeepgalois/src/net.cu b/libdeepgalois/src/net.cu
index 900ba1a762..3077566512 100644
--- a/libdeepgalois/src/net.cu
+++ b/libdeepgalois/src/net.cu
@@ -1,5 +1,6 @@
 #include "deepgalois/net.h"
 #include "deepgalois/cutils.h"
+#include "deepgalois/math_functions.hh"
 #include "gg.h"
 #include "ggcuda.h"
 #include <iomanip>
diff --git a/libdeepgalois/src/optimizer.cpp b/libdeepgalois/src/optimizer.cpp
index c3267f282e..0f00b4da33 100644
--- a/libdeepgalois/src/optimizer.cpp
+++ b/libdeepgalois/src/optimizer.cpp
@@ -1,5 +1,6 @@
 #include "deepgalois/optimizer.h"
 #include "galois/Galois.h"
+#include "deepgalois/math_functions.hh"
 
 namespace deepgalois {
 
diff --git a/libdeepgalois/src/optimizer.cu b/libdeepgalois/src/optimizer.cu
index 7628c3aeba..355d959254 100644
--- a/libdeepgalois/src/optimizer.cu
+++ b/libdeepgalois/src/optimizer.cu
@@ -14,7 +14,20 @@ __global__ void update_kernel(const int n, float_t alpha, float_t b1,
   }
 }
 
-void deepgalois::adam::update_gpu(const size_t n, const float_t* dW, float_t* W) {
+namespace deepgalois {
+
+template <int N>
+template <int Index>
+float_t* stateful_optimizer<N>::get_gpu(const size_t n, const float_t *key) {
+  static_assert(Index < N, "index out of range");
+  if (!is_allocated_device(dE_[Index][key])) {
+    float_malloc_device(n, dE_[Index][key]);
+    init_const_gpu(n, 0.0, dE_[Index][key]);
+  }
+  return dE_[Index][key];
+}
+
+void adam::update_gpu(const size_t n, const float_t* dW, float_t* W) {
   //std::cout << "updating weights on GPU, n = " << n << "\n";
   //print_device_vector(10, dW, "dW");
   float_t* cache = get_gpu<0>(n, W);
@@ -25,3 +38,5 @@ void deepgalois::adam::update_gpu(const size_t n, const float_t* dW, float_t* W)
   b1_t *= b1;
   b2_t *= b2;
 }
+
+}
diff --git a/libdeepgalois/src/sampler.cpp b/libdeepgalois/src/sampler.cpp
index c126660fb4..a0816f4cea 100644
--- a/libdeepgalois/src/sampler.cpp
+++ b/libdeepgalois/src/sampler.cpp
@@ -96,8 +96,15 @@ void Sampler::select_vertices(size_t nv, size_t n, int m, Graph *g, VertexList v
     }
     if (j == degree) galois::gPrint("Not found from ", degree, " neighbors\n");
   }
-  galois::gPrint("Done selection, vertex_set size: ", vertex_set.size(), "\n");
   assert(n == vertex_set.size());
+  galois::gPrint("Done selection, vertex_set size: ", vertex_set.size(), ", set: ( ");
+  int counter = 0;
+  for (int i : vertex_set) {
+    counter ++;
+    if (counter > 16 && counter < n-16) continue;
+    galois::gPrint(i, " ");
+  }
+  galois::gPrint(" )\n");
 }
 
 void Sampler::update_masks(size_t n, VertexSet vertices, mask_t *masks) {
diff --git a/libgalois/include/galois/graphs/LC_CSR_Graph.h b/libgalois/include/galois/graphs/LC_CSR_Graph.h
index 5516b22a92..a786b1b6aa 100644
--- a/libgalois/include/galois/graphs/LC_CSR_Graph.h
+++ b/libgalois/include/galois/graphs/LC_CSR_Graph.h
@@ -323,6 +323,9 @@ class LC_CSR_Graph :
     ar >> edgeData;
   }
 
+  // cxh
+  uint64_t* row_start_ptr() { return &edgeIndData[0]; }
+  uint32_t* edge_dst_ptr() { return &edgeDst[0]; }
   /**
    * Accesses the "prefix sum" of this graph; takes advantage of the fact
    * that edge_end(n) is basically prefix_sum[n] (if a prefix sum existed +
diff --git a/lonestargnn/CMakeLists.txt b/lonestargnn/CMakeLists.txt
index a06dd1907b..b551fa8acb 100644
--- a/lonestargnn/CMakeLists.txt
+++ b/lonestargnn/CMakeLists.txt
@@ -5,17 +5,17 @@ include_directories(BEFORE
 
 include_directories(${CMAKE_SOURCE_DIR}/lonestargnn/include)
 include_directories(${CMAKE_SOURCE_DIR}/libdeepgalois/include)
-
-SET(CUDA_INC /org/centers/cdgc/cuda/cuda-10.0/include)
-include_directories(${CUDA_INC})
+include_directories(${CUDA_HOME}/include)
 if(ENABLE_HETERO_GALOIS)
   include_directories(${CMAKE_SOURCE_DIR}/libgpu/include)
 endif()
 
-SET(OPENBLAS_INC /net/ohm/export/cdgc/cxh/OpenBLAS-faraday/build/include)
-SET(OPENBLAS_LIB /net/ohm/export/cdgc/cxh/OpenBLAS-faraday/build/lib)
-include_directories(${OPENBLAS_INC})
-link_directories(${OPENBLAS_LIB})
+SET(BLAS_LIB_DIR ${OPENBLAS_ROOT}/lib)
+if(USE_MKL_BLAS)
+  SET(BLAS_LIB_DIR "${MKL_ROOT}/lib/intel64")
+endif()
+link_directories(${BLAS_LIB_DIR})
+
 if(NOT ENABLE_HETERO_GALOIS)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCPU_ONLY")
 endif()

From a2583e8cde5e0805a1f8680349981f7a1b8a612f Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 28 Apr 2020 17:34:23 -0500
Subject: [PATCH 201/660] app no longer used by cmake: gcn use new syntax

---
 lonestargnn/gcn/CMakeLists.txt | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/lonestargnn/gcn/CMakeLists.txt b/lonestargnn/gcn/CMakeLists.txt
index 48c7156dcc..eff742aa69 100644
--- a/lonestargnn/gcn/CMakeLists.txt
+++ b/lonestargnn/gcn/CMakeLists.txt
@@ -1,9 +1,13 @@
-app(gcn gcn.cpp)
-target_link_libraries(gcn dg_cpu)
+#app(gcn gcn.cpp)
+add_executable(gcn gcn.cpp)
+target_link_libraries(gcn PRIVATE Galois::shmem lonestar)
+
+target_link_libraries(gcn PRIVATE dg_cpu)
 if(ENABLE_DIST_GALOIS)
-  target_link_libraries(gcn distgraphloader)
+  target_link_libraries(gcn PRIVATE distgraphloader)
 endif()
+
 if(ENABLE_HETERO_GALOIS)
-  target_link_libraries(gcn dg_gpu)
-  target_link_libraries(gcn -lcudart -lcublas -lcurand -lcudadevrt)
+  target_link_libraries(gcn PRIVATE dg_gpu)
+  target_link_libraries(gcn PRIVATE -lcudart -lcublas -lcurand -lcudadevrt)
 endif()

From 3cd4f8e14189f060d84086e8faac549e6a26fa27 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 28 Apr 2020 17:36:53 -0500
Subject: [PATCH 202/660] signed vs unsigned comparison warning fix

---
 libdeepgalois/src/sampler.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libdeepgalois/src/sampler.cpp b/libdeepgalois/src/sampler.cpp
index a0816f4cea..257cf1edef 100644
--- a/libdeepgalois/src/sampler.cpp
+++ b/libdeepgalois/src/sampler.cpp
@@ -98,7 +98,7 @@ void Sampler::select_vertices(size_t nv, size_t n, int m, Graph *g, VertexList v
   }
   assert(n == vertex_set.size());
   galois::gPrint("Done selection, vertex_set size: ", vertex_set.size(), ", set: ( ");
-  int counter = 0;
+  unsigned counter = 0;
   for (int i : vertex_set) {
     counter ++;
     if (counter > 16 && counter < n-16) continue;

From 623809922ee8727ea3df86af1ba379890309cae5 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 28 Apr 2020 17:37:18 -0500
Subject: [PATCH 203/660] initializing endbyte in numamem to var to avoid
 warning

---
 libgalois/src/NumaMem.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libgalois/src/NumaMem.cpp b/libgalois/src/NumaMem.cpp
index cd86a970a0..01bdef4545 100644
--- a/libgalois/src/NumaMem.cpp
+++ b/libgalois/src/NumaMem.cpp
@@ -99,7 +99,7 @@ static void pageInSpecified(void* _ptr, size_t len, size_t pageSize,
                           // first place
                           if (beginLocation != endLocation) {
                             size_t beginByte = beginLocation * elementSize;
-                            size_t endByte;
+                            size_t endByte = 0;
 
                             if (endLocation != 0) {
                               // -1 since end * element will result in the first

From a36c9a2a8e024fe9a8768cc6b56fae717a32bc69 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 28 Apr 2020 17:55:25 -0500
Subject: [PATCH 204/660] fixing build gcc 8.1 for lonestargnn boiler

---
 lonestargnn/include/lonestargnn.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/lonestargnn/include/lonestargnn.h b/lonestargnn/include/lonestargnn.h
index 324f5a31ba..77a2777d5f 100644
--- a/lonestargnn/include/lonestargnn.h
+++ b/lonestargnn/include/lonestargnn.h
@@ -63,9 +63,10 @@ llvm::cl::opt<std::string> statFile(
     llvm::cl::desc("ouput file to print stats to (default value empty)"),
     llvm::cl::init(""));
 
-static void LonestarGnnPrintVersion() {
-  std::cout << "LoneStarGNN Benchmark Suite v" << galois::getVersion() << " ("
-            << galois::getRevision() << ")\n";
+static void LonestarGnnPrintVersion(llvm::raw_ostream& out) {
+  out << "LoneStarGNN Benchmark Suite v" << galois::getVersion() << " ("
+      << galois::getRevision() << ")\n";
+  out.flush();
 }
 
 //! initialize lonestargnn benchmark
@@ -80,7 +81,7 @@ void LonestarGnnStart(int argc, char** argv, const char* app, const char* desc,
   auto& net = galois::runtime::getSystemNetworkInterface();
   if (net.ID == 0) {
 #endif
-  LonestarGnnPrintVersion();
+  LonestarGnnPrintVersion(llvm::outs());
   std::cout << "Copyright (C) " << galois::getCopyrightYear()
             << " The University of Texas at Austin\n";
   std::cout << "http://iss.ices.utexas.edu/galois/\n\n";

From 70dece5548eb0cb6914b007a2879ed8eea495860 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 28 Apr 2020 17:55:47 -0500
Subject: [PATCH 205/660] cmake fixes to build gcn after merge TODO openblas
 needs to be recomopiled with 8.1

---
 CMakeLists.txt               | 6 +++---
 libdeepgalois/CMakeLists.txt | 3 +--
 lonestargnn/CMakeLists.txt   | 5 -----
 3 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 715ecbc8a9..ef921b9e2d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,9 +6,6 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules")
 
 include(GNUInstallDirs)
 
-# TODO; this is GNN related; find better way to do than hardcode
-SET(OPENBLAS_ROOT /net/ohm/export/cdgc/cxh/OpenBLAS-faraday/build)
-SET(CUDA_HOME /org/centers/cdgc/cuda/cuda-10.0)
 
 file(STRINGS config/version.txt GALOIS_VERSION)
 string(REGEX REPLACE "[ \t\n]" "" GALOIS_VERSION ${GALOIS_VERSION})
@@ -47,6 +44,9 @@ set(USE_ARCH native CACHE STRING "Optimize for a specific processor architecture
 
 set(USE_DEEPGALOIS OFF CACHE BOOL "Use gnn apps as well as the DeepGalois library")
 set(USE_MKL_BLAS OFF CACHE BOOL "Use MKL for BLAS")
+# TODO; this is GNN related; find better way to do than hardcode
+SET(OPENBLAS_ROOT /net/ohm/export/cdgc/cxh/OpenBLAS-faraday/build)
+SET(CUDA_HOME /org/centers/cdgc/cuda/cuda-10.0)
 
 # This option is automatically handled by CMake.
 # It makes add_library build a shared lib unless STATIC is explicitly specified.
diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index de0cd30dc9..e36e5784bd 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -104,11 +104,10 @@ set(sources
 endif()
 
 add_library(dg_cpu STATIC ${sources})
-target_link_libraries(dg_cpu galois_shmem gllvm)
+target_link_libraries(dg_cpu galois_shmem)
 target_link_libraries(dg_cpu ${MPI_CXX_LIBRARIES})
 target_link_libraries(dg_cpu ${BLAS_LIB})
 target_include_directories(dg_cpu PUBLIC
-  ${CMAKE_SOURCE_DIR}/libllvm/include
   ${CMAKE_SOURCE_DIR}/libgalois/include
   ${CMAKE_CURRENT_SOURCE_DIR}/include
 )
diff --git a/lonestargnn/CMakeLists.txt b/lonestargnn/CMakeLists.txt
index b551fa8acb..1ae1c63d78 100644
--- a/lonestargnn/CMakeLists.txt
+++ b/lonestargnn/CMakeLists.txt
@@ -1,8 +1,3 @@
-include_directories(BEFORE
-  ${CMAKE_SOURCE_DIR}/libllvm/include
-  ${CMAKE_CURRENT_BINARY_DIR}/../libllvm/include
-)
-
 include_directories(${CMAKE_SOURCE_DIR}/lonestargnn/include)
 include_directories(${CMAKE_SOURCE_DIR}/libdeepgalois/include)
 include_directories(${CUDA_HOME}/include)

From 47684cea96c339211f2e067511b93109882689dd Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Tue, 28 Apr 2020 19:42:00 -0500
Subject: [PATCH 206/660] fix openblas

---
 CMakeLists.txt                   | 12 +++++++++++-
 cmake/Modules/FindOpenBLAS.cmake | 24 ++++++++++++++++++++++++
 libdeepgalois/CMakeLists.txt     |  6 +++---
 lonestargnn/CMakeLists.txt       |  2 +-
 4 files changed, 39 insertions(+), 5 deletions(-)
 create mode 100644 cmake/Modules/FindOpenBLAS.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ef921b9e2d..5a0d440a3c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -45,7 +45,6 @@ set(USE_ARCH native CACHE STRING "Optimize for a specific processor architecture
 set(USE_DEEPGALOIS OFF CACHE BOOL "Use gnn apps as well as the DeepGalois library")
 set(USE_MKL_BLAS OFF CACHE BOOL "Use MKL for BLAS")
 # TODO; this is GNN related; find better way to do than hardcode
-SET(OPENBLAS_ROOT /net/ohm/export/cdgc/cxh/OpenBLAS-faraday/build)
 SET(CUDA_HOME /org/centers/cdgc/cuda/cuda-10.0)
 
 # This option is automatically handled by CMake.
@@ -270,6 +269,17 @@ if(USE_MKL_BLAS)
   endif()
 endif()
 
+SET(OPENBLAS_ROOT /org/centers/cdgc/openblas/gcc8.1)
+if(USE_OPENBLAS)
+  find_package(OpenBLAS)
+  message(STATUS "OpenBLAS: ${OPENBLAS_INCLUDE_DIRS}")
+  if (OPENBLAS_FOUND)
+    include_directories(${OPENBLAS_INCLUDE_DIRS})
+  else()
+    message(WARNING "OpenBLAS not found")
+  endif()
+endif()
+
 if(USE_PAPI)
   if (PAPI_ROOT STREQUAL "")
     set(PAPI_ROOT /usr)
diff --git a/cmake/Modules/FindOpenBLAS.cmake b/cmake/Modules/FindOpenBLAS.cmake
new file mode 100644
index 0000000000..3f595744d0
--- /dev/null
+++ b/cmake/Modules/FindOpenBLAS.cmake
@@ -0,0 +1,24 @@
+# Find OpenBLAS libraries
+# Once done this will define
+#  OpenBLAS_FOUND - System has OpenBLAS
+#  OpenBLAS_INCLUDE_DIRS - The OpenBLAS include directories
+#  OpenBLAS_LIBRARIES - The libraries needed to use OpenBLAS
+
+set(OPENBLAS_LIBRARIES) # Include-only library
+
+if(OPENBLAS_INCLUDE_DIRS)
+  set(OPENBLAS_FIND_QUIETLY TRUE)
+endif()
+
+find_path(OPENBLAS_INCLUDE_DIRS cblas.h PATHS ${OPENBLAS_ROOT} PATH_SUFFIXES include/openblas)
+message(STATUS "OPENBLAS_INCLUDE_DIRS: ${OPENBLAS_INCLUDE_DIRS}")
+find_library(OPENBLAS_LIBRARY NAMES openblas PATHS ${OPENBLAS_ROOT} PATH_SUFFIXES lib64)
+message(STATUS "OPENBLAS_LIBRARY: ${OPENBLAS_LIBRARY}")
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(OPENBLAS DEFAULT_MSG OPENBLAS_LIBRARY OPENBLAS_INCLUDE_DIRS)
+if(OPENBLAS_FOUND)
+  set(OPENBLAS_FOUND on)
+endif()
+
+mark_as_advanced(OPENBLAS_INCLUDE_DIRS)
diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index e36e5784bd..fffe49af1a 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -1,8 +1,8 @@
 cmake_minimum_required(VERSION 2.8)
 
-SET(BLAS_INC_DIR ${OPENBLAS_ROOT}/include)
-SET(BLAS_LIB_DIR ${OPENBLAS_ROOT}/lib)
-set(BLAS_LIB -lopenblas)
+SET(BLAS_INC_DIR ${OPENBLAS_ROOT}/include/openblas)
+SET(BLAS_LIB_DIR ${OPENBLAS_ROOT}/lib64)
+set(BLAS_LIB "-lopenblas -lpthread")
 if(USE_MKL_BLAS)
   SET(BLAS_INC_DIR ${MKL_ROOT}/include)
   SET(BLAS_LIB_DIR ${MKL_ROOT}/lib/intel64)
diff --git a/lonestargnn/CMakeLists.txt b/lonestargnn/CMakeLists.txt
index 1ae1c63d78..24c9c6a726 100644
--- a/lonestargnn/CMakeLists.txt
+++ b/lonestargnn/CMakeLists.txt
@@ -5,7 +5,7 @@ if(ENABLE_HETERO_GALOIS)
   include_directories(${CMAKE_SOURCE_DIR}/libgpu/include)
 endif()
 
-SET(BLAS_LIB_DIR ${OPENBLAS_ROOT}/lib)
+SET(BLAS_LIB_DIR ${OPENBLAS_ROOT}/lib64)
 if(USE_MKL_BLAS)
   SET(BLAS_LIB_DIR "${MKL_ROOT}/lib/intel64")
 endif()

From 0c5e36596674ef3f213c5fd77da967f93d79c4ed Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Tue, 28 Apr 2020 21:41:44 -0500
Subject: [PATCH 207/660] add lgraph

---
 libdeepgalois/CMakeLists.txt               |  1 +
 libdeepgalois/include/deepgalois/context.h |  2 ++
 libdeepgalois/include/deepgalois/lgraph.h  | 33 ++++++++++++++++++++++
 libdeepgalois/src/context.cpp              |  3 ++
 libdeepgalois/src/lgraph.cpp               | 31 ++++++++++++++++++++
 5 files changed, 70 insertions(+)
 create mode 100644 libdeepgalois/include/deepgalois/lgraph.h
 create mode 100644 libdeepgalois/src/lgraph.cpp

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index fffe49af1a..9c6bc0a88f 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -97,6 +97,7 @@ set(sources
   src/optimizer.cpp
   src/context.cpp
   src/sampler.cpp
+  src/lgraph.cpp
   src/utils.cpp
   src/node.cpp
   src/net.cpp
diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h
index e368319dff..affe48ace0 100644
--- a/libdeepgalois/include/deepgalois/context.h
+++ b/libdeepgalois/include/deepgalois/context.h
@@ -7,6 +7,7 @@
 #include <cassert>
 #include "deepgalois/types.h"
 #ifdef CPU_ONLY
+#include "deepgalois/lgraph.h"
 #include "deepgalois/gtypes.h"
 #else
 #include "graph_gpu.h"
@@ -39,6 +40,7 @@ class Context {
 
 #ifdef CPU_ONLY
   Graph* graph_cpu; // the input graph, |V| = N
+  LearningGraph* lgraph;
   Graph* subgraph_cpu;
   void createSubgraph();
   void add_selfloop(Graph &og, Graph &g);
diff --git a/libdeepgalois/include/deepgalois/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h
new file mode 100644
index 0000000000..dbe66c0092
--- /dev/null
+++ b/libdeepgalois/include/deepgalois/lgraph.h
@@ -0,0 +1,33 @@
+#pragma once
+#include "deepgalois/types.h"
+#include <string>
+#include <boost/iterator/counting_iterator.hpp>
+
+namespace deepgalois {
+
+typedef uint32_t index_t;
+
+class LearningGraph {
+protected:
+  index_t num_vertices_;
+  index_t num_edges_;
+  index_t *rowptr_;
+  index_t *colidx_;
+  index_t *degrees_;
+public:
+  //typedef index_t* iterator;
+  using iterator = boost::counting_iterator<index_t>;
+  LearningGraph();
+  ~LearningGraph();
+  void readGraph(std::string path, std::string dataset);
+  index_t getDegree(index_t vid) { return degrees_[vid]; }
+  index_t getEdgeDst(index_t eid) { return colidx_[eid]; }
+  index_t edge_begin(index_t vid) { return rowptr_[vid]; }
+  index_t edge_end(index_t vid) { return rowptr_[vid+1]; }
+  index_t* row_start_ptr() { return rowptr_; }
+  index_t* edge_dst_ptr() { return colidx_; }
+  iterator begin() const { return iterator(0); }
+  iterator end() const { return iterator(num_vertices_); }
+};
+
+}
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index b17f6d7eaa..4320df1bc6 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -68,6 +68,9 @@ size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype, bo
     std::string filename = path + dataset_str + ".el";
     printf("Reading .el file: %s\n", filename.c_str());
     read_edgelist(filename.c_str(), true); // symmetrize
+  } else if (filetype == "bin") {
+    lgraph = new LearningGraph();
+    lgraph->readGraph(path, dataset_str);
   } else if (filetype == "gr") {
     std::string filename = path + dataset_str + ".csgr";
     printf("Reading .gr file: %s\n", filename.c_str());
diff --git a/libdeepgalois/src/lgraph.cpp b/libdeepgalois/src/lgraph.cpp
new file mode 100644
index 0000000000..390ba87488
--- /dev/null
+++ b/libdeepgalois/src/lgraph.cpp
@@ -0,0 +1,31 @@
+#include "deepgalois/lgraph.h"
+#include <fstream>
+
+namespace deepgalois {
+
+LearningGraph::LearningGraph() : num_vertices_(0), num_edges_(0),
+                                 rowptr_(NULL), colidx_(NULL), degrees_(NULL) {}
+
+void LearningGraph::readGraph(std::string path, std::string dataset) {
+  std::string file_dims = path + dataset + "-dims.bin";
+  std::string file_rowptr = path + dataset + "-rowptr.bin";
+  std::string file_colidx = path + dataset + "-colidx.bin";
+  index_t dims[2];
+  std::ifstream ifs;
+  ifs.open(file_dims, std::ios::binary|std::ios::in);
+  ifs.read((char*)dims, sizeof(index_t) * 2);
+  ifs.close();
+  num_vertices_ = dims[0];
+  num_edges_ = dims[1];
+  degrees_ = new index_t[num_vertices_];
+  rowptr_ = new index_t[num_vertices_+1];
+  colidx_ = new index_t[num_edges_];
+  ifs.open(file_rowptr, std::ios::binary|std::ios::in);
+  ifs.read((char*)rowptr_, sizeof(index_t) * (num_vertices_+1));
+  ifs.close();
+  ifs.open(file_colidx, std::ios::binary|std::ios::in);
+  ifs.read((char*)colidx_, sizeof(index_t) * num_edges_);
+  ifs.close();
+}
+
+}

From d0ada11e4bd5511091e35ee30810223ed08c81ee Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Wed, 29 Apr 2020 10:29:09 -0500
Subject: [PATCH 208/660] add lgraph

---
 CMakeLists.txt                            |  4 +-
 libdeepgalois/CMakeLists.txt              |  1 +
 libdeepgalois/include/deepgalois/cutils.h |  6 ++
 libdeepgalois/include/deepgalois/lgraph.h | 16 +++-
 libdeepgalois/include/deepgalois/types.h  |  4 +
 libdeepgalois/src/lgraph.cpp              | 96 ++++++++++++++++++++++-
 libdeepgalois/src/lgraph.cu               | 32 ++++++++
 7 files changed, 151 insertions(+), 8 deletions(-)
 create mode 100644 libdeepgalois/src/lgraph.cu

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5a0d440a3c..dc0250a3f4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -45,7 +45,7 @@ set(USE_ARCH native CACHE STRING "Optimize for a specific processor architecture
 set(USE_DEEPGALOIS OFF CACHE BOOL "Use gnn apps as well as the DeepGalois library")
 set(USE_MKL_BLAS OFF CACHE BOOL "Use MKL for BLAS")
 # TODO; this is GNN related; find better way to do than hardcode
-SET(CUDA_HOME /org/centers/cdgc/cuda/cuda-10.0)
+SET(CUDA_HOME /org/centers/cdgc/cuda/cuda-10.2)
 
 # This option is automatically handled by CMake.
 # It makes add_library build a shared lib unless STATIC is explicitly specified.
@@ -356,14 +356,12 @@ if (ENABLE_HETERO_GALOIS)
 
   add_subdirectory(libgpu)
 endif()
-add_subdirectory(libpangolin)
 
 # Applications (apps)
 add_subdirectory(lonestar)
 if (ENABLE_DIST_GALOIS)
   add_subdirectory(lonestardist)
 endif()
-add_subdirectory(lonestarmine)
 
 add_subdirectory(scripts)
 add_subdirectory(inputs)
diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index 9c6bc0a88f..be5853f987 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -54,6 +54,7 @@ else()
     src/math_functions.cu
     src/optimizer.cu
     src/context.cu
+    src/lgraph.cu
     src/node.cu
     src/net.cu
   )
diff --git a/libdeepgalois/include/deepgalois/cutils.h b/libdeepgalois/include/deepgalois/cutils.h
index 7be873a183..5181408363 100644
--- a/libdeepgalois/include/deepgalois/cutils.h
+++ b/libdeepgalois/include/deepgalois/cutils.h
@@ -53,6 +53,8 @@ inline const char* cublasGetErrorString(cublasStatus_t error) {
   case CUBLAS_STATUS_LICENSE_ERROR:
     return "CUBLAS_STATUS_LICENSE_ERROR";
 #endif
+  default:
+    break;
   }
   return "Unknown cublas status";
 }
@@ -79,6 +81,8 @@ inline const char* cusparseGetErrorString(cusparseStatus_t error) {
       return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
   case CUSPARSE_STATUS_ZERO_PIVOT:
         return "CUSPARSE_STATUS_ZERO_PIVOT";
+  default:
+    break;
   }
   return "Unknown cusparse status";
 }
@@ -111,6 +115,8 @@ inline const char* curandGetErrorString(curandStatus_t error) {
     return "CURAND_STATUS_ARCH_MISMATCH";
   case CURAND_STATUS_INTERNAL_ERROR:
     return "CURAND_STATUS_INTERNAL_ERROR";
+  default:
+    break;
   }
   return "Unknown curand status";
 }
diff --git a/libdeepgalois/include/deepgalois/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h
index dbe66c0092..126802e07a 100644
--- a/libdeepgalois/include/deepgalois/lgraph.h
+++ b/libdeepgalois/include/deepgalois/lgraph.h
@@ -5,29 +5,41 @@
 
 namespace deepgalois {
 
-typedef uint32_t index_t;
-
 class LearningGraph {
 protected:
+  bool is_device;
   index_t num_vertices_;
   index_t num_edges_;
   index_t *rowptr_;
   index_t *colidx_;
   index_t *degrees_;
+  vdata_t *vertex_data_;
+  edata_t *edge_data_;
+
 public:
   //typedef index_t* iterator;
   using iterator = boost::counting_iterator<index_t>;
   LearningGraph();
   ~LearningGraph();
+  void init(index_t nv, index_t ne) { num_vertices_ = nv; num_edges_ = ne; }
   void readGraph(std::string path, std::string dataset);
+  index_t size() { return num_vertices_; }
+  index_t sizeEdges() { return num_edges_; }
   index_t getDegree(index_t vid) { return degrees_[vid]; }
   index_t getEdgeDst(index_t eid) { return colidx_[eid]; }
   index_t edge_begin(index_t vid) { return rowptr_[vid]; }
   index_t edge_end(index_t vid) { return rowptr_[vid+1]; }
   index_t* row_start_ptr() { return rowptr_; }
   index_t* edge_dst_ptr() { return colidx_; }
+  index_t* degrees_ptr() { return degrees_; }
+  edata_t* edge_data_ptr() { return edge_data_; }
+  vdata_t* vertex_data_ptr() { return vertex_data_; }
   iterator begin() const { return iterator(0); }
   iterator end() const { return iterator(num_vertices_); }
+  void progressPrint(unsigned maxii, unsigned ii);
+  void allocOnDevice(bool no_edge_data_);
+  void copy_to_cpu(LearningGraph &copygraph);
+  void copy_to_gpu(LearningGraph &copygraph);
 };
 
 }
diff --git a/libdeepgalois/include/deepgalois/types.h b/libdeepgalois/include/deepgalois/types.h
index 9c6c79c6e5..3a579a9c5c 100644
--- a/libdeepgalois/include/deepgalois/types.h
+++ b/libdeepgalois/include/deepgalois/types.h
@@ -28,6 +28,10 @@ typedef std::vector<VertexID> VertexList;
 typedef std::set<VertexID> VertexSet;
 typedef std::vector<size_t> dims_t; // dimentions type
 
+typedef uint32_t index_t; // index type
+typedef float_t edata_t;  // edge data type
+typedef float_t vdata_t;  // vertex data type
+
 enum class net_phase { train, test };
 
 #define CHUNK_SIZE 256
diff --git a/libdeepgalois/src/lgraph.cpp b/libdeepgalois/src/lgraph.cpp
index 390ba87488..799812ac1d 100644
--- a/libdeepgalois/src/lgraph.cpp
+++ b/libdeepgalois/src/lgraph.cpp
@@ -1,17 +1,102 @@
 #include "deepgalois/lgraph.h"
+#include "deepgalois/utils.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <fcntl.h>    /* For O_RDWR */
+#include <unistd.h>   /* For open(), creat() */
 #include <fstream>
+#include <iostream>
+#include <cassert>
 
 namespace deepgalois {
 
-LearningGraph::LearningGraph() : num_vertices_(0), num_edges_(0),
-                                 rowptr_(NULL), colidx_(NULL), degrees_(NULL) {}
+LearningGraph::LearningGraph() : is_device(false), num_vertices_(0), num_edges_(0),
+                                 rowptr_(NULL), colidx_(NULL), degrees_(NULL),
+                                 vertex_data_(NULL), edge_data_(NULL) {}
+
+void LearningGraph::progressPrint(unsigned maxii, unsigned ii) {
+  const unsigned nsteps = 10;
+  unsigned ineachstep = (maxii / nsteps);
+  if(ineachstep == 0) ineachstep = 1;
+  if (ii % ineachstep == 0) {
+    int progress = ((size_t) ii * 100) / maxii + 1;
+    printf("\t%3d%%\r", progress);
+    fflush(stdout);
+  }
+}
 
 void LearningGraph::readGraph(std::string path, std::string dataset) {
+  std::string filename = path + dataset + ".csgr";
+  std::ifstream ifs;
+  ifs.open(filename);
+  int masterFD = open(filename.c_str(), O_RDONLY);
+  if (masterFD == -1) {
+    std::cout << "LearningGraph: unable to open" << filename << "\n";
+    exit(1);
+  }
+  struct stat buf;
+  int f = fstat(masterFD, &buf);
+  if (f == -1) {
+    std::cout << "LearningGraph: unable to stat" << filename << "\n";
+    exit(1);
+  }
+  size_t masterLength = buf.st_size;
+  int _MAP_BASE = MAP_PRIVATE;
+  void* m = mmap(0, masterLength, PROT_READ, _MAP_BASE, masterFD, 0);
+  if (m == MAP_FAILED) {
+    m = 0;
+    std::cout << "LearningGraph: mmap failed.\n";
+    exit(1);
+  }
+  Timer t;
+  t.Start();
+
+  uint64_t* fptr = (uint64_t*)m;
+  __attribute__((unused)) uint64_t version = le64toh(*fptr++);
+  assert(version == 1);
+  uint64_t sizeEdgeTy = le64toh(*fptr++);
+  uint64_t numNodes = le64toh(*fptr++);
+  uint64_t numEdges = le64toh(*fptr++);
+  uint64_t *outIdx = fptr;
+  fptr += numNodes;
+  uint32_t *fptr32 = (uint32_t*)fptr;
+  uint32_t *outs = fptr32; 
+  fptr32 += numEdges;
+  if (numEdges % 2) fptr32 += 1;
+  num_vertices_ = numNodes;
+  num_edges_ = numEdges;
+  if (sizeEdgeTy != 0) {
+    std::cout << "LearningGraph: currently edge data not supported.\n";
+    exit(1);
+  }
+
+  printf("num_vertices_=%d, num_edges_=%d.\n", num_vertices_, num_edges_);
+  degrees_ = new index_t[num_vertices_];
+  rowptr_ = new index_t[num_vertices_+1];
+  colidx_ = new index_t[num_edges_];
+  rowptr_[0] = 0;
+  for (unsigned ii = 0; ii < num_vertices_; ++ii) {
+    rowptr_[ii+1] = le64toh(outIdx[ii]);
+    degrees_[ii] = rowptr_[ii+1] - rowptr_[ii];
+    for (unsigned jj = 0; jj < degrees_[ii]; ++jj) {
+      unsigned eid = rowptr_[ii] + jj;
+      unsigned dst = le32toh(outs[eid]);
+      if (dst >= num_vertices_) {
+        printf("\tinvalid edge from %d to %d at index %d(%d).\n", ii, dst, jj, eid);
+        exit(0);
+      }
+      colidx_[eid] = dst;
+    }
+    progressPrint(num_vertices_, ii);
+  }
+  ifs.close();
+
+/*
   std::string file_dims = path + dataset + "-dims.bin";
   std::string file_rowptr = path + dataset + "-rowptr.bin";
   std::string file_colidx = path + dataset + "-colidx.bin";
   index_t dims[2];
-  std::ifstream ifs;
   ifs.open(file_dims, std::ios::binary|std::ios::in);
   ifs.read((char*)dims, sizeof(index_t) * 2);
   ifs.close();
@@ -26,6 +111,11 @@ void LearningGraph::readGraph(std::string path, std::string dataset) {
   ifs.open(file_colidx, std::ios::binary|std::ios::in);
   ifs.read((char*)colidx_, sizeof(index_t) * num_edges_);
   ifs.close();
+*/
+  t.Stop();
+  double runtime = t.Millisecs();
+  std::cout << "read " << masterLength << " bytes in " << runtime << " ms (" 
+            << masterLength/1000.0/runtime << " MB/s)\n\n"; 
 }
 
 }
diff --git a/libdeepgalois/src/lgraph.cu b/libdeepgalois/src/lgraph.cu
new file mode 100644
index 0000000000..afd4ced9dc
--- /dev/null
+++ b/libdeepgalois/src/lgraph.cu
@@ -0,0 +1,32 @@
+
+void LearningGraph::allocOnDevice(bool no_edge_data_) {
+  if (colidx_ != NULL) return true;  
+  CUDA_CHECK(cudaMalloc((void **) &colidx_, num_edges_ * sizeof(index_type)));
+  CUDA_CHECK(cudaMalloc((void **) &rowptr_, (num_vertices_+1) * sizeof(index_type)));
+  CUDA_CHECK(cudaMalloc((void **) &degrees_, num_vertices_ * sizeof(index_type)));
+  //if (!no_edge_data_) CUDA_CHECK(cudaMalloc((void **) &edge_data_, num_edges_ * sizeof(edge_data__t)));
+  //CUDA_CHECK(cudaMalloc((void **) &vertex_data_, num_vertices_ * sizeof(vdata_t)));
+  is_device = true;
+}
+
+void LearningGraph::copy_to_gpu(LearningGraph &copygraph) {
+  copygraph.init(num_vertices_, num_edges_);
+  copygraph.allocOnDevice(edge_data_ == NULL);
+  CUDA_CHECK(cudaMemcpy(copygraph.colidx_, colidx_, num_edges_ * sizeof(index_type), cudaMemcpyHostToDevice));
+  CUDA_CHECK(cudaMemcpy(copygraph.rowptr_, rowptr_, (num_vertices_+1) * sizeof(index_type), cudaMemcpyHostToDevice));
+  CUDA_CHECK(cudaMemcpy(copygraph.degrees_, degrees_, num_vertices_ * sizeof(index_type), cudaMemcpyHostToDevice));
+  //if (edge_data_ != NULL) CUDA_CHECK(cudaMemcpy(copygraph.edge_data_, edge_data_, num_edges_ * sizeof(edata_t), cudaMemcpyHostToDevice));
+  //CUDA_CHECK(cudaMemcpy(copygraph.vertex_data_, vertex_data_, num_vertices_ * sizeof(vdata_t), cudaMemcpyHostToDevice));
+}
+
+void LearningGraph::copy_to_cpu(LearningGraph &copygraph) {
+  assert(is_device);
+  assert(copygraph.size() = num_vertices_);
+  assert(copygraph.sizeEdges() = num_edges_);
+  CUDA_CHECK(cudaMemcpy(copygraph.edge_dst_ptr(), colidx_, num_edges_ * sizeof(index_type), cudaMemcpyDeviceToHost));
+  CUDA_CHECK(cudaMemcpy(copygraph.row_start_ptr(), rowptr_, (num_vertices_+1) * sizeof(index_type), cudaMemcpyDeviceToHost));
+  CUDA_CHECK(cudaMemcpy(copygraph.degrees_ptr(), degrees_, num_vertices_ * sizeof(index_type), cudaMemcpyDeviceToHost));
+  //if (edge_data_ != NULL) CUDA_CHECK(cudaMemcpy(copygraph.edge_data_ptr(), edge_data_, num_edges_ * sizeof(edata_t), cudaMemcpyDeviceToHost));
+  //CUDA_CHECK(cudaMemcpy(copygraph.vertex_data_ptr(), vertex_data_, num_vertices_ * sizeof(vdata_t), cudaMemcpyDeviceToHost));
+}
+

From ecb5de88d766572399639442780d77e22c1c5f5b Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Wed, 29 Apr 2020 12:33:38 -0500
Subject: [PATCH 209/660] add NodeIndexTy and EdgeIndexTy in LC_CSR_Graph

---
 libdeepgalois/include/deepgalois/context.h    |  5 +-
 libdeepgalois/include/deepgalois/gtypes.h     |  4 +-
 libdeepgalois/include/deepgalois/lgraph.h     | 14 ++++-
 libdeepgalois/src/context.cpp                 | 27 +++++----
 libdeepgalois/src/lgraph.cpp                  | 43 ++++++++++++---
 libdeepgalois/src/lgraph.cu                   |  9 +++
 .../include/galois/graphs/LC_CSR_Graph.h      | 55 ++++++++++---------
 7 files changed, 106 insertions(+), 51 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h
index affe48ace0..6d3d5a884c 100644
--- a/libdeepgalois/include/deepgalois/context.h
+++ b/libdeepgalois/include/deepgalois/context.h
@@ -40,13 +40,16 @@ class Context {
 
 #ifdef CPU_ONLY
   Graph* graph_cpu; // the input graph, |V| = N
-  LearningGraph* lgraph;
   Graph* subgraph_cpu;
+  LearningGraph* lgraph;
+  LearningGraph* lsubgraph;
   void createSubgraph();
   void add_selfloop(Graph &og, Graph &g);
   //! returns pointer to the graph
   Graph* getGraphPointer() { return graph_cpu; }
   Graph* getSubgraphPointer() { return subgraph_cpu; };
+  //LearningGraph* getGraphPointer() { return lgraph; }
+  //LearningGraph* getSubgraphPointer() { return lsubgraph; };
   float_t* get_feats_ptr() { return h_feats; }
   float_t* get_feats_subg_ptr() { return h_feats_subg; }
   label_t* get_labels_ptr() { return h_labels; }
diff --git a/libdeepgalois/include/deepgalois/gtypes.h b/libdeepgalois/include/deepgalois/gtypes.h
index dfc2e1d8c6..9aa405507c 100644
--- a/libdeepgalois/include/deepgalois/gtypes.h
+++ b/libdeepgalois/include/deepgalois/gtypes.h
@@ -21,8 +21,8 @@ using AccuracyAccum = galois::DGAccumulator<acc_t>;
 typedef galois::graphs::LC_CSR_Graph<uint32_t, uint32_t>::with_numa_alloc<
     true>::type ::with_no_lockable<true>::type Graph;
 #else
-typedef galois::graphs::LC_CSR_Graph<uint32_t, void>::with_numa_alloc<
-    true>::type ::with_no_lockable<true>::type Graph;
+typedef galois::graphs::LC_CSR_Graph<void, void, false, false, false, void, uint64_t, uint64_t>::
+    with_numa_alloc<true>::type ::with_no_lockable<true>::type Graph;
 #endif
 #else
 using Graph = galois::graphs::DistGraph<char, void>;
diff --git a/libdeepgalois/include/deepgalois/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h
index 126802e07a..03ae92cbff 100644
--- a/libdeepgalois/include/deepgalois/lgraph.h
+++ b/libdeepgalois/include/deepgalois/lgraph.h
@@ -19,14 +19,18 @@ class LearningGraph {
 public:
   //typedef index_t* iterator;
   using iterator = boost::counting_iterator<index_t>;
-  LearningGraph();
-  ~LearningGraph();
+  LearningGraph(bool use_gpu) : is_device(use_gpu), num_vertices_(0), num_edges_(0),
+                                rowptr_(NULL), colidx_(NULL), degrees_(NULL),
+                                vertex_data_(NULL), edge_data_(NULL) {}
+  LearningGraph() : LearningGraph(false) {}
+  ~LearningGraph() { dealloc(); }
   void init(index_t nv, index_t ne) { num_vertices_ = nv; num_edges_ = ne; }
   void readGraph(std::string path, std::string dataset);
   index_t size() { return num_vertices_; }
   index_t sizeEdges() { return num_edges_; }
   index_t getDegree(index_t vid) { return degrees_[vid]; }
   index_t getEdgeDst(index_t eid) { return colidx_[eid]; }
+  index_t get_degree(index_t vid) { return degrees_[vid]; }
   index_t edge_begin(index_t vid) { return rowptr_[vid]; }
   index_t edge_end(index_t vid) { return rowptr_[vid+1]; }
   index_t* row_start_ptr() { return rowptr_; }
@@ -40,6 +44,12 @@ class LearningGraph {
   void allocOnDevice(bool no_edge_data_);
   void copy_to_cpu(LearningGraph &copygraph);
   void copy_to_gpu(LearningGraph &copygraph);
+  void dealloc();
+  void degree_counting();
+  void allocateFrom(index_t nv, index_t ne);
+  void constructNodes();
+  void fixEndEdge(index_t vid, index_t row_end);
+  void constructEdge(index_t eid, index_t dst, edata_t edata);
 };
 
 }
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index 4320df1bc6..9013e563f5 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -30,6 +30,7 @@ size_t Context::read_graph(std::string dataset_str, bool selfloop) {
 
 void Context::createSubgraph() {
   subgraph_cpu = new Graph(); 
+  lsubgraph = new LearningGraph(); 
 }
 
 // generate labels for the subgraph, m is subgraph size
@@ -63,7 +64,6 @@ void Context::gen_subgraph_feats(size_t m, const mask_t *masks) {
 size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype, bool selfloop) {
   galois::StatTimer Tread("GraphReadingTime");
   Tread.start();
-  graph_cpu = new Graph(); 
   if (filetype == "el") {
     std::string filename = path + dataset_str + ".el";
     printf("Reading .el file: %s\n", filename.c_str());
@@ -72,6 +72,7 @@ size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype, bo
     lgraph = new LearningGraph();
     lgraph->readGraph(path, dataset_str);
   } else if (filetype == "gr") {
+    graph_cpu = new Graph(); 
     std::string filename = path + dataset_str + ".csgr";
     printf("Reading .gr file: %s\n", filename.c_str());
     if (selfloop) {
@@ -86,9 +87,10 @@ size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype, bo
     exit(1);
   }
   Tread.stop();
-  std::cout << "num_vertices " << graph_cpu->size() << " num_edges "
-            << graph_cpu->sizeEdges() << "\n";
-  return graph_cpu->size();
+  auto g = getGraphPointer();
+  std::cout << "num_vertices " << g->size() << " num_edges "
+            << g->sizeEdges() << "\n";
+  return g->size();
 }
 
 void Context::add_selfloop(Graph &og, Graph &g) {
@@ -124,11 +126,13 @@ void Context::add_selfloop(Graph &og, Graph &g) {
 }
 
 void Context::norm_factor_counting(size_t g_size) {
-  Graph *g = graph_cpu;
-  if (use_subgraph) g = subgraph_cpu;
+  auto g = getGraphPointer();
+  auto subg = getSubgraphPointer();
+  g->degree_counting();
+  if (use_subgraph) g = subg;
   if (norm_factor == NULL) norm_factor = new float_t[g_size];
   galois::do_all(galois::iterate((size_t)0, g_size), [&](auto v) {
-    auto degree  = std::distance(g->edge_begin(v), g->edge_end(v));
+    auto degree  = g->get_degree(v);
     float_t temp = std::sqrt(float_t(degree));
     if (temp == 0.0) norm_factor[v] = 0.0;
     else norm_factor[v] = 1.0 / temp;
@@ -185,14 +189,15 @@ void Context::read_edgelist(const char* filename, bool symmetrize, bool add_self
         colidx_[offsets[i]++] = dst;
   }
 
-  graph_cpu->allocateFrom(num_vertices_, num_edges_);
-  graph_cpu->constructNodes();
+  auto g = getGraphPointer();
+  g->allocateFrom(num_vertices_, num_edges_);
+  g->constructNodes();
   for (size_t i = 0; i < num_vertices_; i++) {
     auto row_begin = rowptr_[i];
     auto row_end = rowptr_[i+1];
-    graph_cpu->fixEndEdge(i, row_end);
+    g->fixEndEdge(i, row_end);
     for (auto offset = row_begin; offset < row_end; offset++)
-      graph_cpu->constructEdge(offset, colidx_[offset], 0);
+      g->constructEdge(offset, colidx_[offset], 0);
   }
 }
 
diff --git a/libdeepgalois/src/lgraph.cpp b/libdeepgalois/src/lgraph.cpp
index 799812ac1d..4e65d838a5 100644
--- a/libdeepgalois/src/lgraph.cpp
+++ b/libdeepgalois/src/lgraph.cpp
@@ -10,11 +10,6 @@
 #include <cassert>
 
 namespace deepgalois {
-
-LearningGraph::LearningGraph() : is_device(false), num_vertices_(0), num_edges_(0),
-                                 rowptr_(NULL), colidx_(NULL), degrees_(NULL),
-                                 vertex_data_(NULL), edge_data_(NULL) {}
-
 void LearningGraph::progressPrint(unsigned maxii, unsigned ii) {
   const unsigned nsteps = 10;
   unsigned ineachstep = (maxii / nsteps);
@@ -26,6 +21,27 @@ void LearningGraph::progressPrint(unsigned maxii, unsigned ii) {
   }
 }
 
+void LearningGraph::allocateFrom(index_t nv, index_t ne) {
+}
+
+void LearningGraph::constructNodes() {
+}
+
+void LearningGraph::fixEndEdge(index_t vid, index_t row_end) {
+}
+
+void LearningGraph::constructEdge(index_t eid, index_t dst, edata_t edata) {
+}
+
+void degree_counting() {
+/*
+  degrees = new uint32_t[num_vertices_];
+  galois::do_all(galois::iterate(begin(), end()), [&] (auto v) {
+    degrees[v] = std::distance(this->edge_begin(v), this->edge_end(v));
+  }, galois::loopname("DegreeCounting"));
+*/
+}
+
 void LearningGraph::readGraph(std::string path, std::string dataset) {
   std::string filename = path + dataset + ".csgr";
   std::ifstream ifs;
@@ -56,15 +72,15 @@ void LearningGraph::readGraph(std::string path, std::string dataset) {
   __attribute__((unused)) uint64_t version = le64toh(*fptr++);
   assert(version == 1);
   uint64_t sizeEdgeTy = le64toh(*fptr++);
-  uint64_t numNodes = le64toh(*fptr++);
+  uint64_t nv = le64toh(*fptr++);
   uint64_t numEdges = le64toh(*fptr++);
   uint64_t *outIdx = fptr;
-  fptr += numNodes;
+  fptr += nv;
   uint32_t *fptr32 = (uint32_t*)fptr;
   uint32_t *outs = fptr32; 
   fptr32 += numEdges;
   if (numEdges % 2) fptr32 += 1;
-  num_vertices_ = numNodes;
+  num_vertices_ = nv;
   num_edges_ = numEdges;
   if (sizeEdgeTy != 0) {
     std::cout << "LearningGraph: currently edge data not supported.\n";
@@ -118,4 +134,15 @@ void LearningGraph::readGraph(std::string path, std::string dataset) {
             << masterLength/1000.0/runtime << " MB/s)\n\n"; 
 }
 
+#ifdef CPU_ONLY
+void LearningGraph::dealloc() {
+  assert (!is_device);
+  free(rowptr_);
+  free(colidx_);
+  free(degrees_);
+  if (vertex_data_ != NULL) free(vertex_data_);
+  if (edge_data_ != NULL) free(edge_data_);
 }
+#endif
+
+} // end namespace
diff --git a/libdeepgalois/src/lgraph.cu b/libdeepgalois/src/lgraph.cu
index afd4ced9dc..14b7239358 100644
--- a/libdeepgalois/src/lgraph.cu
+++ b/libdeepgalois/src/lgraph.cu
@@ -1,4 +1,13 @@
 
+void LearningGraph::dealloc() {
+  assert(is_device);
+  CUDA_CHECK(cudaFree(colidx_));
+  CUDA_CHECK(cudaFree(rowptr_));
+  CUDA_CHECK(cudaFree(degrees_));
+  if (edge_data != NULL) CUDA_CHECK(cudaFree(edge_data));
+  if (vertex_data != NULL) CUDA_CHECK(cudaFree(vertex_data));
+}
+
 void LearningGraph::allocOnDevice(bool no_edge_data_) {
   if (colidx_ != NULL) return true;  
   CUDA_CHECK(cudaMalloc((void **) &colidx_, num_edges_ * sizeof(index_type)));
diff --git a/libgalois/include/galois/graphs/LC_CSR_Graph.h b/libgalois/include/galois/graphs/LC_CSR_Graph.h
index 903354f83a..ff6f7b9caf 100644
--- a/libgalois/include/galois/graphs/LC_CSR_Graph.h
+++ b/libgalois/include/galois/graphs/LC_CSR_Graph.h
@@ -61,7 +61,8 @@ namespace graphs {
 template <typename NodeTy, typename EdgeTy, bool HasNoLockable = false,
           bool UseNumaAlloc =
               false, // true => numa-blocked, false => numa-interleaved
-          bool HasOutOfLineLockable = false, typename FileEdgeTy = EdgeTy>
+          bool HasOutOfLineLockable = false, typename FileEdgeTy = EdgeTy,
+          typename NodeIndexTy = uint32_t, typename EdgeIndexTy = uint64_t >
 class LC_CSR_Graph :
     //! [doxygennuma]
     private boost::noncopyable,
@@ -134,18 +135,18 @@ class LC_CSR_Graph :
 
 protected:
   typedef LargeArray<EdgeTy> EdgeData;
-  typedef LargeArray<uint32_t> EdgeDst;
+  typedef LargeArray<NodeIndexTy> EdgeDst;
   typedef internal::NodeInfoBaseTypes<NodeTy,
                                       !HasNoLockable && !HasOutOfLineLockable>
       NodeInfoTypes;
   typedef internal::NodeInfoBase<NodeTy,
                                  !HasNoLockable && !HasOutOfLineLockable>
       NodeInfo;
-  typedef LargeArray<uint64_t> EdgeIndData;
+  typedef LargeArray<EdgeIndexTy> EdgeIndData;
   typedef LargeArray<NodeInfo> NodeData;
 
 public:
-  typedef uint32_t GraphNode;
+  typedef NodeIndexTy GraphNode;
   typedef EdgeTy edge_data_type;
   typedef FileEdgeTy file_edge_data_type;
   typedef NodeTy node_data_type;
@@ -333,8 +334,8 @@ class LC_CSR_Graph :
   }
 
   // cxh
-  uint64_t* row_start_ptr() { return &edgeIndData[0]; }
-  uint32_t* edge_dst_ptr() { return &edgeDst[0]; }
+  EdgeIndexTy* row_start_ptr() { return &edgeIndData[0]; }
+  NodeIndexTy* edge_dst_ptr() { return &edgeDst[0]; }
   /**
    * Accesses the "prefix sum" of this graph; takes advantage of the fact
    * that edge_end(n) is basically prefix_sum[n] (if a prefix sum existed +
@@ -349,7 +350,7 @@ class LC_CSR_Graph :
   uint64_t operator[](uint64_t n) { return *(edge_end(n)); }
 
   template <typename EdgeNumFnTy, typename EdgeDstFnTy, typename EdgeDataFnTy>
-  LC_CSR_Graph(uint32_t _numNodes, uint64_t _numEdges, EdgeNumFnTy edgeNum,
+  LC_CSR_Graph(NodeIndexTy _numNodes, EdgeIndexTy _numEdges, EdgeNumFnTy edgeNum,
                EdgeDstFnTy _edgeDst, EdgeDataFnTy _edgeData)
       : numNodes(_numNodes), numEdges(_numEdges) {
     // std::cerr << "\n**" << numNodes << " " << numEdges << "\n\n";
@@ -552,7 +553,7 @@ class LC_CSR_Graph :
     }
   }
 
-  void allocateFrom(uint32_t nNodes, uint64_t nEdges) {
+  void allocateFrom(NodeIndexTy nNodes, EdgeIndexTy nEdges) {
     numNodes = nNodes;
     numEdges = nEdges;
 
@@ -571,7 +572,7 @@ class LC_CSR_Graph :
     }
   }
 
-  void destroyAndAllocateFrom(uint32_t nNodes, uint64_t nEdges) {
+  void destroyAndAllocateFrom(NodeIndexTy nNodes, EdgeIndexTy nEdges) {
     numNodes = nNodes;
     numEdges = nEdges;
 
@@ -595,7 +596,7 @@ class LC_CSR_Graph :
 
   void constructNodes() {
 #ifndef GALOIS_GRAPH_CONSTRUCT_SERIAL
-    for (uint32_t x = 0; x < numNodes; ++x) {
+    for (NodeIndexTy x = 0; x < numNodes; ++x) {
       nodeData.constructAt(x);
       this->outOfLineConstructAt(x);
     }
@@ -623,15 +624,15 @@ class LC_CSR_Graph :
     edgeData.destroy();
   }
 
-  void constructEdge(uint64_t e, uint32_t dst,
+  void constructEdge(EdgeIndexTy e, NodeIndexTy dst,
                      const typename EdgeData::value_type& val) {
     edgeData.set(e, val);
     edgeDst[e] = dst;
   }
 
-  void constructEdge(uint64_t e, uint32_t dst) { edgeDst[e] = dst; }
+  void constructEdge(EdgeIndexTy e, NodeIndexTy dst) { edgeDst[e] = dst; }
 
-  void fixEndEdge(uint32_t n, uint64_t e) { edgeIndData[n] = e; }
+  void fixEndEdge(NodeIndexTy n, EdgeIndexTy e) { edgeIndData[n] = e; }
 
   /**
    * Perform an in-memory transpose of the graph, replacing the original
@@ -681,7 +682,7 @@ class LC_CSR_Graph :
 
     // TODO is it worth doing parallel prefix sum?
     // prefix sum calculation of the edge index array
-    for (uint32_t n = 1; n < numNodes; ++n) {
+    for (NodeIndexTy n = 1; n < numNodes; ++n) {
       edgeIndData_temp[n] += edgeIndData_temp[n - 1];
     }
 
@@ -735,15 +736,15 @@ class LC_CSR_Graph :
   }
 
   template <bool is_non_void = EdgeData::has_value>
-  void edgeDataCopy(EdgeData& edgeData_new, EdgeData& edgeData, uint64_t e_new,
-                    uint64_t e,
+  void edgeDataCopy(EdgeData& edgeData_new, EdgeData& edgeData, EdgeIndexTy e_new,
+                    EdgeIndexTy e,
                     typename std::enable_if<is_non_void>::type* = 0) {
     edgeData_new[e_new] = edgeData[e];
   }
 
   template <bool is_non_void = EdgeData::has_value>
-  void edgeDataCopy(EdgeData& edgeData_new, EdgeData& edgeData, uint64_t e_new,
-                    uint64_t e,
+  void edgeDataCopy(EdgeData& edgeData_new, EdgeData& edgeData, EdgeIndexTy e_new,
+                    EdgeIndexTy e,
                     typename std::enable_if<!is_non_void>::type* = 0) {
     // does nothing
   }
@@ -793,7 +794,7 @@ class LC_CSR_Graph :
    * Adding for Louvain clustering
    * TODO: Find better way to do this
    */
-  void constructFrom(uint32_t numNodes, uint64_t numEdges, std::vector<uint64_t>& prefix_sum, std::vector<std::vector<uint32_t>>& edges_id, std::vector<std::vector<EdgeTy>>& edges_data) {
+  void constructFrom(NodeIndexTy numNodes, EdgeIndexTy numEdges, std::vector<uint64_t>& prefix_sum, std::vector<std::vector<uint32_t>>& edges_id, std::vector<std::vector<EdgeTy>>& edges_data) {
     //allocateFrom(numNodes, numEdges);
     /*
      * Deallocate if reusing the graph
@@ -801,13 +802,13 @@ class LC_CSR_Graph :
     destroyAndAllocateFrom(numNodes, numEdges);
     constructNodes();
 
-    galois::do_all(galois::iterate((uint32_t)0, numNodes),
-                  [&](uint32_t n) {
+    galois::do_all(galois::iterate((NodeIndexTy)0, numNodes),
+                  [&](NodeIndexTy n) {
                     edgeIndData[n] = prefix_sum[n];
                     });
 
-    galois::do_all(galois::iterate((uint32_t)0, numNodes),
-                  [&](uint32_t n) {
+    galois::do_all(galois::iterate((NodeIndexTy)0, numNodes),
+                  [&](NodeIndexTy n) {
                     if( n == 0){
                       if(edgeIndData[n] > 0){
                         std::copy(edges_id[n].begin(), edges_id[n].end(), edgeDst.begin());
@@ -879,11 +880,11 @@ class LC_CSR_Graph :
   readPosition = ((4 + numNodes) * sizeof(uint64_t));
   graphFile.seekg(readPosition);
   if(version == 1) {
-    graphFile.read(reinterpret_cast<char*>(edgeDst.data()), sizeof(uint32_t)*numEdges);
-    readPosition = ((4 + numNodes) * sizeof(uint64_t) + numEdges * sizeof(uint32_t));
+    graphFile.read(reinterpret_cast<char*>(edgeDst.data()), sizeof(NodeIndexTy)*numEdges);
+    readPosition = ((4 + numNodes) * sizeof(uint64_t) + numEdges * sizeof(NodeIndexTy));
     // version 1 padding TODO make version agnostic
     if (numEdges% 2) {
-      readPosition += sizeof(uint32_t);
+      readPosition += sizeof(NodeIndexTy);
     }
   } else if(version == 2) {
     graphFile.read(reinterpret_cast<char*>(edgeDst.data()), sizeof(uint64_t)*numEdges);
@@ -955,7 +956,7 @@ void readGraphFromGRFile(const std::string& filename) {
   readPosition = ((4 + numNodes) * sizeof(uint64_t));
   graphFile.seekg(readPosition);
   if(version == 1) {
-    graphFile.read(reinterpret_cast<char*>(edgeDst.data()), sizeof(uint32_t)*numEdges);
+    graphFile.read(reinterpret_cast<char*>(edgeDst.data()), sizeof(NodeIndexTy)*numEdges);
   } else if(version == 2) {
     graphFile.read(reinterpret_cast<char*>(edgeDst.data()), sizeof(uint64_t)*numEdges);
   } else {

From e7fe5d859f5e00a6168226bc82abb1ee69454215 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Wed, 29 Apr 2020 13:24:49 -0500
Subject: [PATCH 210/660] fix types

---
 libdeepgalois/include/deepgalois/context.h |  5 -----
 libdeepgalois/include/deepgalois/gtypes.h  | 18 ++++++++++--------
 libdeepgalois/include/deepgalois/lgraph.h  |  5 +++--
 libdeepgalois/include/deepgalois/sampler.h |  6 +++---
 libdeepgalois/src/context.cpp              | 13 +++++++------
 libdeepgalois/src/layers/aggregator.cpp    |  2 +-
 libdeepgalois/src/lgraph.cpp               |  5 ++++-
 libdeepgalois/src/sampler.cpp              | 20 ++++++++------------
 lonestargnn/gcn/gcn.cpp                    |  2 +-
 9 files changed, 37 insertions(+), 39 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h
index 6d3d5a884c..e368319dff 100644
--- a/libdeepgalois/include/deepgalois/context.h
+++ b/libdeepgalois/include/deepgalois/context.h
@@ -7,7 +7,6 @@
 #include <cassert>
 #include "deepgalois/types.h"
 #ifdef CPU_ONLY
-#include "deepgalois/lgraph.h"
 #include "deepgalois/gtypes.h"
 #else
 #include "graph_gpu.h"
@@ -41,15 +40,11 @@ class Context {
 #ifdef CPU_ONLY
   Graph* graph_cpu; // the input graph, |V| = N
   Graph* subgraph_cpu;
-  LearningGraph* lgraph;
-  LearningGraph* lsubgraph;
   void createSubgraph();
   void add_selfloop(Graph &og, Graph &g);
   //! returns pointer to the graph
   Graph* getGraphPointer() { return graph_cpu; }
   Graph* getSubgraphPointer() { return subgraph_cpu; };
-  //LearningGraph* getGraphPointer() { return lgraph; }
-  //LearningGraph* getSubgraphPointer() { return lsubgraph; };
   float_t* get_feats_ptr() { return h_feats; }
   float_t* get_feats_subg_ptr() { return h_feats_subg; }
   label_t* get_labels_ptr() { return h_labels; }
diff --git a/libdeepgalois/include/deepgalois/gtypes.h b/libdeepgalois/include/deepgalois/gtypes.h
index 9aa405507c..fe759803e2 100644
--- a/libdeepgalois/include/deepgalois/gtypes.h
+++ b/libdeepgalois/include/deepgalois/gtypes.h
@@ -1,14 +1,14 @@
-#ifndef __DG_GTYPES__
-#define __DG_GTYPES__
+#pragma once
 
 #include "galois/Galois.h"
 #include "galois/graphs/LCGraph.h"
 #include "deepgalois/types.h"
+#include "deepgalois/lgraph.h"
 #ifdef GALOIS_USE_DIST
 #include "galois/graphs/NewGeneric.h"
 #endif
 
-// TODO namespace
+namespace deepgalois {
 
 typedef galois::GAccumulator<acc_t> AccumF;
 typedef galois::GAccumulator<size_t> AccumU;
@@ -19,15 +19,17 @@ using AccuracyAccum = galois::DGAccumulator<acc_t>;
 #ifndef GALOIS_USE_DIST
 #ifdef EDGE_LABEL
 typedef galois::graphs::LC_CSR_Graph<uint32_t, uint32_t>::with_numa_alloc<
-    true>::type ::with_no_lockable<true>::type Graph;
+    true>::type ::with_no_lockable<true>::type LCGraph;
 #else
 typedef galois::graphs::LC_CSR_Graph<void, void, false, false, false, void, uint64_t, uint64_t>::
-    with_numa_alloc<true>::type ::with_no_lockable<true>::type Graph;
+    with_numa_alloc<true>::type ::with_no_lockable<true>::type LCGraph;
 #endif
+//typedef LCGraph Graph;
+//typedef Graph::edge_iterator edge_iterator;
+typedef LearningGraph Graph;
+typedef index_t edge_iterator;
 #else
 using Graph = galois::graphs::DistGraph<char, void>;
 #endif
 
-typedef Graph::GraphNode GNode;
-
-#endif
+}
diff --git a/libdeepgalois/include/deepgalois/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h
index 03ae92cbff..bf3ace2470 100644
--- a/libdeepgalois/include/deepgalois/lgraph.h
+++ b/libdeepgalois/include/deepgalois/lgraph.h
@@ -26,8 +26,9 @@ class LearningGraph {
   ~LearningGraph() { dealloc(); }
   void init(index_t nv, index_t ne) { num_vertices_ = nv; num_edges_ = ne; }
   void readGraph(std::string path, std::string dataset);
-  index_t size() { return num_vertices_; }
-  index_t sizeEdges() { return num_edges_; }
+  void readGraphFromGRFile(const std::string& filename);
+  size_t size() { return (size_t)num_vertices_; }
+  size_t sizeEdges() { return (size_t)num_edges_; }
   index_t getDegree(index_t vid) { return degrees_[vid]; }
   index_t getEdgeDst(index_t eid) { return colidx_[eid]; }
   index_t get_degree(index_t vid) { return degrees_[vid]; }
diff --git a/libdeepgalois/include/deepgalois/sampler.h b/libdeepgalois/include/deepgalois/sampler.h
index 01616d01f5..15c82ffa12 100644
--- a/libdeepgalois/include/deepgalois/sampler.h
+++ b/libdeepgalois/include/deepgalois/sampler.h
@@ -13,11 +13,11 @@ class Sampler {
   // !API function for user-defined selection strategy
   virtual void select_vertices(size_t nv, size_t n, int m, Graph* g, VertexList vertices, VertexSet &vertex_set);
 
-  galois::runtime::iterable<galois::NoDerefIterator<Graph::edge_iterator> > neighbor_sampler(Graph &g, GNode v);
+  galois::runtime::iterable<galois::NoDerefIterator<edge_iterator> > neighbor_sampler(Graph &g, VertexID v);
 
-  Graph::edge_iterator sampled_edge_begin(Graph &g, GNode v) { return g.edge_begin(v); }
+  edge_iterator sampled_edge_begin(Graph &g, VertexID v) { return g.edge_begin(v); }
 
-  Graph::edge_iterator sampled_edge_end(Graph &g, GNode v) { return g.edge_end(v); }
+  edge_iterator sampled_edge_end(Graph &g, VertexID v) { return g.edge_end(v); }
 
   void set_masked_graph(size_t begin, size_t end, size_t count, mask_t* masks, Graph* g);
 
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index 9013e563f5..9b5b858206 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -30,7 +30,6 @@ size_t Context::read_graph(std::string dataset_str, bool selfloop) {
 
 void Context::createSubgraph() {
   subgraph_cpu = new Graph(); 
-  lsubgraph = new LearningGraph(); 
 }
 
 // generate labels for the subgraph, m is subgraph size
@@ -62,25 +61,27 @@ void Context::gen_subgraph_feats(size_t m, const mask_t *masks) {
 }
 
 size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype, bool selfloop) {
+  std::string filename = path + dataset_str + ".csgr";
   galois::StatTimer Tread("GraphReadingTime");
   Tread.start();
   if (filetype == "el") {
-    std::string filename = path + dataset_str + ".el";
+    filename = path + dataset_str + ".el";
     printf("Reading .el file: %s\n", filename.c_str());
     read_edgelist(filename.c_str(), true); // symmetrize
   } else if (filetype == "bin") {
-    lgraph = new LearningGraph();
-    lgraph->readGraph(path, dataset_str);
+    graph_cpu->readGraphFromGRFile(filename);
   } else if (filetype == "gr") {
     graph_cpu = new Graph(); 
     std::string filename = path + dataset_str + ".csgr";
     printf("Reading .gr file: %s\n", filename.c_str());
     if (selfloop) {
       Graph graph_temp;
-      galois::graphs::readGraph(graph_temp, filename);
+      //galois::graphs::readGraph(graph_temp, filename);
+      graph_temp.readGraphFromGRFile(filename);
       add_selfloop(graph_temp, *graph_cpu);
       is_selfloop_added = selfloop;
-    } else galois::graphs::readGraph(*graph_cpu, filename);
+    //} else galois::graphs::readGraph(*graph_cpu, filename);
+    } else graph_cpu->readGraphFromGRFile(filename);
 // TODO dist version of self loop
   } else {
     printf("Unkown file format\n");
diff --git a/libdeepgalois/src/layers/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp
index b374dd9d91..7dc1752436 100644
--- a/libdeepgalois/src/layers/aggregator.cpp
+++ b/libdeepgalois/src/layers/aggregator.cpp
@@ -17,7 +17,7 @@ void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* ou
     // get normalization factor if needed
     if (norm) a = norm_factor[src];
     // gather neighbors' embeddings
-    for (const auto e : g.edges(src)) {
+    for (auto e = g.edge_begin(src); e != g.edge_end(src); e++) {
       const auto dst = g.getEdgeDst(e);
       if (norm) {
         // normalize b as well
diff --git a/libdeepgalois/src/lgraph.cpp b/libdeepgalois/src/lgraph.cpp
index 4e65d838a5..ba4432aca9 100644
--- a/libdeepgalois/src/lgraph.cpp
+++ b/libdeepgalois/src/lgraph.cpp
@@ -33,7 +33,7 @@ void LearningGraph::fixEndEdge(index_t vid, index_t row_end) {
 void LearningGraph::constructEdge(index_t eid, index_t dst, edata_t edata) {
 }
 
-void degree_counting() {
+void LearningGraph::degree_counting() {
 /*
   degrees = new uint32_t[num_vertices_];
   galois::do_all(galois::iterate(begin(), end()), [&] (auto v) {
@@ -44,6 +44,9 @@ void degree_counting() {
 
 void LearningGraph::readGraph(std::string path, std::string dataset) {
   std::string filename = path + dataset + ".csgr";
+}
+
+void LearningGraph::readGraphFromGRFile(const std::string& filename) {
   std::ifstream ifs;
   ifs.open(filename);
   int masterFD = open(filename.c_str(), O_RDONLY);
diff --git a/libdeepgalois/src/sampler.cpp b/libdeepgalois/src/sampler.cpp
index 257cf1edef..a1b1e4feba 100644
--- a/libdeepgalois/src/sampler.cpp
+++ b/libdeepgalois/src/sampler.cpp
@@ -3,10 +3,6 @@
 #include <time.h> 
 #include <vector>
 
-inline unsigned getDegree(Graph *g, GNode v) {
-	return std::distance(g->edge_begin(v), g->edge_end(v));
-}
-
 namespace deepgalois {
 
 void Sampler::set_masked_graph(size_t begin, size_t end, size_t count, mask_t *masks, Graph *g) {
@@ -29,9 +25,9 @@ void Sampler::set_masked_graph(size_t begin, size_t end, size_t count, mask_t *m
 
 void Sampler::get_masked_degrees(size_t n, mask_t *masks, Graph *g, std::vector<uint32_t> &degrees) {
   assert(degrees.size() == n);
-  galois::do_all(galois::iterate(size_t(0), n), [&](const GNode src) {
+  galois::do_all(galois::iterate(size_t(0), n), [&](const auto src) {
     if (masks[src] == 1) {
-      for (const auto e : g->edges(src)) {
+      for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) {
         const auto dst = g->getEdgeDst(e);
         if (masks[dst] == 1) degrees[src] ++;
       }
@@ -48,11 +44,11 @@ void Sampler::generate_masked_graph(size_t n, mask_t* masks, Graph* g, Graph& su
 #ifndef GALOIS_USE_DIST
   sub.allocateFrom(n, ne);
   sub.constructNodes();
-  galois::do_all(galois::iterate((size_t)0, n), [&](const GNode src) {
+  galois::do_all(galois::iterate((size_t)0, n), [&](const auto src) {
     sub.fixEndEdge(src, offsets[src+1]);
     if (masks[src] == 1) {
       auto idx = offsets[src];
-      for (const auto e : g->edges(src)) {
+      for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) {
         const auto dst = g->getEdgeDst(e);
         if (masks[dst] == 1) sub.constructEdge(idx++, dst, 0);
       }
@@ -77,7 +73,7 @@ void Sampler::select_vertices(size_t nv, size_t n, int m, Graph *g, VertexList v
   galois::gPrint("vertex_set size: ", vertex_set.size(), "\n");
   int *degrees = new int[m];
   galois::do_all(galois::iterate(size_t(0), size_t(m)), [&](const auto i) {
-    degrees[i] = (int)getDegree(g, frontier[i]);
+    degrees[i] = (int)g->get_degree(frontier[i]);
   }, galois::loopname("compute_degrees"));
   for (size_t i = 0; i < n - m; i++) {
     auto pos = select_one_item((int)m, degrees);
@@ -89,7 +85,7 @@ void Sampler::select_vertices(size_t nv, size_t n, int m, Graph *g, VertexList v
       auto dst = g->getEdgeDst(g->edge_begin(u) + neighbor_id);
       if (vertex_set.find(dst) == vertex_set.end()) {
         frontier[pos] = dst;
-        degrees[pos] = getDegree(g, frontier[pos]);
+        degrees[pos] = g->get_degree(frontier[pos]);
         vertex_set.insert(dst);
         break;
       }
@@ -129,7 +125,7 @@ void Sampler::generate_subgraph(VertexSet &vertex_set, Graph &g, Graph &sub) {
   VertexList new_ids = reindexing_vertice(graph->size(), vertex_set);
   std::vector<uint32_t> degrees(nv, 0); // degrees of vertices in the subgraph
   for (auto v : vertex_set) {
-	degrees[new_ids[v]] = std::distance(g.edge_begin(v), g.edge_end(v));
+	degrees[new_ids[v]] = g.get_degree(v);
   }
   auto offsets = deepgalois::parallel_prefix_sum(degrees);
   auto ne = offsets[nv];
@@ -142,7 +138,7 @@ void Sampler::generate_subgraph(VertexSet &vertex_set, Graph &g, Graph &sub) {
     sub.fixEndEdge(i, offsets[i+1]);
     unsigned j = 0;
     auto old_id = old_ids[i];
-    for (auto e : g.edges(old_id)) {
+    for (auto e = g.edge_begin(old_id); e != g.edge_end(old_id); e++) {
       sub.constructEdge(offsets[i]+j, g.getEdgeDst(e), 0);
       j ++;
     }
diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp
index de999a095e..fa492172a5 100644
--- a/lonestargnn/gcn/gcn.cpp
+++ b/lonestargnn/gcn/gcn.cpp
@@ -18,7 +18,7 @@ int main(int argc, char** argv) {
   LonestarGnnStart(argc, argv, name, desc, url);
   deepgalois::Net network; // the neural network to train
 
-  Graph* dGraph = NULL;
+  deepgalois::Graph* dGraph = NULL;
 #ifdef GALOIS_USE_DIST
   std::vector<unsigned> dummyVec;
   dGraph = galois::graphs::constructSymmetricGraph<char, void>(dummyVec);

From c63e46440efe915474aedafcb626f9f9047d9530 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Wed, 29 Apr 2020 16:41:05 -0500
Subject: [PATCH 211/660] fix mkl csrmm

---
 libdeepgalois/CMakeLists.txt                  |   3 +-
 libdeepgalois/include/deepgalois/context.h    |  27 ++
 .../deepgalois/layers/graph_conv_layer.h      |   2 +-
 .../include/deepgalois/math_functions.hh      |  68 ++---
 libdeepgalois/include/deepgalois/utils.h      |   4 -
 libdeepgalois/src/context.cpp                 |  28 +-
 libdeepgalois/src/layers/aggregator.cpp       |   4 +
 libdeepgalois/src/layers/graph_conv_layer.cpp |  12 +-
 libdeepgalois/src/math_functions.cpp          | 281 ++++--------------
 9 files changed, 140 insertions(+), 289 deletions(-)

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index be5853f987..2ede00abbc 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -105,10 +105,11 @@ set(sources
 )
 endif()
 
+set(BOOST_LIBRARIES "-lboost_system -lboost_thread")
 add_library(dg_cpu STATIC ${sources})
 target_link_libraries(dg_cpu galois_shmem)
 target_link_libraries(dg_cpu ${MPI_CXX_LIBRARIES})
-target_link_libraries(dg_cpu ${BLAS_LIB})
+target_link_libraries(dg_cpu ${BLAS_LIB} ${BOOST_LIBRARIES})
 target_include_directories(dg_cpu PUBLIC
   ${CMAKE_SOURCE_DIR}/libgalois/include
   ${CMAKE_CURRENT_SOURCE_DIR}/include
diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h
index e368319dff..ffbaecb0d3 100644
--- a/libdeepgalois/include/deepgalois/context.h
+++ b/libdeepgalois/include/deepgalois/context.h
@@ -6,6 +6,7 @@
 #include <string>
 #include <cassert>
 #include "deepgalois/types.h"
+#include <boost/shared_ptr.hpp>
 #ifdef CPU_ONLY
 #include "deepgalois/gtypes.h"
 #else
@@ -15,10 +16,13 @@
 
 namespace deepgalois {
 
+using boost::shared_ptr;
+
 class Context {
 public:
   Context();
   ~Context();
+  static Context& Get();
 
   size_t read_graph(std::string dataset_str, bool selfloop);
   size_t read_graph_cpu(std::string dataset_str, std::string filetype, bool selfloop);
@@ -66,6 +70,28 @@ class Context {
   inline static curandGenerator_t curand_generator() { return curand_generator_; }
 #endif
 
+  // This random number generator facade hides boost and CUDA rng
+  // implementation from one another (for cross-platform compatibility).
+  class RNG {
+   public:
+    RNG();
+    explicit RNG(unsigned int seed);
+    explicit RNG(const RNG&);
+    RNG& operator=(const RNG&);
+    void* generator();
+   private:
+    class Generator;
+    shared_ptr<Generator> generator_;
+  };
+
+  // Getters for boost rng, curand, and cublas handles
+  inline static RNG& rng_stream() {
+    if (!Get().random_generator_) {
+      Get().random_generator_.reset(new RNG());
+    }
+    return *(Get().random_generator_);
+  }
+
 protected:
   size_t n;                    // number of samples: N
   size_t num_classes;          // number of classes: E
@@ -82,6 +108,7 @@ class Context {
   float_t* d_feats;            // input features on device
   float_t* d_feats_subg;       // input features for subgraph on device
   float_t* norm_factor;        // normalization constant based on graph structure
+  shared_ptr<RNG> random_generator_;
 
 #ifdef CPU_ONLY
   void read_edgelist(const char* filename, bool symmetrize = false, bool add_self_loop = false);
diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
index dc38642330..6cc40c266d 100644
--- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
@@ -67,7 +67,7 @@ class graph_conv_layer : public layer {
   float_t* in_temp;
   float_t* in_temp1;
   float_t* trans_data;    // y*x
-  unsigned* dropout_mask; // x*y
+  mask_t* dropout_mask; // x*y
 
   // Glorot & Bengio (AISTATS 2010)
   inline void rand_init_matrix(size_t dim_x, size_t dim_y, vec_t& matrix, unsigned seed=1);
diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh
index 72b836da64..05a63ee9ca 100644
--- a/libdeepgalois/include/deepgalois/math_functions.hh
+++ b/libdeepgalois/include/deepgalois/math_functions.hh
@@ -12,7 +12,7 @@
 
 #ifdef USE_MKL
 #include <mkl.h>
-#else  // If use MKL, simply include the MKL header
+#else
 extern "C" {
 #include <cblas.h>
 }
@@ -23,26 +23,33 @@ namespace deepgalois {
 namespace math {
 //! add 2 arrays for n elements
 void vadd_cpu(size_t n, const float_t* a, const float_t* b, float_t* out);
+
 //! multiply n elements of vector by scalar
 void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out);
+
 //! do dot product of 2 vectors
 float_t dot(const vec_t& x, const vec_t& y);
 void axpy(size_t n, const float_t a, float_t *x, float_t *y);
 int argmax(const size_t n, const float_t* x); // the arguments of the maxima
+
 //! Computes half the L2 norm of a tensor without the sqrt: output = sum(t ** 2) / 2
 float_t l2_norm(size_t n, const float_t* a);
+
 //! clear n elements of a vector
 void clear_cpu(size_t n, float_t* in);
+
 // dropout functions randomly remove weights
-void dropout_cpu(size_t n, const float scale, const float dropout_rate,
-             const float_t* in, unsigned* mask, float_t* out);
+void dropout_cpu(size_t n, size_t m, float scale, float dropout_rate, const float_t* in, mask_t* mask, float_t* out);
+
 // dropout derivative: use existing dropouts in masks instead of generating them;
-void d_dropout_cpu(size_t n, const float scale, const float_t* in_diff,
-               unsigned* mask, float_t* out_diff);
+void d_dropout_cpu(size_t n, size_t m, float scale, const float_t* in, mask_t* mask, float_t* out);
+
 //! ReLU = keep if positive
 void relu_cpu(size_t n, const float_t* in, float_t* out);
+
 //! ReLU derivative; generally, 1 if data > 0, 0 otherwise
 void d_relu_cpu(size_t n, const float_t* in, const float_t* data, float_t* out);
+
 void leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in, float_t* out);
 void d_leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in, const float_t* data, float_t* out);
 
@@ -65,61 +72,28 @@ void d_sigmoid(size_t n, const float_t* y, const float_t* p, float_t* dy, const
 
 //! copy vector from in -> out; first len elements
 void copy_cpu(size_t len, const float_t* in, float_t* out);
+
 // single-precision dense matrix multiply
 void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
                const int M, const int N, const int K, const float alpha,
                const float* A, const float* B, const float beta, float* C);
+
 // single-precision sparse matrix dense matrix multiply, C = A * B, A is sparse
 void csrmm_cpu(const int M, const int N, const int K, const int nnz, 
                const float alpha, const float* A_nonzeros, 
 	           const int* A_idx_ptr, const int* A_nonzero_idx,
                const float* B, const float beta, float* C);
-} // deepgalois
-} // math
 
-//! clear entire vector
-void clear(vec_t& in);
-//! multiply vector by scalar
-void mul_scalar(const float_t alpha, vec_t& Y);
-//! add two same size vectors into out
-void vadd(const vec_t& a, const vec_t& b, vec_t& out); // vector add
 // dropout functions randomly remove weights
-void dropout(const float scale, const float dropout_rate, const vec_t& in,
-             std::vector<unsigned>& mask, vec_t& out); // dropout
-void dropout(const float scale, const float dropout_rate, const vec_t& in,
-             std::vector<unsigned>& mask, float_t* out);
-void d_dropout(const float scale, const vec_t& in_diff,
-               std::vector<unsigned>& mask, vec_t& out_diff);
-//! ReLU = keep if positive
-void relu(const vec_t& in, vec_t& out);
-//! copy vector from in -> out
-void copy1D1D(const vec_t& in, vec_t& out);
-//! matrix multiply
-void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z,
-                const float_t* A, const float_t* B,
-                float_t* C); // matrix multiply
-//! transposes a matrix (vector)
-void transpose(size_t x, size_t y, const vec_t& in, vec_t& out);
+void dropout(float scale, float dropout_rate, const float_t* in, mask_t* mask, float_t* out);
+void d_dropout(const float scale, const float_t* in, mask_t* mask, float_t* out);
+
 //! transposes a matrix (malloc'd array)
 void transpose(size_t x, size_t y, const float_t* in, float_t* out);
-void vsub(const vec_t& a, const vec_t& b, vec_t& out);
-void vmul(const vec_t& a, const vec_t& b, vec_t& out);
-void vdiv(const vec_t& a, const vec_t& b, vec_t& out);
-void add_scalar(const float_t alpha, vec_t& Y);
-void sub_scalar(const float_t alpha, vec_t& Y);
-void div_scalar(const float_t alpha, vec_t& Y);
-//void mvmul(const vec_t& matrix, const vec_t& in_vector, vec_t& out_vector);
-void mvmul(size_t m, size_t n, const float_t *matrix, const float_t *in_vector, float_t *out_vector);
-void vvmul(const vec_t& a, const vec_t& b, tensor_t& out);
-void matadd(size_t x, size_t y, const tensor_t& A, const tensor_t& B,
-            tensor_t& C);
-void copy2D1D(const tensor_t& in, vec_t& out);
-void matmul2D(const tensor_t& A, const tensor_t& B, tensor_t& C);
-void matmul2D1D(const size_t dim_y, const tensor_t& A, const vec_t& B,
-                vec_t& C);
-void transpose2D(const tensor_t& in, tensor_t& out);
-void transpose2D1D(const tensor_t& in, vec_t& out);
-int argmax(const size_t n, const vec_t& x);   // the arguments of the maxima
+void mvmul(size_t m, size_t n, const float_t *matrix, const float_t *in, float_t *out);
+
+} // math
+} // deepgalois
 
 // GPU operators
 bool isnan_gpu(int n, const float_t *array); // does array contain any 'nan' element
diff --git a/libdeepgalois/include/deepgalois/utils.h b/libdeepgalois/include/deepgalois/utils.h
index 60974b9f8a..c8bb1d4e41 100644
--- a/libdeepgalois/include/deepgalois/utils.h
+++ b/libdeepgalois/include/deepgalois/utils.h
@@ -96,10 +96,6 @@ uniform_rand(T min, T max) {
   return dst(random_generator::get_instance()());
 }
 
-inline bool bernoulli(float_t p) {
-  return uniform_rand(float_t(0), float_t(1)) > p;
-}
-
 // sequential prefix sum
 template <typename InTy = unsigned, typename OutTy = unsigned>
 inline std::vector<OutTy> prefix_sum(const std::vector<InTy> &in) {
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index 9b5b858206..efbc525a32 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -4,9 +4,20 @@
 #include "deepgalois/context.h"
 #include "deepgalois/utils.h"
 #include "deepgalois/configs.h"
+#include <boost/thread.hpp>
 
 namespace deepgalois {
 
+// Make sure each thread can have different values.
+static boost::thread_specific_ptr<Context> thread_instance_;
+
+Context& Context::Get() {
+  if (!thread_instance_.get()) {
+    thread_instance_.reset(new Context());
+  }
+  return *(thread_instance_.get());
+}
+
 #ifdef CPU_ONLY
 Context::Context() : n(0), num_classes(0), 
   feat_len(0), is_single_class(true), 
@@ -129,15 +140,28 @@ void Context::add_selfloop(Graph &og, Graph &g) {
 void Context::norm_factor_counting(size_t g_size) {
   auto g = getGraphPointer();
   auto subg = getSubgraphPointer();
-  g->degree_counting();
   if (use_subgraph) g = subg;
+  g->degree_counting();
+#ifdef USE_MKL
+  if (norm_factor == NULL) norm_factor = new float_t[g->sizeEdges()];
+  galois::do_all(galois::iterate((size_t)0, g_size), [&](auto i) {
+    float_t c_i = std::sqrt(float_t(g->get_degree(i)));
+    for (auto e = g->edge_begin(i); e != g->edge_end(i); e++) {
+      const auto j = g->getEdgeDst(e);
+      float_t c_j = std::sqrt(float_t(g->get_degree(j)));
+      if (c_i == 0.0 || c_j == 0.0) norm_factor[e] = 0.0;
+      else norm_factor[e] = 1.0 / (c_i * c_j);
+    }
+  }, galois::loopname("NormCountingEdge"));
+#else
   if (norm_factor == NULL) norm_factor = new float_t[g_size];
   galois::do_all(galois::iterate((size_t)0, g_size), [&](auto v) {
     auto degree  = g->get_degree(v);
     float_t temp = std::sqrt(float_t(degree));
     if (temp == 0.0) norm_factor[v] = 0.0;
     else norm_factor[v] = 1.0 / temp;
-  }, galois::loopname("NormCounting"));
+  }, galois::loopname("NormCountingVertex"));
+#endif
 }
 
 void Context::read_edgelist(const char* filename, bool symmetrize, bool add_self_loop) {
diff --git a/libdeepgalois/src/layers/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp
index 7dc1752436..bd76b8b99b 100644
--- a/libdeepgalois/src/layers/aggregator.cpp
+++ b/libdeepgalois/src/layers/aggregator.cpp
@@ -37,9 +37,13 @@ void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* ou
 
 void deepgalois::update_all_csrmm(size_t len, Graph& g, const float_t* in, float_t* out,
                                   bool norm, const float_t* norm_factor) {
+  galois::StatTimer Tcsrmm("CSRMM-MKL");
+  //galois::gPrint("csrmm mkl\n");
+  Tcsrmm.start();
   unsigned n = g.size();
   math::clear_cpu(n*len, out);
   math::csrmm_cpu(n, len, n, g.sizeEdges(), 1.0, norm_factor, 
             (const int*)g.row_start_ptr(), (const int*)g.edge_dst_ptr(), in, 0.0, out);
+  Tcsrmm.stop();
 }
 #endif
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index dae3d14ce5..c5c73b1f36 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -13,7 +13,7 @@ graph_conv_layer::graph_conv_layer(unsigned level, bool act, bool norm,
   assert(input_dims[0] == output_dims[0]); // num_vertices
   trainable_ = true;
   name_      = layer_type() + "_" + std::to_string(level);
-  assert(dropout_rate_ < 1.);
+  assert(dropout_rate_ >= 0. && dropout_rate_ < 1.);
   scale_ = 1. / (1. - dropout_rate_);
 }
 
@@ -61,8 +61,8 @@ void graph_conv_layer::d_aggregate(size_t len, Graph& g, const float_t* in, floa
 void graph_conv_layer::combine(size_t n, size_t len, const float_t* self, const float_t* neighbors, float_t* out) {
   float_t *a = new float_t[len];
   float_t *b = new float_t[len];
-  mvmul(n, len, &Q[0], self, a);
-  mvmul(n, len, &W[0], neighbors, b);
+  math::mvmul(n, len, &Q[0], self, a);
+  math::mvmul(n, len, &W[0], neighbors, b);
   math::vadd_cpu(len, a, b, out); // out = W*self + Q*neighbors
 }
 
@@ -90,7 +90,7 @@ void graph_conv_layer::malloc_and_init() {
   // rand_init_matrix(y, z, Q);
   zero_init_matrix(y, z, layer::weight_grad);
 
-  if (dropout_) dropout_mask = new unsigned[x * y];
+  if (dropout_) dropout_mask = new mask_t[x * y];
   in_temp  = new float_t[x * y];
   out_temp = new float_t[x * z];
   trans_data = new float_t[y * x]; // y*x
@@ -108,7 +108,7 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_
   // if y > z: mult W first to reduce the feature size for aggregation
   // else: aggregate first then mult W
   if (dropout_ && phase_ == net_phase::train)
-    math::dropout_cpu(x*y, scale_, dropout_rate_, in_data, dropout_mask, in_temp);
+    math::dropout_cpu(x, y, scale_, dropout_rate_, in_data, dropout_mask, in_temp);
   else math::copy_cpu(x*y, in_data, in_temp); 
 
   if (y > z) {
@@ -165,7 +165,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
 #endif
 
   if (level_ != 0 && dropout_)
-    math::d_dropout_cpu(x*y, scale_, in_grad, dropout_mask, in_grad);
+    math::d_dropout_cpu(x, y, scale_, in_grad, dropout_mask, in_grad);
 
 #ifdef GALOIS_USE_DIST
   layer::syncSub->sync<writeAny, readAny, GradientSync>("GradientSync");
diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp
index aa41ffc41f..4c0354cccc 100644
--- a/libdeepgalois/src/math_functions.cpp
+++ b/libdeepgalois/src/math_functions.cpp
@@ -1,7 +1,9 @@
 #include "deepgalois/math_functions.hh"
 #include "galois/Timer.h"
 #include "galois/Galois.h"
+#include <random>
 #include <immintrin.h>
+#include <boost/random.hpp>
 #include "deepgalois/utils.h"
 
 #ifdef USE_MKL
@@ -18,7 +20,29 @@ extern "C" {
     exit(1);                           \
   } while(0);
 
+std::default_random_engine generator;
+std::uniform_real_distribution<float_t> distribution(0.0,1.0);
+/*
+typedef boost::mt19937 rng_t;
+inline rng_t* deepgalois_rng() {
+  return static_cast<rng_t*>(Context::rng_stream().generator());
+}
+
+void rng_bernoulli(size_t n, const float_t p, uint8_t* r) {
+  boost::bernoulli_distribution<float_t> random_distribution(p);
+  boost::variate_generator<rng_t*, boost::bernoulli_distribution<float_t> >
+      variate_generator(deepgalois_rng(), random_distribution);
+  for (size_t i = 0; i < n; ++i)
+    r[i] = variate_generator();
+}
+*/
 namespace deepgalois {
+
+inline uint8_t bernoulli(float_t p) {
+  //return uniform_rand(float_t(0), float_t(1)) > p ? 1 : 0;
+  return distribution(generator) > p ? 1 : 0;
+}
+
 namespace math {
 
 //! wrapper function to call cblas_sgemm
@@ -41,8 +65,7 @@ void csrmm_cpu(const int M, const int N, const int K, const int nnz,
 #ifdef USE_MKL
   const char *matdescra = "GXXCX";//6 bytes
   const char transa = 'N';
-  printf("Calling Intel MKL\n");
-  exit(1);
+  //printf("Calling Intel MKL\n"); exit(1);
   mkl_scsrmm(&transa, &M , &N, &K, &alpha , matdescra,
              A_nonzeros, A_nnz_idx, A_idx_ptr, A_idx_ptr+1,
              B, &N, &beta , C, &N);
@@ -168,55 +191,40 @@ void clear(vec_t& in) {
 }
 
 void clear_cpu(size_t n, float_t* in) {
-  for (size_t i = 0; i < n; i++) in[i] = 0;
+  //for (size_t i = 0; i < n; i++) in[i] = 0;
+  std::fill(in, in+n, 0);
   // memset(in, 0, n*sizeof(float_t));
 }
 
-void dropout(const float scale, const float dropout_rate, const vec_t& in,
-             std::vector<unsigned>& masks, vec_t& out) {
-  assert(masks.size() == out.size());
-  // rng_bernoulli(1. - dropout_rate, masks); // Create random numbers
-  for (size_t i = 0; i < in.size(); ++i)
-    masks[i] = deepgalois::bernoulli(dropout_rate)?1:0;
-  for (size_t i = 0; i < in.size(); ++i)
-    out[i] = in[i] * masks[i] * scale;
+void dropout(size_t m, float scale, float dropout_rate, 
+             const float_t* in, mask_t* masks, float_t* out) {
+  for (size_t i = 0; i < m; ++i)
+    masks[i] = deepgalois::bernoulli(dropout_rate);
+  for (size_t i = 0; i < m; ++i)
+    out[i] = in[i] * (float_t)masks[i] * scale;
 }
 
-void dropout(const float scale, const float dropout_rate, const vec_t& in,
-             std::vector<unsigned>& masks, float_t* out) {
-  for (size_t i = 0; i < in.size(); ++i)
-    masks[i] = deepgalois::bernoulli(dropout_rate)?1:0;
-  for (size_t i = 0; i < in.size(); ++i)
-    out[i] = in[i] * masks[i] * scale;
-}
-
-void dropout_cpu(size_t n, const float scale, const float dropout_rate,
-             const float_t* in, unsigned* masks, float_t* out) {
-  galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) {
-    masks[i] = deepgalois::bernoulli(dropout_rate)?1:0;
-    out[i] = in[i] * masks[i] * scale;
+void dropout_cpu(size_t n, size_t m, float scale, float dropout_rate,
+             const float_t* in, mask_t* masks, float_t* out) {
+  for (size_t i = 0; i < n*m; ++i)
+    masks[i] = deepgalois::bernoulli(dropout_rate);
+  galois::do_all(galois::iterate((size_t)0, n*m), [&](const auto& i) {
+    out[i] = in[i] * (float_t)masks[i] * scale;
   }, galois::loopname("dropout"));
 }
 
-void d_dropout(const float scale, const vec_t& in_diff,
-               std::vector<unsigned>& masks, vec_t& out_diff) {
-  for (size_t i = 0; i < in_diff.size(); ++i)
-    out_diff[i] = in_diff[i] * masks[i] * scale;
+void d_dropout(size_t m, float scale, const float_t* in, mask_t* masks, float_t* out) {
+  for (size_t i = 0; i < m; ++i)
+    out[i] = in[i] * (float_t)masks[i] * scale;
 }
 
-void d_dropout_cpu(size_t n, const float scale, const float_t* in,
-               unsigned* masks, float_t* out) {
-  galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) {
-    out[i] = in[i] * masks[i] * scale;
+void d_dropout_cpu(size_t n, size_t m, float scale, const float_t* in,
+                   mask_t* masks, float_t* out) {
+  galois::do_all(galois::iterate((size_t)0, n*m), [&](const auto& i) {
+    out[i] = in[i] * (float_t)masks[i] * scale;
   }, galois::loopname("d_dropout"));
 }
 
-void relu(const vec_t& in, vec_t& out) {
-  for (size_t i = 0; i < out.size(); ++i) {
-    out[i] = std::max(in[i], (float_t)0);
-  }
-}
-
 void relu_cpu(size_t n, const float_t* in, float_t* out) {
   // TODO: vectorize
   galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) {
@@ -373,15 +381,6 @@ void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z,
   sgemm_cpu(CblasNoTrans, CblasNoTrans, dim_x, dim_y, dim_z, 1.0, A, B, 0.0, C);
 }
 
-// TODO make parallel
-void transpose(size_t x, size_t y, const vec_t& in, vec_t& out) {
-  for (size_t i = 0; i < y; i++) {
-    for (size_t j = 0; j < x; j++) {
-      out[i * x + j] = in[j * y + i];
-    }
-  }
-}
-
 // TODO make parallel
 void transpose(size_t x, size_t y, const float_t* in, float_t* out) {
   for (size_t i = 0; i < y; i++) {
@@ -390,51 +389,6 @@ void transpose(size_t x, size_t y, const float_t* in, float_t* out) {
     }
   }
 }
-} // deepgalois
-} // math
-
-
-
-// vector subtract
-void vsub(const vec_t& in_a, const vec_t& in_b, vec_t& out) {
-  for (size_t i = 0; i < out.size(); ++i)
-    out[i] = in_a[i] - in_b[i];
-}
-
-// vector multiply
-void vmul(const vec_t& in_a, const vec_t& in_b, vec_t& out) {
-  for (size_t i = 0; i < out.size(); ++i)
-    out[i] = in_a[i] * in_b[i];
-}
-
-// vector divide
-void vdiv(const vec_t& in_a, const vec_t& in_b, vec_t& out) {
-  for (size_t i = 0; i < out.size(); ++i) {
-    assert(in_b[i] != 0);
-    out[i] = in_a[i] / in_b[i];
-  }
-}
-
-// vector add scalar
-void add_scalar(const float_t alpha, vec_t& Y) {
-  for (size_t i = 0; i < Y.size(); ++i)
-    Y[i] += alpha;
-}
-
-// vector subtract scalar
-void sub_scalar(const float_t alpha, vec_t& Y) {
-  for (size_t i = 0; i < Y.size(); ++i)
-    Y[i] -= alpha;
-}
-
-
-// vector divide scalar
-void div_scalar(const float_t alpha, vec_t& Y) {
-  assert(alpha != 0);
-  for (size_t i = 0; i < Y.size(); ++i)
-    Y[i] /= alpha;
-}
-
 
 // matrix-vector multiply
 void mvmul(size_t m, size_t n, const float_t *matrix, const float_t *in_vector, float_t *out_vector) {
@@ -445,143 +399,14 @@ void mvmul(size_t m, size_t n, const float_t *matrix, const float_t *in_vector,
   }
 }
 
-// vector-vector multiply
-void vvmul(const vec_t& a, const vec_t& b, tensor_t& out) {
-  size_t m = a.size();
-  size_t n = b.size();
-  for (size_t i = 0; i < m; ++i) {
-    for (size_t j = 0; j < n; ++j) {
-      out[i][j] += a[i] * b[j];
-    }
-  }
-}
-
-// matrix addition
-void matadd(size_t x, size_t y, const tensor_t& A, const tensor_t& B,
-            tensor_t& C) {
-  for (size_t i = 0; i < x; ++i)
-    for (size_t j = 0; j < y; ++j)
-      C[i][j] = A[i][j] + B[i][j];
-}
-
-// TODO: vectorize
-void copy2D1D(const tensor_t& in, vec_t& out) {
-  size_t x = in.size();
-  size_t y = in[0].size();
-  auto ptr = &out[0];
-  for (size_t i = 0; i < x; i++) {
-    std::copy(in[i].begin(), in[i].end(), ptr);
-    ptr += y;
-  }
-}
-
-
-
-void matmul2D(const tensor_t& A, const tensor_t& B, tensor_t& C) {
-  // A: x*z; B: z*y; C: x*y
-  size_t dim_x = A.size();
-  size_t dim_y = C[0].size();
-  size_t dim_z = A[0].size();
-  assert(C.size() == dim_x);
-  assert(B.size() == dim_z);
-  assert(B[0].size() == dim_y);
-
-  for (size_t i = 0; i < dim_x; ++i) {
-    for (size_t j = 0; j < dim_y; ++j) {
-      C[i][j] = 0;
-      for (size_t k = 0; k < dim_z; ++k) {
-        C[i][j] += A[i][k] * B[k][j];
-      }
-    }
-  }
-}
-
-
-void matmul2D1D(const size_t dim_y, const tensor_t& A, const vec_t& B,
-                vec_t& C) {
-  // A: x*z; B: z*y; C: x*y
-  size_t dim_x = A.size();
-  size_t dim_z = A[0].size();
-  assert(B.size() == dim_z * dim_y);
-  assert(C.size() == dim_x * dim_y);
-  vec_t A1D(dim_x * dim_z);
-  copy2D1D(A, A1D);
-  deepgalois::math::matmul1D1D(dim_x, dim_y, dim_z, &A1D[0], &B[0], &C[0]);
-}
-
-void matmul(const tensor_t& A, const vec_t& B, tensor_t& C) {
-  // A: x*z; B: z*y; C: x*y
-  size_t dim_x = C.size();
-  size_t dim_y = C[0].size();
-  size_t dim_z = A[0].size();
-  assert(A.size() == dim_x);
-  assert(B.size() == dim_y * dim_z);
-  vec_t A1D(dim_x * dim_z);
-  vec_t C1D(dim_x * dim_y, 0);
-  auto ptr = &A1D[0];
-  for (size_t i = 0; i < dim_x; i++) {
-    std::copy(A[i].begin(), A[i].end(), ptr);
-    ptr += dim_z;
-  }
-  deepgalois::math::matmul1D1D(dim_x, dim_y, dim_z, &A1D[0], &B[0], &C1D[0]);
-  for (size_t i = 0; i < dim_x; i++) {
-    for (size_t j = 0; j < dim_y; ++j) {
-      C[i][j] = C1D[i * dim_y + j];
-    }
-  }
-}
-
-void transpose2D(const tensor_t& in, tensor_t& out) {
-  size_t x = in.size();
-  size_t y = in[0].size();
-  for (size_t i = 0; i < y; i++) {
-    for (size_t j = 0; j < x; j++) {
-      out[i][j] = in[j][i];
-    }
-  }
-}
-
-// TODO: vectorize
-void transpose2D1D(const tensor_t& in, vec_t& out) {
-  size_t x = in.size();
-  size_t y = in[0].size();
-  assert(out.size() == x * y);
-  for (size_t i = 0; i < y; i++) {
-    for (size_t j = 0; j < x; j++) {
-      out[i * x + j] = in[j][i];
-    }
-  }
-}
-
-
-int argmax(const size_t n, const vec_t& x) {
-  float_t max = x[0];
-  int max_ind = 0;
-  for (size_t i = 1; i < n; i++) {
-    if (x[i] > max) {
-      max_ind = i;
-      max     = x[i];
-    }
+float reduce_mean(size_t n, const float_t* x) {
+  float_t sum = 0.;
+  for (size_t i = 0; i < n; i++) {
+    sum += (float_t)x[i];
   }
-  return max_ind;
-}
-
-void d_mvmul(vec_t& in_diff, vec_t& h_in, tensor_t& out_diff) {
-  vvmul(h_in, in_diff, out_diff); // transposed feature matrix X^T times in_diff
+  return sum / (float_t)n;
 }
 
-void d_vadd(vec_t& in_diff, vec_t& out_diff) {
-  for (size_t i = 0; i < out_diff.size(); ++i)
-    out_diff[i] = in_diff[i];
-}
-
-float reduce_mean(const vec_t& x) {
-  size_t n = x.size();
-  assert(n > 0);
-  float sum = (float)x[0];
-  for (size_t i = 1; i < n; i++) {
-    sum += (float)x[i];
-  }
-  return sum / (float)n;
-}
+} // end namespace math
+} // end namespace deepgalois
 

From 59e506f04fa2ad91e05c17dc81cb281ca0006d01 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Wed, 29 Apr 2020 20:20:03 -0500
Subject: [PATCH 212/660] clean math

---
 .../include/deepgalois/math_functions.hh      |  58 +++----
 libdeepgalois/src/context.cpp                 |   5 +-
 libdeepgalois/src/layers/aggregator.cpp       |   4 +-
 libdeepgalois/src/layers/graph_conv_layer.cpp |   7 +-
 libdeepgalois/src/lgraph.cpp                  |  28 ++--
 libdeepgalois/src/math_functions.cpp          | 143 +++++-------------
 libdeepgalois/src/sampler.cpp                 |   2 +
 7 files changed, 94 insertions(+), 153 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh
index 05a63ee9ca..a39e463ecc 100644
--- a/libdeepgalois/include/deepgalois/math_functions.hh
+++ b/libdeepgalois/include/deepgalois/math_functions.hh
@@ -21,15 +21,37 @@ extern "C" {
 namespace deepgalois {
 
 namespace math {
+
+// single-precision dense matrix multiply
+void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
+               const int M, const int N, const int K, const float alpha,
+               const float* A, const float* B, const float beta, float* C);
+
+// single-precision sparse matrix dense matrix multiply, C = A * B, A is sparse
+void csrmm_cpu(const int M, const int N, const int K, const int nnz, 
+               const float alpha, const float* A_nonzeros, 
+	           const int* A_idx_ptr, const int* A_nonzero_idx,
+               const float* B, const float beta, float* C);
+
+// matrix-vector multiply
+void mvmul(const CBLAS_TRANSPOSE TransA, const int M, const int N, const float alpha, 
+           const float* A, const float* x, const float beta, float* y);
+
 //! add 2 arrays for n elements
 void vadd_cpu(size_t n, const float_t* a, const float_t* b, float_t* out);
 
 //! multiply n elements of vector by scalar
-void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out);
+void scal(size_t n, const float_t alpha, float_t* x);
+void scale(size_t n, const float_t alpha, const float_t* x, float_t* y);
+void mul_scalar(size_t n, const float_t alpha, const float_t* x, float_t* y);
 
 //! do dot product of 2 vectors
-float_t dot(const vec_t& x, const vec_t& y);
+float_t dot(size_t n, const float_t* x, const float_t* y);
+
+// SAXPY stands for “Single-precision A*X Plus Y"
 void axpy(size_t n, const float_t a, float_t *x, float_t *y);
+
+// Returns the index of the maximum value
 int argmax(const size_t n, const float_t* x); // the arguments of the maxima
 
 //! Computes half the L2 norm of a tensor without the sqrt: output = sum(t ** 2) / 2
@@ -38,60 +60,42 @@ float_t l2_norm(size_t n, const float_t* a);
 //! clear n elements of a vector
 void clear_cpu(size_t n, float_t* in);
 
+//! copy vector from in -> out; first len elements
+void copy_cpu(size_t len, const float_t* in, float_t* out);
+
 // dropout functions randomly remove weights
 void dropout_cpu(size_t n, size_t m, float scale, float dropout_rate, const float_t* in, mask_t* mask, float_t* out);
 
 // dropout derivative: use existing dropouts in masks instead of generating them;
 void d_dropout_cpu(size_t n, size_t m, float scale, const float_t* in, mask_t* mask, float_t* out);
 
-//! ReLU = keep if positive
+//! ReLU = keep if positive; and ReLU derivative: 1 if data > 0, 0 otherwise
 void relu_cpu(size_t n, const float_t* in, float_t* out);
-
-//! ReLU derivative; generally, 1 if data > 0, 0 otherwise
 void d_relu_cpu(size_t n, const float_t* in, const float_t* data, float_t* out);
 
+// Leaky ReLU
 void leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in, float_t* out);
 void d_leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in, const float_t* data, float_t* out);
 
 // Loss function for single-class label (one-hot) data: softmax
-void softmax(const vec_t& input, vec_t& output);
 void softmax(size_t n, const float_t* input, float_t* output);
-void d_softmax(const vec_t& y, const vec_t& p, vec_t& dy, const vec_t& dp);
 void d_softmax(size_t n, const float_t* y, const float_t* p, float_t* dy, const float_t* dp);
 
-float_t cross_entropy(const vec_t& y, const vec_t& p);
+// Cross entropy
 float_t cross_entropy(size_t n, const float_t* y, const float_t* p);
-void d_cross_entropy(const vec_t& y, const vec_t& p, vec_t& d);
 void d_cross_entropy(size_t n, const float_t* y, const float_t* p, float_t* d);
 
 // Loss function for multi-class label (one-hot) data: sigmoid
-void sigmoid(const vec_t& input, vec_t& output);
 void sigmoid(size_t n, const float_t* input, float_t* output);
-void d_sigmoid(const vec_t& y, const vec_t& p, vec_t& dy, const vec_t& dp);
 void d_sigmoid(size_t n, const float_t* y, const float_t* p, float_t* dy, const float_t* dp);
 
-//! copy vector from in -> out; first len elements
-void copy_cpu(size_t len, const float_t* in, float_t* out);
-
-// single-precision dense matrix multiply
-void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
-               const int M, const int N, const int K, const float alpha,
-               const float* A, const float* B, const float beta, float* C);
-
-// single-precision sparse matrix dense matrix multiply, C = A * B, A is sparse
-void csrmm_cpu(const int M, const int N, const int K, const int nnz, 
-               const float alpha, const float* A_nonzeros, 
-	           const int* A_idx_ptr, const int* A_nonzero_idx,
-               const float* B, const float beta, float* C);
-
 // dropout functions randomly remove weights
 void dropout(float scale, float dropout_rate, const float_t* in, mask_t* mask, float_t* out);
 void d_dropout(const float scale, const float_t* in, mask_t* mask, float_t* out);
 
 //! transposes a matrix (malloc'd array)
 void transpose(size_t x, size_t y, const float_t* in, float_t* out);
-void mvmul(size_t m, size_t n, const float_t *matrix, const float_t *in, float_t *out);
-
+ 
 } // math
 } // deepgalois
 
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index efbc525a32..37c9a33e04 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -142,8 +142,9 @@ void Context::norm_factor_counting(size_t g_size) {
   auto subg = getSubgraphPointer();
   if (use_subgraph) g = subg;
   g->degree_counting();
+  if (norm_factor != NULL) free(norm_factor);
 #ifdef USE_MKL
-  if (norm_factor == NULL) norm_factor = new float_t[g->sizeEdges()];
+  norm_factor = new float_t[g->sizeEdges()];
   galois::do_all(galois::iterate((size_t)0, g_size), [&](auto i) {
     float_t c_i = std::sqrt(float_t(g->get_degree(i)));
     for (auto e = g->edge_begin(i); e != g->edge_end(i); e++) {
@@ -154,7 +155,7 @@ void Context::norm_factor_counting(size_t g_size) {
     }
   }, galois::loopname("NormCountingEdge"));
 #else
-  if (norm_factor == NULL) norm_factor = new float_t[g_size];
+  norm_factor = new float_t[g_size];
   galois::do_all(galois::iterate((size_t)0, g_size), [&](auto v) {
     auto degree  = g->get_degree(v);
     float_t temp = std::sqrt(float_t(degree));
diff --git a/libdeepgalois/src/layers/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp
index bd76b8b99b..e3d6f12f78 100644
--- a/libdeepgalois/src/layers/aggregator.cpp
+++ b/libdeepgalois/src/layers/aggregator.cpp
@@ -22,9 +22,9 @@ void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* ou
       if (norm) {
         // normalize b as well
         b = a * norm_factor[dst];
-        vec_t neighbor(len);
+        float_t* neighbor = new float_t[len];
         // scale the neighbor's data using the normalization factor
-        math::mul_scalar(len, b, &in[dst * len], &neighbor[0]);
+        math::scale(len, b, &in[dst * len], neighbor);
         // use scaled data to update; out[src] += in[dst]
         math::vadd_cpu(len, &out[src * len], &neighbor[0],  &out[src * len]);
       } else {
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index c5c73b1f36..31622e0699 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -61,8 +61,8 @@ void graph_conv_layer::d_aggregate(size_t len, Graph& g, const float_t* in, floa
 void graph_conv_layer::combine(size_t n, size_t len, const float_t* self, const float_t* neighbors, float_t* out) {
   float_t *a = new float_t[len];
   float_t *b = new float_t[len];
-  math::mvmul(n, len, &Q[0], self, a);
-  math::mvmul(n, len, &W[0], neighbors, b);
+  math::mvmul(CblasNoTrans, n, len, 1.0, &Q[0], self, 0.0, a);
+  math::mvmul(CblasNoTrans, n, len, 1.0, &W[0], neighbors, 0.0, b);
   math::vadd_cpu(len, a, b, out); // out = W*self + Q*neighbors
 }
 
@@ -72,8 +72,7 @@ void graph_conv_layer::malloc_and_init() {
   size_t z = output_dims[1];
 #ifdef GALOIS_USE_DIST
   // setup gluon
-  layer::gradientGraph = new deepgalois::GluonGradients(layer::weight_grad,
-                                                        y * z);
+  layer::gradientGraph = new deepgalois::GluonGradients(layer::weight_grad, y * z);
   layer::syncSub =
     new galois::graphs::GluonSubstrate<deepgalois::GluonGradients>(
       *layer::gradientGraph, layer::gradientGraph->myHostID(),
diff --git a/libdeepgalois/src/lgraph.cpp b/libdeepgalois/src/lgraph.cpp
index ba4432aca9..6c36eb464b 100644
--- a/libdeepgalois/src/lgraph.cpp
+++ b/libdeepgalois/src/lgraph.cpp
@@ -1,5 +1,6 @@
 #include "deepgalois/lgraph.h"
 #include "deepgalois/utils.h"
+#include "galois/Galois.h"
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <sys/mman.h>
@@ -22,24 +23,33 @@ void LearningGraph::progressPrint(unsigned maxii, unsigned ii) {
 }
 
 void LearningGraph::allocateFrom(index_t nv, index_t ne) {
+  num_vertices_ = nv;
+  num_edges_ = ne;
+  printf("Allocating num_vertices_=%d, num_edges_=%d.\n", num_vertices_, num_edges_);
+  rowptr_ = new index_t[num_vertices_+1];
+  colidx_ = new index_t[num_edges_];
+  rowptr_[0] = 0;
 }
 
 void LearningGraph::constructNodes() {
 }
 
 void LearningGraph::fixEndEdge(index_t vid, index_t row_end) {
+  rowptr_[vid+1] = row_end;
 }
 
 void LearningGraph::constructEdge(index_t eid, index_t dst, edata_t edata) {
+  assert(dst < num_vertices_);
+  assert(eid < num_edges_);
+  colidx_[eid] = dst;
 }
 
 void LearningGraph::degree_counting() {
-/*
-  degrees = new uint32_t[num_vertices_];
-  galois::do_all(galois::iterate(begin(), end()), [&] (auto v) {
-    degrees[v] = std::distance(this->edge_begin(v), this->edge_end(v));
+  if (degrees_ != NULL) return;
+  degrees_ = new index_t[num_vertices_];
+  galois::do_all(galois::iterate(size_t(0), size_t(num_vertices_)), [&] (auto v) {
+    degrees_[v] = rowptr_[v+1] - rowptr_[v];
   }, galois::loopname("DegreeCounting"));
-*/
 }
 
 void LearningGraph::readGraph(std::string path, std::string dataset) {
@@ -76,15 +86,15 @@ void LearningGraph::readGraphFromGRFile(const std::string& filename) {
   assert(version == 1);
   uint64_t sizeEdgeTy = le64toh(*fptr++);
   uint64_t nv = le64toh(*fptr++);
-  uint64_t numEdges = le64toh(*fptr++);
+  uint64_t ne = le64toh(*fptr++);
   uint64_t *outIdx = fptr;
   fptr += nv;
   uint32_t *fptr32 = (uint32_t*)fptr;
   uint32_t *outs = fptr32; 
-  fptr32 += numEdges;
-  if (numEdges % 2) fptr32 += 1;
+  fptr32 += ne;
+  if (ne % 2) fptr32 += 1;
   num_vertices_ = nv;
-  num_edges_ = numEdges;
+  num_edges_ = ne;
   if (sizeEdgeTy != 0) {
     std::cout << "LearningGraph: currently edge data not supported.\n";
     exit(1);
diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp
index 4c0354cccc..08ceaab76c 100644
--- a/libdeepgalois/src/math_functions.cpp
+++ b/libdeepgalois/src/math_functions.cpp
@@ -74,28 +74,25 @@ void csrmm_cpu(const int M, const int N, const int K, const int nnz,
 #endif
 }
 
+// matrix-vector multiply
+void mvmul(const CBLAS_TRANSPOSE TransA, const int M, const int N, const float alpha, 
+           const float* A, const float* x, const float beta, float* y) {
+  cblas_sgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1);
+}
+
+/*
 const size_t vec_len = 8; // for 32-bit floating point in AVX2
 // vector add
-#if defined(__AVX__) || defined(__AVX2__)
 void vadd_cpu(size_t n, const float_t* a, const float_t* b, float_t* out) {
+#ifdef __AVX2__
   const size_t alignedN = n - n % vec_len;
   for (size_t i = 0; i < alignedN; i += vec_len)
     _mm256_storeu_ps(&out[i], _mm256_add_ps(_mm256_loadu_ps(&a[i]), _mm256_loadu_ps(&b[i])));
   for (size_t i = alignedN; i < n; ++i) out[i] = a[i] + b[i];
-}
-
-void vadd(const vec_t& a, const vec_t& b, vec_t& out) {
-  size_t n = out.size();
-  vadd_cpu(n, &a[0], &b[0], &out[0]);
-}
 #else
-void vadd(const vec_t& a, const vec_t& b, vec_t& out) {
-  for (size_t i = 0; i < out.size(); ++i) out[i] = a[i] + b[i];
-}
-void vadd_cpu(size_t n, const float_t* a, const float_t* b, float_t* out) {
   for (size_t i = 0; i < n; ++i) out[i] = a[i] + b[i];
-}
 #endif
+}
 
 #if defined(__AVX__) || defined(__AVX2__)
 void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out) {
@@ -107,7 +104,6 @@ void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out)
 }
 
 // SAXPY stands for “Single-precision A*X Plus Y"
-/*
 void axpy(size_t n, const float_t a, float_t *x, float_t *y) {
   const size_t alignedN = n - n % vec_len;
   const __m256 alpha = _mm256_set1_ps(a);
@@ -128,27 +124,32 @@ float_t l2_norm(size_t n, const float_t* in) {
   __m256 sum = _mm256_hadd_ps(vsum, vsum);
   return (((float_t*)&sum)[0] + ((float_t*)&sum)[2]) / 2.0;
 }
-*/
 #else
 // vector multiply scalar
-void mul_scalar(const float_t alpha, vec_t& Y) {
-  for (size_t i = 0; i < Y.size(); ++i) Y[i] *= alpha;
-}
-
 void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out) {
   for (size_t i = 0; i < n; ++i) out[i] = alpha * in[i];
 }
 
-//void axpy(size_t n, const float_t a, float_t *x, float_t *y) {
-//  for (size_t i = 0; i < n; ++i) y[i] = a * x[i] + y[i];
-//}
-
-//float_t l2_norm(size_t n, const float_t* a) {
-//  float_t sum = 0.0;
-//  for (size_t i = 0; i < n; ++i) sum += a[i] * a[i];
-//  return sum / 2.0;
-//}
+float_t l2_norm(size_t n, const float_t* a) {
+  float_t sum = 0.0;
+  for (size_t i = 0; i < n; ++i) sum += a[i] * a[i];
+  return sum / 2.0;
+}
 #endif
+*/
+
+void vadd_cpu(size_t n, const float_t* a, const float_t* b, float_t* y) {
+  vsAdd(n, a, b, y);
+}
+
+void scal(size_t n, const float_t alpha, float_t* x) {
+  cblas_sscal(n, alpha, x, 1);
+}
+
+void scale(size_t n, const float_t alpha, const float_t* x, float_t* y) {
+  cblas_scopy(n, x, 1, y, 1);
+  cblas_sscal(n, alpha, y, 1);
+}
 
 void axpy(size_t n, const float_t a, float_t *x, float_t *y) {
   cblas_saxpy(n, a, x, 1, y, 1);
@@ -166,28 +167,14 @@ int argmax(const size_t n, const float_t* x) {
   return max_ind;
 }
 
+// l2 normalization
 float_t l2_norm(size_t n, const float_t* x) {
   return cblas_snrm2(n, x, 1);
 }
 
 // dot product
-float_t dot(const vec_t& x, const vec_t& y) {
-  float_t sum = 0;
-  for (size_t i = 0; i < x.size(); ++i)
-    sum += x[i] * y[i];
-  return sum;
-}
-
 float_t dot(size_t n, const float_t* x, const float_t* y) {
-  float_t sum = 0;
-  for (size_t i = 0; i < n; ++i)
-    sum += x[i] * y[i];
-  return sum;
-}
-
-void clear(vec_t& in) {
-  for (size_t i = 0; i < in.size(); i++)
-    in[i] = 0;
+  return cblas_sdot(n, x, 1, y, 1);
 }
 
 void clear_cpu(size_t n, float_t* in) {
@@ -255,17 +242,6 @@ void d_leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in,
   }, galois::chunk_size<64>(), galois::loopname("d_leaky_relu"));
 }
 
-void softmax(const vec_t& input, vec_t& output) {
-  const float_t max = *std::max_element(input.begin(), input.end());
-  float_t denominator(0);
-  for (size_t i = 0; i < input.size(); i++) {
-    output[i] = std::exp(input[i] - max);
-    denominator += output[i];
-  }
-  for (size_t i = 0; i < input.size(); i++)
-    output[i] /= denominator;
-}
-
 void softmax(size_t n, const float_t* input, float_t* output) {
   const float_t max = *std::max_element(input, input + n);
   float_t denominator(0);
@@ -277,20 +253,6 @@ void softmax(size_t n, const float_t* input, float_t* output) {
     output[i] /= denominator;
 }
 
-void d_softmax(const vec_t& y, const vec_t& p, vec_t& dy, const vec_t& dp) {
-  auto n = y.size();
-  vec_t df(n, 0);
-  for (size_t i = 0; i < n; i++) {
-    for (size_t j = 0; j < n; j++) {
-      // float_t delta_ij = i == j? 1 : 0;
-      // df[i] += p[j] * (delta_ij - p[i]);
-      df[j] = (j == i) ? p[i] * (float_t(1) - p[i]) : -p[j] * p[i];
-    }
-    // dy = dp * (gradient of softmax)
-    dy[i] = dot(dp, df);
-  }
-}
-
 void d_softmax(size_t n, const float_t* y, const float_t* p, float_t* dy,
                const float_t* dp) {
   vec_t df(n, 0);
@@ -305,20 +267,6 @@ void d_softmax(size_t n, const float_t* y, const float_t* p, float_t* dy,
 // cross-entropy loss function for multi-class classification
 // y: ground truth
 // p: predicted probability
-float_t cross_entropy(const vec_t& y, const vec_t& p) {
-  auto n = y.size();
-  assert(n > 0);
-  float_t loss = 0.0;
-  for (size_t i = 0; i < n; i++) {
-    if (y[i] == float_t(0))
-      continue;
-    if (p[i] == float_t(0))
-      loss -= y[i] * std::log(float_t(1e-10));
-    else loss -= y[i] * std::log(p[i]);
-  }
-  return loss;
-}
-
 float_t cross_entropy(size_t n, const float_t* y, const float_t* p) {
   float_t loss = 0.0;
   for (size_t i = 0; i < n; i++) {
@@ -332,13 +280,6 @@ float_t cross_entropy(size_t n, const float_t* y, const float_t* p) {
   return loss;
 }
 
-void d_cross_entropy(const vec_t& y, const vec_t& p, vec_t& d) {
-  auto n = y.size();
-  for (size_t i = 0; i < n; i++) {
-    d[i] = -y[i] / (p[i] + float_t(1e-10));
-  }
-}
-
 void d_cross_entropy(size_t n, const float_t* y, const float_t* p, float_t* d) {
   for (size_t i = 0; i < n; i++) {
     d[i] = -y[i] / (p[i] + float_t(1e-10));
@@ -350,11 +291,6 @@ void d_cross_entropy(size_t n, const float_t* y, const float_t* p, float_t* d) {
 inline float_t sigmoid_func(float_t x) { return 1./(1.+expf(-x)); }
 
 // Sigmoid
-void sigmoid(const vec_t& in, vec_t &out) {
-  for (size_t i = 0; i < in.size(); ++i)
-    out[i] = sigmoid_func(in[i]);
-}
-
 void sigmoid(size_t n, const float_t* in, float_t* out) {
   for (size_t i = 0; i < n; i++) {
     out[i] = 1. / (1. + expf(-in[i]));
@@ -367,12 +303,10 @@ void d_sigmoid(size_t n, const float_t* y, const float_t* p, float_t* dy, const
   }
 }
 
-void copy1D1D(const vec_t& in, vec_t& out) {
-  std::copy(in.begin(), in.end(), &out[0]);
-}
-
-void copy_cpu(size_t len, const float_t* in, float_t* out) {
-  std::copy(in, in + len, out);
+void copy_cpu(size_t n, const float_t* in, float_t* out) {
+  //std::copy(in, in + n, out);
+  //memcpy(out, in, sizeof(float_t) * n);
+  cblas_scopy(n, in, 1, out, 1);
 }
 
 // num rows in A, C; num columns in B, C; num columns in A, rows in B
@@ -390,15 +324,6 @@ void transpose(size_t x, size_t y, const float_t* in, float_t* out) {
   }
 }
 
-// matrix-vector multiply
-void mvmul(size_t m, size_t n, const float_t *matrix, const float_t *in_vector, float_t *out_vector) {
-  for (size_t i = 0; i < m; ++i) {
-    for (size_t j = 0; j < n; ++j) {
-      out_vector[i] += matrix[i * n + j] * in_vector[j];
-    }
-  }
-}
-
 float reduce_mean(size_t n, const float_t* x) {
   float_t sum = 0.;
   for (size_t i = 0; i < n; i++) {
diff --git a/libdeepgalois/src/sampler.cpp b/libdeepgalois/src/sampler.cpp
index a1b1e4feba..f1e4238a84 100644
--- a/libdeepgalois/src/sampler.cpp
+++ b/libdeepgalois/src/sampler.cpp
@@ -54,6 +54,7 @@ void Sampler::generate_masked_graph(size_t n, mask_t* masks, Graph* g, Graph& su
       }
     }
   }, galois::loopname("gen_subgraph"));
+  sub.degree_counting();
 #endif
 }
 
@@ -143,6 +144,7 @@ void Sampler::generate_subgraph(VertexSet &vertex_set, Graph &g, Graph &sub) {
       j ++;
     }
   }, galois::loopname("construct_graph"));
+  sub.degree_counting();
 #endif
 }
 

From 14461e39772ebbabea3920fae396e31a7facba50 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Thu, 30 Apr 2020 15:56:14 -0500
Subject: [PATCH 213/660] udapte cmake

---
 CMakeLists.txt                       | 4 +++-
 libdeepgalois/CMakeLists.txt         | 4 ++++
 libdeepgalois/src/math_functions.cpp | 1 +
 lonestargnn/CMakeLists.txt           | 1 +
 4 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index dc0250a3f4..a56a1702e9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -259,7 +259,9 @@ if(USE_VTUNE)
 endif()
 
 if(USE_MKL_BLAS)
-  SET(MKL_ROOT /opt/apps/sysnet/intel/17.0/mkl)
+  SET(INTEL_ROOT /opt/apps/sysnet/intel/17.0)
+  SET(MKL_ROOT ${INTEL_ROOT}/mkl)
+  SET(INTEL_LIBS_DIR ${INTEL_ROOT}/lib/intel64_lin)
   find_package(MKL)
   message(STATUS "MKL: ${MKL_INCLUDE_DIRS}")
   if (MKL_FOUND)
diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index 2ede00abbc..3c41d945cb 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -1,12 +1,16 @@
 cmake_minimum_required(VERSION 2.8)
 
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -pthread")
 SET(BLAS_INC_DIR ${OPENBLAS_ROOT}/include/openblas)
 SET(BLAS_LIB_DIR ${OPENBLAS_ROOT}/lib64)
 set(BLAS_LIB "-lopenblas -lpthread")
 if(USE_MKL_BLAS)
+  link_directories(${INTEL_LIBS_DIR})
+  message(STATUS "ICC Libraries for MKL: ${INTEL_LIBS_DIR}")
   SET(BLAS_INC_DIR ${MKL_ROOT}/include)
   SET(BLAS_LIB_DIR ${MKL_ROOT}/lib/intel64)
   set(BLAS_LIB "-lmkl_intel_lp64 -lmkl_sequential -lmkl_core")
+  #set(BLAS_LIB "-lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -lpthread -liomp5")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_MKL")
 endif()
 
diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp
index 08ceaab76c..e254839bed 100644
--- a/libdeepgalois/src/math_functions.cpp
+++ b/libdeepgalois/src/math_functions.cpp
@@ -63,6 +63,7 @@ void csrmm_cpu(const int M, const int N, const int K, const int nnz,
 	           const int* A_idx_ptr, const int* A_nnz_idx,
                const float* B, const float beta, float* C) {
 #ifdef USE_MKL
+  mkl_set_num_threads(56);
   const char *matdescra = "GXXCX";//6 bytes
   const char transa = 'N';
   //printf("Calling Intel MKL\n"); exit(1);
diff --git a/lonestargnn/CMakeLists.txt b/lonestargnn/CMakeLists.txt
index 24c9c6a726..6db3877a6f 100644
--- a/lonestargnn/CMakeLists.txt
+++ b/lonestargnn/CMakeLists.txt
@@ -10,6 +10,7 @@ if(USE_MKL_BLAS)
   SET(BLAS_LIB_DIR "${MKL_ROOT}/lib/intel64")
 endif()
 link_directories(${BLAS_LIB_DIR})
+link_directories(${INTEL_LIBS_DIR})
 
 if(NOT ENABLE_HETERO_GALOIS)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCPU_ONLY")

From 6efe3bbb663fae737d4c19592ae7fecf9e751d9e Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Thu, 30 Apr 2020 16:10:05 -0500
Subject: [PATCH 214/660] fix compile

---
 libdeepgalois/src/math_functions.cpp | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp
index e254839bed..968f477f63 100644
--- a/libdeepgalois/src/math_functions.cpp
+++ b/libdeepgalois/src/math_functions.cpp
@@ -81,8 +81,8 @@ void mvmul(const CBLAS_TRANSPOSE TransA, const int M, const int N, const float a
   cblas_sgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1);
 }
 
+const size_t vec_len = 8; // for 32-bit floating point in AVX2; TODO AVX512
 /*
-const size_t vec_len = 8; // for 32-bit floating point in AVX2
 // vector add
 void vadd_cpu(size_t n, const float_t* a, const float_t* b, float_t* out) {
 #ifdef __AVX2__
@@ -140,7 +140,18 @@ float_t l2_norm(size_t n, const float_t* a) {
 */
 
 void vadd_cpu(size_t n, const float_t* a, const float_t* b, float_t* y) {
+#ifdef USE_MKL
   vsAdd(n, a, b, y);
+#else
+#ifdef __AVX2__
+  const size_t alignedN = n - n % vec_len;
+  for (size_t i = 0; i < alignedN; i += vec_len)
+    _mm256_storeu_ps(&y[i], _mm256_add_ps(_mm256_loadu_ps(&a[i]), _mm256_loadu_ps(&b[i])));
+  for (size_t i = alignedN; i < n; ++i) y[i] = a[i] + b[i];
+#else
+  for (size_t i = 0; i < n; ++i) y[i] = a[i] + b[i];
+#endif
+#endif
 }
 
 void scal(size_t n, const float_t alpha, float_t* x) {

From e1e037832b03a94048938ab9fdbfad6f911b0c4a Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Thu, 30 Apr 2020 19:47:20 -0500
Subject: [PATCH 215/660] fix gpu

---
 libdeepgalois/CMakeLists.txt                  |  2 +-
 libdeepgalois/include/deepgalois/context.h    | 14 ++--
 libdeepgalois/include/deepgalois/lgraph.h     |  6 +-
 .../include/deepgalois/math_functions.hh      |  8 +-
 libdeepgalois/include/deepgalois/net.h        | 16 ++--
 libdeepgalois/src/context.cpp                 |  6 +-
 libdeepgalois/src/layers/graph_conv_layer.cu  |  2 +-
 libdeepgalois/src/lgraph.cpp                  |  1 +
 libdeepgalois/src/lgraph.cu                   | 48 ++++++-----
 libdeepgalois/src/math_functions.cpp          |  8 +-
 libdeepgalois/src/math_functions.cu           | 39 +++++----
 libdeepgalois/src/net.cpp                     | 82 +++++++++++--------
 lonestargnn/gcn/gcn.cpp                       | 12 +--
 13 files changed, 134 insertions(+), 110 deletions(-)

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index 3c41d945cb..7afa6c9169 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -109,7 +109,7 @@ set(sources
 )
 endif()
 
-set(BOOST_LIBRARIES "-lboost_system -lboost_thread")
+#set(BOOST_LIBRARIES "-lboost_system -lboost_thread")
 add_library(dg_cpu STATIC ${sources})
 target_link_libraries(dg_cpu galois_shmem)
 target_link_libraries(dg_cpu ${MPI_CXX_LIBRARIES})
diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h
index ffbaecb0d3..ff324ef60f 100644
--- a/libdeepgalois/include/deepgalois/context.h
+++ b/libdeepgalois/include/deepgalois/context.h
@@ -6,7 +6,7 @@
 #include <string>
 #include <cassert>
 #include "deepgalois/types.h"
-#include <boost/shared_ptr.hpp>
+//#include <boost/shared_ptr.hpp>
 #ifdef CPU_ONLY
 #include "deepgalois/gtypes.h"
 #else
@@ -16,13 +16,10 @@
 
 namespace deepgalois {
 
-using boost::shared_ptr;
-
 class Context {
 public:
   Context();
   ~Context();
-  static Context& Get();
 
   size_t read_graph(std::string dataset_str, bool selfloop);
   size_t read_graph_cpu(std::string dataset_str, std::string filetype, bool selfloop);
@@ -69,7 +66,7 @@ class Context {
   inline static cusparseMatDescr_t cusparse_matdescr() { return cusparse_matdescr_; }
   inline static curandGenerator_t curand_generator() { return curand_generator_; }
 #endif
-
+/*
   // This random number generator facade hides boost and CUDA rng
   // implementation from one another (for cross-platform compatibility).
   class RNG {
@@ -81,9 +78,10 @@ class Context {
     void* generator();
    private:
     class Generator;
-    shared_ptr<Generator> generator_;
+    boost::shared_ptr<Generator> generator_;
   };
 
+  static Context& Get();
   // Getters for boost rng, curand, and cublas handles
   inline static RNG& rng_stream() {
     if (!Get().random_generator_) {
@@ -91,7 +89,7 @@ class Context {
     }
     return *(Get().random_generator_);
   }
-
+*/
 protected:
   size_t n;                    // number of samples: N
   size_t num_classes;          // number of classes: E
@@ -108,7 +106,7 @@ class Context {
   float_t* d_feats;            // input features on device
   float_t* d_feats_subg;       // input features for subgraph on device
   float_t* norm_factor;        // normalization constant based on graph structure
-  shared_ptr<RNG> random_generator_;
+  //boost::shared_ptr<RNG> random_generator_;
 
 #ifdef CPU_ONLY
   void read_edgelist(const char* filename, bool symmetrize = false, bool add_self_loop = false);
diff --git a/libdeepgalois/include/deepgalois/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h
index bf3ace2470..733c6620d8 100644
--- a/libdeepgalois/include/deepgalois/lgraph.h
+++ b/libdeepgalois/include/deepgalois/lgraph.h
@@ -1,7 +1,7 @@
 #pragma once
 #include "deepgalois/types.h"
 #include <string>
-#include <boost/iterator/counting_iterator.hpp>
+//#include <boost/iterator/counting_iterator.hpp>
 
 namespace deepgalois {
 
@@ -17,8 +17,8 @@ class LearningGraph {
   edata_t *edge_data_;
 
 public:
-  //typedef index_t* iterator;
-  using iterator = boost::counting_iterator<index_t>;
+  typedef size_t iterator;
+  //using iterator = boost::counting_iterator<index_t>;
   LearningGraph(bool use_gpu) : is_device(use_gpu), num_vertices_(0), num_edges_(0),
                                 rowptr_(NULL), colidx_(NULL), degrees_(NULL),
                                 vertex_data_(NULL), edge_data_(NULL) {}
diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh
index a39e463ecc..53baa2ff0f 100644
--- a/libdeepgalois/include/deepgalois/math_functions.hh
+++ b/libdeepgalois/include/deepgalois/math_functions.hh
@@ -112,10 +112,10 @@ void leaky_relu_gpu(const int n, const float_t epsilon,
                     const float_t* in, float_t* out); // Leaky ReLU
 void d_leaky_relu_gpu(const int n, const float_t epsilon, const float_t* in_diff, 
                       const float_t* data, float_t* out_diff); // Leaky ReLU derivative
-void dropout_gpu(const int n, const float scale, const float dropout_rate,
-                 const float_t* in, unsigned* masks, float_t* out); // dropout
-void d_dropout_gpu(const int n, const float scale, const float dropout_rate,
-                   const float_t* in, const unsigned* masks, float_t* out); // dropout derivative
+void dropout_gpu(int n, float scale, float dropout_rate,
+                 const float_t* in, mask_t* masks, float_t* out); // dropout
+void d_dropout_gpu(int n, float scale, float dropout_rate,
+                   const float_t* in, const mask_t* masks, float_t* out); // dropout derivative
 void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
                const int M, const int N, const int K, const float alpha,
                const float* A, const float* B, const float beta, float* C);
diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h
index 820367bef5..ba34687e22 100644
--- a/libdeepgalois/include/deepgalois/net.h
+++ b/libdeepgalois/include/deepgalois/net.h
@@ -5,18 +5,19 @@
 #define _MODEL_H_
 
 #include <random>
-#include "galois/Timer.h"
 #include "deepgalois/types.h"
-#include "deepgalois/gtypes.h"
 #include "deepgalois/layers/l2_norm_layer.h"
 #include "deepgalois/layers/graph_conv_layer.h"
 #include "deepgalois/layers/softmax_loss_layer.h"
 #include "deepgalois/layers/sigmoid_loss_layer.h"
 #include "deepgalois/optimizer.h"
+#ifdef CPU_ONLY
 #include "deepgalois/sampler.h"
+#endif
 #ifndef GALOIS_USE_DIST
 #include "deepgalois/context.h"
 #else
+#include "deepgalois/gtypes.h"
 #include "deepgalois/DistContext.h"
 #endif
 
@@ -40,8 +41,10 @@ class Net {
   void init(std::string dataset_str, unsigned num_conv, unsigned epochs,
             unsigned hidden1, float lr, float dropout, float wd,
             bool selfloop, bool single, bool l2norm, bool dense, 
-            unsigned neigh_sample_size = 0, unsigned subg_sample = 0, 
-            Graph* dGraph = NULL);
+            unsigned neigh_sample_size = 0, unsigned subg_sample = 0);
+#ifdef GALOIS_USE_DIST
+  void dist_init(Graph* dGraph);
+#endif
   size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; }
   size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id + 1]; }
   size_t get_nnodes() { return num_samples; }
@@ -55,7 +58,7 @@ class Net {
 
   void train(optimizer* opt, bool need_validate); // training
   double evaluate(std::string type, acc_t& loss, acc_t& acc); // inference
-  void read_test_masks(std::string dataset, Graph* dGraph);
+  void read_test_masks(std::string dataset);
   acc_t fprop(size_t begin, size_t end, size_t count, mask_t* masks); // forward propagation
   void bprop(); // back propogation
   void normalize(); // Scale gradient to counterbalance accumulation
@@ -106,14 +109,15 @@ class Net {
   mask_t* subgraph_masks;            // masks for subgraph
   std::vector<size_t> feature_dims;  // feature dimnesions for each layer
   std::vector<layer*> layers;        // all the layers in the neural network
-  Sampler *sampler;
 #ifndef GALOIS_USE_DIST
   deepgalois::Context* context;
 #else
   deepgalois::DistContext* context;
+  Graph* dGraph;
 #endif
 
 #ifdef CPU_ONLY
+  Sampler *sampler;
   // comparing outputs with the ground truth (labels)
   acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, Graph* dGraph);
   acc_t masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, Graph* dGraph);
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index 37c9a33e04..ffc2069024 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -4,10 +4,10 @@
 #include "deepgalois/context.h"
 #include "deepgalois/utils.h"
 #include "deepgalois/configs.h"
-#include <boost/thread.hpp>
+//#include <boost/thread.hpp>
 
 namespace deepgalois {
-
+/*
 // Make sure each thread can have different values.
 static boost::thread_specific_ptr<Context> thread_instance_;
 
@@ -17,7 +17,7 @@ Context& Context::Get() {
   }
   return *(thread_instance_.get());
 }
-
+*/
 #ifdef CPU_ONLY
 Context::Context() : n(0), num_classes(0), 
   feat_len(0), is_single_class(true), 
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cu b/libdeepgalois/src/layers/graph_conv_layer.cu
index 41f6e30a0f..7edb4ab10c 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cu
+++ b/libdeepgalois/src/layers/graph_conv_layer.cu
@@ -8,7 +8,7 @@ void graph_conv_layer::malloc_and_init() {
   size_t y = input_dims[1];
   size_t z = output_dims[1];
 
-  if (dropout_) CUDA_CHECK(cudaMalloc((void**)&dropout_mask, x * y * sizeof(unsigned)));
+  if (dropout_) CUDA_CHECK(cudaMalloc((void**)&dropout_mask, x * y * sizeof(mask_t)));
   //CUDA_CHECK(cudaMalloc((void**)&in_temp, x * y * sizeof(float_t)));
   float_malloc_device(x*y, in_temp);
   init_const_gpu(x*y, 0.0, in_temp);
diff --git a/libdeepgalois/src/lgraph.cpp b/libdeepgalois/src/lgraph.cpp
index 6c36eb464b..891973e612 100644
--- a/libdeepgalois/src/lgraph.cpp
+++ b/libdeepgalois/src/lgraph.cpp
@@ -11,6 +11,7 @@
 #include <cassert>
 
 namespace deepgalois {
+
 void LearningGraph::progressPrint(unsigned maxii, unsigned ii) {
   const unsigned nsteps = 10;
   unsigned ineachstep = (maxii / nsteps);
diff --git a/libdeepgalois/src/lgraph.cu b/libdeepgalois/src/lgraph.cu
index 14b7239358..0a925bbbdb 100644
--- a/libdeepgalois/src/lgraph.cu
+++ b/libdeepgalois/src/lgraph.cu
@@ -1,41 +1,47 @@
+#include "deepgalois/lgraph.h"
+#include "deepgalois/cutils.h"
+#include <cassert>
+
+namespace deepgalois {
 
 void LearningGraph::dealloc() {
   assert(is_device);
   CUDA_CHECK(cudaFree(colidx_));
   CUDA_CHECK(cudaFree(rowptr_));
   CUDA_CHECK(cudaFree(degrees_));
-  if (edge_data != NULL) CUDA_CHECK(cudaFree(edge_data));
-  if (vertex_data != NULL) CUDA_CHECK(cudaFree(vertex_data));
+  if (edge_data_ != NULL) CUDA_CHECK(cudaFree(edge_data_));
+  if (vertex_data_ != NULL) CUDA_CHECK(cudaFree(vertex_data_));
 }
 
-void LearningGraph::allocOnDevice(bool no_edge_data_) {
-  if (colidx_ != NULL) return true;  
-  CUDA_CHECK(cudaMalloc((void **) &colidx_, num_edges_ * sizeof(index_type)));
-  CUDA_CHECK(cudaMalloc((void **) &rowptr_, (num_vertices_+1) * sizeof(index_type)));
-  CUDA_CHECK(cudaMalloc((void **) &degrees_, num_vertices_ * sizeof(index_type)));
-  //if (!no_edge_data_) CUDA_CHECK(cudaMalloc((void **) &edge_data_, num_edges_ * sizeof(edge_data__t)));
-  //CUDA_CHECK(cudaMalloc((void **) &vertex_data_, num_vertices_ * sizeof(vdata_t)));
+void LearningGraph::allocOnDevice(bool no_edge_data__) {
+  if (colidx_ != NULL) return;  
+  CUDA_CHECK(cudaMalloc((void **) &colidx_, num_edges_ * sizeof(index_t)));
+  CUDA_CHECK(cudaMalloc((void **) &rowptr_, (num_vertices_+1) * sizeof(index_t)));
+  CUDA_CHECK(cudaMalloc((void **) &degrees_, num_vertices_ * sizeof(index_t)));
+  //if (!no_edge_data__) CUDA_CHECK(cudaMalloc((void **) &edge_data__, num_edges_ * sizeof(edge_data___t)));
+  //CUDA_CHECK(cudaMalloc((void **) &vertex_data__, num_vertices_ * sizeof(vdata_t)));
   is_device = true;
 }
 
 void LearningGraph::copy_to_gpu(LearningGraph &copygraph) {
   copygraph.init(num_vertices_, num_edges_);
   copygraph.allocOnDevice(edge_data_ == NULL);
-  CUDA_CHECK(cudaMemcpy(copygraph.colidx_, colidx_, num_edges_ * sizeof(index_type), cudaMemcpyHostToDevice));
-  CUDA_CHECK(cudaMemcpy(copygraph.rowptr_, rowptr_, (num_vertices_+1) * sizeof(index_type), cudaMemcpyHostToDevice));
-  CUDA_CHECK(cudaMemcpy(copygraph.degrees_, degrees_, num_vertices_ * sizeof(index_type), cudaMemcpyHostToDevice));
-  //if (edge_data_ != NULL) CUDA_CHECK(cudaMemcpy(copygraph.edge_data_, edge_data_, num_edges_ * sizeof(edata_t), cudaMemcpyHostToDevice));
-  //CUDA_CHECK(cudaMemcpy(copygraph.vertex_data_, vertex_data_, num_vertices_ * sizeof(vdata_t), cudaMemcpyHostToDevice));
+  CUDA_CHECK(cudaMemcpy(copygraph.colidx_, colidx_, num_edges_ * sizeof(index_t), cudaMemcpyHostToDevice));
+  CUDA_CHECK(cudaMemcpy(copygraph.rowptr_, rowptr_, (num_vertices_+1) * sizeof(index_t), cudaMemcpyHostToDevice));
+  CUDA_CHECK(cudaMemcpy(copygraph.degrees_, degrees_, num_vertices_ * sizeof(index_t), cudaMemcpyHostToDevice));
+  //if (edge_data__ != NULL) CUDA_CHECK(cudaMemcpy(copygraph.edge_data__, edge_data__, num_edges_ * sizeof(edata_t), cudaMemcpyHostToDevice));
+  //CUDA_CHECK(cudaMemcpy(copygraph.vertex_data__, vertex_data__, num_vertices_ * sizeof(vdata_t), cudaMemcpyHostToDevice));
 }
 
 void LearningGraph::copy_to_cpu(LearningGraph &copygraph) {
   assert(is_device);
-  assert(copygraph.size() = num_vertices_);
-  assert(copygraph.sizeEdges() = num_edges_);
-  CUDA_CHECK(cudaMemcpy(copygraph.edge_dst_ptr(), colidx_, num_edges_ * sizeof(index_type), cudaMemcpyDeviceToHost));
-  CUDA_CHECK(cudaMemcpy(copygraph.row_start_ptr(), rowptr_, (num_vertices_+1) * sizeof(index_type), cudaMemcpyDeviceToHost));
-  CUDA_CHECK(cudaMemcpy(copygraph.degrees_ptr(), degrees_, num_vertices_ * sizeof(index_type), cudaMemcpyDeviceToHost));
-  //if (edge_data_ != NULL) CUDA_CHECK(cudaMemcpy(copygraph.edge_data_ptr(), edge_data_, num_edges_ * sizeof(edata_t), cudaMemcpyDeviceToHost));
-  //CUDA_CHECK(cudaMemcpy(copygraph.vertex_data_ptr(), vertex_data_, num_vertices_ * sizeof(vdata_t), cudaMemcpyDeviceToHost));
+  assert(copygraph.size() == num_vertices_);
+  assert(copygraph.sizeEdges() == num_edges_);
+  CUDA_CHECK(cudaMemcpy(copygraph.edge_dst_ptr(), colidx_, num_edges_ * sizeof(index_t), cudaMemcpyDeviceToHost));
+  CUDA_CHECK(cudaMemcpy(copygraph.row_start_ptr(), rowptr_, (num_vertices_+1) * sizeof(index_t), cudaMemcpyDeviceToHost));
+  CUDA_CHECK(cudaMemcpy(copygraph.degrees_ptr(), degrees_, num_vertices_ * sizeof(index_t), cudaMemcpyDeviceToHost));
+  //if (edge_data__ != NULL) CUDA_CHECK(cudaMemcpy(copygraph.edge_data__ptr(), edge_data__, num_edges_ * sizeof(edata_t), cudaMemcpyDeviceToHost));
+  //CUDA_CHECK(cudaMemcpy(copygraph.vertex_data__ptr(), vertex_data__, num_vertices_ * sizeof(vdata_t), cudaMemcpyDeviceToHost));
 }
 
+}
diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp
index 968f477f63..45fccaea04 100644
--- a/libdeepgalois/src/math_functions.cpp
+++ b/libdeepgalois/src/math_functions.cpp
@@ -1,10 +1,9 @@
-#include "deepgalois/math_functions.hh"
-#include "galois/Timer.h"
-#include "galois/Galois.h"
 #include <random>
 #include <immintrin.h>
-#include <boost/random.hpp>
+#include "galois/Timer.h"
+#include "galois/Galois.h"
 #include "deepgalois/utils.h"
+#include "deepgalois/math_functions.hh"
 
 #ifdef USE_MKL
 #include <mkl.h>
@@ -23,6 +22,7 @@ extern "C" {
 std::default_random_engine generator;
 std::uniform_real_distribution<float_t> distribution(0.0,1.0);
 /*
+#include <boost/random.hpp>
 typedef boost::mt19937 rng_t;
 inline rng_t* deepgalois_rng() {
   return static_cast<rng_t*>(Context::rng_stream().generator());
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index c1746d9075..62a7af3849 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -28,8 +28,8 @@ bool isnan_gpu(int n, const float_t *array) {
   return h_result;
 }
 
-void gpu_rng_uniform(const int n, unsigned* r) {
-  CURAND_CHECK(curandGenerate(deepgalois::Context::curand_generator(), r, n));
+void gpu_rng_uniform(const int n, float_t* r) {
+  CURAND_CHECK(curandGenerateUniform(deepgalois::Context::curand_generator(), r, n));
 }
 
 void rng_uniform_gpu(const int n, const float_t a, const float_t b, float_t* r) {
@@ -78,30 +78,33 @@ __global__ void setup_curand_kernel(const int n, curandState* state) {
   }
 }
 
-__global__ void dropout_kernel(const int n, const float scale,
-                               const float threshold, const float_t* in,
-                               unsigned* masks, float_t* out) {
-  CUDA_KERNEL_LOOP(i, n) { out[i] = in[i] * (masks[i] > threshold) * scale; }
+__global__ void dropout_kernel(int n, float scale, float threshold,
+                               float_t *rands, const float_t* in,
+                               mask_t* masks, float_t* out) {
+  CUDA_KERNEL_LOOP(i, n) {
+    masks[i] = rands[i] > threshold ? 1 : 0; 
+    out[i] = in[i] * masks[i] * scale; 
+  }
 }
 
-void dropout_gpu(const int n, const float scale, const float dropout_rate,
-                 const float_t* in, unsigned* masks, float_t* out) {
-  gpu_rng_uniform(n, masks);
-  //std::cout << "[debug]: dropout_gpu\n";
+void dropout_gpu(int n, float scale, float dropout_rate,
+                 const float_t* in, mask_t* masks, float_t* out) {
+  float_t *rands;
+  float_malloc_device(n, rands);
+  gpu_rng_uniform(n, rands);
   dropout_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
-      n, scale, dropout_rate, in, masks, out);
+      n, scale, dropout_rate, rands, in, masks, out);
   CudaTest("solving dropout kernel failed");
-  //std::cout << "[debug]: dropout_gpu done\n";
+  float_free_device(rands);
 }
 
-__global__ void d_dropout_kernel(const int n, const float scale,
-                                 const float threshold, const float_t* in,
-                                 const unsigned* masks, float_t* out) {
-  CUDA_KERNEL_LOOP(i, n) { out[i] = in[i] * (masks[i] > threshold) * scale; }
+__global__ void d_dropout_kernel(int n, float scale, float threshold,
+                                 const float_t* in, const mask_t* masks, float_t* out) {
+  CUDA_KERNEL_LOOP(i, n) { out[i] = in[i] * masks[i] * scale; }
 }
 
-void d_dropout_gpu(const int n, const float scale, const float dropout_rate, 
-                   const float_t* in, const unsigned* masks, float_t* out) {
+void d_dropout_gpu(int n, float scale, float dropout_rate, 
+                   const float_t* in, const mask_t* masks, float_t* out) {
   d_dropout_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
       n, scale, dropout_rate, in, masks, out);
   CudaTest("solving d_dropout kernel failed");
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index 7da9fcbb18..cc7ba738cc 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -2,6 +2,7 @@
  * Based on the net.hpp file from Caffe deep learning framework.
  */
 
+#include "galois/Timer.h"
 #include "deepgalois/net.h"
 #include "deepgalois/utils.h"
 #include "deepgalois/math_functions.hh"
@@ -11,7 +12,7 @@ namespace deepgalois {
 void Net::init(std::string dataset_str, unsigned num_conv, unsigned epochs,
                unsigned hidden1, float lr, float dropout, float wd,
                bool selfloop, bool single, bool l2norm, bool dense, 
-               unsigned neigh_sz, unsigned subg_sz, Graph* dGraph) {
+               unsigned neigh_sz, unsigned subg_sz) {
   assert(num_conv > 0);
   num_conv_layers = num_conv;
   num_epochs = epochs;
@@ -32,21 +33,14 @@ void Net::init(std::string dataset_str, unsigned num_conv, unsigned epochs,
                  ", weight_decay ", weight_decay, "\n");
 #ifndef GALOIS_USE_DIST
   context = new deepgalois::Context();
-  context->set_label_class(is_single_class);
-  context->set_use_subgraph(subgraph_sample_size > 0);
   num_samples = context->read_graph(dataset_str, selfloop);
-  if (subgraph_sample_size) sampler = new deepgalois::Sampler();
-#else
-  context = new deepgalois::DistContext();
-  num_samples = dGraph->size();
-  context->saveGraph(dGraph);
-  // TODO self loop setup?
-  context->initializeSyncSubstrate();
+  context->set_label_class(is_single_class);
 #endif
 
   // read graph, get num nodes
   num_classes = context->read_labels(dataset_str);
 
+#ifndef GALOIS_USE_DIST
   //std::cout << "Reading label masks ... ";
   train_masks = new mask_t[num_samples];
   val_masks = new mask_t[num_samples];
@@ -59,31 +53,13 @@ void Net::init(std::string dataset_str, unsigned num_conv, unsigned epochs,
     train_end = train_begin + train_count;
     val_begin = 153431, val_count = 23831, val_end = val_begin + val_count;
     // TODO do all can be used below
-#ifndef GALOIS_USE_DIST
     for (size_t i = train_begin; i < train_end; i++) train_masks[i] = 1;
     for (size_t i = val_begin; i < val_end; i++) val_masks[i] = 1;
-#else
-    // find local ID from global ID, set if it exists
-    for (size_t i = train_begin; i < train_end; i++) {
-      if (dGraph->isLocal(i)) {
-        train_masks[dGraph->getLID(i)] = 1;
-      }
-    }
-    for (size_t i = val_begin; i < val_end; i++) {
-      if (dGraph->isLocal(i)) {
-        val_masks[dGraph->getLID(i)] = 1;
-      }
-    }
-#endif
   } else {
-#ifndef GALOIS_USE_DIST
     train_count = context->read_masks(dataset_str, "train", num_samples, train_begin, train_end, train_masks);
     val_count = context->read_masks(dataset_str, "val", num_samples, val_begin, val_end, val_masks);
-#else
-    train_count = context->read_masks(dataset_str, "train", num_samples, train_begin, train_end, train_masks, dGraph);
-    val_count = context->read_masks(dataset_str, "val", num_samples, val_begin, val_end, val_masks, dGraph);
-#endif
   }
+#endif
 
   if (subgraph_sample_size > train_count) {
     galois::gPrint("FATAL: subgraph size can not be larger than the size of training set\n");
@@ -108,13 +84,53 @@ void Net::init(std::string dataset_str, unsigned num_conv, unsigned epochs,
   feature_dims[num_layers] = num_classes;                // normalized output embedding: E
   layers.resize(num_layers);
 
-#ifndef CPU_ONLY
+#ifdef CPU_ONLY
+  context->set_use_subgraph(subgraph_sample_size > 0);
+  if (subgraph_sample_size) sampler = new deepgalois::Sampler();
+#else
   copy_masks_device(num_samples, train_masks, d_train_masks);
   copy_masks_device(num_samples, val_masks, d_val_masks);
   context->copy_data_to_device(); // copy labels and input features to the device
 #endif
 }
 
+#ifdef GALOIS_USE_DIST
+void Net::dist_init(Graph* graph) {
+  dGraph = graph;
+  context = new deepgalois::DistContext();
+  num_samples = dGraph->size();
+  context->saveGraph(dGraph);
+  // TODO self loop setup?
+  context->initializeSyncSubstrate();
+
+  //std::cout << "Reading label masks ... ";
+  train_masks = new mask_t[num_samples];
+  val_masks = new mask_t[num_samples];
+  std::fill(train_masks, train_masks+num_samples, 0);
+  std::fill(val_masks, val_masks+num_samples, 0);
+
+  if (dataset_str == "reddit") {
+    train_begin = 0, train_count = 153431,
+    train_end = train_begin + train_count;
+    val_begin = 153431, val_count = 23831, val_end = val_begin + val_count;
+    // find local ID from global ID, set if it exists
+    for (size_t i = train_begin; i < train_end; i++) {
+      if (dGraph->isLocal(i)) {
+        train_masks[dGraph->getLID(i)] = 1;
+      }
+    }
+    for (size_t i = val_begin; i < val_end; i++) {
+      if (dGraph->isLocal(i)) {
+        val_masks[dGraph->getLID(i)] = 1;
+      }
+    }
+  } else {
+    train_count = context->read_masks(dataset_str, "train", num_samples, train_begin, train_end, train_masks, dGraph);
+    val_count = context->read_masks(dataset_str, "val", num_samples, val_begin, val_end, val_masks, dGraph);
+  }
+}
+#endif
+
 void Net::train(optimizer* opt, bool need_validate) {
   std::string header = "";
   std::string seperator = " ";
@@ -396,7 +412,7 @@ void Net::append_conv_layer(size_t layer_id, bool act, bool norm, bool bias,
   layers[layer_id]->set_graph_ptr(context->getGraphPointer());
 }
 
-void Net::read_test_masks(std::string dataset, Graph* dGraph) {
+void Net::read_test_masks(std::string dataset) {
   test_masks = new mask_t[num_samples];
   if (dataset == "reddit") {
     test_begin = 177262;
@@ -430,7 +446,7 @@ void Net::read_test_masks(std::string dataset, Graph* dGraph) {
  * @param end GLOBAL end
  * @param count GLOBAL training count
  */
-acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, Graph* dGraph) {
+acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks) {
 #ifndef GALOIS_USE_DIST
   AccumF accuracy_all;
 #else
@@ -479,7 +495,7 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks
   return accuracy_all.reduce() / (acc_t)count;
 }
 
-acc_t Net::masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, Graph* dGraph) {
+acc_t Net::masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks) {
   auto preds = layers[num_conv_layers]->next()->get_data();
   auto ground_truth = context->get_labels_ptr();
   return deepgalois::masked_f1_score(begin, end, count, masks, num_classes, ground_truth, preds);
diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp
index fa492172a5..62a8067294 100644
--- a/lonestargnn/gcn/gcn.cpp
+++ b/lonestargnn/gcn/gcn.cpp
@@ -18,17 +18,17 @@ int main(int argc, char** argv) {
   LonestarGnnStart(argc, argv, name, desc, url);
   deepgalois::Net network; // the neural network to train
 
-  deepgalois::Graph* dGraph = NULL;
 #ifdef GALOIS_USE_DIST
   std::vector<unsigned> dummyVec;
-  dGraph = galois::graphs::constructSymmetricGraph<char, void>(dummyVec);
+  deepgalois::Graph* dGraph = galois::graphs::constructSymmetricGraph<char, void>(dummyVec);
+  network.dist_init(dGraph);
 #endif
 
   // read network, features, ground truth, initialize metadata
   network.init(dataset, num_conv_layers, epochs, hidden1, learning_rate, 
                dropout_rate, weight_decay, add_selfloop, 
                is_single_class, add_l2norm, add_dense, 
-               neighbor_sample_sz, subgraph_sample_sz, dGraph);
+               neighbor_sample_sz, subgraph_sample_sz);
   // default setting for now; can be customized by the user
   network.construct_layers();
   network.print_layers_info();
@@ -47,11 +47,7 @@ int main(int argc, char** argv) {
   if (do_test) {
     // test using test samples
     galois::gPrint("\n");
-#ifndef GALOIS_USE_DIST
-    network.read_test_masks(dataset, NULL);
-#else
-    network.read_test_masks(dataset, dGraph);
-#endif
+    network.read_test_masks(dataset);
     galois::StatTimer Ttest("Test");
     Ttest.start();
     acc_t test_loss = 0.0, test_acc = 0.0;

From c487a8d4ee76f69fc35a59c6cdbd16b89e4bac81 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Thu, 30 Apr 2020 19:55:16 -0500
Subject: [PATCH 216/660] fix cpu

---
 libdeepgalois/include/deepgalois/net.h | 4 ++--
 libdeepgalois/src/net.cpp              | 5 ++---
 libdeepgalois/src/net.cu               | 4 ++--
 3 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h
index ba34687e22..5c32292430 100644
--- a/libdeepgalois/include/deepgalois/net.h
+++ b/libdeepgalois/include/deepgalois/net.h
@@ -119,8 +119,8 @@ class Net {
 #ifdef CPU_ONLY
   Sampler *sampler;
   // comparing outputs with the ground truth (labels)
-  acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, Graph* dGraph);
-  acc_t masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, Graph* dGraph);
+  acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks);
+  acc_t masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks);
 #else
   acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, CSRGraph *gGraph);
   acc_t masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, CSRGraph *gGraph);
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index cc7ba738cc..86bd0f6340 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -270,11 +270,10 @@ double Net::evaluate(std::string type, acc_t& loss, acc_t& acc) {
 #endif
 
   loss = fprop(begin, end, count, masks);
-  auto g = context->getGraphPointer();
   if (is_single_class) {
-    acc = masked_accuracy(begin, end, count, masks, g);
+    acc = masked_accuracy(begin, end, count, masks);
   } else {
-    acc = masked_multi_class_accuracy(begin, end, count, masks, g);
+    acc = masked_multi_class_accuracy(begin, end, count, masks);
   }
   t_eval.Stop();
   return t_eval.Millisecs();
diff --git a/libdeepgalois/src/net.cu b/libdeepgalois/src/net.cu
index 3077566512..6ead99d31a 100644
--- a/libdeepgalois/src/net.cu
+++ b/libdeepgalois/src/net.cu
@@ -144,14 +144,14 @@ acc_t masked_f1_score_gpu(int num_classes, int begin, int end, int count,
 
 namespace deepgalois {
 acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count,
-                           mask_t* masks, CSRGraph *g) {
+                           mask_t* masks) {
   return masked_accuracy_gpu(num_classes, begin, end, count, masks,
                              layers[num_conv_layers - 1]->next()->get_data(),
                              context->get_labels_ptr());
 }
 
 acc_t Net::masked_multi_class_accuracy(size_t begin, size_t end, size_t count, 
-                                       mask_t* masks, CSRGraph* g) {
+                                       mask_t* masks) {
 	return masked_f1_score_gpu(num_classes, begin, end, count, masks,
                              layers[num_conv_layers]->next()->get_data(),
                              context->get_labels_ptr());

From 15209f42111b76068278ad188596bc1ef94a4919 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Thu, 30 Apr 2020 19:58:52 -0500
Subject: [PATCH 217/660] fix gpu

---
 libdeepgalois/include/deepgalois/net.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h
index 5c32292430..fe5eaa8aac 100644
--- a/libdeepgalois/include/deepgalois/net.h
+++ b/libdeepgalois/include/deepgalois/net.h
@@ -122,8 +122,8 @@ class Net {
   acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks);
   acc_t masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks);
 #else
-  acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, CSRGraph *gGraph);
-  acc_t masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, CSRGraph *gGraph);
+  acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks);
+  acc_t masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks);
 #endif
 };
 

From 2491650e212dab17fe0b4e5c05ad27a4e7f043d2 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Fri, 1 May 2020 11:51:08 -0500
Subject: [PATCH 218/660] fix gpu

---
 libdeepgalois/CMakeLists.txt                       | 1 +
 libdeepgalois/include/deepgalois/math_functions.hh | 2 +-
 libdeepgalois/src/math_functions.cu                | 4 ++--
 lonestargnn/CMakeLists.txt                         | 3 ++-
 4 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index 7afa6c9169..69a6e7fa40 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -46,6 +46,7 @@ else()
   #set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -G -Xcompiler -rdynamic)
   link_directories(${CUDA_HOME}/lib64)
   link_directories(${CMAKE_SOURCE_DIR}/libgpu)
+  message(STATUS "CUDA_LIB_DIR: ${CUDA_HOME}/lib64")
 
   set(CUDA_SOURCES
     src/layers/graph_conv_layer.cu
diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh
index 53baa2ff0f..fc9e798633 100644
--- a/libdeepgalois/include/deepgalois/math_functions.hh
+++ b/libdeepgalois/include/deepgalois/math_functions.hh
@@ -142,7 +142,7 @@ void d_sigmoid_cross_entropy_gpu(int len, int bengin, int end,
                                  const float_t* out_data, float_t* diff);
 void scal_gpu(const int n, const float alpha, float* X);
 void add_scalar_gpu(const int n, const float_t alpha, float_t* Y);
-void rng_uniform_gpu(const int n, const float_t a, const float_t b, float_t* r);
+void rng_uniform_gpu(size_t n, const float_t a, const float_t b, float_t* r);
 bool is_allocated_device(float_t* data);
 void copy_masks_device(int n, mask_t* h_masks, mask_t*& d_masks);
 void float_malloc_device(int n, float_t*& ptr);
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index 62a7af3849..1f9c020676 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -28,11 +28,11 @@ bool isnan_gpu(int n, const float_t *array) {
   return h_result;
 }
 
-void gpu_rng_uniform(const int n, float_t* r) {
+void gpu_rng_uniform(size_t n, float_t* r) {
   CURAND_CHECK(curandGenerateUniform(deepgalois::Context::curand_generator(), r, n));
 }
 
-void rng_uniform_gpu(const int n, const float_t a, const float_t b, float_t* r) {
+void rng_uniform_gpu(size_t n, const float_t a, const float_t b, float_t* r) {
   CURAND_CHECK(curandGenerateUniform(deepgalois::Context::curand_generator(), r, n));
   const float range = b - a;
   if (range != float_t(1))
diff --git a/lonestargnn/CMakeLists.txt b/lonestargnn/CMakeLists.txt
index 6db3877a6f..0c313d742c 100644
--- a/lonestargnn/CMakeLists.txt
+++ b/lonestargnn/CMakeLists.txt
@@ -1,8 +1,10 @@
 include_directories(${CMAKE_SOURCE_DIR}/lonestargnn/include)
 include_directories(${CMAKE_SOURCE_DIR}/libdeepgalois/include)
 include_directories(${CUDA_HOME}/include)
+link_directories(${CUDA_HOME}/lib64)
 if(ENABLE_HETERO_GALOIS)
   include_directories(${CMAKE_SOURCE_DIR}/libgpu/include)
+  link_directories(${INTEL_LIBS_DIR})
 endif()
 
 SET(BLAS_LIB_DIR ${OPENBLAS_ROOT}/lib64)
@@ -10,7 +12,6 @@ if(USE_MKL_BLAS)
   SET(BLAS_LIB_DIR "${MKL_ROOT}/lib/intel64")
 endif()
 link_directories(${BLAS_LIB_DIR})
-link_directories(${INTEL_LIBS_DIR})
 
 if(NOT ENABLE_HETERO_GALOIS)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCPU_ONLY")

From 9ac1e7be7e0886818518f5715de9887ae59c8018 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Fri, 1 May 2020 12:25:03 -0500
Subject: [PATCH 219/660] fix agg

---
 libdeepgalois/src/layers/aggregator.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/libdeepgalois/src/layers/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp
index e3d6f12f78..0dec25c019 100644
--- a/libdeepgalois/src/layers/aggregator.cpp
+++ b/libdeepgalois/src/layers/aggregator.cpp
@@ -4,13 +4,13 @@
 #ifdef CPU_ONLY
 void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* out,
                 bool norm, const float_t* norm_factor) {
-  // zero out the output data
   #ifndef GALOIS_USE_DIST
   galois::do_all(galois::iterate(size_t(0), g.size()),[&](const auto src) {
   #else
   auto& rangeObj = g.allNodesRange();
   galois::do_all(galois::iterate(rangeObj), [&](const auto src) {
   #endif
+    // zero out the output data
     math::clear_cpu(len , &out[src * len]);
     float_t a = 0.0;
     float_t b = 0.0;
@@ -22,9 +22,10 @@ void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* ou
       if (norm) {
         // normalize b as well
         b = a * norm_factor[dst];
-        float_t* neighbor = new float_t[len];
+        //float_t* neighbor = new float_t[len]; // this is super slow
+        vec_t neighbor(len);
         // scale the neighbor's data using the normalization factor
-        math::scale(len, b, &in[dst * len], neighbor);
+        math::scale(len, b, &in[dst * len], &neighbor[0]);
         // use scaled data to update; out[src] += in[dst]
         math::vadd_cpu(len, &out[src * len], &neighbor[0],  &out[src * len]);
       } else {

From 24292beef51cebd4343e95cfa801aed0261ae4ea Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Fri, 1 May 2020 14:51:52 -0500
Subject: [PATCH 220/660] update dropout

---
 libdeepgalois/src/math_functions.cpp | 63 ++++++++++++++++++++++++----
 1 file changed, 54 insertions(+), 9 deletions(-)

diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp
index 45fccaea04..ec43be8656 100644
--- a/libdeepgalois/src/math_functions.cpp
+++ b/libdeepgalois/src/math_functions.cpp
@@ -1,4 +1,7 @@
 #include <random>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
 #include <immintrin.h>
 #include "galois/Timer.h"
 #include "galois/Galois.h"
@@ -19,8 +22,6 @@ extern "C" {
     exit(1);                           \
   } while(0);
 
-std::default_random_engine generator;
-std::uniform_real_distribution<float_t> distribution(0.0,1.0);
 /*
 #include <boost/random.hpp>
 typedef boost::mt19937 rng_t;
@@ -36,15 +37,18 @@ void rng_bernoulli(size_t n, const float_t p, uint8_t* r) {
     r[i] = variate_generator();
 }
 */
+
+std::default_random_engine generator;
+std::uniform_real_distribution<float_t> distribution(0.0,1.0);
+
 namespace deepgalois {
 
+namespace math {
+
 inline uint8_t bernoulli(float_t p) {
-  //return uniform_rand(float_t(0), float_t(1)) > p ? 1 : 0;
   return distribution(generator) > p ? 1 : 0;
 }
 
-namespace math {
-
 //! wrapper function to call cblas_sgemm
 void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
                const int M, const int N, const int K, const float alpha,
@@ -81,6 +85,26 @@ void mvmul(const CBLAS_TRANSPOSE TransA, const int M, const int N, const float a
   cblas_sgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1);
 }
 
+inline void rng_uniform_cpu(size_t n, float_t* r) {
+#ifdef USE_MKL
+  VSLStreamStatePtr stream;	 
+  // Initializing the streams
+  vslNewStream(&stream, VSL_BRNG_SOBOL, 1);
+  // Generating
+  vsRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, n, r, 0.0f, 1.0f);
+  // Deleting the streams
+  vslDeleteStream(&stream);
+#else
+  for (size_t i = 0; i < n; ++i) {
+    r[i] = distribution(generator);
+  }
+  //galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) {
+  //  unsigned short xi[3];
+  //  r[i] = erand48(xi);
+  //}, galois::loopname("randomMaskGen"));
+#endif
+}
+
 const size_t vec_len = 8; // for 32-bit floating point in AVX2; TODO AVX512
 /*
 // vector add
@@ -198,16 +222,37 @@ void clear_cpu(size_t n, float_t* in) {
 void dropout(size_t m, float scale, float dropout_rate, 
              const float_t* in, mask_t* masks, float_t* out) {
   for (size_t i = 0; i < m; ++i)
-    masks[i] = deepgalois::bernoulli(dropout_rate);
+    masks[i] = bernoulli(dropout_rate);
   for (size_t i = 0; i < m; ++i)
     out[i] = in[i] * (float_t)masks[i] * scale;
 }
 
 void dropout_cpu(size_t n, size_t m, float scale, float dropout_rate,
              const float_t* in, mask_t* masks, float_t* out) {
-  for (size_t i = 0; i < n*m; ++i)
-    masks[i] = deepgalois::bernoulli(dropout_rate);
-  galois::do_all(galois::iterate((size_t)0, n*m), [&](const auto& i) {
+  size_t len = n * m;
+/*
+#ifdef USE_MKL
+  vec_t rands(len);
+  rng_uniform_cpu(len, &rands[0]);
+  galois::do_all(galois::iterate((size_t)0, len), [&](const auto& i) {
+    masks[i] = rands[i] > dropout_rate ? 1 : 0;
+  }, galois::loopname("randomMaskGen"));
+*/
+/*
+  galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) {
+    auto idx = i * m;
+    vec_t rands(m);
+    rng_uniform_cpu(m, &rands[0]);
+    for (size_t j = 0; j < m; ++j)
+      masks[idx+j] = rands[j] > dropout_rate ? 1 : 0;
+  }, galois::loopname("dropout"));
+#else
+*/
+  for (size_t i = 0; i < len; ++i) {
+    masks[i] = bernoulli(dropout_rate);
+  }
+//#endif
+  galois::do_all(galois::iterate((size_t)0, len), [&](const auto& i) {
     out[i] = in[i] * (float_t)masks[i] * scale;
   }, galois::loopname("dropout"));
 }

From 4bafd7274a7231f26b9016bdd377abaa1672fb3d Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Fri, 1 May 2020 15:32:56 -0500
Subject: [PATCH 221/660] udapte lgraph

---
 libdeepgalois/CMakeLists.txt              |  1 +
 libdeepgalois/include/deepgalois/lgraph.h |  8 ++++++++
 libdeepgalois/src/lgraph.cpp              | 14 ++++++++++++++
 3 files changed, 23 insertions(+)

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index 69a6e7fa40..157e0151ad 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -85,6 +85,7 @@ set(sources
   src/DistContext.cpp
   src/optimizer.cpp
   src/sampler.cpp
+  src/lgraph.cpp
   src/utils.cpp
   src/node.cpp
   src/net.cpp
diff --git a/libdeepgalois/include/deepgalois/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h
index 733c6620d8..315ec1145a 100644
--- a/libdeepgalois/include/deepgalois/lgraph.h
+++ b/libdeepgalois/include/deepgalois/lgraph.h
@@ -15,6 +15,7 @@ class LearningGraph {
   index_t *degrees_;
   vdata_t *vertex_data_;
   edata_t *edge_data_;
+  std::vector<std::vector<size_t>> mirrorNodes;
 
 public:
   typedef size_t iterator;
@@ -51,6 +52,13 @@ class LearningGraph {
   void constructNodes();
   void fixEndEdge(index_t vid, index_t row_end);
   void constructEdge(index_t eid, index_t dst, edata_t edata);
+
+  bool isLocal(index_t vid);
+  index_t getLID(index_t vid);
+  bool is_vertex_cut();
+  std::vector<std::vector<size_t>>& getMirrorNodes();
+  uint64_t numMasters();
+  uint64_t globalSize();
 };
 
 }
diff --git a/libdeepgalois/src/lgraph.cpp b/libdeepgalois/src/lgraph.cpp
index 891973e612..3573a9627a 100644
--- a/libdeepgalois/src/lgraph.cpp
+++ b/libdeepgalois/src/lgraph.cpp
@@ -12,6 +12,20 @@
 
 namespace deepgalois {
 
+bool LearningGraph::isLocal(index_t vid) { return true; }
+
+index_t LearningGraph::getLID(index_t vid) { return 0; }
+
+bool LearningGraph::is_vertex_cut() {return true; }
+
+std::vector<std::vector<size_t>>& LearningGraph::getMirrorNodes() {
+  return mirrorNodes;
+}
+
+uint64_t LearningGraph::numMasters() { return 0; }
+
+uint64_t LearningGraph::globalSize() { return 0; }
+
 void LearningGraph::progressPrint(unsigned maxii, unsigned ii) {
   const unsigned nsteps = 10;
   unsigned ineachstep = (maxii / nsteps);

From f6fd899e1421b706aaab8d8095d9a4bf1b574cb9 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Fri, 1 May 2020 21:29:22 -0500
Subject: [PATCH 222/660] fix sampler bug

---
 .../include/deepgalois/DistContext.h          |  6 +-
 libdeepgalois/include/deepgalois/context.h    | 34 ++--------
 .../include/deepgalois/layers/layer.h         | 13 +++-
 libdeepgalois/include/deepgalois/net.h        |  9 +--
 libdeepgalois/src/DistContext.cpp             |  2 +-
 libdeepgalois/src/context.cpp                 | 55 ++++++++++++----
 libdeepgalois/src/context.cu                  | 12 ++--
 libdeepgalois/src/layers/aggregator.cpp       | 17 +++--
 libdeepgalois/src/layers/graph_conv_layer.cpp |  3 +-
 libdeepgalois/src/layers/graph_conv_layer.cu  |  2 -
 .../src/layers/softmax_loss_layer.cpp         |  9 +--
 libdeepgalois/src/lgraph.cpp                  | 10 +--
 libdeepgalois/src/net.cpp                     | 64 ++++++++++++-------
 libdeepgalois/src/sampler.cpp                 |  4 +-
 14 files changed, 138 insertions(+), 102 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h
index 04aca5fc9e..d7e368965a 100644
--- a/libdeepgalois/include/deepgalois/DistContext.h
+++ b/libdeepgalois/include/deepgalois/DistContext.h
@@ -26,7 +26,7 @@ class DistContext {
   label_t *d_labels_subg;      // labels for subgraph on device
   float_t* d_feats;            // input features on device
   float_t* d_feats_subg;       // input features for subgraph on device
-  float_t* norm_factor;        // normalization constant based on graph structure
+  float_t* norm_factors;       // normalization constant based on graph structure
 
 public:
   DistContext();
@@ -47,12 +47,12 @@ class DistContext {
 
   //! find norm factor by looking at degree
   // TODO this is a distributed operation
-  void norm_factor_counting(size_t g_size);
+  void norm_factor_computing(size_t g_size);
   void createSubgraph() {}
   void gen_subgraph_labels(size_t m, const mask_t *masks) {}
   void gen_subgraph_feats(size_t m, const mask_t *masks) {}
 
-  float_t* get_norm_factor_ptr() { return norm_factor; }
+  float_t* get_norm_factors_ptr() { return norm_factors; }
   Graph* getGraphPointer() { return graph_cpu; }
   Graph* getSubgraphPointer() { return subgraph_cpu; };
   float_t* get_feats_ptr() { return h_feats; }
diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h
index ff324ef60f..fc9748d952 100644
--- a/libdeepgalois/include/deepgalois/context.h
+++ b/libdeepgalois/include/deepgalois/context.h
@@ -31,12 +31,13 @@ class Context {
 
   label_t get_label(size_t i) { return h_labels[i]; } // single-class (one-hot) label
   //label_t get_label(size_t i, size_t j) { return labels[i*num_classes+j]; } // multi-class label
-  float_t* get_norm_factor_ptr() { return norm_factor; }
+  float_t* get_norm_factors_ptr() { return norm_factors; }
+  float_t* get_norm_factors_subg_ptr() { return norm_factors_subg; }
 
   void set_label_class(bool is_single = true) { is_single_class = is_single; }
   void set_use_subgraph(bool use_subg) { use_subgraph = use_subg; }
   void copy_data_to_device(); // copy labels and input features
-  void norm_factor_counting(size_t g_size);
+  void norm_factor_computing(bool is_subgraph);
 
 #ifdef CPU_ONLY
   Graph* graph_cpu; // the input graph, |V| = N
@@ -66,30 +67,7 @@ class Context {
   inline static cusparseMatDescr_t cusparse_matdescr() { return cusparse_matdescr_; }
   inline static curandGenerator_t curand_generator() { return curand_generator_; }
 #endif
-/*
-  // This random number generator facade hides boost and CUDA rng
-  // implementation from one another (for cross-platform compatibility).
-  class RNG {
-   public:
-    RNG();
-    explicit RNG(unsigned int seed);
-    explicit RNG(const RNG&);
-    RNG& operator=(const RNG&);
-    void* generator();
-   private:
-    class Generator;
-    boost::shared_ptr<Generator> generator_;
-  };
 
-  static Context& Get();
-  // Getters for boost rng, curand, and cublas handles
-  inline static RNG& rng_stream() {
-    if (!Get().random_generator_) {
-      Get().random_generator_.reset(new RNG());
-    }
-    return *(Get().random_generator_);
-  }
-*/
 protected:
   size_t n;                    // number of samples: N
   size_t num_classes;          // number of classes: E
@@ -105,8 +83,10 @@ class Context {
   label_t *d_labels_subg;      // labels for subgraph on device
   float_t* d_feats;            // input features on device
   float_t* d_feats_subg;       // input features for subgraph on device
-  float_t* norm_factor;        // normalization constant based on graph structure
-  //boost::shared_ptr<RNG> random_generator_;
+  float_t* norm_factors;       // normalization constant based on graph structure
+  float_t* norm_factors_subg;  // normalization constant for subgraph
+  void alloc_norm_factor();
+  void alloc_subgraph_norm_factor();
 
 #ifdef CPU_ONLY
   void read_edgelist(const char* filename, bool symmetrize = false, bool add_self_loop = false);
diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h
index 7f1c05ce60..0ffab6de41 100644
--- a/libdeepgalois/include/deepgalois/layers/layer.h
+++ b/libdeepgalois/include/deepgalois/layers/layer.h
@@ -72,6 +72,7 @@ class layer : public deepgalois::node {
   void set_context(ContextType* ctx) { context = ctx; }
   void set_trainable(bool trainable) { trainable_ = trainable; } // is this layer trainable?
   void set_labels_ptr(label_t *ptr) { labels = ptr; }
+  void set_norm_consts_ptr(float_t *ptr) { norm_consts = ptr; }
   void set_feats_ptr(float_t *ptr) { prev_->set_data(ptr); }
   void set_name(std::string name) { name_ = name; } // name metadata
 #ifdef CPU_ONLY
@@ -79,7 +80,7 @@ class layer : public deepgalois::node {
 #else
   void set_graph_ptr(CSRGraph *ptr) { graph_gpu = ptr; }
 #endif
-  void update_dim_size(size_t sg_size) { input_dims[0] = output_dims[0] = sg_size; }
+  void update_dim_size(size_t g_size) { input_dims[0] = output_dims[0] = g_size; }
 
   //! set the data of the previous layer connected to this one
   void set_in_data(float_t* data) {
@@ -93,11 +94,15 @@ class layer : public deepgalois::node {
     begin_ = sample_begin;
     end_   = sample_end;
     count_ = sample_count;
+    use_mask = false;
+    if (masks != NULL) {
+      use_mask = true;
 #ifdef CPU_ONLY
-    masks_ = masks;
+      masks_ = masks;
 #else
-	d_masks_ = masks;
+      d_masks_ = masks;
 #endif
+    }
   }
 
   void add_edge() {
@@ -151,6 +156,7 @@ class layer : public deepgalois::node {
   std::vector<size_t> output_dims; // output dimentions
   std::string name_;               // name of this layer
   bool trainable_;                 // is this layer trainable
+  bool use_mask;
   vec_t W; // parameters to learn, for vertex v, layer0: D x 16, layer1: 16 x E
   vec_t Q; // parameters to learn, for vertex u, i.e. v's neighbors, layer0: D x
            // 16, layer1: 16 x E
@@ -162,6 +168,7 @@ class layer : public deepgalois::node {
   float_t* loss; // error for each vertex: N x 1
   ContextType* context;
   label_t* labels;
+  float_t* norm_consts;
 #ifdef CPU_ONLY
   Graph *graph_cpu;
 #else
diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h
index fe5eaa8aac..f87b4e549a 100644
--- a/libdeepgalois/include/deepgalois/net.h
+++ b/libdeepgalois/include/deepgalois/net.h
@@ -118,13 +118,10 @@ class Net {
 
 #ifdef CPU_ONLY
   Sampler *sampler;
-  // comparing outputs with the ground truth (labels)
-  acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks);
-  acc_t masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks);
-#else
-  acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks);
-  acc_t masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks);
 #endif
+  // comparing outputs with the ground truth (labels)
+  acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, float_t* preds, label_t* ground_truth);
+  acc_t masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, float_t* preds, label_t* ground_truth);
 };
 
 } // namespace deepgalois
diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp
index 174e7eb210..e53dc1c118 100644
--- a/libdeepgalois/src/DistContext.cpp
+++ b/libdeepgalois/src/DistContext.cpp
@@ -151,7 +151,7 @@ float_t* DistContext::get_in_ptr() {
   return &h_feats[0];
 }
 
-void DistContext::norm_factor_counting(size_t g_size) {
+void DistContext::norm_factor_computing(size_t g_size) {
   // TODO: this is a distributed operation
 
   // create for now, TODO need to actually fill it in
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index ffc2069024..bb0e67c818 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -26,12 +26,15 @@ Context::Context() : n(0), num_classes(0),
   h_feats(NULL), h_feats_subg(NULL),
   d_labels(NULL), d_labels_subg(NULL),
   d_feats(NULL), d_feats_subg(NULL),
-  norm_factor(NULL) {}
+  norm_factors(NULL) {}
 
 Context::~Context() {
   if (h_labels) delete h_labels;
+  if (h_labels_subg) delete h_labels_subg;
   if (h_feats) delete h_feats;
-  if (norm_factor) delete norm_factor;
+  if (h_feats_subg) delete h_feats_subg;
+  if (norm_factors) delete norm_factors;
+  if (norm_factors_subg) delete norm_factors_subg;
 }
 
 size_t Context::read_graph(std::string dataset_str, bool selfloop) {
@@ -137,30 +140,56 @@ void Context::add_selfloop(Graph &og, Graph &g) {
   //*/
 }
 
-void Context::norm_factor_counting(size_t g_size) {
-  auto g = getGraphPointer();
-  auto subg = getSubgraphPointer();
-  if (use_subgraph) g = subg;
+void Context::alloc_norm_factor() {
+  Graph* g = getGraphPointer();
+  if (norm_factors == NULL)
+#ifdef USE_MKL
+    norm_factors = new float_t[g->sizeEdges()];
+#else
+    norm_factors = new float_t[g->size()];
+#endif
+}
+
+void Context::alloc_subgraph_norm_factor() {
+  Graph* g = getSubgraphPointer();
+  if (norm_factors_subg == NULL)
+#ifdef USE_MKL
+    norm_factors_subg = new float_t[g->sizeEdges()];
+#else
+    norm_factors_subg = new float_t[g->size()];
+#endif
+}
+
+void Context::norm_factor_computing(bool is_subgraph) {
+  Graph* g;
+  float_t *constants;
+  if (!is_subgraph) {
+    g = getGraphPointer();
+    alloc_norm_factor();
+    constants = norm_factors;
+  } else {
+    g = getSubgraphPointer();
+    alloc_subgraph_norm_factor();
+    constants = norm_factors_subg;
+  }
+  auto g_size = g->size();
   g->degree_counting();
-  if (norm_factor != NULL) free(norm_factor);
 #ifdef USE_MKL
-  norm_factor = new float_t[g->sizeEdges()];
   galois::do_all(galois::iterate((size_t)0, g_size), [&](auto i) {
     float_t c_i = std::sqrt(float_t(g->get_degree(i)));
     for (auto e = g->edge_begin(i); e != g->edge_end(i); e++) {
       const auto j = g->getEdgeDst(e);
       float_t c_j = std::sqrt(float_t(g->get_degree(j)));
-      if (c_i == 0.0 || c_j == 0.0) norm_factor[e] = 0.0;
-      else norm_factor[e] = 1.0 / (c_i * c_j);
+      if (c_i == 0.0 || c_j == 0.0) constants[e] = 0.0;
+      else constants[e] = 1.0 / (c_i * c_j);
     }
   }, galois::loopname("NormCountingEdge"));
 #else
-  norm_factor = new float_t[g_size];
   galois::do_all(galois::iterate((size_t)0, g_size), [&](auto v) {
     auto degree  = g->get_degree(v);
     float_t temp = std::sqrt(float_t(degree));
-    if (temp == 0.0) norm_factor[v] = 0.0;
-    else norm_factor[v] = 1.0 / temp;
+    if (temp == 0.0) constants[v] = 0.0;
+    else constants[v] = 1.0 / temp;
   }, galois::loopname("NormCountingVertex"));
 #endif
 }
diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu
index 23abd3f1c2..0042f5420e 100644
--- a/libdeepgalois/src/context.cu
+++ b/libdeepgalois/src/context.cu
@@ -25,7 +25,7 @@ int64_t cluster_seedgen(void) {
 }
 
 // computing normalization factor for each vertex
-__global__ void norm_factor_counting_node(int n, CSRGraph graph, float_t* norm_fac) {
+__global__ void norm_factor_computing_node(int n, CSRGraph graph, float_t* norm_fac) {
   CUDA_KERNEL_LOOP(i, n) {
     float_t temp = sqrt(float_t(graph.getOutDegree(i)));
     if (temp == 0.0) norm_fac[i] = 0.0;
@@ -35,7 +35,7 @@ __global__ void norm_factor_counting_node(int n, CSRGraph graph, float_t* norm_f
 
 // TODO: make sure self-loop added for each vertex
 // computing normalization factor for each edge
-__global__ void norm_factor_counting_edge(int n, CSRGraph graph, float_t* norm_fac) {
+__global__ void norm_factor_computing_edge(int n, CSRGraph graph, float_t* norm_fac) {
   CUDA_KERNEL_LOOP(src, n) {
     assert(src < n);
     float_t d_src = float_t(graph.getOutDegree(src));
@@ -97,7 +97,7 @@ size_t Context::read_graph(std::string dataset_str, bool selfloop) {
   return n;
 }
 
-void Context::norm_factor_counting(size_t g_size) {
+void Context::norm_factor_computing(bool is_subgraph) {
   std::cout << "Pre-computing normalization factor (n=" << n << ") ... ";
   if (!is_selfloop_added) {
     std::cout << "Set -sl=1 to add selfloop\n";	  
@@ -107,14 +107,14 @@ void Context::norm_factor_counting(size_t g_size) {
   int nnz = graph_gpu.nedges;
   CUDA_CHECK(cudaMalloc((void**)&norm_factor, nnz * sizeof(float_t)));
   init_const_gpu(nnz, 0.0, norm_factor);
-  norm_factor_counting_edge<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
+  norm_factor_computing_edge<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
       n, graph_gpu, norm_factor);
 #else
   CUDA_CHECK(cudaMalloc((void**)&norm_factor, n * sizeof(float_t)));
-  norm_factor_counting_node<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
+  norm_factor_computing_node<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
       n, graph_gpu, norm_factor);
 #endif
-  CudaTest("solving norm_factor_counting kernel failed");
+  CudaTest("solving norm_factor_computing kernel failed");
   std::cout << "Done\n";
 }
 /*
diff --git a/libdeepgalois/src/layers/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp
index 0dec25c019..fc841a6361 100644
--- a/libdeepgalois/src/layers/aggregator.cpp
+++ b/libdeepgalois/src/layers/aggregator.cpp
@@ -4,14 +4,17 @@
 #ifdef CPU_ONLY
 void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* out,
                 bool norm, const float_t* norm_factor) {
+  size_t n = g.size();
+  //std::cout << "[update_all] graph size: " << n << "\n";
   #ifndef GALOIS_USE_DIST
-  galois::do_all(galois::iterate(size_t(0), g.size()),[&](const auto src) {
+  galois::do_all(galois::iterate(size_t(0), n),[&](const auto src) {
   #else
   auto& rangeObj = g.allNodesRange();
   galois::do_all(galois::iterate(rangeObj), [&](const auto src) {
   #endif
+    auto src_idx = src * len;
     // zero out the output data
-    math::clear_cpu(len , &out[src * len]);
+    math::clear_cpu(len , &out[src_idx]);
     float_t a = 0.0;
     float_t b = 0.0;
     // get normalization factor if needed
@@ -19,21 +22,23 @@ void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* ou
     // gather neighbors' embeddings
     for (auto e = g.edge_begin(src); e != g.edge_end(src); e++) {
       const auto dst = g.getEdgeDst(e);
+      assert(dst < n);
+      auto dst_idx = dst * len;
       if (norm) {
         // normalize b as well
         b = a * norm_factor[dst];
         //float_t* neighbor = new float_t[len]; // this is super slow
         vec_t neighbor(len);
         // scale the neighbor's data using the normalization factor
-        math::scale(len, b, &in[dst * len], &neighbor[0]);
+        math::scale(len, b, &in[dst_idx], &neighbor[0]);
         // use scaled data to update; out[src] += in[dst]
-        math::vadd_cpu(len, &out[src * len], &neighbor[0],  &out[src * len]);
+        math::vadd_cpu(len, &out[src_idx], &neighbor[0],  &out[src_idx]);
       } else {
         // add embeddings from neighbors together; out[src] += in[dst]
-        math::vadd_cpu(len, &out[src * len], &in[dst * len], &out[src * len]);
+        math::vadd_cpu(len, &out[src_idx], &in[dst_idx], &out[src_idx]);
       }
     }
-  }, galois::steal(), galois::no_stats(), galois::loopname("update_all"));
+  }, galois::steal(), galois::loopname("update_all"));
 }
 
 void deepgalois::update_all_csrmm(size_t len, Graph& g, const float_t* in, float_t* out,
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 31622e0699..e50d66f5ae 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -40,7 +40,6 @@ inline void graph_conv_layer::zero_init_matrix(size_t dim_x, size_t dim_y, vec_t
 // aggregate based on graph topology
 void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in, float_t* out) {
   // normalization constant based on graph structure
-  float_t* norm_consts = context->get_norm_factor_ptr();
 #ifdef USE_MKL
   update_all_csrmm(len, g, in, out, norm_, norm_consts);
 #else
@@ -50,7 +49,6 @@ void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in, float_
 
 // since graph is symmetric, the derivative is the same
 void graph_conv_layer::d_aggregate(size_t len, Graph& g, const float_t* in, float_t* out) {
-  float_t* norm_consts = context->get_norm_factor_ptr();
 #ifdef USE_MKL
   update_all_csrmm(len, g, in, out, norm_, norm_consts); // x*x; x*z -> x*z
 #else
@@ -101,6 +99,7 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_
   size_t x = input_dims[0];
   size_t y = input_dims[1];
   size_t z = output_dims[1];
+  //std::cout << "layer: " << name_ << "\n";
   //std::cout << "x=" << x << ", y=" << y << ", z=" << z << "\n";
 
   // input: x*y; W: y*z; output: x*z
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cu b/libdeepgalois/src/layers/graph_conv_layer.cu
index 7edb4ab10c..ef62725da2 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cu
+++ b/libdeepgalois/src/layers/graph_conv_layer.cu
@@ -31,7 +31,6 @@ void graph_conv_layer::malloc_and_init() {
 }
 
 void graph_conv_layer::aggregate(size_t len, CSRGraph& g, const float_t* in, float_t* out) {
-  float_t* norm_factor = context->get_norm_factor_ptr();
   #ifdef USE_CUSPARSE
   deepgalois::update_all_csrmm(len, g, in, out, norm_, norm_factor);
   #else
@@ -40,7 +39,6 @@ void graph_conv_layer::aggregate(size_t len, CSRGraph& g, const float_t* in, flo
 }
 
 void graph_conv_layer::d_aggregate(size_t len, CSRGraph& g, const float_t* in, float_t* out) {
-  float_t* norm_factor = context->get_norm_factor_ptr();
 #ifdef USE_CUSPARSE
   deepgalois::update_all_csrmm(len, g, in, out, norm_, norm_factor);
 #else
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp
index f1c1aa27e4..4ae9c6364b 100644
--- a/libdeepgalois/src/layers/softmax_loss_layer.cpp
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp
@@ -31,11 +31,11 @@ void softmax_loss_layer::forward_propagation(const float_t* in_data,
                                              float_t* out_data) {
   size_t len = input_dims[1];
   galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) {
-    if (masks_[i] == 1) { // masked
+    if (!use_mask || masks_[i] == 1) { // masked
       // output is normalized input for this layer
       math::softmax(len, &in_data[len*i], &out_data[len*i]); // normalize using softmax
       // one hot encoded vector for the labels
-      std::vector<acc_t> groundTruth(output_dims[1], 0.0); // ground truth
+      vec_t groundTruth(output_dims[1], 0.0); // ground truth
       groundTruth[get_label(i)] = 1.0;            // one-hot
       // loss calculation
       loss[i] = math::cross_entropy(len, &groundTruth[0], &out_data[len*i]);
@@ -52,7 +52,7 @@ void softmax_loss_layer::back_propagation(const float_t* in_data,
   // note: out_grad is ignored because it shouldn't exist (this is output layer)
   size_t len = layer::input_dims[1];
   galois::do_all(galois::iterate(layer::begin_, layer::end_), [&](const auto& i) {
-    if (masks_[i] == 1) { // masked
+    if (!use_mask || masks_[i] == 1) { // masked
       vec_t norm_grad(len);
       std::vector<acc_t> groundTruth(len, 0.0);
       groundTruth[get_label(i)] = 1.0;
@@ -74,11 +74,12 @@ acc_t softmax_loss_layer::get_prediction_loss() {
   total_loss.reset();
   valid_sample_count.reset();
   galois::do_all(galois::iterate(layer::begin_, layer::end_), [&](const auto& i) {
-    if (masks_[i]) {
+    if (!use_mask || masks_[i]) {
       total_loss += loss[i];
       valid_sample_count += 1;
     }
   }, galois::chunk_size<64>(), galois::steal(), galois::loopname("getMaskedLoss"));
+  //std::cout << "begin = " << begin_ << " end = " << end_ << " count = " << count_ << " valid_count = " << valid_sample_count.reduce() << "\n";
   assert(valid_sample_count.reduce() == count_);
   return total_loss.reduce() / (acc_t)count_;
 }
diff --git a/libdeepgalois/src/lgraph.cpp b/libdeepgalois/src/lgraph.cpp
index 3573a9627a..dd3390d331 100644
--- a/libdeepgalois/src/lgraph.cpp
+++ b/libdeepgalois/src/lgraph.cpp
@@ -165,11 +165,11 @@ void LearningGraph::readGraphFromGRFile(const std::string& filename) {
 #ifdef CPU_ONLY
 void LearningGraph::dealloc() {
   assert (!is_device);
-  free(rowptr_);
-  free(colidx_);
-  free(degrees_);
-  if (vertex_data_ != NULL) free(vertex_data_);
-  if (edge_data_ != NULL) free(edge_data_);
+  if (rowptr_ != NULL) delete rowptr_;
+  if (colidx_ != NULL) delete colidx_;
+  if (degrees_ != NULL) delete degrees_;
+  if (vertex_data_ != NULL) delete vertex_data_;
+  if (edge_data_ != NULL) delete edge_data_;
 }
 #endif
 
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index 86bd0f6340..b5d0e36197 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -162,17 +162,17 @@ void Net::train(optimizer* opt, bool need_validate) {
     t_epoch.Start();
 
     if (subgraph_sample_size && num_subg_remain == 0) {
+      for (size_t i = 0; i < num_layers; i++) layers[i]->update_dim_size(subgraph_sample_size);
 #ifdef CPU_ONLY
       // generate subgraph
       context->createSubgraph();
       auto subgraph_ptr = context->getSubgraphPointer();
       sampler->subgraph_sample(subgraph_sample_size, *(subgraph_ptr), subgraph_masks);
-      for (size_t i = 0; i < num_conv_layers-1; i++) {
+      context->norm_factor_computing(1);
+      for (size_t i = 0; i < num_conv_layers; i++) {
         layers[i]->set_graph_ptr(context->getSubgraphPointer());
+        layers[i]->set_norm_consts_ptr(context->get_norm_factors_subg_ptr());
 	  }
-      // update masks for subgraph
-      layers[num_layers - 1]->set_sample_mask(train_begin, train_end, train_count, subgraph_masks);
-
       // update labels for subgraph
       context->gen_subgraph_labels(subgraph_sample_size, subgraph_masks);
       layers[num_layers-1]->set_labels_ptr(context->get_labels_subg_ptr());
@@ -180,8 +180,6 @@ void Net::train(optimizer* opt, bool need_validate) {
       // update features for subgraph
       context->gen_subgraph_feats(subgraph_sample_size, subgraph_masks);
       layers[0]->set_feats_ptr(context->get_feats_subg_ptr()); // feed input data
-
-      context->norm_factor_counting(subgraph_sample_size);
 #endif
       num_subg_remain += 1; // num_threads
     }
@@ -215,6 +213,17 @@ void Net::train(optimizer* opt, bool need_validate) {
     double epoch_time = t_epoch.Millisecs();
     total_train_time += epoch_time;
     if (need_validate && ep % val_interval == 0) {
+      if (subgraph_sample_size) { // switch to the original graph
+        for (size_t i = 0; i < num_layers; i++) layers[i]->update_dim_size(num_samples);
+#ifdef CPU_ONLY
+        for (size_t i = 0; i < num_conv_layers; i++) {
+          layers[i]->set_graph_ptr(context->getGraphPointer());
+          layers[i]->set_norm_consts_ptr(context->get_norm_factors_ptr());
+	    }
+        layers[num_layers-1]->set_labels_ptr(context->get_labels_ptr());
+        layers[0]->set_feats_ptr(context->get_feats_ptr()); // feed input data
+#endif
+      }
       // Validation
       acc_t val_loss = 0.0, val_acc = 0.0;
       Tval.start();
@@ -247,7 +256,13 @@ double Net::evaluate(std::string type, acc_t& loss, acc_t& acc) {
     end = train_end;
     count = train_count;
     masks = train_masks;
-    if (subgraph_sample_size) masks = subgraph_masks;
+    if (subgraph_sample_size) {
+      // update masks for subgraph
+      masks = NULL;
+      begin = 0;
+      end = subgraph_sample_size;
+      count = subgraph_sample_size;
+    }
   } else if (type == "val") {
     begin = val_begin;
     end = val_end;
@@ -270,10 +285,17 @@ double Net::evaluate(std::string type, acc_t& loss, acc_t& acc) {
 #endif
 
   loss = fprop(begin, end, count, masks);
+  float_t* predictions = layers[num_layers - 1]->next()->get_data();
+  label_t* labels;
+  if (subgraph_sample_size) {
+    labels = context->get_labels_subg_ptr();
+  } else {
+    labels = context->get_labels_ptr();
+  }
   if (is_single_class) {
-    acc = masked_accuracy(begin, end, count, masks);
+    acc = masked_accuracy(begin, end, count, masks, predictions, labels);
   } else {
-    acc = masked_multi_class_accuracy(begin, end, count, masks);
+    acc = masked_multi_class_accuracy(begin, end, count, masks, predictions, labels);
   }
   t_eval.Stop();
   return t_eval.Millisecs();
@@ -347,8 +369,6 @@ void Net::construct_layers() {
 
   // allocate memory for intermediate features and gradients
   for (size_t i = 0; i < num_layers; i++) {
-    if (subgraph_sample_size)
-      layers[i]->update_dim_size(subgraph_sample_size);
     layers[i]->add_edge();
   }
   for (size_t i = 1; i < num_layers; i++)
@@ -357,7 +377,9 @@ void Net::construct_layers() {
     layers[i]->malloc_and_init();
   layers[0]->set_in_data(context->get_feats_ptr()); // feed input data
   // precompute the normalization constant based on graph structure
-  if (!subgraph_sample_size) context->norm_factor_counting(num_samples);
+  context->norm_factor_computing(0);
+  for (size_t i = 0; i < num_conv_layers; i++)
+    layers[i]->set_norm_consts_ptr(context->get_norm_factors_ptr());
   set_contexts();
 }
 
@@ -445,7 +467,7 @@ void Net::read_test_masks(std::string dataset) {
  * @param end GLOBAL end
  * @param count GLOBAL training count
  */
-acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks) {
+acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, float_t* preds, label_t* ground_truth) {
 #ifndef GALOIS_USE_DIST
   AccumF accuracy_all;
 #else
@@ -458,12 +480,11 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks
 
   galois::do_all(galois::iterate(begin, end), [&](const auto& i) {
 #ifndef GALOIS_USE_DIST
-    if (masks[i] == 1) {
+    if (masks == NULL || masks[i] == 1) { // use sampled graph when masks is NULL
       // get prediction
-      int preds = math::argmax(num_classes,
-      	    &(layers[num_conv_layers - 1]->next()->get_data()[i * num_classes]));
+      auto pred = math::argmax(num_classes, preds+i*num_classes);
       // check prediction
-      if ((label_t)preds == context->get_label(i))
+      if ((label_t)pred == ground_truth[i])
         accuracy_all += 1.0;
     }
 #else
@@ -475,10 +496,9 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks
       uint32_t localID = dGraph->getLID(i);
       if (masks[localID] == 1) {
         // get prediction
-        int preds = math::argmax(num_classes,
-        	    &(layers[num_conv_layers - 1]->next()->get_data()[localID * num_classes]));
+        auto preds = math::argmax(num_classes, preds+localID*num_classes);
         // check prediction
-        if ((label_t)preds == context->get_label(localID))
+        if ((label_t)preds == ground_truth[localID])
           accuracy_all += 1.0;
       }
     }
@@ -494,9 +514,7 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks
   return accuracy_all.reduce() / (acc_t)count;
 }
 
-acc_t Net::masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks) {
-  auto preds = layers[num_conv_layers]->next()->get_data();
-  auto ground_truth = context->get_labels_ptr();
+acc_t Net::masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, float_t* preds, label_t* ground_truth) {
   return deepgalois::masked_f1_score(begin, end, count, masks, num_classes, ground_truth, preds);
 }
 #endif
diff --git a/libdeepgalois/src/sampler.cpp b/libdeepgalois/src/sampler.cpp
index f1e4238a84..c7d5639330 100644
--- a/libdeepgalois/src/sampler.cpp
+++ b/libdeepgalois/src/sampler.cpp
@@ -140,7 +140,9 @@ void Sampler::generate_subgraph(VertexSet &vertex_set, Graph &g, Graph &sub) {
     unsigned j = 0;
     auto old_id = old_ids[i];
     for (auto e = g.edge_begin(old_id); e != g.edge_end(old_id); e++) {
-      sub.constructEdge(offsets[i]+j, g.getEdgeDst(e), 0);
+      auto dst = new_ids[g.getEdgeDst(e)];
+      assert(dst < nv);
+      sub.constructEdge(offsets[i]+j, dst, 0);
       j ++;
     }
   }, galois::loopname("construct_graph"));

From c0083d4cfa3396b4abb5b0da0f05284c6a761def Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Sat, 2 May 2020 13:03:52 -0500
Subject: [PATCH 223/660] update sampler

---
 .../include/deepgalois/DistContext.h          |  2 +-
 libdeepgalois/include/deepgalois/context.h    | 14 ++--
 libdeepgalois/include/deepgalois/net.h        |  9 ++-
 libdeepgalois/src/context.cpp                 | 18 +++---
 libdeepgalois/src/context.cu                  |  9 +++
 libdeepgalois/src/lgraph.cpp                  |  2 +-
 libdeepgalois/src/net.cpp                     | 64 +++++++++++++------
 libdeepgalois/src/sampler.cpp                 | 16 +++--
 lonestargnn/gcn/gcn.cpp                       |  6 +-
 9 files changed, 89 insertions(+), 51 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h
index d7e368965a..32d95fcc99 100644
--- a/libdeepgalois/include/deepgalois/DistContext.h
+++ b/libdeepgalois/include/deepgalois/DistContext.h
@@ -48,7 +48,7 @@ class DistContext {
   //! find norm factor by looking at degree
   // TODO this is a distributed operation
   void norm_factor_computing(size_t g_size);
-  void createSubgraph() {}
+  void createSubgraphs(int num_subgraphs) {}
   void gen_subgraph_labels(size_t m, const mask_t *masks) {}
   void gen_subgraph_feats(size_t m, const mask_t *masks) {}
 
diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h
index fc9748d952..0f6d96ec98 100644
--- a/libdeepgalois/include/deepgalois/context.h
+++ b/libdeepgalois/include/deepgalois/context.h
@@ -37,22 +37,22 @@ class Context {
   void set_label_class(bool is_single = true) { is_single_class = is_single; }
   void set_use_subgraph(bool use_subg) { use_subgraph = use_subg; }
   void copy_data_to_device(); // copy labels and input features
-  void norm_factor_computing(bool is_subgraph);
+  void norm_factor_computing(bool is_subgraph, int subg_id = 0);
+  void gen_subgraph_labels(size_t m, const mask_t *masks);
+  void gen_subgraph_feats(size_t m, const mask_t *masks);
+  void createSubgraphs(int num_subgraphs);
 
 #ifdef CPU_ONLY
   Graph* graph_cpu; // the input graph, |V| = N
-  Graph* subgraph_cpu;
-  void createSubgraph();
+  std::vector<Graph*> subgraphs_cpu;
   void add_selfloop(Graph &og, Graph &g);
   //! returns pointer to the graph
   Graph* getGraphPointer() { return graph_cpu; }
-  Graph* getSubgraphPointer() { return subgraph_cpu; };
+  Graph* getSubgraphPointer(int id) { return subgraphs_cpu[id]; };
   float_t* get_feats_ptr() { return h_feats; }
   float_t* get_feats_subg_ptr() { return h_feats_subg; }
   label_t* get_labels_ptr() { return h_labels; }
   label_t* get_labels_subg_ptr() { return h_labels_subg; }
-  void gen_subgraph_labels(size_t m, const mask_t *masks);
-  void gen_subgraph_feats(size_t m, const mask_t *masks);
 #else
   CSRGraph graph_gpu; // the input graph, |V| = N
   CSRGraph subgraph_gpu;
@@ -86,7 +86,7 @@ class Context {
   float_t* norm_factors;       // normalization constant based on graph structure
   float_t* norm_factors_subg;  // normalization constant for subgraph
   void alloc_norm_factor();
-  void alloc_subgraph_norm_factor();
+  void alloc_subgraph_norm_factor(int subg_id);
 
 #ifdef CPU_ONLY
   void read_edgelist(const char* filename, bool symmetrize = false, bool add_self_loop = false);
diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h
index f87b4e549a..6708e2ce63 100644
--- a/libdeepgalois/include/deepgalois/net.h
+++ b/libdeepgalois/include/deepgalois/net.h
@@ -31,14 +31,14 @@ class Net {
 public:
   Net() : is_single_class(true), has_l2norm(false), has_dense(false),
           neighbor_sample_size(0), subgraph_sample_size(0),
-          num_samples(0), num_classes(0),
+          num_threads(1), num_samples(0), num_classes(0),
           num_conv_layers(0), num_layers(0), num_epochs(0),
           learning_rate(0.0), dropout_rate(0.0), weight_decay(0.0),
           train_begin(0), train_end(0), train_count(0),
           val_begin(0), val_end(0), val_count(0),
           test_begin(0), test_end(0), test_count(0),
           train_masks(NULL), val_masks(NULL), test_masks(NULL), context(NULL) {}
-  void init(std::string dataset_str, unsigned num_conv, unsigned epochs,
+  void init(std::string dataset_str, int nt, unsigned n_conv, unsigned epochs,
             unsigned hidden1, float lr, float dropout, float wd,
             bool selfloop, bool single, bool l2norm, bool dense, 
             unsigned neigh_sample_size = 0, unsigned subg_sample = 0);
@@ -87,6 +87,7 @@ class Net {
   bool has_dense;                    // whether the net contains an dense layer
   unsigned neighbor_sample_size;     // neighbor sampling
   unsigned subgraph_sample_size;     // subgraph sampling
+  int num_threads;                   // number of threads
   size_t num_samples;                // number of samples: N
   size_t num_classes;                // number of vertex classes: E
   size_t num_conv_layers;            // number of convolutional layers
@@ -99,6 +100,8 @@ class Net {
   size_t val_begin, val_end, val_count;
   size_t test_begin, test_end, test_count;
   int val_interval;
+  int num_subgraphs;
+  int num_vertices_sg;
 
   mask_t* train_masks;               // masks for training
   mask_t* d_train_masks;             // masks for training on device
@@ -106,7 +109,7 @@ class Net {
   mask_t* d_val_masks;               // masks for validation on device
   mask_t* test_masks;                // masks for test
   mask_t* d_test_masks;              // masks for test on device
-  mask_t* subgraph_masks;            // masks for subgraph
+  mask_t* subgraphs_masks;           // masks for subgraphs
   std::vector<size_t> feature_dims;  // feature dimnesions for each layer
   std::vector<layer*> layers;        // all the layers in the neural network
 #ifndef GALOIS_USE_DIST
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index bb0e67c818..bbaa915e0f 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -26,7 +26,7 @@ Context::Context() : n(0), num_classes(0),
   h_feats(NULL), h_feats_subg(NULL),
   d_labels(NULL), d_labels_subg(NULL),
   d_feats(NULL), d_feats_subg(NULL),
-  norm_factors(NULL) {}
+  norm_factors(NULL), norm_factors_subg(NULL) {}
 
 Context::~Context() {
   if (h_labels) delete h_labels;
@@ -42,8 +42,10 @@ size_t Context::read_graph(std::string dataset_str, bool selfloop) {
   return n;
 }
 
-void Context::createSubgraph() {
-  subgraph_cpu = new Graph(); 
+void Context::createSubgraphs(int num_subgraphs) {
+  subgraphs_cpu.resize(num_subgraphs);
+  for (int i = 0; i < num_subgraphs; i++)
+    subgraphs_cpu[i] = new Graph(); 
 }
 
 // generate labels for the subgraph, m is subgraph size
@@ -150,8 +152,8 @@ void Context::alloc_norm_factor() {
 #endif
 }
 
-void Context::alloc_subgraph_norm_factor() {
-  Graph* g = getSubgraphPointer();
+void Context::alloc_subgraph_norm_factor(int subg_id) {
+  Graph* g = getSubgraphPointer(subg_id);
   if (norm_factors_subg == NULL)
 #ifdef USE_MKL
     norm_factors_subg = new float_t[g->sizeEdges()];
@@ -160,7 +162,7 @@ void Context::alloc_subgraph_norm_factor() {
 #endif
 }
 
-void Context::norm_factor_computing(bool is_subgraph) {
+void Context::norm_factor_computing(bool is_subgraph, int subg_id) {
   Graph* g;
   float_t *constants;
   if (!is_subgraph) {
@@ -168,8 +170,8 @@ void Context::norm_factor_computing(bool is_subgraph) {
     alloc_norm_factor();
     constants = norm_factors;
   } else {
-    g = getSubgraphPointer();
-    alloc_subgraph_norm_factor();
+    g = getSubgraphPointer(subg_id);
+    alloc_subgraph_norm_factor(subg_id);
     constants = norm_factors_subg;
   }
   auto g_size = g->size();
diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu
index 0042f5420e..531671c3c2 100644
--- a/libdeepgalois/src/context.cu
+++ b/libdeepgalois/src/context.cu
@@ -92,6 +92,15 @@ Context::~Context() {
   if (norm_factor) CUDA_CHECK(cudaFree(norm_factor));
 }
 
+void Context::createSubgraphs(int n_sg) {
+}
+
+void Context::gen_subgraph_labels(size_t m, const mask_t *masks) {
+}
+
+void Context::gen_subgraph_feats(size_t m, const mask_t *masks) {
+}
+
 size_t Context::read_graph(std::string dataset_str, bool selfloop) {
   n = read_graph_gpu(dataset_str, selfloop);
   return n;
diff --git a/libdeepgalois/src/lgraph.cpp b/libdeepgalois/src/lgraph.cpp
index dd3390d331..a2c4a9e4ca 100644
--- a/libdeepgalois/src/lgraph.cpp
+++ b/libdeepgalois/src/lgraph.cpp
@@ -40,7 +40,7 @@ void LearningGraph::progressPrint(unsigned maxii, unsigned ii) {
 void LearningGraph::allocateFrom(index_t nv, index_t ne) {
   num_vertices_ = nv;
   num_edges_ = ne;
-  printf("Allocating num_vertices_=%d, num_edges_=%d.\n", num_vertices_, num_edges_);
+  //printf("Allocating num_vertices_=%d, num_edges_=%d.\n", num_vertices_, num_edges_);
   rowptr_ = new index_t[num_vertices_+1];
   colidx_ = new index_t[num_edges_];
   rowptr_[0] = 0;
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index b5d0e36197..e9c64a0f7a 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -9,12 +9,13 @@
 
 namespace deepgalois {
 
-void Net::init(std::string dataset_str, unsigned num_conv, unsigned epochs,
+void Net::init(std::string dataset_str, int nt, unsigned n_conv, unsigned epochs,
                unsigned hidden1, float lr, float dropout, float wd,
                bool selfloop, bool single, bool l2norm, bool dense, 
                unsigned neigh_sz, unsigned subg_sz) {
-  assert(num_conv > 0);
-  num_conv_layers = num_conv;
+  assert(n_conv > 0);
+  num_threads = nt;
+  num_conv_layers = n_conv;
   num_epochs = epochs;
   learning_rate = lr;
   dropout_rate = dropout;
@@ -25,7 +26,9 @@ void Net::init(std::string dataset_str, unsigned num_conv, unsigned epochs,
   neighbor_sample_size = neigh_sz;
   subgraph_sample_size = subg_sz;
   val_interval = 1;
-  galois::gPrint("Configuration: num_conv_layers ", num_conv_layers,
+  num_subgraphs = num_threads;
+  galois::gPrint("Configuration: num_threads ", num_threads, 
+                 ", num_conv_layers ", num_conv_layers,
                  ", num_epochs ", num_epochs,
                  ", hidden1 ", hidden1,
                  ", learning_rate ", learning_rate,
@@ -149,8 +152,9 @@ void Net::train(optimizer* opt, bool need_validate) {
   int num_subg_remain = 0;
 #ifdef CPU_ONLY
   if (subgraph_sample_size) {
+    context->createSubgraphs(num_subgraphs);
+    subgraphs_masks = new mask_t[num_samples*num_subgraphs];
     galois::gPrint("\nConstruct training vertex set induced graph...\n");
-    subgraph_masks = new mask_t[num_samples];
     sampler->set_masked_graph(train_begin, train_end, train_count, train_masks, context->getGraphPointer());
   }
 #endif
@@ -161,28 +165,46 @@ void Net::train(optimizer* opt, bool need_validate) {
     galois::gPrint(header, "Epoch ", std::setw(3), ep, seperator);
     t_epoch.Start();
 
-    if (subgraph_sample_size && num_subg_remain == 0) {
-      for (size_t i = 0; i < num_layers; i++) layers[i]->update_dim_size(subgraph_sample_size);
-#ifdef CPU_ONLY
-      // generate subgraph
-      context->createSubgraph();
-      auto subgraph_ptr = context->getSubgraphPointer();
-      sampler->subgraph_sample(subgraph_sample_size, *(subgraph_ptr), subgraph_masks);
-      context->norm_factor_computing(1);
+    if (subgraph_sample_size) {
+      if (num_subg_remain == 0) {
+        galois::gPrint("Generating subgraphs (mini-batches) ... ");
+        Timer t_subgen;
+        t_subgen.Start();
+        // generate subgraphs
+        for (int sid = 0; sid < num_subgraphs; sid++) {
+        //galois::do_all(galois::iterate(size_t(0), size_t(num_subgraphs)),[&](const auto sid) {
+          sampler->subgraph_sample(subgraph_sample_size, *(context->getSubgraphPointer(sid)), &subgraphs_masks[sid*num_samples]);
+        }//, galois::loopname("subgraph_gen"));
+        num_subg_remain = num_subgraphs;
+        t_subgen.Stop();
+        galois::gPrint("Done, time: ", t_subgen.Millisecs(), "\n");
+      }
+      for (int i = 0; i < num_subgraphs; i++) {
+        //auto sg_ptr = context->getSubgraphPointer(i);
+        //galois::gPrint("\tsubgraph[", i, "]: num_v ", sg_ptr->size(), " num_e ", sg_ptr->sizeEdges(), "\n");
+      }
+      num_subg_remain--;
+      int sg_id = num_subg_remain;
+      auto subgraph_ptr = context->getSubgraphPointer(sg_id);
+      num_vertices_sg = subgraph_ptr->size();
+      galois::gPrint("Subgraph num_vertices: ", num_vertices_sg, 
+          ", num_edges: ", subgraph_ptr->sizeEdges(), "\n");
+      for (size_t i = 0; i < num_layers; i++)
+        layers[i]->update_dim_size(num_vertices_sg);
+      context->norm_factor_computing(1, sg_id);
       for (size_t i = 0; i < num_conv_layers; i++) {
-        layers[i]->set_graph_ptr(context->getSubgraphPointer());
+        layers[i]->set_graph_ptr(subgraph_ptr);
         layers[i]->set_norm_consts_ptr(context->get_norm_factors_subg_ptr());
 	  }
       // update labels for subgraph
-      context->gen_subgraph_labels(subgraph_sample_size, subgraph_masks);
+      context->gen_subgraph_labels(num_vertices_sg, &subgraphs_masks[sg_id*num_samples]);
       layers[num_layers-1]->set_labels_ptr(context->get_labels_subg_ptr());
 
       // update features for subgraph
-      context->gen_subgraph_feats(subgraph_sample_size, subgraph_masks);
+      context->gen_subgraph_feats(num_vertices_sg, &subgraphs_masks[sg_id*num_samples]);
       layers[0]->set_feats_ptr(context->get_feats_subg_ptr()); // feed input data
-#endif
-      num_subg_remain += 1; // num_threads
-    }
+	}
+
     // training steps
     set_netphases(net_phase::train);
     acc_t train_loss = 0.0, train_acc = 0.0;
@@ -260,8 +282,8 @@ double Net::evaluate(std::string type, acc_t& loss, acc_t& acc) {
       // update masks for subgraph
       masks = NULL;
       begin = 0;
-      end = subgraph_sample_size;
-      count = subgraph_sample_size;
+      end = num_vertices_sg;
+      count = num_vertices_sg;
     }
   } else if (type == "val") {
     begin = val_begin;
diff --git a/libdeepgalois/src/sampler.cpp b/libdeepgalois/src/sampler.cpp
index c7d5639330..a54cc145d9 100644
--- a/libdeepgalois/src/sampler.cpp
+++ b/libdeepgalois/src/sampler.cpp
@@ -6,7 +6,7 @@
 namespace deepgalois {
 
 void Sampler::set_masked_graph(size_t begin, size_t end, size_t count, mask_t *masks, Graph *g) {
-  galois::gPrint("Set masked graph: begin=", begin, ", end=", end, ", count=", count, "\n");
+  //galois::gPrint("Set masked graph: begin=", begin, ", end=", end, ", count=", count, "\n");
   begin_ = begin;
   end_ = end;
   count_ = count;
@@ -40,7 +40,7 @@ void Sampler::generate_masked_graph(size_t n, mask_t* masks, Graph* g, Graph& su
   get_masked_degrees(n, masks, g, degrees);
   auto offsets = deepgalois::parallel_prefix_sum(degrees);
   size_t ne = offsets[n];
-  galois::gPrint("Generate masked graph: num_vertices=", n, ", num_edges=", ne, "\n");
+  //galois::gPrint("Generate masked graph: num_vertices=", n, ", num_edges=", ne, "\n");
 #ifndef GALOIS_USE_DIST
   sub.allocateFrom(n, ne);
   sub.constructNodes();
@@ -64,14 +64,14 @@ void Sampler::generate_masked_graph(size_t n, mask_t* masks, Graph* g, Graph& su
 // n: number of vertices in the subgraph;
 // m: number of vertices in the frontier.
 void Sampler::select_vertices(size_t nv, size_t n, int m, Graph *g, VertexList vertices, VertexSet &vertex_set) {
-  galois::gPrint("Select a vertex set of size ", n, " from ", nv, " vertices, graph size: ", g->size(), "\n");
+  //galois::gPrint("Select a vertex set of size ", n, " from ", nv, " vertices, graph size: ", g->size(), "\n");
   assert(nv == vertices.size());
   auto frontier_indices = deepgalois::select_k_items(m, 0, (int)nv); // randomly select m vertices from vertices as frontier
   VertexList frontier(m);
   for (int i = 0; i < m; i++)
     frontier[i] = vertices[frontier_indices[i]];
   vertex_set.insert(frontier.begin(), frontier.end());
-  galois::gPrint("vertex_set size: ", vertex_set.size(), "\n");
+  //galois::gPrint("vertex_set size: ", vertex_set.size(), "\n");
   int *degrees = new int[m];
   galois::do_all(galois::iterate(size_t(0), size_t(m)), [&](const auto i) {
     degrees[i] = (int)g->get_degree(frontier[i]);
@@ -93,7 +93,8 @@ void Sampler::select_vertices(size_t nv, size_t n, int m, Graph *g, VertexList v
     }
     if (j == degree) galois::gPrint("Not found from ", degree, " neighbors\n");
   }
-  assert(n == vertex_set.size());
+  /*
+  assert(n == vertex_set.size()); // size of vertex_set could be slightly smaller than n
   galois::gPrint("Done selection, vertex_set size: ", vertex_set.size(), ", set: ( ");
   unsigned counter = 0;
   for (int i : vertex_set) {
@@ -102,10 +103,11 @@ void Sampler::select_vertices(size_t nv, size_t n, int m, Graph *g, VertexList v
     galois::gPrint(i, " ");
   }
   galois::gPrint(" )\n");
+  */
 }
 
 void Sampler::update_masks(size_t n, VertexSet vertices, mask_t *masks) {
-  galois::gPrint("Updating masks, size = ", vertices.size(), "\n");
+  //galois::gPrint("Updating masks, size = ", vertices.size(), "\n");
   std::fill(masks, masks+n, 0);
   for (auto v : vertices) masks[v] = 1;
 }
@@ -130,7 +132,7 @@ void Sampler::generate_subgraph(VertexSet &vertex_set, Graph &g, Graph &sub) {
   }
   auto offsets = deepgalois::parallel_prefix_sum(degrees);
   auto ne = offsets[nv];
-  galois::gPrint("Generate subgraph: num_vertices=", nv, ", num_edges=", ne, "\n");
+  //galois::gPrint("Generate subgraph: num_vertices=", nv, ", num_edges=", ne, "\n");
 #ifndef GALOIS_USE_DIST
   sub.allocateFrom(nv, ne);
   sub.constructNodes();
diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp
index 62a8067294..489553a689 100644
--- a/lonestargnn/gcn/gcn.cpp
+++ b/lonestargnn/gcn/gcn.cpp
@@ -25,9 +25,9 @@ int main(int argc, char** argv) {
 #endif
 
   // read network, features, ground truth, initialize metadata
-  network.init(dataset, num_conv_layers, epochs, hidden1, learning_rate, 
-               dropout_rate, weight_decay, add_selfloop, 
-               is_single_class, add_l2norm, add_dense, 
+  network.init(dataset, numThreads, num_conv_layers, epochs, hidden1,
+               learning_rate, dropout_rate, weight_decay,
+               add_selfloop, is_single_class, add_l2norm, add_dense, 
                neighbor_sample_sz, subgraph_sample_sz);
   // default setting for now; can be customized by the user
   network.construct_layers();

From fdce957d48ff4bcc413654389936aeb1369531af Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Sat, 2 May 2020 13:36:19 -0500
Subject: [PATCH 224/660] fix gpu

---
 libdeepgalois/include/deepgalois/context.h   |  4 ++--
 libdeepgalois/src/context.cu                 | 18 ++++++++----------
 libdeepgalois/src/layers/graph_conv_layer.cu |  8 ++++----
 libdeepgalois/src/net.cpp                    |  4 +++-
 libdeepgalois/src/net.cu                     | 14 +++++---------
 libgpu/include/graph_gpu.h                   |  2 ++
 6 files changed, 24 insertions(+), 26 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h
index 0f6d96ec98..4a0ce506b2 100644
--- a/libdeepgalois/include/deepgalois/context.h
+++ b/libdeepgalois/include/deepgalois/context.h
@@ -55,9 +55,9 @@ class Context {
   label_t* get_labels_subg_ptr() { return h_labels_subg; }
 #else
   CSRGraph graph_gpu; // the input graph, |V| = N
-  CSRGraph subgraph_gpu;
+  std::vector<CSRGraph*> subgraphs_gpu;
   CSRGraph* getGraphPointer() { return &graph_gpu; }
-  CSRGraph* getSubgraphPointer() { return &subgraph_gpu; };
+  CSRGraph* getSubgraphPointer(int id) { return subgraphs_gpu[id]; };
   float_t* get_feats_ptr() { return d_feats; }
   float_t* get_feats_subg_ptr() { return d_feats_subg; }
   label_t* get_labels_ptr() { return d_labels; }
diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu
index 531671c3c2..6f42196428 100644
--- a/libdeepgalois/src/context.cu
+++ b/libdeepgalois/src/context.cu
@@ -68,7 +68,7 @@ Context::Context() : n(0), num_classes(0), feat_len(0),
                      h_feats(NULL), h_feats_subg(NULL),
                      d_labels(NULL), d_labels_subg(NULL),
                      d_feats(NULL), d_feats_subg(NULL),
-                     norm_factor(NULL) {
+                     norm_factors(NULL) {
   CUBLAS_CHECK(cublasCreate(&cublas_handle_));
   CUSPARSE_CHECK(cusparseCreate(&cusparse_handle_));
   CUSPARSE_CHECK(cusparseCreateMatDescr(&cusparse_matdescr_));
@@ -89,7 +89,7 @@ Context::~Context() {
     CURAND_CHECK(curandDestroyGenerator(curand_generator_));
   if (d_labels) CUDA_CHECK(cudaFree(d_labels));
   if (d_feats) CUDA_CHECK(cudaFree(d_feats));
-  if (norm_factor) CUDA_CHECK(cudaFree(norm_factor));
+  if (norm_factors) CUDA_CHECK(cudaFree(norm_factors));
 }
 
 void Context::createSubgraphs(int n_sg) {
@@ -106,7 +106,7 @@ size_t Context::read_graph(std::string dataset_str, bool selfloop) {
   return n;
 }
 
-void Context::norm_factor_computing(bool is_subgraph) {
+void Context::norm_factor_computing(bool is_subgraph, int subg_id) {
   std::cout << "Pre-computing normalization factor (n=" << n << ") ... ";
   if (!is_selfloop_added) {
     std::cout << "Set -sl=1 to add selfloop\n";	  
@@ -114,14 +114,12 @@ void Context::norm_factor_computing(bool is_subgraph) {
   }
 #ifdef USE_CUSPARSE
   int nnz = graph_gpu.nedges;
-  CUDA_CHECK(cudaMalloc((void**)&norm_factor, nnz * sizeof(float_t)));
-  init_const_gpu(nnz, 0.0, norm_factor);
-  norm_factor_computing_edge<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
-      n, graph_gpu, norm_factor);
+  CUDA_CHECK(cudaMalloc((void**)&norm_factors, nnz * sizeof(float_t)));
+  init_const_gpu(nnz, 0.0, norm_factors);
+  norm_factor_computing_edge<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, graph_gpu, norm_factors);
 #else
-  CUDA_CHECK(cudaMalloc((void**)&norm_factor, n * sizeof(float_t)));
-  norm_factor_computing_node<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
-      n, graph_gpu, norm_factor);
+  CUDA_CHECK(cudaMalloc((void**)&norm_factors, n * sizeof(float_t)));
+  norm_factor_computing_node<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, graph_gpu, norm_factors);
 #endif
   CudaTest("solving norm_factor_computing kernel failed");
   std::cout << "Done\n";
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cu b/libdeepgalois/src/layers/graph_conv_layer.cu
index ef62725da2..f4282ced42 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cu
+++ b/libdeepgalois/src/layers/graph_conv_layer.cu
@@ -32,17 +32,17 @@ void graph_conv_layer::malloc_and_init() {
 
 void graph_conv_layer::aggregate(size_t len, CSRGraph& g, const float_t* in, float_t* out) {
   #ifdef USE_CUSPARSE
-  deepgalois::update_all_csrmm(len, g, in, out, norm_, norm_factor);
+  deepgalois::update_all_csrmm(len, g, in, out, norm_, norm_consts);
   #else
-  deepgalois::update_all(len, g, in, out, norm_, norm_factor);
+  deepgalois::update_all(len, g, in, out, norm_, norm_consts);
   #endif
 }
 
 void graph_conv_layer::d_aggregate(size_t len, CSRGraph& g, const float_t* in, float_t* out) {
 #ifdef USE_CUSPARSE
-  deepgalois::update_all_csrmm(len, g, in, out, norm_, norm_factor);
+  deepgalois::update_all_csrmm(len, g, in, out, norm_, norm_consts);
 #else
-  deepgalois::update_all(len, g, in, out, norm_, norm_factor);
+  deepgalois::update_all(len, g, in, out, norm_, norm_consts);
 #endif
 }
 
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index e9c64a0f7a..b9fd931746 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -87,8 +87,8 @@ void Net::init(std::string dataset_str, int nt, unsigned n_conv, unsigned epochs
   feature_dims[num_layers] = num_classes;                // normalized output embedding: E
   layers.resize(num_layers);
 
-#ifdef CPU_ONLY
   context->set_use_subgraph(subgraph_sample_size > 0);
+#ifdef CPU_ONLY
   if (subgraph_sample_size) sampler = new deepgalois::Sampler();
 #else
   copy_masks_device(num_samples, train_masks, d_train_masks);
@@ -173,7 +173,9 @@ void Net::train(optimizer* opt, bool need_validate) {
         // generate subgraphs
         for (int sid = 0; sid < num_subgraphs; sid++) {
         //galois::do_all(galois::iterate(size_t(0), size_t(num_subgraphs)),[&](const auto sid) {
+#ifdef CPU_ONLY
           sampler->subgraph_sample(subgraph_sample_size, *(context->getSubgraphPointer(sid)), &subgraphs_masks[sid*num_samples]);
+#endif
         }//, galois::loopname("subgraph_gen"));
         num_subg_remain = num_subgraphs;
         t_subgen.Stop();
diff --git a/libdeepgalois/src/net.cu b/libdeepgalois/src/net.cu
index 6ead99d31a..5cc8593647 100644
--- a/libdeepgalois/src/net.cu
+++ b/libdeepgalois/src/net.cu
@@ -143,18 +143,14 @@ acc_t masked_f1_score_gpu(int num_classes, int begin, int end, int count,
 }
 
 namespace deepgalois {
-acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count,
-                           mask_t* masks) {
-  return masked_accuracy_gpu(num_classes, begin, end, count, masks,
-                             layers[num_conv_layers - 1]->next()->get_data(),
-                             context->get_labels_ptr());
+acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, 
+                           mask_t* masks, float_t* preds, label_t* ground_truth) {
+  return masked_accuracy_gpu(num_classes, begin, end, count, masks, preds, ground_truth);
 }
 
 acc_t Net::masked_multi_class_accuracy(size_t begin, size_t end, size_t count, 
-                                       mask_t* masks) {
-	return masked_f1_score_gpu(num_classes, begin, end, count, masks,
-                             layers[num_conv_layers]->next()->get_data(),
-                             context->get_labels_ptr());
+                                       mask_t* masks, float_t* preds, label_t* ground_truth) {
+	return masked_f1_score_gpu(num_classes, begin, end, count, masks, preds, ground_truth);
 }
 
 } // end namespace
diff --git a/libgpu/include/graph_gpu.h b/libgpu/include/graph_gpu.h
index e2057bf7af..f6f9c57643 100644
--- a/libgpu/include/graph_gpu.h
+++ b/libgpu/include/graph_gpu.h
@@ -175,6 +175,8 @@ struct CSRGraph {
 	__device__ __host__ edge_data_type *edge_data_ptr() { return edge_data; }
 	__device__ __host__ const edge_data_type *edge_data_ptr() const { return edge_data; }
 
+  size_t size() { return size_t(nnodes); }
+  size_t sizeEdges() { return size_t(nedges); }
   index_type nnodes, nedges;
   index_type* row_start; // row_start[node] points into edge_dst, node starts at
                          // 0, row_start[nnodes] = nedges

From 61ec2c9a2acf6fdc0af5c1451568b7f89b3b854e Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Sun, 3 May 2020 13:50:23 -0500
Subject: [PATCH 225/660] fix dist

---
 libdeepgalois/include/deepgalois/DistContext.h |  2 +-
 libdeepgalois/src/DistContext.cpp              | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h
index 32d95fcc99..1304b631f8 100644
--- a/libdeepgalois/include/deepgalois/DistContext.h
+++ b/libdeepgalois/include/deepgalois/DistContext.h
@@ -47,7 +47,7 @@ class DistContext {
 
   //! find norm factor by looking at degree
   // TODO this is a distributed operation
-  void norm_factor_computing(size_t g_size);
+  void norm_factor_computing(bool is_subgraph, int subg_id);
   void createSubgraphs(int num_subgraphs) {}
   void gen_subgraph_labels(size_t m, const mask_t *masks) {}
   void gen_subgraph_feats(size_t m, const mask_t *masks) {}
diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp
index e53dc1c118..3f915ec062 100644
--- a/libdeepgalois/src/DistContext.cpp
+++ b/libdeepgalois/src/DistContext.cpp
@@ -151,22 +151,22 @@ float_t* DistContext::get_in_ptr() {
   return &h_feats[0];
 }
 
-void DistContext::norm_factor_computing(size_t g_size) {
+void DistContext::norm_factor_computing(bool is_subgraph, int subg_id) {
   // TODO: this is a distributed operation
 
   // create for now, TODO need to actually fill it in
-  norm_factor = new float_t[localVertices];
+  norm_factors = new float_t[localVertices];
   galois::do_all(galois::iterate((size_t)0, localVertices),
     [&](auto v) {
-      norm_factor[v] = 1;
+      norm_factors[v] = 1;
     }, galois::loopname("NormCounting"));
 
   //galois::do_all(galois::iterate((size_t)0, localVertices),
   //  [&](auto v) {
   //    auto degree  = std::distance(graph_cpu->edge_begin(v), graph_cpu->edge_end(v));
   //    float_t temp = std::sqrt(float_t(degree));
-  //    if (temp == 0.0) norm_factor[v] = 0.0;
-  //    else norm_factor[v] = 1.0 / temp;
+  //    if (temp == 0.0) norm_factors[v] = 0.0;
+  //    else norm_factors[v] = 1.0 / temp;
   //  }, galois::loopname("NormCounting"));
 
   return;

From de8feacd310c9987e6317d8788b06a14fa7b4323 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Sun, 3 May 2020 20:28:47 -0500
Subject: [PATCH 226/660] update sampler

---
 libdeepgalois/include/deepgalois/sampler.h |  15 +-
 libdeepgalois/src/net.cpp                  |  20 +-
 libdeepgalois/src/sampler.cpp              | 234 ++++++++++++++++++---
 3 files changed, 232 insertions(+), 37 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/sampler.h b/libdeepgalois/include/deepgalois/sampler.h
index 15c82ffa12..c92e8471d3 100644
--- a/libdeepgalois/include/deepgalois/sampler.h
+++ b/libdeepgalois/include/deepgalois/sampler.h
@@ -2,16 +2,23 @@
 #include "deepgalois/gtypes.h"
 
 namespace deepgalois {
+#define ETA 1.5 // length factor of DB in sampling
+#define SAMPLE_CLIP 3000 // clip degree in sampling
+#define DEFAULT_SIZE_FRONTIER 3000
+#define DEFAULT_SIZE_SUBG 9000
+
 class Sampler {
 public:
-  Sampler() : m_(1000) {}
+  typedef int db_t;
+  Sampler() : m_(DEFAULT_SIZE_FRONTIER) {}
   ~Sampler() {}
 
   // sample a subgraph sg of size n from graph g
-  void subgraph_sample(size_t n, Graph &sg, mask_t* masks);
+  void subgraph_sample(size_t n, Graph &sg, mask_t* masks, unsigned tid = 0);
 
   // !API function for user-defined selection strategy
   virtual void select_vertices(size_t nv, size_t n, int m, Graph* g, VertexList vertices, VertexSet &vertex_set);
+  virtual void select_vertices(size_t n, int m, VertexSet &vertex_set, unsigned tid);
 
   galois::runtime::iterable<galois::NoDerefIterator<edge_iterator> > neighbor_sampler(Graph &g, VertexID v);
 
@@ -26,7 +33,10 @@ class Sampler {
   size_t count_;
   size_t begin_;
   size_t end_;
+  int avg_deg;
+  int subg_deg;
   VertexList vertices_;
+  std::vector<index_t> node_train;
   mask_t *masks_;
   Graph *masked_graph;
   Graph *graph;
@@ -37,6 +47,7 @@ class Sampler {
   void get_masked_degrees(size_t n, mask_t* masks, Graph* g, std::vector<uint32_t> &degrees);
   void update_masks(size_t n, VertexSet vertices, mask_t* masks);
   inline VertexList reindexing_vertice(size_t n, VertexSet vertex_set);
+  void check_DB(std::vector<db_t> &DB0, std::vector<db_t> &DB1, std::vector<db_t> &DB2, size_t size);
 };
 
 }
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index b9fd931746..39909b4d34 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -167,30 +167,32 @@ void Net::train(optimizer* opt, bool need_validate) {
 
     if (subgraph_sample_size) {
       if (num_subg_remain == 0) {
-        galois::gPrint("Generating subgraphs (mini-batches) ... ");
+        //galois::gPrint("Generating ", num_subgraphs, " subgraphs (mini-batches) ... ");
         Timer t_subgen;
         t_subgen.Start();
         // generate subgraphs
-        for (int sid = 0; sid < num_subgraphs; sid++) {
-        //galois::do_all(galois::iterate(size_t(0), size_t(num_subgraphs)),[&](const auto sid) {
+        //for (int sid = 0; sid < num_subgraphs; sid++) {
+        galois::do_all(galois::iterate(size_t(0), size_t(num_subgraphs)),[&](const auto sid) {
+          unsigned tid = 0;
+          tid = galois::substrate::ThreadPool::getTID();
 #ifdef CPU_ONLY
-          sampler->subgraph_sample(subgraph_sample_size, *(context->getSubgraphPointer(sid)), &subgraphs_masks[sid*num_samples]);
+          sampler->subgraph_sample(subgraph_sample_size, *(context->getSubgraphPointer(sid)), &subgraphs_masks[sid*num_samples], tid);
 #endif
-        }//, galois::loopname("subgraph_gen"));
+        }, galois::loopname("subgraph_gen"));
         num_subg_remain = num_subgraphs;
         t_subgen.Stop();
-        galois::gPrint("Done, time: ", t_subgen.Millisecs(), "\n");
+        //galois::gPrint("Done, time: ", t_subgen.Millisecs(), "\n");
       }
       for (int i = 0; i < num_subgraphs; i++) {
-        //auto sg_ptr = context->getSubgraphPointer(i);
+        auto sg_ptr = context->getSubgraphPointer(i);
+        sg_ptr->degree_counting();
         //galois::gPrint("\tsubgraph[", i, "]: num_v ", sg_ptr->size(), " num_e ", sg_ptr->sizeEdges(), "\n");
       }
       num_subg_remain--;
       int sg_id = num_subg_remain;
       auto subgraph_ptr = context->getSubgraphPointer(sg_id);
       num_vertices_sg = subgraph_ptr->size();
-      galois::gPrint("Subgraph num_vertices: ", num_vertices_sg, 
-          ", num_edges: ", subgraph_ptr->sizeEdges(), "\n");
+      //galois::gPrint("Subgraph num_vertices: ", num_vertices_sg, ", num_edges: ", subgraph_ptr->sizeEdges(), "\n");
       for (size_t i = 0; i < num_layers; i++)
         layers[i]->update_dim_size(num_vertices_sg);
       context->norm_factor_computing(1, sg_id);
diff --git a/libdeepgalois/src/sampler.cpp b/libdeepgalois/src/sampler.cpp
index a54cc145d9..2157d97dc5 100644
--- a/libdeepgalois/src/sampler.cpp
+++ b/libdeepgalois/src/sampler.cpp
@@ -4,6 +4,11 @@
 #include <vector>
 
 namespace deepgalois {
+inline unsigned getDegree(Graph *g, index_t v) {
+  //return g->get_degree(v);
+  //return std::distance(g->edge_begin(v), g->edge_end(v));
+  return g->edge_end(v) - g->edge_begin(v);
+}
 
 void Sampler::set_masked_graph(size_t begin, size_t end, size_t count, mask_t *masks, Graph *g) {
   //galois::gPrint("Set masked graph: begin=", begin, ", end=", end, ", count=", count, "\n");
@@ -15,7 +20,31 @@ void Sampler::set_masked_graph(size_t begin, size_t end, size_t count, mask_t *m
 #ifndef GALOIS_USE_DIST
   masked_graph = new Graph();
 #endif
-  generate_masked_graph(g->size(), masks, g, *masked_graph);
+  //generate_masked_graph(g->size(), masks, g, *masked_graph);
+  std::vector<uint32_t> degrees(g->size(), 0);
+  get_masked_degrees(g->size(), masks, g, degrees);
+  auto offsets = deepgalois::parallel_prefix_sum(degrees);
+  size_t ne = offsets[g->size()];
+  for (size_t i = 0; i < g->size(); i++) {
+    if (masks[i] == 1) node_train.push_back(i);
+  }
+  masked_graph->allocateFrom(g->size(), ne);
+  masked_graph->constructNodes();
+  galois::do_all(galois::iterate((size_t)0, g->size()), [&](const auto src) {
+    masked_graph->fixEndEdge(src, offsets[src+1]);
+    if (masks[src] == 1) {
+      auto idx = offsets[src];
+      for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) {
+        const auto dst = g->getEdgeDst(e);
+        if (masks[dst] == 1) masked_graph->constructEdge(idx++, dst, 0);
+      }
+    }
+  }, galois::loopname("gen_subgraph"));
+
+  masked_graph->degree_counting();
+  avg_deg = masked_graph->sizeEdges() / masked_graph->size();
+  subg_deg = (avg_deg > SAMPLE_CLIP) ? SAMPLE_CLIP : avg_deg;
+  //galois::gPrint("Train graph: num_vertices ", masked_graph->size(), " num_edges ", masked_graph->sizeEdges(), " avg_degree ", avg_deg, "\n");
   size_t idx = 0;
   vertices_.resize(count);
   for (size_t i = begin; i < end; i++) {
@@ -25,26 +54,29 @@ void Sampler::set_masked_graph(size_t begin, size_t end, size_t count, mask_t *m
 
 void Sampler::get_masked_degrees(size_t n, mask_t *masks, Graph *g, std::vector<uint32_t> &degrees) {
   assert(degrees.size() == n);
-  galois::do_all(galois::iterate(size_t(0), n), [&](const auto src) {
+  for (size_t src = 0; src < n; src++) {
+  //galois::do_all(galois::iterate(size_t(0), n), [&](const auto src) {
     if (masks[src] == 1) {
       for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) {
         const auto dst = g->getEdgeDst(e);
         if (masks[dst] == 1) degrees[src] ++;
       }
     }
-  }, galois::loopname("update_degrees"));
+  }//, galois::loopname("update_degrees"));
 }
 
 void Sampler::generate_masked_graph(size_t n, mask_t* masks, Graph* g, Graph& sub) {
   std::vector<uint32_t> degrees(n, 0);
   get_masked_degrees(n, masks, g, degrees);
-  auto offsets = deepgalois::parallel_prefix_sum(degrees);
+  //auto offsets = deepgalois::parallel_prefix_sum(degrees);
+  auto offsets = deepgalois::prefix_sum(degrees);
   size_t ne = offsets[n];
   //galois::gPrint("Generate masked graph: num_vertices=", n, ", num_edges=", ne, "\n");
 #ifndef GALOIS_USE_DIST
   sub.allocateFrom(n, ne);
   sub.constructNodes();
-  galois::do_all(galois::iterate((size_t)0, n), [&](const auto src) {
+  for (size_t src = 0; src < n; src++) {
+  //galois::do_all(galois::iterate((size_t)0, n), [&](const auto src) {
     sub.fixEndEdge(src, offsets[src+1]);
     if (masks[src] == 1) {
       auto idx = offsets[src];
@@ -53,11 +85,164 @@ void Sampler::generate_masked_graph(size_t n, mask_t* masks, Graph* g, Graph& su
         if (masks[dst] == 1) sub.constructEdge(idx++, dst, 0);
       }
     }
-  }, galois::loopname("gen_subgraph"));
-  sub.degree_counting();
+  }//, galois::loopname("gen_subgraph"));
 #endif
 }
 
+void Sampler::check_DB(std::vector<db_t> &DB0, std::vector<db_t> &DB1, std::vector<db_t> &DB2, size_t size) {
+  if (DB0.capacity() < size) {
+    DB0.reserve(DB0.capacity()*2);
+    DB1.reserve(DB1.capacity()*2);
+    DB2.reserve(DB2.capacity()*2);
+  }
+  DB0.resize(size);
+  DB1.resize(size);
+  DB2.resize(size);
+}
+
+void print_vertex_set(VertexSet vertex_set) {
+  unsigned counter = 0;
+  unsigned n = vertex_set.size();
+  galois::gPrint("( ");
+  for (int i : vertex_set) {
+    counter ++;
+    if (counter > 16 && counter < n-16) continue;
+    galois::gPrint(i, " ");
+  }
+  galois::gPrint(")\n");
+}
+
+void Sampler::select_vertices(size_t n, int m, VertexSet &st, unsigned tid) {
+  //unsigned myseed = time(NULL);
+  unsigned myseed = tid;
+  //DBx: Dashboard line x, IAx: Index array line x
+  std::vector<db_t> DB0, DB1, DB2, IA0, IA1, IA2, IA3, IA4, nDB0, nDB1, nDB2;
+  DB0.reserve(subg_deg*m*ETA);
+  DB1.reserve(subg_deg*m*ETA);
+  DB2.reserve(subg_deg*m*ETA);
+  IA0.reserve(n);
+  IA1.reserve(n);
+  IA2.reserve(n);
+  IA3.reserve(n);
+  IA4.reserve(n);
+  IA0.resize(m);
+  IA1.resize(m);
+  IA2.resize(m);
+  IA3.resize(m);
+
+  //galois::gPrint("seed ", myseed, " m ", m, "\n");
+  //galois::gPrint("node_train size: ", node_train.size(), "\n");
+  //printf("( ");
+  //for (size_t i = 0; i < 10; i++) std::cout << node_train[i] << " ";
+  //printf(")\n");
+  for (int i = 0; i < m; i++) {
+    auto rand_idx = rand_r(&myseed) % node_train.size();
+    db_t v = IA3[i] = node_train[rand_idx];
+    st.insert(v);
+    IA0[i] = getDegree(masked_graph, v);
+    IA0[i] = (IA0[i] > SAMPLE_CLIP) ? SAMPLE_CLIP : IA0[i];
+    IA1[i] = 1;
+    IA2[i] = 0;
+  }
+  // calculate prefix sum for IA0 and store in IA2 to compute the address for each frontier in DB
+  IA2[0] = IA0[0];
+  for (int i = 1; i < m; i++) IA2[i] = IA2[i-1] + IA0[i];
+  // now fill DB accordingly
+  check_DB(DB0, DB1, DB2, IA2[m-1]);
+  for (int i = 0; i < m; i++) {
+    db_t DB_start = (i==0) ? 0 : IA2[i-1];
+    db_t DB_end = IA2[i];
+    for (auto j = DB_start; j < DB_end; j++) {
+      DB0[j] = IA3[i];
+      DB1[j] = (j==DB_start) ? (j-DB_end) : (j-DB_start);
+      DB2[j] = i + 1;
+    }
+  }
+
+  db_t choose, neigh_v, newsize, tmp;
+  for (size_t itr = 0; itr < n-m; itr++) {
+    choose = db_t(-1);
+    while (choose == db_t(-1)) {
+      tmp = rand_r(&myseed) % DB0.size();
+      if (size_t(tmp) < DB0.size())
+        if (DB0[tmp] != db_t(-1)) choose = tmp;
+    }
+    choose = (DB1[choose] < 0) ? choose : (choose - DB1[choose]);
+    db_t v = DB0[choose];
+    auto degree = getDegree(masked_graph, v);
+    neigh_v = (degree!=0) ? rand_r(&myseed)%degree : db_t(-1);
+    if (neigh_v != db_t(-1)) {
+      neigh_v = masked_graph->getEdgeDst(masked_graph->edge_begin(v)+neigh_v);
+      st.insert(neigh_v);
+      IA1[DB2[choose]-1] = 0;
+      IA0[DB2[choose]-1] = 0;
+      for (auto i = choose; i < choose-DB1[choose]; i++) DB0[i] = db_t(-1);
+      newsize = getDegree(masked_graph, neigh_v);
+      newsize = (newsize > SAMPLE_CLIP) ? SAMPLE_CLIP : newsize;
+    }
+    else newsize = 0;
+    //shrink DB to remove sampled nodes, also shrink IA accordingly
+    bool cond = DB0.size() + newsize > DB0.capacity();
+    if (cond) {
+      // compute prefix sum for the location in shrinked DB
+      IA4.resize(IA0.size());
+      IA4[0]=IA0[0];
+      for (size_t i = 1; i < IA0.size(); i++) IA4[i] = IA4[i-1] + IA0[i];
+      nDB0.resize(IA4.back());
+      nDB1.resize(IA4.back());
+      nDB2.resize(IA4.back());
+      IA2.assign(IA4.begin(), IA4.end());
+      for (size_t i = 0; i < IA0.size(); i++) {
+        if (IA1[i] == 0) continue;
+        db_t DB_start = (i==0) ? 0 : IA4[i-1];
+        db_t DB_end = IA4[i];
+        for (auto j = DB_start; j < DB_end; j++) {
+          nDB0[j] = IA3[i];
+          nDB1[j] = (j==DB_start) ? (j-DB_end) : (j-DB_start);
+          nDB2[j] = i + 1;
+        }
+      }
+      // remap the index in DB2 by compute prefix of IA1 (new idx in IA)
+      IA4.resize(IA1.size());
+      IA4[0] = IA1[0];
+      for (size_t i = 1; i < IA1.size(); i++)
+        IA4[i] = IA4[i-1] + IA1[i];
+      DB0.assign(nDB0.begin(), nDB0.end());
+      DB1.assign(nDB1.begin(), nDB1.end());
+      DB2.assign(nDB2.begin(), nDB2.end());
+      for (auto i = DB2.begin(); i < DB2.end(); i++) *i = IA4[*i - 1];
+      db_t curr=0;
+      for (size_t i = 0; i < IA0.size(); i++) {
+        if (IA0[i] != 0) {
+          IA0[curr]=IA0[i];
+          IA1[curr]=IA1[i];
+          IA2[curr]=IA2[i];
+          IA3[curr]=IA3[i];
+          curr++;
+        }
+      }
+      IA0.resize(curr);
+      IA1.resize(curr);
+      IA2.resize(curr);
+      IA3.resize(curr);
+    }
+    check_DB(DB0, DB1, DB2, newsize+DB0.size());
+    IA0.push_back(newsize);
+    IA1.push_back(1);
+    IA2.push_back(IA2.back() + IA0.back());
+    IA3.push_back(neigh_v);
+    db_t DB_start = (*(IA2.end() - 2));
+    db_t DB_end = IA2.back();
+    for (auto j = DB_start; j < DB_end; j++) {
+      DB0[j] = IA3.back();
+      DB1[j] = (j==DB_start) ? (j-DB_end) : (j-DB_start);
+      DB2[j] = IA3.size();
+    }
+  }
+  //galois::gPrint("Done selection, vertex_set size: ", st.size(), ", set: ");
+  //print_vertex_set(st);
+}
+
 // !API function for user-defined selection strategy
 // Select n vertices from vertices and put them in vertex_set.
 // nv: number of vertices in the original graph;
@@ -73,9 +258,10 @@ void Sampler::select_vertices(size_t nv, size_t n, int m, Graph *g, VertexList v
   vertex_set.insert(frontier.begin(), frontier.end());
   //galois::gPrint("vertex_set size: ", vertex_set.size(), "\n");
   int *degrees = new int[m];
-  galois::do_all(galois::iterate(size_t(0), size_t(m)), [&](const auto i) {
-    degrees[i] = (int)g->get_degree(frontier[i]);
-  }, galois::loopname("compute_degrees"));
+  for (int i = 0; i < m; i++) {
+  //galois::do_all(galois::iterate(size_t(0), size_t(m)), [&](const auto i) {
+    degrees[i] = (int)getDegree(g, frontier[i]);
+  }//, galois::loopname("compute_degrees"));
   for (size_t i = 0; i < n - m; i++) {
     auto pos = select_one_item((int)m, degrees);
     auto u = frontier[pos];
@@ -86,7 +272,7 @@ void Sampler::select_vertices(size_t nv, size_t n, int m, Graph *g, VertexList v
       auto dst = g->getEdgeDst(g->edge_begin(u) + neighbor_id);
       if (vertex_set.find(dst) == vertex_set.end()) {
         frontier[pos] = dst;
-        degrees[pos] = g->get_degree(frontier[pos]);
+        degrees[pos] = getDegree(g, frontier[pos]);
         vertex_set.insert(dst);
         break;
       }
@@ -95,14 +281,8 @@ void Sampler::select_vertices(size_t nv, size_t n, int m, Graph *g, VertexList v
   }
   /*
   assert(n == vertex_set.size()); // size of vertex_set could be slightly smaller than n
-  galois::gPrint("Done selection, vertex_set size: ", vertex_set.size(), ", set: ( ");
-  unsigned counter = 0;
-  for (int i : vertex_set) {
-    counter ++;
-    if (counter > 16 && counter < n-16) continue;
-    galois::gPrint(i, " ");
-  }
-  galois::gPrint(" )\n");
+  galois::gPrint("Done selection, vertex_set size: ", vertex_set.size(), ", set: ");
+  print_vertex_set(vertex_set);
   */
 }
 
@@ -128,16 +308,18 @@ void Sampler::generate_subgraph(VertexSet &vertex_set, Graph &g, Graph &sub) {
   VertexList new_ids = reindexing_vertice(graph->size(), vertex_set);
   std::vector<uint32_t> degrees(nv, 0); // degrees of vertices in the subgraph
   for (auto v : vertex_set) {
-	degrees[new_ids[v]] = g.get_degree(v);
+	degrees[new_ids[v]] = getDegree(&g, v);
   }
-  auto offsets = deepgalois::parallel_prefix_sum(degrees);
+  //auto offsets = deepgalois::parallel_prefix_sum(degrees);
+  auto offsets = deepgalois::prefix_sum(degrees);
   auto ne = offsets[nv];
   //galois::gPrint("Generate subgraph: num_vertices=", nv, ", num_edges=", ne, "\n");
 #ifndef GALOIS_USE_DIST
   sub.allocateFrom(nv, ne);
   sub.constructNodes();
   VertexList old_ids(vertex_set.begin(), vertex_set.end()); // vertex ID mapping
-  galois::do_all(galois::iterate((size_t)0, nv), [&](const auto i) {
+  for (size_t i = 0; i < nv; i++) {
+  //galois::do_all(galois::iterate((size_t)0, nv), [&](const auto i) {
     sub.fixEndEdge(i, offsets[i+1]);
     unsigned j = 0;
     auto old_id = old_ids[i];
@@ -147,14 +329,14 @@ void Sampler::generate_subgraph(VertexSet &vertex_set, Graph &g, Graph &sub) {
       sub.constructEdge(offsets[i]+j, dst, 0);
       j ++;
     }
-  }, galois::loopname("construct_graph"));
-  sub.degree_counting();
+  }//, galois::loopname("construct_graph"));
 #endif
 }
 
-void Sampler::subgraph_sample(size_t n, Graph&sg, mask_t *masks) {
+void Sampler::subgraph_sample(size_t n, Graph&sg, mask_t *masks, unsigned tid) {
   VertexSet vertex_set; // n = 9000 by default
-  select_vertices(count_, n, m_, masked_graph, vertices_, vertex_set); // m = 1000 by default
+  //select_vertices(count_, n, m_, masked_graph, vertices_, vertex_set); // m = 1000 by default
+  select_vertices(n, m_, vertex_set, tid); // m = 1000 by default
   update_masks(graph->size(), vertex_set, masks); // set masks for vertices in the vertex_set
 #ifndef GALOIS_USE_DIST
   Graph masked_sg;

From 1488798572187bcee1137da0191e16e2b216168d Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Mon, 4 May 2020 14:27:56 -0500
Subject: [PATCH 227/660] minor fix

---
 libdeepgalois/include/deepgalois/cutils.h     |  2 +-
 libdeepgalois/src/context.cpp                 | 12 ++++-----
 .../src/layers/sigmoid_loss_layer.cpp         |  8 +++---
 .../src/layers/softmax_loss_layer.cpp         |  2 +-
 libdeepgalois/src/lgraph.cpp                  | 27 ++++++++++++-------
 libdeepgalois/src/net.cpp                     | 10 +++----
 libdeepgalois/src/net.cu                      |  8 +++---
 libdeepgalois/src/sampler.cpp                 |  3 ++-
 libgpu/include/graph_gpu.h                    |  1 +
 9 files changed, 42 insertions(+), 31 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/cutils.h b/libdeepgalois/include/deepgalois/cutils.h
index 5181408363..383c9d6325 100644
--- a/libdeepgalois/include/deepgalois/cutils.h
+++ b/libdeepgalois/include/deepgalois/cutils.h
@@ -177,6 +177,6 @@ inline void print_device_vector(size_t n, const float_t *d_x, std::string name =
   float_t *h_x = new float_t[n];
   CUDA_CHECK(cudaMemcpy(h_x, d_x, n * sizeof(float_t), cudaMemcpyDeviceToHost));
   for (size_t i = 0; i < n; i ++) std::cout << name << "[" << i << "]=" << h_x[i] << "\n";
-  delete h_x;
+  delete[] h_x;
 }
 
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index bbaa915e0f..9f2b306371 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -29,12 +29,12 @@ Context::Context() : n(0), num_classes(0),
   norm_factors(NULL), norm_factors_subg(NULL) {}
 
 Context::~Context() {
-  if (h_labels) delete h_labels;
-  if (h_labels_subg) delete h_labels_subg;
-  if (h_feats) delete h_feats;
-  if (h_feats_subg) delete h_feats_subg;
-  if (norm_factors) delete norm_factors;
-  if (norm_factors_subg) delete norm_factors_subg;
+  if (h_labels) delete[] h_labels;
+  if (h_labels_subg) delete[] h_labels_subg;
+  if (h_feats) delete[] h_feats;
+  if (h_feats_subg) delete[] h_feats_subg;
+  if (norm_factors) delete[] norm_factors;
+  if (norm_factors_subg) delete[] norm_factors_subg;
 }
 
 size_t Context::read_graph(std::string dataset_str, bool selfloop) {
diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
index ca34389127..10a4f8454a 100644
--- a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
+++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
@@ -13,7 +13,7 @@ sigmoid_loss_layer::sigmoid_loss_layer(unsigned level,
 }
 
 sigmoid_loss_layer::~sigmoid_loss_layer() {
-  delete loss;
+  delete[] loss;
 }
 
 void sigmoid_loss_layer::malloc_and_init() {
@@ -37,7 +37,7 @@ void sigmoid_loss_layer::forward_propagation(const float_t* in_data, float_t* ou
       for (size_t j = 0; j < len; j++) ground_truth[j] = (float_t)get_label(i, j);
       // loss calculation
       loss[i] = math::cross_entropy(len, ground_truth, &out_data[idx]);
-	  delete ground_truth;
+	  delete[] ground_truth;
     }
   }, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("sigmoid-loss-fw"));
 }
@@ -55,8 +55,8 @@ void sigmoid_loss_layer::back_propagation(const float_t* in_data, const float_t*
       math::d_cross_entropy(len, ground_truth, &out_data[idx], norm_grad);
       // derviative sigmoid to gradient used in the next layer
       math::d_sigmoid(len, &in_data[idx], &out_data[idx], &in_grad[idx], norm_grad);
-	  delete norm_grad;
-	  delete ground_truth;
+	  delete[] norm_grad;
+	  delete[] ground_truth;
     }
   }, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("sigmoid-loss-bw"));
 }
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp
index 4ae9c6364b..54e461121f 100644
--- a/libdeepgalois/src/layers/softmax_loss_layer.cpp
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp
@@ -13,7 +13,7 @@ softmax_loss_layer::softmax_loss_layer(unsigned level,
 }
 
 softmax_loss_layer::~softmax_loss_layer() {
-  delete loss;
+  delete[] loss;
 }
 
 void softmax_loss_layer::malloc_and_init() {
diff --git a/libdeepgalois/src/lgraph.cpp b/libdeepgalois/src/lgraph.cpp
index a2c4a9e4ca..684d9b89e8 100644
--- a/libdeepgalois/src/lgraph.cpp
+++ b/libdeepgalois/src/lgraph.cpp
@@ -38,11 +38,20 @@ void LearningGraph::progressPrint(unsigned maxii, unsigned ii) {
 }
 
 void LearningGraph::allocateFrom(index_t nv, index_t ne) {
-  num_vertices_ = nv;
-  num_edges_ = ne;
   //printf("Allocating num_vertices_=%d, num_edges_=%d.\n", num_vertices_, num_edges_);
-  rowptr_ = new index_t[num_vertices_+1];
-  colidx_ = new index_t[num_edges_];
+  if (num_vertices_ != nv) {
+    if (rowptr_ != NULL) delete [] rowptr_;
+    if (degrees_ != NULL) delete [] degrees_;
+    if (vertex_data_ != NULL) delete [] vertex_data_;
+    num_vertices_ = nv;
+  }
+  if (num_edges_ != ne) {
+    if (colidx_ != NULL) delete [] colidx_;
+    if (edge_data_ != NULL) delete [] edge_data_;
+    num_edges_ = ne;
+  } 
+  if (rowptr_ == NULL) rowptr_ = new index_t[num_vertices_+1];
+  if (colidx_ == NULL) colidx_ = new index_t[num_edges_];
   rowptr_[0] = 0;
 }
 
@@ -165,11 +174,11 @@ void LearningGraph::readGraphFromGRFile(const std::string& filename) {
 #ifdef CPU_ONLY
 void LearningGraph::dealloc() {
   assert (!is_device);
-  if (rowptr_ != NULL) delete rowptr_;
-  if (colidx_ != NULL) delete colidx_;
-  if (degrees_ != NULL) delete degrees_;
-  if (vertex_data_ != NULL) delete vertex_data_;
-  if (edge_data_ != NULL) delete edge_data_;
+  if (rowptr_ != NULL) delete [] rowptr_;
+  if (colidx_ != NULL) delete [] colidx_;
+  if (degrees_ != NULL) delete [] degrees_;
+  if (vertex_data_ != NULL) delete [] vertex_data_;
+  if (edge_data_ != NULL) delete [] edge_data_;
 }
 #endif
 
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index 39909b4d34..c0127a54f2 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -162,23 +162,22 @@ void Net::train(optimizer* opt, bool need_validate) {
   Timer t_epoch;
   // run epochs
   for (unsigned ep = 0; ep < num_epochs; ep++) {
-    galois::gPrint(header, "Epoch ", std::setw(3), ep, seperator);
     t_epoch.Start();
 
     if (subgraph_sample_size) {
       if (num_subg_remain == 0) {
-        //galois::gPrint("Generating ", num_subgraphs, " subgraphs (mini-batches) ... ");
+        galois::gPrint("Generating ", num_subgraphs, " subgraphs (mini-batches) ... ");
         Timer t_subgen;
         t_subgen.Start();
         // generate subgraphs
+#ifdef CPU_ONLY
         //for (int sid = 0; sid < num_subgraphs; sid++) {
         galois::do_all(galois::iterate(size_t(0), size_t(num_subgraphs)),[&](const auto sid) {
           unsigned tid = 0;
           tid = galois::substrate::ThreadPool::getTID();
-#ifdef CPU_ONLY
           sampler->subgraph_sample(subgraph_sample_size, *(context->getSubgraphPointer(sid)), &subgraphs_masks[sid*num_samples], tid);
-#endif
         }, galois::loopname("subgraph_gen"));
+#endif
         num_subg_remain = num_subgraphs;
         t_subgen.Stop();
         //galois::gPrint("Done, time: ", t_subgen.Millisecs(), "\n");
@@ -210,6 +209,7 @@ void Net::train(optimizer* opt, bool need_validate) {
 	}
 
     // training steps
+    galois::gPrint(header, "Epoch ", std::setw(3), ep, seperator);
     set_netphases(net_phase::train);
     acc_t train_loss = 0.0, train_acc = 0.0;
 
@@ -313,7 +313,7 @@ double Net::evaluate(std::string type, acc_t& loss, acc_t& acc) {
   loss = fprop(begin, end, count, masks);
   float_t* predictions = layers[num_layers - 1]->next()->get_data();
   label_t* labels;
-  if (subgraph_sample_size) {
+  if (type == "train" && subgraph_sample_size) {
     labels = context->get_labels_subg_ptr();
   } else {
     labels = context->get_labels_ptr();
diff --git a/libdeepgalois/src/net.cu b/libdeepgalois/src/net.cu
index 5cc8593647..115ff6d81d 100644
--- a/libdeepgalois/src/net.cu
+++ b/libdeepgalois/src/net.cu
@@ -135,10 +135,10 @@ acc_t masked_f1_score_gpu(int num_classes, int begin, int end, int count,
   float_free_device(d_fp);
   float_free_device(d_fn);
   float_free_device(d_tn);
-  delete h_tp;
-  delete h_fp;
-  delete h_fn;
-  delete h_tn;
+  delete[] h_tp;
+  delete[] h_fp;
+  delete[] h_fn;
+  delete[] h_tn;
   return f1_micro;
 }
 
diff --git a/libdeepgalois/src/sampler.cpp b/libdeepgalois/src/sampler.cpp
index 2157d97dc5..ba338f5012 100644
--- a/libdeepgalois/src/sampler.cpp
+++ b/libdeepgalois/src/sampler.cpp
@@ -114,7 +114,8 @@ void print_vertex_set(VertexSet vertex_set) {
 
 void Sampler::select_vertices(size_t n, int m, VertexSet &st, unsigned tid) {
   //unsigned myseed = time(NULL);
-  unsigned myseed = tid;
+  unsigned myseed = tid + time(NULL);
+  //unsigned myseed = tid;
   //DBx: Dashboard line x, IAx: Index array line x
   std::vector<db_t> DB0, DB1, DB2, IA0, IA1, IA2, IA3, IA4, nDB0, nDB1, nDB2;
   DB0.reserve(subg_deg*m*ETA);
diff --git a/libgpu/include/graph_gpu.h b/libgpu/include/graph_gpu.h
index f6f9c57643..6815d1304f 100644
--- a/libgpu/include/graph_gpu.h
+++ b/libgpu/include/graph_gpu.h
@@ -177,6 +177,7 @@ struct CSRGraph {
 
   size_t size() { return size_t(nnodes); }
   size_t sizeEdges() { return size_t(nedges); }
+  void degree_counting() {}
   index_type nnodes, nedges;
   index_type* row_start; // row_start[node] points into edge_dst, node starts at
                          // 0, row_start[nnodes] = nedges

From e8bda7a4890fe189260aead174daf15fb275fa0d Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 4 May 2020 15:42:50 -0500
Subject: [PATCH 228/660] distgalois: enable def, move around cmakelist

---
 CMakeLists.txt | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a56a1702e9..58b143766d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -337,17 +337,21 @@ add_custom_target(apps)
 # Core libraries (lib)
 add_subdirectory(libgalois)
 
-if(USE_DEEPGALOIS)
-  add_subdirectory(libdeepgalois)
-  add_subdirectory(lonestargnn)
-endif(USE_DEEPGALOIS)
-
 if (ENABLE_DIST_GALOIS)
+  # currently making use of this in deepgalois to distinguish dist from no dist
+  # note this has to go before the libdeepgalois subdirectory is added below
+  add_definitions(-DGALOIS_USE_DIST)
   find_package(MPI REQUIRED)
   add_subdirectory(libdist)
   add_subdirectory(libcusp)
   add_subdirectory(libgluon)
 endif()
+
+if(USE_DEEPGALOIS)
+  add_subdirectory(libdeepgalois)
+  add_subdirectory(lonestargnn)
+endif(USE_DEEPGALOIS)
+
 if (ENABLE_HETERO_GALOIS)
   enable_language(CUDA)
   string(REPLACE "." "" GENCODES ${CUDA_CAPABILITY})

From 15953d19f0b3e74a6e36692e9aa1b79e927f8c6a Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 4 May 2020 15:43:23 -0500
Subject: [PATCH 229/660] dist graph index types, exposure of csr arrasy

---
 .../include/galois/graphs/DistributedGraph.h  | 20 +++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/libcusp/include/galois/graphs/DistributedGraph.h b/libcusp/include/galois/graphs/DistributedGraph.h
index 3912e41f15..d35bdbc91c 100644
--- a/libcusp/include/galois/graphs/DistributedGraph.h
+++ b/libcusp/include/galois/graphs/DistributedGraph.h
@@ -67,13 +67,16 @@ enum MASTERS_DISTRIBUTION {
  * @tparam NodeTy type of node data for the graph
  * @tparam EdgeTy type of edge data for the graph
  */
-template <typename NodeTy, typename EdgeTy>
+template <typename NodeTy, typename EdgeTy,
+          typename NodeIndexTy=uint32_t,
+          typename EdgeIndexTy=uint64_t>
 class DistGraph {
 private:
   //! Graph name used for printing things
   constexpr static const char* const GRNAME = "dGraph";
 
-  using GraphTy = galois::graphs::LC_CSR_Graph<NodeTy, EdgeTy, true>;
+  using GraphTy = galois::graphs::LC_CSR_Graph<NodeTy, EdgeTy, true, false,
+                  false, EdgeTy, NodeIndexTy, EdgeIndexTy>;
 
 protected:
   //! The internal graph used by DistGraph to represent the graph
@@ -1029,11 +1032,20 @@ class DistGraph {
     galois::gDebug("Deallocating CSR in DistGraph");
     graph.deallocate();
   }
+
+
+////////////////////////////////////////////////////////////////////////////////
+// what follows are GNN functions; some are not great (e.g. expose arrays)
+// TODO figure out better way to do this
+////////////////////////////////////////////////////////////////////////////////
+  EdgeIndexTy* row_start_ptr() { return graph.row_start_ptr(); }
+  NodeIndexTy* edge_dst_ptr() { return graph.edge_dst_ptr(); }
 };
 
-template <typename NodeTy, typename EdgeTy>
+template <typename NodeTy, typename EdgeTy, typename NodeIndexTy,
+          typename EdgeIndexTy>
 constexpr const char* const
-    galois::graphs::DistGraph<NodeTy, EdgeTy>::GRNAME;
+    galois::graphs::DistGraph<NodeTy, EdgeTy, NodeIndexTy, EdgeIndexTy>::GRNAME;
 } // end namespace graphs
 } // end namespace galois
 

From 41d53578492c86069f19085eb990eeaac2297f6c Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 4 May 2020 15:44:28 -0500
Subject: [PATCH 230/660] sync vars for graphconvlayer defined in cpp only

---
 libdeepgalois/include/deepgalois/types.h      | 7 +++++--
 libdeepgalois/src/layers/graph_conv_layer.cpp | 7 +++++++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/types.h b/libdeepgalois/include/deepgalois/types.h
index 3a579a9c5c..87e7411689 100644
--- a/libdeepgalois/include/deepgalois/types.h
+++ b/libdeepgalois/include/deepgalois/types.h
@@ -44,11 +44,14 @@ enum class net_phase { train, test };
 
 #ifdef GALOIS_USE_DIST
 namespace deepgalois {
+  // TODO only being used by graph conv layer at the moment so extern works,
+  // but this design is bad and needs to be revisited
+
   //! Set this to let sync struct know where to get data from
-  static float_t* _dataToSync = nullptr;
+  extern float_t* _dataToSync;
   //! Set this to let sync struct know the size of the vector to use during
   //! sync
-  static long unsigned _syncVectorSize = 0;
+  extern long unsigned _syncVectorSize;
 }
 #endif
 
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index e50d66f5ae..354db106e9 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -4,6 +4,13 @@
 
 namespace deepgalois {
 
+//! Set this to let sync struct know where to get data from
+float_t* _dataToSync = nullptr;
+//! Set this to let sync struct know the size of the vector to use during
+//! sync
+long unsigned _syncVectorSize = 0;
+
+
 graph_conv_layer::graph_conv_layer(unsigned level, bool act, bool norm,
                                    bool bias, bool dropout, float_t dropout_rate,
                                    std::vector<size_t> in_dims,

From 0065fe91e9db76950fe0bc8c721f94172b41ac82 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 4 May 2020 15:45:21 -0500
Subject: [PATCH 231/660] gnn distgraphloader, include LLVM command line

---
 lonestargnn/CMakeLists.txt                   | 4 +---
 lonestargnn/include/DistributedGraphLoader.h | 1 +
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/lonestargnn/CMakeLists.txt b/lonestargnn/CMakeLists.txt
index 0c313d742c..62fe9b321f 100644
--- a/lonestargnn/CMakeLists.txt
+++ b/lonestargnn/CMakeLists.txt
@@ -19,9 +19,7 @@ endif()
 
 if(ENABLE_DIST_GALOIS)
   add_library(distgraphloader STATIC src/DistributedGraphLoader.cpp)
-  target_include_directories(distgraphloader PUBLIC
-    include
-  )
+  target_include_directories(distgraphloader PUBLIC include)
   target_link_libraries(distgraphloader galois_cusp)
 endif()
 
diff --git a/lonestargnn/include/DistributedGraphLoader.h b/lonestargnn/include/DistributedGraphLoader.h
index 247ad0763c..f5a896b3de 100644
--- a/lonestargnn/include/DistributedGraphLoader.h
+++ b/lonestargnn/include/DistributedGraphLoader.h
@@ -32,6 +32,7 @@
 
 #include "galois/graphs/CuSPPartitioner.h"
 #include "deepgalois/configs.h"
+#include "llvm/Support/CommandLine.h"
 
 /*******************************************************************************
  * Supported partitioning schemes

From d95fc7736e2892601053430a38f56da6881779e0 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 4 May 2020 15:46:49 -0500
Subject: [PATCH 232/660] gtypes edgeiterator change for distgraph

---
 libdeepgalois/include/deepgalois/gtypes.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/libdeepgalois/include/deepgalois/gtypes.h b/libdeepgalois/include/deepgalois/gtypes.h
index fe759803e2..697d386d9a 100644
--- a/libdeepgalois/include/deepgalois/gtypes.h
+++ b/libdeepgalois/include/deepgalois/gtypes.h
@@ -29,6 +29,8 @@ typedef galois::graphs::LC_CSR_Graph<void, void, false, false, false, void, uint
 typedef LearningGraph Graph;
 typedef index_t edge_iterator;
 #else
+// TODO check if this needs changing
+typedef index_t edge_iterator;
 using Graph = galois::graphs::DistGraph<char, void>;
 #endif
 

From 91ee8dc256a4c75d86223beedd955e958368ce24 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 4 May 2020 15:47:14 -0500
Subject: [PATCH 233/660] WIP: disabling sampler for dist build

---
 libdeepgalois/CMakeLists.txt               |  2 +-
 libdeepgalois/include/deepgalois/net.h     |  4 +++-
 libdeepgalois/include/deepgalois/sampler.h |  4 ++++
 libdeepgalois/src/layers/aggregator.cpp    |  2 +-
 libdeepgalois/src/net.cpp                  | 10 +++++++++-
 5 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index 157e0151ad..a22985b3fa 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -72,6 +72,7 @@ endif()
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 if(ENABLE_DIST_GALOIS)
 # do not link regular context.cpp; TODO do this conditional in cleaner way
+# also don't link sampler
 set(sources
   src/layers/softmax_loss_layer.cpp
   src/layers/sigmoid_loss_layer.cpp
@@ -84,7 +85,6 @@ set(sources
   src/layers/layer.cpp
   src/DistContext.cpp
   src/optimizer.cpp
-  src/sampler.cpp
   src/lgraph.cpp
   src/utils.cpp
   src/node.cpp
diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h
index 6708e2ce63..ad0b1547f3 100644
--- a/libdeepgalois/include/deepgalois/net.h
+++ b/libdeepgalois/include/deepgalois/net.h
@@ -43,7 +43,7 @@ class Net {
             bool selfloop, bool single, bool l2norm, bool dense, 
             unsigned neigh_sample_size = 0, unsigned subg_sample = 0);
 #ifdef GALOIS_USE_DIST
-  void dist_init(Graph* dGraph);
+  void dist_init(Graph* graph, std::string dataset_str);
 #endif
   size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; }
   size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id + 1]; }
@@ -120,7 +120,9 @@ class Net {
 #endif
 
 #ifdef CPU_ONLY
+#ifndef GALOIS_USE_DIST
   Sampler *sampler;
+#endif
 #endif
   // comparing outputs with the ground truth (labels)
   acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, float_t* preds, label_t* ground_truth);
diff --git a/libdeepgalois/include/deepgalois/sampler.h b/libdeepgalois/include/deepgalois/sampler.h
index c92e8471d3..eb3b936d18 100644
--- a/libdeepgalois/include/deepgalois/sampler.h
+++ b/libdeepgalois/include/deepgalois/sampler.h
@@ -1,3 +1,5 @@
+#ifndef GALOIS_USE_DIST
+
 #pragma once
 #include "deepgalois/gtypes.h"
 
@@ -51,3 +53,5 @@ class Sampler {
 };
 
 }
+
+#endif
diff --git a/libdeepgalois/src/layers/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp
index fc841a6361..430106e16d 100644
--- a/libdeepgalois/src/layers/aggregator.cpp
+++ b/libdeepgalois/src/layers/aggregator.cpp
@@ -4,9 +4,9 @@
 #ifdef CPU_ONLY
 void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* out,
                 bool norm, const float_t* norm_factor) {
-  size_t n = g.size();
   //std::cout << "[update_all] graph size: " << n << "\n";
   #ifndef GALOIS_USE_DIST
+  size_t n = g.size();
   galois::do_all(galois::iterate(size_t(0), n),[&](const auto src) {
   #else
   auto& rangeObj = g.allNodesRange();
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index c0127a54f2..74ffba528b 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -87,6 +87,7 @@ void Net::init(std::string dataset_str, int nt, unsigned n_conv, unsigned epochs
   feature_dims[num_layers] = num_classes;                // normalized output embedding: E
   layers.resize(num_layers);
 
+#ifndef GALOIS_USE_DIST
   context->set_use_subgraph(subgraph_sample_size > 0);
 #ifdef CPU_ONLY
   if (subgraph_sample_size) sampler = new deepgalois::Sampler();
@@ -95,10 +96,11 @@ void Net::init(std::string dataset_str, int nt, unsigned n_conv, unsigned epochs
   copy_masks_device(num_samples, val_masks, d_val_masks);
   context->copy_data_to_device(); // copy labels and input features to the device
 #endif
+#endif
 }
 
 #ifdef GALOIS_USE_DIST
-void Net::dist_init(Graph* graph) {
+void Net::dist_init(Graph* graph, std::string dataset_str) {
   dGraph = graph;
   context = new deepgalois::DistContext();
   num_samples = dGraph->size();
@@ -151,12 +153,14 @@ void Net::train(optimizer* opt, bool need_validate) {
 
   int num_subg_remain = 0;
 #ifdef CPU_ONLY
+#ifndef GALOIS_USE_DIST
   if (subgraph_sample_size) {
     context->createSubgraphs(num_subgraphs);
     subgraphs_masks = new mask_t[num_samples*num_subgraphs];
     galois::gPrint("\nConstruct training vertex set induced graph...\n");
     sampler->set_masked_graph(train_begin, train_end, train_count, train_masks, context->getGraphPointer());
   }
+#endif
 #endif
   galois::gPrint("\nStart training...\n");
   Timer t_epoch;
@@ -171,22 +175,26 @@ void Net::train(optimizer* opt, bool need_validate) {
         t_subgen.Start();
         // generate subgraphs
 #ifdef CPU_ONLY
+#ifndef GALOIS_USE_DIST
         //for (int sid = 0; sid < num_subgraphs; sid++) {
         galois::do_all(galois::iterate(size_t(0), size_t(num_subgraphs)),[&](const auto sid) {
           unsigned tid = 0;
           tid = galois::substrate::ThreadPool::getTID();
           sampler->subgraph_sample(subgraph_sample_size, *(context->getSubgraphPointer(sid)), &subgraphs_masks[sid*num_samples], tid);
         }, galois::loopname("subgraph_gen"));
+#endif
 #endif
         num_subg_remain = num_subgraphs;
         t_subgen.Stop();
         //galois::gPrint("Done, time: ", t_subgen.Millisecs(), "\n");
       }
+#ifndef GALOIS_USE_DIST
       for (int i = 0; i < num_subgraphs; i++) {
         auto sg_ptr = context->getSubgraphPointer(i);
         sg_ptr->degree_counting();
         //galois::gPrint("\tsubgraph[", i, "]: num_v ", sg_ptr->size(), " num_e ", sg_ptr->sizeEdges(), "\n");
       }
+#endif GALOIS_USE_DIST
       num_subg_remain--;
       int sg_id = num_subg_remain;
       auto subgraph_ptr = context->getSubgraphPointer(sg_id);

From 42ec7dac87a408730d8cb4d34a7eb8566c2666b6 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Mon, 4 May 2020 16:15:01 -0500
Subject: [PATCH 234/660] fix some build errors

---
 .../include/deepgalois/DistContext.h          |  8 ++--
 libdeepgalois/include/deepgalois/lgraph.h     | 26 +++++++-----
 libdeepgalois/src/lgraph.cpp                  | 22 +++++++---
 libdeepgalois/src/lgraph.cu                   | 40 +++++++++----------
 libdeepgalois/src/net.cpp                     |  6 +--
 lonestargnn/gcn/gcn.cpp                       |  2 +-
 6 files changed, 60 insertions(+), 44 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h
index 1304b631f8..953010f09a 100644
--- a/libdeepgalois/include/deepgalois/DistContext.h
+++ b/libdeepgalois/include/deepgalois/DistContext.h
@@ -17,7 +17,7 @@ class DistContext {
   galois::graphs::GluonSubstrate<Graph>* syncSubstrate;
 
   Graph* graph_cpu;            // the input graph, |V| = N
-  Graph* subgraph_cpu;
+  std::vector<Graph*> subgraphs_cpu;
   label_t *h_labels;           // labels for classification. Single-class label: Nx1, multi-class label: NxE 
   label_t *h_labels_subg;      // labels for subgraph
   float_t* h_feats;            // input features: N x D
@@ -27,6 +27,7 @@ class DistContext {
   float_t* d_feats;            // input features on device
   float_t* d_feats_subg;       // input features for subgraph on device
   float_t* norm_factors;       // normalization constant based on graph structure
+  float_t* norm_factors_subg;  // normalization constant for subgraph
 
 public:
   DistContext();
@@ -47,18 +48,19 @@ class DistContext {
 
   //! find norm factor by looking at degree
   // TODO this is a distributed operation
-  void norm_factor_computing(bool is_subgraph, int subg_id);
+  void norm_factor_computing(bool is_subgraph, int subg_id = 0);
   void createSubgraphs(int num_subgraphs) {}
   void gen_subgraph_labels(size_t m, const mask_t *masks) {}
   void gen_subgraph_feats(size_t m, const mask_t *masks) {}
 
   float_t* get_norm_factors_ptr() { return norm_factors; }
   Graph* getGraphPointer() { return graph_cpu; }
-  Graph* getSubgraphPointer() { return subgraph_cpu; };
+  Graph* getSubgraphPointer(int id) { return subgraphs_cpu[id]; };
   float_t* get_feats_ptr() { return h_feats; }
   float_t* get_feats_subg_ptr() { return h_feats_subg; }
   label_t* get_labels_ptr() { return h_labels; }
   label_t* get_labels_subg_ptr() { return h_labels_subg; }
+  float_t* get_norm_factors_subg_ptr() { return norm_factors_subg; }
 
   void initializeSyncSubstrate();
   galois::graphs::GluonSubstrate<Graph>* getSyncSubstrate();
diff --git a/libdeepgalois/include/deepgalois/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h
index 315ec1145a..8d450a1a23 100644
--- a/libdeepgalois/include/deepgalois/lgraph.h
+++ b/libdeepgalois/include/deepgalois/lgraph.h
@@ -6,22 +6,30 @@
 namespace deepgalois {
 
 class LearningGraph {
+  typedef std::vector<index_t> IndexList;
+  //typedef index_t* IndexList;
 protected:
   bool is_device;
   index_t num_vertices_;
   index_t num_edges_;
-  index_t *rowptr_;
-  index_t *colidx_;
-  index_t *degrees_;
+  IndexList rowptr_;
+  IndexList colidx_;
+  IndexList degrees_;
   vdata_t *vertex_data_;
   edata_t *edge_data_;
+
+  index_t *d_rowptr_;
+  index_t *d_colidx_;
+  index_t *d_degrees_;
+  vdata_t *d_vertex_data_;
+  edata_t *d_edge_data_;
   std::vector<std::vector<size_t>> mirrorNodes;
 
 public:
   typedef size_t iterator;
   //using iterator = boost::counting_iterator<index_t>;
   LearningGraph(bool use_gpu) : is_device(use_gpu), num_vertices_(0), num_edges_(0),
-                                rowptr_(NULL), colidx_(NULL), degrees_(NULL),
+                                //rowptr_(NULL), colidx_(NULL), degrees_(NULL),
                                 vertex_data_(NULL), edge_data_(NULL) {}
   LearningGraph() : LearningGraph(false) {}
   ~LearningGraph() { dealloc(); }
@@ -35,17 +43,17 @@ class LearningGraph {
   index_t get_degree(index_t vid) { return degrees_[vid]; }
   index_t edge_begin(index_t vid) { return rowptr_[vid]; }
   index_t edge_end(index_t vid) { return rowptr_[vid+1]; }
-  index_t* row_start_ptr() { return rowptr_; }
-  index_t* edge_dst_ptr() { return colidx_; }
-  index_t* degrees_ptr() { return degrees_; }
+  index_t* row_start_ptr() { return &rowptr_[0]; }
+  index_t* edge_dst_ptr() { return &colidx_[0]; }
+  index_t* degrees_ptr() { return &degrees_[0]; }
   edata_t* edge_data_ptr() { return edge_data_; }
   vdata_t* vertex_data_ptr() { return vertex_data_; }
   iterator begin() const { return iterator(0); }
   iterator end() const { return iterator(num_vertices_); }
   void progressPrint(unsigned maxii, unsigned ii);
   void allocOnDevice(bool no_edge_data_);
-  void copy_to_cpu(LearningGraph &copygraph);
-  void copy_to_gpu(LearningGraph &copygraph);
+  void copy_to_cpu();
+  void copy_to_gpu();
   void dealloc();
   void degree_counting();
   void allocateFrom(index_t nv, index_t ne);
diff --git a/libdeepgalois/src/lgraph.cpp b/libdeepgalois/src/lgraph.cpp
index 684d9b89e8..a99ce7df36 100644
--- a/libdeepgalois/src/lgraph.cpp
+++ b/libdeepgalois/src/lgraph.cpp
@@ -39,6 +39,7 @@ void LearningGraph::progressPrint(unsigned maxii, unsigned ii) {
 
 void LearningGraph::allocateFrom(index_t nv, index_t ne) {
   //printf("Allocating num_vertices_=%d, num_edges_=%d.\n", num_vertices_, num_edges_);
+/*
   if (num_vertices_ != nv) {
     if (rowptr_ != NULL) delete [] rowptr_;
     if (degrees_ != NULL) delete [] degrees_;
@@ -52,6 +53,12 @@ void LearningGraph::allocateFrom(index_t nv, index_t ne) {
   } 
   if (rowptr_ == NULL) rowptr_ = new index_t[num_vertices_+1];
   if (colidx_ == NULL) colidx_ = new index_t[num_edges_];
+*/
+  num_vertices_ = nv;
+  num_edges_ = ne;
+  rowptr_.resize(num_vertices_+1);
+  colidx_.resize(num_edges_);
+  degrees_.resize(num_vertices_);
   rowptr_[0] = 0;
 }
 
@@ -69,8 +76,8 @@ void LearningGraph::constructEdge(index_t eid, index_t dst, edata_t edata) {
 }
 
 void LearningGraph::degree_counting() {
-  if (degrees_ != NULL) return;
-  degrees_ = new index_t[num_vertices_];
+  //if (degrees_ != NULL) return;
+  //degrees_ = new index_t[num_vertices_];
   galois::do_all(galois::iterate(size_t(0), size_t(num_vertices_)), [&] (auto v) {
     degrees_[v] = rowptr_[v+1] - rowptr_[v];
   }, galois::loopname("DegreeCounting"));
@@ -125,10 +132,11 @@ void LearningGraph::readGraphFromGRFile(const std::string& filename) {
   }
 
   printf("num_vertices_=%d, num_edges_=%d.\n", num_vertices_, num_edges_);
-  degrees_ = new index_t[num_vertices_];
-  rowptr_ = new index_t[num_vertices_+1];
-  colidx_ = new index_t[num_edges_];
-  rowptr_[0] = 0;
+  allocateFrom(nv, ne);
+  //degrees_ = new index_t[num_vertices_];
+  //rowptr_ = new index_t[num_vertices_+1];
+  //colidx_ = new index_t[num_edges_];
+  //rowptr_[0] = 0;
   for (unsigned ii = 0; ii < num_vertices_; ++ii) {
     rowptr_[ii+1] = le64toh(outIdx[ii]);
     degrees_[ii] = rowptr_[ii+1] - rowptr_[ii];
@@ -173,12 +181,14 @@ void LearningGraph::readGraphFromGRFile(const std::string& filename) {
 
 #ifdef CPU_ONLY
 void LearningGraph::dealloc() {
+/*
   assert (!is_device);
   if (rowptr_ != NULL) delete [] rowptr_;
   if (colidx_ != NULL) delete [] colidx_;
   if (degrees_ != NULL) delete [] degrees_;
   if (vertex_data_ != NULL) delete [] vertex_data_;
   if (edge_data_ != NULL) delete [] edge_data_;
+//*/
 }
 #endif
 
diff --git a/libdeepgalois/src/lgraph.cu b/libdeepgalois/src/lgraph.cu
index 0a925bbbdb..3a379a649e 100644
--- a/libdeepgalois/src/lgraph.cu
+++ b/libdeepgalois/src/lgraph.cu
@@ -6,40 +6,36 @@ namespace deepgalois {
 
 void LearningGraph::dealloc() {
   assert(is_device);
-  CUDA_CHECK(cudaFree(colidx_));
-  CUDA_CHECK(cudaFree(rowptr_));
-  CUDA_CHECK(cudaFree(degrees_));
-  if (edge_data_ != NULL) CUDA_CHECK(cudaFree(edge_data_));
-  if (vertex_data_ != NULL) CUDA_CHECK(cudaFree(vertex_data_));
+  CUDA_CHECK(cudaFree(d_colidx_));
+  CUDA_CHECK(cudaFree(d_rowptr_));
+  CUDA_CHECK(cudaFree(d_degrees_));
+  if (edge_data_ != NULL) CUDA_CHECK(cudaFree(d_edge_data_));
+  if (vertex_data_ != NULL) CUDA_CHECK(cudaFree(d_vertex_data_));
 }
 
 void LearningGraph::allocOnDevice(bool no_edge_data__) {
-  if (colidx_ != NULL) return;  
-  CUDA_CHECK(cudaMalloc((void **) &colidx_, num_edges_ * sizeof(index_t)));
-  CUDA_CHECK(cudaMalloc((void **) &rowptr_, (num_vertices_+1) * sizeof(index_t)));
-  CUDA_CHECK(cudaMalloc((void **) &degrees_, num_vertices_ * sizeof(index_t)));
+  if (d_colidx_ != NULL) return;  
+  CUDA_CHECK(cudaMalloc((void **) &d_colidx_, num_edges_ * sizeof(index_t)));
+  CUDA_CHECK(cudaMalloc((void **) &d_rowptr_, (num_vertices_+1) * sizeof(index_t)));
+  CUDA_CHECK(cudaMalloc((void **) &d_degrees_, num_vertices_ * sizeof(index_t)));
   //if (!no_edge_data__) CUDA_CHECK(cudaMalloc((void **) &edge_data__, num_edges_ * sizeof(edge_data___t)));
   //CUDA_CHECK(cudaMalloc((void **) &vertex_data__, num_vertices_ * sizeof(vdata_t)));
   is_device = true;
 }
 
-void LearningGraph::copy_to_gpu(LearningGraph &copygraph) {
-  copygraph.init(num_vertices_, num_edges_);
-  copygraph.allocOnDevice(edge_data_ == NULL);
-  CUDA_CHECK(cudaMemcpy(copygraph.colidx_, colidx_, num_edges_ * sizeof(index_t), cudaMemcpyHostToDevice));
-  CUDA_CHECK(cudaMemcpy(copygraph.rowptr_, rowptr_, (num_vertices_+1) * sizeof(index_t), cudaMemcpyHostToDevice));
-  CUDA_CHECK(cudaMemcpy(copygraph.degrees_, degrees_, num_vertices_ * sizeof(index_t), cudaMemcpyHostToDevice));
+void LearningGraph::copy_to_gpu() {
+  allocOnDevice(edge_data_ == NULL);
+  CUDA_CHECK(cudaMemcpy(edge_dst_ptr(), d_colidx_, num_edges_ * sizeof(index_t), cudaMemcpyHostToDevice));
+  CUDA_CHECK(cudaMemcpy(row_start_ptr(), d_rowptr_, (num_vertices_+1) * sizeof(index_t), cudaMemcpyHostToDevice));
+  CUDA_CHECK(cudaMemcpy(degrees_ptr(), d_degrees_, num_vertices_ * sizeof(index_t), cudaMemcpyHostToDevice));
   //if (edge_data__ != NULL) CUDA_CHECK(cudaMemcpy(copygraph.edge_data__, edge_data__, num_edges_ * sizeof(edata_t), cudaMemcpyHostToDevice));
   //CUDA_CHECK(cudaMemcpy(copygraph.vertex_data__, vertex_data__, num_vertices_ * sizeof(vdata_t), cudaMemcpyHostToDevice));
 }
 
-void LearningGraph::copy_to_cpu(LearningGraph &copygraph) {
-  assert(is_device);
-  assert(copygraph.size() == num_vertices_);
-  assert(copygraph.sizeEdges() == num_edges_);
-  CUDA_CHECK(cudaMemcpy(copygraph.edge_dst_ptr(), colidx_, num_edges_ * sizeof(index_t), cudaMemcpyDeviceToHost));
-  CUDA_CHECK(cudaMemcpy(copygraph.row_start_ptr(), rowptr_, (num_vertices_+1) * sizeof(index_t), cudaMemcpyDeviceToHost));
-  CUDA_CHECK(cudaMemcpy(copygraph.degrees_ptr(), degrees_, num_vertices_ * sizeof(index_t), cudaMemcpyDeviceToHost));
+void LearningGraph::copy_to_cpu() {
+  CUDA_CHECK(cudaMemcpy(edge_dst_ptr(), d_colidx_, num_edges_ * sizeof(index_t), cudaMemcpyDeviceToHost));
+  CUDA_CHECK(cudaMemcpy(row_start_ptr(), d_rowptr_, (num_vertices_+1) * sizeof(index_t), cudaMemcpyDeviceToHost));
+  CUDA_CHECK(cudaMemcpy(degrees_ptr(), d_degrees_, num_vertices_ * sizeof(index_t), cudaMemcpyDeviceToHost));
   //if (edge_data__ != NULL) CUDA_CHECK(cudaMemcpy(copygraph.edge_data__ptr(), edge_data__, num_edges_ * sizeof(edata_t), cudaMemcpyDeviceToHost));
   //CUDA_CHECK(cudaMemcpy(copygraph.vertex_data__ptr(), vertex_data__, num_vertices_ * sizeof(vdata_t), cudaMemcpyDeviceToHost));
 }
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index 74ffba528b..9ade42ff9b 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -194,7 +194,7 @@ void Net::train(optimizer* opt, bool need_validate) {
         sg_ptr->degree_counting();
         //galois::gPrint("\tsubgraph[", i, "]: num_v ", sg_ptr->size(), " num_e ", sg_ptr->sizeEdges(), "\n");
       }
-#endif GALOIS_USE_DIST
+#endif //GALOIS_USE_DIST
       num_subg_remain--;
       int sg_id = num_subg_remain;
       auto subgraph_ptr = context->getSubgraphPointer(sg_id);
@@ -530,9 +530,9 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks
       uint32_t localID = dGraph->getLID(i);
       if (masks[localID] == 1) {
         // get prediction
-        auto preds = math::argmax(num_classes, preds+localID*num_classes);
+        auto pred = math::argmax(num_classes, &preds[localID*num_classes]);
         // check prediction
-        if ((label_t)preds == ground_truth[localID])
+        if ((label_t)pred == ground_truth[localID])
           accuracy_all += 1.0;
       }
     }
diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp
index 489553a689..9cbb0eb77f 100644
--- a/lonestargnn/gcn/gcn.cpp
+++ b/lonestargnn/gcn/gcn.cpp
@@ -21,7 +21,7 @@ int main(int argc, char** argv) {
 #ifdef GALOIS_USE_DIST
   std::vector<unsigned> dummyVec;
   deepgalois::Graph* dGraph = galois::graphs::constructSymmetricGraph<char, void>(dummyVec);
-  network.dist_init(dGraph);
+  network.dist_init(dGraph, dataset);
 #endif
 
   // read network, features, ground truth, initialize metadata

From 0cb1cf05ce1c0468d0d45e5b059295256eeb17cb Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 4 May 2020 16:26:22 -0500
Subject: [PATCH 235/660] linking llvm gnn graph loader, llvm partition cl

---
 lonestargnn/CMakeLists.txt                 | 2 +-
 lonestargnn/src/DistributedGraphLoader.cpp | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/lonestargnn/CMakeLists.txt b/lonestargnn/CMakeLists.txt
index 62fe9b321f..0f7ef10320 100644
--- a/lonestargnn/CMakeLists.txt
+++ b/lonestargnn/CMakeLists.txt
@@ -20,7 +20,7 @@ endif()
 if(ENABLE_DIST_GALOIS)
   add_library(distgraphloader STATIC src/DistributedGraphLoader.cpp)
   target_include_directories(distgraphloader PUBLIC include)
-  target_link_libraries(distgraphloader galois_cusp)
+  target_link_libraries(distgraphloader galois_cusp LLVMSupport)
 endif()
 
 add_subdirectory(gcn)
diff --git a/lonestargnn/src/DistributedGraphLoader.cpp b/lonestargnn/src/DistributedGraphLoader.cpp
index dbdf24ab90..7c309dedc2 100644
--- a/lonestargnn/src/DistributedGraphLoader.cpp
+++ b/lonestargnn/src/DistributedGraphLoader.cpp
@@ -44,6 +44,5 @@ cll::opt<PARTITIONING_SCHEME> partitionScheme(
         clEnumValN(GINGER_I, "ginger-i", "ginger, incoming edges, using CuSP"),
         clEnumValN(FENNEL_O, "fennel-o", "fennel, outgoing edge cut, using CuSP"),
         clEnumValN(FENNEL_I, "fennel-i", "fennel, incoming edge cut, using CuSP"),
-        clEnumValN(SUGAR_O, "sugar-o", "fennel, incoming edge cut, using CuSP"),
-        clEnumValEnd),
+        clEnumValN(SUGAR_O, "sugar-o", "fennel, incoming edge cut, using CuSP")),
     cll::init(OEC));

From 09402d1724c7438f0b1b95c3e3df5cd9a131c4ea Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Mon, 4 May 2020 20:46:16 -0500
Subject: [PATCH 236/660] fix a bug

---
 libdeepgalois/include/deepgalois/context.h | 15 ++++++----
 libdeepgalois/src/context.cpp              | 33 +++++++++++++---------
 libdeepgalois/src/net.cpp                  |  2 +-
 3 files changed, 29 insertions(+), 21 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h
index 4a0ce506b2..5683c26f12 100644
--- a/libdeepgalois/include/deepgalois/context.h
+++ b/libdeepgalois/include/deepgalois/context.h
@@ -32,7 +32,7 @@ class Context {
   label_t get_label(size_t i) { return h_labels[i]; } // single-class (one-hot) label
   //label_t get_label(size_t i, size_t j) { return labels[i*num_classes+j]; } // multi-class label
   float_t* get_norm_factors_ptr() { return norm_factors; }
-  float_t* get_norm_factors_subg_ptr() { return norm_factors_subg; }
+  float_t* get_norm_factors_subg_ptr() { return &norm_factors_subg[0]; }
 
   void set_label_class(bool is_single = true) { is_single_class = is_single; }
   void set_use_subgraph(bool use_subg) { use_subgraph = use_subg; }
@@ -50,9 +50,9 @@ class Context {
   Graph* getGraphPointer() { return graph_cpu; }
   Graph* getSubgraphPointer(int id) { return subgraphs_cpu[id]; };
   float_t* get_feats_ptr() { return h_feats; }
-  float_t* get_feats_subg_ptr() { return h_feats_subg; }
+  float_t* get_feats_subg_ptr() { return &h_feats_subg[0]; }
   label_t* get_labels_ptr() { return h_labels; }
-  label_t* get_labels_subg_ptr() { return h_labels_subg; }
+  label_t* get_labels_subg_ptr() { return &h_labels_subg[0]; }
 #else
   CSRGraph graph_gpu; // the input graph, |V| = N
   std::vector<CSRGraph*> subgraphs_gpu;
@@ -76,15 +76,18 @@ class Context {
   bool is_selfloop_added;      // whether selfloop is added to the input graph
   bool use_subgraph;           // whether to use subgraph
   label_t *h_labels;           // labels for classification. Single-class label: Nx1, multi-class label: NxE 
-  label_t *h_labels_subg;      // labels for subgraph
   float_t* h_feats;            // input features: N x D
-  float_t* h_feats_subg;       // input features for subgraph
+  //label_t *h_labels_subg;      // labels for subgraph
+  //float_t* h_feats_subg;       // input features for subgraph
   label_t* d_labels;           // labels on device
   label_t *d_labels_subg;      // labels for subgraph on device
   float_t* d_feats;            // input features on device
   float_t* d_feats_subg;       // input features for subgraph on device
   float_t* norm_factors;       // normalization constant based on graph structure
-  float_t* norm_factors_subg;  // normalization constant for subgraph
+  std::vector<label_t> h_labels_subg;      // labels for subgraph
+  std::vector<float_t> h_feats_subg;       // input features for subgraph
+  std::vector<float_t> norm_factors_subg;  // normalization constant for subgraph
+  //float_t* norm_factors_subg;  // normalization constant for subgraph
   void alloc_norm_factor();
   void alloc_subgraph_norm_factor(int subg_id);
 
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index 9f2b306371..a4b0c27be2 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -22,19 +22,20 @@ Context& Context::Get() {
 Context::Context() : n(0), num_classes(0), 
   feat_len(0), is_single_class(true), 
   is_selfloop_added(false), use_subgraph(false),
-  h_labels(NULL), h_labels_subg(NULL), 
-  h_feats(NULL), h_feats_subg(NULL),
+  h_labels(NULL), h_feats(NULL),
+  //h_labels_subg(NULL), h_feats_subg(NULL),
   d_labels(NULL), d_labels_subg(NULL),
   d_feats(NULL), d_feats_subg(NULL),
-  norm_factors(NULL), norm_factors_subg(NULL) {}
+  norm_factors(NULL) {}
+  //norm_factors_subg(NULL) {}
 
 Context::~Context() {
   if (h_labels) delete[] h_labels;
-  if (h_labels_subg) delete[] h_labels_subg;
+  //if (h_labels_subg) delete[] h_labels_subg;
   if (h_feats) delete[] h_feats;
-  if (h_feats_subg) delete[] h_feats_subg;
+  //if (h_feats_subg) delete[] h_feats_subg;
   if (norm_factors) delete[] norm_factors;
-  if (norm_factors_subg) delete[] norm_factors_subg;
+  //if (norm_factors_subg) delete[] norm_factors_subg;
 }
 
 size_t Context::read_graph(std::string dataset_str, bool selfloop) {
@@ -50,14 +51,15 @@ void Context::createSubgraphs(int num_subgraphs) {
 
 // generate labels for the subgraph, m is subgraph size
 void Context::gen_subgraph_labels(size_t m, const mask_t *masks) {
-  if (h_labels_subg == NULL) h_labels_subg = new label_t[m];
+  //if (h_labels_subg == NULL) h_labels_subg = new label_t[m];
+  h_labels_subg.resize(m);
   size_t count = 0;
   for (size_t i = 0; i < n; i++) {
     if (masks[i] == 1) {
       if (is_single_class) {
         h_labels_subg[count] = h_labels[i];
       } else {
-        std::copy(h_labels+i*num_classes, h_labels+(i+1)*num_classes, h_labels_subg+count*num_classes);
+        std::copy(h_labels+i*num_classes, h_labels+(i+1)*num_classes, &h_labels_subg[count*num_classes]);
 	  }
       count ++;
 	}
@@ -67,10 +69,11 @@ void Context::gen_subgraph_labels(size_t m, const mask_t *masks) {
 // generate input features for the subgraph, m is subgraph size
 void Context::gen_subgraph_feats(size_t m, const mask_t *masks) {
   size_t count = 0;
-  if (h_feats_subg == NULL) h_feats_subg = new float_t[m*feat_len];
+  //if (h_feats_subg == NULL) h_feats_subg = new float_t[m*feat_len];
+  h_feats_subg.resize(m*feat_len);
   for (size_t i = 0; i < n; i++) {
     if (masks[i] == 1) {
-      std::copy(h_feats+i*feat_len, h_feats+(i+1)*feat_len, h_feats_subg+count*feat_len);
+      std::copy(h_feats+i*feat_len, h_feats+(i+1)*feat_len, &h_feats_subg[count*feat_len]);
       count ++;
 	}
   }
@@ -154,11 +157,13 @@ void Context::alloc_norm_factor() {
 
 void Context::alloc_subgraph_norm_factor(int subg_id) {
   Graph* g = getSubgraphPointer(subg_id);
-  if (norm_factors_subg == NULL)
+  //if (norm_factors_subg == NULL)
 #ifdef USE_MKL
-    norm_factors_subg = new float_t[g->sizeEdges()];
+    //norm_factors_subg = new float_t[g->sizeEdges()];
+    norm_factors_subg.resize(g->sizeEdges());
 #else
-    norm_factors_subg = new float_t[g->size()];
+    norm_factors_subg.resize(g->size());
+    //norm_factors_subg = new float_t[g->size()];
 #endif
 }
 
@@ -172,7 +177,7 @@ void Context::norm_factor_computing(bool is_subgraph, int subg_id) {
   } else {
     g = getSubgraphPointer(subg_id);
     alloc_subgraph_norm_factor(subg_id);
-    constants = norm_factors_subg;
+    constants = get_norm_factors_subg_ptr();
   }
   auto g_size = g->size();
   g->degree_counting();
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index 9ade42ff9b..9ee41b3302 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -170,7 +170,7 @@ void Net::train(optimizer* opt, bool need_validate) {
 
     if (subgraph_sample_size) {
       if (num_subg_remain == 0) {
-        galois::gPrint("Generating ", num_subgraphs, " subgraphs (mini-batches) ... ");
+        galois::gPrint("Generating ", num_subgraphs, " subgraphs ");
         Timer t_subgen;
         t_subgen.Start();
         // generate subgraphs

From b1b77c025c0e0ca02d49849789297015e6abd802 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Tue, 5 May 2020 09:56:04 -0500
Subject: [PATCH 237/660] update sampler

---
 libdeepgalois/include/deepgalois/net.h | 10 ++++---
 libdeepgalois/src/context.cpp          |  6 +++-
 libdeepgalois/src/net.cpp              | 41 +++++++++++++-------------
 libdeepgalois/src/sampler.cpp          | 31 +++++++++++++++----
 lonestargnn/gcn/gcn.cpp                |  2 +-
 lonestargnn/include/lonestargnn.h      |  4 +--
 6 files changed, 59 insertions(+), 35 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h
index ad0b1547f3..9c794a9063 100644
--- a/libdeepgalois/include/deepgalois/net.h
+++ b/libdeepgalois/include/deepgalois/net.h
@@ -37,11 +37,13 @@ class Net {
           train_begin(0), train_end(0), train_count(0),
           val_begin(0), val_end(0), val_count(0),
           test_begin(0), test_end(0), test_count(0),
-          train_masks(NULL), val_masks(NULL), test_masks(NULL), context(NULL) {}
-  void init(std::string dataset_str, int nt, unsigned n_conv, unsigned epochs,
+          val_interval(1), num_subgraphs(1), num_vertices_sg(9000),
+          train_masks(NULL), val_masks(NULL), 
+          test_masks(NULL), context(NULL) {}
+  void init(std::string dataset_str, int nt, unsigned n_conv, int epochs,
             unsigned hidden1, float lr, float dropout, float wd,
             bool selfloop, bool single, bool l2norm, bool dense, 
-            unsigned neigh_sample_size = 0, unsigned subg_sample = 0);
+            unsigned neigh_sample_sz, unsigned subg_sample_sz, int val_itv);
 #ifdef GALOIS_USE_DIST
   void dist_init(Graph* graph, std::string dataset_str);
 #endif
@@ -92,7 +94,7 @@ class Net {
   size_t num_classes;                // number of vertex classes: E
   size_t num_conv_layers;            // number of convolutional layers
   size_t num_layers;                 // total number of layers (conv + output)
-  unsigned num_epochs;               // number of epochs
+  int num_epochs;                    // number of epochs
   float learning_rate;               // learning rate
   float dropout_rate;                // dropout rate
   float weight_decay;                // weighti decay for over-fitting
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index a4b0c27be2..caec001182 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -52,7 +52,11 @@ void Context::createSubgraphs(int num_subgraphs) {
 // generate labels for the subgraph, m is subgraph size
 void Context::gen_subgraph_labels(size_t m, const mask_t *masks) {
   //if (h_labels_subg == NULL) h_labels_subg = new label_t[m];
-  h_labels_subg.resize(m);
+  if (is_single_class) {
+    h_labels_subg.resize(m);
+  } else {
+    h_labels_subg.resize(m*num_classes);
+  }
   size_t count = 0;
   for (size_t i = 0; i < n; i++) {
     if (masks[i] == 1) {
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index 9ee41b3302..d62ac752b1 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -9,10 +9,10 @@
 
 namespace deepgalois {
 
-void Net::init(std::string dataset_str, int nt, unsigned n_conv, unsigned epochs,
+void Net::init(std::string dataset_str, int nt, unsigned n_conv, int epochs,
                unsigned hidden1, float lr, float dropout, float wd,
                bool selfloop, bool single, bool l2norm, bool dense, 
-               unsigned neigh_sz, unsigned subg_sz) {
+               unsigned neigh_sz, unsigned subg_sz, int val_itv) {
   assert(n_conv > 0);
   num_threads = nt;
   num_conv_layers = n_conv;
@@ -25,8 +25,8 @@ void Net::init(std::string dataset_str, int nt, unsigned n_conv, unsigned epochs
   has_dense = dense;
   neighbor_sample_size = neigh_sz;
   subgraph_sample_size = subg_sz;
-  val_interval = 1;
-  num_subgraphs = num_threads;
+  val_interval = val_itv;
+  //num_subgraphs = 1;//num_threads;
   galois::gPrint("Configuration: num_threads ", num_threads, 
                  ", num_conv_layers ", num_conv_layers,
                  ", num_epochs ", num_epochs,
@@ -165,7 +165,7 @@ void Net::train(optimizer* opt, bool need_validate) {
   galois::gPrint("\nStart training...\n");
   Timer t_epoch;
   // run epochs
-  for (unsigned ep = 0; ep < num_epochs; ep++) {
+  for (int ep = 0; ep < num_epochs; ep++) {
     t_epoch.Start();
 
     if (subgraph_sample_size) {
@@ -176,12 +176,12 @@ void Net::train(optimizer* opt, bool need_validate) {
         // generate subgraphs
 #ifdef CPU_ONLY
 #ifndef GALOIS_USE_DIST
-        //for (int sid = 0; sid < num_subgraphs; sid++) {
-        galois::do_all(galois::iterate(size_t(0), size_t(num_subgraphs)),[&](const auto sid) {
+        for (int sid = 0; sid < num_subgraphs; sid++) {
+        //galois::do_all(galois::iterate(size_t(0), size_t(num_subgraphs)),[&](const auto sid) {
           unsigned tid = 0;
-          tid = galois::substrate::ThreadPool::getTID();
+          //tid = galois::substrate::ThreadPool::getTID();
           sampler->subgraph_sample(subgraph_sample_size, *(context->getSubgraphPointer(sid)), &subgraphs_masks[sid*num_samples], tid);
-        }, galois::loopname("subgraph_gen"));
+        }//, galois::loopname("subgraph_gen"));
 #endif
 #endif
         num_subg_remain = num_subgraphs;
@@ -247,17 +247,6 @@ void Net::train(optimizer* opt, bool need_validate) {
     double epoch_time = t_epoch.Millisecs();
     total_train_time += epoch_time;
     if (need_validate && ep % val_interval == 0) {
-      if (subgraph_sample_size) { // switch to the original graph
-        for (size_t i = 0; i < num_layers; i++) layers[i]->update_dim_size(num_samples);
-#ifdef CPU_ONLY
-        for (size_t i = 0; i < num_conv_layers; i++) {
-          layers[i]->set_graph_ptr(context->getGraphPointer());
-          layers[i]->set_norm_consts_ptr(context->get_norm_factors_ptr());
-	    }
-        layers[num_layers-1]->set_labels_ptr(context->get_labels_ptr());
-        layers[0]->set_feats_ptr(context->get_feats_ptr()); // feed input data
-#endif
-      }
       // Validation
       acc_t val_loss = 0.0, val_acc = 0.0;
       Tval.start();
@@ -308,7 +297,17 @@ double Net::evaluate(std::string type, acc_t& loss, acc_t& acc) {
     count = test_count;
     masks = test_masks;
   }
-#ifndef CPU_ONLY
+#ifdef CPU_ONLY
+  if (subgraph_sample_size && type != "train") { // switch to the original graph
+    for (size_t i = 0; i < num_layers; i++) layers[i]->update_dim_size(num_samples);
+    for (size_t i = 0; i < num_conv_layers; i++) {
+      layers[i]->set_graph_ptr(context->getGraphPointer());
+      layers[i]->set_norm_consts_ptr(context->get_norm_factors_ptr());
+    }
+    layers[num_layers-1]->set_labels_ptr(context->get_labels_ptr());
+    layers[0]->set_feats_ptr(context->get_feats_ptr()); // feed input data
+  }
+#else
   if (type == "train") {
     masks = d_train_masks;
   } else if (type == "val") {
diff --git a/libdeepgalois/src/sampler.cpp b/libdeepgalois/src/sampler.cpp
index ba338f5012..47317bdd3d 100644
--- a/libdeepgalois/src/sampler.cpp
+++ b/libdeepgalois/src/sampler.cpp
@@ -2,6 +2,7 @@
 #include "deepgalois/sampler.h"
 #include <time.h> 
 #include <vector>
+#define PARALLEL_GEN
 
 namespace deepgalois {
 inline unsigned getDegree(Graph *g, index_t v) {
@@ -54,15 +55,21 @@ void Sampler::set_masked_graph(size_t begin, size_t end, size_t count, mask_t *m
 
 void Sampler::get_masked_degrees(size_t n, mask_t *masks, Graph *g, std::vector<uint32_t> &degrees) {
   assert(degrees.size() == n);
+#ifdef PARALLEL_GEN
+  galois::do_all(galois::iterate(size_t(0), n), [&](const auto src) {
+#else
   for (size_t src = 0; src < n; src++) {
-  //galois::do_all(galois::iterate(size_t(0), n), [&](const auto src) {
+#endif
     if (masks[src] == 1) {
       for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) {
         const auto dst = g->getEdgeDst(e);
         if (masks[dst] == 1) degrees[src] ++;
       }
     }
-  }//, galois::loopname("update_degrees"));
+  }
+#ifdef PARALLEL_GEN
+  , galois::loopname("update_degrees"));
+#endif
 }
 
 void Sampler::generate_masked_graph(size_t n, mask_t* masks, Graph* g, Graph& sub) {
@@ -75,8 +82,11 @@ void Sampler::generate_masked_graph(size_t n, mask_t* masks, Graph* g, Graph& su
 #ifndef GALOIS_USE_DIST
   sub.allocateFrom(n, ne);
   sub.constructNodes();
+#ifdef PARALLEL_GEN
+  galois::do_all(galois::iterate((size_t)0, n), [&](const auto src) {
+#else
   for (size_t src = 0; src < n; src++) {
-  //galois::do_all(galois::iterate((size_t)0, n), [&](const auto src) {
+#endif
     sub.fixEndEdge(src, offsets[src+1]);
     if (masks[src] == 1) {
       auto idx = offsets[src];
@@ -85,7 +95,10 @@ void Sampler::generate_masked_graph(size_t n, mask_t* masks, Graph* g, Graph& su
         if (masks[dst] == 1) sub.constructEdge(idx++, dst, 0);
       }
     }
-  }//, galois::loopname("gen_subgraph"));
+  }
+#ifdef PARALLEL_GEN
+  , galois::loopname("gen_subgraph"));
+#endif
 #endif
 }
 
@@ -319,8 +332,11 @@ void Sampler::generate_subgraph(VertexSet &vertex_set, Graph &g, Graph &sub) {
   sub.allocateFrom(nv, ne);
   sub.constructNodes();
   VertexList old_ids(vertex_set.begin(), vertex_set.end()); // vertex ID mapping
+#ifdef PARALLEL_GEN
+  galois::do_all(galois::iterate((size_t)0, nv), [&](const auto i) {
+#else
   for (size_t i = 0; i < nv; i++) {
-  //galois::do_all(galois::iterate((size_t)0, nv), [&](const auto i) {
+#endif
     sub.fixEndEdge(i, offsets[i+1]);
     unsigned j = 0;
     auto old_id = old_ids[i];
@@ -330,7 +346,10 @@ void Sampler::generate_subgraph(VertexSet &vertex_set, Graph &g, Graph &sub) {
       sub.constructEdge(offsets[i]+j, dst, 0);
       j ++;
     }
-  }//, galois::loopname("construct_graph"));
+  }
+#ifdef PARALLEL_GEN
+  , galois::loopname("construct_graph"));
+#endif
 #endif
 }
 
diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp
index 9cbb0eb77f..de3f2a76ee 100644
--- a/lonestargnn/gcn/gcn.cpp
+++ b/lonestargnn/gcn/gcn.cpp
@@ -28,7 +28,7 @@ int main(int argc, char** argv) {
   network.init(dataset, numThreads, num_conv_layers, epochs, hidden1,
                learning_rate, dropout_rate, weight_decay,
                add_selfloop, is_single_class, add_l2norm, add_dense, 
-               neighbor_sample_sz, subgraph_sample_sz);
+               neighbor_sample_sz, subgraph_sample_sz, val_interval);
   // default setting for now; can be customized by the user
   network.construct_layers();
   network.print_layers_info();
diff --git a/lonestargnn/include/lonestargnn.h b/lonestargnn/include/lonestargnn.h
index 77a2777d5f..a72668daab 100644
--- a/lonestargnn/include/lonestargnn.h
+++ b/lonestargnn/include/lonestargnn.h
@@ -20,7 +20,7 @@ static cll::opt<std::string> dataset(cll::Positional,
     cll::desc("<dataset name>"), cll::Required); // 'cora', 'citeseer', 'pubmed'
 //static cll::opt<std::string> model("m", 
 //  cll::desc("Model string"), cll::init("gcn")); // 'gcn', 'gcn_cheby', 'dense'
-static cll::opt<unsigned> epochs("k",
+static cll::opt<int> epochs("k",
     cll::desc("number of epoch, i.e. iterations (default value 1)"), cll::init(1));
 static cll::opt<unsigned> num_conv_layers("nc",
     cll::desc("number of convolutional layers, (default value 2)"), cll::init(2));
@@ -41,7 +41,7 @@ static cll::opt<bool> do_test("dt", cll::desc("enable test"), cll::init(1));
 static cll::opt<bool> add_selfloop("sl", cll::desc("add selfloop"), cll::init(0));
 static cll::opt<bool> add_l2norm("l2", cll::desc("add an l2_norm layer"), cll::init(0));
 static cll::opt<bool> add_dense("d", cll::desc("add an dense layer"), cll::init(0));
-static cll::opt<unsigned> val_interval("vi", cll::desc("validation interval (default value 1)"), cll::init(1));
+static cll::opt<int> val_interval("vi", cll::desc("validation interval (default value 1)"), cll::init(1));
 static cll::opt<unsigned> neighbor_sample_sz("ns", cll::desc("neighbor sampling size (default value 0)"), cll::init(0));
 static cll::opt<unsigned> subgraph_sample_sz("ss", cll::desc("subgraph sampling size (default value 0)"), cll::init(0));
 

From 766b5ee78614ffe0a9cd85ea7c6289a286d1a7db Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Tue, 5 May 2020 12:53:41 -0500
Subject: [PATCH 238/660] use mkl19

---
 CMakeLists.txt                                |  2 +-
 .../include/deepgalois/layers/aggregator.h    |  4 +-
 .../include/deepgalois/math_functions.hh      |  3 +-
 libdeepgalois/src/context.cu                  |  7 ++--
 libdeepgalois/src/layers/aggregator.cpp       |  8 ++--
 .../src/layers/sigmoid_loss_layer.cpp         |  6 +--
 libdeepgalois/src/math_functions.cpp          | 41 ++++++++++++++-----
 libdeepgalois/src/utils.cpp                   |  2 +-
 8 files changed, 44 insertions(+), 29 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 58b143766d..d0fa1a80c1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -259,7 +259,7 @@ if(USE_VTUNE)
 endif()
 
 if(USE_MKL_BLAS)
-  SET(INTEL_ROOT /opt/apps/sysnet/intel/17.0)
+  SET(INTEL_ROOT /opt/apps/sysnet/intel/19.0)
   SET(MKL_ROOT ${INTEL_ROOT}/mkl)
   SET(INTEL_LIBS_DIR ${INTEL_ROOT}/lib/intel64_lin)
   find_package(MKL)
diff --git a/libdeepgalois/include/deepgalois/layers/aggregator.h b/libdeepgalois/include/deepgalois/layers/aggregator.h
index ffdd3935a8..1b2d4b5104 100644
--- a/libdeepgalois/include/deepgalois/layers/aggregator.h
+++ b/libdeepgalois/include/deepgalois/layers/aggregator.h
@@ -6,9 +6,9 @@
 #include "deepgalois/gtypes.h"
 namespace deepgalois {
 void update_all(size_t len, Graph& g, const float_t* in, float_t* out,
-                bool norm, const float_t* norm_factor);
+                bool norm, float_t* norm_factor);
 void update_all_csrmm(size_t len, Graph& g, const float_t* in, 
-                float_t* out, bool norm, const float_t* norm_factor);
+                float_t* out, bool norm, float_t* norm_factor);
 }
 #else
 #include "graph_gpu.h"
diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh
index fc9e798633..6e7ac10fe2 100644
--- a/libdeepgalois/include/deepgalois/math_functions.hh
+++ b/libdeepgalois/include/deepgalois/math_functions.hh
@@ -29,8 +29,7 @@ void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
 
 // single-precision sparse matrix dense matrix multiply, C = A * B, A is sparse
 void csrmm_cpu(const int M, const int N, const int K, const int nnz, 
-               const float alpha, const float* A_nonzeros, 
-	           const int* A_idx_ptr, const int* A_nonzero_idx,
+               const float alpha, float* A_nonzeros, int* A_idx_ptr, int* A_nonzero_idx,
                const float* B, const float beta, float* C);
 
 // matrix-vector multiply
diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu
index 6f42196428..99f14ce11a 100644
--- a/libdeepgalois/src/context.cu
+++ b/libdeepgalois/src/context.cu
@@ -64,10 +64,9 @@ curandGenerator_t Context::curand_generator_   = 0;
 
 Context::Context() : n(0), num_classes(0), feat_len(0), 
                      is_single_class(true), is_selfloop_added(false), 
-                     h_labels(NULL), h_labels_subg(NULL), 
-                     h_feats(NULL), h_feats_subg(NULL),
-                     d_labels(NULL), d_labels_subg(NULL),
-                     d_feats(NULL), d_feats_subg(NULL),
+                     h_labels(NULL), h_feats(NULL), 
+                     d_labels(NULL), d_feats(NULL),
+                     d_labels_subg(NULL), d_feats_subg(NULL),
                      norm_factors(NULL) {
   CUBLAS_CHECK(cublasCreate(&cublas_handle_));
   CUSPARSE_CHECK(cusparseCreate(&cusparse_handle_));
diff --git a/libdeepgalois/src/layers/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp
index 430106e16d..b298107f4e 100644
--- a/libdeepgalois/src/layers/aggregator.cpp
+++ b/libdeepgalois/src/layers/aggregator.cpp
@@ -3,7 +3,7 @@
 
 #ifdef CPU_ONLY
 void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* out,
-                bool norm, const float_t* norm_factor) {
+                bool norm, float_t* norm_factor) {
   //std::cout << "[update_all] graph size: " << n << "\n";
   #ifndef GALOIS_USE_DIST
   size_t n = g.size();
@@ -42,14 +42,12 @@ void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* ou
 }
 
 void deepgalois::update_all_csrmm(size_t len, Graph& g, const float_t* in, float_t* out,
-                                  bool norm, const float_t* norm_factor) {
+                                  bool norm, float_t* norm_factor) {
   galois::StatTimer Tcsrmm("CSRMM-MKL");
-  //galois::gPrint("csrmm mkl\n");
   Tcsrmm.start();
   unsigned n = g.size();
   math::clear_cpu(n*len, out);
-  math::csrmm_cpu(n, len, n, g.sizeEdges(), 1.0, norm_factor, 
-            (const int*)g.row_start_ptr(), (const int*)g.edge_dst_ptr(), in, 0.0, out);
+  math::csrmm_cpu(n, len, n, g.sizeEdges(), 1.0, norm_factor, (int*)g.row_start_ptr(), (int*)g.edge_dst_ptr(), in, 0.0, out);
   Tcsrmm.stop();
 }
 #endif
diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
index 10a4f8454a..2288a8da26 100644
--- a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
+++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
@@ -28,7 +28,7 @@ inline label_t sigmoid_loss_layer::get_label(size_t i, size_t j) {
 void sigmoid_loss_layer::forward_propagation(const float_t* in_data, float_t* out_data) {
   size_t len = input_dims[1];
   galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) {
-    if (masks_[i] == 1) { // masked
+    if (!use_mask || masks_[i] == 1) { // masked
       size_t idx = len * i;
       // output is normalized input for this layer
       math::sigmoid(len, &in_data[idx], &out_data[idx]); // normalize using sigmoid
@@ -46,7 +46,7 @@ void sigmoid_loss_layer::back_propagation(const float_t* in_data, const float_t*
                                           float_t* out_grad, float_t* in_grad) {
   size_t len = layer::input_dims[1];
   galois::do_all(galois::iterate(layer::begin_, layer::end_), [&](const auto& i) {
-    if (masks_[i] == 1) { // masked
+    if (!use_mask || masks_[i] == 1) { // masked
       size_t idx = len * i;
       float_t *norm_grad = new float_t[len];
       float_t *ground_truth = new float_t[len];
@@ -68,7 +68,7 @@ acc_t sigmoid_loss_layer::get_prediction_loss() {
   total_loss.reset();
   valid_sample_count.reset();
   galois::do_all(galois::iterate(layer::begin_, layer::end_), [&](const auto& i) {
-    if (masks_[i]) {
+    if (!use_mask || masks_[i]) {
       total_loss += loss[i];
       valid_sample_count += 1;
     }
diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp
index ec43be8656..4dcfb941ac 100644
--- a/libdeepgalois/src/math_functions.cpp
+++ b/libdeepgalois/src/math_functions.cpp
@@ -57,23 +57,42 @@ void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
   Tmatmul.start();
   int lda = (TransA == CblasNoTrans) ? K : M;
   int ldb = (TransB == CblasNoTrans) ? N : K;
-  cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb,
-              beta, C, N);
+  cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N);
   Tmatmul.stop();
 }
 
 void csrmm_cpu(const int M, const int N, const int K, const int nnz, 
-               const float alpha, const float* A_nonzeros, 
-	           const int* A_idx_ptr, const int* A_nnz_idx,
+               const float alpha, float* A_nonzeros, int* A_idx_ptr, int* A_nnz_idx,
                const float* B, const float beta, float* C) {
 #ifdef USE_MKL
-  mkl_set_num_threads(56);
-  const char *matdescra = "GXXCX";//6 bytes
-  const char transa = 'N';
-  //printf("Calling Intel MKL\n"); exit(1);
-  mkl_scsrmm(&transa, &M , &N, &K, &alpha , matdescra,
-             A_nonzeros, A_nnz_idx, A_idx_ptr, A_idx_ptr+1,
-             B, &N, &beta , C, &N);
+  //mkl_set_num_threads(56);
+  //const char *matdescra = "GXXCX";//6 bytes
+  //const char transa = 'N';
+  //mkl_scsrmm(&transa, &M , &N, &K, &alpha, matdescra, A_nonzeros, A_nnz_idx, A_idx_ptr, A_idx_ptr+1, B, &N, &beta, C, &N);
+  sparse_status_t status;
+  bool need_trans = false;
+  bool is_row_major = true;
+  sparse_matrix_t csrA = NULL;
+  sparse_index_base_t indexing = SPARSE_INDEX_BASE_ZERO;
+  sparse_layout_t layout = (is_row_major ? SPARSE_LAYOUT_ROW_MAJOR : SPARSE_LAYOUT_COLUMN_MAJOR);
+  status = mkl_sparse_s_create_csr(&csrA, indexing, M, K, A_idx_ptr, A_idx_ptr + 1, A_nnz_idx, A_nonzeros);
+  if (status != SPARSE_STATUS_SUCCESS) {
+    std::cout << "mkl_sparse_s_create_csr status :" << status << std::endl;
+    exit(1);
+  }
+  sparse_operation_t transa = (need_trans ? SPARSE_OPERATION_TRANSPOSE : SPARSE_OPERATION_NON_TRANSPOSE);
+  struct matrix_descr descrA;
+  descrA.type = SPARSE_MATRIX_TYPE_GENERAL;
+  //descrA.mode = SPARSE_FILL_MODE_UPPER;
+  //descrA.diag = SPARSE_DIAG_NON_UNIT;
+  //mkl_sparse_set_mm_hint(csrA, transa, descrA, layout, N, 1);
+  //mkl_sparse_optimize(csrA);
+  status = mkl_sparse_s_mm(transa, alpha, csrA, descrA, layout, B, N, N, beta, C, N);
+  if (status != SPARSE_STATUS_SUCCESS) {
+    std::cout << "mkl_sparse_s_create_csr status :" << status << std::endl;
+    exit(1);
+  }
+  mkl_sparse_destroy(csrA);
 #else
   NOT_IMPLEMENTED;
 #endif
diff --git a/libdeepgalois/src/utils.cpp b/libdeepgalois/src/utils.cpp
index dedb9c225a..882154f5c0 100644
--- a/libdeepgalois/src/utils.cpp
+++ b/libdeepgalois/src/utils.cpp
@@ -59,7 +59,7 @@ acc_t masked_f1_score(size_t begin, size_t end, size_t count, mask_t *masks,
     int tp_cls(0), fp_cls(0), fn_cls(0), tn_cls(0);
     for (size_t row = begin; row < end; row ++) {
     //galois::do_all(galois::iterate(begin, end), [&](const auto& row) {
-      if (masks[row] == 1) {
+      if (masks == NULL || masks[row] == 1) {
         auto idx = row * num_classes + col;
         if (ground_truth[idx] == 1 && pred[idx] > 0.5) {
           //__sync_fetch_and_add(&tp_cls, 1);

From d61963d836b22b5bb96143d1e813337870dfa203 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 5 May 2020 16:03:20 -0500
Subject: [PATCH 239/660] lonestargnn now subdirectory in lonestar

---
 CMakeLists.txt                                                | 1 -
 lonestar/CMakeLists.txt                                       | 4 ++++
 {lonestargnn => lonestar/gnn}/CMakeLists.txt                  | 2 +-
 {lonestargnn => lonestar/gnn}/gcn/CMakeLists.txt              | 0
 {lonestargnn => lonestar/gnn/gcn}/README.md                   | 0
 {lonestargnn => lonestar/gnn}/gcn/gcn.cpp                     | 0
 {lonestargnn => lonestar/gnn}/gin/CMakeLists.txt              | 0
 {lonestargnn => lonestar/gnn}/gin/gin.cpp                     | 0
 {lonestargnn => lonestar/gnn}/graphsage/gs-mean.cpp           | 0
 .../gnn}/include/DistributedGraphLoader.h                     | 0
 {lonestargnn => lonestar/gnn}/include/lonestargnn.h           | 0
 {lonestargnn => lonestar/gnn}/run-citeseer.sh                 | 0
 {lonestargnn => lonestar/gnn}/src/DistributedGraphLoader.cpp  | 0
 13 files changed, 5 insertions(+), 2 deletions(-)
 rename {lonestargnn => lonestar/gnn}/CMakeLists.txt (92%)
 rename {lonestargnn => lonestar/gnn}/gcn/CMakeLists.txt (100%)
 rename {lonestargnn => lonestar/gnn/gcn}/README.md (100%)
 rename {lonestargnn => lonestar/gnn}/gcn/gcn.cpp (100%)
 rename {lonestargnn => lonestar/gnn}/gin/CMakeLists.txt (100%)
 rename {lonestargnn => lonestar/gnn}/gin/gin.cpp (100%)
 rename {lonestargnn => lonestar/gnn}/graphsage/gs-mean.cpp (100%)
 rename {lonestargnn => lonestar/gnn}/include/DistributedGraphLoader.h (100%)
 rename {lonestargnn => lonestar/gnn}/include/lonestargnn.h (100%)
 rename {lonestargnn => lonestar/gnn}/run-citeseer.sh (100%)
 rename {lonestargnn => lonestar/gnn}/src/DistributedGraphLoader.cpp (100%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8f2a43aa02..b8c5e98cf6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -297,7 +297,6 @@ endif()
 
 if(USE_DEEPGALOIS)
   add_subdirectory(libdeepgalois)
-  add_subdirectory(lonestargnn)
 endif(USE_DEEPGALOIS)
 
 if (ENABLE_HETERO_GALOIS)
diff --git a/lonestar/CMakeLists.txt b/lonestar/CMakeLists.txt
index fbb645e7cf..58e911aa0d 100644
--- a/lonestar/CMakeLists.txt
+++ b/lonestar/CMakeLists.txt
@@ -193,3 +193,7 @@ add_subdirectory(analytics)
 add_subdirectory(eda)
 add_subdirectory(mining)
 add_subdirectory(scientific)
+
+if(USE_DEEPGALOIS)
+  add_subdirectory(gnn)
+endif(USE_DEEPGALOIS)
diff --git a/lonestargnn/CMakeLists.txt b/lonestar/gnn/CMakeLists.txt
similarity index 92%
rename from lonestargnn/CMakeLists.txt
rename to lonestar/gnn/CMakeLists.txt
index 0f7ef10320..d0551bdadc 100644
--- a/lonestargnn/CMakeLists.txt
+++ b/lonestar/gnn/CMakeLists.txt
@@ -1,4 +1,4 @@
-include_directories(${CMAKE_SOURCE_DIR}/lonestargnn/include)
+include_directories(${CMAKE_SOURCE_DIR}/lonestar/gnn/include)
 include_directories(${CMAKE_SOURCE_DIR}/libdeepgalois/include)
 include_directories(${CUDA_HOME}/include)
 link_directories(${CUDA_HOME}/lib64)
diff --git a/lonestargnn/gcn/CMakeLists.txt b/lonestar/gnn/gcn/CMakeLists.txt
similarity index 100%
rename from lonestargnn/gcn/CMakeLists.txt
rename to lonestar/gnn/gcn/CMakeLists.txt
diff --git a/lonestargnn/README.md b/lonestar/gnn/gcn/README.md
similarity index 100%
rename from lonestargnn/README.md
rename to lonestar/gnn/gcn/README.md
diff --git a/lonestargnn/gcn/gcn.cpp b/lonestar/gnn/gcn/gcn.cpp
similarity index 100%
rename from lonestargnn/gcn/gcn.cpp
rename to lonestar/gnn/gcn/gcn.cpp
diff --git a/lonestargnn/gin/CMakeLists.txt b/lonestar/gnn/gin/CMakeLists.txt
similarity index 100%
rename from lonestargnn/gin/CMakeLists.txt
rename to lonestar/gnn/gin/CMakeLists.txt
diff --git a/lonestargnn/gin/gin.cpp b/lonestar/gnn/gin/gin.cpp
similarity index 100%
rename from lonestargnn/gin/gin.cpp
rename to lonestar/gnn/gin/gin.cpp
diff --git a/lonestargnn/graphsage/gs-mean.cpp b/lonestar/gnn/graphsage/gs-mean.cpp
similarity index 100%
rename from lonestargnn/graphsage/gs-mean.cpp
rename to lonestar/gnn/graphsage/gs-mean.cpp
diff --git a/lonestargnn/include/DistributedGraphLoader.h b/lonestar/gnn/include/DistributedGraphLoader.h
similarity index 100%
rename from lonestargnn/include/DistributedGraphLoader.h
rename to lonestar/gnn/include/DistributedGraphLoader.h
diff --git a/lonestargnn/include/lonestargnn.h b/lonestar/gnn/include/lonestargnn.h
similarity index 100%
rename from lonestargnn/include/lonestargnn.h
rename to lonestar/gnn/include/lonestargnn.h
diff --git a/lonestargnn/run-citeseer.sh b/lonestar/gnn/run-citeseer.sh
similarity index 100%
rename from lonestargnn/run-citeseer.sh
rename to lonestar/gnn/run-citeseer.sh
diff --git a/lonestargnn/src/DistributedGraphLoader.cpp b/lonestar/gnn/src/DistributedGraphLoader.cpp
similarity index 100%
rename from lonestargnn/src/DistributedGraphLoader.cpp
rename to lonestar/gnn/src/DistributedGraphLoader.cpp

From 8a851dc85fb693f1969ad43d418fa745a836fd63 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 5 May 2020 16:14:56 -0500
Subject: [PATCH 240/660] readdded libpangolin that was accidentally deleted in
 merge

---
 CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b8c5e98cf6..3fb831effd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -309,6 +309,7 @@ if (ENABLE_HETERO_GALOIS)
 
   add_subdirectory(libgpu)
 endif()
+add_subdirectory(libpangolin)
 
 # Applications (apps)
 add_subdirectory(lonestar)

From 3fa21e02077117ffd16e1b6a27eb578efbfc56d1 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Tue, 5 May 2020 17:55:16 -0500
Subject: [PATCH 241/660] merge fix

---
 libdeepgalois/CMakeLists.txt                  |  9 ++
 .../deepgalois/layers/graph_conv_layer.h      | 12 ++-
 .../include/deepgalois/layers/layer.h         |  9 +-
 .../include/deepgalois/layers/node.h          |  7 +-
 libdeepgalois/include/deepgalois/lgraph.h     | 33 +++++--
 libdeepgalois/include/deepgalois/optimizer.h  | 30 +++----
 libdeepgalois/include/deepgalois/types.h      |  1 +
 libdeepgalois/src/layers/aggregator.cpp       |  1 +
 libdeepgalois/src/layers/graph_conv_layer.cpp | 16 +---
 libdeepgalois/src/layers/l2_norm_layer.cpp    |  1 +
 libdeepgalois/src/layers/leaky_relu_layer.cpp |  1 +
 libdeepgalois/src/layers/relu_layer.cpp       |  1 +
 .../src/layers/sigmoid_loss_layer.cpp         |  1 +
 .../src/layers/softmax_loss_layer.cpp         | 10 +--
 libdeepgalois/src/lgraph.cpp                  | 39 ++++++++-
 libdeepgalois/src/math_functions.cpp          |  3 +
 libdeepgalois/src/optimizer.cpp               | 85 +++++++++----------
 libdeepgalois/src/utils.cpp                   |  1 +
 18 files changed, 161 insertions(+), 99 deletions(-)

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index a22985b3fa..3f5bc11a95 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -91,6 +91,14 @@ set(sources
   src/net.cpp
 )
 else()
+if(ENABLE_DIST_GALOIS)
+set(sources
+  src/context.cpp
+  src/lgraph.cpp
+  src/node.cpp
+  src/net.cpp
+)
+else()
 set(sources
   src/layers/softmax_loss_layer.cpp
   src/layers/sigmoid_loss_layer.cpp
@@ -110,6 +118,7 @@ set(sources
   src/net.cpp
 )
 endif()
+endif()
 
 #set(BOOST_LIBRARIES "-lboost_system -lboost_thread")
 add_library(dg_cpu STATIC ${sources})
diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
index 6cc40c266d..92bc999653 100644
--- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
@@ -28,14 +28,21 @@ class graph_conv_layer : public layer {
 public:
   graph_conv_layer(unsigned level, bool act, bool norm, bool bias,
                    bool dropout, float_t dropout_rate,
-                   std::vector<size_t> in_dims, std::vector<size_t> out_dims);
+                   std::vector<size_t> in_dims, std::vector<size_t> out_dims)
+    : layer(level, in_dims, out_dims), act_(act), norm_(norm), bias_(bias),
+      dropout_(dropout), dropout_rate_(dropout_rate) {
+    assert(input_dims[0] == output_dims[0]); // num_vertices
+    trainable_ = true;
+    name_      = layer_type() + "_" + std::to_string(level);
+    assert(dropout_rate_ >= 0. && dropout_rate_ < 1.);
+    scale_ = 1. / (1. - dropout_rate_);
+  }
   graph_conv_layer(unsigned level, std::vector<size_t> in_dims,
                    std::vector<size_t> out_dims)
       : graph_conv_layer(level, false, true, false, true, 0.5, in_dims, out_dims) {}
   ~graph_conv_layer() {}
   void malloc_and_init();
   std::string layer_type() const override { return std::string("graph_conv"); }
-  void set_netphase(net_phase ctx) override { phase_ = ctx; }
   virtual acc_t get_weight_decay_loss();
   //! Uses weights contained in this layer to update in_data (results from previous)
   //! and save result to out_data
@@ -62,7 +69,6 @@ class graph_conv_layer : public layer {
   bool dropout_; // whether to use dropout at first
   const float_t dropout_rate_;
   float_t scale_;
-  net_phase phase_;
   float_t* out_temp; //!< intermediate data temporary
   float_t* in_temp;
   float_t* in_temp1;
diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h
index 0ffab6de41..206e5e7da3 100644
--- a/libdeepgalois/include/deepgalois/layers/layer.h
+++ b/libdeepgalois/include/deepgalois/layers/layer.h
@@ -48,7 +48,7 @@ class layer : public deepgalois::node {
 
   layer(unsigned level, std::vector<size_t> in_dims,
         std::vector<size_t> out_dims)
-      : node(in_dims.size(), out_dims.size()), level_(level), begin_(0),
+      : level_(level), begin_(0),
         end_(0), num_dims(in_dims.size()), input_dims(in_dims),
         output_dims(out_dims), labels(NULL) { }
   virtual ~layer()                       = default;
@@ -68,7 +68,7 @@ class layer : public deepgalois::node {
   float_t* get_grads_device_ptr() { return d_weight_grad; }
 
   // set methods
-  virtual void set_netphase(net_phase phase) {}
+  void set_netphase(net_phase ctx) { phase_ = ctx; }
   void set_context(ContextType* ctx) { context = ctx; }
   void set_trainable(bool trainable) { trainable_ = trainable; } // is this layer trainable?
   void set_labels_ptr(label_t *ptr) { labels = ptr; }
@@ -137,8 +137,8 @@ class layer : public deepgalois::node {
 #ifdef CPU_ONLY
     // parallelize only when target size is big enough to mitigate thread
     // spawning overhead.
-    bool parallel = (W.size() >= 512);
-    opt->update(layer::weight_grad, layer::W, parallel); // W += grad
+    //bool parallel = (W.size() >= 512);
+    opt->update(layer::weight_grad, layer::W); // W += grad
 #else
     opt->update_gpu(input_dims[1]*output_dims[1], d_weight_grad, d_W); // W += grad
 #endif
@@ -152,6 +152,7 @@ class layer : public deepgalois::node {
   size_t end_;                     // sample end index
   size_t count_;                   // number of samples
   size_t num_dims;                 // number of dimensions
+  net_phase phase_;                // in which phase: train, val or test
   std::vector<size_t> input_dims;  // input dimensions
   std::vector<size_t> output_dims; // output dimentions
   std::string name_;               // name of this layer
diff --git a/libdeepgalois/include/deepgalois/layers/node.h b/libdeepgalois/include/deepgalois/layers/node.h
index 9b43167656..ec7c319d87 100644
--- a/libdeepgalois/include/deepgalois/layers/node.h
+++ b/libdeepgalois/include/deepgalois/layers/node.h
@@ -26,14 +26,15 @@ typedef std::shared_ptr<edge> edgeptr_t;
 // edge
 class node : public std::enable_shared_from_this<node> {
 public:
-  node(size_t in_size, size_t out_size) {
-  } //: prev_(in_size), next_(out_size) {}
+  node() { prev_= NULL; next_ = NULL; }
+  //node(size_t in_size, size_t out_size) {
+  //} //: prev_(in_size), next_(out_size) {}
   virtual ~node() {}
   const edgeptr_t prev() const { return prev_; }
   const edgeptr_t next() const { return next_; }
 
 protected:
-  node() = delete;
+  //node() = delete;
   friend void connect(layer* head, layer* tail);
   mutable edgeptr_t prev_;
   mutable edgeptr_t next_;
diff --git a/libdeepgalois/include/deepgalois/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h
index 8d450a1a23..f8e5ce8315 100644
--- a/libdeepgalois/include/deepgalois/lgraph.h
+++ b/libdeepgalois/include/deepgalois/lgraph.h
@@ -43,11 +43,7 @@ class LearningGraph {
   index_t get_degree(index_t vid) { return degrees_[vid]; }
   index_t edge_begin(index_t vid) { return rowptr_[vid]; }
   index_t edge_end(index_t vid) { return rowptr_[vid+1]; }
-  index_t* row_start_ptr() { return &rowptr_[0]; }
-  index_t* edge_dst_ptr() { return &colidx_[0]; }
-  index_t* degrees_ptr() { return &degrees_[0]; }
-  edata_t* edge_data_ptr() { return edge_data_; }
-  vdata_t* vertex_data_ptr() { return vertex_data_; }
+
   iterator begin() const { return iterator(0); }
   iterator end() const { return iterator(num_vertices_); }
   void progressPrint(unsigned maxii, unsigned ii);
@@ -60,6 +56,7 @@ class LearningGraph {
   void constructNodes();
   void fixEndEdge(index_t vid, index_t row_end);
   void constructEdge(index_t eid, index_t dst, edata_t edata);
+  void add_selfloop();
 
   bool isLocal(index_t vid);
   index_t getLID(index_t vid);
@@ -67,6 +64,32 @@ class LearningGraph {
   std::vector<std::vector<size_t>>& getMirrorNodes();
   uint64_t numMasters();
   uint64_t globalSize();
+
+#ifdef CPU_ONLY
+  index_t* row_start_ptr() { return &rowptr_[0]; }
+  const index_t* row_start_ptr() const { return &rowptr_[0]; }
+  index_t* edge_dst_ptr() { return &colidx_[0]; }
+  const index_t* edge_dst_ptr() const { return &colidx_[0]; }
+  index_t* degrees_ptr() { return &degrees_[0]; }
+  edata_t* edge_data_ptr() { return edge_data_; }
+  vdata_t* vertex_data_ptr() { return vertex_data_; }
+#else
+	__device__ index_t getEdgeDst(unsigned edge) { return colidx_[edge]; }
+	__device__ index_t edge_begin(unsigned src) { return d_rowptr_[src]; }
+	__device__ index_t edge_end(unsigned src) { return d_rowptr_[src+1]; }
+	__device__ vdata_t getData(unsigned vid) { return vertex_data_[vid]; }
+	__device__ index_t getDegree(unsigned vid) { return d_degrees_[vid]; }
+	index_t *row_start_ptr() { return d_rowptr_; }
+	const index_t *row_start_ptr() const { return d_rowptr_; }
+	index_t *edge_dst_ptr() { return d_colidx_; }
+	const index_t *edge_dst_ptr() const { return d_colidx_; }
+  index_t* degrees_ptr() { return d_degrees_; }
+	edata_t *edge_data_ptr() { return d_edge_data_; }
+	vdata_t *vertex_data_ptr() { return d_vertex_data_; }
+	//const vdata_t *vertex_data_ptr() const { return vertex_data_; }
+	//const edata_t *edge_data_ptr() const { return edge_data; }
+#endif
+
 };
 
 }
diff --git a/libdeepgalois/include/deepgalois/optimizer.h b/libdeepgalois/include/deepgalois/optimizer.h
index b745f12cb6..c9db614814 100644
--- a/libdeepgalois/include/deepgalois/optimizer.h
+++ b/libdeepgalois/include/deepgalois/optimizer.h
@@ -29,7 +29,7 @@ struct optimizer {
   optimizer& operator=(const optimizer&) = default;
   optimizer& operator=(optimizer&&)                                = default;
   virtual ~optimizer()                                             = default;
-  virtual void update(const vec_t& dW, vec_t& W, bool parallelize) = 0;
+  virtual void update(const vec_t& dW, vec_t& W) = 0;
   virtual void update_gpu(const size_t n, const float_t* dW, float_t* W) = 0;
   virtual void reset() {} // override to implement pre-learning action
 };
@@ -65,8 +65,8 @@ struct stateful_optimizer : public optimizer {
  **/
 struct adagrad : public stateful_optimizer<1> {
   adagrad() : alpha(0.01), eps(float_t(1e-8)) {}
-  void update(const vec_t& dW, vec_t& W, bool parallelize);
-  void update_gpu(const size_t n, const float_t* dW, float_t* W) {}
+  void update(const vec_t& dW, vec_t& W);
+  void update_gpu(const size_t n, const float_t* dW, float_t* W);
   float_t alpha; // learning rate
 private:
   float_t eps;
@@ -80,8 +80,8 @@ struct adagrad : public stateful_optimizer<1> {
  **/
 struct RMSprop : public stateful_optimizer<1> {
   RMSprop() : alpha(float_t(0.0001)), mu(float_t(0.99)), eps(float_t(1e-8)) {}
-  void update(const vec_t& dW, vec_t& W, bool parallelize);
-  void update_gpu(const size_t n, const float_t* dW, float_t* W) {}
+  void update(const vec_t& dW, vec_t& W);
+  void update_gpu(const size_t n, const float_t* dW, float_t* W);
   float_t alpha; // learning rate
   float_t mu;    // decay term
 private:
@@ -94,9 +94,9 @@ struct adam : public stateful_optimizer<2> {
   adam()
       : alpha(float_t(0.01)), b1(float_t(0.9)), b2(float_t(0.999)),
         b1_t(float_t(0.9)), b2_t(float_t(0.999)), eps(float_t(1e-8)) {}
-  void update(const vec_t& dW, vec_t& W, bool parallelize);
+  void update(const vec_t& dW, vec_t& W);
 #ifdef CPU_ONLY
-  void update_gpu(const size_t n, const float_t* dW, float_t* W) {}
+  void update_gpu(const size_t n, const float_t* dW, float_t* W);
 #else
   void update_gpu(const size_t n, const float_t* dW, float_t* W);
 #endif
@@ -121,8 +121,8 @@ struct adamax : public stateful_optimizer<2> {
   adamax()
       : alpha(float_t(0.002)), b1(float_t(0.9)), b2(float_t(0.999)), b1_t(b1),
         eps(float_t(1e-8)) {}
-  void update(const vec_t& dW, vec_t& W, bool parallelize);
-  void update_gpu(const size_t n, const float_t* dW, float_t* W) {}
+  void update(const vec_t& dW, vec_t& W);
+  void update_gpu(const size_t n, const float_t* dW, float_t* W);
 
   float_t alpha; // learning rate
   float_t b1;    // decay term
@@ -137,8 +137,8 @@ struct adamax : public stateful_optimizer<2> {
 // slightly faster than tiny_dnn::momentum
 struct gradient_descent : public optimizer {
   gradient_descent() : alpha(float_t(0.01)), lambda(float_t(0)) {}
-  void update(const vec_t& dW, vec_t& W, bool parallelize);
-  void update_gpu(const size_t n, const float_t* dW, float_t* W) {}
+  void update(const vec_t& dW, vec_t& W);
+  void update_gpu(const size_t n, const float_t* dW, float_t* W);
   float_t alpha;  // learning rate
   float_t lambda; // weight decay
 };
@@ -153,8 +153,8 @@ struct gradient_descent : public optimizer {
 struct momentum : public stateful_optimizer<1> {
 public:
   momentum() : alpha(float_t(0.01)), lambda(float_t(0)), mu(float_t(0.9)) {}
-  void update(const vec_t& dW, vec_t& W, bool parallelize);
-  void update_gpu(const size_t n, const float_t* dW, float_t* W) {}
+  void update(const vec_t& dW, vec_t& W);
+  void update_gpu(const size_t n, const float_t* dW, float_t* W);
 
   float_t alpha;  // learning rate
   float_t lambda; // weight decay
@@ -172,8 +172,8 @@ struct nesterov_momentum : public stateful_optimizer<1> {
 public:
   nesterov_momentum()
       : alpha(float_t(0.01)), lambda(float_t(0)), mu(float_t(0.9)) {}
-  void update(const vec_t& dW, vec_t& W, bool parallelize);
-  void update_gpu(const size_t n, const float_t* dW, float_t* W) {}
+  void update(const vec_t& dW, vec_t& W);
+  void update_gpu(const size_t n, const float_t* dW, float_t* W);
 
   float_t alpha;  // learning rate
   float_t lambda; // weight decay
diff --git a/libdeepgalois/include/deepgalois/types.h b/libdeepgalois/include/deepgalois/types.h
index 87e7411689..e1c405d653 100644
--- a/libdeepgalois/include/deepgalois/types.h
+++ b/libdeepgalois/include/deepgalois/types.h
@@ -41,6 +41,7 @@ enum class net_phase { train, test };
 #define MAX_NUM_CLASSES 128
 #define WARPS_PER_BLOCK (BLOCK_SIZE / WARP_SIZE)
 #define USE_CUSPARSE
+#define UNUSED(expr) do { (void)(expr); } while (0)
 
 #ifdef GALOIS_USE_DIST
 namespace deepgalois {
diff --git a/libdeepgalois/src/layers/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp
index b298107f4e..4468e72ea7 100644
--- a/libdeepgalois/src/layers/aggregator.cpp
+++ b/libdeepgalois/src/layers/aggregator.cpp
@@ -43,6 +43,7 @@ void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* ou
 
 void deepgalois::update_all_csrmm(size_t len, Graph& g, const float_t* in, float_t* out,
                                   bool norm, float_t* norm_factor) {
+  UNUSED(norm);
   galois::StatTimer Tcsrmm("CSRMM-MKL");
   Tcsrmm.start();
   unsigned n = g.size();
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 354db106e9..e46a2477a6 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -10,20 +10,7 @@ float_t* _dataToSync = nullptr;
 //! sync
 long unsigned _syncVectorSize = 0;
 
-
-graph_conv_layer::graph_conv_layer(unsigned level, bool act, bool norm,
-                                   bool bias, bool dropout, float_t dropout_rate,
-                                   std::vector<size_t> in_dims,
-                                   std::vector<size_t> out_dims)
-    : layer(level, in_dims, out_dims), act_(act), norm_(norm), bias_(bias),
-      dropout_(dropout), dropout_rate_(dropout_rate) {
-  assert(input_dims[0] == output_dims[0]); // num_vertices
-  trainable_ = true;
-  name_      = layer_type() + "_" + std::to_string(level);
-  assert(dropout_rate_ >= 0. && dropout_rate_ < 1.);
-  scale_ = 1. / (1. - dropout_rate_);
-}
-
+#ifdef CPU_ONLY
 inline void graph_conv_layer::rand_init_matrix(size_t dim_x, size_t dim_y, vec_t& matrix, unsigned seed) {
   auto init_range = sqrt(6.0 / (dim_x + dim_y));
   std::default_random_engine rng(seed);
@@ -43,7 +30,6 @@ inline void graph_conv_layer::zero_init_matrix(size_t dim_x, size_t dim_y, vec_t
   }
 }
 
-#ifdef CPU_ONLY
 // aggregate based on graph topology
 void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in, float_t* out) {
   // normalization constant based on graph structure
diff --git a/libdeepgalois/src/layers/l2_norm_layer.cpp b/libdeepgalois/src/layers/l2_norm_layer.cpp
index 3e12a1d603..0e3ea946f0 100644
--- a/libdeepgalois/src/layers/l2_norm_layer.cpp
+++ b/libdeepgalois/src/layers/l2_norm_layer.cpp
@@ -32,6 +32,7 @@ void l2_norm_layer::forward_propagation(const float_t* in_data, float_t* out_dat
 
 void l2_norm_layer::back_propagation(const float_t* in_data, const float_t* out_data,
                                   float_t* out_grad, float_t* in_grad) {
+  UNUSED(out_data);
   size_t x = input_dims[0];
   size_t y = input_dims[1];
   galois::do_all(galois::iterate((size_t)0, x), [&](const auto i) {
diff --git a/libdeepgalois/src/layers/leaky_relu_layer.cpp b/libdeepgalois/src/layers/leaky_relu_layer.cpp
index f7cfe375cc..e4ebfaad1e 100644
--- a/libdeepgalois/src/layers/leaky_relu_layer.cpp
+++ b/libdeepgalois/src/layers/leaky_relu_layer.cpp
@@ -22,6 +22,7 @@ void leaky_relu_layer::forward_propagation(const float_t* in_data, float_t* out_
 //              = 𝜕𝐿 / 𝜕𝑦𝑙,       𝑖𝑓 (𝑦[𝑙] > 0)
 void leaky_relu_layer::back_propagation(const float_t* in_data, const float_t* out_data,
                                   float_t* out_grad, float_t* in_grad) {
+  UNUSED(in_data);
   math::d_leaky_relu_cpu(n, epsilon_, out_grad, out_data, in_grad);
 }
 #endif
diff --git a/libdeepgalois/src/layers/relu_layer.cpp b/libdeepgalois/src/layers/relu_layer.cpp
index aee6e29a07..e351d11d4f 100644
--- a/libdeepgalois/src/layers/relu_layer.cpp
+++ b/libdeepgalois/src/layers/relu_layer.cpp
@@ -14,6 +14,7 @@ void relu_layer::forward_propagation(const float_t* in_data, float_t* out_data)
 //              = 𝜕𝐿 / 𝜕𝑦𝑙, 𝑜𝑡ℎ𝑒𝑟𝑤𝑖𝑠𝑒
 void relu_layer::back_propagation(const float_t* in_data, const float_t* out_data,
                                   float_t* out_grad, float_t* in_grad) {
+  UNUSED(in_data);
   size_t n = input_dims[0] * input_dims[1];
   math::d_relu_cpu(n, out_grad, out_data, in_grad);
 }
diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
index 2288a8da26..4cddbaa854 100644
--- a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
+++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
@@ -44,6 +44,7 @@ void sigmoid_loss_layer::forward_propagation(const float_t* in_data, float_t* ou
 
 void sigmoid_loss_layer::back_propagation(const float_t* in_data, const float_t* out_data,
                                           float_t* out_grad, float_t* in_grad) {
+  if (out_grad) delete[] out_grad;
   size_t len = layer::input_dims[1];
   galois::do_all(galois::iterate(layer::begin_, layer::end_), [&](const auto& i) {
     if (!use_mask || masks_[i] == 1) { // masked
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp
index 54e461121f..a53c81488b 100644
--- a/libdeepgalois/src/layers/softmax_loss_layer.cpp
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp
@@ -27,8 +27,7 @@ inline label_t softmax_loss_layer::get_label(size_t i) {
 
 // TODO: need kernel fusion optimization
 // 𝑦[i] = 𝑒^𝑥[i] / Σ 𝑒^𝑥[𝑘]
-void softmax_loss_layer::forward_propagation(const float_t* in_data,
-                                             float_t* out_data) {
+void softmax_loss_layer::forward_propagation(const float_t* in_data, float_t* out_data) {
   size_t len = input_dims[1];
   galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) {
     if (!use_mask || masks_[i] == 1) { // masked
@@ -46,9 +45,9 @@ void softmax_loss_layer::forward_propagation(const float_t* in_data,
   // in this forward pass; only a post-process pretty much
 }
 
-void softmax_loss_layer::back_propagation(const float_t* in_data,
-                                          const float_t* out_data,
+void softmax_loss_layer::back_propagation(const float_t* in_data, const float_t* out_data,
                                           float_t* out_grad, float_t* in_grad) {
+  if (out_grad) delete[] out_grad;
   // note: out_grad is ignored because it shouldn't exist (this is output layer)
   size_t len = layer::input_dims[1];
   galois::do_all(galois::iterate(layer::begin_, layer::end_), [&](const auto& i) {
@@ -59,8 +58,7 @@ void softmax_loss_layer::back_propagation(const float_t* in_data,
       // use ground truth to determine derivative of cross entropy
       math::d_cross_entropy(len, &groundTruth[0], &out_data[len * i], &norm_grad[0]);
       // derviative softmax to gradient used in the next layer
-      math::d_softmax(len, &in_data[len * i], &out_data[len * i],
-                      &in_grad[len * i], &norm_grad[0]);
+      math::d_softmax(len, &in_data[len * i], &out_data[len * i], &in_grad[len * i], &norm_grad[0]);
     }
   }, galois::chunk_size<64>(), galois::steal(), galois::loopname("softmax-loss-bw"));
 
diff --git a/libdeepgalois/src/lgraph.cpp b/libdeepgalois/src/lgraph.cpp
index a99ce7df36..b9da782599 100644
--- a/libdeepgalois/src/lgraph.cpp
+++ b/libdeepgalois/src/lgraph.cpp
@@ -12,9 +12,9 @@
 
 namespace deepgalois {
 
-bool LearningGraph::isLocal(index_t vid) { return true; }
+bool LearningGraph::isLocal(index_t vid) { UNUSED(vid); return true; }
 
-index_t LearningGraph::getLID(index_t vid) { return 0; }
+index_t LearningGraph::getLID(index_t vid) { UNUSED(vid); return 0; }
 
 bool LearningGraph::is_vertex_cut() {return true; }
 
@@ -73,6 +73,7 @@ void LearningGraph::constructEdge(index_t eid, index_t dst, edata_t edata) {
   assert(dst < num_vertices_);
   assert(eid < num_edges_);
   colidx_[eid] = dst;
+  if (edge_data_) edge_data_[eid] = edata;
 }
 
 void LearningGraph::degree_counting() {
@@ -83,6 +84,40 @@ void LearningGraph::degree_counting() {
   }, galois::loopname("DegreeCounting"));
 }
 
+void LearningGraph::add_selfloop() {
+  //print_neighbors(nnodes-1);
+  //print_neighbors(0);
+  auto old_colidx_ = colidx_;
+  colidx_.resize(num_vertices_ + num_edges_);
+  for (index_t i = 0; i < num_vertices_; i++) {
+    auto start = rowptr_[i];
+    auto end = rowptr_[i+1];
+    bool selfloop_inserted = false;
+    if (start == end) {
+      colidx_[start+i] = i;
+      continue;
+    }
+    for (auto e = start; e != end; e++) {
+      auto dst = old_colidx_[e];
+      if (!selfloop_inserted) {
+        if (i < dst) {
+          selfloop_inserted = true;
+          colidx_[e+i] = i;
+          colidx_[e+i+1] = dst;
+        } else if (e+1 == end) {
+          selfloop_inserted = true;
+          colidx_[e+i+1] = i;
+          colidx_[e+i] = dst;
+        } else colidx_[e+i] = dst;
+      } else colidx_[e+i+1] = dst;
+    }
+  }
+  for (index_t i = 0; i <= num_vertices_; i++) rowptr_[i] += i;
+  num_edges_ += num_vertices_;
+  //print_neighbors(nnodes-1);
+  //print_neighbors(0);
+}
+
 void LearningGraph::readGraph(std::string path, std::string dataset) {
   std::string filename = path + dataset + ".csgr";
 }
diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp
index 4dcfb941ac..27de4e144f 100644
--- a/libdeepgalois/src/math_functions.cpp
+++ b/libdeepgalois/src/math_functions.cpp
@@ -64,6 +64,7 @@ void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
 void csrmm_cpu(const int M, const int N, const int K, const int nnz, 
                const float alpha, float* A_nonzeros, int* A_idx_ptr, int* A_nnz_idx,
                const float* B, const float beta, float* C) {
+  UNUSED(nnz);
 #ifdef USE_MKL
   //mkl_set_num_threads(56);
   //const char *matdescra = "GXXCX";//6 bytes
@@ -331,6 +332,7 @@ void softmax(size_t n, const float_t* input, float_t* output) {
 
 void d_softmax(size_t n, const float_t* y, const float_t* p, float_t* dy,
                const float_t* dp) {
+  UNUSED(y);
   vec_t df(n, 0);
   for (size_t i = 0; i < n; i++) {
     for (size_t j = 0; j < n; j++) {
@@ -374,6 +376,7 @@ void sigmoid(size_t n, const float_t* in, float_t* out) {
 }
 
 void d_sigmoid(size_t n, const float_t* y, const float_t* p, float_t* dy, const float_t* dp) {
+  UNUSED(y);
   for (size_t i = 0; i < n; i++) {
     dy[i] = dp[i] * p[i] * (float_t(1) - p[i]);
   }
diff --git a/libdeepgalois/src/optimizer.cpp b/libdeepgalois/src/optimizer.cpp
index 0f00b4da33..a73b5cd6d2 100644
--- a/libdeepgalois/src/optimizer.cpp
+++ b/libdeepgalois/src/optimizer.cpp
@@ -4,23 +4,21 @@
 
 namespace deepgalois {
 
-void adagrad::update(const vec_t& dW, vec_t& W, bool parallelize) {
+void adagrad::update(const vec_t& dW, vec_t& W) {
   vec_t& g = get<0>(W);
-  if (parallelize) {
-    galois::do_all(galois::iterate((size_t)0, W.size()),
-      [&](const auto& i) {
-        g[i] += dW[i] * dW[i];
-        W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps);
-      }, galois::loopname("adagrad_update"));
-  } else {
+  galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) {
+    g[i] += dW[i] * dW[i];
+    W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps);
+  }, galois::loopname("adagrad_update"));
+/*
     for (size_t i = 0; i < W.size(); i++) {
       g[i] += dW[i] * dW[i];
       W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps);
     }
-  }
+*/
 }
 
-void RMSprop::update(const vec_t& dW, vec_t& W, bool parallelize) {
+void RMSprop::update(const vec_t& dW, vec_t& W) {
   vec_t& g = get<0>(W);
   galois::do_all(galois::iterate((size_t)0, W.size()),
     [&](const auto& i) {
@@ -29,59 +27,54 @@ void RMSprop::update(const vec_t& dW, vec_t& W, bool parallelize) {
     }, galois::loopname("rms_update"));
 }
 
-void adam::update(const vec_t& dW, vec_t& W, bool parallelize) {
+void adam::update(const vec_t& dW, vec_t& W) {
   vec_t& mt = get<0>(W);
   vec_t& vt = get<1>(W);
-  galois::do_all(galois::iterate((size_t)0, W.size()),
-    [&](const auto& i) {
-      mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i];
-      vt[i] = b2 * vt[i] + (float_t(1) - b2) * dW[i] * dW[i];
-      // L2 norm based update rule
-      W[i] -= alpha * (mt[i] / (float_t(1) - b1_t)) /
-              std::sqrt((vt[i] / (float_t(1) - b2_t)) + eps);
-    }, galois::chunk_size<256>(), galois::steal(),
-    galois::loopname("adam_update"));
+  galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) {
+    mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i];
+    vt[i] = b2 * vt[i] + (float_t(1) - b2) * dW[i] * dW[i];
+    // L2 norm based update rule
+    W[i] -= alpha * (mt[i] / (float_t(1) - b1_t)) /
+            std::sqrt((vt[i] / (float_t(1) - b2_t)) + eps);
+  }, galois::chunk_size<256>(), galois::steal(), galois::loopname("adam_update"));
   b1_t *= b1;
   b2_t *= b2;
 }
 
-void adamax::update(const vec_t& dW, vec_t& W, bool parallelize) {
+void adamax::update(const vec_t& dW, vec_t& W) {
   vec_t& mt = get<0>(W);
   vec_t& ut = get<1>(W);
-  galois::do_all(galois::iterate((size_t)0, W.size()),
-    [&](const auto& i) {
-      mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i];
-      ut[i] = std::max(b2 * ut[i], std::abs(dW[i]));
-      // Lp norm based update rule
-      W[i] -= (alpha / (1.0 - b1_t)) * (mt[i] / (ut[i] + eps));
-    }, galois::loopname("adamax_update"));
+  galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) {
+    mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i];
+    ut[i] = std::max(b2 * ut[i], std::abs(dW[i]));
+    // Lp norm based update rule
+    W[i] -= (alpha / (1.0 - b1_t)) * (mt[i] / (ut[i] + eps));
+  }, galois::loopname("adamax_update"));
   b1_t *= b1;
 }
 
-void gradient_descent::update(const vec_t& dW, vec_t& W, bool parallelize) {
-  galois::do_all(galois::iterate((size_t)0, W.size()),
-      [&](const auto& i) { W[i] = W[i] - alpha * (dW[i] + lambda * W[i]); },
-    galois::loopname("gradient_descent_update"));
+void gradient_descent::update(const vec_t& dW, vec_t& W) {
+  galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) {
+    W[i] = W[i] - alpha * (dW[i] + lambda * W[i]);
+  }, galois::loopname("gradient_descent_update"));
 }
 
-void momentum::update(const vec_t& dW, vec_t& W, bool parallelize) {
+void momentum::update(const vec_t& dW, vec_t& W) {
   vec_t& dWprev = get<0>(W);
-  galois::do_all(galois::iterate((size_t)0, W.size()),
-    [&](const auto& i) {
-      float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda);
-      W[i] += V;
-      dWprev[i] = V;
-    }, galois::loopname("momentum_update"));
+  galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) {
+    float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda);
+    W[i] += V;
+    dWprev[i] = V;
+  }, galois::loopname("momentum_update"));
 }
 
-void nesterov_momentum::update(const vec_t& dW, vec_t& W, bool parallelize) {
+void nesterov_momentum::update(const vec_t& dW, vec_t& W) {
   vec_t& dWprev = get<0>(W);
-  galois::do_all(galois::iterate((size_t)0, W.size()),
-    [&](const auto& i) {
-      float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda);
-      W[i] += (-mu) * dWprev[i] + (1 + mu) * V;
-      dWprev[i] = V;
-    }, galois::loopname("nesterov_momentum_update"));
+  galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) {
+    float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda);
+    W[i] += (-mu) * dWprev[i] + (1 + mu) * V;
+    dWprev[i] = V;
+  }, galois::loopname("nesterov_momentum_update"));
 }
 
 } // namespace deepgalois
diff --git a/libdeepgalois/src/utils.cpp b/libdeepgalois/src/utils.cpp
index 882154f5c0..58a68d1d7d 100644
--- a/libdeepgalois/src/utils.cpp
+++ b/libdeepgalois/src/utils.cpp
@@ -53,6 +53,7 @@ template uint32_t* parallel_prefix_sum<uint32_t, uint32_t>(const std::vector<uin
 // and https://github.com/ashokpant/accuracy-evaluation-cpp/blob/master/src/evaluation.hpp
 acc_t masked_f1_score(size_t begin, size_t end, size_t count, mask_t *masks, 
                       size_t num_classes, label_t *ground_truth, float_t *pred) {
+  UNUSED(count);
   double precision_cls(0.), recall_cls(0.), f1_accum(0.);
   int tp_accum(0), fn_accum(0), fp_accum(0), tn_accum(0);
   for (size_t col = 0; col < num_classes; col++) {

From e32f59c5b05a82533968724bf5f239b327c45f51 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Tue, 5 May 2020 18:09:36 -0500
Subject: [PATCH 242/660] fix update_gpu

---
 libdeepgalois/include/deepgalois/optimizer.h  | 18 ++++++++--
 .../src/layers/sigmoid_loss_layer.cpp         |  2 +-
 .../src/layers/softmax_loss_layer.cpp         |  2 +-
 libdeepgalois/src/optimizer.cu                | 36 +++++++++++++++++++
 4 files changed, 53 insertions(+), 5 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/optimizer.h b/libdeepgalois/include/deepgalois/optimizer.h
index c9db614814..4fd7caa800 100644
--- a/libdeepgalois/include/deepgalois/optimizer.h
+++ b/libdeepgalois/include/deepgalois/optimizer.h
@@ -30,7 +30,9 @@ struct optimizer {
   optimizer& operator=(optimizer&&)                                = default;
   virtual ~optimizer()                                             = default;
   virtual void update(const vec_t& dW, vec_t& W) = 0;
+#ifndef CPU_ONLY
   virtual void update_gpu(const size_t n, const float_t* dW, float_t* W) = 0;
+#endif
   virtual void reset() {} // override to implement pre-learning action
 };
 
@@ -66,7 +68,9 @@ struct stateful_optimizer : public optimizer {
 struct adagrad : public stateful_optimizer<1> {
   adagrad() : alpha(0.01), eps(float_t(1e-8)) {}
   void update(const vec_t& dW, vec_t& W);
+#ifndef CPU_ONLY
   void update_gpu(const size_t n, const float_t* dW, float_t* W);
+#endif
   float_t alpha; // learning rate
 private:
   float_t eps;
@@ -81,7 +85,9 @@ struct adagrad : public stateful_optimizer<1> {
 struct RMSprop : public stateful_optimizer<1> {
   RMSprop() : alpha(float_t(0.0001)), mu(float_t(0.99)), eps(float_t(1e-8)) {}
   void update(const vec_t& dW, vec_t& W);
+#ifndef CPU_ONLY
   void update_gpu(const size_t n, const float_t* dW, float_t* W);
+#endif
   float_t alpha; // learning rate
   float_t mu;    // decay term
 private:
@@ -95,9 +101,7 @@ struct adam : public stateful_optimizer<2> {
       : alpha(float_t(0.01)), b1(float_t(0.9)), b2(float_t(0.999)),
         b1_t(float_t(0.9)), b2_t(float_t(0.999)), eps(float_t(1e-8)) {}
   void update(const vec_t& dW, vec_t& W);
-#ifdef CPU_ONLY
-  void update_gpu(const size_t n, const float_t* dW, float_t* W);
-#else
+#ifndef CPU_ONLY
   void update_gpu(const size_t n, const float_t* dW, float_t* W);
 #endif
 
@@ -122,7 +126,9 @@ struct adamax : public stateful_optimizer<2> {
       : alpha(float_t(0.002)), b1(float_t(0.9)), b2(float_t(0.999)), b1_t(b1),
         eps(float_t(1e-8)) {}
   void update(const vec_t& dW, vec_t& W);
+#ifndef CPU_ONLY
   void update_gpu(const size_t n, const float_t* dW, float_t* W);
+#endif
 
   float_t alpha; // learning rate
   float_t b1;    // decay term
@@ -138,7 +144,9 @@ struct adamax : public stateful_optimizer<2> {
 struct gradient_descent : public optimizer {
   gradient_descent() : alpha(float_t(0.01)), lambda(float_t(0)) {}
   void update(const vec_t& dW, vec_t& W);
+#ifndef CPU_ONLY
   void update_gpu(const size_t n, const float_t* dW, float_t* W);
+#endif
   float_t alpha;  // learning rate
   float_t lambda; // weight decay
 };
@@ -154,7 +162,9 @@ struct momentum : public stateful_optimizer<1> {
 public:
   momentum() : alpha(float_t(0.01)), lambda(float_t(0)), mu(float_t(0.9)) {}
   void update(const vec_t& dW, vec_t& W);
+#ifndef CPU_ONLY
   void update_gpu(const size_t n, const float_t* dW, float_t* W);
+#endif
 
   float_t alpha;  // learning rate
   float_t lambda; // weight decay
@@ -173,7 +183,9 @@ struct nesterov_momentum : public stateful_optimizer<1> {
   nesterov_momentum()
       : alpha(float_t(0.01)), lambda(float_t(0)), mu(float_t(0.9)) {}
   void update(const vec_t& dW, vec_t& W);
+#ifndef CPU_ONLY
   void update_gpu(const size_t n, const float_t* dW, float_t* W);
+#endif
 
   float_t alpha;  // learning rate
   float_t lambda; // weight decay
diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
index 4cddbaa854..60b4227ac6 100644
--- a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
+++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
@@ -44,7 +44,7 @@ void sigmoid_loss_layer::forward_propagation(const float_t* in_data, float_t* ou
 
 void sigmoid_loss_layer::back_propagation(const float_t* in_data, const float_t* out_data,
                                           float_t* out_grad, float_t* in_grad) {
-  if (out_grad) delete[] out_grad;
+  UNUSED(out_grad);
   size_t len = layer::input_dims[1];
   galois::do_all(galois::iterate(layer::begin_, layer::end_), [&](const auto& i) {
     if (!use_mask || masks_[i] == 1) { // masked
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp
index a53c81488b..6360db26be 100644
--- a/libdeepgalois/src/layers/softmax_loss_layer.cpp
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp
@@ -47,7 +47,7 @@ void softmax_loss_layer::forward_propagation(const float_t* in_data, float_t* ou
 
 void softmax_loss_layer::back_propagation(const float_t* in_data, const float_t* out_data,
                                           float_t* out_grad, float_t* in_grad) {
-  if (out_grad) delete[] out_grad;
+  UNUSED(out_grad);
   // note: out_grad is ignored because it shouldn't exist (this is output layer)
   size_t len = layer::input_dims[1];
   galois::do_all(galois::iterate(layer::begin_, layer::end_), [&](const auto& i) {
diff --git a/libdeepgalois/src/optimizer.cu b/libdeepgalois/src/optimizer.cu
index 355d959254..3a4365da6e 100644
--- a/libdeepgalois/src/optimizer.cu
+++ b/libdeepgalois/src/optimizer.cu
@@ -39,4 +39,40 @@ void adam::update_gpu(const size_t n, const float_t* dW, float_t* W) {
   b2_t *= b2;
 }
 
+void adagrad::update_gpu(const size_t n, const float_t* dW, float_t* W) {
+  UNUSED(n);
+  UNUSED(dW);
+  UNUSED(W);
+}
+
+void RMSprop::update_gpu(const size_t n, const float_t* dW, float_t* W) {
+  UNUSED(n);
+  UNUSED(dW);
+  UNUSED(W);
+}
+
+void adamax::update_gpu(const size_t n, const float_t* dW, float_t* W) {
+  UNUSED(n);
+  UNUSED(dW);
+  UNUSED(W);
+}
+
+void gradient_descent::update_gpu(const size_t n, const float_t* dW, float_t* W) {
+  UNUSED(n);
+  UNUSED(dW);
+  UNUSED(W);
+}
+
+void momentum::update_gpu(const size_t n, const float_t* dW, float_t* W) {
+  UNUSED(n);
+  UNUSED(dW);
+  UNUSED(W);
+}
+
+void nesterov_momentum::update_gpu(const size_t n, const float_t* dW, float_t* W) {
+  UNUSED(n);
+  UNUSED(dW);
+  UNUSED(W);
+}
+
 }

From f7c41e8a431235f1645bd68d9ca204797fa592be Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 5 May 2020 18:02:49 -0500
Subject: [PATCH 243/660] remove use of unused; unused params leave unnamed
 instead

---
 libdeepgalois/include/deepgalois/types.h      | 1 -
 libdeepgalois/src/layers/aggregator.cpp       | 3 +--
 libdeepgalois/src/layers/l2_norm_layer.cpp    | 3 +--
 libdeepgalois/src/layers/leaky_relu_layer.cpp | 3 +--
 libdeepgalois/src/layers/relu_layer.cpp       | 3 +--
 libdeepgalois/src/lgraph.cpp                  | 4 ++--
 libdeepgalois/src/math_functions.cpp          | 9 +++------
 libdeepgalois/src/utils.cpp                   | 3 +--
 8 files changed, 10 insertions(+), 19 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/types.h b/libdeepgalois/include/deepgalois/types.h
index e1c405d653..87e7411689 100644
--- a/libdeepgalois/include/deepgalois/types.h
+++ b/libdeepgalois/include/deepgalois/types.h
@@ -41,7 +41,6 @@ enum class net_phase { train, test };
 #define MAX_NUM_CLASSES 128
 #define WARPS_PER_BLOCK (BLOCK_SIZE / WARP_SIZE)
 #define USE_CUSPARSE
-#define UNUSED(expr) do { (void)(expr); } while (0)
 
 #ifdef GALOIS_USE_DIST
 namespace deepgalois {
diff --git a/libdeepgalois/src/layers/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp
index 4468e72ea7..d17cf79a72 100644
--- a/libdeepgalois/src/layers/aggregator.cpp
+++ b/libdeepgalois/src/layers/aggregator.cpp
@@ -42,8 +42,7 @@ void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* ou
 }
 
 void deepgalois::update_all_csrmm(size_t len, Graph& g, const float_t* in, float_t* out,
-                                  bool norm, float_t* norm_factor) {
-  UNUSED(norm);
+                                  bool, float_t* norm_factor) {
   galois::StatTimer Tcsrmm("CSRMM-MKL");
   Tcsrmm.start();
   unsigned n = g.size();
diff --git a/libdeepgalois/src/layers/l2_norm_layer.cpp b/libdeepgalois/src/layers/l2_norm_layer.cpp
index 0e3ea946f0..a5a77eb82e 100644
--- a/libdeepgalois/src/layers/l2_norm_layer.cpp
+++ b/libdeepgalois/src/layers/l2_norm_layer.cpp
@@ -30,9 +30,8 @@ void l2_norm_layer::forward_propagation(const float_t* in_data, float_t* out_dat
   }, galois::loopname("l2_norm"));
 }
 
-void l2_norm_layer::back_propagation(const float_t* in_data, const float_t* out_data,
+void l2_norm_layer::back_propagation(const float_t* in_data, const float_t*,
                                   float_t* out_grad, float_t* in_grad) {
-  UNUSED(out_data);
   size_t x = input_dims[0];
   size_t y = input_dims[1];
   galois::do_all(galois::iterate((size_t)0, x), [&](const auto i) {
diff --git a/libdeepgalois/src/layers/leaky_relu_layer.cpp b/libdeepgalois/src/layers/leaky_relu_layer.cpp
index e4ebfaad1e..f0ea5f591e 100644
--- a/libdeepgalois/src/layers/leaky_relu_layer.cpp
+++ b/libdeepgalois/src/layers/leaky_relu_layer.cpp
@@ -20,9 +20,8 @@ void leaky_relu_layer::forward_propagation(const float_t* in_data, float_t* out_
 
 // 𝜕𝐿 / 𝜕𝑦[𝑙−1] = 𝜕𝐿 / 𝜕𝑦𝑙 * ε,   𝑖𝑓 (𝑦[𝑙] ≤ 0)
 //              = 𝜕𝐿 / 𝜕𝑦𝑙,       𝑖𝑓 (𝑦[𝑙] > 0)
-void leaky_relu_layer::back_propagation(const float_t* in_data, const float_t* out_data,
+void leaky_relu_layer::back_propagation(const float_t*, const float_t* out_data,
                                   float_t* out_grad, float_t* in_grad) {
-  UNUSED(in_data);
   math::d_leaky_relu_cpu(n, epsilon_, out_grad, out_data, in_grad);
 }
 #endif
diff --git a/libdeepgalois/src/layers/relu_layer.cpp b/libdeepgalois/src/layers/relu_layer.cpp
index e351d11d4f..9e54d64975 100644
--- a/libdeepgalois/src/layers/relu_layer.cpp
+++ b/libdeepgalois/src/layers/relu_layer.cpp
@@ -12,9 +12,8 @@ void relu_layer::forward_propagation(const float_t* in_data, float_t* out_data)
 
 // 𝜕𝐿 / 𝜕𝑦[𝑙−1] = 0, 𝑖𝑓 (𝑦[𝑙] < 0)
 //              = 𝜕𝐿 / 𝜕𝑦𝑙, 𝑜𝑡ℎ𝑒𝑟𝑤𝑖𝑠𝑒
-void relu_layer::back_propagation(const float_t* in_data, const float_t* out_data,
+void relu_layer::back_propagation(const float_t*, const float_t* out_data,
                                   float_t* out_grad, float_t* in_grad) {
-  UNUSED(in_data);
   size_t n = input_dims[0] * input_dims[1];
   math::d_relu_cpu(n, out_grad, out_data, in_grad);
 }
diff --git a/libdeepgalois/src/lgraph.cpp b/libdeepgalois/src/lgraph.cpp
index b9da782599..6531034794 100644
--- a/libdeepgalois/src/lgraph.cpp
+++ b/libdeepgalois/src/lgraph.cpp
@@ -12,9 +12,9 @@
 
 namespace deepgalois {
 
-bool LearningGraph::isLocal(index_t vid) { UNUSED(vid); return true; }
+bool LearningGraph::isLocal(index_t) { return true; }
 
-index_t LearningGraph::getLID(index_t vid) { UNUSED(vid); return 0; }
+index_t LearningGraph::getLID(index_t) { return 0; }
 
 bool LearningGraph::is_vertex_cut() {return true; }
 
diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp
index 27de4e144f..9e1b997f47 100644
--- a/libdeepgalois/src/math_functions.cpp
+++ b/libdeepgalois/src/math_functions.cpp
@@ -61,10 +61,9 @@ void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
   Tmatmul.stop();
 }
 
-void csrmm_cpu(const int M, const int N, const int K, const int nnz, 
+void csrmm_cpu(const int M, const int N, const int K, const int, 
                const float alpha, float* A_nonzeros, int* A_idx_ptr, int* A_nnz_idx,
                const float* B, const float beta, float* C) {
-  UNUSED(nnz);
 #ifdef USE_MKL
   //mkl_set_num_threads(56);
   //const char *matdescra = "GXXCX";//6 bytes
@@ -330,9 +329,8 @@ void softmax(size_t n, const float_t* input, float_t* output) {
     output[i] /= denominator;
 }
 
-void d_softmax(size_t n, const float_t* y, const float_t* p, float_t* dy,
+void d_softmax(size_t n, const float_t*, const float_t* p, float_t* dy,
                const float_t* dp) {
-  UNUSED(y);
   vec_t df(n, 0);
   for (size_t i = 0; i < n; i++) {
     for (size_t j = 0; j < n; j++) {
@@ -375,8 +373,7 @@ void sigmoid(size_t n, const float_t* in, float_t* out) {
   }
 }
 
-void d_sigmoid(size_t n, const float_t* y, const float_t* p, float_t* dy, const float_t* dp) {
-  UNUSED(y);
+void d_sigmoid(size_t n, const float_t*, const float_t* p, float_t* dy, const float_t* dp) {
   for (size_t i = 0; i < n; i++) {
     dy[i] = dp[i] * p[i] * (float_t(1) - p[i]);
   }
diff --git a/libdeepgalois/src/utils.cpp b/libdeepgalois/src/utils.cpp
index 58a68d1d7d..00a7d5696a 100644
--- a/libdeepgalois/src/utils.cpp
+++ b/libdeepgalois/src/utils.cpp
@@ -51,9 +51,8 @@ template uint32_t* parallel_prefix_sum<uint32_t, uint32_t>(const std::vector<uin
 // Please refer to https://sebastianraschka.com/faq/docs/multiclass-metric.html,
 // http://pageperso.lif.univ-mrs.fr/~francois.denis/IAAM1/scikit-learn-docs.pdf (p.1672)
 // and https://github.com/ashokpant/accuracy-evaluation-cpp/blob/master/src/evaluation.hpp
-acc_t masked_f1_score(size_t begin, size_t end, size_t count, mask_t *masks, 
+acc_t masked_f1_score(size_t begin, size_t end, size_t, mask_t *masks, 
                       size_t num_classes, label_t *ground_truth, float_t *pred) {
-  UNUSED(count);
   double precision_cls(0.), recall_cls(0.), f1_accum(0.);
   int tp_accum(0), fn_accum(0), fp_accum(0), tn_accum(0);
   for (size_t col = 0; col < num_classes; col++) {

From 407bbcfc13129fd4dee084375d205801905a62d4 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 5 May 2020 18:09:58 -0500
Subject: [PATCH 244/660] mkl: don't name params if not used

---
 libdeepgalois/src/math_functions.cpp | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp
index 9e1b997f47..0923411ff2 100644
--- a/libdeepgalois/src/math_functions.cpp
+++ b/libdeepgalois/src/math_functions.cpp
@@ -61,9 +61,15 @@ void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
   Tmatmul.stop();
 }
 
-void csrmm_cpu(const int M, const int N, const int K, const int, 
+#ifdef USE_MKL
+void csrmm_cpu(const int M, const int N, const int K, const int,
                const float alpha, float* A_nonzeros, int* A_idx_ptr, int* A_nnz_idx,
                const float* B, const float beta, float* C) {
+#else
+void csrmm_cpu(const int, const int, const int, const int,
+               const float, float*, int*, int*,
+               const float*, const float, float*) {
+#endif
 #ifdef USE_MKL
   //mkl_set_num_threads(56);
   //const char *matdescra = "GXXCX";//6 bytes
@@ -99,14 +105,14 @@ void csrmm_cpu(const int M, const int N, const int K, const int,
 }
 
 // matrix-vector multiply
-void mvmul(const CBLAS_TRANSPOSE TransA, const int M, const int N, const float alpha, 
+void mvmul(const CBLAS_TRANSPOSE TransA, const int M, const int N, const float alpha,
            const float* A, const float* x, const float beta, float* y) {
   cblas_sgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1);
 }
 
 inline void rng_uniform_cpu(size_t n, float_t* r) {
 #ifdef USE_MKL
-  VSLStreamStatePtr stream;	 
+  VSLStreamStatePtr stream;
   // Initializing the streams
   vslNewStream(&stream, VSL_BRNG_SOBOL, 1);
   // Generating
@@ -238,7 +244,7 @@ void clear_cpu(size_t n, float_t* in) {
   // memset(in, 0, n*sizeof(float_t));
 }
 
-void dropout(size_t m, float scale, float dropout_rate, 
+void dropout(size_t m, float scale, float dropout_rate,
              const float_t* in, mask_t* masks, float_t* out) {
   for (size_t i = 0; i < m; ++i)
     masks[i] = bernoulli(dropout_rate);
@@ -310,7 +316,7 @@ void leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in, float_t* out)
   }, galois::chunk_size<64>(), galois::loopname("leaky_relu"));
 }
 
-void d_leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in, 
+void d_leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in,
                       const float_t* data, float_t* out) {
   // TODO: vectorize
   galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) {

From 2af0f556f866bb6bf5e4f3cf94d57873d33de6e1 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 5 May 2020 18:13:56 -0500
Subject: [PATCH 245/660] got rid of more unused calls in layers/opt

---
 .../src/layers/sigmoid_loss_layer.cpp         |  3 +-
 .../src/layers/softmax_loss_layer.cpp         |  3 +-
 libdeepgalois/src/optimizer.cu                | 36 ++++---------------
 3 files changed, 8 insertions(+), 34 deletions(-)

diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
index 60b4227ac6..5a511d2308 100644
--- a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
+++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
@@ -43,8 +43,7 @@ void sigmoid_loss_layer::forward_propagation(const float_t* in_data, float_t* ou
 }
 
 void sigmoid_loss_layer::back_propagation(const float_t* in_data, const float_t* out_data,
-                                          float_t* out_grad, float_t* in_grad) {
-  UNUSED(out_grad);
+                                          float_t*, float_t* in_grad) {
   size_t len = layer::input_dims[1];
   galois::do_all(galois::iterate(layer::begin_, layer::end_), [&](const auto& i) {
     if (!use_mask || masks_[i] == 1) { // masked
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp
index 6360db26be..2fc7ac80dc 100644
--- a/libdeepgalois/src/layers/softmax_loss_layer.cpp
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp
@@ -46,8 +46,7 @@ void softmax_loss_layer::forward_propagation(const float_t* in_data, float_t* ou
 }
 
 void softmax_loss_layer::back_propagation(const float_t* in_data, const float_t* out_data,
-                                          float_t* out_grad, float_t* in_grad) {
-  UNUSED(out_grad);
+                                          float_t*, float_t* in_grad) {
   // note: out_grad is ignored because it shouldn't exist (this is output layer)
   size_t len = layer::input_dims[1];
   galois::do_all(galois::iterate(layer::begin_, layer::end_), [&](const auto& i) {
diff --git a/libdeepgalois/src/optimizer.cu b/libdeepgalois/src/optimizer.cu
index 3a4365da6e..6953a804c1 100644
--- a/libdeepgalois/src/optimizer.cu
+++ b/libdeepgalois/src/optimizer.cu
@@ -39,40 +39,16 @@ void adam::update_gpu(const size_t n, const float_t* dW, float_t* W) {
   b2_t *= b2;
 }
 
-void adagrad::update_gpu(const size_t n, const float_t* dW, float_t* W) {
-  UNUSED(n);
-  UNUSED(dW);
-  UNUSED(W);
-}
+void adagrad::update_gpu(const size_t, const float_t*, float_t*) {}
 
-void RMSprop::update_gpu(const size_t n, const float_t* dW, float_t* W) {
-  UNUSED(n);
-  UNUSED(dW);
-  UNUSED(W);
-}
+void RMSprop::update_gpu(const size_t, const float_t*, float_t*) {}
 
-void adamax::update_gpu(const size_t n, const float_t* dW, float_t* W) {
-  UNUSED(n);
-  UNUSED(dW);
-  UNUSED(W);
-}
+void adamax::update_gpu(const size_t, const float_t*, float_t*) {}
 
-void gradient_descent::update_gpu(const size_t n, const float_t* dW, float_t* W) {
-  UNUSED(n);
-  UNUSED(dW);
-  UNUSED(W);
-}
+void gradient_descent::update_gpu(const size_t, const float_t*, float_t*) {}
 
-void momentum::update_gpu(const size_t n, const float_t* dW, float_t* W) {
-  UNUSED(n);
-  UNUSED(dW);
-  UNUSED(W);
-}
+void momentum::update_gpu(const size_t, const float_t*, float_t*) {}
 
-void nesterov_momentum::update_gpu(const size_t n, const float_t* dW, float_t* W) {
-  UNUSED(n);
-  UNUSED(dW);
-  UNUSED(W);
-}
+void nesterov_momentum::update_gpu(const size_t, const float_t*, float_t*) {}
 
 }

From 089ff4d1b076155074768001f14054e252dd44ea Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 5 May 2020 18:28:37 -0500
Subject: [PATCH 246/660] Unused vars fix for distributed gcn

---
 libdeepgalois/include/deepgalois/DistContext.h         | 10 +++++++---
 .../include/deepgalois/layers/GluonGradients.h         |  8 ++++----
 .../include/deepgalois/layers/GradientSyncStructs.h    |  8 ++++----
 .../deepgalois/layers/GraphConvSyncStructures.h        |  8 ++++----
 libdeepgalois/src/DistContext.cpp                      |  3 ++-
 libdeepgalois/src/net.cpp                              |  3 +++
 libgalois/include/galois/graphs/LC_CSR_Graph.h         |  3 +--
 lonestar/gnn/include/DistributedGraphLoader.h          |  3 +--
 8 files changed, 26 insertions(+), 20 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h
index 953010f09a..7fce4a12d9 100644
--- a/libdeepgalois/include/deepgalois/DistContext.h
+++ b/libdeepgalois/include/deepgalois/DistContext.h
@@ -49,9 +49,13 @@ class DistContext {
   //! find norm factor by looking at degree
   // TODO this is a distributed operation
   void norm_factor_computing(bool is_subgraph, int subg_id = 0);
-  void createSubgraphs(int num_subgraphs) {}
-  void gen_subgraph_labels(size_t m, const mask_t *masks) {}
-  void gen_subgraph_feats(size_t m, const mask_t *masks) {}
+  //void createSubgraphs(int num_subgraphs) {}
+  //void gen_subgraph_labels(size_t m, const mask_t *masks) {}
+  //void gen_subgraph_feats(size_t m, const mask_t *masks) {}
+  // TODO define these
+  void createSubgraphs(int) {}
+  void gen_subgraph_labels(size_t, const mask_t *) {}
+  void gen_subgraph_feats(size_t, const mask_t *) {}
 
   float_t* get_norm_factors_ptr() { return norm_factors; }
   Graph* getGraphPointer() { return graph_cpu; }
diff --git a/libdeepgalois/include/deepgalois/layers/GluonGradients.h b/libdeepgalois/include/deepgalois/layers/GluonGradients.h
index 1643a62027..a7aa66d576 100644
--- a/libdeepgalois/include/deepgalois/layers/GluonGradients.h
+++ b/libdeepgalois/include/deepgalois/layers/GluonGradients.h
@@ -164,22 +164,22 @@ class GluonGradients {
   }
 
   //! no edges, return 0
-  unsigned edge_begin(uint32_t dummy) {
+  unsigned edge_begin(uint32_t) {
     return 0;
   }
 
   //! no edges, return 0
-  unsigned edge_end(uint32_t dummy) {
+  unsigned edge_end(uint32_t) {
     return 0;
   }
 
   //! no edges, return 0
-  unsigned getEdgeDst(uint32_t dummy) {
+  unsigned getEdgeDst(uint32_t) {
     return 0;
   }
 
   //! no edges, return 0
-  unsigned getEdgeData(uint32_t dummy) {
+  unsigned getEdgeData(uint32_t) {
     return 0;
   }
 };
diff --git a/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h
index d0074d11ed..1d26b87007 100644
--- a/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h
+++ b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h
@@ -6,11 +6,11 @@
 struct GradientSync {
   using ValTy = float_t;
 
-  static ValTy extract(uint32_t node_id, float_t& weight) {
+  static ValTy extract(uint32_t, float_t& weight) {
     return weight;
   }
 
-  static bool reduce(uint32_t node_id, float_t& weight, ValTy y) {
+  static bool reduce(uint32_t, float_t& weight, ValTy y) {
     // TODO merge function here
     // for now make sure the weights are close enough
     //if (std::abs(weight - y) > 0.00001) {
@@ -21,12 +21,12 @@ struct GradientSync {
   }
 
   //! reset weight to 0
-  static void reset(uint32_t node_id, float_t &weight) {
+  static void reset(uint32_t, float_t &weight) {
     weight = 0;
   }
 
   //! save weight
-  static void setVal(uint32_t node_id, float_t &weight, ValTy y) {
+  static void setVal(uint32_t, float_t &weight, ValTy y) {
     weight = y;
   }
 
diff --git a/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h b/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h
index 3b95d55f82..e4874e468f 100644
--- a/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h
+++ b/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h
@@ -5,7 +5,7 @@ struct GraphConvSync {
   using ValTy = std::vector<float>;
 
   //! return a vector of floats to sync
-  static ValTy extract(uint32_t node_id, char& filler) {
+  static ValTy extract(uint32_t node_id, char&) {
     // TODO figure out how to avoid copy from C array to vector; best
     // way is if original data is in a vector probably, but that has the
     // issue of not being able to directly call BLAS
@@ -23,7 +23,7 @@ struct GraphConvSync {
 
   //! reduction is addition in this case; add received vector to
   //! own vector
-  static bool reduce(uint32_t node_id, char& filler, ValTy y) {
+  static bool reduce(uint32_t node_id, char&, ValTy y) {
     assert(y.size() == deepgalois::_syncVectorSize);
     // loop and do addition
     for (unsigned i = 0; i < deepgalois::_syncVectorSize; i++) {
@@ -33,11 +33,11 @@ struct GraphConvSync {
   }
 
   //! do nothing (waste of a write)
-  static void reset(uint32_t node_id, char& filler) {
+  static void reset(uint32_t, char&) {
   }
 
   //! element wise set
-  static void setVal(uint32_t node_id, char& filler, ValTy y) {
+  static void setVal(uint32_t node_id, char&, ValTy y) {
     assert(y.size() == deepgalois::_syncVectorSize);
     // loop and do addition
     for (unsigned i = 0; i < deepgalois::_syncVectorSize; i++) {
diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp
index 3f915ec062..66a1a0885e 100644
--- a/libdeepgalois/src/DistContext.cpp
+++ b/libdeepgalois/src/DistContext.cpp
@@ -151,7 +151,8 @@ float_t* DistContext::get_in_ptr() {
   return &h_feats[0];
 }
 
-void DistContext::norm_factor_computing(bool is_subgraph, int subg_id) {
+//void DistContext::norm_factor_computing(bool is_subgraph, int subg_id) {
+void DistContext::norm_factor_computing(bool, int) {
   // TODO: this is a distributed operation
 
   // create for now, TODO need to actually fill it in
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index d62ac752b1..052fab6a40 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -38,6 +38,9 @@ void Net::init(std::string dataset_str, int nt, unsigned n_conv, int epochs,
   context = new deepgalois::Context();
   num_samples = context->read_graph(dataset_str, selfloop);
   context->set_label_class(is_single_class);
+#else
+  // only done here to avoid unused var complain TODO find better way
+  (void)selfloop;
 #endif
 
   // read graph, get num nodes
diff --git a/libgalois/include/galois/graphs/LC_CSR_Graph.h b/libgalois/include/galois/graphs/LC_CSR_Graph.h
index 19aefefb27..6e8c65012b 100644
--- a/libgalois/include/galois/graphs/LC_CSR_Graph.h
+++ b/libgalois/include/galois/graphs/LC_CSR_Graph.h
@@ -748,8 +748,7 @@ class LC_CSR_Graph :
   }
 
   template <bool is_non_void = EdgeData::has_value>
-  void edgeDataCopy(EdgeData& edgeData_new, EdgeData& edgeData, EdgeIndexTy e_new,
-                    EdgeIndexTy e,
+  void edgeDataCopy(EdgeData&, EdgeData&, EdgeIndexTy, EdgeIndexTy,
                     typename std::enable_if<!is_non_void>::type* = 0) {
     // does nothing
   }
diff --git a/lonestar/gnn/include/DistributedGraphLoader.h b/lonestar/gnn/include/DistributedGraphLoader.h
index f5a896b3de..7827c1a39f 100644
--- a/lonestar/gnn/include/DistributedGraphLoader.h
+++ b/lonestar/gnn/include/DistributedGraphLoader.h
@@ -117,13 +117,12 @@ namespace graphs {
  *
  * @tparam NodeData node data to store in graph
  * @tparam EdgeData edge data to store in graph
- * @param scaleFactor How to split nodes among hosts
  * @returns a pointer to a newly allocated DistGraph based on the command line
  * loaded based on command line arguments
  */
 template <typename NodeData, typename EdgeData>
 DistGraph<NodeData, EdgeData>*
-constructSymmetricGraph(std::vector<unsigned>& scaleFactor) {
+constructSymmetricGraph(std::vector<unsigned>&) {
   std::string inputFile = deepgalois::path + dataset + ".csgr";
   galois::gInfo("File to read is ", inputFile);
 

From 7cb394b37117a08a73d5224ffff9ec7c9ddb57f3 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Tue, 5 May 2020 19:59:32 -0500
Subject: [PATCH 247/660] add reader

---
 libdeepgalois/CMakeLists.txt               |  16 +-
 libdeepgalois/include/deepgalois/context.h |  24 ++-
 libdeepgalois/include/deepgalois/reader.h  |  18 +++
 libdeepgalois/src/context.cpp              | 179 ++-------------------
 libdeepgalois/src/context.cu               |  19 +--
 libdeepgalois/src/net.cpp                  |  19 +--
 libdeepgalois/src/reader.cpp               | 144 +++++++++++++++++
 libgpu/include/checker.h                   |  15 --
 libgpu/include/gg.h                        |   1 -
 libgpu/include/graph_gpu.h                 |  13 --
 10 files changed, 212 insertions(+), 236 deletions(-)
 create mode 100644 libdeepgalois/include/deepgalois/reader.h
 create mode 100644 libdeepgalois/src/reader.cpp
 delete mode 100644 libgpu/include/checker.h

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index 3f5bc11a95..b46750b060 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -70,6 +70,12 @@ else()
 endif()
 
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+if(ENABLE_HETERO_GALOIS)
+set(sources
+  src/node.cpp
+  src/net.cpp
+)
+else()
 if(ENABLE_DIST_GALOIS)
 # do not link regular context.cpp; TODO do this conditional in cleaner way
 # also don't link sampler
@@ -85,20 +91,13 @@ set(sources
   src/layers/layer.cpp
   src/DistContext.cpp
   src/optimizer.cpp
+  src/reader.cpp
   src/lgraph.cpp
   src/utils.cpp
   src/node.cpp
   src/net.cpp
 )
 else()
-if(ENABLE_DIST_GALOIS)
-set(sources
-  src/context.cpp
-  src/lgraph.cpp
-  src/node.cpp
-  src/net.cpp
-)
-else()
 set(sources
   src/layers/softmax_loss_layer.cpp
   src/layers/sigmoid_loss_layer.cpp
@@ -112,6 +111,7 @@ set(sources
   src/optimizer.cpp
   src/context.cpp
   src/sampler.cpp
+  src/reader.cpp
   src/lgraph.cpp
   src/utils.cpp
   src/node.cpp
diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h
index 5683c26f12..f9ca056421 100644
--- a/libdeepgalois/include/deepgalois/context.h
+++ b/libdeepgalois/include/deepgalois/context.h
@@ -6,7 +6,7 @@
 #include <string>
 #include <cassert>
 #include "deepgalois/types.h"
-//#include <boost/shared_ptr.hpp>
+#include "deepgalois/reader.h"
 #ifdef CPU_ONLY
 #include "deepgalois/gtypes.h"
 #else
@@ -19,21 +19,25 @@ namespace deepgalois {
 class Context {
 public:
   Context();
+  Context(bool use_gpu) : is_device(use_gpu), n(0), num_classes(0), feat_len(0), is_single_class(true), 
+  is_selfloop_added(false), use_subgraph(false), h_labels(NULL), h_feats(NULL),
+  d_labels(NULL), d_labels_subg(NULL), d_feats(NULL), d_feats_subg(NULL), norm_factors(NULL) {}
+
   ~Context();
 
-  size_t read_graph(std::string dataset_str, bool selfloop);
-  size_t read_graph_cpu(std::string dataset_str, std::string filetype, bool selfloop);
-  size_t read_graph_gpu(std::string dataset_str, bool selfloop);
-  size_t read_labels(std::string dataset_str);
-  size_t read_features(std::string dataset_str, std::string filetype = "bin");
-  size_t read_masks(std::string dataset_str, std::string mask_type,
-                    size_t n, size_t& begin, size_t& end, mask_t* masks);
+  size_t read_graph(bool selfloop);
+  size_t read_labels() { num_classes = reader.read_labels(is_single_class, h_labels); return num_classes; }
+  size_t read_features() { feat_len = reader.read_features(h_feats); return feat_len; }
+  size_t read_masks(std::string mask_type, size_t n, size_t& begin, size_t& end, mask_t* masks) {
+    return reader.read_masks(mask_type, n, begin, end, masks);
+  }
 
   label_t get_label(size_t i) { return h_labels[i]; } // single-class (one-hot) label
   //label_t get_label(size_t i, size_t j) { return labels[i*num_classes+j]; } // multi-class label
   float_t* get_norm_factors_ptr() { return norm_factors; }
   float_t* get_norm_factors_subg_ptr() { return &norm_factors_subg[0]; }
 
+  void set_dataset(std::string dataset_str) { dataset = dataset_str; reader.init(dataset); }
   void set_label_class(bool is_single = true) { is_single_class = is_single; }
   void set_use_subgraph(bool use_subg) { use_subgraph = use_subg; }
   void copy_data_to_device(); // copy labels and input features
@@ -69,6 +73,8 @@ class Context {
 #endif
 
 protected:
+  std::string dataset; 
+  bool is_device;              // is this on device or host
   size_t n;                    // number of samples: N
   size_t num_classes;          // number of classes: E
   size_t feat_len;             // input feature length: D
@@ -88,6 +94,8 @@ class Context {
   std::vector<float_t> h_feats_subg;       // input features for subgraph
   std::vector<float_t> norm_factors_subg;  // normalization constant for subgraph
   //float_t* norm_factors_subg;  // normalization constant for subgraph
+  Reader reader;
+
   void alloc_norm_factor();
   void alloc_subgraph_norm_factor(int subg_id);
 
diff --git a/libdeepgalois/include/deepgalois/reader.h b/libdeepgalois/include/deepgalois/reader.h
new file mode 100644
index 0000000000..090ec817f8
--- /dev/null
+++ b/libdeepgalois/include/deepgalois/reader.h
@@ -0,0 +1,18 @@
+#pragma once
+#include "deepgalois/types.h"
+
+namespace deepgalois {
+
+class Reader {
+private:
+  std::string dataset_str;
+public:
+  Reader() : dataset_str("") {}
+  Reader(std::string dataset) : dataset_str(dataset) {}
+  void init(std::string dataset) { dataset_str = dataset; }
+  size_t read_labels(bool is_single_class, label_t*& labels);
+  size_t read_features(float_t*& feats, std::string filetype = "bin");
+  size_t read_masks(std::string mask_type, size_t n, size_t& begin, size_t& end, mask_t* masks);
+};
+
+}
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index caec001182..71410eee13 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -7,42 +7,20 @@
 //#include <boost/thread.hpp>
 
 namespace deepgalois {
-/*
-// Make sure each thread can have different values.
-static boost::thread_specific_ptr<Context> thread_instance_;
 
-Context& Context::Get() {
-  if (!thread_instance_.get()) {
-    thread_instance_.reset(new Context());
-  }
-  return *(thread_instance_.get());
-}
-*/
 #ifdef CPU_ONLY
-Context::Context() : n(0), num_classes(0), 
-  feat_len(0), is_single_class(true), 
-  is_selfloop_added(false), use_subgraph(false),
-  h_labels(NULL), h_feats(NULL),
-  //h_labels_subg(NULL), h_feats_subg(NULL),
-  d_labels(NULL), d_labels_subg(NULL),
-  d_feats(NULL), d_feats_subg(NULL),
-  norm_factors(NULL) {}
-  //norm_factors_subg(NULL) {}
+
+Context::Context() : Context(false) {}
 
 Context::~Context() {
   if (h_labels) delete[] h_labels;
-  //if (h_labels_subg) delete[] h_labels_subg;
   if (h_feats) delete[] h_feats;
-  //if (h_feats_subg) delete[] h_feats_subg;
   if (norm_factors) delete[] norm_factors;
+  //if (h_feats_subg) delete[] h_feats_subg;
+  //if (h_labels_subg) delete[] h_labels_subg;
   //if (norm_factors_subg) delete[] norm_factors_subg;
 }
 
-size_t Context::read_graph(std::string dataset_str, bool selfloop) {
-  n = read_graph_cpu(dataset_str, "gr", selfloop);
-  return n;
-}
-
 void Context::createSubgraphs(int num_subgraphs) {
   subgraphs_cpu.resize(num_subgraphs);
   for (int i = 0; i < num_subgraphs; i++)
@@ -83,19 +61,20 @@ void Context::gen_subgraph_feats(size_t m, const mask_t *masks) {
   }
 }
 
-size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype, bool selfloop) {
-  std::string filename = path + dataset_str + ".csgr";
+size_t Context::read_graph(bool selfloop) {
+  std::string filename = path + dataset + ".csgr";
+  std::string filetype = "gr";
   galois::StatTimer Tread("GraphReadingTime");
   Tread.start();
   if (filetype == "el") {
-    filename = path + dataset_str + ".el";
+    filename = path + dataset + ".el";
     printf("Reading .el file: %s\n", filename.c_str());
     read_edgelist(filename.c_str(), true); // symmetrize
   } else if (filetype == "bin") {
     graph_cpu->readGraphFromGRFile(filename);
   } else if (filetype == "gr") {
     graph_cpu = new Graph(); 
-    std::string filename = path + dataset_str + ".csgr";
+    std::string filename = path + dataset + ".csgr";
     printf("Reading .gr file: %s\n", filename.c_str());
     if (selfloop) {
       Graph graph_temp;
@@ -114,7 +93,8 @@ size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype, bo
   auto g = getGraphPointer();
   std::cout << "num_vertices " << g->size() << " num_edges "
             << g->sizeEdges() << "\n";
-  return g->size();
+  n = g->size();
+  return n;
 }
 
 void Context::add_selfloop(Graph &og, Graph &g) {
@@ -269,144 +249,7 @@ void Context::read_edgelist(const char* filename, bool symmetrize, bool add_self
 
 #endif
 
-// labels contain the ground truth (e.g. vertex classes) for each example
-// (num_examples x 1). Note that labels is not one-hot encoded vector and it can
-// be computed as y.argmax(axis=1) from one-hot encoded vector (y) of labels if
-// required.
-size_t Context::read_labels(std::string dataset_str) {
-  std::cout << "Reading labels ... ";
-  Timer t_read;
-  t_read.Start();
-  std::string filename = path + dataset_str + "-labels.txt";
-  std::ifstream in;
-  std::string line;
-  in.open(filename, std::ios::in);
-  size_t m; // m: number of samples
-  in >> m >> num_classes >> std::ws;
-  assert(m == n);
-  if (is_single_class) {
-    std::cout << "Using single-class (one-hot) labels\n";
-    h_labels = new label_t[m]; // single-class (one-hot) label for each vertex: N x 1
-  } else {
-    std::cout << "Using multi-class labels\n";
-   h_labels = new label_t[m*num_classes]; // multi-class label for each vertex: N x E
-  }
-  unsigned v = 0;
-  while (std::getline(in, line)) {
-    std::istringstream label_stream(line);
-    unsigned x;
-    for (size_t idx = 0; idx < num_classes; ++idx) {
-      label_stream >> x;
-      if (is_single_class) {
-        if (x != 0) {
-          h_labels[v] = idx;
-          break;
-        }
-      } else {
-        h_labels[v*num_classes+idx] = x;
-      }
-    }
-    v++;
-  }
-  in.close();
-  t_read.Stop();
-  // print the number of vertex classes
-  std::cout << "Done, unique label counts: " << num_classes
-            << ", time: " << t_read.Millisecs() << " ms\n";
-  //for (auto i = 0; i < 10; i ++) std::cout << "labels[" << i << "] = " << unsigned(labels[i]) << "\n";
-  return num_classes;
-}
-
-//! Read features, return the length of a feature vector
-//! Features are stored in the Context class
-size_t Context::read_features(std::string dataset_str, std::string filetype) {
-  //filetype = "txt";
-  std::cout << "Reading features ... ";
-  Timer t_read;
-  t_read.Start();
-  size_t m; // m = number of vertices
-  std::string filename = path + dataset_str + ".ft";
-  std::ifstream in;
-
-  if (filetype == "bin") {
-    std::string file_dims = path + dataset_str + "-dims.txt";
-    std::ifstream ifs;
-    ifs.open(file_dims, std::ios::in);
-    ifs >> m >> feat_len >> std::ws;
-    ifs.close();
-  } else {
-    in.open(filename, std::ios::in);
-    in >> m >> feat_len >> std::ws;
-  }
-  std::cout << "N x D: " << m << " x " << feat_len << "\n";
-  h_feats = new float_t[m * feat_len];
-  if (filetype == "bin") {
-    filename = path + dataset_str + "-feats.bin";
-    in.open(filename, std::ios::binary|std::ios::in);
-    in.read((char*)h_feats, sizeof(float_t) * m * feat_len);
-  } else {
-    std::string line;
-    while (std::getline(in, line)) {
-      std::istringstream edge_stream(line);
-      unsigned u, v;
-      float_t w;
-      edge_stream >> u;
-      edge_stream >> v;
-      edge_stream >> w;
-      h_feats[u * feat_len + v] = w;
-    }
-  }
-  in.close();
-  t_read.Stop();
-  std::cout << "Done, feature length: " << feat_len
-            << ", time: " << t_read.Millisecs() << " ms\n";
-  //for (auto i = 0; i < 6; i ++) 
-    //for (auto j = 0; j < 6; j ++) 
-      //std::cout << "feats[" << i << "][" << j << "] = " << h_feats[i*feat_len+j] << "\n";
-  return feat_len;
-}
 
-//! Get masks from datafile where first line tells range of
-//! set to create mask from
-size_t Context::read_masks(std::string dataset_str, std::string mask_type,
-                  size_t n, size_t& begin, size_t& end, mask_t* masks) {
-  bool dataset_found = false;
-  for (int i = 0; i < NUM_DATASETS; i++) {
-    if (dataset_str == dataset_names[i]) {
-      dataset_found = true;
-      break;
-    }
-  }
-  if (!dataset_found) {
-    std::cout << "Dataset currently not supported\n";
-    exit(1);
-  }
-  size_t i             = 0;
-  size_t sample_count  = 0;
-  std::string filename = path + dataset_str + "-" + mask_type + "_mask.txt";
-  // std::cout << "Reading " << filename << "\n";
-  std::ifstream in;
-  std::string line;
-  in.open(filename, std::ios::in);
-  in >> begin >> end >> std::ws;
-  while (std::getline(in, line)) {
-    std::istringstream mask_stream(line);
-    if (i >= begin && i < end) {
-      unsigned mask = 0;
-      mask_stream >> mask;
-      if (mask == 1) {
-        masks[i] = 1;
-        sample_count++;
-      }
-    }
-    i++;
-  }
-  std::cout << mask_type + "_mask range: [" << begin << ", " << end
-    << ") Number of valid samples: " << sample_count << " (" 
-    << (float)sample_count/(float)n*(float)100 << "\%)\n";
-  in.close();
-  return sample_count;
-}
 
 /*
 inline void init_features(size_t dim, vec_t &x) {
diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu
index 99f14ce11a..7f435e8ca8 100644
--- a/libdeepgalois/src/context.cu
+++ b/libdeepgalois/src/context.cu
@@ -62,12 +62,7 @@ cusparseHandle_t Context::cusparse_handle_     = 0;
 cusparseMatDescr_t Context::cusparse_matdescr_ = 0;
 curandGenerator_t Context::curand_generator_   = 0;
 
-Context::Context() : n(0), num_classes(0), feat_len(0), 
-                     is_single_class(true), is_selfloop_added(false), 
-                     h_labels(NULL), h_feats(NULL), 
-                     d_labels(NULL), d_feats(NULL),
-                     d_labels_subg(NULL), d_feats_subg(NULL),
-                     norm_factors(NULL) {
+Context::Context() : Context(true) {
   CUBLAS_CHECK(cublasCreate(&cublas_handle_));
   CUSPARSE_CHECK(cusparseCreate(&cusparse_handle_));
   CUSPARSE_CHECK(cusparseCreateMatDescr(&cusparse_matdescr_));
@@ -100,11 +95,6 @@ void Context::gen_subgraph_labels(size_t m, const mask_t *masks) {
 void Context::gen_subgraph_feats(size_t m, const mask_t *masks) {
 }
 
-size_t Context::read_graph(std::string dataset_str, bool selfloop) {
-  n = read_graph_gpu(dataset_str, selfloop);
-  return n;
-}
-
 void Context::norm_factor_computing(bool is_subgraph, int subg_id) {
   std::cout << "Pre-computing normalization factor (n=" << n << ") ... ";
   if (!is_selfloop_added) {
@@ -136,8 +126,8 @@ void Context::SetDevice(const int device_id) {
   CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen()));
 }
 */
-size_t Context::read_graph_gpu(std::string dataset_str, bool selfloop) {
-  std::string filename = path + dataset_str + ".csgr";
+size_t Context::read_graph(bool selfloop) {
+  std::string filename = path + dataset + ".csgr";
   CSRGraph g;
   g.read(filename.c_str(), false);
   if (selfloop) {
@@ -145,7 +135,8 @@ size_t Context::read_graph_gpu(std::string dataset_str, bool selfloop) {
     is_selfloop_added = selfloop;
   }
   g.copy_to_gpu(graph_gpu);
-  return graph_gpu.nnodes;
+  n = graph_gpu.nnodes;
+  return n;
 }
 
 void Context::copy_data_to_device() {
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index 052fab6a40..719b5267ee 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -36,7 +36,8 @@ void Net::init(std::string dataset_str, int nt, unsigned n_conv, int epochs,
                  ", weight_decay ", weight_decay, "\n");
 #ifndef GALOIS_USE_DIST
   context = new deepgalois::Context();
-  num_samples = context->read_graph(dataset_str, selfloop);
+  context->set_dataset(dataset_str);
+  num_samples = context->read_graph(selfloop);
   context->set_label_class(is_single_class);
 #else
   // only done here to avoid unused var complain TODO find better way
@@ -44,7 +45,7 @@ void Net::init(std::string dataset_str, int nt, unsigned n_conv, int epochs,
 #endif
 
   // read graph, get num nodes
-  num_classes = context->read_labels(dataset_str);
+  num_classes = context->read_labels();
 
 #ifndef GALOIS_USE_DIST
   //std::cout << "Reading label masks ... ";
@@ -62,8 +63,8 @@ void Net::init(std::string dataset_str, int nt, unsigned n_conv, int epochs,
     for (size_t i = train_begin; i < train_end; i++) train_masks[i] = 1;
     for (size_t i = val_begin; i < val_end; i++) val_masks[i] = 1;
   } else {
-    train_count = context->read_masks(dataset_str, "train", num_samples, train_begin, train_end, train_masks);
-    val_count = context->read_masks(dataset_str, "val", num_samples, val_begin, val_end, val_masks);
+    train_count = context->read_masks("train", num_samples, train_begin, train_end, train_masks);
+    val_count = context->read_masks("val", num_samples, val_begin, val_end, val_masks);
   }
 #endif
 
@@ -79,7 +80,7 @@ void Net::init(std::string dataset_str, int nt, unsigned n_conv, int epochs,
   if (has_dense) num_layers ++;
   // initialize feature metadata
   feature_dims.resize(num_layers + 1);
-  feature_dims[0] = context->read_features(dataset_str); // input feature dimension: D
+  feature_dims[0] = context->read_features(); // input feature dimension: D
   for (size_t i = 1; i < num_conv_layers; i++)
     feature_dims[i] = hidden1;                           // hidden1 level embedding: 16
   feature_dims[num_conv_layers] = num_classes;           // output embedding: E
@@ -133,8 +134,8 @@ void Net::dist_init(Graph* graph, std::string dataset_str) {
       }
     }
   } else {
-    train_count = context->read_masks(dataset_str, "train", num_samples, train_begin, train_end, train_masks, dGraph);
-    val_count = context->read_masks(dataset_str, "val", num_samples, val_begin, val_end, val_masks, dGraph);
+    train_count = context->read_masks("train", num_samples, train_begin, train_end, train_masks, dGraph);
+    val_count = context->read_masks("val", num_samples, val_begin, val_end, val_masks, dGraph);
   }
 }
 #endif
@@ -486,9 +487,9 @@ void Net::read_test_masks(std::string dataset) {
 #endif
   } else {
 #ifndef GALOIS_USE_DIST
-    test_count = context->read_masks(dataset, "test", num_samples, test_begin, test_end, test_masks);
+    test_count = context->read_masks("test", num_samples, test_begin, test_end, test_masks);
 #else
-    test_count = context->read_masks(dataset, "test", num_samples, test_begin, test_end, test_masks, dGraph);
+    test_count = context->read_masks("test", num_samples, test_begin, test_end, test_masks, dGraph);
 #endif
   }
 #ifndef CPU_ONLY
diff --git a/libdeepgalois/src/reader.cpp b/libdeepgalois/src/reader.cpp
new file mode 100644
index 0000000000..2ea8134254
--- /dev/null
+++ b/libdeepgalois/src/reader.cpp
@@ -0,0 +1,144 @@
+#include "deepgalois/reader.h"
+#include "deepgalois/utils.h"
+#include "deepgalois/configs.h"
+
+namespace deepgalois {
+
+// labels contain the ground truth (e.g. vertex classes) for each example
+// (num_examples x 1). Note that labels is not one-hot encoded vector and it can
+// be computed as y.argmax(axis=1) from one-hot encoded vector (y) of labels if
+// required.
+size_t Reader::read_labels(bool is_single_class, label_t*& labels) {
+  std::cout << "Reading labels ... ";
+  Timer t_read;
+  t_read.Start();
+  std::string filename = path + dataset_str + "-labels.txt";
+  std::ifstream in;
+  std::string line;
+  in.open(filename, std::ios::in);
+  size_t m, num_classes; // m: number of samples
+  in >> m >> num_classes >> std::ws;
+  if (is_single_class) {
+    std::cout << "Using single-class (one-hot) labels\n";
+    labels = new label_t[m]; // single-class (one-hot) label for each vertex: N x 1
+  } else {
+    std::cout << "Using multi-class labels\n";
+   labels = new label_t[m*num_classes]; // multi-class label for each vertex: N x E
+  }
+  unsigned v = 0;
+  while (std::getline(in, line)) {
+    std::istringstream label_stream(line);
+    unsigned x;
+    for (size_t idx = 0; idx < num_classes; ++idx) {
+      label_stream >> x;
+      if (is_single_class) {
+        if (x != 0) {
+          labels[v] = idx;
+          break;
+        }
+      } else {
+        labels[v*num_classes+idx] = x;
+      }
+    }
+    v++;
+  }
+  in.close();
+  t_read.Stop();
+  // print the number of vertex classes
+  std::cout << "Done, unique label counts: " << num_classes
+            << ", time: " << t_read.Millisecs() << " ms\n";
+  //for (auto i = 0; i < 10; i ++) std::cout << "labels[" << i << "] = " << unsigned(labels[i]) << "\n";
+  return num_classes;
+}
+
+//! Read features, return the length of a feature vector
+//! Features are stored in the Context class
+size_t Reader::read_features(float_t*& feats, std::string filetype) {
+  //filetype = "txt";
+  std::cout << "Reading features ... ";
+  Timer t_read;
+  t_read.Start();
+  size_t m, feat_len; // m = number of vertices
+  std::string filename = path + dataset_str + ".ft";
+  std::ifstream in;
+
+  if (filetype == "bin") {
+    std::string file_dims = path + dataset_str + "-dims.txt";
+    std::ifstream ifs;
+    ifs.open(file_dims, std::ios::in);
+    ifs >> m >> feat_len >> std::ws;
+    ifs.close();
+  } else {
+    in.open(filename, std::ios::in);
+    in >> m >> feat_len >> std::ws;
+  }
+  std::cout << "N x D: " << m << " x " << feat_len << "\n";
+  feats = new float_t[m * feat_len];
+  if (filetype == "bin") {
+    filename = path + dataset_str + "-feats.bin";
+    in.open(filename, std::ios::binary|std::ios::in);
+    in.read((char*)feats, sizeof(float_t) * m * feat_len);
+  } else {
+    std::string line;
+    while (std::getline(in, line)) {
+      std::istringstream edge_stream(line);
+      unsigned u, v;
+      float_t w;
+      edge_stream >> u;
+      edge_stream >> v;
+      edge_stream >> w;
+      feats[u * feat_len + v] = w;
+    }
+  }
+  in.close();
+  t_read.Stop();
+  std::cout << "Done, feature length: " << feat_len
+            << ", time: " << t_read.Millisecs() << " ms\n";
+  //for (auto i = 0; i < 6; i ++) 
+    //for (auto j = 0; j < 6; j ++) 
+      //std::cout << "feats[" << i << "][" << j << "] = " << feats[i*feat_len+j] << "\n";
+  return feat_len;
+}
+
+//! Get masks from datafile where first line tells range of
+//! set to create mask from
+size_t Reader::read_masks(std::string mask_type, size_t n, size_t& begin, size_t& end, mask_t* masks) {
+  bool dataset_found = false;
+  for (int i = 0; i < NUM_DATASETS; i++) {
+    if (dataset_str == dataset_names[i]) {
+      dataset_found = true;
+      break;
+    }
+  }
+  if (!dataset_found) {
+    std::cout << "Dataset currently not supported\n";
+    exit(1);
+  }
+  size_t i             = 0;
+  size_t sample_count  = 0;
+  std::string filename = path + dataset_str + "-" + mask_type + "_mask.txt";
+  // std::cout << "Reading " << filename << "\n";
+  std::ifstream in;
+  std::string line;
+  in.open(filename, std::ios::in);
+  in >> begin >> end >> std::ws;
+  while (std::getline(in, line)) {
+    std::istringstream mask_stream(line);
+    if (i >= begin && i < end) {
+      unsigned mask = 0;
+      mask_stream >> mask;
+      if (mask == 1) {
+        masks[i] = 1;
+        sample_count++;
+      }
+    }
+    i++;
+  }
+  std::cout << mask_type + "_mask range: [" << begin << ", " << end
+    << ") Number of valid samples: " << sample_count << " (" 
+    << (float)sample_count/(float)n*(float)100 << "\%)\n";
+  in.close();
+  return sample_count;
+}
+
+}
diff --git a/libgpu/include/checker.h b/libgpu/include/checker.h
deleted file mode 100644
index 7f2cf4e36e..0000000000
--- a/libgpu/include/checker.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#ifndef CHECKER_H
-#define CHECKER_H
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-static void check_cuda_error(const cudaError_t e, const char* file,
-                             const int line) {
-  if (e != cudaSuccess) {
-    fprintf(stderr, "%s:%d: %s (%d)\n", file, line, cudaGetErrorString(e), e);
-    exit(1);
-  }
-}
-#define check_cuda(x) check_cuda_error(x, __FILE__, __LINE__)
-
-#endif
diff --git a/libgpu/include/gg.h b/libgpu/include/gg.h
index 7f4a130c23..69239fd46c 100644
--- a/libgpu/include/gg.h
+++ b/libgpu/include/gg.h
@@ -34,7 +34,6 @@
 unsigned const debug = GGDEBUG;
 
 #include "Timer.h"
-#include "checker.h"
 
 template <typename T>
 static void check_retval(const T retval, const T expected, const char* file,
diff --git a/libgpu/include/graph_gpu.h b/libgpu/include/graph_gpu.h
index 6815d1304f..f456c367dc 100644
--- a/libgpu/include/graph_gpu.h
+++ b/libgpu/include/graph_gpu.h
@@ -16,7 +16,6 @@
 
 #include <cassert>
 #include <fstream>
-#include "checker.h"
 
 // Adapted from LSG CSRGraph.h
 
@@ -93,18 +92,6 @@ struct CSRGraph {
     return edge_data[abs_edge];
   };
 
-	void init_from_mgraph(int m, int nnz, index_type *h_row_offsets, index_type *h_column_indices, node_data_type *h_labels) {
-		nnodes = m;
-		nedges = nnz;
-		check_cuda(cudaMalloc((void **)&row_start, (m + 1) * sizeof(index_type)));
-		check_cuda(cudaMalloc((void **)&edge_dst, nnz * sizeof(index_type)));
-		check_cuda(cudaMemcpy(row_start, h_row_offsets, (m + 1) * sizeof(index_type), cudaMemcpyHostToDevice));
-		check_cuda(cudaMemcpy(edge_dst, h_column_indices, nnz * sizeof(index_type), cudaMemcpyHostToDevice));
-		#ifdef ENABLE_LABEL
-		check_cuda(cudaMalloc((void **)&node_data, m * sizeof(node_data_type)));
-		check_cuda(cudaMemcpy(node_data, h_labels, m * sizeof(node_data_type), cudaMemcpyHostToDevice));
-		#endif
-	}
 	void print_neighbors(index_type vid) {
 		printf("Vertex %d neighbors: [ ", vid);
 		index_type start = row_start[vid];

From a5dacc491c1dcc1446637091db2ea9d476cbde92 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Tue, 5 May 2020 20:13:22 -0500
Subject: [PATCH 248/660] update node

---
 libdeepgalois/CMakeLists.txt                  |  5 +---
 .../include/deepgalois/layers/node.h          |  6 +---
 libdeepgalois/src/node.cpp                    | 29 ++++---------------
 libdeepgalois/src/node.cu                     | 23 ++++++++-------
 4 files changed, 21 insertions(+), 42 deletions(-)

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index b46750b060..43d7fb5fac 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -71,10 +71,7 @@ endif()
 
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 if(ENABLE_HETERO_GALOIS)
-set(sources
-  src/node.cpp
-  src/net.cpp
-)
+set(sources src/net.cpp)
 else()
 if(ENABLE_DIST_GALOIS)
 # do not link regular context.cpp; TODO do this conditional in cleaner way
diff --git a/libdeepgalois/include/deepgalois/layers/node.h b/libdeepgalois/include/deepgalois/layers/node.h
index ec7c319d87..e8699d2498 100644
--- a/libdeepgalois/include/deepgalois/layers/node.h
+++ b/libdeepgalois/include/deepgalois/layers/node.h
@@ -47,12 +47,8 @@ class edge {
       : num_samples_(n), ft_dim_(len), data_(NULL), grad_(NULL), prev_(prev) {}
 
   void alloc();
-  void alloc_gpu();
-  void merge_grads(vec_t* dst);
-  void merge_grads_gpu(float_t* dst);
   void clear_grads();
-  void clear_grads_gpu();
-
+  void merge_grads(float_t* dst);
   void set_data(float_t* ptr) { data_ = ptr; }
   float_t* get_data() { return data_; }
   const float_t* get_data() const { return data_; }
diff --git a/libdeepgalois/src/node.cpp b/libdeepgalois/src/node.cpp
index b1ee96a58b..fbd8d2bc6a 100644
--- a/libdeepgalois/src/node.cpp
+++ b/libdeepgalois/src/node.cpp
@@ -4,41 +4,24 @@
 namespace deepgalois {
 
 void edge::alloc() {
-  // std::cout << "Allocating memory for tensors (intermediate features and
-  // gradients) ...\n";
-#ifdef CPU_ONLY
   data_ = new float_t[num_samples_ * ft_dim_];
   grad_ = new float_t[num_samples_ * ft_dim_];
-#else
-  alloc_gpu();
-#endif
 }
 
-void edge::merge_grads(vec_t* dst) {
+void edge::merge_grads(float_t* dst) {
   assert(grad_ != NULL);
-  dst->resize(ft_dim_);
-  float_t* pdst = &(*dst)[0];
-#ifdef CPU_ONLY
-  std::copy(grad_, grad_ + ft_dim_, pdst);
+  if(dst) delete[] dst;
+  dst = new float_t[ft_dim_];
+  std::copy(grad_, grad_ + ft_dim_, dst);
   // @todo consider adding parallelism and vectorization
   for (size_t sample = 1; sample < num_samples_; ++sample) {
     for (size_t i = 0; i < ft_dim_; i++)
-      pdst[i] += grad_[sample * ft_dim_ + i];
-    // vectorize::reduce<float_t>(&grad_[sample][0], ft_dim_, pdst);
+      dst[i] += grad_[sample * ft_dim_ + i];
   }
-#else
-  merge_grads_gpu(pdst);
-#endif
 }
 
 void edge::clear_grads() {
-#ifdef CPU_ONLY
-  std::fill(grad_, grad_ + ft_dim_ * num_samples_,
-            float_t(0)); // TODO: need vectorize
-  // vectorize::fill(&grad_[0], grad_.size(), float_t(0));
-#else
-  clear_grads_gpu();
-#endif
+  std::fill(grad_, grad_ + ft_dim_ * num_samples_, float_t(0));
 }
 
 } // namespace deepgalois
diff --git a/libdeepgalois/src/node.cu b/libdeepgalois/src/node.cu
index 88d486f369..b5a17af1fd 100644
--- a/libdeepgalois/src/node.cu
+++ b/libdeepgalois/src/node.cu
@@ -1,18 +1,21 @@
 #include "deepgalois/layers/node.h"
 #include "deepgalois/cutils.h"
+#include "deepgalois/math_functions.hh"
 
-void deepgalois::edge::alloc_gpu() {
-  CUDA_CHECK(
-      cudaMalloc((void**)&data_, num_samples_ * ft_dim_ * sizeof(float_t)));
-  CUDA_CHECK(
-      cudaMalloc((void**)&grad_, num_samples_ * ft_dim_ * sizeof(float_t)));
+namespace deepgalois {
+
+void edge::alloc() {
+  CUDA_CHECK(cudaMalloc((void**)&data_, num_samples_ * ft_dim_ * sizeof(float_t)));
+  CUDA_CHECK(cudaMalloc((void**)&grad_, num_samples_ * ft_dim_ * sizeof(float_t)));
+}
+
+void edge::merge_grads_gpu(float_t* dst) {
+  CUDA_CHECK(cudaMemcpy(&dst, grad_, ft_dim_ * sizeof(float_t), cudaMemcpyDeviceToHost));
 }
 
-void deepgalois::edge::merge_grads_gpu(float_t* dst) {
-  CUDA_CHECK(cudaMemcpy(&dst, grad_, ft_dim_ * sizeof(float_t),
-                        cudaMemcpyDeviceToHost));
+void edge::clear_grads() {
+  //CUDA_CHECK(cudaMemset(grad_, 0, num_samples_ * ft_dim_ * sizeof(float_t)));
+  init_const_gpu(num_samples_ * ft_dim_, 0.0, grad_);
 }
 
-void deepgalois::edge::clear_grads_gpu() {
-  CUDA_CHECK(cudaMemset(grad_, 0, num_samples_ * ft_dim_ * sizeof(float_t)));
 }

From 6e356d24877f7cd38210c6f2a6a1bac5486f6fa5 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Wed, 6 May 2020 09:24:23 -0500
Subject: [PATCH 249/660] remove cpp for gpu

---
 libdeepgalois/CMakeLists.txt           |   7 +-
 libdeepgalois/include/deepgalois/net.h | 421 +++++++++++++++++++++--
 libdeepgalois/src/net.cpp              | 455 +------------------------
 libdeepgalois/src/net.cu               |  19 ++
 libgpu/include/checker.h               |  15 +
 libgpu/include/csr_graph.h             |   4 +
 lonestar/gnn/gcn/CMakeLists.txt        |   9 +-
 lonestar/gnn/gcn/gcn.cpp               |  10 +-
 8 files changed, 465 insertions(+), 475 deletions(-)
 create mode 100644 libgpu/include/checker.h

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index 43d7fb5fac..23a0b44ed7 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -70,9 +70,7 @@ else()
 endif()
 
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-if(ENABLE_HETERO_GALOIS)
-set(sources src/net.cpp)
-else()
+if(NOT ENABLE_HETERO_GALOIS)
 if(ENABLE_DIST_GALOIS)
 # do not link regular context.cpp; TODO do this conditional in cleaner way
 # also don't link sampler
@@ -115,9 +113,7 @@ set(sources
   src/net.cpp
 )
 endif()
-endif()
 
-#set(BOOST_LIBRARIES "-lboost_system -lboost_thread")
 add_library(dg_cpu STATIC ${sources})
 target_link_libraries(dg_cpu galois_shmem)
 target_link_libraries(dg_cpu ${MPI_CXX_LIBRARIES})
@@ -150,3 +146,4 @@ set_target_properties(dg_cpu PROPERTIES
   INTERFACE_POSITION_INDEPENDENT_CODE On
   POSITION_INDEPENDENT_CODE On
 )
+endif()
diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h
index 9c794a9063..5bab5f12d2 100644
--- a/libdeepgalois/include/deepgalois/net.h
+++ b/libdeepgalois/include/deepgalois/net.h
@@ -1,9 +1,7 @@
 /**
  * Based on the net.hpp file from Caffe deep learning framework.
  */
-#ifndef _MODEL_H_
-#define _MODEL_H_
-
+#pragma once
 #include <random>
 #include "deepgalois/types.h"
 #include "deepgalois/layers/l2_norm_layer.h"
@@ -11,6 +9,7 @@
 #include "deepgalois/layers/softmax_loss_layer.h"
 #include "deepgalois/layers/sigmoid_loss_layer.h"
 #include "deepgalois/optimizer.h"
+#include "deepgalois/utils.h"
 #ifdef CPU_ONLY
 #include "deepgalois/sampler.h"
 #endif
@@ -29,6 +28,75 @@ namespace deepgalois {
 // layer 2: features N x 16, weights 16 x E, out N x E
 class Net {
 public:
+  Net(std::string dataset_str, int nt, unsigned n_conv, int epochs,
+      unsigned hidden1, float lr, float dropout, float wd,
+      bool selfloop, bool single, bool l2norm, bool dense, 
+      unsigned neigh_sz, unsigned subg_sz, int val_itv) :
+  is_single_class(single), has_l2norm(l2norm), has_dense(dense),
+  neighbor_sample_size(neigh_sz), subgraph_sample_size(subg_sz),
+  num_threads(nt), num_conv_layers(n_conv), num_epochs(epochs), 
+  learning_rate(lr), dropout_rate(dropout), weight_decay(wd),
+  val_interval(val_itv), is_selfloop(selfloop) {
+    assert(n_conv > 0);
+    std::cout << "Configuration: num_threads " << num_threads
+              << ", num_conv_layers " << num_conv_layers
+              << ", num_epochs " << num_epochs
+              << ", hidden1 " << hidden1
+              << ", learning_rate " << learning_rate
+              << ", dropout_rate " << dropout_rate
+              << ", weight_decay " << weight_decay << "\n";
+    num_layers = num_conv_layers + 1;
+    if (has_l2norm) num_layers ++;
+    if (has_dense) num_layers ++;
+    // initialize feature metadata
+    feature_dims.resize(num_layers + 1);
+#ifndef GALOIS_USE_DIST
+    context = new deepgalois::Context();
+    context->set_dataset(dataset_str);
+    num_samples = context->read_graph(selfloop);
+    context->set_label_class(is_single_class);
+    // read graph, get num nodes
+    num_classes = context->read_labels();
+
+    //std::cout << "Reading label masks ... ";
+    train_masks = new mask_t[num_samples];
+    val_masks = new mask_t[num_samples];
+    std::fill(train_masks, train_masks+num_samples, 0);
+    std::fill(val_masks, val_masks+num_samples, 0);
+
+    // get training and validation sets
+    if (dataset_str == "reddit") {
+      train_begin = 0, train_count = 153431,
+                  train_end = train_begin + train_count;
+      val_begin = 153431, val_count = 23831, val_end = val_begin + val_count;
+      // TODO do all can be used below
+      for (size_t i = train_begin; i < train_end; i++) train_masks[i] = 1;
+      for (size_t i = val_begin; i < val_end; i++) val_masks[i] = 1;
+    } else {
+      train_count = context->read_masks("train", num_samples, train_begin, train_end, train_masks);
+      val_count = context->read_masks("val", num_samples, val_begin, val_end, val_masks);
+    }
+
+    if (subgraph_sample_size > train_count) {
+      std::cout << "FATAL: subgraph size can not be larger than the size of training set\n";
+      exit(1);
+    }
+
+    feature_dims[0] = context->read_features(); // input feature dimension: D
+    for (size_t i = 1; i < num_conv_layers; i++)
+      feature_dims[i] = hidden1;                           // hidden1 level embedding: 16
+    feature_dims[num_conv_layers] = num_classes;           // output embedding: E
+    if (has_l2norm) 
+      feature_dims[num_conv_layers+1] = num_classes;     // l2 normalized embedding: E
+    if (has_dense) 
+      feature_dims[num_layers-1] = num_classes;          // MLP embedding: E
+    feature_dims[num_layers] = num_classes;                // normalized output embedding: E
+    layers.resize(num_layers);
+    context->set_use_subgraph(subgraph_sample_size > 0);
+    init();
+#endif 
+  }
+
   Net() : is_single_class(true), has_l2norm(false), has_dense(false),
           neighbor_sample_size(0), subgraph_sample_size(0),
           num_threads(1), num_samples(0), num_classes(0),
@@ -40,10 +108,8 @@ class Net {
           val_interval(1), num_subgraphs(1), num_vertices_sg(9000),
           train_masks(NULL), val_masks(NULL), 
           test_masks(NULL), context(NULL) {}
-  void init(std::string dataset_str, int nt, unsigned n_conv, int epochs,
-            unsigned hidden1, float lr, float dropout, float wd,
-            bool selfloop, bool single, bool l2norm, bool dense, 
-            unsigned neigh_sample_sz, unsigned subg_sample_sz, int val_itv);
+
+  void init();
 #ifdef GALOIS_USE_DIST
   void dist_init(Graph* graph, std::string dataset_str);
 #endif
@@ -51,21 +117,334 @@ class Net {
   size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id + 1]; }
   size_t get_nnodes() { return num_samples; }
 
-  void construct_layers();
-  void append_out_layer(size_t layer_id);
-  void append_l2norm_layer(size_t layer_id);
-  void append_dense_layer(size_t layer_id);
-  void append_conv_layer(size_t layer_id, bool act = false, bool norm = true,
-         bool bias = false, bool dropout = true); //! Add a convolution layer to the network
-
-  void train(optimizer* opt, bool need_validate); // training
-  double evaluate(std::string type, acc_t& loss, acc_t& acc); // inference
-  void read_test_masks(std::string dataset);
-  acc_t fprop(size_t begin, size_t end, size_t count, mask_t* masks); // forward propagation
-  void bprop(); // back propogation
   void normalize(); // Scale gradient to counterbalance accumulation
   void regularize(); // add weight decay
-  void update_weights(optimizer* opt); // update trainable weights after back-propagation
+
+  void train(optimizer* opt, bool need_validate) {
+    std::string header = "";
+    std::string seperator = " ";
+#ifdef GALOIS_USE_DIST
+    unsigned myID = galois::runtime::getSystemNetworkInterface().ID;
+    header = "[" + std::to_string(myID) + "] ";
+    seperator = "\n";
+#endif
+
+    double total_train_time = 0.0;
+    int num_subg_remain = 0;
+#ifdef CPU_ONLY
+#ifndef GALOIS_USE_DIST
+    if (subgraph_sample_size) {
+      context->createSubgraphs(num_subgraphs);
+      subgraphs_masks = new mask_t[num_samples*num_subgraphs];
+      std::cout << "\nConstruct training vertex set induced graph...\n";
+      sampler->set_masked_graph(train_begin, train_end, train_count, train_masks, context->getGraphPointer());
+    }
+#endif
+#endif
+    std::cout << "\nStart training...\n";
+    Timer t_epoch;
+    // run epochs
+    for (int ep = 0; ep < num_epochs; ep++) {
+      t_epoch.Start();
+
+      if (subgraph_sample_size) {
+        if (num_subg_remain == 0) {
+          std::cout << "Generating " << num_subgraphs << " subgraphs ";
+          Timer t_subgen;
+          t_subgen.Start();
+          // generate subgraphs
+#ifdef CPU_ONLY
+#ifndef GALOIS_USE_DIST
+          for (int sid = 0; sid < num_subgraphs; sid++) {
+            //galois::do_all(galois::iterate(size_t(0), size_t(num_subgraphs)),[&](const auto sid) {
+            unsigned tid = 0;
+            //tid = galois::substrate::ThreadPool::getTID();
+            sampler->subgraph_sample(subgraph_sample_size, *(context->getSubgraphPointer(sid)), &subgraphs_masks[sid*num_samples], tid);
+          }//, galois::loopname("subgraph_gen"));
+#endif
+#endif
+          num_subg_remain = num_subgraphs;
+          t_subgen.Stop();
+          //std::cout << "Done, time: " << t_subgen.Millisecs() << "\n";
+        }
+#ifndef GALOIS_USE_DIST
+        for (int i = 0; i < num_subgraphs; i++) {
+          auto sg_ptr = context->getSubgraphPointer(i);
+          sg_ptr->degree_counting();
+          //galois::gPrint("\tsubgraph[", i, "]: num_v ", sg_ptr->size(), " num_e ", sg_ptr->sizeEdges(), "\n");
+        }
+#endif //GALOIS_USE_DIST
+        num_subg_remain--;
+        int sg_id = num_subg_remain;
+        auto subgraph_ptr = context->getSubgraphPointer(sg_id);
+        num_vertices_sg = subgraph_ptr->size();
+        //galois::gPrint("Subgraph num_vertices: ", num_vertices_sg, ", num_edges: ", subgraph_ptr->sizeEdges(), "\n");
+        for (size_t i = 0; i < num_layers; i++)
+          layers[i]->update_dim_size(num_vertices_sg);
+        context->norm_factor_computing(1, sg_id);
+        for (size_t i = 0; i < num_conv_layers; i++) {
+          layers[i]->set_graph_ptr(subgraph_ptr);
+          layers[i]->set_norm_consts_ptr(context->get_norm_factors_subg_ptr());
+        }
+        // update labels for subgraph
+        context->gen_subgraph_labels(num_vertices_sg, &subgraphs_masks[sg_id*num_samples]);
+        layers[num_layers-1]->set_labels_ptr(context->get_labels_subg_ptr());
+
+        // update features for subgraph
+        context->gen_subgraph_feats(num_vertices_sg, &subgraphs_masks[sg_id*num_samples]);
+        layers[0]->set_feats_ptr(context->get_feats_subg_ptr()); // feed input data
+      }
+
+      // training steps
+      std::cout << header << "Epoch " << std::setw(3) << ep << seperator;
+      set_netphases(net_phase::train);
+      acc_t train_loss = 0.0, train_acc = 0.0;
+
+      // forward: after this phase, layer edges will contain intermediate features
+      // for use during backprop
+      double fw_time = evaluate("train", train_loss, train_acc);
+
+      // backward: use intermediate features + ground truth to update layers
+      // with feature gradients whcih are then used to calculate weight gradients
+      Net::bprop();
+
+      // gradient update: use gradients stored on each layer to update model for
+      // next epoch
+      Net::update_weights(opt); // update parameters
+
+      // validation / testing
+      set_netphases(net_phase::test);
+      std::cout << header << "train_loss " << std::setprecision(3) << std::fixed << train_loss
+                << " train_acc " << train_acc << seperator;
+      t_epoch.Stop();
+      double epoch_time = t_epoch.Millisecs();
+      total_train_time += epoch_time;
+      if (need_validate && ep % val_interval == 0) {
+        // Validation
+        acc_t val_loss = 0.0, val_acc = 0.0;
+        double val_time = evaluate("val", val_loss, val_acc);
+        std::cout << header << "val_loss " << std::setprecision(3) << std::fixed << val_loss
+                  << " val_acc " << val_acc << seperator;
+        std::cout << header << "time " << std::setprecision(3) << std::fixed << epoch_time + val_time
+                  << " ms (train_time " << epoch_time << " val_time " << val_time << ")\n";
+      } else {
+        std::cout << header << "train_time " << std::fixed << epoch_time 
+                  << " ms (fw " << fw_time << ", bw " << epoch_time - fw_time << ")\n";
+      }
+    }
+    double avg_train_time = total_train_time / (double)num_epochs;
+    double throughput = 1000.0 * (double)num_epochs / total_train_time;
+    std::cout << "\nAverage training time: " << avg_train_time
+              << " ms. Throughput: " << throughput << " epoch/s\n";
+  }
+
+  // evaluate, i.e. inference or predict
+  double evaluate(std::string type, acc_t& loss, acc_t& acc) {
+    // TODO may need to do something for the dist case
+    Timer t_eval;
+    t_eval.Start();
+    size_t begin = 0, end = 0, count = 0;
+    mask_t* masks = NULL;
+    if (type == "train") {
+      begin = train_begin;
+      end = train_end;
+      count = train_count;
+      masks = train_masks;
+      if (subgraph_sample_size) {
+        // update masks for subgraph
+        masks = NULL;
+        begin = 0;
+        end = num_vertices_sg;
+        count = num_vertices_sg;
+      }
+    } else if (type == "val") {
+      begin = val_begin;
+      end = val_end;
+      count = val_count;
+      masks = val_masks;
+    } else {
+      begin = test_begin;
+      end = test_end;
+      count = test_count;
+      masks = test_masks;
+    }
+#ifdef CPU_ONLY
+    if (subgraph_sample_size && type != "train") { // switch to the original graph
+      for (size_t i = 0; i < num_layers; i++) layers[i]->update_dim_size(num_samples);
+      for (size_t i = 0; i < num_conv_layers; i++) {
+        layers[i]->set_graph_ptr(context->getGraphPointer());
+        layers[i]->set_norm_consts_ptr(context->get_norm_factors_ptr());
+      }
+      layers[num_layers-1]->set_labels_ptr(context->get_labels_ptr());
+      layers[0]->set_feats_ptr(context->get_feats_ptr()); // feed input data
+    }
+#else
+    if (type == "train") {
+      masks = d_train_masks;
+    } else if (type == "val") {
+      masks = d_val_masks;
+    } else {
+      masks = d_test_masks;
+    }
+#endif
+    loss = fprop(begin, end, count, masks);
+    float_t* predictions = layers[num_layers - 1]->next()->get_data();
+    label_t* labels;
+    if (type == "train" && subgraph_sample_size) {
+      labels = context->get_labels_subg_ptr();
+    } else {
+      labels = context->get_labels_ptr();
+    }
+    if (is_single_class) {
+      acc = masked_accuracy(begin, end, count, masks, predictions, labels);
+    } else {
+      acc = masked_multi_class_accuracy(begin, end, count, masks, predictions, labels);
+    }
+    t_eval.Stop();
+    return t_eval.Millisecs();
+  }
+
+  // read masks of test set
+  void read_test_masks(std::string dataset) {
+    test_masks = new mask_t[num_samples];
+    if (dataset == "reddit") {
+      test_begin = 177262;
+      test_count = 55703;
+      test_end   = test_begin + test_count;
+#ifndef GALOIS_USE_DIST
+      for (size_t i = test_begin; i < test_end; i++) test_masks[i] = 1;
+#else
+      for (size_t i = test_begin; i < test_end; i++)  {
+        if (dGraph->isLocal(i)) {
+          test_masks[dGraph->getLID(i)] = 1;
+        }
+      }
+#endif
+    } else {
+#ifndef GALOIS_USE_DIST
+      test_count = context->read_masks("test", num_samples, test_begin, test_end, test_masks);
+#else
+      test_count = context->read_masks("test", num_samples, test_begin, test_end, test_masks, dGraph);
+#endif
+    }
+#ifndef CPU_ONLY
+    copy_test_masks_to_device();
+#endif
+  }
+  void copy_test_masks_to_device();
+
+  void construct_layers() {
+    // append conv layers
+    std::cout << "\nConstructing layers...\n";
+    for (size_t i = 0; i < num_conv_layers-1; i++)
+      append_conv_layer(i, true);                  // conv layers, act=true
+    append_conv_layer(num_conv_layers-1);          // the last hidden layer, act=false
+    if (has_l2norm)
+      append_l2norm_layer(num_conv_layers);        // l2_norm layer
+    if (has_dense)
+      append_dense_layer(num_layers-2);            // dense layer
+    append_out_layer(num_layers-1);                // output layer
+
+    // allocate memory for intermediate features and gradients
+    for (size_t i = 0; i < num_layers; i++) {
+      layers[i]->add_edge();
+    }
+    for (size_t i = 1; i < num_layers; i++)
+      connect(layers[i - 1], layers[i]);
+    for (size_t i = 0; i < num_layers; i++)
+      layers[i]->malloc_and_init();
+    layers[0]->set_in_data(context->get_feats_ptr()); // feed input data
+    // precompute the normalization constant based on graph structure
+    context->norm_factor_computing(0);
+    for (size_t i = 0; i < num_conv_layers; i++)
+      layers[i]->set_norm_consts_ptr(context->get_norm_factors_ptr());
+    set_contexts();
+  }
+
+  //! Add an l2_norm layer to the network
+  void append_l2norm_layer(size_t layer_id) {
+    assert(layer_id > 0); // can not be the first layer
+    std::vector<size_t> in_dims(2), out_dims(2);
+    in_dims[0]       = num_samples;
+    in_dims[0]       = num_samples;
+    in_dims[1]       = get_in_dim(layer_id);
+    out_dims[1]      = get_out_dim(layer_id);
+    layers[layer_id] = new l2_norm_layer(layer_id, in_dims, out_dims);
+  }
+
+  //! Add an dense layer to the network
+  void append_dense_layer(size_t layer_id) {
+    assert(layer_id > 0); // can not be the first layer
+    std::vector<size_t> in_dims(2), out_dims(2);
+    in_dims[0]       = num_samples;
+    in_dims[0]       = num_samples;
+    in_dims[1]       = get_in_dim(layer_id);
+    out_dims[1]      = get_out_dim(layer_id);
+    //layers[layer_id] = new dense_layer(layer_id, in_dims, out_dims);
+  }
+
+  //! Add an output layer to the network
+  void append_out_layer(size_t layer_id) {
+    assert(layer_id > 0); // can not be the first layer
+    std::vector<size_t> in_dims(2), out_dims(2);
+    in_dims[0] = out_dims[0] = num_samples;
+    in_dims[1]               = get_in_dim(layer_id);
+    out_dims[1]              = get_out_dim(layer_id);
+    if (is_single_class)
+      layers[layer_id] = new softmax_loss_layer(layer_id, in_dims, out_dims);
+    else
+      layers[layer_id] = new sigmoid_loss_layer(layer_id, in_dims, out_dims);
+    layers[layer_id]->set_labels_ptr(context->get_labels_ptr());
+  }
+
+  //! Add a convolution layer to the network
+  void append_conv_layer(size_t layer_id, bool act=false, bool norm=true, bool bias=false, bool dropout=true) {
+    assert(dropout_rate < 1.0);
+    assert(layer_id < num_conv_layers);
+    std::vector<size_t> in_dims(2), out_dims(2);
+    in_dims[0] = out_dims[0] = num_samples;
+    in_dims[1]               = get_in_dim(layer_id);
+    out_dims[1]              = get_out_dim(layer_id);
+    layers[layer_id] = new graph_conv_layer(layer_id, act, norm, bias, dropout,
+        dropout_rate, in_dims, out_dims);
+    layers[layer_id]->set_graph_ptr(context->getGraphPointer());
+  }
+
+  // update trainable weights after back-propagation
+  void update_weights(optimizer* opt) {
+    normalize();
+    regularize();
+    for (size_t i = 0; i < num_layers; i++) {
+      if (layers[i]->trainable()) {
+        layers[i]->update_weight(opt);
+      }
+    }
+  }
+
+  //! forward propagation: [begin, end) is the range of samples used.
+  //! calls "forward" on each layer and returns the loss of the final layer
+  acc_t fprop(size_t begin, size_t end, size_t count, mask_t* masks) {
+    // set mask for the last layer
+    layers[num_layers - 1]->set_sample_mask(begin, end, count, masks);
+    // layer0: from N x D to N x 16
+    // layer1: from N x 16 to N x E
+    // layer2: from N x E to N x E (normalize only)
+    for (size_t i = 0; i < num_layers; i++) {
+      layers[i]->forward();
+      // TODO need to sync model between layers here
+    }
+    // prediction error
+    auto loss = layers[num_layers - 1]->get_prediction_loss();
+    // Squared Norm Regularization to mitigate overfitting
+    loss += weight_decay * layers[0]->get_weight_decay_loss();
+    return loss;
+  }
+
+  void bprop() {
+    for (size_t i = num_layers; i != 0; i--) {
+      layers[i - 1]->backward();
+    }
+  }
 
   //! Save the context object to all layers of the network
   void set_contexts() {
@@ -104,6 +483,7 @@ class Net {
   int val_interval;
   int num_subgraphs;
   int num_vertices_sg;
+  bool is_selfloop;
 
   mask_t* train_masks;               // masks for training
   mask_t* d_train_masks;             // masks for training on device
@@ -133,4 +513,3 @@ class Net {
 
 } // namespace deepgalois
 
-#endif
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index 719b5267ee..1f63eacc60 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -4,105 +4,10 @@
 
 #include "galois/Timer.h"
 #include "deepgalois/net.h"
-#include "deepgalois/utils.h"
 #include "deepgalois/math_functions.hh"
 
 namespace deepgalois {
 
-void Net::init(std::string dataset_str, int nt, unsigned n_conv, int epochs,
-               unsigned hidden1, float lr, float dropout, float wd,
-               bool selfloop, bool single, bool l2norm, bool dense, 
-               unsigned neigh_sz, unsigned subg_sz, int val_itv) {
-  assert(n_conv > 0);
-  num_threads = nt;
-  num_conv_layers = n_conv;
-  num_epochs = epochs;
-  learning_rate = lr;
-  dropout_rate = dropout;
-  weight_decay = wd;
-  is_single_class = single;
-  has_l2norm = l2norm;
-  has_dense = dense;
-  neighbor_sample_size = neigh_sz;
-  subgraph_sample_size = subg_sz;
-  val_interval = val_itv;
-  //num_subgraphs = 1;//num_threads;
-  galois::gPrint("Configuration: num_threads ", num_threads, 
-                 ", num_conv_layers ", num_conv_layers,
-                 ", num_epochs ", num_epochs,
-                 ", hidden1 ", hidden1,
-                 ", learning_rate ", learning_rate,
-                 ", dropout_rate ", dropout_rate,
-                 ", weight_decay ", weight_decay, "\n");
-#ifndef GALOIS_USE_DIST
-  context = new deepgalois::Context();
-  context->set_dataset(dataset_str);
-  num_samples = context->read_graph(selfloop);
-  context->set_label_class(is_single_class);
-#else
-  // only done here to avoid unused var complain TODO find better way
-  (void)selfloop;
-#endif
-
-  // read graph, get num nodes
-  num_classes = context->read_labels();
-
-#ifndef GALOIS_USE_DIST
-  //std::cout << "Reading label masks ... ";
-  train_masks = new mask_t[num_samples];
-  val_masks = new mask_t[num_samples];
-  std::fill(train_masks, train_masks+num_samples, 0);
-  std::fill(val_masks, val_masks+num_samples, 0);
-
-  // get training and validation sets
-  if (dataset_str == "reddit") {
-    train_begin = 0, train_count = 153431,
-    train_end = train_begin + train_count;
-    val_begin = 153431, val_count = 23831, val_end = val_begin + val_count;
-    // TODO do all can be used below
-    for (size_t i = train_begin; i < train_end; i++) train_masks[i] = 1;
-    for (size_t i = val_begin; i < val_end; i++) val_masks[i] = 1;
-  } else {
-    train_count = context->read_masks("train", num_samples, train_begin, train_end, train_masks);
-    val_count = context->read_masks("val", num_samples, val_begin, val_end, val_masks);
-  }
-#endif
-
-  if (subgraph_sample_size > train_count) {
-    galois::gPrint("FATAL: subgraph size can not be larger than the size of training set\n");
-    exit(1);
-  }
-  // NOTE: train_begin/train_end are global IDs, train_masks is a local id
-  // train count and val count are LOCAL counts
-
-  num_layers = num_conv_layers + 1;
-  if (has_l2norm) num_layers ++;
-  if (has_dense) num_layers ++;
-  // initialize feature metadata
-  feature_dims.resize(num_layers + 1);
-  feature_dims[0] = context->read_features(); // input feature dimension: D
-  for (size_t i = 1; i < num_conv_layers; i++)
-    feature_dims[i] = hidden1;                           // hidden1 level embedding: 16
-  feature_dims[num_conv_layers] = num_classes;           // output embedding: E
-  if (has_l2norm) 
-      feature_dims[num_conv_layers+1] = num_classes;     // l2 normalized embedding: E
-  if (has_dense) 
-      feature_dims[num_layers-1] = num_classes;          // MLP embedding: E
-  feature_dims[num_layers] = num_classes;                // normalized output embedding: E
-  layers.resize(num_layers);
-
-#ifndef GALOIS_USE_DIST
-  context->set_use_subgraph(subgraph_sample_size > 0);
-#ifdef CPU_ONLY
-  if (subgraph_sample_size) sampler = new deepgalois::Sampler();
-#else
-  copy_masks_device(num_samples, train_masks, d_train_masks);
-  copy_masks_device(num_samples, val_masks, d_val_masks);
-  context->copy_data_to_device(); // copy labels and input features to the device
-#endif
-#endif
-}
-
 #ifdef GALOIS_USE_DIST
 void Net::dist_init(Graph* graph, std::string dataset_str) {
   dGraph = graph;
@@ -111,6 +16,7 @@ void Net::dist_init(Graph* graph, std::string dataset_str) {
   context->saveGraph(dGraph);
   // TODO self loop setup?
   context->initializeSyncSubstrate();
+  num_classes = context->read_labels();
 
   //std::cout << "Reading label masks ... ";
   train_masks = new mask_t[num_samples];
@@ -137,367 +43,38 @@ void Net::dist_init(Graph* graph, std::string dataset_str) {
     train_count = context->read_masks("train", num_samples, train_begin, train_end, train_masks, dGraph);
     val_count = context->read_masks("val", num_samples, val_begin, val_end, val_masks, dGraph);
   }
-}
-#endif
-
-void Net::train(optimizer* opt, bool need_validate) {
-  std::string header = "";
-  std::string seperator = " ";
-#ifdef GALOIS_USE_DIST
-  unsigned myID = galois::runtime::getSystemNetworkInterface().ID;
-  header = "[" + std::to_string(myID) + "] ";
-  seperator = "\n";
-#endif
-
-  galois::StatTimer Tupdate("Train-WeightUpdate");
-  galois::StatTimer Tfw("Train-Forward");
-  galois::StatTimer Tbw("Train-Backward");
-  galois::StatTimer Tval("Validation");
-  double total_train_time = 0.0;
 
-  int num_subg_remain = 0;
-#ifdef CPU_ONLY
-#ifndef GALOIS_USE_DIST
-  if (subgraph_sample_size) {
-    context->createSubgraphs(num_subgraphs);
-    subgraphs_masks = new mask_t[num_samples*num_subgraphs];
-    galois::gPrint("\nConstruct training vertex set induced graph...\n");
-    sampler->set_masked_graph(train_begin, train_end, train_count, train_masks, context->getGraphPointer());
-  }
-#endif
-#endif
-  galois::gPrint("\nStart training...\n");
-  Timer t_epoch;
-  // run epochs
-  for (int ep = 0; ep < num_epochs; ep++) {
-    t_epoch.Start();
-
-    if (subgraph_sample_size) {
-      if (num_subg_remain == 0) {
-        galois::gPrint("Generating ", num_subgraphs, " subgraphs ");
-        Timer t_subgen;
-        t_subgen.Start();
-        // generate subgraphs
-#ifdef CPU_ONLY
-#ifndef GALOIS_USE_DIST
-        for (int sid = 0; sid < num_subgraphs; sid++) {
-        //galois::do_all(galois::iterate(size_t(0), size_t(num_subgraphs)),[&](const auto sid) {
-          unsigned tid = 0;
-          //tid = galois::substrate::ThreadPool::getTID();
-          sampler->subgraph_sample(subgraph_sample_size, *(context->getSubgraphPointer(sid)), &subgraphs_masks[sid*num_samples], tid);
-        }//, galois::loopname("subgraph_gen"));
-#endif
-#endif
-        num_subg_remain = num_subgraphs;
-        t_subgen.Stop();
-        //galois::gPrint("Done, time: ", t_subgen.Millisecs(), "\n");
-      }
-#ifndef GALOIS_USE_DIST
-      for (int i = 0; i < num_subgraphs; i++) {
-        auto sg_ptr = context->getSubgraphPointer(i);
-        sg_ptr->degree_counting();
-        //galois::gPrint("\tsubgraph[", i, "]: num_v ", sg_ptr->size(), " num_e ", sg_ptr->sizeEdges(), "\n");
-      }
-#endif //GALOIS_USE_DIST
-      num_subg_remain--;
-      int sg_id = num_subg_remain;
-      auto subgraph_ptr = context->getSubgraphPointer(sg_id);
-      num_vertices_sg = subgraph_ptr->size();
-      //galois::gPrint("Subgraph num_vertices: ", num_vertices_sg, ", num_edges: ", subgraph_ptr->sizeEdges(), "\n");
-      for (size_t i = 0; i < num_layers; i++)
-        layers[i]->update_dim_size(num_vertices_sg);
-      context->norm_factor_computing(1, sg_id);
-      for (size_t i = 0; i < num_conv_layers; i++) {
-        layers[i]->set_graph_ptr(subgraph_ptr);
-        layers[i]->set_norm_consts_ptr(context->get_norm_factors_subg_ptr());
-	  }
-      // update labels for subgraph
-      context->gen_subgraph_labels(num_vertices_sg, &subgraphs_masks[sg_id*num_samples]);
-      layers[num_layers-1]->set_labels_ptr(context->get_labels_subg_ptr());
-
-      // update features for subgraph
-      context->gen_subgraph_feats(num_vertices_sg, &subgraphs_masks[sg_id*num_samples]);
-      layers[0]->set_feats_ptr(context->get_feats_subg_ptr()); // feed input data
-	}
-
-    // training steps
-    galois::gPrint(header, "Epoch ", std::setw(3), ep, seperator);
-    set_netphases(net_phase::train);
-    acc_t train_loss = 0.0, train_acc = 0.0;
-
-    // forward: after this phase, layer edges will contain intermediate features
-    // for use during backprop
-    Tfw.start();
-    double fw_time = evaluate("train", train_loss, train_acc);
-    Tfw.stop();
-
-    // backward: use intermediate features + ground truth to update layers
-    // with feature gradients whcih are then used to calculate weight gradients
-    Tbw.start();
-    Net::bprop();
-    Tbw.stop();
-
-    // gradient update: use gradients stored on each layer to update model for
-    // next epoch
-    Tupdate.start();
-    Net::update_weights(opt); // update parameters
-    Tupdate.stop();
-
-    // validation / testing
-    set_netphases(net_phase::test);
-    galois::gPrint(header, "train_loss ", std::setprecision(3), std::fixed, train_loss,
-                   " train_acc ", train_acc, seperator);
-    t_epoch.Stop();
-    double epoch_time = t_epoch.Millisecs();
-    total_train_time += epoch_time;
-    if (need_validate && ep % val_interval == 0) {
-      // Validation
-      acc_t val_loss = 0.0, val_acc = 0.0;
-      Tval.start();
-      double val_time = evaluate("val", val_loss, val_acc);
-      Tval.stop();
-      galois::gPrint(header, "val_loss ", std::setprecision(3), std::fixed, val_loss,
-                     " val_acc ", val_acc, seperator);
-      galois::gPrint(header, "time ", std::setprecision(3), std::fixed, epoch_time + val_time, 
-                     " ms (train_time ", epoch_time, " val_time ", val_time, ")\n");
-    } else {
-      galois::gPrint(header, "train_time ", std::fixed, epoch_time, 
-                     " ms (fw ", fw_time, ", bw ", epoch_time - fw_time, ")\n");
-    }
-  }
-  double avg_train_time = total_train_time / (double)num_epochs;
-  double throughput = 1000.0 * (double)num_epochs / total_train_time;
-  galois::gPrint("\nAverage training time: ", avg_train_time, 
-                 " ms. Throughput: ", throughput, " epoch/s\n");
+  feature_dims[0] = context->read_features(); // input feature dimension: D
+  for (size_t i = 1; i < num_conv_layers; i++)
+    feature_dims[i] = hidden1;                           // hidden1 level embedding: 16
+  feature_dims[num_conv_layers] = num_classes;           // output embedding: E
+  if (has_l2norm) 
+      feature_dims[num_conv_layers+1] = num_classes;     // l2 normalized embedding: E
+  if (has_dense) 
+      feature_dims[num_layers-1] = num_classes;          // MLP embedding: E
+  feature_dims[num_layers] = num_classes;                // normalized output embedding: E
+  layers.resize(num_layers);
 }
-
-// evaluate, i.e. inference or predict
-double Net::evaluate(std::string type, acc_t& loss, acc_t& acc) {
-  // TODO may need to do something for the dist case
-  Timer t_eval;
-  t_eval.Start();
-  size_t begin = 0, end = 0, count = 0;
-  mask_t* masks = NULL;
-  if (type == "train") {
-    begin = train_begin;
-    end = train_end;
-    count = train_count;
-    masks = train_masks;
-    if (subgraph_sample_size) {
-      // update masks for subgraph
-      masks = NULL;
-      begin = 0;
-      end = num_vertices_sg;
-      count = num_vertices_sg;
-    }
-  } else if (type == "val") {
-    begin = val_begin;
-    end = val_end;
-    count = val_count;
-    masks = val_masks;
-  } else {
-    begin = test_begin;
-    end = test_end;
-    count = test_count;
-    masks = test_masks;
-  }
-#ifdef CPU_ONLY
-  if (subgraph_sample_size && type != "train") { // switch to the original graph
-    for (size_t i = 0; i < num_layers; i++) layers[i]->update_dim_size(num_samples);
-    for (size_t i = 0; i < num_conv_layers; i++) {
-      layers[i]->set_graph_ptr(context->getGraphPointer());
-      layers[i]->set_norm_consts_ptr(context->get_norm_factors_ptr());
-    }
-    layers[num_layers-1]->set_labels_ptr(context->get_labels_ptr());
-    layers[0]->set_feats_ptr(context->get_feats_ptr()); // feed input data
-  }
-#else
-  if (type == "train") {
-    masks = d_train_masks;
-  } else if (type == "val") {
-    masks = d_val_masks;
-  } else {
-    masks = d_test_masks;
-  }
 #endif
 
-  loss = fprop(begin, end, count, masks);
-  float_t* predictions = layers[num_layers - 1]->next()->get_data();
-  label_t* labels;
-  if (type == "train" && subgraph_sample_size) {
-    labels = context->get_labels_subg_ptr();
-  } else {
-    labels = context->get_labels_ptr();
-  }
-  if (is_single_class) {
-    acc = masked_accuracy(begin, end, count, masks, predictions, labels);
-  } else {
-    acc = masked_multi_class_accuracy(begin, end, count, masks, predictions, labels);
-  }
-  t_eval.Stop();
-  return t_eval.Millisecs();
-}
-
-//! forward propagation: [begin, end) is the range of samples used.
-//! calls "forward" on the layers of the network and returns the loss of the
-//! final layer
-acc_t Net::fprop(size_t begin, size_t end, size_t count, mask_t* masks) {
-  // set mask for the last layer
-  layers[num_layers - 1]->set_sample_mask(begin, end, count, masks);
-  // layer0: from N x D to N x 16
-  // layer1: from N x 16 to N x E
-  // layer2: from N x E to N x E (normalize only)
-  for (size_t i = 0; i < num_layers; i++) {
-    layers[i]->forward();
-    // TODO need to sync model between layers here
-  }
-  // prediction error
-  auto loss = layers[num_layers - 1]->get_prediction_loss();
-  // Squared Norm Regularization to mitigate overfitting
-  loss += weight_decay * layers[0]->get_weight_decay_loss();
-  return loss;
-}
-
-void Net::bprop() {
-  for (size_t i = num_layers; i != 0; i--) {
-    layers[i - 1]->backward();
-  }
-}
-
-// Scale gradient to counterbalance accumulation
-void Net::normalize() {
+#ifdef CPU_ONLY
+void Net::init() {
+  if (subgraph_sample_size) sampler = new deepgalois::Sampler();
 }
 
 // add weight decay
 void Net::regularize() {
   size_t layer_id = 0;
   auto n = feature_dims[layer_id] * feature_dims[layer_id+1];
-#ifdef CPU_ONLY
   // TODO: parallel
   math::axpy(n, weight_decay, layers[layer_id]->get_weights_ptr(), 
     layers[layer_id]->get_grads_ptr());
-#else
-  axpy_gpu(n, weight_decay, layers[layer_id]->get_weights_device_ptr(), 
-    layers[layer_id]->get_grads_device_ptr());
-#endif
-}
-
-void Net::update_weights(optimizer* opt) {
-  normalize();
-  regularize();
-  for (size_t i = 0; i < num_layers; i++) {
-    if (layers[i]->trainable()) {
-      layers[i]->update_weight(opt);
-    }
-  }
-}
-
-void Net::construct_layers() {
-  // append conv layers
-  std::cout << "\nConstructing layers...\n";
-  for (size_t i = 0; i < num_conv_layers-1; i++)
-    append_conv_layer(i, true);                  // conv layers, act=true
-  append_conv_layer(num_conv_layers-1);          // the last hidden layer, act=false
-  if (has_l2norm)
-    append_l2norm_layer(num_conv_layers);        // l2_norm layer
-  if (has_dense)
-    append_dense_layer(num_layers-2);            // dense layer
-  append_out_layer(num_layers-1);                // output layer
-
-  // allocate memory for intermediate features and gradients
-  for (size_t i = 0; i < num_layers; i++) {
-    layers[i]->add_edge();
-  }
-  for (size_t i = 1; i < num_layers; i++)
-    connect(layers[i - 1], layers[i]);
-  for (size_t i = 0; i < num_layers; i++)
-    layers[i]->malloc_and_init();
-  layers[0]->set_in_data(context->get_feats_ptr()); // feed input data
-  // precompute the normalization constant based on graph structure
-  context->norm_factor_computing(0);
-  for (size_t i = 0; i < num_conv_layers; i++)
-    layers[i]->set_norm_consts_ptr(context->get_norm_factors_ptr());
-  set_contexts();
-}
-
-//! Add an l2_norm layer to the network
-void Net::append_l2norm_layer(size_t layer_id) {
-  assert(layer_id > 0); // can not be the first layer
-  std::vector<size_t> in_dims(2), out_dims(2);
-  in_dims[0]       = num_samples;
-  in_dims[0]       = num_samples;
-  in_dims[1]       = get_in_dim(layer_id);
-  out_dims[1]      = get_out_dim(layer_id);
-  layers[layer_id] = new l2_norm_layer(layer_id, in_dims, out_dims);
-}
-
-//! Add an dense layer to the network
-void Net::append_dense_layer(size_t layer_id) {
-  assert(layer_id > 0); // can not be the first layer
-  std::vector<size_t> in_dims(2), out_dims(2);
-  in_dims[0]       = num_samples;
-  in_dims[0]       = num_samples;
-  in_dims[1]       = get_in_dim(layer_id);
-  out_dims[1]      = get_out_dim(layer_id);
-  //layers[layer_id] = new dense_layer(layer_id, in_dims, out_dims);
-}
-
-//! Add an output layer to the network
-void Net::append_out_layer(size_t layer_id) {
-  assert(layer_id > 0); // can not be the first layer
-  std::vector<size_t> in_dims(2), out_dims(2);
-  in_dims[0] = out_dims[0] = num_samples;
-  in_dims[1]               = get_in_dim(layer_id);
-  out_dims[1]              = get_out_dim(layer_id);
-  if (is_single_class)
-    layers[layer_id] = new softmax_loss_layer(layer_id, in_dims, out_dims);
-  else
-    layers[layer_id] = new sigmoid_loss_layer(layer_id, in_dims, out_dims);
-  layers[layer_id]->set_labels_ptr(context->get_labels_ptr());
 }
 
-//! Add a convolution layer to the network
-void Net::append_conv_layer(size_t layer_id, bool act, bool norm, bool bias,
-                            bool dropout) {
-  assert(dropout_rate < 1.0);
-  assert(layer_id < num_conv_layers);
-  std::vector<size_t> in_dims(2), out_dims(2);
-  in_dims[0] = out_dims[0] = num_samples;
-  in_dims[1]               = get_in_dim(layer_id);
-  out_dims[1]              = get_out_dim(layer_id);
-  layers[layer_id] = new graph_conv_layer(layer_id, act, norm, bias, dropout,
-                                          dropout_rate, in_dims, out_dims);
-  layers[layer_id]->set_graph_ptr(context->getGraphPointer());
-}
-
-void Net::read_test_masks(std::string dataset) {
-  test_masks = new mask_t[num_samples];
-  if (dataset == "reddit") {
-    test_begin = 177262;
-    test_count = 55703;
-    test_end   = test_begin + test_count;
-#ifndef GALOIS_USE_DIST
-    for (size_t i = test_begin; i < test_end; i++) test_masks[i] = 1;
-#else
-    for (size_t i = test_begin; i < test_end; i++)  {
-      if (dGraph->isLocal(i)) {
-        test_masks[dGraph->getLID(i)] = 1;
-      }
-    }
-#endif
-  } else {
-#ifndef GALOIS_USE_DIST
-    test_count = context->read_masks("test", num_samples, test_begin, test_end, test_masks);
-#else
-    test_count = context->read_masks("test", num_samples, test_begin, test_end, test_masks, dGraph);
-#endif
-  }
-#ifndef CPU_ONLY
-  copy_masks_device(num_samples, test_masks, d_test_masks);
-#endif
+// Scale gradient to counterbalance accumulation
+void Net::normalize() {
 }
 
-#ifdef CPU_ONLY
 /**
  *
  * @param begin GLOBAL begin
diff --git a/libdeepgalois/src/net.cu b/libdeepgalois/src/net.cu
index 115ff6d81d..1a50c0c551 100644
--- a/libdeepgalois/src/net.cu
+++ b/libdeepgalois/src/net.cu
@@ -143,6 +143,25 @@ acc_t masked_f1_score_gpu(int num_classes, int begin, int end, int count,
 }
 
 namespace deepgalois {
+
+void Net::init() {
+  copy_masks_device(num_samples, train_masks, d_train_masks);
+  copy_masks_device(num_samples, val_masks, d_val_masks);
+  context->copy_data_to_device(); // copy labels and input features to the device
+}
+
+void Net::copy_test_masks_to_device() {
+  copy_masks_device(num_samples, test_masks, d_test_masks);
+}
+
+// add weight decay
+void Net::regularize() {
+  size_t layer_id = 0;
+  auto n = feature_dims[layer_id] * feature_dims[layer_id+1];
+  axpy_gpu(n, weight_decay, layers[layer_id]->get_weights_device_ptr(), 
+    layers[layer_id]->get_grads_device_ptr());
+}
+
 acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, 
                            mask_t* masks, float_t* preds, label_t* ground_truth) {
   return masked_accuracy_gpu(num_classes, begin, end, count, masks, preds, ground_truth);
diff --git a/libgpu/include/checker.h b/libgpu/include/checker.h
new file mode 100644
index 0000000000..7f2cf4e36e
--- /dev/null
+++ b/libgpu/include/checker.h
@@ -0,0 +1,15 @@
+#ifndef CHECKER_H
+#define CHECKER_H
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+static void check_cuda_error(const cudaError_t e, const char* file,
+                             const int line) {
+  if (e != cudaSuccess) {
+    fprintf(stderr, "%s:%d: %s (%d)\n", file, line, cudaGetErrorString(e), e);
+    exit(1);
+  }
+}
+#define check_cuda(x) check_cuda_error(x, __FILE__, __LINE__)
+
+#endif
diff --git a/libgpu/include/csr_graph.h b/libgpu/include/csr_graph.h
index c663edb8a3..7fff0750e4 100644
--- a/libgpu/include/csr_graph.h
+++ b/libgpu/include/csr_graph.h
@@ -18,7 +18,10 @@
 //#include "graph_gpu.h"
 
 #include <fstream>
+#include "checker.h"
+#include "graph_gpu.h"
 
+/*
 // Adapted from LSG CSRGraph.h
 
 // TODO: make this template data
@@ -143,6 +146,7 @@ struct CSRGraph {
   bool device_graph;
 };
 >>>>>>> dist-dev
+//*/
 
 struct CSRGraphTex : CSRGraph {
   cudaTextureObject_t edge_dst_tx;
diff --git a/lonestar/gnn/gcn/CMakeLists.txt b/lonestar/gnn/gcn/CMakeLists.txt
index eff742aa69..589f60b881 100644
--- a/lonestar/gnn/gcn/CMakeLists.txt
+++ b/lonestar/gnn/gcn/CMakeLists.txt
@@ -1,13 +1,12 @@
-#app(gcn gcn.cpp)
 add_executable(gcn gcn.cpp)
 target_link_libraries(gcn PRIVATE Galois::shmem lonestar)
 
+if(ENABLE_HETERO_GALOIS)
+  target_link_libraries(gcn PRIVATE dg_gpu)
+  target_link_libraries(gcn PRIVATE -lcudart -lcublas -lcurand -lcudadevrt)
+else()
 target_link_libraries(gcn PRIVATE dg_cpu)
 if(ENABLE_DIST_GALOIS)
   target_link_libraries(gcn PRIVATE distgraphloader)
 endif()
-
-if(ENABLE_HETERO_GALOIS)
-  target_link_libraries(gcn PRIVATE dg_gpu)
-  target_link_libraries(gcn PRIVATE -lcudart -lcublas -lcurand -lcudadevrt)
 endif()
diff --git a/lonestar/gnn/gcn/gcn.cpp b/lonestar/gnn/gcn/gcn.cpp
index de3f2a76ee..a8ab651603 100644
--- a/lonestar/gnn/gcn/gcn.cpp
+++ b/lonestar/gnn/gcn/gcn.cpp
@@ -16,7 +16,11 @@ int main(int argc, char** argv) {
   galois::DistMemSys G;
 #endif
   LonestarGnnStart(argc, argv, name, desc, url);
-  deepgalois::Net network; // the neural network to train
+  // the neural network to train
+  deepgalois::Net network(dataset, numThreads, num_conv_layers, epochs,
+                    hidden1, learning_rate, dropout_rate, weight_decay,
+                    add_selfloop, is_single_class, add_l2norm, add_dense, 
+                    neighbor_sample_sz, subgraph_sample_sz, val_interval);
 
 #ifdef GALOIS_USE_DIST
   std::vector<unsigned> dummyVec;
@@ -25,10 +29,6 @@ int main(int argc, char** argv) {
 #endif
 
   // read network, features, ground truth, initialize metadata
-  network.init(dataset, numThreads, num_conv_layers, epochs, hidden1,
-               learning_rate, dropout_rate, weight_decay,
-               add_selfloop, is_single_class, add_l2norm, add_dense, 
-               neighbor_sample_sz, subgraph_sample_sz, val_interval);
   // default setting for now; can be customized by the user
   network.construct_layers();
   network.print_layers_info();

From d9aae5d50e8359861bd671b81db59900e468d090 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Wed, 6 May 2020 10:04:37 -0500
Subject: [PATCH 250/660] fix gtypes

---
 libdeepgalois/include/deepgalois/context.h    | 12 ++---
 libdeepgalois/include/deepgalois/gtypes.h     | 48 +++++++++++--------
 .../include/deepgalois/layers/aggregator.h    |  8 ++--
 .../include/deepgalois/layers/layer.h         |  8 ++--
 libdeepgalois/include/deepgalois/lgraph.h     | 11 +++--
 libdeepgalois/include/deepgalois/sampler.h    |  2 +-
 libdeepgalois/src/context.cpp                 |  9 +---
 libdeepgalois/src/layers/aggregator.cpp       |  4 +-
 libdeepgalois/src/layers/l2_norm_layer.cpp    |  1 +
 .../src/layers/sigmoid_loss_layer.cpp         |  5 +-
 .../src/layers/softmax_loss_layer.cpp         |  5 +-
 libdeepgalois/src/net.cpp                     |  5 +-
 libdeepgalois/src/node.cu                     |  2 +-
 libdeepgalois/src/sampler.cpp                 |  1 +
 14 files changed, 66 insertions(+), 55 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h
index f9ca056421..afabe49973 100644
--- a/libdeepgalois/include/deepgalois/context.h
+++ b/libdeepgalois/include/deepgalois/context.h
@@ -10,7 +10,7 @@
 #ifdef CPU_ONLY
 #include "deepgalois/gtypes.h"
 #else
-#include "graph_gpu.h"
+//#include "graph_gpu.h"
 #include "deepgalois/cutils.h"
 #endif
 
@@ -46,22 +46,22 @@ class Context {
   void gen_subgraph_feats(size_t m, const mask_t *masks);
   void createSubgraphs(int num_subgraphs);
 
-#ifdef CPU_ONLY
   Graph* graph_cpu; // the input graph, |V| = N
   std::vector<Graph*> subgraphs_cpu;
   void add_selfloop(Graph &og, Graph &g);
   //! returns pointer to the graph
   Graph* getGraphPointer() { return graph_cpu; }
   Graph* getSubgraphPointer(int id) { return subgraphs_cpu[id]; };
+#ifdef CPU_ONLY
   float_t* get_feats_ptr() { return h_feats; }
   float_t* get_feats_subg_ptr() { return &h_feats_subg[0]; }
   label_t* get_labels_ptr() { return h_labels; }
   label_t* get_labels_subg_ptr() { return &h_labels_subg[0]; }
 #else
-  CSRGraph graph_gpu; // the input graph, |V| = N
-  std::vector<CSRGraph*> subgraphs_gpu;
-  CSRGraph* getGraphPointer() { return &graph_gpu; }
-  CSRGraph* getSubgraphPointer(int id) { return subgraphs_gpu[id]; };
+  //CSRGraph graph_gpu; // the input graph, |V| = N
+  //std::vector<CSRGraph*> subgraphs_gpu;
+  //CSRGraph* getGraphPointer() { return &graph_gpu; }
+  //CSRGraph* getSubgraphPointer(int id) { return subgraphs_gpu[id]; };
   float_t* get_feats_ptr() { return d_feats; }
   float_t* get_feats_subg_ptr() { return d_feats_subg; }
   label_t* get_labels_ptr() { return d_labels; }
diff --git a/libdeepgalois/include/deepgalois/gtypes.h b/libdeepgalois/include/deepgalois/gtypes.h
index 697d386d9a..cc6fba8041 100644
--- a/libdeepgalois/include/deepgalois/gtypes.h
+++ b/libdeepgalois/include/deepgalois/gtypes.h
@@ -1,37 +1,47 @@
 #pragma once
 
-#include "galois/Galois.h"
-#include "galois/graphs/LCGraph.h"
 #include "deepgalois/types.h"
-#include "deepgalois/lgraph.h"
 #ifdef GALOIS_USE_DIST
+#include "galois/Galois.h"
 #include "galois/graphs/NewGeneric.h"
+#else
+#ifdef CPU_ONLY
+//#include "galois/Galois.h"
+//#include "galois/graphs/LCGraph.h"
+#include "deepgalois/lgraph.h"
+#else
+#include "graph_gpu.h"
 #endif
-
-namespace deepgalois {
-
-typedef galois::GAccumulator<acc_t> AccumF;
-typedef galois::GAccumulator<size_t> AccumU;
-#ifdef GALOIS_USE_DIST
-using AccuracyAccum = galois::DGAccumulator<acc_t>;
 #endif
 
 #ifndef GALOIS_USE_DIST
-#ifdef EDGE_LABEL
-typedef galois::graphs::LC_CSR_Graph<uint32_t, uint32_t>::with_numa_alloc<
-    true>::type ::with_no_lockable<true>::type LCGraph;
-#else
-typedef galois::graphs::LC_CSR_Graph<void, void, false, false, false, void, uint64_t, uint64_t>::
-    with_numa_alloc<true>::type ::with_no_lockable<true>::type LCGraph;
-#endif
+
+namespace deepgalois {
+#ifdef CPU_ONLY
+//#ifdef EDGE_LABEL
+//typedef galois::graphs::LC_CSR_Graph<uint32_t, uint32_t>::
+//    with_numa_alloc<true>::type ::with_no_lockable<true>::type LCGraph;
+//#else
+//typedef galois::graphs::LC_CSR_Graph<void, void, false, false, false, void, uint64_t, uint64_t>::
+//    with_numa_alloc<true>::type ::with_no_lockable<true>::type LCGraph;
+//#endif
 //typedef LCGraph Graph;
 //typedef Graph::edge_iterator edge_iterator;
-typedef LearningGraph Graph;
 typedef index_t edge_iterator;
+typedef LearningGraph Graph;
+#else
+//typedef CSRGraph GraphGPU;
+typedef LearningGraph GraphGPU;
+#endif
+}
+
 #else
+
+namespace deepgalois {
 // TODO check if this needs changing
 typedef index_t edge_iterator;
 using Graph = galois::graphs::DistGraph<char, void>;
+}
+
 #endif
 
-}
diff --git a/libdeepgalois/include/deepgalois/layers/aggregator.h b/libdeepgalois/include/deepgalois/layers/aggregator.h
index 1b2d4b5104..67d4bedf3f 100644
--- a/libdeepgalois/include/deepgalois/layers/aggregator.h
+++ b/libdeepgalois/include/deepgalois/layers/aggregator.h
@@ -1,9 +1,9 @@
 #pragma once
-#include "deepgalois/types.h"
+//#include "deepgalois/types.h"
+#include "deepgalois/gtypes.h"
 //! For each node in the graph, add the embeddings of all of its neighbors
 //! together (using norm_factor if specified)
 #ifdef CPU_ONLY
-#include "deepgalois/gtypes.h"
 namespace deepgalois {
 void update_all(size_t len, Graph& g, const float_t* in, float_t* out,
                 bool norm, float_t* norm_factor);
@@ -13,9 +13,9 @@ void update_all_csrmm(size_t len, Graph& g, const float_t* in,
 #else
 #include "graph_gpu.h"
 namespace deepgalois {
-void update_all(size_t len, CSRGraph& g, const float_t* in, float_t* out,
+void update_all(size_t len, GraphGPU& g, const float_t* in, float_t* out,
                 bool norm, const float_t* norm_factor);
-void update_all_csrmm(size_t len, CSRGraph& g, const float_t* in, 
+void update_all_csrmm(size_t len, GraphGPU& g, const float_t* in, 
                 float_t* out, bool norm, const float_t* norm_factor);
 }
 #endif
diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h
index 206e5e7da3..5d4d1419d2 100644
--- a/libdeepgalois/include/deepgalois/layers/layer.h
+++ b/libdeepgalois/include/deepgalois/layers/layer.h
@@ -75,11 +75,11 @@ class layer : public deepgalois::node {
   void set_norm_consts_ptr(float_t *ptr) { norm_consts = ptr; }
   void set_feats_ptr(float_t *ptr) { prev_->set_data(ptr); }
   void set_name(std::string name) { name_ = name; } // name metadata
-#ifdef CPU_ONLY
+//#ifdef CPU_ONLY
   void set_graph_ptr(Graph *ptr) { graph_cpu = ptr; }
-#else
-  void set_graph_ptr(CSRGraph *ptr) { graph_gpu = ptr; }
-#endif
+//#else
+//  void set_graph_ptr(CSRGraph *ptr) { graph_gpu = ptr; }
+//#endif
   void update_dim_size(size_t g_size) { input_dims[0] = output_dims[0] = g_size; }
 
   //! set the data of the previous layer connected to this one
diff --git a/libdeepgalois/include/deepgalois/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h
index f8e5ce8315..a82a80c989 100644
--- a/libdeepgalois/include/deepgalois/lgraph.h
+++ b/libdeepgalois/include/deepgalois/lgraph.h
@@ -38,11 +38,7 @@ class LearningGraph {
   void readGraphFromGRFile(const std::string& filename);
   size_t size() { return (size_t)num_vertices_; }
   size_t sizeEdges() { return (size_t)num_edges_; }
-  index_t getDegree(index_t vid) { return degrees_[vid]; }
-  index_t getEdgeDst(index_t eid) { return colidx_[eid]; }
   index_t get_degree(index_t vid) { return degrees_[vid]; }
-  index_t edge_begin(index_t vid) { return rowptr_[vid]; }
-  index_t edge_end(index_t vid) { return rowptr_[vid+1]; }
 
   iterator begin() const { return iterator(0); }
   iterator end() const { return iterator(num_vertices_); }
@@ -66,6 +62,11 @@ class LearningGraph {
   uint64_t globalSize();
 
 #ifdef CPU_ONLY
+  index_t getEdgeDst(index_t eid) { return colidx_[eid]; }
+  index_t edge_begin(index_t vid) { return rowptr_[vid]; }
+  index_t edge_end(index_t vid) { return rowptr_[vid+1]; }
+	vdata_t getData(unsigned vid) { return vertex_data_[vid]; }
+  index_t getDegree(index_t vid) { return degrees_[vid]; }
   index_t* row_start_ptr() { return &rowptr_[0]; }
   const index_t* row_start_ptr() const { return &rowptr_[0]; }
   index_t* edge_dst_ptr() { return &colidx_[0]; }
@@ -77,7 +78,7 @@ class LearningGraph {
 	__device__ index_t getEdgeDst(unsigned edge) { return colidx_[edge]; }
 	__device__ index_t edge_begin(unsigned src) { return d_rowptr_[src]; }
 	__device__ index_t edge_end(unsigned src) { return d_rowptr_[src+1]; }
-	__device__ vdata_t getData(unsigned vid) { return vertex_data_[vid]; }
+	__device__ vdata_t getData(unsigned vid) { return d_vertex_data_[vid]; }
 	__device__ index_t getDegree(unsigned vid) { return d_degrees_[vid]; }
 	index_t *row_start_ptr() { return d_rowptr_; }
 	const index_t *row_start_ptr() const { return d_rowptr_; }
diff --git a/libdeepgalois/include/deepgalois/sampler.h b/libdeepgalois/include/deepgalois/sampler.h
index eb3b936d18..c5f8abd219 100644
--- a/libdeepgalois/include/deepgalois/sampler.h
+++ b/libdeepgalois/include/deepgalois/sampler.h
@@ -22,7 +22,7 @@ class Sampler {
   virtual void select_vertices(size_t nv, size_t n, int m, Graph* g, VertexList vertices, VertexSet &vertex_set);
   virtual void select_vertices(size_t n, int m, VertexSet &vertex_set, unsigned tid);
 
-  galois::runtime::iterable<galois::NoDerefIterator<edge_iterator> > neighbor_sampler(Graph &g, VertexID v);
+  //galois::runtime::iterable<galois::NoDerefIterator<edge_iterator> > neighbor_sampler(Graph &g, VertexID v);
 
   edge_iterator sampled_edge_begin(Graph &g, VertexID v) { return g.edge_begin(v); }
 
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index 71410eee13..bfa006a1d7 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -4,12 +4,10 @@
 #include "deepgalois/context.h"
 #include "deepgalois/utils.h"
 #include "deepgalois/configs.h"
-//#include <boost/thread.hpp>
+#include "galois/Galois.h"
 
 namespace deepgalois {
 
-#ifdef CPU_ONLY
-
 Context::Context() : Context(false) {}
 
 Context::~Context() {
@@ -247,10 +245,6 @@ void Context::read_edgelist(const char* filename, bool symmetrize, bool add_self
   }
 }
 
-#endif
-
-
-
 /*
 inline void init_features(size_t dim, vec_t &x) {
     std::default_random_engine rng;
@@ -259,4 +253,5 @@ inline void init_features(size_t dim, vec_t &x) {
         x[i] = dist(rng);
 }
 */
+
 } // end deepgalois namespace
diff --git a/libdeepgalois/src/layers/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp
index d17cf79a72..8b9e726e8e 100644
--- a/libdeepgalois/src/layers/aggregator.cpp
+++ b/libdeepgalois/src/layers/aggregator.cpp
@@ -1,7 +1,7 @@
 #include "deepgalois/layers/aggregator.h"
 #include "deepgalois/math_functions.hh"
+#include "galois/Galois.h"
 
-#ifdef CPU_ONLY
 void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* out,
                 bool norm, float_t* norm_factor) {
   //std::cout << "[update_all] graph size: " << n << "\n";
@@ -50,4 +50,4 @@ void deepgalois::update_all_csrmm(size_t len, Graph& g, const float_t* in, float
   math::csrmm_cpu(n, len, n, g.sizeEdges(), 1.0, norm_factor, (int*)g.row_start_ptr(), (int*)g.edge_dst_ptr(), in, 0.0, out);
   Tcsrmm.stop();
 }
-#endif
+
diff --git a/libdeepgalois/src/layers/l2_norm_layer.cpp b/libdeepgalois/src/layers/l2_norm_layer.cpp
index a5a77eb82e..f1cb6a4445 100644
--- a/libdeepgalois/src/layers/l2_norm_layer.cpp
+++ b/libdeepgalois/src/layers/l2_norm_layer.cpp
@@ -1,5 +1,6 @@
 #include "deepgalois/layers/l2_norm_layer.h"
 #include "deepgalois/math_functions.hh"
+#include "galois/Galois.h"
 
 namespace deepgalois {
 
diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
index 5a511d2308..d7ec46378e 100644
--- a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
+++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
@@ -1,5 +1,6 @@
 #include "deepgalois/layers/sigmoid_loss_layer.h"
 #include "deepgalois/math_functions.hh"
+#include "galois/Galois.h"
 
 namespace deepgalois {
 
@@ -63,8 +64,8 @@ void sigmoid_loss_layer::back_propagation(const float_t* in_data, const float_t*
 
 acc_t sigmoid_loss_layer::get_prediction_loss() {
   assert(count_ > 0);
-  AccumF total_loss;
-  AccumU valid_sample_count;
+  galois::GAccumulator<acc_t> total_loss;
+  galois::GAccumulator<size_t> valid_sample_count;
   total_loss.reset();
   valid_sample_count.reset();
   galois::do_all(galois::iterate(layer::begin_, layer::end_), [&](const auto& i) {
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp
index 2fc7ac80dc..d40ff6d411 100644
--- a/libdeepgalois/src/layers/softmax_loss_layer.cpp
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp
@@ -1,5 +1,6 @@
 #include "deepgalois/layers/softmax_loss_layer.h"
 #include "deepgalois/math_functions.hh"
+#include "galois/Galois.h"
 
 namespace deepgalois {
 
@@ -66,8 +67,8 @@ void softmax_loss_layer::back_propagation(const float_t* in_data, const float_t*
 
 acc_t softmax_loss_layer::get_prediction_loss() {
   assert(count_ > 0);
-  AccumF total_loss;
-  AccumU valid_sample_count;
+  galois::GAccumulator<acc_t> total_loss;
+  galois::GAccumulator<size_t> valid_sample_count;
   total_loss.reset();
   valid_sample_count.reset();
   galois::do_all(galois::iterate(layer::begin_, layer::end_), [&](const auto& i) {
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index 1f63eacc60..381539df6b 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -3,6 +3,7 @@
  */
 
 #include "galois/Timer.h"
+#include "galois/Galois.h"
 #include "deepgalois/net.h"
 #include "deepgalois/math_functions.hh"
 
@@ -83,9 +84,9 @@ void Net::normalize() {
  */
 acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, float_t* preds, label_t* ground_truth) {
 #ifndef GALOIS_USE_DIST
-  AccumF accuracy_all;
+  galois::GAccumulator<acc_t> accuracy_all;
 #else
-  AccuracyAccum accuracy_all;
+  galois::DGAccumulator<acc_t> accuracy_all;
   galois::DGAccumulator<uint32_t> sampleCount;
   sampleCount.reset();
 #endif
diff --git a/libdeepgalois/src/node.cu b/libdeepgalois/src/node.cu
index b5a17af1fd..afaceaeaea 100644
--- a/libdeepgalois/src/node.cu
+++ b/libdeepgalois/src/node.cu
@@ -9,7 +9,7 @@ void edge::alloc() {
   CUDA_CHECK(cudaMalloc((void**)&grad_, num_samples_ * ft_dim_ * sizeof(float_t)));
 }
 
-void edge::merge_grads_gpu(float_t* dst) {
+void edge::merge_grads(float_t* dst) {
   CUDA_CHECK(cudaMemcpy(&dst, grad_, ft_dim_ * sizeof(float_t), cudaMemcpyDeviceToHost));
 }
 
diff --git a/libdeepgalois/src/sampler.cpp b/libdeepgalois/src/sampler.cpp
index 47317bdd3d..f61f1bcaa4 100644
--- a/libdeepgalois/src/sampler.cpp
+++ b/libdeepgalois/src/sampler.cpp
@@ -1,5 +1,6 @@
 #include "deepgalois/utils.h"
 #include "deepgalois/sampler.h"
+#include "galois/Galois.h"
 #include <time.h> 
 #include <vector>
 #define PARALLEL_GEN

From c3001cc20ddf33435611bd419042ebdc2372d0b4 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Wed, 6 May 2020 10:57:41 -0500
Subject: [PATCH 251/660] fix GraphGPU type

---
 libdeepgalois/include/deepgalois/context.h    | 15 +++++------
 libdeepgalois/include/deepgalois/gtypes.h     |  5 ++--
 .../include/deepgalois/layers/aggregator.h    |  7 +++---
 .../deepgalois/layers/graph_conv_layer.h      |  4 +--
 .../include/deepgalois/layers/layer.h         | 12 ++++-----
 libdeepgalois/include/deepgalois/lgraph.h     | 15 ++++++-----
 libdeepgalois/src/context.cu                  | 25 +++++++++++--------
 libdeepgalois/src/layers/aggregator.cu        | 18 +++++++------
 libdeepgalois/src/layers/graph_conv_layer.cu  |  4 +--
 lonestar/gnn/gcn/CMakeLists.txt               |  5 ++++
 10 files changed, 62 insertions(+), 48 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h
index afabe49973..f8b848f453 100644
--- a/libdeepgalois/include/deepgalois/context.h
+++ b/libdeepgalois/include/deepgalois/context.h
@@ -7,10 +7,11 @@
 #include <cassert>
 #include "deepgalois/types.h"
 #include "deepgalois/reader.h"
-#ifdef CPU_ONLY
+//#ifdef CPU_ONLY
 #include "deepgalois/gtypes.h"
-#else
+//#else
 //#include "graph_gpu.h"
+#ifndef CPU_ONLY
 #include "deepgalois/cutils.h"
 #endif
 
@@ -46,22 +47,22 @@ class Context {
   void gen_subgraph_feats(size_t m, const mask_t *masks);
   void createSubgraphs(int num_subgraphs);
 
+#ifdef CPU_ONLY
   Graph* graph_cpu; // the input graph, |V| = N
   std::vector<Graph*> subgraphs_cpu;
   void add_selfloop(Graph &og, Graph &g);
   //! returns pointer to the graph
   Graph* getGraphPointer() { return graph_cpu; }
   Graph* getSubgraphPointer(int id) { return subgraphs_cpu[id]; };
-#ifdef CPU_ONLY
   float_t* get_feats_ptr() { return h_feats; }
   float_t* get_feats_subg_ptr() { return &h_feats_subg[0]; }
   label_t* get_labels_ptr() { return h_labels; }
   label_t* get_labels_subg_ptr() { return &h_labels_subg[0]; }
 #else
-  //CSRGraph graph_gpu; // the input graph, |V| = N
-  //std::vector<CSRGraph*> subgraphs_gpu;
-  //CSRGraph* getGraphPointer() { return &graph_gpu; }
-  //CSRGraph* getSubgraphPointer(int id) { return subgraphs_gpu[id]; };
+  GraphGPU graph_gpu; // the input graph, |V| = N
+  std::vector<GraphGPU*> subgraphs_gpu;
+  GraphGPU* getGraphPointer() { return &graph_gpu; }
+  GraphGPU* getSubgraphPointer(int id) { return subgraphs_gpu[id]; };
   float_t* get_feats_ptr() { return d_feats; }
   float_t* get_feats_subg_ptr() { return d_feats_subg; }
   label_t* get_labels_ptr() { return d_labels; }
diff --git a/libdeepgalois/include/deepgalois/gtypes.h b/libdeepgalois/include/deepgalois/gtypes.h
index cc6fba8041..d12ac8e0d1 100644
--- a/libdeepgalois/include/deepgalois/gtypes.h
+++ b/libdeepgalois/include/deepgalois/gtypes.h
@@ -10,13 +10,15 @@
 //#include "galois/graphs/LCGraph.h"
 #include "deepgalois/lgraph.h"
 #else
-#include "graph_gpu.h"
+//#include "graph_gpu.h"
+#include "deepgalois/lgraph.h"
 #endif
 #endif
 
 #ifndef GALOIS_USE_DIST
 
 namespace deepgalois {
+typedef index_t edge_iterator;
 #ifdef CPU_ONLY
 //#ifdef EDGE_LABEL
 //typedef galois::graphs::LC_CSR_Graph<uint32_t, uint32_t>::
@@ -27,7 +29,6 @@ namespace deepgalois {
 //#endif
 //typedef LCGraph Graph;
 //typedef Graph::edge_iterator edge_iterator;
-typedef index_t edge_iterator;
 typedef LearningGraph Graph;
 #else
 //typedef CSRGraph GraphGPU;
diff --git a/libdeepgalois/include/deepgalois/layers/aggregator.h b/libdeepgalois/include/deepgalois/layers/aggregator.h
index 67d4bedf3f..90c5781189 100644
--- a/libdeepgalois/include/deepgalois/layers/aggregator.h
+++ b/libdeepgalois/include/deepgalois/layers/aggregator.h
@@ -1,9 +1,9 @@
 #pragma once
-//#include "deepgalois/types.h"
-#include "deepgalois/gtypes.h"
+#include "deepgalois/types.h"
 //! For each node in the graph, add the embeddings of all of its neighbors
 //! together (using norm_factor if specified)
 #ifdef CPU_ONLY
+#include "deepgalois/gtypes.h"
 namespace deepgalois {
 void update_all(size_t len, Graph& g, const float_t* in, float_t* out,
                 bool norm, float_t* norm_factor);
@@ -11,7 +11,8 @@ void update_all_csrmm(size_t len, Graph& g, const float_t* in,
                 float_t* out, bool norm, float_t* norm_factor);
 }
 #else
-#include "graph_gpu.h"
+#include "deepgalois/gtypes.h"
+//#include "graph_gpu.h"
 namespace deepgalois {
 void update_all(size_t len, GraphGPU& g, const float_t* in, float_t* out,
                 bool norm, const float_t* norm_factor);
diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
index 92bc999653..56c0de0be6 100644
--- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
@@ -56,8 +56,8 @@ class graph_conv_layer : public layer {
   virtual void aggregate(size_t len, Graph& g, const float_t* in, float_t* out);
   void d_aggregate(size_t len, Graph& g, const float_t* in, float_t* out);
 #else
-  virtual void aggregate(size_t len, CSRGraph& g, const float_t* in, float_t* out);
-  void d_aggregate(size_t len, CSRGraph& g, const float_t* in, float_t* out);
+  virtual void aggregate(size_t len, GraphGPU& g, const float_t* in, float_t* out);
+  void d_aggregate(size_t len, GraphGPU& g, const float_t* in, float_t* out);
 #endif
   // user-defined combine function
   virtual void combine(size_t dim_x, size_t dim_y, const float_t* self, const float_t* neighbors, float_t* out);
diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h
index 5d4d1419d2..cebef58059 100644
--- a/libdeepgalois/include/deepgalois/layers/layer.h
+++ b/libdeepgalois/include/deepgalois/layers/layer.h
@@ -9,7 +9,7 @@
  * Reused/revised under 3-BSD
  */
 
-#include "deepgalois/types.h"
+#include "deepgalois/gtypes.h"
 #ifndef GALOIS_USE_DIST
 #include "deepgalois/context.h"
 #else
@@ -75,11 +75,11 @@ class layer : public deepgalois::node {
   void set_norm_consts_ptr(float_t *ptr) { norm_consts = ptr; }
   void set_feats_ptr(float_t *ptr) { prev_->set_data(ptr); }
   void set_name(std::string name) { name_ = name; } // name metadata
-//#ifdef CPU_ONLY
+#ifdef CPU_ONLY
   void set_graph_ptr(Graph *ptr) { graph_cpu = ptr; }
-//#else
-//  void set_graph_ptr(CSRGraph *ptr) { graph_gpu = ptr; }
-//#endif
+#else
+  void set_graph_ptr(GraphGPU *ptr) { graph_gpu = ptr; }
+#endif
   void update_dim_size(size_t g_size) { input_dims[0] = output_dims[0] = g_size; }
 
   //! set the data of the previous layer connected to this one
@@ -173,7 +173,7 @@ class layer : public deepgalois::node {
 #ifdef CPU_ONLY
   Graph *graph_cpu;
 #else
-  CSRGraph *graph_gpu;
+  GraphGPU *graph_gpu;
 #endif
 
 #ifdef GALOIS_USE_DIST
diff --git a/libdeepgalois/include/deepgalois/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h
index a82a80c989..77d48d87a6 100644
--- a/libdeepgalois/include/deepgalois/lgraph.h
+++ b/libdeepgalois/include/deepgalois/lgraph.h
@@ -1,7 +1,6 @@
 #pragma once
 #include "deepgalois/types.h"
 #include <string>
-//#include <boost/iterator/counting_iterator.hpp>
 
 namespace deepgalois {
 
@@ -27,7 +26,6 @@ class LearningGraph {
 
 public:
   typedef size_t iterator;
-  //using iterator = boost::counting_iterator<index_t>;
   LearningGraph(bool use_gpu) : is_device(use_gpu), num_vertices_(0), num_edges_(0),
                                 //rowptr_(NULL), colidx_(NULL), degrees_(NULL),
                                 vertex_data_(NULL), edge_data_(NULL) {}
@@ -65,7 +63,7 @@ class LearningGraph {
   index_t getEdgeDst(index_t eid) { return colidx_[eid]; }
   index_t edge_begin(index_t vid) { return rowptr_[vid]; }
   index_t edge_end(index_t vid) { return rowptr_[vid+1]; }
-	vdata_t getData(unsigned vid) { return vertex_data_[vid]; }
+	vdata_t getData(index_t vid) { return vertex_data_[vid]; }
   index_t getDegree(index_t vid) { return degrees_[vid]; }
   index_t* row_start_ptr() { return &rowptr_[0]; }
   const index_t* row_start_ptr() const { return &rowptr_[0]; }
@@ -75,11 +73,12 @@ class LearningGraph {
   edata_t* edge_data_ptr() { return edge_data_; }
   vdata_t* vertex_data_ptr() { return vertex_data_; }
 #else
-	__device__ index_t getEdgeDst(unsigned edge) { return colidx_[edge]; }
-	__device__ index_t edge_begin(unsigned src) { return d_rowptr_[src]; }
-	__device__ index_t edge_end(unsigned src) { return d_rowptr_[src+1]; }
-	__device__ vdata_t getData(unsigned vid) { return d_vertex_data_[vid]; }
-	__device__ index_t getDegree(unsigned vid) { return d_degrees_[vid]; }
+	__device__ index_t getEdgeDst(index_t edge) { return d_colidx_[edge]; }
+	__device__ index_t edge_begin(index_t src) { return d_rowptr_[src]; }
+	__device__ index_t edge_end(index_t src) { return d_rowptr_[src+1]; }
+	__device__ vdata_t getData(index_t vid) { return d_vertex_data_[vid]; }
+	__device__ index_t getDegree(index_t vid) { return d_degrees_[vid]; }
+	__device__ index_t getOutDegree(index_t vid) { return d_degrees_[vid]; }
 	index_t *row_start_ptr() { return d_rowptr_; }
 	const index_t *row_start_ptr() const { return d_rowptr_; }
 	index_t *edge_dst_ptr() { return d_colidx_; }
diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu
index 7f435e8ca8..528c34b7e5 100644
--- a/libdeepgalois/src/context.cu
+++ b/libdeepgalois/src/context.cu
@@ -24,8 +24,10 @@ int64_t cluster_seedgen(void) {
   return seed;
 }
 
+namespace deepgalois {
+
 // computing normalization factor for each vertex
-__global__ void norm_factor_computing_node(int n, CSRGraph graph, float_t* norm_fac) {
+__global__ void norm_factor_computing_node(int n, GraphGPU graph, float_t* norm_fac) {
   CUDA_KERNEL_LOOP(i, n) {
     float_t temp = sqrt(float_t(graph.getOutDegree(i)));
     if (temp == 0.0) norm_fac[i] = 0.0;
@@ -35,16 +37,16 @@ __global__ void norm_factor_computing_node(int n, CSRGraph graph, float_t* norm_
 
 // TODO: make sure self-loop added for each vertex
 // computing normalization factor for each edge
-__global__ void norm_factor_computing_edge(int n, CSRGraph graph, float_t* norm_fac) {
+__global__ void norm_factor_computing_edge(int n, GraphGPU graph, float_t* norm_fac) {
   CUDA_KERNEL_LOOP(src, n) {
     assert(src < n);
     float_t d_src = float_t(graph.getOutDegree(src));
     assert(d_src != 0.0); // should never be zero since self-loop added for each vertex
     d_src = 1.0 / sqrt(d_src);
-    index_type start = graph.edge_begin(src);
-    index_type end = graph.edge_end(src);
-	for (index_type e = start; e != end; e++) {
-      index_type dst = graph.getEdgeDst(e);
+    auto start = graph.edge_begin(src);
+    index_t end = graph.edge_end(src);
+	for (index_t e = start; e != end; e++) {
+      index_t dst = graph.getEdgeDst(e);
       if (dst >= n) printf("src=%d, dst=%d, e=%d, start=%d, end=%d\n", src, dst, e, start, end);
       assert(dst < n);
       float_t d_dst = float_t(graph.getOutDegree(dst));
@@ -55,8 +57,6 @@ __global__ void norm_factor_computing_edge(int n, CSRGraph graph, float_t* norm_
   }
 }
 
-namespace deepgalois {
-
 cublasHandle_t Context::cublas_handle_         = 0;
 cusparseHandle_t Context::cusparse_handle_     = 0;
 cusparseMatDescr_t Context::cusparse_matdescr_ = 0;
@@ -102,7 +102,7 @@ void Context::norm_factor_computing(bool is_subgraph, int subg_id) {
     exit(0);
   }
 #ifdef USE_CUSPARSE
-  int nnz = graph_gpu.nedges;
+  int nnz = graph_gpu.sizeEdges();
   CUDA_CHECK(cudaMalloc((void**)&norm_factors, nnz * sizeof(float_t)));
   init_const_gpu(nnz, 0.0, norm_factors);
   norm_factor_computing_edge<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, graph_gpu, norm_factors);
@@ -128,14 +128,17 @@ void Context::SetDevice(const int device_id) {
 */
 size_t Context::read_graph(bool selfloop) {
   std::string filename = path + dataset + ".csgr";
-  CSRGraph g;
-  g.read(filename.c_str(), false);
+  /*GraphGPU g;
+  graph.read(filename.c_str(), false);
   if (selfloop) {
     g.add_selfloop();
     is_selfloop_added = selfloop;
   }
   g.copy_to_gpu(graph_gpu);
   n = graph_gpu.nnodes;
+  */
+  graph_gpu.readGraphFromGRFile(filename);
+  graph_gpu.copy_to_gpu();
   return n;
 }
 
diff --git a/libdeepgalois/src/layers/aggregator.cu b/libdeepgalois/src/layers/aggregator.cu
index 1f739eef58..fe3aca0182 100644
--- a/libdeepgalois/src/layers/aggregator.cu
+++ b/libdeepgalois/src/layers/aggregator.cu
@@ -5,6 +5,8 @@
 #include "deepgalois/layers/aggregator.h"
 #include "deepgalois/math_functions.hh"
 
+namespace deepgalois {
+
 // TODO: use warp
 __device__ void scale_add(const int n, const float_t alpha, const float_t* a,
                           const float_t* b, float_t* y) {
@@ -12,7 +14,7 @@ __device__ void scale_add(const int n, const float_t alpha, const float_t* a,
     y[i] = alpha * a[i] + b[i];
 }
 
-__global__ void update_all_naive(size_t n, size_t len, CSRGraph g,
+__global__ void update_all_naive(size_t n, size_t len, GraphGPU g,
                                   const float_t* in, float_t* out,
                                   bool norm, const float_t* norm_factor) {
   CUDA_KERNEL_LOOP(src, n) {
@@ -29,7 +31,7 @@ __global__ void update_all_naive(size_t n, size_t len, CSRGraph g,
   }
 }
 
-__global__ void update_all_warp(size_t n, size_t len, CSRGraph g,
+__global__ void update_all_warp(size_t n, size_t len, GraphGPU g,
                                   const float_t* in, float_t* out,
                                   bool norm, const float_t* norm_factor) {
   __shared__ index_type ptrs[BLOCK_SIZE/WARP_SIZE][2];
@@ -59,23 +61,25 @@ __global__ void update_all_warp(size_t n, size_t len, CSRGraph g,
   }
 }
 
-void deepgalois::update_all(size_t len, CSRGraph& g, const float_t* in, float_t* out,
+void update_all(size_t len, GraphGPU& g, const float_t* in, float_t* out,
                 bool norm, const float_t* norm_factor) {
-  unsigned n = g.nnodes;
+  unsigned n = g.size();
   CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t)));
   //update_all_naive<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, len, g, in, out, norm, norm_factor);
   update_all_warp<<<(n-1)/WARPS_PER_BLOCK+1, BLOCK_SIZE>>>(n, len, g, in, out, norm, norm_factor);
   CudaTest("solving update_all kernel failed");
 }
 
-void deepgalois::update_all_csrmm(size_t len, CSRGraph& g, const float_t* in, float_t* out,
+void update_all_csrmm(size_t len, GraphGPU& g, const float_t* in, float_t* out,
                 bool norm, const float_t* norm_factor) {
-  unsigned n = g.nnodes;
+  unsigned n = g.size();
   CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t)));
   //std::cout << "[debug]: update_all on GPU, n=" << n << ", len=" << len << "\n";
   //print_device_vector(10, norm_factor, "norm_factor");
   float *temp;
   float_malloc_device(n*len, temp); // TODO: avoid repetitive allocation
-  csrmm_gpu(n, len, n, g.nedges, 1.0, norm_factor, (const int*)g.row_start_ptr(), (const int*)g.edge_dst_ptr(), in, 0.0, temp, out);
+  csrmm_gpu(n, len, n, g.sizeEdges(), 1.0, norm_factor, (const int*)g.row_start_ptr(), (const int*)g.edge_dst_ptr(), in, 0.0, temp, out);
   float_free_device(temp);
 }
+
+}
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cu b/libdeepgalois/src/layers/graph_conv_layer.cu
index f4282ced42..e814631022 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cu
+++ b/libdeepgalois/src/layers/graph_conv_layer.cu
@@ -30,7 +30,7 @@ void graph_conv_layer::malloc_and_init() {
   init_const_gpu(y*z, 0.0, layer::d_weight_grad);
 }
 
-void graph_conv_layer::aggregate(size_t len, CSRGraph& g, const float_t* in, float_t* out) {
+void graph_conv_layer::aggregate(size_t len, GraphGPU& g, const float_t* in, float_t* out) {
   #ifdef USE_CUSPARSE
   deepgalois::update_all_csrmm(len, g, in, out, norm_, norm_consts);
   #else
@@ -38,7 +38,7 @@ void graph_conv_layer::aggregate(size_t len, CSRGraph& g, const float_t* in, flo
   #endif
 }
 
-void graph_conv_layer::d_aggregate(size_t len, CSRGraph& g, const float_t* in, float_t* out) {
+void graph_conv_layer::d_aggregate(size_t len, GraphGPU& g, const float_t* in, float_t* out) {
 #ifdef USE_CUSPARSE
   deepgalois::update_all_csrmm(len, g, in, out, norm_, norm_consts);
 #else
diff --git a/lonestar/gnn/gcn/CMakeLists.txt b/lonestar/gnn/gcn/CMakeLists.txt
index 589f60b881..80daca4c78 100644
--- a/lonestar/gnn/gcn/CMakeLists.txt
+++ b/lonestar/gnn/gcn/CMakeLists.txt
@@ -1,7 +1,12 @@
+if(ENABLE_HETERO_GALOIS)
+  set_source_files_properties(gcn.cpp PROPERTIES LANGUAGE CUDA)
+endif()
 add_executable(gcn gcn.cpp)
 target_link_libraries(gcn PRIVATE Galois::shmem lonestar)
 
 if(ENABLE_HETERO_GALOIS)
+  set_property(TARGET gcn PROPERTY CUDA_STANDARD 14)
+  set_property(TARGET gcn PROPERTY CUDA_SEPARABLE_COMPILATION ON)
   target_link_libraries(gcn PRIVATE dg_gpu)
   target_link_libraries(gcn PRIVATE -lcudart -lcublas -lcurand -lcudadevrt)
 else()

From ca77dc4675706eac1ca156a8e3041ff72422115c Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 5 May 2020 21:45:22 -0500
Subject: [PATCH 252/660] added a define to disable boost 1.69 warnings

---
 libgalois/include/galois/runtime/Statistics.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/libgalois/include/galois/runtime/Statistics.h b/libgalois/include/galois/runtime/Statistics.h
index 253bfcad01..5c6df094cb 100644
--- a/libgalois/include/galois/runtime/Statistics.h
+++ b/libgalois/include/galois/runtime/Statistics.h
@@ -28,6 +28,8 @@
 #include <sys/resource.h>
 #include <sys/time.h>
 
+// added her to get rid of annoying int log deprecation in boost 1.69
+#define BOOST_ALLOW_DEPRECATED_HEADERS
 #include <boost/uuid/uuid.hpp>            // uuid class
 #include <boost/uuid/uuid_generators.hpp> // generators
 #include <boost/uuid/uuid_io.hpp>         // streaming operators etc.

From 532d41fac386a851e7e62f50385a7eca8e9b19c9 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 6 May 2020 11:41:08 -0500
Subject: [PATCH 253/660] cmake changes (indentation fixes)

---
 libdeepgalois/CMakeLists.txt | 84 ++++++++++++++++++------------------
 1 file changed, 43 insertions(+), 41 deletions(-)

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index 23a0b44ed7..d57962c185 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -70,48 +70,50 @@ else()
 endif()
 
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+
 if(NOT ENABLE_HETERO_GALOIS)
-if(ENABLE_DIST_GALOIS)
-# do not link regular context.cpp; TODO do this conditional in cleaner way
-# also don't link sampler
-set(sources
-  src/layers/softmax_loss_layer.cpp
-  src/layers/sigmoid_loss_layer.cpp
-  src/layers/graph_conv_layer.cpp
-  src/layers/leaky_relu_layer.cpp
-  src/layers/l2_norm_layer.cpp
-  src/layers/relu_layer.cpp
-  src/layers/aggregator.cpp
-  src/math_functions.cpp
-  src/layers/layer.cpp
-  src/DistContext.cpp
-  src/optimizer.cpp
-  src/reader.cpp
-  src/lgraph.cpp
-  src/utils.cpp
-  src/node.cpp
-  src/net.cpp
-)
-else()
-set(sources
-  src/layers/softmax_loss_layer.cpp
-  src/layers/sigmoid_loss_layer.cpp
-  src/layers/graph_conv_layer.cpp
-  src/layers/leaky_relu_layer.cpp
-  src/layers/l2_norm_layer.cpp
-  src/layers/relu_layer.cpp
-  src/layers/aggregator.cpp
-  src/math_functions.cpp
-  src/layers/layer.cpp
-  src/optimizer.cpp
-  src/context.cpp
-  src/sampler.cpp
-  src/reader.cpp
-  src/lgraph.cpp
-  src/utils.cpp
-  src/node.cpp
-  src/net.cpp
-)
+  if(ENABLE_DIST_GALOIS)
+  # do not link regular context.cpp; TODO do this conditional in cleaner way
+  # also don't link sampler
+    set(sources
+      src/layers/softmax_loss_layer.cpp
+      src/layers/sigmoid_loss_layer.cpp
+      src/layers/graph_conv_layer.cpp
+      src/layers/leaky_relu_layer.cpp
+      src/layers/l2_norm_layer.cpp
+      src/layers/relu_layer.cpp
+      src/layers/aggregator.cpp
+      src/math_functions.cpp
+      src/layers/layer.cpp
+      src/DistContext.cpp
+      src/optimizer.cpp
+      src/reader.cpp
+      src/lgraph.cpp
+      src/utils.cpp
+      src/node.cpp
+      src/net.cpp
+    )
+  else()
+    set(sources
+      src/layers/softmax_loss_layer.cpp
+      src/layers/sigmoid_loss_layer.cpp
+      src/layers/graph_conv_layer.cpp
+      src/layers/leaky_relu_layer.cpp
+      src/layers/l2_norm_layer.cpp
+      src/layers/relu_layer.cpp
+      src/layers/aggregator.cpp
+      src/math_functions.cpp
+      src/layers/layer.cpp
+      src/optimizer.cpp
+      src/context.cpp
+      src/sampler.cpp
+      src/reader.cpp
+      src/lgraph.cpp
+      src/utils.cpp
+      src/node.cpp
+      src/net.cpp
+    )
+  endif()
 endif()
 
 add_library(dg_cpu STATIC ${sources})

From 2c90c4934731b0240278f45a5b317c0bc07fe7c8 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 6 May 2020 11:50:26 -0500
Subject: [PATCH 254/660] deepgalois cmake if closure fix

---
 libdeepgalois/CMakeLists.txt | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index d57962c185..5a732ccf93 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -113,8 +113,8 @@ if(NOT ENABLE_HETERO_GALOIS)
       src/node.cpp
       src/net.cpp
     )
-  endif()
-endif()
+  endif(ENABLE_DIST_GALOIS)
+endif(NOT ENABLE_HETERO_GALOIS)
 
 add_library(dg_cpu STATIC ${sources})
 target_link_libraries(dg_cpu galois_shmem)
@@ -148,4 +148,3 @@ set_target_properties(dg_cpu PROPERTIES
   INTERFACE_POSITION_INDEPENDENT_CODE On
   POSITION_INDEPENDENT_CODE On
 )
-endif()

From 390ab47f90f003fb66853ff1027e63ca361b0181 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Wed, 6 May 2020 12:27:28 -0500
Subject: [PATCH 255/660] update gtype

---
 libdeepgalois/include/deepgalois/gtypes.h | 11 +++++++++--
 libdeepgalois/src/context.cu              | 10 ++++++----
 lonestar/gnn/gcn/CMakeLists.txt           |  6 +++---
 lonestar/gnn/include/lonestargnn.h        |  2 --
 4 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/gtypes.h b/libdeepgalois/include/deepgalois/gtypes.h
index d12ac8e0d1..c011ad4537 100644
--- a/libdeepgalois/include/deepgalois/gtypes.h
+++ b/libdeepgalois/include/deepgalois/gtypes.h
@@ -1,4 +1,5 @@
 #pragma once
+#define USE_CSRGRAPH
 
 #include "deepgalois/types.h"
 #ifdef GALOIS_USE_DIST
@@ -10,10 +11,13 @@
 //#include "galois/graphs/LCGraph.h"
 #include "deepgalois/lgraph.h"
 #else
-//#include "graph_gpu.h"
+#ifdef USE_CSRGRAPH
+#include "graph_gpu.h"
+#else
 #include "deepgalois/lgraph.h"
 #endif
 #endif
+#endif
 
 #ifndef GALOIS_USE_DIST
 
@@ -31,9 +35,12 @@ typedef index_t edge_iterator;
 //typedef Graph::edge_iterator edge_iterator;
 typedef LearningGraph Graph;
 #else
-//typedef CSRGraph GraphGPU;
+#ifdef USE_CSRGRAPH
+typedef CSRGraph GraphGPU;
+#else
 typedef LearningGraph GraphGPU;
 #endif
+#endif
 }
 
 #else
diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu
index 528c34b7e5..bbaddf0e99 100644
--- a/libdeepgalois/src/context.cu
+++ b/libdeepgalois/src/context.cu
@@ -128,17 +128,19 @@ void Context::SetDevice(const int device_id) {
 */
 size_t Context::read_graph(bool selfloop) {
   std::string filename = path + dataset + ".csgr";
-  /*GraphGPU g;
-  graph.read(filename.c_str(), false);
+#ifdef USE_CSRGRAPH
+  GraphGPU g;
+  g.read(filename.c_str(), false);
   if (selfloop) {
     g.add_selfloop();
     is_selfloop_added = selfloop;
   }
   g.copy_to_gpu(graph_gpu);
-  n = graph_gpu.nnodes;
-  */
+#else
   graph_gpu.readGraphFromGRFile(filename);
   graph_gpu.copy_to_gpu();
+#endif
+  n = graph_gpu.size();
   return n;
 }
 
diff --git a/lonestar/gnn/gcn/CMakeLists.txt b/lonestar/gnn/gcn/CMakeLists.txt
index 80daca4c78..ca8dcaa83e 100644
--- a/lonestar/gnn/gcn/CMakeLists.txt
+++ b/lonestar/gnn/gcn/CMakeLists.txt
@@ -1,6 +1,6 @@
-if(ENABLE_HETERO_GALOIS)
-  set_source_files_properties(gcn.cpp PROPERTIES LANGUAGE CUDA)
-endif()
+#if(ENABLE_HETERO_GALOIS)
+#  set_source_files_properties(gcn.cpp PROPERTIES LANGUAGE CUDA)
+#endif()
 add_executable(gcn gcn.cpp)
 target_link_libraries(gcn PRIVATE Galois::shmem lonestar)
 
diff --git a/lonestar/gnn/include/lonestargnn.h b/lonestar/gnn/include/lonestargnn.h
index a72668daab..d0255b9368 100644
--- a/lonestar/gnn/include/lonestargnn.h
+++ b/lonestar/gnn/include/lonestargnn.h
@@ -109,6 +109,4 @@ void LonestarGnnStart(int argc, char** argv, const char* app, const char* desc,
   galois::runtime::reportParam("(NULL)", "Hostname", name);
 }
 
-#include "deepgalois/types.h"
-#include "deepgalois/utils.h"
 #include "deepgalois/net.h"

From 4f07d348430c6d5961dd21be44e7863fb8c9d3c5 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 6 May 2020 15:56:56 -0500
Subject: [PATCH 256/660] TODO something needs to be included as part of
 dg_cpu, dg_gpu doesn't build either

---
 libdeepgalois/CMakeLists.txt | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index 5a732ccf93..6c694dc038 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -114,6 +114,12 @@ if(NOT ENABLE_HETERO_GALOIS)
       src/net.cpp
     )
   endif(ENABLE_DIST_GALOIS)
+else()
+  # dummy sources set for dg_cpu for HETERO build
+  # TODO fix this
+  set(sources
+    src/net.cpp
+  )
 endif(NOT ENABLE_HETERO_GALOIS)
 
 add_library(dg_cpu STATIC ${sources})

From 3ace80fce5bc0eb6b02ef90d2d2956dcf0b51c71 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Wed, 6 May 2020 18:09:14 -0500
Subject: [PATCH 257/660] fix gpu compilation

---
 libdeepgalois/CMakeLists.txt                  |   6 +-
 libdeepgalois/include/deepgalois/gtypes.h     |   5 +-
 .../include/deepgalois/layers/l2_norm_layer.h |   8 +-
 .../include/deepgalois/layers/layer.h         |   9 +-
 libdeepgalois/include/deepgalois/lgraph.h     |  32 ++--
 libdeepgalois/include/deepgalois/reader.h     |   4 +-
 libdeepgalois/src/context.cpp                 |   6 +-
 libdeepgalois/src/context.cu                  |   4 +-
 libdeepgalois/src/layers/l2_norm_layer.cpp    |  10 --
 libdeepgalois/src/layers/layer.cpp            |  12 --
 libdeepgalois/src/lgraph.cpp                  | 142 +-----------------
 libdeepgalois/src/lgraph.cu                   |   8 +
 libdeepgalois/src/net.cu                      |   2 +
 libdeepgalois/src/optimizer.cu                |   1 +
 libdeepgalois/src/reader.cpp                  | 107 +++++++++++++
 lonestar/gnn/gcn/CMakeLists.txt               |   2 +-
 16 files changed, 169 insertions(+), 189 deletions(-)
 delete mode 100644 libdeepgalois/src/layers/layer.cpp

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index 6c694dc038..41e5130818 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -84,7 +84,6 @@ if(NOT ENABLE_HETERO_GALOIS)
       src/layers/relu_layer.cpp
       src/layers/aggregator.cpp
       src/math_functions.cpp
-      src/layers/layer.cpp
       src/DistContext.cpp
       src/optimizer.cpp
       src/reader.cpp
@@ -103,7 +102,6 @@ if(NOT ENABLE_HETERO_GALOIS)
       src/layers/relu_layer.cpp
       src/layers/aggregator.cpp
       src/math_functions.cpp
-      src/layers/layer.cpp
       src/optimizer.cpp
       src/context.cpp
       src/sampler.cpp
@@ -117,9 +115,7 @@ if(NOT ENABLE_HETERO_GALOIS)
 else()
   # dummy sources set for dg_cpu for HETERO build
   # TODO fix this
-  set(sources
-    src/net.cpp
-  )
+  set(sources src/reader.cpp)
 endif(NOT ENABLE_HETERO_GALOIS)
 
 add_library(dg_cpu STATIC ${sources})
diff --git a/libdeepgalois/include/deepgalois/gtypes.h b/libdeepgalois/include/deepgalois/gtypes.h
index c011ad4537..9b85007d28 100644
--- a/libdeepgalois/include/deepgalois/gtypes.h
+++ b/libdeepgalois/include/deepgalois/gtypes.h
@@ -1,5 +1,5 @@
 #pragma once
-#define USE_CSRGRAPH
+//#define USE_CSRGRAPH
 
 #include "deepgalois/types.h"
 #ifdef GALOIS_USE_DIST
@@ -23,7 +23,6 @@
 
 namespace deepgalois {
 typedef index_t edge_iterator;
-#ifdef CPU_ONLY
 //#ifdef EDGE_LABEL
 //typedef galois::graphs::LC_CSR_Graph<uint32_t, uint32_t>::
 //    with_numa_alloc<true>::type ::with_no_lockable<true>::type LCGraph;
@@ -34,13 +33,11 @@ typedef index_t edge_iterator;
 //typedef LCGraph Graph;
 //typedef Graph::edge_iterator edge_iterator;
 typedef LearningGraph Graph;
-#else
 #ifdef USE_CSRGRAPH
 typedef CSRGraph GraphGPU;
 #else
 typedef LearningGraph GraphGPU;
 #endif
-#endif
 }
 
 #else
diff --git a/libdeepgalois/include/deepgalois/layers/l2_norm_layer.h b/libdeepgalois/include/deepgalois/layers/l2_norm_layer.h
index b15c1ae671..29e29f3474 100644
--- a/libdeepgalois/include/deepgalois/layers/l2_norm_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/l2_norm_layer.h
@@ -5,7 +5,12 @@ namespace deepgalois {
 // L2 Normalization Layer
 class l2_norm_layer : public layer {
 public:
-  l2_norm_layer(unsigned level, float_t eps, float_t scale, dims_t in_dims, dims_t out_dims);
+  l2_norm_layer(unsigned level, float_t eps, float_t scale, dims_t in_dims, dims_t out_dims)
+    : layer(level, in_dims, out_dims), epsilon_(eps), scale_(scale) {
+    assert(input_dims[0] == output_dims[0]); // num_vertices
+    trainable_ = false;
+    name_ = layer_type() + "_" + std::to_string(level);
+  }
   l2_norm_layer(unsigned level, dims_t in_dims, dims_t out_dims) :
     l2_norm_layer(level, 1e-12, 20, in_dims, out_dims) {}
   ~l2_norm_layer() {}
@@ -17,4 +22,5 @@ class l2_norm_layer : public layer {
   float_t epsilon_;
   float_t scale_;
 };
+
 } // namespace
diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h
index cebef58059..c0f03aafd3 100644
--- a/libdeepgalois/include/deepgalois/layers/layer.h
+++ b/libdeepgalois/include/deepgalois/layers/layer.h
@@ -8,7 +8,7 @@
  * All rights reserved.
  * Reused/revised under 3-BSD
  */
-
+#include <iostream>
 #include "deepgalois/gtypes.h"
 #ifndef GALOIS_USE_DIST
 #include "deepgalois/context.h"
@@ -53,9 +53,12 @@ class layer : public deepgalois::node {
         output_dims(out_dims), labels(NULL) { }
   virtual ~layer()                       = default;
   virtual std::string layer_type() const = 0;
-  void print_layer_info(); //! debug print function
   virtual void malloc_and_init() {}
-
+  void print_layer_info() { //! debug print function
+    std::cout << "Layer" << level_ << " type: " << layer_type() << " input["
+              << input_dims[0] << "," << input_dims[1] << "] output["
+              <<  output_dims[0] << "," << output_dims[1] << "]\n";
+  }
   // get methods
   virtual acc_t get_prediction_loss() { return acc_t(0); }
   virtual acc_t get_weight_decay_loss() { return acc_t(0); }
diff --git a/libdeepgalois/include/deepgalois/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h
index 77d48d87a6..eb53a4d930 100644
--- a/libdeepgalois/include/deepgalois/lgraph.h
+++ b/libdeepgalois/include/deepgalois/lgraph.h
@@ -2,6 +2,12 @@
 #include "deepgalois/types.h"
 #include <string>
 
+#ifdef __CUDACC__
+#define CUDA_HOSTDEV __host__ __device__
+#else
+#define CUDA_HOSTDEV
+#endif
+
 namespace deepgalois {
 
 class LearningGraph {
@@ -32,8 +38,6 @@ class LearningGraph {
   LearningGraph() : LearningGraph(false) {}
   ~LearningGraph() { dealloc(); }
   void init(index_t nv, index_t ne) { num_vertices_ = nv; num_edges_ = ne; }
-  void readGraph(std::string path, std::string dataset);
-  void readGraphFromGRFile(const std::string& filename);
   size_t size() { return (size_t)num_vertices_; }
   size_t sizeEdges() { return (size_t)num_edges_; }
   index_t get_degree(index_t vid) { return degrees_[vid]; }
@@ -46,12 +50,20 @@ class LearningGraph {
   void copy_to_gpu();
   void dealloc();
   void degree_counting();
-  void allocateFrom(index_t nv, index_t ne);
   void constructNodes();
   void fixEndEdge(index_t vid, index_t row_end);
   void constructEdge(index_t eid, index_t dst, edata_t edata);
   void add_selfloop();
-
+  void readGraph(std::string dataset);
+  void allocateFrom(index_t nv, index_t ne) {
+    //printf("Allocating num_vertices_=%d, num_edges_=%d.\n", num_vertices_, num_edges_);
+    num_vertices_ = nv;
+    num_edges_ = ne;
+    rowptr_.resize(num_vertices_+1);
+    colidx_.resize(num_edges_);
+    degrees_.resize(num_vertices_);
+    rowptr_[0] = 0;
+  }
   bool isLocal(index_t vid);
   index_t getLID(index_t vid);
   bool is_vertex_cut();
@@ -73,12 +85,12 @@ class LearningGraph {
   edata_t* edge_data_ptr() { return edge_data_; }
   vdata_t* vertex_data_ptr() { return vertex_data_; }
 #else
-	__device__ index_t getEdgeDst(index_t edge) { return d_colidx_[edge]; }
-	__device__ index_t edge_begin(index_t src) { return d_rowptr_[src]; }
-	__device__ index_t edge_end(index_t src) { return d_rowptr_[src+1]; }
-	__device__ vdata_t getData(index_t vid) { return d_vertex_data_[vid]; }
-	__device__ index_t getDegree(index_t vid) { return d_degrees_[vid]; }
-	__device__ index_t getOutDegree(index_t vid) { return d_degrees_[vid]; }
+	CUDA_HOSTDEV index_t getEdgeDst(index_t edge) { return d_colidx_[edge]; }
+	CUDA_HOSTDEV index_t edge_begin(index_t src) { return d_rowptr_[src]; }
+	CUDA_HOSTDEV index_t edge_end(index_t src) { return d_rowptr_[src+1]; }
+	CUDA_HOSTDEV vdata_t getData(index_t vid) { return d_vertex_data_[vid]; }
+	CUDA_HOSTDEV index_t getDegree(index_t vid) { return d_degrees_[vid]; }
+	CUDA_HOSTDEV index_t getOutDegree(index_t vid) { return d_degrees_[vid]; }
 	index_t *row_start_ptr() { return d_rowptr_; }
 	const index_t *row_start_ptr() const { return d_rowptr_; }
 	index_t *edge_dst_ptr() { return d_colidx_; }
diff --git a/libdeepgalois/include/deepgalois/reader.h b/libdeepgalois/include/deepgalois/reader.h
index 090ec817f8..e25124cbfd 100644
--- a/libdeepgalois/include/deepgalois/reader.h
+++ b/libdeepgalois/include/deepgalois/reader.h
@@ -1,11 +1,12 @@
 #pragma once
-#include "deepgalois/types.h"
+#include "deepgalois/gtypes.h"
 
 namespace deepgalois {
 
 class Reader {
 private:
   std::string dataset_str;
+  void progressPrint(unsigned maxii, unsigned ii);
 public:
   Reader() : dataset_str("") {}
   Reader(std::string dataset) : dataset_str(dataset) {}
@@ -13,6 +14,7 @@ class Reader {
   size_t read_labels(bool is_single_class, label_t*& labels);
   size_t read_features(float_t*& feats, std::string filetype = "bin");
   size_t read_masks(std::string mask_type, size_t n, size_t& begin, size_t& end, mask_t* masks);
+  void readGraphFromGRFile(Graph *g);
 };
 
 }
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index bfa006a1d7..757279ceba 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -69,7 +69,7 @@ size_t Context::read_graph(bool selfloop) {
     printf("Reading .el file: %s\n", filename.c_str());
     read_edgelist(filename.c_str(), true); // symmetrize
   } else if (filetype == "bin") {
-    graph_cpu->readGraphFromGRFile(filename);
+    graph_cpu->readGraph(dataset);
   } else if (filetype == "gr") {
     graph_cpu = new Graph(); 
     std::string filename = path + dataset + ".csgr";
@@ -77,11 +77,11 @@ size_t Context::read_graph(bool selfloop) {
     if (selfloop) {
       Graph graph_temp;
       //galois::graphs::readGraph(graph_temp, filename);
-      graph_temp.readGraphFromGRFile(filename);
+      graph_temp.readGraph(dataset);
       add_selfloop(graph_temp, *graph_cpu);
       is_selfloop_added = selfloop;
     //} else galois::graphs::readGraph(*graph_cpu, filename);
-    } else graph_cpu->readGraphFromGRFile(filename);
+    } else graph_cpu->readGraph(dataset);
 // TODO dist version of self loop
   } else {
     printf("Unkown file format\n");
diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu
index bbaddf0e99..17356c4423 100644
--- a/libdeepgalois/src/context.cu
+++ b/libdeepgalois/src/context.cu
@@ -127,8 +127,8 @@ void Context::SetDevice(const int device_id) {
 }
 */
 size_t Context::read_graph(bool selfloop) {
-  std::string filename = path + dataset + ".csgr";
 #ifdef USE_CSRGRAPH
+  std::string filename = path + dataset + ".csgr";
   GraphGPU g;
   g.read(filename.c_str(), false);
   if (selfloop) {
@@ -137,7 +137,7 @@ size_t Context::read_graph(bool selfloop) {
   }
   g.copy_to_gpu(graph_gpu);
 #else
-  graph_gpu.readGraphFromGRFile(filename);
+  graph_gpu.readGraph(dataset);
   graph_gpu.copy_to_gpu();
 #endif
   n = graph_gpu.size();
diff --git a/libdeepgalois/src/layers/l2_norm_layer.cpp b/libdeepgalois/src/layers/l2_norm_layer.cpp
index f1cb6a4445..864eaeb321 100644
--- a/libdeepgalois/src/layers/l2_norm_layer.cpp
+++ b/libdeepgalois/src/layers/l2_norm_layer.cpp
@@ -4,15 +4,6 @@
 
 namespace deepgalois {
 
-l2_norm_layer::l2_norm_layer(unsigned level, float_t eps, float_t scale,
-                             dims_t in_dims, dims_t out_dims)
-    : layer(level, in_dims, out_dims), epsilon_(eps), scale_(scale) {
-  assert(input_dims[0] == output_dims[0]); // num_vertices
-  trainable_ = false;
-  name_ = layer_type() + "_" + std::to_string(level);
-}
-
-#ifdef CPU_ONLY
 void l2_norm_layer::forward_propagation(const float_t* in_data, float_t* out_data) {
   size_t x = input_dims[0];
   size_t y = input_dims[1];
@@ -51,6 +42,5 @@ void l2_norm_layer::back_propagation(const float_t* in_data, const float_t*,
     }
   }, galois::loopname("d_l2_norm"));
 }
-#endif
 
 } // namespace
diff --git a/libdeepgalois/src/layers/layer.cpp b/libdeepgalois/src/layers/layer.cpp
deleted file mode 100644
index 6abb1ffb6a..0000000000
--- a/libdeepgalois/src/layers/layer.cpp
+++ /dev/null
@@ -1,12 +0,0 @@
-#include "deepgalois/layers/layer.h"
-#include "galois/Galois.h"
-
-namespace deepgalois {
-
-void layer::print_layer_info() {
-  galois::gPrint("Layer", level_, " type: ", layer_type(), " input[",
-                 input_dims[0], ",", input_dims[1], "] output[",
-                 output_dims[0], ",", output_dims[1], "]\n");
-}
-
-}
diff --git a/libdeepgalois/src/lgraph.cpp b/libdeepgalois/src/lgraph.cpp
index 6531034794..76187f302f 100644
--- a/libdeepgalois/src/lgraph.cpp
+++ b/libdeepgalois/src/lgraph.cpp
@@ -1,12 +1,7 @@
 #include "deepgalois/lgraph.h"
 #include "deepgalois/utils.h"
+#include "deepgalois/reader.h"
 #include "galois/Galois.h"
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/mman.h>
-#include <fcntl.h>    /* For O_RDWR */
-#include <unistd.h>   /* For open(), creat() */
-#include <fstream>
 #include <iostream>
 #include <cassert>
 
@@ -26,43 +21,12 @@ uint64_t LearningGraph::numMasters() { return 0; }
 
 uint64_t LearningGraph::globalSize() { return 0; }
 
-void LearningGraph::progressPrint(unsigned maxii, unsigned ii) {
-  const unsigned nsteps = 10;
-  unsigned ineachstep = (maxii / nsteps);
-  if(ineachstep == 0) ineachstep = 1;
-  if (ii % ineachstep == 0) {
-    int progress = ((size_t) ii * 100) / maxii + 1;
-    printf("\t%3d%%\r", progress);
-    fflush(stdout);
-  }
+void LearningGraph::constructNodes() {
 }
 
-void LearningGraph::allocateFrom(index_t nv, index_t ne) {
-  //printf("Allocating num_vertices_=%d, num_edges_=%d.\n", num_vertices_, num_edges_);
-/*
-  if (num_vertices_ != nv) {
-    if (rowptr_ != NULL) delete [] rowptr_;
-    if (degrees_ != NULL) delete [] degrees_;
-    if (vertex_data_ != NULL) delete [] vertex_data_;
-    num_vertices_ = nv;
-  }
-  if (num_edges_ != ne) {
-    if (colidx_ != NULL) delete [] colidx_;
-    if (edge_data_ != NULL) delete [] edge_data_;
-    num_edges_ = ne;
-  } 
-  if (rowptr_ == NULL) rowptr_ = new index_t[num_vertices_+1];
-  if (colidx_ == NULL) colidx_ = new index_t[num_edges_];
-*/
-  num_vertices_ = nv;
-  num_edges_ = ne;
-  rowptr_.resize(num_vertices_+1);
-  colidx_.resize(num_edges_);
-  degrees_.resize(num_vertices_);
-  rowptr_[0] = 0;
-}
-
-void LearningGraph::constructNodes() {
+void LearningGraph::readGraph(std::string dataset) {
+  deepgalois::Reader reader(dataset);
+  reader.readGraphFromGRFile(this);
 }
 
 void LearningGraph::fixEndEdge(index_t vid, index_t row_end) {
@@ -118,102 +82,6 @@ void LearningGraph::add_selfloop() {
   //print_neighbors(0);
 }
 
-void LearningGraph::readGraph(std::string path, std::string dataset) {
-  std::string filename = path + dataset + ".csgr";
-}
-
-void LearningGraph::readGraphFromGRFile(const std::string& filename) {
-  std::ifstream ifs;
-  ifs.open(filename);
-  int masterFD = open(filename.c_str(), O_RDONLY);
-  if (masterFD == -1) {
-    std::cout << "LearningGraph: unable to open" << filename << "\n";
-    exit(1);
-  }
-  struct stat buf;
-  int f = fstat(masterFD, &buf);
-  if (f == -1) {
-    std::cout << "LearningGraph: unable to stat" << filename << "\n";
-    exit(1);
-  }
-  size_t masterLength = buf.st_size;
-  int _MAP_BASE = MAP_PRIVATE;
-  void* m = mmap(0, masterLength, PROT_READ, _MAP_BASE, masterFD, 0);
-  if (m == MAP_FAILED) {
-    m = 0;
-    std::cout << "LearningGraph: mmap failed.\n";
-    exit(1);
-  }
-  Timer t;
-  t.Start();
-
-  uint64_t* fptr = (uint64_t*)m;
-  __attribute__((unused)) uint64_t version = le64toh(*fptr++);
-  assert(version == 1);
-  uint64_t sizeEdgeTy = le64toh(*fptr++);
-  uint64_t nv = le64toh(*fptr++);
-  uint64_t ne = le64toh(*fptr++);
-  uint64_t *outIdx = fptr;
-  fptr += nv;
-  uint32_t *fptr32 = (uint32_t*)fptr;
-  uint32_t *outs = fptr32; 
-  fptr32 += ne;
-  if (ne % 2) fptr32 += 1;
-  num_vertices_ = nv;
-  num_edges_ = ne;
-  if (sizeEdgeTy != 0) {
-    std::cout << "LearningGraph: currently edge data not supported.\n";
-    exit(1);
-  }
-
-  printf("num_vertices_=%d, num_edges_=%d.\n", num_vertices_, num_edges_);
-  allocateFrom(nv, ne);
-  //degrees_ = new index_t[num_vertices_];
-  //rowptr_ = new index_t[num_vertices_+1];
-  //colidx_ = new index_t[num_edges_];
-  //rowptr_[0] = 0;
-  for (unsigned ii = 0; ii < num_vertices_; ++ii) {
-    rowptr_[ii+1] = le64toh(outIdx[ii]);
-    degrees_[ii] = rowptr_[ii+1] - rowptr_[ii];
-    for (unsigned jj = 0; jj < degrees_[ii]; ++jj) {
-      unsigned eid = rowptr_[ii] + jj;
-      unsigned dst = le32toh(outs[eid]);
-      if (dst >= num_vertices_) {
-        printf("\tinvalid edge from %d to %d at index %d(%d).\n", ii, dst, jj, eid);
-        exit(0);
-      }
-      colidx_[eid] = dst;
-    }
-    progressPrint(num_vertices_, ii);
-  }
-  ifs.close();
-
-/*
-  std::string file_dims = path + dataset + "-dims.bin";
-  std::string file_rowptr = path + dataset + "-rowptr.bin";
-  std::string file_colidx = path + dataset + "-colidx.bin";
-  index_t dims[2];
-  ifs.open(file_dims, std::ios::binary|std::ios::in);
-  ifs.read((char*)dims, sizeof(index_t) * 2);
-  ifs.close();
-  num_vertices_ = dims[0];
-  num_edges_ = dims[1];
-  degrees_ = new index_t[num_vertices_];
-  rowptr_ = new index_t[num_vertices_+1];
-  colidx_ = new index_t[num_edges_];
-  ifs.open(file_rowptr, std::ios::binary|std::ios::in);
-  ifs.read((char*)rowptr_, sizeof(index_t) * (num_vertices_+1));
-  ifs.close();
-  ifs.open(file_colidx, std::ios::binary|std::ios::in);
-  ifs.read((char*)colidx_, sizeof(index_t) * num_edges_);
-  ifs.close();
-*/
-  t.Stop();
-  double runtime = t.Millisecs();
-  std::cout << "read " << masterLength << " bytes in " << runtime << " ms (" 
-            << masterLength/1000.0/runtime << " MB/s)\n\n"; 
-}
-
 #ifdef CPU_ONLY
 void LearningGraph::dealloc() {
 /*
diff --git a/libdeepgalois/src/lgraph.cu b/libdeepgalois/src/lgraph.cu
index 3a379a649e..42280956d4 100644
--- a/libdeepgalois/src/lgraph.cu
+++ b/libdeepgalois/src/lgraph.cu
@@ -1,9 +1,15 @@
 #include "deepgalois/lgraph.h"
 #include "deepgalois/cutils.h"
+#include "deepgalois/reader.h"
 #include <cassert>
 
 namespace deepgalois {
 
+void LearningGraph::readGraph(std::string dataset) {
+  deepgalois::Reader reader(dataset);
+  reader.readGraphFromGRFile(this);
+}
+
 void LearningGraph::dealloc() {
   assert(is_device);
   CUDA_CHECK(cudaFree(d_colidx_));
@@ -40,4 +46,6 @@ void LearningGraph::copy_to_cpu() {
   //CUDA_CHECK(cudaMemcpy(copygraph.vertex_data__ptr(), vertex_data__, num_vertices_ * sizeof(vdata_t), cudaMemcpyDeviceToHost));
 }
 
+void LearningGraph::degree_counting() {}
+
 }
diff --git a/libdeepgalois/src/net.cu b/libdeepgalois/src/net.cu
index 1a50c0c551..7d3e61bac8 100644
--- a/libdeepgalois/src/net.cu
+++ b/libdeepgalois/src/net.cu
@@ -162,6 +162,8 @@ void Net::regularize() {
     layers[layer_id]->get_grads_device_ptr());
 }
 
+void Net::normalize() {}
+
 acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, 
                            mask_t* masks, float_t* preds, label_t* ground_truth) {
   return masked_accuracy_gpu(num_classes, begin, end, count, masks, preds, ground_truth);
diff --git a/libdeepgalois/src/optimizer.cu b/libdeepgalois/src/optimizer.cu
index 6953a804c1..0fd16803fd 100644
--- a/libdeepgalois/src/optimizer.cu
+++ b/libdeepgalois/src/optimizer.cu
@@ -27,6 +27,7 @@ float_t* stateful_optimizer<N>::get_gpu(const size_t n, const float_t *key) {
   return dE_[Index][key];
 }
 
+void adam::update(const vec_t& dW, vec_t& W) {}
 void adam::update_gpu(const size_t n, const float_t* dW, float_t* W) {
   //std::cout << "updating weights on GPU, n = " << n << "\n";
   //print_device_vector(10, dW, "dW");
diff --git a/libdeepgalois/src/reader.cpp b/libdeepgalois/src/reader.cpp
index 2ea8134254..7497cb2887 100644
--- a/libdeepgalois/src/reader.cpp
+++ b/libdeepgalois/src/reader.cpp
@@ -1,6 +1,13 @@
 #include "deepgalois/reader.h"
 #include "deepgalois/utils.h"
 #include "deepgalois/configs.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <fcntl.h>    /* For O_RDWR */
+#include <unistd.h>   /* For open(), creat() */
+#include <fstream>
+#include <cassert>
 
 namespace deepgalois {
 
@@ -141,4 +148,104 @@ size_t Reader::read_masks(std::string mask_type, size_t n, size_t& begin, size_t
   return sample_count;
 }
 
+void Reader::progressPrint(unsigned maxii, unsigned ii) {
+  const unsigned nsteps = 10;
+  unsigned ineachstep = (maxii / nsteps);
+  if(ineachstep == 0) ineachstep = 1;
+  if (ii % ineachstep == 0) {
+    int progress = ((size_t) ii * 100) / maxii + 1;
+    printf("\t%3d%%\r", progress);
+    fflush(stdout);
+  }
+}
+
+void Reader::readGraphFromGRFile(Graph *g) {
+  std::string filename = path + dataset_str + ".csgr";
+  std::ifstream ifs;
+  ifs.open(filename);
+  int masterFD = open(filename.c_str(), O_RDONLY);
+  if (masterFD == -1) {
+    std::cout << "LearningGraph: unable to open" << filename << "\n";
+    exit(1);
+  }
+  struct stat buf;
+  int f = fstat(masterFD, &buf);
+  if (f == -1) {
+    std::cout << "LearningGraph: unable to stat" << filename << "\n";
+    exit(1);
+  }
+  size_t masterLength = buf.st_size;
+  int _MAP_BASE = MAP_PRIVATE;
+  void* m = mmap(0, masterLength, PROT_READ, _MAP_BASE, masterFD, 0);
+  if (m == MAP_FAILED) {
+    m = 0;
+    std::cout << "LearningGraph: mmap failed.\n";
+    exit(1);
+  }
+  Timer t;
+  t.Start();
+
+  uint64_t* fptr = (uint64_t*)m;
+  __attribute__((unused)) uint64_t version = le64toh(*fptr++);
+  assert(version == 1);
+  uint64_t sizeEdgeTy = le64toh(*fptr++);
+  uint64_t nv = le64toh(*fptr++);
+  uint64_t ne = le64toh(*fptr++);
+  uint64_t *outIdx = fptr;
+  fptr += nv;
+  uint32_t *fptr32 = (uint32_t*)fptr;
+  uint32_t *outs = fptr32; 
+  fptr32 += ne;
+  if (ne % 2) fptr32 += 1;
+  if (sizeEdgeTy != 0) {
+    std::cout << "LearningGraph: currently edge data not supported.\n";
+    exit(1);
+  }
+  printf("num_vertices %lu, num_edges %lu.\n", nv, ne);
+  g->allocateFrom(nv, ne);
+  auto rowptr = g->row_start_ptr();
+  auto colidx = g->edge_dst_ptr();
+  auto degrees = g->degrees_ptr();
+  for (unsigned ii = 0; ii < nv; ++ii) {
+    rowptr[ii+1] = le64toh(outIdx[ii]);
+    degrees[ii] = rowptr[ii+1] - rowptr[ii];
+    for (unsigned jj = 0; jj < degrees[ii]; ++jj) {
+      unsigned eid = rowptr[ii] + jj;
+      unsigned dst = le32toh(outs[eid]);
+      if (dst >= nv) {
+        printf("\tinvalid edge from %d to %d at index %d(%d).\n", ii, dst, jj, eid);
+        exit(0);
+      }
+      colidx[eid] = dst;
+    }
+    progressPrint(nv, ii);
+  }
+  ifs.close();
+
+/*
+  std::string file_dims = path + dataset + "-dims.bin";
+  std::string file_rowptr = path + dataset + "-rowptr.bin";
+  std::string file_colidx = path + dataset + "-colidx.bin";
+  index_t dims[2];
+  ifs.open(file_dims, std::ios::binary|std::ios::in);
+  ifs.read((char*)dims, sizeof(index_t) * 2);
+  ifs.close();
+  num_vertices_ = dims[0];
+  num_edges_ = dims[1];
+  degrees_ = new index_t[num_vertices_];
+  rowptr_ = new index_t[num_vertices_+1];
+  colidx_ = new index_t[num_edges_];
+  ifs.open(file_rowptr, std::ios::binary|std::ios::in);
+  ifs.read((char*)rowptr_, sizeof(index_t) * (num_vertices_+1));
+  ifs.close();
+  ifs.open(file_colidx, std::ios::binary|std::ios::in);
+  ifs.read((char*)colidx_, sizeof(index_t) * num_edges_);
+  ifs.close();
+*/
+  t.Stop();
+  double runtime = t.Millisecs();
+  std::cout << "read " << masterLength << " bytes in " << runtime << " ms (" 
+            << masterLength/1000.0/runtime << " MB/s)\n\n"; 
+}
+
 }
diff --git a/lonestar/gnn/gcn/CMakeLists.txt b/lonestar/gnn/gcn/CMakeLists.txt
index ca8dcaa83e..fc5f134d76 100644
--- a/lonestar/gnn/gcn/CMakeLists.txt
+++ b/lonestar/gnn/gcn/CMakeLists.txt
@@ -7,7 +7,7 @@ target_link_libraries(gcn PRIVATE Galois::shmem lonestar)
 if(ENABLE_HETERO_GALOIS)
   set_property(TARGET gcn PROPERTY CUDA_STANDARD 14)
   set_property(TARGET gcn PROPERTY CUDA_SEPARABLE_COMPILATION ON)
-  target_link_libraries(gcn PRIVATE dg_gpu)
+  target_link_libraries(gcn PRIVATE dg_gpu dg_cpu)
   target_link_libraries(gcn PRIVATE -lcudart -lcublas -lcurand -lcudadevrt)
 else()
 target_link_libraries(gcn PRIVATE dg_cpu)

From f043c193bb6f4b18a884f32a9744f6848dffef97 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Wed, 6 May 2020 18:37:43 -0500
Subject: [PATCH 258/660] fix bug in reader

---
 libdeepgalois/include/deepgalois/lgraph.h | 51 +++++++++++++++++--
 libdeepgalois/src/context.cu              |  4 ++
 libdeepgalois/src/lgraph.cpp              | 62 +----------------------
 libdeepgalois/src/reader.cpp              | 28 +++++-----
 4 files changed, 65 insertions(+), 80 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h
index eb53a4d930..e9dd995f93 100644
--- a/libdeepgalois/include/deepgalois/lgraph.h
+++ b/libdeepgalois/include/deepgalois/lgraph.h
@@ -1,6 +1,7 @@
 #pragma once
 #include "deepgalois/types.h"
 #include <string>
+#include <cassert>
 
 #ifdef __CUDACC__
 #define CUDA_HOSTDEV __host__ __device__
@@ -50,11 +51,12 @@ class LearningGraph {
   void copy_to_gpu();
   void dealloc();
   void degree_counting();
-  void constructNodes();
-  void fixEndEdge(index_t vid, index_t row_end);
-  void constructEdge(index_t eid, index_t dst, edata_t edata);
-  void add_selfloop();
+  void constructNodes() {}
+
   void readGraph(std::string dataset);
+  void fixEndEdge(index_t vid, index_t row_end) {
+    rowptr_[vid+1] = row_end;
+  }
   void allocateFrom(index_t nv, index_t ne) {
     //printf("Allocating num_vertices_=%d, num_edges_=%d.\n", num_vertices_, num_edges_);
     num_vertices_ = nv;
@@ -64,6 +66,46 @@ class LearningGraph {
     degrees_.resize(num_vertices_);
     rowptr_[0] = 0;
   }
+  void constructEdge(index_t eid, index_t dst, edata_t edata = 0) {
+    assert(dst < num_vertices_);
+    assert(eid < num_edges_);
+    colidx_[eid] = dst;
+    if (edge_data_) edge_data_[eid] = edata;
+  }
+  void add_selfloop() {
+    //print_neighbors(nnodes-1);
+    //print_neighbors(0);
+    auto old_colidx_ = colidx_;
+    colidx_.resize(num_vertices_ + num_edges_);
+    for (index_t i = 0; i < num_vertices_; i++) {
+      auto start = rowptr_[i];
+      auto end = rowptr_[i+1];
+      bool selfloop_inserted = false;
+      if (start == end) {
+        colidx_[start+i] = i;
+        continue;
+      }
+      for (auto e = start; e != end; e++) {
+        auto dst = old_colidx_[e];
+        if (!selfloop_inserted) {
+          if (i < dst) {
+            selfloop_inserted = true;
+            colidx_[e+i] = i;
+            colidx_[e+i+1] = dst;
+          } else if (e+1 == end) {
+            selfloop_inserted = true;
+            colidx_[e+i+1] = i;
+            colidx_[e+i] = dst;
+          } else colidx_[e+i] = dst;
+        } else colidx_[e+i+1] = dst;
+      }
+    }
+    for (index_t i = 0; i <= num_vertices_; i++) rowptr_[i] += i;
+    num_edges_ += num_vertices_;
+    //print_neighbors(nnodes-1);
+    //print_neighbors(0);
+  }
+
   bool isLocal(index_t vid);
   index_t getLID(index_t vid);
   bool is_vertex_cut();
@@ -71,6 +113,7 @@ class LearningGraph {
   uint64_t numMasters();
   uint64_t globalSize();
 
+  index_t* row_start_host_ptr() { return &rowptr_[0]; }
 #ifdef CPU_ONLY
   index_t getEdgeDst(index_t eid) { return colidx_[eid]; }
   index_t edge_begin(index_t vid) { return rowptr_[vid]; }
diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu
index 17356c4423..65b0e6304a 100644
--- a/libdeepgalois/src/context.cu
+++ b/libdeepgalois/src/context.cu
@@ -138,6 +138,10 @@ size_t Context::read_graph(bool selfloop) {
   g.copy_to_gpu(graph_gpu);
 #else
   graph_gpu.readGraph(dataset);
+  if (selfloop) {
+    graph_gpu.add_selfloop();
+    is_selfloop_added = selfloop;
+  }
   graph_gpu.copy_to_gpu();
 #endif
   n = graph_gpu.size();
diff --git a/libdeepgalois/src/lgraph.cpp b/libdeepgalois/src/lgraph.cpp
index 76187f302f..26811280a1 100644
--- a/libdeepgalois/src/lgraph.cpp
+++ b/libdeepgalois/src/lgraph.cpp
@@ -3,7 +3,6 @@
 #include "deepgalois/reader.h"
 #include "galois/Galois.h"
 #include <iostream>
-#include <cassert>
 
 namespace deepgalois {
 
@@ -21,25 +20,11 @@ uint64_t LearningGraph::numMasters() { return 0; }
 
 uint64_t LearningGraph::globalSize() { return 0; }
 
-void LearningGraph::constructNodes() {
-}
-
 void LearningGraph::readGraph(std::string dataset) {
   deepgalois::Reader reader(dataset);
   reader.readGraphFromGRFile(this);
 }
 
-void LearningGraph::fixEndEdge(index_t vid, index_t row_end) {
-  rowptr_[vid+1] = row_end;
-}
-
-void LearningGraph::constructEdge(index_t eid, index_t dst, edata_t edata) {
-  assert(dst < num_vertices_);
-  assert(eid < num_edges_);
-  colidx_[eid] = dst;
-  if (edge_data_) edge_data_[eid] = edata;
-}
-
 void LearningGraph::degree_counting() {
   //if (degrees_ != NULL) return;
   //degrees_ = new index_t[num_vertices_];
@@ -48,51 +33,6 @@ void LearningGraph::degree_counting() {
   }, galois::loopname("DegreeCounting"));
 }
 
-void LearningGraph::add_selfloop() {
-  //print_neighbors(nnodes-1);
-  //print_neighbors(0);
-  auto old_colidx_ = colidx_;
-  colidx_.resize(num_vertices_ + num_edges_);
-  for (index_t i = 0; i < num_vertices_; i++) {
-    auto start = rowptr_[i];
-    auto end = rowptr_[i+1];
-    bool selfloop_inserted = false;
-    if (start == end) {
-      colidx_[start+i] = i;
-      continue;
-    }
-    for (auto e = start; e != end; e++) {
-      auto dst = old_colidx_[e];
-      if (!selfloop_inserted) {
-        if (i < dst) {
-          selfloop_inserted = true;
-          colidx_[e+i] = i;
-          colidx_[e+i+1] = dst;
-        } else if (e+1 == end) {
-          selfloop_inserted = true;
-          colidx_[e+i+1] = i;
-          colidx_[e+i] = dst;
-        } else colidx_[e+i] = dst;
-      } else colidx_[e+i+1] = dst;
-    }
-  }
-  for (index_t i = 0; i <= num_vertices_; i++) rowptr_[i] += i;
-  num_edges_ += num_vertices_;
-  //print_neighbors(nnodes-1);
-  //print_neighbors(0);
-}
-
-#ifdef CPU_ONLY
-void LearningGraph::dealloc() {
-/*
-  assert (!is_device);
-  if (rowptr_ != NULL) delete [] rowptr_;
-  if (colidx_ != NULL) delete [] colidx_;
-  if (degrees_ != NULL) delete [] degrees_;
-  if (vertex_data_ != NULL) delete [] vertex_data_;
-  if (edge_data_ != NULL) delete [] edge_data_;
-//*/
-}
-#endif
+void LearningGraph::dealloc() {}
 
 } // end namespace
diff --git a/libdeepgalois/src/reader.cpp b/libdeepgalois/src/reader.cpp
index 7497cb2887..519e27496a 100644
--- a/libdeepgalois/src/reader.cpp
+++ b/libdeepgalois/src/reader.cpp
@@ -148,12 +148,12 @@ size_t Reader::read_masks(std::string mask_type, size_t n, size_t& begin, size_t
   return sample_count;
 }
 
-void Reader::progressPrint(unsigned maxii, unsigned ii) {
+void Reader::progressPrint(unsigned max, unsigned i) {
   const unsigned nsteps = 10;
-  unsigned ineachstep = (maxii / nsteps);
+  unsigned ineachstep = (max / nsteps);
   if(ineachstep == 0) ineachstep = 1;
-  if (ii % ineachstep == 0) {
-    int progress = ((size_t) ii * 100) / maxii + 1;
+  if (i % ineachstep == 0) {
+    int progress = ((size_t) i * 100) / max + 1;
     printf("\t%3d%%\r", progress);
     fflush(stdout);
   }
@@ -203,22 +203,20 @@ void Reader::readGraphFromGRFile(Graph *g) {
   }
   printf("num_vertices %lu, num_edges %lu.\n", nv, ne);
   g->allocateFrom(nv, ne);
-  auto rowptr = g->row_start_ptr();
-  auto colidx = g->edge_dst_ptr();
-  auto degrees = g->degrees_ptr();
-  for (unsigned ii = 0; ii < nv; ++ii) {
-    rowptr[ii+1] = le64toh(outIdx[ii]);
-    degrees[ii] = rowptr[ii+1] - rowptr[ii];
-    for (unsigned jj = 0; jj < degrees[ii]; ++jj) {
-      unsigned eid = rowptr[ii] + jj;
+  auto rowptr = g->row_start_host_ptr();
+  for (unsigned vid = 0; vid < nv; ++vid) {
+    g->fixEndEdge(vid, le64toh(outIdx[vid]));
+    auto degree = rowptr[vid+1] - rowptr[vid];
+    for (unsigned jj = 0; jj < degree; ++jj) {
+      unsigned eid = rowptr[vid] + jj;
       unsigned dst = le32toh(outs[eid]);
       if (dst >= nv) {
-        printf("\tinvalid edge from %d to %d at index %d(%d).\n", ii, dst, jj, eid);
+        printf("\tinvalid edge from %d to %d at index %d(%d).\n", vid, dst, jj, eid);
         exit(0);
       }
-      colidx[eid] = dst;
+      g->constructEdge(eid, dst);
     }
-    progressPrint(nv, ii);
+    progressPrint(nv, vid);
   }
   ifs.close();
 

From f7887794c70bcf842a719471251f2b61d50bbcf7 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Wed, 6 May 2020 20:06:32 -0500
Subject: [PATCH 259/660] fix subgraph bug

---
 libdeepgalois/include/deepgalois/cutils.h    |  7 +++++++
 libdeepgalois/include/deepgalois/lgraph.h    | 15 +++++++--------
 libdeepgalois/include/deepgalois/net.h       |  2 +-
 libdeepgalois/src/layers/aggregator.cu       | 12 +++++++++---
 libdeepgalois/src/layers/graph_conv_layer.cu | 10 ++--------
 libdeepgalois/src/lgraph.cu                  | 18 +++++++++++-------
 libdeepgalois/src/math_functions.cu          |  4 ++--
 libdeepgalois/src/net.cu                     |  5 ++---
 lonestar/gnn/gcn/README.md                   | 10 +++++++---
 9 files changed, 48 insertions(+), 35 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/cutils.h b/libdeepgalois/include/deepgalois/cutils.h
index 383c9d6325..9466f55c53 100644
--- a/libdeepgalois/include/deepgalois/cutils.h
+++ b/libdeepgalois/include/deepgalois/cutils.h
@@ -180,3 +180,10 @@ inline void print_device_vector(size_t n, const float_t *d_x, std::string name =
   delete[] h_x;
 }
 
+inline void print_device_int_vector(size_t n, const int *d_x, std::string name = "x") {
+  int *h_x = new int[n];
+  CUDA_CHECK(cudaMemcpy(h_x, d_x, n * sizeof(int), cudaMemcpyDeviceToHost));
+  for (size_t i = 0; i < n; i ++) std::cout << name << "[" << i << "]=" << h_x[i] << "\n";
+  delete[] h_x;
+}
+
diff --git a/libdeepgalois/include/deepgalois/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h
index e9dd995f93..21caec947d 100644
--- a/libdeepgalois/include/deepgalois/lgraph.h
+++ b/libdeepgalois/include/deepgalois/lgraph.h
@@ -34,7 +34,6 @@ class LearningGraph {
 public:
   typedef size_t iterator;
   LearningGraph(bool use_gpu) : is_device(use_gpu), num_vertices_(0), num_edges_(0),
-                                //rowptr_(NULL), colidx_(NULL), degrees_(NULL),
                                 vertex_data_(NULL), edge_data_(NULL) {}
   LearningGraph() : LearningGraph(false) {}
   ~LearningGraph() { dealloc(); }
@@ -58,7 +57,7 @@ class LearningGraph {
     rowptr_[vid+1] = row_end;
   }
   void allocateFrom(index_t nv, index_t ne) {
-    //printf("Allocating num_vertices_=%d, num_edges_=%d.\n", num_vertices_, num_edges_);
+    //printf("Allocating num_vertices %d num_edgesi %d\n", num_vertices_, num_edges_);
     num_vertices_ = nv;
     num_edges_ = ne;
     rowptr_.resize(num_vertices_+1);
@@ -73,8 +72,6 @@ class LearningGraph {
     if (edge_data_) edge_data_[eid] = edata;
   }
   void add_selfloop() {
-    //print_neighbors(nnodes-1);
-    //print_neighbors(0);
     auto old_colidx_ = colidx_;
     colidx_.resize(num_vertices_ + num_edges_);
     for (index_t i = 0; i < num_vertices_; i++) {
@@ -102,8 +99,7 @@ class LearningGraph {
     }
     for (index_t i = 0; i <= num_vertices_; i++) rowptr_[i] += i;
     num_edges_ += num_vertices_;
-    //print_neighbors(nnodes-1);
-    //print_neighbors(0);
+    printf("Selfloop added: num_vertices %d num_edges %d\n", num_vertices_, num_edges_);
   }
 
   bool isLocal(index_t vid);
@@ -114,6 +110,7 @@ class LearningGraph {
   uint64_t globalSize();
 
   index_t* row_start_host_ptr() { return &rowptr_[0]; }
+  index_t* edge_dst_host_ptr() { return &colidx_[0]; }
 #ifdef CPU_ONLY
   index_t getEdgeDst(index_t eid) { return colidx_[eid]; }
   index_t edge_begin(index_t vid) { return rowptr_[vid]; }
@@ -132,8 +129,10 @@ class LearningGraph {
 	CUDA_HOSTDEV index_t edge_begin(index_t src) { return d_rowptr_[src]; }
 	CUDA_HOSTDEV index_t edge_end(index_t src) { return d_rowptr_[src+1]; }
 	CUDA_HOSTDEV vdata_t getData(index_t vid) { return d_vertex_data_[vid]; }
-	CUDA_HOSTDEV index_t getDegree(index_t vid) { return d_degrees_[vid]; }
-	CUDA_HOSTDEV index_t getOutDegree(index_t vid) { return d_degrees_[vid]; }
+	//CUDA_HOSTDEV index_t getDegree(index_t vid) { return d_degrees_[vid]; }
+	//CUDA_HOSTDEV index_t getOutDegree(index_t vid) { return d_degrees_[vid]; }
+	CUDA_HOSTDEV index_t getDegree(index_t vid) { return d_rowptr_[vid+1] - d_rowptr_[vid]; }
+	CUDA_HOSTDEV index_t getOutDegree(index_t vid) { return d_rowptr_[vid+1] - d_rowptr_[vid]; }
 	index_t *row_start_ptr() { return d_rowptr_; }
 	const index_t *row_start_ptr() const { return d_rowptr_; }
 	index_t *edge_dst_ptr() { return d_colidx_; }
diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h
index 5bab5f12d2..9ad0ac7d15 100644
--- a/libdeepgalois/include/deepgalois/net.h
+++ b/libdeepgalois/include/deepgalois/net.h
@@ -36,7 +36,7 @@ class Net {
   neighbor_sample_size(neigh_sz), subgraph_sample_size(subg_sz),
   num_threads(nt), num_conv_layers(n_conv), num_epochs(epochs), 
   learning_rate(lr), dropout_rate(dropout), weight_decay(wd),
-  val_interval(val_itv), is_selfloop(selfloop) {
+  val_interval(val_itv), num_subgraphs(1), is_selfloop(selfloop) {
     assert(n_conv > 0);
     std::cout << "Configuration: num_threads " << num_threads
               << ", num_conv_layers " << num_conv_layers
diff --git a/libdeepgalois/src/layers/aggregator.cu b/libdeepgalois/src/layers/aggregator.cu
index fe3aca0182..e1bee86c47 100644
--- a/libdeepgalois/src/layers/aggregator.cu
+++ b/libdeepgalois/src/layers/aggregator.cu
@@ -74,11 +74,17 @@ void update_all_csrmm(size_t len, GraphGPU& g, const float_t* in, float_t* out,
                 bool norm, const float_t* norm_factor) {
   unsigned n = g.size();
   CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t)));
-  //std::cout << "[debug]: update_all on GPU, n=" << n << ", len=" << len << "\n";
-  //print_device_vector(10, norm_factor, "norm_factor");
+  std::cout << "[debug]: update_all on GPU, n=" << n << ", len=" << len << "\n";
+  print_device_vector(10, norm_factor, "norm_factor");
   float *temp;
+  const int *row_start = (const int*)g.row_start_ptr();
+  const int *edge_dst = (const int*)g.edge_dst_ptr();
+  printf("row_start_ptr: 0x%x\n", row_start);
+  printf("edge_dst_ptr: 0x%x\n", edge_dst);
+  print_device_int_vector(10, row_start, "row_start");
+  print_device_int_vector(10, edge_dst, "edge_dst");
   float_malloc_device(n*len, temp); // TODO: avoid repetitive allocation
-  csrmm_gpu(n, len, n, g.sizeEdges(), 1.0, norm_factor, (const int*)g.row_start_ptr(), (const int*)g.edge_dst_ptr(), in, 0.0, temp, out);
+  csrmm_gpu(n, len, n, g.sizeEdges(), 1.0, norm_factor, row_start, edge_dst, in, 0.0, temp, out);
   float_free_device(temp);
 }
 
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cu b/libdeepgalois/src/layers/graph_conv_layer.cu
index e814631022..a1682847ad 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cu
+++ b/libdeepgalois/src/layers/graph_conv_layer.cu
@@ -9,24 +9,19 @@ void graph_conv_layer::malloc_and_init() {
   size_t z = output_dims[1];
 
   if (dropout_) CUDA_CHECK(cudaMalloc((void**)&dropout_mask, x * y * sizeof(mask_t)));
-  //CUDA_CHECK(cudaMalloc((void**)&in_temp, x * y * sizeof(float_t)));
   float_malloc_device(x*y, in_temp);
   init_const_gpu(x*y, 0.0, in_temp);
   if (y <= z) {
     float_malloc_device(x*y, in_temp1);
     init_const_gpu(x*y, 0.0, in_temp1);
   }
-  //CUDA_CHECK(cudaMalloc((void**)&out_temp, x * z * sizeof(float_t)));
   float_malloc_device(x*z, out_temp);
   init_const_gpu(x*z, 0.0, out_temp);
-  //CUDA_CHECK(cudaMalloc((void**)&d_W, y * z * sizeof(float_t)));
   float_malloc_device(y*z, d_W);
   auto init_range = sqrt(6.0 / (y + z));
   // Glorot & Bengio (AISTATS 2010)
   rng_uniform_gpu(y * z, -init_range, init_range, d_W);
-  //CUDA_CHECK(cudaMalloc((void**)&layer::d_weight_grad, y * z * sizeof(float_t)));
   float_malloc_device(y*z, layer::d_weight_grad);
-  //CUDA_CHECK(cudaMemset(layer::d_weight_grad, 0, y * z * sizeof(float_t)));
   init_const_gpu(y*z, 0.0, layer::d_weight_grad);
 }
 
@@ -56,9 +51,9 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_
   size_t y = input_dims[1];
   size_t z = output_dims[1];
  
+	// currently only support feature length <= 128
   if (z > MAX_NUM_CLASSES) {
     std::cout << "Currently support maximum hidden feature length of " << MAX_NUM_CLASSES << "\n"; 
-	// currently only support feature length <= 128
     exit(0);
   }
   init_const_gpu(x*z, 0.0, out_temp);
@@ -76,8 +71,7 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_
 }
 
 // GPU backward: compute input gradients (in_grad) and weight gradients (d_weight_grad)
-void graph_conv_layer::back_propagation(const float_t* in_data,
-                                        const float_t* out_data,
+void graph_conv_layer::back_propagation(const float_t* in_data, const float_t* out_data,
                                         float_t* out_grad, float_t* in_grad) {
   size_t x = input_dims[0];
   size_t y = input_dims[1];
diff --git a/libdeepgalois/src/lgraph.cu b/libdeepgalois/src/lgraph.cu
index 42280956d4..54e696c5ca 100644
--- a/libdeepgalois/src/lgraph.cu
+++ b/libdeepgalois/src/lgraph.cu
@@ -23,7 +23,7 @@ void LearningGraph::allocOnDevice(bool no_edge_data__) {
   if (d_colidx_ != NULL) return;  
   CUDA_CHECK(cudaMalloc((void **) &d_colidx_, num_edges_ * sizeof(index_t)));
   CUDA_CHECK(cudaMalloc((void **) &d_rowptr_, (num_vertices_+1) * sizeof(index_t)));
-  CUDA_CHECK(cudaMalloc((void **) &d_degrees_, num_vertices_ * sizeof(index_t)));
+  //CUDA_CHECK(cudaMalloc((void **) &d_degrees_, num_vertices_ * sizeof(index_t)));
   //if (!no_edge_data__) CUDA_CHECK(cudaMalloc((void **) &edge_data__, num_edges_ * sizeof(edge_data___t)));
   //CUDA_CHECK(cudaMalloc((void **) &vertex_data__, num_vertices_ * sizeof(vdata_t)));
   is_device = true;
@@ -31,17 +31,21 @@ void LearningGraph::allocOnDevice(bool no_edge_data__) {
 
 void LearningGraph::copy_to_gpu() {
   allocOnDevice(edge_data_ == NULL);
-  CUDA_CHECK(cudaMemcpy(edge_dst_ptr(), d_colidx_, num_edges_ * sizeof(index_t), cudaMemcpyHostToDevice));
-  CUDA_CHECK(cudaMemcpy(row_start_ptr(), d_rowptr_, (num_vertices_+1) * sizeof(index_t), cudaMemcpyHostToDevice));
-  CUDA_CHECK(cudaMemcpy(degrees_ptr(), d_degrees_, num_vertices_ * sizeof(index_t), cudaMemcpyHostToDevice));
+  CUDA_CHECK(cudaMemcpy(d_colidx_, edge_dst_host_ptr(), num_edges_ * sizeof(index_t), cudaMemcpyHostToDevice));
+  CUDA_CHECK(cudaMemcpy(d_rowptr_, row_start_host_ptr(), (num_vertices_+1) * sizeof(index_t), cudaMemcpyHostToDevice));
+  printf("row_start_ptr: 0x%x\n", d_rowptr_);
+  printf("edge_dst_ptr: 0x%x\n", d_colidx_);
+  print_device_int_vector(10, (const int*)d_rowptr_, "row_start");
+  print_device_int_vector(10, (const int*)d_colidx_, "edge_dst");
+  //CUDA_CHECK(cudaMemcpy(degrees_ptr(), d_degrees_, num_vertices_ * sizeof(index_t), cudaMemcpyHostToDevice));
   //if (edge_data__ != NULL) CUDA_CHECK(cudaMemcpy(copygraph.edge_data__, edge_data__, num_edges_ * sizeof(edata_t), cudaMemcpyHostToDevice));
   //CUDA_CHECK(cudaMemcpy(copygraph.vertex_data__, vertex_data__, num_vertices_ * sizeof(vdata_t), cudaMemcpyHostToDevice));
 }
 
 void LearningGraph::copy_to_cpu() {
-  CUDA_CHECK(cudaMemcpy(edge_dst_ptr(), d_colidx_, num_edges_ * sizeof(index_t), cudaMemcpyDeviceToHost));
-  CUDA_CHECK(cudaMemcpy(row_start_ptr(), d_rowptr_, (num_vertices_+1) * sizeof(index_t), cudaMemcpyDeviceToHost));
-  CUDA_CHECK(cudaMemcpy(degrees_ptr(), d_degrees_, num_vertices_ * sizeof(index_t), cudaMemcpyDeviceToHost));
+  CUDA_CHECK(cudaMemcpy(edge_dst_host_ptr(), d_colidx_, num_edges_ * sizeof(index_t), cudaMemcpyDeviceToHost));
+  CUDA_CHECK(cudaMemcpy(row_start_host_ptr(), d_rowptr_, (num_vertices_+1) * sizeof(index_t), cudaMemcpyDeviceToHost));
+  //CUDA_CHECK(cudaMemcpy(degrees_ptr(), d_degrees_, num_vertices_ * sizeof(index_t), cudaMemcpyDeviceToHost));
   //if (edge_data__ != NULL) CUDA_CHECK(cudaMemcpy(copygraph.edge_data__ptr(), edge_data__, num_edges_ * sizeof(edata_t), cudaMemcpyDeviceToHost));
   //CUDA_CHECK(cudaMemcpy(copygraph.vertex_data__ptr(), vertex_data__, num_vertices_ * sizeof(vdata_t), cudaMemcpyDeviceToHost));
 }
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index 1f9c020676..2ef1e9a803 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -208,9 +208,9 @@ void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z,
 // workspace memory (transpose_C) for this.
 void csrmm_gpu(const int M, const int N, const int K, const int nnz, 
                const float alpha, const float* A_nonzeros, 
-	           const int* A_idx_ptr, const int* A_nnz_idx,
+               const int* A_idx_ptr, const int* A_nnz_idx,
                const float* B, const float beta, float *transpose_C, float* C) {
-  //std::cout << "[debug] csrmm_gpu m=" << M << ", n=" << N << ", k=" << K << ", nnz=" << nnz << "\n";
+  std::cout << "[debug] csrmm_gpu m=" << M << ", n=" << N << ", k=" << K << ", nnz=" << nnz << "\n";
   CUSPARSE_CHECK(cusparseScsrmm2(deepgalois::Context::cusparse_handle(),
                  CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE,
                  M, N, K, nnz, &alpha, deepgalois::Context::cusparse_matdescr(), A_nonzeros, 
diff --git a/libdeepgalois/src/net.cu b/libdeepgalois/src/net.cu
index 7d3e61bac8..98a5e82010 100644
--- a/libdeepgalois/src/net.cu
+++ b/libdeepgalois/src/net.cu
@@ -23,8 +23,7 @@ __global__ void masked_accuracy_kernel(int num_classes, int begin,
                                        float_t* preds, label_t* labels,
                                        HGAccumulator<acc_t> total) {
   total.thread_entry();
-  __shared__ cub::BlockReduce<acc_t, CUDA_NUM_THREADS>::TempStorage
-      local_accuracy;
+  __shared__ cub::BlockReduce<acc_t, CUDA_NUM_THREADS>::TempStorage local_accuracy;
   CUDA_KERNEL_LOOP(i, end - begin) {
     if (masks[begin + i] == 1) {
       label_t pred = (label_t)argmax_device(num_classes, preds + (begin + i) * num_classes);
@@ -72,7 +71,7 @@ __global__ void masked_f1_score_kernel(int num_classes, int begin,
           atomicAdd(&true_negtive[j], 1.0);
         }
       }
-	}
+    }
   }
 }
 
diff --git a/lonestar/gnn/gcn/README.md b/lonestar/gnn/gcn/README.md
index bae49e36a0..ba680b1f5e 100644
--- a/lonestar/gnn/gcn/README.md
+++ b/lonestar/gnn/gcn/README.md
@@ -26,12 +26,16 @@ BUILD
 RUN
 ===========
 
+Datasets: 
+(1) single-class: cora citeseer pubmed flickr reddit
+(2) multi-class: ppi yelp amazon
+
 The following are a few example command lines.
 
 $ export OPENBLAS_NUM_THREADS=28
-$ ./gnn cora -t=1 -k=3
-$ ./gnn citeseer -t=3 -k=30
-$ ./gnn reddit -t=56 -k=3
+$ ./gnn cora -t=1 -k=30
+$ ./gnn reddit -t=56 -k=200
+$ ./gcn reddit -k=200 -t=56 -ss=9000 -dr=0.1 -h=128 -vi=20
 
 PERFORMANCE
 ===========

From 2272e8a7685564f4607d14f3021da404cd686ed2 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Wed, 6 May 2020 21:26:10 -0500
Subject: [PATCH 260/660] use CSRGraph

---
 libdeepgalois/include/deepgalois/gtypes.h |  3 +-
 libdeepgalois/include/deepgalois/lgraph.h |  1 +
 libdeepgalois/src/context.cu              |  4 +-
 libdeepgalois/src/layers/aggregator.cu    | 16 ++++----
 libdeepgalois/src/lgraph.cu               | 12 ++++--
 libdeepgalois/src/math_functions.cu       |  2 +-
 libgpu/include/graph_gpu.h                | 46 +++++++++++++----------
 7 files changed, 49 insertions(+), 35 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/gtypes.h b/libdeepgalois/include/deepgalois/gtypes.h
index 9b85007d28..e06a9c3fe0 100644
--- a/libdeepgalois/include/deepgalois/gtypes.h
+++ b/libdeepgalois/include/deepgalois/gtypes.h
@@ -1,5 +1,5 @@
 #pragma once
-//#define USE_CSRGRAPH
+#define USE_CSRGRAPH
 
 #include "deepgalois/types.h"
 #ifdef GALOIS_USE_DIST
@@ -12,6 +12,7 @@
 #include "deepgalois/lgraph.h"
 #else
 #ifdef USE_CSRGRAPH
+#include "deepgalois/lgraph.h"
 #include "graph_gpu.h"
 #else
 #include "deepgalois/lgraph.h"
diff --git a/libdeepgalois/include/deepgalois/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h
index 21caec947d..d9e6e60d1d 100644
--- a/libdeepgalois/include/deepgalois/lgraph.h
+++ b/libdeepgalois/include/deepgalois/lgraph.h
@@ -142,6 +142,7 @@ class LearningGraph {
 	vdata_t *vertex_data_ptr() { return d_vertex_data_; }
 	//const vdata_t *vertex_data_ptr() const { return vertex_data_; }
 	//const edata_t *edge_data_ptr() const { return edge_data; }
+  void print_test();
 #endif
 
 };
diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu
index 65b0e6304a..f7a76d2db0 100644
--- a/libdeepgalois/src/context.cu
+++ b/libdeepgalois/src/context.cu
@@ -45,9 +45,9 @@ __global__ void norm_factor_computing_edge(int n, GraphGPU graph, float_t* norm_
     d_src = 1.0 / sqrt(d_src);
     auto start = graph.edge_begin(src);
     index_t end = graph.edge_end(src);
-	for (index_t e = start; e != end; e++) {
+    for (index_t e = start; e != end; e++) {
       index_t dst = graph.getEdgeDst(e);
-      if (dst >= n) printf("src=%d, dst=%d, e=%d, start=%d, end=%d\n", src, dst, e, start, end);
+      //if (dst >= n) printf("src=%d, dst=%d, e=%d, start=%d, end=%d\n", src, dst, e, start, end);
       assert(dst < n);
       float_t d_dst = float_t(graph.getOutDegree(dst));
       assert(d_dst != 0.0);
diff --git a/libdeepgalois/src/layers/aggregator.cu b/libdeepgalois/src/layers/aggregator.cu
index e1bee86c47..158b1c2b4d 100644
--- a/libdeepgalois/src/layers/aggregator.cu
+++ b/libdeepgalois/src/layers/aggregator.cu
@@ -72,19 +72,21 @@ void update_all(size_t len, GraphGPU& g, const float_t* in, float_t* out,
 
 void update_all_csrmm(size_t len, GraphGPU& g, const float_t* in, float_t* out,
                 bool norm, const float_t* norm_factor) {
+  //g.print_test();
   unsigned n = g.size();
+  auto nnz = g.sizeEdges();
   CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t)));
-  std::cout << "[debug]: update_all on GPU, n=" << n << ", len=" << len << "\n";
-  print_device_vector(10, norm_factor, "norm_factor");
+  //std::cout << "[debug]: update_all on GPU, n " << n << " len " << len << " nnz " << nnz << "\n";
+  //print_device_vector(10, norm_factor, "norm_factor");
   float *temp;
   const int *row_start = (const int*)g.row_start_ptr();
   const int *edge_dst = (const int*)g.edge_dst_ptr();
-  printf("row_start_ptr: 0x%x\n", row_start);
-  printf("edge_dst_ptr: 0x%x\n", edge_dst);
-  print_device_int_vector(10, row_start, "row_start");
-  print_device_int_vector(10, edge_dst, "edge_dst");
+  //printf("row_start_ptr: 0x%x\n", row_start);
+  //printf("edge_dst_ptr: 0x%x\n", edge_dst);
+  //print_device_int_vector(10, row_start, "row_start");
+  //print_device_int_vector(10, edge_dst, "edge_dst");
   float_malloc_device(n*len, temp); // TODO: avoid repetitive allocation
-  csrmm_gpu(n, len, n, g.sizeEdges(), 1.0, norm_factor, row_start, edge_dst, in, 0.0, temp, out);
+  csrmm_gpu(n, len, n, nnz, 1.0, norm_factor, row_start, edge_dst, in, 0.0, temp, out);
   float_free_device(temp);
 }
 
diff --git a/libdeepgalois/src/lgraph.cu b/libdeepgalois/src/lgraph.cu
index 54e696c5ca..2c630ca7ae 100644
--- a/libdeepgalois/src/lgraph.cu
+++ b/libdeepgalois/src/lgraph.cu
@@ -29,14 +29,18 @@ void LearningGraph::allocOnDevice(bool no_edge_data__) {
   is_device = true;
 }
 
+void LearningGraph::print_test() {
+  printf("d_rowptr_: 0x%x\n", d_rowptr_);
+  printf("d_colidx_: 0x%x\n", d_colidx_);
+  print_device_int_vector(10, (const int*)d_rowptr_, "row_start");
+  print_device_int_vector(10, (const int*)d_colidx_, "edge_dst");
+}
+
 void LearningGraph::copy_to_gpu() {
   allocOnDevice(edge_data_ == NULL);
   CUDA_CHECK(cudaMemcpy(d_colidx_, edge_dst_host_ptr(), num_edges_ * sizeof(index_t), cudaMemcpyHostToDevice));
   CUDA_CHECK(cudaMemcpy(d_rowptr_, row_start_host_ptr(), (num_vertices_+1) * sizeof(index_t), cudaMemcpyHostToDevice));
-  printf("row_start_ptr: 0x%x\n", d_rowptr_);
-  printf("edge_dst_ptr: 0x%x\n", d_colidx_);
-  print_device_int_vector(10, (const int*)d_rowptr_, "row_start");
-  print_device_int_vector(10, (const int*)d_colidx_, "edge_dst");
+  print_test();
   //CUDA_CHECK(cudaMemcpy(degrees_ptr(), d_degrees_, num_vertices_ * sizeof(index_t), cudaMemcpyHostToDevice));
   //if (edge_data__ != NULL) CUDA_CHECK(cudaMemcpy(copygraph.edge_data__, edge_data__, num_edges_ * sizeof(edata_t), cudaMemcpyHostToDevice));
   //CUDA_CHECK(cudaMemcpy(copygraph.vertex_data__, vertex_data__, num_vertices_ * sizeof(vdata_t), cudaMemcpyHostToDevice));
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index 2ef1e9a803..449b597621 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -210,7 +210,7 @@ void csrmm_gpu(const int M, const int N, const int K, const int nnz,
                const float alpha, const float* A_nonzeros, 
                const int* A_idx_ptr, const int* A_nnz_idx,
                const float* B, const float beta, float *transpose_C, float* C) {
-  std::cout << "[debug] csrmm_gpu m=" << M << ", n=" << N << ", k=" << K << ", nnz=" << nnz << "\n";
+  //std::cout << "[debug] csrmm_gpu m=" << M << ", n=" << N << ", k=" << K << ", nnz=" << nnz << "\n";
   CUSPARSE_CHECK(cusparseScsrmm2(deepgalois::Context::cusparse_handle(),
                  CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE,
                  M, N, K, nnz, &alpha, deepgalois::Context::cusparse_matdescr(), A_nonzeros, 
diff --git a/libgpu/include/graph_gpu.h b/libgpu/include/graph_gpu.h
index f456c367dc..b47ed326b1 100644
--- a/libgpu/include/graph_gpu.h
+++ b/libgpu/include/graph_gpu.h
@@ -17,6 +17,12 @@
 #include <cassert>
 #include <fstream>
 
+#ifdef __CUDACC__
+#define CUDA_HOSTDEV __host__ __device__
+#else
+#define CUDA_HOSTDEV
+#endif
+
 // Adapted from LSG CSRGraph.h
 
 // TODO: make this template data
@@ -42,20 +48,20 @@ struct CSRGraph {
   unsigned deallocOnDevice();
   void dealloc();
 
-  __device__ __host__ bool valid_node(index_type node) {
+  CUDA_HOSTDEV bool valid_node(index_type node) {
     return (node < nnodes);
   }
 
-  __device__ __host__ bool valid_edge(index_type edge) {
+  CUDA_HOSTDEV bool valid_edge(index_type edge) {
     return (edge < nedges);
   }
 
-  __device__ __host__ index_type getOutDegree(unsigned src) {
+  CUDA_HOSTDEV index_type getOutDegree(unsigned src) {
     assert(src < nnodes);
     return row_start[src + 1] - row_start[src];
   };
 
-  __device__ __host__ index_type getDestination(unsigned src, unsigned edge) {
+  CUDA_HOSTDEV index_type getDestination(unsigned src, unsigned edge) {
     assert(src < nnodes);
     assert(edge < getOutDegree(src));
 
@@ -65,18 +71,18 @@ struct CSRGraph {
     return edge_dst[abs_edge];
   };
 
-  __device__ __host__ index_type getAbsDestination(unsigned abs_edge) {
+  CUDA_HOSTDEV index_type getAbsDestination(unsigned abs_edge) {
     assert(abs_edge < nedges);
 
     return edge_dst[abs_edge];
   };
 
-  __device__ __host__ index_type getFirstEdge(unsigned src) {
+  CUDA_HOSTDEV index_type getFirstEdge(unsigned src) {
     assert(src <= nnodes); // <= is okay
     return row_start[src];
   };
 
-  __device__ __host__ edge_data_type getWeight(unsigned src, unsigned edge) {
+  CUDA_HOSTDEV edge_data_type getWeight(unsigned src, unsigned edge) {
     assert(src < nnodes);
     assert(edge < getOutDegree(src));
 
@@ -86,7 +92,7 @@ struct CSRGraph {
     return edge_data[abs_edge];
   };
 
-  __device__ __host__ edge_data_type getAbsWeight(unsigned abs_edge) {
+  CUDA_HOSTDEV edge_data_type getAbsWeight(unsigned abs_edge) {
     assert(abs_edge < nedges);
 
     return edge_data[abs_edge];
@@ -138,29 +144,29 @@ struct CSRGraph {
 		//print_neighbors(0);
 	}
 
-	__device__ __host__ index_type getEdgeDst(unsigned edge) {
+	CUDA_HOSTDEV index_type getEdgeDst(unsigned edge) {
 		assert(edge < nedges);
 		return edge_dst[edge];
 	};
-	__device__ __host__ node_data_type getData(unsigned vid) {
+	CUDA_HOSTDEV node_data_type getData(unsigned vid) {
 		return node_data[vid];
 	}
-	__device__ __host__ index_type edge_begin(unsigned src) {
+	CUDA_HOSTDEV index_type edge_begin(unsigned src) {
 		assert(src <= nnodes);
 		return row_start[src];
 	};
-	__device__ __host__ index_type edge_end(unsigned src) {
+	CUDA_HOSTDEV index_type edge_end(unsigned src) {
 		assert(src <= nnodes);
 		return row_start[src+1];
 	};
-	__device__ __host__ index_type *row_start_ptr() { return row_start; }
-	__device__ __host__ const index_type *row_start_ptr() const { return row_start; }
-	__device__ __host__ index_type *edge_dst_ptr() { return edge_dst; }
-	__device__ __host__ const index_type *edge_dst_ptr() const { return edge_dst; }
-	__device__ __host__ node_data_type *node_data_ptr() { return node_data; }
-	__device__ __host__ const node_data_type *node_data_ptr() const { return node_data; }
-	__device__ __host__ edge_data_type *edge_data_ptr() { return edge_data; }
-	__device__ __host__ const edge_data_type *edge_data_ptr() const { return edge_data; }
+	CUDA_HOSTDEV index_type *row_start_ptr() { return row_start; }
+	CUDA_HOSTDEV const index_type *row_start_ptr() const { return row_start; }
+	CUDA_HOSTDEV index_type *edge_dst_ptr() { return edge_dst; }
+	CUDA_HOSTDEV const index_type *edge_dst_ptr() const { return edge_dst; }
+	CUDA_HOSTDEV node_data_type *node_data_ptr() { return node_data; }
+	CUDA_HOSTDEV const node_data_type *node_data_ptr() const { return node_data; }
+	CUDA_HOSTDEV edge_data_type *edge_data_ptr() { return edge_data; }
+	CUDA_HOSTDEV const edge_data_type *edge_data_ptr() const { return edge_data; }
 
   size_t size() { return size_t(nnodes); }
   size_t sizeEdges() { return size_t(nedges); }

From 17caee78f48fe3d3a40fee0ee40a249b88635257 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 7 May 2020 13:02:54 -0500
Subject: [PATCH 261/660] wip: HETERO not USE CPU, removing USE_DIST

---
 .../include/deepgalois/DistContext.h          |  2 +-
 libdeepgalois/include/deepgalois/context.h    | 19 +++++++++----------
 libdeepgalois/include/deepgalois/net.h        |  4 ++++
 lonestar/gnn/CMakeLists.txt                   |  4 ++--
 lonestar/gnn/gcn/gcn.cpp                      | 12 ++++--------
 5 files changed, 20 insertions(+), 21 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h
index 7fce4a12d9..b110f0df89 100644
--- a/libdeepgalois/include/deepgalois/DistContext.h
+++ b/libdeepgalois/include/deepgalois/DistContext.h
@@ -34,7 +34,7 @@ class DistContext {
   ~DistContext();
 
   //! save graph pointer to context object
-  void saveGraph(Graph* dGraph);
+  void saveDistGraph(Graph* dGraph);
 
   //! read labels of local nodes only
   size_t read_labels(std::string dataset_str);
diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h
index f8b848f453..feacca3a4a 100644
--- a/libdeepgalois/include/deepgalois/context.h
+++ b/libdeepgalois/include/deepgalois/context.h
@@ -7,11 +7,9 @@
 #include <cassert>
 #include "deepgalois/types.h"
 #include "deepgalois/reader.h"
-//#ifdef CPU_ONLY
 #include "deepgalois/gtypes.h"
-//#else
-//#include "graph_gpu.h"
-#ifndef CPU_ONLY
+
+#ifdef __GALOIS_HET_CUDA__
 #include "deepgalois/cutils.h"
 #endif
 
@@ -20,10 +18,11 @@ namespace deepgalois {
 class Context {
 public:
   Context();
-  Context(bool use_gpu) : is_device(use_gpu), n(0), num_classes(0), feat_len(0), is_single_class(true), 
-  is_selfloop_added(false), use_subgraph(false), h_labels(NULL), h_feats(NULL),
-  d_labels(NULL), d_labels_subg(NULL), d_feats(NULL), d_feats_subg(NULL), norm_factors(NULL) {}
-
+  Context(bool use_gpu) :
+    is_device(use_gpu), n(0), num_classes(0), feat_len(0),
+    is_single_class(true), is_selfloop_added(false), use_subgraph(false),
+    h_labels(NULL), h_feats(NULL), d_labels(NULL), d_labels_subg(NULL),
+    d_feats(NULL), d_feats_subg(NULL), norm_factors(NULL) {}
   ~Context();
 
   size_t read_graph(bool selfloop);
@@ -47,7 +46,7 @@ class Context {
   void gen_subgraph_feats(size_t m, const mask_t *masks);
   void createSubgraphs(int num_subgraphs);
 
-#ifdef CPU_ONLY
+#ifndef __GALOIS_HET_CUDA__
   Graph* graph_cpu; // the input graph, |V| = N
   std::vector<Graph*> subgraphs_cpu;
   void add_selfloop(Graph &og, Graph &g);
@@ -100,7 +99,7 @@ class Context {
   void alloc_norm_factor();
   void alloc_subgraph_norm_factor(int subg_id);
 
-#ifdef CPU_ONLY
+#ifndef __GALOIS_HET_CUDA__
   void read_edgelist(const char* filename, bool symmetrize = false, bool add_self_loop = false);
 #else
   static cublasHandle_t cublas_handle_; // used to call cuBLAS
diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h
index 9ad0ac7d15..aa62339a2a 100644
--- a/libdeepgalois/include/deepgalois/net.h
+++ b/libdeepgalois/include/deepgalois/net.h
@@ -38,6 +38,7 @@ class Net {
   learning_rate(lr), dropout_rate(dropout), weight_decay(wd),
   val_interval(val_itv), num_subgraphs(1), is_selfloop(selfloop) {
     assert(n_conv > 0);
+    // TODO use galois print
     std::cout << "Configuration: num_threads " << num_threads
               << ", num_conv_layers " << num_conv_layers
               << ", num_epochs " << num_epochs
@@ -50,6 +51,9 @@ class Net {
     if (has_dense) num_layers ++;
     // initialize feature metadata
     feature_dims.resize(num_layers + 1);
+
+
+
 #ifndef GALOIS_USE_DIST
     context = new deepgalois::Context();
     context->set_dataset(dataset_str);
diff --git a/lonestar/gnn/CMakeLists.txt b/lonestar/gnn/CMakeLists.txt
index d0551bdadc..1f5d35b5f1 100644
--- a/lonestar/gnn/CMakeLists.txt
+++ b/lonestar/gnn/CMakeLists.txt
@@ -13,8 +13,8 @@ if(USE_MKL_BLAS)
 endif()
 link_directories(${BLAS_LIB_DIR})
 
-if(NOT ENABLE_HETERO_GALOIS)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCPU_ONLY")
+if(ENABLE_HETERO_GALOIS)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__GALOIS_HET_CUDA__")
 endif()
 
 if(ENABLE_DIST_GALOIS)
diff --git a/lonestar/gnn/gcn/gcn.cpp b/lonestar/gnn/gcn/gcn.cpp
index a8ab651603..97e1d71447 100644
--- a/lonestar/gnn/gcn/gcn.cpp
+++ b/lonestar/gnn/gcn/gcn.cpp
@@ -10,23 +10,19 @@ const char* desc = "Graph convolutional neural networks on an undirected graph";
 const char* url  = 0;
 
 int main(int argc, char** argv) {
-#ifndef GALOIS_USE_DIST
-  galois::SharedMemSys G;
-#else
   galois::DistMemSys G;
-#endif
   LonestarGnnStart(argc, argv, name, desc, url);
-  // the neural network to train
+
+  // the neural network to train: loads the entire graph on CPU
   deepgalois::Net network(dataset, numThreads, num_conv_layers, epochs,
                     hidden1, learning_rate, dropout_rate, weight_decay,
                     add_selfloop, is_single_class, add_l2norm, add_dense, 
                     neighbor_sample_sz, subgraph_sample_sz, val_interval);
 
-#ifdef GALOIS_USE_DIST
   std::vector<unsigned> dummyVec;
-  deepgalois::Graph* dGraph = galois::graphs::constructSymmetricGraph<char, void>(dummyVec);
+  deepgalois::Graph* dGraph =
+    galois::graphs::constructSymmetricGraph<char, void>(dummyVec);
   network.dist_init(dGraph, dataset);
-#endif
 
   // read network, features, ground truth, initialize metadata
   // default setting for now; can be customized by the user

From df2c358b72775023345475d2146143362c6dbfbb Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 7 May 2020 13:06:22 -0500
Subject: [PATCH 262/660] ran clang-format on all of deepgalois

---
 .../include/deepgalois/DistContext.h          |  45 +-
 libdeepgalois/include/deepgalois/configs.h    |   5 +-
 libdeepgalois/include/deepgalois/context.h    | 104 ++--
 libdeepgalois/include/deepgalois/cutils.h     |  21 +-
 libdeepgalois/include/deepgalois/gtypes.h     |  14 +-
 .../deepgalois/layers/GluonGradients.h        |  79 +--
 .../deepgalois/layers/GradientSyncStructs.h   |  21 +-
 .../layers/GraphConvSyncStructures.h          |  13 +-
 .../include/deepgalois/layers/aggregator.h    |  12 +-
 .../deepgalois/layers/arithmetic_layer.h      |   2 +-
 .../deepgalois/layers/graph_conv_layer.h      |  36 +-
 .../include/deepgalois/layers/l2_norm_layer.h |  14 +-
 .../include/deepgalois/layers/layer.h         |  49 +-
 .../deepgalois/layers/leaky_relu_layer.h      |  10 +-
 .../include/deepgalois/layers/linear_layer.h  |   2 +-
 .../include/deepgalois/layers/node.h          |   9 +-
 .../include/deepgalois/layers/relu_layer.h    |   6 +-
 .../deepgalois/layers/sigmoid_loss_layer.h    |   6 +-
 .../deepgalois/layers/softmax_loss_layer.h    |   6 +-
 libdeepgalois/include/deepgalois/lgraph.h     | 115 ++--
 .../include/deepgalois/math_functions.hh      | 110 ++--
 libdeepgalois/include/deepgalois/net.h        | 287 +++++----
 libdeepgalois/include/deepgalois/optimizer.h  |  10 +-
 libdeepgalois/include/deepgalois/reader.h     |   8 +-
 libdeepgalois/include/deepgalois/sampler.h    |  41 +-
 libdeepgalois/include/deepgalois/types.h      |  24 +-
 libdeepgalois/include/deepgalois/utils.h      | 100 ++--
 libdeepgalois/src/DistContext.cpp             |  52 +-
 libdeepgalois/src/context.cpp                 | 140 +++--
 libdeepgalois/src/context.cu                  |  97 +--
 libdeepgalois/src/layers/aggregator.cpp       |  87 +--
 libdeepgalois/src/layers/aggregator.cu        |  82 +--
 libdeepgalois/src/layers/graph_conv_layer.cpp |  97 +--
 libdeepgalois/src/layers/graph_conv_layer.cu  |  89 +--
 libdeepgalois/src/layers/l2_norm_layer.cpp    |  69 ++-
 libdeepgalois/src/layers/l2_norm_layer.cu     |  10 +-
 libdeepgalois/src/layers/leaky_relu_layer.cpp |  17 +-
 libdeepgalois/src/layers/leaky_relu_layer.cu  |  12 +-
 libdeepgalois/src/layers/relu_layer.cpp       |   5 +-
 libdeepgalois/src/layers/relu_layer.cu        |  10 +-
 .../src/layers/sigmoid_loss_layer.cpp         | 100 ++--
 .../src/layers/sigmoid_loss_layer.cu          |  14 +-
 .../src/layers/softmax_loss_layer.cpp         |  91 +--
 .../src/layers/softmax_loss_layer.cu          |  14 +-
 libdeepgalois/src/lgraph.cpp                  |  15 +-
 libdeepgalois/src/lgraph.cu                   |  56 +-
 libdeepgalois/src/math_functions.cpp          | 220 +++----
 libdeepgalois/src/math_functions.cu           | 563 ++++++++++--------
 libdeepgalois/src/net.cpp                     | 107 ++--
 libdeepgalois/src/net.cu                      | 107 ++--
 libdeepgalois/src/node.cpp                    |   3 +-
 libdeepgalois/src/node.cu                     |  13 +-
 libdeepgalois/src/optimizer.cpp               | 101 ++--
 libdeepgalois/src/optimizer.cu                |  20 +-
 libdeepgalois/src/reader.cpp                  | 111 ++--
 libdeepgalois/src/sampler.cpp                 | 341 ++++++-----
 libdeepgalois/src/utils.cpp                   | 103 ++--
 lonestar/gnn/gcn/gcn.cpp                      |  12 +-
 58 files changed, 2172 insertions(+), 1735 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h
index b110f0df89..2f65360106 100644
--- a/libdeepgalois/include/deepgalois/DistContext.h
+++ b/libdeepgalois/include/deepgalois/DistContext.h
@@ -11,23 +11,24 @@ namespace deepgalois {
 
 class DistContext {
 protected:
-  size_t localVertices;        // number of samples: N
-  size_t num_classes;          // number of classes: E
-  size_t feat_len;             // input feature length: D
+  size_t localVertices; // number of samples: N
+  size_t num_classes;   // number of classes: E
+  size_t feat_len;      // input feature length: D
   galois::graphs::GluonSubstrate<Graph>* syncSubstrate;
 
-  Graph* graph_cpu;            // the input graph, |V| = N
+  Graph* graph_cpu; // the input graph, |V| = N
   std::vector<Graph*> subgraphs_cpu;
-  label_t *h_labels;           // labels for classification. Single-class label: Nx1, multi-class label: NxE 
-  label_t *h_labels_subg;      // labels for subgraph
-  float_t* h_feats;            // input features: N x D
-  float_t* h_feats_subg;       // input features for subgraph
-  label_t* d_labels;           // labels on device
-  label_t *d_labels_subg;      // labels for subgraph on device
-  float_t* d_feats;            // input features on device
-  float_t* d_feats_subg;       // input features for subgraph on device
-  float_t* norm_factors;       // normalization constant based on graph structure
-  float_t* norm_factors_subg;  // normalization constant for subgraph
+  label_t* h_labels;      // labels for classification. Single-class label: Nx1,
+                          // multi-class label: NxE
+  label_t* h_labels_subg; // labels for subgraph
+  float_t* h_feats;       // input features: N x D
+  float_t* h_feats_subg;  // input features for subgraph
+  label_t* d_labels;      // labels on device
+  label_t* d_labels_subg; // labels for subgraph on device
+  float_t* d_feats;       // input features on device
+  float_t* d_feats_subg;  // input features for subgraph on device
+  float_t* norm_factors;  // normalization constant based on graph structure
+  float_t* norm_factors_subg; // normalization constant for subgraph
 
 public:
   DistContext();
@@ -43,19 +44,19 @@ class DistContext {
   size_t read_features(std::string dataset_str);
 
   //! read masks of local nodes only
-  size_t read_masks(std::string dataset_str, std::string mask_type,
-                    size_t n, size_t& begin, size_t& end, mask_t* masks, Graph* dGraph);
+  size_t read_masks(std::string dataset_str, std::string mask_type, size_t n,
+                    size_t& begin, size_t& end, mask_t* masks, Graph* dGraph);
 
   //! find norm factor by looking at degree
   // TODO this is a distributed operation
   void norm_factor_computing(bool is_subgraph, int subg_id = 0);
-  //void createSubgraphs(int num_subgraphs) {}
-  //void gen_subgraph_labels(size_t m, const mask_t *masks) {}
-  //void gen_subgraph_feats(size_t m, const mask_t *masks) {}
+  // void createSubgraphs(int num_subgraphs) {}
+  // void gen_subgraph_labels(size_t m, const mask_t *masks) {}
+  // void gen_subgraph_feats(size_t m, const mask_t *masks) {}
   // TODO define these
   void createSubgraphs(int) {}
-  void gen_subgraph_labels(size_t, const mask_t *) {}
-  void gen_subgraph_feats(size_t, const mask_t *) {}
+  void gen_subgraph_labels(size_t, const mask_t*) {}
+  void gen_subgraph_feats(size_t, const mask_t*) {}
 
   float_t* get_norm_factors_ptr() { return norm_factors; }
   Graph* getGraphPointer() { return graph_cpu; }
@@ -77,6 +78,6 @@ class DistContext {
   float_t* get_in_ptr();
 };
 
-} // end deepgalois namespace
+} // namespace deepgalois
 
 #endif
diff --git a/libdeepgalois/include/deepgalois/configs.h b/libdeepgalois/include/deepgalois/configs.h
index 3de67ecb74..f21dff7fed 100644
--- a/libdeepgalois/include/deepgalois/configs.h
+++ b/libdeepgalois/include/deepgalois/configs.h
@@ -6,6 +6,7 @@ const std::string path =
     "/net/ohm/export/iss/inputs/Learning/"; // path to the input dataset
 
 #define NUM_DATASETS 8
-const std::string dataset_names[NUM_DATASETS] = {"cora", "citeseer", "ppi", "pubmed", "flickr", "yelp", "reddit", "amazon"};
+const std::string dataset_names[NUM_DATASETS] = {
+    "cora", "citeseer", "ppi", "pubmed", "flickr", "yelp", "reddit", "amazon"};
 
-}
+} // namespace deepgalois
diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h
index feacca3a4a..77c15ee890 100644
--- a/libdeepgalois/include/deepgalois/context.h
+++ b/libdeepgalois/include/deepgalois/context.h
@@ -18,38 +18,51 @@ namespace deepgalois {
 class Context {
 public:
   Context();
-  Context(bool use_gpu) :
-    is_device(use_gpu), n(0), num_classes(0), feat_len(0),
-    is_single_class(true), is_selfloop_added(false), use_subgraph(false),
-    h_labels(NULL), h_feats(NULL), d_labels(NULL), d_labels_subg(NULL),
-    d_feats(NULL), d_feats_subg(NULL), norm_factors(NULL) {}
+  Context(bool use_gpu)
+      : is_device(use_gpu), n(0), num_classes(0), feat_len(0),
+        is_single_class(true), is_selfloop_added(false), use_subgraph(false),
+        h_labels(NULL), h_feats(NULL), d_labels(NULL), d_labels_subg(NULL),
+        d_feats(NULL), d_feats_subg(NULL), norm_factors(NULL) {}
   ~Context();
 
   size_t read_graph(bool selfloop);
-  size_t read_labels() { num_classes = reader.read_labels(is_single_class, h_labels); return num_classes; }
-  size_t read_features() { feat_len = reader.read_features(h_feats); return feat_len; }
-  size_t read_masks(std::string mask_type, size_t n, size_t& begin, size_t& end, mask_t* masks) {
+  size_t read_labels() {
+    num_classes = reader.read_labels(is_single_class, h_labels);
+    return num_classes;
+  }
+  size_t read_features() {
+    feat_len = reader.read_features(h_feats);
+    return feat_len;
+  }
+  size_t read_masks(std::string mask_type, size_t n, size_t& begin, size_t& end,
+                    mask_t* masks) {
     return reader.read_masks(mask_type, n, begin, end, masks);
   }
 
-  label_t get_label(size_t i) { return h_labels[i]; } // single-class (one-hot) label
-  //label_t get_label(size_t i, size_t j) { return labels[i*num_classes+j]; } // multi-class label
+  label_t get_label(size_t i) {
+    return h_labels[i];
+  } // single-class (one-hot) label
+  // label_t get_label(size_t i, size_t j) { return labels[i*num_classes+j]; }
+  // // multi-class label
   float_t* get_norm_factors_ptr() { return norm_factors; }
   float_t* get_norm_factors_subg_ptr() { return &norm_factors_subg[0]; }
 
-  void set_dataset(std::string dataset_str) { dataset = dataset_str; reader.init(dataset); }
+  void set_dataset(std::string dataset_str) {
+    dataset = dataset_str;
+    reader.init(dataset);
+  }
   void set_label_class(bool is_single = true) { is_single_class = is_single; }
   void set_use_subgraph(bool use_subg) { use_subgraph = use_subg; }
   void copy_data_to_device(); // copy labels and input features
   void norm_factor_computing(bool is_subgraph, int subg_id = 0);
-  void gen_subgraph_labels(size_t m, const mask_t *masks);
-  void gen_subgraph_feats(size_t m, const mask_t *masks);
+  void gen_subgraph_labels(size_t m, const mask_t* masks);
+  void gen_subgraph_feats(size_t m, const mask_t* masks);
   void createSubgraphs(int num_subgraphs);
 
 #ifndef __GALOIS_HET_CUDA__
   Graph* graph_cpu; // the input graph, |V| = N
   std::vector<Graph*> subgraphs_cpu;
-  void add_selfloop(Graph &og, Graph &g);
+  void add_selfloop(Graph& og, Graph& g);
   //! returns pointer to the graph
   Graph* getGraphPointer() { return graph_cpu; }
   Graph* getSubgraphPointer(int id) { return subgraphs_cpu[id]; };
@@ -68,45 +81,52 @@ class Context {
   label_t* get_labels_subg_ptr() { return d_labels_subg; }
   inline static cublasHandle_t cublas_handle() { return cublas_handle_; }
   inline static cusparseHandle_t cusparse_handle() { return cusparse_handle_; }
-  inline static cusparseMatDescr_t cusparse_matdescr() { return cusparse_matdescr_; }
-  inline static curandGenerator_t curand_generator() { return curand_generator_; }
+  inline static cusparseMatDescr_t cusparse_matdescr() {
+    return cusparse_matdescr_;
+  }
+  inline static curandGenerator_t curand_generator() {
+    return curand_generator_;
+  }
 #endif
 
 protected:
-  std::string dataset; 
-  bool is_device;              // is this on device or host
-  size_t n;                    // number of samples: N
-  size_t num_classes;          // number of classes: E
-  size_t feat_len;             // input feature length: D
-  bool is_single_class;        // single-class (one-hot) or multi-class label
-  bool is_selfloop_added;      // whether selfloop is added to the input graph
-  bool use_subgraph;           // whether to use subgraph
-  label_t *h_labels;           // labels for classification. Single-class label: Nx1, multi-class label: NxE 
-  float_t* h_feats;            // input features: N x D
-  //label_t *h_labels_subg;      // labels for subgraph
-  //float_t* h_feats_subg;       // input features for subgraph
-  label_t* d_labels;           // labels on device
-  label_t *d_labels_subg;      // labels for subgraph on device
-  float_t* d_feats;            // input features on device
-  float_t* d_feats_subg;       // input features for subgraph on device
-  float_t* norm_factors;       // normalization constant based on graph structure
-  std::vector<label_t> h_labels_subg;      // labels for subgraph
-  std::vector<float_t> h_feats_subg;       // input features for subgraph
-  std::vector<float_t> norm_factors_subg;  // normalization constant for subgraph
-  //float_t* norm_factors_subg;  // normalization constant for subgraph
+  std::string dataset;
+  bool is_device;         // is this on device or host
+  size_t n;               // number of samples: N
+  size_t num_classes;     // number of classes: E
+  size_t feat_len;        // input feature length: D
+  bool is_single_class;   // single-class (one-hot) or multi-class label
+  bool is_selfloop_added; // whether selfloop is added to the input graph
+  bool use_subgraph;      // whether to use subgraph
+  label_t* h_labels;      // labels for classification. Single-class label: Nx1,
+                          // multi-class label: NxE
+  float_t* h_feats;       // input features: N x D
+  // label_t *h_labels_subg;      // labels for subgraph
+  // float_t* h_feats_subg;       // input features for subgraph
+  label_t* d_labels;      // labels on device
+  label_t* d_labels_subg; // labels for subgraph on device
+  float_t* d_feats;       // input features on device
+  float_t* d_feats_subg;  // input features for subgraph on device
+  float_t* norm_factors;  // normalization constant based on graph structure
+  std::vector<label_t> h_labels_subg;     // labels for subgraph
+  std::vector<float_t> h_feats_subg;      // input features for subgraph
+  std::vector<float_t> norm_factors_subg; // normalization constant for subgraph
+  // float_t* norm_factors_subg;  // normalization constant for subgraph
   Reader reader;
 
   void alloc_norm_factor();
   void alloc_subgraph_norm_factor(int subg_id);
 
 #ifndef __GALOIS_HET_CUDA__
-  void read_edgelist(const char* filename, bool symmetrize = false, bool add_self_loop = false);
+  void read_edgelist(const char* filename, bool symmetrize = false,
+                     bool add_self_loop = false);
 #else
-  static cublasHandle_t cublas_handle_; // used to call cuBLAS
-  static cusparseHandle_t cusparse_handle_; // used to call cuSPARSE
+  static cublasHandle_t cublas_handle_;         // used to call cuBLAS
+  static cusparseHandle_t cusparse_handle_;     // used to call cuSPARSE
   static cusparseMatDescr_t cusparse_matdescr_; // used to call cuSPARSE
-  static curandGenerator_t curand_generator_; // used to generate random numbers on GPU
+  static curandGenerator_t
+      curand_generator_; // used to generate random numbers on GPU
 #endif
 };
 
-} // end deepgalois namespace
+} // namespace deepgalois
diff --git a/libdeepgalois/include/deepgalois/cutils.h b/libdeepgalois/include/deepgalois/cutils.h
index 9466f55c53..4e4e9842b1 100644
--- a/libdeepgalois/include/deepgalois/cutils.h
+++ b/libdeepgalois/include/deepgalois/cutils.h
@@ -78,9 +78,9 @@ inline const char* cusparseGetErrorString(cusparseStatus_t error) {
   case CUSPARSE_STATUS_INTERNAL_ERROR:
     return "CUSPARSE_STATUS_INTERNAL_ERROR";
   case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
-      return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+    return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
   case CUSPARSE_STATUS_ZERO_PIVOT:
-        return "CUSPARSE_STATUS_ZERO_PIVOT";
+    return "CUSPARSE_STATUS_ZERO_PIVOT";
   default:
     break;
   }
@@ -173,17 +173,20 @@ inline const char* curandGetErrorString(curandStatus_t error) {
 // CUDA: check for error after kernel execution and exit loudly if there is one.
 #define CUDA_POST_KERNEL_CHECK CUDA_CHECK(cudaPeekAtLastError())
 
-inline void print_device_vector(size_t n, const float_t *d_x, std::string name = "x") {
-  float_t *h_x = new float_t[n];
+inline void print_device_vector(size_t n, const float_t* d_x,
+                                std::string name = "x") {
+  float_t* h_x = new float_t[n];
   CUDA_CHECK(cudaMemcpy(h_x, d_x, n * sizeof(float_t), cudaMemcpyDeviceToHost));
-  for (size_t i = 0; i < n; i ++) std::cout << name << "[" << i << "]=" << h_x[i] << "\n";
+  for (size_t i = 0; i < n; i++)
+    std::cout << name << "[" << i << "]=" << h_x[i] << "\n";
   delete[] h_x;
 }
 
-inline void print_device_int_vector(size_t n, const int *d_x, std::string name = "x") {
-  int *h_x = new int[n];
+inline void print_device_int_vector(size_t n, const int* d_x,
+                                    std::string name = "x") {
+  int* h_x = new int[n];
   CUDA_CHECK(cudaMemcpy(h_x, d_x, n * sizeof(int), cudaMemcpyDeviceToHost));
-  for (size_t i = 0; i < n; i ++) std::cout << name << "[" << i << "]=" << h_x[i] << "\n";
+  for (size_t i = 0; i < n; i++)
+    std::cout << name << "[" << i << "]=" << h_x[i] << "\n";
   delete[] h_x;
 }
-
diff --git a/libdeepgalois/include/deepgalois/gtypes.h b/libdeepgalois/include/deepgalois/gtypes.h
index e06a9c3fe0..ff4a6e4e46 100644
--- a/libdeepgalois/include/deepgalois/gtypes.h
+++ b/libdeepgalois/include/deepgalois/gtypes.h
@@ -25,21 +25,22 @@
 namespace deepgalois {
 typedef index_t edge_iterator;
 //#ifdef EDGE_LABEL
-//typedef galois::graphs::LC_CSR_Graph<uint32_t, uint32_t>::
+// typedef galois::graphs::LC_CSR_Graph<uint32_t, uint32_t>::
 //    with_numa_alloc<true>::type ::with_no_lockable<true>::type LCGraph;
 //#else
-//typedef galois::graphs::LC_CSR_Graph<void, void, false, false, false, void, uint64_t, uint64_t>::
+// typedef galois::graphs::LC_CSR_Graph<void, void, false, false, false, void,
+// uint64_t, uint64_t>::
 //    with_numa_alloc<true>::type ::with_no_lockable<true>::type LCGraph;
 //#endif
-//typedef LCGraph Graph;
-//typedef Graph::edge_iterator edge_iterator;
+// typedef LCGraph Graph;
+// typedef Graph::edge_iterator edge_iterator;
 typedef LearningGraph Graph;
 #ifdef USE_CSRGRAPH
 typedef CSRGraph GraphGPU;
 #else
 typedef LearningGraph GraphGPU;
 #endif
-}
+} // namespace deepgalois
 
 #else
 
@@ -47,7 +48,6 @@ namespace deepgalois {
 // TODO check if this needs changing
 typedef index_t edge_iterator;
 using Graph = galois::graphs::DistGraph<char, void>;
-}
+} // namespace deepgalois
 
 #endif
-
diff --git a/libdeepgalois/include/deepgalois/layers/GluonGradients.h b/libdeepgalois/include/deepgalois/layers/GluonGradients.h
index a7aa66d576..e14fe27bc8 100644
--- a/libdeepgalois/include/deepgalois/layers/GluonGradients.h
+++ b/libdeepgalois/include/deepgalois/layers/GluonGradients.h
@@ -38,6 +38,7 @@ class GluonGradients {
   std::vector<std::vector<size_t>> _mirrorNodes;
   //! nodes that are mirrors on this host
   std::vector<std::pair<uint32_t, uint32_t>> _mirrorRanges;
+
 public:
   /**
    * Save weight gradients + number of them (i.e. size).
@@ -45,7 +46,7 @@ class GluonGradients {
    */
   GluonGradients(GradientVecType& gradients, size_t numWeights)
       : _gradients(gradients), _numWeights(numWeights) {
-    _myHost = galois::runtime::getSystemNetworkInterface().ID;
+    _myHost     = galois::runtime::getSystemNetworkInterface().ID;
     _totalHosts = galois::runtime::getSystemNetworkInterface().Num;
 
     // allocate a vector for each host
@@ -54,13 +55,13 @@ class GluonGradients {
     // loop through distribution of weights to hosts
     for (unsigned h = 0; h < _totalHosts; h++) {
       std::pair<size_t, size_t> curRange =
-        galois::block_range((size_t)0, _numWeights, h, _totalHosts);
+          galois::block_range((size_t)0, _numWeights, h, _totalHosts);
 
       if (h != _myHost) {
         // setup mirrors for the host h which is just the list of IDs
-        size_t curW = curRange.first;
+        size_t curW  = curRange.first;
         size_t lastW = curRange.second;
-        size_t numW = lastW - curW;
+        size_t numW  = lastW - curW;
 
         // set mirrors for host h
         _mirrorNodes[h].reserve(numW);
@@ -71,8 +72,8 @@ class GluonGradients {
         // these belong to this host; save, then mirror ranges can be
         // calculated from this
         _beginMaster = curRange.first;
-        _endMaster = curRange.second;
-        _numOwned = _endMaster - _beginMaster;
+        _endMaster   = curRange.second;
+        _numOwned    = _endMaster - _beginMaster;
 
         // first range is 0 to begin master
         if (_beginMaster > 0) {
@@ -95,44 +96,28 @@ class GluonGradients {
   }
 
   //! Size is number of weights
-  size_t size() const {
-    return _numWeights;
-  }
+  size_t size() const { return _numWeights; }
 
   //! Global size is number of weights
-  size_t globalSize() const {
-    return _numWeights;
-  }
+  size_t globalSize() const { return _numWeights; }
 
   //! Return the weights owned by this host
-  size_t numMasters() const {
-    return _numOwned;
-  }
+  size_t numMasters() const { return _numOwned; }
 
   //! Return host ID
-  unsigned myHostID() const {
-    return _myHost;
-  }
+  unsigned myHostID() const { return _myHost; }
 
   //! Return num hosts in the system
-  unsigned numHosts() const {
-    return _totalHosts;
-  }
+  unsigned numHosts() const { return _totalHosts; }
 
   //! GID is same as LID since all hosts have all weights
-  uint32_t getGID(const uint32_t nodeID) const {
-    return nodeID;
-  }
+  uint32_t getGID(const uint32_t nodeID) const { return nodeID; }
 
   //! LID is same as GID since all hosts have all weights
-  uint32_t getLID(const uint32_t nodeID) const {
-    return nodeID;
-  }
+  uint32_t getLID(const uint32_t nodeID) const { return nodeID; }
 
   //! Return local weight w
-  GradientType& getData(uint32_t w) const {
-    return _gradients[w];
-  }
+  GradientType& getData(uint32_t w) const { return _gradients[w]; }
 
   //! Return ranges for mirrors (unowned nodes)
   const std::vector<std::pair<uint32_t, uint32_t>>& getMirrorRanges() const {
@@ -140,50 +125,34 @@ class GluonGradients {
   }
 
   //! Return mirror nodes for each host from this host's point of view
-  std::vector<std::vector<size_t>>& getMirrorNodes() {
-    return _mirrorNodes;
-  }
+  std::vector<std::vector<size_t>>& getMirrorNodes() { return _mirrorNodes; }
 
   //! clears the vector
   // TODO return to this when we start distributing on GPUs; wrapper
   // end probably shouldn't be managing this MAYBE
-  void deallocate() {
-    _gradients.clear();
-  }
+  void deallocate() { _gradients.clear(); }
 
   // Essentially no-op functions follow
 
   //! no nodes with edges
-  size_t getNumNodesWithEdges() {
-    return 0;
-  }
+  size_t getNumNodesWithEdges() { return 0; }
 
   //! No edges; not a vertex cut
-  bool is_vertex_cut() const {
-    return false;
-  }
+  bool is_vertex_cut() const { return false; }
 
   //! no edges, return 0
-  unsigned edge_begin(uint32_t) {
-    return 0;
-  }
+  unsigned edge_begin(uint32_t) { return 0; }
 
   //! no edges, return 0
-  unsigned edge_end(uint32_t) {
-    return 0;
-  }
+  unsigned edge_end(uint32_t) { return 0; }
 
   //! no edges, return 0
-  unsigned getEdgeDst(uint32_t) {
-    return 0;
-  }
+  unsigned getEdgeDst(uint32_t) { return 0; }
 
   //! no edges, return 0
-  unsigned getEdgeData(uint32_t) {
-    return 0;
-  }
+  unsigned getEdgeData(uint32_t) { return 0; }
 };
 
-}
+} // namespace deepgalois
 
 #endif // end header guard
diff --git a/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h
index 1d26b87007..dd2f3de6a9 100644
--- a/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h
+++ b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h
@@ -6,14 +6,12 @@
 struct GradientSync {
   using ValTy = float_t;
 
-  static ValTy extract(uint32_t, float_t& weight) {
-    return weight;
-  }
+  static ValTy extract(uint32_t, float_t& weight) { return weight; }
 
   static bool reduce(uint32_t, float_t& weight, ValTy y) {
     // TODO merge function here
     // for now make sure the weights are close enough
-    //if (std::abs(weight - y) > 0.00001) {
+    // if (std::abs(weight - y) > 0.00001) {
     //  galois::gInfo("weight ", node_id, " not consistent with one received");
     //}
     weight += y;
@@ -21,22 +19,19 @@ struct GradientSync {
   }
 
   //! reset weight to 0
-  static void reset(uint32_t, float_t &weight) {
-    weight = 0;
-  }
+  static void reset(uint32_t, float_t& weight) { weight = 0; }
 
   //! save weight
-  static void setVal(uint32_t, float_t &weight, ValTy y) {
-    weight = y;
-  }
+  static void setVal(uint32_t, float_t& weight, ValTy y) { weight = y; }
 
   // GPU options TODO for GPU
   static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {
     return false;
   }
   static bool extract_batch(unsigned, uint8_t*) { return false; }
-  static bool extract_reset_batch(unsigned, uint8_t*, size_t*,
-                                  DataCommMode*) { return false; }
+  static bool extract_reset_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {
+    return false;
+  }
   static bool extract_reset_batch(unsigned, uint8_t*) { return false; }
   static bool reduce_batch(unsigned, uint8_t*, DataCommMode) { return false; }
   static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) {
@@ -46,5 +41,5 @@ struct GradientSync {
 };
 
 // TODO bitset; might have to do it manually
-//GALOIS_SYNC_STRUCTURE_BITSET(TODOTHIS?);
+// GALOIS_SYNC_STRUCTURE_BITSET(TODOTHIS?);
 #endif
diff --git a/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h b/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h
index e4874e468f..cb5a33e783 100644
--- a/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h
+++ b/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h
@@ -15,7 +15,7 @@ struct GraphConvSync {
     // copy the node's data to vector to serialize/send
     for (unsigned i = 0; i < deepgalois::_syncVectorSize; i++) {
       vecToReturn[i] =
-        deepgalois::_dataToSync[node_id * deepgalois::_syncVectorSize + i];
+          deepgalois::_dataToSync[node_id * deepgalois::_syncVectorSize + i];
     }
     // move constructor should kick in here to avoid return copy
     return vecToReturn;
@@ -27,14 +27,14 @@ struct GraphConvSync {
     assert(y.size() == deepgalois::_syncVectorSize);
     // loop and do addition
     for (unsigned i = 0; i < deepgalois::_syncVectorSize; i++) {
-      deepgalois::_dataToSync[node_id * deepgalois::_syncVectorSize + i] += y[i];
+      deepgalois::_dataToSync[node_id * deepgalois::_syncVectorSize + i] +=
+          y[i];
     }
     return true;
   }
 
   //! do nothing (waste of a write)
-  static void reset(uint32_t, char&) {
-  }
+  static void reset(uint32_t, char&) {}
 
   //! element wise set
   static void setVal(uint32_t node_id, char&, ValTy y) {
@@ -50,8 +50,9 @@ struct GraphConvSync {
     return false;
   }
   static bool extract_batch(unsigned, uint8_t*) { return false; }
-  static bool extract_reset_batch(unsigned, uint8_t*, size_t*,
-                                  DataCommMode*) { return false; }
+  static bool extract_reset_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {
+    return false;
+  }
   static bool extract_reset_batch(unsigned, uint8_t*) { return false; }
   static bool reduce_batch(unsigned, uint8_t*, DataCommMode) { return false; }
   static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) {
diff --git a/libdeepgalois/include/deepgalois/layers/aggregator.h b/libdeepgalois/include/deepgalois/layers/aggregator.h
index 90c5781189..6e5e7a5926 100644
--- a/libdeepgalois/include/deepgalois/layers/aggregator.h
+++ b/libdeepgalois/include/deepgalois/layers/aggregator.h
@@ -7,16 +7,16 @@
 namespace deepgalois {
 void update_all(size_t len, Graph& g, const float_t* in, float_t* out,
                 bool norm, float_t* norm_factor);
-void update_all_csrmm(size_t len, Graph& g, const float_t* in, 
-                float_t* out, bool norm, float_t* norm_factor);
-}
+void update_all_csrmm(size_t len, Graph& g, const float_t* in, float_t* out,
+                      bool norm, float_t* norm_factor);
+} // namespace deepgalois
 #else
 #include "deepgalois/gtypes.h"
 //#include "graph_gpu.h"
 namespace deepgalois {
 void update_all(size_t len, GraphGPU& g, const float_t* in, float_t* out,
                 bool norm, const float_t* norm_factor);
-void update_all_csrmm(size_t len, GraphGPU& g, const float_t* in, 
-                float_t* out, bool norm, const float_t* norm_factor);
-}
+void update_all_csrmm(size_t len, GraphGPU& g, const float_t* in, float_t* out,
+                      bool norm, const float_t* norm_factor);
+} // namespace deepgalois
 #endif
diff --git a/libdeepgalois/include/deepgalois/layers/arithmetic_layer.h b/libdeepgalois/include/deepgalois/layers/arithmetic_layer.h
index c28d0ed89c..e4b59e694f 100644
--- a/libdeepgalois/include/deepgalois/layers/arithmetic_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/arithmetic_layer.h
@@ -25,4 +25,4 @@ class elementwise_add_layer : public layer {
     in_grad = out_grad;
   }
 };
-} // namespace
+} // namespace deepgalois
diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
index 56c0de0be6..09d4233c27 100644
--- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
@@ -7,7 +7,7 @@
 
 /**
  * GraphConv Layer; based on DGL implementation + follows TinyDNN layer
- * convention 
+ * convention
  * https://docs.dgl.ai/en/0.4.x/_modules/dgl/nn/pytorch/conv/graphconv.html
  *
  *   Parameters
@@ -26,11 +26,11 @@
 namespace deepgalois {
 class graph_conv_layer : public layer {
 public:
-  graph_conv_layer(unsigned level, bool act, bool norm, bool bias,
-                   bool dropout, float_t dropout_rate,
-                   std::vector<size_t> in_dims, std::vector<size_t> out_dims)
-    : layer(level, in_dims, out_dims), act_(act), norm_(norm), bias_(bias),
-      dropout_(dropout), dropout_rate_(dropout_rate) {
+  graph_conv_layer(unsigned level, bool act, bool norm, bool bias, bool dropout,
+                   float_t dropout_rate, std::vector<size_t> in_dims,
+                   std::vector<size_t> out_dims)
+      : layer(level, in_dims, out_dims), act_(act), norm_(norm), bias_(bias),
+        dropout_(dropout), dropout_rate_(dropout_rate) {
     assert(input_dims[0] == output_dims[0]); // num_vertices
     trainable_ = true;
     name_      = layer_type() + "_" + std::to_string(level);
@@ -39,16 +39,17 @@ class graph_conv_layer : public layer {
   }
   graph_conv_layer(unsigned level, std::vector<size_t> in_dims,
                    std::vector<size_t> out_dims)
-      : graph_conv_layer(level, false, true, false, true, 0.5, in_dims, out_dims) {}
+      : graph_conv_layer(level, false, true, false, true, 0.5, in_dims,
+                         out_dims) {}
   ~graph_conv_layer() {}
   void malloc_and_init();
   std::string layer_type() const override { return std::string("graph_conv"); }
   virtual acc_t get_weight_decay_loss();
-  //! Uses weights contained in this layer to update in_data (results from previous)
-  //! and save result to out_data
+  //! Uses weights contained in this layer to update in_data (results from
+  //! previous) and save result to out_data
   virtual void forward_propagation(const float_t* in_data, float_t* out_data);
-  //! Uses gradients from layer after this one to update both own weight gradients
-  //! as well as gradients for the features (in_grad)
+  //! Uses gradients from layer after this one to update both own weight
+  //! gradients as well as gradients for the features (in_grad)
   virtual void back_propagation(const float_t* in_data, const float_t* out_data,
                                 float_t* out_grad, float_t* in_grad);
   // user-defined aggregate function
@@ -56,11 +57,13 @@ class graph_conv_layer : public layer {
   virtual void aggregate(size_t len, Graph& g, const float_t* in, float_t* out);
   void d_aggregate(size_t len, Graph& g, const float_t* in, float_t* out);
 #else
-  virtual void aggregate(size_t len, GraphGPU& g, const float_t* in, float_t* out);
+  virtual void aggregate(size_t len, GraphGPU& g, const float_t* in,
+                         float_t* out);
   void d_aggregate(size_t len, GraphGPU& g, const float_t* in, float_t* out);
 #endif
   // user-defined combine function
-  virtual void combine(size_t dim_x, size_t dim_y, const float_t* self, const float_t* neighbors, float_t* out);
+  virtual void combine(size_t dim_x, size_t dim_y, const float_t* self,
+                       const float_t* neighbors, float_t* out);
 
 private:
   bool act_;     // whether to use activation function at the end
@@ -72,12 +75,13 @@ class graph_conv_layer : public layer {
   float_t* out_temp; //!< intermediate data temporary
   float_t* in_temp;
   float_t* in_temp1;
-  float_t* trans_data;    // y*x
+  float_t* trans_data;  // y*x
   mask_t* dropout_mask; // x*y
 
   // Glorot & Bengio (AISTATS 2010)
-  inline void rand_init_matrix(size_t dim_x, size_t dim_y, vec_t& matrix, unsigned seed=1);
+  inline void rand_init_matrix(size_t dim_x, size_t dim_y, vec_t& matrix,
+                               unsigned seed = 1);
   inline void zero_init_matrix(size_t dim_x, size_t dim_y, vec_t& matrix);
 };
 
-} // namespace
+} // namespace deepgalois
diff --git a/libdeepgalois/include/deepgalois/layers/l2_norm_layer.h b/libdeepgalois/include/deepgalois/layers/l2_norm_layer.h
index 29e29f3474..c7167700a2 100644
--- a/libdeepgalois/include/deepgalois/layers/l2_norm_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/l2_norm_layer.h
@@ -5,22 +5,24 @@ namespace deepgalois {
 // L2 Normalization Layer
 class l2_norm_layer : public layer {
 public:
-  l2_norm_layer(unsigned level, float_t eps, float_t scale, dims_t in_dims, dims_t out_dims)
-    : layer(level, in_dims, out_dims), epsilon_(eps), scale_(scale) {
+  l2_norm_layer(unsigned level, float_t eps, float_t scale, dims_t in_dims,
+                dims_t out_dims)
+      : layer(level, in_dims, out_dims), epsilon_(eps), scale_(scale) {
     assert(input_dims[0] == output_dims[0]); // num_vertices
     trainable_ = false;
-    name_ = layer_type() + "_" + std::to_string(level);
+    name_      = layer_type() + "_" + std::to_string(level);
   }
-  l2_norm_layer(unsigned level, dims_t in_dims, dims_t out_dims) :
-    l2_norm_layer(level, 1e-12, 20, in_dims, out_dims) {}
+  l2_norm_layer(unsigned level, dims_t in_dims, dims_t out_dims)
+      : l2_norm_layer(level, 1e-12, 20, in_dims, out_dims) {}
   ~l2_norm_layer() {}
   std::string layer_type() const override { return std::string("l2_norm"); }
   virtual void forward_propagation(const float_t* in_data, float_t* out_data);
   virtual void back_propagation(const float_t* in_data, const float_t* out_data,
                                 float_t* out_grad, float_t* in_grad);
+
 protected:
   float_t epsilon_;
   float_t scale_;
 };
 
-} // namespace
+} // namespace deepgalois
diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h
index c0f03aafd3..ec35c1d8c9 100644
--- a/libdeepgalois/include/deepgalois/layers/layer.h
+++ b/libdeepgalois/include/deepgalois/layers/layer.h
@@ -48,16 +48,15 @@ class layer : public deepgalois::node {
 
   layer(unsigned level, std::vector<size_t> in_dims,
         std::vector<size_t> out_dims)
-      : level_(level), begin_(0),
-        end_(0), num_dims(in_dims.size()), input_dims(in_dims),
-        output_dims(out_dims), labels(NULL) { }
+      : level_(level), begin_(0), end_(0), num_dims(in_dims.size()),
+        input_dims(in_dims), output_dims(out_dims), labels(NULL) {}
   virtual ~layer()                       = default;
   virtual std::string layer_type() const = 0;
   virtual void malloc_and_init() {}
   void print_layer_info() { //! debug print function
     std::cout << "Layer" << level_ << " type: " << layer_type() << " input["
               << input_dims[0] << "," << input_dims[1] << "] output["
-              <<  output_dims[0] << "," << output_dims[1] << "]\n";
+              << output_dims[0] << "," << output_dims[1] << "]\n";
   }
   // get methods
   virtual acc_t get_prediction_loss() { return acc_t(0); }
@@ -73,30 +72,35 @@ class layer : public deepgalois::node {
   // set methods
   void set_netphase(net_phase ctx) { phase_ = ctx; }
   void set_context(ContextType* ctx) { context = ctx; }
-  void set_trainable(bool trainable) { trainable_ = trainable; } // is this layer trainable?
-  void set_labels_ptr(label_t *ptr) { labels = ptr; }
-  void set_norm_consts_ptr(float_t *ptr) { norm_consts = ptr; }
-  void set_feats_ptr(float_t *ptr) { prev_->set_data(ptr); }
+  void set_trainable(bool trainable) {
+    trainable_ = trainable;
+  } // is this layer trainable?
+  void set_labels_ptr(label_t* ptr) { labels = ptr; }
+  void set_norm_consts_ptr(float_t* ptr) { norm_consts = ptr; }
+  void set_feats_ptr(float_t* ptr) { prev_->set_data(ptr); }
   void set_name(std::string name) { name_ = name; } // name metadata
 #ifdef CPU_ONLY
-  void set_graph_ptr(Graph *ptr) { graph_cpu = ptr; }
+  void set_graph_ptr(Graph* ptr) { graph_cpu = ptr; }
 #else
-  void set_graph_ptr(GraphGPU *ptr) { graph_gpu = ptr; }
+  void set_graph_ptr(GraphGPU* ptr) { graph_gpu = ptr; }
 #endif
-  void update_dim_size(size_t g_size) { input_dims[0] = output_dims[0] = g_size; }
+  void update_dim_size(size_t g_size) {
+    input_dims[0] = output_dims[0] = g_size;
+  }
 
   //! set the data of the previous layer connected to this one
   void set_in_data(float_t* data) {
-    prev_ = std::make_shared<deepgalois::edge>(this, input_dims[0], input_dims[1]);
+    prev_ =
+        std::make_shared<deepgalois::edge>(this, input_dims[0], input_dims[1]);
     prev_->set_data(data);
     // no need to allocate memory for gradients, since this is the input layer.
   }
 
   virtual void set_sample_mask(size_t sample_begin, size_t sample_end,
                                size_t sample_count, mask_t* masks) {
-    begin_ = sample_begin;
-    end_   = sample_end;
-    count_ = sample_count;
+    begin_   = sample_begin;
+    end_     = sample_end;
+    count_   = sample_count;
     use_mask = false;
     if (masks != NULL) {
       use_mask = true;
@@ -110,7 +114,8 @@ class layer : public deepgalois::node {
 
   void add_edge() {
     // add an outgoing edge
-    next_ = std::make_shared<deepgalois::edge>(this, output_dims[0], output_dims[1]);
+    next_ = std::make_shared<deepgalois::edge>(this, output_dims[0],
+                                               output_dims[1]);
     // allocate memory for intermediate feature vectors and gradients
     next_->alloc();
   }
@@ -140,10 +145,11 @@ class layer : public deepgalois::node {
 #ifdef CPU_ONLY
     // parallelize only when target size is big enough to mitigate thread
     // spawning overhead.
-    //bool parallel = (W.size() >= 512);
+    // bool parallel = (W.size() >= 512);
     opt->update(layer::weight_grad, layer::W); // W += grad
 #else
-    opt->update_gpu(input_dims[1]*output_dims[1], d_weight_grad, d_W); // W += grad
+    opt->update_gpu(input_dims[1] * output_dims[1], d_weight_grad,
+                    d_W); // W += grad
 #endif
     // prev()->clear_grads();
     next()->clear_grads();
@@ -174,9 +180,9 @@ class layer : public deepgalois::node {
   label_t* labels;
   float_t* norm_consts;
 #ifdef CPU_ONLY
-  Graph *graph_cpu;
+  Graph* graph_cpu;
 #else
-  GraphGPU *graph_gpu;
+  GraphGPU* graph_gpu;
 #endif
 
 #ifdef GALOIS_USE_DIST
@@ -186,9 +192,8 @@ class layer : public deepgalois::node {
 #endif
 };
 
-
 //! Connects tail to head's edge and sets that edge's target to tail
-//inline void connect(layer* head, layer* tail) {
+// inline void connect(layer* head, layer* tail) {
 inline void connect(layer* head, layer* tail) {
   tail->prev_ = head->next_;
   tail->prev_->add_next_node(tail);
diff --git a/libdeepgalois/include/deepgalois/layers/leaky_relu_layer.h b/libdeepgalois/include/deepgalois/layers/leaky_relu_layer.h
index a8b6136eea..2f43e0a228 100644
--- a/libdeepgalois/include/deepgalois/layers/leaky_relu_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/leaky_relu_layer.h
@@ -5,16 +5,18 @@ namespace deepgalois {
 // Leaky ReLU Layer
 class leaky_relu_layer : public layer {
 public:
-  leaky_relu_layer(unsigned level, float_t eps, dims_t in_dims, dims_t out_dims);
-  leaky_relu_layer(unsigned level, dims_t in_dims, dims_t out_dims) :
-    leaky_relu_layer(level, 0.0, in_dims, out_dims) {}
+  leaky_relu_layer(unsigned level, float_t eps, dims_t in_dims,
+                   dims_t out_dims);
+  leaky_relu_layer(unsigned level, dims_t in_dims, dims_t out_dims)
+      : leaky_relu_layer(level, 0.0, in_dims, out_dims) {}
   ~leaky_relu_layer() {}
   std::string layer_type() const override { return std::string("leaky_relu"); }
   virtual void forward_propagation(const float_t* in_data, float_t* out_data);
   virtual void back_propagation(const float_t* in_data, const float_t* out_data,
                                 float_t* out_grad, float_t* in_grad);
+
 protected:
   float_t epsilon_;
   size_t n;
 };
-} // namespace
+} // namespace deepgalois
diff --git a/libdeepgalois/include/deepgalois/layers/linear_layer.h b/libdeepgalois/include/deepgalois/layers/linear_layer.h
index d68ae12479..ebcc774cc1 100644
--- a/libdeepgalois/include/deepgalois/layers/linear_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/linear_layer.h
@@ -31,4 +31,4 @@ class linear_layer : public layer {
 protected:
   float_t scale_, bias_;
 };
-} // namespace
+} // namespace deepgalois
diff --git a/libdeepgalois/include/deepgalois/layers/node.h b/libdeepgalois/include/deepgalois/layers/node.h
index e8699d2498..11499bbede 100644
--- a/libdeepgalois/include/deepgalois/layers/node.h
+++ b/libdeepgalois/include/deepgalois/layers/node.h
@@ -26,15 +26,18 @@ typedef std::shared_ptr<edge> edgeptr_t;
 // edge
 class node : public std::enable_shared_from_this<node> {
 public:
-  node() { prev_= NULL; next_ = NULL; }
-  //node(size_t in_size, size_t out_size) {
+  node() {
+    prev_ = NULL;
+    next_ = NULL;
+  }
+  // node(size_t in_size, size_t out_size) {
   //} //: prev_(in_size), next_(out_size) {}
   virtual ~node() {}
   const edgeptr_t prev() const { return prev_; }
   const edgeptr_t next() const { return next_; }
 
 protected:
-  //node() = delete;
+  // node() = delete;
   friend void connect(layer* head, layer* tail);
   mutable edgeptr_t prev_;
   mutable edgeptr_t next_;
diff --git a/libdeepgalois/include/deepgalois/layers/relu_layer.h b/libdeepgalois/include/deepgalois/layers/relu_layer.h
index 601c5d67ed..4e1c47ed77 100644
--- a/libdeepgalois/include/deepgalois/layers/relu_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/relu_layer.h
@@ -6,11 +6,13 @@ namespace deepgalois {
 class relu_layer : public layer {
 public:
   relu_layer(unsigned level, dims_t in_dims, dims_t out_dims)
-      : layer(level, in_dims, out_dims) { trainable_ = false; }
+      : layer(level, in_dims, out_dims) {
+    trainable_ = false;
+  }
   ~relu_layer() {}
   std::string layer_type() const override { return std::string("relu"); }
   virtual void forward_propagation(const float_t* in_data, float_t* out_data);
   virtual void back_propagation(const float_t* in_data, const float_t* out_data,
                                 float_t* out_grad, float_t* in_grad);
 };
-} // namespace
+} // namespace deepgalois
diff --git a/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h b/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h
index c8b1241acc..be133995c0 100644
--- a/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h
@@ -7,7 +7,9 @@ class sigmoid_loss_layer : public layer {
   sigmoid_loss_layer(unsigned level, std::vector<size_t> in_dims,
                      std::vector<size_t> out_dims);
   ~sigmoid_loss_layer();
-  std::string layer_type() const override { return std::string("sigmoid_loss"); }
+  std::string layer_type() const override {
+    return std::string("sigmoid_loss");
+  }
   void malloc_and_init();
   inline label_t get_label(size_t i, size_t j);
   virtual void forward_propagation(const float_t* in_data, float_t* out_data);
@@ -15,4 +17,4 @@ class sigmoid_loss_layer : public layer {
                                 float_t* out_grad, float_t* in_grad);
   virtual acc_t get_prediction_loss();
 };
-}
+} // namespace deepgalois
diff --git a/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h b/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h
index 43f07728cd..7ba096a2aa 100644
--- a/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h
@@ -7,7 +7,9 @@ class softmax_loss_layer : public layer {
   softmax_loss_layer(unsigned level, std::vector<size_t> in_dims,
                      std::vector<size_t> out_dims);
   ~softmax_loss_layer();
-  std::string layer_type() const override { return std::string("softmax_loss"); }
+  std::string layer_type() const override {
+    return std::string("softmax_loss");
+  }
   void malloc_and_init();
   inline label_t get_label(size_t i);
   virtual void forward_propagation(const float_t* in_data, float_t* out_data);
@@ -15,4 +17,4 @@ class softmax_loss_layer : public layer {
                                 float_t* out_grad, float_t* in_grad);
   virtual acc_t get_prediction_loss();
 };
-}
+} // namespace deepgalois
diff --git a/libdeepgalois/include/deepgalois/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h
index d9e6e60d1d..53382199f4 100644
--- a/libdeepgalois/include/deepgalois/lgraph.h
+++ b/libdeepgalois/include/deepgalois/lgraph.h
@@ -13,7 +13,7 @@ namespace deepgalois {
 
 class LearningGraph {
   typedef std::vector<index_t> IndexList;
-  //typedef index_t* IndexList;
+  // typedef index_t* IndexList;
 protected:
   bool is_device;
   index_t num_vertices_;
@@ -21,23 +21,27 @@ class LearningGraph {
   IndexList rowptr_;
   IndexList colidx_;
   IndexList degrees_;
-  vdata_t *vertex_data_;
-  edata_t *edge_data_;
+  vdata_t* vertex_data_;
+  edata_t* edge_data_;
 
-  index_t *d_rowptr_;
-  index_t *d_colidx_;
-  index_t *d_degrees_;
-  vdata_t *d_vertex_data_;
-  edata_t *d_edge_data_;
+  index_t* d_rowptr_;
+  index_t* d_colidx_;
+  index_t* d_degrees_;
+  vdata_t* d_vertex_data_;
+  edata_t* d_edge_data_;
   std::vector<std::vector<size_t>> mirrorNodes;
 
 public:
   typedef size_t iterator;
-  LearningGraph(bool use_gpu) : is_device(use_gpu), num_vertices_(0), num_edges_(0),
-                                vertex_data_(NULL), edge_data_(NULL) {}
+  LearningGraph(bool use_gpu)
+      : is_device(use_gpu), num_vertices_(0), num_edges_(0), vertex_data_(NULL),
+        edge_data_(NULL) {}
   LearningGraph() : LearningGraph(false) {}
   ~LearningGraph() { dealloc(); }
-  void init(index_t nv, index_t ne) { num_vertices_ = nv; num_edges_ = ne; }
+  void init(index_t nv, index_t ne) {
+    num_vertices_ = nv;
+    num_edges_    = ne;
+  }
   size_t size() { return (size_t)num_vertices_; }
   size_t sizeEdges() { return (size_t)num_edges_; }
   index_t get_degree(index_t vid) { return degrees_[vid]; }
@@ -53,14 +57,13 @@ class LearningGraph {
   void constructNodes() {}
 
   void readGraph(std::string dataset);
-  void fixEndEdge(index_t vid, index_t row_end) {
-    rowptr_[vid+1] = row_end;
-  }
+  void fixEndEdge(index_t vid, index_t row_end) { rowptr_[vid + 1] = row_end; }
   void allocateFrom(index_t nv, index_t ne) {
-    //printf("Allocating num_vertices %d num_edgesi %d\n", num_vertices_, num_edges_);
+    // printf("Allocating num_vertices %d num_edgesi %d\n", num_vertices_,
+    // num_edges_);
     num_vertices_ = nv;
-    num_edges_ = ne;
-    rowptr_.resize(num_vertices_+1);
+    num_edges_    = ne;
+    rowptr_.resize(num_vertices_ + 1);
     colidx_.resize(num_edges_);
     degrees_.resize(num_vertices_);
     rowptr_[0] = 0;
@@ -69,37 +72,42 @@ class LearningGraph {
     assert(dst < num_vertices_);
     assert(eid < num_edges_);
     colidx_[eid] = dst;
-    if (edge_data_) edge_data_[eid] = edata;
+    if (edge_data_)
+      edge_data_[eid] = edata;
   }
   void add_selfloop() {
     auto old_colidx_ = colidx_;
     colidx_.resize(num_vertices_ + num_edges_);
     for (index_t i = 0; i < num_vertices_; i++) {
-      auto start = rowptr_[i];
-      auto end = rowptr_[i+1];
+      auto start             = rowptr_[i];
+      auto end               = rowptr_[i + 1];
       bool selfloop_inserted = false;
       if (start == end) {
-        colidx_[start+i] = i;
+        colidx_[start + i] = i;
         continue;
       }
       for (auto e = start; e != end; e++) {
         auto dst = old_colidx_[e];
         if (!selfloop_inserted) {
           if (i < dst) {
-            selfloop_inserted = true;
-            colidx_[e+i] = i;
-            colidx_[e+i+1] = dst;
-          } else if (e+1 == end) {
-            selfloop_inserted = true;
-            colidx_[e+i+1] = i;
-            colidx_[e+i] = dst;
-          } else colidx_[e+i] = dst;
-        } else colidx_[e+i+1] = dst;
+            selfloop_inserted  = true;
+            colidx_[e + i]     = i;
+            colidx_[e + i + 1] = dst;
+          } else if (e + 1 == end) {
+            selfloop_inserted  = true;
+            colidx_[e + i + 1] = i;
+            colidx_[e + i]     = dst;
+          } else
+            colidx_[e + i] = dst;
+        } else
+          colidx_[e + i + 1] = dst;
       }
     }
-    for (index_t i = 0; i <= num_vertices_; i++) rowptr_[i] += i;
+    for (index_t i = 0; i <= num_vertices_; i++)
+      rowptr_[i] += i;
     num_edges_ += num_vertices_;
-    printf("Selfloop added: num_vertices %d num_edges %d\n", num_vertices_, num_edges_);
+    printf("Selfloop added: num_vertices %d num_edges %d\n", num_vertices_,
+           num_edges_);
   }
 
   bool isLocal(index_t vid);
@@ -114,8 +122,8 @@ class LearningGraph {
 #ifdef CPU_ONLY
   index_t getEdgeDst(index_t eid) { return colidx_[eid]; }
   index_t edge_begin(index_t vid) { return rowptr_[vid]; }
-  index_t edge_end(index_t vid) { return rowptr_[vid+1]; }
-	vdata_t getData(index_t vid) { return vertex_data_[vid]; }
+  index_t edge_end(index_t vid) { return rowptr_[vid + 1]; }
+  vdata_t getData(index_t vid) { return vertex_data_[vid]; }
   index_t getDegree(index_t vid) { return degrees_[vid]; }
   index_t* row_start_ptr() { return &rowptr_[0]; }
   const index_t* row_start_ptr() const { return &rowptr_[0]; }
@@ -125,26 +133,29 @@ class LearningGraph {
   edata_t* edge_data_ptr() { return edge_data_; }
   vdata_t* vertex_data_ptr() { return vertex_data_; }
 #else
-	CUDA_HOSTDEV index_t getEdgeDst(index_t edge) { return d_colidx_[edge]; }
-	CUDA_HOSTDEV index_t edge_begin(index_t src) { return d_rowptr_[src]; }
-	CUDA_HOSTDEV index_t edge_end(index_t src) { return d_rowptr_[src+1]; }
-	CUDA_HOSTDEV vdata_t getData(index_t vid) { return d_vertex_data_[vid]; }
-	//CUDA_HOSTDEV index_t getDegree(index_t vid) { return d_degrees_[vid]; }
-	//CUDA_HOSTDEV index_t getOutDegree(index_t vid) { return d_degrees_[vid]; }
-	CUDA_HOSTDEV index_t getDegree(index_t vid) { return d_rowptr_[vid+1] - d_rowptr_[vid]; }
-	CUDA_HOSTDEV index_t getOutDegree(index_t vid) { return d_rowptr_[vid+1] - d_rowptr_[vid]; }
-	index_t *row_start_ptr() { return d_rowptr_; }
-	const index_t *row_start_ptr() const { return d_rowptr_; }
-	index_t *edge_dst_ptr() { return d_colidx_; }
-	const index_t *edge_dst_ptr() const { return d_colidx_; }
+  CUDA_HOSTDEV index_t getEdgeDst(index_t edge) { return d_colidx_[edge]; }
+  CUDA_HOSTDEV index_t edge_begin(index_t src) { return d_rowptr_[src]; }
+  CUDA_HOSTDEV index_t edge_end(index_t src) { return d_rowptr_[src + 1]; }
+  CUDA_HOSTDEV vdata_t getData(index_t vid) { return d_vertex_data_[vid]; }
+  // CUDA_HOSTDEV index_t getDegree(index_t vid) { return d_degrees_[vid]; }
+  // CUDA_HOSTDEV index_t getOutDegree(index_t vid) { return d_degrees_[vid]; }
+  CUDA_HOSTDEV index_t getDegree(index_t vid) {
+    return d_rowptr_[vid + 1] - d_rowptr_[vid];
+  }
+  CUDA_HOSTDEV index_t getOutDegree(index_t vid) {
+    return d_rowptr_[vid + 1] - d_rowptr_[vid];
+  }
+  index_t* row_start_ptr() { return d_rowptr_; }
+  const index_t* row_start_ptr() const { return d_rowptr_; }
+  index_t* edge_dst_ptr() { return d_colidx_; }
+  const index_t* edge_dst_ptr() const { return d_colidx_; }
   index_t* degrees_ptr() { return d_degrees_; }
-	edata_t *edge_data_ptr() { return d_edge_data_; }
-	vdata_t *vertex_data_ptr() { return d_vertex_data_; }
-	//const vdata_t *vertex_data_ptr() const { return vertex_data_; }
-	//const edata_t *edge_data_ptr() const { return edge_data; }
+  edata_t* edge_data_ptr() { return d_edge_data_; }
+  vdata_t* vertex_data_ptr() { return d_vertex_data_; }
+  // const vdata_t *vertex_data_ptr() const { return vertex_data_; }
+  // const edata_t *edge_data_ptr() const { return edge_data; }
   void print_test();
 #endif
-
 };
 
-}
+} // namespace deepgalois
diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh
index 6e7ac10fe2..89cc3d5d9c 100644
--- a/libdeepgalois/include/deepgalois/math_functions.hh
+++ b/libdeepgalois/include/deepgalois/math_functions.hh
@@ -28,13 +28,14 @@ void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
                const float* A, const float* B, const float beta, float* C);
 
 // single-precision sparse matrix dense matrix multiply, C = A * B, A is sparse
-void csrmm_cpu(const int M, const int N, const int K, const int nnz, 
-               const float alpha, float* A_nonzeros, int* A_idx_ptr, int* A_nonzero_idx,
-               const float* B, const float beta, float* C);
+void csrmm_cpu(const int M, const int N, const int K, const int nnz,
+               const float alpha, float* A_nonzeros, int* A_idx_ptr,
+               int* A_nonzero_idx, const float* B, const float beta, float* C);
 
 // matrix-vector multiply
-void mvmul(const CBLAS_TRANSPOSE TransA, const int M, const int N, const float alpha, 
-           const float* A, const float* x, const float beta, float* y);
+void mvmul(const CBLAS_TRANSPOSE TransA, const int M, const int N,
+           const float alpha, const float* A, const float* x, const float beta,
+           float* y);
 
 //! add 2 arrays for n elements
 void vadd_cpu(size_t n, const float_t* a, const float_t* b, float_t* out);
@@ -48,12 +49,13 @@ void mul_scalar(size_t n, const float_t alpha, const float_t* x, float_t* y);
 float_t dot(size_t n, const float_t* x, const float_t* y);
 
 // SAXPY stands for “Single-precision A*X Plus Y"
-void axpy(size_t n, const float_t a, float_t *x, float_t *y);
+void axpy(size_t n, const float_t a, float_t* x, float_t* y);
 
 // Returns the index of the maximum value
 int argmax(const size_t n, const float_t* x); // the arguments of the maxima
 
-//! Computes half the L2 norm of a tensor without the sqrt: output = sum(t ** 2) / 2
+//! Computes half the L2 norm of a tensor without the sqrt: output = sum(t ** 2)
+//! / 2
 float_t l2_norm(size_t n, const float_t* a);
 
 //! clear n elements of a vector
@@ -63,10 +65,13 @@ void clear_cpu(size_t n, float_t* in);
 void copy_cpu(size_t len, const float_t* in, float_t* out);
 
 // dropout functions randomly remove weights
-void dropout_cpu(size_t n, size_t m, float scale, float dropout_rate, const float_t* in, mask_t* mask, float_t* out);
+void dropout_cpu(size_t n, size_t m, float scale, float dropout_rate,
+                 const float_t* in, mask_t* mask, float_t* out);
 
-// dropout derivative: use existing dropouts in masks instead of generating them;
-void d_dropout_cpu(size_t n, size_t m, float scale, const float_t* in, mask_t* mask, float_t* out);
+// dropout derivative: use existing dropouts in masks instead of generating
+// them;
+void d_dropout_cpu(size_t n, size_t m, float scale, const float_t* in,
+                   mask_t* mask, float_t* out);
 
 //! ReLU = keep if positive; and ReLU derivative: 1 if data > 0, 0 otherwise
 void relu_cpu(size_t n, const float_t* in, float_t* out);
@@ -74,11 +79,13 @@ void d_relu_cpu(size_t n, const float_t* in, const float_t* data, float_t* out);
 
 // Leaky ReLU
 void leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in, float_t* out);
-void d_leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in, const float_t* data, float_t* out);
+void d_leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in,
+                      const float_t* data, float_t* out);
 
 // Loss function for single-class label (one-hot) data: softmax
 void softmax(size_t n, const float_t* input, float_t* output);
-void d_softmax(size_t n, const float_t* y, const float_t* p, float_t* dy, const float_t* dp);
+void d_softmax(size_t n, const float_t* y, const float_t* p, float_t* dy,
+               const float_t* dp);
 
 // Cross entropy
 float_t cross_entropy(size_t n, const float_t* y, const float_t* p);
@@ -86,56 +93,65 @@ void d_cross_entropy(size_t n, const float_t* y, const float_t* p, float_t* d);
 
 // Loss function for multi-class label (one-hot) data: sigmoid
 void sigmoid(size_t n, const float_t* input, float_t* output);
-void d_sigmoid(size_t n, const float_t* y, const float_t* p, float_t* dy, const float_t* dp);
+void d_sigmoid(size_t n, const float_t* y, const float_t* p, float_t* dy,
+               const float_t* dp);
 
 // dropout functions randomly remove weights
-void dropout(float scale, float dropout_rate, const float_t* in, mask_t* mask, float_t* out);
-void d_dropout(const float scale, const float_t* in, mask_t* mask, float_t* out);
+void dropout(float scale, float dropout_rate, const float_t* in, mask_t* mask,
+             float_t* out);
+void d_dropout(const float scale, const float_t* in, mask_t* mask,
+               float_t* out);
 
 //! transposes a matrix (malloc'd array)
 void transpose(size_t x, size_t y, const float_t* in, float_t* out);
- 
-} // math
-} // deepgalois
+
+} // namespace math
+} // namespace deepgalois
 
 // GPU operators
-bool isnan_gpu(int n, const float_t *array); // does array contain any 'nan' element
-void init_const_gpu(int n, float_t value, float_t *array);
+bool isnan_gpu(int n,
+               const float_t* array); // does array contain any 'nan' element
+void init_const_gpu(int n, float_t value, float_t* array);
 void copy_gpu(int len, const float_t* in, float_t* out);
-void vadd_gpu(const int n, const float_t* a, const float_t* b, float_t* out); // vector add
-void axpy_gpu(const int n, const float_t a, const float_t* x, float_t* y); // axpy
+void vadd_gpu(const int n, const float_t* a, const float_t* b,
+              float_t* out); // vector add
+void axpy_gpu(const int n, const float_t a, const float_t* x,
+              float_t* y);                                   // axpy
 void relu_gpu(const int n, const float_t* in, float_t* out); // ReLU
 void d_relu_gpu(const int n, const float_t* in_diff, const float_t* data,
                 float_t* out_diff); // ReLU derivative
-void leaky_relu_gpu(const int n, const float_t epsilon, 
-                    const float_t* in, float_t* out); // Leaky ReLU
-void d_leaky_relu_gpu(const int n, const float_t epsilon, const float_t* in_diff, 
-                      const float_t* data, float_t* out_diff); // Leaky ReLU derivative
-void dropout_gpu(int n, float scale, float dropout_rate,
-                 const float_t* in, mask_t* masks, float_t* out); // dropout
-void d_dropout_gpu(int n, float scale, float dropout_rate,
-                   const float_t* in, const mask_t* masks, float_t* out); // dropout derivative
+void leaky_relu_gpu(const int n, const float_t epsilon, const float_t* in,
+                    float_t* out); // Leaky ReLU
+void d_leaky_relu_gpu(const int n, const float_t epsilon,
+                      const float_t* in_diff, const float_t* data,
+                      float_t* out_diff); // Leaky ReLU derivative
+void dropout_gpu(int n, float scale, float dropout_rate, const float_t* in,
+                 mask_t* masks, float_t* out); // dropout
+void d_dropout_gpu(int n, float scale, float dropout_rate, const float_t* in,
+                   const mask_t* masks, float_t* out); // dropout derivative
 void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
                const int M, const int N, const int K, const float alpha,
                const float* A, const float* B, const float beta, float* C);
 void matmul_gpu(const size_t x, const size_t y, const size_t z,
-                    const float_t* A, const float_t* B, float_t* C);
+                const float_t* A, const float_t* B, float_t* C);
 void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z,
                     const float_t* A, const float_t* B,
                     float_t* C); // matrix multiply
-void csrmm_gpu(const int M, const int N, const int K, const int nnz, 
-               const float alpha, const float* A_nonzeros, 
-	           const int* A_idx_ptr, const int* A_nonzero_idx,
-               const float* B, const float beta, float* trans_C, float* C);
-void softmax_cross_entropy_gpu(int len, int begin, int end, const float_t* in_data,
-                               const mask_t* masks, const label_t* labels,
-                               float_t* loss, float_t* out_data);
+void csrmm_gpu(const int M, const int N, const int K, const int nnz,
+               const float alpha, const float* A_nonzeros, const int* A_idx_ptr,
+               const int* A_nonzero_idx, const float* B, const float beta,
+               float* trans_C, float* C);
+void softmax_cross_entropy_gpu(int len, int begin, int end,
+                               const float_t* in_data, const mask_t* masks,
+                               const label_t* labels, float_t* loss,
+                               float_t* out_data);
 void d_softmax_cross_entropy_gpu(int len, int bengin, int end,
                                  const mask_t* masks, const label_t* labels,
                                  const float_t* out_data, float_t* diff);
-void sigmoid_cross_entropy_gpu(int len, int begin, int end, const float_t* in_data,
-                               const mask_t* masks, const label_t* labels,
-                               float_t* loss, float_t* out_data);
+void sigmoid_cross_entropy_gpu(int len, int begin, int end,
+                               const float_t* in_data, const mask_t* masks,
+                               const label_t* labels, float_t* loss,
+                               float_t* out_data);
 void d_sigmoid_cross_entropy_gpu(int len, int bengin, int end,
                                  const mask_t* masks, const label_t* labels,
                                  const float_t* out_data, float_t* diff);
@@ -146,9 +162,11 @@ bool is_allocated_device(float_t* data);
 void copy_masks_device(int n, mask_t* h_masks, mask_t*& d_masks);
 void float_malloc_device(int n, float_t*& ptr);
 void float_free_device(float_t*& ptr);
-void float_copy_device(int n, float_t* h_ptr, float_t *d_ptr);
-acc_t masked_avg_loss_gpu(int begin, int end, int count, mask_t* masks, float_t* loss);
-acc_t l2_norm_gpu(int n, const float_t *in);
-void l2_norm_gpu(size_t x, size_t y, const float_t* in, float_t *out);
-void d_l2_norm_gpu(size_t x, size_t y, const float_t* in_data, float_t *in_diff, float_t *out_diff);
+void float_copy_device(int n, float_t* h_ptr, float_t* d_ptr);
+acc_t masked_avg_loss_gpu(int begin, int end, int count, mask_t* masks,
+                          float_t* loss);
+acc_t l2_norm_gpu(int n, const float_t* in);
+void l2_norm_gpu(size_t x, size_t y, const float_t* in, float_t* out);
+void d_l2_norm_gpu(size_t x, size_t y, const float_t* in_data, float_t* in_diff,
+                   float_t* out_diff);
 #endif
diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h
index aa62339a2a..117de131b2 100644
--- a/libdeepgalois/include/deepgalois/net.h
+++ b/libdeepgalois/include/deepgalois/net.h
@@ -29,31 +29,29 @@ namespace deepgalois {
 class Net {
 public:
   Net(std::string dataset_str, int nt, unsigned n_conv, int epochs,
-      unsigned hidden1, float lr, float dropout, float wd,
-      bool selfloop, bool single, bool l2norm, bool dense, 
-      unsigned neigh_sz, unsigned subg_sz, int val_itv) :
-  is_single_class(single), has_l2norm(l2norm), has_dense(dense),
-  neighbor_sample_size(neigh_sz), subgraph_sample_size(subg_sz),
-  num_threads(nt), num_conv_layers(n_conv), num_epochs(epochs), 
-  learning_rate(lr), dropout_rate(dropout), weight_decay(wd),
-  val_interval(val_itv), num_subgraphs(1), is_selfloop(selfloop) {
+      unsigned hidden1, float lr, float dropout, float wd, bool selfloop,
+      bool single, bool l2norm, bool dense, unsigned neigh_sz, unsigned subg_sz,
+      int val_itv)
+      : is_single_class(single), has_l2norm(l2norm), has_dense(dense),
+        neighbor_sample_size(neigh_sz), subgraph_sample_size(subg_sz),
+        num_threads(nt), num_conv_layers(n_conv), num_epochs(epochs),
+        learning_rate(lr), dropout_rate(dropout), weight_decay(wd),
+        val_interval(val_itv), num_subgraphs(1), is_selfloop(selfloop) {
     assert(n_conv > 0);
     // TODO use galois print
     std::cout << "Configuration: num_threads " << num_threads
-              << ", num_conv_layers " << num_conv_layers
-              << ", num_epochs " << num_epochs
-              << ", hidden1 " << hidden1
-              << ", learning_rate " << learning_rate
-              << ", dropout_rate " << dropout_rate
+              << ", num_conv_layers " << num_conv_layers << ", num_epochs "
+              << num_epochs << ", hidden1 " << hidden1 << ", learning_rate "
+              << learning_rate << ", dropout_rate " << dropout_rate
               << ", weight_decay " << weight_decay << "\n";
     num_layers = num_conv_layers + 1;
-    if (has_l2norm) num_layers ++;
-    if (has_dense) num_layers ++;
+    if (has_l2norm)
+      num_layers++;
+    if (has_dense)
+      num_layers++;
     // initialize feature metadata
     feature_dims.resize(num_layers + 1);
 
-
-
 #ifndef GALOIS_USE_DIST
     context = new deepgalois::Context();
     context->set_dataset(dataset_str);
@@ -62,56 +60,60 @@ class Net {
     // read graph, get num nodes
     num_classes = context->read_labels();
 
-    //std::cout << "Reading label masks ... ";
+    // std::cout << "Reading label masks ... ";
     train_masks = new mask_t[num_samples];
-    val_masks = new mask_t[num_samples];
-    std::fill(train_masks, train_masks+num_samples, 0);
-    std::fill(val_masks, val_masks+num_samples, 0);
+    val_masks   = new mask_t[num_samples];
+    std::fill(train_masks, train_masks + num_samples, 0);
+    std::fill(val_masks, val_masks + num_samples, 0);
 
     // get training and validation sets
     if (dataset_str == "reddit") {
       train_begin = 0, train_count = 153431,
-                  train_end = train_begin + train_count;
+      train_end = train_begin + train_count;
       val_begin = 153431, val_count = 23831, val_end = val_begin + val_count;
       // TODO do all can be used below
-      for (size_t i = train_begin; i < train_end; i++) train_masks[i] = 1;
-      for (size_t i = val_begin; i < val_end; i++) val_masks[i] = 1;
+      for (size_t i = train_begin; i < train_end; i++)
+        train_masks[i] = 1;
+      for (size_t i = val_begin; i < val_end; i++)
+        val_masks[i] = 1;
     } else {
-      train_count = context->read_masks("train", num_samples, train_begin, train_end, train_masks);
-      val_count = context->read_masks("val", num_samples, val_begin, val_end, val_masks);
+      train_count = context->read_masks("train", num_samples, train_begin,
+                                        train_end, train_masks);
+      val_count   = context->read_masks("val", num_samples, val_begin, val_end,
+                                      val_masks);
     }
 
     if (subgraph_sample_size > train_count) {
-      std::cout << "FATAL: subgraph size can not be larger than the size of training set\n";
+      std::cout << "FATAL: subgraph size can not be larger than the size of "
+                   "training set\n";
       exit(1);
     }
 
     feature_dims[0] = context->read_features(); // input feature dimension: D
     for (size_t i = 1; i < num_conv_layers; i++)
-      feature_dims[i] = hidden1;                           // hidden1 level embedding: 16
-    feature_dims[num_conv_layers] = num_classes;           // output embedding: E
-    if (has_l2norm) 
-      feature_dims[num_conv_layers+1] = num_classes;     // l2 normalized embedding: E
-    if (has_dense) 
-      feature_dims[num_layers-1] = num_classes;          // MLP embedding: E
-    feature_dims[num_layers] = num_classes;                // normalized output embedding: E
+      feature_dims[i] = hidden1;                 // hidden1 level embedding: 16
+    feature_dims[num_conv_layers] = num_classes; // output embedding: E
+    if (has_l2norm)
+      feature_dims[num_conv_layers + 1] =
+          num_classes; // l2 normalized embedding: E
+    if (has_dense)
+      feature_dims[num_layers - 1] = num_classes; // MLP embedding: E
+    feature_dims[num_layers] = num_classes; // normalized output embedding: E
     layers.resize(num_layers);
     context->set_use_subgraph(subgraph_sample_size > 0);
     init();
-#endif 
+#endif
   }
 
-  Net() : is_single_class(true), has_l2norm(false), has_dense(false),
-          neighbor_sample_size(0), subgraph_sample_size(0),
-          num_threads(1), num_samples(0), num_classes(0),
-          num_conv_layers(0), num_layers(0), num_epochs(0),
-          learning_rate(0.0), dropout_rate(0.0), weight_decay(0.0),
-          train_begin(0), train_end(0), train_count(0),
-          val_begin(0), val_end(0), val_count(0),
-          test_begin(0), test_end(0), test_count(0),
-          val_interval(1), num_subgraphs(1), num_vertices_sg(9000),
-          train_masks(NULL), val_masks(NULL), 
-          test_masks(NULL), context(NULL) {}
+  Net()
+      : is_single_class(true), has_l2norm(false), has_dense(false),
+        neighbor_sample_size(0), subgraph_sample_size(0), num_threads(1),
+        num_samples(0), num_classes(0), num_conv_layers(0), num_layers(0),
+        num_epochs(0), learning_rate(0.0), dropout_rate(0.0), weight_decay(0.0),
+        train_begin(0), train_end(0), train_count(0), val_begin(0), val_end(0),
+        val_count(0), test_begin(0), test_end(0), test_count(0),
+        val_interval(1), num_subgraphs(1), num_vertices_sg(9000),
+        train_masks(NULL), val_masks(NULL), test_masks(NULL), context(NULL) {}
 
   void init();
 #ifdef GALOIS_USE_DIST
@@ -121,27 +123,28 @@ class Net {
   size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id + 1]; }
   size_t get_nnodes() { return num_samples; }
 
-  void normalize(); // Scale gradient to counterbalance accumulation
+  void normalize();  // Scale gradient to counterbalance accumulation
   void regularize(); // add weight decay
 
   void train(optimizer* opt, bool need_validate) {
-    std::string header = "";
+    std::string header    = "";
     std::string seperator = " ";
 #ifdef GALOIS_USE_DIST
     unsigned myID = galois::runtime::getSystemNetworkInterface().ID;
-    header = "[" + std::to_string(myID) + "] ";
-    seperator = "\n";
+    header        = "[" + std::to_string(myID) + "] ";
+    seperator     = "\n";
 #endif
 
     double total_train_time = 0.0;
-    int num_subg_remain = 0;
+    int num_subg_remain     = 0;
 #ifdef CPU_ONLY
 #ifndef GALOIS_USE_DIST
     if (subgraph_sample_size) {
       context->createSubgraphs(num_subgraphs);
-      subgraphs_masks = new mask_t[num_samples*num_subgraphs];
+      subgraphs_masks = new mask_t[num_samples * num_subgraphs];
       std::cout << "\nConstruct training vertex set induced graph...\n";
-      sampler->set_masked_graph(train_begin, train_end, train_count, train_masks, context->getGraphPointer());
+      sampler->set_masked_graph(train_begin, train_end, train_count,
+                                train_masks, context->getGraphPointer());
     }
 #endif
 #endif
@@ -160,29 +163,34 @@ class Net {
 #ifdef CPU_ONLY
 #ifndef GALOIS_USE_DIST
           for (int sid = 0; sid < num_subgraphs; sid++) {
-            //galois::do_all(galois::iterate(size_t(0), size_t(num_subgraphs)),[&](const auto sid) {
+            // galois::do_all(galois::iterate(size_t(0),
+            // size_t(num_subgraphs)),[&](const auto sid) {
             unsigned tid = 0;
-            //tid = galois::substrate::ThreadPool::getTID();
-            sampler->subgraph_sample(subgraph_sample_size, *(context->getSubgraphPointer(sid)), &subgraphs_masks[sid*num_samples], tid);
-          }//, galois::loopname("subgraph_gen"));
+            // tid = galois::substrate::ThreadPool::getTID();
+            sampler->subgraph_sample(subgraph_sample_size,
+                                     *(context->getSubgraphPointer(sid)),
+                                     &subgraphs_masks[sid * num_samples], tid);
+          } //, galois::loopname("subgraph_gen"));
 #endif
 #endif
           num_subg_remain = num_subgraphs;
           t_subgen.Stop();
-          //std::cout << "Done, time: " << t_subgen.Millisecs() << "\n";
+          // std::cout << "Done, time: " << t_subgen.Millisecs() << "\n";
         }
 #ifndef GALOIS_USE_DIST
         for (int i = 0; i < num_subgraphs; i++) {
           auto sg_ptr = context->getSubgraphPointer(i);
           sg_ptr->degree_counting();
-          //galois::gPrint("\tsubgraph[", i, "]: num_v ", sg_ptr->size(), " num_e ", sg_ptr->sizeEdges(), "\n");
+          // galois::gPrint("\tsubgraph[", i, "]: num_v ", sg_ptr->size(), "
+          // num_e ", sg_ptr->sizeEdges(), "\n");
         }
-#endif //GALOIS_USE_DIST
+#endif // GALOIS_USE_DIST
         num_subg_remain--;
-        int sg_id = num_subg_remain;
+        int sg_id         = num_subg_remain;
         auto subgraph_ptr = context->getSubgraphPointer(sg_id);
-        num_vertices_sg = subgraph_ptr->size();
-        //galois::gPrint("Subgraph num_vertices: ", num_vertices_sg, ", num_edges: ", subgraph_ptr->sizeEdges(), "\n");
+        num_vertices_sg   = subgraph_ptr->size();
+        // galois::gPrint("Subgraph num_vertices: ", num_vertices_sg, ",
+        // num_edges: ", subgraph_ptr->sizeEdges(), "\n");
         for (size_t i = 0; i < num_layers; i++)
           layers[i]->update_dim_size(num_vertices_sg);
         context->norm_factor_computing(1, sg_id);
@@ -191,12 +199,15 @@ class Net {
           layers[i]->set_norm_consts_ptr(context->get_norm_factors_subg_ptr());
         }
         // update labels for subgraph
-        context->gen_subgraph_labels(num_vertices_sg, &subgraphs_masks[sg_id*num_samples]);
-        layers[num_layers-1]->set_labels_ptr(context->get_labels_subg_ptr());
+        context->gen_subgraph_labels(num_vertices_sg,
+                                     &subgraphs_masks[sg_id * num_samples]);
+        layers[num_layers - 1]->set_labels_ptr(context->get_labels_subg_ptr());
 
         // update features for subgraph
-        context->gen_subgraph_feats(num_vertices_sg, &subgraphs_masks[sg_id*num_samples]);
-        layers[0]->set_feats_ptr(context->get_feats_subg_ptr()); // feed input data
+        context->gen_subgraph_feats(num_vertices_sg,
+                                    &subgraphs_masks[sg_id * num_samples]);
+        layers[0]->set_feats_ptr(
+            context->get_feats_subg_ptr()); // feed input data
       }
 
       // training steps
@@ -204,12 +215,13 @@ class Net {
       set_netphases(net_phase::train);
       acc_t train_loss = 0.0, train_acc = 0.0;
 
-      // forward: after this phase, layer edges will contain intermediate features
-      // for use during backprop
+      // forward: after this phase, layer edges will contain intermediate
+      // features for use during backprop
       double fw_time = evaluate("train", train_loss, train_acc);
 
       // backward: use intermediate features + ground truth to update layers
-      // with feature gradients whcih are then used to calculate weight gradients
+      // with feature gradients whcih are then used to calculate weight
+      // gradients
       Net::bprop();
 
       // gradient update: use gradients stored on each layer to update model for
@@ -218,8 +230,8 @@ class Net {
 
       // validation / testing
       set_netphases(net_phase::test);
-      std::cout << header << "train_loss " << std::setprecision(3) << std::fixed << train_loss
-                << " train_acc " << train_acc << seperator;
+      std::cout << header << "train_loss " << std::setprecision(3) << std::fixed
+                << train_loss << " train_acc " << train_acc << seperator;
       t_epoch.Stop();
       double epoch_time = t_epoch.Millisecs();
       total_train_time += epoch_time;
@@ -227,17 +239,19 @@ class Net {
         // Validation
         acc_t val_loss = 0.0, val_acc = 0.0;
         double val_time = evaluate("val", val_loss, val_acc);
-        std::cout << header << "val_loss " << std::setprecision(3) << std::fixed << val_loss
-                  << " val_acc " << val_acc << seperator;
-        std::cout << header << "time " << std::setprecision(3) << std::fixed << epoch_time + val_time
-                  << " ms (train_time " << epoch_time << " val_time " << val_time << ")\n";
+        std::cout << header << "val_loss " << std::setprecision(3) << std::fixed
+                  << val_loss << " val_acc " << val_acc << seperator;
+        std::cout << header << "time " << std::setprecision(3) << std::fixed
+                  << epoch_time + val_time << " ms (train_time " << epoch_time
+                  << " val_time " << val_time << ")\n";
       } else {
-        std::cout << header << "train_time " << std::fixed << epoch_time 
-                  << " ms (fw " << fw_time << ", bw " << epoch_time - fw_time << ")\n";
+        std::cout << header << "train_time " << std::fixed << epoch_time
+                  << " ms (fw " << fw_time << ", bw " << epoch_time - fw_time
+                  << ")\n";
       }
     }
     double avg_train_time = total_train_time / (double)num_epochs;
-    double throughput = 1000.0 * (double)num_epochs / total_train_time;
+    double throughput     = 1000.0 * (double)num_epochs / total_train_time;
     std::cout << "\nAverage training time: " << avg_train_time
               << " ms. Throughput: " << throughput << " epoch/s\n";
   }
@@ -251,35 +265,37 @@ class Net {
     mask_t* masks = NULL;
     if (type == "train") {
       begin = train_begin;
-      end = train_end;
+      end   = train_end;
       count = train_count;
       masks = train_masks;
       if (subgraph_sample_size) {
         // update masks for subgraph
         masks = NULL;
         begin = 0;
-        end = num_vertices_sg;
+        end   = num_vertices_sg;
         count = num_vertices_sg;
       }
     } else if (type == "val") {
       begin = val_begin;
-      end = val_end;
+      end   = val_end;
       count = val_count;
       masks = val_masks;
     } else {
       begin = test_begin;
-      end = test_end;
+      end   = test_end;
       count = test_count;
       masks = test_masks;
     }
 #ifdef CPU_ONLY
-    if (subgraph_sample_size && type != "train") { // switch to the original graph
-      for (size_t i = 0; i < num_layers; i++) layers[i]->update_dim_size(num_samples);
+    if (subgraph_sample_size &&
+        type != "train") { // switch to the original graph
+      for (size_t i = 0; i < num_layers; i++)
+        layers[i]->update_dim_size(num_samples);
       for (size_t i = 0; i < num_conv_layers; i++) {
         layers[i]->set_graph_ptr(context->getGraphPointer());
         layers[i]->set_norm_consts_ptr(context->get_norm_factors_ptr());
       }
-      layers[num_layers-1]->set_labels_ptr(context->get_labels_ptr());
+      layers[num_layers - 1]->set_labels_ptr(context->get_labels_ptr());
       layers[0]->set_feats_ptr(context->get_feats_ptr()); // feed input data
     }
 #else
@@ -291,7 +307,7 @@ class Net {
       masks = d_test_masks;
     }
 #endif
-    loss = fprop(begin, end, count, masks);
+    loss                 = fprop(begin, end, count, masks);
     float_t* predictions = layers[num_layers - 1]->next()->get_data();
     label_t* labels;
     if (type == "train" && subgraph_sample_size) {
@@ -302,7 +318,8 @@ class Net {
     if (is_single_class) {
       acc = masked_accuracy(begin, end, count, masks, predictions, labels);
     } else {
-      acc = masked_multi_class_accuracy(begin, end, count, masks, predictions, labels);
+      acc = masked_multi_class_accuracy(begin, end, count, masks, predictions,
+                                        labels);
     }
     t_eval.Stop();
     return t_eval.Millisecs();
@@ -316,9 +333,10 @@ class Net {
       test_count = 55703;
       test_end   = test_begin + test_count;
 #ifndef GALOIS_USE_DIST
-      for (size_t i = test_begin; i < test_end; i++) test_masks[i] = 1;
+      for (size_t i = test_begin; i < test_end; i++)
+        test_masks[i] = 1;
 #else
-      for (size_t i = test_begin; i < test_end; i++)  {
+      for (size_t i = test_begin; i < test_end; i++) {
         if (dGraph->isLocal(i)) {
           test_masks[dGraph->getLID(i)] = 1;
         }
@@ -326,9 +344,11 @@ class Net {
 #endif
     } else {
 #ifndef GALOIS_USE_DIST
-      test_count = context->read_masks("test", num_samples, test_begin, test_end, test_masks);
+      test_count = context->read_masks("test", num_samples, test_begin,
+                                       test_end, test_masks);
 #else
-      test_count = context->read_masks("test", num_samples, test_begin, test_end, test_masks, dGraph);
+      test_count = context->read_masks("test", num_samples, test_begin,
+                                       test_end, test_masks, dGraph);
 #endif
     }
 #ifndef CPU_ONLY
@@ -340,14 +360,14 @@ class Net {
   void construct_layers() {
     // append conv layers
     std::cout << "\nConstructing layers...\n";
-    for (size_t i = 0; i < num_conv_layers-1; i++)
-      append_conv_layer(i, true);                  // conv layers, act=true
-    append_conv_layer(num_conv_layers-1);          // the last hidden layer, act=false
+    for (size_t i = 0; i < num_conv_layers - 1; i++)
+      append_conv_layer(i, true);           // conv layers, act=true
+    append_conv_layer(num_conv_layers - 1); // the last hidden layer, act=false
     if (has_l2norm)
-      append_l2norm_layer(num_conv_layers);        // l2_norm layer
+      append_l2norm_layer(num_conv_layers); // l2_norm layer
     if (has_dense)
-      append_dense_layer(num_layers-2);            // dense layer
-    append_out_layer(num_layers-1);                // output layer
+      append_dense_layer(num_layers - 2); // dense layer
+    append_out_layer(num_layers - 1);     // output layer
 
     // allocate memory for intermediate features and gradients
     for (size_t i = 0; i < num_layers; i++) {
@@ -380,11 +400,11 @@ class Net {
   void append_dense_layer(size_t layer_id) {
     assert(layer_id > 0); // can not be the first layer
     std::vector<size_t> in_dims(2), out_dims(2);
-    in_dims[0]       = num_samples;
-    in_dims[0]       = num_samples;
-    in_dims[1]       = get_in_dim(layer_id);
-    out_dims[1]      = get_out_dim(layer_id);
-    //layers[layer_id] = new dense_layer(layer_id, in_dims, out_dims);
+    in_dims[0]  = num_samples;
+    in_dims[0]  = num_samples;
+    in_dims[1]  = get_in_dim(layer_id);
+    out_dims[1] = get_out_dim(layer_id);
+    // layers[layer_id] = new dense_layer(layer_id, in_dims, out_dims);
   }
 
   //! Add an output layer to the network
@@ -402,7 +422,8 @@ class Net {
   }
 
   //! Add a convolution layer to the network
-  void append_conv_layer(size_t layer_id, bool act=false, bool norm=true, bool bias=false, bool dropout=true) {
+  void append_conv_layer(size_t layer_id, bool act = false, bool norm = true,
+                         bool bias = false, bool dropout = true) {
     assert(dropout_rate < 1.0);
     assert(layer_id < num_conv_layers);
     std::vector<size_t> in_dims(2), out_dims(2);
@@ -410,7 +431,7 @@ class Net {
     in_dims[1]               = get_in_dim(layer_id);
     out_dims[1]              = get_out_dim(layer_id);
     layers[layer_id] = new graph_conv_layer(layer_id, act, norm, bias, dropout,
-        dropout_rate, in_dims, out_dims);
+                                            dropout_rate, in_dims, out_dims);
     layers[layer_id]->set_graph_ptr(context->getGraphPointer());
   }
 
@@ -467,20 +488,20 @@ class Net {
   }
 
 protected:
-  bool is_single_class;              // single-class (one-hot) or multi-class label
-  bool has_l2norm;                   // whether the net contains an l2_norm layer
-  bool has_dense;                    // whether the net contains an dense layer
-  unsigned neighbor_sample_size;     // neighbor sampling
-  unsigned subgraph_sample_size;     // subgraph sampling
-  int num_threads;                   // number of threads
-  size_t num_samples;                // number of samples: N
-  size_t num_classes;                // number of vertex classes: E
-  size_t num_conv_layers;            // number of convolutional layers
-  size_t num_layers;                 // total number of layers (conv + output)
-  int num_epochs;                    // number of epochs
-  float learning_rate;               // learning rate
-  float dropout_rate;                // dropout rate
-  float weight_decay;                // weighti decay for over-fitting
+  bool is_single_class;          // single-class (one-hot) or multi-class label
+  bool has_l2norm;               // whether the net contains an l2_norm layer
+  bool has_dense;                // whether the net contains an dense layer
+  unsigned neighbor_sample_size; // neighbor sampling
+  unsigned subgraph_sample_size; // subgraph sampling
+  int num_threads;               // number of threads
+  size_t num_samples;            // number of samples: N
+  size_t num_classes;            // number of vertex classes: E
+  size_t num_conv_layers;        // number of convolutional layers
+  size_t num_layers;             // total number of layers (conv + output)
+  int num_epochs;                // number of epochs
+  float learning_rate;           // learning rate
+  float dropout_rate;            // dropout rate
+  float weight_decay;            // weighti decay for over-fitting
   size_t train_begin, train_end, train_count;
   size_t val_begin, val_end, val_count;
   size_t test_begin, test_end, test_count;
@@ -489,15 +510,15 @@ class Net {
   int num_vertices_sg;
   bool is_selfloop;
 
-  mask_t* train_masks;               // masks for training
-  mask_t* d_train_masks;             // masks for training on device
-  mask_t* val_masks;                 // masks for validation
-  mask_t* d_val_masks;               // masks for validation on device
-  mask_t* test_masks;                // masks for test
-  mask_t* d_test_masks;              // masks for test on device
-  mask_t* subgraphs_masks;           // masks for subgraphs
-  std::vector<size_t> feature_dims;  // feature dimnesions for each layer
-  std::vector<layer*> layers;        // all the layers in the neural network
+  mask_t* train_masks;              // masks for training
+  mask_t* d_train_masks;            // masks for training on device
+  mask_t* val_masks;                // masks for validation
+  mask_t* d_val_masks;              // masks for validation on device
+  mask_t* test_masks;               // masks for test
+  mask_t* d_test_masks;             // masks for test on device
+  mask_t* subgraphs_masks;          // masks for subgraphs
+  std::vector<size_t> feature_dims; // feature dimnesions for each layer
+  std::vector<layer*> layers;       // all the layers in the neural network
 #ifndef GALOIS_USE_DIST
   deepgalois::Context* context;
 #else
@@ -507,13 +528,15 @@ class Net {
 
 #ifdef CPU_ONLY
 #ifndef GALOIS_USE_DIST
-  Sampler *sampler;
+  Sampler* sampler;
 #endif
 #endif
   // comparing outputs with the ground truth (labels)
-  acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, float_t* preds, label_t* ground_truth);
-  acc_t masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, float_t* preds, label_t* ground_truth);
+  acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks,
+                        float_t* preds, label_t* ground_truth);
+  acc_t masked_multi_class_accuracy(size_t begin, size_t end, size_t count,
+                                    mask_t* masks, float_t* preds,
+                                    label_t* ground_truth);
 };
 
 } // namespace deepgalois
-
diff --git a/libdeepgalois/include/deepgalois/optimizer.h b/libdeepgalois/include/deepgalois/optimizer.h
index 4fd7caa800..aa0dcbaab7 100644
--- a/libdeepgalois/include/deepgalois/optimizer.h
+++ b/libdeepgalois/include/deepgalois/optimizer.h
@@ -27,8 +27,8 @@ struct optimizer {
   optimizer(const optimizer&) = default;
   optimizer(optimizer&&)      = default;
   optimizer& operator=(const optimizer&) = default;
-  optimizer& operator=(optimizer&&)                                = default;
-  virtual ~optimizer()                                             = default;
+  optimizer& operator=(optimizer&&)              = default;
+  virtual ~optimizer()                           = default;
   virtual void update(const vec_t& dW, vec_t& W) = 0;
 #ifndef CPU_ONLY
   virtual void update_gpu(const size_t n, const float_t* dW, float_t* W) = 0;
@@ -40,8 +40,10 @@ struct optimizer {
 template <int N>
 struct stateful_optimizer : public optimizer {
   void reset() override {
-    for (auto& e : E_) e.clear();
+    for (auto& e : E_)
+      e.clear();
   }
+
 protected:
   template <int Index>
   vec_t& get(const vec_t& key) {
@@ -53,7 +55,7 @@ struct stateful_optimizer : public optimizer {
   std::unordered_map<const vec_t*, vec_t> E_[N];
 #ifndef CPU_ONLY
   template <int Index>
-  float_t *get_gpu(const size_t n, const float_t *key);
+  float_t* get_gpu(const size_t n, const float_t* key);
   std::unordered_map<const float_t*, float_t*> dE_[N];
 #endif
 };
diff --git a/libdeepgalois/include/deepgalois/reader.h b/libdeepgalois/include/deepgalois/reader.h
index e25124cbfd..9e5faf1f39 100644
--- a/libdeepgalois/include/deepgalois/reader.h
+++ b/libdeepgalois/include/deepgalois/reader.h
@@ -7,14 +7,16 @@ class Reader {
 private:
   std::string dataset_str;
   void progressPrint(unsigned maxii, unsigned ii);
+
 public:
   Reader() : dataset_str("") {}
   Reader(std::string dataset) : dataset_str(dataset) {}
   void init(std::string dataset) { dataset_str = dataset; }
   size_t read_labels(bool is_single_class, label_t*& labels);
   size_t read_features(float_t*& feats, std::string filetype = "bin");
-  size_t read_masks(std::string mask_type, size_t n, size_t& begin, size_t& end, mask_t* masks);
-  void readGraphFromGRFile(Graph *g);
+  size_t read_masks(std::string mask_type, size_t n, size_t& begin, size_t& end,
+                    mask_t* masks);
+  void readGraphFromGRFile(Graph* g);
 };
 
-}
+} // namespace deepgalois
diff --git a/libdeepgalois/include/deepgalois/sampler.h b/libdeepgalois/include/deepgalois/sampler.h
index c5f8abd219..c559804354 100644
--- a/libdeepgalois/include/deepgalois/sampler.h
+++ b/libdeepgalois/include/deepgalois/sampler.h
@@ -4,7 +4,7 @@
 #include "deepgalois/gtypes.h"
 
 namespace deepgalois {
-#define ETA 1.5 // length factor of DB in sampling
+#define ETA 1.5          // length factor of DB in sampling
 #define SAMPLE_CLIP 3000 // clip degree in sampling
 #define DEFAULT_SIZE_FRONTIER 3000
 #define DEFAULT_SIZE_SUBG 9000
@@ -16,19 +16,25 @@ class Sampler {
   ~Sampler() {}
 
   // sample a subgraph sg of size n from graph g
-  void subgraph_sample(size_t n, Graph &sg, mask_t* masks, unsigned tid = 0);
+  void subgraph_sample(size_t n, Graph& sg, mask_t* masks, unsigned tid = 0);
 
   // !API function for user-defined selection strategy
-  virtual void select_vertices(size_t nv, size_t n, int m, Graph* g, VertexList vertices, VertexSet &vertex_set);
-  virtual void select_vertices(size_t n, int m, VertexSet &vertex_set, unsigned tid);
+  virtual void select_vertices(size_t nv, size_t n, int m, Graph* g,
+                               VertexList vertices, VertexSet& vertex_set);
+  virtual void select_vertices(size_t n, int m, VertexSet& vertex_set,
+                               unsigned tid);
 
-  //galois::runtime::iterable<galois::NoDerefIterator<edge_iterator> > neighbor_sampler(Graph &g, VertexID v);
+  // galois::runtime::iterable<galois::NoDerefIterator<edge_iterator> >
+  // neighbor_sampler(Graph &g, VertexID v);
 
-  edge_iterator sampled_edge_begin(Graph &g, VertexID v) { return g.edge_begin(v); }
+  edge_iterator sampled_edge_begin(Graph& g, VertexID v) {
+    return g.edge_begin(v);
+  }
 
-  edge_iterator sampled_edge_end(Graph &g, VertexID v) { return g.edge_end(v); }
+  edge_iterator sampled_edge_end(Graph& g, VertexID v) { return g.edge_end(v); }
 
-  void set_masked_graph(size_t begin, size_t end, size_t count, mask_t* masks, Graph* g);
+  void set_masked_graph(size_t begin, size_t end, size_t count, mask_t* masks,
+                        Graph* g);
 
 protected:
   int m_;
@@ -39,19 +45,22 @@ class Sampler {
   int subg_deg;
   VertexList vertices_;
   std::vector<index_t> node_train;
-  mask_t *masks_;
-  Graph *masked_graph;
-  Graph *graph;
+  mask_t* masks_;
+  Graph* masked_graph;
+  Graph* graph;
 
-  // Given a subset of vertices and a graph g, generate a subgraph sg from the graph g
-  void generate_subgraph(VertexSet &vertex_set, Graph &g, Graph &sub);
+  // Given a subset of vertices and a graph g, generate a subgraph sg from the
+  // graph g
+  void generate_subgraph(VertexSet& vertex_set, Graph& g, Graph& sub);
   void generate_masked_graph(size_t n, mask_t* masks, Graph* g, Graph& mg);
-  void get_masked_degrees(size_t n, mask_t* masks, Graph* g, std::vector<uint32_t> &degrees);
+  void get_masked_degrees(size_t n, mask_t* masks, Graph* g,
+                          std::vector<uint32_t>& degrees);
   void update_masks(size_t n, VertexSet vertices, mask_t* masks);
   inline VertexList reindexing_vertice(size_t n, VertexSet vertex_set);
-  void check_DB(std::vector<db_t> &DB0, std::vector<db_t> &DB1, std::vector<db_t> &DB2, size_t size);
+  void check_DB(std::vector<db_t>& DB0, std::vector<db_t>& DB1,
+                std::vector<db_t>& DB2, size_t size);
 };
 
-}
+} // namespace deepgalois
 
 #endif
diff --git a/libdeepgalois/include/deepgalois/types.h b/libdeepgalois/include/deepgalois/types.h
index 87e7411689..71add8b650 100644
--- a/libdeepgalois/include/deepgalois/types.h
+++ b/libdeepgalois/include/deepgalois/types.h
@@ -19,9 +19,9 @@ typedef std::vector<vec_t>
 typedef std::vector<feature_t> FV; // feature vector
 typedef std::vector<FV> FV2D;      // feature vectors: num_samples x feature_dim
 typedef float acc_t;               // Accuracy type
-typedef uint8_t label_t;  // label is for classification (supervised learning)
-typedef uint8_t mask_t; // mask is used to indicate different uses of labels:
-                        // train, val, test
+typedef uint8_t label_t; // label is for classification (supervised learning)
+typedef uint8_t mask_t;  // mask is used to indicate different uses of labels:
+                         // train, val, test
 typedef uint32_t VertexID;
 typedef uint64_t EdgeID;
 typedef std::vector<VertexID> VertexList;
@@ -44,15 +44,15 @@ enum class net_phase { train, test };
 
 #ifdef GALOIS_USE_DIST
 namespace deepgalois {
-  // TODO only being used by graph conv layer at the moment so extern works,
-  // but this design is bad and needs to be revisited
-
-  //! Set this to let sync struct know where to get data from
-  extern float_t* _dataToSync;
-  //! Set this to let sync struct know the size of the vector to use during
-  //! sync
-  extern long unsigned _syncVectorSize;
-}
+// TODO only being used by graph conv layer at the moment so extern works,
+// but this design is bad and needs to be revisited
+
+//! Set this to let sync struct know where to get data from
+extern float_t* _dataToSync;
+//! Set this to let sync struct know the size of the vector to use during
+//! sync
+extern long unsigned _syncVectorSize;
+} // namespace deepgalois
 #endif
 
 #endif
diff --git a/libdeepgalois/include/deepgalois/utils.h b/libdeepgalois/include/deepgalois/utils.h
index c8bb1d4e41..08f28126bf 100644
--- a/libdeepgalois/include/deepgalois/utils.h
+++ b/libdeepgalois/include/deepgalois/utils.h
@@ -98,10 +98,10 @@ uniform_rand(T min, T max) {
 
 // sequential prefix sum
 template <typename InTy = unsigned, typename OutTy = unsigned>
-inline std::vector<OutTy> prefix_sum(const std::vector<InTy> &in) {
+inline std::vector<OutTy> prefix_sum(const std::vector<InTy>& in) {
   std::vector<OutTy> prefix(in.size() + 1);
   OutTy total = 0;
-  for (size_t i = 0; i < in.size(); i ++) {
+  for (size_t i = 0; i < in.size(); i++) {
     prefix[i] = total;
     total += (OutTy)in[i];
   }
@@ -110,62 +110,66 @@ inline std::vector<OutTy> prefix_sum(const std::vector<InTy> &in) {
 }
 
 template <typename InTy = unsigned, typename OutTy = unsigned>
-OutTy* parallel_prefix_sum(const std::vector<InTy> &in);
+OutTy* parallel_prefix_sum(const std::vector<InTy>& in);
 
 // Utility function to randomly select k items from [begin, end)
 template <typename T = int>
 inline T* select_k_items(T k, T begin, T end) {
-    auto i = begin;
-  
-    // reservoir[] is the output array. Initialize  
-    // it with first k vertices 
-    T *reservoir = new T[k];
-    for (; i < k; i++) reservoir[i] = i;
-  
-    // Use a different seed value so that we don't get  
-    // same result each time we run this program  
-    srand(time(NULL));  
-  
-    // Iterate from the (k+1)th element to nth element  
-    for (; i < end; i++) {  
-        // Pick a random index from 0 to i.  
-        auto j = rand() % (i + 1);  
-  
-        // If the randomly picked index is smaller than k,  
-        // then replace the element present at the index  
-        // with new element from stream  
-        if (j < k) reservoir[j] = i;
-    }
-	return reservoir;
+  auto i = begin;
+
+  // reservoir[] is the output array. Initialize
+  // it with first k vertices
+  T* reservoir = new T[k];
+  for (; i < k; i++)
+    reservoir[i] = i;
+
+  // Use a different seed value so that we don't get
+  // same result each time we run this program
+  srand(time(NULL));
+
+  // Iterate from the (k+1)th element to nth element
+  for (; i < end; i++) {
+    // Pick a random index from 0 to i.
+    auto j = rand() % (i + 1);
+
+    // If the randomly picked index is smaller than k,
+    // then replace the element present at the index
+    // with new element from stream
+    if (j < k)
+      reservoir[j] = i;
+  }
+  return reservoir;
 }
 
 // Utility function to find ceiling of r in arr[l..h]
 template <typename T = int>
-inline T find_ceil(T *arr, T r, T l, T h) {  
-	T mid;
-	while (l < h) {
-		mid = l + ((h - l) >> 1); // Same as mid = (l+h)/2
-		(r > arr[mid]) ? (l = mid + 1) : (h = mid);
-	}
-	return (arr[l] >= r) ? l : -1;  
-} 
-
-// Utility function to select one element from n elements given a frequency (probability) distribution
+inline T find_ceil(T* arr, T r, T l, T h) {
+  T mid;
+  while (l < h) {
+    mid = l + ((h - l) >> 1); // Same as mid = (l+h)/2
+    (r > arr[mid]) ? (l = mid + 1) : (h = mid);
+  }
+  return (arr[l] >= r) ? l : -1;
+}
+
+// Utility function to select one element from n elements given a frequency
+// (probability) distribution
 // https://www.geeksforgeeks.org/random-number-generator-in-arbitrary-probability-distribution-fashion/
 template <typename T = int>
-T select_one_item(T n, T *dist) {
-	T *offsets = new T[n];
-	offsets[0] = dist[0];
-	// compute the prefix sum of the distribution
-	for (T i = 1; i < n; ++i) offsets[i] = offsets[i-1] + dist[i];
-	// offsets[n-1] is sum of all frequencies
-	T sum = offsets[n-1];
-	T r = (rand() % sum) + 1;
-	// find which range r falls into, and return the index of the range
-	return find_ceil(offsets, r, 0, n - 1);
+T select_one_item(T n, T* dist) {
+  T* offsets = new T[n];
+  offsets[0] = dist[0];
+  // compute the prefix sum of the distribution
+  for (T i = 1; i < n; ++i)
+    offsets[i] = offsets[i - 1] + dist[i];
+  // offsets[n-1] is sum of all frequencies
+  T sum = offsets[n - 1];
+  T r   = (rand() % sum) + 1;
+  // find which range r falls into, and return the index of the range
+  return find_ceil(offsets, r, 0, n - 1);
 }
 
-acc_t masked_f1_score(size_t begin, size_t end, size_t count, mask_t *masks, 
-                      size_t num_classes, label_t *ground_truth, float_t *pred);
+acc_t masked_f1_score(size_t begin, size_t end, size_t count, mask_t* masks,
+                      size_t num_classes, label_t* ground_truth, float_t* pred);
 
-} // end namespace
+} // namespace deepgalois
diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp
index 66a1a0885e..1da6c6c5a1 100644
--- a/libdeepgalois/src/DistContext.cpp
+++ b/libdeepgalois/src/DistContext.cpp
@@ -26,10 +26,11 @@ size_t DistContext::read_labels(std::string dataset_str) {
   in >> m >> num_classes >> std::ws;
   assert(m == dGraph->globalSize());
   // size of labels should be # local nodes
-  h_labels = new label_t[dGraph->size()]; // single-class (one-hot) label for each vertex: N x 1
+  h_labels = new label_t[dGraph->size()]; // single-class (one-hot) label for
+                                          // each vertex: N x 1
 
   uint32_t foundVertices = 0;
-  unsigned v = 0;
+  unsigned v             = 0;
   // each line contains a set of 0s and 1s
   while (std::getline(in, line)) {
     // only bother if local node
@@ -55,8 +56,9 @@ size_t DistContext::read_labels(std::string dataset_str) {
   in.close();
 
   // print the number of vertex classes
-  galois::gPrint("[", myID, "] Done with labels, unique label counts: ",
-                 num_classes, "; set ", foundVertices, " nodes\n");
+  galois::gPrint("[", myID,
+                 "] Done with labels, unique label counts: ", num_classes,
+                 "; set ", foundVertices, " nodes\n");
 
   return num_classes;
 }
@@ -97,8 +99,8 @@ size_t DistContext::read_features(std::string dataset_str) {
   }
   in.close();
 
-  galois::gPrint("[", myID, "] Done with features, feature length: ",
-                 feat_len, "\n");
+  galois::gPrint("[", myID, "] Done with features, feature length: ", feat_len,
+                 "\n");
 
   return feat_len;
 }
@@ -141,50 +143,42 @@ size_t DistContext::read_masks(std::string dataset_str, std::string mask_type,
     i++;
   }
   std::cout << mask_type + "_mask range: [" << begin << ", " << end
-    << ") Number of valid samples: " << sample_count << "("
-    << (float)sample_count/(float)n*(float)100 << "\%)\n";
+            << ") Number of valid samples: " << sample_count << "("
+            << (float)sample_count / (float)n * (float)100 << "\%)\n";
   in.close();
   return sample_count;
 }
 
-float_t* DistContext::get_in_ptr() {
-  return &h_feats[0];
-}
+float_t* DistContext::get_in_ptr() { return &h_feats[0]; }
 
-//void DistContext::norm_factor_computing(bool is_subgraph, int subg_id) {
+// void DistContext::norm_factor_computing(bool is_subgraph, int subg_id) {
 void DistContext::norm_factor_computing(bool, int) {
   // TODO: this is a distributed operation
 
   // create for now, TODO need to actually fill it in
   norm_factors = new float_t[localVertices];
-  galois::do_all(galois::iterate((size_t)0, localVertices),
-    [&](auto v) {
-      norm_factors[v] = 1;
-    }, galois::loopname("NormCounting"));
+  galois::do_all(
+      galois::iterate((size_t)0, localVertices),
+      [&](auto v) { norm_factors[v] = 1; }, galois::loopname("NormCounting"));
 
-  //galois::do_all(galois::iterate((size_t)0, localVertices),
+  // galois::do_all(galois::iterate((size_t)0, localVertices),
   //  [&](auto v) {
-  //    auto degree  = std::distance(graph_cpu->edge_begin(v), graph_cpu->edge_end(v));
-  //    float_t temp = std::sqrt(float_t(degree));
-  //    if (temp == 0.0) norm_factors[v] = 0.0;
-  //    else norm_factors[v] = 1.0 / temp;
+  //    auto degree  = std::distance(graph_cpu->edge_begin(v),
+  //    graph_cpu->edge_end(v)); float_t temp = std::sqrt(float_t(degree)); if
+  //    (temp == 0.0) norm_factors[v] = 0.0; else norm_factors[v] = 1.0 / temp;
   //  }, galois::loopname("NormCounting"));
 
   return;
 }
 
 void DistContext::initializeSyncSubstrate() {
-  DistContext::syncSubstrate =
-    new galois::graphs::GluonSubstrate<Graph>(
-      *DistContext::graph_cpu,
-      galois::runtime::getSystemNetworkInterface().ID,
-      galois::runtime::getSystemNetworkInterface().Num,
-      false
-    );
+  DistContext::syncSubstrate = new galois::graphs::GluonSubstrate<Graph>(
+      *DistContext::graph_cpu, galois::runtime::getSystemNetworkInterface().ID,
+      galois::runtime::getSystemNetworkInterface().Num, false);
 }
 
 galois::graphs::GluonSubstrate<Graph>* DistContext::getSyncSubstrate() {
   return DistContext::syncSubstrate;
 };
 
-}  // deepgalois
+} // namespace deepgalois
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index 757279ceba..f07da83d6d 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -11,27 +11,30 @@ namespace deepgalois {
 Context::Context() : Context(false) {}
 
 Context::~Context() {
-  if (h_labels) delete[] h_labels;
-  if (h_feats) delete[] h_feats;
-  if (norm_factors) delete[] norm_factors;
-  //if (h_feats_subg) delete[] h_feats_subg;
-  //if (h_labels_subg) delete[] h_labels_subg;
-  //if (norm_factors_subg) delete[] norm_factors_subg;
+  if (h_labels)
+    delete[] h_labels;
+  if (h_feats)
+    delete[] h_feats;
+  if (norm_factors)
+    delete[] norm_factors;
+  // if (h_feats_subg) delete[] h_feats_subg;
+  // if (h_labels_subg) delete[] h_labels_subg;
+  // if (norm_factors_subg) delete[] norm_factors_subg;
 }
 
 void Context::createSubgraphs(int num_subgraphs) {
   subgraphs_cpu.resize(num_subgraphs);
   for (int i = 0; i < num_subgraphs; i++)
-    subgraphs_cpu[i] = new Graph(); 
+    subgraphs_cpu[i] = new Graph();
 }
 
 // generate labels for the subgraph, m is subgraph size
-void Context::gen_subgraph_labels(size_t m, const mask_t *masks) {
-  //if (h_labels_subg == NULL) h_labels_subg = new label_t[m];
+void Context::gen_subgraph_labels(size_t m, const mask_t* masks) {
+  // if (h_labels_subg == NULL) h_labels_subg = new label_t[m];
   if (is_single_class) {
     h_labels_subg.resize(m);
   } else {
-    h_labels_subg.resize(m*num_classes);
+    h_labels_subg.resize(m * num_classes);
   }
   size_t count = 0;
   for (size_t i = 0; i < n; i++) {
@@ -39,23 +42,25 @@ void Context::gen_subgraph_labels(size_t m, const mask_t *masks) {
       if (is_single_class) {
         h_labels_subg[count] = h_labels[i];
       } else {
-        std::copy(h_labels+i*num_classes, h_labels+(i+1)*num_classes, &h_labels_subg[count*num_classes]);
-	  }
-      count ++;
-	}
+        std::copy(h_labels + i * num_classes, h_labels + (i + 1) * num_classes,
+                  &h_labels_subg[count * num_classes]);
+      }
+      count++;
+    }
   }
 }
 
 // generate input features for the subgraph, m is subgraph size
-void Context::gen_subgraph_feats(size_t m, const mask_t *masks) {
+void Context::gen_subgraph_feats(size_t m, const mask_t* masks) {
   size_t count = 0;
-  //if (h_feats_subg == NULL) h_feats_subg = new float_t[m*feat_len];
-  h_feats_subg.resize(m*feat_len);
+  // if (h_feats_subg == NULL) h_feats_subg = new float_t[m*feat_len];
+  h_feats_subg.resize(m * feat_len);
   for (size_t i = 0; i < n; i++) {
     if (masks[i] == 1) {
-      std::copy(h_feats+i*feat_len, h_feats+(i+1)*feat_len, &h_feats_subg[count*feat_len]);
-      count ++;
-	}
+      std::copy(h_feats + i * feat_len, h_feats + (i + 1) * feat_len,
+                &h_feats_subg[count * feat_len]);
+      count++;
+    }
   }
 }
 
@@ -71,32 +76,33 @@ size_t Context::read_graph(bool selfloop) {
   } else if (filetype == "bin") {
     graph_cpu->readGraph(dataset);
   } else if (filetype == "gr") {
-    graph_cpu = new Graph(); 
+    graph_cpu            = new Graph();
     std::string filename = path + dataset + ".csgr";
     printf("Reading .gr file: %s\n", filename.c_str());
     if (selfloop) {
       Graph graph_temp;
-      //galois::graphs::readGraph(graph_temp, filename);
+      // galois::graphs::readGraph(graph_temp, filename);
       graph_temp.readGraph(dataset);
       add_selfloop(graph_temp, *graph_cpu);
       is_selfloop_added = selfloop;
-    //} else galois::graphs::readGraph(*graph_cpu, filename);
-    } else graph_cpu->readGraph(dataset);
-// TODO dist version of self loop
+      //} else galois::graphs::readGraph(*graph_cpu, filename);
+    } else
+      graph_cpu->readGraph(dataset);
+    // TODO dist version of self loop
   } else {
     printf("Unkown file format\n");
     exit(1);
   }
   Tread.stop();
   auto g = getGraphPointer();
-  std::cout << "num_vertices " << g->size() << " num_edges "
-            << g->sizeEdges() << "\n";
+  std::cout << "num_vertices " << g->size() << " num_edges " << g->sizeEdges()
+            << "\n";
   n = g->size();
   return n;
 }
 
-void Context::add_selfloop(Graph &og, Graph &g) {
-  g.allocateFrom(og.size(), og.size()+og.sizeEdges());
+void Context::add_selfloop(Graph& og, Graph& g) {
+  g.allocateFrom(og.size(), og.size() + og.sizeEdges());
   g.constructNodes();
   /*
   for (size_t src = 0; src < og.size(); src++) {
@@ -139,19 +145,19 @@ void Context::alloc_norm_factor() {
 
 void Context::alloc_subgraph_norm_factor(int subg_id) {
   Graph* g = getSubgraphPointer(subg_id);
-  //if (norm_factors_subg == NULL)
+  // if (norm_factors_subg == NULL)
 #ifdef USE_MKL
-    //norm_factors_subg = new float_t[g->sizeEdges()];
-    norm_factors_subg.resize(g->sizeEdges());
+  // norm_factors_subg = new float_t[g->sizeEdges()];
+  norm_factors_subg.resize(g->sizeEdges());
 #else
-    norm_factors_subg.resize(g->size());
-    //norm_factors_subg = new float_t[g->size()];
+  norm_factors_subg.resize(g->size());
+  // norm_factors_subg = new float_t[g->size()];
 #endif
 }
 
 void Context::norm_factor_computing(bool is_subgraph, int subg_id) {
   Graph* g;
-  float_t *constants;
+  float_t* constants;
   if (!is_subgraph) {
     g = getGraphPointer();
     alloc_norm_factor();
@@ -164,26 +170,37 @@ void Context::norm_factor_computing(bool is_subgraph, int subg_id) {
   auto g_size = g->size();
   g->degree_counting();
 #ifdef USE_MKL
-  galois::do_all(galois::iterate((size_t)0, g_size), [&](auto i) {
-    float_t c_i = std::sqrt(float_t(g->get_degree(i)));
-    for (auto e = g->edge_begin(i); e != g->edge_end(i); e++) {
-      const auto j = g->getEdgeDst(e);
-      float_t c_j = std::sqrt(float_t(g->get_degree(j)));
-      if (c_i == 0.0 || c_j == 0.0) constants[e] = 0.0;
-      else constants[e] = 1.0 / (c_i * c_j);
-    }
-  }, galois::loopname("NormCountingEdge"));
+  galois::do_all(
+      galois::iterate((size_t)0, g_size),
+      [&](auto i) {
+        float_t c_i = std::sqrt(float_t(g->get_degree(i)));
+        for (auto e = g->edge_begin(i); e != g->edge_end(i); e++) {
+          const auto j = g->getEdgeDst(e);
+          float_t c_j  = std::sqrt(float_t(g->get_degree(j)));
+          if (c_i == 0.0 || c_j == 0.0)
+            constants[e] = 0.0;
+          else
+            constants[e] = 1.0 / (c_i * c_j);
+        }
+      },
+      galois::loopname("NormCountingEdge"));
 #else
-  galois::do_all(galois::iterate((size_t)0, g_size), [&](auto v) {
-    auto degree  = g->get_degree(v);
-    float_t temp = std::sqrt(float_t(degree));
-    if (temp == 0.0) constants[v] = 0.0;
-    else constants[v] = 1.0 / temp;
-  }, galois::loopname("NormCountingVertex"));
+  galois::do_all(
+      galois::iterate((size_t)0, g_size),
+      [&](auto v) {
+        auto degree = g->get_degree(v);
+        float_t temp = std::sqrt(float_t(degree));
+        if (temp == 0.0)
+          constants[v] = 0.0;
+        else
+          constants[v] = 1.0 / temp;
+      },
+      galois::loopname("NormCountingVertex"));
 #endif
 }
 
-void Context::read_edgelist(const char* filename, bool symmetrize, bool add_self_loop) {
+void Context::read_edgelist(const char* filename, bool symmetrize,
+                            bool add_self_loop) {
   std::ifstream in;
   std::string line;
   in.open(filename, std::ios::in);
@@ -192,10 +209,11 @@ void Context::read_edgelist(const char* filename, bool symmetrize, bool add_self
   size_t num_vertices_ = m;
   size_t num_edges_    = 0;
   std::cout << "num_vertices " << num_vertices_ << "\n";
-  std::vector<std::set<uint32_t> > vertices(m);
+  std::vector<std::set<uint32_t>> vertices(m);
   for (size_t i = 0; i < n; i++) {
     std::set<uint32_t> neighbors;
-    if (add_self_loop) neighbors.insert(i);
+    if (add_self_loop)
+      neighbors.insert(i);
     vertices.push_back(neighbors);
   }
   while (std::getline(in, line)) {
@@ -204,10 +222,12 @@ void Context::read_edgelist(const char* filename, bool symmetrize, bool add_self
     edge_stream >> u;
     edge_stream >> v;
     vertices[u].insert(v);
-    if (symmetrize) vertices[v].insert(u);
+    if (symmetrize)
+      vertices[v].insert(u);
   }
   in.close();
-  for (size_t i = 0; i < n; i++) num_edges_ += vertices[i].size();
+  for (size_t i = 0; i < n; i++)
+    num_edges_ += vertices[i].size();
   std::cout << "num_edges " << num_edges_ << "\n";
 
   std::vector<uint32_t> degrees;
@@ -224,13 +244,13 @@ void Context::read_edgelist(const char* filename, bool symmetrize, bool add_self
   offsets[degrees.size()] = total;
   degrees.clear();
   assert(num_edges_ == offsets[num_vertices_]);
-  EdgeID *colidx_ = new EdgeID[num_edges_];
-  VertexID *rowptr_ = new VertexID[num_vertices_ + 1];
+  EdgeID* colidx_   = new EdgeID[num_edges_];
+  VertexID* rowptr_ = new VertexID[num_vertices_ + 1];
   for (size_t i = 0; i < num_vertices_ + 1; i++)
     rowptr_[i] = offsets[i];
   for (size_t i = 0; i < num_vertices_; i++) {
     for (auto dst : vertices[i])
-        colidx_[offsets[i]++] = dst;
+      colidx_[offsets[i]++] = dst;
   }
 
   auto g = getGraphPointer();
@@ -238,7 +258,7 @@ void Context::read_edgelist(const char* filename, bool symmetrize, bool add_self
   g->constructNodes();
   for (size_t i = 0; i < num_vertices_; i++) {
     auto row_begin = rowptr_[i];
-    auto row_end = rowptr_[i+1];
+    auto row_end   = rowptr_[i + 1];
     g->fixEndEdge(i, row_end);
     for (auto offset = row_begin; offset < row_end; offset++)
       g->constructEdge(offset, colidx_[offset], 0);
@@ -254,4 +274,4 @@ inline void init_features(size_t dim, vec_t &x) {
 }
 */
 
-} // end deepgalois namespace
+} // namespace deepgalois
diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu
index f7a76d2db0..365bef8e50 100644
--- a/libdeepgalois/src/context.cu
+++ b/libdeepgalois/src/context.cu
@@ -27,31 +27,37 @@ int64_t cluster_seedgen(void) {
 namespace deepgalois {
 
 // computing normalization factor for each vertex
-__global__ void norm_factor_computing_node(int n, GraphGPU graph, float_t* norm_fac) {
+__global__ void norm_factor_computing_node(int n, GraphGPU graph,
+                                           float_t* norm_fac) {
   CUDA_KERNEL_LOOP(i, n) {
     float_t temp = sqrt(float_t(graph.getOutDegree(i)));
-    if (temp == 0.0) norm_fac[i] = 0.0;
-    else norm_fac[i] = 1.0 / temp;
+    if (temp == 0.0)
+      norm_fac[i] = 0.0;
+    else
+      norm_fac[i] = 1.0 / temp;
   }
 }
 
 // TODO: make sure self-loop added for each vertex
 // computing normalization factor for each edge
-__global__ void norm_factor_computing_edge(int n, GraphGPU graph, float_t* norm_fac) {
+__global__ void norm_factor_computing_edge(int n, GraphGPU graph,
+                                           float_t* norm_fac) {
   CUDA_KERNEL_LOOP(src, n) {
     assert(src < n);
     float_t d_src = float_t(graph.getOutDegree(src));
-    assert(d_src != 0.0); // should never be zero since self-loop added for each vertex
-    d_src = 1.0 / sqrt(d_src);
-    auto start = graph.edge_begin(src);
+    assert(d_src !=
+           0.0); // should never be zero since self-loop added for each vertex
+    d_src       = 1.0 / sqrt(d_src);
+    auto start  = graph.edge_begin(src);
     index_t end = graph.edge_end(src);
     for (index_t e = start; e != end; e++) {
       index_t dst = graph.getEdgeDst(e);
-      //if (dst >= n) printf("src=%d, dst=%d, e=%d, start=%d, end=%d\n", src, dst, e, start, end);
+      // if (dst >= n) printf("src=%d, dst=%d, e=%d, start=%d, end=%d\n", src,
+      // dst, e, start, end);
       assert(dst < n);
       float_t d_dst = float_t(graph.getOutDegree(dst));
       assert(d_dst != 0.0);
-      d_dst = 1.0 / sqrt(d_dst);
+      d_dst       = 1.0 / sqrt(d_dst);
       norm_fac[e] = d_src * d_dst;
     }
   }
@@ -66,10 +72,14 @@ Context::Context() : Context(true) {
   CUBLAS_CHECK(cublasCreate(&cublas_handle_));
   CUSPARSE_CHECK(cusparseCreate(&cusparse_handle_));
   CUSPARSE_CHECK(cusparseCreateMatDescr(&cusparse_matdescr_));
-  CUSPARSE_CHECK(cusparseSetMatType(cusparse_matdescr_,CUSPARSE_MATRIX_TYPE_GENERAL));
-  CUSPARSE_CHECK(cusparseSetMatIndexBase(cusparse_matdescr_,CUSPARSE_INDEX_BASE_ZERO));
-  CURAND_CHECK(curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT));
-  CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen()));
+  CUSPARSE_CHECK(
+      cusparseSetMatType(cusparse_matdescr_, CUSPARSE_MATRIX_TYPE_GENERAL));
+  CUSPARSE_CHECK(
+      cusparseSetMatIndexBase(cusparse_matdescr_, CUSPARSE_INDEX_BASE_ZERO));
+  CURAND_CHECK(
+      curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT));
+  CURAND_CHECK(
+      curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen()));
 }
 
 Context::~Context() {
@@ -81,34 +91,36 @@ Context::~Context() {
     CUSPARSE_CHECK(cusparseDestroyMatDescr(cusparse_matdescr_));
   if (curand_generator_)
     CURAND_CHECK(curandDestroyGenerator(curand_generator_));
-  if (d_labels) CUDA_CHECK(cudaFree(d_labels));
-  if (d_feats) CUDA_CHECK(cudaFree(d_feats));
-  if (norm_factors) CUDA_CHECK(cudaFree(norm_factors));
+  if (d_labels)
+    CUDA_CHECK(cudaFree(d_labels));
+  if (d_feats)
+    CUDA_CHECK(cudaFree(d_feats));
+  if (norm_factors)
+    CUDA_CHECK(cudaFree(norm_factors));
 }
 
-void Context::createSubgraphs(int n_sg) {
-}
+void Context::createSubgraphs(int n_sg) {}
 
-void Context::gen_subgraph_labels(size_t m, const mask_t *masks) {
-}
+void Context::gen_subgraph_labels(size_t m, const mask_t* masks) {}
 
-void Context::gen_subgraph_feats(size_t m, const mask_t *masks) {
-}
+void Context::gen_subgraph_feats(size_t m, const mask_t* masks) {}
 
 void Context::norm_factor_computing(bool is_subgraph, int subg_id) {
   std::cout << "Pre-computing normalization factor (n=" << n << ") ... ";
   if (!is_selfloop_added) {
-    std::cout << "Set -sl=1 to add selfloop\n";	  
+    std::cout << "Set -sl=1 to add selfloop\n";
     exit(0);
   }
 #ifdef USE_CUSPARSE
   int nnz = graph_gpu.sizeEdges();
   CUDA_CHECK(cudaMalloc((void**)&norm_factors, nnz * sizeof(float_t)));
   init_const_gpu(nnz, 0.0, norm_factors);
-  norm_factor_computing_edge<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, graph_gpu, norm_factors);
+  norm_factor_computing_edge<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
+      n, graph_gpu, norm_factors);
 #else
   CUDA_CHECK(cudaMalloc((void**)&norm_factors, n * sizeof(float_t)));
-  norm_factor_computing_node<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, graph_gpu, norm_factors);
+  norm_factor_computing_node<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
+      n, graph_gpu, norm_factors);
 #endif
   CudaTest("solving norm_factor_computing kernel failed");
   std::cout << "Done\n";
@@ -120,10 +132,13 @@ void Context::SetDevice(const int device_id) {
   if (current_device == device_id) return;
   CUDA_CHECK(cudaSetDevice(device_id));
   if (cublas_handle_) CUBLAS_CHECK(cublasDestroy(cublas_handle_));
-  if (curand_generator_) CURAND_CHECK(curandDestroyGenerator(curand_generator_));
+  if (curand_generator_)
+CURAND_CHECK(curandDestroyGenerator(curand_generator_));
   CUBLAS_CHECK(cublasCreate(&cublas_handle_));
-  CURAND_CHECK(curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT));
-  CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen()));
+  CURAND_CHECK(curandCreateGenerator(&curand_generator_,
+CURAND_RNG_PSEUDO_DEFAULT));
+  CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(curand_generator_,
+cluster_seedgen()));
 }
 */
 size_t Context::read_graph(bool selfloop) {
@@ -151,21 +166,25 @@ size_t Context::read_graph(bool selfloop) {
 void Context::copy_data_to_device() {
   if (is_single_class) {
     CUDA_CHECK(cudaMalloc((void**)&d_labels, n * sizeof(label_t)));
-    CUDA_CHECK(cudaMemcpy(d_labels, h_labels, n * sizeof(label_t), cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpy(d_labels, h_labels, n * sizeof(label_t),
+                          cudaMemcpyHostToDevice));
   } else {
-    CUDA_CHECK(cudaMalloc((void**)&d_labels, n * num_classes * sizeof(label_t)));
-    CUDA_CHECK(cudaMemcpy(d_labels, h_labels, n * num_classes * sizeof(label_t), cudaMemcpyHostToDevice));
+    CUDA_CHECK(
+        cudaMalloc((void**)&d_labels, n * num_classes * sizeof(label_t)));
+    CUDA_CHECK(cudaMemcpy(d_labels, h_labels, n * num_classes * sizeof(label_t),
+                          cudaMemcpyHostToDevice));
   }
   CUDA_CHECK(cudaMalloc((void**)&d_feats, n * feat_len * sizeof(float_t)));
-  CUDA_CHECK(cudaMemcpy(d_feats, &h_feats[0], n * feat_len * sizeof(float_t), cudaMemcpyHostToDevice));
-  //print_device_vector(10, d_feats, "d_feats");
+  CUDA_CHECK(cudaMemcpy(d_feats, &h_feats[0], n * feat_len * sizeof(float_t),
+                        cudaMemcpyHostToDevice));
+  // print_device_vector(10, d_feats, "d_feats");
 }
 
-//void Context::copy_data_to_device() {
-  //float_malloc_device(n, d_labels);
-  //float_copy_device(n, h_labels, d_labels);
-  //float_malloc_device(n*feat_len, d_feats);
-  //float_copy_device(n*feat_len, &h_feats[0], d_feats);
+// void Context::copy_data_to_device() {
+// float_malloc_device(n, d_labels);
+// float_copy_device(n, h_labels, d_labels);
+// float_malloc_device(n*feat_len, d_feats);
+// float_copy_device(n*feat_len, &h_feats[0], d_feats);
 //}
 
-} // namespace context
+} // namespace deepgalois
diff --git a/libdeepgalois/src/layers/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp
index 8b9e726e8e..9c3454d550 100644
--- a/libdeepgalois/src/layers/aggregator.cpp
+++ b/libdeepgalois/src/layers/aggregator.cpp
@@ -2,52 +2,59 @@
 #include "deepgalois/math_functions.hh"
 #include "galois/Galois.h"
 
-void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* out,
-                bool norm, float_t* norm_factor) {
-  //std::cout << "[update_all] graph size: " << n << "\n";
-  #ifndef GALOIS_USE_DIST
+void deepgalois::update_all(size_t len, Graph& g, const float_t* in,
+                            float_t* out, bool norm, float_t* norm_factor) {
+// std::cout << "[update_all] graph size: " << n << "\n";
+#ifndef GALOIS_USE_DIST
   size_t n = g.size();
-  galois::do_all(galois::iterate(size_t(0), n),[&](const auto src) {
-  #else
+  galois::do_all(
+      galois::iterate(size_t(0), n),
+      [&](const auto src) {
+#else
   auto& rangeObj = g.allNodesRange();
-  galois::do_all(galois::iterate(rangeObj), [&](const auto src) {
-  #endif
-    auto src_idx = src * len;
-    // zero out the output data
-    math::clear_cpu(len , &out[src_idx]);
-    float_t a = 0.0;
-    float_t b = 0.0;
-    // get normalization factor if needed
-    if (norm) a = norm_factor[src];
-    // gather neighbors' embeddings
-    for (auto e = g.edge_begin(src); e != g.edge_end(src); e++) {
-      const auto dst = g.getEdgeDst(e);
-      assert(dst < n);
-      auto dst_idx = dst * len;
-      if (norm) {
-        // normalize b as well
-        b = a * norm_factor[dst];
-        //float_t* neighbor = new float_t[len]; // this is super slow
-        vec_t neighbor(len);
-        // scale the neighbor's data using the normalization factor
-        math::scale(len, b, &in[dst_idx], &neighbor[0]);
-        // use scaled data to update; out[src] += in[dst]
-        math::vadd_cpu(len, &out[src_idx], &neighbor[0],  &out[src_idx]);
-      } else {
-        // add embeddings from neighbors together; out[src] += in[dst]
-        math::vadd_cpu(len, &out[src_idx], &in[dst_idx], &out[src_idx]);
-      }
-    }
-  }, galois::steal(), galois::loopname("update_all"));
+  galois::do_all(
+      galois::iterate(rangeObj),
+      [&](const auto src) {
+#endif
+        auto src_idx = src * len;
+        // zero out the output data
+        math::clear_cpu(len, &out[src_idx]);
+        float_t a = 0.0;
+        float_t b = 0.0;
+        // get normalization factor if needed
+        if (norm)
+          a = norm_factor[src];
+        // gather neighbors' embeddings
+        for (auto e = g.edge_begin(src); e != g.edge_end(src); e++) {
+          const auto dst = g.getEdgeDst(e);
+          assert(dst < n);
+          auto dst_idx = dst * len;
+          if (norm) {
+            // normalize b as well
+            b = a * norm_factor[dst];
+            // float_t* neighbor = new float_t[len]; // this is super slow
+            vec_t neighbor(len);
+            // scale the neighbor's data using the normalization factor
+            math::scale(len, b, &in[dst_idx], &neighbor[0]);
+            // use scaled data to update; out[src] += in[dst]
+            math::vadd_cpu(len, &out[src_idx], &neighbor[0], &out[src_idx]);
+          } else {
+            // add embeddings from neighbors together; out[src] += in[dst]
+            math::vadd_cpu(len, &out[src_idx], &in[dst_idx], &out[src_idx]);
+          }
+        }
+      },
+      galois::steal(), galois::loopname("update_all"));
 }
 
-void deepgalois::update_all_csrmm(size_t len, Graph& g, const float_t* in, float_t* out,
-                                  bool, float_t* norm_factor) {
+void deepgalois::update_all_csrmm(size_t len, Graph& g, const float_t* in,
+                                  float_t* out, bool, float_t* norm_factor) {
   galois::StatTimer Tcsrmm("CSRMM-MKL");
   Tcsrmm.start();
   unsigned n = g.size();
-  math::clear_cpu(n*len, out);
-  math::csrmm_cpu(n, len, n, g.sizeEdges(), 1.0, norm_factor, (int*)g.row_start_ptr(), (int*)g.edge_dst_ptr(), in, 0.0, out);
+  math::clear_cpu(n * len, out);
+  math::csrmm_cpu(n, len, n, g.sizeEdges(), 1.0, norm_factor,
+                  (int*)g.row_start_ptr(), (int*)g.edge_dst_ptr(), in, 0.0,
+                  out);
   Tcsrmm.stop();
 }
-
diff --git a/libdeepgalois/src/layers/aggregator.cu b/libdeepgalois/src/layers/aggregator.cu
index 158b1c2b4d..2bfe55ca46 100644
--- a/libdeepgalois/src/layers/aggregator.cu
+++ b/libdeepgalois/src/layers/aggregator.cu
@@ -15,16 +15,18 @@ __device__ void scale_add(const int n, const float_t alpha, const float_t* a,
 }
 
 __global__ void update_all_naive(size_t n, size_t len, GraphGPU g,
-                                  const float_t* in, float_t* out,
-                                  bool norm, const float_t* norm_factor) {
+                                 const float_t* in, float_t* out, bool norm,
+                                 const float_t* norm_factor) {
   CUDA_KERNEL_LOOP(src, n) {
     float_t a = 0.0, b = 1.0;
-    if (norm) a = norm_factor[src];
+    if (norm)
+      a = norm_factor[src];
     index_type begin = g.edge_begin(src);
     index_type end   = g.edge_end(src);
     for (index_type e = begin; e != end; e++) {
       index_type dst = g.getEdgeDst(e);
-      if (norm) b = a * norm_factor[dst];
+      if (norm)
+        b = a * norm_factor[dst];
       scale_add(len, b, in + dst * len, out + src * len,
                 out + src * len); // out[src] += in[dst]
     }
@@ -32,31 +34,36 @@ __global__ void update_all_naive(size_t n, size_t len, GraphGPU g,
 }
 
 __global__ void update_all_warp(size_t n, size_t len, GraphGPU g,
-                                  const float_t* in, float_t* out,
-                                  bool norm, const float_t* norm_factor) {
-  __shared__ index_type ptrs[BLOCK_SIZE/WARP_SIZE][2];
-  const int thread_id   = BLOCK_SIZE * blockIdx.x + threadIdx.x;  // global thread index
-  const int thread_lane = threadIdx.x & (WARP_SIZE-1);            // thread index within the warp
-  const int warp_id     = thread_id   / WARP_SIZE;                // global warp index
-  const int warp_lane   = threadIdx.x / WARP_SIZE;                // warp index within the CTA
-  const int num_warps   = (BLOCK_SIZE / WARP_SIZE) * gridDim.x;   // total number of active warps
+                                const float_t* in, float_t* out, bool norm,
+                                const float_t* norm_factor) {
+  __shared__ index_type ptrs[BLOCK_SIZE / WARP_SIZE][2];
+  const int thread_id =
+      BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index
+  const int thread_lane =
+      threadIdx.x & (WARP_SIZE - 1);             // thread index within the warp
+  const int warp_id   = thread_id / WARP_SIZE;   // global warp index
+  const int warp_lane = threadIdx.x / WARP_SIZE; // warp index within the CTA
+  const int num_warps =
+      (BLOCK_SIZE / WARP_SIZE) * gridDim.x; // total number of active warps
 
-  for(int src = warp_id; src < n; src += num_warps) {
+  for (int src = warp_id; src < n; src += num_warps) {
     float_t a = 0.0, b = 1.0;
-    if (norm) a = norm_factor[src];
+    if (norm)
+      a = norm_factor[src];
     if (thread_lane < 2)
       ptrs[warp_lane][thread_lane] = g.edge_begin(src + thread_lane);
     __syncthreads();
     const index_type row_begin = ptrs[warp_lane][0];
     const index_type row_end   = ptrs[warp_lane][1];
-    index_type base_src = src * len;
-    for(index_type offset = row_begin; offset < row_end; offset ++) {
+    index_type base_src        = src * len;
+    for (index_type offset = row_begin; offset < row_end; offset++) {
       index_type dst = g.getEdgeDst(offset);
-      if (norm) b = a * norm_factor[dst];
+      if (norm)
+        b = a * norm_factor[dst];
       index_type base_dst = dst * len;
       for (int i = 0; i < len; i += WARP_SIZE)
-        if (thread_lane+i < len)
-          out[base_src+thread_lane+i] += in[base_dst+thread_lane+i] * b;
+        if (thread_lane + i < len)
+          out[base_src + thread_lane + i] += in[base_dst + thread_lane + i] * b;
     }
   }
 }
@@ -65,29 +72,32 @@ void update_all(size_t len, GraphGPU& g, const float_t* in, float_t* out,
                 bool norm, const float_t* norm_factor) {
   unsigned n = g.size();
   CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t)));
-  //update_all_naive<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, len, g, in, out, norm, norm_factor);
-  update_all_warp<<<(n-1)/WARPS_PER_BLOCK+1, BLOCK_SIZE>>>(n, len, g, in, out, norm, norm_factor);
+  // update_all_naive<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, len, g, in,
+  // out, norm, norm_factor);
+  update_all_warp<<<(n - 1) / WARPS_PER_BLOCK + 1, BLOCK_SIZE>>>(
+      n, len, g, in, out, norm, norm_factor);
   CudaTest("solving update_all kernel failed");
 }
 
 void update_all_csrmm(size_t len, GraphGPU& g, const float_t* in, float_t* out,
-                bool norm, const float_t* norm_factor) {
-  //g.print_test();
+                      bool norm, const float_t* norm_factor) {
+  // g.print_test();
   unsigned n = g.size();
-  auto nnz = g.sizeEdges();
+  auto nnz   = g.sizeEdges();
   CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t)));
-  //std::cout << "[debug]: update_all on GPU, n " << n << " len " << len << " nnz " << nnz << "\n";
-  //print_device_vector(10, norm_factor, "norm_factor");
-  float *temp;
-  const int *row_start = (const int*)g.row_start_ptr();
-  const int *edge_dst = (const int*)g.edge_dst_ptr();
-  //printf("row_start_ptr: 0x%x\n", row_start);
-  //printf("edge_dst_ptr: 0x%x\n", edge_dst);
-  //print_device_int_vector(10, row_start, "row_start");
-  //print_device_int_vector(10, edge_dst, "edge_dst");
-  float_malloc_device(n*len, temp); // TODO: avoid repetitive allocation
-  csrmm_gpu(n, len, n, nnz, 1.0, norm_factor, row_start, edge_dst, in, 0.0, temp, out);
+  // std::cout << "[debug]: update_all on GPU, n " << n << " len " << len << "
+  // nnz " << nnz << "\n"; print_device_vector(10, norm_factor, "norm_factor");
+  float* temp;
+  const int* row_start = (const int*)g.row_start_ptr();
+  const int* edge_dst  = (const int*)g.edge_dst_ptr();
+  // printf("row_start_ptr: 0x%x\n", row_start);
+  // printf("edge_dst_ptr: 0x%x\n", edge_dst);
+  // print_device_int_vector(10, row_start, "row_start");
+  // print_device_int_vector(10, edge_dst, "edge_dst");
+  float_malloc_device(n * len, temp); // TODO: avoid repetitive allocation
+  csrmm_gpu(n, len, n, nnz, 1.0, norm_factor, row_start, edge_dst, in, 0.0,
+            temp, out);
   float_free_device(temp);
 }
 
-}
+} // namespace deepgalois
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index e46a2477a6..d50f7bfb0a 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -11,7 +11,8 @@ float_t* _dataToSync = nullptr;
 long unsigned _syncVectorSize = 0;
 
 #ifdef CPU_ONLY
-inline void graph_conv_layer::rand_init_matrix(size_t dim_x, size_t dim_y, vec_t& matrix, unsigned seed) {
+inline void graph_conv_layer::rand_init_matrix(size_t dim_x, size_t dim_y,
+                                               vec_t& matrix, unsigned seed) {
   auto init_range = sqrt(6.0 / (dim_x + dim_y));
   std::default_random_engine rng(seed);
   std::uniform_real_distribution<float_t> dist(-init_range, init_range);
@@ -22,7 +23,8 @@ inline void graph_conv_layer::rand_init_matrix(size_t dim_x, size_t dim_y, vec_t
   }
 }
 
-inline void graph_conv_layer::zero_init_matrix(size_t dim_x, size_t dim_y, vec_t& matrix) {
+inline void graph_conv_layer::zero_init_matrix(size_t dim_x, size_t dim_y,
+                                               vec_t& matrix) {
   matrix.resize(dim_x * dim_y);
   for (size_t i = 0; i < dim_x; ++i) {
     for (size_t j = 0; j < dim_y; ++j)
@@ -31,7 +33,8 @@ inline void graph_conv_layer::zero_init_matrix(size_t dim_x, size_t dim_y, vec_t
 }
 
 // aggregate based on graph topology
-void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in, float_t* out) {
+void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in,
+                                 float_t* out) {
   // normalization constant based on graph structure
 #ifdef USE_MKL
   update_all_csrmm(len, g, in, out, norm_, norm_consts);
@@ -41,7 +44,8 @@ void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in, float_
 }
 
 // since graph is symmetric, the derivative is the same
-void graph_conv_layer::d_aggregate(size_t len, Graph& g, const float_t* in, float_t* out) {
+void graph_conv_layer::d_aggregate(size_t len, Graph& g, const float_t* in,
+                                   float_t* out) {
 #ifdef USE_MKL
   update_all_csrmm(len, g, in, out, norm_, norm_consts); // x*x; x*z -> x*z
 #else
@@ -49,9 +53,10 @@ void graph_conv_layer::d_aggregate(size_t len, Graph& g, const float_t* in, floa
 #endif
 }
 
-void graph_conv_layer::combine(size_t n, size_t len, const float_t* self, const float_t* neighbors, float_t* out) {
-  float_t *a = new float_t[len];
-  float_t *b = new float_t[len];
+void graph_conv_layer::combine(size_t n, size_t len, const float_t* self,
+                               const float_t* neighbors, float_t* out) {
+  float_t* a = new float_t[len];
+  float_t* b = new float_t[len];
   math::mvmul(CblasNoTrans, n, len, 1.0, &Q[0], self, 0.0, a);
   math::mvmul(CblasNoTrans, n, len, 1.0, &W[0], neighbors, 0.0, b);
   math::vadd_cpu(len, a, b, out); // out = W*self + Q*neighbors
@@ -63,11 +68,12 @@ void graph_conv_layer::malloc_and_init() {
   size_t z = output_dims[1];
 #ifdef GALOIS_USE_DIST
   // setup gluon
-  layer::gradientGraph = new deepgalois::GluonGradients(layer::weight_grad, y * z);
+  layer::gradientGraph =
+      new deepgalois::GluonGradients(layer::weight_grad, y * z);
   layer::syncSub =
-    new galois::graphs::GluonSubstrate<deepgalois::GluonGradients>(
-      *layer::gradientGraph, layer::gradientGraph->myHostID(),
-      layer::gradientGraph->numHosts(), false);
+      new galois::graphs::GluonSubstrate<deepgalois::GluonGradients>(
+          *layer::gradientGraph, layer::gradientGraph->myHostID(),
+          layer::gradientGraph->numHosts(), false);
 #endif
 
 #ifdef GALOIS_USE_DIST
@@ -80,43 +86,52 @@ void graph_conv_layer::malloc_and_init() {
   // rand_init_matrix(y, z, Q);
   zero_init_matrix(y, z, layer::weight_grad);
 
-  if (dropout_) dropout_mask = new mask_t[x * y];
-  in_temp  = new float_t[x * y];
-  out_temp = new float_t[x * z];
+  if (dropout_)
+    dropout_mask = new mask_t[x * y];
+  in_temp    = new float_t[x * y];
+  out_temp   = new float_t[x * z];
   trans_data = new float_t[y * x]; // y*x
-  if (y <= z) in_temp1 = new float_t[x * y];
+  if (y <= z)
+    in_temp1 = new float_t[x * y];
 }
 
 // 𝒉[𝑙] = σ(𝑊 * Σ(𝒉[𝑙-1]))
-void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_data) {
+void graph_conv_layer::forward_propagation(const float_t* in_data,
+                                           float_t* out_data) {
   size_t x = input_dims[0];
   size_t y = input_dims[1];
   size_t z = output_dims[1];
-  //std::cout << "layer: " << name_ << "\n";
-  //std::cout << "x=" << x << ", y=" << y << ", z=" << z << "\n";
+  // std::cout << "layer: " << name_ << "\n";
+  // std::cout << "x=" << x << ", y=" << y << ", z=" << z << "\n";
 
   // input: x*y; W: y*z; output: x*z
   // if y > z: mult W first to reduce the feature size for aggregation
   // else: aggregate first then mult W
   if (dropout_ && phase_ == net_phase::train)
-    math::dropout_cpu(x, y, scale_, dropout_rate_, in_data, dropout_mask, in_temp);
-  else math::copy_cpu(x*y, in_data, in_temp); 
+    math::dropout_cpu(x, y, scale_, dropout_rate_, in_data, dropout_mask,
+                      in_temp);
+  else
+    math::copy_cpu(x * y, in_data, in_temp);
 
   if (y > z) {
-    math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, &layer::W[0], 0.0, out_temp);
+    math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp,
+                    &layer::W[0], 0.0, out_temp);
     aggregate(z, *graph_cpu, out_temp, out_data);
   } else {
     aggregate(y, *graph_cpu, in_temp, in_temp1);
-    math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp1, &layer::W[0], 0.0, out_data);
+    math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp1,
+                    &layer::W[0], 0.0, out_data);
   }
 #ifdef GALOIS_USE_DIST
   // TODO sync of out_data required here
   deepgalois::_syncVectorSize = z;
-  deepgalois::_dataToSync = out_data;
-  layer::context->getSyncSubstrate()->sync<writeAny, readAny, GraphConvSync>("AggSync");
+  deepgalois::_dataToSync     = out_data;
+  layer::context->getSyncSubstrate()->sync<writeAny, readAny, GraphConvSync>(
+      "AggSync");
 #endif
   // run relu activation on output if specified
-  if (act_) math::relu_cpu(x*z, out_data, out_data);
+  if (act_)
+    math::relu_cpu(x * z, out_data, out_data);
 }
 
 // 𝜕𝐸 / 𝜕𝑦[𝑙−1] = 𝜕𝐸 / 𝜕𝑦[𝑙] ∗ 𝑊 ^𝑇
@@ -127,8 +142,9 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
   size_t y = input_dims[1];
   size_t z = output_dims[1];
   // note; assumption here is that out_grad contains 1s or 0s via relu?
-  if (act_) math::d_relu_cpu(x*z, out_grad, out_data, out_grad);
-  //else math::copy_cpu(x * z, out_grad, out_temp); // TODO: avoid copying
+  if (act_)
+    math::d_relu_cpu(x * z, out_grad, out_data, out_grad);
+  // else math::copy_cpu(x * z, out_grad, out_temp); // TODO: avoid copying
 
   if (y > z) {
     d_aggregate(z, *graph_cpu, out_grad, out_temp);
@@ -137,22 +153,28 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
     // this calculates gradients for the node predictions
     if (level_ != 0) // no need to calculate in_grad for the first layer
       // derivative of matmul needs transposed matrix
-      math::sgemm_cpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, &W[0], 0.0, in_grad); // x*z; z*y -> x*y
-    // calculate weight gradients using input data; multiplied by gradients from last back prop step
-    math::sgemm_cpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, &layer::weight_grad[0]); // y*x; x*z; y*z
+      math::sgemm_cpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, &W[0],
+                      0.0, in_grad); // x*z; z*y -> x*y
+    // calculate weight gradients using input data; multiplied by gradients from
+    // last back prop step
+    math::sgemm_cpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp,
+                    0.0, &layer::weight_grad[0]); // y*x; x*z; y*z
   } else {
     if (level_ != 0) {
-      math::sgemm_cpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_grad, &W[0], 0.0, in_temp);
+      math::sgemm_cpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_grad, &W[0],
+                      0.0, in_temp);
       d_aggregate(y, *graph_cpu, in_temp, in_grad);
     }
-    math::sgemm_cpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_grad, 0.0, &layer::weight_grad[0]);
+    math::sgemm_cpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_grad,
+                    0.0, &layer::weight_grad[0]);
   }
 
 #ifdef GALOIS_USE_DIST
   // sync agg
   deepgalois::_syncVectorSize = z;
-  deepgalois::_dataToSync = out_temp;
-  layer::context->getSyncSubstrate()->sync<writeAny, readAny, GraphConvSync>("AggSyncBack");
+  deepgalois::_dataToSync     = out_temp;
+  layer::context->getSyncSubstrate()->sync<writeAny, readAny, GraphConvSync>(
+      "AggSyncBack");
 #endif
 
   if (level_ != 0 && dropout_)
@@ -160,14 +182,13 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
 
 #ifdef GALOIS_USE_DIST
   layer::syncSub->sync<writeAny, readAny, GradientSync>("GradientSync");
-  //galois::gInfo("[", layer::gradientGraph->myHostID(), "] Sync done");
+  // galois::gInfo("[", layer::gradientGraph->myHostID(), "] Sync done");
 #endif
 }
 
 acc_t graph_conv_layer::get_weight_decay_loss() {
-  return math::l2_norm(input_dims[1]*output_dims[1], &layer::W[0]);
+  return math::l2_norm(input_dims[1] * output_dims[1], &layer::W[0]);
 }
 
 #endif // end if CPU_ONLY
-} // namespace
-
+} // namespace deepgalois
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cu b/libdeepgalois/src/layers/graph_conv_layer.cu
index a1682847ad..f8b59d3c0e 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cu
+++ b/libdeepgalois/src/layers/graph_conv_layer.cu
@@ -8,32 +8,35 @@ void graph_conv_layer::malloc_and_init() {
   size_t y = input_dims[1];
   size_t z = output_dims[1];
 
-  if (dropout_) CUDA_CHECK(cudaMalloc((void**)&dropout_mask, x * y * sizeof(mask_t)));
-  float_malloc_device(x*y, in_temp);
-  init_const_gpu(x*y, 0.0, in_temp);
+  if (dropout_)
+    CUDA_CHECK(cudaMalloc((void**)&dropout_mask, x * y * sizeof(mask_t)));
+  float_malloc_device(x * y, in_temp);
+  init_const_gpu(x * y, 0.0, in_temp);
   if (y <= z) {
-    float_malloc_device(x*y, in_temp1);
-    init_const_gpu(x*y, 0.0, in_temp1);
+    float_malloc_device(x * y, in_temp1);
+    init_const_gpu(x * y, 0.0, in_temp1);
   }
-  float_malloc_device(x*z, out_temp);
-  init_const_gpu(x*z, 0.0, out_temp);
-  float_malloc_device(y*z, d_W);
+  float_malloc_device(x * z, out_temp);
+  init_const_gpu(x * z, 0.0, out_temp);
+  float_malloc_device(y * z, d_W);
   auto init_range = sqrt(6.0 / (y + z));
   // Glorot & Bengio (AISTATS 2010)
   rng_uniform_gpu(y * z, -init_range, init_range, d_W);
-  float_malloc_device(y*z, layer::d_weight_grad);
-  init_const_gpu(y*z, 0.0, layer::d_weight_grad);
+  float_malloc_device(y * z, layer::d_weight_grad);
+  init_const_gpu(y * z, 0.0, layer::d_weight_grad);
 }
 
-void graph_conv_layer::aggregate(size_t len, GraphGPU& g, const float_t* in, float_t* out) {
-  #ifdef USE_CUSPARSE
+void graph_conv_layer::aggregate(size_t len, GraphGPU& g, const float_t* in,
+                                 float_t* out) {
+#ifdef USE_CUSPARSE
   deepgalois::update_all_csrmm(len, g, in, out, norm_, norm_consts);
-  #else
+#else
   deepgalois::update_all(len, g, in, out, norm_, norm_consts);
-  #endif
+#endif
 }
 
-void graph_conv_layer::d_aggregate(size_t len, GraphGPU& g, const float_t* in, float_t* out) {
+void graph_conv_layer::d_aggregate(size_t len, GraphGPU& g, const float_t* in,
+                                   float_t* out) {
 #ifdef USE_CUSPARSE
   deepgalois::update_all_csrmm(len, g, in, out, norm_, norm_consts);
 #else
@@ -41,62 +44,74 @@ void graph_conv_layer::d_aggregate(size_t len, GraphGPU& g, const float_t* in, f
 #endif
 }
 
-void graph_conv_layer::combine(size_t dim_x, size_t dim_y, const float_t* self, const float_t* neighbors, float_t* out) {
-}
+void graph_conv_layer::combine(size_t dim_x, size_t dim_y, const float_t* self,
+                               const float_t* neighbors, float_t* out) {}
 
 // GPU forward: compute output features
 // NOTE: in_data will be used in back-prop, so it can not be modified
-void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_data) {
+void graph_conv_layer::forward_propagation(const float_t* in_data,
+                                           float_t* out_data) {
   size_t x = input_dims[0];
   size_t y = input_dims[1];
   size_t z = output_dims[1];
- 
-	// currently only support feature length <= 128
+
+  // currently only support feature length <= 128
   if (z > MAX_NUM_CLASSES) {
-    std::cout << "Currently support maximum hidden feature length of " << MAX_NUM_CLASSES << "\n"; 
+    std::cout << "Currently support maximum hidden feature length of "
+              << MAX_NUM_CLASSES << "\n";
     exit(0);
   }
-  init_const_gpu(x*z, 0.0, out_temp);
+  init_const_gpu(x * z, 0.0, out_temp);
   if (dropout_ && phase_ == net_phase::train)
     dropout_gpu(x * y, scale_, dropout_rate_, in_data, dropout_mask, in_temp);
-  else copy_gpu(x*y, in_data, in_temp); 
+  else
+    copy_gpu(x * y, in_data, in_temp);
   if (y > z) {
-    sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, d_W, 0.0, out_temp);
+    sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, d_W, 0.0,
+              out_temp);
     graph_conv_layer::aggregate(z, *graph_gpu, out_temp, out_data);
   } else {
     graph_conv_layer::aggregate(y, *graph_gpu, in_temp, in_temp1);
-    sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp1, d_W, 0.0, out_data);
+    sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp1, d_W, 0.0,
+              out_data);
   }
-  if (act_) relu_gpu(x * z, out_data, out_data);
+  if (act_)
+    relu_gpu(x * z, out_data, out_data);
 }
 
-// GPU backward: compute input gradients (in_grad) and weight gradients (d_weight_grad)
-void graph_conv_layer::back_propagation(const float_t* in_data, const float_t* out_data,
+// GPU backward: compute input gradients (in_grad) and weight gradients
+// (d_weight_grad)
+void graph_conv_layer::back_propagation(const float_t* in_data,
+                                        const float_t* out_data,
                                         float_t* out_grad, float_t* in_grad) {
   size_t x = input_dims[0];
   size_t y = input_dims[1];
   size_t z = output_dims[1];
- 
-  if (act_) d_relu_gpu(x * z, out_grad, out_data, out_grad);
+
+  if (act_)
+    d_relu_gpu(x * z, out_grad, out_data, out_grad);
   if (y > z) {
     graph_conv_layer::d_aggregate(z, *graph_gpu, out_grad, out_temp);
     if (level_ != 0)
-      sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0, in_grad);
-    sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, layer::d_weight_grad);
+      sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0,
+                in_grad);
+    sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0,
+              layer::d_weight_grad);
   } else {
     if (level_ != 0) {
-      sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_grad, d_W, 0.0, in_temp);
+      sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_grad, d_W, 0.0,
+                in_temp);
       graph_conv_layer::d_aggregate(y, *graph_gpu, in_temp, in_grad);
     }
-    sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_grad, 0.0, layer::d_weight_grad);
+    sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_grad, 0.0,
+              layer::d_weight_grad);
   }
   if (level_ != 0 && dropout_)
     d_dropout_gpu(x * y, scale_, dropout_rate_, in_grad, dropout_mask, in_grad);
 }
 
 acc_t graph_conv_layer::get_weight_decay_loss() {
-  return l2_norm_gpu(input_dims[1]*output_dims[1], d_W);
+  return l2_norm_gpu(input_dims[1] * output_dims[1], d_W);
 }
 
-} // namespace
-
+} // namespace deepgalois
diff --git a/libdeepgalois/src/layers/l2_norm_layer.cpp b/libdeepgalois/src/layers/l2_norm_layer.cpp
index 864eaeb321..8de2406ede 100644
--- a/libdeepgalois/src/layers/l2_norm_layer.cpp
+++ b/libdeepgalois/src/layers/l2_norm_layer.cpp
@@ -4,43 +4,50 @@
 
 namespace deepgalois {
 
-void l2_norm_layer::forward_propagation(const float_t* in_data, float_t* out_data) {
+void l2_norm_layer::forward_propagation(const float_t* in_data,
+                                        float_t* out_data) {
   size_t x = input_dims[0];
   size_t y = input_dims[1];
-  galois::do_all(galois::iterate((size_t)0, x), [&](const auto i) {
-  //for (size_t i = 0; i < x; i++) {
-    float_t sum = 0.0;
-    size_t idx = i * y;
-    for (size_t j = 0; j < y; j++) {
-      sum += in_data[idx + j] * in_data[idx + j];
-    }
-    sum = std::max(sum, epsilon_);
-    sum = sqrt(sum);
-    for (size_t j = 0; j < y; j++) {
-      out_data[idx + j] = in_data[idx + j] / sum  * scale_;
-    }
-  }, galois::loopname("l2_norm"));
+  galois::do_all(
+      galois::iterate((size_t)0, x),
+      [&](const auto i) {
+        // for (size_t i = 0; i < x; i++) {
+        float_t sum = 0.0;
+        size_t idx  = i * y;
+        for (size_t j = 0; j < y; j++) {
+          sum += in_data[idx + j] * in_data[idx + j];
+        }
+        sum = std::max(sum, epsilon_);
+        sum = sqrt(sum);
+        for (size_t j = 0; j < y; j++) {
+          out_data[idx + j] = in_data[idx + j] / sum * scale_;
+        }
+      },
+      galois::loopname("l2_norm"));
 }
 
 void l2_norm_layer::back_propagation(const float_t* in_data, const float_t*,
-                                  float_t* out_grad, float_t* in_grad) {
+                                     float_t* out_grad, float_t* in_grad) {
   size_t x = input_dims[0];
   size_t y = input_dims[1];
-  galois::do_all(galois::iterate((size_t)0, x), [&](const auto i) {
-  //for (size_t i = 0; i < x; i++) {
-    float_t sum_x2 = 0.0;
-    float_t coef0_axis0 = 0, coef1_axis0 = 0;
-    size_t idx = i * y;
-    for (size_t j = 0; j < y; j++) {
-      sum_x2 += powf(in_data[idx + j], 2);
-      coef0_axis0 -= in_data[idx + j] * out_grad[idx + j];
-    }
-    coef1_axis0 = powf(sum_x2, -1.5);
-    for (size_t j = 0; j < y; j++) {
-      in_grad[idx + j] = in_data[idx + j] * coef0_axis0 * coef1_axis0
-                         + out_grad[idx + j] * sum_x2 * coef1_axis0;
-    }
-  }, galois::loopname("d_l2_norm"));
+  galois::do_all(
+      galois::iterate((size_t)0, x),
+      [&](const auto i) {
+        // for (size_t i = 0; i < x; i++) {
+        float_t sum_x2      = 0.0;
+        float_t coef0_axis0 = 0, coef1_axis0 = 0;
+        size_t idx = i * y;
+        for (size_t j = 0; j < y; j++) {
+          sum_x2 += powf(in_data[idx + j], 2);
+          coef0_axis0 -= in_data[idx + j] * out_grad[idx + j];
+        }
+        coef1_axis0 = powf(sum_x2, -1.5);
+        for (size_t j = 0; j < y; j++) {
+          in_grad[idx + j] = in_data[idx + j] * coef0_axis0 * coef1_axis0 +
+                             out_grad[idx + j] * sum_x2 * coef1_axis0;
+        }
+      },
+      galois::loopname("d_l2_norm"));
 }
 
-} // namespace
+} // namespace deepgalois
diff --git a/libdeepgalois/src/layers/l2_norm_layer.cu b/libdeepgalois/src/layers/l2_norm_layer.cu
index e600b6fbbb..ed86cf147d 100644
--- a/libdeepgalois/src/layers/l2_norm_layer.cu
+++ b/libdeepgalois/src/layers/l2_norm_layer.cu
@@ -3,17 +3,19 @@
 
 namespace deepgalois {
 
-void l2_norm_layer::forward_propagation(const float_t* in_data, float_t* out_data) {
+void l2_norm_layer::forward_propagation(const float_t* in_data,
+                                        float_t* out_data) {
   size_t x = input_dims[0];
   size_t y = input_dims[1];
   l2_norm_gpu(x, y, in_data, out_data);
 }
 
-void l2_norm_layer::back_propagation(const float_t* in_data, const float_t* out_data,
-                                  float_t* out_grad, float_t* in_grad) {
+void l2_norm_layer::back_propagation(const float_t* in_data,
+                                     const float_t* out_data, float_t* out_grad,
+                                     float_t* in_grad) {
   size_t x = input_dims[0];
   size_t y = input_dims[1];
   d_l2_norm_gpu(x, y, in_data, out_grad, in_grad);
 }
 
-} // namespace
+} // namespace deepgalois
diff --git a/libdeepgalois/src/layers/leaky_relu_layer.cpp b/libdeepgalois/src/layers/leaky_relu_layer.cpp
index f0ea5f591e..dd4357739f 100644
--- a/libdeepgalois/src/layers/leaky_relu_layer.cpp
+++ b/libdeepgalois/src/layers/leaky_relu_layer.cpp
@@ -3,27 +3,28 @@
 
 namespace deepgalois {
 
-leaky_relu_layer::leaky_relu_layer(unsigned level, float_t eps,
-                                   dims_t in_dims, dims_t out_dims)
+leaky_relu_layer::leaky_relu_layer(unsigned level, float_t eps, dims_t in_dims,
+                                   dims_t out_dims)
     : layer(level, in_dims, out_dims), epsilon_(eps) {
   assert(input_dims[0] == output_dims[0]); // num_vertices
   trainable_ = false;
-  n = input_dims[0] * input_dims[1];
-  name_ = layer_type() + "_" + std::to_string(level);
+  n          = input_dims[0] * input_dims[1];
+  name_      = layer_type() + "_" + std::to_string(level);
 }
 
 #ifdef CPU_ONLY
-// 𝑦[𝑙] = 𝑦[𝑙−1] > 0 ? 𝑦[𝑙−1]) : 𝑦[𝑙−1] * ε 
-void leaky_relu_layer::forward_propagation(const float_t* in_data, float_t* out_data) {
+// 𝑦[𝑙] = 𝑦[𝑙−1] > 0 ? 𝑦[𝑙−1]) : 𝑦[𝑙−1] * ε
+void leaky_relu_layer::forward_propagation(const float_t* in_data,
+                                           float_t* out_data) {
   math::leaky_relu_cpu(n, epsilon_, in_data, out_data);
 }
 
 // 𝜕𝐿 / 𝜕𝑦[𝑙−1] = 𝜕𝐿 / 𝜕𝑦𝑙 * ε,   𝑖𝑓 (𝑦[𝑙] ≤ 0)
 //              = 𝜕𝐿 / 𝜕𝑦𝑙,       𝑖𝑓 (𝑦[𝑙] > 0)
 void leaky_relu_layer::back_propagation(const float_t*, const float_t* out_data,
-                                  float_t* out_grad, float_t* in_grad) {
+                                        float_t* out_grad, float_t* in_grad) {
   math::d_leaky_relu_cpu(n, epsilon_, out_grad, out_data, in_grad);
 }
 #endif
 
-} // namespace
+} // namespace deepgalois
diff --git a/libdeepgalois/src/layers/leaky_relu_layer.cu b/libdeepgalois/src/layers/leaky_relu_layer.cu
index 6fe4d005ac..a6271086e9 100644
--- a/libdeepgalois/src/layers/leaky_relu_layer.cu
+++ b/libdeepgalois/src/layers/leaky_relu_layer.cu
@@ -3,16 +3,18 @@
 
 namespace deepgalois {
 
-// 𝑦[𝑙] = 𝑦[𝑙−1] > 0 ? 𝑦[𝑙−1]) : 𝑦[𝑙−1] * ε 
-void leaky_relu_layer::forward_propagation(const float_t* in_data, float_t* out_data) {
+// 𝑦[𝑙] = 𝑦[𝑙−1] > 0 ? 𝑦[𝑙−1]) : 𝑦[𝑙−1] * ε
+void leaky_relu_layer::forward_propagation(const float_t* in_data,
+                                           float_t* out_data) {
   leaky_relu_gpu(n, epsilon_, in_data, out_data);
 }
 
 // 𝜕𝐿 / 𝜕𝑦[𝑙−1] = 𝜕𝐿 / 𝜕𝑦𝑙 * ε,   𝑖𝑓 (𝑦[𝑙] ≤ 0)
 //              = 𝜕𝐿 / 𝜕𝑦𝑙,       𝑖𝑓 (𝑦[𝑙] > 0)
-void leaky_relu_layer::back_propagation(const float_t* in_data, const float_t* out_data,
-                                  float_t* out_grad, float_t* in_grad) {
+void leaky_relu_layer::back_propagation(const float_t* in_data,
+                                        const float_t* out_data,
+                                        float_t* out_grad, float_t* in_grad) {
   d_leaky_relu_gpu(n, epsilon_, out_grad, in_data, in_grad);
 }
 
-} // namespace
+} // namespace deepgalois
diff --git a/libdeepgalois/src/layers/relu_layer.cpp b/libdeepgalois/src/layers/relu_layer.cpp
index 9e54d64975..03cd0f4652 100644
--- a/libdeepgalois/src/layers/relu_layer.cpp
+++ b/libdeepgalois/src/layers/relu_layer.cpp
@@ -5,7 +5,8 @@ namespace deepgalois {
 
 #ifdef CPU_ONLY
 // 𝑦[𝑙] = max(0, 𝑦[𝑙−1])
-void relu_layer::forward_propagation(const float_t* in_data, float_t* out_data) {
+void relu_layer::forward_propagation(const float_t* in_data,
+                                     float_t* out_data) {
   size_t n = input_dims[0] * input_dims[1];
   math::relu_cpu(n, in_data, out_data);
 }
@@ -19,4 +20,4 @@ void relu_layer::back_propagation(const float_t*, const float_t* out_data,
 }
 #endif
 
-} // namespace
+} // namespace deepgalois
diff --git a/libdeepgalois/src/layers/relu_layer.cu b/libdeepgalois/src/layers/relu_layer.cu
index 0d39a9dab2..d457c994ce 100644
--- a/libdeepgalois/src/layers/relu_layer.cu
+++ b/libdeepgalois/src/layers/relu_layer.cu
@@ -4,17 +4,19 @@
 namespace deepgalois {
 
 // 𝑦[𝑙] = max(0, 𝑦[𝑙−1])
-void relu_layer::forward_propagation(const float_t* in_data, float_t* out_data) {
+void relu_layer::forward_propagation(const float_t* in_data,
+                                     float_t* out_data) {
   const size_t count = input_dims[0] * input_dims[1];
   relu_gpu(count, in_data, out_data);
 }
 
 // 𝜕𝐿 / 𝜕𝑦[𝑙−1] = 0, 𝑖𝑓 (𝑦[𝑙] < 0)
 //              = 𝜕𝐿 / 𝜕𝑦𝑙, 𝑜𝑡ℎ𝑒𝑟𝑤𝑖𝑠𝑒
-void relu_layer::back_propagation(const float_t* in_data, const float_t* out_data,
-                                  float_t* out_grad, float_t* in_grad) {
+void relu_layer::back_propagation(const float_t* in_data,
+                                  const float_t* out_data, float_t* out_grad,
+                                  float_t* in_grad) {
   const size_t count = input_dims[0] * input_dims[1];
   d_relu_gpu(count, out_grad, in_data, in_grad);
 }
 
-} // namespace
+} // namespace deepgalois
diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
index d7ec46378e..d20f2a769b 100644
--- a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
+++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
@@ -13,53 +13,65 @@ sigmoid_loss_layer::sigmoid_loss_layer(unsigned level,
   name_      = layer_type() + "_" + std::to_string(level);
 }
 
-sigmoid_loss_layer::~sigmoid_loss_layer() {
-  delete[] loss;
-}
+sigmoid_loss_layer::~sigmoid_loss_layer() { delete[] loss; }
 
 void sigmoid_loss_layer::malloc_and_init() {
   loss = new float_t[input_dims[0]]; // error for each sample
 }
 
 inline label_t sigmoid_loss_layer::get_label(size_t i, size_t j) {
-  //return context->get_label(i, j);
-  return labels[i*input_dims[1]+j];
+  // return context->get_label(i, j);
+  return labels[i * input_dims[1] + j];
 }
 
-void sigmoid_loss_layer::forward_propagation(const float_t* in_data, float_t* out_data) {
+void sigmoid_loss_layer::forward_propagation(const float_t* in_data,
+                                             float_t* out_data) {
   size_t len = input_dims[1];
-  galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) {
-    if (!use_mask || masks_[i] == 1) { // masked
-      size_t idx = len * i;
-      // output is normalized input for this layer
-      math::sigmoid(len, &in_data[idx], &out_data[idx]); // normalize using sigmoid
-      // one hot encoded vector for the labels
-      float_t *ground_truth = new float_t[len];
-      for (size_t j = 0; j < len; j++) ground_truth[j] = (float_t)get_label(i, j);
-      // loss calculation
-      loss[i] = math::cross_entropy(len, ground_truth, &out_data[idx]);
-	  delete[] ground_truth;
-    }
-  }, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("sigmoid-loss-fw"));
+  galois::do_all(
+      galois::iterate(begin_, end_),
+      [&](const auto& i) {
+        if (!use_mask || masks_[i] == 1) { // masked
+          size_t idx = len * i;
+          // output is normalized input for this layer
+          math::sigmoid(len, &in_data[idx],
+                        &out_data[idx]); // normalize using sigmoid
+          // one hot encoded vector for the labels
+          float_t* ground_truth = new float_t[len];
+          for (size_t j = 0; j < len; j++)
+            ground_truth[j] = (float_t)get_label(i, j);
+          // loss calculation
+          loss[i] = math::cross_entropy(len, ground_truth, &out_data[idx]);
+          delete[] ground_truth;
+        }
+      },
+      galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
+      galois::loopname("sigmoid-loss-fw"));
 }
 
-void sigmoid_loss_layer::back_propagation(const float_t* in_data, const float_t* out_data,
-                                          float_t*, float_t* in_grad) {
+void sigmoid_loss_layer::back_propagation(const float_t* in_data,
+                                          const float_t* out_data, float_t*,
+                                          float_t* in_grad) {
   size_t len = layer::input_dims[1];
-  galois::do_all(galois::iterate(layer::begin_, layer::end_), [&](const auto& i) {
-    if (!use_mask || masks_[i] == 1) { // masked
-      size_t idx = len * i;
-      float_t *norm_grad = new float_t[len];
-      float_t *ground_truth = new float_t[len];
-      for (size_t j = 0; j < len; j++) ground_truth[j] = (float_t)get_label(i, j);
-      // use ground truth to determine derivative of cross entropy
-      math::d_cross_entropy(len, ground_truth, &out_data[idx], norm_grad);
-      // derviative sigmoid to gradient used in the next layer
-      math::d_sigmoid(len, &in_data[idx], &out_data[idx], &in_grad[idx], norm_grad);
-	  delete[] norm_grad;
-	  delete[] ground_truth;
-    }
-  }, galois::chunk_size<CHUNK_SIZE>(), galois::steal(), galois::loopname("sigmoid-loss-bw"));
+  galois::do_all(
+      galois::iterate(layer::begin_, layer::end_),
+      [&](const auto& i) {
+        if (!use_mask || masks_[i] == 1) { // masked
+          size_t idx            = len * i;
+          float_t* norm_grad    = new float_t[len];
+          float_t* ground_truth = new float_t[len];
+          for (size_t j = 0; j < len; j++)
+            ground_truth[j] = (float_t)get_label(i, j);
+          // use ground truth to determine derivative of cross entropy
+          math::d_cross_entropy(len, ground_truth, &out_data[idx], norm_grad);
+          // derviative sigmoid to gradient used in the next layer
+          math::d_sigmoid(len, &in_data[idx], &out_data[idx], &in_grad[idx],
+                          norm_grad);
+          delete[] norm_grad;
+          delete[] ground_truth;
+        }
+      },
+      galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
+      galois::loopname("sigmoid-loss-bw"));
 }
 
 acc_t sigmoid_loss_layer::get_prediction_loss() {
@@ -68,15 +80,19 @@ acc_t sigmoid_loss_layer::get_prediction_loss() {
   galois::GAccumulator<size_t> valid_sample_count;
   total_loss.reset();
   valid_sample_count.reset();
-  galois::do_all(galois::iterate(layer::begin_, layer::end_), [&](const auto& i) {
-    if (!use_mask || masks_[i]) {
-      total_loss += loss[i];
-      valid_sample_count += 1;
-    }
-  }, galois::chunk_size<256>(), galois::steal(), galois::loopname("getMaskedLoss"));
+  galois::do_all(
+      galois::iterate(layer::begin_, layer::end_),
+      [&](const auto& i) {
+        if (!use_mask || masks_[i]) {
+          total_loss += loss[i];
+          valid_sample_count += 1;
+        }
+      },
+      galois::chunk_size<256>(), galois::steal(),
+      galois::loopname("getMaskedLoss"));
   assert(valid_sample_count.reduce() == count_);
   return total_loss.reduce() / (acc_t)count_;
 }
 #endif
 
-} // namespace
+} // namespace deepgalois
diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cu b/libdeepgalois/src/layers/sigmoid_loss_layer.cu
index f00689dfc9..0f5ff9cb69 100644
--- a/libdeepgalois/src/layers/sigmoid_loss_layer.cu
+++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cu
@@ -13,9 +13,7 @@ sigmoid_loss_layer::sigmoid_loss_layer(unsigned level,
   name_      = layer_type() + "_" + std::to_string(level);
 }
 
-sigmoid_loss_layer::~sigmoid_loss_layer() {
-  float_free_device(loss);
-}
+sigmoid_loss_layer::~sigmoid_loss_layer() { float_free_device(loss); }
 
 void sigmoid_loss_layer::malloc_and_init() {
   float_malloc_device(input_dims[0], loss);
@@ -24,19 +22,19 @@ void sigmoid_loss_layer::malloc_and_init() {
 void sigmoid_loss_layer::forward_propagation(const float_t* in_data,
                                              float_t* out_data) {
   init_const_gpu(input_dims[0], 0.0, loss);
-  sigmoid_cross_entropy_gpu(input_dims[1], begin_, end_, in_data,
-                            d_masks_, labels, loss, out_data);
+  sigmoid_cross_entropy_gpu(input_dims[1], begin_, end_, in_data, d_masks_,
+                            labels, loss, out_data);
 }
 
 void sigmoid_loss_layer::back_propagation(const float_t* in_data,
                                           const float_t* out_data,
                                           float_t* out_grad, float_t* in_grad) {
-  d_sigmoid_cross_entropy_gpu(input_dims[1], begin_, end_, d_masks_,
-                              labels, out_data, in_grad);
+  d_sigmoid_cross_entropy_gpu(input_dims[1], begin_, end_, d_masks_, labels,
+                              out_data, in_grad);
 }
 
 acc_t sigmoid_loss_layer::get_prediction_loss() {
   return masked_avg_loss_gpu(begin_, end_, count_, d_masks_, loss);
 }
 
-} // namespace
+} // namespace deepgalois
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp
index d40ff6d411..f3eb3ee969 100644
--- a/libdeepgalois/src/layers/softmax_loss_layer.cpp
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp
@@ -13,9 +13,7 @@ softmax_loss_layer::softmax_loss_layer(unsigned level,
   name_      = layer_type() + "_" + std::to_string(level);
 }
 
-softmax_loss_layer::~softmax_loss_layer() {
-  delete[] loss;
-}
+softmax_loss_layer::~softmax_loss_layer() { delete[] loss; }
 
 void softmax_loss_layer::malloc_and_init() {
   loss = new float_t[input_dims[0]]; // error for each sample
@@ -23,44 +21,58 @@ void softmax_loss_layer::malloc_and_init() {
 
 inline label_t softmax_loss_layer::get_label(size_t i) {
   return labels[i];
-  //return context->get_label(i);
+  // return context->get_label(i);
 }
 
 // TODO: need kernel fusion optimization
 // 𝑦[i] = 𝑒^𝑥[i] / Σ 𝑒^𝑥[𝑘]
-void softmax_loss_layer::forward_propagation(const float_t* in_data, float_t* out_data) {
+void softmax_loss_layer::forward_propagation(const float_t* in_data,
+                                             float_t* out_data) {
   size_t len = input_dims[1];
-  galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) {
-    if (!use_mask || masks_[i] == 1) { // masked
-      // output is normalized input for this layer
-      math::softmax(len, &in_data[len*i], &out_data[len*i]); // normalize using softmax
-      // one hot encoded vector for the labels
-      vec_t groundTruth(output_dims[1], 0.0); // ground truth
-      groundTruth[get_label(i)] = 1.0;            // one-hot
-      // loss calculation
-      loss[i] = math::cross_entropy(len, &groundTruth[0], &out_data[len*i]);
-    }
-  }, galois::chunk_size<64>(), galois::steal(), galois::loopname("softmax-loss-fw"));
+  galois::do_all(
+      galois::iterate(begin_, end_),
+      [&](const auto& i) {
+        if (!use_mask || masks_[i] == 1) { // masked
+          // output is normalized input for this layer
+          math::softmax(len, &in_data[len * i],
+                        &out_data[len * i]); // normalize using softmax
+          // one hot encoded vector for the labels
+          vec_t groundTruth(output_dims[1], 0.0); // ground truth
+          groundTruth[get_label(i)] = 1.0;        // one-hot
+          // loss calculation
+          loss[i] =
+              math::cross_entropy(len, &groundTruth[0], &out_data[len * i]);
+        }
+      },
+      galois::chunk_size<64>(), galois::steal(),
+      galois::loopname("softmax-loss-fw"));
 
   // no sync required in distributed execution since no graph topology used
   // in this forward pass; only a post-process pretty much
 }
 
-void softmax_loss_layer::back_propagation(const float_t* in_data, const float_t* out_data,
-                                          float_t*, float_t* in_grad) {
+void softmax_loss_layer::back_propagation(const float_t* in_data,
+                                          const float_t* out_data, float_t*,
+                                          float_t* in_grad) {
   // note: out_grad is ignored because it shouldn't exist (this is output layer)
   size_t len = layer::input_dims[1];
-  galois::do_all(galois::iterate(layer::begin_, layer::end_), [&](const auto& i) {
-    if (!use_mask || masks_[i] == 1) { // masked
-      vec_t norm_grad(len);
-      std::vector<acc_t> groundTruth(len, 0.0);
-      groundTruth[get_label(i)] = 1.0;
-      // use ground truth to determine derivative of cross entropy
-      math::d_cross_entropy(len, &groundTruth[0], &out_data[len * i], &norm_grad[0]);
-      // derviative softmax to gradient used in the next layer
-      math::d_softmax(len, &in_data[len * i], &out_data[len * i], &in_grad[len * i], &norm_grad[0]);
-    }
-  }, galois::chunk_size<64>(), galois::steal(), galois::loopname("softmax-loss-bw"));
+  galois::do_all(
+      galois::iterate(layer::begin_, layer::end_),
+      [&](const auto& i) {
+        if (!use_mask || masks_[i] == 1) { // masked
+          vec_t norm_grad(len);
+          std::vector<acc_t> groundTruth(len, 0.0);
+          groundTruth[get_label(i)] = 1.0;
+          // use ground truth to determine derivative of cross entropy
+          math::d_cross_entropy(len, &groundTruth[0], &out_data[len * i],
+                                &norm_grad[0]);
+          // derviative softmax to gradient used in the next layer
+          math::d_softmax(len, &in_data[len * i], &out_data[len * i],
+                          &in_grad[len * i], &norm_grad[0]);
+        }
+      },
+      galois::chunk_size<64>(), galois::steal(),
+      galois::loopname("softmax-loss-bw"));
 
   // no weight sync required: this is all local graph information
 }
@@ -71,16 +83,21 @@ acc_t softmax_loss_layer::get_prediction_loss() {
   galois::GAccumulator<size_t> valid_sample_count;
   total_loss.reset();
   valid_sample_count.reset();
-  galois::do_all(galois::iterate(layer::begin_, layer::end_), [&](const auto& i) {
-    if (!use_mask || masks_[i]) {
-      total_loss += loss[i];
-      valid_sample_count += 1;
-    }
-  }, galois::chunk_size<64>(), galois::steal(), galois::loopname("getMaskedLoss"));
-  //std::cout << "begin = " << begin_ << " end = " << end_ << " count = " << count_ << " valid_count = " << valid_sample_count.reduce() << "\n";
+  galois::do_all(
+      galois::iterate(layer::begin_, layer::end_),
+      [&](const auto& i) {
+        if (!use_mask || masks_[i]) {
+          total_loss += loss[i];
+          valid_sample_count += 1;
+        }
+      },
+      galois::chunk_size<64>(), galois::steal(),
+      galois::loopname("getMaskedLoss"));
+  // std::cout << "begin = " << begin_ << " end = " << end_ << " count = " <<
+  // count_ << " valid_count = " << valid_sample_count.reduce() << "\n";
   assert(valid_sample_count.reduce() == count_);
   return total_loss.reduce() / (acc_t)count_;
 }
 #endif
 
-} // namespace
+} // namespace deepgalois
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cu b/libdeepgalois/src/layers/softmax_loss_layer.cu
index 59a955526b..20b7e659d8 100644
--- a/libdeepgalois/src/layers/softmax_loss_layer.cu
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cu
@@ -13,9 +13,7 @@ softmax_loss_layer::softmax_loss_layer(unsigned level,
   name_      = layer_type() + "_" + std::to_string(level);
 }
 
-softmax_loss_layer::~softmax_loss_layer() {
-  float_free_device(loss);
-}
+softmax_loss_layer::~softmax_loss_layer() { float_free_device(loss); }
 
 void softmax_loss_layer::malloc_and_init() {
   float_malloc_device(input_dims[0], loss);
@@ -24,19 +22,19 @@ void softmax_loss_layer::malloc_and_init() {
 void softmax_loss_layer::forward_propagation(const float_t* in_data,
                                              float_t* out_data) {
   init_const_gpu(input_dims[0], 0.0, loss);
-  softmax_cross_entropy_gpu(input_dims[1], begin_, end_, in_data,
-                            d_masks_, labels, loss, out_data);
+  softmax_cross_entropy_gpu(input_dims[1], begin_, end_, in_data, d_masks_,
+                            labels, loss, out_data);
 }
 
 void softmax_loss_layer::back_propagation(const float_t* in_data,
                                           const float_t* out_data,
                                           float_t* out_grad, float_t* in_grad) {
-  d_softmax_cross_entropy_gpu(input_dims[1], begin_, end_, d_masks_,
-                              labels, out_data, in_grad);
+  d_softmax_cross_entropy_gpu(input_dims[1], begin_, end_, d_masks_, labels,
+                              out_data, in_grad);
 }
 
 acc_t softmax_loss_layer::get_prediction_loss() {
   return masked_avg_loss_gpu(begin_, end_, count_, d_masks_, loss);
 }
 
-} // namespace
+} // namespace deepgalois
diff --git a/libdeepgalois/src/lgraph.cpp b/libdeepgalois/src/lgraph.cpp
index 26811280a1..572f4e5662 100644
--- a/libdeepgalois/src/lgraph.cpp
+++ b/libdeepgalois/src/lgraph.cpp
@@ -10,7 +10,7 @@ bool LearningGraph::isLocal(index_t) { return true; }
 
 index_t LearningGraph::getLID(index_t) { return 0; }
 
-bool LearningGraph::is_vertex_cut() {return true; }
+bool LearningGraph::is_vertex_cut() { return true; }
 
 std::vector<std::vector<size_t>>& LearningGraph::getMirrorNodes() {
   return mirrorNodes;
@@ -26,13 +26,14 @@ void LearningGraph::readGraph(std::string dataset) {
 }
 
 void LearningGraph::degree_counting() {
-  //if (degrees_ != NULL) return;
-  //degrees_ = new index_t[num_vertices_];
-  galois::do_all(galois::iterate(size_t(0), size_t(num_vertices_)), [&] (auto v) {
-    degrees_[v] = rowptr_[v+1] - rowptr_[v];
-  }, galois::loopname("DegreeCounting"));
+  // if (degrees_ != NULL) return;
+  // degrees_ = new index_t[num_vertices_];
+  galois::do_all(
+      galois::iterate(size_t(0), size_t(num_vertices_)),
+      [&](auto v) { degrees_[v] = rowptr_[v + 1] - rowptr_[v]; },
+      galois::loopname("DegreeCounting"));
 }
 
 void LearningGraph::dealloc() {}
 
-} // end namespace
+} // namespace deepgalois
diff --git a/libdeepgalois/src/lgraph.cu b/libdeepgalois/src/lgraph.cu
index 2c630ca7ae..679a4b6d8a 100644
--- a/libdeepgalois/src/lgraph.cu
+++ b/libdeepgalois/src/lgraph.cu
@@ -15,17 +15,23 @@ void LearningGraph::dealloc() {
   CUDA_CHECK(cudaFree(d_colidx_));
   CUDA_CHECK(cudaFree(d_rowptr_));
   CUDA_CHECK(cudaFree(d_degrees_));
-  if (edge_data_ != NULL) CUDA_CHECK(cudaFree(d_edge_data_));
-  if (vertex_data_ != NULL) CUDA_CHECK(cudaFree(d_vertex_data_));
+  if (edge_data_ != NULL)
+    CUDA_CHECK(cudaFree(d_edge_data_));
+  if (vertex_data_ != NULL)
+    CUDA_CHECK(cudaFree(d_vertex_data_));
 }
 
 void LearningGraph::allocOnDevice(bool no_edge_data__) {
-  if (d_colidx_ != NULL) return;  
-  CUDA_CHECK(cudaMalloc((void **) &d_colidx_, num_edges_ * sizeof(index_t)));
-  CUDA_CHECK(cudaMalloc((void **) &d_rowptr_, (num_vertices_+1) * sizeof(index_t)));
-  //CUDA_CHECK(cudaMalloc((void **) &d_degrees_, num_vertices_ * sizeof(index_t)));
-  //if (!no_edge_data__) CUDA_CHECK(cudaMalloc((void **) &edge_data__, num_edges_ * sizeof(edge_data___t)));
-  //CUDA_CHECK(cudaMalloc((void **) &vertex_data__, num_vertices_ * sizeof(vdata_t)));
+  if (d_colidx_ != NULL)
+    return;
+  CUDA_CHECK(cudaMalloc((void**)&d_colidx_, num_edges_ * sizeof(index_t)));
+  CUDA_CHECK(
+      cudaMalloc((void**)&d_rowptr_, (num_vertices_ + 1) * sizeof(index_t)));
+  // CUDA_CHECK(cudaMalloc((void **) &d_degrees_, num_vertices_ *
+  // sizeof(index_t))); if (!no_edge_data__) CUDA_CHECK(cudaMalloc((void **)
+  // &edge_data__, num_edges_ * sizeof(edge_data___t)));
+  // CUDA_CHECK(cudaMalloc((void **) &vertex_data__, num_vertices_ *
+  // sizeof(vdata_t)));
   is_device = true;
 }
 
@@ -38,22 +44,34 @@ void LearningGraph::print_test() {
 
 void LearningGraph::copy_to_gpu() {
   allocOnDevice(edge_data_ == NULL);
-  CUDA_CHECK(cudaMemcpy(d_colidx_, edge_dst_host_ptr(), num_edges_ * sizeof(index_t), cudaMemcpyHostToDevice));
-  CUDA_CHECK(cudaMemcpy(d_rowptr_, row_start_host_ptr(), (num_vertices_+1) * sizeof(index_t), cudaMemcpyHostToDevice));
+  CUDA_CHECK(cudaMemcpy(d_colidx_, edge_dst_host_ptr(),
+                        num_edges_ * sizeof(index_t), cudaMemcpyHostToDevice));
+  CUDA_CHECK(cudaMemcpy(d_rowptr_, row_start_host_ptr(),
+                        (num_vertices_ + 1) * sizeof(index_t),
+                        cudaMemcpyHostToDevice));
   print_test();
-  //CUDA_CHECK(cudaMemcpy(degrees_ptr(), d_degrees_, num_vertices_ * sizeof(index_t), cudaMemcpyHostToDevice));
-  //if (edge_data__ != NULL) CUDA_CHECK(cudaMemcpy(copygraph.edge_data__, edge_data__, num_edges_ * sizeof(edata_t), cudaMemcpyHostToDevice));
-  //CUDA_CHECK(cudaMemcpy(copygraph.vertex_data__, vertex_data__, num_vertices_ * sizeof(vdata_t), cudaMemcpyHostToDevice));
+  // CUDA_CHECK(cudaMemcpy(degrees_ptr(), d_degrees_, num_vertices_ *
+  // sizeof(index_t), cudaMemcpyHostToDevice)); if (edge_data__ != NULL)
+  // CUDA_CHECK(cudaMemcpy(copygraph.edge_data__, edge_data__, num_edges_ *
+  // sizeof(edata_t), cudaMemcpyHostToDevice));
+  // CUDA_CHECK(cudaMemcpy(copygraph.vertex_data__, vertex_data__, num_vertices_
+  // * sizeof(vdata_t), cudaMemcpyHostToDevice));
 }
 
 void LearningGraph::copy_to_cpu() {
-  CUDA_CHECK(cudaMemcpy(edge_dst_host_ptr(), d_colidx_, num_edges_ * sizeof(index_t), cudaMemcpyDeviceToHost));
-  CUDA_CHECK(cudaMemcpy(row_start_host_ptr(), d_rowptr_, (num_vertices_+1) * sizeof(index_t), cudaMemcpyDeviceToHost));
-  //CUDA_CHECK(cudaMemcpy(degrees_ptr(), d_degrees_, num_vertices_ * sizeof(index_t), cudaMemcpyDeviceToHost));
-  //if (edge_data__ != NULL) CUDA_CHECK(cudaMemcpy(copygraph.edge_data__ptr(), edge_data__, num_edges_ * sizeof(edata_t), cudaMemcpyDeviceToHost));
-  //CUDA_CHECK(cudaMemcpy(copygraph.vertex_data__ptr(), vertex_data__, num_vertices_ * sizeof(vdata_t), cudaMemcpyDeviceToHost));
+  CUDA_CHECK(cudaMemcpy(edge_dst_host_ptr(), d_colidx_,
+                        num_edges_ * sizeof(index_t), cudaMemcpyDeviceToHost));
+  CUDA_CHECK(cudaMemcpy(row_start_host_ptr(), d_rowptr_,
+                        (num_vertices_ + 1) * sizeof(index_t),
+                        cudaMemcpyDeviceToHost));
+  // CUDA_CHECK(cudaMemcpy(degrees_ptr(), d_degrees_, num_vertices_ *
+  // sizeof(index_t), cudaMemcpyDeviceToHost)); if (edge_data__ != NULL)
+  // CUDA_CHECK(cudaMemcpy(copygraph.edge_data__ptr(), edge_data__, num_edges_ *
+  // sizeof(edata_t), cudaMemcpyDeviceToHost));
+  // CUDA_CHECK(cudaMemcpy(copygraph.vertex_data__ptr(), vertex_data__,
+  // num_vertices_ * sizeof(vdata_t), cudaMemcpyDeviceToHost));
 }
 
 void LearningGraph::degree_counting() {}
 
-}
+} // namespace deepgalois
diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp
index 0923411ff2..3b96341c66 100644
--- a/libdeepgalois/src/math_functions.cpp
+++ b/libdeepgalois/src/math_functions.cpp
@@ -10,17 +10,17 @@
 
 #ifdef USE_MKL
 #include <mkl.h>
-#else  // If use MKL, simply include the MKL header
+#else // If use MKL, simply include the MKL header
 extern "C" {
 #include <cblas.h>
 }
 #endif
 
-#define NOT_IMPLEMENTED                \
-  do {                                 \
-    std::cout << "Not Implemented Yet";\
-    exit(1);                           \
-  } while(0);
+#define NOT_IMPLEMENTED                                                        \
+  do {                                                                         \
+    std::cout << "Not Implemented Yet";                                        \
+    exit(1);                                                                   \
+  } while (0);
 
 /*
 #include <boost/random.hpp>
@@ -39,7 +39,7 @@ void rng_bernoulli(size_t n, const float_t p, uint8_t* r) {
 */
 
 std::default_random_engine generator;
-std::uniform_real_distribution<float_t> distribution(0.0,1.0);
+std::uniform_real_distribution<float_t> distribution(0.0, 1.0);
 
 namespace deepgalois {
 
@@ -57,43 +57,48 @@ void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
   Tmatmul.start();
   int lda = (TransA == CblasNoTrans) ? K : M;
   int ldb = (TransB == CblasNoTrans) ? N : K;
-  cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N);
+  cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb,
+              beta, C, N);
   Tmatmul.stop();
 }
 
 #ifdef USE_MKL
 void csrmm_cpu(const int M, const int N, const int K, const int,
-               const float alpha, float* A_nonzeros, int* A_idx_ptr, int* A_nnz_idx,
-               const float* B, const float beta, float* C) {
+               const float alpha, float* A_nonzeros, int* A_idx_ptr,
+               int* A_nnz_idx, const float* B, const float beta, float* C) {
 #else
-void csrmm_cpu(const int, const int, const int, const int,
-               const float, float*, int*, int*,
-               const float*, const float, float*) {
+void csrmm_cpu(const int, const int, const int, const int, const float, float*,
+               int*, int*, const float*, const float, float*) {
 #endif
 #ifdef USE_MKL
-  //mkl_set_num_threads(56);
-  //const char *matdescra = "GXXCX";//6 bytes
-  //const char transa = 'N';
-  //mkl_scsrmm(&transa, &M , &N, &K, &alpha, matdescra, A_nonzeros, A_nnz_idx, A_idx_ptr, A_idx_ptr+1, B, &N, &beta, C, &N);
+  // mkl_set_num_threads(56);
+  // const char *matdescra = "GXXCX";//6 bytes
+  // const char transa = 'N';
+  // mkl_scsrmm(&transa, &M , &N, &K, &alpha, matdescra, A_nonzeros, A_nnz_idx,
+  // A_idx_ptr, A_idx_ptr+1, B, &N, &beta, C, &N);
   sparse_status_t status;
-  bool need_trans = false;
-  bool is_row_major = true;
-  sparse_matrix_t csrA = NULL;
+  bool need_trans              = false;
+  bool is_row_major            = true;
+  sparse_matrix_t csrA         = NULL;
   sparse_index_base_t indexing = SPARSE_INDEX_BASE_ZERO;
-  sparse_layout_t layout = (is_row_major ? SPARSE_LAYOUT_ROW_MAJOR : SPARSE_LAYOUT_COLUMN_MAJOR);
-  status = mkl_sparse_s_create_csr(&csrA, indexing, M, K, A_idx_ptr, A_idx_ptr + 1, A_nnz_idx, A_nonzeros);
+  sparse_layout_t layout =
+      (is_row_major ? SPARSE_LAYOUT_ROW_MAJOR : SPARSE_LAYOUT_COLUMN_MAJOR);
+  status = mkl_sparse_s_create_csr(&csrA, indexing, M, K, A_idx_ptr,
+                                   A_idx_ptr + 1, A_nnz_idx, A_nonzeros);
   if (status != SPARSE_STATUS_SUCCESS) {
     std::cout << "mkl_sparse_s_create_csr status :" << status << std::endl;
     exit(1);
   }
-  sparse_operation_t transa = (need_trans ? SPARSE_OPERATION_TRANSPOSE : SPARSE_OPERATION_NON_TRANSPOSE);
+  sparse_operation_t transa = (need_trans ? SPARSE_OPERATION_TRANSPOSE
+                                          : SPARSE_OPERATION_NON_TRANSPOSE);
   struct matrix_descr descrA;
   descrA.type = SPARSE_MATRIX_TYPE_GENERAL;
-  //descrA.mode = SPARSE_FILL_MODE_UPPER;
-  //descrA.diag = SPARSE_DIAG_NON_UNIT;
-  //mkl_sparse_set_mm_hint(csrA, transa, descrA, layout, N, 1);
-  //mkl_sparse_optimize(csrA);
-  status = mkl_sparse_s_mm(transa, alpha, csrA, descrA, layout, B, N, N, beta, C, N);
+  // descrA.mode = SPARSE_FILL_MODE_UPPER;
+  // descrA.diag = SPARSE_DIAG_NON_UNIT;
+  // mkl_sparse_set_mm_hint(csrA, transa, descrA, layout, N, 1);
+  // mkl_sparse_optimize(csrA);
+  status =
+      mkl_sparse_s_mm(transa, alpha, csrA, descrA, layout, B, N, N, beta, C, N);
   if (status != SPARSE_STATUS_SUCCESS) {
     std::cout << "mkl_sparse_s_create_csr status :" << status << std::endl;
     exit(1);
@@ -105,8 +110,9 @@ void csrmm_cpu(const int, const int, const int, const int,
 }
 
 // matrix-vector multiply
-void mvmul(const CBLAS_TRANSPOSE TransA, const int M, const int N, const float alpha,
-           const float* A, const float* x, const float beta, float* y) {
+void mvmul(const CBLAS_TRANSPOSE TransA, const int M, const int N,
+           const float alpha, const float* A, const float* x, const float beta,
+           float* y) {
   cblas_sgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1);
 }
 
@@ -123,7 +129,7 @@ inline void rng_uniform_cpu(size_t n, float_t* r) {
   for (size_t i = 0; i < n; ++i) {
     r[i] = distribution(generator);
   }
-  //galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) {
+  // galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) {
   //  unsigned short xi[3];
   //  r[i] = erand48(xi);
   //}, galois::loopname("randomMaskGen"));
@@ -137,18 +143,15 @@ void vadd_cpu(size_t n, const float_t* a, const float_t* b, float_t* out) {
 #ifdef __AVX2__
   const size_t alignedN = n - n % vec_len;
   for (size_t i = 0; i < alignedN; i += vec_len)
-    _mm256_storeu_ps(&out[i], _mm256_add_ps(_mm256_loadu_ps(&a[i]), _mm256_loadu_ps(&b[i])));
-  for (size_t i = alignedN; i < n; ++i) out[i] = a[i] + b[i];
-#else
-  for (size_t i = 0; i < n; ++i) out[i] = a[i] + b[i];
-#endif
+    _mm256_storeu_ps(&out[i], _mm256_add_ps(_mm256_loadu_ps(&a[i]),
+_mm256_loadu_ps(&b[i]))); for (size_t i = alignedN; i < n; ++i) out[i] = a[i] +
+b[i]; #else for (size_t i = 0; i < n; ++i) out[i] = a[i] + b[i]; #endif
 }
 
 #if defined(__AVX__) || defined(__AVX2__)
-void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out) {
-  const size_t alignedN = n - n % vec_len;
-  const __m256 scal = _mm256_set1_ps(alpha);
-  for (size_t i = 0; i < alignedN; i += vec_len)
+void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out)
+{ const size_t alignedN = n - n % vec_len; const __m256 scal =
+_mm256_set1_ps(alpha); for (size_t i = 0; i < alignedN; i += vec_len)
     _mm256_storeu_ps(&out[i], _mm256_mul_ps(_mm256_loadu_ps(&in[i]), scal));
   for (size_t i = alignedN; i < n; ++i) out[i] = alpha * in[i];
 }
@@ -176,8 +179,8 @@ float_t l2_norm(size_t n, const float_t* in) {
 }
 #else
 // vector multiply scalar
-void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out) {
-  for (size_t i = 0; i < n; ++i) out[i] = alpha * in[i];
+void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out)
+{ for (size_t i = 0; i < n; ++i) out[i] = alpha * in[i];
 }
 
 float_t l2_norm(size_t n, const float_t* a) {
@@ -195,10 +198,13 @@ void vadd_cpu(size_t n, const float_t* a, const float_t* b, float_t* y) {
 #ifdef __AVX2__
   const size_t alignedN = n - n % vec_len;
   for (size_t i = 0; i < alignedN; i += vec_len)
-    _mm256_storeu_ps(&y[i], _mm256_add_ps(_mm256_loadu_ps(&a[i]), _mm256_loadu_ps(&b[i])));
-  for (size_t i = alignedN; i < n; ++i) y[i] = a[i] + b[i];
+    _mm256_storeu_ps(
+        &y[i], _mm256_add_ps(_mm256_loadu_ps(&a[i]), _mm256_loadu_ps(&b[i])));
+  for (size_t i = alignedN; i < n; ++i)
+    y[i] = a[i] + b[i];
 #else
-  for (size_t i = 0; i < n; ++i) y[i] = a[i] + b[i];
+  for (size_t i = 0; i < n; ++i)
+    y[i] = a[i] + b[i];
 #endif
 #endif
 }
@@ -212,7 +218,7 @@ void scale(size_t n, const float_t alpha, const float_t* x, float_t* y) {
   cblas_sscal(n, alpha, y, 1);
 }
 
-void axpy(size_t n, const float_t a, float_t *x, float_t *y) {
+void axpy(size_t n, const float_t a, float_t* x, float_t* y) {
   cblas_saxpy(n, a, x, 1, y, 1);
 }
 
@@ -229,9 +235,7 @@ int argmax(const size_t n, const float_t* x) {
 }
 
 // l2 normalization
-float_t l2_norm(size_t n, const float_t* x) {
-  return cblas_snrm2(n, x, 1);
-}
+float_t l2_norm(size_t n, const float_t* x) { return cblas_snrm2(n, x, 1); }
 
 // dot product
 float_t dot(size_t n, const float_t* x, const float_t* y) {
@@ -239,13 +243,13 @@ float_t dot(size_t n, const float_t* x, const float_t* y) {
 }
 
 void clear_cpu(size_t n, float_t* in) {
-  //for (size_t i = 0; i < n; i++) in[i] = 0;
-  std::fill(in, in+n, 0);
+  // for (size_t i = 0; i < n; i++) in[i] = 0;
+  std::fill(in, in + n, 0);
   // memset(in, 0, n*sizeof(float_t));
 }
 
-void dropout(size_t m, float scale, float dropout_rate,
-             const float_t* in, mask_t* masks, float_t* out) {
+void dropout(size_t m, float scale, float dropout_rate, const float_t* in,
+             mask_t* masks, float_t* out) {
   for (size_t i = 0; i < m; ++i)
     masks[i] = bernoulli(dropout_rate);
   for (size_t i = 0; i < m; ++i)
@@ -253,75 +257,88 @@ void dropout(size_t m, float scale, float dropout_rate,
 }
 
 void dropout_cpu(size_t n, size_t m, float scale, float dropout_rate,
-             const float_t* in, mask_t* masks, float_t* out) {
+                 const float_t* in, mask_t* masks, float_t* out) {
   size_t len = n * m;
-/*
-#ifdef USE_MKL
-  vec_t rands(len);
-  rng_uniform_cpu(len, &rands[0]);
-  galois::do_all(galois::iterate((size_t)0, len), [&](const auto& i) {
-    masks[i] = rands[i] > dropout_rate ? 1 : 0;
-  }, galois::loopname("randomMaskGen"));
-*/
-/*
-  galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) {
-    auto idx = i * m;
-    vec_t rands(m);
-    rng_uniform_cpu(m, &rands[0]);
-    for (size_t j = 0; j < m; ++j)
-      masks[idx+j] = rands[j] > dropout_rate ? 1 : 0;
-  }, galois::loopname("dropout"));
-#else
-*/
+  /*
+  #ifdef USE_MKL
+    vec_t rands(len);
+    rng_uniform_cpu(len, &rands[0]);
+    galois::do_all(galois::iterate((size_t)0, len), [&](const auto& i) {
+      masks[i] = rands[i] > dropout_rate ? 1 : 0;
+    }, galois::loopname("randomMaskGen"));
+  */
+  /*
+    galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) {
+      auto idx = i * m;
+      vec_t rands(m);
+      rng_uniform_cpu(m, &rands[0]);
+      for (size_t j = 0; j < m; ++j)
+        masks[idx+j] = rands[j] > dropout_rate ? 1 : 0;
+    }, galois::loopname("dropout"));
+  #else
+  */
   for (size_t i = 0; i < len; ++i) {
     masks[i] = bernoulli(dropout_rate);
   }
-//#endif
-  galois::do_all(galois::iterate((size_t)0, len), [&](const auto& i) {
-    out[i] = in[i] * (float_t)masks[i] * scale;
-  }, galois::loopname("dropout"));
+  //#endif
+  galois::do_all(
+      galois::iterate((size_t)0, len),
+      [&](const auto& i) { out[i] = in[i] * (float_t)masks[i] * scale; },
+      galois::loopname("dropout"));
 }
 
-void d_dropout(size_t m, float scale, const float_t* in, mask_t* masks, float_t* out) {
+void d_dropout(size_t m, float scale, const float_t* in, mask_t* masks,
+               float_t* out) {
   for (size_t i = 0; i < m; ++i)
     out[i] = in[i] * (float_t)masks[i] * scale;
 }
 
 void d_dropout_cpu(size_t n, size_t m, float scale, const float_t* in,
                    mask_t* masks, float_t* out) {
-  galois::do_all(galois::iterate((size_t)0, n*m), [&](const auto& i) {
-    out[i] = in[i] * (float_t)masks[i] * scale;
-  }, galois::loopname("d_dropout"));
+  galois::do_all(
+      galois::iterate((size_t)0, n * m),
+      [&](const auto& i) { out[i] = in[i] * (float_t)masks[i] * scale; },
+      galois::loopname("d_dropout"));
 }
 
 void relu_cpu(size_t n, const float_t* in, float_t* out) {
   // TODO: vectorize
-  galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) {
-    out[i] = std::max(in[i], float_t(0));
-  }, galois::chunk_size<64>(), galois::loopname("relu"));
+  galois::do_all(
+      galois::iterate((size_t)0, n),
+      [&](const auto& i) { out[i] = std::max(in[i], float_t(0)); },
+      galois::chunk_size<64>(), galois::loopname("relu"));
 }
 
-void d_relu_cpu(size_t n, const float_t* in, const float_t* data, float_t* out) {
+void d_relu_cpu(size_t n, const float_t* in, const float_t* data,
+                float_t* out) {
   // TODO: vectorize
   // check if original data greater than 0; if so keep grad
-  galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) {
-    out[i] = data[i] > float_t(0) ? in[i] : float_t(0);
-  }, galois::chunk_size<64>(), galois::loopname("d_relu"));
+  galois::do_all(
+      galois::iterate((size_t)0, n),
+      [&](const auto& i) {
+        out[i] = data[i] > float_t(0) ? in[i] : float_t(0);
+      },
+      galois::chunk_size<64>(), galois::loopname("d_relu"));
 }
 
-void leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in, float_t* out) {
+void leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in,
+                    float_t* out) {
   // TODO: vectorize
-  galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) {
-    out[i] = in[i] > 0 ? in[i] : epsilon * in[i];
-  }, galois::chunk_size<64>(), galois::loopname("leaky_relu"));
+  galois::do_all(
+      galois::iterate((size_t)0, n),
+      [&](const auto& i) { out[i] = in[i] > 0 ? in[i] : epsilon * in[i]; },
+      galois::chunk_size<64>(), galois::loopname("leaky_relu"));
 }
 
 void d_leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in,
                       const float_t* data, float_t* out) {
   // TODO: vectorize
-  galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) {
-    out[i] = in[i] * (data[i] > float_t(0) ? float_t(1) : epsilon);
-  }, galois::chunk_size<64>(), galois::loopname("d_leaky_relu"));
+  galois::do_all(
+      galois::iterate((size_t)0, n),
+      [&](const auto& i) {
+        out[i] = in[i] * (data[i] > float_t(0) ? float_t(1) : epsilon);
+      },
+      galois::chunk_size<64>(), galois::loopname("d_leaky_relu"));
 }
 
 void softmax(size_t n, const float_t* input, float_t* output) {
@@ -368,9 +385,10 @@ void d_cross_entropy(size_t n, const float_t* y, const float_t* p, float_t* d) {
   }
 }
 
-// use sigmoid instead of softmax for multi-class datasets, e.g. ppi, yelp and amazon
-// inline float_t sigmoid_func(float_t x) { return 0.5 * tanh(0.5 * x) + 0.5; }
-inline float_t sigmoid_func(float_t x) { return 1./(1.+expf(-x)); }
+// use sigmoid instead of softmax for multi-class datasets, e.g. ppi, yelp and
+// amazon inline float_t sigmoid_func(float_t x) { return 0.5 * tanh(0.5 * x) +
+// 0.5; }
+inline float_t sigmoid_func(float_t x) { return 1. / (1. + expf(-x)); }
 
 // Sigmoid
 void sigmoid(size_t n, const float_t* in, float_t* out) {
@@ -379,15 +397,16 @@ void sigmoid(size_t n, const float_t* in, float_t* out) {
   }
 }
 
-void d_sigmoid(size_t n, const float_t*, const float_t* p, float_t* dy, const float_t* dp) {
+void d_sigmoid(size_t n, const float_t*, const float_t* p, float_t* dy,
+               const float_t* dp) {
   for (size_t i = 0; i < n; i++) {
     dy[i] = dp[i] * p[i] * (float_t(1) - p[i]);
   }
 }
 
 void copy_cpu(size_t n, const float_t* in, float_t* out) {
-  //std::copy(in, in + n, out);
-  //memcpy(out, in, sizeof(float_t) * n);
+  // std::copy(in, in + n, out);
+  // memcpy(out, in, sizeof(float_t) * n);
   cblas_scopy(n, in, 1, out, 1);
 }
 
@@ -416,4 +435,3 @@ float reduce_mean(size_t n, const float_t* x) {
 
 } // end namespace math
 } // end namespace deepgalois
-
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index 449b597621..06d854d4b7 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -5,22 +5,25 @@
 #include "cub/cub.cuh"
 #include <curand_kernel.h>
 
-__global__ void init_const_kernel(int n, float_t value, float_t *array) {
+__global__ void init_const_kernel(int n, float_t value, float_t* array) {
   CUDA_KERNEL_LOOP(i, n) { array[i] = value; }
 }
 
-void init_const_gpu(int n, float_t value, float_t *array) {
+void init_const_gpu(int n, float_t value, float_t* array) {
   init_const_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, value, array);
   CudaTest("solving init_const kernel failed");
 }
 
-__global__ void isnan_test(const int n, const float *data, bool *result) {
-	CUDA_KERNEL_LOOP(i, n) { if (isnan(data[i])) *result = true; }
+__global__ void isnan_test(const int n, const float* data, bool* result) {
+  CUDA_KERNEL_LOOP(i, n) {
+    if (isnan(data[i]))
+      *result = true;
+  }
 }
 
-bool isnan_gpu(int n, const float_t *array) {
-  bool  *d_result, h_result = false;
-  cudaMalloc((void **)&d_result, sizeof (bool));
+bool isnan_gpu(int n, const float_t* array) {
+  bool *d_result, h_result = false;
+  cudaMalloc((void**)&d_result, sizeof(bool));
   cudaMemcpy(d_result, &h_result, sizeof(bool), cudaMemcpyHostToDevice);
   isnan_test<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, array, d_result);
   CudaTest("solving init_const kernel failed");
@@ -29,11 +32,13 @@ bool isnan_gpu(int n, const float_t *array) {
 }
 
 void gpu_rng_uniform(size_t n, float_t* r) {
-  CURAND_CHECK(curandGenerateUniform(deepgalois::Context::curand_generator(), r, n));
+  CURAND_CHECK(
+      curandGenerateUniform(deepgalois::Context::curand_generator(), r, n));
 }
 
 void rng_uniform_gpu(size_t n, const float_t a, const float_t b, float_t* r) {
-  CURAND_CHECK(curandGenerateUniform(deepgalois::Context::curand_generator(), r, n));
+  CURAND_CHECK(
+      curandGenerateUniform(deepgalois::Context::curand_generator(), r, n));
   const float range = b - a;
   if (range != float_t(1))
     scal_gpu(n, range, r);
@@ -41,15 +46,19 @@ void rng_uniform_gpu(size_t n, const float_t a, const float_t b, float_t* r) {
     add_scalar_gpu(n, a, r);
 }
 
-void gpu_rng_gaussian(const int n, const float_t mu, const float_t sigma, float_t* r) {
-  CURAND_CHECK(curandGenerateNormal(deepgalois::Context::curand_generator(), r, n, mu, sigma));
+void gpu_rng_gaussian(const int n, const float_t mu, const float_t sigma,
+                      float_t* r) {
+  CURAND_CHECK(curandGenerateNormal(deepgalois::Context::curand_generator(), r,
+                                    n, mu, sigma));
 }
 
 bool is_allocated_device(float_t* data) {
-  if (data == NULL) return false;
+  if (data == NULL)
+    return false;
   cudaPointerAttributes attributes;
   CUDA_CHECK(cudaPointerGetAttributes(&attributes, data));
-  if (attributes.devicePointer != NULL) return true;
+  if (attributes.devicePointer != NULL)
+    return true;
   return false;
 }
 
@@ -57,18 +66,18 @@ void float_malloc_device(int n, float_t*& ptr) {
   CUDA_CHECK(cudaMalloc((void**)&ptr, n * sizeof(float_t)));
 }
 
-void float_free_device(float_t*& ptr) {
-  CUDA_CHECK(cudaFree(ptr));
-}
+void float_free_device(float_t*& ptr) { CUDA_CHECK(cudaFree(ptr)); }
 
-void float_copy_device(int n, float_t* h_ptr, float_t *d_ptr) {
-  CUDA_CHECK(cudaMemcpy(d_ptr, h_ptr, n * sizeof(float_t), cudaMemcpyHostToDevice));
+void float_copy_device(int n, float_t* h_ptr, float_t* d_ptr) {
+  CUDA_CHECK(
+      cudaMemcpy(d_ptr, h_ptr, n * sizeof(float_t), cudaMemcpyHostToDevice));
 }
 
 void copy_masks_device(int n, mask_t* h_masks, mask_t*& d_masks) {
   assert(h_masks != NULL);
   CUDA_CHECK(cudaMalloc((void**)&d_masks, n * sizeof(mask_t)));
-  CUDA_CHECK(cudaMemcpy(d_masks, h_masks, n * sizeof(mask_t), cudaMemcpyHostToDevice));
+  CUDA_CHECK(
+      cudaMemcpy(d_masks, h_masks, n * sizeof(mask_t), cudaMemcpyHostToDevice));
 }
 
 __global__ void setup_curand_kernel(const int n, curandState* state) {
@@ -79,17 +88,17 @@ __global__ void setup_curand_kernel(const int n, curandState* state) {
 }
 
 __global__ void dropout_kernel(int n, float scale, float threshold,
-                               float_t *rands, const float_t* in,
-                               mask_t* masks, float_t* out) {
+                               float_t* rands, const float_t* in, mask_t* masks,
+                               float_t* out) {
   CUDA_KERNEL_LOOP(i, n) {
-    masks[i] = rands[i] > threshold ? 1 : 0; 
-    out[i] = in[i] * masks[i] * scale; 
+    masks[i] = rands[i] > threshold ? 1 : 0;
+    out[i]   = in[i] * masks[i] * scale;
   }
 }
 
-void dropout_gpu(int n, float scale, float dropout_rate,
-                 const float_t* in, mask_t* masks, float_t* out) {
-  float_t *rands;
+void dropout_gpu(int n, float scale, float dropout_rate, const float_t* in,
+                 mask_t* masks, float_t* out) {
+  float_t* rands;
   float_malloc_device(n, rands);
   gpu_rng_uniform(n, rands);
   dropout_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
@@ -99,12 +108,13 @@ void dropout_gpu(int n, float scale, float dropout_rate,
 }
 
 __global__ void d_dropout_kernel(int n, float scale, float threshold,
-                                 const float_t* in, const mask_t* masks, float_t* out) {
+                                 const float_t* in, const mask_t* masks,
+                                 float_t* out) {
   CUDA_KERNEL_LOOP(i, n) { out[i] = in[i] * masks[i] * scale; }
 }
 
-void d_dropout_gpu(int n, float scale, float dropout_rate, 
-                   const float_t* in, const mask_t* masks, float_t* out) {
+void d_dropout_gpu(int n, float scale, float dropout_rate, const float_t* in,
+                   const mask_t* masks, float_t* out) {
   d_dropout_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
       n, scale, dropout_rate, in, masks, out);
   CudaTest("solving d_dropout kernel failed");
@@ -138,21 +148,24 @@ __global__ void leaky_relu_kernel(const int n, const float_t epsilon,
   CUDA_KERNEL_LOOP(i, n) { out[i] = in[i] > 0 ? in[i] : epsilon * in[i]; }
 }
 
-void leaky_relu_gpu(const int n, const float_t epsilon, 
-                    const float_t* in, float_t* out) {
-  leaky_relu_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, epsilon, in, out);
+void leaky_relu_gpu(const int n, const float_t epsilon, const float_t* in,
+                    float_t* out) {
+  leaky_relu_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, epsilon, in,
+                                                              out);
   CudaTest("solving leaky_relu kernel failed");
 }
 
-__global__ void d_leaky_relu_kernel(const int n, const float_t epsilon, 
-    const float_t* in_diff, const float_t* data, float_t* out_diff) {
+__global__ void d_leaky_relu_kernel(const int n, const float_t epsilon,
+                                    const float_t* in_diff, const float_t* data,
+                                    float_t* out_diff) {
   CUDA_KERNEL_LOOP(i, n) {
     out_diff[i] = in_diff[i] * (data[i] > 0 ? 1.0 : epsilon);
   }
 }
 
-void d_leaky_relu_gpu(const int n, const float_t epsilon, const float_t* in_diff, 
-                      const float_t* data, float_t* out_diff) {
+void d_leaky_relu_gpu(const int n, const float_t epsilon,
+                      const float_t* in_diff, const float_t* data,
+                      float_t* out_diff) {
   d_leaky_relu_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
       n, epsilon, in_diff, data, out_diff);
   CudaTest("solving d_leaky_relu kernel failed");
@@ -160,23 +173,23 @@ void d_leaky_relu_gpu(const int n, const float_t epsilon, const float_t* in_diff
 
 __global__ void matmul_kernel(int x, int y, int z, const float_t* A,
                               const float_t* B, float_t* C) {
-	int row = blockIdx.x*blockDim.x+threadIdx.x;
-	int col = blockIdx.y*blockDim.y+threadIdx.y;
-	float_t sum = 0.0f;
-	if (row < x && col < y) {
-		for (int i = 0; i < z; i++) {
-			sum += A[row * z + i] * B[i * y + col];
-		}
-	}
-	C[row * y + col] = sum;
+  int row     = blockIdx.x * blockDim.x + threadIdx.x;
+  int col     = blockIdx.y * blockDim.y + threadIdx.y;
+  float_t sum = 0.0f;
+  if (row < x && col < y) {
+    for (int i = 0; i < z; i++) {
+      sum += A[row * z + i] * B[i * y + col];
+    }
+  }
+  C[row * y + col] = sum;
 }
 
 #define TILE_SZ 16
 void matmul_gpu(const size_t x, const size_t y, const size_t z,
-                    const float_t* A, const float_t* B, float_t* C) {
+                const float_t* A, const float_t* B, float_t* C) {
   dim3 threadsPerBlock(TILE_SZ, TILE_SZ);
-  dim3 blocksPerGrid((y-1)/TILE_SZ+1, (x-1)/TILE_SZ+1);
-  matmul_kernel<<<blocksPerGrid,threadsPerBlock>>>(x, y, z, A, B, C);
+  dim3 blocksPerGrid((y - 1) / TILE_SZ + 1, (x - 1) / TILE_SZ + 1);
+  matmul_kernel<<<blocksPerGrid, threadsPerBlock>>>(x, y, z, A, B, C);
   CudaTest("solving matmul kernel failed");
 }
 
@@ -190,8 +203,9 @@ void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
       (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
   cublasOperation_t cuTransB =
       (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  CUBLAS_CHECK(cublasSgemm(deepgalois::Context::cublas_handle(), cuTransB, cuTransA,
-                           N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
+  CUBLAS_CHECK(cublasSgemm(deepgalois::Context::cublas_handle(), cuTransB,
+                           cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C,
+                           N));
 }
 
 void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z,
@@ -201,57 +215,60 @@ void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z,
   sgemm_gpu(TransA, TransB, dim_x, dim_y, dim_z, 1.0, A, B, 0.0, C);
 }
 
-// C = A x B, where A is a sparse matrix in CSR format, B is the dense matrix for vertex
-// feature tensor. However, since cusparse only supports column-major, while feature 
-// tensor is stored in row-major, the actual computation is: C = trans(A x trans(B)).
-// Currently, we use cublasSgeam to implement transposition and allocate intermediate
-// workspace memory (transpose_C) for this.
-void csrmm_gpu(const int M, const int N, const int K, const int nnz, 
-               const float alpha, const float* A_nonzeros, 
-               const int* A_idx_ptr, const int* A_nnz_idx,
-               const float* B, const float beta, float *transpose_C, float* C) {
-  //std::cout << "[debug] csrmm_gpu m=" << M << ", n=" << N << ", k=" << K << ", nnz=" << nnz << "\n";
-  CUSPARSE_CHECK(cusparseScsrmm2(deepgalois::Context::cusparse_handle(),
-                 CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE,
-                 M, N, K, nnz, &alpha, deepgalois::Context::cusparse_matdescr(), A_nonzeros, 
-                 A_idx_ptr, A_nnz_idx, B, N, &beta, transpose_C, M)); 
-  //transpose C
-  const float one = 1.0;
-  const float zero = 0.0; 
-  CUBLAS_CHECK(cublasSgeam(deepgalois::Context::cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_T,
-                           N, M, &one, transpose_C, M, &zero, NULL, M, C, N)); 
+// C = A x B, where A is a sparse matrix in CSR format, B is the dense matrix
+// for vertex feature tensor. However, since cusparse only supports
+// column-major, while feature tensor is stored in row-major, the actual
+// computation is: C = trans(A x trans(B)). Currently, we use cublasSgeam to
+// implement transposition and allocate intermediate workspace memory
+// (transpose_C) for this.
+void csrmm_gpu(const int M, const int N, const int K, const int nnz,
+               const float alpha, const float* A_nonzeros, const int* A_idx_ptr,
+               const int* A_nnz_idx, const float* B, const float beta,
+               float* transpose_C, float* C) {
+  // std::cout << "[debug] csrmm_gpu m=" << M << ", n=" << N << ", k=" << K <<
+  // ", nnz=" << nnz << "\n";
+  CUSPARSE_CHECK(cusparseScsrmm2(
+      deepgalois::Context::cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE,
+      CUSPARSE_OPERATION_TRANSPOSE, M, N, K, nnz, &alpha,
+      deepgalois::Context::cusparse_matdescr(), A_nonzeros, A_idx_ptr,
+      A_nnz_idx, B, N, &beta, transpose_C, M));
+  // transpose C
+  const float one  = 1.0;
+  const float zero = 0.0;
+  CUBLAS_CHECK(cublasSgeam(deepgalois::Context::cublas_handle(), CUBLAS_OP_T,
+                           CUBLAS_OP_T, N, M, &one, transpose_C, M, &zero, NULL,
+                           M, C, N));
 }
 /*
-void csrmm_gpu_new(const int M, const int N, const int K, const int nnz, 
-               const float alpha, const float* A_nonzeros, 
-	           const int* A_idx_ptr, const int* A_nnz_idx,
+void csrmm_gpu_new(const int M, const int N, const int K, const int nnz,
+               const float alpha, const float* A_nonzeros,
+               const int* A_idx_ptr, const int* A_nnz_idx,
                const float* B, const float beta, float *transpose_C, float* C) {
   std::cout << "[debug]: csrmm_gpu\n";
   cusparseSpMatDescr_t A_descr;
-  CUSPARSE_CHECK(cusparseCreateCsr(&A_descr, M, K, nnz, A_idx_ptr, A_nnz_idx, A_nonzeros,
-   	             CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));
+  CUSPARSE_CHECK(cusparseCreateCsr(&A_descr, M, K, nnz, A_idx_ptr, A_nnz_idx,
+A_nonzeros, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));
   cusparseDnMatDescr_t B_descr;
-  CUSPARSE_CHECK(cusparseCreateDnMat(&B_descr, K, N, K, B, CUDA_R_32F, CUSPARSE_ORDER_COL));
-  cusparseDnMatDescr_t C_descr;
-  CUSPARSE_CHECK(cusparseCreateDnMat(&C_descr, M, N, M, C, CUDA_R_32F, CUSPARSE_ORDER_COL));
-  size_t bufferSize;
+  CUSPARSE_CHECK(cusparseCreateDnMat(&B_descr, K, N, K, B, CUDA_R_32F,
+CUSPARSE_ORDER_COL)); cusparseDnMatDescr_t C_descr;
+  CUSPARSE_CHECK(cusparseCreateDnMat(&C_descr, M, N, M, C, CUDA_R_32F,
+CUSPARSE_ORDER_COL)); size_t bufferSize;
   CUSPARSE_CHECK(cusparseSpMM_bufferSize(deepgalois::Context::cusparse_handle(),
-                       CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE,
-                       (void*)&alpha, A_descr, B_descr, (void*)&beta, C_descr,
-                       CUDA_R_32F, CUSPARSE_COOMM_ALG1, &bufferSize));
+                       CUSPARSE_OPERATION_NON_TRANSPOSE,
+CUSPARSE_OPERATION_TRANSPOSE, (void*)&alpha, A_descr, B_descr, (void*)&beta,
+C_descr, CUDA_R_32F, CUSPARSE_COOMM_ALG1, &bufferSize));
   cudaDeviceSynchronize();
   void* buffer = NULL;
   if (bufferSize > 0) CUDA_CHECK(cudaMalloc(&buffer, bufferSize));
   CUSPARSE_CHECK(cusparseSpMM(deepgalois::Context::cusparse_handle(),
                  CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE,
-                 (const void*)&alpha, A_descr, B_descr, (const void*)&beta, C_descr, 
-                 CUDA_R_32F, CUSPARSE_COOMM_ALG1, buffer));
-  cudaDeviceSynchronize();
+                 (const void*)&alpha, A_descr, B_descr, (const void*)&beta,
+C_descr, CUDA_R_32F, CUSPARSE_COOMM_ALG1, buffer)); cudaDeviceSynchronize();
   //transpose C
   const float one = 1.0;
-  const float zero = 0.0; 
-  CUBLAS_CHECK(cublasSgeam(deepgalois::Context::cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_T,
-                           N, M, &one, transpose_C, M, &zero, NULL, M, C, N)); 
+  const float zero = 0.0;
+  CUBLAS_CHECK(cublasSgeam(deepgalois::Context::cublas_handle(), CUBLAS_OP_T,
+CUBLAS_OP_T, N, M, &one, transpose_C, M, &zero, NULL, M, C, N));
 }
 //*/
 void gemv_gpu(const CBLAS_TRANSPOSE TransA, const int M, const int N,
@@ -259,16 +276,18 @@ void gemv_gpu(const CBLAS_TRANSPOSE TransA, const int M, const int N,
               const float beta, float* y) {
   cublasOperation_t cuTransA =
       (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
-  CUBLAS_CHECK(cublasSgemv(deepgalois::Context::cublas_handle(), cuTransA, N, M, &alpha, A,
-                           N, x, 1, &beta, y, 1));
+  CUBLAS_CHECK(cublasSgemv(deepgalois::Context::cublas_handle(), cuTransA, N, M,
+                           &alpha, A, N, x, 1, &beta, y, 1));
 }
 
 void scal_gpu(const int N, const float alpha, float* X) {
-  CUBLAS_CHECK(cublasSscal(deepgalois::Context::cublas_handle(), N, &alpha, X, 1));
+  CUBLAS_CHECK(
+      cublasSscal(deepgalois::Context::cublas_handle(), N, &alpha, X, 1));
 }
 
 void dot_gpu(const int n, const float* x, const float* y, float* out) {
-  CUBLAS_CHECK(cublasSdot(deepgalois::Context::cublas_handle(), n, x, 1, y, 1, out));
+  CUBLAS_CHECK(
+      cublasSdot(deepgalois::Context::cublas_handle(), n, x, 1, y, 1, out));
 }
 
 void asum_gpu(const int n, const float* x, float* y) {
@@ -276,8 +295,10 @@ void asum_gpu(const int n, const float* x, float* y) {
 }
 
 void scale_gpu(const int n, const float alpha, const float* x, float* y) {
-  CUBLAS_CHECK(cublasScopy(deepgalois::Context::cublas_handle(), n, x, 1, y, 1));
-  CUBLAS_CHECK(cublasSscal(deepgalois::Context::cublas_handle(), n, &alpha, y, 1));
+  CUBLAS_CHECK(
+      cublasScopy(deepgalois::Context::cublas_handle(), n, x, 1, y, 1));
+  CUBLAS_CHECK(
+      cublasSscal(deepgalois::Context::cublas_handle(), n, &alpha, y, 1));
 }
 
 __global__ void set_kernel(const int n, const float_t alpha, float_t* y) {
@@ -318,12 +339,13 @@ __global__ void axpy_kernel(const int n, const float_t a, const float_t* x,
 }
 
 void axpy_gpu(const int n, const float_t a, const float_t* x, float_t* y) {
-  //axpy_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, a, x, y);
-  CUBLAS_CHECK(cublasSaxpy(deepgalois::Context::cublas_handle(), n, &a, x, 1, y, 1));
+  // axpy_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, a, x, y);
+  CUBLAS_CHECK(
+      cublasSaxpy(deepgalois::Context::cublas_handle(), n, &a, x, 1, y, 1));
   CudaTest("solving axpy kernel failed");
 }
 
-__global__ void l2_norm_kernel(const int n, const float_t* a, float_t *sum) {
+__global__ void l2_norm_kernel(const int n, const float_t* a, float_t* sum) {
   CUDA_KERNEL_LOOP(i, n) {
     float_t product = a[i] * a[i];
     atomicAdd(sum, product);
@@ -332,24 +354,25 @@ __global__ void l2_norm_kernel(const int n, const float_t* a, float_t *sum) {
 
 acc_t l2_norm_gpu(int n, const float_t* x) {
   float_t sum = 0.0;
-  CUBLAS_CHECK(cublasSnrm2(deepgalois::Context::cublas_handle(), n, x, 1, &sum));
-  //float_t *d_sum;
-  //CUDA_CHECK(cudaMalloc((void**)&d_sum, sizeof(float_t));
-  //CUDA_CHECK(cudaMemcpy(d_sum, &sum, sizeof(acc_t), cudaMemcpyHostToDevice));
-  //l2_norm_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, x, d_sum);
-  //CUDA_CHECK(cudaMemcpy(d_sum, &sum, sizeof(float_t), cudaMemcpyDeviceToHost));
+  CUBLAS_CHECK(
+      cublasSnrm2(deepgalois::Context::cublas_handle(), n, x, 1, &sum));
+  // float_t *d_sum;
+  // CUDA_CHECK(cudaMalloc((void**)&d_sum, sizeof(float_t));
+  // CUDA_CHECK(cudaMemcpy(d_sum, &sum, sizeof(acc_t), cudaMemcpyHostToDevice));
+  // l2_norm_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, x, d_sum);
+  // CUDA_CHECK(cudaMemcpy(d_sum, &sum, sizeof(float_t),
+  // cudaMemcpyDeviceToHost));
   return (acc_t)sum / 2.0;
 }
 
-void l2_norm_gpu(size_t x, size_t y, const float_t* in, float_t *out) {
-}
+void l2_norm_gpu(size_t x, size_t y, const float_t* in, float_t* out) {}
 
-void d_l2_norm_gpu(size_t x, size_t y, const float_t* in_data, 
-                   float_t *in_diff, float_t *out_diff) {
-}
+void d_l2_norm_gpu(size_t x, size_t y, const float_t* in_data, float_t* in_diff,
+                   float_t* out_diff) {}
 
 void copy_gpu(int len, const float_t* in, float_t* out) {
-  CUDA_CHECK(cudaMemcpy(out, in, len * sizeof(float_t), cudaMemcpyDeviceToDevice));
+  CUDA_CHECK(
+      cudaMemcpy(out, in, len * sizeof(float_t), cudaMemcpyDeviceToDevice));
 }
 
 // TODO: use warp
@@ -362,14 +385,15 @@ __device__ void softmax_device(int n, const float_t* input, float_t* output) {
   for (int i = 0; i < n; i++) {
     output[i] = expf(input[i] - max);
     denominator += output[i];
-	if (output[i] < 0.0) printf("in[%d]=%f, out[%d]=%f\n", i, input[i], i, output[i]);
-    //assert(output[i] >= 0.0);
+    if (output[i] < 0.0)
+      printf("in[%d]=%f, out[%d]=%f\n", i, input[i], i, output[i]);
+    // assert(output[i] >= 0.0);
   }
   assert(denominator != 0.0);
   for (int i = 0; i < n; i++) {
     output[i] /= denominator;
-    //assert(output[i] >= 0.0);
-    //assert(output[i] <= 1.0);
+    // assert(output[i] >= 0.0);
+    // assert(output[i] <= 1.0);
   }
 }
 
@@ -378,18 +402,25 @@ __device__ void sigmoid_device(int n, const float_t* in, float_t* out) {
     out[i] = 1. / (1. + expf(-in[i]));
 }
 
-__device__ void cross_entropy_device(int n, const label_t idx, const float_t* p, float_t& loss) {
-  if (p[idx] == 0.0) loss -= logf(float_t(1e-10));
-  else loss -= logf(p[idx]);
+__device__ void cross_entropy_device(int n, const label_t idx, const float_t* p,
+                                     float_t& loss) {
+  if (p[idx] == 0.0)
+    loss -= logf(float_t(1e-10));
+  else
+    loss -= logf(p[idx]);
 }
 
 // y: ground truth
 // p: predictions
-__device__ void cross_entropy_multi_device(int n, const label_t *y, const float_t* p, float_t& loss) {
+__device__ void cross_entropy_multi_device(int n, const label_t* y,
+                                           const float_t* p, float_t& loss) {
   for (int i = 0; i < n; i++) {
-    if (y[i] == 0) continue;
-    if (p[i] == float_t(0)) loss -= logf(float_t(1e-10)); // avoid NaN exception
-    else loss -= logf(p[i]);
+    if (y[i] == 0)
+      continue;
+    if (p[i] == float_t(0))
+      loss -= logf(float_t(1e-10)); // avoid NaN exception
+    else
+      loss -= logf(p[i]);
   }
 }
 
@@ -401,13 +432,13 @@ __global__ void softmax_cross_entropy_kernel(int len, int begin, int end,
                                              const mask_t* masks,
                                              const label_t* labels,
                                              float_t* loss, float_t* out_data) {
-  CUDA_KERNEL_LOOP(i, end-begin) {
+  CUDA_KERNEL_LOOP(i, end - begin) {
     int id = begin + i;
     if (masks[id] == 1) { // masked
-	  // normalize using softmax
-      softmax_device(len, in_data + len*id, out_data + len*id);
-      //loss[id] = 0.0;
-      cross_entropy_device(len, labels[id], out_data + len*id, loss[id]);
+                          // normalize using softmax
+      softmax_device(len, in_data + len * id, out_data + len * id);
+      // loss[id] = 0.0;
+      cross_entropy_device(len, labels[id], out_data + len * id, loss[id]);
     }
   }
 }
@@ -415,8 +446,9 @@ __global__ void softmax_cross_entropy_kernel(int len, int begin, int end,
 void softmax_cross_entropy_gpu(int len, int begin, int end, const float_t* in,
                                const mask_t* masks, const label_t* labels,
                                float_t* loss, float_t* out) {
-  softmax_cross_entropy_kernel<<<CUDA_GET_BLOCKS(end-begin), CUDA_NUM_THREADS>>>(
-      len, begin, end, in, masks, labels, loss, out);
+  softmax_cross_entropy_kernel<<<CUDA_GET_BLOCKS(end - begin),
+                                 CUDA_NUM_THREADS>>>(len, begin, end, in, masks,
+                                                     labels, loss, out);
   CudaTest("solving softmax_cross_entropy kernel failed");
 }
 
@@ -428,11 +460,11 @@ __global__ void sigmoid_cross_entropy_kernel(int len, int begin, int end,
                                              const mask_t* masks,
                                              const label_t* labels,
                                              float_t* loss, float_t* out_data) {
-  CUDA_KERNEL_LOOP(i, end-begin) {
+  CUDA_KERNEL_LOOP(i, end - begin) {
     int id = begin + i;
     if (masks[id] == 1) { // masked
-      sigmoid_device(len, in_data + len*id, out_data + len*id);
-      cross_entropy_multi_device(len, labels, out_data + len*id, loss[id]);
+      sigmoid_device(len, in_data + len * id, out_data + len * id);
+      cross_entropy_multi_device(len, labels, out_data + len * id, loss[id]);
     }
   }
 }
@@ -440,64 +472,77 @@ __global__ void sigmoid_cross_entropy_kernel(int len, int begin, int end,
 void sigmoid_cross_entropy_gpu(int len, int begin, int end, const float_t* in,
                                const mask_t* masks, const label_t* labels,
                                float_t* loss, float_t* out) {
-  sigmoid_cross_entropy_kernel<<<CUDA_GET_BLOCKS(end-begin), CUDA_NUM_THREADS>>>(
-      len, begin, end, in, masks, labels, loss, out);
+  sigmoid_cross_entropy_kernel<<<CUDA_GET_BLOCKS(end - begin),
+                                 CUDA_NUM_THREADS>>>(len, begin, end, in, masks,
+                                                     labels, loss, out);
   CudaTest("solving sigmoid_cross_entropy kernel failed");
 }
 
-__device__ void d_cross_entropy_device(int n, const label_t idx, const float_t* p, float_t* d) {
+__device__ void d_cross_entropy_device(int n, const label_t idx,
+                                       const float_t* p, float_t* d) {
   for (int i = 0; i < n; i++) {
-    if (i == (int)idx) d[i] = -1.0 / (p[i] + 1e-10);
-    else d[i] = 0.0;
+    if (i == (int)idx)
+      d[i] = -1.0 / (p[i] + 1e-10);
+    else
+      d[i] = 0.0;
   }
 }
 
 __global__ void d_cross_entropy_kernel(int len, int begin, int end,
-                                const mask_t* masks, const label_t* labels,
-                                const float_t* data, float_t* grad) {
+                                       const mask_t* masks,
+                                       const label_t* labels,
+                                       const float_t* data, float_t* grad) {
   int base = begin * len;
-  CUDA_KERNEL_LOOP(i, (end-begin)*len) {
-    int id = begin + i/len;
+  CUDA_KERNEL_LOOP(i, (end - begin) * len) {
+    int id = begin + i / len;
     if (masks[id] == 1) { // masked
-      if (i%len == (int)labels[id]) grad[i] = -1.0 / (data[i+base] + 1e-10);
-      else grad[i] = 0.0;
-      //d_cross_entropy_device(len, labels[id], data + len*id, grad + len*i);
+      if (i % len == (int)labels[id])
+        grad[i] = -1.0 / (data[i + base] + 1e-10);
+      else
+        grad[i] = 0.0;
+      // d_cross_entropy_device(len, labels[id], data + len*id, grad + len*i);
     }
   }
-} 
+}
 
 __global__ void d_cross_entropy_warp(int len, int begin, int end,
-                                const mask_t* masks, const label_t* labels,
-                                const float_t* data, float_t* grad) {
-  __shared__ float_t p[BLOCK_SIZE/WARP_SIZE][MAX_NUM_CLASSES];
-  const int thread_id   = BLOCK_SIZE * blockIdx.x + threadIdx.x;  // global thread index
-  const int thread_lane = threadIdx.x & (WARP_SIZE-1);            // thread index within the warp
-  const int warp_id     = thread_id   / WARP_SIZE;                // global warp index
-  const int warp_lane   = threadIdx.x / WARP_SIZE;                // warp index within the CTA
-  const int num_warps   = (BLOCK_SIZE / WARP_SIZE) * gridDim.x;   // total number of active warps
-
-  for (int wid = warp_id; wid < end-begin; wid += num_warps) {
-    int id = begin + wid;
-    int base = id * len;	
+                                     const mask_t* masks, const label_t* labels,
+                                     const float_t* data, float_t* grad) {
+  __shared__ float_t p[BLOCK_SIZE / WARP_SIZE][MAX_NUM_CLASSES];
+  const int thread_id =
+      BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index
+  const int thread_lane =
+      threadIdx.x & (WARP_SIZE - 1);             // thread index within the warp
+  const int warp_id   = thread_id / WARP_SIZE;   // global warp index
+  const int warp_lane = threadIdx.x / WARP_SIZE; // warp index within the CTA
+  const int num_warps =
+      (BLOCK_SIZE / WARP_SIZE) * gridDim.x; // total number of active warps
+
+  for (int wid = warp_id; wid < end - begin; wid += num_warps) {
+    int id   = begin + wid;
+    int base = id * len;
     if (masks[id] == 1) {
       for (int i = 0; i < len; i += WARP_SIZE) {
         int pid = thread_lane + i;
-        if (pid < len) p[warp_lane][pid] = data[base+pid];
+        if (pid < len)
+          p[warp_lane][pid] = data[base + pid];
       }
       __syncthreads();
       for (int i = 0; i < len; i += WARP_SIZE) {
         int pid = thread_lane + i;
         if (pid < len) {
           if (pid == (int)labels[id])
-            grad[wid*len+pid] = -1.0 / (p[warp_lane][pid] + 1e-10);
-          else grad[wid*len+pid] = 0.0;
+            grad[wid * len + pid] = -1.0 / (p[warp_lane][pid] + 1e-10);
+          else
+            grad[wid * len + pid] = 0.0;
         }
       }
     }
   }
 }
 
-__device__ void d_softmax_device(int n, const float_t* p, const float_t* dp, float_t* dy) {
+__device__ void d_softmax_device(int n, const float_t* p, const float_t* dp,
+                                 float_t* dy) {
   for (int i = 0; i < n; i++) {
     dy[i] = 0;
     for (int j = 0; j < n; j++) {
@@ -510,47 +555,52 @@ __device__ void d_softmax_device(int n, const float_t* p, const float_t* dp, flo
 __global__ void d_softmax_kernel(int len, int begin, int end,
                                  const mask_t* masks, const float_t* data,
                                  const float_t* in_grad, float_t* out_grad) {
-  CUDA_KERNEL_LOOP(i, end-begin) {
+  CUDA_KERNEL_LOOP(i, end - begin) {
     int id = begin + i;
     if (masks[id] == 1) { // masked
-      d_softmax_device(len, data + len*id, in_grad + len*i, out_grad + len*id);
+      d_softmax_device(len, data + len * id, in_grad + len * i,
+                       out_grad + len * id);
     }
   }
-} 
-
-__global__ void d_softmax_warp(int len, int begin, int end,
-                               const mask_t* masks, const float_t* data,
-                               const float_t* in_grad, float_t* out_grad) {
-  __shared__ float_t p[BLOCK_SIZE/WARP_SIZE][MAX_NUM_CLASSES];
-  __shared__ float_t d[BLOCK_SIZE/WARP_SIZE][MAX_NUM_CLASSES];
-  const int thread_id   = BLOCK_SIZE * blockIdx.x + threadIdx.x;  // global thread index
-  const int thread_lane = threadIdx.x & (WARP_SIZE-1);            // thread index within the warp
-  const int warp_id     = thread_id   / WARP_SIZE;                // global warp index
-  const int warp_lane   = threadIdx.x / WARP_SIZE;                // warp index within the CTA
-  const int num_warps   = (BLOCK_SIZE / WARP_SIZE) * gridDim.x;   // total number of active warps
-
-  for (int wid = warp_id; wid < end-begin; wid += num_warps) {
-    int id = begin + wid;
-    int base = id * len;	
+}
+
+__global__ void d_softmax_warp(int len, int begin, int end, const mask_t* masks,
+                               const float_t* data, const float_t* in_grad,
+                               float_t* out_grad) {
+  __shared__ float_t p[BLOCK_SIZE / WARP_SIZE][MAX_NUM_CLASSES];
+  __shared__ float_t d[BLOCK_SIZE / WARP_SIZE][MAX_NUM_CLASSES];
+  const int thread_id =
+      BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index
+  const int thread_lane =
+      threadIdx.x & (WARP_SIZE - 1);             // thread index within the warp
+  const int warp_id   = thread_id / WARP_SIZE;   // global warp index
+  const int warp_lane = threadIdx.x / WARP_SIZE; // warp index within the CTA
+  const int num_warps =
+      (BLOCK_SIZE / WARP_SIZE) * gridDim.x; // total number of active warps
+
+  for (int wid = warp_id; wid < end - begin; wid += num_warps) {
+    int id   = begin + wid;
+    int base = id * len;
     if (masks[id] == 1) {
       for (int i = 0; i < len; i += WARP_SIZE) {
         int pid = thread_lane + i;
         if (pid < len) {
-          p[warp_lane][pid] = data[base+pid];
-          d[warp_lane][pid] = in_grad[wid*len+pid];
+          p[warp_lane][pid] = data[base + pid];
+          d[warp_lane][pid] = in_grad[wid * len + pid];
         }
       }
       __syncthreads();
       for (int i = 0; i < len; i += WARP_SIZE) {
         int pid = thread_lane + i;
         if (pid < len) {
-          float_t sum = 0.0;
+          float_t sum  = 0.0;
           float_t self = p[warp_lane][pid];
           for (int j = 0; j < len; j++) {
-            float_t df = (j == pid) ? self * (1.0 - self) : -p[warp_lane][j] * self;
+            float_t df =
+                (j == pid) ? self * (1.0 - self) : -p[warp_lane][j] * self;
             sum += df * d[warp_lane][j];
           }
-          out_grad[base+pid] = sum;
+          out_grad[base + pid] = sum;
         }
       }
       __syncthreads();
@@ -559,36 +609,44 @@ __global__ void d_softmax_warp(int len, int begin, int end,
 }
 
 __global__ void d_softmax_cross_entropy_kernel(int len, int begin, int end,
-                                               const mask_t* masks, const label_t* labels,
-                                               const float_t* out, float_t* diff) {
-  CUDA_KERNEL_LOOP(i, end-begin) {
+                                               const mask_t* masks,
+                                               const label_t* labels,
+                                               const float_t* out,
+                                               float_t* diff) {
+  CUDA_KERNEL_LOOP(i, end - begin) {
     int id = begin + i;
-    if (masks[id] == 1) { // masked
-	  float_t out_grad[41]; // TODO
-      d_cross_entropy_device(len, labels[id], out + len*id, out_grad);
-      d_softmax_device(len, out + len*id, out_grad, diff + len*id);
+    if (masks[id] == 1) {   // masked
+      float_t out_grad[41]; // TODO
+      d_cross_entropy_device(len, labels[id], out + len * id, out_grad);
+      d_softmax_device(len, out + len * id, out_grad, diff + len * id);
     }
   }
 }
 
 __global__ void d_softmax_cross_entropy_warp(int len, int begin, int end,
-                                const mask_t* masks, const label_t* labels,
-                                const float_t* data, float_t* grad) {
-  __shared__ float_t p[BLOCK_SIZE/WARP_SIZE][MAX_NUM_CLASSES];
-  __shared__ float_t d[BLOCK_SIZE/WARP_SIZE][MAX_NUM_CLASSES];
-  const int thread_id   = BLOCK_SIZE * blockIdx.x + threadIdx.x;  // global thread index
-  const int thread_lane = threadIdx.x & (WARP_SIZE-1);            // thread index within the warp
-  const int warp_id     = thread_id   / WARP_SIZE;                // global warp index
-  const int warp_lane   = threadIdx.x / WARP_SIZE;                // warp index within the CTA
-  const int num_warps   = (BLOCK_SIZE / WARP_SIZE) * gridDim.x;   // total number of active warps
-
-  for (int wid = warp_id; wid < end-begin; wid += num_warps) {
-    int id = begin + wid;
-    int base = id * len;	
+                                             const mask_t* masks,
+                                             const label_t* labels,
+                                             const float_t* data,
+                                             float_t* grad) {
+  __shared__ float_t p[BLOCK_SIZE / WARP_SIZE][MAX_NUM_CLASSES];
+  __shared__ float_t d[BLOCK_SIZE / WARP_SIZE][MAX_NUM_CLASSES];
+  const int thread_id =
+      BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index
+  const int thread_lane =
+      threadIdx.x & (WARP_SIZE - 1);             // thread index within the warp
+  const int warp_id   = thread_id / WARP_SIZE;   // global warp index
+  const int warp_lane = threadIdx.x / WARP_SIZE; // warp index within the CTA
+  const int num_warps =
+      (BLOCK_SIZE / WARP_SIZE) * gridDim.x; // total number of active warps
+
+  for (int wid = warp_id; wid < end - begin; wid += num_warps) {
+    int id   = begin + wid;
+    int base = id * len;
     if (masks[id] == 1) {
       for (int i = 0; i < len; i += WARP_SIZE) {
         int pid = thread_lane + i;
-        if (pid < len) p[warp_lane][pid] = data[base+pid];
+        if (pid < len)
+          p[warp_lane][pid] = data[base + pid];
       }
       __syncthreads();
 
@@ -598,7 +656,8 @@ __global__ void d_softmax_cross_entropy_warp(int len, int begin, int end,
         if (pid < len) {
           if (pid == (int)labels[id])
             d[warp_lane][pid] = -1.0 / (p[warp_lane][pid] + 1e-10);
-          else d[warp_lane][pid] = 0.0;
+          else
+            d[warp_lane][pid] = 0.0;
         }
       }
       __syncthreads();
@@ -607,13 +666,14 @@ __global__ void d_softmax_cross_entropy_warp(int len, int begin, int end,
       for (int i = 0; i < len; i += WARP_SIZE) {
         int pid = thread_lane + i;
         if (pid < len) {
-          float_t sum = 0.0;
+          float_t sum  = 0.0;
           float_t self = p[warp_lane][pid];
           for (int j = 0; j < len; j++) {
-            float_t df = (j == pid) ? self * (1.0 - self) : -p[warp_lane][j] * self;
+            float_t df =
+                (j == pid) ? self * (1.0 - self) : -p[warp_lane][j] * self;
             sum += df * d[warp_lane][j];
           }
-          grad[base+pid] = sum;
+          grad[base + pid] = sum;
         }
       }
       __syncthreads();
@@ -624,42 +684,51 @@ __global__ void d_softmax_cross_entropy_warp(int len, int begin, int end,
 void d_softmax_cross_entropy_gpu(int len, int begin, int end,
                                  const mask_t* masks, const label_t* labels,
                                  const float_t* out, float_t* diff) {
-//  d_softmax_cross_entropy_kernel<<<CUDA_GET_BLOCKS(end-begin), CUDA_NUM_THREADS>>>(
-//      len, begin, end, masks, labels, out, diff);
-//  CudaTest("solving d_softmax_cross_entropy kernel failed");
-  //float_t *grad;
-  //float_malloc_device((end-begin)*len, grad);
-  //d_cross_entropy_kernel<<<CUDA_GET_BLOCKS((end-begin)*len), CUDA_NUM_THREADS>>>(
-  //d_cross_entropy_warp<<<(end-begin-1)/WARPS_PER_BLOCK+1, BLOCK_SIZE>>>(
+  //  d_softmax_cross_entropy_kernel<<<CUDA_GET_BLOCKS(end-begin),
+  //  CUDA_NUM_THREADS>>>(
+  //      len, begin, end, masks, labels, out, diff);
+  //  CudaTest("solving d_softmax_cross_entropy kernel failed");
+  // float_t *grad;
+  // float_malloc_device((end-begin)*len, grad);
+  // d_cross_entropy_kernel<<<CUDA_GET_BLOCKS((end-begin)*len),
+  // CUDA_NUM_THREADS>>>( d_cross_entropy_warp<<<(end-begin-1)/WARPS_PER_BLOCK+1,
+  // BLOCK_SIZE>>>(
   //    len, begin, end, masks, labels, out, grad);
-  //CudaTest("solving d_cross_entropy kernel failed");
-  //d_softmax_kernel<<<CUDA_GET_BLOCKS(end-begin), CUDA_NUM_THREADS>>>(
-  //d_softmax_warp<<<(end-begin-1)/WARPS_PER_BLOCK+1, BLOCK_SIZE>>>(
+  // CudaTest("solving d_cross_entropy kernel failed");
+  // d_softmax_kernel<<<CUDA_GET_BLOCKS(end-begin), CUDA_NUM_THREADS>>>(
+  // d_softmax_warp<<<(end-begin-1)/WARPS_PER_BLOCK+1, BLOCK_SIZE>>>(
   //    len, begin, end, masks, out, grad, diff);
-  //CudaTest("solving d_softmax kernel failed");
-  d_softmax_cross_entropy_warp<<<(end-begin-1)/WARPS_PER_BLOCK+1, BLOCK_SIZE>>>(
-      len, begin, end, masks, labels, out, diff);
+  // CudaTest("solving d_softmax kernel failed");
+  d_softmax_cross_entropy_warp<<<(end - begin - 1) / WARPS_PER_BLOCK + 1,
+                                 BLOCK_SIZE>>>(len, begin, end, masks, labels,
+                                               out, diff);
   CudaTest("solving d_softmax_cross_entropy_warp kernel failed");
 }
 
 __global__ void d_sigmoid_cross_entropy_warp(int len, int begin, int end,
-                                             const mask_t* masks, const label_t* labels,
-                                             const float_t* data, float_t* grad) {
-  __shared__ float_t p[BLOCK_SIZE/WARP_SIZE][MAX_NUM_CLASSES];
-  __shared__ float_t d[BLOCK_SIZE/WARP_SIZE][MAX_NUM_CLASSES];
-  const int thread_id   = BLOCK_SIZE * blockIdx.x + threadIdx.x;  // global thread index
-  const int thread_lane = threadIdx.x & (WARP_SIZE-1);            // thread index within the warp
-  const int warp_id     = thread_id   / WARP_SIZE;                // global warp index
-  const int warp_lane   = threadIdx.x / WARP_SIZE;                // warp index within the CTA
-  const int num_warps   = (BLOCK_SIZE / WARP_SIZE) * gridDim.x;   // total number of active warps
-
-  for (int wid = warp_id; wid < end-begin; wid += num_warps) {
-    int id = begin + wid;
-    int base = id * len;	
+                                             const mask_t* masks,
+                                             const label_t* labels,
+                                             const float_t* data,
+                                             float_t* grad) {
+  __shared__ float_t p[BLOCK_SIZE / WARP_SIZE][MAX_NUM_CLASSES];
+  __shared__ float_t d[BLOCK_SIZE / WARP_SIZE][MAX_NUM_CLASSES];
+  const int thread_id =
+      BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index
+  const int thread_lane =
+      threadIdx.x & (WARP_SIZE - 1);             // thread index within the warp
+  const int warp_id   = thread_id / WARP_SIZE;   // global warp index
+  const int warp_lane = threadIdx.x / WARP_SIZE; // warp index within the CTA
+  const int num_warps =
+      (BLOCK_SIZE / WARP_SIZE) * gridDim.x; // total number of active warps
+
+  for (int wid = warp_id; wid < end - begin; wid += num_warps) {
+    int id   = begin + wid;
+    int base = id * len;
     if (masks[id] == 1) {
       for (int i = 0; i < len; i += WARP_SIZE) {
         int pid = thread_lane + i;
-        if (pid < len) p[warp_lane][pid] = data[base+pid];
+        if (pid < len)
+          p[warp_lane][pid] = data[base + pid];
       }
       __syncthreads();
 
@@ -667,9 +736,10 @@ __global__ void d_sigmoid_cross_entropy_warp(int len, int begin, int end,
       for (int i = 0; i < len; i += WARP_SIZE) {
         int pid = thread_lane + i;
         if (pid < len) {
-          //if (p[warp_lane][pid] == 0)
-            d[warp_lane][pid] = -(float_t)labels[base+pid] / (p[warp_lane][pid] + 1e-10);
-          //else d[warp_lane][pid] = -(float_t)labels[pid] / 1e-10;
+          // if (p[warp_lane][pid] == 0)
+          d[warp_lane][pid] =
+              -(float_t)labels[base + pid] / (p[warp_lane][pid] + 1e-10);
+          // else d[warp_lane][pid] = -(float_t)labels[pid] / 1e-10;
         }
       }
       __syncthreads();
@@ -678,9 +748,9 @@ __global__ void d_sigmoid_cross_entropy_warp(int len, int begin, int end,
       for (int i = 0; i < len; i += WARP_SIZE) {
         int pid = thread_lane + i;
         if (pid < len) {
-          float_t self = p[warp_lane][pid];
-          float_t dp = d[warp_lane][pid];
-          grad[base+pid] = dp * self * (float_t(1) - self);
+          float_t self     = p[warp_lane][pid];
+          float_t dp       = d[warp_lane][pid];
+          grad[base + pid] = dp * self * (float_t(1) - self);
         }
       }
       __syncthreads();
@@ -691,13 +761,15 @@ __global__ void d_sigmoid_cross_entropy_warp(int len, int begin, int end,
 void d_sigmoid_cross_entropy_gpu(int len, int begin, int end,
                                  const mask_t* masks, const label_t* labels,
                                  const float_t* out, float_t* diff) {
-  d_sigmoid_cross_entropy_warp<<<(end-begin-1)/WARPS_PER_BLOCK+1, BLOCK_SIZE>>>(
-      len, begin, end, masks, labels, out, diff);
+  d_sigmoid_cross_entropy_warp<<<(end - begin - 1) / WARPS_PER_BLOCK + 1,
+                                 BLOCK_SIZE>>>(len, begin, end, masks, labels,
+                                               out, diff);
   CudaTest("solving d_sigmoid_cross_entropy_warp kernel failed");
 }
 
 __global__ void masked_avg_loss_kernel(int begin, int end, mask_t* masks,
-                                       float_t* loss, HGAccumulator<acc_t> total) {
+                                       float_t* loss,
+                                       HGAccumulator<acc_t> total) {
   total.thread_entry();
   __shared__ cub::BlockReduce<acc_t, CUDA_NUM_THREADS>::TempStorage local_loss;
   CUDA_KERNEL_LOOP(i, end - begin) {
@@ -707,8 +779,10 @@ __global__ void masked_avg_loss_kernel(int begin, int end, mask_t* masks,
   total.thread_exit<cub::BlockReduce<acc_t, CUDA_NUM_THREADS>>(local_loss);
 }
 
-//acc_t masked_avg_loss(int begin, int end, int count, mask_t* masks, float_t* loss);
-acc_t masked_avg_loss_gpu(int begin, int end, int count, mask_t* masks, float_t* loss) {
+// acc_t masked_avg_loss(int begin, int end, int count, mask_t* masks, float_t*
+// loss);
+acc_t masked_avg_loss_gpu(int begin, int end, int count, mask_t* masks,
+                          float_t* loss) {
   assert(count > 0);
   HGAccumulator<acc_t> loss_accum;
   Shared<acc_t> total_loss   = Shared<acc_t>(1);
@@ -720,4 +794,3 @@ acc_t masked_avg_loss_gpu(int begin, int end, int count, mask_t* masks, float_t*
   cudaDeviceSynchronize();
   return *(total_loss.cpu_rd_ptr()) / count;
 }
-
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp
index 381539df6b..ebd19639da 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/net.cpp
@@ -11,19 +11,19 @@ namespace deepgalois {
 
 #ifdef GALOIS_USE_DIST
 void Net::dist_init(Graph* graph, std::string dataset_str) {
-  dGraph = graph;
-  context = new deepgalois::DistContext();
+  dGraph      = graph;
+  context     = new deepgalois::DistContext();
   num_samples = dGraph->size();
   context->saveGraph(dGraph);
   // TODO self loop setup?
   context->initializeSyncSubstrate();
   num_classes = context->read_labels();
 
-  //std::cout << "Reading label masks ... ";
+  // std::cout << "Reading label masks ... ";
   train_masks = new mask_t[num_samples];
-  val_masks = new mask_t[num_samples];
-  std::fill(train_masks, train_masks+num_samples, 0);
-  std::fill(val_masks, val_masks+num_samples, 0);
+  val_masks   = new mask_t[num_samples];
+  std::fill(train_masks, train_masks + num_samples, 0);
+  std::fill(val_masks, val_masks + num_samples, 0);
 
   if (dataset_str == "reddit") {
     train_begin = 0, train_count = 153431,
@@ -41,40 +41,43 @@ void Net::dist_init(Graph* graph, std::string dataset_str) {
       }
     }
   } else {
-    train_count = context->read_masks("train", num_samples, train_begin, train_end, train_masks, dGraph);
-    val_count = context->read_masks("val", num_samples, val_begin, val_end, val_masks, dGraph);
+    train_count = context->read_masks("train", num_samples, train_begin,
+                                      train_end, train_masks, dGraph);
+    val_count   = context->read_masks("val", num_samples, val_begin, val_end,
+                                    val_masks, dGraph);
   }
 
   feature_dims[0] = context->read_features(); // input feature dimension: D
   for (size_t i = 1; i < num_conv_layers; i++)
-    feature_dims[i] = hidden1;                           // hidden1 level embedding: 16
-  feature_dims[num_conv_layers] = num_classes;           // output embedding: E
-  if (has_l2norm) 
-      feature_dims[num_conv_layers+1] = num_classes;     // l2 normalized embedding: E
-  if (has_dense) 
-      feature_dims[num_layers-1] = num_classes;          // MLP embedding: E
-  feature_dims[num_layers] = num_classes;                // normalized output embedding: E
+    feature_dims[i] = hidden1;                 // hidden1 level embedding: 16
+  feature_dims[num_conv_layers] = num_classes; // output embedding: E
+  if (has_l2norm)
+    feature_dims[num_conv_layers + 1] =
+        num_classes; // l2 normalized embedding: E
+  if (has_dense)
+    feature_dims[num_layers - 1] = num_classes; // MLP embedding: E
+  feature_dims[num_layers] = num_classes; // normalized output embedding: E
   layers.resize(num_layers);
 }
 #endif
 
 #ifdef CPU_ONLY
 void Net::init() {
-  if (subgraph_sample_size) sampler = new deepgalois::Sampler();
+  if (subgraph_sample_size)
+    sampler = new deepgalois::Sampler();
 }
 
 // add weight decay
 void Net::regularize() {
   size_t layer_id = 0;
-  auto n = feature_dims[layer_id] * feature_dims[layer_id+1];
+  auto n          = feature_dims[layer_id] * feature_dims[layer_id + 1];
   // TODO: parallel
-  math::axpy(n, weight_decay, layers[layer_id]->get_weights_ptr(), 
-    layers[layer_id]->get_grads_ptr());
+  math::axpy(n, weight_decay, layers[layer_id]->get_weights_ptr(),
+             layers[layer_id]->get_grads_ptr());
 }
 
 // Scale gradient to counterbalance accumulation
-void Net::normalize() {
-}
+void Net::normalize() {}
 
 /**
  *
@@ -82,7 +85,9 @@ void Net::normalize() {
  * @param end GLOBAL end
  * @param count GLOBAL training count
  */
-acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, float_t* preds, label_t* ground_truth) {
+acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count,
+                           mask_t* masks, float_t* preds,
+                           label_t* ground_truth) {
 #ifndef GALOIS_USE_DIST
   galois::GAccumulator<acc_t> accuracy_all;
 #else
@@ -93,32 +98,37 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks
 
   accuracy_all.reset();
 
-  galois::do_all(galois::iterate(begin, end), [&](const auto& i) {
+  galois::do_all(
+      galois::iterate(begin, end),
+      [&](const auto& i) {
 #ifndef GALOIS_USE_DIST
-    if (masks == NULL || masks[i] == 1) { // use sampled graph when masks is NULL
-      // get prediction
-      auto pred = math::argmax(num_classes, preds+i*num_classes);
-      // check prediction
-      if ((label_t)pred == ground_truth[i])
-        accuracy_all += 1.0;
-    }
+        if (masks == NULL ||
+            masks[i] == 1) { // use sampled graph when masks is NULL
+          // get prediction
+          auto pred = math::argmax(num_classes, preds + i * num_classes);
+          // check prediction
+          if ((label_t)pred == ground_truth[i])
+            accuracy_all += 1.0;
+        }
 #else
-    // only look at owned nodes (i.e. masters); the prediction for these
-    // should only by handled on the owner
-    if (dGraph->isOwned(i)) {
-      sampleCount += 1;
-
-      uint32_t localID = dGraph->getLID(i);
-      if (masks[localID] == 1) {
-        // get prediction
-        auto pred = math::argmax(num_classes, &preds[localID*num_classes]);
-        // check prediction
-        if ((label_t)pred == ground_truth[localID])
-          accuracy_all += 1.0;
-      }
-    }
+        // only look at owned nodes (i.e. masters); the prediction for these
+        // should only by handled on the owner
+        if (dGraph->isOwned(i)) {
+          sampleCount += 1;
+
+          uint32_t localID = dGraph->getLID(i);
+          if (masks[localID] == 1) {
+            // get prediction
+            auto pred =
+                math::argmax(num_classes, &preds[localID * num_classes]);
+            // check prediction
+            if ((label_t)pred == ground_truth[localID])
+              accuracy_all += 1.0;
+          }
+        }
 #endif
-  }, galois::loopname("getMaskedLoss"));
+      },
+      galois::loopname("getMaskedLoss"));
 
 #ifdef GALOIS_USE_DIST
   count = sampleCount.reduce();
@@ -129,8 +139,11 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks
   return accuracy_all.reduce() / (acc_t)count;
 }
 
-acc_t Net::masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, float_t* preds, label_t* ground_truth) {
-  return deepgalois::masked_f1_score(begin, end, count, masks, num_classes, ground_truth, preds);
+acc_t Net::masked_multi_class_accuracy(size_t begin, size_t end, size_t count,
+                                       mask_t* masks, float_t* preds,
+                                       label_t* ground_truth) {
+  return deepgalois::masked_f1_score(begin, end, count, masks, num_classes,
+                                     ground_truth, preds);
 }
 #endif
 
diff --git a/libdeepgalois/src/net.cu b/libdeepgalois/src/net.cu
index 98a5e82010..cd635ef07f 100644
--- a/libdeepgalois/src/net.cu
+++ b/libdeepgalois/src/net.cu
@@ -7,7 +7,7 @@
 
 // the arguments of the maxima
 __device__ int argmax_device(const int n, const float_t* x) {
-  float_t max    = x[0];
+  float_t max = x[0];
   int max_ind = 0;
   for (int i = 1; i < n; i++) {
     if (x[i] > max) {
@@ -18,15 +18,17 @@ __device__ int argmax_device(const int n, const float_t* x) {
   return max_ind;
 }
 
-__global__ void masked_accuracy_kernel(int num_classes, int begin,
-                                       int end, mask_t* masks,
-                                       float_t* preds, label_t* labels,
+__global__ void masked_accuracy_kernel(int num_classes, int begin, int end,
+                                       mask_t* masks, float_t* preds,
+                                       label_t* labels,
                                        HGAccumulator<acc_t> total) {
   total.thread_entry();
-  __shared__ cub::BlockReduce<acc_t, CUDA_NUM_THREADS>::TempStorage local_accuracy;
+  __shared__ cub::BlockReduce<acc_t, CUDA_NUM_THREADS>::TempStorage
+      local_accuracy;
   CUDA_KERNEL_LOOP(i, end - begin) {
     if (masks[begin + i] == 1) {
-      label_t pred = (label_t)argmax_device(num_classes, preds + (begin + i) * num_classes);
+      label_t pred = (label_t)argmax_device(num_classes,
+                                            preds + (begin + i) * num_classes);
       if (pred == labels[begin + i])
         total.reduce(1.0);
     }
@@ -49,13 +51,11 @@ acc_t masked_accuracy_gpu(int num_classes, int begin, int end, int count,
 }
 
 typedef float f1count_t;
-__global__ void masked_f1_score_kernel(int num_classes, int begin,
-                                       int end, mask_t* masks,
-                                       float_t* preds, label_t* labels,
-                                       f1count_t* true_positive,
-                                       f1count_t* false_positive,
-                                       f1count_t* false_negtive,
-                                       f1count_t* true_negtive) {
+__global__ void
+masked_f1_score_kernel(int num_classes, int begin, int end, mask_t* masks,
+                       float_t* preds, label_t* labels,
+                       f1count_t* true_positive, f1count_t* false_positive,
+                       f1count_t* false_negtive, f1count_t* true_negtive) {
   CUDA_KERNEL_LOOP(i, end - begin) {
     int id = begin + i;
     if (masks[id] == 1) {
@@ -83,7 +83,7 @@ acc_t masked_f1_score_gpu(int num_classes, int begin, int end, int count,
   f1count_t* h_fp = new f1count_t[num_classes];
   f1count_t* h_fn = new f1count_t[num_classes];
   f1count_t* h_tn = new f1count_t[num_classes];
-  f1count_t* d_tp, *d_fp, *d_fn, *d_tn;
+  f1count_t *d_tp, *d_fp, *d_fn, *d_tn;
   float_malloc_device(num_classes, d_tp);
   float_malloc_device(num_classes, d_fp);
   float_malloc_device(num_classes, d_fn);
@@ -95,41 +95,45 @@ acc_t masked_f1_score_gpu(int num_classes, int begin, int end, int count,
   masked_f1_score_kernel<<<CUDA_GET_BLOCKS(end - begin), CUDA_NUM_THREADS>>>(
       num_classes, begin, end, masks, preds, labels, d_tp, d_fp, d_fn, d_tn);
   CudaTest("solving masked_f1_score_kernel kernel failed");
-  CUDA_CHECK(cudaMemcpy(h_tp, d_tp, num_classes * sizeof(f1count_t), cudaMemcpyDeviceToHost));
-  CUDA_CHECK(cudaMemcpy(h_fp, d_fp, num_classes * sizeof(f1count_t), cudaMemcpyDeviceToHost));
-  CUDA_CHECK(cudaMemcpy(h_fn, d_fn, num_classes * sizeof(f1count_t), cudaMemcpyDeviceToHost));
-  CUDA_CHECK(cudaMemcpy(h_tn, d_tn, num_classes * sizeof(f1count_t), cudaMemcpyDeviceToHost));
-
-  acc_t pNumerator = 0.0;
-  acc_t pDenominator = 0.0;
-  acc_t rNumerator = 0.0;
-  acc_t rDenominator = 0.0;
+  CUDA_CHECK(cudaMemcpy(h_tp, d_tp, num_classes * sizeof(f1count_t),
+                        cudaMemcpyDeviceToHost));
+  CUDA_CHECK(cudaMemcpy(h_fp, d_fp, num_classes * sizeof(f1count_t),
+                        cudaMemcpyDeviceToHost));
+  CUDA_CHECK(cudaMemcpy(h_fn, d_fn, num_classes * sizeof(f1count_t),
+                        cudaMemcpyDeviceToHost));
+  CUDA_CHECK(cudaMemcpy(h_tn, d_tn, num_classes * sizeof(f1count_t),
+                        cudaMemcpyDeviceToHost));
+
+  acc_t pNumerator     = 0.0;
+  acc_t pDenominator   = 0.0;
+  acc_t rNumerator     = 0.0;
+  acc_t rDenominator   = 0.0;
   acc_t precisionMacro = 0.0;
-  acc_t recallMacro = 0.0;
+  acc_t recallMacro    = 0.0;
   for (size_t i = 0; i < num_classes; i++) {
     acc_t fn = (acc_t)h_fn[i]; // false negtive
     acc_t fp = (acc_t)h_fp[i]; // false positive
-	acc_t tp = (acc_t)h_tp[i]; // true positive
-	//acc_t tn = (acc_t)h_tn[i]; // true positive
+    acc_t tp = (acc_t)h_tp[i]; // true positive
+                               // acc_t tn = (acc_t)h_tn[i]; // true positive
 
     precisionMacro = precisionMacro + (tp / (tp + fp));
-    recallMacro = recallMacro + (tp / (tp + fn));
-	pNumerator = pNumerator + tp;
-	pDenominator = pDenominator + (tp + fp);
-    rNumerator = rNumerator + tp;
-    rDenominator = rDenominator + (tp + fn);
+    recallMacro    = recallMacro + (tp / (tp + fn));
+    pNumerator     = pNumerator + tp;
+    pDenominator   = pDenominator + (tp + fp);
+    rNumerator     = rNumerator + tp;
+    rDenominator   = rDenominator + (tp + fn);
   }
   precisionMacro = precisionMacro / num_classes;
-  recallMacro = recallMacro / num_classes;
-  acc_t f1_macro = (((beta * beta) + 1) * precisionMacro * recallMacro) / 
+  recallMacro    = recallMacro / num_classes;
+  acc_t f1_macro = (((beta * beta) + 1) * precisionMacro * recallMacro) /
                    ((beta * beta) * precisionMacro + recallMacro);
-  acc_t recallMicro = rNumerator / rDenominator;
+  acc_t recallMicro    = rNumerator / rDenominator;
   acc_t precisionMicro = pNumerator / pDenominator;
-  acc_t f1_micro = (((beta * beta) + 1) * precisionMicro * recallMicro) / 
+  acc_t f1_micro       = (((beta * beta) + 1) * precisionMicro * recallMicro) /
                    ((beta * beta) * precisionMicro + recallMicro);
-  std::cout << std::setprecision(3) << std::fixed <<
-      " (f1_micro: " << f1_micro << ", f1_macro: " << f1_macro << ") ";
- 
+  std::cout << std::setprecision(3) << std::fixed << " (f1_micro: " << f1_micro
+            << ", f1_macro: " << f1_macro << ") ";
+
   float_free_device(d_tp);
   float_free_device(d_fp);
   float_free_device(d_fn);
@@ -146,7 +150,8 @@ namespace deepgalois {
 void Net::init() {
   copy_masks_device(num_samples, train_masks, d_train_masks);
   copy_masks_device(num_samples, val_masks, d_val_masks);
-  context->copy_data_to_device(); // copy labels and input features to the device
+  context
+      ->copy_data_to_device(); // copy labels and input features to the device
 }
 
 void Net::copy_test_masks_to_device() {
@@ -156,21 +161,25 @@ void Net::copy_test_masks_to_device() {
 // add weight decay
 void Net::regularize() {
   size_t layer_id = 0;
-  auto n = feature_dims[layer_id] * feature_dims[layer_id+1];
-  axpy_gpu(n, weight_decay, layers[layer_id]->get_weights_device_ptr(), 
-    layers[layer_id]->get_grads_device_ptr());
+  auto n          = feature_dims[layer_id] * feature_dims[layer_id + 1];
+  axpy_gpu(n, weight_decay, layers[layer_id]->get_weights_device_ptr(),
+           layers[layer_id]->get_grads_device_ptr());
 }
 
 void Net::normalize() {}
 
-acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, 
-                           mask_t* masks, float_t* preds, label_t* ground_truth) {
-  return masked_accuracy_gpu(num_classes, begin, end, count, masks, preds, ground_truth);
+acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count,
+                           mask_t* masks, float_t* preds,
+                           label_t* ground_truth) {
+  return masked_accuracy_gpu(num_classes, begin, end, count, masks, preds,
+                             ground_truth);
 }
 
-acc_t Net::masked_multi_class_accuracy(size_t begin, size_t end, size_t count, 
-                                       mask_t* masks, float_t* preds, label_t* ground_truth) {
-	return masked_f1_score_gpu(num_classes, begin, end, count, masks, preds, ground_truth);
+acc_t Net::masked_multi_class_accuracy(size_t begin, size_t end, size_t count,
+                                       mask_t* masks, float_t* preds,
+                                       label_t* ground_truth) {
+  return masked_f1_score_gpu(num_classes, begin, end, count, masks, preds,
+                             ground_truth);
 }
 
-} // end namespace
+} // namespace deepgalois
diff --git a/libdeepgalois/src/node.cpp b/libdeepgalois/src/node.cpp
index fbd8d2bc6a..e5e9fa7c10 100644
--- a/libdeepgalois/src/node.cpp
+++ b/libdeepgalois/src/node.cpp
@@ -10,7 +10,8 @@ void edge::alloc() {
 
 void edge::merge_grads(float_t* dst) {
   assert(grad_ != NULL);
-  if(dst) delete[] dst;
+  if (dst)
+    delete[] dst;
   dst = new float_t[ft_dim_];
   std::copy(grad_, grad_ + ft_dim_, dst);
   // @todo consider adding parallelism and vectorization
diff --git a/libdeepgalois/src/node.cu b/libdeepgalois/src/node.cu
index afaceaeaea..2151162752 100644
--- a/libdeepgalois/src/node.cu
+++ b/libdeepgalois/src/node.cu
@@ -5,17 +5,20 @@
 namespace deepgalois {
 
 void edge::alloc() {
-  CUDA_CHECK(cudaMalloc((void**)&data_, num_samples_ * ft_dim_ * sizeof(float_t)));
-  CUDA_CHECK(cudaMalloc((void**)&grad_, num_samples_ * ft_dim_ * sizeof(float_t)));
+  CUDA_CHECK(
+      cudaMalloc((void**)&data_, num_samples_ * ft_dim_ * sizeof(float_t)));
+  CUDA_CHECK(
+      cudaMalloc((void**)&grad_, num_samples_ * ft_dim_ * sizeof(float_t)));
 }
 
 void edge::merge_grads(float_t* dst) {
-  CUDA_CHECK(cudaMemcpy(&dst, grad_, ft_dim_ * sizeof(float_t), cudaMemcpyDeviceToHost));
+  CUDA_CHECK(cudaMemcpy(&dst, grad_, ft_dim_ * sizeof(float_t),
+                        cudaMemcpyDeviceToHost));
 }
 
 void edge::clear_grads() {
-  //CUDA_CHECK(cudaMemset(grad_, 0, num_samples_ * ft_dim_ * sizeof(float_t)));
+  // CUDA_CHECK(cudaMemset(grad_, 0, num_samples_ * ft_dim_ * sizeof(float_t)));
   init_const_gpu(num_samples_ * ft_dim_, 0.0, grad_);
 }
 
-}
+} // namespace deepgalois
diff --git a/libdeepgalois/src/optimizer.cpp b/libdeepgalois/src/optimizer.cpp
index a73b5cd6d2..e8455e9206 100644
--- a/libdeepgalois/src/optimizer.cpp
+++ b/libdeepgalois/src/optimizer.cpp
@@ -6,37 +6,46 @@ namespace deepgalois {
 
 void adagrad::update(const vec_t& dW, vec_t& W) {
   vec_t& g = get<0>(W);
-  galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) {
-    g[i] += dW[i] * dW[i];
-    W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps);
-  }, galois::loopname("adagrad_update"));
-/*
-    for (size_t i = 0; i < W.size(); i++) {
-      g[i] += dW[i] * dW[i];
-      W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps);
-    }
-*/
+  galois::do_all(
+      galois::iterate((size_t)0, W.size()),
+      [&](const auto& i) {
+        g[i] += dW[i] * dW[i];
+        W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps);
+      },
+      galois::loopname("adagrad_update"));
+  /*
+      for (size_t i = 0; i < W.size(); i++) {
+        g[i] += dW[i] * dW[i];
+        W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps);
+      }
+  */
 }
 
 void RMSprop::update(const vec_t& dW, vec_t& W) {
   vec_t& g = get<0>(W);
-  galois::do_all(galois::iterate((size_t)0, W.size()),
-    [&](const auto& i) {
-      g[i] = mu * g[i] + (1 - mu) * dW[i] * dW[i];
-      W[i] -= alpha * dW[i] / std::sqrt(g[i] + eps);
-    }, galois::loopname("rms_update"));
+  galois::do_all(
+      galois::iterate((size_t)0, W.size()),
+      [&](const auto& i) {
+        g[i] = mu * g[i] + (1 - mu) * dW[i] * dW[i];
+        W[i] -= alpha * dW[i] / std::sqrt(g[i] + eps);
+      },
+      galois::loopname("rms_update"));
 }
 
 void adam::update(const vec_t& dW, vec_t& W) {
   vec_t& mt = get<0>(W);
   vec_t& vt = get<1>(W);
-  galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) {
-    mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i];
-    vt[i] = b2 * vt[i] + (float_t(1) - b2) * dW[i] * dW[i];
-    // L2 norm based update rule
-    W[i] -= alpha * (mt[i] / (float_t(1) - b1_t)) /
-            std::sqrt((vt[i] / (float_t(1) - b2_t)) + eps);
-  }, galois::chunk_size<256>(), galois::steal(), galois::loopname("adam_update"));
+  galois::do_all(
+      galois::iterate((size_t)0, W.size()),
+      [&](const auto& i) {
+        mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i];
+        vt[i] = b2 * vt[i] + (float_t(1) - b2) * dW[i] * dW[i];
+        // L2 norm based update rule
+        W[i] -= alpha * (mt[i] / (float_t(1) - b1_t)) /
+                std::sqrt((vt[i] / (float_t(1) - b2_t)) + eps);
+      },
+      galois::chunk_size<256>(), galois::steal(),
+      galois::loopname("adam_update"));
   b1_t *= b1;
   b2_t *= b2;
 }
@@ -44,37 +53,47 @@ void adam::update(const vec_t& dW, vec_t& W) {
 void adamax::update(const vec_t& dW, vec_t& W) {
   vec_t& mt = get<0>(W);
   vec_t& ut = get<1>(W);
-  galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) {
-    mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i];
-    ut[i] = std::max(b2 * ut[i], std::abs(dW[i]));
-    // Lp norm based update rule
-    W[i] -= (alpha / (1.0 - b1_t)) * (mt[i] / (ut[i] + eps));
-  }, galois::loopname("adamax_update"));
+  galois::do_all(
+      galois::iterate((size_t)0, W.size()),
+      [&](const auto& i) {
+        mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i];
+        ut[i] = std::max(b2 * ut[i], std::abs(dW[i]));
+        // Lp norm based update rule
+        W[i] -= (alpha / (1.0 - b1_t)) * (mt[i] / (ut[i] + eps));
+      },
+      galois::loopname("adamax_update"));
   b1_t *= b1;
 }
 
 void gradient_descent::update(const vec_t& dW, vec_t& W) {
-  galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) {
-    W[i] = W[i] - alpha * (dW[i] + lambda * W[i]);
-  }, galois::loopname("gradient_descent_update"));
+  galois::do_all(
+      galois::iterate((size_t)0, W.size()),
+      [&](const auto& i) { W[i] = W[i] - alpha * (dW[i] + lambda * W[i]); },
+      galois::loopname("gradient_descent_update"));
 }
 
 void momentum::update(const vec_t& dW, vec_t& W) {
   vec_t& dWprev = get<0>(W);
-  galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) {
-    float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda);
-    W[i] += V;
-    dWprev[i] = V;
-  }, galois::loopname("momentum_update"));
+  galois::do_all(
+      galois::iterate((size_t)0, W.size()),
+      [&](const auto& i) {
+        float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda);
+        W[i] += V;
+        dWprev[i] = V;
+      },
+      galois::loopname("momentum_update"));
 }
 
 void nesterov_momentum::update(const vec_t& dW, vec_t& W) {
   vec_t& dWprev = get<0>(W);
-  galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) {
-    float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda);
-    W[i] += (-mu) * dWprev[i] + (1 + mu) * V;
-    dWprev[i] = V;
-  }, galois::loopname("nesterov_momentum_update"));
+  galois::do_all(
+      galois::iterate((size_t)0, W.size()),
+      [&](const auto& i) {
+        float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda);
+        W[i] += (-mu) * dWprev[i] + (1 + mu) * V;
+        dWprev[i] = V;
+      },
+      galois::loopname("nesterov_momentum_update"));
 }
 
 } // namespace deepgalois
diff --git a/libdeepgalois/src/optimizer.cu b/libdeepgalois/src/optimizer.cu
index 0fd16803fd..15f2fe5515 100644
--- a/libdeepgalois/src/optimizer.cu
+++ b/libdeepgalois/src/optimizer.cu
@@ -3,14 +3,14 @@
 #include "deepgalois/math_functions.hh"
 
 __global__ void update_kernel(const int n, float_t alpha, float_t b1,
-                         float_t b2, float_t b1_t, float_t b2_t,
-                         float_t eps, float_t* mt, float_t* vt,
-                         const float_t* dW, float_t* W) {
+                              float_t b2, float_t b1_t, float_t b2_t,
+                              float_t eps, float_t* mt, float_t* vt,
+                              const float_t* dW, float_t* W) {
   CUDA_KERNEL_LOOP(i, n) {
     mt[i] = b1 * mt[i] + (1.0 - b1) * dW[i];
     vt[i] = b2 * vt[i] + (1.0 - b2) * dW[i] * dW[i];
-    W[i] -= alpha * (mt[i] / (1.0 - b1_t)) /
-            sqrtf((vt[i] / (1.0 - b2_t)) + eps);
+    W[i] -=
+        alpha * (mt[i] / (1.0 - b1_t)) / sqrtf((vt[i] / (1.0 - b2_t)) + eps);
   }
 }
 
@@ -18,7 +18,7 @@ namespace deepgalois {
 
 template <int N>
 template <int Index>
-float_t* stateful_optimizer<N>::get_gpu(const size_t n, const float_t *key) {
+float_t* stateful_optimizer<N>::get_gpu(const size_t n, const float_t* key) {
   static_assert(Index < N, "index out of range");
   if (!is_allocated_device(dE_[Index][key])) {
     float_malloc_device(n, dE_[Index][key]);
@@ -29,9 +29,9 @@ float_t* stateful_optimizer<N>::get_gpu(const size_t n, const float_t *key) {
 
 void adam::update(const vec_t& dW, vec_t& W) {}
 void adam::update_gpu(const size_t n, const float_t* dW, float_t* W) {
-  //std::cout << "updating weights on GPU, n = " << n << "\n";
-  //print_device_vector(10, dW, "dW");
-  float_t* cache = get_gpu<0>(n, W);
+  // std::cout << "updating weights on GPU, n = " << n << "\n";
+  // print_device_vector(10, dW, "dW");
+  float_t* cache    = get_gpu<0>(n, W);
   float_t* velocity = get_gpu<1>(n, W);
 
   update_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
@@ -52,4 +52,4 @@ void momentum::update_gpu(const size_t, const float_t*, float_t*) {}
 
 void nesterov_momentum::update_gpu(const size_t, const float_t*, float_t*) {}
 
-}
+} // namespace deepgalois
diff --git a/libdeepgalois/src/reader.cpp b/libdeepgalois/src/reader.cpp
index 519e27496a..29f729f3a4 100644
--- a/libdeepgalois/src/reader.cpp
+++ b/libdeepgalois/src/reader.cpp
@@ -4,8 +4,8 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <sys/mman.h>
-#include <fcntl.h>    /* For O_RDWR */
-#include <unistd.h>   /* For open(), creat() */
+#include <fcntl.h>  /* For O_RDWR */
+#include <unistd.h> /* For open(), creat() */
 #include <fstream>
 #include <cassert>
 
@@ -27,10 +27,13 @@ size_t Reader::read_labels(bool is_single_class, label_t*& labels) {
   in >> m >> num_classes >> std::ws;
   if (is_single_class) {
     std::cout << "Using single-class (one-hot) labels\n";
-    labels = new label_t[m]; // single-class (one-hot) label for each vertex: N x 1
+    labels =
+        new label_t[m]; // single-class (one-hot) label for each vertex: N x 1
   } else {
     std::cout << "Using multi-class labels\n";
-   labels = new label_t[m*num_classes]; // multi-class label for each vertex: N x E
+    labels =
+        new label_t[m *
+                    num_classes]; // multi-class label for each vertex: N x E
   }
   unsigned v = 0;
   while (std::getline(in, line)) {
@@ -44,7 +47,7 @@ size_t Reader::read_labels(bool is_single_class, label_t*& labels) {
           break;
         }
       } else {
-        labels[v*num_classes+idx] = x;
+        labels[v * num_classes + idx] = x;
       }
     }
     v++;
@@ -54,14 +57,15 @@ size_t Reader::read_labels(bool is_single_class, label_t*& labels) {
   // print the number of vertex classes
   std::cout << "Done, unique label counts: " << num_classes
             << ", time: " << t_read.Millisecs() << " ms\n";
-  //for (auto i = 0; i < 10; i ++) std::cout << "labels[" << i << "] = " << unsigned(labels[i]) << "\n";
+  // for (auto i = 0; i < 10; i ++) std::cout << "labels[" << i << "] = " <<
+  // unsigned(labels[i]) << "\n";
   return num_classes;
 }
 
 //! Read features, return the length of a feature vector
 //! Features are stored in the Context class
 size_t Reader::read_features(float_t*& feats, std::string filetype) {
-  //filetype = "txt";
+  // filetype = "txt";
   std::cout << "Reading features ... ";
   Timer t_read;
   t_read.Start();
@@ -83,7 +87,7 @@ size_t Reader::read_features(float_t*& feats, std::string filetype) {
   feats = new float_t[m * feat_len];
   if (filetype == "bin") {
     filename = path + dataset_str + "-feats.bin";
-    in.open(filename, std::ios::binary|std::ios::in);
+    in.open(filename, std::ios::binary | std::ios::in);
     in.read((char*)feats, sizeof(float_t) * m * feat_len);
   } else {
     std::string line;
@@ -101,15 +105,17 @@ size_t Reader::read_features(float_t*& feats, std::string filetype) {
   t_read.Stop();
   std::cout << "Done, feature length: " << feat_len
             << ", time: " << t_read.Millisecs() << " ms\n";
-  //for (auto i = 0; i < 6; i ++) 
-    //for (auto j = 0; j < 6; j ++) 
-      //std::cout << "feats[" << i << "][" << j << "] = " << feats[i*feat_len+j] << "\n";
+  // for (auto i = 0; i < 6; i ++)
+  // for (auto j = 0; j < 6; j ++)
+  // std::cout << "feats[" << i << "][" << j << "] = " << feats[i*feat_len+j] <<
+  // "\n";
   return feat_len;
 }
 
 //! Get masks from datafile where first line tells range of
 //! set to create mask from
-size_t Reader::read_masks(std::string mask_type, size_t n, size_t& begin, size_t& end, mask_t* masks) {
+size_t Reader::read_masks(std::string mask_type, size_t n, size_t& begin,
+                          size_t& end, mask_t* masks) {
   bool dataset_found = false;
   for (int i = 0; i < NUM_DATASETS; i++) {
     if (dataset_str == dataset_names[i]) {
@@ -142,24 +148,25 @@ size_t Reader::read_masks(std::string mask_type, size_t n, size_t& begin, size_t
     i++;
   }
   std::cout << mask_type + "_mask range: [" << begin << ", " << end
-    << ") Number of valid samples: " << sample_count << " (" 
-    << (float)sample_count/(float)n*(float)100 << "\%)\n";
+            << ") Number of valid samples: " << sample_count << " ("
+            << (float)sample_count / (float)n * (float)100 << "\%)\n";
   in.close();
   return sample_count;
 }
 
 void Reader::progressPrint(unsigned max, unsigned i) {
   const unsigned nsteps = 10;
-  unsigned ineachstep = (max / nsteps);
-  if(ineachstep == 0) ineachstep = 1;
+  unsigned ineachstep   = (max / nsteps);
+  if (ineachstep == 0)
+    ineachstep = 1;
   if (i % ineachstep == 0) {
-    int progress = ((size_t) i * 100) / max + 1;
+    int progress = ((size_t)i * 100) / max + 1;
     printf("\t%3d%%\r", progress);
     fflush(stdout);
   }
 }
 
-void Reader::readGraphFromGRFile(Graph *g) {
+void Reader::readGraphFromGRFile(Graph* g) {
   std::string filename = path + dataset_str + ".csgr";
   std::ifstream ifs;
   ifs.open(filename);
@@ -175,7 +182,7 @@ void Reader::readGraphFromGRFile(Graph *g) {
     exit(1);
   }
   size_t masterLength = buf.st_size;
-  int _MAP_BASE = MAP_PRIVATE;
+  int _MAP_BASE       = MAP_PRIVATE;
   void* m = mmap(0, masterLength, PROT_READ, _MAP_BASE, masterFD, 0);
   if (m == MAP_FAILED) {
     m = 0;
@@ -185,18 +192,19 @@ void Reader::readGraphFromGRFile(Graph *g) {
   Timer t;
   t.Start();
 
-  uint64_t* fptr = (uint64_t*)m;
+  uint64_t* fptr                           = (uint64_t*)m;
   __attribute__((unused)) uint64_t version = le64toh(*fptr++);
   assert(version == 1);
   uint64_t sizeEdgeTy = le64toh(*fptr++);
-  uint64_t nv = le64toh(*fptr++);
-  uint64_t ne = le64toh(*fptr++);
-  uint64_t *outIdx = fptr;
+  uint64_t nv         = le64toh(*fptr++);
+  uint64_t ne         = le64toh(*fptr++);
+  uint64_t* outIdx    = fptr;
   fptr += nv;
-  uint32_t *fptr32 = (uint32_t*)fptr;
-  uint32_t *outs = fptr32; 
+  uint32_t* fptr32 = (uint32_t*)fptr;
+  uint32_t* outs   = fptr32;
   fptr32 += ne;
-  if (ne % 2) fptr32 += 1;
+  if (ne % 2)
+    fptr32 += 1;
   if (sizeEdgeTy != 0) {
     std::cout << "LearningGraph: currently edge data not supported.\n";
     exit(1);
@@ -206,12 +214,13 @@ void Reader::readGraphFromGRFile(Graph *g) {
   auto rowptr = g->row_start_host_ptr();
   for (unsigned vid = 0; vid < nv; ++vid) {
     g->fixEndEdge(vid, le64toh(outIdx[vid]));
-    auto degree = rowptr[vid+1] - rowptr[vid];
+    auto degree = rowptr[vid + 1] - rowptr[vid];
     for (unsigned jj = 0; jj < degree; ++jj) {
       unsigned eid = rowptr[vid] + jj;
       unsigned dst = le32toh(outs[eid]);
       if (dst >= nv) {
-        printf("\tinvalid edge from %d to %d at index %d(%d).\n", vid, dst, jj, eid);
+        printf("\tinvalid edge from %d to %d at index %d(%d).\n", vid, dst, jj,
+               eid);
         exit(0);
       }
       g->constructEdge(eid, dst);
@@ -220,30 +229,30 @@ void Reader::readGraphFromGRFile(Graph *g) {
   }
   ifs.close();
 
-/*
-  std::string file_dims = path + dataset + "-dims.bin";
-  std::string file_rowptr = path + dataset + "-rowptr.bin";
-  std::string file_colidx = path + dataset + "-colidx.bin";
-  index_t dims[2];
-  ifs.open(file_dims, std::ios::binary|std::ios::in);
-  ifs.read((char*)dims, sizeof(index_t) * 2);
-  ifs.close();
-  num_vertices_ = dims[0];
-  num_edges_ = dims[1];
-  degrees_ = new index_t[num_vertices_];
-  rowptr_ = new index_t[num_vertices_+1];
-  colidx_ = new index_t[num_edges_];
-  ifs.open(file_rowptr, std::ios::binary|std::ios::in);
-  ifs.read((char*)rowptr_, sizeof(index_t) * (num_vertices_+1));
-  ifs.close();
-  ifs.open(file_colidx, std::ios::binary|std::ios::in);
-  ifs.read((char*)colidx_, sizeof(index_t) * num_edges_);
-  ifs.close();
-*/
+  /*
+    std::string file_dims = path + dataset + "-dims.bin";
+    std::string file_rowptr = path + dataset + "-rowptr.bin";
+    std::string file_colidx = path + dataset + "-colidx.bin";
+    index_t dims[2];
+    ifs.open(file_dims, std::ios::binary|std::ios::in);
+    ifs.read((char*)dims, sizeof(index_t) * 2);
+    ifs.close();
+    num_vertices_ = dims[0];
+    num_edges_ = dims[1];
+    degrees_ = new index_t[num_vertices_];
+    rowptr_ = new index_t[num_vertices_+1];
+    colidx_ = new index_t[num_edges_];
+    ifs.open(file_rowptr, std::ios::binary|std::ios::in);
+    ifs.read((char*)rowptr_, sizeof(index_t) * (num_vertices_+1));
+    ifs.close();
+    ifs.open(file_colidx, std::ios::binary|std::ios::in);
+    ifs.read((char*)colidx_, sizeof(index_t) * num_edges_);
+    ifs.close();
+  */
   t.Stop();
   double runtime = t.Millisecs();
-  std::cout << "read " << masterLength << " bytes in " << runtime << " ms (" 
-            << masterLength/1000.0/runtime << " MB/s)\n\n"; 
+  std::cout << "read " << masterLength << " bytes in " << runtime << " ms ("
+            << masterLength / 1000.0 / runtime << " MB/s)\n\n";
 }
 
-}
+} // namespace deepgalois
diff --git a/libdeepgalois/src/sampler.cpp b/libdeepgalois/src/sampler.cpp
index f61f1bcaa4..0ac77526f3 100644
--- a/libdeepgalois/src/sampler.cpp
+++ b/libdeepgalois/src/sampler.cpp
@@ -1,113 +1,134 @@
 #include "deepgalois/utils.h"
 #include "deepgalois/sampler.h"
 #include "galois/Galois.h"
-#include <time.h> 
+#include <time.h>
 #include <vector>
 #define PARALLEL_GEN
 
 namespace deepgalois {
-inline unsigned getDegree(Graph *g, index_t v) {
-  //return g->get_degree(v);
-  //return std::distance(g->edge_begin(v), g->edge_end(v));
+inline unsigned getDegree(Graph* g, index_t v) {
+  // return g->get_degree(v);
+  // return std::distance(g->edge_begin(v), g->edge_end(v));
   return g->edge_end(v) - g->edge_begin(v);
 }
 
-void Sampler::set_masked_graph(size_t begin, size_t end, size_t count, mask_t *masks, Graph *g) {
-  //galois::gPrint("Set masked graph: begin=", begin, ", end=", end, ", count=", count, "\n");
+void Sampler::set_masked_graph(size_t begin, size_t end, size_t count,
+                               mask_t* masks, Graph* g) {
+  // galois::gPrint("Set masked graph: begin=", begin, ", end=", end, ",
+  // count=", count, "\n");
   begin_ = begin;
-  end_ = end;
+  end_   = end;
   count_ = count;
   masks_ = masks;
-  graph = g;
+  graph  = g;
 #ifndef GALOIS_USE_DIST
   masked_graph = new Graph();
 #endif
-  //generate_masked_graph(g->size(), masks, g, *masked_graph);
+  // generate_masked_graph(g->size(), masks, g, *masked_graph);
   std::vector<uint32_t> degrees(g->size(), 0);
   get_masked_degrees(g->size(), masks, g, degrees);
   auto offsets = deepgalois::parallel_prefix_sum(degrees);
-  size_t ne = offsets[g->size()];
+  size_t ne    = offsets[g->size()];
   for (size_t i = 0; i < g->size(); i++) {
-    if (masks[i] == 1) node_train.push_back(i);
+    if (masks[i] == 1)
+      node_train.push_back(i);
   }
   masked_graph->allocateFrom(g->size(), ne);
   masked_graph->constructNodes();
-  galois::do_all(galois::iterate((size_t)0, g->size()), [&](const auto src) {
-    masked_graph->fixEndEdge(src, offsets[src+1]);
-    if (masks[src] == 1) {
-      auto idx = offsets[src];
-      for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) {
-        const auto dst = g->getEdgeDst(e);
-        if (masks[dst] == 1) masked_graph->constructEdge(idx++, dst, 0);
-      }
-    }
-  }, galois::loopname("gen_subgraph"));
+  galois::do_all(
+      galois::iterate((size_t)0, g->size()),
+      [&](const auto src) {
+        masked_graph->fixEndEdge(src, offsets[src + 1]);
+        if (masks[src] == 1) {
+          auto idx = offsets[src];
+          for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) {
+            const auto dst = g->getEdgeDst(e);
+            if (masks[dst] == 1)
+              masked_graph->constructEdge(idx++, dst, 0);
+          }
+        }
+      },
+      galois::loopname("gen_subgraph"));
 
   masked_graph->degree_counting();
-  avg_deg = masked_graph->sizeEdges() / masked_graph->size();
+  avg_deg  = masked_graph->sizeEdges() / masked_graph->size();
   subg_deg = (avg_deg > SAMPLE_CLIP) ? SAMPLE_CLIP : avg_deg;
-  //galois::gPrint("Train graph: num_vertices ", masked_graph->size(), " num_edges ", masked_graph->sizeEdges(), " avg_degree ", avg_deg, "\n");
+  // galois::gPrint("Train graph: num_vertices ", masked_graph->size(), "
+  // num_edges ", masked_graph->sizeEdges(), " avg_degree ", avg_deg, "\n");
   size_t idx = 0;
   vertices_.resize(count);
   for (size_t i = begin; i < end; i++) {
-    if (masks_[i] == 1) vertices_[idx++] = i;
+    if (masks_[i] == 1)
+      vertices_[idx++] = i;
   }
 }
 
-void Sampler::get_masked_degrees(size_t n, mask_t *masks, Graph *g, std::vector<uint32_t> &degrees) {
+void Sampler::get_masked_degrees(size_t n, mask_t* masks, Graph* g,
+                                 std::vector<uint32_t>& degrees) {
   assert(degrees.size() == n);
 #ifdef PARALLEL_GEN
-  galois::do_all(galois::iterate(size_t(0), n), [&](const auto src) {
+  galois::do_all(
+      galois::iterate(size_t(0), n),
+      [&](const auto src) {
 #else
   for (size_t src = 0; src < n; src++) {
 #endif
-    if (masks[src] == 1) {
-      for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) {
-        const auto dst = g->getEdgeDst(e);
-        if (masks[dst] == 1) degrees[src] ++;
+        if (masks[src] == 1) {
+          for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) {
+            const auto dst = g->getEdgeDst(e);
+            if (masks[dst] == 1)
+              degrees[src]++;
+          }
+        }
       }
-    }
-  }
 #ifdef PARALLEL_GEN
-  , galois::loopname("update_degrees"));
+      ,
+      galois::loopname("update_degrees"));
 #endif
 }
 
-void Sampler::generate_masked_graph(size_t n, mask_t* masks, Graph* g, Graph& sub) {
+void Sampler::generate_masked_graph(size_t n, mask_t* masks, Graph* g,
+                                    Graph& sub) {
   std::vector<uint32_t> degrees(n, 0);
   get_masked_degrees(n, masks, g, degrees);
-  //auto offsets = deepgalois::parallel_prefix_sum(degrees);
+  // auto offsets = deepgalois::parallel_prefix_sum(degrees);
   auto offsets = deepgalois::prefix_sum(degrees);
-  size_t ne = offsets[n];
-  //galois::gPrint("Generate masked graph: num_vertices=", n, ", num_edges=", ne, "\n");
+  size_t ne    = offsets[n];
+  // galois::gPrint("Generate masked graph: num_vertices=", n, ", num_edges=",
+  // ne, "\n");
 #ifndef GALOIS_USE_DIST
   sub.allocateFrom(n, ne);
   sub.constructNodes();
 #ifdef PARALLEL_GEN
-  galois::do_all(galois::iterate((size_t)0, n), [&](const auto src) {
+  galois::do_all(
+      galois::iterate((size_t)0, n),
+      [&](const auto src) {
 #else
   for (size_t src = 0; src < n; src++) {
 #endif
-    sub.fixEndEdge(src, offsets[src+1]);
-    if (masks[src] == 1) {
-      auto idx = offsets[src];
-      for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) {
-        const auto dst = g->getEdgeDst(e);
-        if (masks[dst] == 1) sub.constructEdge(idx++, dst, 0);
+        sub.fixEndEdge(src, offsets[src + 1]);
+        if (masks[src] == 1) {
+          auto idx = offsets[src];
+          for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) {
+            const auto dst = g->getEdgeDst(e);
+            if (masks[dst] == 1)
+              sub.constructEdge(idx++, dst, 0);
+          }
+        }
       }
-    }
-  }
 #ifdef PARALLEL_GEN
-  , galois::loopname("gen_subgraph"));
+      ,
+      galois::loopname("gen_subgraph"));
 #endif
 #endif
 }
 
-void Sampler::check_DB(std::vector<db_t> &DB0, std::vector<db_t> &DB1, std::vector<db_t> &DB2, size_t size) {
+void Sampler::check_DB(std::vector<db_t>& DB0, std::vector<db_t>& DB1,
+                       std::vector<db_t>& DB2, size_t size) {
   if (DB0.capacity() < size) {
-    DB0.reserve(DB0.capacity()*2);
-    DB1.reserve(DB1.capacity()*2);
-    DB2.reserve(DB2.capacity()*2);
+    DB0.reserve(DB0.capacity() * 2);
+    DB1.reserve(DB1.capacity() * 2);
+    DB2.reserve(DB2.capacity() * 2);
   }
   DB0.resize(size);
   DB1.resize(size);
@@ -116,25 +137,26 @@ void Sampler::check_DB(std::vector<db_t> &DB0, std::vector<db_t> &DB1, std::vect
 
 void print_vertex_set(VertexSet vertex_set) {
   unsigned counter = 0;
-  unsigned n = vertex_set.size();
+  unsigned n       = vertex_set.size();
   galois::gPrint("( ");
   for (int i : vertex_set) {
-    counter ++;
-    if (counter > 16 && counter < n-16) continue;
+    counter++;
+    if (counter > 16 && counter < n - 16)
+      continue;
     galois::gPrint(i, " ");
   }
   galois::gPrint(")\n");
 }
 
-void Sampler::select_vertices(size_t n, int m, VertexSet &st, unsigned tid) {
-  //unsigned myseed = time(NULL);
+void Sampler::select_vertices(size_t n, int m, VertexSet& st, unsigned tid) {
+  // unsigned myseed = time(NULL);
   unsigned myseed = tid + time(NULL);
-  //unsigned myseed = tid;
-  //DBx: Dashboard line x, IAx: Index array line x
+  // unsigned myseed = tid;
+  // DBx: Dashboard line x, IAx: Index array line x
   std::vector<db_t> DB0, DB1, DB2, IA0, IA1, IA2, IA3, IA4, nDB0, nDB1, nDB2;
-  DB0.reserve(subg_deg*m*ETA);
-  DB1.reserve(subg_deg*m*ETA);
-  DB2.reserve(subg_deg*m*ETA);
+  DB0.reserve(subg_deg * m * ETA);
+  DB1.reserve(subg_deg * m * ETA);
+  DB2.reserve(subg_deg * m * ETA);
   IA0.reserve(n);
   IA1.reserve(n);
   IA2.reserve(n);
@@ -145,11 +167,11 @@ void Sampler::select_vertices(size_t n, int m, VertexSet &st, unsigned tid) {
   IA2.resize(m);
   IA3.resize(m);
 
-  //galois::gPrint("seed ", myseed, " m ", m, "\n");
-  //galois::gPrint("node_train size: ", node_train.size(), "\n");
-  //printf("( ");
-  //for (size_t i = 0; i < 10; i++) std::cout << node_train[i] << " ";
-  //printf(")\n");
+  // galois::gPrint("seed ", myseed, " m ", m, "\n");
+  // galois::gPrint("node_train size: ", node_train.size(), "\n");
+  // printf("( ");
+  // for (size_t i = 0; i < 10; i++) std::cout << node_train[i] << " ";
+  // printf(")\n");
   for (int i = 0; i < m; i++) {
     auto rand_idx = rand_r(&myseed) % node_train.size();
     db_t v = IA3[i] = node_train[rand_idx];
@@ -159,61 +181,67 @@ void Sampler::select_vertices(size_t n, int m, VertexSet &st, unsigned tid) {
     IA1[i] = 1;
     IA2[i] = 0;
   }
-  // calculate prefix sum for IA0 and store in IA2 to compute the address for each frontier in DB
+  // calculate prefix sum for IA0 and store in IA2 to compute the address for
+  // each frontier in DB
   IA2[0] = IA0[0];
-  for (int i = 1; i < m; i++) IA2[i] = IA2[i-1] + IA0[i];
+  for (int i = 1; i < m; i++)
+    IA2[i] = IA2[i - 1] + IA0[i];
   // now fill DB accordingly
-  check_DB(DB0, DB1, DB2, IA2[m-1]);
+  check_DB(DB0, DB1, DB2, IA2[m - 1]);
   for (int i = 0; i < m; i++) {
-    db_t DB_start = (i==0) ? 0 : IA2[i-1];
-    db_t DB_end = IA2[i];
+    db_t DB_start = (i == 0) ? 0 : IA2[i - 1];
+    db_t DB_end   = IA2[i];
     for (auto j = DB_start; j < DB_end; j++) {
       DB0[j] = IA3[i];
-      DB1[j] = (j==DB_start) ? (j-DB_end) : (j-DB_start);
+      DB1[j] = (j == DB_start) ? (j - DB_end) : (j - DB_start);
       DB2[j] = i + 1;
     }
   }
 
   db_t choose, neigh_v, newsize, tmp;
-  for (size_t itr = 0; itr < n-m; itr++) {
+  for (size_t itr = 0; itr < n - m; itr++) {
     choose = db_t(-1);
     while (choose == db_t(-1)) {
       tmp = rand_r(&myseed) % DB0.size();
       if (size_t(tmp) < DB0.size())
-        if (DB0[tmp] != db_t(-1)) choose = tmp;
+        if (DB0[tmp] != db_t(-1))
+          choose = tmp;
     }
-    choose = (DB1[choose] < 0) ? choose : (choose - DB1[choose]);
-    db_t v = DB0[choose];
+    choose      = (DB1[choose] < 0) ? choose : (choose - DB1[choose]);
+    db_t v      = DB0[choose];
     auto degree = getDegree(masked_graph, v);
-    neigh_v = (degree!=0) ? rand_r(&myseed)%degree : db_t(-1);
+    neigh_v     = (degree != 0) ? rand_r(&myseed) % degree : db_t(-1);
     if (neigh_v != db_t(-1)) {
-      neigh_v = masked_graph->getEdgeDst(masked_graph->edge_begin(v)+neigh_v);
+      neigh_v = masked_graph->getEdgeDst(masked_graph->edge_begin(v) + neigh_v);
       st.insert(neigh_v);
-      IA1[DB2[choose]-1] = 0;
-      IA0[DB2[choose]-1] = 0;
-      for (auto i = choose; i < choose-DB1[choose]; i++) DB0[i] = db_t(-1);
+      IA1[DB2[choose] - 1] = 0;
+      IA0[DB2[choose] - 1] = 0;
+      for (auto i = choose; i < choose - DB1[choose]; i++)
+        DB0[i] = db_t(-1);
       newsize = getDegree(masked_graph, neigh_v);
       newsize = (newsize > SAMPLE_CLIP) ? SAMPLE_CLIP : newsize;
-    }
-    else newsize = 0;
-    //shrink DB to remove sampled nodes, also shrink IA accordingly
+    } else
+      newsize = 0;
+    // shrink DB to remove sampled nodes, also shrink IA accordingly
     bool cond = DB0.size() + newsize > DB0.capacity();
     if (cond) {
       // compute prefix sum for the location in shrinked DB
       IA4.resize(IA0.size());
-      IA4[0]=IA0[0];
-      for (size_t i = 1; i < IA0.size(); i++) IA4[i] = IA4[i-1] + IA0[i];
+      IA4[0] = IA0[0];
+      for (size_t i = 1; i < IA0.size(); i++)
+        IA4[i] = IA4[i - 1] + IA0[i];
       nDB0.resize(IA4.back());
       nDB1.resize(IA4.back());
       nDB2.resize(IA4.back());
       IA2.assign(IA4.begin(), IA4.end());
       for (size_t i = 0; i < IA0.size(); i++) {
-        if (IA1[i] == 0) continue;
-        db_t DB_start = (i==0) ? 0 : IA4[i-1];
-        db_t DB_end = IA4[i];
+        if (IA1[i] == 0)
+          continue;
+        db_t DB_start = (i == 0) ? 0 : IA4[i - 1];
+        db_t DB_end   = IA4[i];
         for (auto j = DB_start; j < DB_end; j++) {
           nDB0[j] = IA3[i];
-          nDB1[j] = (j==DB_start) ? (j-DB_end) : (j-DB_start);
+          nDB1[j] = (j == DB_start) ? (j - DB_end) : (j - DB_start);
           nDB2[j] = i + 1;
         }
       }
@@ -221,18 +249,19 @@ void Sampler::select_vertices(size_t n, int m, VertexSet &st, unsigned tid) {
       IA4.resize(IA1.size());
       IA4[0] = IA1[0];
       for (size_t i = 1; i < IA1.size(); i++)
-        IA4[i] = IA4[i-1] + IA1[i];
+        IA4[i] = IA4[i - 1] + IA1[i];
       DB0.assign(nDB0.begin(), nDB0.end());
       DB1.assign(nDB1.begin(), nDB1.end());
       DB2.assign(nDB2.begin(), nDB2.end());
-      for (auto i = DB2.begin(); i < DB2.end(); i++) *i = IA4[*i - 1];
-      db_t curr=0;
+      for (auto i = DB2.begin(); i < DB2.end(); i++)
+        *i = IA4[*i - 1];
+      db_t curr = 0;
       for (size_t i = 0; i < IA0.size(); i++) {
         if (IA0[i] != 0) {
-          IA0[curr]=IA0[i];
-          IA1[curr]=IA1[i];
-          IA2[curr]=IA2[i];
-          IA3[curr]=IA3[i];
+          IA0[curr] = IA0[i];
+          IA1[curr] = IA1[i];
+          IA2[curr] = IA2[i];
+          IA3[curr] = IA3[i];
           curr++;
         }
       }
@@ -241,21 +270,21 @@ void Sampler::select_vertices(size_t n, int m, VertexSet &st, unsigned tid) {
       IA2.resize(curr);
       IA3.resize(curr);
     }
-    check_DB(DB0, DB1, DB2, newsize+DB0.size());
+    check_DB(DB0, DB1, DB2, newsize + DB0.size());
     IA0.push_back(newsize);
     IA1.push_back(1);
     IA2.push_back(IA2.back() + IA0.back());
     IA3.push_back(neigh_v);
     db_t DB_start = (*(IA2.end() - 2));
-    db_t DB_end = IA2.back();
+    db_t DB_end   = IA2.back();
     for (auto j = DB_start; j < DB_end; j++) {
       DB0[j] = IA3.back();
-      DB1[j] = (j==DB_start) ? (j-DB_end) : (j-DB_start);
+      DB1[j] = (j == DB_start) ? (j - DB_end) : (j - DB_start);
       DB2[j] = IA3.size();
     }
   }
-  //galois::gPrint("Done selection, vertex_set size: ", st.size(), ", set: ");
-  //print_vertex_set(st);
+  // galois::gPrint("Done selection, vertex_set size: ", st.size(), ", set: ");
+  // print_vertex_set(st);
 }
 
 // !API function for user-defined selection strategy
@@ -263,48 +292,53 @@ void Sampler::select_vertices(size_t n, int m, VertexSet &st, unsigned tid) {
 // nv: number of vertices in the original graph;
 // n: number of vertices in the subgraph;
 // m: number of vertices in the frontier.
-void Sampler::select_vertices(size_t nv, size_t n, int m, Graph *g, VertexList vertices, VertexSet &vertex_set) {
-  //galois::gPrint("Select a vertex set of size ", n, " from ", nv, " vertices, graph size: ", g->size(), "\n");
+void Sampler::select_vertices(size_t nv, size_t n, int m, Graph* g,
+                              VertexList vertices, VertexSet& vertex_set) {
+  // galois::gPrint("Select a vertex set of size ", n, " from ", nv, " vertices,
+  // graph size: ", g->size(), "\n");
   assert(nv == vertices.size());
-  auto frontier_indices = deepgalois::select_k_items(m, 0, (int)nv); // randomly select m vertices from vertices as frontier
+  auto frontier_indices = deepgalois::select_k_items(
+      m, 0, (int)nv); // randomly select m vertices from vertices as frontier
   VertexList frontier(m);
   for (int i = 0; i < m; i++)
     frontier[i] = vertices[frontier_indices[i]];
   vertex_set.insert(frontier.begin(), frontier.end());
-  //galois::gPrint("vertex_set size: ", vertex_set.size(), "\n");
-  int *degrees = new int[m];
+  // galois::gPrint("vertex_set size: ", vertex_set.size(), "\n");
+  int* degrees = new int[m];
   for (int i = 0; i < m; i++) {
-  //galois::do_all(galois::iterate(size_t(0), size_t(m)), [&](const auto i) {
+    // galois::do_all(galois::iterate(size_t(0), size_t(m)), [&](const auto i) {
     degrees[i] = (int)getDegree(g, frontier[i]);
-  }//, galois::loopname("compute_degrees"));
+  } //, galois::loopname("compute_degrees"));
   for (size_t i = 0; i < n - m; i++) {
-    auto pos = select_one_item((int)m, degrees);
-    auto u = frontier[pos];
+    auto pos    = select_one_item((int)m, degrees);
+    auto u      = frontier[pos];
     auto degree = degrees[pos];
-    int j =0;
-    for (; j < degree; j ++) {
+    int j       = 0;
+    for (; j < degree; j++) {
       auto neighbor_id = rand() % degree; // randomly select a neighbor
-      auto dst = g->getEdgeDst(g->edge_begin(u) + neighbor_id);
+      auto dst         = g->getEdgeDst(g->edge_begin(u) + neighbor_id);
       if (vertex_set.find(dst) == vertex_set.end()) {
         frontier[pos] = dst;
-        degrees[pos] = getDegree(g, frontier[pos]);
+        degrees[pos]  = getDegree(g, frontier[pos]);
         vertex_set.insert(dst);
         break;
       }
     }
-    if (j == degree) galois::gPrint("Not found from ", degree, " neighbors\n");
+    if (j == degree)
+      galois::gPrint("Not found from ", degree, " neighbors\n");
   }
   /*
-  assert(n == vertex_set.size()); // size of vertex_set could be slightly smaller than n
-  galois::gPrint("Done selection, vertex_set size: ", vertex_set.size(), ", set: ");
-  print_vertex_set(vertex_set);
+  assert(n == vertex_set.size()); // size of vertex_set could be slightly
+  smaller than n galois::gPrint("Done selection, vertex_set size: ",
+  vertex_set.size(), ", set: "); print_vertex_set(vertex_set);
   */
 }
 
-void Sampler::update_masks(size_t n, VertexSet vertices, mask_t *masks) {
-  //galois::gPrint("Updating masks, size = ", vertices.size(), "\n");
-  std::fill(masks, masks+n, 0);
-  for (auto v : vertices) masks[v] = 1;
+void Sampler::update_masks(size_t n, VertexSet vertices, mask_t* masks) {
+  // galois::gPrint("Updating masks, size = ", vertices.size(), "\n");
+  std::fill(masks, masks + n, 0);
+  for (auto v : vertices)
+    masks[v] = 1;
 }
 
 inline VertexList Sampler::reindexing_vertice(size_t n, VertexSet vertex_set) {
@@ -316,55 +350,64 @@ inline VertexList Sampler::reindexing_vertice(size_t n, VertexSet vertex_set) {
   return new_ids;
 }
 
-// Given a subset of vertices and a graph g, generate a subgraph sg from the graph g
-void Sampler::generate_subgraph(VertexSet &vertex_set, Graph &g, Graph &sub) {
-  //auto n = g.size(); // old graph size
-  auto nv = vertex_set.size(); // new graph (subgraph) size
+// Given a subset of vertices and a graph g, generate a subgraph sg from the
+// graph g
+void Sampler::generate_subgraph(VertexSet& vertex_set, Graph& g, Graph& sub) {
+  // auto n = g.size(); // old graph size
+  auto nv            = vertex_set.size(); // new graph (subgraph) size
   VertexList new_ids = reindexing_vertice(graph->size(), vertex_set);
   std::vector<uint32_t> degrees(nv, 0); // degrees of vertices in the subgraph
   for (auto v : vertex_set) {
-	degrees[new_ids[v]] = getDegree(&g, v);
+    degrees[new_ids[v]] = getDegree(&g, v);
   }
-  //auto offsets = deepgalois::parallel_prefix_sum(degrees);
+  // auto offsets = deepgalois::parallel_prefix_sum(degrees);
   auto offsets = deepgalois::prefix_sum(degrees);
-  auto ne = offsets[nv];
-  //galois::gPrint("Generate subgraph: num_vertices=", nv, ", num_edges=", ne, "\n");
+  auto ne      = offsets[nv];
+  // galois::gPrint("Generate subgraph: num_vertices=", nv, ", num_edges=", ne,
+  // "\n");
 #ifndef GALOIS_USE_DIST
   sub.allocateFrom(nv, ne);
   sub.constructNodes();
   VertexList old_ids(vertex_set.begin(), vertex_set.end()); // vertex ID mapping
 #ifdef PARALLEL_GEN
-  galois::do_all(galois::iterate((size_t)0, nv), [&](const auto i) {
+  galois::do_all(
+      galois::iterate((size_t)0, nv),
+      [&](const auto i) {
 #else
   for (size_t i = 0; i < nv; i++) {
 #endif
-    sub.fixEndEdge(i, offsets[i+1]);
-    unsigned j = 0;
-    auto old_id = old_ids[i];
-    for (auto e = g.edge_begin(old_id); e != g.edge_end(old_id); e++) {
-      auto dst = new_ids[g.getEdgeDst(e)];
-      assert(dst < nv);
-      sub.constructEdge(offsets[i]+j, dst, 0);
-      j ++;
-    }
-  }
+        sub.fixEndEdge(i, offsets[i + 1]);
+        unsigned j  = 0;
+        auto old_id = old_ids[i];
+        for (auto e = g.edge_begin(old_id); e != g.edge_end(old_id); e++) {
+          auto dst = new_ids[g.getEdgeDst(e)];
+          assert(dst < nv);
+          sub.constructEdge(offsets[i] + j, dst, 0);
+          j++;
+        }
+      }
 #ifdef PARALLEL_GEN
-  , galois::loopname("construct_graph"));
+      ,
+      galois::loopname("construct_graph"));
 #endif
 #endif
 }
 
-void Sampler::subgraph_sample(size_t n, Graph&sg, mask_t *masks, unsigned tid) {
+void Sampler::subgraph_sample(size_t n, Graph& sg, mask_t* masks,
+                              unsigned tid) {
   VertexSet vertex_set; // n = 9000 by default
-  //select_vertices(count_, n, m_, masked_graph, vertices_, vertex_set); // m = 1000 by default
+  // select_vertices(count_, n, m_, masked_graph, vertices_, vertex_set); // m =
+  // 1000 by default
   select_vertices(n, m_, vertex_set, tid); // m = 1000 by default
-  update_masks(graph->size(), vertex_set, masks); // set masks for vertices in the vertex_set
+  update_masks(graph->size(), vertex_set,
+               masks); // set masks for vertices in the vertex_set
 #ifndef GALOIS_USE_DIST
   Graph masked_sg;
-  generate_masked_graph(graph->size(), masks, masked_graph, masked_sg); // remove edges whose destination is not masked
+  generate_masked_graph(
+      graph->size(), masks, masked_graph,
+      masked_sg); // remove edges whose destination is not masked
   generate_subgraph(vertex_set, masked_sg, sg);
 #endif
 }
 
-} // end namespace
-
+} // namespace deepgalois
diff --git a/libdeepgalois/src/utils.cpp b/libdeepgalois/src/utils.cpp
index 00a7d5696a..3f67974c67 100644
--- a/libdeepgalois/src/utils.cpp
+++ b/libdeepgalois/src/utils.cpp
@@ -5,60 +5,66 @@ namespace deepgalois {
 
 // parallel prefix sum
 template <typename InTy, typename OutTy>
-OutTy* parallel_prefix_sum(const std::vector<InTy> &in) {
-  const size_t block_size = 1<<20;
+OutTy* parallel_prefix_sum(const std::vector<InTy>& in) {
+  const size_t block_size = 1 << 20;
   const size_t num_blocks = (in.size() + block_size - 1) / block_size;
   std::vector<OutTy> local_sums(num_blocks);
   // count how many bits are set on each thread
-  galois::do_all(galois::iterate((size_t)0, num_blocks), [&](const size_t& block) {
-    OutTy lsum = 0;
-    size_t block_end = std::min((block + 1) * block_size, in.size());
-    for (size_t i=block * block_size; i < block_end; i++)
-      lsum += in[i];
-    local_sums[block] = lsum;
-  });
-  std::vector<OutTy> bulk_prefix(num_blocks+1);
+  galois::do_all(
+      galois::iterate((size_t)0, num_blocks), [&](const size_t& block) {
+        OutTy lsum       = 0;
+        size_t block_end = std::min((block + 1) * block_size, in.size());
+        for (size_t i = block * block_size; i < block_end; i++)
+          lsum += in[i];
+        local_sums[block] = lsum;
+      });
+  std::vector<OutTy> bulk_prefix(num_blocks + 1);
   OutTy total = 0;
-  for (size_t block=0; block < num_blocks; block++) {
+  for (size_t block = 0; block < num_blocks; block++) {
     bulk_prefix[block] = total;
     total += local_sums[block];
   }
   bulk_prefix[num_blocks] = total;
-  OutTy *prefix = new OutTy[in.size() + 1];
-  galois::do_all(galois::iterate((size_t)0, num_blocks), [&](const size_t& block) {
-    OutTy local_total = bulk_prefix[block];
-    size_t block_end = std::min((block + 1) * block_size, in.size());
-    for (size_t i=block * block_size; i < block_end; i++) {
-      prefix[i] = local_total;
-      local_total += in[i];
-    }
-  });
+  OutTy* prefix           = new OutTy[in.size() + 1];
+  galois::do_all(
+      galois::iterate((size_t)0, num_blocks), [&](const size_t& block) {
+        OutTy local_total = bulk_prefix[block];
+        size_t block_end  = std::min((block + 1) * block_size, in.size());
+        for (size_t i = block * block_size; i < block_end; i++) {
+          prefix[i] = local_total;
+          local_total += in[i];
+        }
+      });
   prefix[in.size()] = bulk_prefix[num_blocks];
   return prefix;
 }
 
-template uint32_t* parallel_prefix_sum<uint32_t, uint32_t>(const std::vector<uint32_t> &in);
+template uint32_t*
+parallel_prefix_sum<uint32_t, uint32_t>(const std::vector<uint32_t>& in);
 
 // Compute the F1 score, also known as balanced F-score or F-measure
-// The F1 score can be interpreted as a weighted average of the precision and recall, 
-// where an F1 score reaches its best value at 1 and worst score at 0. 
+// The F1 score can be interpreted as a weighted average of the precision and
+// recall, where an F1 score reaches its best value at 1 and worst score at 0.
 // The relative contribution of precision and recall to the F1 score are equal.
 // The formula for the F1 score is:
 // F1 = 2 * (precision * recall) / (precision + recall)
 // where precision = TP / (TP + FP), recall = TP / (TP + FN)
 // TP: true positive; FP: false positive; FN: false negative.
-// In the multi-class and multi-label case, this is the weighted average of the F1 score of each class.
-// Please refer to https://sebastianraschka.com/faq/docs/multiclass-metric.html,
-// http://pageperso.lif.univ-mrs.fr/~francois.denis/IAAM1/scikit-learn-docs.pdf (p.1672)
-// and https://github.com/ashokpant/accuracy-evaluation-cpp/blob/master/src/evaluation.hpp
-acc_t masked_f1_score(size_t begin, size_t end, size_t, mask_t *masks, 
-                      size_t num_classes, label_t *ground_truth, float_t *pred) {
+// In the multi-class and multi-label case, this is the weighted average of the
+// F1 score of each class. Please refer to
+// https://sebastianraschka.com/faq/docs/multiclass-metric.html,
+// http://pageperso.lif.univ-mrs.fr/~francois.denis/IAAM1/scikit-learn-docs.pdf
+// (p.1672) and
+// https://github.com/ashokpant/accuracy-evaluation-cpp/blob/master/src/evaluation.hpp
+acc_t masked_f1_score(size_t begin, size_t end, size_t, mask_t* masks,
+                      size_t num_classes, label_t* ground_truth,
+                      float_t* pred) {
   double precision_cls(0.), recall_cls(0.), f1_accum(0.);
   int tp_accum(0), fn_accum(0), fp_accum(0), tn_accum(0);
   for (size_t col = 0; col < num_classes; col++) {
     int tp_cls(0), fp_cls(0), fn_cls(0), tn_cls(0);
-    for (size_t row = begin; row < end; row ++) {
-    //galois::do_all(galois::iterate(begin, end), [&](const auto& row) {
+    for (size_t row = begin; row < end; row++) {
+      // galois::do_all(galois::iterate(begin, end), [&](const auto& row) {
       if (masks == NULL || masks[row] == 1) {
         auto idx = row * num_classes + col;
         if (ground_truth[idx] == 1 && pred[idx] > 0.5) {
@@ -81,18 +87,31 @@ acc_t masked_f1_score(size_t begin, size_t end, size_t, mask_t *masks,
     fn_accum += fn_cls;
     fp_accum += fp_cls;
     tn_accum += tn_cls;
-    precision_cls = tp_cls + fp_cls > 0 ? (double)tp_cls/(double)(tp_cls+fp_cls) : 0.;
-    recall_cls = tp_cls+fn_cls > 0 ? (double)tp_cls/(double)(tp_cls+fn_cls) : 0.;
-    f1_accum += recall_cls+precision_cls > 0. ? 2.*(recall_cls*precision_cls)/(recall_cls+precision_cls) : 0.;
+    precision_cls =
+        tp_cls + fp_cls > 0 ? (double)tp_cls / (double)(tp_cls + fp_cls) : 0.;
+    recall_cls =
+        tp_cls + fn_cls > 0 ? (double)tp_cls / (double)(tp_cls + fn_cls) : 0.;
+    f1_accum +=
+        recall_cls + precision_cls > 0.
+            ? 2. * (recall_cls * precision_cls) / (recall_cls + precision_cls)
+            : 0.;
   }
-  double f1_macro = f1_accum/(double)num_classes;
-  //double accuracy_mic = (double)(tp_accum+tn_accum)/(double)(tp_accum+tn_accum+fp_accum+fn_accum);
-  double precision_mic = tp_accum+fp_accum > 0 ? (double)tp_accum/(double)(tp_accum+fp_accum) : 0.;
-  double recall_mic = tp_accum+fn_accum > 0 ? (double)tp_accum/(double)(tp_accum+fn_accum) : 0.;
-  double f1_micro = recall_mic+precision_mic > 0. ? 2.*(recall_mic*precision_mic)/(recall_mic+precision_mic) : 0.;
-  std::cout << std::setprecision(3) << std::fixed <<
-      " (f1_micro: " << f1_micro << ", f1_macro: " << f1_macro << ") ";
+  double f1_macro = f1_accum / (double)num_classes;
+  // double accuracy_mic =
+  // (double)(tp_accum+tn_accum)/(double)(tp_accum+tn_accum+fp_accum+fn_accum);
+  double precision_mic = tp_accum + fp_accum > 0
+                             ? (double)tp_accum / (double)(tp_accum + fp_accum)
+                             : 0.;
+  double recall_mic = tp_accum + fn_accum > 0
+                          ? (double)tp_accum / (double)(tp_accum + fn_accum)
+                          : 0.;
+  double f1_micro =
+      recall_mic + precision_mic > 0.
+          ? 2. * (recall_mic * precision_mic) / (recall_mic + precision_mic)
+          : 0.;
+  std::cout << std::setprecision(3) << std::fixed << " (f1_micro: " << f1_micro
+            << ", f1_macro: " << f1_macro << ") ";
   return f1_micro;
 }
 
-} // end namespace
+} // namespace deepgalois
diff --git a/lonestar/gnn/gcn/gcn.cpp b/lonestar/gnn/gcn/gcn.cpp
index 97e1d71447..f2d08d3cb3 100644
--- a/lonestar/gnn/gcn/gcn.cpp
+++ b/lonestar/gnn/gcn/gcn.cpp
@@ -14,14 +14,14 @@ int main(int argc, char** argv) {
   LonestarGnnStart(argc, argv, name, desc, url);
 
   // the neural network to train: loads the entire graph on CPU
-  deepgalois::Net network(dataset, numThreads, num_conv_layers, epochs,
-                    hidden1, learning_rate, dropout_rate, weight_decay,
-                    add_selfloop, is_single_class, add_l2norm, add_dense, 
-                    neighbor_sample_sz, subgraph_sample_sz, val_interval);
+  deepgalois::Net network(dataset, numThreads, num_conv_layers, epochs, hidden1,
+                          learning_rate, dropout_rate, weight_decay,
+                          add_selfloop, is_single_class, add_l2norm, add_dense,
+                          neighbor_sample_sz, subgraph_sample_sz, val_interval);
 
   std::vector<unsigned> dummyVec;
   deepgalois::Graph* dGraph =
-    galois::graphs::constructSymmetricGraph<char, void>(dummyVec);
+      galois::graphs::constructSymmetricGraph<char, void>(dummyVec);
   network.dist_init(dGraph, dataset);
 
   // read network, features, ground truth, initialize metadata
@@ -30,7 +30,7 @@ int main(int argc, char** argv) {
   network.print_layers_info();
   deepgalois::ResourceManager rm; // tracks peak memory usage
 
-  // the optimizer used to update parameters, 
+  // the optimizer used to update parameters,
   // see optimizer.h for more details
   // optimizer *opt = new gradient_descent();
   // optimizer *opt = new adagrad();

From 9d254bbe603928fec343dcc6661f89b3afbf9d72 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Fri, 8 May 2020 18:03:32 -0500
Subject: [PATCH 263/660] add sampler.cu

---
 libdeepgalois/src/sampler.cu | 145 +++++++++++++++++++++++++++++++++++
 libgpu/include/graph_gpu.h   |  17 +++-
 libgpu/src/csr_graph.cu      |   9 +++
 3 files changed, 170 insertions(+), 1 deletion(-)
 create mode 100644 libdeepgalois/src/sampler.cu

diff --git a/libdeepgalois/src/sampler.cu b/libdeepgalois/src/sampler.cu
new file mode 100644
index 0000000000..cecfa6c9e0
--- /dev/null
+++ b/libdeepgalois/src/sampler.cu
@@ -0,0 +1,145 @@
+#include <thrust/scan.h>
+#include <thrust/execution_policy.h>
+#include "deepgalois/sampler.h"
+
+namespace deepgalois {
+
+// set the masks of vertices in a given vertex set
+// n is the size of the vertex set
+__global__ void set_masks(index_t n, index_t* vertices, mask_t* masks) {
+  CUDA_KERNEL_LOOP(i, n) { masks[vertices[i]] = 1; }
+}
+
+// compute the degrees of a masked graph
+// n is the size of the original graph 
+__global__ void get_masked_degrees(index_t n, mask_t *masks, GraphGPU g, index_t* degrees) {
+  CUDA_KERNEL_LOOP(src, n) {
+    if (masks[src] == 1) {
+      for (auto e = g.edge_begin(src); e != g.edge_end(src); e++) {
+        auto dst = g.getEdgeDst(e);
+        if (masks[dst] == 1) degrees[src] ++;
+      }
+    }
+  }
+}
+
+// Given a graph, remove any edge which has end-point masked, and generate the subgraph
+// n is the size of the original graph and the subgraph
+// offset was computed by using prefix-sum of the masked degrees
+__global__ void generate_masked_graph_kernel(index_t n, const mask_t *masks, const index_t* offsets, GraphGPU g, GraphGPU subg) {
+  CUDA_KERNEL_LOOP(src, n) {
+    subg.fixEndEdge(src, offsets[src+1]);
+    if (masks[src] == 1) {
+      auto idx = offsets[src];
+      for (auto e = g.edge_begin(src); e != g.edge_end(src); e++) {
+        auto dst = g.getEdgeDst(e);
+        if (masks[dst] == 1) subg.constructEdge(idx++, dst);
+      }
+    }
+  }
+}
+
+// compute the degrees of the subgraph induced by the vertex set
+// n is the size of the vertex set
+// new_ids array maps vertex ID in the original graph to the vertex ID in the subgraph
+__global__ void get_new_degrees(index_t n, index_t* vertices, index_t* new_ids, GraphGPU g, index_t* degrees) {
+  CUDA_KERNEL_LOOP(i, n) {
+    auto v = vertices[i];
+    degrees[new_ids[v]] = g.getOutDegree(v);
+  }
+}
+
+// Given a masked graph, remove the masked vertices, reindex the rest vertices, and generate the subgraph
+// offset was computed by using prefix-sum of the new degrees
+// n is the size of the old_ids and the sbugraph
+__global__ void generate_graph_kernel(index_t n, const index_t* offsets, const index_t* old_ids, const index_t* new_ids, GraphGPU g, GraphGPU subg) {
+  CUDA_KERNEL_LOOP(i, n) {
+    subg.fixEndEdge(i, offsets[i+1]);
+    index_t j = 0;
+    auto src  = old_ids[i];
+    for (auto e = g.edge_begin(src); e != g.edge_end(src); e++) {
+      auto dst = new_ids[g.getEdgeDst(e)];
+      assert(dst < n);
+      subg.constructEdge(offsets[i] + j, dst);
+      j++;
+    }
+  }
+}
+
+void Sampler::update_masks(size_t n, index_t* vertices, mask_t *masks) {
+  set_masks<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, vertices, masks);
+}
+
+void Sampler::indexing(size_t n, index_t* vertices, index_t *new_indices) {
+  index_t vid = 0;
+  for (index_t i = 0; i < n; i++) {
+    auto v = vertices[i];
+    new_indices[v] = vid ++;
+  }
+}
+
+inline VertexList Sampler::reindexing_vertices(size_t n, VertexSet vertex_set) {
+  VertexList new_ids(n, 0);
+  int vid = 0;
+  for (auto v : vertex_set) {
+    new_ids[v] = vid++; // reindex
+  }
+  return new_ids;
+}
+
+void Sampler::generate_masked_graph(index_t n, mask_t* masks, GraphGPU *g, GraphGPU *subg) {
+  index_t *degrees, *offsets;
+  CUDA_CHECK(cudaMalloc((void**)&degrees, sizeof(index_t)*n);
+  get_masked_degrees<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, masks, g, degrees);
+  CUDA_CHECK(cudaFree(degrees));
+  CUDA_CHECK(cudaMalloc((void**)&offsets, sizeof(index_t)*(n+1));
+  thrust::exclusive_scan(thrust::device, degrees, degrees+n, offsets);
+  index_t ne;
+  CUDA_CHECK(cudaMemcpy(&ne, offsets+n, sizeof(index_t), cudaMemcpyDeviceToHost));
+  subg.allocateFrom(n, ne); // TODO: avoid reallocation
+  generate_masked_graph_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, masks, offsets, g, subg);
+  CUDA_CHECK(cudaFree(pffsets));
+}
+
+// use a random walk to select vertex subset
+void Sampler::select_vertices(size_t n, int m, VertexSet &st) {
+}
+
+// n: size of the original graph
+// nv: size of the subgraph; i.e. size of vertex_set
+// masks, graph g and subgraph sub are on the device (GPU)
+void Sampler::generate_subgraph(index_t nv, VertexSet vertex_set, mask_t* masks, GraphGPU *g, GraphGPU *sub) {
+  // convert the vertex_set to a vertex_list and copy it to the device
+  VertexList vertex_list(vertex_set.begin(), vertex_set.end());
+  index_t *d_vertex_list;
+  cudaMalloc((void **) &d_vertex_list, nv*sizeof(index_t));
+  CUDA_CHECK(cudaMemcpy(d_vertex_list, &vertex_list[0], nv*sizeof(index_t), cudaMemcpyHostToDevice));
+
+  index_t n = graph->size();
+  update_masks(n, d_vertex_list, masks); // set masks for vertices in the vertex_set
+  GraphGPU masked_sg; // size is the same as original graph, but masked dst removed
+  generate_masked_graph(n, masks, g, &masked_sg); // remove edges whose destination is not masked
+
+  // re-index the subgraph
+  index_t *d_new_ids; // Given an old vertex ID ∈ [0, n), returns a new vertex ID ∈ [0, nv)
+  cudaMalloc((void **) &d_new_ids, n*sizeof(index_t));
+  auto new_ids = reindexing_vertices(nv, vertex_set);
+  CUDA_CHECK(cudaMemcpy(d_new_ids, &new_ids[0], n*sizeof(index_t), cudaMemcpyHostToDevice));
+
+  // generate the offsets for the re-indexed subgraph
+  index_t *degrees, *offsets;
+  CUDA_CHECK(cudaMalloc((void**)&degrees, sizeof(index_t)*nv);
+  get_new_degrees<<<CUDA_GET_BLOCKS(nv), CUDA_NUM_THREADS>>>(nv, d_vertex_list, d_new_ids, masked_sg, degrees);
+  CUDA_CHECK(cudaFree(degrees));
+  CUDA_CHECK(cudaMalloc((void**)&offsets, sizeof(index_t)*(nv+1));
+  thrust::exclusive_scan(thrust::device, degrees, degrees+nv, offsets);
+  index_t ne;
+  CUDA_CHECK(cudaMemcpy(&ne, offsets+nv, sizeof(index_t), cudaMemcpyDeviceToHost));
+
+  // allocate memory for the subgraph
+  sub.allocateFrom(nv, ne); // avoid reallocation
+  // generate the subgraph
+  generate_graph_kernel<<<CUDA_GET_BLOCKS(nv), CUDA_NUM_THREADS>>>(nv, offsets, d_vertex_list, d_new_ids, masked_sg, sub);
+}
+
+}
diff --git a/libgpu/include/graph_gpu.h b/libgpu/include/graph_gpu.h
index b47ed326b1..449e38a7b5 100644
--- a/libgpu/include/graph_gpu.h
+++ b/libgpu/include/graph_gpu.h
@@ -167,7 +167,22 @@ struct CSRGraph {
 	CUDA_HOSTDEV const node_data_type *node_data_ptr() const { return node_data; }
 	CUDA_HOSTDEV edge_data_type *edge_data_ptr() { return edge_data; }
 	CUDA_HOSTDEV const edge_data_type *edge_data_ptr() const { return edge_data; }
-
+  CUDA_HOSTDEV void fixEndEdge(index_type vid, index_type row_end) { row_start[vid + 1] = row_end; }
+  CUDA_HOSTDEV void constructEdge(index_type eid, index_type dst, edge_data_type edata = 0) {
+    assert(dst < nnodes);
+    assert(eid < nedges);
+    edge_dst[eid] = dst;
+    //if (edge_data) edge_data[eid] = edata;
+  }
+  void malloc_index_device(index_type n, index_type *ptr);
+  void set_index(index_type pos, index_type value, index_type *ptr);
+  void allocateFrom(index_type nv, index_type ne) {
+    nnodes = nv;
+    nedges = ne;
+    malloc_index_device(nedges, edge_dst);
+    malloc_index_device(nnodes+1, row_start);
+    set_index(0, 0, row_start);
+  }
   size_t size() { return size_t(nnodes); }
   size_t sizeEdges() { return size_t(nedges); }
   void degree_counting() {}
diff --git a/libgpu/src/csr_graph.cu b/libgpu/src/csr_graph.cu
index 593451d788..e7be218138 100644
--- a/libgpu/src/csr_graph.cu
+++ b/libgpu/src/csr_graph.cu
@@ -46,6 +46,15 @@ unsigned CSRGraph::allocOnHost(bool no_edge_data) {
   return ((no_edge_data || edge_data) && row_start && edge_dst && node_data);
 }
 
+void CSRGraph::malloc_index_device(index_type n, index_type *ptr) {
+  check_cuda(cudaMalloc((void **) &ptr, n * sizeof(index_type)));
+}
+
+void CSRGraph::set_index(index_type pos, index_type value, index_type *ptr) {
+  index_type h_value = value;
+  check_cuda(cudaMemcpy(ptr+pos, &h_value, sizeof(index_type), cudaMemcpyHostToDevice));
+}
+
 unsigned CSRGraph::allocOnDevice(bool no_edge_data) {
   if(edge_dst != NULL)  // already allocated
     return true;  

From e01dbc54d627a2fda134d16fa081a1b4cd1be766 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Sun, 10 May 2020 19:16:17 -0500
Subject: [PATCH 264/660] =?UTF-8?q?fix=20error:=20=E2=80=98CSRGraph?=
 =?UTF-8?q?=E2=80=99=20does=20not=20name=20a=20type?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 libdeepgalois/include/deepgalois/gtypes.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libdeepgalois/include/deepgalois/gtypes.h b/libdeepgalois/include/deepgalois/gtypes.h
index ff4a6e4e46..a2535f93a3 100644
--- a/libdeepgalois/include/deepgalois/gtypes.h
+++ b/libdeepgalois/include/deepgalois/gtypes.h
@@ -1,5 +1,4 @@
 #pragma once
-#define USE_CSRGRAPH
 
 #include "deepgalois/types.h"
 #ifdef GALOIS_USE_DIST
@@ -11,6 +10,7 @@
 //#include "galois/graphs/LCGraph.h"
 #include "deepgalois/lgraph.h"
 #else
+#define USE_CSRGRAPH
 #ifdef USE_CSRGRAPH
 #include "deepgalois/lgraph.h"
 #include "graph_gpu.h"

From a83c74665ace438497a60d481b57374a5be5467b Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Mon, 11 May 2020 09:18:30 -0500
Subject: [PATCH 265/660] add fat and sage

---
 lonestar/gnn/CMakeLists.txt        |  2 ++
 lonestar/gnn/gat/CMakeLists.txt    | 14 +++++++++
 lonestar/gnn/gat/gat.cpp           | 16 ++++++++++
 lonestar/gnn/graphsage/gs-mean.cpp | 45 ---------------------------
 lonestar/gnn/include/engine.h      | 50 ++++++++++++++++++++++++++++++
 lonestar/gnn/sage/CMakeLists.txt   | 14 +++++++++
 lonestar/gnn/sage/sage.cpp         | 20 ++++++++++++
 7 files changed, 116 insertions(+), 45 deletions(-)
 create mode 100644 lonestar/gnn/gat/CMakeLists.txt
 create mode 100644 lonestar/gnn/gat/gat.cpp
 delete mode 100644 lonestar/gnn/graphsage/gs-mean.cpp
 create mode 100644 lonestar/gnn/include/engine.h
 create mode 100644 lonestar/gnn/sage/CMakeLists.txt
 create mode 100644 lonestar/gnn/sage/sage.cpp

diff --git a/lonestar/gnn/CMakeLists.txt b/lonestar/gnn/CMakeLists.txt
index 1f5d35b5f1..773df6a819 100644
--- a/lonestar/gnn/CMakeLists.txt
+++ b/lonestar/gnn/CMakeLists.txt
@@ -24,3 +24,5 @@ if(ENABLE_DIST_GALOIS)
 endif()
 
 add_subdirectory(gcn)
+add_subdirectory(sage)
+add_subdirectory(gat)
diff --git a/lonestar/gnn/gat/CMakeLists.txt b/lonestar/gnn/gat/CMakeLists.txt
new file mode 100644
index 0000000000..f9f1efdc6f
--- /dev/null
+++ b/lonestar/gnn/gat/CMakeLists.txt
@@ -0,0 +1,14 @@
+add_executable(gat gat.cpp)
+target_link_libraries(gat PRIVATE Galois::shmem lonestar)
+
+if(ENABLE_HETERO_GALOIS)
+  set_property(TARGET gat PROPERTY CUDA_STANDARD 14)
+  set_property(TARGET gat PROPERTY CUDA_SEPARABLE_COMPILATION ON)
+  target_link_libraries(gat PRIVATE dg_gpu dg_cpu)
+  target_link_libraries(gat PRIVATE -lcudart -lcublas -lcurand -lcudadevrt)
+else()
+target_link_libraries(gat PRIVATE dg_cpu)
+if(ENABLE_DIST_GALOIS)
+  target_link_libraries(gat PRIVATE distgraphloader)
+endif()
+endif()
diff --git a/lonestar/gnn/gat/gat.cpp b/lonestar/gnn/gat/gat.cpp
new file mode 100644
index 0000000000..6f652e84c7
--- /dev/null
+++ b/lonestar/gnn/gat/gat.cpp
@@ -0,0 +1,16 @@
+// Graph Attension Networks (GAT)
+// Xuhao Chen <cxh@utexas.edu>
+#include "lonestargnn.h"
+
+const char* name = "Graph Attention Networks (GAT)";
+const char* desc = "Graph Attention Networks on an undirected graph: <https://arxiv.org/pdf/1710.10903.pdf>";
+const char* url  = 0;
+
+// define aggregator here
+
+// math: h_i^{(l+1)} = \sum_{j\in \mathcal{N}(i)} \alpha_{i,j} W^{(l)} h_j^{(l)}
+// where :math:`\alpha_{ij}` is the attention score bewteen node :math:`i` and node :math:`j`:
+// .. math:: \alpha_{ij}^{l} & = \mathrm{softmax_i} (e_{ij}^{l})
+//                e_{ij}^{l} & = \mathrm{LeakyReLU}\left(\vec{a}^T [W h_{i} \| W h_{j}]\right)
+
+#include "engine.h"
diff --git a/lonestar/gnn/graphsage/gs-mean.cpp b/lonestar/gnn/graphsage/gs-mean.cpp
deleted file mode 100644
index 4bd80e6203..0000000000
--- a/lonestar/gnn/graphsage/gs-mean.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-// Graph Neural Networks
-// Xuhao Chen <cxh@utexas.edu>
-#include "gnn.h"
-
-const char* name = "GraphSage";
-const char* desc = "A graph neural network variant: GraphSAGE";
-const char* url  = 0;
-
-class GraphSageMean : public graph_conv_layer {
-  // user-defined combine function
-};
-
-int main(int argc, char** argv) {
-  galois::SharedMemSys G;
-  LonestarStart(argc, argv, name, desc, url);
-  Net network;    // the neural network to train
-  network.init(); // default setting for now; see its implementation to find how
-                  // to customize it by the user
-  ResourceManager rm;
-
-  // the optimizer used to update parameters, see optimizer.h for more details
-  // optimizer *opt = new gradient_descent();
-  // optimizer *opt = new adagrad();
-  optimizer* opt = new adam();
-  galois::StatTimer Ttrain("Train");
-  Ttrain.start();
-  network.train(opt); // do training using training samples
-  Ttrain.stop();
-
-  // test using test samples
-  acc_t test_loss = 0.0, test_acc = 0.0;
-  size_t test_begin = 2312, test_end = 3312; // [2312, 3327) test size = 1015
-                                             // TODO: replace ad-hoc settings
-  galois::StatTimer Ttest("Test");
-  Ttest.start();
-  double test_time =
-      network.evaluate(test_begin, test_end, test_loss, test_acc);
-  std::cout << "\nTesting: test_loss = " << test_loss
-            << " test_acc = " << test_acc << " test_time = " << test_time
-            << "\n";
-  Ttest.stop();
-
-  std::cout << "\n" << rm.get_peak_memory() << "\n\n";
-  return 0;
-}
diff --git a/lonestar/gnn/include/engine.h b/lonestar/gnn/include/engine.h
new file mode 100644
index 0000000000..7d0691de0f
--- /dev/null
+++ b/lonestar/gnn/include/engine.h
@@ -0,0 +1,50 @@
+#ifdef GALOIS_USE_DIST
+#include "DistributedGraphLoader.h"
+#endif
+
+int main(int argc, char** argv) {
+  galois::DistMemSys G;
+  LonestarGnnStart(argc, argv, name, desc, url);
+
+  // the neural network to train: loads the entire graph on CPU
+  deepgalois::Net network(dataset, numThreads, num_conv_layers, epochs, hidden1,
+                          learning_rate, dropout_rate, weight_decay,
+                          add_selfloop, is_single_class, add_l2norm, add_dense,
+                          neighbor_sample_sz, subgraph_sample_sz, val_interval);
+
+  std::vector<unsigned> dummyVec;
+  deepgalois::Graph* dGraph =
+      galois::graphs::constructSymmetricGraph<char, void>(dummyVec);
+  network.dist_init(dGraph, dataset);
+
+  // read network, features, ground truth, initialize metadata
+  // default setting for now; can be customized by the user
+  network.construct_layers();
+  network.print_layers_info();
+  deepgalois::ResourceManager rm; // tracks peak memory usage
+
+  // the optimizer used to update parameters,
+  // see optimizer.h for more details
+  // optimizer *opt = new gradient_descent();
+  // optimizer *opt = new adagrad();
+  deepgalois::optimizer* opt = new deepgalois::adam();
+  galois::StatTimer Ttrain("TrainAndVal");
+  Ttrain.start();
+  network.train(opt, do_validate); // do training using training samples
+  Ttrain.stop();
+
+  if (do_test) {
+    // test using test samples
+    galois::gPrint("\n");
+    network.read_test_masks(dataset);
+    galois::StatTimer Ttest("Test");
+    Ttest.start();
+    acc_t test_loss = 0.0, test_acc = 0.0;
+    double test_time = network.evaluate("test", test_loss, test_acc);
+    galois::gPrint("Testing: test_loss = ", test_loss, " test_acc = ", test_acc,
+                   " test_time = ", test_time, "\n");
+    Ttest.stop();
+  }
+  galois::gPrint("\n", rm.get_peak_memory(), "\n\n");
+  return 0;
+}
diff --git a/lonestar/gnn/sage/CMakeLists.txt b/lonestar/gnn/sage/CMakeLists.txt
new file mode 100644
index 0000000000..94b6d234b7
--- /dev/null
+++ b/lonestar/gnn/sage/CMakeLists.txt
@@ -0,0 +1,14 @@
+add_executable(sage sage.cpp)
+target_link_libraries(sage PRIVATE Galois::shmem lonestar)
+
+if(ENABLE_HETERO_GALOIS)
+  set_property(TARGET sage PROPERTY CUDA_STANDARD 14)
+  set_property(TARGET sage PROPERTY CUDA_SEPARABLE_COMPILATION ON)
+  target_link_libraries(sage PRIVATE dg_gpu dg_cpu)
+  target_link_libraries(sage PRIVATE -lcudart -lcublas -lcurand -lcudadevrt)
+else()
+target_link_libraries(sage PRIVATE dg_cpu)
+if(ENABLE_DIST_GALOIS)
+  target_link_libraries(sage PRIVATE distgraphloader)
+endif()
+endif()
diff --git a/lonestar/gnn/sage/sage.cpp b/lonestar/gnn/sage/sage.cpp
new file mode 100644
index 0000000000..a6f6b8621e
--- /dev/null
+++ b/lonestar/gnn/sage/sage.cpp
@@ -0,0 +1,20 @@
+// GraphSAGE 
+// Xuhao Chen <cxh@utexas.edu>
+#include "lonestargnn.h"
+
+const char* name = "GraphSAGE";
+const char* desc = "GraphSAGE on an undirected graph: <https://arxiv.org/pdf/1706.02216.pdf>";
+const char* url  = 0;
+
+// define aggregator here
+// .. math::
+//      h_{\mathcal{N}(i)}^{(l+1)} & = \mathrm{aggregate}
+//      \left(\{h_{j}^{l}, \forall j \in \mathcal{N}(i) \}\right)
+//
+//      h_{i}^{(l+1)} & = \sigma \left(W \cdot \mathrm{concat}
+//      (h_{i}^{l}, h_{\mathcal{N}(i)}^{l+1} + b) \right)
+//
+//      h_{i}^{(l+1)} & = \mathrm{norm}(h_{i}^{l})
+
+
+#include "engine.h"

From b5f22a77c6b7ac3c87d0387966f7a8db5d326f13 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Mon, 11 May 2020 11:16:31 -0500
Subject: [PATCH 266/660] update gat and sage

---
 libdeepgalois/include/deepgalois/types.h |  1 +
 lonestar/gnn/CMakeLists.txt              |  2 +-
 lonestar/gnn/gat/gat.cpp                 | 22 ++++++++++++--
 lonestar/gnn/sage/sage.cpp               | 37 +++++++++++++++++++++++-
 4 files changed, 58 insertions(+), 4 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/types.h b/libdeepgalois/include/deepgalois/types.h
index 71add8b650..e3165abc8a 100644
--- a/libdeepgalois/include/deepgalois/types.h
+++ b/libdeepgalois/include/deepgalois/types.h
@@ -31,6 +31,7 @@ typedef std::vector<size_t> dims_t; // dimentions type
 typedef uint32_t index_t; // index type
 typedef float_t edata_t;  // edge data type
 typedef float_t vdata_t;  // vertex data type
+typedef float_t* emb_t;   // embedding (feature vector) type
 
 enum class net_phase { train, test };
 
diff --git a/lonestar/gnn/CMakeLists.txt b/lonestar/gnn/CMakeLists.txt
index 773df6a819..40eac53052 100644
--- a/lonestar/gnn/CMakeLists.txt
+++ b/lonestar/gnn/CMakeLists.txt
@@ -24,5 +24,5 @@ if(ENABLE_DIST_GALOIS)
 endif()
 
 add_subdirectory(gcn)
-add_subdirectory(sage)
+#add_subdirectory(sage)
 add_subdirectory(gat)
diff --git a/lonestar/gnn/gat/gat.cpp b/lonestar/gnn/gat/gat.cpp
index 6f652e84c7..10647924b7 100644
--- a/lonestar/gnn/gat/gat.cpp
+++ b/lonestar/gnn/gat/gat.cpp
@@ -6,11 +6,29 @@ const char* name = "Graph Attention Networks (GAT)";
 const char* desc = "Graph Attention Networks on an undirected graph: <https://arxiv.org/pdf/1710.10903.pdf>";
 const char* url  = 0;
 
-// define aggregator here
-
 // math: h_i^{(l+1)} = \sum_{j\in \mathcal{N}(i)} \alpha_{i,j} W^{(l)} h_j^{(l)}
 // where :math:`\alpha_{ij}` is the attention score bewteen node :math:`i` and node :math:`j`:
 // .. math:: \alpha_{ij}^{l} & = \mathrm{softmax_i} (e_{ij}^{l})
 //                e_{ij}^{l} & = \mathrm{LeakyReLU}\left(\vec{a}^T [W h_{i} \| W h_{j}]\right)
+/*
+namespace deepgalois {
+ 
+// define aggregator here
+class AppAggregator: public Aggregator {
+public:
+  emb_t applyEdge(VertexID, VertexID u, emb_t in) {
+    auto ilen = get_in_feat_len();
+    return &in[ilen*u];
+  }
+
+  emb_t applyVertex(VertexID v, emb_t in, emb_t accum) {
+    auto n = get_num_samples();
+    auto ilen = get_in_feat_len();
+    auto olen = get_out_feat_len();
+    emb_t a, b, c;
+  }
+};
 
+}
+//*/
 #include "engine.h"
diff --git a/lonestar/gnn/sage/sage.cpp b/lonestar/gnn/sage/sage.cpp
index a6f6b8621e..5f078dff63 100644
--- a/lonestar/gnn/sage/sage.cpp
+++ b/lonestar/gnn/sage/sage.cpp
@@ -1,4 +1,4 @@
-// GraphSAGE 
+// GraphSAGE: <https://arxiv.org/pdf/1706.02216.pdf>
 // Xuhao Chen <cxh@utexas.edu>
 #include "lonestargnn.h"
 
@@ -16,5 +16,40 @@ const char* url  = 0;
 //
 //      h_{i}^{(l+1)} & = \mathrm{norm}(h_{i}^{l})
 
+namespace deepgalois {
+ 
+class AppAggregator: public Aggregator {
+public:
+  emb_t applyEdge(VertexID, VertexID u, emb_t in) {
+    auto ilen = get_in_feat_len();
+    return &in[ilen*u];
+  }
 
+  emb_t applyVertex(VertexID v, emb_t in, emb_t accum) {
+    auto n = get_num_samples();
+    auto ilen = get_in_feat_len();
+    auto olen = get_out_feat_len();
+    emb_t a, b, c;
+    math::mvmul(CblasTrans, olen, ilen, 1.0, W, &accum[v*ilen], 0.0, a); // a = W * accum[v]; [olen x ilen] * [ilen x 1] = [olen x 1]
+    math::mvmul(CblasTrans, olen, ilen, 1.0, Q, &in[v*ilen], 0.0, b);    // b = Q * in; [olen x ilen] * [ilen x 1] = [olen x 1] 
+    math::vadd_cpu(olen, a, b, c); // c = a + b; [olen x 1]
+    return c; // the feature vector to update h[v]
+  }
+/*
+  emb_t applyVertex(emb_t in, emb_t accum) {
+    auto n = get_num_samples();
+    auto ilen = get_in_feat_len();
+    auto olen = get_out_feat_len();
+    emb_t a, b, c;
+    math::matmul(n, olen, ilen, accum, W, a); // a = accum * W; [n x ilen] * [ilen x olen] = [n x olen]
+    math::matmul(n, olen, ilen, in, Q, b);    // b = in * Q; [n x ilen] * [ilen x olen] = [n x olen] 
+    math::vadd(n*olen, a, b, c); // c = a + b; [n x olen]
+    return c; // all the feature vectors to update the entire h
+  }
+*/
+  //void update_all(size_t len, Graph& g, const emb_t in, emb_t out) {
+  //}
+};
+
+}
 #include "engine.h"

From 15f9aa6a514423e8f4d63326e7eb1daba0c3b0ac Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 7 May 2020 15:25:30 -0500
Subject: [PATCH 267/660] gtypes -> graph types; clean up file as well

---
 .../include/deepgalois/DistContext.h          |  2 +-
 libdeepgalois/include/deepgalois/GraphTypes.h | 16 +++++
 libdeepgalois/include/deepgalois/context.h    |  3 +-
 libdeepgalois/include/deepgalois/gtypes.h     | 53 ---------------
 .../include/deepgalois/layers/aggregator.h    |  4 +-
 libdeepgalois/include/deepgalois/net.h        | 64 +++++++++++--------
 libdeepgalois/include/deepgalois/reader.h     |  2 +-
 libdeepgalois/include/deepgalois/sampler.h    |  2 +-
 libdeepgalois/include/deepgalois/utils.h      |  2 +-
 9 files changed, 61 insertions(+), 87 deletions(-)
 create mode 100644 libdeepgalois/include/deepgalois/GraphTypes.h
 delete mode 100644 libdeepgalois/include/deepgalois/gtypes.h

diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h
index 2f65360106..7069c1a0d7 100644
--- a/libdeepgalois/include/deepgalois/DistContext.h
+++ b/libdeepgalois/include/deepgalois/DistContext.h
@@ -5,7 +5,7 @@
  */
 #include "galois/graphs/GluonSubstrate.h"
 #include "deepgalois/types.h"
-#include "deepgalois/gtypes.h"
+#include "deepgalois/GraphTypes.h"
 
 namespace deepgalois {
 
diff --git a/libdeepgalois/include/deepgalois/GraphTypes.h b/libdeepgalois/include/deepgalois/GraphTypes.h
new file mode 100644
index 0000000000..0ef3fb4a77
--- /dev/null
+++ b/libdeepgalois/include/deepgalois/GraphTypes.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "deepgalois/types.h"
+#include "galois/Galois.h"
+#include "galois/graphs/NewGeneric.h"
+#include "deepgalois/lgraph.h"
+
+#ifdef __GALOIS_HET_CUDA__
+// TODO reintroduce GPU as necessary here
+#endif
+
+namespace deepgalois {
+using index_t = edge_iterator;
+using DGraph = galois::graphs::DistGraph<char, void>;
+using Graph = LearningGraph;
+} // namespace deepgalois
diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h
index 77c15ee890..106427ccf7 100644
--- a/libdeepgalois/include/deepgalois/context.h
+++ b/libdeepgalois/include/deepgalois/context.h
@@ -7,7 +7,7 @@
 #include <cassert>
 #include "deepgalois/types.h"
 #include "deepgalois/reader.h"
-#include "deepgalois/gtypes.h"
+#include "deepgalois/GraphTypes.h"
 
 #ifdef __GALOIS_HET_CUDA__
 #include "deepgalois/cutils.h"
@@ -18,6 +18,7 @@ namespace deepgalois {
 class Context {
 public:
   Context();
+  //! initializer for gpu; goes ahead and sets a few things
   Context(bool use_gpu)
       : is_device(use_gpu), n(0), num_classes(0), feat_len(0),
         is_single_class(true), is_selfloop_added(false), use_subgraph(false),
diff --git a/libdeepgalois/include/deepgalois/gtypes.h b/libdeepgalois/include/deepgalois/gtypes.h
deleted file mode 100644
index a2535f93a3..0000000000
--- a/libdeepgalois/include/deepgalois/gtypes.h
+++ /dev/null
@@ -1,53 +0,0 @@
-#pragma once
-
-#include "deepgalois/types.h"
-#ifdef GALOIS_USE_DIST
-#include "galois/Galois.h"
-#include "galois/graphs/NewGeneric.h"
-#else
-#ifdef CPU_ONLY
-//#include "galois/Galois.h"
-//#include "galois/graphs/LCGraph.h"
-#include "deepgalois/lgraph.h"
-#else
-#define USE_CSRGRAPH
-#ifdef USE_CSRGRAPH
-#include "deepgalois/lgraph.h"
-#include "graph_gpu.h"
-#else
-#include "deepgalois/lgraph.h"
-#endif
-#endif
-#endif
-
-#ifndef GALOIS_USE_DIST
-
-namespace deepgalois {
-typedef index_t edge_iterator;
-//#ifdef EDGE_LABEL
-// typedef galois::graphs::LC_CSR_Graph<uint32_t, uint32_t>::
-//    with_numa_alloc<true>::type ::with_no_lockable<true>::type LCGraph;
-//#else
-// typedef galois::graphs::LC_CSR_Graph<void, void, false, false, false, void,
-// uint64_t, uint64_t>::
-//    with_numa_alloc<true>::type ::with_no_lockable<true>::type LCGraph;
-//#endif
-// typedef LCGraph Graph;
-// typedef Graph::edge_iterator edge_iterator;
-typedef LearningGraph Graph;
-#ifdef USE_CSRGRAPH
-typedef CSRGraph GraphGPU;
-#else
-typedef LearningGraph GraphGPU;
-#endif
-} // namespace deepgalois
-
-#else
-
-namespace deepgalois {
-// TODO check if this needs changing
-typedef index_t edge_iterator;
-using Graph = galois::graphs::DistGraph<char, void>;
-} // namespace deepgalois
-
-#endif
diff --git a/libdeepgalois/include/deepgalois/layers/aggregator.h b/libdeepgalois/include/deepgalois/layers/aggregator.h
index 6e5e7a5926..cc6e22db00 100644
--- a/libdeepgalois/include/deepgalois/layers/aggregator.h
+++ b/libdeepgalois/include/deepgalois/layers/aggregator.h
@@ -3,7 +3,7 @@
 //! For each node in the graph, add the embeddings of all of its neighbors
 //! together (using norm_factor if specified)
 #ifdef CPU_ONLY
-#include "deepgalois/gtypes.h"
+#include "deepgalois/GraphTypes.h"
 namespace deepgalois {
 void update_all(size_t len, Graph& g, const float_t* in, float_t* out,
                 bool norm, float_t* norm_factor);
@@ -11,7 +11,7 @@ void update_all_csrmm(size_t len, Graph& g, const float_t* in, float_t* out,
                       bool norm, float_t* norm_factor);
 } // namespace deepgalois
 #else
-#include "deepgalois/gtypes.h"
+#include "deepgalois/GraphTypes.h"
 //#include "graph_gpu.h"
 namespace deepgalois {
 void update_all(size_t len, GraphGPU& g, const float_t* in, float_t* out,
diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h
index 117de131b2..4928f61f1d 100644
--- a/libdeepgalois/include/deepgalois/net.h
+++ b/libdeepgalois/include/deepgalois/net.h
@@ -16,7 +16,7 @@
 #ifndef GALOIS_USE_DIST
 #include "deepgalois/context.h"
 #else
-#include "deepgalois/gtypes.h"
+#include "deepgalois/GraphTypes.h"
 #include "deepgalois/DistContext.h"
 #endif
 
@@ -45,28 +45,32 @@ class Net {
               << learning_rate << ", dropout_rate " << dropout_rate
               << ", weight_decay " << weight_decay << "\n";
     num_layers = num_conv_layers + 1;
+
+    // additional layers to add
     if (has_l2norm)
       num_layers++;
     if (has_dense)
       num_layers++;
+
     // initialize feature metadata
     feature_dims.resize(num_layers + 1);
 
-#ifndef GALOIS_USE_DIST
+    // initialze context
     context = new deepgalois::Context();
     context->set_dataset(dataset_str);
+    // read graph, get num nodes
     num_samples = context->read_graph(selfloop);
     context->set_label_class(is_single_class);
-    // read graph, get num nodes
+    // read ground truth labels
     num_classes = context->read_labels();
 
-    // std::cout << "Reading label masks ... ";
+    // get training and validation sets
     train_masks = new mask_t[num_samples];
     val_masks   = new mask_t[num_samples];
     std::fill(train_masks, train_masks + num_samples, 0);
     std::fill(val_masks, val_masks + num_samples, 0);
 
-    // get training and validation sets
+    // reddit is hard coded
     if (dataset_str == "reddit") {
       train_begin = 0, train_count = 153431,
       train_end = train_begin + train_count;
@@ -83,42 +87,52 @@ class Net {
                                       val_masks);
     }
 
+    // make sure sampel size isn't greater than what we have to train with
     if (subgraph_sample_size > train_count) {
-      std::cout << "FATAL: subgraph size can not be larger than the size of "
-                   "training set\n";
-      exit(1);
+      GALOIS_DIE("subgraph size can not be larger than the size of training "
+                 "set\n");
     }
 
+    // read features of vertices
     feature_dims[0] = context->read_features(); // input feature dimension: D
+
     for (size_t i = 1; i < num_conv_layers; i++)
       feature_dims[i] = hidden1;                 // hidden1 level embedding: 16
+
     feature_dims[num_conv_layers] = num_classes; // output embedding: E
+
     if (has_l2norm)
       feature_dims[num_conv_layers + 1] =
           num_classes; // l2 normalized embedding: E
+
     if (has_dense)
       feature_dims[num_layers - 1] = num_classes; // MLP embedding: E
+
     feature_dims[num_layers] = num_classes; // normalized output embedding: E
     layers.resize(num_layers);
+
+    // set the subgraph boolean if sample size is greater than 0
     context->set_use_subgraph(subgraph_sample_size > 0);
-    init();
-#endif
   }
 
-  Net()
-      : is_single_class(true), has_l2norm(false), has_dense(false),
-        neighbor_sample_size(0), subgraph_sample_size(0), num_threads(1),
-        num_samples(0), num_classes(0), num_conv_layers(0), num_layers(0),
-        num_epochs(0), learning_rate(0.0), dropout_rate(0.0), weight_decay(0.0),
-        train_begin(0), train_end(0), train_count(0), val_begin(0), val_end(0),
-        val_count(0), test_begin(0), test_end(0), test_count(0),
-        val_interval(1), num_subgraphs(1), num_vertices_sg(9000),
-        train_masks(NULL), val_masks(NULL), test_masks(NULL), context(NULL) {}
-
-  void init();
+  //! Default net constructor
+  //Net()
+  //    : is_single_class(true), has_l2norm(false), has_dense(false),
+  //      neighbor_sample_size(0), subgraph_sample_size(0), num_threads(1),
+  //      num_samples(0), num_classes(0), num_conv_layers(0), num_layers(0),
+  //      num_epochs(0), learning_rate(0.0), dropout_rate(0.0), weight_decay(0.0),
+  //      train_begin(0), train_end(0), train_count(0), val_begin(0), val_end(0),
+  //      val_count(0), test_begin(0), test_end(0), test_count(0),
+  //      val_interval(1), num_subgraphs(1), num_vertices_sg(9000),
+  //      train_masks(NULL), val_masks(NULL), test_masks(NULL), context(NULL) {}
+
+  //! save graph pointer to context object
+  void saveDistGraph(Graph* dGraph);
+
 #ifdef GALOIS_USE_DIST
   void dist_init(Graph* graph, std::string dataset_str);
 #endif
+
   size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; }
   size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id + 1]; }
   size_t get_nnodes() { return num_samples; }
@@ -127,13 +141,9 @@ class Net {
   void regularize(); // add weight decay
 
   void train(optimizer* opt, bool need_validate) {
-    std::string header    = "";
-    std::string seperator = " ";
-#ifdef GALOIS_USE_DIST
     unsigned myID = galois::runtime::getSystemNetworkInterface().ID;
-    header        = "[" + std::to_string(myID) + "] ";
-    seperator     = "\n";
-#endif
+    std::string header        = "[" + std::to_string(myID) + "] ";
+    std::string seperator     = "\n";
 
     double total_train_time = 0.0;
     int num_subg_remain     = 0;
diff --git a/libdeepgalois/include/deepgalois/reader.h b/libdeepgalois/include/deepgalois/reader.h
index 9e5faf1f39..1bcda0b4b7 100644
--- a/libdeepgalois/include/deepgalois/reader.h
+++ b/libdeepgalois/include/deepgalois/reader.h
@@ -1,5 +1,5 @@
 #pragma once
-#include "deepgalois/gtypes.h"
+#include "deepgalois/GraphTypes.h"
 
 namespace deepgalois {
 
diff --git a/libdeepgalois/include/deepgalois/sampler.h b/libdeepgalois/include/deepgalois/sampler.h
index c559804354..ab0fb03a25 100644
--- a/libdeepgalois/include/deepgalois/sampler.h
+++ b/libdeepgalois/include/deepgalois/sampler.h
@@ -1,7 +1,7 @@
 #ifndef GALOIS_USE_DIST
 
 #pragma once
-#include "deepgalois/gtypes.h"
+#include "deepgalois/GraphTypes.h"
 
 namespace deepgalois {
 #define ETA 1.5          // length factor of DB in sampling
diff --git a/libdeepgalois/include/deepgalois/utils.h b/libdeepgalois/include/deepgalois/utils.h
index 08f28126bf..7093897af2 100644
--- a/libdeepgalois/include/deepgalois/utils.h
+++ b/libdeepgalois/include/deepgalois/utils.h
@@ -8,7 +8,7 @@
 #include <sys/time.h>
 #include <sys/resource.h>
 #ifdef GALOIS_USE_DIST
-#include "deepgalois/gtypes.h"
+#include "deepgalois/GraphTypes.h"
 #else
 #include "deepgalois/types.h"
 #endif

From a2d21a0ea5f5cecc0baf415c0f67424ff2f8c7d3 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 7 May 2020 15:29:35 -0500
Subject: [PATCH 268/660] context.h -> Context.h

---
 libdeepgalois/include/deepgalois/{context.h => Context.h} | 0
 libdeepgalois/include/deepgalois/layers/layer.h           | 2 +-
 libdeepgalois/include/deepgalois/net.h                    | 2 +-
 libdeepgalois/src/context.cpp                             | 2 +-
 4 files changed, 3 insertions(+), 3 deletions(-)
 rename libdeepgalois/include/deepgalois/{context.h => Context.h} (100%)

diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/Context.h
similarity index 100%
rename from libdeepgalois/include/deepgalois/context.h
rename to libdeepgalois/include/deepgalois/Context.h
diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h
index ec35c1d8c9..a1c2ef630a 100644
--- a/libdeepgalois/include/deepgalois/layers/layer.h
+++ b/libdeepgalois/include/deepgalois/layers/layer.h
@@ -11,7 +11,7 @@
 #include <iostream>
 #include "deepgalois/gtypes.h"
 #ifndef GALOIS_USE_DIST
-#include "deepgalois/context.h"
+#include "deepgalois/Context.h"
 #else
 #include "deepgalois/DistContext.h"
 #endif
diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h
index 4928f61f1d..ac934d0c7d 100644
--- a/libdeepgalois/include/deepgalois/net.h
+++ b/libdeepgalois/include/deepgalois/net.h
@@ -14,7 +14,7 @@
 #include "deepgalois/sampler.h"
 #endif
 #ifndef GALOIS_USE_DIST
-#include "deepgalois/context.h"
+#include "deepgalois/Context.h"
 #else
 #include "deepgalois/GraphTypes.h"
 #include "deepgalois/DistContext.h"
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp
index f07da83d6d..d5cc9435ee 100644
--- a/libdeepgalois/src/context.cpp
+++ b/libdeepgalois/src/context.cpp
@@ -1,7 +1,7 @@
 /**
  * Based on common.hpp file of the Caffe deep learning library.
  */
-#include "deepgalois/context.h"
+#include "deepgalois/Context.h"
 #include "deepgalois/utils.h"
 #include "deepgalois/configs.h"
 #include "galois/Galois.h"

From 4eaed77a7c641f3ea09e08ef5c6b606cc212d02e Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 7 May 2020 15:35:25 -0500
Subject: [PATCH 269/660] net -> Net; also getting context

---
 libdeepgalois/CMakeLists.txt                  | 58 ++++++-------------
 .../include/deepgalois/{net.h => Net.h}       |  0
 .../src/{context.cpp => Context.cpp}          |  0
 libdeepgalois/src/{net.cpp => Net.cpp}        |  2 +-
 libdeepgalois/src/net.cu                      |  2 +-
 5 files changed, 20 insertions(+), 42 deletions(-)
 rename libdeepgalois/include/deepgalois/{net.h => Net.h} (100%)
 rename libdeepgalois/src/{context.cpp => Context.cpp} (100%)
 rename libdeepgalois/src/{net.cpp => Net.cpp} (99%)

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index 41e5130818..9a20111e0b 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -72,46 +72,24 @@ endif()
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 
 if(NOT ENABLE_HETERO_GALOIS)
-  if(ENABLE_DIST_GALOIS)
-  # do not link regular context.cpp; TODO do this conditional in cleaner way
-  # also don't link sampler
-    set(sources
-      src/layers/softmax_loss_layer.cpp
-      src/layers/sigmoid_loss_layer.cpp
-      src/layers/graph_conv_layer.cpp
-      src/layers/leaky_relu_layer.cpp
-      src/layers/l2_norm_layer.cpp
-      src/layers/relu_layer.cpp
-      src/layers/aggregator.cpp
-      src/math_functions.cpp
-      src/DistContext.cpp
-      src/optimizer.cpp
-      src/reader.cpp
-      src/lgraph.cpp
-      src/utils.cpp
-      src/node.cpp
-      src/net.cpp
-    )
-  else()
-    set(sources
-      src/layers/softmax_loss_layer.cpp
-      src/layers/sigmoid_loss_layer.cpp
-      src/layers/graph_conv_layer.cpp
-      src/layers/leaky_relu_layer.cpp
-      src/layers/l2_norm_layer.cpp
-      src/layers/relu_layer.cpp
-      src/layers/aggregator.cpp
-      src/math_functions.cpp
-      src/optimizer.cpp
-      src/context.cpp
-      src/sampler.cpp
-      src/reader.cpp
-      src/lgraph.cpp
-      src/utils.cpp
-      src/node.cpp
-      src/net.cpp
-    )
-  endif(ENABLE_DIST_GALOIS)
+  set(sources
+    src/layers/softmax_loss_layer.cpp
+    src/layers/sigmoid_loss_layer.cpp
+    src/layers/graph_conv_layer.cpp
+    src/layers/leaky_relu_layer.cpp
+    src/layers/l2_norm_layer.cpp
+    src/layers/relu_layer.cpp
+    src/layers/aggregator.cpp
+    src/math_functions.cpp
+    src/optimizer.cpp
+    src/Context.cpp
+    src/sampler.cpp
+    src/reader.cpp
+    src/lgraph.cpp
+    src/utils.cpp
+    src/node.cpp
+    src/Net.cpp
+  )
 else()
   # dummy sources set for dg_cpu for HETERO build
   # TODO fix this
diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/Net.h
similarity index 100%
rename from libdeepgalois/include/deepgalois/net.h
rename to libdeepgalois/include/deepgalois/Net.h
diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/Context.cpp
similarity index 100%
rename from libdeepgalois/src/context.cpp
rename to libdeepgalois/src/Context.cpp
diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/Net.cpp
similarity index 99%
rename from libdeepgalois/src/net.cpp
rename to libdeepgalois/src/Net.cpp
index ebd19639da..ede45fe2a3 100644
--- a/libdeepgalois/src/net.cpp
+++ b/libdeepgalois/src/Net.cpp
@@ -4,7 +4,7 @@
 
 #include "galois/Timer.h"
 #include "galois/Galois.h"
-#include "deepgalois/net.h"
+#include "deepgalois/Net.h"
 #include "deepgalois/math_functions.hh"
 
 namespace deepgalois {
diff --git a/libdeepgalois/src/net.cu b/libdeepgalois/src/net.cu
index cd635ef07f..f1bbe97c94 100644
--- a/libdeepgalois/src/net.cu
+++ b/libdeepgalois/src/net.cu
@@ -1,4 +1,4 @@
-#include "deepgalois/net.h"
+#include "deepgalois/Net.h"
 #include "deepgalois/cutils.h"
 #include "deepgalois/math_functions.hh"
 #include "gg.h"

From 19a772fd0ce94e9466666fea27f041d6bedb41b0 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 8 May 2020 14:54:29 -0500
Subject: [PATCH 270/660] mainly sampler function commenting, renaming to get a
 better understanding now it works

---
 libdeepgalois/include/deepgalois/Context.h |   3 +-
 libdeepgalois/include/deepgalois/Net.h     |  14 +-
 libdeepgalois/include/deepgalois/sampler.h |  37 +++--
 libdeepgalois/src/Context.cpp              | 117 ++++++++-------
 libdeepgalois/src/DistContext.cpp          |   5 +-
 libdeepgalois/src/context.cu               |   2 +-
 libdeepgalois/src/sampler.cpp              | 158 +++++++++++----------
 7 files changed, 177 insertions(+), 159 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/Context.h b/libdeepgalois/include/deepgalois/Context.h
index 106427ccf7..519a75d7f3 100644
--- a/libdeepgalois/include/deepgalois/Context.h
+++ b/libdeepgalois/include/deepgalois/Context.h
@@ -58,7 +58,8 @@ class Context {
   void norm_factor_computing(bool is_subgraph, int subg_id = 0);
   void gen_subgraph_labels(size_t m, const mask_t* masks);
   void gen_subgraph_feats(size_t m, const mask_t* masks);
-  void createSubgraphs(int num_subgraphs);
+  //! Allocate subgraphs (but don't actually do sampling yet)
+  void allocateSubgraphs(int num_subgraphs);
 
 #ifndef __GALOIS_HET_CUDA__
   Graph* graph_cpu; // the input graph, |V| = N
diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h
index ac934d0c7d..d478d83e4c 100644
--- a/libdeepgalois/include/deepgalois/Net.h
+++ b/libdeepgalois/include/deepgalois/Net.h
@@ -147,18 +147,16 @@ class Net {
 
     double total_train_time = 0.0;
     int num_subg_remain     = 0;
-#ifdef CPU_ONLY
-#ifndef GALOIS_USE_DIST
+
     if (subgraph_sample_size) {
-      context->createSubgraphs(num_subgraphs);
+      context->allocateSubgraphs(num_subgraphs);
       subgraphs_masks = new mask_t[num_samples * num_subgraphs];
-      std::cout << "\nConstruct training vertex set induced graph...\n";
-      sampler->set_masked_graph(train_begin, train_end, train_count,
-                                train_masks, context->getGraphPointer());
+      galois::gPrint(header, " Construct training vertex set induced graph...\n";
+      sampler->initializeMaskedGraph(train_count, train_masks, context->getGraphPointer());
     }
-#endif
-#endif
+
     std::cout << "\nStart training...\n";
+
     Timer t_epoch;
     // run epochs
     for (int ep = 0; ep < num_epochs; ep++) {
diff --git a/libdeepgalois/include/deepgalois/sampler.h b/libdeepgalois/include/deepgalois/sampler.h
index ab0fb03a25..7934b28aa7 100644
--- a/libdeepgalois/include/deepgalois/sampler.h
+++ b/libdeepgalois/include/deepgalois/sampler.h
@@ -15,10 +15,12 @@ class Sampler {
   Sampler() : m_(DEFAULT_SIZE_FRONTIER) {}
   ~Sampler() {}
 
-  // sample a subgraph sg of size n from graph g
+  //! sample a subgraph sg of size n from graph g
+  //! sg is overwritten/is output
   void subgraph_sample(size_t n, Graph& sg, mask_t* masks, unsigned tid = 0);
 
-  // !API function for user-defined selection strategy
+  //! API function for user-defined selection strategy
+  // TODO how to expose this?
   virtual void select_vertices(size_t nv, size_t n, int m, Graph* g,
                                VertexList vertices, VertexSet& vertex_set);
   virtual void select_vertices(size_t n, int m, VertexSet& vertex_set,
@@ -33,31 +35,38 @@ class Sampler {
 
   edge_iterator sampled_edge_end(Graph& g, VertexID v) { return g.edge_end(v); }
 
-  void set_masked_graph(size_t begin, size_t end, size_t count, mask_t* masks,
-                        Graph* g);
+  //! Given a mask, construct the graph with only those vertices ans ave as the
+  //! masked graph in this class for the sampler.
+  void initializeMaskedGraph(size_t count, mask_t* masks, Graph* g);
 
 protected:
   int m_;
   size_t count_;
-  size_t begin_;
-  size_t end_;
+
+  //! averaged degree of masked graph
   int avg_deg;
+  //! average degree cut off to a clip
   int subg_deg;
-  VertexList vertices_;
+  //! list  of vertices active in the graph being maintained (masked_graph)
+  //VertexList vertices_;
+  //! List of training nodes; sampling set
   std::vector<index_t> node_train;
   mask_t* masks_;
+  //! masked original graph; typically to the training set
   Graph* masked_graph;
   Graph* graph;
 
-  // Given a subset of vertices and a graph g, generate a subgraph sg from the
-  // graph g
-  void generate_subgraph(VertexSet& vertex_set, Graph& g, Graph& sub);
-  void generate_masked_graph(size_t n, mask_t* masks, Graph* g, Graph& mg);
+  //! Reindex a graph to only contain those in the vertex set
+  void reindexSubgraph(VertexSet& keptVertices, Graph& g, Graph& reindexed);
+  //! Given a graph, return a graph with edges to unmasked vertices removed in
+  //! mg
+  void getMaskedGraph(size_t n, mask_t* masks, Graph* g, Graph& mg);
   void get_masked_degrees(size_t n, mask_t* masks, Graph* g,
                           std::vector<uint32_t>& degrees);
-  void update_masks(size_t n, VertexSet vertices, mask_t* masks);
-  inline VertexList reindexing_vertice(size_t n, VertexSet vertex_set);
-  void check_DB(std::vector<db_t>& DB0, std::vector<db_t>& DB1,
+  //! Set masks bitset with IDs in the vertices VertexSet
+  void getMasks(size_t n, VertexSet vertices, mask_t* masks);
+  inline VertexList reindexVertices(size_t n, VertexSet vertex_set);
+  void checkGSDB(std::vector<db_t>& DB0, std::vector<db_t>& DB1,
                 std::vector<db_t>& DB2, size_t size);
 };
 
diff --git a/libdeepgalois/src/Context.cpp b/libdeepgalois/src/Context.cpp
index d5cc9435ee..58526d7a96 100644
--- a/libdeepgalois/src/Context.cpp
+++ b/libdeepgalois/src/Context.cpp
@@ -22,46 +22,52 @@ Context::~Context() {
   // if (norm_factors_subg) delete[] norm_factors_subg;
 }
 
-void Context::createSubgraphs(int num_subgraphs) {
+void Context::allocateSubgraphs(int num_subgraphs) {
   subgraphs_cpu.resize(num_subgraphs);
   for (int i = 0; i < num_subgraphs; i++)
     subgraphs_cpu[i] = new Graph();
 }
 
-// generate labels for the subgraph, m is subgraph size
+//! generate labels for the subgraph, m is subgraph size, mask
+//! tells which vertices to use
 void Context::gen_subgraph_labels(size_t m, const mask_t* masks) {
   // if (h_labels_subg == NULL) h_labels_subg = new label_t[m];
-  if (is_single_class) {
-    h_labels_subg.resize(m);
+  if (Context::is_single_class) {
+    Context::h_labels_subg.resize(m);
   } else {
-    h_labels_subg.resize(m * num_classes);
+    Context::h_labels_subg.resize(m * Context::num_classes);
   }
+
   size_t count = 0;
+  // see which labels to copy over for this subgraph
   for (size_t i = 0; i < n; i++) {
     if (masks[i] == 1) {
-      if (is_single_class) {
-        h_labels_subg[count] = h_labels[i];
+      if (Context::is_single_class) {
+        Context::h_labels_subg[count] = h_labels[i];
       } else {
-        std::copy(h_labels + i * num_classes, h_labels + (i + 1) * num_classes,
-                  &h_labels_subg[count * num_classes]);
+        std::copy(Context::h_labels + i * Context::num_classes, Context::h_labels + (i + 1) * Context::num_classes,
+                  &Context::h_labels_subg[count * Context::num_classes]);
       }
       count++;
     }
   }
+  assert(count == m);
 }
 
-// generate input features for the subgraph, m is subgraph size
+//! generate input features for the subgraph, m is subgraph size, 
+//! masks tells which vertices to use
 void Context::gen_subgraph_feats(size_t m, const mask_t* masks) {
   size_t count = 0;
   // if (h_feats_subg == NULL) h_feats_subg = new float_t[m*feat_len];
-  h_feats_subg.resize(m * feat_len);
+  Context::h_feats_subg.resize(m * feat_len);
   for (size_t i = 0; i < n; i++) {
     if (masks[i] == 1) {
-      std::copy(h_feats + i * feat_len, h_feats + (i + 1) * feat_len,
-                &h_feats_subg[count * feat_len]);
+      std::copy(Context::h_feats + i * Context::feat_len, Context::h_feats + (i + 1) * Context::feat_len,
+                &Context::h_feats_subg[count * Context::feat_len]);
       count++;
     }
   }
+  assert(count == m);
 }
 
 size_t Context::read_graph(bool selfloop) {
@@ -80,63 +86,64 @@ size_t Context::read_graph(bool selfloop) {
     std::string filename = path + dataset + ".csgr";
     printf("Reading .gr file: %s\n", filename.c_str());
     if (selfloop) {
+      galois::gWarn("SELF LOOPS NOT SUPPORTED AT THIS TIME");
       Graph graph_temp;
       // galois::graphs::readGraph(graph_temp, filename);
       graph_temp.readGraph(dataset);
       add_selfloop(graph_temp, *graph_cpu);
       is_selfloop_added = selfloop;
       //} else galois::graphs::readGraph(*graph_cpu, filename);
-    } else
+    } else {
       graph_cpu->readGraph(dataset);
+    }
     // TODO dist version of self loop
   } else {
-    printf("Unkown file format\n");
-    exit(1);
+    GALOIS_DIE("unknown file format for readgraph");
   }
   Tread.stop();
+
   auto g = getGraphPointer();
-  std::cout << "num_vertices " << g->size() << " num_edges " << g->sizeEdges()
-            << "\n";
-  n = g->size();
-  return n;
+  galois::gPrint("num_vertices ", g->size(), " num_edges ", g->sizeEdges(),
+                 "\n");
+  return g->size();
 }
 
 void Context::add_selfloop(Graph& og, Graph& g) {
+  // TODO not actually implemented yet
   g.allocateFrom(og.size(), og.size() + og.sizeEdges());
   g.constructNodes();
-  /*
-  for (size_t src = 0; src < og.size(); src++) {
-    //g.getData(src) = 1;
-    auto begin = og.edge_begin(src);
-    auto end = og.edge_end(src);
-    g.fixEndEdge(src, end+src+1);
-    bool self_inserted = false;
-    if (begin == end) {
-      new_edge_dst[begin+i] = i;
-      continue;
-    }
-    for (auto e = begin; e != end; e++) {
-      auto dst = og.getEdgeDst(e);
-      if (!self_inserted) {
-        if (dst > src) {
-          g.constructEdge(e+src, src, 0);
-          g.constructEdge(e+src+1, dst, 0);
-          self_inserted = true;
-        } else if (e+1 == end) {
-          g.constructEdge(e+src+1, src, 0);
-          g.constructEdge(e+src, dst, 0);
-          self_inserted = true;
-        } else g.constructEdge(e+src, dst, 0);
-      } else g.constructEdge(e+src+1, dst, 0);
-    }
-  }
-  //*/
+  //for (size_t src = 0; src < og.size(); src++) {
+  //  //g.getData(src) = 1;
+  //  auto begin = og.edge_begin(src);
+  //  auto end = og.edge_end(src);
+  //  g.fixEndEdge(src, end+src+1);
+  //  bool self_inserted = false;
+  //  if (begin == end) {
+  //    new_edge_dst[begin+i] = i;
+  //    continue;
+  //  }
+  //  for (auto e = begin; e != end; e++) {
+  //    auto dst = og.getEdgeDst(e);
+  //    if (!self_inserted) {
+  //      if (dst > src) {
+  //        g.constructEdge(e+src, src, 0);
+  //        g.constructEdge(e+src+1, dst, 0);
+  //        self_inserted = true;
+  //      } else if (e+1 == end) {
+  //        g.constructEdge(e+src+1, src, 0);
+  //        g.constructEdge(e+src, dst, 0);
+  //        self_inserted = true;
+  //      } else g.constructEdge(e+src, dst, 0);
+  //    } else g.constructEdge(e+src+1, dst, 0);
+  //  }
+  //}
 }
 
 void Context::alloc_norm_factor() {
   Graph* g = getGraphPointer();
   if (norm_factors == NULL)
 #ifdef USE_MKL
+    // TODO why does MKL use size edges
     norm_factors = new float_t[g->sizeEdges()];
 #else
     norm_factors = new float_t[g->size()];
@@ -145,19 +152,19 @@ void Context::alloc_norm_factor() {
 
 void Context::alloc_subgraph_norm_factor(int subg_id) {
   Graph* g = getSubgraphPointer(subg_id);
-  // if (norm_factors_subg == NULL)
 #ifdef USE_MKL
-  // norm_factors_subg = new float_t[g->sizeEdges()];
   norm_factors_subg.resize(g->sizeEdges());
 #else
   norm_factors_subg.resize(g->size());
-  // norm_factors_subg = new float_t[g->size()];
 #endif
+  norm_factors_subg.clear();
 }
 
 void Context::norm_factor_computing(bool is_subgraph, int subg_id) {
   Graph* g;
   float_t* constants;
+
+  // grab orig or subgraph pointer as necessary
   if (!is_subgraph) {
     g = getGraphPointer();
     alloc_norm_factor();
@@ -167,6 +174,7 @@ void Context::norm_factor_computing(bool is_subgraph, int subg_id) {
     alloc_subgraph_norm_factor(subg_id);
     constants = get_norm_factors_subg_ptr();
   }
+
   auto g_size = g->size();
   g->degree_counting();
 #ifdef USE_MKL
@@ -265,13 +273,4 @@ void Context::read_edgelist(const char* filename, bool symmetrize,
   }
 }
 
-/*
-inline void init_features(size_t dim, vec_t &x) {
-    std::default_random_engine rng;
-    std::uniform_real_distribution<feature_t> dist(0, 0.1);
-    for (size_t i = 0; i < dim; ++i)
-        x[i] = dist(rng);
-}
-*/
-
 } // namespace deepgalois
diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp
index 1da6c6c5a1..3332aeabaf 100644
--- a/libdeepgalois/src/DistContext.cpp
+++ b/libdeepgalois/src/DistContext.cpp
@@ -6,13 +6,12 @@ namespace deepgalois {
 DistContext::DistContext() {}
 DistContext::~DistContext() {}
 
-void DistContext::saveGraph(Graph* dGraph) {
+void DistContext::saveGraph(DGraph* dGraph) {
   graph_cpu = dGraph;
-
   localVertices = graph_cpu->size();
 }
 
-size_t DistContext::read_labels(std::string dataset_str) {
+size_t DistContext::read_labels(DGraph& dGraph, std::string dataset_str) {
   Graph* dGraph = DistContext::graph_cpu;
   unsigned myID = galois::runtime::getSystemNetworkInterface().ID;
   galois::gPrint("[", myID, "] Reading labels from disk...\n");
diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu
index 365bef8e50..05a1b0cd8f 100644
--- a/libdeepgalois/src/context.cu
+++ b/libdeepgalois/src/context.cu
@@ -99,7 +99,7 @@ Context::~Context() {
     CUDA_CHECK(cudaFree(norm_factors));
 }
 
-void Context::createSubgraphs(int n_sg) {}
+void Context::allocateSubgraphs(int n_sg) {}
 
 void Context::gen_subgraph_labels(size_t m, const mask_t* masks) {}
 
diff --git a/libdeepgalois/src/sampler.cpp b/libdeepgalois/src/sampler.cpp
index 0ac77526f3..727a95eb55 100644
--- a/libdeepgalois/src/sampler.cpp
+++ b/libdeepgalois/src/sampler.cpp
@@ -12,57 +12,57 @@ inline unsigned getDegree(Graph* g, index_t v) {
   return g->edge_end(v) - g->edge_begin(v);
 }
 
-void Sampler::set_masked_graph(size_t begin, size_t end, size_t count,
-                               mask_t* masks, Graph* g) {
-  // galois::gPrint("Set masked graph: begin=", begin, ", end=", end, ",
-  // count=", count, "\n");
-  begin_ = begin;
-  end_   = end;
-  count_ = count;
-  masks_ = masks;
-  graph  = g;
-#ifndef GALOIS_USE_DIST
-  masked_graph = new Graph();
-#endif
-  // generate_masked_graph(g->size(), masks, g, *masked_graph);
+void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, Graph* g) {
+  this->count_ = count;
+  this->masks_ = masks;
+  // save original graph
+  Sampler::graph  = g;
+  // allocate the object for the new masked graph
+  Sampler::masked_graph = new Graph();
+
   std::vector<uint32_t> degrees(g->size(), 0);
-  get_masked_degrees(g->size(), masks, g, degrees);
+  // get degrees of nodes that will be in new graph
+  this->get_masked_degrees(g->size(), masks, g, degrees);
   auto offsets = deepgalois::parallel_prefix_sum(degrees);
   size_t ne    = offsets[g->size()];
+
+  // save ids (on original graph) of training nodes to vector
   for (size_t i = 0; i < g->size(); i++) {
     if (masks[i] == 1)
-      node_train.push_back(i);
+      Sampler::node_train.push_back(i);
   }
-  masked_graph->allocateFrom(g->size(), ne);
-  masked_graph->constructNodes();
+
+  Sampler::masked_graph->allocateFrom(g->size(), ne);
+  Sampler::masked_graph->constructNodes();
+  // same as original graph, except keep only edges involved in masks
   galois::do_all(
       galois::iterate((size_t)0, g->size()),
       [&](const auto src) {
-        masked_graph->fixEndEdge(src, offsets[src + 1]);
+        Sampler::masked_graph->fixEndEdge(src, offsets[src + 1]);
         if (masks[src] == 1) {
           auto idx = offsets[src];
           for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) {
             const auto dst = g->getEdgeDst(e);
             if (masks[dst] == 1)
-              masked_graph->constructEdge(idx++, dst, 0);
+              Sampler::masked_graph->constructEdge(idx++, dst, 0);
           }
         }
       },
       galois::loopname("gen_subgraph"));
 
-  masked_graph->degree_counting();
-  avg_deg  = masked_graph->sizeEdges() / masked_graph->size();
-  subg_deg = (avg_deg > SAMPLE_CLIP) ? SAMPLE_CLIP : avg_deg;
-  // galois::gPrint("Train graph: num_vertices ", masked_graph->size(), "
-  // num_edges ", masked_graph->sizeEdges(), " avg_degree ", avg_deg, "\n");
-  size_t idx = 0;
-  vertices_.resize(count);
-  for (size_t i = begin; i < end; i++) {
-    if (masks_[i] == 1)
-      vertices_[idx++] = i;
-  }
+  Sampler::masked_graph->degree_counting();
+  Sampler::avg_deg  = masked_graph->sizeEdges() / masked_graph->size();
+  Sampler::subg_deg = (avg_deg > SAMPLE_CLIP) ? SAMPLE_CLIP : avg_deg;
+
+  //size_t idx = 0;
+  //vertices_.resize(count);
+  //for (size_t i = begin; i < end; i++) {
+  //  if (masks_[i] == 1)
+  //    vertices_[idx++] = i;
+  //}
 }
 
+//! determine degree of each vertex in a masked graph (given by masks and g)
 void Sampler::get_masked_degrees(size_t n, mask_t* masks, Graph* g,
                                  std::vector<uint32_t>& degrees) {
   assert(degrees.size() == n);
@@ -87,18 +87,22 @@ void Sampler::get_masked_degrees(size_t n, mask_t* masks, Graph* g,
 #endif
 }
 
-void Sampler::generate_masked_graph(size_t n, mask_t* masks, Graph* g,
+//! returns a graph in the variable sub: it is g with the mask applied
+void Sampler::getMaskedGraph(size_t n, mask_t* masks, Graph* g,
                                     Graph& sub) {
   std::vector<uint32_t> degrees(n, 0);
-  get_masked_degrees(n, masks, g, degrees);
+  this->get_masked_degrees(n, masks, g, degrees);
   // auto offsets = deepgalois::parallel_prefix_sum(degrees);
   auto offsets = deepgalois::prefix_sum(degrees);
   size_t ne    = offsets[n];
   // galois::gPrint("Generate masked graph: num_vertices=", n, ", num_edges=",
   // ne, "\n");
-#ifndef GALOIS_USE_DIST
+  //
+
+  // note this constructs the full graph's nodes; just trims edges
   sub.allocateFrom(n, ne);
   sub.constructNodes();
+
 #ifdef PARALLEL_GEN
   galois::do_all(
       galois::iterate((size_t)0, n),
@@ -123,7 +127,9 @@ void Sampler::generate_masked_graph(size_t n, mask_t* masks, Graph* g,
 #endif
 }
 
-void Sampler::check_DB(std::vector<db_t>& DB0, std::vector<db_t>& DB1,
+
+// helper function for graph saint implementation below
+void Sampler::checkGSDB(std::vector<db_t>& DB0, std::vector<db_t>& DB1,
                        std::vector<db_t>& DB2, size_t size) {
   if (DB0.capacity() < size) {
     DB0.reserve(DB0.capacity() * 2);
@@ -135,6 +141,7 @@ void Sampler::check_DB(std::vector<db_t>& DB0, std::vector<db_t>& DB1,
   DB2.resize(size);
 }
 
+//! debug function: prints out sets of vertices
 void print_vertex_set(VertexSet vertex_set) {
   unsigned counter = 0;
   unsigned n       = vertex_set.size();
@@ -148,9 +155,11 @@ void print_vertex_set(VertexSet vertex_set) {
   galois::gPrint(")\n");
 }
 
-void Sampler::select_vertices(size_t n, int m, VertexSet& st, unsigned tid) {
-  // unsigned myseed = time(NULL);
-  unsigned myseed = tid + time(NULL);
+// implementation from GraphSAINT
+// https://github.com/GraphSAINT/GraphSAINT/blob/master/ipdps19_cpp/sample.cpp
+void Sampler::select_vertices(size_t n, int m, VertexSet& st, unsigned seed) {
+  unsigned myseed = seed;
+
   // unsigned myseed = tid;
   // DBx: Dashboard line x, IAx: Index array line x
   std::vector<db_t> DB0, DB1, DB2, IA0, IA1, IA2, IA3, IA4, nDB0, nDB1, nDB2;
@@ -172,11 +181,12 @@ void Sampler::select_vertices(size_t n, int m, VertexSet& st, unsigned tid) {
   // printf("( ");
   // for (size_t i = 0; i < 10; i++) std::cout << node_train[i] << " ";
   // printf(")\n");
+
   for (int i = 0; i < m; i++) {
-    auto rand_idx = rand_r(&myseed) % node_train.size();
-    db_t v = IA3[i] = node_train[rand_idx];
-    st.insert(v);
-    IA0[i] = getDegree(masked_graph, v);
+    auto rand_idx = rand_r(&myseed) % Sampler::node_train.size();
+    db_t v = IA3[i] = Sampler::node_train[rand_idx];
+    st.iisert(v);
+    IA0[i] = getDegree(Sampler::masked_graph, v);
     IA0[i] = (IA0[i] > SAMPLE_CLIP) ? SAMPLE_CLIP : IA0[i];
     IA1[i] = 1;
     IA2[i] = 0;
@@ -187,7 +197,7 @@ void Sampler::select_vertices(size_t n, int m, VertexSet& st, unsigned tid) {
   for (int i = 1; i < m; i++)
     IA2[i] = IA2[i - 1] + IA0[i];
   // now fill DB accordingly
-  check_DB(DB0, DB1, DB2, IA2[m - 1]);
+  checkGSDB(DB0, DB1, DB2, IA2[m - 1]);
   for (int i = 0; i < m; i++) {
     db_t DB_start = (i == 0) ? 0 : IA2[i - 1];
     db_t DB_end   = IA2[i];
@@ -209,16 +219,16 @@ void Sampler::select_vertices(size_t n, int m, VertexSet& st, unsigned tid) {
     }
     choose      = (DB1[choose] < 0) ? choose : (choose - DB1[choose]);
     db_t v      = DB0[choose];
-    auto degree = getDegree(masked_graph, v);
+    auto degree = getDegree(Sampler::masked_graph, v);
     neigh_v     = (degree != 0) ? rand_r(&myseed) % degree : db_t(-1);
     if (neigh_v != db_t(-1)) {
-      neigh_v = masked_graph->getEdgeDst(masked_graph->edge_begin(v) + neigh_v);
+      neigh_v = Sampler::masked_graph->getEdgeDst(Sampler::masked_graph->edge_begin(v) + neigh_v);
       st.insert(neigh_v);
       IA1[DB2[choose] - 1] = 0;
       IA0[DB2[choose] - 1] = 0;
       for (auto i = choose; i < choose - DB1[choose]; i++)
         DB0[i] = db_t(-1);
-      newsize = getDegree(masked_graph, neigh_v);
+      newsize = getDegree(Sampler::masked_graph, neigh_v);
       newsize = (newsize > SAMPLE_CLIP) ? SAMPLE_CLIP : newsize;
     } else
       newsize = 0;
@@ -270,7 +280,7 @@ void Sampler::select_vertices(size_t n, int m, VertexSet& st, unsigned tid) {
       IA2.resize(curr);
       IA3.resize(curr);
     }
-    check_DB(DB0, DB1, DB2, newsize + DB0.size());
+    checkGSDB(DB0, DB1, DB2, newsize + DB0.size());
     IA0.push_back(newsize);
     IA1.push_back(1);
     IA2.push_back(IA2.back() + IA0.back());
@@ -287,11 +297,12 @@ void Sampler::select_vertices(size_t n, int m, VertexSet& st, unsigned tid) {
   // print_vertex_set(st);
 }
 
-// !API function for user-defined selection strategy
+// API function for user-defined selection strategy
 // Select n vertices from vertices and put them in vertex_set.
 // nv: number of vertices in the original graph;
 // n: number of vertices in the subgraph;
 // m: number of vertices in the frontier.
+// our implementation of GraphSAINT sampling
 void Sampler::select_vertices(size_t nv, size_t n, int m, Graph* g,
                               VertexList vertices, VertexSet& vertex_set) {
   // galois::gPrint("Select a vertex set of size ", n, " from ", nv, " vertices,
@@ -334,14 +345,14 @@ void Sampler::select_vertices(size_t nv, size_t n, int m, Graph* g,
   */
 }
 
-void Sampler::update_masks(size_t n, VertexSet vertices, mask_t* masks) {
+void Sampler::getMasks(size_t n, VertexSet vertices, mask_t* masks) {
   // galois::gPrint("Updating masks, size = ", vertices.size(), "\n");
   std::fill(masks, masks + n, 0);
   for (auto v : vertices)
     masks[v] = 1;
 }
 
-inline VertexList Sampler::reindexing_vertice(size_t n, VertexSet vertex_set) {
+inline VertexList Sampler::reindexVertices(size_t n, VertexSet vertex_set) {
   VertexList new_ids(n, 0);
   int vid = 0;
   for (auto v : vertex_set) {
@@ -352,13 +363,13 @@ inline VertexList Sampler::reindexing_vertice(size_t n, VertexSet vertex_set) {
 
 // Given a subset of vertices and a graph g, generate a subgraph sg from the
 // graph g
-void Sampler::generate_subgraph(VertexSet& vertex_set, Graph& g, Graph& sub) {
-  // auto n = g.size(); // old graph size
-  auto nv            = vertex_set.size(); // new graph (subgraph) size
-  VertexList new_ids = reindexing_vertice(graph->size(), vertex_set);
+void Sampler::reindexSubgraph(VertexSet& keptVertices, Graph& origGraph, Graph& reindexGraph) {
+  // auto n = origGraph.size(); // old graph size
+  auto nv            = keptVertices.size(); // new graph (subgraph) size
+  VertexList new_ids = this->reindexVertices(graph->size(), keptVertices);
   std::vector<uint32_t> degrees(nv, 0); // degrees of vertices in the subgraph
-  for (auto v : vertex_set) {
-    degrees[new_ids[v]] = getDegree(&g, v);
+  for (auto v : keptVertices) {
+    degrees[new_ids[v]] = getDegree(&origGraph, v);
   }
   // auto offsets = deepgalois::parallel_prefix_sum(degrees);
   auto offsets = deepgalois::prefix_sum(degrees);
@@ -366,9 +377,9 @@ void Sampler::generate_subgraph(VertexSet& vertex_set, Graph& g, Graph& sub) {
   // galois::gPrint("Generate subgraph: num_vertices=", nv, ", num_edges=", ne,
   // "\n");
 #ifndef GALOIS_USE_DIST
-  sub.allocateFrom(nv, ne);
-  sub.constructNodes();
-  VertexList old_ids(vertex_set.begin(), vertex_set.end()); // vertex ID mapping
+  reindexGraph.allocateFrom(nv, ne);
+  reindexGraph.constructNodes();
+  VertexList old_ids(keptVertices.begin(), keptVertices.end()); // vertex ID mapping
 #ifdef PARALLEL_GEN
   galois::do_all(
       galois::iterate((size_t)0, nv),
@@ -376,13 +387,13 @@ void Sampler::generate_subgraph(VertexSet& vertex_set, Graph& g, Graph& sub) {
 #else
   for (size_t i = 0; i < nv; i++) {
 #endif
-        sub.fixEndEdge(i, offsets[i + 1]);
+        reindexGraph.fixEndEdge(i, offsets[i + 1]);
         unsigned j  = 0;
         auto old_id = old_ids[i];
-        for (auto e = g.edge_begin(old_id); e != g.edge_end(old_id); e++) {
-          auto dst = new_ids[g.getEdgeDst(e)];
+        for (auto e = origGraph.edge_begin(old_id); e != origGraph.edge_end(old_id); e++) {
+          auto dst = new_ids[origGraph.getEdgeDst(e)];
           assert(dst < nv);
-          sub.constructEdge(offsets[i] + j, dst, 0);
+          reindexGraph.constructEdge(offsets[i] + j, dst, 0);
           j++;
         }
       }
@@ -395,19 +406,20 @@ void Sampler::generate_subgraph(VertexSet& vertex_set, Graph& g, Graph& sub) {
 
 void Sampler::subgraph_sample(size_t n, Graph& sg, mask_t* masks,
                               unsigned tid) {
-  VertexSet vertex_set; // n = 9000 by default
-  // select_vertices(count_, n, m_, masked_graph, vertices_, vertex_set); // m =
-  // 1000 by default
-  select_vertices(n, m_, vertex_set, tid); // m = 1000 by default
-  update_masks(graph->size(), vertex_set,
-               masks); // set masks for vertices in the vertex_set
-#ifndef GALOIS_USE_DIST
+  VertexSet sampledSet;
+  // n = 9000 by default
+  // this->select_vertices(count_, n, m_, masked_graph, vertices_, sampledSet);
+
+  // do the sampling of vertices from training set + using masked graph
+  this->select_vertices(n, m_, sampledSet, tid); // m = 1000 by default
+
+  // create the masks on the masked_graph
+  getMasks(Sampler::graph->size(), sampledSet, masks);
+
   Graph masked_sg;
-  generate_masked_graph(
-      graph->size(), masks, masked_graph,
+  this->getMaskedGraph(Sampler::graph->size(), masks, Sampler::masked_graph,
       masked_sg); // remove edges whose destination is not masked
-  generate_subgraph(vertex_set, masked_sg, sg);
-#endif
+  this->reindexSubgraph(sampledSet, masked_sg, sg);
 }
 
 } // namespace deepgalois

From 541745380f9cd1ddcb081ff320a72cbdc513b90f Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 8 May 2020 16:01:23 -0500
Subject: [PATCH 271/660] sampler renaming

---
 libdeepgalois/CMakeLists.txt                              | 2 +-
 libdeepgalois/include/deepgalois/Net.h                    | 7 ++-----
 libdeepgalois/include/deepgalois/{sampler.h => Sampler.h} | 0
 libdeepgalois/src/{sampler.cpp => Sampler.cpp}            | 2 +-
 4 files changed, 4 insertions(+), 7 deletions(-)
 rename libdeepgalois/include/deepgalois/{sampler.h => Sampler.h} (100%)
 rename libdeepgalois/src/{sampler.cpp => Sampler.cpp} (99%)

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index 9a20111e0b..58309084b1 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -83,7 +83,7 @@ if(NOT ENABLE_HETERO_GALOIS)
     src/math_functions.cpp
     src/optimizer.cpp
     src/Context.cpp
-    src/sampler.cpp
+    src/Sampler.cpp
     src/reader.cpp
     src/lgraph.cpp
     src/utils.cpp
diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h
index d478d83e4c..61fb1034c7 100644
--- a/libdeepgalois/include/deepgalois/Net.h
+++ b/libdeepgalois/include/deepgalois/Net.h
@@ -10,13 +10,10 @@
 #include "deepgalois/layers/sigmoid_loss_layer.h"
 #include "deepgalois/optimizer.h"
 #include "deepgalois/utils.h"
-#ifdef CPU_ONLY
-#include "deepgalois/sampler.h"
-#endif
-#ifndef GALOIS_USE_DIST
+#include "deepgalois/Sampler.h"
 #include "deepgalois/Context.h"
-#else
 #include "deepgalois/GraphTypes.h"
+
 #include "deepgalois/DistContext.h"
 #endif
 
diff --git a/libdeepgalois/include/deepgalois/sampler.h b/libdeepgalois/include/deepgalois/Sampler.h
similarity index 100%
rename from libdeepgalois/include/deepgalois/sampler.h
rename to libdeepgalois/include/deepgalois/Sampler.h
diff --git a/libdeepgalois/src/sampler.cpp b/libdeepgalois/src/Sampler.cpp
similarity index 99%
rename from libdeepgalois/src/sampler.cpp
rename to libdeepgalois/src/Sampler.cpp
index 727a95eb55..aa6fb6d686 100644
--- a/libdeepgalois/src/sampler.cpp
+++ b/libdeepgalois/src/Sampler.cpp
@@ -1,5 +1,5 @@
 #include "deepgalois/utils.h"
-#include "deepgalois/sampler.h"
+#include "deepgalois/Sampler.h"
 #include "galois/Galois.h"
 #include <time.h>
 #include <vector>

From a4f1c063f994c394db7258d27e2596af84a1d119 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 8 May 2020 17:55:27 -0500
Subject: [PATCH 272/660] renaming net things for easier understanding; getting
 ready for dist sampling

---
 .../include/deepgalois/DistContext.h          |  16 --
 libdeepgalois/include/deepgalois/Net.h        | 216 ++++++++++--------
 libdeepgalois/src/DistContext.cpp             |  20 --
 libdeepgalois/src/Net.cpp                     |  63 ++---
 lonestar/gnn/gcn/gcn.cpp                      |  12 +-
 5 files changed, 160 insertions(+), 167 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h
index 7069c1a0d7..e1b76fa00c 100644
--- a/libdeepgalois/include/deepgalois/DistContext.h
+++ b/libdeepgalois/include/deepgalois/DistContext.h
@@ -10,7 +10,6 @@
 namespace deepgalois {
 
 class DistContext {
-protected:
   size_t localVertices; // number of samples: N
   size_t num_classes;   // number of classes: E
   size_t feat_len;      // input feature length: D
@@ -23,12 +22,6 @@ class DistContext {
   label_t* h_labels_subg; // labels for subgraph
   float_t* h_feats;       // input features: N x D
   float_t* h_feats_subg;  // input features for subgraph
-  label_t* d_labels;      // labels on device
-  label_t* d_labels_subg; // labels for subgraph on device
-  float_t* d_feats;       // input features on device
-  float_t* d_feats_subg;  // input features for subgraph on device
-  float_t* norm_factors;  // normalization constant based on graph structure
-  float_t* norm_factors_subg; // normalization constant for subgraph
 
 public:
   DistContext();
@@ -36,23 +29,14 @@ class DistContext {
 
   //! save graph pointer to context object
   void saveDistGraph(Graph* dGraph);
-
   //! read labels of local nodes only
   size_t read_labels(std::string dataset_str);
-
   //! read features of local nodes only
   size_t read_features(std::string dataset_str);
-
   //! read masks of local nodes only
   size_t read_masks(std::string dataset_str, std::string mask_type, size_t n,
                     size_t& begin, size_t& end, mask_t* masks, Graph* dGraph);
 
-  //! find norm factor by looking at degree
-  // TODO this is a distributed operation
-  void norm_factor_computing(bool is_subgraph, int subg_id = 0);
-  // void createSubgraphs(int num_subgraphs) {}
-  // void gen_subgraph_labels(size_t m, const mask_t *masks) {}
-  // void gen_subgraph_feats(size_t m, const mask_t *masks) {}
   // TODO define these
   void createSubgraphs(int) {}
   void gen_subgraph_labels(size_t, const mask_t*) {}
diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h
index 61fb1034c7..59674abc41 100644
--- a/libdeepgalois/include/deepgalois/Net.h
+++ b/libdeepgalois/include/deepgalois/Net.h
@@ -24,6 +24,67 @@ namespace deepgalois {
 // layer 1: features N x D, weights D x 16, out N x 16 (hidden1=16)
 // layer 2: features N x 16, weights 16 x E, out N x E
 class Net {
+  unsigned myID = galois::runtime::getSystemNetworkInterface().ID;
+  std::string header        = "[" + std::to_string(myID) + "] ";
+  std::string seperator     = "\n";
+
+  bool is_single_class;          // single-class (one-hot) or multi-class label
+  bool has_l2norm;               // whether the net contains an l2_norm layer
+  bool has_dense;                // whether the net contains an dense layer
+  unsigned neighbor_sample_size; // neighbor sampling
+  unsigned subgraph_sample_size; // subgraph sampling
+  int num_threads;               // number of threads
+  size_t num_samples;            // number of samples: N
+  size_t distNumSamples;            // number of samples: N
+  size_t num_classes;            // number of vertex classes: E
+  size_t num_conv_layers;        // number of convolutional layers
+  size_t num_layers;             // total number of layers (conv + output)
+  int num_epochs;                // number of epochs
+  float learning_rate;           // learning rate
+  float dropout_rate;            // dropout rate
+  float weight_decay;            // weighti decay for over-fitting
+  // begins/ends below are global ids
+  size_t globalTrainBegin;
+  size_t globalTrainEnd;
+  size_t globalTrainCount;
+  size_t globalValBegin;
+  size_t globalValEnd;
+  size_t globalValCount;
+  size_t globalTestBegin;
+  size_t globalTestEnd;
+  size_t globalTestCount;
+  int val_interval;
+  int num_subgraphs;
+  int num_vertices_sg;
+  bool is_selfloop;
+
+  mask_t* globalTrainMasks;              // masks for training
+  mask_t* globalValMasks;                // masks for validation
+  mask_t* distTrainMasks;
+  mask_t* distValMasks;
+  mask_t* test_masks;               // masks for test
+
+
+  mask_t* d_train_masks;            // masks for training on device
+  mask_t* d_val_masks;              // masks for validation on device
+  mask_t* d_test_masks;             // masks for test on device
+
+  mask_t* subgraphs_masks;          // masks for subgraphs
+  std::vector<size_t> feature_dims; // feature dimnesions for each layer
+  std::vector<layer*> layers;       // all the layers in the neural network
+
+  // one context is for entire graph; other is for partitioned graph
+  // TODO optimize single host case
+
+  //! context holds all of the graph data
+  deepgalois::Context* context;
+  //! dist context holds graph data of the partitioned graph only
+  deepgalois::DistContext* distContext;
+
+  DGraph* dGraph;
+
+  Sampler* sampler;
+
 public:
   Net(std::string dataset_str, int nt, unsigned n_conv, int epochs,
       unsigned hidden1, float lr, float dropout, float wd, bool selfloop,
@@ -34,13 +95,19 @@ class Net {
         num_threads(nt), num_conv_layers(n_conv), num_epochs(epochs),
         learning_rate(lr), dropout_rate(dropout), weight_decay(wd),
         val_interval(val_itv), num_subgraphs(1), is_selfloop(selfloop) {
+    // init some identifiers for this host
+    this->myID          = galois::runtime::getSystemNetworkInterface().ID;
+    this->header        = "[" + std::to_string(myID) + "] ";
+    this->seperator     = "\n";
+
     assert(n_conv > 0);
+
     // TODO use galois print
-    std::cout << "Configuration: num_threads " << num_threads
-              << ", num_conv_layers " << num_conv_layers << ", num_epochs "
-              << num_epochs << ", hidden1 " << hidden1 << ", learning_rate "
-              << learning_rate << ", dropout_rate " << dropout_rate
-              << ", weight_decay " << weight_decay << "\n";
+    galois>>gPrint(header, "Configuration: num_threads ", num_threads,
+              ", num_conv_layers ", num_conv_layers, ", num_epochs ",
+              num_epochs, ", hidden1 ", hidden1, ", learning_rate ",
+              learning_rate, ", dropout_rate ", dropout_rate,
+              ", weight_decay ", weight_decay, "\n");
     num_layers = num_conv_layers + 1;
 
     // additional layers to add
@@ -62,30 +129,34 @@ class Net {
     num_classes = context->read_labels();
 
     // get training and validation sets
-    train_masks = new mask_t[num_samples];
-    val_masks   = new mask_t[num_samples];
-    std::fill(train_masks, train_masks + num_samples, 0);
-    std::fill(val_masks, val_masks + num_samples, 0);
+    globalTrainMasks = new mask_t[num_samples];
+    globalValMasks   = new mask_t[num_samples];
+    std::fill(globalTrainMasks, globalTrainMasks + num_samples, 0);
+    std::fill(globalValMasks, globalValMasks + num_samples, 0);
 
     // reddit is hard coded
     if (dataset_str == "reddit") {
-      train_begin = 0, train_count = 153431,
-      train_end = train_begin + train_count;
-      val_begin = 153431, val_count = 23831, val_end = val_begin + val_count;
+      this->globalTrainBegin = 0;
+      this->globalTrainCount = 153431;
+      this->globalTrainEnd = this->globalTrainBegin + this->globalTrainCount;
+      this->globalValBegin = 153431;
+      this->globalValCount = 23831;
+      this->globalValEnd = this->globalValBegin + this->globalValCount;
+
       // TODO do all can be used below
-      for (size_t i = train_begin; i < train_end; i++)
-        train_masks[i] = 1;
-      for (size_t i = val_begin; i < val_end; i++)
-        val_masks[i] = 1;
+      for (size_t i = globalTrainBegin; i < globalTrainEnd; i++)
+        globalTrainMasks[i] = 1;
+      for (size_t i = globalValBegin; i < globalValEnd; i++)
+        globalValMasks[i] = 1;
     } else {
-      train_count = context->read_masks("train", num_samples, train_begin,
-                                        train_end, train_masks);
-      val_count   = context->read_masks("val", num_samples, val_begin, val_end,
-                                      val_masks);
+      globalTrainCount = context->read_masks("train", num_samples, globalTrainBegin,
+                                        globalTrainEnd, globalTrainMasks);
+      globalValCount   = context->read_masks("val", num_samples, globalValBegin, globalValEnd,
+                                      globalValMasks);
     }
 
     // make sure sampel size isn't greater than what we have to train with
-    if (subgraph_sample_size > train_count) {
+    if (subgraph_sample_size > globalTrainCount) {
       GALOIS_DIE("subgraph size can not be larger than the size of training "
                  "set\n");
     }
@@ -118,17 +189,13 @@ class Net {
   //      neighbor_sample_size(0), subgraph_sample_size(0), num_threads(1),
   //      num_samples(0), num_classes(0), num_conv_layers(0), num_layers(0),
   //      num_epochs(0), learning_rate(0.0), dropout_rate(0.0), weight_decay(0.0),
-  //      train_begin(0), train_end(0), train_count(0), val_begin(0), val_end(0),
-  //      val_count(0), test_begin(0), test_end(0), test_count(0),
+  //      globalTrainBegin(0), globalTrainEnd(0), globalTrainCount(0), globalValBegin(0), globalValEnd(0),
+  //      globalValCount(0), globalTestBegin(0), globalTestEnd(0), globalTestCount(0),
   //      val_interval(1), num_subgraphs(1), num_vertices_sg(9000),
-  //      train_masks(NULL), val_masks(NULL), test_masks(NULL), context(NULL) {}
-
-  //! save graph pointer to context object
-  void saveDistGraph(Graph* dGraph);
+  //      globalTrainMasks(NULL), globalValMasks(NULL), test_masks(NULL), context(NULL) {}
 
-#ifdef GALOIS_USE_DIST
-  void dist_init(Graph* graph, std::string dataset_str);
-#endif
+  //! Initializes metadata for the partition
+  void partitionInit(DGraph* graph, std::string dataset_str);
 
   size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; }
   size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id + 1]; }
@@ -138,9 +205,6 @@ class Net {
   void regularize(); // add weight decay
 
   void train(optimizer* opt, bool need_validate) {
-    unsigned myID = galois::runtime::getSystemNetworkInterface().ID;
-    std::string header        = "[" + std::to_string(myID) + "] ";
-    std::string seperator     = "\n";
 
     double total_train_time = 0.0;
     int num_subg_remain     = 0;
@@ -149,7 +213,7 @@ class Net {
       context->allocateSubgraphs(num_subgraphs);
       subgraphs_masks = new mask_t[num_samples * num_subgraphs];
       galois::gPrint(header, " Construct training vertex set induced graph...\n";
-      sampler->initializeMaskedGraph(train_count, train_masks, context->getGraphPointer());
+      sampler->initializeMaskedGraph(globalTrainCount, globalTrainMasks, context->getGraphPointer());
     }
 
     std::cout << "\nStart training...\n";
@@ -269,10 +333,10 @@ class Net {
     size_t begin = 0, end = 0, count = 0;
     mask_t* masks = NULL;
     if (type == "train") {
-      begin = train_begin;
-      end   = train_end;
-      count = train_count;
-      masks = train_masks;
+      begin = globalTrainBegin;
+      end   = globalTrainEnd;
+      count = globalTrainCount;
+      masks = globalTrainMasks;
       if (subgraph_sample_size) {
         // update masks for subgraph
         masks = NULL;
@@ -281,14 +345,14 @@ class Net {
         count = num_vertices_sg;
       }
     } else if (type == "val") {
-      begin = val_begin;
-      end   = val_end;
-      count = val_count;
-      masks = val_masks;
+      begin = globalValBegin;
+      end   = globalValEnd;
+      count = globalValCount;
+      masks = globalValMasks;
     } else {
-      begin = test_begin;
-      end   = test_end;
-      count = test_count;
+      begin = globalTestBegin;
+      end   = globalTestEnd;
+      count = globalTestCount;
       masks = test_masks;
     }
 #ifdef CPU_ONLY
@@ -334,14 +398,14 @@ class Net {
   void read_test_masks(std::string dataset) {
     test_masks = new mask_t[num_samples];
     if (dataset == "reddit") {
-      test_begin = 177262;
-      test_count = 55703;
-      test_end   = test_begin + test_count;
+      globalTestBegin = 177262;
+      globalTestCount = 55703;
+      globalTestEnd   = globalTestBegin + globalTestCount;
 #ifndef GALOIS_USE_DIST
-      for (size_t i = test_begin; i < test_end; i++)
+      for (size_t i = globalTestBegin; i < globalTestEnd; i++)
         test_masks[i] = 1;
 #else
-      for (size_t i = test_begin; i < test_end; i++) {
+      for (size_t i = globalTestBegin; i < globalTestEnd; i++) {
         if (dGraph->isLocal(i)) {
           test_masks[dGraph->getLID(i)] = 1;
         }
@@ -349,11 +413,11 @@ class Net {
 #endif
     } else {
 #ifndef GALOIS_USE_DIST
-      test_count = context->read_masks("test", num_samples, test_begin,
-                                       test_end, test_masks);
+      globalTestCount = context->read_masks("test", num_samples, globalTestBegin,
+                                       globalTestEnd, test_masks);
 #else
-      test_count = context->read_masks("test", num_samples, test_begin,
-                                       test_end, test_masks, dGraph);
+      globalTestCount = context->read_masks("test", num_samples, globalTestBegin,
+                                       globalTestEnd, test_masks, dGraph);
 #endif
     }
 #ifndef CPU_ONLY
@@ -492,50 +556,6 @@ class Net {
       layers[i]->print_layer_info();
   }
 
-protected:
-  bool is_single_class;          // single-class (one-hot) or multi-class label
-  bool has_l2norm;               // whether the net contains an l2_norm layer
-  bool has_dense;                // whether the net contains an dense layer
-  unsigned neighbor_sample_size; // neighbor sampling
-  unsigned subgraph_sample_size; // subgraph sampling
-  int num_threads;               // number of threads
-  size_t num_samples;            // number of samples: N
-  size_t num_classes;            // number of vertex classes: E
-  size_t num_conv_layers;        // number of convolutional layers
-  size_t num_layers;             // total number of layers (conv + output)
-  int num_epochs;                // number of epochs
-  float learning_rate;           // learning rate
-  float dropout_rate;            // dropout rate
-  float weight_decay;            // weighti decay for over-fitting
-  size_t train_begin, train_end, train_count;
-  size_t val_begin, val_end, val_count;
-  size_t test_begin, test_end, test_count;
-  int val_interval;
-  int num_subgraphs;
-  int num_vertices_sg;
-  bool is_selfloop;
-
-  mask_t* train_masks;              // masks for training
-  mask_t* d_train_masks;            // masks for training on device
-  mask_t* val_masks;                // masks for validation
-  mask_t* d_val_masks;              // masks for validation on device
-  mask_t* test_masks;               // masks for test
-  mask_t* d_test_masks;             // masks for test on device
-  mask_t* subgraphs_masks;          // masks for subgraphs
-  std::vector<size_t> feature_dims; // feature dimnesions for each layer
-  std::vector<layer*> layers;       // all the layers in the neural network
-#ifndef GALOIS_USE_DIST
-  deepgalois::Context* context;
-#else
-  deepgalois::DistContext* context;
-  Graph* dGraph;
-#endif
-
-#ifdef CPU_ONLY
-#ifndef GALOIS_USE_DIST
-  Sampler* sampler;
-#endif
-#endif
   // comparing outputs with the ground truth (labels)
   acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks,
                         float_t* preds, label_t* ground_truth);
diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp
index 3332aeabaf..f7ad18bc22 100644
--- a/libdeepgalois/src/DistContext.cpp
+++ b/libdeepgalois/src/DistContext.cpp
@@ -150,26 +150,6 @@ size_t DistContext::read_masks(std::string dataset_str, std::string mask_type,
 
 float_t* DistContext::get_in_ptr() { return &h_feats[0]; }
 
-// void DistContext::norm_factor_computing(bool is_subgraph, int subg_id) {
-void DistContext::norm_factor_computing(bool, int) {
-  // TODO: this is a distributed operation
-
-  // create for now, TODO need to actually fill it in
-  norm_factors = new float_t[localVertices];
-  galois::do_all(
-      galois::iterate((size_t)0, localVertices),
-      [&](auto v) { norm_factors[v] = 1; }, galois::loopname("NormCounting"));
-
-  // galois::do_all(galois::iterate((size_t)0, localVertices),
-  //  [&](auto v) {
-  //    auto degree  = std::distance(graph_cpu->edge_begin(v),
-  //    graph_cpu->edge_end(v)); float_t temp = std::sqrt(float_t(degree)); if
-  //    (temp == 0.0) norm_factors[v] = 0.0; else norm_factors[v] = 1.0 / temp;
-  //  }, galois::loopname("NormCounting"));
-
-  return;
-}
-
 void DistContext::initializeSyncSubstrate() {
   DistContext::syncSubstrate = new galois::graphs::GluonSubstrate<Graph>(
       *DistContext::graph_cpu, galois::runtime::getSystemNetworkInterface().ID,
diff --git a/libdeepgalois/src/Net.cpp b/libdeepgalois/src/Net.cpp
index ede45fe2a3..a986ec194d 100644
--- a/libdeepgalois/src/Net.cpp
+++ b/libdeepgalois/src/Net.cpp
@@ -10,44 +10,51 @@
 namespace deepgalois {
 
 #ifdef GALOIS_USE_DIST
-void Net::dist_init(Graph* graph, std::string dataset_str) {
-  dGraph      = graph;
-  context     = new deepgalois::DistContext();
-  num_samples = dGraph->size();
-  context->saveGraph(dGraph);
-  // TODO self loop setup?
-  context->initializeSyncSubstrate();
-  num_classes = context->read_labels();
+void Net::partitionInit(DGraph* graph, std::string dataset_str) {
+  this->dGraph          = graph;
+  this->distContext     = new deepgalois::DistContext();
+  this->distContext->saveDistGraph(dGraph);
+  this->distNumSamples = this->dGraph->size();
+
+  // TODO self loop setup would have to be done before this during partitioning
+  // or on master node only
+
+  this->distContext->initializeSyncSubstrate();
+  num_classes = this->distContext->read_labels();
 
   // std::cout << "Reading label masks ... ";
-  train_masks = new mask_t[num_samples];
-  val_masks   = new mask_t[num_samples];
-  std::fill(train_masks, train_masks + num_samples, 0);
-  std::fill(val_masks, val_masks + num_samples, 0);
+  this->distTrainMasks = new mask_t[this->distNumSamples];
+  this->distValMasks   = new mask_t[this->distNumSamples];
+  std::fill(this->distTrainMasks, this->distTrainMasks + this->distNumSamples, 0);
+  std::fill(this->distValMasks, this->distValMasks + this->distNumSamples, 0);
 
   if (dataset_str == "reddit") {
-    train_begin = 0, train_count = 153431,
-    train_end = train_begin + train_count;
-    val_begin = 153431, val_count = 23831, val_end = val_begin + val_count;
+    //this->globalTrainBegin = 0;
+    //this->globalTrainCount = 153431;
+    //this->globalTrainEnd = this->globalTrainBegin + this->globalTrainCount;
+    //this->globalValBegin = 153431;
+    //this->globalValCount = 23831;
+    //this->globalValEnd = this->globalValBegin + this->globalValCount;
+
     // find local ID from global ID, set if it exists
-    for (size_t i = train_begin; i < train_end; i++) {
-      if (dGraph->isLocal(i)) {
-        train_masks[dGraph->getLID(i)] = 1;
+    for (size_t i = globalTrainBegin; i < globalTrainEnd; i++) {
+      if (this->dGraph->isLocal(i)) {
+        this->distTrainMasks[this->dGraph->getLID(i)] = 1;
       }
     }
-    for (size_t i = val_begin; i < val_end; i++) {
-      if (dGraph->isLocal(i)) {
-        val_masks[dGraph->getLID(i)] = 1;
+    for (size_t i = globalValBegin; i < globalValEnd; i++) {
+      if (this->dGraph->isLocal(i)) {
+        this->distValMasks[this->dGraph->getLID(i)] = 1;
       }
     }
   } else {
-    train_count = context->read_masks("train", num_samples, train_begin,
-                                      train_end, train_masks, dGraph);
-    val_count   = context->read_masks("val", num_samples, val_begin, val_end,
-                                    val_masks, dGraph);
+    globalTrainCount = this->distContext->read_masks("train", this->distNumSamples, globalTrainBegin,
+                                      globalTrainEnd, this->distTrainMasks, this->dGraph);
+    globalValCount   = this->distContext->read_masks("val", this->distNumSamples, globalValBegin, globalValEnd,
+                                    this->distValMasks, this->dGraph);
   }
 
-  feature_dims[0] = context->read_features(); // input feature dimension: D
+  feature_dims[0] = this->distContext->read_features(); // input feature dimension: D
   for (size_t i = 1; i < num_conv_layers; i++)
     feature_dims[i] = hidden1;                 // hidden1 level embedding: 16
   feature_dims[num_conv_layers] = num_classes; // output embedding: E
@@ -113,10 +120,10 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count,
 #else
         // only look at owned nodes (i.e. masters); the prediction for these
         // should only by handled on the owner
-        if (dGraph->isOwned(i)) {
+        if (this->dGraph->isOwned(i)) {
           sampleCount += 1;
 
-          uint32_t localID = dGraph->getLID(i);
+          uint32_t localID = this->dGraph->getLID(i);
           if (masks[localID] == 1) {
             // get prediction
             auto pred =
diff --git a/lonestar/gnn/gcn/gcn.cpp b/lonestar/gnn/gcn/gcn.cpp
index f2d08d3cb3..702fc63516 100644
--- a/lonestar/gnn/gcn/gcn.cpp
+++ b/lonestar/gnn/gcn/gcn.cpp
@@ -13,16 +13,18 @@ int main(int argc, char** argv) {
   galois::DistMemSys G;
   LonestarGnnStart(argc, argv, name, desc, url);
 
-  // the neural network to train: loads the entire graph on CPU
+  // Get a partitioned graph first
+  std::vector<unsigned> dummyVec;
+  deepgalois::Graph* dGraph =
+      galois::graphs::constructSymmetricGraph<char, void>(dummyVec);
+  network.dist_init(dGraph, dataset);
+
+  // initialize entire on CPU
   deepgalois::Net network(dataset, numThreads, num_conv_layers, epochs, hidden1,
                           learning_rate, dropout_rate, weight_decay,
                           add_selfloop, is_single_class, add_l2norm, add_dense,
                           neighbor_sample_sz, subgraph_sample_sz, val_interval);
 
-  std::vector<unsigned> dummyVec;
-  deepgalois::Graph* dGraph =
-      galois::graphs::constructSymmetricGraph<char, void>(dummyVec);
-  network.dist_init(dGraph, dataset);
 
   // read network, features, ground truth, initialize metadata
   // default setting for now; can be customized by the user

From 2f90bf93ce4f9b27d01c46adfa13479de740d5b1 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 8 May 2020 17:58:02 -0500
Subject: [PATCH 273/660] re-clang formatting

---
 libdeepgalois/include/deepgalois/GraphTypes.h |  4 +-
 libdeepgalois/include/deepgalois/Net.h        | 79 ++++++++++---------
 libdeepgalois/include/deepgalois/Sampler.h    |  4 +-
 libdeepgalois/src/Context.cpp                 | 10 ++-
 libdeepgalois/src/DistContext.cpp             |  2 +-
 libdeepgalois/src/Net.cpp                     | 32 ++++----
 libdeepgalois/src/Sampler.cpp                 | 29 ++++---
 libdeepgalois/src/math_functions.cu           |  4 +-
 8 files changed, 88 insertions(+), 76 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/GraphTypes.h b/libdeepgalois/include/deepgalois/GraphTypes.h
index 0ef3fb4a77..ba241c53f5 100644
--- a/libdeepgalois/include/deepgalois/GraphTypes.h
+++ b/libdeepgalois/include/deepgalois/GraphTypes.h
@@ -11,6 +11,6 @@
 
 namespace deepgalois {
 using index_t = edge_iterator;
-using DGraph = galois::graphs::DistGraph<char, void>;
-using Graph = LearningGraph;
+using DGraph  = galois::graphs::DistGraph<char, void>;
+using Graph   = LearningGraph;
 } // namespace deepgalois
diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h
index 59674abc41..195c524a2d 100644
--- a/libdeepgalois/include/deepgalois/Net.h
+++ b/libdeepgalois/include/deepgalois/Net.h
@@ -24,9 +24,9 @@ namespace deepgalois {
 // layer 1: features N x D, weights D x 16, out N x 16 (hidden1=16)
 // layer 2: features N x 16, weights 16 x E, out N x E
 class Net {
-  unsigned myID = galois::runtime::getSystemNetworkInterface().ID;
-  std::string header        = "[" + std::to_string(myID) + "] ";
-  std::string seperator     = "\n";
+  unsigned myID         = galois::runtime::getSystemNetworkInterface().ID;
+  std::string header    = "[" + std::to_string(myID) + "] ";
+  std::string seperator = "\n";
 
   bool is_single_class;          // single-class (one-hot) or multi-class label
   bool has_l2norm;               // whether the net contains an l2_norm layer
@@ -35,7 +35,7 @@ class Net {
   unsigned subgraph_sample_size; // subgraph sampling
   int num_threads;               // number of threads
   size_t num_samples;            // number of samples: N
-  size_t distNumSamples;            // number of samples: N
+  size_t distNumSamples;         // number of samples: N
   size_t num_classes;            // number of vertex classes: E
   size_t num_conv_layers;        // number of convolutional layers
   size_t num_layers;             // total number of layers (conv + output)
@@ -58,16 +58,15 @@ class Net {
   int num_vertices_sg;
   bool is_selfloop;
 
-  mask_t* globalTrainMasks;              // masks for training
-  mask_t* globalValMasks;                // masks for validation
+  mask_t* globalTrainMasks; // masks for training
+  mask_t* globalValMasks;   // masks for validation
   mask_t* distTrainMasks;
   mask_t* distValMasks;
-  mask_t* test_masks;               // masks for test
+  mask_t* test_masks; // masks for test
 
-
-  mask_t* d_train_masks;            // masks for training on device
-  mask_t* d_val_masks;              // masks for validation on device
-  mask_t* d_test_masks;             // masks for test on device
+  mask_t* d_train_masks; // masks for training on device
+  mask_t* d_val_masks;   // masks for validation on device
+  mask_t* d_test_masks;  // masks for test on device
 
   mask_t* subgraphs_masks;          // masks for subgraphs
   std::vector<size_t> feature_dims; // feature dimnesions for each layer
@@ -96,18 +95,18 @@ class Net {
         learning_rate(lr), dropout_rate(dropout), weight_decay(wd),
         val_interval(val_itv), num_subgraphs(1), is_selfloop(selfloop) {
     // init some identifiers for this host
-    this->myID          = galois::runtime::getSystemNetworkInterface().ID;
-    this->header        = "[" + std::to_string(myID) + "] ";
-    this->seperator     = "\n";
+    this->myID      = galois::runtime::getSystemNetworkInterface().ID;
+    this->header    = "[" + std::to_string(myID) + "] ";
+    this->seperator = "\n";
 
     assert(n_conv > 0);
 
     // TODO use galois print
-    galois>>gPrint(header, "Configuration: num_threads ", num_threads,
-              ", num_conv_layers ", num_conv_layers, ", num_epochs ",
-              num_epochs, ", hidden1 ", hidden1, ", learning_rate ",
-              learning_rate, ", dropout_rate ", dropout_rate,
-              ", weight_decay ", weight_decay, "\n");
+    galois >> gPrint(header, "Configuration: num_threads ", num_threads,
+                     ", num_conv_layers ", num_conv_layers, ", num_epochs ",
+                     num_epochs, ", hidden1 ", hidden1, ", learning_rate ",
+                     learning_rate, ", dropout_rate ", dropout_rate,
+                     ", weight_decay ", weight_decay, "\n");
     num_layers = num_conv_layers + 1;
 
     // additional layers to add
@@ -138,10 +137,10 @@ class Net {
     if (dataset_str == "reddit") {
       this->globalTrainBegin = 0;
       this->globalTrainCount = 153431;
-      this->globalTrainEnd = this->globalTrainBegin + this->globalTrainCount;
-      this->globalValBegin = 153431;
-      this->globalValCount = 23831;
-      this->globalValEnd = this->globalValBegin + this->globalValCount;
+      this->globalTrainEnd   = this->globalTrainBegin + this->globalTrainCount;
+      this->globalValBegin   = 153431;
+      this->globalValCount   = 23831;
+      this->globalValEnd     = this->globalValBegin + this->globalValCount;
 
       // TODO do all can be used below
       for (size_t i = globalTrainBegin; i < globalTrainEnd; i++)
@@ -149,10 +148,11 @@ class Net {
       for (size_t i = globalValBegin; i < globalValEnd; i++)
         globalValMasks[i] = 1;
     } else {
-      globalTrainCount = context->read_masks("train", num_samples, globalTrainBegin,
-                                        globalTrainEnd, globalTrainMasks);
-      globalValCount   = context->read_masks("val", num_samples, globalValBegin, globalValEnd,
-                                      globalValMasks);
+      globalTrainCount =
+          context->read_masks("train", num_samples, globalTrainBegin,
+                              globalTrainEnd, globalTrainMasks);
+      globalValCount = context->read_masks("val", num_samples, globalValBegin,
+                                           globalValEnd, globalValMasks);
     }
 
     // make sure sampel size isn't greater than what we have to train with
@@ -165,7 +165,7 @@ class Net {
     feature_dims[0] = context->read_features(); // input feature dimension: D
 
     for (size_t i = 1; i < num_conv_layers; i++)
-      feature_dims[i] = hidden1;                 // hidden1 level embedding: 16
+      feature_dims[i] = hidden1; // hidden1 level embedding: 16
 
     feature_dims[num_conv_layers] = num_classes; // output embedding: E
 
@@ -184,15 +184,17 @@ class Net {
   }
 
   //! Default net constructor
-  //Net()
+  // Net()
   //    : is_single_class(true), has_l2norm(false), has_dense(false),
   //      neighbor_sample_size(0), subgraph_sample_size(0), num_threads(1),
   //      num_samples(0), num_classes(0), num_conv_layers(0), num_layers(0),
-  //      num_epochs(0), learning_rate(0.0), dropout_rate(0.0), weight_decay(0.0),
-  //      globalTrainBegin(0), globalTrainEnd(0), globalTrainCount(0), globalValBegin(0), globalValEnd(0),
-  //      globalValCount(0), globalTestBegin(0), globalTestEnd(0), globalTestCount(0),
-  //      val_interval(1), num_subgraphs(1), num_vertices_sg(9000),
-  //      globalTrainMasks(NULL), globalValMasks(NULL), test_masks(NULL), context(NULL) {}
+  //      num_epochs(0), learning_rate(0.0), dropout_rate(0.0),
+  //      weight_decay(0.0), globalTrainBegin(0), globalTrainEnd(0),
+  //      globalTrainCount(0), globalValBegin(0), globalValEnd(0),
+  //      globalValCount(0), globalTestBegin(0), globalTestEnd(0),
+  //      globalTestCount(0), val_interval(1), num_subgraphs(1),
+  //      num_vertices_sg(9000), globalTrainMasks(NULL), globalValMasks(NULL),
+  //      test_masks(NULL), context(NULL) {}
 
   //! Initializes metadata for the partition
   void partitionInit(DGraph* graph, std::string dataset_str);
@@ -413,11 +415,12 @@ class Net {
 #endif
     } else {
 #ifndef GALOIS_USE_DIST
-      globalTestCount = context->read_masks("test", num_samples, globalTestBegin,
-                                       globalTestEnd, test_masks);
+      globalTestCount = context->read_masks(
+          "test", num_samples, globalTestBegin, globalTestEnd, test_masks);
 #else
-      globalTestCount = context->read_masks("test", num_samples, globalTestBegin,
-                                       globalTestEnd, test_masks, dGraph);
+      globalTestCount =
+          context->read_masks("test", num_samples, globalTestBegin,
+                              globalTestEnd, test_masks, dGraph);
 #endif
     }
 #ifndef CPU_ONLY
diff --git a/libdeepgalois/include/deepgalois/Sampler.h b/libdeepgalois/include/deepgalois/Sampler.h
index 7934b28aa7..578bb6abf7 100644
--- a/libdeepgalois/include/deepgalois/Sampler.h
+++ b/libdeepgalois/include/deepgalois/Sampler.h
@@ -48,7 +48,7 @@ class Sampler {
   //! average degree cut off to a clip
   int subg_deg;
   //! list  of vertices active in the graph being maintained (masked_graph)
-  //VertexList vertices_;
+  // VertexList vertices_;
   //! List of training nodes; sampling set
   std::vector<index_t> node_train;
   mask_t* masks_;
@@ -67,7 +67,7 @@ class Sampler {
   void getMasks(size_t n, VertexSet vertices, mask_t* masks);
   inline VertexList reindexVertices(size_t n, VertexSet vertex_set);
   void checkGSDB(std::vector<db_t>& DB0, std::vector<db_t>& DB1,
-                std::vector<db_t>& DB2, size_t size);
+                 std::vector<db_t>& DB2, size_t size);
 };
 
 } // namespace deepgalois
diff --git a/libdeepgalois/src/Context.cpp b/libdeepgalois/src/Context.cpp
index 58526d7a96..2fbf8e6617 100644
--- a/libdeepgalois/src/Context.cpp
+++ b/libdeepgalois/src/Context.cpp
@@ -45,7 +45,8 @@ void Context::gen_subgraph_labels(size_t m, const mask_t* masks) {
       if (Context::is_single_class) {
         Context::h_labels_subg[count] = h_labels[i];
       } else {
-        std::copy(Context::h_labels + i * Context::num_classes, Context::h_labels + (i + 1) * Context::num_classes,
+        std::copy(Context::h_labels + i * Context::num_classes,
+                  Context::h_labels + (i + 1) * Context::num_classes,
                   &Context::h_labels_subg[count * Context::num_classes]);
       }
       count++;
@@ -54,7 +55,7 @@ void Context::gen_subgraph_labels(size_t m, const mask_t* masks) {
   assert(count == m);
 }
 
-//! generate input features for the subgraph, m is subgraph size, 
+//! generate input features for the subgraph, m is subgraph size,
 //! masks tells which vertices to use
 void Context::gen_subgraph_feats(size_t m, const mask_t* masks) {
   size_t count = 0;
@@ -62,7 +63,8 @@ void Context::gen_subgraph_feats(size_t m, const mask_t* masks) {
   Context::h_feats_subg.resize(m * feat_len);
   for (size_t i = 0; i < n; i++) {
     if (masks[i] == 1) {
-      std::copy(Context::h_feats + i * Context::feat_len, Context::h_feats + (i + 1) * Context::feat_len,
+      std::copy(Context::h_feats + i * Context::feat_len,
+                Context::h_feats + (i + 1) * Context::feat_len,
                 &Context::h_feats_subg[count * Context::feat_len]);
       count++;
     }
@@ -112,7 +114,7 @@ void Context::add_selfloop(Graph& og, Graph& g) {
   // TODO not actually implemented yet
   g.allocateFrom(og.size(), og.size() + og.sizeEdges());
   g.constructNodes();
-  //for (size_t src = 0; src < og.size(); src++) {
+  // for (size_t src = 0; src < og.size(); src++) {
   //  //g.getData(src) = 1;
   //  auto begin = og.edge_begin(src);
   //  auto end = og.edge_end(src);
diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp
index f7ad18bc22..ee47917347 100644
--- a/libdeepgalois/src/DistContext.cpp
+++ b/libdeepgalois/src/DistContext.cpp
@@ -7,7 +7,7 @@ DistContext::DistContext() {}
 DistContext::~DistContext() {}
 
 void DistContext::saveGraph(DGraph* dGraph) {
-  graph_cpu = dGraph;
+  graph_cpu     = dGraph;
   localVertices = graph_cpu->size();
 }
 
diff --git a/libdeepgalois/src/Net.cpp b/libdeepgalois/src/Net.cpp
index a986ec194d..f7882d1209 100644
--- a/libdeepgalois/src/Net.cpp
+++ b/libdeepgalois/src/Net.cpp
@@ -11,8 +11,8 @@ namespace deepgalois {
 
 #ifdef GALOIS_USE_DIST
 void Net::partitionInit(DGraph* graph, std::string dataset_str) {
-  this->dGraph          = graph;
-  this->distContext     = new deepgalois::DistContext();
+  this->dGraph      = graph;
+  this->distContext = new deepgalois::DistContext();
   this->distContext->saveDistGraph(dGraph);
   this->distNumSamples = this->dGraph->size();
 
@@ -25,16 +25,17 @@ void Net::partitionInit(DGraph* graph, std::string dataset_str) {
   // std::cout << "Reading label masks ... ";
   this->distTrainMasks = new mask_t[this->distNumSamples];
   this->distValMasks   = new mask_t[this->distNumSamples];
-  std::fill(this->distTrainMasks, this->distTrainMasks + this->distNumSamples, 0);
+  std::fill(this->distTrainMasks, this->distTrainMasks + this->distNumSamples,
+            0);
   std::fill(this->distValMasks, this->distValMasks + this->distNumSamples, 0);
 
   if (dataset_str == "reddit") {
-    //this->globalTrainBegin = 0;
-    //this->globalTrainCount = 153431;
-    //this->globalTrainEnd = this->globalTrainBegin + this->globalTrainCount;
-    //this->globalValBegin = 153431;
-    //this->globalValCount = 23831;
-    //this->globalValEnd = this->globalValBegin + this->globalValCount;
+    // this->globalTrainBegin = 0;
+    // this->globalTrainCount = 153431;
+    // this->globalTrainEnd = this->globalTrainBegin + this->globalTrainCount;
+    // this->globalValBegin = 153431;
+    // this->globalValCount = 23831;
+    // this->globalValEnd = this->globalValBegin + this->globalValCount;
 
     // find local ID from global ID, set if it exists
     for (size_t i = globalTrainBegin; i < globalTrainEnd; i++) {
@@ -48,13 +49,16 @@ void Net::partitionInit(DGraph* graph, std::string dataset_str) {
       }
     }
   } else {
-    globalTrainCount = this->distContext->read_masks("train", this->distNumSamples, globalTrainBegin,
-                                      globalTrainEnd, this->distTrainMasks, this->dGraph);
-    globalValCount   = this->distContext->read_masks("val", this->distNumSamples, globalValBegin, globalValEnd,
-                                    this->distValMasks, this->dGraph);
+    globalTrainCount = this->distContext->read_masks(
+        "train", this->distNumSamples, globalTrainBegin, globalTrainEnd,
+        this->distTrainMasks, this->dGraph);
+    globalValCount = this->distContext->read_masks(
+        "val", this->distNumSamples, globalValBegin, globalValEnd,
+        this->distValMasks, this->dGraph);
   }
 
-  feature_dims[0] = this->distContext->read_features(); // input feature dimension: D
+  feature_dims[0] =
+      this->distContext->read_features(); // input feature dimension: D
   for (size_t i = 1; i < num_conv_layers; i++)
     feature_dims[i] = hidden1;                 // hidden1 level embedding: 16
   feature_dims[num_conv_layers] = num_classes; // output embedding: E
diff --git a/libdeepgalois/src/Sampler.cpp b/libdeepgalois/src/Sampler.cpp
index aa6fb6d686..dbf54a7b4b 100644
--- a/libdeepgalois/src/Sampler.cpp
+++ b/libdeepgalois/src/Sampler.cpp
@@ -16,7 +16,7 @@ void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, Graph* g) {
   this->count_ = count;
   this->masks_ = masks;
   // save original graph
-  Sampler::graph  = g;
+  Sampler::graph = g;
   // allocate the object for the new masked graph
   Sampler::masked_graph = new Graph();
 
@@ -54,9 +54,9 @@ void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, Graph* g) {
   Sampler::avg_deg  = masked_graph->sizeEdges() / masked_graph->size();
   Sampler::subg_deg = (avg_deg > SAMPLE_CLIP) ? SAMPLE_CLIP : avg_deg;
 
-  //size_t idx = 0;
-  //vertices_.resize(count);
-  //for (size_t i = begin; i < end; i++) {
+  // size_t idx = 0;
+  // vertices_.resize(count);
+  // for (size_t i = begin; i < end; i++) {
   //  if (masks_[i] == 1)
   //    vertices_[idx++] = i;
   //}
@@ -88,8 +88,7 @@ void Sampler::get_masked_degrees(size_t n, mask_t* masks, Graph* g,
 }
 
 //! returns a graph in the variable sub: it is g with the mask applied
-void Sampler::getMaskedGraph(size_t n, mask_t* masks, Graph* g,
-                                    Graph& sub) {
+void Sampler::getMaskedGraph(size_t n, mask_t* masks, Graph* g, Graph& sub) {
   std::vector<uint32_t> degrees(n, 0);
   this->get_masked_degrees(n, masks, g, degrees);
   // auto offsets = deepgalois::parallel_prefix_sum(degrees);
@@ -127,10 +126,9 @@ void Sampler::getMaskedGraph(size_t n, mask_t* masks, Graph* g,
 #endif
 }
 
-
 // helper function for graph saint implementation below
 void Sampler::checkGSDB(std::vector<db_t>& DB0, std::vector<db_t>& DB1,
-                       std::vector<db_t>& DB2, size_t size) {
+                        std::vector<db_t>& DB2, size_t size) {
   if (DB0.capacity() < size) {
     DB0.reserve(DB0.capacity() * 2);
     DB1.reserve(DB1.capacity() * 2);
@@ -222,7 +220,8 @@ void Sampler::select_vertices(size_t n, int m, VertexSet& st, unsigned seed) {
     auto degree = getDegree(Sampler::masked_graph, v);
     neigh_v     = (degree != 0) ? rand_r(&myseed) % degree : db_t(-1);
     if (neigh_v != db_t(-1)) {
-      neigh_v = Sampler::masked_graph->getEdgeDst(Sampler::masked_graph->edge_begin(v) + neigh_v);
+      neigh_v = Sampler::masked_graph->getEdgeDst(
+          Sampler::masked_graph->edge_begin(v) + neigh_v);
       st.insert(neigh_v);
       IA1[DB2[choose] - 1] = 0;
       IA0[DB2[choose] - 1] = 0;
@@ -363,7 +362,8 @@ inline VertexList Sampler::reindexVertices(size_t n, VertexSet vertex_set) {
 
 // Given a subset of vertices and a graph g, generate a subgraph sg from the
 // graph g
-void Sampler::reindexSubgraph(VertexSet& keptVertices, Graph& origGraph, Graph& reindexGraph) {
+void Sampler::reindexSubgraph(VertexSet& keptVertices, Graph& origGraph,
+                              Graph& reindexGraph) {
   // auto n = origGraph.size(); // old graph size
   auto nv            = keptVertices.size(); // new graph (subgraph) size
   VertexList new_ids = this->reindexVertices(graph->size(), keptVertices);
@@ -379,7 +379,8 @@ void Sampler::reindexSubgraph(VertexSet& keptVertices, Graph& origGraph, Graph&
 #ifndef GALOIS_USE_DIST
   reindexGraph.allocateFrom(nv, ne);
   reindexGraph.constructNodes();
-  VertexList old_ids(keptVertices.begin(), keptVertices.end()); // vertex ID mapping
+  VertexList old_ids(keptVertices.begin(),
+                     keptVertices.end()); // vertex ID mapping
 #ifdef PARALLEL_GEN
   galois::do_all(
       galois::iterate((size_t)0, nv),
@@ -390,7 +391,8 @@ void Sampler::reindexSubgraph(VertexSet& keptVertices, Graph& origGraph, Graph&
         reindexGraph.fixEndEdge(i, offsets[i + 1]);
         unsigned j  = 0;
         auto old_id = old_ids[i];
-        for (auto e = origGraph.edge_begin(old_id); e != origGraph.edge_end(old_id); e++) {
+        for (auto e = origGraph.edge_begin(old_id);
+             e != origGraph.edge_end(old_id); e++) {
           auto dst = new_ids[origGraph.getEdgeDst(e)];
           assert(dst < nv);
           reindexGraph.constructEdge(offsets[i] + j, dst, 0);
@@ -417,7 +419,8 @@ void Sampler::subgraph_sample(size_t n, Graph& sg, mask_t* masks,
   getMasks(Sampler::graph->size(), sampledSet, masks);
 
   Graph masked_sg;
-  this->getMaskedGraph(Sampler::graph->size(), masks, Sampler::masked_graph,
+  this->getMaskedGraph(
+      Sampler::graph->size(), masks, Sampler::masked_graph,
       masked_sg); // remove edges whose destination is not masked
   this->reindexSubgraph(sampledSet, masked_sg, sg);
 }
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index 06d854d4b7..8b5ab8100f 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -691,8 +691,8 @@ void d_softmax_cross_entropy_gpu(int len, int begin, int end,
   // float_t *grad;
   // float_malloc_device((end-begin)*len, grad);
   // d_cross_entropy_kernel<<<CUDA_GET_BLOCKS((end-begin)*len),
-  // CUDA_NUM_THREADS>>>( d_cross_entropy_warp<<<(end-begin-1)/WARPS_PER_BLOCK+1,
-  // BLOCK_SIZE>>>(
+  // CUDA_NUM_THREADS>>>(
+  // d_cross_entropy_warp<<<(end-begin-1)/WARPS_PER_BLOCK+1, BLOCK_SIZE>>>(
   //    len, begin, end, masks, labels, out, grad);
   // CudaTest("solving d_cross_entropy kernel failed");
   // d_softmax_kernel<<<CUDA_GET_BLOCKS(end-begin), CUDA_NUM_THREADS>>>(

From 596dbb050dee39c5a645cc5ad4093dcf67d9d669 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 8 May 2020 18:41:36 -0500
Subject: [PATCH 274/660] context/net cleanup, reorg

---
 libdeepgalois/include/deepgalois/Context.h    | 134 +++++++++---------
 .../include/deepgalois/DistContext.h          |   3 -
 libdeepgalois/include/deepgalois/Net.h        |  58 +++-----
 libdeepgalois/src/Context.cpp                 |  72 +---------
 libdeepgalois/src/DistContext.cpp             |   5 -
 libdeepgalois/src/Net.cpp                     |  21 +--
 lonestar/gnn/gcn/gcn.cpp                      |  10 +-
 7 files changed, 103 insertions(+), 200 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/Context.h b/libdeepgalois/include/deepgalois/Context.h
index 519a75d7f3..41751badcf 100644
--- a/libdeepgalois/include/deepgalois/Context.h
+++ b/libdeepgalois/include/deepgalois/Context.h
@@ -16,6 +16,69 @@
 namespace deepgalois {
 
 class Context {
+  std::string dataset;
+  bool is_device;         // is this on device or host
+  size_t n;               // number of samples: N
+  size_t num_classes;     // number of classes: E
+  size_t feat_len;        // input feature length: D
+  bool is_single_class;   // single-class (one-hot) or multi-class label
+  bool is_selfloop_added; // whether selfloop is added to the input graph
+  bool use_subgraph;      // whether to use subgraph
+  label_t* h_labels;      // labels for classification. Single-class label: Nx1,
+                          // multi-class label: NxE
+  float_t* h_feats;       // input features: N x D
+  // label_t *h_labels_subg;      // labels for subgraph
+  // float_t* h_feats_subg;       // input features for subgraph
+  label_t* d_labels;      // labels on device
+  label_t* d_labels_subg; // labels for subgraph on device
+  float_t* d_feats;       // input features on device
+  float_t* d_feats_subg;  // input features for subgraph on device
+  float_t* norm_factors;  // normalization constant based on graph structure
+  std::vector<label_t> h_labels_subg;     // labels for subgraph
+  std::vector<float_t> h_feats_subg;      // input features for subgraph
+  std::vector<float_t> norm_factors_subg; // normalization constant for subgraph
+  // float_t* norm_factors_subg;  // normalization constant for subgraph
+  Reader reader;
+
+  void alloc_norm_factor();
+  void alloc_subgraph_norm_factor(int subg_id);
+
+#ifndef __GALOIS_HET_CUDA__
+  Graph* graph_cpu; // the input graph, |V| = N
+  std::vector<Graph*> subgraphs_cpu;
+  void add_selfloop(Graph& og, Graph& g);
+  //! returns pointer to the graph
+  Graph* getGraphPointer() { return graph_cpu; }
+  Graph* getSubgraphPointer(int id) { return subgraphs_cpu[id]; };
+  float_t* get_feats_ptr() { return h_feats; }
+  float_t* get_feats_subg_ptr() { return &h_feats_subg[0]; }
+  label_t* get_labels_ptr() { return h_labels; }
+  label_t* get_labels_subg_ptr() { return &h_labels_subg[0]; }
+#else
+  static cublasHandle_t cublas_handle_;         // used to call cuBLAS
+  static cusparseHandle_t cusparse_handle_;     // used to call cuSPARSE
+  static cusparseMatDescr_t cusparse_matdescr_; // used to call cuSPARSE
+  static curandGenerator_t
+      curand_generator_; // used to generate random numbers on GPU
+
+  GraphGPU graph_gpu; // the input graph, |V| = N
+  std::vector<GraphGPU*> subgraphs_gpu;
+  GraphGPU* getGraphPointer() { return &graph_gpu; }
+  GraphGPU* getSubgraphPointer(int id) { return subgraphs_gpu[id]; };
+  float_t* get_feats_ptr() { return d_feats; }
+  float_t* get_feats_subg_ptr() { return d_feats_subg; }
+  label_t* get_labels_ptr() { return d_labels; }
+  label_t* get_labels_subg_ptr() { return d_labels_subg; }
+  inline static cublasHandle_t cublas_handle() { return cublas_handle_; }
+  inline static cusparseHandle_t cusparse_handle() { return cusparse_handle_; }
+  inline static cusparseMatDescr_t cusparse_matdescr() {
+    return cusparse_matdescr_;
+  }
+  inline static curandGenerator_t curand_generator() {
+    return curand_generator_;
+  }
+#endif
+
 public:
   Context();
   //! initializer for gpu; goes ahead and sets a few things
@@ -43,6 +106,7 @@ class Context {
   label_t get_label(size_t i) {
     return h_labels[i];
   } // single-class (one-hot) label
+
   // label_t get_label(size_t i, size_t j) { return labels[i*num_classes+j]; }
   // // multi-class label
   float_t* get_norm_factors_ptr() { return norm_factors; }
@@ -52,6 +116,7 @@ class Context {
     dataset = dataset_str;
     reader.init(dataset);
   }
+
   void set_label_class(bool is_single = true) { is_single_class = is_single; }
   void set_use_subgraph(bool use_subg) { use_subgraph = use_subg; }
   void copy_data_to_device(); // copy labels and input features
@@ -60,75 +125,6 @@ class Context {
   void gen_subgraph_feats(size_t m, const mask_t* masks);
   //! Allocate subgraphs (but don't actually do sampling yet)
   void allocateSubgraphs(int num_subgraphs);
-
-#ifndef __GALOIS_HET_CUDA__
-  Graph* graph_cpu; // the input graph, |V| = N
-  std::vector<Graph*> subgraphs_cpu;
-  void add_selfloop(Graph& og, Graph& g);
-  //! returns pointer to the graph
-  Graph* getGraphPointer() { return graph_cpu; }
-  Graph* getSubgraphPointer(int id) { return subgraphs_cpu[id]; };
-  float_t* get_feats_ptr() { return h_feats; }
-  float_t* get_feats_subg_ptr() { return &h_feats_subg[0]; }
-  label_t* get_labels_ptr() { return h_labels; }
-  label_t* get_labels_subg_ptr() { return &h_labels_subg[0]; }
-#else
-  GraphGPU graph_gpu; // the input graph, |V| = N
-  std::vector<GraphGPU*> subgraphs_gpu;
-  GraphGPU* getGraphPointer() { return &graph_gpu; }
-  GraphGPU* getSubgraphPointer(int id) { return subgraphs_gpu[id]; };
-  float_t* get_feats_ptr() { return d_feats; }
-  float_t* get_feats_subg_ptr() { return d_feats_subg; }
-  label_t* get_labels_ptr() { return d_labels; }
-  label_t* get_labels_subg_ptr() { return d_labels_subg; }
-  inline static cublasHandle_t cublas_handle() { return cublas_handle_; }
-  inline static cusparseHandle_t cusparse_handle() { return cusparse_handle_; }
-  inline static cusparseMatDescr_t cusparse_matdescr() {
-    return cusparse_matdescr_;
-  }
-  inline static curandGenerator_t curand_generator() {
-    return curand_generator_;
-  }
-#endif
-
-protected:
-  std::string dataset;
-  bool is_device;         // is this on device or host
-  size_t n;               // number of samples: N
-  size_t num_classes;     // number of classes: E
-  size_t feat_len;        // input feature length: D
-  bool is_single_class;   // single-class (one-hot) or multi-class label
-  bool is_selfloop_added; // whether selfloop is added to the input graph
-  bool use_subgraph;      // whether to use subgraph
-  label_t* h_labels;      // labels for classification. Single-class label: Nx1,
-                          // multi-class label: NxE
-  float_t* h_feats;       // input features: N x D
-  // label_t *h_labels_subg;      // labels for subgraph
-  // float_t* h_feats_subg;       // input features for subgraph
-  label_t* d_labels;      // labels on device
-  label_t* d_labels_subg; // labels for subgraph on device
-  float_t* d_feats;       // input features on device
-  float_t* d_feats_subg;  // input features for subgraph on device
-  float_t* norm_factors;  // normalization constant based on graph structure
-  std::vector<label_t> h_labels_subg;     // labels for subgraph
-  std::vector<float_t> h_feats_subg;      // input features for subgraph
-  std::vector<float_t> norm_factors_subg; // normalization constant for subgraph
-  // float_t* norm_factors_subg;  // normalization constant for subgraph
-  Reader reader;
-
-  void alloc_norm_factor();
-  void alloc_subgraph_norm_factor(int subg_id);
-
-#ifndef __GALOIS_HET_CUDA__
-  void read_edgelist(const char* filename, bool symmetrize = false,
-                     bool add_self_loop = false);
-#else
-  static cublasHandle_t cublas_handle_;         // used to call cuBLAS
-  static cusparseHandle_t cusparse_handle_;     // used to call cuSPARSE
-  static cusparseMatDescr_t cusparse_matdescr_; // used to call cuSPARSE
-  static curandGenerator_t
-      curand_generator_; // used to generate random numbers on GPU
-#endif
 };
 
 } // namespace deepgalois
diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h
index e1b76fa00c..571a873e83 100644
--- a/libdeepgalois/include/deepgalois/DistContext.h
+++ b/libdeepgalois/include/deepgalois/DistContext.h
@@ -10,7 +10,6 @@
 namespace deepgalois {
 
 class DistContext {
-  size_t localVertices; // number of samples: N
   size_t num_classes;   // number of classes: E
   size_t feat_len;      // input feature length: D
   galois::graphs::GluonSubstrate<Graph>* syncSubstrate;
@@ -27,8 +26,6 @@ class DistContext {
   DistContext();
   ~DistContext();
 
-  //! save graph pointer to context object
-  void saveDistGraph(Graph* dGraph);
   //! read labels of local nodes only
   size_t read_labels(std::string dataset_str);
   //! read features of local nodes only
diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h
index 195c524a2d..7547252b86 100644
--- a/libdeepgalois/include/deepgalois/Net.h
+++ b/libdeepgalois/include/deepgalois/Net.h
@@ -34,7 +34,7 @@ class Net {
   unsigned neighbor_sample_size; // neighbor sampling
   unsigned subgraph_sample_size; // subgraph sampling
   int num_threads;               // number of threads
-  size_t num_samples;            // number of samples: N
+  size_t globalSamples;            // number of samples: N
   size_t distNumSamples;         // number of samples: N
   size_t num_classes;            // number of vertex classes: E
   size_t num_conv_layers;        // number of convolutional layers
@@ -107,31 +107,31 @@ class Net {
                      num_epochs, ", hidden1 ", hidden1, ", learning_rate ",
                      learning_rate, ", dropout_rate ", dropout_rate,
                      ", weight_decay ", weight_decay, "\n");
-    num_layers = num_conv_layers + 1;
+    this->num_layers = num_conv_layers + 1;
 
     // additional layers to add
     if (has_l2norm)
-      num_layers++;
+      this->num_layers++;
     if (has_dense)
-      num_layers++;
-
+      this->num_layers++;
     // initialize feature metadata
     feature_dims.resize(num_layers + 1);
 
-    // initialze context
+    // initialze global graph context
     context = new deepgalois::Context();
     context->set_dataset(dataset_str);
-    // read graph, get num nodes
-    num_samples = context->read_graph(selfloop);
+    // read *entire* graph, get num nodes
+    globalSamples = context->read_graph(selfloop);
     context->set_label_class(is_single_class);
     // read ground truth labels
     num_classes = context->read_labels();
 
-    // get training and validation sets
-    globalTrainMasks = new mask_t[num_samples];
-    globalValMasks   = new mask_t[num_samples];
-    std::fill(globalTrainMasks, globalTrainMasks + num_samples, 0);
-    std::fill(globalValMasks, globalValMasks + num_samples, 0);
+    // get training and validation sets: this is to create the training
+    // subgraph in the sampler
+    globalTrainMasks = new mask_t[globalSamples];
+    globalValMasks   = new mask_t[globalSamples];
+    std::fill(globalTrainMasks, globalTrainMasks + globalSamples, 0);
+    std::fill(globalValMasks, globalValMasks + globalSamples, 0);
 
     // reddit is hard coded
     if (dataset_str == "reddit") {
@@ -149,9 +149,9 @@ class Net {
         globalValMasks[i] = 1;
     } else {
       globalTrainCount =
-          context->read_masks("train", num_samples, globalTrainBegin,
+          context->read_masks("train", globalSamples, globalTrainBegin,
                               globalTrainEnd, globalTrainMasks);
-      globalValCount = context->read_masks("val", num_samples, globalValBegin,
+      globalValCount = context->read_masks("val", globalSamples, globalValBegin,
                                            globalValEnd, globalValMasks);
     }
 
@@ -161,23 +161,8 @@ class Net {
                  "set\n");
     }
 
-    // read features of vertices
-    feature_dims[0] = context->read_features(); // input feature dimension: D
-
-    for (size_t i = 1; i < num_conv_layers; i++)
-      feature_dims[i] = hidden1; // hidden1 level embedding: 16
-
-    feature_dims[num_conv_layers] = num_classes; // output embedding: E
-
-    if (has_l2norm)
-      feature_dims[num_conv_layers + 1] =
-          num_classes; // l2 normalized embedding: E
-
-    if (has_dense)
-      feature_dims[num_layers - 1] = num_classes; // MLP embedding: E
-
-    feature_dims[num_layers] = num_classes; // normalized output embedding: E
-    layers.resize(num_layers);
+    // features are read in distcontext, not this context (this context only
+    // used for sampling)
 
     // set the subgraph boolean if sample size is greater than 0
     context->set_use_subgraph(subgraph_sample_size > 0);
@@ -187,7 +172,7 @@ class Net {
   // Net()
   //    : is_single_class(true), has_l2norm(false), has_dense(false),
   //      neighbor_sample_size(0), subgraph_sample_size(0), num_threads(1),
-  //      num_samples(0), num_classes(0), num_conv_layers(0), num_layers(0),
+  //      globalSamples(0), num_classes(0), num_conv_layers(0), num_layers(0),
   //      num_epochs(0), learning_rate(0.0), dropout_rate(0.0),
   //      weight_decay(0.0), globalTrainBegin(0), globalTrainEnd(0),
   //      globalTrainCount(0), globalValBegin(0), globalValEnd(0),
@@ -201,13 +186,10 @@ class Net {
 
   size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; }
   size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id + 1]; }
-  size_t get_nnodes() { return num_samples; }
 
-  void normalize();  // Scale gradient to counterbalance accumulation
   void regularize(); // add weight decay
 
   void train(optimizer* opt, bool need_validate) {
-
     double total_train_time = 0.0;
     int num_subg_remain     = 0;
 
@@ -416,10 +398,10 @@ class Net {
     } else {
 #ifndef GALOIS_USE_DIST
       globalTestCount = context->read_masks(
-          "test", num_samples, globalTestBegin, globalTestEnd, test_masks);
+          "test", globalSamples, globalTestBegin, globalTestEnd, test_masks);
 #else
       globalTestCount =
-          context->read_masks("test", num_samples, globalTestBegin,
+          context->read_masks("test", globalSamples, globalTestBegin,
                               globalTestEnd, test_masks, dGraph);
 #endif
     }
diff --git a/libdeepgalois/src/Context.cpp b/libdeepgalois/src/Context.cpp
index 2fbf8e6617..b44331fe1f 100644
--- a/libdeepgalois/src/Context.cpp
+++ b/libdeepgalois/src/Context.cpp
@@ -77,11 +77,7 @@ size_t Context::read_graph(bool selfloop) {
   std::string filetype = "gr";
   galois::StatTimer Tread("GraphReadingTime");
   Tread.start();
-  if (filetype == "el") {
-    filename = path + dataset + ".el";
-    printf("Reading .el file: %s\n", filename.c_str());
-    read_edgelist(filename.c_str(), true); // symmetrize
-  } else if (filetype == "bin") {
+  if (filetype == "bin") {
     graph_cpu->readGraph(dataset);
   } else if (filetype == "gr") {
     graph_cpu            = new Graph();
@@ -209,70 +205,4 @@ void Context::norm_factor_computing(bool is_subgraph, int subg_id) {
 #endif
 }
 
-void Context::read_edgelist(const char* filename, bool symmetrize,
-                            bool add_self_loop) {
-  std::ifstream in;
-  std::string line;
-  in.open(filename, std::ios::in);
-  size_t m, n;
-  in >> m >> n >> std::ws;
-  size_t num_vertices_ = m;
-  size_t num_edges_    = 0;
-  std::cout << "num_vertices " << num_vertices_ << "\n";
-  std::vector<std::set<uint32_t>> vertices(m);
-  for (size_t i = 0; i < n; i++) {
-    std::set<uint32_t> neighbors;
-    if (add_self_loop)
-      neighbors.insert(i);
-    vertices.push_back(neighbors);
-  }
-  while (std::getline(in, line)) {
-    std::istringstream edge_stream(line);
-    VertexID u, v;
-    edge_stream >> u;
-    edge_stream >> v;
-    vertices[u].insert(v);
-    if (symmetrize)
-      vertices[v].insert(u);
-  }
-  in.close();
-  for (size_t i = 0; i < n; i++)
-    num_edges_ += vertices[i].size();
-  std::cout << "num_edges " << num_edges_ << "\n";
-
-  std::vector<uint32_t> degrees;
-  degrees.resize(num_vertices_);
-  std::fill(degrees.begin(), degrees.end(), 0);
-  for (size_t i = 0; i < num_vertices_; i++)
-    degrees[i] = vertices[i].size();
-  std::vector<uint32_t> offsets(degrees.size() + 1);
-  uint32_t total = 0;
-  for (size_t n = 0; n < degrees.size(); n++) {
-    offsets[n] = total;
-    total += degrees[n];
-  }
-  offsets[degrees.size()] = total;
-  degrees.clear();
-  assert(num_edges_ == offsets[num_vertices_]);
-  EdgeID* colidx_   = new EdgeID[num_edges_];
-  VertexID* rowptr_ = new VertexID[num_vertices_ + 1];
-  for (size_t i = 0; i < num_vertices_ + 1; i++)
-    rowptr_[i] = offsets[i];
-  for (size_t i = 0; i < num_vertices_; i++) {
-    for (auto dst : vertices[i])
-      colidx_[offsets[i]++] = dst;
-  }
-
-  auto g = getGraphPointer();
-  g->allocateFrom(num_vertices_, num_edges_);
-  g->constructNodes();
-  for (size_t i = 0; i < num_vertices_; i++) {
-    auto row_begin = rowptr_[i];
-    auto row_end   = rowptr_[i + 1];
-    g->fixEndEdge(i, row_end);
-    for (auto offset = row_begin; offset < row_end; offset++)
-      g->constructEdge(offset, colidx_[offset], 0);
-  }
-}
-
 } // namespace deepgalois
diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp
index ee47917347..21236ef638 100644
--- a/libdeepgalois/src/DistContext.cpp
+++ b/libdeepgalois/src/DistContext.cpp
@@ -6,11 +6,6 @@ namespace deepgalois {
 DistContext::DistContext() {}
 DistContext::~DistContext() {}
 
-void DistContext::saveGraph(DGraph* dGraph) {
-  graph_cpu     = dGraph;
-  localVertices = graph_cpu->size();
-}
-
 size_t DistContext::read_labels(DGraph& dGraph, std::string dataset_str) {
   Graph* dGraph = DistContext::graph_cpu;
   unsigned myID = galois::runtime::getSystemNetworkInterface().ID;
diff --git a/libdeepgalois/src/Net.cpp b/libdeepgalois/src/Net.cpp
index f7882d1209..4ba6c23fe1 100644
--- a/libdeepgalois/src/Net.cpp
+++ b/libdeepgalois/src/Net.cpp
@@ -57,16 +57,20 @@ void Net::partitionInit(DGraph* graph, std::string dataset_str) {
         this->distValMasks, this->dGraph);
   }
 
-  feature_dims[0] =
-      this->distContext->read_features(); // input feature dimension: D
+  // input feature dimension: D
+  feature_dims[0] = this->distContext->read_features();
   for (size_t i = 1; i < num_conv_layers; i++)
     feature_dims[i] = hidden1;                 // hidden1 level embedding: 16
   feature_dims[num_conv_layers] = num_classes; // output embedding: E
-  if (has_l2norm)
-    feature_dims[num_conv_layers + 1] =
-        num_classes; // l2 normalized embedding: E
-  if (has_dense)
-    feature_dims[num_layers - 1] = num_classes; // MLP embedding: E
+  if (this->has_l2norm) {
+    // l2 normalized embedding: E
+    feature_dims[num_conv_layers + 1] = num_classes;
+  }
+  if (this->has_dense) {
+     // MLP embedding: E
+    feature_dims[num_layers - 1] = num_classes;
+  }
+
   feature_dims[num_layers] = num_classes; // normalized output embedding: E
   layers.resize(num_layers);
 }
@@ -87,9 +91,6 @@ void Net::regularize() {
              layers[layer_id]->get_grads_ptr());
 }
 
-// Scale gradient to counterbalance accumulation
-void Net::normalize() {}
-
 /**
  *
  * @param begin GLOBAL begin
diff --git a/lonestar/gnn/gcn/gcn.cpp b/lonestar/gnn/gcn/gcn.cpp
index 702fc63516..0a47913a0f 100644
--- a/lonestar/gnn/gcn/gcn.cpp
+++ b/lonestar/gnn/gcn/gcn.cpp
@@ -17,17 +17,19 @@ int main(int argc, char** argv) {
   std::vector<unsigned> dummyVec;
   deepgalois::Graph* dGraph =
       galois::graphs::constructSymmetricGraph<char, void>(dummyVec);
-  network.dist_init(dGraph, dataset);
 
-  // initialize entire on CPU
+  // initialize network + whole context on CPU
+  // read network, features, ground truth, initialize metadata
+  // default setting for now; can be customized by the user
   deepgalois::Net network(dataset, numThreads, num_conv_layers, epochs, hidden1,
                           learning_rate, dropout_rate, weight_decay,
                           add_selfloop, is_single_class, add_l2norm, add_dense,
                           neighbor_sample_sz, subgraph_sample_sz, val_interval);
 
+  // initialize distributed context
+  network.partitionInit(dGraph, dataset);
 
-  // read network, features, ground truth, initialize metadata
-  // default setting for now; can be customized by the user
+  // construct layers from distributed context
   network.construct_layers();
   network.print_layers_info();
   deepgalois::ResourceManager rm; // tracks peak memory usage

From 42317772c26a7316e56b4fce5f10aae46cf1164e Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 8 May 2020 19:29:04 -0500
Subject: [PATCH 275/660] library builds: mostly distcinting between DGraph and
 Graph

---
 libdeepgalois/include/deepgalois/Context.h    |  4 +-
 .../include/deepgalois/DistContext.h          | 21 ++++++----
 libdeepgalois/include/deepgalois/GraphTypes.h |  6 +--
 libdeepgalois/include/deepgalois/Net.h        | 42 +++++++++----------
 libdeepgalois/include/deepgalois/Sampler.h    |  4 --
 .../include/deepgalois/layers/aggregator.h    |  1 +
 .../include/deepgalois/layers/layer.h         | 16 ++-----
 libdeepgalois/src/DistContext.cpp             | 10 ++---
 libdeepgalois/src/Net.cpp                     | 10 ++---
 libdeepgalois/src/Sampler.cpp                 |  5 +--
 libdeepgalois/src/layers/aggregator.cpp       |  8 +---
 libdeepgalois/src/layers/graph_conv_layer.cpp |  7 +---
 lonestar/gnn/include/lonestargnn.h            |  5 +--
 13 files changed, 59 insertions(+), 80 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/Context.h b/libdeepgalois/include/deepgalois/Context.h
index 41751badcf..373d91d463 100644
--- a/libdeepgalois/include/deepgalois/Context.h
+++ b/libdeepgalois/include/deepgalois/Context.h
@@ -43,6 +43,9 @@ class Context {
   void alloc_norm_factor();
   void alloc_subgraph_norm_factor(int subg_id);
 
+
+public:
+// TODO separate below to public and private
 #ifndef __GALOIS_HET_CUDA__
   Graph* graph_cpu; // the input graph, |V| = N
   std::vector<Graph*> subgraphs_cpu;
@@ -79,7 +82,6 @@ class Context {
   }
 #endif
 
-public:
   Context();
   //! initializer for gpu; goes ahead and sets a few things
   Context(bool use_gpu)
diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h
index 571a873e83..7f1c4fb1ea 100644
--- a/libdeepgalois/include/deepgalois/DistContext.h
+++ b/libdeepgalois/include/deepgalois/DistContext.h
@@ -12,9 +12,9 @@ namespace deepgalois {
 class DistContext {
   size_t num_classes;   // number of classes: E
   size_t feat_len;      // input feature length: D
-  galois::graphs::GluonSubstrate<Graph>* syncSubstrate;
+  galois::graphs::GluonSubstrate<DGraph>* syncSubstrate;
 
-  Graph* graph_cpu; // the input graph, |V| = N
+  DGraph* graph_cpu; // the input graph, |V| = N
   std::vector<Graph*> subgraphs_cpu;
   label_t* h_labels;      // labels for classification. Single-class label: Nx1,
                           // multi-class label: NxE
@@ -22,17 +22,23 @@ class DistContext {
   float_t* h_feats;       // input features: N x D
   float_t* h_feats_subg;  // input features for subgraph
 
+  // TODO needs to come from whole graph
+  float_t* norm_factors;  // normalization constant based on graph structure
+  std::vector<float_t> norm_factors_subg; // normalization constant for subgraph
+
 public:
   DistContext();
   ~DistContext();
 
+  void saveDistGraph(DGraph* a) { graph_cpu = a; }
+
   //! read labels of local nodes only
-  size_t read_labels(std::string dataset_str);
+  size_t read_labels(DGraph* dGraph, std::string dataset_str);
   //! read features of local nodes only
   size_t read_features(std::string dataset_str);
   //! read masks of local nodes only
   size_t read_masks(std::string dataset_str, std::string mask_type, size_t n,
-                    size_t& begin, size_t& end, mask_t* masks, Graph* dGraph);
+                    size_t& begin, size_t& end, mask_t* masks, DGraph* dGraph);
 
   // TODO define these
   void createSubgraphs(int) {}
@@ -40,16 +46,17 @@ class DistContext {
   void gen_subgraph_feats(size_t, const mask_t*) {}
 
   float_t* get_norm_factors_ptr() { return norm_factors; }
-  Graph* getGraphPointer() { return graph_cpu; }
+  // TODO shouldn't return 0 always
+  float_t* get_norm_factors_subg_ptr() { return &norm_factors_subg[0]; }
+  DGraph* getGraphPointer() { return graph_cpu; }
   Graph* getSubgraphPointer(int id) { return subgraphs_cpu[id]; };
   float_t* get_feats_ptr() { return h_feats; }
   float_t* get_feats_subg_ptr() { return h_feats_subg; }
   label_t* get_labels_ptr() { return h_labels; }
   label_t* get_labels_subg_ptr() { return h_labels_subg; }
-  float_t* get_norm_factors_subg_ptr() { return norm_factors_subg; }
 
   void initializeSyncSubstrate();
-  galois::graphs::GluonSubstrate<Graph>* getSyncSubstrate();
+  galois::graphs::GluonSubstrate<DGraph>* getSyncSubstrate();
 
   //! return label for some node
   //! NOTE: this is LID, not GID
diff --git a/libdeepgalois/include/deepgalois/GraphTypes.h b/libdeepgalois/include/deepgalois/GraphTypes.h
index ba241c53f5..3a93565724 100644
--- a/libdeepgalois/include/deepgalois/GraphTypes.h
+++ b/libdeepgalois/include/deepgalois/GraphTypes.h
@@ -10,7 +10,7 @@
 #endif
 
 namespace deepgalois {
-using index_t = edge_iterator;
-using DGraph  = galois::graphs::DistGraph<char, void>;
-using Graph   = LearningGraph;
+using edge_iterator = index_t;
+using DGraph        = galois::graphs::DistGraph<char, void>;
+using Graph         = LearningGraph;
 } // namespace deepgalois
diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h
index 7547252b86..0fb0dfefdb 100644
--- a/libdeepgalois/include/deepgalois/Net.h
+++ b/libdeepgalois/include/deepgalois/Net.h
@@ -15,7 +15,6 @@
 #include "deepgalois/GraphTypes.h"
 
 #include "deepgalois/DistContext.h"
-#endif
 
 namespace deepgalois {
 
@@ -40,6 +39,7 @@ class Net {
   size_t num_conv_layers;        // number of convolutional layers
   size_t num_layers;             // total number of layers (conv + output)
   int num_epochs;                // number of epochs
+  unsigned h1;                // hidden layer size
   float learning_rate;           // learning rate
   float dropout_rate;            // dropout rate
   float weight_decay;            // weighti decay for over-fitting
@@ -92,6 +92,7 @@ class Net {
       : is_single_class(single), has_l2norm(l2norm), has_dense(dense),
         neighbor_sample_size(neigh_sz), subgraph_sample_size(subg_sz),
         num_threads(nt), num_conv_layers(n_conv), num_epochs(epochs),
+        h1(hidden1),
         learning_rate(lr), dropout_rate(dropout), weight_decay(wd),
         val_interval(val_itv), num_subgraphs(1), is_selfloop(selfloop) {
     // init some identifiers for this host
@@ -102,7 +103,7 @@ class Net {
     assert(n_conv > 0);
 
     // TODO use galois print
-    galois >> gPrint(header, "Configuration: num_threads ", num_threads,
+    galois::gPrint(header, "Configuration: num_threads ", num_threads,
                      ", num_conv_layers ", num_conv_layers, ", num_epochs ",
                      num_epochs, ", hidden1 ", hidden1, ", learning_rate ",
                      learning_rate, ", dropout_rate ", dropout_rate,
@@ -181,6 +182,7 @@ class Net {
   //      num_vertices_sg(9000), globalTrainMasks(NULL), globalValMasks(NULL),
   //      test_masks(NULL), context(NULL) {}
 
+  void init();
   //! Initializes metadata for the partition
   void partitionInit(DGraph* graph, std::string dataset_str);
 
@@ -195,8 +197,8 @@ class Net {
 
     if (subgraph_sample_size) {
       context->allocateSubgraphs(num_subgraphs);
-      subgraphs_masks = new mask_t[num_samples * num_subgraphs];
-      galois::gPrint(header, " Construct training vertex set induced graph...\n";
+      subgraphs_masks = new mask_t[distNumSamples * num_subgraphs];
+      galois::gPrint(header, " Construct training vertex set induced graph...\n");
       sampler->initializeMaskedGraph(globalTrainCount, globalTrainMasks, context->getGraphPointer());
     }
 
@@ -222,7 +224,7 @@ class Net {
             // tid = galois::substrate::ThreadPool::getTID();
             sampler->subgraph_sample(subgraph_sample_size,
                                      *(context->getSubgraphPointer(sid)),
-                                     &subgraphs_masks[sid * num_samples], tid);
+                                     &subgraphs_masks[sid * globalSamples], tid);
           } //, galois::loopname("subgraph_gen"));
 #endif
 #endif
@@ -253,12 +255,12 @@ class Net {
         }
         // update labels for subgraph
         context->gen_subgraph_labels(num_vertices_sg,
-                                     &subgraphs_masks[sg_id * num_samples]);
+                                     &subgraphs_masks[sg_id * globalSamples]);
         layers[num_layers - 1]->set_labels_ptr(context->get_labels_subg_ptr());
 
         // update features for subgraph
         context->gen_subgraph_feats(num_vertices_sg,
-                                    &subgraphs_masks[sg_id * num_samples]);
+                                    &subgraphs_masks[sg_id * globalSamples]);
         layers[0]->set_feats_ptr(
             context->get_feats_subg_ptr()); // feed input data
       }
@@ -343,7 +345,7 @@ class Net {
     if (subgraph_sample_size &&
         type != "train") { // switch to the original graph
       for (size_t i = 0; i < num_layers; i++)
-        layers[i]->update_dim_size(num_samples);
+        layers[i]->update_dim_size(distNumSamples);
       for (size_t i = 0; i < num_conv_layers; i++) {
         layers[i]->set_graph_ptr(context->getGraphPointer());
         layers[i]->set_norm_consts_ptr(context->get_norm_factors_ptr());
@@ -380,7 +382,7 @@ class Net {
 
   // read masks of test set
   void read_test_masks(std::string dataset) {
-    test_masks = new mask_t[num_samples];
+    test_masks = new mask_t[distNumSamples];
     if (dataset == "reddit") {
       globalTestBegin = 177262;
       globalTestCount = 55703;
@@ -396,14 +398,9 @@ class Net {
       }
 #endif
     } else {
-#ifndef GALOIS_USE_DIST
-      globalTestCount = context->read_masks(
-          "test", globalSamples, globalTestBegin, globalTestEnd, test_masks);
-#else
       globalTestCount =
-          context->read_masks("test", globalSamples, globalTestBegin,
+          distContext->read_masks(dataset, std::string("test"), globalSamples, globalTestBegin,
                               globalTestEnd, test_masks, dGraph);
-#endif
     }
 #ifndef CPU_ONLY
     copy_test_masks_to_device();
@@ -443,8 +440,8 @@ class Net {
   void append_l2norm_layer(size_t layer_id) {
     assert(layer_id > 0); // can not be the first layer
     std::vector<size_t> in_dims(2), out_dims(2);
-    in_dims[0]       = num_samples;
-    in_dims[0]       = num_samples;
+    in_dims[0]       = distNumSamples;
+    in_dims[0]       = distNumSamples;
     in_dims[1]       = get_in_dim(layer_id);
     out_dims[1]      = get_out_dim(layer_id);
     layers[layer_id] = new l2_norm_layer(layer_id, in_dims, out_dims);
@@ -454,8 +451,8 @@ class Net {
   void append_dense_layer(size_t layer_id) {
     assert(layer_id > 0); // can not be the first layer
     std::vector<size_t> in_dims(2), out_dims(2);
-    in_dims[0]  = num_samples;
-    in_dims[0]  = num_samples;
+    in_dims[0]  = distNumSamples;
+    in_dims[0]  = distNumSamples;
     in_dims[1]  = get_in_dim(layer_id);
     out_dims[1] = get_out_dim(layer_id);
     // layers[layer_id] = new dense_layer(layer_id, in_dims, out_dims);
@@ -465,7 +462,7 @@ class Net {
   void append_out_layer(size_t layer_id) {
     assert(layer_id > 0); // can not be the first layer
     std::vector<size_t> in_dims(2), out_dims(2);
-    in_dims[0] = out_dims[0] = num_samples;
+    in_dims[0] = out_dims[0] = distNumSamples;
     in_dims[1]               = get_in_dim(layer_id);
     out_dims[1]              = get_out_dim(layer_id);
     if (is_single_class)
@@ -481,7 +478,7 @@ class Net {
     assert(dropout_rate < 1.0);
     assert(layer_id < num_conv_layers);
     std::vector<size_t> in_dims(2), out_dims(2);
-    in_dims[0] = out_dims[0] = num_samples;
+    in_dims[0] = out_dims[0] = distNumSamples;
     in_dims[1]               = get_in_dim(layer_id);
     out_dims[1]              = get_out_dim(layer_id);
     layers[layer_id] = new graph_conv_layer(layer_id, act, norm, bias, dropout,
@@ -491,7 +488,6 @@ class Net {
 
   // update trainable weights after back-propagation
   void update_weights(optimizer* opt) {
-    normalize();
     regularize();
     for (size_t i = 0; i < num_layers; i++) {
       if (layers[i]->trainable()) {
@@ -528,7 +524,7 @@ class Net {
   //! Save the context object to all layers of the network
   void set_contexts() {
     for (size_t i = 0; i < num_layers; i++)
-      layers[i]->set_context(context);
+      layers[i]->set_context(distContext);
   }
   //! set netphases for all layers in this network
   void set_netphases(net_phase phase) {
diff --git a/libdeepgalois/include/deepgalois/Sampler.h b/libdeepgalois/include/deepgalois/Sampler.h
index 578bb6abf7..9c4ea06f12 100644
--- a/libdeepgalois/include/deepgalois/Sampler.h
+++ b/libdeepgalois/include/deepgalois/Sampler.h
@@ -1,5 +1,3 @@
-#ifndef GALOIS_USE_DIST
-
 #pragma once
 #include "deepgalois/GraphTypes.h"
 
@@ -71,5 +69,3 @@ class Sampler {
 };
 
 } // namespace deepgalois
-
-#endif
diff --git a/libdeepgalois/include/deepgalois/layers/aggregator.h b/libdeepgalois/include/deepgalois/layers/aggregator.h
index cc6e22db00..997f006de8 100644
--- a/libdeepgalois/include/deepgalois/layers/aggregator.h
+++ b/libdeepgalois/include/deepgalois/layers/aggregator.h
@@ -5,6 +5,7 @@
 #ifdef CPU_ONLY
 #include "deepgalois/GraphTypes.h"
 namespace deepgalois {
+// TODO template arg
 void update_all(size_t len, Graph& g, const float_t* in, float_t* out,
                 bool norm, float_t* norm_factor);
 void update_all_csrmm(size_t len, Graph& g, const float_t* in, float_t* out,
diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h
index a1c2ef630a..5ad33ae25a 100644
--- a/libdeepgalois/include/deepgalois/layers/layer.h
+++ b/libdeepgalois/include/deepgalois/layers/layer.h
@@ -9,19 +9,16 @@
  * Reused/revised under 3-BSD
  */
 #include <iostream>
-#include "deepgalois/gtypes.h"
-#ifndef GALOIS_USE_DIST
+#include "deepgalois/GraphTypes.h"
 #include "deepgalois/Context.h"
-#else
+
 #include "deepgalois/DistContext.h"
-#endif
 #include "deepgalois/optimizer.h"
 #include "deepgalois/layers/node.h"
-#ifdef GALOIS_USE_DIST
+
 #include "galois/graphs/GluonSubstrate.h"
 #include "deepgalois/layers/GluonGradients.h"
 #include "deepgalois/layers/GradientSyncStructs.h"
-#endif
 
 namespace deepgalois {
 
@@ -40,11 +37,7 @@ namespace deepgalois {
  **/
 class layer : public deepgalois::node {
 public:
-#ifndef GALOIS_USE_DIST
-  using ContextType = deepgalois::Context;
-#else
   using ContextType = deepgalois::DistContext;
-#endif
 
   layer(unsigned level, std::vector<size_t> in_dims,
         std::vector<size_t> out_dims)
@@ -179,17 +172,16 @@ class layer : public deepgalois::node {
   ContextType* context;
   label_t* labels;
   float_t* norm_consts;
+// TODO
 #ifdef CPU_ONLY
   Graph* graph_cpu;
 #else
   GraphGPU* graph_gpu;
 #endif
 
-#ifdef GALOIS_USE_DIST
   // Used for synchronization of weight gradients
   deepgalois::GluonGradients* gradientGraph;
   galois::graphs::GluonSubstrate<deepgalois::GluonGradients>* syncSub;
-#endif
 };
 
 //! Connects tail to head's edge and sets that edge's target to tail
diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp
index 21236ef638..1b8fef711c 100644
--- a/libdeepgalois/src/DistContext.cpp
+++ b/libdeepgalois/src/DistContext.cpp
@@ -7,7 +7,7 @@ DistContext::DistContext() {}
 DistContext::~DistContext() {}
 
 size_t DistContext::read_labels(DGraph& dGraph, std::string dataset_str) {
-  Graph* dGraph = DistContext::graph_cpu;
+  DGraph* dGraph = DistContext::graph_cpu;
   unsigned myID = galois::runtime::getSystemNetworkInterface().ID;
   galois::gPrint("[", myID, "] Reading labels from disk...\n");
 
@@ -58,7 +58,7 @@ size_t DistContext::read_labels(DGraph& dGraph, std::string dataset_str) {
 }
 
 size_t DistContext::read_features(std::string dataset_str) {
-  Graph* dGraph = DistContext::graph_cpu;
+  DGraph* dGraph = DistContext::graph_cpu;
   unsigned myID = galois::runtime::getSystemNetworkInterface().ID;
   galois::gPrint("[", myID, "] Reading features from disk...\n");
 
@@ -101,7 +101,7 @@ size_t DistContext::read_features(std::string dataset_str) {
 
 size_t DistContext::read_masks(std::string dataset_str, std::string mask_type,
                                size_t n, size_t& begin, size_t& end,
-                               mask_t* masks, Graph* dGraph) {
+                               mask_t* masks, DGraph* dGraph) {
   bool dataset_found = false;
   for (int i = 0; i < NUM_DATASETS; i++) {
     if (dataset_str == dataset_names[i]) {
@@ -146,12 +146,12 @@ size_t DistContext::read_masks(std::string dataset_str, std::string mask_type,
 float_t* DistContext::get_in_ptr() { return &h_feats[0]; }
 
 void DistContext::initializeSyncSubstrate() {
-  DistContext::syncSubstrate = new galois::graphs::GluonSubstrate<Graph>(
+  DistContext::syncSubstrate = new galois::graphs::GluonSubstrate<DGraph>(
       *DistContext::graph_cpu, galois::runtime::getSystemNetworkInterface().ID,
       galois::runtime::getSystemNetworkInterface().Num, false);
 }
 
-galois::graphs::GluonSubstrate<Graph>* DistContext::getSyncSubstrate() {
+galois::graphs::GluonSubstrate<DGraph>* DistContext::getSyncSubstrate() {
   return DistContext::syncSubstrate;
 };
 
diff --git a/libdeepgalois/src/Net.cpp b/libdeepgalois/src/Net.cpp
index 4ba6c23fe1..f7ff51bb2e 100644
--- a/libdeepgalois/src/Net.cpp
+++ b/libdeepgalois/src/Net.cpp
@@ -20,7 +20,7 @@ void Net::partitionInit(DGraph* graph, std::string dataset_str) {
   // or on master node only
 
   this->distContext->initializeSyncSubstrate();
-  num_classes = this->distContext->read_labels();
+  num_classes = this->distContext->read_labels(graph, dataset_str);
 
   // std::cout << "Reading label masks ... ";
   this->distTrainMasks = new mask_t[this->distNumSamples];
@@ -49,18 +49,18 @@ void Net::partitionInit(DGraph* graph, std::string dataset_str) {
       }
     }
   } else {
-    globalTrainCount = this->distContext->read_masks(
+    globalTrainCount = this->distContext->read_masks(dataset_str,
         "train", this->distNumSamples, globalTrainBegin, globalTrainEnd,
         this->distTrainMasks, this->dGraph);
-    globalValCount = this->distContext->read_masks(
+    globalValCount = this->distContext->read_masks(dataset_str,
         "val", this->distNumSamples, globalValBegin, globalValEnd,
         this->distValMasks, this->dGraph);
   }
 
   // input feature dimension: D
-  feature_dims[0] = this->distContext->read_features();
+  feature_dims[0] = this->distContext->read_features(dataset_str);
   for (size_t i = 1; i < num_conv_layers; i++)
-    feature_dims[i] = hidden1;                 // hidden1 level embedding: 16
+    feature_dims[i] = this->h1;                 // hidden1 level embedding: 16
   feature_dims[num_conv_layers] = num_classes; // output embedding: E
   if (this->has_l2norm) {
     // l2 normalized embedding: E
diff --git a/libdeepgalois/src/Sampler.cpp b/libdeepgalois/src/Sampler.cpp
index dbf54a7b4b..6a84a8de76 100644
--- a/libdeepgalois/src/Sampler.cpp
+++ b/libdeepgalois/src/Sampler.cpp
@@ -123,7 +123,6 @@ void Sampler::getMaskedGraph(size_t n, mask_t* masks, Graph* g, Graph& sub) {
       ,
       galois::loopname("gen_subgraph"));
 #endif
-#endif
 }
 
 // helper function for graph saint implementation below
@@ -183,7 +182,7 @@ void Sampler::select_vertices(size_t n, int m, VertexSet& st, unsigned seed) {
   for (int i = 0; i < m; i++) {
     auto rand_idx = rand_r(&myseed) % Sampler::node_train.size();
     db_t v = IA3[i] = Sampler::node_train[rand_idx];
-    st.iisert(v);
+    st.insert(v);
     IA0[i] = getDegree(Sampler::masked_graph, v);
     IA0[i] = (IA0[i] > SAMPLE_CLIP) ? SAMPLE_CLIP : IA0[i];
     IA1[i] = 1;
@@ -376,7 +375,6 @@ void Sampler::reindexSubgraph(VertexSet& keptVertices, Graph& origGraph,
   auto ne      = offsets[nv];
   // galois::gPrint("Generate subgraph: num_vertices=", nv, ", num_edges=", ne,
   // "\n");
-#ifndef GALOIS_USE_DIST
   reindexGraph.allocateFrom(nv, ne);
   reindexGraph.constructNodes();
   VertexList old_ids(keptVertices.begin(),
@@ -403,7 +401,6 @@ void Sampler::reindexSubgraph(VertexSet& keptVertices, Graph& origGraph,
       ,
       galois::loopname("construct_graph"));
 #endif
-#endif
 }
 
 void Sampler::subgraph_sample(size_t n, Graph& sg, mask_t* masks,
diff --git a/libdeepgalois/src/layers/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp
index 9c3454d550..4e07ca96cf 100644
--- a/libdeepgalois/src/layers/aggregator.cpp
+++ b/libdeepgalois/src/layers/aggregator.cpp
@@ -2,20 +2,14 @@
 #include "deepgalois/math_functions.hh"
 #include "galois/Galois.h"
 
+// TODO template arg
 void deepgalois::update_all(size_t len, Graph& g, const float_t* in,
                             float_t* out, bool norm, float_t* norm_factor) {
 // std::cout << "[update_all] graph size: " << n << "\n";
-#ifndef GALOIS_USE_DIST
   size_t n = g.size();
   galois::do_all(
       galois::iterate(size_t(0), n),
       [&](const auto src) {
-#else
-  auto& rangeObj = g.allNodesRange();
-  galois::do_all(
-      galois::iterate(rangeObj),
-      [&](const auto src) {
-#endif
         auto src_idx = src * len;
         // zero out the output data
         math::clear_cpu(len, &out[src_idx]);
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index d50f7bfb0a..97facbcd83 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -66,7 +66,7 @@ void graph_conv_layer::malloc_and_init() {
   size_t x = input_dims[0];
   size_t y = input_dims[1];
   size_t z = output_dims[1];
-#ifdef GALOIS_USE_DIST
+
   // setup gluon
   layer::gradientGraph =
       new deepgalois::GluonGradients(layer::weight_grad, y * z);
@@ -74,14 +74,9 @@ void graph_conv_layer::malloc_and_init() {
       new galois::graphs::GluonSubstrate<deepgalois::GluonGradients>(
           *layer::gradientGraph, layer::gradientGraph->myHostID(),
           layer::gradientGraph->numHosts(), false);
-#endif
 
-#ifdef GALOIS_USE_DIST
   // make sure seed consistent across all hosts for weight matrix
   rand_init_matrix(y, z, W, 1);
-#else
-  rand_init_matrix(y, z, W);
-#endif
 
   // rand_init_matrix(y, z, Q);
   zero_init_matrix(y, z, layer::weight_grad);
diff --git a/lonestar/gnn/include/lonestargnn.h b/lonestar/gnn/include/lonestargnn.h
index d0255b9368..21e73cb024 100644
--- a/lonestar/gnn/include/lonestargnn.h
+++ b/lonestar/gnn/include/lonestargnn.h
@@ -10,10 +10,9 @@
 #include "galois/runtime/Profile.h"
 #include "llvm/Support/CommandLine.h"
 #include <boost/iterator/transform_iterator.hpp>
-#ifdef GALOIS_USE_DIST
+
 #include "galois/DistGalois.h"
 #include "galois/runtime/Network.h"
-#endif
 
 namespace cll = llvm::cl;
 static cll::opt<std::string> dataset(cll::Positional, 
@@ -109,4 +108,4 @@ void LonestarGnnStart(int argc, char** argv, const char* app, const char* desc,
   galois::runtime::reportParam("(NULL)", "Hostname", name);
 }
 
-#include "deepgalois/net.h"
+#include "deepgalois/Net.h"

From 4e0d315974b1f94de107fcf77327b744547175e3 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 8 May 2020 19:48:46 -0500
Subject: [PATCH 276/660] distcontext back as a source file

---
 libdeepgalois/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index 58309084b1..2f05527318 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -83,6 +83,7 @@ if(NOT ENABLE_HETERO_GALOIS)
     src/math_functions.cpp
     src/optimizer.cpp
     src/Context.cpp
+    src/DistContext.cpp
     src/Sampler.cpp
     src/reader.cpp
     src/lgraph.cpp

From 8fd535f261bf5a29b888a779e3f83b38e3d36c42 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 8 May 2020 19:49:33 -0500
Subject: [PATCH 277/660] het cuda defs, signature change for dist read labels

---
 libdeepgalois/include/deepgalois/DistContext.h |  2 +-
 libdeepgalois/include/deepgalois/Net.h         |  6 +++---
 .../include/deepgalois/layers/aggregator.h     |  2 +-
 .../deepgalois/layers/graph_conv_layer.h       |  2 +-
 .../include/deepgalois/layers/layer.h          |  8 ++++----
 libdeepgalois/include/deepgalois/lgraph.h      |  2 +-
 libdeepgalois/include/deepgalois/optimizer.h   | 18 +++++++++---------
 libdeepgalois/src/DistContext.cpp              |  2 +-
 libdeepgalois/src/Net.cpp                      |  4 ++--
 libdeepgalois/src/layers/graph_conv_layer.cpp  |  2 +-
 lonestar/gnn/gcn/gcn.cpp                       |  2 +-
 11 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h
index 7f1c4fb1ea..212a29a287 100644
--- a/libdeepgalois/include/deepgalois/DistContext.h
+++ b/libdeepgalois/include/deepgalois/DistContext.h
@@ -33,7 +33,7 @@ class DistContext {
   void saveDistGraph(DGraph* a) { graph_cpu = a; }
 
   //! read labels of local nodes only
-  size_t read_labels(DGraph* dGraph, std::string dataset_str);
+  size_t read_labels(std::string dataset_str);
   //! read features of local nodes only
   size_t read_features(std::string dataset_str);
   //! read masks of local nodes only
diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h
index 0fb0dfefdb..548b01e79a 100644
--- a/libdeepgalois/include/deepgalois/Net.h
+++ b/libdeepgalois/include/deepgalois/Net.h
@@ -215,7 +215,7 @@ class Net {
           Timer t_subgen;
           t_subgen.Start();
           // generate subgraphs
-#ifdef CPU_ONLY
+#ifndef __GALOIS_HET_CUDA__
 #ifndef GALOIS_USE_DIST
           for (int sid = 0; sid < num_subgraphs; sid++) {
             // galois::do_all(galois::iterate(size_t(0),
@@ -341,7 +341,7 @@ class Net {
       count = globalTestCount;
       masks = test_masks;
     }
-#ifdef CPU_ONLY
+#ifndef __GALOIS_HET_CUDA__
     if (subgraph_sample_size &&
         type != "train") { // switch to the original graph
       for (size_t i = 0; i < num_layers; i++)
@@ -402,7 +402,7 @@ class Net {
           distContext->read_masks(dataset, std::string("test"), globalSamples, globalTestBegin,
                               globalTestEnd, test_masks, dGraph);
     }
-#ifndef CPU_ONLY
+#ifdef __GALOIS_HET_CUDA__
     copy_test_masks_to_device();
 #endif
   }
diff --git a/libdeepgalois/include/deepgalois/layers/aggregator.h b/libdeepgalois/include/deepgalois/layers/aggregator.h
index 997f006de8..142812c6ba 100644
--- a/libdeepgalois/include/deepgalois/layers/aggregator.h
+++ b/libdeepgalois/include/deepgalois/layers/aggregator.h
@@ -2,7 +2,7 @@
 #include "deepgalois/types.h"
 //! For each node in the graph, add the embeddings of all of its neighbors
 //! together (using norm_factor if specified)
-#ifdef CPU_ONLY
+#ifndef __GALOIS_HET_CUDA__
 #include "deepgalois/GraphTypes.h"
 namespace deepgalois {
 // TODO template arg
diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
index 09d4233c27..216b7e1935 100644
--- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
@@ -53,7 +53,7 @@ class graph_conv_layer : public layer {
   virtual void back_propagation(const float_t* in_data, const float_t* out_data,
                                 float_t* out_grad, float_t* in_grad);
   // user-defined aggregate function
-#ifdef CPU_ONLY
+#ifndef __GALOIS_HET_CUDA__
   virtual void aggregate(size_t len, Graph& g, const float_t* in, float_t* out);
   void d_aggregate(size_t len, Graph& g, const float_t* in, float_t* out);
 #else
diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h
index 5ad33ae25a..3a33d54440 100644
--- a/libdeepgalois/include/deepgalois/layers/layer.h
+++ b/libdeepgalois/include/deepgalois/layers/layer.h
@@ -72,7 +72,7 @@ class layer : public deepgalois::node {
   void set_norm_consts_ptr(float_t* ptr) { norm_consts = ptr; }
   void set_feats_ptr(float_t* ptr) { prev_->set_data(ptr); }
   void set_name(std::string name) { name_ = name; } // name metadata
-#ifdef CPU_ONLY
+#ifndef __GALOIS_HET_CUDA__
   void set_graph_ptr(Graph* ptr) { graph_cpu = ptr; }
 #else
   void set_graph_ptr(GraphGPU* ptr) { graph_gpu = ptr; }
@@ -97,7 +97,7 @@ class layer : public deepgalois::node {
     use_mask = false;
     if (masks != NULL) {
       use_mask = true;
-#ifdef CPU_ONLY
+#ifndef __GALOIS_HET_CUDA__
       masks_ = masks;
 #else
       d_masks_ = masks;
@@ -135,7 +135,7 @@ class layer : public deepgalois::node {
 
   //! use optimizer to update weights given gradient (weight_grad)
   void update_weight(deepgalois::optimizer* opt) {
-#ifdef CPU_ONLY
+#ifndef __GALOIS_HET_CUDA__
     // parallelize only when target size is big enough to mitigate thread
     // spawning overhead.
     // bool parallel = (W.size() >= 512);
@@ -173,7 +173,7 @@ class layer : public deepgalois::node {
   label_t* labels;
   float_t* norm_consts;
 // TODO
-#ifdef CPU_ONLY
+#ifndef __GALOIS_HET_CUDA__
   Graph* graph_cpu;
 #else
   GraphGPU* graph_gpu;
diff --git a/libdeepgalois/include/deepgalois/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h
index 53382199f4..0c06a926cb 100644
--- a/libdeepgalois/include/deepgalois/lgraph.h
+++ b/libdeepgalois/include/deepgalois/lgraph.h
@@ -119,7 +119,7 @@ class LearningGraph {
 
   index_t* row_start_host_ptr() { return &rowptr_[0]; }
   index_t* edge_dst_host_ptr() { return &colidx_[0]; }
-#ifdef CPU_ONLY
+#ifndef __GALOIS_HET_CUDA__
   index_t getEdgeDst(index_t eid) { return colidx_[eid]; }
   index_t edge_begin(index_t vid) { return rowptr_[vid]; }
   index_t edge_end(index_t vid) { return rowptr_[vid + 1]; }
diff --git a/libdeepgalois/include/deepgalois/optimizer.h b/libdeepgalois/include/deepgalois/optimizer.h
index aa0dcbaab7..3a0139418e 100644
--- a/libdeepgalois/include/deepgalois/optimizer.h
+++ b/libdeepgalois/include/deepgalois/optimizer.h
@@ -30,7 +30,7 @@ struct optimizer {
   optimizer& operator=(optimizer&&)              = default;
   virtual ~optimizer()                           = default;
   virtual void update(const vec_t& dW, vec_t& W) = 0;
-#ifndef CPU_ONLY
+#ifdef __GALOIS_HET_CUDA__
   virtual void update_gpu(const size_t n, const float_t* dW, float_t* W) = 0;
 #endif
   virtual void reset() {} // override to implement pre-learning action
@@ -53,7 +53,7 @@ struct stateful_optimizer : public optimizer {
     return E_[Index][&key];
   }
   std::unordered_map<const vec_t*, vec_t> E_[N];
-#ifndef CPU_ONLY
+#ifdef __GALOIS_HET_CUDA__
   template <int Index>
   float_t* get_gpu(const size_t n, const float_t* key);
   std::unordered_map<const float_t*, float_t*> dE_[N];
@@ -70,7 +70,7 @@ struct stateful_optimizer : public optimizer {
 struct adagrad : public stateful_optimizer<1> {
   adagrad() : alpha(0.01), eps(float_t(1e-8)) {}
   void update(const vec_t& dW, vec_t& W);
-#ifndef CPU_ONLY
+#ifdef __GALOIS_HET_CUDA__
   void update_gpu(const size_t n, const float_t* dW, float_t* W);
 #endif
   float_t alpha; // learning rate
@@ -87,7 +87,7 @@ struct adagrad : public stateful_optimizer<1> {
 struct RMSprop : public stateful_optimizer<1> {
   RMSprop() : alpha(float_t(0.0001)), mu(float_t(0.99)), eps(float_t(1e-8)) {}
   void update(const vec_t& dW, vec_t& W);
-#ifndef CPU_ONLY
+#ifdef __GALOIS_HET_CUDA__
   void update_gpu(const size_t n, const float_t* dW, float_t* W);
 #endif
   float_t alpha; // learning rate
@@ -103,7 +103,7 @@ struct adam : public stateful_optimizer<2> {
       : alpha(float_t(0.01)), b1(float_t(0.9)), b2(float_t(0.999)),
         b1_t(float_t(0.9)), b2_t(float_t(0.999)), eps(float_t(1e-8)) {}
   void update(const vec_t& dW, vec_t& W);
-#ifndef CPU_ONLY
+#ifdef __GALOIS_HET_CUDA__
   void update_gpu(const size_t n, const float_t* dW, float_t* W);
 #endif
 
@@ -128,7 +128,7 @@ struct adamax : public stateful_optimizer<2> {
       : alpha(float_t(0.002)), b1(float_t(0.9)), b2(float_t(0.999)), b1_t(b1),
         eps(float_t(1e-8)) {}
   void update(const vec_t& dW, vec_t& W);
-#ifndef CPU_ONLY
+#ifdef __GALOIS_HET_CUDA__
   void update_gpu(const size_t n, const float_t* dW, float_t* W);
 #endif
 
@@ -146,7 +146,7 @@ struct adamax : public stateful_optimizer<2> {
 struct gradient_descent : public optimizer {
   gradient_descent() : alpha(float_t(0.01)), lambda(float_t(0)) {}
   void update(const vec_t& dW, vec_t& W);
-#ifndef CPU_ONLY
+#ifdef __GALOIS_HET_CUDA__
   void update_gpu(const size_t n, const float_t* dW, float_t* W);
 #endif
   float_t alpha;  // learning rate
@@ -164,7 +164,7 @@ struct momentum : public stateful_optimizer<1> {
 public:
   momentum() : alpha(float_t(0.01)), lambda(float_t(0)), mu(float_t(0.9)) {}
   void update(const vec_t& dW, vec_t& W);
-#ifndef CPU_ONLY
+#ifdef __GALOIS_HET_CUDA__
   void update_gpu(const size_t n, const float_t* dW, float_t* W);
 #endif
 
@@ -185,7 +185,7 @@ struct nesterov_momentum : public stateful_optimizer<1> {
   nesterov_momentum()
       : alpha(float_t(0.01)), lambda(float_t(0)), mu(float_t(0.9)) {}
   void update(const vec_t& dW, vec_t& W);
-#ifndef CPU_ONLY
+#ifdef __GALOIS_HET_CUDA__
   void update_gpu(const size_t n, const float_t* dW, float_t* W);
 #endif
 
diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp
index 1b8fef711c..8576082c7b 100644
--- a/libdeepgalois/src/DistContext.cpp
+++ b/libdeepgalois/src/DistContext.cpp
@@ -6,7 +6,7 @@ namespace deepgalois {
 DistContext::DistContext() {}
 DistContext::~DistContext() {}
 
-size_t DistContext::read_labels(DGraph& dGraph, std::string dataset_str) {
+size_t DistContext::read_labels(std::string dataset_str) {
   DGraph* dGraph = DistContext::graph_cpu;
   unsigned myID = galois::runtime::getSystemNetworkInterface().ID;
   galois::gPrint("[", myID, "] Reading labels from disk...\n");
diff --git a/libdeepgalois/src/Net.cpp b/libdeepgalois/src/Net.cpp
index f7ff51bb2e..800d550048 100644
--- a/libdeepgalois/src/Net.cpp
+++ b/libdeepgalois/src/Net.cpp
@@ -20,7 +20,7 @@ void Net::partitionInit(DGraph* graph, std::string dataset_str) {
   // or on master node only
 
   this->distContext->initializeSyncSubstrate();
-  num_classes = this->distContext->read_labels(graph, dataset_str);
+  num_classes = this->distContext->read_labels(dataset_str);
 
   // std::cout << "Reading label masks ... ";
   this->distTrainMasks = new mask_t[this->distNumSamples];
@@ -76,7 +76,7 @@ void Net::partitionInit(DGraph* graph, std::string dataset_str) {
 }
 #endif
 
-#ifdef CPU_ONLY
+#ifndef __GALOIS_HET_CUDA__
 void Net::init() {
   if (subgraph_sample_size)
     sampler = new deepgalois::Sampler();
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 97facbcd83..00ca3f30e6 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -10,7 +10,7 @@ float_t* _dataToSync = nullptr;
 //! sync
 long unsigned _syncVectorSize = 0;
 
-#ifdef CPU_ONLY
+#ifndef __GALOIS_HET_CUDA__
 inline void graph_conv_layer::rand_init_matrix(size_t dim_x, size_t dim_y,
                                                vec_t& matrix, unsigned seed) {
   auto init_range = sqrt(6.0 / (dim_x + dim_y));
diff --git a/lonestar/gnn/gcn/gcn.cpp b/lonestar/gnn/gcn/gcn.cpp
index 0a47913a0f..d9219438ae 100644
--- a/lonestar/gnn/gcn/gcn.cpp
+++ b/lonestar/gnn/gcn/gcn.cpp
@@ -15,7 +15,7 @@ int main(int argc, char** argv) {
 
   // Get a partitioned graph first
   std::vector<unsigned> dummyVec;
-  deepgalois::Graph* dGraph =
+  deepgalois::DGraph* dGraph =
       galois::graphs::constructSymmetricGraph<char, void>(dummyVec);
 
   // initialize network + whole context on CPU

From 425d4944d7c516b940a82329e3c9a535ceb2bb1f Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Sat, 9 May 2020 13:34:45 -0500
Subject: [PATCH 278/660] single process now runs (incorrectly probably + no
 sampling)

---
 .../include/deepgalois/DistContext.h          |  8 +-
 libdeepgalois/include/deepgalois/Net.h        | 81 +++++++++++--------
 libdeepgalois/src/Context.cpp                 |  4 +-
 libdeepgalois/src/DistContext.cpp             | 27 +++++++
 libdeepgalois/src/layers/graph_conv_layer.cpp | 15 ++--
 5 files changed, 90 insertions(+), 45 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h
index 212a29a287..0ffb2e1b0c 100644
--- a/libdeepgalois/include/deepgalois/DistContext.h
+++ b/libdeepgalois/include/deepgalois/DistContext.h
@@ -5,6 +5,7 @@
  */
 #include "galois/graphs/GluonSubstrate.h"
 #include "deepgalois/types.h"
+#include "deepgalois/Context.h"
 #include "deepgalois/GraphTypes.h"
 
 namespace deepgalois {
@@ -23,7 +24,7 @@ class DistContext {
   float_t* h_feats_subg;  // input features for subgraph
 
   // TODO needs to come from whole graph
-  float_t* norm_factors;  // normalization constant based on graph structure
+  float_t* normFactors;  // normalization constant based on graph structure
   std::vector<float_t> norm_factors_subg; // normalization constant for subgraph
 
 public:
@@ -45,7 +46,10 @@ class DistContext {
   void gen_subgraph_labels(size_t, const mask_t*) {}
   void gen_subgraph_feats(size_t, const mask_t*) {}
 
-  float_t* get_norm_factors_ptr() { return norm_factors; }
+  void constructNormFactor(deepgalois::Context* globalContext, bool is_subgraph,
+                           int subg_id = 0);
+
+  float_t* get_norm_factors_ptr() { return normFactors; }
   // TODO shouldn't return 0 always
   float_t* get_norm_factors_subg_ptr() { return &norm_factors_subg[0]; }
   DGraph* getGraphPointer() { return graph_cpu; }
diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h
index 548b01e79a..e50b081bd7 100644
--- a/libdeepgalois/include/deepgalois/Net.h
+++ b/libdeepgalois/include/deepgalois/Net.h
@@ -202,21 +202,23 @@ class Net {
       sampler->initializeMaskedGraph(globalTrainCount, globalTrainMasks, context->getGraphPointer());
     }
 
-    std::cout << "\nStart training...\n";
+    galois::gPrint(header, "Start training...\n");
 
     Timer t_epoch;
+
     // run epochs
     for (int ep = 0; ep < num_epochs; ep++) {
       t_epoch.Start();
 
+////////////////////////////////////////////////////////////////////////////////
       if (subgraph_sample_size) {
         if (num_subg_remain == 0) {
           std::cout << "Generating " << num_subgraphs << " subgraphs ";
           Timer t_subgen;
           t_subgen.Start();
+
           // generate subgraphs
 #ifndef __GALOIS_HET_CUDA__
-#ifndef GALOIS_USE_DIST
           for (int sid = 0; sid < num_subgraphs; sid++) {
             // galois::do_all(galois::iterate(size_t(0),
             // size_t(num_subgraphs)),[&](const auto sid) {
@@ -226,20 +228,18 @@ class Net {
                                      *(context->getSubgraphPointer(sid)),
                                      &subgraphs_masks[sid * globalSamples], tid);
           } //, galois::loopname("subgraph_gen"));
-#endif
 #endif
           num_subg_remain = num_subgraphs;
           t_subgen.Stop();
           // std::cout << "Done, time: " << t_subgen.Millisecs() << "\n";
         }
-#ifndef GALOIS_USE_DIST
         for (int i = 0; i < num_subgraphs; i++) {
           auto sg_ptr = context->getSubgraphPointer(i);
           sg_ptr->degree_counting();
           // galois::gPrint("\tsubgraph[", i, "]: num_v ", sg_ptr->size(), "
           // num_e ", sg_ptr->sizeEdges(), "\n");
         }
-#endif // GALOIS_USE_DIST
+
         num_subg_remain--;
         int sg_id         = num_subg_remain;
         auto subgraph_ptr = context->getSubgraphPointer(sg_id);
@@ -263,10 +263,11 @@ class Net {
                                     &subgraphs_masks[sg_id * globalSamples]);
         layers[0]->set_feats_ptr(
             context->get_feats_subg_ptr()); // feed input data
-      }
+      } // end subgraph sample loop
+////////////////////////////////////////////////////////////////////////////////
 
       // training steps
-      std::cout << header << "Epoch " << std::setw(3) << ep << seperator;
+      galois::gPrint(header, "Epoch ", std::setw(3), ep, seperator);
       set_netphases(net_phase::train);
       acc_t train_loss = 0.0, train_acc = 0.0;
 
@@ -274,6 +275,8 @@ class Net {
       // features for use during backprop
       double fw_time = evaluate("train", train_loss, train_acc);
 
+      galois::gPrint(header, "Back prop\n");
+
       // backward: use intermediate features + ground truth to update layers
       // with feature gradients whcih are then used to calculate weight
       // gradients
@@ -285,11 +288,15 @@ class Net {
 
       // validation / testing
       set_netphases(net_phase::test);
-      std::cout << header << "train_loss " << std::setprecision(3) << std::fixed
-                << train_loss << " train_acc " << train_acc << seperator;
+
+      galois::gPrint(header, "train_loss ", std::setprecision(3), std::fixed,
+                     train_loss, " train_acc ", train_acc, seperator);
+
       t_epoch.Stop();
+
       double epoch_time = t_epoch.Millisecs();
       total_train_time += epoch_time;
+
       if (need_validate && ep % val_interval == 0) {
         // Validation
         acc_t val_loss = 0.0, val_acc = 0.0;
@@ -304,20 +311,22 @@ class Net {
                   << " ms (fw " << fw_time << ", bw " << epoch_time - fw_time
                   << ")\n";
       }
-    }
+    } // epoch loop
+
     double avg_train_time = total_train_time / (double)num_epochs;
     double throughput     = 1000.0 * (double)num_epochs / total_train_time;
-    std::cout << "\nAverage training time: " << avg_train_time
-              << " ms. Throughput: " << throughput << " epoch/s\n";
+    galois::gPrint(header, "Average training time per epoch: ", avg_train_time,
+                   " ms. Throughput: ", throughput, " epoch/s\n");
   }
 
   // evaluate, i.e. inference or predict
   double evaluate(std::string type, acc_t& loss, acc_t& acc) {
-    // TODO may need to do something for the dist case
     Timer t_eval;
     t_eval.Start();
     size_t begin = 0, end = 0, count = 0;
     mask_t* masks = NULL;
+
+    // TODO global here good for dist case?
     if (type == "train") {
       begin = globalTrainBegin;
       end   = globalTrainEnd;
@@ -341,9 +350,10 @@ class Net {
       count = globalTestCount;
       masks = test_masks;
     }
+
 #ifndef __GALOIS_HET_CUDA__
-    if (subgraph_sample_size &&
-        type != "train") { // switch to the original graph
+    // switch to the original graph if not training
+    if (subgraph_sample_size && type != "train") {
       for (size_t i = 0; i < num_layers; i++)
         layers[i]->update_dim_size(distNumSamples);
       for (size_t i = 0; i < num_conv_layers; i++) {
@@ -362,6 +372,7 @@ class Net {
       masks = d_test_masks;
     }
 #endif
+
     loss                 = fprop(begin, end, count, masks);
     float_t* predictions = layers[num_layers - 1]->next()->get_data();
     label_t* labels;
@@ -387,16 +398,11 @@ class Net {
       globalTestBegin = 177262;
       globalTestCount = 55703;
       globalTestEnd   = globalTestBegin + globalTestCount;
-#ifndef GALOIS_USE_DIST
-      for (size_t i = globalTestBegin; i < globalTestEnd; i++)
-        test_masks[i] = 1;
-#else
       for (size_t i = globalTestBegin; i < globalTestEnd; i++) {
         if (dGraph->isLocal(i)) {
           test_masks[dGraph->getLID(i)] = 1;
         }
       }
-#endif
     } else {
       globalTestCount =
           distContext->read_masks(dataset, std::string("test"), globalSamples, globalTestBegin,
@@ -411,28 +417,40 @@ class Net {
   void construct_layers() {
     // append conv layers
     std::cout << "\nConstructing layers...\n";
-    for (size_t i = 0; i < num_conv_layers - 1; i++)
+    for (size_t i = 0; i < num_conv_layers - 1; i++) {
       append_conv_layer(i, true);           // conv layers, act=true
+    }
+
     append_conv_layer(num_conv_layers - 1); // the last hidden layer, act=false
-    if (has_l2norm)
+
+    if (has_l2norm) {
       append_l2norm_layer(num_conv_layers); // l2_norm layer
-    if (has_dense)
+    }
+
+    if (has_dense) {
       append_dense_layer(num_layers - 2); // dense layer
+    }
+
     append_out_layer(num_layers - 1);     // output layer
 
     // allocate memory for intermediate features and gradients
     for (size_t i = 0; i < num_layers; i++) {
       layers[i]->add_edge();
     }
-    for (size_t i = 1; i < num_layers; i++)
+    for (size_t i = 1; i < num_layers; i++) {
       connect(layers[i - 1], layers[i]);
-    for (size_t i = 0; i < num_layers; i++)
+    }
+
+    for (size_t i = 0; i < num_layers; i++) {
       layers[i]->malloc_and_init();
-    layers[0]->set_in_data(context->get_feats_ptr()); // feed input data
+    }
+
+    layers[0]->set_in_data(distContext->get_feats_ptr()); // feed input data
     // precompute the normalization constant based on graph structure
-    context->norm_factor_computing(0);
+    //context->norm_factor_computing(false);
+    distContext->constructNormFactor(context, false);
     for (size_t i = 0; i < num_conv_layers; i++)
-      layers[i]->set_norm_consts_ptr(context->get_norm_factors_ptr());
+      layers[i]->set_norm_consts_ptr(distContext->get_norm_factors_ptr());
     set_contexts();
   }
 
@@ -499,14 +517,11 @@ class Net {
   //! forward propagation: [begin, end) is the range of samples used.
   //! calls "forward" on each layer and returns the loss of the final layer
   acc_t fprop(size_t begin, size_t end, size_t count, mask_t* masks) {
-    // set mask for the last layer
+    // set mask for the last layer; globals
     layers[num_layers - 1]->set_sample_mask(begin, end, count, masks);
-    // layer0: from N x D to N x 16
-    // layer1: from N x 16 to N x E
-    // layer2: from N x E to N x E (normalize only)
+
     for (size_t i = 0; i < num_layers; i++) {
       layers[i]->forward();
-      // TODO need to sync model between layers here
     }
     // prediction error
     auto loss = layers[num_layers - 1]->get_prediction_loss();
diff --git a/libdeepgalois/src/Context.cpp b/libdeepgalois/src/Context.cpp
index b44331fe1f..7c7bcdd61b 100644
--- a/libdeepgalois/src/Context.cpp
+++ b/libdeepgalois/src/Context.cpp
@@ -139,13 +139,13 @@ void Context::add_selfloop(Graph& og, Graph& g) {
 
 void Context::alloc_norm_factor() {
   Graph* g = getGraphPointer();
-  if (norm_factors == NULL)
+  if (norm_factors == NULL) {
 #ifdef USE_MKL
-    // TODO why does MKL use size edges
     norm_factors = new float_t[g->sizeEdges()];
 #else
     norm_factors = new float_t[g->size()];
 #endif
+  }
 }
 
 void Context::alloc_subgraph_norm_factor(int subg_id) {
diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp
index 8576082c7b..66a030e036 100644
--- a/libdeepgalois/src/DistContext.cpp
+++ b/libdeepgalois/src/DistContext.cpp
@@ -151,6 +151,33 @@ void DistContext::initializeSyncSubstrate() {
       galois::runtime::getSystemNetworkInterface().Num, false);
 }
 
+void DistContext::constructNormFactor(deepgalois::Context* globalContext, bool isSubgraph,
+                         int subgraphID) {
+  // TODO IMPLEMENT THIS; get relevant info from the original context
+  globalContext->norm_factor_computing(isSubgraph, subgraphID);
+
+  // TODO can check if already allocated instead of freeing every time
+  if (this->normFactors) {
+    free(this->normFactors);
+  }
+
+#ifdef USE_MKL
+  this->normFactors = new float_t[graph_cpu->sizeEdges()];
+  galois::do_all(galois::iterate((size_t)0, graph_cpu->sizeEdges()),
+    [&] (unsigned i) {
+      normFactors[i] = 1;
+    }
+  );
+#else
+  this->normFactors = new float_t[graph_cpu->size()];
+  galois::do_all(galois::iterate((size_t)0, graph_cpu->size()),
+    [&] (unsigned i) {
+      normFactors[i] = 1;
+    }
+  );
+#endif
+}
+
 galois::graphs::GluonSubstrate<DGraph>* DistContext::getSyncSubstrate() {
   return DistContext::syncSubstrate;
 };
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 00ca3f30e6..c117b55d27 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -96,17 +96,16 @@ void graph_conv_layer::forward_propagation(const float_t* in_data,
   size_t x = input_dims[0];
   size_t y = input_dims[1];
   size_t z = output_dims[1];
-  // std::cout << "layer: " << name_ << "\n";
-  // std::cout << "x=" << x << ", y=" << y << ", z=" << z << "\n";
 
   // input: x*y; W: y*z; output: x*z
   // if y > z: mult W first to reduce the feature size for aggregation
   // else: aggregate first then mult W
-  if (dropout_ && phase_ == net_phase::train)
+  if (dropout_ && phase_ == net_phase::train) {
     math::dropout_cpu(x, y, scale_, dropout_rate_, in_data, dropout_mask,
                       in_temp);
-  else
+  } else {
     math::copy_cpu(x * y, in_data, in_temp);
+  }
 
   if (y > z) {
     math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp,
@@ -117,16 +116,16 @@ void graph_conv_layer::forward_propagation(const float_t* in_data,
     math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp1,
                     &layer::W[0], 0.0, out_data);
   }
-#ifdef GALOIS_USE_DIST
+
   // TODO sync of out_data required here
+  // TODO how to do this for the sampled case?
   deepgalois::_syncVectorSize = z;
   deepgalois::_dataToSync     = out_data;
   layer::context->getSyncSubstrate()->sync<writeAny, readAny, GraphConvSync>(
       "AggSync");
-#endif
+
   // run relu activation on output if specified
-  if (act_)
-    math::relu_cpu(x * z, out_data, out_data);
+  if (act_) math::relu_cpu(x * z, out_data, out_data);
 }
 
 // 𝜕𝐸 / 𝜕𝑦[𝑙−1] = 𝜕𝐸 / 𝜕𝑦[𝑙] ∗ 𝑊 ^𝑇

From 76915834828a71807a054e0256dea5709dbfc1c4 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Sat, 9 May 2020 16:49:39 -0500
Subject: [PATCH 279/660] norm factor used correctly for dist graph, single
 process

---
 libdeepgalois/include/deepgalois/Context.h    |  4 +
 .../include/deepgalois/DistContext.h          | 13 +--
 libdeepgalois/include/deepgalois/Net.h        | 12 ++-
 libdeepgalois/src/Context.cpp                 | 15 ++++
 libdeepgalois/src/DistContext.cpp             | 88 ++++++++++++++-----
 libdeepgalois/src/Net.cpp                     | 35 ++++----
 6 files changed, 122 insertions(+), 45 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/Context.h b/libdeepgalois/include/deepgalois/Context.h
index 373d91d463..41e3aac23b 100644
--- a/libdeepgalois/include/deepgalois/Context.h
+++ b/libdeepgalois/include/deepgalois/Context.h
@@ -119,6 +119,10 @@ class Context {
     reader.init(dataset);
   }
 
+  //! Checks if subgraph being used, sets currenet graph, then calls degreex
+  //! counting
+  Graph* getCurrentGraph(bool usingSubGraph, int subID=0);
+
   void set_label_class(bool is_single = true) { is_single_class = is_single; }
   void set_use_subgraph(bool use_subg) { use_subgraph = use_subg; }
   void copy_data_to_device(); // copy labels and input features
diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h
index 0ffb2e1b0c..473242f05e 100644
--- a/libdeepgalois/include/deepgalois/DistContext.h
+++ b/libdeepgalois/include/deepgalois/DistContext.h
@@ -15,7 +15,7 @@ class DistContext {
   size_t feat_len;      // input feature length: D
   galois::graphs::GluonSubstrate<DGraph>* syncSubstrate;
 
-  DGraph* graph_cpu; // the input graph, |V| = N
+  DGraph* partitionedGraph; // the input graph, |V| = N
   std::vector<Graph*> subgraphs_cpu;
   label_t* h_labels;      // labels for classification. Single-class label: Nx1,
                           // multi-class label: NxE
@@ -31,7 +31,7 @@ class DistContext {
   DistContext();
   ~DistContext();
 
-  void saveDistGraph(DGraph* a) { graph_cpu = a; }
+  void saveDistGraph(DGraph* a) { partitionedGraph = a; }
 
   //! read labels of local nodes only
   size_t read_labels(std::string dataset_str);
@@ -46,13 +46,11 @@ class DistContext {
   void gen_subgraph_labels(size_t, const mask_t*) {}
   void gen_subgraph_feats(size_t, const mask_t*) {}
 
-  void constructNormFactor(deepgalois::Context* globalContext, bool is_subgraph,
-                           int subg_id = 0);
 
   float_t* get_norm_factors_ptr() { return normFactors; }
   // TODO shouldn't return 0 always
   float_t* get_norm_factors_subg_ptr() { return &norm_factors_subg[0]; }
-  DGraph* getGraphPointer() { return graph_cpu; }
+  DGraph* getGraphPointer() { return partitionedGraph; }
   Graph* getSubgraphPointer(int id) { return subgraphs_cpu[id]; };
   float_t* get_feats_ptr() { return h_feats; }
   float_t* get_feats_subg_ptr() { return h_feats_subg; }
@@ -62,6 +60,11 @@ class DistContext {
   void initializeSyncSubstrate();
   galois::graphs::GluonSubstrate<DGraph>* getSyncSubstrate();
 
+  //! allocate the norm factor vector
+  void allocNormFactor();
+  //! construct norm factor vector by using data from global graph
+  void constructNormFactor(deepgalois::Context* globalContext);
+
   //! return label for some node
   //! NOTE: this is LID, not GID
   label_t get_label(size_t i) { return h_labels[i]; }
diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h
index e50b081bd7..d19a54156f 100644
--- a/libdeepgalois/include/deepgalois/Net.h
+++ b/libdeepgalois/include/deepgalois/Net.h
@@ -248,6 +248,7 @@ class Net {
         // num_edges: ", subgraph_ptr->sizeEdges(), "\n");
         for (size_t i = 0; i < num_layers; i++)
           layers[i]->update_dim_size(num_vertices_sg);
+        // TODO dist 
         context->norm_factor_computing(1, sg_id);
         for (size_t i = 0; i < num_conv_layers; i++) {
           layers[i]->set_graph_ptr(subgraph_ptr);
@@ -275,8 +276,6 @@ class Net {
       // features for use during backprop
       double fw_time = evaluate("train", train_loss, train_acc);
 
-      galois::gPrint(header, "Back prop\n");
-
       // backward: use intermediate features + ground truth to update layers
       // with feature gradients whcih are then used to calculate weight
       // gradients
@@ -375,18 +374,23 @@ class Net {
 
     loss                 = fprop(begin, end, count, masks);
     float_t* predictions = layers[num_layers - 1]->next()->get_data();
+
+    // labels will be subgraph labels if applicable
     label_t* labels;
     if (type == "train" && subgraph_sample_size) {
       labels = context->get_labels_subg_ptr();
     } else {
+      // note this grabs global labels; everything passed in should be global
       labels = context->get_labels_ptr();
     }
+
     if (is_single_class) {
       acc = masked_accuracy(begin, end, count, masks, predictions, labels);
     } else {
       acc = masked_multi_class_accuracy(begin, end, count, masks, predictions,
                                         labels);
     }
+
     t_eval.Stop();
     return t_eval.Millisecs();
   }
@@ -448,7 +452,7 @@ class Net {
     layers[0]->set_in_data(distContext->get_feats_ptr()); // feed input data
     // precompute the normalization constant based on graph structure
     //context->norm_factor_computing(false);
-    distContext->constructNormFactor(context, false);
+    distContext->constructNormFactor(context);
     for (size_t i = 0; i < num_conv_layers; i++)
       layers[i]->set_norm_consts_ptr(distContext->get_norm_factors_ptr());
     set_contexts();
@@ -518,6 +522,8 @@ class Net {
   //! calls "forward" on each layer and returns the loss of the final layer
   acc_t fprop(size_t begin, size_t end, size_t count, mask_t* masks) {
     // set mask for the last layer; globals
+    // TODO this should be distirbuted sample begin->end not global; fix later
+    // seems to be unused in code right now anyways
     layers[num_layers - 1]->set_sample_mask(begin, end, count, masks);
 
     for (size_t i = 0; i < num_layers; i++) {
diff --git a/libdeepgalois/src/Context.cpp b/libdeepgalois/src/Context.cpp
index 7c7bcdd61b..8f0b8d07f5 100644
--- a/libdeepgalois/src/Context.cpp
+++ b/libdeepgalois/src/Context.cpp
@@ -158,6 +158,21 @@ void Context::alloc_subgraph_norm_factor(int subg_id) {
   norm_factors_subg.clear();
 }
 
+// get current graph, also gets degrees of g
+Graph* Context::getCurrentGraph(bool usingSubGraph, int subID) {
+  Graph* g;
+
+  // grab orig or subgraph pointer as necessary
+  if (!usingSubGraph) {
+    g = getGraphPointer();
+  } else {
+    g = getSubgraphPointer(subID);
+  }
+  g->degree_counting();
+
+  return g;
+}
+
 void Context::norm_factor_computing(bool is_subgraph, int subg_id) {
   Graph* g;
   float_t* constants;
diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp
index 66a030e036..7d0356e189 100644
--- a/libdeepgalois/src/DistContext.cpp
+++ b/libdeepgalois/src/DistContext.cpp
@@ -7,7 +7,7 @@ DistContext::DistContext() {}
 DistContext::~DistContext() {}
 
 size_t DistContext::read_labels(std::string dataset_str) {
-  DGraph* dGraph = DistContext::graph_cpu;
+  DGraph* dGraph = DistContext::partitionedGraph;
   unsigned myID = galois::runtime::getSystemNetworkInterface().ID;
   galois::gPrint("[", myID, "] Reading labels from disk...\n");
 
@@ -58,7 +58,7 @@ size_t DistContext::read_labels(std::string dataset_str) {
 }
 
 size_t DistContext::read_features(std::string dataset_str) {
-  DGraph* dGraph = DistContext::graph_cpu;
+  DGraph* dGraph = DistContext::partitionedGraph;
   unsigned myID = galois::runtime::getSystemNetworkInterface().ID;
   galois::gPrint("[", myID, "] Reading features from disk...\n");
 
@@ -147,37 +147,85 @@ float_t* DistContext::get_in_ptr() { return &h_feats[0]; }
 
 void DistContext::initializeSyncSubstrate() {
   DistContext::syncSubstrate = new galois::graphs::GluonSubstrate<DGraph>(
-      *DistContext::graph_cpu, galois::runtime::getSystemNetworkInterface().ID,
+      *DistContext::partitionedGraph, galois::runtime::getSystemNetworkInterface().ID,
       galois::runtime::getSystemNetworkInterface().Num, false);
 }
 
-void DistContext::constructNormFactor(deepgalois::Context* globalContext, bool isSubgraph,
-                         int subgraphID) {
+void DistContext::allocNormFactor() {
+  if (!normFactors) {
+#ifdef USE_MKL
+    normFactors = new float_t[partitionedGraph->sizeEdges()];
+#else
+    normFactors = new float_t[partitionedGraph->size()];
+#endif
+  }
+  if (!normFactors) {
+    GALOIS_DIE("norm factors failed to be allocated");
+  }
+}
+
+//void DistContext::allocSubNormFactor(int subID) {
+//  if (!normFactors) {
+//#ifdef USE_MKL
+//    normFactors = new float_t[partitionedGraph->sizeEdges()];
+//#else
+//    normFactors = new float_t[partitionedGraph->size()];
+//#endif
+//  }
+//  if (!normFactors) {
+//    GALOIS_DIE("norm factors failed to be allocated");
+//  }
+//}
+
+void DistContext::constructNormFactor(deepgalois::Context* globalContext) {
   // TODO IMPLEMENT THIS; get relevant info from the original context
-  globalContext->norm_factor_computing(isSubgraph, subgraphID);
+  // sets current subgraph + gets degrees
+  Graph* wholeGraph = globalContext->getCurrentGraph(false);
 
-  // TODO can check if already allocated instead of freeing every time
-  if (this->normFactors) {
-    free(this->normFactors);
-  }
+  allocNormFactor();
+
+  // this is for testing purposes
+  //galois::do_all(galois::iterate((size_t)0, partitionedGraph->size()),
+  //  [&] (unsigned i) {
+  //    this->normFactors[i] = 0;
+  //  }
+  //);
 
 #ifdef USE_MKL
-  this->normFactors = new float_t[graph_cpu->sizeEdges()];
-  galois::do_all(galois::iterate((size_t)0, graph_cpu->sizeEdges()),
+  galois::do_all(galois::iterate((size_t)0, partitionedGraph->size()),
     [&] (unsigned i) {
-      normFactors[i] = 1;
-    }
+      float_t c_i = std::sqrt(float_t(wholeGraph->get_degree(partitionedGraph->getGID(i))));
+
+      for (auto e = partitionedGraph->edge_begin(i); e != partitionedGraph->edge_end(i); e++) {
+        const auto j = partitionedGraph->getEdgeDst(e);
+        float_t c_j  = std::sqrt(float_t(wholeGraph->get_degree(partitionedGraph->getGID(j))));
+
+        if (c_i == 0.0 || c_j == 0.0) {
+          this->normFactors[e] = 0.0;
+        } else {
+          this->normFactors[e] = 1.0 / (c_i * c_j);
+        }
+    },
+    galois::loopname("NormCountingEdge"));
   );
 #else
-  this->normFactors = new float_t[graph_cpu->size()];
-  galois::do_all(galois::iterate((size_t)0, graph_cpu->size()),
-    [&] (unsigned i) {
-      normFactors[i] = 1;
-    }
-  );
+  galois::do_all(galois::iterate((size_t)0, partitionedGraph->size()),
+    [&] (unsigned v) {
+      auto degree = wholeGraph->get_degree(partitionedGraph->getGID(v));
+      float_t temp = std::sqrt(float_t(degree));
+      if (temp == 0.0) {
+        this->normFactors[v] = 0.0;
+      } else {
+        this->normFactors[v] = 1.0 / temp;
+      }
+    },
+    galois::loopname("NormCountingNode"));
 #endif
 }
 
+//void DistContext::constructNormFactorSub(deepgalois::Context* globalContext, bool isSubgraph,
+//                         int subgraphID) {
+
 galois::graphs::GluonSubstrate<DGraph>* DistContext::getSyncSubstrate() {
   return DistContext::syncSubstrate;
 };
diff --git a/libdeepgalois/src/Net.cpp b/libdeepgalois/src/Net.cpp
index 800d550048..47b9bdc334 100644
--- a/libdeepgalois/src/Net.cpp
+++ b/libdeepgalois/src/Net.cpp
@@ -9,7 +9,6 @@
 
 namespace deepgalois {
 
-#ifdef GALOIS_USE_DIST
 void Net::partitionInit(DGraph* graph, std::string dataset_str) {
   this->dGraph      = graph;
   this->distContext = new deepgalois::DistContext();
@@ -74,7 +73,6 @@ void Net::partitionInit(DGraph* graph, std::string dataset_str) {
   feature_dims[num_layers] = num_classes; // normalized output embedding: E
   layers.resize(num_layers);
 }
-#endif
 
 #ifndef __GALOIS_HET_CUDA__
 void Net::init() {
@@ -95,21 +93,18 @@ void Net::regularize() {
  *
  * @param begin GLOBAL begin
  * @param end GLOBAL end
+ * @param masks: GLOBAL masks
  * @param count GLOBAL training count
  */
 acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count,
                            mask_t* masks, float_t* preds,
                            label_t* ground_truth) {
-#ifndef GALOIS_USE_DIST
-  galois::GAccumulator<acc_t> accuracy_all;
-#else
   galois::DGAccumulator<acc_t> accuracy_all;
   galois::DGAccumulator<uint32_t> sampleCount;
-  sampleCount.reset();
-#endif
-
   accuracy_all.reset();
+  sampleCount.reset();
 
+  // TODO figure this out for distributed case
   galois::do_all(
       galois::iterate(begin, end),
       [&](const auto& i) {
@@ -123,29 +118,34 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count,
             accuracy_all += 1.0;
         }
 #else
+        // TODO dist subraph
+
         // only look at owned nodes (i.e. masters); the prediction for these
         // should only by handled on the owner
         if (this->dGraph->isOwned(i)) {
           sampleCount += 1;
 
           uint32_t localID = this->dGraph->getLID(i);
-          if (masks[localID] == 1) {
-            // get prediction
-            auto pred =
-                math::argmax(num_classes, &preds[localID * num_classes]);
-            // check prediction
-            if ((label_t)pred == ground_truth[localID])
-              accuracy_all += 1.0;
+          if (masks == NULL) {
+            GALOIS_DIE("subgraphs not implemented for dist yet");
+            // subgraph here: TODO
+          } else {
+            if (masks[localID] == 1) {
+              // get prediction
+              auto pred =
+                  math::argmax(num_classes, &preds[localID * num_classes]);
+              // check prediction
+              if ((label_t)pred == ground_truth[localID])
+                accuracy_all += 1.0;
+            }
           }
         }
 #endif
       },
       galois::loopname("getMaskedLoss"));
 
-#ifdef GALOIS_USE_DIST
   count = sampleCount.reduce();
   galois::gDebug("sample count is ", count);
-#endif
 
   // all hosts should get same accuracy
   return accuracy_all.reduce() / (acc_t)count;
@@ -154,6 +154,7 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count,
 acc_t Net::masked_multi_class_accuracy(size_t begin, size_t end, size_t count,
                                        mask_t* masks, float_t* preds,
                                        label_t* ground_truth) {
+  // TODO dist version
   return deepgalois::masked_f1_score(begin, end, count, masks, num_classes,
                                      ground_truth, preds);
 }

From 6881c67c15e363a5432cd7543d93fa94443badaa Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Sat, 9 May 2020 17:05:27 -0500
Subject: [PATCH 280/660] disabling sync for now; getting subgraph sampling
 back online first

---
 libdeepgalois/src/layers/graph_conv_layer.cpp | 22 ++++++++-----------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index c117b55d27..7acf787bae 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -119,10 +119,10 @@ void graph_conv_layer::forward_propagation(const float_t* in_data,
 
   // TODO sync of out_data required here
   // TODO how to do this for the sampled case?
-  deepgalois::_syncVectorSize = z;
-  deepgalois::_dataToSync     = out_data;
-  layer::context->getSyncSubstrate()->sync<writeAny, readAny, GraphConvSync>(
-      "AggSync");
+  //deepgalois::_syncVectorSize = z;
+  //deepgalois::_dataToSync     = out_data;
+  //layer::context->getSyncSubstrate()->sync<writeAny, readAny, GraphConvSync>(
+  //    "AggSync");
 
   // run relu activation on output if specified
   if (act_) math::relu_cpu(x * z, out_data, out_data);
@@ -163,21 +163,17 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
                     0.0, &layer::weight_grad[0]);
   }
 
-#ifdef GALOIS_USE_DIST
   // sync agg
-  deepgalois::_syncVectorSize = z;
-  deepgalois::_dataToSync     = out_temp;
-  layer::context->getSyncSubstrate()->sync<writeAny, readAny, GraphConvSync>(
-      "AggSyncBack");
-#endif
+  //deepgalois::_syncVectorSize = z;
+  //deepgalois::_dataToSync     = out_temp;
+  //layer::context->getSyncSubstrate()->sync<writeAny, readAny, GraphConvSync>(
+  //    "AggSyncBack");
 
   if (level_ != 0 && dropout_)
     math::d_dropout_cpu(x, y, scale_, in_grad, dropout_mask, in_grad);
 
-#ifdef GALOIS_USE_DIST
-  layer::syncSub->sync<writeAny, readAny, GradientSync>("GradientSync");
+  //layer::syncSub->sync<writeAny, readAny, GradientSync>("GradientSync");
   // galois::gInfo("[", layer::gradientGraph->myHostID(), "] Sync done");
-#endif
 }
 
 acc_t graph_conv_layer::get_weight_decay_loss() {

From 0726cdbe25d928e8264aa10c1f7c65c7eb6dbabd Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Sat, 9 May 2020 23:16:14 -0500
Subject: [PATCH 281/660] sampling, fixes to sampling, distgraph lgraph
 creation, various other things

---
 .../include/deepgalois/DistContext.h          |  57 ++++--
 libdeepgalois/include/deepgalois/Net.h        |  98 ++++++-----
 libdeepgalois/include/deepgalois/Sampler.h    | 132 ++++++++++----
 libdeepgalois/include/deepgalois/lgraph.h     |   2 +
 libdeepgalois/src/Context.cpp                 |   2 +-
 libdeepgalois/src/DistContext.cpp             | 115 +++++++++++-
 libdeepgalois/src/Net.cpp                     |   9 +-
 libdeepgalois/src/Sampler.cpp                 | 164 +++++++-----------
 8 files changed, 377 insertions(+), 202 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h
index 473242f05e..ffaf430792 100644
--- a/libdeepgalois/include/deepgalois/DistContext.h
+++ b/libdeepgalois/include/deepgalois/DistContext.h
@@ -15,23 +15,45 @@ class DistContext {
   size_t feat_len;      // input feature length: D
   galois::graphs::GluonSubstrate<DGraph>* syncSubstrate;
 
+  Graph* lGraph; // laerning graph version
   DGraph* partitionedGraph; // the input graph, |V| = N
-  std::vector<Graph*> subgraphs_cpu;
+  std::vector<Graph*> partitionedSubgraphs;
   label_t* h_labels;      // labels for classification. Single-class label: Nx1,
                           // multi-class label: NxE
-  label_t* h_labels_subg; // labels for subgraph
+  std::vector<label_t> h_labels_subg; // labels for subgraph
   float_t* h_feats;       // input features: N x D
-  float_t* h_feats_subg;  // input features for subgraph
+  std::vector<float_t> h_feats_subg;  // input features for subgraph
 
-  // TODO needs to come from whole graph
+  //  change regular one to a vector as well
   float_t* normFactors;  // normalization constant based on graph structure
-  std::vector<float_t> norm_factors_subg; // normalization constant for subgraph
+  std::vector<float_t> normFactorsSub; // normalization constant for subgraph
 
 public:
   DistContext();
   ~DistContext();
 
-  void saveDistGraph(DGraph* a) { partitionedGraph = a; }
+  void saveDistGraph(DGraph* a) {
+    partitionedGraph = a;
+
+    // construct lgraph from underlying lc csr graph
+    this->lGraph = new Graph();
+    this->lGraph->allocateFrom(a->size(), a->sizeEdges());
+    this->lGraph->constructNodes();
+
+    galois::do_all(
+        galois::iterate((size_t)0, a->size()),
+        [&](const auto src) {
+          this->lGraph->fixEndEdge(src, *a->edge_end(src));
+          index_t idx = *(a->edge_begin(src));
+
+          for (auto e = a->edge_begin(src); e != a->edge_end(src); e++) {
+            const auto dst = a->getEdgeDst(e);
+            this->lGraph->constructEdge(idx++, dst, 0);
+          }
+        },
+        galois::loopname("lgraphcopy")
+    );
+  }
 
   //! read labels of local nodes only
   size_t read_labels(std::string dataset_str);
@@ -46,24 +68,30 @@ class DistContext {
   void gen_subgraph_labels(size_t, const mask_t*) {}
   void gen_subgraph_feats(size_t, const mask_t*) {}
 
-
-  float_t* get_norm_factors_ptr() { return normFactors; }
-  // TODO shouldn't return 0 always
-  float_t* get_norm_factors_subg_ptr() { return &norm_factors_subg[0]; }
   DGraph* getGraphPointer() { return partitionedGraph; }
-  Graph* getSubgraphPointer(int id) { return subgraphs_cpu[id]; };
+  Graph* getLGraphPointer() { return lGraph; }
+
+  Graph* getSubgraphPointer(int id) { return partitionedSubgraphs[id]; };
   float_t* get_feats_ptr() { return h_feats; }
-  float_t* get_feats_subg_ptr() { return h_feats_subg; }
+  float_t* get_feats_subg_ptr() { return h_feats_subg.data(); }
   label_t* get_labels_ptr() { return h_labels; }
-  label_t* get_labels_subg_ptr() { return h_labels_subg; }
+  label_t* get_labels_subg_ptr() { return h_labels_subg.data(); }
 
   void initializeSyncSubstrate();
   galois::graphs::GluonSubstrate<DGraph>* getSyncSubstrate();
 
   //! allocate the norm factor vector
   void allocNormFactor();
+  void allocNormFactorSub(int subID);
   //! construct norm factor vector by using data from global graph
   void constructNormFactor(deepgalois::Context* globalContext);
+  void constructNormFactorSub(int subgraphID);
+
+  void constructSubgraphLabels(size_t m, const mask_t* masks);
+  void constructSubgraphFeatures(size_t m, const mask_t* masks);
+
+  float_t* get_norm_factors_ptr() { return normFactors; }
+  float_t* get_norm_factors_subg_ptr() { return &normFactorsSub[0]; }
 
   //! return label for some node
   //! NOTE: this is LID, not GID
@@ -71,6 +99,9 @@ class DistContext {
 
   //! returns pointer to the features of each local node
   float_t* get_in_ptr();
+
+  //! allocate memory for subgraphs (don't actually build them)
+  void allocateSubgraphs(int num_subgraphs);
 };
 
 } // namespace deepgalois
diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h
index d19a54156f..9f49f8f847 100644
--- a/libdeepgalois/include/deepgalois/Net.h
+++ b/libdeepgalois/include/deepgalois/Net.h
@@ -55,7 +55,7 @@ class Net {
   size_t globalTestCount;
   int val_interval;
   int num_subgraphs;
-  int num_vertices_sg;
+  unsigned subgraphNumVertices;
   bool is_selfloop;
 
   mask_t* globalTrainMasks; // masks for training
@@ -68,7 +68,7 @@ class Net {
   mask_t* d_val_masks;   // masks for validation on device
   mask_t* d_test_masks;  // masks for test on device
 
-  mask_t* subgraphs_masks;          // masks for subgraphs
+  mask_t* subgraphs_masks;          // masks for subgraphs; size of local graph
   std::vector<size_t> feature_dims; // feature dimnesions for each layer
   std::vector<layer*> layers;       // all the layers in the neural network
 
@@ -167,6 +167,8 @@ class Net {
 
     // set the subgraph boolean if sample size is greater than 0
     context->set_use_subgraph(subgraph_sample_size > 0);
+
+    this->sampler = new Sampler();
   }
 
   //! Default net constructor
@@ -196,10 +198,11 @@ class Net {
     int num_subg_remain     = 0;
 
     if (subgraph_sample_size) {
-      context->allocateSubgraphs(num_subgraphs);
+      distContext->allocateSubgraphs(num_subgraphs);
       subgraphs_masks = new mask_t[distNumSamples * num_subgraphs];
       galois::gPrint(header, " Construct training vertex set induced graph...\n");
-      sampler->initializeMaskedGraph(globalTrainCount, globalTrainMasks, context->getGraphPointer());
+      sampler->initializeMaskedGraph(globalTrainCount, globalTrainMasks, context->getGraphPointer(),
+                                     distContext->getGraphPointer());
     }
 
     galois::gPrint(header, "Start training...\n");
@@ -207,68 +210,73 @@ class Net {
     Timer t_epoch;
 
     // run epochs
-    for (int ep = 0; ep < num_epochs; ep++) {
+    for (int curEpoch = 0; curEpoch < num_epochs; curEpoch++) {
       t_epoch.Start();
 
+////////////////////////////////////////////////////////////////////////////////
+// Sampling
 ////////////////////////////////////////////////////////////////////////////////
       if (subgraph_sample_size) {
         if (num_subg_remain == 0) {
-          std::cout << "Generating " << num_subgraphs << " subgraphs ";
+          galois::gPrint(header, "Generating ", num_subgraphs, " subgraph(s)\n");
+          // TODO stat timer instead of this timer
           Timer t_subgen;
           t_subgen.Start();
 
           // generate subgraphs
 #ifndef __GALOIS_HET_CUDA__
           for (int sid = 0; sid < num_subgraphs; sid++) {
-            // galois::do_all(galois::iterate(size_t(0),
-            // size_t(num_subgraphs)),[&](const auto sid) {
-            unsigned tid = 0;
-            // tid = galois::substrate::ThreadPool::getTID();
-            sampler->subgraph_sample(subgraph_sample_size,
-                                     *(context->getSubgraphPointer(sid)),
-                                     &subgraphs_masks[sid * globalSamples], tid);
-          } //, galois::loopname("subgraph_gen"));
+            sampler->sampleSubgraph(subgraph_sample_size,
+                                     *(distContext->getSubgraphPointer(sid)),
+                                     &subgraphs_masks[sid * globalSamples], curEpoch);
+          }
 #endif
           num_subg_remain = num_subgraphs;
           t_subgen.Stop();
           // std::cout << "Done, time: " << t_subgen.Millisecs() << "\n";
         }
+        // count their degrees
         for (int i = 0; i < num_subgraphs; i++) {
-          auto sg_ptr = context->getSubgraphPointer(i);
+          auto sg_ptr = distContext->getSubgraphPointer(i);
           sg_ptr->degree_counting();
           // galois::gPrint("\tsubgraph[", i, "]: num_v ", sg_ptr->size(), "
           // num_e ", sg_ptr->sizeEdges(), "\n");
         }
 
+        // choose a subgraph to use
         num_subg_remain--;
         int sg_id         = num_subg_remain;
-        auto subgraph_ptr = context->getSubgraphPointer(sg_id);
-        num_vertices_sg   = subgraph_ptr->size();
-        // galois::gPrint("Subgraph num_vertices: ", num_vertices_sg, ",
-        // num_edges: ", subgraph_ptr->sizeEdges(), "\n");
-        for (size_t i = 0; i < num_layers; i++)
-          layers[i]->update_dim_size(num_vertices_sg);
-        // TODO dist 
-        context->norm_factor_computing(1, sg_id);
+        auto subgraphPointer = distContext->getSubgraphPointer(sg_id);
+        this->subgraphNumVertices   = subgraphPointer->size();
+
+        // galois::gPrint("Subgraph num_vertices: ", subgraphNumVertices, ",
+        // num_edges: ", subgraphPointer->sizeEdges(), "\n");
+        for (size_t i = 0; i < num_layers; i++) {
+          layers[i]->update_dim_size(this->subgraphNumVertices);
+        }
+
+        // TODO dist version where i need global degrees
+        // change normalization constants
+        distContext->constructNormFactorSub(sg_id);
         for (size_t i = 0; i < num_conv_layers; i++) {
-          layers[i]->set_graph_ptr(subgraph_ptr);
-          layers[i]->set_norm_consts_ptr(context->get_norm_factors_subg_ptr());
+          layers[i]->set_graph_ptr(subgraphPointer);
+          layers[i]->set_norm_consts_ptr(distContext->get_norm_factors_subg_ptr());
         }
+
         // update labels for subgraph
-        context->gen_subgraph_labels(num_vertices_sg,
-                                     &subgraphs_masks[sg_id * globalSamples]);
-        layers[num_layers - 1]->set_labels_ptr(context->get_labels_subg_ptr());
+        distContext->constructSubgraphLabels(this->subgraphNumVertices,
+                                             &subgraphs_masks[sg_id * globalSamples]);
+        layers[num_layers - 1]->set_labels_ptr(distContext->get_labels_subg_ptr());
 
         // update features for subgraph
-        context->gen_subgraph_feats(num_vertices_sg,
-                                    &subgraphs_masks[sg_id * globalSamples]);
-        layers[0]->set_feats_ptr(
-            context->get_feats_subg_ptr()); // feed input data
+        distContext->constructSubgraphFeatures(this->subgraphNumVertices,
+                                               &subgraphs_masks[sg_id * globalSamples]);
+        layers[0]->set_feats_ptr(distContext->get_feats_subg_ptr()); // feed input data
       } // end subgraph sample loop
 ////////////////////////////////////////////////////////////////////////////////
 
       // training steps
-      galois::gPrint(header, "Epoch ", std::setw(3), ep, seperator);
+      galois::gPrint(header, "Epoch ", std::setw(3), curEpoch, seperator);
       set_netphases(net_phase::train);
       acc_t train_loss = 0.0, train_acc = 0.0;
 
@@ -296,7 +304,7 @@ class Net {
       double epoch_time = t_epoch.Millisecs();
       total_train_time += epoch_time;
 
-      if (need_validate && ep % val_interval == 0) {
+      if (need_validate && curEpoch % val_interval == 0) {
         // Validation
         acc_t val_loss = 0.0, val_acc = 0.0;
         double val_time = evaluate("val", val_loss, val_acc);
@@ -335,8 +343,8 @@ class Net {
         // update masks for subgraph
         masks = NULL;
         begin = 0;
-        end   = num_vertices_sg;
-        count = num_vertices_sg;
+        end   = this->subgraphNumVertices;
+        count = this->subgraphNumVertices;
       }
     } else if (type == "val") {
       begin = globalValBegin;
@@ -356,11 +364,11 @@ class Net {
       for (size_t i = 0; i < num_layers; i++)
         layers[i]->update_dim_size(distNumSamples);
       for (size_t i = 0; i < num_conv_layers; i++) {
-        layers[i]->set_graph_ptr(context->getGraphPointer());
-        layers[i]->set_norm_consts_ptr(context->get_norm_factors_ptr());
+        layers[i]->set_graph_ptr(distContext->getLGraphPointer());
+        layers[i]->set_norm_consts_ptr(distContext->get_norm_factors_ptr());
       }
-      layers[num_layers - 1]->set_labels_ptr(context->get_labels_ptr());
-      layers[0]->set_feats_ptr(context->get_feats_ptr()); // feed input data
+      layers[num_layers - 1]->set_labels_ptr(distContext->get_labels_ptr());
+      layers[0]->set_feats_ptr(distContext->get_feats_ptr()); // feed input data
     }
 #else
     if (type == "train") {
@@ -378,10 +386,10 @@ class Net {
     // labels will be subgraph labels if applicable
     label_t* labels;
     if (type == "train" && subgraph_sample_size) {
-      labels = context->get_labels_subg_ptr();
+      labels = distContext->get_labels_subg_ptr();
     } else {
       // note this grabs global labels; everything passed in should be global
-      labels = context->get_labels_ptr();
+      labels = distContext->get_labels_ptr();
     }
 
     if (is_single_class) {
@@ -487,11 +495,13 @@ class Net {
     in_dims[0] = out_dims[0] = distNumSamples;
     in_dims[1]               = get_in_dim(layer_id);
     out_dims[1]              = get_out_dim(layer_id);
+
     if (is_single_class)
       layers[layer_id] = new softmax_loss_layer(layer_id, in_dims, out_dims);
     else
       layers[layer_id] = new sigmoid_loss_layer(layer_id, in_dims, out_dims);
-    layers[layer_id]->set_labels_ptr(context->get_labels_ptr());
+
+    layers[layer_id]->set_labels_ptr(distContext->get_labels_ptr());
   }
 
   //! Add a convolution layer to the network
@@ -505,7 +515,7 @@ class Net {
     out_dims[1]              = get_out_dim(layer_id);
     layers[layer_id] = new graph_conv_layer(layer_id, act, norm, bias, dropout,
                                             dropout_rate, in_dims, out_dims);
-    layers[layer_id]->set_graph_ptr(context->getGraphPointer());
+    layers[layer_id]->set_graph_ptr(distContext->getLGraphPointer());
   }
 
   // update trainable weights after back-propagation
diff --git a/libdeepgalois/include/deepgalois/Sampler.h b/libdeepgalois/include/deepgalois/Sampler.h
index 9c4ea06f12..d29c537ab9 100644
--- a/libdeepgalois/include/deepgalois/Sampler.h
+++ b/libdeepgalois/include/deepgalois/Sampler.h
@@ -4,38 +4,12 @@
 namespace deepgalois {
 #define ETA 1.5          // length factor of DB in sampling
 #define SAMPLE_CLIP 3000 // clip degree in sampling
-#define DEFAULT_SIZE_FRONTIER 3000
+#define DEFAULT_SIZE_FRONTIER 1000
 #define DEFAULT_SIZE_SUBG 9000
 
 class Sampler {
 public:
   typedef int db_t;
-  Sampler() : m_(DEFAULT_SIZE_FRONTIER) {}
-  ~Sampler() {}
-
-  //! sample a subgraph sg of size n from graph g
-  //! sg is overwritten/is output
-  void subgraph_sample(size_t n, Graph& sg, mask_t* masks, unsigned tid = 0);
-
-  //! API function for user-defined selection strategy
-  // TODO how to expose this?
-  virtual void select_vertices(size_t nv, size_t n, int m, Graph* g,
-                               VertexList vertices, VertexSet& vertex_set);
-  virtual void select_vertices(size_t n, int m, VertexSet& vertex_set,
-                               unsigned tid);
-
-  // galois::runtime::iterable<galois::NoDerefIterator<edge_iterator> >
-  // neighbor_sampler(Graph &g, VertexID v);
-
-  edge_iterator sampled_edge_begin(Graph& g, VertexID v) {
-    return g.edge_begin(v);
-  }
-
-  edge_iterator sampled_edge_end(Graph& g, VertexID v) { return g.edge_end(v); }
-
-  //! Given a mask, construct the graph with only those vertices ans ave as the
-  //! masked graph in this class for the sampler.
-  void initializeMaskedGraph(size_t count, mask_t* masks, Graph* g);
 
 protected:
   int m_;
@@ -45,27 +19,111 @@ class Sampler {
   int avg_deg;
   //! average degree cut off to a clip
   int subg_deg;
-  //! list  of vertices active in the graph being maintained (masked_graph)
-  // VertexList vertices_;
+
+  //VertexList vertices_;
+  //mask_t* masks_;
+
   //! List of training nodes; sampling set
-  std::vector<index_t> node_train;
-  mask_t* masks_;
+  std::vector<index_t> trainingNodes;
+
   //! masked original graph; typically to the training set
-  Graph* masked_graph;
-  Graph* graph;
+  Graph* globalMaskedGraph;
+  Graph* globalGraph;
+  DGraph* partGraph;
 
   //! Reindex a graph to only contain those in the vertex set
   void reindexSubgraph(VertexSet& keptVertices, Graph& g, Graph& reindexed);
+
   //! Given a graph, return a graph with edges to unmasked vertices removed in
   //! mg
-  void getMaskedGraph(size_t n, mask_t* masks, Graph* g, Graph& mg);
-  void get_masked_degrees(size_t n, mask_t* masks, Graph* g,
-                          std::vector<uint32_t>& degrees);
+  template <typename GraphTy>
+  void getMaskedGraph(size_t n, mask_t* masks, GraphTy* g, Graph& sub) {
+    std::vector<uint32_t> degrees(n, 0);
+    this->getMaskedDegrees(n, masks, g, degrees);
+    // auto offsets = deepgalois::parallel_prefix_sum(degrees);
+    auto offsets = deepgalois::prefix_sum(degrees);
+    size_t ne    = offsets[n];
+    //galois::gPrint("Generate masked graph: num_vertices=", n, ", num_edges=", ne, "\n");
+  
+    // note this constructs the full graph's nodes; just trims edges
+    sub.allocateFrom(n, ne);
+    sub.constructNodes();
+  
+    galois::do_all(
+        galois::iterate((size_t)0, n),
+        [&](const auto src) {
+          sub.fixEndEdge(src, offsets[src + 1]);
+          if (masks[src] == 1) {
+            auto idx = offsets[src];
+            for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) {
+              const auto dst = g->getEdgeDst(e);
+              if (masks[dst] == 1)
+                sub.constructEdge(idx++, dst, 0);
+            }
+          }
+        }
+        ,
+        galois::loopname("gen_subgraph"));
+  }
+
+
+//! determine degree of each vertex in a masked graph (given by masks and g)
+template <typename GraphTy>
+void getMaskedDegrees(size_t n, mask_t* masks, GraphTy* g,
+                                 std::vector<uint32_t>& degrees) {
+  assert(degrees.size() == n);
+#ifdef PARALLEL_GEN
+  galois::do_all(
+      galois::iterate(size_t(0), n),
+      [&](const auto src) {
+#else
+  for (size_t src = 0; src < n; src++) {
+#endif
+        if (masks[src] == 1) {
+          for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) {
+            const auto dst = g->getEdgeDst(e);
+            if (masks[dst] == 1) {
+              //galois::gInfo("Edge ", src, " ", dst);
+              degrees[src]++;
+            }
+          }
+        }
+      }
+#ifdef PARALLEL_GEN
+      ,
+      galois::loopname("update_degrees"));
+#endif
+}
+
   //! Set masks bitset with IDs in the vertices VertexSet
-  void getMasks(size_t n, VertexSet vertices, mask_t* masks);
+  void createMasks(size_t n, VertexSet vertices, mask_t* masks);
   inline VertexList reindexVertices(size_t n, VertexSet vertex_set);
   void checkGSDB(std::vector<db_t>& DB0, std::vector<db_t>& DB1,
                  std::vector<db_t>& DB2, size_t size);
+
+  VertexSet convertToLID(VertexSet& gidSet);
+
+public:
+  Sampler() : m_(DEFAULT_SIZE_FRONTIER) {}
+  ~Sampler() {}
+
+  //! sample a subgraph sg of size n from graph g
+  //! sg is overwritten/is output
+  void sampleSubgraph(size_t n, Graph& sg, mask_t* masks, unsigned seed = 0);
+
+  //! API function for user-defined selection strategy
+  // TODO how to expose this?
+  virtual void selectVertices(size_t nv, size_t n, int m, Graph* g,
+                               VertexList vertices, VertexSet& vertex_set);
+  virtual void selectVertices(size_t n, int m, VertexSet& vertex_set,
+                               unsigned seed);
+
+  // galois::runtime::iterable<galois::NoDerefIterator<edge_iterator> >
+  // neighbor_sampler(Graph &g, VertexID v);
+
+  //! Given a mask, construct the graph with only those vertices ans ave as the
+  //! masked graph in this class for the sampler.
+  void initializeMaskedGraph(size_t count, mask_t* masks, Graph* g, DGraph* dg);
 };
 
 } // namespace deepgalois
diff --git a/libdeepgalois/include/deepgalois/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h
index 0c06a926cb..40ca6c5a18 100644
--- a/libdeepgalois/include/deepgalois/lgraph.h
+++ b/libdeepgalois/include/deepgalois/lgraph.h
@@ -68,6 +68,7 @@ class LearningGraph {
     degrees_.resize(num_vertices_);
     rowptr_[0] = 0;
   }
+
   void constructEdge(index_t eid, index_t dst, edata_t edata = 0) {
     assert(dst < num_vertices_);
     assert(eid < num_edges_);
@@ -75,6 +76,7 @@ class LearningGraph {
     if (edge_data_)
       edge_data_[eid] = edata;
   }
+
   void add_selfloop() {
     auto old_colidx_ = colidx_;
     colidx_.resize(num_vertices_ + num_edges_);
diff --git a/libdeepgalois/src/Context.cpp b/libdeepgalois/src/Context.cpp
index 8f0b8d07f5..17b9872f74 100644
--- a/libdeepgalois/src/Context.cpp
+++ b/libdeepgalois/src/Context.cpp
@@ -93,6 +93,7 @@ size_t Context::read_graph(bool selfloop) {
       //} else galois::graphs::readGraph(*graph_cpu, filename);
     } else {
       graph_cpu->readGraph(dataset);
+      galois::gPrint("graph read size ", graph_cpu->size());
     }
     // TODO dist version of self loop
   } else {
@@ -155,7 +156,6 @@ void Context::alloc_subgraph_norm_factor(int subg_id) {
 #else
   norm_factors_subg.resize(g->size());
 #endif
-  norm_factors_subg.clear();
 }
 
 // get current graph, also gets degrees of g
diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp
index 7d0356e189..7899a180e2 100644
--- a/libdeepgalois/src/DistContext.cpp
+++ b/libdeepgalois/src/DistContext.cpp
@@ -164,6 +164,16 @@ void DistContext::allocNormFactor() {
   }
 }
 
+void DistContext::allocNormFactorSub(int subID) {
+#ifdef USE_MKL
+  normFactorsSub.resize(partitionedSubgraphs[subID]->sizeEdges());
+#else
+  normFactorsSub.resize(partitionedSubgraphs[subID]->size());
+#endif
+  // TODO clean out?
+}
+
+
 //void DistContext::allocSubNormFactor(int subID) {
 //  if (!normFactors) {
 //#ifdef USE_MKL
@@ -223,11 +233,112 @@ void DistContext::constructNormFactor(deepgalois::Context* globalContext) {
 #endif
 }
 
-//void DistContext::constructNormFactorSub(deepgalois::Context* globalContext, bool isSubgraph,
-//                         int subgraphID) {
+void DistContext::constructNormFactorSub(int subgraphID) {
+  // right now norm factor based on subgraph
+  // TODO fix this
+
+  allocNormFactorSub(subgraphID);
+
+  Graph& graphToUse = *partitionedSubgraphs[subgraphID];
+  graphToUse.degree_counting();
+
+  // TODO using partitioned subgraph rather than whoel graph; i.e. dist setting wrong
+#ifdef USE_MKL
+  galois::do_all(galois::iterate((size_t)0, graphToUse->size()),
+    [&] (unsigned i) {
+      //float_t c_i = std::sqrt(float_t(wholeGraph->get_degree(partitionedGraph->getGID(i))));
+      float_t c_i = std::sqrt(float_t(graphToUse.get_degree(i)));
+
+      for (auto e = graphToUse->edge_begin(i); e != graphToUse->edge_end(i); e++) {
+        const auto j = graphToUse->getEdgeDst(e);
+        float_t c_j  = std::sqrt(float_t(graphToUse.get_degree(j)));
+
+        if (c_i == 0.0 || c_j == 0.0) {
+          this->normFactors[e] = 0.0;
+        } else {
+          this->normFactors[e] = 1.0 / (c_i * c_j);
+        }
+    },
+    galois::loopname("NormCountingEdge"));
+  );
+#else
+  galois::do_all(galois::iterate((size_t)0, graphToUse.size()),
+    [&] (unsigned v) {
+      //auto degree = wholeGraph->get_degree(partitionedGraph->getGID(v));
+      auto degree = graphToUse.get_degree(v);
+      float_t temp = std::sqrt(float_t(degree));
+      if (temp == 0.0) {
+        this->normFactors[v] = 0.0;
+      } else {
+        this->normFactors[v] = 1.0 / temp;
+      }
+    },
+    galois::loopname("NormCountingNode"));
+#endif
+}
+//! generate labels for the subgraph, m is subgraph size, mask
+//! tells which vertices to use
+void DistContext::constructSubgraphLabels(size_t m, const mask_t* masks) {
+  // TODO multiclass
+
+  // if (h_labels_subg == NULL) h_labels_subg = new label_t[m];
+  //if (DistContext::is_single_class) {
+  //} else {
+  //  DistContext::h_labels_subg.resize(m * Context::num_classes);
+  //}
+
+  DistContext::h_labels_subg.resize(m);
+
+  size_t count = 0;
+  // see which labels to copy over for this subgraph
+  for (size_t i = 0; i < this->partitionedGraph->size(); i++) {
+    if (masks[i] == 1) {
+      //if (Context::is_single_class) {
+      //} else {
+      //  std::copy(Context::h_labels + i * Context::num_classes,
+      //            Context::h_labels + (i + 1) * Context::num_classes,
+      //            &Context::h_labels_subg[count * Context::num_classes]);
+      //}
+      DistContext::h_labels_subg[count] = h_labels[i];
+      count++;
+    }
+  }
+  GALOIS_ASSERT(count == m);
+}
+
+//! generate input features for the subgraph, m is subgraph size,
+//! masks tells which vertices to use
+void DistContext::constructSubgraphFeatures(size_t m, const mask_t* masks) {
+  size_t count = 0;
+  // if (h_feats_subg == NULL) h_feats_subg = new float_t[m*feat_len];
+  DistContext::h_feats_subg.resize(m * feat_len);
+  for (size_t i = 0; i < this->partitionedGraph->size(); i++) {
+    if (masks[i] == 1) {
+      std::copy(DistContext::h_feats + i * DistContext::feat_len,
+                DistContext::h_feats + (i + 1) * DistContext::feat_len,
+                &DistContext::h_feats_subg[count * DistContext::feat_len]);
+      count++;
+    }
+  }
+  GALOIS_ASSERT(count == m);
+}
+
+
+
+
+
+
+
 
 galois::graphs::GluonSubstrate<DGraph>* DistContext::getSyncSubstrate() {
   return DistContext::syncSubstrate;
 };
 
+void DistContext::allocateSubgraphs(int num_subgraphs) {
+  partitionedSubgraphs.resize(num_subgraphs);
+  for (int i = 0; i < num_subgraphs; i++) {
+    partitionedSubgraphs[i] = new Graph();
+  }
+}
+
 } // namespace deepgalois
diff --git a/libdeepgalois/src/Net.cpp b/libdeepgalois/src/Net.cpp
index 47b9bdc334..3500911c74 100644
--- a/libdeepgalois/src/Net.cpp
+++ b/libdeepgalois/src/Net.cpp
@@ -127,13 +127,16 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count,
 
           uint32_t localID = this->dGraph->getLID(i);
           if (masks == NULL) {
-            GALOIS_DIE("subgraphs not implemented for dist yet");
+            //GALOIS_DIE("subgraphs not implemented for dist yet");
             // subgraph here: TODO
+            auto pred = math::argmax(num_classes, &preds[localID * num_classes]);
+            // check prediction
+            if ((label_t)pred == ground_truth[localID])
+              accuracy_all += 1.0;
           } else {
             if (masks[localID] == 1) {
               // get prediction
-              auto pred =
-                  math::argmax(num_classes, &preds[localID * num_classes]);
+              auto pred = math::argmax(num_classes, &preds[localID * num_classes]);
               // check prediction
               if ((label_t)pred == ground_truth[localID])
                 accuracy_all += 1.0;
diff --git a/libdeepgalois/src/Sampler.cpp b/libdeepgalois/src/Sampler.cpp
index 6a84a8de76..3dfbcf8c86 100644
--- a/libdeepgalois/src/Sampler.cpp
+++ b/libdeepgalois/src/Sampler.cpp
@@ -12,48 +12,52 @@ inline unsigned getDegree(Graph* g, index_t v) {
   return g->edge_end(v) - g->edge_begin(v);
 }
 
-void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, Graph* g) {
+void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, Graph* g, DGraph* dg) {
   this->count_ = count;
-  this->masks_ = masks;
   // save original graph
-  Sampler::graph = g;
+  Sampler::globalGraph = g;
+  // save partitioned graph
+  Sampler::partGraph = dg;
+
   // allocate the object for the new masked graph
-  Sampler::masked_graph = new Graph();
+  Sampler::globalMaskedGraph = new Graph();
 
   std::vector<uint32_t> degrees(g->size(), 0);
   // get degrees of nodes that will be in new graph
-  this->get_masked_degrees(g->size(), masks, g, degrees);
+  this->getMaskedDegrees(g->size(), masks, g, degrees);
   auto offsets = deepgalois::parallel_prefix_sum(degrees);
   size_t ne    = offsets[g->size()];
 
-  // save ids (on original graph) of training nodes to vector
+  // save ids (of original graph) of training nodes to vector
   for (size_t i = 0; i < g->size(); i++) {
     if (masks[i] == 1)
-      Sampler::node_train.push_back(i);
+      Sampler::trainingNodes.push_back(i);
   }
 
-  Sampler::masked_graph->allocateFrom(g->size(), ne);
-  Sampler::masked_graph->constructNodes();
+  Sampler::globalMaskedGraph->allocateFrom(g->size(), ne);
+  Sampler::globalMaskedGraph->constructNodes();
   // same as original graph, except keep only edges involved in masks
   galois::do_all(
       galois::iterate((size_t)0, g->size()),
       [&](const auto src) {
-        Sampler::masked_graph->fixEndEdge(src, offsets[src + 1]);
+        Sampler::globalMaskedGraph->fixEndEdge(src, offsets[src + 1]);
         if (masks[src] == 1) {
           auto idx = offsets[src];
           for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) {
             const auto dst = g->getEdgeDst(e);
             if (masks[dst] == 1)
-              Sampler::masked_graph->constructEdge(idx++, dst, 0);
+              Sampler::globalMaskedGraph->constructEdge(idx++, dst, 0);
           }
         }
       },
       galois::loopname("gen_subgraph"));
 
-  Sampler::masked_graph->degree_counting();
-  Sampler::avg_deg  = masked_graph->sizeEdges() / masked_graph->size();
+  Sampler::globalMaskedGraph->degree_counting();
+  Sampler::avg_deg  = globalMaskedGraph->sizeEdges() / globalMaskedGraph->size();
   Sampler::subg_deg = (avg_deg > SAMPLE_CLIP) ? SAMPLE_CLIP : avg_deg;
 
+  // TODO masked part graph as well to save time later
+
   // size_t idx = 0;
   // vertices_.resize(count);
   // for (size_t i = begin; i < end; i++) {
@@ -62,69 +66,6 @@ void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, Graph* g) {
   //}
 }
 
-//! determine degree of each vertex in a masked graph (given by masks and g)
-void Sampler::get_masked_degrees(size_t n, mask_t* masks, Graph* g,
-                                 std::vector<uint32_t>& degrees) {
-  assert(degrees.size() == n);
-#ifdef PARALLEL_GEN
-  galois::do_all(
-      galois::iterate(size_t(0), n),
-      [&](const auto src) {
-#else
-  for (size_t src = 0; src < n; src++) {
-#endif
-        if (masks[src] == 1) {
-          for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) {
-            const auto dst = g->getEdgeDst(e);
-            if (masks[dst] == 1)
-              degrees[src]++;
-          }
-        }
-      }
-#ifdef PARALLEL_GEN
-      ,
-      galois::loopname("update_degrees"));
-#endif
-}
-
-//! returns a graph in the variable sub: it is g with the mask applied
-void Sampler::getMaskedGraph(size_t n, mask_t* masks, Graph* g, Graph& sub) {
-  std::vector<uint32_t> degrees(n, 0);
-  this->get_masked_degrees(n, masks, g, degrees);
-  // auto offsets = deepgalois::parallel_prefix_sum(degrees);
-  auto offsets = deepgalois::prefix_sum(degrees);
-  size_t ne    = offsets[n];
-  // galois::gPrint("Generate masked graph: num_vertices=", n, ", num_edges=",
-  // ne, "\n");
-  //
-
-  // note this constructs the full graph's nodes; just trims edges
-  sub.allocateFrom(n, ne);
-  sub.constructNodes();
-
-#ifdef PARALLEL_GEN
-  galois::do_all(
-      galois::iterate((size_t)0, n),
-      [&](const auto src) {
-#else
-  for (size_t src = 0; src < n; src++) {
-#endif
-        sub.fixEndEdge(src, offsets[src + 1]);
-        if (masks[src] == 1) {
-          auto idx = offsets[src];
-          for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) {
-            const auto dst = g->getEdgeDst(e);
-            if (masks[dst] == 1)
-              sub.constructEdge(idx++, dst, 0);
-          }
-        }
-      }
-#ifdef PARALLEL_GEN
-      ,
-      galois::loopname("gen_subgraph"));
-#endif
-}
-
 // helper function for graph saint implementation below
 void Sampler::checkGSDB(std::vector<db_t>& DB0, std::vector<db_t>& DB1,
                         std::vector<db_t>& DB2, size_t size) {
@@ -154,7 +95,10 @@ void print_vertex_set(VertexSet vertex_set) {
 
 // implementation from GraphSAINT
 // https://github.com/GraphSAINT/GraphSAINT/blob/master/ipdps19_cpp/sample.cpp
-void Sampler::select_vertices(size_t n, int m, VertexSet& st, unsigned seed) {
+void Sampler::selectVertices(size_t n, int m, VertexSet& st, unsigned seed) {
+  if (n < (size_t)m) {
+    m = n;
+  }
   unsigned myseed = seed;
 
   // unsigned myseed = tid;
@@ -174,16 +118,16 @@ void Sampler::select_vertices(size_t n, int m, VertexSet& st, unsigned seed) {
   IA3.resize(m);
 
   // galois::gPrint("seed ", myseed, " m ", m, "\n");
-  // galois::gPrint("node_train size: ", node_train.size(), "\n");
+  // galois::gPrint("trainingNodes size: ", trainingNodes.size(), "\n");
   // printf("( ");
-  // for (size_t i = 0; i < 10; i++) std::cout << node_train[i] << " ";
+  // for (size_t i = 0; i < 10; i++) std::cout << trainingNodes[i] << " ";
   // printf(")\n");
 
   for (int i = 0; i < m; i++) {
-    auto rand_idx = rand_r(&myseed) % Sampler::node_train.size();
-    db_t v = IA3[i] = Sampler::node_train[rand_idx];
+    auto rand_idx = rand_r(&myseed) % Sampler::trainingNodes.size();
+    db_t v = IA3[i] = Sampler::trainingNodes[rand_idx];
     st.insert(v);
-    IA0[i] = getDegree(Sampler::masked_graph, v);
+    IA0[i] = getDegree(Sampler::globalMaskedGraph, v);
     IA0[i] = (IA0[i] > SAMPLE_CLIP) ? SAMPLE_CLIP : IA0[i];
     IA1[i] = 1;
     IA2[i] = 0;
@@ -216,17 +160,17 @@ void Sampler::select_vertices(size_t n, int m, VertexSet& st, unsigned seed) {
     }
     choose      = (DB1[choose] < 0) ? choose : (choose - DB1[choose]);
     db_t v      = DB0[choose];
-    auto degree = getDegree(Sampler::masked_graph, v);
+    auto degree = getDegree(Sampler::globalMaskedGraph, v);
     neigh_v     = (degree != 0) ? rand_r(&myseed) % degree : db_t(-1);
     if (neigh_v != db_t(-1)) {
-      neigh_v = Sampler::masked_graph->getEdgeDst(
-          Sampler::masked_graph->edge_begin(v) + neigh_v);
+      neigh_v = Sampler::globalMaskedGraph->getEdgeDst(
+          Sampler::globalMaskedGraph->edge_begin(v) + neigh_v);
       st.insert(neigh_v);
       IA1[DB2[choose] - 1] = 0;
       IA0[DB2[choose] - 1] = 0;
       for (auto i = choose; i < choose - DB1[choose]; i++)
         DB0[i] = db_t(-1);
-      newsize = getDegree(Sampler::masked_graph, neigh_v);
+      newsize = getDegree(Sampler::globalMaskedGraph, neigh_v);
       newsize = (newsize > SAMPLE_CLIP) ? SAMPLE_CLIP : newsize;
     } else
       newsize = 0;
@@ -301,8 +245,8 @@ void Sampler::select_vertices(size_t n, int m, VertexSet& st, unsigned seed) {
 // n: number of vertices in the subgraph;
 // m: number of vertices in the frontier.
 // our implementation of GraphSAINT sampling
-void Sampler::select_vertices(size_t nv, size_t n, int m, Graph* g,
-                              VertexList vertices, VertexSet& vertex_set) {
+void Sampler::selectVertices(size_t nv, size_t n, int m, Graph* g,
+                             VertexList vertices, VertexSet& vertex_set) {
   // galois::gPrint("Select a vertex set of size ", n, " from ", nv, " vertices,
   // graph size: ", g->size(), "\n");
   assert(nv == vertices.size());
@@ -343,7 +287,7 @@ void Sampler::select_vertices(size_t nv, size_t n, int m, Graph* g,
   */
 }
 
-void Sampler::getMasks(size_t n, VertexSet vertices, mask_t* masks) {
+void Sampler::createMasks(size_t n, VertexSet vertices, mask_t* masks) {
   // galois::gPrint("Updating masks, size = ", vertices.size(), "\n");
   std::fill(masks, masks + n, 0);
   for (auto v : vertices)
@@ -365,7 +309,7 @@ void Sampler::reindexSubgraph(VertexSet& keptVertices, Graph& origGraph,
                               Graph& reindexGraph) {
   // auto n = origGraph.size(); // old graph size
   auto nv            = keptVertices.size(); // new graph (subgraph) size
-  VertexList new_ids = this->reindexVertices(graph->size(), keptVertices);
+  VertexList new_ids = this->reindexVertices(globalGraph->size(), keptVertices);
   std::vector<uint32_t> degrees(nv, 0); // degrees of vertices in the subgraph
   for (auto v : keptVertices) {
     degrees[new_ids[v]] = getDegree(&origGraph, v);
@@ -403,23 +347,39 @@ void Sampler::reindexSubgraph(VertexSet& keptVertices, Graph& origGraph,
 #endif
 }
 
-void Sampler::subgraph_sample(size_t n, Graph& sg, mask_t* masks,
-                              unsigned tid) {
+VertexSet Sampler::convertToLID(VertexSet& gidSet) {
+  VertexSet existingLIDs;
+
+  for (auto i : gidSet) {
+    if (partGraph->isLocal(i)) {
+      existingLIDs.insert(partGraph->getLID(i));
+    }
+  }
+
+  return existingLIDs;
+}
+
+void Sampler::sampleSubgraph(size_t n, Graph& sg, mask_t* masks,
+                              unsigned seed) {
   VertexSet sampledSet;
   // n = 9000 by default
-  // this->select_vertices(count_, n, m_, masked_graph, vertices_, sampledSet);
-
+  // this->selectVertices(count_, n, m_, globalMaskedGraph, vertices_, sampledSet);
   // do the sampling of vertices from training set + using masked graph
-  this->select_vertices(n, m_, sampledSet, tid); // m = 1000 by default
+  this->selectVertices(n, m_, sampledSet, seed); // m = 1000 by default
+
+  // sampledSet is a list of *global* ids in the graph
+  // create new vertex set with LIDs for partitioned graph
+  VertexSet sampledLIDs = this->convertToLID(sampledSet);
 
-  // create the masks on the masked_graph
-  getMasks(Sampler::graph->size(), sampledSet, masks);
+  // create the masks
+  createMasks(Sampler::partGraph->size(), sampledLIDs, masks);
 
-  Graph masked_sg;
-  this->getMaskedGraph(
-      Sampler::graph->size(), masks, Sampler::masked_graph,
-      masked_sg); // remove edges whose destination is not masked
-  this->reindexSubgraph(sampledSet, masked_sg, sg);
+  // this graph will contain sampled vertices and induced subgraph for it
+  Graph maskedSG;
+  // TODO use partMaskedGraph once constructed later
+  this->getMaskedGraph(Sampler::partGraph->size(), masks, Sampler::partGraph,
+      maskedSG); // remove edges whose destination is not masked
+  this->reindexSubgraph(sampledLIDs, maskedSG, sg);
 }
 
 } // namespace deepgalois

From 066a0b64b2c21b00b0a7afd536ca2891ec73e664 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 11 May 2020 14:32:46 -0500
Subject: [PATCH 282/660] subgraph norm factor generation: assing to correct
 var also added a bunch of commented out print debugs

---
 libdeepgalois/include/deepgalois/Net.h     |  9 +++++-
 libdeepgalois/include/deepgalois/Sampler.h |  4 ++-
 libdeepgalois/src/DistContext.cpp          | 36 ++++++++++++++--------
 libdeepgalois/src/Sampler.cpp              | 16 ++++++++--
 4 files changed, 47 insertions(+), 18 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h
index 9f49f8f847..cb559f4a4d 100644
--- a/libdeepgalois/include/deepgalois/Net.h
+++ b/libdeepgalois/include/deepgalois/Net.h
@@ -200,7 +200,7 @@ class Net {
     if (subgraph_sample_size) {
       distContext->allocateSubgraphs(num_subgraphs);
       subgraphs_masks = new mask_t[distNumSamples * num_subgraphs];
-      galois::gPrint(header, " Construct training vertex set induced graph...\n");
+      galois::gPrint(header, "Constructing training vertex set induced graph...\n");
       sampler->initializeMaskedGraph(globalTrainCount, globalTrainMasks, context->getGraphPointer(),
                                      distContext->getGraphPointer());
     }
@@ -272,6 +272,13 @@ class Net {
         distContext->constructSubgraphFeatures(this->subgraphNumVertices,
                                                &subgraphs_masks[sg_id * globalSamples]);
         layers[0]->set_feats_ptr(distContext->get_feats_subg_ptr()); // feed input data
+
+        //Graph* testing = distContext->getSubgraphPointer(sg_id);
+        //for (size_t i = 0; i < testing->size(); i++) {
+        //  for (auto j = testing->edge_begin(i); j < testing->edge_end(i); j++) {
+        //    galois::gPrint(i, " ", testing->getEdgeDst(j), "\n");
+        //  }
+        //}
       } // end subgraph sample loop
 ////////////////////////////////////////////////////////////////////////////////
 
diff --git a/libdeepgalois/include/deepgalois/Sampler.h b/libdeepgalois/include/deepgalois/Sampler.h
index d29c537ab9..f736fa6a8f 100644
--- a/libdeepgalois/include/deepgalois/Sampler.h
+++ b/libdeepgalois/include/deepgalois/Sampler.h
@@ -57,8 +57,10 @@ class Sampler {
             auto idx = offsets[src];
             for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) {
               const auto dst = g->getEdgeDst(e);
-              if (masks[dst] == 1)
+              if (masks[dst] == 1) {
+                //galois::gPrint(src, " ", dst, "\n");
                 sub.constructEdge(idx++, dst, 0);
+              }
             }
           }
         }
diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp
index 7899a180e2..2e23d967fe 100644
--- a/libdeepgalois/src/DistContext.cpp
+++ b/libdeepgalois/src/DistContext.cpp
@@ -68,9 +68,17 @@ size_t DistContext::read_features(std::string dataset_str) {
 
   in.open(filename, std::ios::in);
   size_t m; // m = number of global vertices
-
   // header read
   in >> m >> feat_len >> std::ws;
+
+//    std::string file_dims = path + dataset_str + "-dims.txt";
+//    std::ifstream ifs;
+//    ifs.open(file_dims, std::ios::in);
+//    ifs >> m >> feat_len >> std::ws;
+//    ifs.close();
+//
+
+  galois::gPrint("N x D: ", m, " x ", feat_len, "\n");
   // use local size, not global size
   h_feats = new float_t[dGraph->size() * feat_len];
 
@@ -87,9 +95,9 @@ size_t DistContext::read_features(std::string dataset_str) {
       edge_stream >> v;
       // actual feature
       edge_stream >> w;
-
       h_feats[dGraph->getLID(u) * feat_len + v] = w;
     }
+    //galois::gPrint(u, "\n");
   }
   in.close();
 
@@ -235,7 +243,7 @@ void DistContext::constructNormFactor(deepgalois::Context* globalContext) {
 
 void DistContext::constructNormFactorSub(int subgraphID) {
   // right now norm factor based on subgraph
-  // TODO fix this
+  // TODO fix this for dist execution
 
   allocNormFactorSub(subgraphID);
 
@@ -254,9 +262,9 @@ void DistContext::constructNormFactorSub(int subgraphID) {
         float_t c_j  = std::sqrt(float_t(graphToUse.get_degree(j)));
 
         if (c_i == 0.0 || c_j == 0.0) {
-          this->normFactors[e] = 0.0;
+          this->normFactorsSub[e] = 0.0;
         } else {
-          this->normFactors[e] = 1.0 / (c_i * c_j);
+          this->normFactorsSub[e] = 1.0 / (c_i * c_j);
         }
     },
     galois::loopname("NormCountingEdge"));
@@ -268,10 +276,11 @@ void DistContext::constructNormFactorSub(int subgraphID) {
       auto degree = graphToUse.get_degree(v);
       float_t temp = std::sqrt(float_t(degree));
       if (temp == 0.0) {
-        this->normFactors[v] = 0.0;
+        this->normFactorsSub[v] = 0.0;
       } else {
-        this->normFactors[v] = 1.0 / temp;
+        this->normFactorsSub[v] = 1.0 / temp;
       }
+      galois::gPrint(this->normFactorsSub[v], "\n");
     },
     galois::loopname("NormCountingNode"));
 #endif
@@ -300,6 +309,7 @@ void DistContext::constructSubgraphLabels(size_t m, const mask_t* masks) {
       //            &Context::h_labels_subg[count * Context::num_classes]);
       //}
       DistContext::h_labels_subg[count] = h_labels[i];
+      //galois::gPrint("l ", (float)DistContext::h_labels_subg[count], "\n");
       count++;
     }
   }
@@ -317,6 +327,12 @@ void DistContext::constructSubgraphFeatures(size_t m, const mask_t* masks) {
       std::copy(DistContext::h_feats + i * DistContext::feat_len,
                 DistContext::h_feats + (i + 1) * DistContext::feat_len,
                 &DistContext::h_feats_subg[count * DistContext::feat_len]);
+      //for (unsigned a = 0; a < DistContext::feat_len; a++) {
+      //  if (h_feats_subg[count * DistContext::feat_len + a] != 0) {
+      //    galois::gPrint(h_feats_subg[count * DistContext::feat_len + a], " ");
+      //  }
+      //}
+      //galois::gPrint("\n");
       count++;
     }
   }
@@ -324,12 +340,6 @@ void DistContext::constructSubgraphFeatures(size_t m, const mask_t* masks) {
 }
 
 
-
-
-
-
-
-
 galois::graphs::GluonSubstrate<DGraph>* DistContext::getSyncSubstrate() {
   return DistContext::syncSubstrate;
 };
diff --git a/libdeepgalois/src/Sampler.cpp b/libdeepgalois/src/Sampler.cpp
index 3dfbcf8c86..d57bf85537 100644
--- a/libdeepgalois/src/Sampler.cpp
+++ b/libdeepgalois/src/Sampler.cpp
@@ -45,8 +45,10 @@ void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, Graph* g, DGrap
           auto idx = offsets[src];
           for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) {
             const auto dst = g->getEdgeDst(e);
-            if (masks[dst] == 1)
+            if (masks[dst] == 1) {
+              //galois::gPrint(src, " ", dst, "\n");
               Sampler::globalMaskedGraph->constructEdge(idx++, dst, 0);
+            }
           }
         }
       },
@@ -359,8 +361,7 @@ VertexSet Sampler::convertToLID(VertexSet& gidSet) {
   return existingLIDs;
 }
 
-void Sampler::sampleSubgraph(size_t n, Graph& sg, mask_t* masks,
-                              unsigned seed) {
+void Sampler::sampleSubgraph(size_t n, Graph& sg, mask_t* masks, unsigned seed) {
   VertexSet sampledSet;
   // n = 9000 by default
   // this->selectVertices(count_, n, m_, globalMaskedGraph, vertices_, sampledSet);
@@ -371,6 +372,13 @@ void Sampler::sampleSubgraph(size_t n, Graph& sg, mask_t* masks,
   // create new vertex set with LIDs for partitioned graph
   VertexSet sampledLIDs = this->convertToLID(sampledSet);
 
+  //VertexSet sampledLIDs;
+  //galois::gPrint("part graph num edges is ", partGraph->sizeEdges(), "\n");
+  //galois::gPrint("global mask num edges is ", globalMaskedGraph->sizeEdges(), "\n");
+  //for (auto i : this->trainingNodes) {
+  //  sampledLIDs.insert(i);
+  //}
+
   // create the masks
   createMasks(Sampler::partGraph->size(), sampledLIDs, masks);
 
@@ -380,6 +388,8 @@ void Sampler::sampleSubgraph(size_t n, Graph& sg, mask_t* masks,
   this->getMaskedGraph(Sampler::partGraph->size(), masks, Sampler::partGraph,
       maskedSG); // remove edges whose destination is not masked
   this->reindexSubgraph(sampledLIDs, maskedSG, sg);
+
+  //galois::gPrint("sg num edges is ", sg.sizeEdges(), "\n");
 }
 
 } // namespace deepgalois

From b770e71e15a481cbad8ac252fdeebaa790ea9001 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 11 May 2020 16:43:35 -0500
Subject: [PATCH 283/660] dist: feature reading using binary

---
 libdeepgalois/src/DistContext.cpp | 61 +++++++++++++++----------------
 1 file changed, 30 insertions(+), 31 deletions(-)

diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp
index 2e23d967fe..7e27ea0d92 100644
--- a/libdeepgalois/src/DistContext.cpp
+++ b/libdeepgalois/src/DistContext.cpp
@@ -64,42 +64,41 @@ size_t DistContext::read_features(std::string dataset_str) {
 
   std::string filename = path + dataset_str + ".ft";
   std::ifstream in;
-  std::string line;
-
-  in.open(filename, std::ios::in);
-  size_t m; // m = number of global vertices
-  // header read
-  in >> m >> feat_len >> std::ws;
-
-//    std::string file_dims = path + dataset_str + "-dims.txt";
-//    std::ifstream ifs;
-//    ifs.open(file_dims, std::ios::in);
-//    ifs >> m >> feat_len >> std::ws;
-//    ifs.close();
-//
+  size_t m; // m = number of vertices
+  // dimension read
+  std::string file_dims = path + dataset_str + "-dims.txt";
+  std::ifstream ifs;
+  ifs.open(file_dims, std::ios::in);
+  ifs >> m >> this->feat_len >> std::ws;
+  ifs.close();
 
   galois::gPrint("N x D: ", m, " x ", feat_len, "\n");
-  // use local size, not global size
+
+  // TODO read in without using 2 in-memory buffers
+  // full read feats to load into h_feats
+  float_t* fullFeats = new float_t[m * feat_len];
+  // actual stored feats
   h_feats = new float_t[dGraph->size() * feat_len];
 
-  // loop through all features
-  while (std::getline(in, line)) {
-    std::istringstream edge_stream(line);
-    unsigned u, v;
-    float_t w;
-    // vertex to set feature for
-    edge_stream >> u;
-    // only set if local
-    if (dGraph->isLocal(u)) {
-      // feature index
-      edge_stream >> v;
-      // actual feature
-      edge_stream >> w;
-      h_feats[dGraph->getLID(u) * feat_len + v] = w;
+  // read in full feats
+  filename = path + dataset_str + "-feats.bin";
+  in.open(filename, std::ios::binary | std::ios::in);
+  in.read((char*)fullFeats, sizeof(float_t) * m * feat_len);
+  in.close();
+
+  // get the local ids we want
+  size_t count = 0;
+  for (size_t i = 0; i < m; i++) {
+    if (dGraph->isLocal(i)) {
+      //h_feats[count * feat_len] = fullFeats[i];
+      std::copy(fullFeats + i * DistContext::feat_len,
+                fullFeats + (i + 1) * DistContext::feat_len,
+                &this->h_feats[count * DistContext::feat_len]);
+      count++;
     }
-    //galois::gPrint(u, "\n");
   }
-  in.close();
+  GALOIS_ASSERT(count == dGraph->size());
+  free(fullFeats);
 
   galois::gPrint("[", myID, "] Done with features, feature length: ", feat_len,
                  "\n");
@@ -280,7 +279,7 @@ void DistContext::constructNormFactorSub(int subgraphID) {
       } else {
         this->normFactorsSub[v] = 1.0 / temp;
       }
-      galois::gPrint(this->normFactorsSub[v], "\n");
+      //galois::gPrint(this->normFactorsSub[v], "\n");
     },
     galois::loopname("NormCountingNode"));
 #endif

From d3ae95ec8888691b96027adfe540a58a584b591b Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 11 May 2020 18:22:08 -0500
Subject: [PATCH 284/660] multiclass reading reimplemented in distcontext

---
 .../include/deepgalois/DistContext.h          | 11 ++-
 libdeepgalois/include/deepgalois/Net.h        |  5 +-
 libdeepgalois/src/DistContext.cpp             | 93 +++++++++----------
 libdeepgalois/src/Net.cpp                     |  4 +-
 lonestar/gnn/gcn/gcn.cpp                      |  2 +-
 5 files changed, 56 insertions(+), 59 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h
index ffaf430792..afd441b6e1 100644
--- a/libdeepgalois/include/deepgalois/DistContext.h
+++ b/libdeepgalois/include/deepgalois/DistContext.h
@@ -25,17 +25,20 @@ class DistContext {
   std::vector<float_t> h_feats_subg;  // input features for subgraph
 
   //  change regular one to a vector as well
-  float_t* normFactors;  // normalization constant based on graph structure
+  std::vector<float_t> normFactors;  // normalization constant based on graph structure
   std::vector<float_t> normFactorsSub; // normalization constant for subgraph
+  bool usingSingleClass;
 
 public:
-  DistContext();
+  // TODO better constructor
+  DistContext() : usingSingleClass(true) {};
   ~DistContext();
 
   void saveDistGraph(DGraph* a) {
     partitionedGraph = a;
 
     // construct lgraph from underlying lc csr graph
+    // TODO fix this so i don't have more than 1 copy of graph in memory
     this->lGraph = new Graph();
     this->lGraph->allocateFrom(a->size(), a->sizeEdges());
     this->lGraph->constructNodes();
@@ -56,7 +59,7 @@ class DistContext {
   }
 
   //! read labels of local nodes only
-  size_t read_labels(std::string dataset_str);
+  size_t read_labels(bool isSingleClassLabel, std::string dataset_str);
   //! read features of local nodes only
   size_t read_features(std::string dataset_str);
   //! read masks of local nodes only
@@ -90,7 +93,7 @@ class DistContext {
   void constructSubgraphLabels(size_t m, const mask_t* masks);
   void constructSubgraphFeatures(size_t m, const mask_t* masks);
 
-  float_t* get_norm_factors_ptr() { return normFactors; }
+  float_t* get_norm_factors_ptr() { return normFactors.data(); }
   float_t* get_norm_factors_subg_ptr() { return &normFactorsSub[0]; }
 
   //! return label for some node
diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h
index cb559f4a4d..8bc27df33f 100644
--- a/libdeepgalois/include/deepgalois/Net.h
+++ b/libdeepgalois/include/deepgalois/Net.h
@@ -123,9 +123,6 @@ class Net {
     context->set_dataset(dataset_str);
     // read *entire* graph, get num nodes
     globalSamples = context->read_graph(selfloop);
-    context->set_label_class(is_single_class);
-    // read ground truth labels
-    num_classes = context->read_labels();
 
     // get training and validation sets: this is to create the training
     // subgraph in the sampler
@@ -186,7 +183,7 @@ class Net {
 
   void init();
   //! Initializes metadata for the partition
-  void partitionInit(DGraph* graph, std::string dataset_str);
+  void partitionInit(DGraph* graph, std::string dataset_str, bool isSingleClassLabel);
 
   size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; }
   size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id + 1]; }
diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp
index 7e27ea0d92..d354301eb5 100644
--- a/libdeepgalois/src/DistContext.cpp
+++ b/libdeepgalois/src/DistContext.cpp
@@ -3,11 +3,11 @@
 #include "deepgalois/configs.h"
 
 namespace deepgalois {
-DistContext::DistContext() {}
 DistContext::~DistContext() {}
 
-size_t DistContext::read_labels(std::string dataset_str) {
+size_t DistContext::read_labels(bool isSingleClassLabel, std::string dataset_str) {
   DGraph* dGraph = DistContext::partitionedGraph;
+  this->usingSingleClass = isSingleClassLabel;
   unsigned myID = galois::runtime::getSystemNetworkInterface().ID;
   galois::gPrint("[", myID, "] Reading labels from disk...\n");
 
@@ -17,11 +17,18 @@ size_t DistContext::read_labels(std::string dataset_str) {
   in.open(filename, std::ios::in);
   size_t m;
   // read file header
-  in >> m >> num_classes >> std::ws;
+  in >> m >> this->num_classes >> std::ws;
   assert(m == dGraph->globalSize());
+
   // size of labels should be # local nodes
-  h_labels = new label_t[dGraph->size()]; // single-class (one-hot) label for
-                                          // each vertex: N x 1
+  if (isSingleClassLabel) {
+    galois::gPrint("[", myID, "] One hot labels...\n");
+    this->h_labels = new label_t[dGraph->size()]; // single-class (one-hot) label for
+                                            // each vertex: N x 1
+  } else {
+    galois::gPrint("[", myID, "] Multi-class labels...\n");
+    this->h_labels = new label_t[dGraph->size() * this->num_classes]; // multi-class label for each vertex: N x E
+  }
 
   uint32_t foundVertices = 0;
   unsigned v             = 0;
@@ -32,14 +39,21 @@ size_t DistContext::read_labels(std::string dataset_str) {
       std::istringstream label_stream(line);
       unsigned x;
       // for each class
-      for (size_t idx = 0; idx < num_classes; ++idx) {
+      for (size_t idx = 0; idx < this->num_classes; ++idx) {
         // check if that class is labeled
         label_stream >> x;
-        if (x != 0) {
-          // set local id
-          h_labels[dGraph->getLID(v)] = idx;
+
+        // diff between single and multi class
+        if (isSingleClassLabel) {
+          if (x != 0) {
+            // set local id
+            this->h_labels[dGraph->getLID(v)] = idx;
+            foundVertices++;
+            break;
+          }
+        } else {
+          this->h_labels[dGraph->getLID(v) * this->num_classes + idx] = x;
           foundVertices++;
-          break;
         }
       }
     }
@@ -159,42 +173,26 @@ void DistContext::initializeSyncSubstrate() {
 }
 
 void DistContext::allocNormFactor() {
-  if (!normFactors) {
 #ifdef USE_MKL
-    normFactors = new float_t[partitionedGraph->sizeEdges()];
+  this->normFactors.resize(partitionedGraph->sizeEdges());
 #else
-    normFactors = new float_t[partitionedGraph->size()];
+  this->normFactors.resize(partitionedGraph->size());
 #endif
-  }
-  if (!normFactors) {
-    GALOIS_DIE("norm factors failed to be allocated");
-  }
+  // TODO clean out?
 }
 
 void DistContext::allocNormFactorSub(int subID) {
 #ifdef USE_MKL
-  normFactorsSub.resize(partitionedSubgraphs[subID]->sizeEdges());
+  this->normFactorsSub.resize(partitionedSubgraphs[subID]->sizeEdges());
 #else
-  normFactorsSub.resize(partitionedSubgraphs[subID]->size());
+  this->normFactorsSub.resize(partitionedSubgraphs[subID]->size());
 #endif
   // TODO clean out?
 }
 
 
-//void DistContext::allocSubNormFactor(int subID) {
-//  if (!normFactors) {
-//#ifdef USE_MKL
-//    normFactors = new float_t[partitionedGraph->sizeEdges()];
-//#else
-//    normFactors = new float_t[partitionedGraph->size()];
-//#endif
-//  }
-//  if (!normFactors) {
-//    GALOIS_DIE("norm factors failed to be allocated");
-//  }
-//}
-
 void DistContext::constructNormFactor(deepgalois::Context* globalContext) {
+  galois::gPrint("Norm factor construction\n");
   // TODO IMPLEMENT THIS; get relevant info from the original context
   // sets current subgraph + gets degrees
   Graph* wholeGraph = globalContext->getCurrentGraph(false);
@@ -238,9 +236,11 @@ void DistContext::constructNormFactor(deepgalois::Context* globalContext) {
     },
     galois::loopname("NormCountingNode"));
 #endif
+  galois::gPrint("Norm factor construction done\n");
 }
 
 void DistContext::constructNormFactorSub(int subgraphID) {
+  galois::gPrint("Sub norm factor construction\n");
   // right now norm factor based on subgraph
   // TODO fix this for dist execution
 
@@ -283,31 +283,28 @@ void DistContext::constructNormFactorSub(int subgraphID) {
     },
     galois::loopname("NormCountingNode"));
 #endif
+  galois::gPrint("Sub norm factor construction done\n");
 }
 //! generate labels for the subgraph, m is subgraph size, mask
 //! tells which vertices to use
 void DistContext::constructSubgraphLabels(size_t m, const mask_t* masks) {
-  // TODO multiclass
-
-  // if (h_labels_subg == NULL) h_labels_subg = new label_t[m];
-  //if (DistContext::is_single_class) {
-  //} else {
-  //  DistContext::h_labels_subg.resize(m * Context::num_classes);
-  //}
-
-  DistContext::h_labels_subg.resize(m);
+  if (DistContext::usingSingleClass) {
+    DistContext::h_labels_subg.resize(m);
+  } else {
+    DistContext::h_labels_subg.resize(m * DistContext::num_classes);
+  }
 
   size_t count = 0;
   // see which labels to copy over for this subgraph
   for (size_t i = 0; i < this->partitionedGraph->size(); i++) {
     if (masks[i] == 1) {
-      //if (Context::is_single_class) {
-      //} else {
-      //  std::copy(Context::h_labels + i * Context::num_classes,
-      //            Context::h_labels + (i + 1) * Context::num_classes,
-      //            &Context::h_labels_subg[count * Context::num_classes]);
-      //}
-      DistContext::h_labels_subg[count] = h_labels[i];
+      if (DistContext::usingSingleClass) {
+        DistContext::h_labels_subg[count] = h_labels[i];
+      } else {
+        std::copy(DistContext::h_labels + i * DistContext::num_classes,
+                  DistContext::h_labels + (i + 1) * DistContext::num_classes,
+                  &DistContext::h_labels_subg[count * DistContext::num_classes]);
+      }
       //galois::gPrint("l ", (float)DistContext::h_labels_subg[count], "\n");
       count++;
     }
diff --git a/libdeepgalois/src/Net.cpp b/libdeepgalois/src/Net.cpp
index 3500911c74..ce23b2b51d 100644
--- a/libdeepgalois/src/Net.cpp
+++ b/libdeepgalois/src/Net.cpp
@@ -9,7 +9,7 @@
 
 namespace deepgalois {
 
-void Net::partitionInit(DGraph* graph, std::string dataset_str) {
+void Net::partitionInit(DGraph* graph, std::string dataset_str, bool isSingleClassLabel) {
   this->dGraph      = graph;
   this->distContext = new deepgalois::DistContext();
   this->distContext->saveDistGraph(dGraph);
@@ -19,7 +19,7 @@ void Net::partitionInit(DGraph* graph, std::string dataset_str) {
   // or on master node only
 
   this->distContext->initializeSyncSubstrate();
-  num_classes = this->distContext->read_labels(dataset_str);
+  num_classes = this->distContext->read_labels(isSingleClassLabel, dataset_str);
 
   // std::cout << "Reading label masks ... ";
   this->distTrainMasks = new mask_t[this->distNumSamples];
diff --git a/lonestar/gnn/gcn/gcn.cpp b/lonestar/gnn/gcn/gcn.cpp
index d9219438ae..fabd27667f 100644
--- a/lonestar/gnn/gcn/gcn.cpp
+++ b/lonestar/gnn/gcn/gcn.cpp
@@ -27,7 +27,7 @@ int main(int argc, char** argv) {
                           neighbor_sample_sz, subgraph_sample_sz, val_interval);
 
   // initialize distributed context
-  network.partitionInit(dGraph, dataset);
+  network.partitionInit(dGraph, dataset, is_single_class);
 
   // construct layers from distributed context
   network.construct_layers();

From ad6424209a3efd3527742a1a4e1b7888eff1a9e1 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 11 May 2020 18:52:30 -0500
Subject: [PATCH 285/660] bunch of cleanup: context is now topo only, removed
 unused functions

---
 libdeepgalois/include/deepgalois/Context.h    |  60 +-------
 .../include/deepgalois/DistContext.h          |   5 -
 libdeepgalois/include/deepgalois/Net.h        |  19 +--
 libdeepgalois/include/deepgalois/Sampler.h    |   1 +
 libdeepgalois/src/Context.cpp                 | 141 +-----------------
 libdeepgalois/src/DistContext.cpp             |   9 +-
 libdeepgalois/src/Sampler.cpp                 |  41 ++---
 7 files changed, 45 insertions(+), 231 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/Context.h b/libdeepgalois/include/deepgalois/Context.h
index 41e3aac23b..7faebd7c83 100644
--- a/libdeepgalois/include/deepgalois/Context.h
+++ b/libdeepgalois/include/deepgalois/Context.h
@@ -18,31 +18,14 @@ namespace deepgalois {
 class Context {
   std::string dataset;
   bool is_device;         // is this on device or host
-  size_t n;               // number of samples: N
-  size_t num_classes;     // number of classes: E
-  size_t feat_len;        // input feature length: D
-  bool is_single_class;   // single-class (one-hot) or multi-class label
   bool is_selfloop_added; // whether selfloop is added to the input graph
-  bool use_subgraph;      // whether to use subgraph
-  label_t* h_labels;      // labels for classification. Single-class label: Nx1,
-                          // multi-class label: NxE
-  float_t* h_feats;       // input features: N x D
-  // label_t *h_labels_subg;      // labels for subgraph
-  // float_t* h_feats_subg;       // input features for subgraph
+
   label_t* d_labels;      // labels on device
   label_t* d_labels_subg; // labels for subgraph on device
   float_t* d_feats;       // input features on device
   float_t* d_feats_subg;  // input features for subgraph on device
-  float_t* norm_factors;  // normalization constant based on graph structure
-  std::vector<label_t> h_labels_subg;     // labels for subgraph
-  std::vector<float_t> h_feats_subg;      // input features for subgraph
-  std::vector<float_t> norm_factors_subg; // normalization constant for subgraph
-  // float_t* norm_factors_subg;  // normalization constant for subgraph
-  Reader reader;
-
-  void alloc_norm_factor();
-  void alloc_subgraph_norm_factor(int subg_id);
 
+  Reader reader;
 
 public:
 // TODO separate below to public and private
@@ -52,11 +35,6 @@ class Context {
   void add_selfloop(Graph& og, Graph& g);
   //! returns pointer to the graph
   Graph* getGraphPointer() { return graph_cpu; }
-  Graph* getSubgraphPointer(int id) { return subgraphs_cpu[id]; };
-  float_t* get_feats_ptr() { return h_feats; }
-  float_t* get_feats_subg_ptr() { return &h_feats_subg[0]; }
-  label_t* get_labels_ptr() { return h_labels; }
-  label_t* get_labels_subg_ptr() { return &h_labels_subg[0]; }
 #else
   static cublasHandle_t cublas_handle_;         // used to call cuBLAS
   static cusparseHandle_t cusparse_handle_;     // used to call cuSPARSE
@@ -85,35 +63,18 @@ class Context {
   Context();
   //! initializer for gpu; goes ahead and sets a few things
   Context(bool use_gpu)
-      : is_device(use_gpu), n(0), num_classes(0), feat_len(0),
-        is_single_class(true), is_selfloop_added(false), use_subgraph(false),
-        h_labels(NULL), h_feats(NULL), d_labels(NULL), d_labels_subg(NULL),
-        d_feats(NULL), d_feats_subg(NULL), norm_factors(NULL) {}
+      : is_device(use_gpu),
+        is_selfloop_added(false), d_labels(NULL), d_labels_subg(NULL),
+        d_feats(NULL), d_feats_subg(NULL) {}
   ~Context();
 
   size_t read_graph(bool selfloop);
-  size_t read_labels() {
-    num_classes = reader.read_labels(is_single_class, h_labels);
-    return num_classes;
-  }
-  size_t read_features() {
-    feat_len = reader.read_features(h_feats);
-    return feat_len;
-  }
+
   size_t read_masks(std::string mask_type, size_t n, size_t& begin, size_t& end,
                     mask_t* masks) {
     return reader.read_masks(mask_type, n, begin, end, masks);
   }
 
-  label_t get_label(size_t i) {
-    return h_labels[i];
-  } // single-class (one-hot) label
-
-  // label_t get_label(size_t i, size_t j) { return labels[i*num_classes+j]; }
-  // // multi-class label
-  float_t* get_norm_factors_ptr() { return norm_factors; }
-  float_t* get_norm_factors_subg_ptr() { return &norm_factors_subg[0]; }
-
   void set_dataset(std::string dataset_str) {
     dataset = dataset_str;
     reader.init(dataset);
@@ -121,16 +82,9 @@ class Context {
 
   //! Checks if subgraph being used, sets currenet graph, then calls degreex
   //! counting
-  Graph* getCurrentGraph(bool usingSubGraph, int subID=0);
+  Graph* getFullGraph();
 
-  void set_label_class(bool is_single = true) { is_single_class = is_single; }
-  void set_use_subgraph(bool use_subg) { use_subgraph = use_subg; }
   void copy_data_to_device(); // copy labels and input features
-  void norm_factor_computing(bool is_subgraph, int subg_id = 0);
-  void gen_subgraph_labels(size_t m, const mask_t* masks);
-  void gen_subgraph_feats(size_t m, const mask_t* masks);
-  //! Allocate subgraphs (but don't actually do sampling yet)
-  void allocateSubgraphs(int num_subgraphs);
 };
 
 } // namespace deepgalois
diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h
index afd441b6e1..be7dd7be45 100644
--- a/libdeepgalois/include/deepgalois/DistContext.h
+++ b/libdeepgalois/include/deepgalois/DistContext.h
@@ -66,11 +66,6 @@ class DistContext {
   size_t read_masks(std::string dataset_str, std::string mask_type, size_t n,
                     size_t& begin, size_t& end, mask_t* masks, DGraph* dGraph);
 
-  // TODO define these
-  void createSubgraphs(int) {}
-  void gen_subgraph_labels(size_t, const mask_t*) {}
-  void gen_subgraph_feats(size_t, const mask_t*) {}
-
   DGraph* getGraphPointer() { return partitionedGraph; }
   Graph* getLGraphPointer() { return lGraph; }
 
diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h
index 8bc27df33f..c10c262a02 100644
--- a/libdeepgalois/include/deepgalois/Net.h
+++ b/libdeepgalois/include/deepgalois/Net.h
@@ -76,7 +76,7 @@ class Net {
   // TODO optimize single host case
 
   //! context holds all of the graph data
-  deepgalois::Context* context;
+  deepgalois::Context* graphTopologyContext;
   //! dist context holds graph data of the partitioned graph only
   deepgalois::DistContext* distContext;
 
@@ -119,10 +119,10 @@ class Net {
     feature_dims.resize(num_layers + 1);
 
     // initialze global graph context
-    context = new deepgalois::Context();
-    context->set_dataset(dataset_str);
+    graphTopologyContext = new deepgalois::Context();
+    graphTopologyContext->set_dataset(dataset_str);
     // read *entire* graph, get num nodes
-    globalSamples = context->read_graph(selfloop);
+    globalSamples = graphTopologyContext->read_graph(selfloop);
 
     // get training and validation sets: this is to create the training
     // subgraph in the sampler
@@ -147,9 +147,9 @@ class Net {
         globalValMasks[i] = 1;
     } else {
       globalTrainCount =
-          context->read_masks("train", globalSamples, globalTrainBegin,
+          graphTopologyContext->read_masks("train", globalSamples, globalTrainBegin,
                               globalTrainEnd, globalTrainMasks);
-      globalValCount = context->read_masks("val", globalSamples, globalValBegin,
+      globalValCount = graphTopologyContext->read_masks("val", globalSamples, globalValBegin,
                                            globalValEnd, globalValMasks);
     }
 
@@ -162,9 +162,6 @@ class Net {
     // features are read in distcontext, not this context (this context only
     // used for sampling)
 
-    // set the subgraph boolean if sample size is greater than 0
-    context->set_use_subgraph(subgraph_sample_size > 0);
-
     this->sampler = new Sampler();
   }
 
@@ -198,7 +195,7 @@ class Net {
       distContext->allocateSubgraphs(num_subgraphs);
       subgraphs_masks = new mask_t[distNumSamples * num_subgraphs];
       galois::gPrint(header, "Constructing training vertex set induced graph...\n");
-      sampler->initializeMaskedGraph(globalTrainCount, globalTrainMasks, context->getGraphPointer(),
+      sampler->initializeMaskedGraph(globalTrainCount, globalTrainMasks, graphTopologyContext->getGraphPointer(),
                                      distContext->getGraphPointer());
     }
 
@@ -464,7 +461,7 @@ class Net {
     layers[0]->set_in_data(distContext->get_feats_ptr()); // feed input data
     // precompute the normalization constant based on graph structure
     //context->norm_factor_computing(false);
-    distContext->constructNormFactor(context);
+    distContext->constructNormFactor(graphTopologyContext);
     for (size_t i = 0; i < num_conv_layers; i++)
       layers[i]->set_norm_consts_ptr(distContext->get_norm_factors_ptr());
     set_contexts();
diff --git a/libdeepgalois/include/deepgalois/Sampler.h b/libdeepgalois/include/deepgalois/Sampler.h
index f736fa6a8f..1a9fabd9ec 100644
--- a/libdeepgalois/include/deepgalois/Sampler.h
+++ b/libdeepgalois/include/deepgalois/Sampler.h
@@ -103,6 +103,7 @@ void getMaskedDegrees(size_t n, mask_t* masks, GraphTy* g,
   void checkGSDB(std::vector<db_t>& DB0, std::vector<db_t>& DB1,
                  std::vector<db_t>& DB2, size_t size);
 
+  //! convert set of gids to lids
   VertexSet convertToLID(VertexSet& gidSet);
 
 public:
diff --git a/libdeepgalois/src/Context.cpp b/libdeepgalois/src/Context.cpp
index 17b9872f74..c9bbe9e706 100644
--- a/libdeepgalois/src/Context.cpp
+++ b/libdeepgalois/src/Context.cpp
@@ -10,67 +10,7 @@ namespace deepgalois {
 
 Context::Context() : Context(false) {}
 
-Context::~Context() {
-  if (h_labels)
-    delete[] h_labels;
-  if (h_feats)
-    delete[] h_feats;
-  if (norm_factors)
-    delete[] norm_factors;
-  // if (h_feats_subg) delete[] h_feats_subg;
-  // if (h_labels_subg) delete[] h_labels_subg;
-  // if (norm_factors_subg) delete[] norm_factors_subg;
-}
-
-void Context::allocateSubgraphs(int num_subgraphs) {
-  subgraphs_cpu.resize(num_subgraphs);
-  for (int i = 0; i < num_subgraphs; i++)
-    subgraphs_cpu[i] = new Graph();
-}
-
-//! generate labels for the subgraph, m is subgraph size, mask
-//! tells which vertices to use
-void Context::gen_subgraph_labels(size_t m, const mask_t* masks) {
-  // if (h_labels_subg == NULL) h_labels_subg = new label_t[m];
-  if (Context::is_single_class) {
-    Context::h_labels_subg.resize(m);
-  } else {
-    Context::h_labels_subg.resize(m * Context::num_classes);
-  }
-
-  size_t count = 0;
-  // see which labels to copy over for this subgraph
-  for (size_t i = 0; i < n; i++) {
-    if (masks[i] == 1) {
-      if (Context::is_single_class) {
-        Context::h_labels_subg[count] = h_labels[i];
-      } else {
-        std::copy(Context::h_labels + i * Context::num_classes,
-                  Context::h_labels + (i + 1) * Context::num_classes,
-                  &Context::h_labels_subg[count * Context::num_classes]);
-      }
-      count++;
-    }
-  }
-  assert(count == m);
-}
-
-//! generate input features for the subgraph, m is subgraph size,
-//! masks tells which vertices to use
-void Context::gen_subgraph_feats(size_t m, const mask_t* masks) {
-  size_t count = 0;
-  // if (h_feats_subg == NULL) h_feats_subg = new float_t[m*feat_len];
-  Context::h_feats_subg.resize(m * feat_len);
-  for (size_t i = 0; i < n; i++) {
-    if (masks[i] == 1) {
-      std::copy(Context::h_feats + i * Context::feat_len,
-                Context::h_feats + (i + 1) * Context::feat_len,
-                &Context::h_feats_subg[count * Context::feat_len]);
-      count++;
-    }
-  }
-  assert(count == m);
-}
+Context::~Context() {}
 
 size_t Context::read_graph(bool selfloop) {
   std::string filename = path + dataset + ".csgr";
@@ -138,86 +78,11 @@ void Context::add_selfloop(Graph& og, Graph& g) {
   //}
 }
 
-void Context::alloc_norm_factor() {
-  Graph* g = getGraphPointer();
-  if (norm_factors == NULL) {
-#ifdef USE_MKL
-    norm_factors = new float_t[g->sizeEdges()];
-#else
-    norm_factors = new float_t[g->size()];
-#endif
-  }
-}
-
-void Context::alloc_subgraph_norm_factor(int subg_id) {
-  Graph* g = getSubgraphPointer(subg_id);
-#ifdef USE_MKL
-  norm_factors_subg.resize(g->sizeEdges());
-#else
-  norm_factors_subg.resize(g->size());
-#endif
-}
-
 // get current graph, also gets degrees of g
-Graph* Context::getCurrentGraph(bool usingSubGraph, int subID) {
-  Graph* g;
-
-  // grab orig or subgraph pointer as necessary
-  if (!usingSubGraph) {
-    g = getGraphPointer();
-  } else {
-    g = getSubgraphPointer(subID);
-  }
+Graph* Context::getFullGraph() {
+  Graph* g = getGraphPointer();
   g->degree_counting();
-
   return g;
 }
 
-void Context::norm_factor_computing(bool is_subgraph, int subg_id) {
-  Graph* g;
-  float_t* constants;
-
-  // grab orig or subgraph pointer as necessary
-  if (!is_subgraph) {
-    g = getGraphPointer();
-    alloc_norm_factor();
-    constants = norm_factors;
-  } else {
-    g = getSubgraphPointer(subg_id);
-    alloc_subgraph_norm_factor(subg_id);
-    constants = get_norm_factors_subg_ptr();
-  }
-
-  auto g_size = g->size();
-  g->degree_counting();
-#ifdef USE_MKL
-  galois::do_all(
-      galois::iterate((size_t)0, g_size),
-      [&](auto i) {
-        float_t c_i = std::sqrt(float_t(g->get_degree(i)));
-        for (auto e = g->edge_begin(i); e != g->edge_end(i); e++) {
-          const auto j = g->getEdgeDst(e);
-          float_t c_j  = std::sqrt(float_t(g->get_degree(j)));
-          if (c_i == 0.0 || c_j == 0.0)
-            constants[e] = 0.0;
-          else
-            constants[e] = 1.0 / (c_i * c_j);
-        }
-      },
-      galois::loopname("NormCountingEdge"));
-#else
-  galois::do_all(
-      galois::iterate((size_t)0, g_size),
-      [&](auto v) {
-        auto degree = g->get_degree(v);
-        float_t temp = std::sqrt(float_t(degree));
-        if (temp == 0.0)
-          constants[v] = 0.0;
-        else
-          constants[v] = 1.0 / temp;
-      },
-      galois::loopname("NormCountingVertex"));
-#endif
-}
-
 } // namespace deepgalois
diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp
index d354301eb5..ea98e26007 100644
--- a/libdeepgalois/src/DistContext.cpp
+++ b/libdeepgalois/src/DistContext.cpp
@@ -5,6 +5,7 @@
 namespace deepgalois {
 DistContext::~DistContext() {}
 
+// TODO move to reader class
 size_t DistContext::read_labels(bool isSingleClassLabel, std::string dataset_str) {
   DGraph* dGraph = DistContext::partitionedGraph;
   this->usingSingleClass = isSingleClassLabel;
@@ -71,6 +72,7 @@ size_t DistContext::read_labels(bool isSingleClassLabel, std::string dataset_str
   return num_classes;
 }
 
+// TODO move to reader class
 size_t DistContext::read_features(std::string dataset_str) {
   DGraph* dGraph = DistContext::partitionedGraph;
   unsigned myID = galois::runtime::getSystemNetworkInterface().ID;
@@ -120,6 +122,7 @@ size_t DistContext::read_features(std::string dataset_str) {
   return feat_len;
 }
 
+// TODO move to reader class
 size_t DistContext::read_masks(std::string dataset_str, std::string mask_type,
                                size_t n, size_t& begin, size_t& end,
                                mask_t* masks, DGraph* dGraph) {
@@ -193,12 +196,10 @@ void DistContext::allocNormFactorSub(int subID) {
 
 void DistContext::constructNormFactor(deepgalois::Context* globalContext) {
   galois::gPrint("Norm factor construction\n");
-  // TODO IMPLEMENT THIS; get relevant info from the original context
-  // sets current subgraph + gets degrees
-  Graph* wholeGraph = globalContext->getCurrentGraph(false);
+  // using original graph to get ids
+  Graph* wholeGraph = globalContext->getFullGraph();
 
   allocNormFactor();
-
   // this is for testing purposes
   //galois::do_all(galois::iterate((size_t)0, partitionedGraph->size()),
   //  [&] (unsigned i) {
diff --git a/libdeepgalois/src/Sampler.cpp b/libdeepgalois/src/Sampler.cpp
index d57bf85537..466cee6584 100644
--- a/libdeepgalois/src/Sampler.cpp
+++ b/libdeepgalois/src/Sampler.cpp
@@ -6,9 +6,23 @@
 #define PARALLEL_GEN
 
 namespace deepgalois {
+
+//! debug function: prints out sets of vertices
+void print_vertex_set(VertexSet vertex_set) {
+  unsigned counter = 0;
+  unsigned n       = vertex_set.size();
+  galois::gPrint("( ");
+  for (int i : vertex_set) {
+    counter++;
+    if (counter > 16 && counter < n - 16)
+      continue;
+    galois::gPrint(i, " ");
+  }
+  galois::gPrint(")\n");
+}
+
+//! helper function to get degree of some vertex given some graph
 inline unsigned getDegree(Graph* g, index_t v) {
-  // return g->get_degree(v);
-  // return std::distance(g->edge_begin(v), g->edge_end(v));
   return g->edge_end(v) - g->edge_begin(v);
 }
 
@@ -58,7 +72,8 @@ void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, Graph* g, DGrap
   Sampler::avg_deg  = globalMaskedGraph->sizeEdges() / globalMaskedGraph->size();
   Sampler::subg_deg = (avg_deg > SAMPLE_CLIP) ? SAMPLE_CLIP : avg_deg;
 
-  // TODO masked part graph as well to save time later
+  // TODO masked part graph as well to save time later; right now constructing
+  // from full part graph
 
   // size_t idx = 0;
   // vertices_.resize(count);
@@ -81,19 +96,7 @@ void Sampler::checkGSDB(std::vector<db_t>& DB0, std::vector<db_t>& DB1,
   DB2.resize(size);
 }
 
-//! debug function: prints out sets of vertices
-void print_vertex_set(VertexSet vertex_set) {
-  unsigned counter = 0;
-  unsigned n       = vertex_set.size();
-  galois::gPrint("( ");
-  for (int i : vertex_set) {
-    counter++;
-    if (counter > 16 && counter < n - 16)
-      continue;
-    galois::gPrint(i, " ");
-  }
-  galois::gPrint(")\n");
-}
+
 
 // implementation from GraphSAINT
 // https://github.com/GraphSAINT/GraphSAINT/blob/master/ipdps19_cpp/sample.cpp
@@ -292,8 +295,7 @@ void Sampler::selectVertices(size_t nv, size_t n, int m, Graph* g,
 void Sampler::createMasks(size_t n, VertexSet vertices, mask_t* masks) {
   // galois::gPrint("Updating masks, size = ", vertices.size(), "\n");
   std::fill(masks, masks + n, 0);
-  for (auto v : vertices)
-    masks[v] = 1;
+  for (auto v : vertices) masks[v] = 1;
 }
 
 inline VertexList Sampler::reindexVertices(size_t n, VertexSet vertex_set) {
@@ -351,13 +353,12 @@ void Sampler::reindexSubgraph(VertexSet& keptVertices, Graph& origGraph,
 
 VertexSet Sampler::convertToLID(VertexSet& gidSet) {
   VertexSet existingLIDs;
-
+  // find local selected vertices, convert to lid
   for (auto i : gidSet) {
     if (partGraph->isLocal(i)) {
       existingLIDs.insert(partGraph->getLID(i));
     }
   }
-
   return existingLIDs;
 }
 

From a326fb6600fc000500a10cc8aaa6fdab9cf4cb2b Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 11 May 2020 18:56:30 -0500
Subject: [PATCH 286/660] ran clang-format 10 on deepgalois

---
 libdeepgalois/include/deepgalois/Context.h    |   5 +-
 .../include/deepgalois/DistContext.h          |  20 +-
 libdeepgalois/include/deepgalois/Net.h        |  95 ++++----
 libdeepgalois/include/deepgalois/Sampler.h    |  59 +++--
 libdeepgalois/src/DistContext.cpp             | 207 ++++++++++--------
 libdeepgalois/src/Net.cpp                     |  25 ++-
 libdeepgalois/src/Sampler.cpp                 |  33 +--
 libdeepgalois/src/layers/aggregator.cpp       |   2 +-
 libdeepgalois/src/layers/graph_conv_layer.cpp |  17 +-
 libdeepgalois/src/sampler.cu                  |  87 +++++---
 10 files changed, 297 insertions(+), 253 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/Context.h b/libdeepgalois/include/deepgalois/Context.h
index 7faebd7c83..341270201a 100644
--- a/libdeepgalois/include/deepgalois/Context.h
+++ b/libdeepgalois/include/deepgalois/Context.h
@@ -63,9 +63,8 @@ class Context {
   Context();
   //! initializer for gpu; goes ahead and sets a few things
   Context(bool use_gpu)
-      : is_device(use_gpu),
-        is_selfloop_added(false), d_labels(NULL), d_labels_subg(NULL),
-        d_feats(NULL), d_feats_subg(NULL) {}
+      : is_device(use_gpu), is_selfloop_added(false), d_labels(NULL),
+        d_labels_subg(NULL), d_feats(NULL), d_feats_subg(NULL) {}
   ~Context();
 
   size_t read_graph(bool selfloop);
diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h
index be7dd7be45..14b2ae18b7 100644
--- a/libdeepgalois/include/deepgalois/DistContext.h
+++ b/libdeepgalois/include/deepgalois/DistContext.h
@@ -11,27 +11,28 @@
 namespace deepgalois {
 
 class DistContext {
-  size_t num_classes;   // number of classes: E
-  size_t feat_len;      // input feature length: D
+  size_t num_classes; // number of classes: E
+  size_t feat_len;    // input feature length: D
   galois::graphs::GluonSubstrate<DGraph>* syncSubstrate;
 
-  Graph* lGraph; // laerning graph version
+  Graph* lGraph;            // laerning graph version
   DGraph* partitionedGraph; // the input graph, |V| = N
   std::vector<Graph*> partitionedSubgraphs;
-  label_t* h_labels;      // labels for classification. Single-class label: Nx1,
-                          // multi-class label: NxE
+  label_t* h_labels; // labels for classification. Single-class label: Nx1,
+                     // multi-class label: NxE
   std::vector<label_t> h_labels_subg; // labels for subgraph
-  float_t* h_feats;       // input features: N x D
+  float_t* h_feats;                   // input features: N x D
   std::vector<float_t> h_feats_subg;  // input features for subgraph
 
   //  change regular one to a vector as well
-  std::vector<float_t> normFactors;  // normalization constant based on graph structure
+  std::vector<float_t>
+      normFactors; // normalization constant based on graph structure
   std::vector<float_t> normFactorsSub; // normalization constant for subgraph
   bool usingSingleClass;
 
 public:
   // TODO better constructor
-  DistContext() : usingSingleClass(true) {};
+  DistContext() : usingSingleClass(true){};
   ~DistContext();
 
   void saveDistGraph(DGraph* a) {
@@ -54,8 +55,7 @@ class DistContext {
             this->lGraph->constructEdge(idx++, dst, 0);
           }
         },
-        galois::loopname("lgraphcopy")
-    );
+        galois::loopname("lgraphcopy"));
   }
 
   //! read labels of local nodes only
diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h
index c10c262a02..04f51f317b 100644
--- a/libdeepgalois/include/deepgalois/Net.h
+++ b/libdeepgalois/include/deepgalois/Net.h
@@ -33,13 +33,13 @@ class Net {
   unsigned neighbor_sample_size; // neighbor sampling
   unsigned subgraph_sample_size; // subgraph sampling
   int num_threads;               // number of threads
-  size_t globalSamples;            // number of samples: N
+  size_t globalSamples;          // number of samples: N
   size_t distNumSamples;         // number of samples: N
   size_t num_classes;            // number of vertex classes: E
   size_t num_conv_layers;        // number of convolutional layers
   size_t num_layers;             // total number of layers (conv + output)
   int num_epochs;                // number of epochs
-  unsigned h1;                // hidden layer size
+  unsigned h1;                   // hidden layer size
   float learning_rate;           // learning rate
   float dropout_rate;            // dropout rate
   float weight_decay;            // weighti decay for over-fitting
@@ -92,8 +92,7 @@ class Net {
       : is_single_class(single), has_l2norm(l2norm), has_dense(dense),
         neighbor_sample_size(neigh_sz), subgraph_sample_size(subg_sz),
         num_threads(nt), num_conv_layers(n_conv), num_epochs(epochs),
-        h1(hidden1),
-        learning_rate(lr), dropout_rate(dropout), weight_decay(wd),
+        h1(hidden1), learning_rate(lr), dropout_rate(dropout), weight_decay(wd),
         val_interval(val_itv), num_subgraphs(1), is_selfloop(selfloop) {
     // init some identifiers for this host
     this->myID      = galois::runtime::getSystemNetworkInterface().ID;
@@ -104,10 +103,10 @@ class Net {
 
     // TODO use galois print
     galois::gPrint(header, "Configuration: num_threads ", num_threads,
-                     ", num_conv_layers ", num_conv_layers, ", num_epochs ",
-                     num_epochs, ", hidden1 ", hidden1, ", learning_rate ",
-                     learning_rate, ", dropout_rate ", dropout_rate,
-                     ", weight_decay ", weight_decay, "\n");
+                   ", num_conv_layers ", num_conv_layers, ", num_epochs ",
+                   num_epochs, ", hidden1 ", hidden1, ", learning_rate ",
+                   learning_rate, ", dropout_rate ", dropout_rate,
+                   ", weight_decay ", weight_decay, "\n");
     this->num_layers = num_conv_layers + 1;
 
     // additional layers to add
@@ -146,11 +145,11 @@ class Net {
       for (size_t i = globalValBegin; i < globalValEnd; i++)
         globalValMasks[i] = 1;
     } else {
-      globalTrainCount =
-          graphTopologyContext->read_masks("train", globalSamples, globalTrainBegin,
-                              globalTrainEnd, globalTrainMasks);
-      globalValCount = graphTopologyContext->read_masks("val", globalSamples, globalValBegin,
-                                           globalValEnd, globalValMasks);
+      globalTrainCount = graphTopologyContext->read_masks(
+          "train", globalSamples, globalTrainBegin, globalTrainEnd,
+          globalTrainMasks);
+      globalValCount = graphTopologyContext->read_masks(
+          "val", globalSamples, globalValBegin, globalValEnd, globalValMasks);
     }
 
     // make sure sampel size isn't greater than what we have to train with
@@ -180,7 +179,8 @@ class Net {
 
   void init();
   //! Initializes metadata for the partition
-  void partitionInit(DGraph* graph, std::string dataset_str, bool isSingleClassLabel);
+  void partitionInit(DGraph* graph, std::string dataset_str,
+                     bool isSingleClassLabel);
 
   size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; }
   size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id + 1]; }
@@ -194,8 +194,10 @@ class Net {
     if (subgraph_sample_size) {
       distContext->allocateSubgraphs(num_subgraphs);
       subgraphs_masks = new mask_t[distNumSamples * num_subgraphs];
-      galois::gPrint(header, "Constructing training vertex set induced graph...\n");
-      sampler->initializeMaskedGraph(globalTrainCount, globalTrainMasks, graphTopologyContext->getGraphPointer(),
+      galois::gPrint(header,
+                     "Constructing training vertex set induced graph...\n");
+      sampler->initializeMaskedGraph(globalTrainCount, globalTrainMasks,
+                                     graphTopologyContext->getGraphPointer(),
                                      distContext->getGraphPointer());
     }
 
@@ -207,12 +209,13 @@ class Net {
     for (int curEpoch = 0; curEpoch < num_epochs; curEpoch++) {
       t_epoch.Start();
 
-////////////////////////////////////////////////////////////////////////////////
-// Sampling
-////////////////////////////////////////////////////////////////////////////////
+      ////////////////////////////////////////////////////////////////////////////////
+      // Sampling
+      ////////////////////////////////////////////////////////////////////////////////
       if (subgraph_sample_size) {
         if (num_subg_remain == 0) {
-          galois::gPrint(header, "Generating ", num_subgraphs, " subgraph(s)\n");
+          galois::gPrint(header, "Generating ", num_subgraphs,
+                         " subgraph(s)\n");
           // TODO stat timer instead of this timer
           Timer t_subgen;
           t_subgen.Start();
@@ -220,9 +223,9 @@ class Net {
           // generate subgraphs
 #ifndef __GALOIS_HET_CUDA__
           for (int sid = 0; sid < num_subgraphs; sid++) {
-            sampler->sampleSubgraph(subgraph_sample_size,
-                                     *(distContext->getSubgraphPointer(sid)),
-                                     &subgraphs_masks[sid * globalSamples], curEpoch);
+            sampler->sampleSubgraph(
+                subgraph_sample_size, *(distContext->getSubgraphPointer(sid)),
+                &subgraphs_masks[sid * globalSamples], curEpoch);
           }
 #endif
           num_subg_remain = num_subgraphs;
@@ -239,9 +242,9 @@ class Net {
 
         // choose a subgraph to use
         num_subg_remain--;
-        int sg_id         = num_subg_remain;
-        auto subgraphPointer = distContext->getSubgraphPointer(sg_id);
-        this->subgraphNumVertices   = subgraphPointer->size();
+        int sg_id                 = num_subg_remain;
+        auto subgraphPointer      = distContext->getSubgraphPointer(sg_id);
+        this->subgraphNumVertices = subgraphPointer->size();
 
         // galois::gPrint("Subgraph num_vertices: ", subgraphNumVertices, ",
         // num_edges: ", subgraphPointer->sizeEdges(), "\n");
@@ -254,27 +257,31 @@ class Net {
         distContext->constructNormFactorSub(sg_id);
         for (size_t i = 0; i < num_conv_layers; i++) {
           layers[i]->set_graph_ptr(subgraphPointer);
-          layers[i]->set_norm_consts_ptr(distContext->get_norm_factors_subg_ptr());
+          layers[i]->set_norm_consts_ptr(
+              distContext->get_norm_factors_subg_ptr());
         }
 
         // update labels for subgraph
-        distContext->constructSubgraphLabels(this->subgraphNumVertices,
-                                             &subgraphs_masks[sg_id * globalSamples]);
-        layers[num_layers - 1]->set_labels_ptr(distContext->get_labels_subg_ptr());
+        distContext->constructSubgraphLabels(
+            this->subgraphNumVertices, &subgraphs_masks[sg_id * globalSamples]);
+        layers[num_layers - 1]->set_labels_ptr(
+            distContext->get_labels_subg_ptr());
 
         // update features for subgraph
-        distContext->constructSubgraphFeatures(this->subgraphNumVertices,
-                                               &subgraphs_masks[sg_id * globalSamples]);
-        layers[0]->set_feats_ptr(distContext->get_feats_subg_ptr()); // feed input data
-
-        //Graph* testing = distContext->getSubgraphPointer(sg_id);
-        //for (size_t i = 0; i < testing->size(); i++) {
-        //  for (auto j = testing->edge_begin(i); j < testing->edge_end(i); j++) {
+        distContext->constructSubgraphFeatures(
+            this->subgraphNumVertices, &subgraphs_masks[sg_id * globalSamples]);
+        layers[0]->set_feats_ptr(
+            distContext->get_feats_subg_ptr()); // feed input data
+
+        // Graph* testing = distContext->getSubgraphPointer(sg_id);
+        // for (size_t i = 0; i < testing->size(); i++) {
+        //  for (auto j = testing->edge_begin(i); j < testing->edge_end(i); j++)
+        //  {
         //    galois::gPrint(i, " ", testing->getEdgeDst(j), "\n");
         //  }
         //}
       } // end subgraph sample loop
-////////////////////////////////////////////////////////////////////////////////
+      ////////////////////////////////////////////////////////////////////////////////
 
       // training steps
       galois::gPrint(header, "Epoch ", std::setw(3), curEpoch, seperator);
@@ -417,9 +424,9 @@ class Net {
         }
       }
     } else {
-      globalTestCount =
-          distContext->read_masks(dataset, std::string("test"), globalSamples, globalTestBegin,
-                              globalTestEnd, test_masks, dGraph);
+      globalTestCount = distContext->read_masks(
+          dataset, std::string("test"), globalSamples, globalTestBegin,
+          globalTestEnd, test_masks, dGraph);
     }
 #ifdef __GALOIS_HET_CUDA__
     copy_test_masks_to_device();
@@ -431,7 +438,7 @@ class Net {
     // append conv layers
     std::cout << "\nConstructing layers...\n";
     for (size_t i = 0; i < num_conv_layers - 1; i++) {
-      append_conv_layer(i, true);           // conv layers, act=true
+      append_conv_layer(i, true); // conv layers, act=true
     }
 
     append_conv_layer(num_conv_layers - 1); // the last hidden layer, act=false
@@ -444,7 +451,7 @@ class Net {
       append_dense_layer(num_layers - 2); // dense layer
     }
 
-    append_out_layer(num_layers - 1);     // output layer
+    append_out_layer(num_layers - 1); // output layer
 
     // allocate memory for intermediate features and gradients
     for (size_t i = 0; i < num_layers; i++) {
@@ -460,7 +467,7 @@ class Net {
 
     layers[0]->set_in_data(distContext->get_feats_ptr()); // feed input data
     // precompute the normalization constant based on graph structure
-    //context->norm_factor_computing(false);
+    // context->norm_factor_computing(false);
     distContext->constructNormFactor(graphTopologyContext);
     for (size_t i = 0; i < num_conv_layers; i++)
       layers[i]->set_norm_consts_ptr(distContext->get_norm_factors_ptr());
diff --git a/libdeepgalois/include/deepgalois/Sampler.h b/libdeepgalois/include/deepgalois/Sampler.h
index 1a9fabd9ec..72d5425817 100644
--- a/libdeepgalois/include/deepgalois/Sampler.h
+++ b/libdeepgalois/include/deepgalois/Sampler.h
@@ -20,8 +20,8 @@ class Sampler {
   //! average degree cut off to a clip
   int subg_deg;
 
-  //VertexList vertices_;
-  //mask_t* masks_;
+  // VertexList vertices_;
+  // mask_t* masks_;
 
   //! List of training nodes; sampling set
   std::vector<index_t> trainingNodes;
@@ -43,12 +43,13 @@ class Sampler {
     // auto offsets = deepgalois::parallel_prefix_sum(degrees);
     auto offsets = deepgalois::prefix_sum(degrees);
     size_t ne    = offsets[n];
-    //galois::gPrint("Generate masked graph: num_vertices=", n, ", num_edges=", ne, "\n");
-  
+    // galois::gPrint("Generate masked graph: num_vertices=", n, ", num_edges=",
+    // ne, "\n");
+
     // note this constructs the full graph's nodes; just trims edges
     sub.allocateFrom(n, ne);
     sub.constructNodes();
-  
+
     galois::do_all(
         galois::iterate((size_t)0, n),
         [&](const auto src) {
@@ -58,44 +59,42 @@ class Sampler {
             for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) {
               const auto dst = g->getEdgeDst(e);
               if (masks[dst] == 1) {
-                //galois::gPrint(src, " ", dst, "\n");
+                // galois::gPrint(src, " ", dst, "\n");
                 sub.constructEdge(idx++, dst, 0);
               }
             }
           }
-        }
-        ,
+        },
         galois::loopname("gen_subgraph"));
   }
 
-
-//! determine degree of each vertex in a masked graph (given by masks and g)
-template <typename GraphTy>
-void getMaskedDegrees(size_t n, mask_t* masks, GraphTy* g,
-                                 std::vector<uint32_t>& degrees) {
-  assert(degrees.size() == n);
+  //! determine degree of each vertex in a masked graph (given by masks and g)
+  template <typename GraphTy>
+  void getMaskedDegrees(size_t n, mask_t* masks, GraphTy* g,
+                        std::vector<uint32_t>& degrees) {
+    assert(degrees.size() == n);
 #ifdef PARALLEL_GEN
-  galois::do_all(
-      galois::iterate(size_t(0), n),
-      [&](const auto src) {
+    galois::do_all(
+        galois::iterate(size_t(0), n),
+        [&](const auto src) {
 #else
-  for (size_t src = 0; src < n; src++) {
+    for (size_t src = 0; src < n; src++) {
 #endif
-        if (masks[src] == 1) {
-          for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) {
-            const auto dst = g->getEdgeDst(e);
-            if (masks[dst] == 1) {
-              //galois::gInfo("Edge ", src, " ", dst);
-              degrees[src]++;
+          if (masks[src] == 1) {
+            for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) {
+              const auto dst = g->getEdgeDst(e);
+              if (masks[dst] == 1) {
+                // galois::gInfo("Edge ", src, " ", dst);
+                degrees[src]++;
+              }
             }
           }
         }
-      }
 #ifdef PARALLEL_GEN
-      ,
-      galois::loopname("update_degrees"));
+        ,
+        galois::loopname("update_degrees"));
 #endif
-}
+  }
 
   //! Set masks bitset with IDs in the vertices VertexSet
   void createMasks(size_t n, VertexSet vertices, mask_t* masks);
@@ -117,9 +116,9 @@ void getMaskedDegrees(size_t n, mask_t* masks, GraphTy* g,
   //! API function for user-defined selection strategy
   // TODO how to expose this?
   virtual void selectVertices(size_t nv, size_t n, int m, Graph* g,
-                               VertexList vertices, VertexSet& vertex_set);
+                              VertexList vertices, VertexSet& vertex_set);
   virtual void selectVertices(size_t n, int m, VertexSet& vertex_set,
-                               unsigned seed);
+                              unsigned seed);
 
   // galois::runtime::iterable<galois::NoDerefIterator<edge_iterator> >
   // neighbor_sampler(Graph &g, VertexID v);
diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp
index ea98e26007..1df20fb96b 100644
--- a/libdeepgalois/src/DistContext.cpp
+++ b/libdeepgalois/src/DistContext.cpp
@@ -6,10 +6,11 @@ namespace deepgalois {
 DistContext::~DistContext() {}
 
 // TODO move to reader class
-size_t DistContext::read_labels(bool isSingleClassLabel, std::string dataset_str) {
-  DGraph* dGraph = DistContext::partitionedGraph;
+size_t DistContext::read_labels(bool isSingleClassLabel,
+                                std::string dataset_str) {
+  DGraph* dGraph         = DistContext::partitionedGraph;
   this->usingSingleClass = isSingleClassLabel;
-  unsigned myID = galois::runtime::getSystemNetworkInterface().ID;
+  unsigned myID          = galois::runtime::getSystemNetworkInterface().ID;
   galois::gPrint("[", myID, "] Reading labels from disk...\n");
 
   std::string filename = path + dataset_str + "-labels.txt";
@@ -24,11 +25,14 @@ size_t DistContext::read_labels(bool isSingleClassLabel, std::string dataset_str
   // size of labels should be # local nodes
   if (isSingleClassLabel) {
     galois::gPrint("[", myID, "] One hot labels...\n");
-    this->h_labels = new label_t[dGraph->size()]; // single-class (one-hot) label for
-                                            // each vertex: N x 1
+    this->h_labels =
+        new label_t[dGraph->size()]; // single-class (one-hot) label for
+                                     // each vertex: N x 1
   } else {
     galois::gPrint("[", myID, "] Multi-class labels...\n");
-    this->h_labels = new label_t[dGraph->size() * this->num_classes]; // multi-class label for each vertex: N x E
+    this->h_labels = new label_t[dGraph->size() *
+                                 this->num_classes]; // multi-class label for
+                                                     // each vertex: N x E
   }
 
   uint32_t foundVertices = 0;
@@ -75,7 +79,7 @@ size_t DistContext::read_labels(bool isSingleClassLabel, std::string dataset_str
 // TODO move to reader class
 size_t DistContext::read_features(std::string dataset_str) {
   DGraph* dGraph = DistContext::partitionedGraph;
-  unsigned myID = galois::runtime::getSystemNetworkInterface().ID;
+  unsigned myID  = galois::runtime::getSystemNetworkInterface().ID;
   galois::gPrint("[", myID, "] Reading features from disk...\n");
 
   std::string filename = path + dataset_str + ".ft";
@@ -106,7 +110,7 @@ size_t DistContext::read_features(std::string dataset_str) {
   size_t count = 0;
   for (size_t i = 0; i < m; i++) {
     if (dGraph->isLocal(i)) {
-      //h_feats[count * feat_len] = fullFeats[i];
+      // h_feats[count * feat_len] = fullFeats[i];
       std::copy(fullFeats + i * DistContext::feat_len,
                 fullFeats + (i + 1) * DistContext::feat_len,
                 &this->h_feats[count * DistContext::feat_len]);
@@ -171,7 +175,8 @@ float_t* DistContext::get_in_ptr() { return &h_feats[0]; }
 
 void DistContext::initializeSyncSubstrate() {
   DistContext::syncSubstrate = new galois::graphs::GluonSubstrate<DGraph>(
-      *DistContext::partitionedGraph, galois::runtime::getSystemNetworkInterface().ID,
+      *DistContext::partitionedGraph,
+      galois::runtime::getSystemNetworkInterface().ID,
       galois::runtime::getSystemNetworkInterface().Num, false);
 }
 
@@ -193,7 +198,6 @@ void DistContext::allocNormFactorSub(int subID) {
   // TODO clean out?
 }
 
-
 void DistContext::constructNormFactor(deepgalois::Context* globalContext) {
   galois::gPrint("Norm factor construction\n");
   // using original graph to get ids
@@ -201,7 +205,7 @@ void DistContext::constructNormFactor(deepgalois::Context* globalContext) {
 
   allocNormFactor();
   // this is for testing purposes
-  //galois::do_all(galois::iterate((size_t)0, partitionedGraph->size()),
+  // galois::do_all(galois::iterate((size_t)0, partitionedGraph->size()),
   //  [&] (unsigned i) {
   //    this->normFactors[i] = 0;
   //  }
@@ -210,54 +214,61 @@ void DistContext::constructNormFactor(deepgalois::Context* globalContext) {
 #ifdef USE_MKL
   galois::do_all(galois::iterate((size_t)0, partitionedGraph->size()),
     [&] (unsigned i) {
-      float_t c_i = std::sqrt(float_t(wholeGraph->get_degree(partitionedGraph->getGID(i))));
+    float_t c_i =
+        std::sqrt(float_t(wholeGraph->get_degree(partitionedGraph->getGID(i))));
 
-      for (auto e = partitionedGraph->edge_begin(i); e != partitionedGraph->edge_end(i); e++) {
-        const auto j = partitionedGraph->getEdgeDst(e);
-        float_t c_j  = std::sqrt(float_t(wholeGraph->get_degree(partitionedGraph->getGID(j))));
+    for (auto e = partitionedGraph->edge_begin(i);
+         e != partitionedGraph->edge_end(i); e++) {
+      const auto j = partitionedGraph->getEdgeDst(e);
+      float_t c_j  = std::sqrt(
+          float_t(wholeGraph->get_degree(partitionedGraph->getGID(j))));
 
-        if (c_i == 0.0 || c_j == 0.0) {
-          this->normFactors[e] = 0.0;
-        } else {
-          this->normFactors[e] = 1.0 / (c_i * c_j);
-        }
+      if (c_i == 0.0 || c_j == 0.0) {
+        this->normFactors[e] = 0.0;
+      } else {
+        this->normFactors[e] = 1.0 / (c_i * c_j);
+      }
     },
     galois::loopname("NormCountingEdge"));
   );
 #else
-  galois::do_all(galois::iterate((size_t)0, partitionedGraph->size()),
-    [&] (unsigned v) {
-      auto degree = wholeGraph->get_degree(partitionedGraph->getGID(v));
-      float_t temp = std::sqrt(float_t(degree));
-      if (temp == 0.0) {
-        this->normFactors[v] = 0.0;
-      } else {
-        this->normFactors[v] = 1.0 / temp;
-      }
-    },
-    galois::loopname("NormCountingNode"));
+  galois::do_all(
+      galois::iterate((size_t)0, partitionedGraph->size()),
+      [&](unsigned v) {
+        auto degree  = wholeGraph->get_degree(partitionedGraph->getGID(v));
+        float_t temp = std::sqrt(float_t(degree));
+        if (temp == 0.0) {
+          this->normFactors[v] = 0.0;
+        } else {
+          this->normFactors[v] = 1.0 / temp;
+        }
+      },
+      galois::loopname("NormCountingNode"));
 #endif
   galois::gPrint("Norm factor construction done\n");
 }
 
 void DistContext::constructNormFactorSub(int subgraphID) {
-  galois::gPrint("Sub norm factor construction\n");
-  // right now norm factor based on subgraph
-  // TODO fix this for dist execution
+    galois::gPrint("Sub norm factor construction\n");
+    // right now norm factor based on subgraph
+    // TODO fix this for dist execution
 
-  allocNormFactorSub(subgraphID);
+    allocNormFactorSub(subgraphID);
 
-  Graph& graphToUse = *partitionedSubgraphs[subgraphID];
-  graphToUse.degree_counting();
+    Graph& graphToUse = *partitionedSubgraphs[subgraphID];
+    graphToUse.degree_counting();
 
-  // TODO using partitioned subgraph rather than whoel graph; i.e. dist setting wrong
+    // TODO using partitioned subgraph rather than whoel graph; i.e. dist
+    // setting wrong
 #ifdef USE_MKL
   galois::do_all(galois::iterate((size_t)0, graphToUse->size()),
     [&] (unsigned i) {
-      //float_t c_i = std::sqrt(float_t(wholeGraph->get_degree(partitionedGraph->getGID(i))));
+      // float_t c_i =
+      // std::sqrt(float_t(wholeGraph->get_degree(partitionedGraph->getGID(i))));
       float_t c_i = std::sqrt(float_t(graphToUse.get_degree(i)));
 
-      for (auto e = graphToUse->edge_begin(i); e != graphToUse->edge_end(i); e++) {
+      for (auto e = graphToUse->edge_begin(i); e != graphToUse->edge_end(i);
+           e++) {
         const auto j = graphToUse->getEdgeDst(e);
         float_t c_j  = std::sqrt(float_t(graphToUse.get_degree(j)));
 
@@ -266,86 +277,90 @@ void DistContext::constructNormFactorSub(int subgraphID) {
         } else {
           this->normFactorsSub[e] = 1.0 / (c_i * c_j);
         }
-    },
+      },
     galois::loopname("NormCountingEdge"));
   );
 #else
-  galois::do_all(galois::iterate((size_t)0, graphToUse.size()),
-    [&] (unsigned v) {
-      //auto degree = wholeGraph->get_degree(partitionedGraph->getGID(v));
-      auto degree = graphToUse.get_degree(v);
-      float_t temp = std::sqrt(float_t(degree));
-      if (temp == 0.0) {
-        this->normFactorsSub[v] = 0.0;
-      } else {
-        this->normFactorsSub[v] = 1.0 / temp;
-      }
-      //galois::gPrint(this->normFactorsSub[v], "\n");
-    },
-    galois::loopname("NormCountingNode"));
+  galois::do_all(
+      galois::iterate((size_t)0, graphToUse.size()),
+      [&](unsigned v) {
+        // auto degree = wholeGraph->get_degree(partitionedGraph->getGID(v));
+        auto degree  = graphToUse.get_degree(v);
+        float_t temp = std::sqrt(float_t(degree));
+        if (temp == 0.0) {
+          this->normFactorsSub[v] = 0.0;
+        } else {
+          this->normFactorsSub[v] = 1.0 / temp;
+        }
+        // galois::gPrint(this->normFactorsSub[v], "\n");
+      },
+      galois::loopname("NormCountingNode"));
 #endif
   galois::gPrint("Sub norm factor construction done\n");
 }
 //! generate labels for the subgraph, m is subgraph size, mask
 //! tells which vertices to use
 void DistContext::constructSubgraphLabels(size_t m, const mask_t* masks) {
-  if (DistContext::usingSingleClass) {
-    DistContext::h_labels_subg.resize(m);
-  } else {
-    DistContext::h_labels_subg.resize(m * DistContext::num_classes);
-  }
-
-  size_t count = 0;
-  // see which labels to copy over for this subgraph
-  for (size_t i = 0; i < this->partitionedGraph->size(); i++) {
-    if (masks[i] == 1) {
       if (DistContext::usingSingleClass) {
-        DistContext::h_labels_subg[count] = h_labels[i];
+        DistContext::h_labels_subg.resize(m);
       } else {
-        std::copy(DistContext::h_labels + i * DistContext::num_classes,
-                  DistContext::h_labels + (i + 1) * DistContext::num_classes,
-                  &DistContext::h_labels_subg[count * DistContext::num_classes]);
+        DistContext::h_labels_subg.resize(m * DistContext::num_classes);
       }
-      //galois::gPrint("l ", (float)DistContext::h_labels_subg[count], "\n");
-      count++;
-    }
-  }
-  GALOIS_ASSERT(count == m);
+
+      size_t count = 0;
+      // see which labels to copy over for this subgraph
+      for (size_t i = 0; i < this->partitionedGraph->size(); i++) {
+        if (masks[i] == 1) {
+          if (DistContext::usingSingleClass) {
+            DistContext::h_labels_subg[count] = h_labels[i];
+          } else {
+            std::copy(
+                DistContext::h_labels + i * DistContext::num_classes,
+                DistContext::h_labels + (i + 1) * DistContext::num_classes,
+                &DistContext::h_labels_subg[count * DistContext::num_classes]);
+          }
+          // galois::gPrint("l ", (float)DistContext::h_labels_subg[count],
+          // "\n");
+          count++;
+        }
+      }
+      GALOIS_ASSERT(count == m);
 }
 
 //! generate input features for the subgraph, m is subgraph size,
 //! masks tells which vertices to use
 void DistContext::constructSubgraphFeatures(size_t m, const mask_t* masks) {
-  size_t count = 0;
-  // if (h_feats_subg == NULL) h_feats_subg = new float_t[m*feat_len];
-  DistContext::h_feats_subg.resize(m * feat_len);
-  for (size_t i = 0; i < this->partitionedGraph->size(); i++) {
-    if (masks[i] == 1) {
-      std::copy(DistContext::h_feats + i * DistContext::feat_len,
-                DistContext::h_feats + (i + 1) * DistContext::feat_len,
-                &DistContext::h_feats_subg[count * DistContext::feat_len]);
-      //for (unsigned a = 0; a < DistContext::feat_len; a++) {
-      //  if (h_feats_subg[count * DistContext::feat_len + a] != 0) {
-      //    galois::gPrint(h_feats_subg[count * DistContext::feat_len + a], " ");
-      //  }
-      //}
-      //galois::gPrint("\n");
-      count++;
-    }
-  }
-  GALOIS_ASSERT(count == m);
+      size_t count = 0;
+      // if (h_feats_subg == NULL) h_feats_subg = new float_t[m*feat_len];
+      DistContext::h_feats_subg.resize(m * feat_len);
+      for (size_t i = 0; i < this->partitionedGraph->size(); i++) {
+        if (masks[i] == 1) {
+          std::copy(DistContext::h_feats + i * DistContext::feat_len,
+                    DistContext::h_feats + (i + 1) * DistContext::feat_len,
+                    &DistContext::h_feats_subg[count * DistContext::feat_len]);
+          // for (unsigned a = 0; a < DistContext::feat_len; a++) {
+          //  if (h_feats_subg[count * DistContext::feat_len + a] != 0) {
+          //    galois::gPrint(h_feats_subg[count * DistContext::feat_len + a],
+          //    " ");
+          //  }
+          //}
+          // galois::gPrint("\n");
+          count++;
+        }
+      }
+      GALOIS_ASSERT(count == m);
 }
 
 
 galois::graphs::GluonSubstrate<DGraph>* DistContext::getSyncSubstrate() {
-  return DistContext::syncSubstrate;
+      return DistContext::syncSubstrate;
 };
 
 void DistContext::allocateSubgraphs(int num_subgraphs) {
-  partitionedSubgraphs.resize(num_subgraphs);
-  for (int i = 0; i < num_subgraphs; i++) {
-    partitionedSubgraphs[i] = new Graph();
-  }
+      partitionedSubgraphs.resize(num_subgraphs);
+      for (int i = 0; i < num_subgraphs; i++) {
+        partitionedSubgraphs[i] = new Graph();
+      }
 }
 
 } // namespace deepgalois
diff --git a/libdeepgalois/src/Net.cpp b/libdeepgalois/src/Net.cpp
index ce23b2b51d..fbb6323891 100644
--- a/libdeepgalois/src/Net.cpp
+++ b/libdeepgalois/src/Net.cpp
@@ -9,7 +9,8 @@
 
 namespace deepgalois {
 
-void Net::partitionInit(DGraph* graph, std::string dataset_str, bool isSingleClassLabel) {
+void Net::partitionInit(DGraph* graph, std::string dataset_str,
+                        bool isSingleClassLabel) {
   this->dGraph      = graph;
   this->distContext = new deepgalois::DistContext();
   this->distContext->saveDistGraph(dGraph);
@@ -48,25 +49,25 @@ void Net::partitionInit(DGraph* graph, std::string dataset_str, bool isSingleCla
       }
     }
   } else {
-    globalTrainCount = this->distContext->read_masks(dataset_str,
-        "train", this->distNumSamples, globalTrainBegin, globalTrainEnd,
-        this->distTrainMasks, this->dGraph);
-    globalValCount = this->distContext->read_masks(dataset_str,
-        "val", this->distNumSamples, globalValBegin, globalValEnd,
+    globalTrainCount = this->distContext->read_masks(
+        dataset_str, "train", this->distNumSamples, globalTrainBegin,
+        globalTrainEnd, this->distTrainMasks, this->dGraph);
+    globalValCount = this->distContext->read_masks(
+        dataset_str, "val", this->distNumSamples, globalValBegin, globalValEnd,
         this->distValMasks, this->dGraph);
   }
 
   // input feature dimension: D
   feature_dims[0] = this->distContext->read_features(dataset_str);
   for (size_t i = 1; i < num_conv_layers; i++)
-    feature_dims[i] = this->h1;                 // hidden1 level embedding: 16
+    feature_dims[i] = this->h1;                // hidden1 level embedding: 16
   feature_dims[num_conv_layers] = num_classes; // output embedding: E
   if (this->has_l2norm) {
     // l2 normalized embedding: E
     feature_dims[num_conv_layers + 1] = num_classes;
   }
   if (this->has_dense) {
-     // MLP embedding: E
+    // MLP embedding: E
     feature_dims[num_layers - 1] = num_classes;
   }
 
@@ -127,16 +128,18 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count,
 
           uint32_t localID = this->dGraph->getLID(i);
           if (masks == NULL) {
-            //GALOIS_DIE("subgraphs not implemented for dist yet");
+            // GALOIS_DIE("subgraphs not implemented for dist yet");
             // subgraph here: TODO
-            auto pred = math::argmax(num_classes, &preds[localID * num_classes]);
+            auto pred =
+                math::argmax(num_classes, &preds[localID * num_classes]);
             // check prediction
             if ((label_t)pred == ground_truth[localID])
               accuracy_all += 1.0;
           } else {
             if (masks[localID] == 1) {
               // get prediction
-              auto pred = math::argmax(num_classes, &preds[localID * num_classes]);
+              auto pred =
+                  math::argmax(num_classes, &preds[localID * num_classes]);
               // check prediction
               if ((label_t)pred == ground_truth[localID])
                 accuracy_all += 1.0;
diff --git a/libdeepgalois/src/Sampler.cpp b/libdeepgalois/src/Sampler.cpp
index 466cee6584..966caaedf3 100644
--- a/libdeepgalois/src/Sampler.cpp
+++ b/libdeepgalois/src/Sampler.cpp
@@ -26,7 +26,8 @@ inline unsigned getDegree(Graph* g, index_t v) {
   return g->edge_end(v) - g->edge_begin(v);
 }
 
-void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, Graph* g, DGraph* dg) {
+void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, Graph* g,
+                                    DGraph* dg) {
   this->count_ = count;
   // save original graph
   Sampler::globalGraph = g;
@@ -60,7 +61,7 @@ void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, Graph* g, DGrap
           for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) {
             const auto dst = g->getEdgeDst(e);
             if (masks[dst] == 1) {
-              //galois::gPrint(src, " ", dst, "\n");
+              // galois::gPrint(src, " ", dst, "\n");
               Sampler::globalMaskedGraph->constructEdge(idx++, dst, 0);
             }
           }
@@ -69,7 +70,7 @@ void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, Graph* g, DGrap
       galois::loopname("gen_subgraph"));
 
   Sampler::globalMaskedGraph->degree_counting();
-  Sampler::avg_deg  = globalMaskedGraph->sizeEdges() / globalMaskedGraph->size();
+  Sampler::avg_deg = globalMaskedGraph->sizeEdges() / globalMaskedGraph->size();
   Sampler::subg_deg = (avg_deg > SAMPLE_CLIP) ? SAMPLE_CLIP : avg_deg;
 
   // TODO masked part graph as well to save time later; right now constructing
@@ -96,8 +97,6 @@ void Sampler::checkGSDB(std::vector<db_t>& DB0, std::vector<db_t>& DB1,
   DB2.resize(size);
 }
 
-
-
 // implementation from GraphSAINT
 // https://github.com/GraphSAINT/GraphSAINT/blob/master/ipdps19_cpp/sample.cpp
 void Sampler::selectVertices(size_t n, int m, VertexSet& st, unsigned seed) {
@@ -295,7 +294,8 @@ void Sampler::selectVertices(size_t nv, size_t n, int m, Graph* g,
 void Sampler::createMasks(size_t n, VertexSet vertices, mask_t* masks) {
   // galois::gPrint("Updating masks, size = ", vertices.size(), "\n");
   std::fill(masks, masks + n, 0);
-  for (auto v : vertices) masks[v] = 1;
+  for (auto v : vertices)
+    masks[v] = 1;
 }
 
 inline VertexList Sampler::reindexVertices(size_t n, VertexSet vertex_set) {
@@ -362,21 +362,23 @@ VertexSet Sampler::convertToLID(VertexSet& gidSet) {
   return existingLIDs;
 }
 
-void Sampler::sampleSubgraph(size_t n, Graph& sg, mask_t* masks, unsigned seed) {
+void Sampler::sampleSubgraph(size_t n, Graph& sg, mask_t* masks,
+                             unsigned seed) {
   VertexSet sampledSet;
   // n = 9000 by default
-  // this->selectVertices(count_, n, m_, globalMaskedGraph, vertices_, sampledSet);
-  // do the sampling of vertices from training set + using masked graph
+  // this->selectVertices(count_, n, m_, globalMaskedGraph, vertices_,
+  // sampledSet); do the sampling of vertices from training set + using masked
+  // graph
   this->selectVertices(n, m_, sampledSet, seed); // m = 1000 by default
 
   // sampledSet is a list of *global* ids in the graph
   // create new vertex set with LIDs for partitioned graph
   VertexSet sampledLIDs = this->convertToLID(sampledSet);
 
-  //VertexSet sampledLIDs;
-  //galois::gPrint("part graph num edges is ", partGraph->sizeEdges(), "\n");
-  //galois::gPrint("global mask num edges is ", globalMaskedGraph->sizeEdges(), "\n");
-  //for (auto i : this->trainingNodes) {
+  // VertexSet sampledLIDs;
+  // galois::gPrint("part graph num edges is ", partGraph->sizeEdges(), "\n");
+  // galois::gPrint("global mask num edges is ", globalMaskedGraph->sizeEdges(),
+  // "\n"); for (auto i : this->trainingNodes) {
   //  sampledLIDs.insert(i);
   //}
 
@@ -386,11 +388,12 @@ void Sampler::sampleSubgraph(size_t n, Graph& sg, mask_t* masks, unsigned seed)
   // this graph will contain sampled vertices and induced subgraph for it
   Graph maskedSG;
   // TODO use partMaskedGraph once constructed later
-  this->getMaskedGraph(Sampler::partGraph->size(), masks, Sampler::partGraph,
+  this->getMaskedGraph(
+      Sampler::partGraph->size(), masks, Sampler::partGraph,
       maskedSG); // remove edges whose destination is not masked
   this->reindexSubgraph(sampledLIDs, maskedSG, sg);
 
-  //galois::gPrint("sg num edges is ", sg.sizeEdges(), "\n");
+  // galois::gPrint("sg num edges is ", sg.sizeEdges(), "\n");
 }
 
 } // namespace deepgalois
diff --git a/libdeepgalois/src/layers/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp
index 4e07ca96cf..ce9d709dbf 100644
--- a/libdeepgalois/src/layers/aggregator.cpp
+++ b/libdeepgalois/src/layers/aggregator.cpp
@@ -5,7 +5,7 @@
 // TODO template arg
 void deepgalois::update_all(size_t len, Graph& g, const float_t* in,
                             float_t* out, bool norm, float_t* norm_factor) {
-// std::cout << "[update_all] graph size: " << n << "\n";
+  // std::cout << "[update_all] graph size: " << n << "\n";
   size_t n = g.size();
   galois::do_all(
       galois::iterate(size_t(0), n),
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 7acf787bae..5881b617cc 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -119,13 +119,14 @@ void graph_conv_layer::forward_propagation(const float_t* in_data,
 
   // TODO sync of out_data required here
   // TODO how to do this for the sampled case?
-  //deepgalois::_syncVectorSize = z;
-  //deepgalois::_dataToSync     = out_data;
-  //layer::context->getSyncSubstrate()->sync<writeAny, readAny, GraphConvSync>(
+  // deepgalois::_syncVectorSize = z;
+  // deepgalois::_dataToSync     = out_data;
+  // layer::context->getSyncSubstrate()->sync<writeAny, readAny, GraphConvSync>(
   //    "AggSync");
 
   // run relu activation on output if specified
-  if (act_) math::relu_cpu(x * z, out_data, out_data);
+  if (act_)
+    math::relu_cpu(x * z, out_data, out_data);
 }
 
 // 𝜕𝐸 / 𝜕𝑦[𝑙−1] = 𝜕𝐸 / 𝜕𝑦[𝑙] ∗ 𝑊 ^𝑇
@@ -164,15 +165,15 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
   }
 
   // sync agg
-  //deepgalois::_syncVectorSize = z;
-  //deepgalois::_dataToSync     = out_temp;
-  //layer::context->getSyncSubstrate()->sync<writeAny, readAny, GraphConvSync>(
+  // deepgalois::_syncVectorSize = z;
+  // deepgalois::_dataToSync     = out_temp;
+  // layer::context->getSyncSubstrate()->sync<writeAny, readAny, GraphConvSync>(
   //    "AggSyncBack");
 
   if (level_ != 0 && dropout_)
     math::d_dropout_cpu(x, y, scale_, in_grad, dropout_mask, in_grad);
 
-  //layer::syncSub->sync<writeAny, readAny, GradientSync>("GradientSync");
+  // layer::syncSub->sync<writeAny, readAny, GradientSync>("GradientSync");
   // galois::gInfo("[", layer::gradientGraph->myHostID(), "] Sync done");
 }
 
diff --git a/libdeepgalois/src/sampler.cu b/libdeepgalois/src/sampler.cu
index cecfa6c9e0..6fb452db4c 100644
--- a/libdeepgalois/src/sampler.cu
+++ b/libdeepgalois/src/sampler.cu
@@ -11,29 +11,34 @@ __global__ void set_masks(index_t n, index_t* vertices, mask_t* masks) {
 }
 
 // compute the degrees of a masked graph
-// n is the size of the original graph 
-__global__ void get_masked_degrees(index_t n, mask_t *masks, GraphGPU g, index_t* degrees) {
+// n is the size of the original graph
+__global__ void get_masked_degrees(index_t n, mask_t* masks, GraphGPU g,
+                                   index_t* degrees) {
   CUDA_KERNEL_LOOP(src, n) {
     if (masks[src] == 1) {
       for (auto e = g.edge_begin(src); e != g.edge_end(src); e++) {
         auto dst = g.getEdgeDst(e);
-        if (masks[dst] == 1) degrees[src] ++;
+        if (masks[dst] == 1)
+          degrees[src]++;
       }
     }
   }
 }
 
-// Given a graph, remove any edge which has end-point masked, and generate the subgraph
-// n is the size of the original graph and the subgraph
-// offset was computed by using prefix-sum of the masked degrees
-__global__ void generate_masked_graph_kernel(index_t n, const mask_t *masks, const index_t* offsets, GraphGPU g, GraphGPU subg) {
+// Given a graph, remove any edge which has end-point masked, and generate the
+// subgraph n is the size of the original graph and the subgraph offset was
+// computed by using prefix-sum of the masked degrees
+__global__ void generate_masked_graph_kernel(index_t n, const mask_t* masks,
+                                             const index_t* offsets, GraphGPU g,
+                                             GraphGPU subg) {
   CUDA_KERNEL_LOOP(src, n) {
-    subg.fixEndEdge(src, offsets[src+1]);
+    subg.fixEndEdge(src, offsets[src + 1]);
     if (masks[src] == 1) {
       auto idx = offsets[src];
       for (auto e = g.edge_begin(src); e != g.edge_end(src); e++) {
         auto dst = g.getEdgeDst(e);
-        if (masks[dst] == 1) subg.constructEdge(idx++, dst);
+        if (masks[dst] == 1)
+          subg.constructEdge(idx++, dst);
       }
     }
   }
@@ -41,20 +46,25 @@ __global__ void generate_masked_graph_kernel(index_t n, const mask_t *masks, con
 
 // compute the degrees of the subgraph induced by the vertex set
 // n is the size of the vertex set
-// new_ids array maps vertex ID in the original graph to the vertex ID in the subgraph
-__global__ void get_new_degrees(index_t n, index_t* vertices, index_t* new_ids, GraphGPU g, index_t* degrees) {
+// new_ids array maps vertex ID in the original graph to the vertex ID in the
+// subgraph
+__global__ void get_new_degrees(index_t n, index_t* vertices, index_t* new_ids,
+                                GraphGPU g, index_t* degrees) {
   CUDA_KERNEL_LOOP(i, n) {
-    auto v = vertices[i];
+    auto v              = vertices[i];
     degrees[new_ids[v]] = g.getOutDegree(v);
   }
 }
 
-// Given a masked graph, remove the masked vertices, reindex the rest vertices, and generate the subgraph
-// offset was computed by using prefix-sum of the new degrees
-// n is the size of the old_ids and the sbugraph
-__global__ void generate_graph_kernel(index_t n, const index_t* offsets, const index_t* old_ids, const index_t* new_ids, GraphGPU g, GraphGPU subg) {
+// Given a masked graph, remove the masked vertices, reindex the rest vertices,
+// and generate the subgraph offset was computed by using prefix-sum of the new
+// degrees n is the size of the old_ids and the sbugraph
+__global__ void generate_graph_kernel(index_t n, const index_t* offsets,
+                                      const index_t* old_ids,
+                                      const index_t* new_ids, GraphGPU g,
+                                      GraphGPU subg) {
   CUDA_KERNEL_LOOP(i, n) {
-    subg.fixEndEdge(i, offsets[i+1]);
+    subg.fixEndEdge(i, offsets[i + 1]);
     index_t j = 0;
     auto src  = old_ids[i];
     for (auto e = g.edge_begin(src); e != g.edge_end(src); e++) {
@@ -66,15 +76,15 @@ __global__ void generate_graph_kernel(index_t n, const index_t* offsets, const i
   }
 }
 
-void Sampler::update_masks(size_t n, index_t* vertices, mask_t *masks) {
+void Sampler::update_masks(size_t n, index_t* vertices, mask_t* masks) {
   set_masks<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, vertices, masks);
 }
 
-void Sampler::indexing(size_t n, index_t* vertices, index_t *new_indices) {
+void Sampler::indexing(size_t n, index_t* vertices, index_t* new_indices) {
   index_t vid = 0;
   for (index_t i = 0; i < n; i++) {
-    auto v = vertices[i];
-    new_indices[v] = vid ++;
+    auto v         = vertices[i];
+    new_indices[v] = vid++;
   }
 }
 
@@ -87,7 +97,8 @@ inline VertexList Sampler::reindexing_vertices(size_t n, VertexSet vertex_set) {
   return new_ids;
 }
 
-void Sampler::generate_masked_graph(index_t n, mask_t* masks, GraphGPU *g, GraphGPU *subg) {
+void Sampler::generate_masked_graph(index_t n, mask_t* masks, GraphGPU* g,
+                                    GraphGPU* subg) {
   index_t *degrees, *offsets;
   CUDA_CHECK(cudaMalloc((void**)&degrees, sizeof(index_t)*n);
   get_masked_degrees<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, masks, g, degrees);
@@ -102,29 +113,35 @@ void Sampler::generate_masked_graph(index_t n, mask_t* masks, GraphGPU *g, Graph
 }
 
 // use a random walk to select vertex subset
-void Sampler::select_vertices(size_t n, int m, VertexSet &st) {
-}
+void Sampler::select_vertices(size_t n, int m, VertexSet& st) {}
 
 // n: size of the original graph
 // nv: size of the subgraph; i.e. size of vertex_set
 // masks, graph g and subgraph sub are on the device (GPU)
-void Sampler::generate_subgraph(index_t nv, VertexSet vertex_set, mask_t* masks, GraphGPU *g, GraphGPU *sub) {
+void Sampler::generate_subgraph(index_t nv, VertexSet vertex_set, mask_t* masks,
+                                GraphGPU* g, GraphGPU* sub) {
   // convert the vertex_set to a vertex_list and copy it to the device
   VertexList vertex_list(vertex_set.begin(), vertex_set.end());
-  index_t *d_vertex_list;
-  cudaMalloc((void **) &d_vertex_list, nv*sizeof(index_t));
-  CUDA_CHECK(cudaMemcpy(d_vertex_list, &vertex_list[0], nv*sizeof(index_t), cudaMemcpyHostToDevice));
+  index_t* d_vertex_list;
+  cudaMalloc((void**)&d_vertex_list, nv * sizeof(index_t));
+  CUDA_CHECK(cudaMemcpy(d_vertex_list, &vertex_list[0], nv * sizeof(index_t),
+                        cudaMemcpyHostToDevice));
 
   index_t n = graph->size();
-  update_masks(n, d_vertex_list, masks); // set masks for vertices in the vertex_set
-  GraphGPU masked_sg; // size is the same as original graph, but masked dst removed
-  generate_masked_graph(n, masks, g, &masked_sg); // remove edges whose destination is not masked
+  update_masks(n, d_vertex_list,
+               masks); // set masks for vertices in the vertex_set
+  GraphGPU
+      masked_sg; // size is the same as original graph, but masked dst removed
+  generate_masked_graph(
+      n, masks, g, &masked_sg); // remove edges whose destination is not masked
 
   // re-index the subgraph
-  index_t *d_new_ids; // Given an old vertex ID ∈ [0, n), returns a new vertex ID ∈ [0, nv)
-  cudaMalloc((void **) &d_new_ids, n*sizeof(index_t));
+  index_t* d_new_ids; // Given an old vertex ID ∈ [0, n), returns a new vertex
+                      // ID ∈ [0, nv)
+  cudaMalloc((void**)&d_new_ids, n * sizeof(index_t));
   auto new_ids = reindexing_vertices(nv, vertex_set);
-  CUDA_CHECK(cudaMemcpy(d_new_ids, &new_ids[0], n*sizeof(index_t), cudaMemcpyHostToDevice));
+  CUDA_CHECK(cudaMemcpy(d_new_ids, &new_ids[0], n * sizeof(index_t),
+                        cudaMemcpyHostToDevice));
 
   // generate the offsets for the re-indexed subgraph
   index_t *degrees, *offsets;
@@ -142,4 +159,4 @@ void Sampler::generate_subgraph(index_t nv, VertexSet vertex_set, mask_t* masks,
   generate_graph_kernel<<<CUDA_GET_BLOCKS(nv), CUDA_NUM_THREADS>>>(nv, offsets, d_vertex_list, d_new_ids, masked_sg, sub);
 }
 
-}
+} // namespace deepgalois

From 57dacca83622e189c39a58a9869293fe9644e5b6 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Mon, 11 May 2020 21:47:21 -0500
Subject: [PATCH 287/660] fix some errors

---
 libdeepgalois/CMakeLists.txt                  | 16 ++---
 libdeepgalois/include/deepgalois/GraphTypes.h | 12 +++-
 libdeepgalois/include/deepgalois/Net.h        | 69 ++++++++++++-------
 .../include/deepgalois/layers/layer.h         | 18 +++--
 libdeepgalois/src/layers/graph_conv_layer.cpp |  2 -
 libdeepgalois/src/layers/leaky_relu_layer.cpp |  2 -
 libdeepgalois/src/layers/relu_layer.cpp       |  2 -
 .../src/layers/sigmoid_loss_layer.cpp         |  2 -
 .../src/layers/softmax_loss_layer.cpp         |  2 -
 libgpu/include/graph_gpu.h                    |  3 +-
 lonestar/gnn/gcn/gcn.cpp                      | 53 +-------------
 lonestar/gnn/include/engine.h                 | 18 +++--
 12 files changed, 85 insertions(+), 114 deletions(-)

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index 2f05527318..d591c4927f 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -25,10 +25,9 @@ include_directories(${CMAKE_SOURCE_DIR}/libgalois/include)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
 link_directories(${CMAKE_SOURCE_DIR}/libgalois)
 
-if(NOT ENABLE_HETERO_GALOIS)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCPU_ONLY")
-else()
+if(ENABLE_HETERO_GALOIS)
   # hetero path
+  set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -D__GALOIS_HET_CUDA__")
   set(CUB_ROOT "${CMAKE_SOURCE_DIR}/cub") # only required headers
   include_directories("${CUB_ROOT}")
   set(MGPU_ROOT "${CMAKE_SOURCE_DIR}/moderngpu") # only required headers
@@ -71,7 +70,10 @@ endif()
 
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 
-if(NOT ENABLE_HETERO_GALOIS)
+if(ENABLE_HETERO_GALOIS)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__GALOIS_HET_CUDA__")
+  set(sources src/reader.cpp)
+else()
   set(sources
     src/layers/softmax_loss_layer.cpp
     src/layers/sigmoid_loss_layer.cpp
@@ -91,11 +93,7 @@ if(NOT ENABLE_HETERO_GALOIS)
     src/node.cpp
     src/Net.cpp
   )
-else()
-  # dummy sources set for dg_cpu for HETERO build
-  # TODO fix this
-  set(sources src/reader.cpp)
-endif(NOT ENABLE_HETERO_GALOIS)
+endif(ENABLE_HETERO_GALOIS)
 
 add_library(dg_cpu STATIC ${sources})
 target_link_libraries(dg_cpu galois_shmem)
diff --git a/libdeepgalois/include/deepgalois/GraphTypes.h b/libdeepgalois/include/deepgalois/GraphTypes.h
index 3a93565724..c542f42b89 100644
--- a/libdeepgalois/include/deepgalois/GraphTypes.h
+++ b/libdeepgalois/include/deepgalois/GraphTypes.h
@@ -1,16 +1,22 @@
 #pragma once
 
 #include "deepgalois/types.h"
-#include "galois/Galois.h"
-#include "galois/graphs/NewGeneric.h"
 #include "deepgalois/lgraph.h"
 
 #ifdef __GALOIS_HET_CUDA__
-// TODO reintroduce GPU as necessary here
+#include "graph_gpu.h"
+#else
+#include "galois/Galois.h"
+#include "galois/graphs/NewGeneric.h"
 #endif
 
 namespace deepgalois {
 using edge_iterator = index_t;
+#ifdef __GALOIS_HET_CUDA__
+using Graph         = CSRGraph;
+using GraphGPU      = CSRGraph;
+#else
 using DGraph        = galois::graphs::DistGraph<char, void>;
 using Graph         = LearningGraph;
+#endif
 } // namespace deepgalois
diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h
index 04f51f317b..58433c7c1c 100644
--- a/libdeepgalois/include/deepgalois/Net.h
+++ b/libdeepgalois/include/deepgalois/Net.h
@@ -10,11 +10,13 @@
 #include "deepgalois/layers/sigmoid_loss_layer.h"
 #include "deepgalois/optimizer.h"
 #include "deepgalois/utils.h"
-#include "deepgalois/Sampler.h"
 #include "deepgalois/Context.h"
 #include "deepgalois/GraphTypes.h"
 
+#ifndef __GALOIS_HET_CUDA__
+#include "deepgalois/Sampler.h"
 #include "deepgalois/DistContext.h"
+#endif
 
 namespace deepgalois {
 
@@ -23,7 +25,11 @@ namespace deepgalois {
 // layer 1: features N x D, weights D x 16, out N x 16 (hidden1=16)
 // layer 2: features N x 16, weights 16 x E, out N x E
 class Net {
-  unsigned myID         = galois::runtime::getSystemNetworkInterface().ID;
+#ifdef __GALOIS_HET_CUDA__
+  unsigned myID = 0;
+#else
+  unsigned myID = galois::runtime::getSystemNetworkInterface().ID;
+#endif
   std::string header    = "[" + std::to_string(myID) + "] ";
   std::string seperator = "\n";
 
@@ -77,36 +83,38 @@ class Net {
 
   //! context holds all of the graph data
   deepgalois::Context* graphTopologyContext;
+
+#ifndef __GALOIS_HET_CUDA__
   //! dist context holds graph data of the partitioned graph only
   deepgalois::DistContext* distContext;
-
   DGraph* dGraph;
-
   Sampler* sampler;
+#endif
 
 public:
   Net(std::string dataset_str, int nt, unsigned n_conv, int epochs,
       unsigned hidden1, float lr, float dropout, float wd, bool selfloop,
-      bool single, bool l2norm, bool dense, unsigned neigh_sz, unsigned subg_sz,
-      int val_itv)
+      bool single, bool l2norm, bool dense, unsigned neigh_sz, unsigned subg_sz, int val_itv)
       : is_single_class(single), has_l2norm(l2norm), has_dense(dense),
         neighbor_sample_size(neigh_sz), subgraph_sample_size(subg_sz),
         num_threads(nt), num_conv_layers(n_conv), num_epochs(epochs),
         h1(hidden1), learning_rate(lr), dropout_rate(dropout), weight_decay(wd),
         val_interval(val_itv), num_subgraphs(1), is_selfloop(selfloop) {
     // init some identifiers for this host
+#ifndef __GALOIS_HET_CUDA__
     this->myID      = galois::runtime::getSystemNetworkInterface().ID;
+#endif
     this->header    = "[" + std::to_string(myID) + "] ";
-    this->seperator = "\n";
+    this->seperator = " ";
 
     assert(n_conv > 0);
 
     // TODO use galois print
-    galois::gPrint(header, "Configuration: num_threads ", num_threads,
-                   ", num_conv_layers ", num_conv_layers, ", num_epochs ",
-                   num_epochs, ", hidden1 ", hidden1, ", learning_rate ",
-                   learning_rate, ", dropout_rate ", dropout_rate,
-                   ", weight_decay ", weight_decay, "\n");
+    std::cout << header << "Configuration: num_threads " << num_threads
+              << ", num_conv_layers " << num_conv_layers << ", num_epochs "
+              << num_epochs << ", hidden1 " << hidden1 << ", learning_rate "
+              << learning_rate << ", dropout_rate " << dropout_rate
+              << ", weight_decay " << weight_decay << "\n";
     this->num_layers = num_conv_layers + 1;
 
     // additional layers to add
@@ -152,6 +160,7 @@ class Net {
           "val", globalSamples, globalValBegin, globalValEnd, globalValMasks);
     }
 
+#ifndef __GALOIS_HET_CUDA__
     // make sure sampel size isn't greater than what we have to train with
     if (subgraph_sample_size > globalTrainCount) {
       GALOIS_DIE("subgraph size can not be larger than the size of training "
@@ -162,6 +171,7 @@ class Net {
     // used for sampling)
 
     this->sampler = new Sampler();
+#endif
   }
 
   //! Default net constructor
@@ -178,10 +188,12 @@ class Net {
   //      test_masks(NULL), context(NULL) {}
 
   void init();
+
+#ifndef __GALOIS_HET_CUDA__
   //! Initializes metadata for the partition
   void partitionInit(DGraph* graph, std::string dataset_str,
                      bool isSingleClassLabel);
-
+#endif
   size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; }
   size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id + 1]; }
 
@@ -194,14 +206,15 @@ class Net {
     if (subgraph_sample_size) {
       distContext->allocateSubgraphs(num_subgraphs);
       subgraphs_masks = new mask_t[distNumSamples * num_subgraphs];
-      galois::gPrint(header,
-                     "Constructing training vertex set induced graph...\n");
+      std::cout << header << "Constructing training vertex set induced graph...\n";
+#ifndef __GALOIS_HET_CUDA__
       sampler->initializeMaskedGraph(globalTrainCount, globalTrainMasks,
                                      graphTopologyContext->getGraphPointer(),
                                      distContext->getGraphPointer());
+#endif
     }
 
-    galois::gPrint(header, "Start training...\n");
+    std::cout << header << "Start training...\n";
 
     Timer t_epoch;
 
@@ -214,8 +227,7 @@ class Net {
       ////////////////////////////////////////////////////////////////////////////////
       if (subgraph_sample_size) {
         if (num_subg_remain == 0) {
-          galois::gPrint(header, "Generating ", num_subgraphs,
-                         " subgraph(s)\n");
+          std::cout << header << "Generating " << num_subgraphs << " subgraph(s)\n";
           // TODO stat timer instead of this timer
           Timer t_subgen;
           t_subgen.Start();
@@ -284,7 +296,7 @@ class Net {
       ////////////////////////////////////////////////////////////////////////////////
 
       // training steps
-      galois::gPrint(header, "Epoch ", std::setw(3), curEpoch, seperator);
+      std::cout << header << "Epoch " << std::setw(3) << curEpoch << seperator;
       set_netphases(net_phase::train);
       acc_t train_loss = 0.0, train_acc = 0.0;
 
@@ -304,8 +316,8 @@ class Net {
       // validation / testing
       set_netphases(net_phase::test);
 
-      galois::gPrint(header, "train_loss ", std::setprecision(3), std::fixed,
-                     train_loss, " train_acc ", train_acc, seperator);
+      std::cout << header << "train_loss " << std::setprecision(3) << std::fixed
+                << train_loss << " train_acc " << train_acc << seperator;
 
       t_epoch.Stop();
 
@@ -330,8 +342,8 @@ class Net {
 
     double avg_train_time = total_train_time / (double)num_epochs;
     double throughput     = 1000.0 * (double)num_epochs / total_train_time;
-    galois::gPrint(header, "Average training time per epoch: ", avg_train_time,
-                   " ms. Throughput: ", throughput, " epoch/s\n");
+    std::cout << header << "Average training time per epoch: " << avg_train_time
+              << " ms. Throughput: " << throughput << " epoch/s\n";
   }
 
   // evaluate, i.e. inference or predict
@@ -419,14 +431,21 @@ class Net {
       globalTestCount = 55703;
       globalTestEnd   = globalTestBegin + globalTestCount;
       for (size_t i = globalTestBegin; i < globalTestEnd; i++) {
-        if (dGraph->isLocal(i)) {
+#ifndef __GALOIS_HET_CUDA__
+        if (dGraph->isLocal(i))
           test_masks[dGraph->getLID(i)] = 1;
-        }
+#else
+        // TODO: Read for GPU
+#endif
       }
     } else {
       globalTestCount = distContext->read_masks(
           dataset, std::string("test"), globalSamples, globalTestBegin,
+#ifdef __GALOIS_HET_CUDA__
+          globalTestEnd, test_masks, NULL);
+#else
           globalTestEnd, test_masks, dGraph);
+#endif
     }
 #ifdef __GALOIS_HET_CUDA__
     copy_test_masks_to_device();
diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h
index 3a33d54440..45b7bcc8bd 100644
--- a/libdeepgalois/include/deepgalois/layers/layer.h
+++ b/libdeepgalois/include/deepgalois/layers/layer.h
@@ -11,14 +11,15 @@
 #include <iostream>
 #include "deepgalois/GraphTypes.h"
 #include "deepgalois/Context.h"
-
-#include "deepgalois/DistContext.h"
 #include "deepgalois/optimizer.h"
 #include "deepgalois/layers/node.h"
 
+#ifndef __GALOIS_HET_CUDA__
+#include "deepgalois/DistContext.h"
 #include "galois/graphs/GluonSubstrate.h"
 #include "deepgalois/layers/GluonGradients.h"
 #include "deepgalois/layers/GradientSyncStructs.h"
+#endif
 
 namespace deepgalois {
 
@@ -37,7 +38,11 @@ namespace deepgalois {
  **/
 class layer : public deepgalois::node {
 public:
+#ifdef __GALOIS_HET_CUDA__
+  using ContextType = deepgalois::Context;
+#else
   using ContextType = deepgalois::DistContext;
+#endif
 
   layer(unsigned level, std::vector<size_t> in_dims,
         std::vector<size_t> out_dims)
@@ -173,15 +178,14 @@ class layer : public deepgalois::node {
   label_t* labels;
   float_t* norm_consts;
 // TODO
-#ifndef __GALOIS_HET_CUDA__
-  Graph* graph_cpu;
-#else
+#ifdef __GALOIS_HET_CUDA__
   GraphGPU* graph_gpu;
-#endif
-
+#else
+  Graph* graph_cpu;
   // Used for synchronization of weight gradients
   deepgalois::GluonGradients* gradientGraph;
   galois::graphs::GluonSubstrate<deepgalois::GluonGradients>* syncSub;
+#endif
 };
 
 //! Connects tail to head's edge and sets that edge's target to tail
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 5881b617cc..d7c29d1cfa 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -10,7 +10,6 @@ float_t* _dataToSync = nullptr;
 //! sync
 long unsigned _syncVectorSize = 0;
 
-#ifndef __GALOIS_HET_CUDA__
 inline void graph_conv_layer::rand_init_matrix(size_t dim_x, size_t dim_y,
                                                vec_t& matrix, unsigned seed) {
   auto init_range = sqrt(6.0 / (dim_x + dim_y));
@@ -181,5 +180,4 @@ acc_t graph_conv_layer::get_weight_decay_loss() {
   return math::l2_norm(input_dims[1] * output_dims[1], &layer::W[0]);
 }
 
-#endif // end if CPU_ONLY
 } // namespace deepgalois
diff --git a/libdeepgalois/src/layers/leaky_relu_layer.cpp b/libdeepgalois/src/layers/leaky_relu_layer.cpp
index dd4357739f..a230de1090 100644
--- a/libdeepgalois/src/layers/leaky_relu_layer.cpp
+++ b/libdeepgalois/src/layers/leaky_relu_layer.cpp
@@ -12,7 +12,6 @@ leaky_relu_layer::leaky_relu_layer(unsigned level, float_t eps, dims_t in_dims,
   name_      = layer_type() + "_" + std::to_string(level);
 }
 
-#ifdef CPU_ONLY
 // 𝑦[𝑙] = 𝑦[𝑙−1] > 0 ? 𝑦[𝑙−1]) : 𝑦[𝑙−1] * ε
 void leaky_relu_layer::forward_propagation(const float_t* in_data,
                                            float_t* out_data) {
@@ -25,6 +24,5 @@ void leaky_relu_layer::back_propagation(const float_t*, const float_t* out_data,
                                         float_t* out_grad, float_t* in_grad) {
   math::d_leaky_relu_cpu(n, epsilon_, out_grad, out_data, in_grad);
 }
-#endif
 
 } // namespace deepgalois
diff --git a/libdeepgalois/src/layers/relu_layer.cpp b/libdeepgalois/src/layers/relu_layer.cpp
index 03cd0f4652..0576bea642 100644
--- a/libdeepgalois/src/layers/relu_layer.cpp
+++ b/libdeepgalois/src/layers/relu_layer.cpp
@@ -3,7 +3,6 @@
 
 namespace deepgalois {
 
-#ifdef CPU_ONLY
 // 𝑦[𝑙] = max(0, 𝑦[𝑙−1])
 void relu_layer::forward_propagation(const float_t* in_data,
                                      float_t* out_data) {
@@ -18,6 +17,5 @@ void relu_layer::back_propagation(const float_t*, const float_t* out_data,
   size_t n = input_dims[0] * input_dims[1];
   math::d_relu_cpu(n, out_grad, out_data, in_grad);
 }
-#endif
 
 } // namespace deepgalois
diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
index d20f2a769b..3dcb312f08 100644
--- a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
+++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
@@ -4,7 +4,6 @@
 
 namespace deepgalois {
 
-#ifdef CPU_ONLY
 sigmoid_loss_layer::sigmoid_loss_layer(unsigned level,
                                        std::vector<size_t> in_dims,
                                        std::vector<size_t> out_dims)
@@ -93,6 +92,5 @@ acc_t sigmoid_loss_layer::get_prediction_loss() {
   assert(valid_sample_count.reduce() == count_);
   return total_loss.reduce() / (acc_t)count_;
 }
-#endif
 
 } // namespace deepgalois
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp
index f3eb3ee969..940fbeb798 100644
--- a/libdeepgalois/src/layers/softmax_loss_layer.cpp
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp
@@ -4,7 +4,6 @@
 
 namespace deepgalois {
 
-#ifdef CPU_ONLY
 softmax_loss_layer::softmax_loss_layer(unsigned level,
                                        std::vector<size_t> in_dims,
                                        std::vector<size_t> out_dims)
@@ -98,6 +97,5 @@ acc_t softmax_loss_layer::get_prediction_loss() {
   assert(valid_sample_count.reduce() == count_);
   return total_loss.reduce() / (acc_t)count_;
 }
-#endif
 
 } // namespace deepgalois
diff --git a/libgpu/include/graph_gpu.h b/libgpu/include/graph_gpu.h
index 449e38a7b5..4c480bd8fa 100644
--- a/libgpu/include/graph_gpu.h
+++ b/libgpu/include/graph_gpu.h
@@ -159,6 +159,7 @@ struct CSRGraph {
 		assert(src <= nnodes);
 		return row_start[src+1];
 	};
+	CUDA_HOSTDEV index_type *row_start_host_ptr() { return row_start; }
 	CUDA_HOSTDEV index_type *row_start_ptr() { return row_start; }
 	CUDA_HOSTDEV const index_type *row_start_ptr() const { return row_start; }
 	CUDA_HOSTDEV index_type *edge_dst_ptr() { return edge_dst; }
@@ -172,7 +173,7 @@ struct CSRGraph {
     assert(dst < nnodes);
     assert(eid < nedges);
     edge_dst[eid] = dst;
-    //if (edge_data) edge_data[eid] = edata;
+    if (edge_data) edge_data[eid] = edata;
   }
   void malloc_index_device(index_type n, index_type *ptr);
   void set_index(index_type pos, index_type value, index_type *ptr);
diff --git a/lonestar/gnn/gcn/gcn.cpp b/lonestar/gnn/gcn/gcn.cpp
index fabd27667f..c33e7d5574 100644
--- a/lonestar/gnn/gcn/gcn.cpp
+++ b/lonestar/gnn/gcn/gcn.cpp
@@ -1,61 +1,10 @@
 // Graph Neural Networks
 // Xuhao Chen <cxh@utexas.edu>
 #include "lonestargnn.h"
-#ifdef GALOIS_USE_DIST
-#include "DistributedGraphLoader.h"
-#endif
 
 const char* name = "Graph Convolutional Networks";
 const char* desc = "Graph convolutional neural networks on an undirected graph";
 const char* url  = 0;
 
-int main(int argc, char** argv) {
-  galois::DistMemSys G;
-  LonestarGnnStart(argc, argv, name, desc, url);
+#include "engine.h"
 
-  // Get a partitioned graph first
-  std::vector<unsigned> dummyVec;
-  deepgalois::DGraph* dGraph =
-      galois::graphs::constructSymmetricGraph<char, void>(dummyVec);
-
-  // initialize network + whole context on CPU
-  // read network, features, ground truth, initialize metadata
-  // default setting for now; can be customized by the user
-  deepgalois::Net network(dataset, numThreads, num_conv_layers, epochs, hidden1,
-                          learning_rate, dropout_rate, weight_decay,
-                          add_selfloop, is_single_class, add_l2norm, add_dense,
-                          neighbor_sample_sz, subgraph_sample_sz, val_interval);
-
-  // initialize distributed context
-  network.partitionInit(dGraph, dataset, is_single_class);
-
-  // construct layers from distributed context
-  network.construct_layers();
-  network.print_layers_info();
-  deepgalois::ResourceManager rm; // tracks peak memory usage
-
-  // the optimizer used to update parameters,
-  // see optimizer.h for more details
-  // optimizer *opt = new gradient_descent();
-  // optimizer *opt = new adagrad();
-  deepgalois::optimizer* opt = new deepgalois::adam();
-  galois::StatTimer Ttrain("TrainAndVal");
-  Ttrain.start();
-  network.train(opt, do_validate); // do training using training samples
-  Ttrain.stop();
-
-  if (do_test) {
-    // test using test samples
-    galois::gPrint("\n");
-    network.read_test_masks(dataset);
-    galois::StatTimer Ttest("Test");
-    Ttest.start();
-    acc_t test_loss = 0.0, test_acc = 0.0;
-    double test_time = network.evaluate("test", test_loss, test_acc);
-    galois::gPrint("Testing: test_loss = ", test_loss, " test_acc = ", test_acc,
-                   " test_time = ", test_time, "\n");
-    Ttest.stop();
-  }
-  galois::gPrint("\n", rm.get_peak_memory(), "\n\n");
-  return 0;
-}
diff --git a/lonestar/gnn/include/engine.h b/lonestar/gnn/include/engine.h
index 7d0691de0f..4820d5c7fc 100644
--- a/lonestar/gnn/include/engine.h
+++ b/lonestar/gnn/include/engine.h
@@ -6,19 +6,23 @@ int main(int argc, char** argv) {
   galois::DistMemSys G;
   LonestarGnnStart(argc, argv, name, desc, url);
 
-  // the neural network to train: loads the entire graph on CPU
+  // Get a partitioned graph first
+  std::vector<unsigned> dummyVec;
+  deepgalois::DGraph* dGraph =
+      galois::graphs::constructSymmetricGraph<char, void>(dummyVec);
+
+  // initialize network + whole context on CPU
+  // read network, features, ground truth, initialize metadata
+  // default setting for now; can be customized by the user
   deepgalois::Net network(dataset, numThreads, num_conv_layers, epochs, hidden1,
                           learning_rate, dropout_rate, weight_decay,
                           add_selfloop, is_single_class, add_l2norm, add_dense,
                           neighbor_sample_sz, subgraph_sample_sz, val_interval);
 
-  std::vector<unsigned> dummyVec;
-  deepgalois::Graph* dGraph =
-      galois::graphs::constructSymmetricGraph<char, void>(dummyVec);
-  network.dist_init(dGraph, dataset);
+  // initialize distributed context
+  network.partitionInit(dGraph, dataset, is_single_class);
 
-  // read network, features, ground truth, initialize metadata
-  // default setting for now; can be customized by the user
+  // construct layers from distributed context
   network.construct_layers();
   network.print_layers_info();
   deepgalois::ResourceManager rm; // tracks peak memory usage

From e37a4f6b7e98d6ef07742e0aaffc2ea15e0ad515 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 11 May 2020 23:03:16 -0500
Subject: [PATCH 288/660] fix mkl build

---
 libdeepgalois/src/DistContext.cpp | 35 +++++++++++++++----------------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp
index 1df20fb96b..a9c604befc 100644
--- a/libdeepgalois/src/DistContext.cpp
+++ b/libdeepgalois/src/DistContext.cpp
@@ -214,23 +214,23 @@ void DistContext::constructNormFactor(deepgalois::Context* globalContext) {
 #ifdef USE_MKL
   galois::do_all(galois::iterate((size_t)0, partitionedGraph->size()),
     [&] (unsigned i) {
-    float_t c_i =
-        std::sqrt(float_t(wholeGraph->get_degree(partitionedGraph->getGID(i))));
+      float_t c_i =
+          std::sqrt(float_t(wholeGraph->get_degree(partitionedGraph->getGID(i))));
 
-    for (auto e = partitionedGraph->edge_begin(i);
-         e != partitionedGraph->edge_end(i); e++) {
-      const auto j = partitionedGraph->getEdgeDst(e);
-      float_t c_j  = std::sqrt(
-          float_t(wholeGraph->get_degree(partitionedGraph->getGID(j))));
+      for (auto e = partitionedGraph->edge_begin(i);
+           e != partitionedGraph->edge_end(i); e++) {
+        const auto j = partitionedGraph->getEdgeDst(e);
+        float_t c_j  = std::sqrt(
+            float_t(wholeGraph->get_degree(partitionedGraph->getGID(j))));
 
-      if (c_i == 0.0 || c_j == 0.0) {
-        this->normFactors[e] = 0.0;
-      } else {
-        this->normFactors[e] = 1.0 / (c_i * c_j);
+        if (c_i == 0.0 || c_j == 0.0) {
+          this->normFactors[*e] = 0.0;
+        } else {
+          this->normFactors[*e] = 1.0 / (c_i * c_j);
+        }
       }
     },
     galois::loopname("NormCountingEdge"));
-  );
 #else
   galois::do_all(
       galois::iterate((size_t)0, partitionedGraph->size()),
@@ -261,15 +261,15 @@ void DistContext::constructNormFactorSub(int subgraphID) {
     // TODO using partitioned subgraph rather than whoel graph; i.e. dist
     // setting wrong
 #ifdef USE_MKL
-  galois::do_all(galois::iterate((size_t)0, graphToUse->size()),
+  galois::do_all(galois::iterate((size_t)0, graphToUse.size()),
     [&] (unsigned i) {
       // float_t c_i =
       // std::sqrt(float_t(wholeGraph->get_degree(partitionedGraph->getGID(i))));
       float_t c_i = std::sqrt(float_t(graphToUse.get_degree(i)));
 
-      for (auto e = graphToUse->edge_begin(i); e != graphToUse->edge_end(i);
+      for (index_t e = graphToUse.edge_begin(i); e != graphToUse.edge_end(i);
            e++) {
-        const auto j = graphToUse->getEdgeDst(e);
+        const auto j = graphToUse.getEdgeDst(e);
         float_t c_j  = std::sqrt(float_t(graphToUse.get_degree(j)));
 
         if (c_i == 0.0 || c_j == 0.0) {
@@ -277,9 +277,8 @@ void DistContext::constructNormFactorSub(int subgraphID) {
         } else {
           this->normFactorsSub[e] = 1.0 / (c_i * c_j);
         }
-      },
-    galois::loopname("NormCountingEdge"));
-  );
+      }
+    }, galois::loopname("NormCountingEdge"));
 #else
   galois::do_all(
       galois::iterate((size_t)0, graphToUse.size()),

From fd25e3e71bb5332a75a3077e118f6144ed1af877 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Tue, 12 May 2020 09:41:34 -0500
Subject: [PATCH 289/660] fix more

---
 libdeepgalois/CMakeLists.txt                  |  2 +-
 libdeepgalois/include/deepgalois/Context.h    | 12 ++---
 .../include/deepgalois/DistContext.h          | 45 +++++--------------
 libdeepgalois/include/deepgalois/GraphTypes.h |  1 +
 libdeepgalois/include/deepgalois/Net.h        | 11 ++---
 .../include/deepgalois/layers/layer.h         |  6 +--
 libdeepgalois/src/DistContext.cpp             | 23 ++++++++++
 libdeepgalois/src/{net.cu => Net.cu}          | 11 +++--
 8 files changed, 51 insertions(+), 60 deletions(-)
 rename libdeepgalois/src/{net.cu => Net.cu} (95%)

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index d591c4927f..aa4850c8c4 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -60,7 +60,7 @@ if(ENABLE_HETERO_GALOIS)
     src/context.cu
     src/lgraph.cu
     src/node.cu
-    src/net.cu
+    src/Net.cu
   )
   cuda_add_library(dg_gpu ${CUDA_SOURCES})
   target_link_libraries(dg_gpu galois_gpu -lcudart -lcublas -lcusparse -lcurand)
diff --git a/libdeepgalois/include/deepgalois/Context.h b/libdeepgalois/include/deepgalois/Context.h
index 341270201a..0be03a1972 100644
--- a/libdeepgalois/include/deepgalois/Context.h
+++ b/libdeepgalois/include/deepgalois/Context.h
@@ -39,8 +39,7 @@ class Context {
   static cublasHandle_t cublas_handle_;         // used to call cuBLAS
   static cusparseHandle_t cusparse_handle_;     // used to call cuSPARSE
   static cusparseMatDescr_t cusparse_matdescr_; // used to call cuSPARSE
-  static curandGenerator_t
-      curand_generator_; // used to generate random numbers on GPU
+  static curandGenerator_t curand_generator_; // used to generate random numbers on GPU
 
   GraphGPU graph_gpu; // the input graph, |V| = N
   std::vector<GraphGPU*> subgraphs_gpu;
@@ -50,14 +49,11 @@ class Context {
   float_t* get_feats_subg_ptr() { return d_feats_subg; }
   label_t* get_labels_ptr() { return d_labels; }
   label_t* get_labels_subg_ptr() { return d_labels_subg; }
+
   inline static cublasHandle_t cublas_handle() { return cublas_handle_; }
   inline static cusparseHandle_t cusparse_handle() { return cusparse_handle_; }
-  inline static cusparseMatDescr_t cusparse_matdescr() {
-    return cusparse_matdescr_;
-  }
-  inline static curandGenerator_t curand_generator() {
-    return curand_generator_;
-  }
+  inline static cusparseMatDescr_t cusparse_matdescr() { return cusparse_matdescr_; }
+  inline static curandGenerator_t curand_generator() { return curand_generator_; }
 #endif
 
   Context();
diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h
index 14b2ae18b7..9a3496a9c9 100644
--- a/libdeepgalois/include/deepgalois/DistContext.h
+++ b/libdeepgalois/include/deepgalois/DistContext.h
@@ -3,7 +3,9 @@
 /**
  * Based on common.hpp file of the Caffe deep learning library.
  */
+#ifndef __GALOIS_HET_CUDA__
 #include "galois/graphs/GluonSubstrate.h"
+#endif
 #include "deepgalois/types.h"
 #include "deepgalois/Context.h"
 #include "deepgalois/GraphTypes.h"
@@ -13,20 +15,18 @@ namespace deepgalois {
 class DistContext {
   size_t num_classes; // number of classes: E
   size_t feat_len;    // input feature length: D
-  galois::graphs::GluonSubstrate<DGraph>* syncSubstrate;
-
   Graph* lGraph;            // laerning graph version
+#ifndef __GALOIS_HET_CUDA__
+  galois::graphs::GluonSubstrate<DGraph>* syncSubstrate;
+#endif
   DGraph* partitionedGraph; // the input graph, |V| = N
   std::vector<Graph*> partitionedSubgraphs;
   label_t* h_labels; // labels for classification. Single-class label: Nx1,
                      // multi-class label: NxE
-  std::vector<label_t> h_labels_subg; // labels for subgraph
-  float_t* h_feats;                   // input features: N x D
-  std::vector<float_t> h_feats_subg;  // input features for subgraph
-
-  //  change regular one to a vector as well
-  std::vector<float_t>
-      normFactors; // normalization constant based on graph structure
+  float_t* h_feats;                    // input features: N x D
+  std::vector<label_t> h_labels_subg;  // labels for subgraph
+  std::vector<float_t> h_feats_subg;   // input features for subgraph
+  std::vector<float_t> normFactors;    // normalization constant based on graph structure
   std::vector<float_t> normFactorsSub; // normalization constant for subgraph
   bool usingSingleClass;
 
@@ -35,29 +35,6 @@ class DistContext {
   DistContext() : usingSingleClass(true){};
   ~DistContext();
 
-  void saveDistGraph(DGraph* a) {
-    partitionedGraph = a;
-
-    // construct lgraph from underlying lc csr graph
-    // TODO fix this so i don't have more than 1 copy of graph in memory
-    this->lGraph = new Graph();
-    this->lGraph->allocateFrom(a->size(), a->sizeEdges());
-    this->lGraph->constructNodes();
-
-    galois::do_all(
-        galois::iterate((size_t)0, a->size()),
-        [&](const auto src) {
-          this->lGraph->fixEndEdge(src, *a->edge_end(src));
-          index_t idx = *(a->edge_begin(src));
-
-          for (auto e = a->edge_begin(src); e != a->edge_end(src); e++) {
-            const auto dst = a->getEdgeDst(e);
-            this->lGraph->constructEdge(idx++, dst, 0);
-          }
-        },
-        galois::loopname("lgraphcopy"));
-  }
-
   //! read labels of local nodes only
   size_t read_labels(bool isSingleClassLabel, std::string dataset_str);
   //! read features of local nodes only
@@ -68,7 +45,6 @@ class DistContext {
 
   DGraph* getGraphPointer() { return partitionedGraph; }
   Graph* getLGraphPointer() { return lGraph; }
-
   Graph* getSubgraphPointer(int id) { return partitionedSubgraphs[id]; };
   float_t* get_feats_ptr() { return h_feats; }
   float_t* get_feats_subg_ptr() { return h_feats_subg.data(); }
@@ -76,7 +52,10 @@ class DistContext {
   label_t* get_labels_subg_ptr() { return h_labels_subg.data(); }
 
   void initializeSyncSubstrate();
+#ifndef __GALOIS_HET_CUDA__
+  void saveDistGraph(DGraph* a);
   galois::graphs::GluonSubstrate<DGraph>* getSyncSubstrate();
+#endif
 
   //! allocate the norm factor vector
   void allocNormFactor();
diff --git a/libdeepgalois/include/deepgalois/GraphTypes.h b/libdeepgalois/include/deepgalois/GraphTypes.h
index c542f42b89..6829194e26 100644
--- a/libdeepgalois/include/deepgalois/GraphTypes.h
+++ b/libdeepgalois/include/deepgalois/GraphTypes.h
@@ -13,6 +13,7 @@
 namespace deepgalois {
 using edge_iterator = index_t;
 #ifdef __GALOIS_HET_CUDA__
+using DGraph        = CSRGraph;
 using Graph         = CSRGraph;
 using GraphGPU      = CSRGraph;
 #else
diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h
index 58433c7c1c..53ffd54960 100644
--- a/libdeepgalois/include/deepgalois/Net.h
+++ b/libdeepgalois/include/deepgalois/Net.h
@@ -12,10 +12,10 @@
 #include "deepgalois/utils.h"
 #include "deepgalois/Context.h"
 #include "deepgalois/GraphTypes.h"
+#include "deepgalois/DistContext.h"
 
 #ifndef __GALOIS_HET_CUDA__
 #include "deepgalois/Sampler.h"
-#include "deepgalois/DistContext.h"
 #endif
 
 namespace deepgalois {
@@ -84,10 +84,11 @@ class Net {
   //! context holds all of the graph data
   deepgalois::Context* graphTopologyContext;
 
-#ifndef __GALOIS_HET_CUDA__
   //! dist context holds graph data of the partitioned graph only
   deepgalois::DistContext* distContext;
   DGraph* dGraph;
+
+#ifndef __GALOIS_HET_CUDA__
   Sampler* sampler;
 #endif
 
@@ -189,14 +190,10 @@ class Net {
 
   void init();
 
-#ifndef __GALOIS_HET_CUDA__
   //! Initializes metadata for the partition
-  void partitionInit(DGraph* graph, std::string dataset_str,
-                     bool isSingleClassLabel);
-#endif
+  void partitionInit(DGraph* graph, std::string dataset_str, bool isSingleClassLabel);
   size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; }
   size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id + 1]; }
-
   void regularize(); // add weight decay
 
   void train(optimizer* opt, bool need_validate) {
diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h
index 45b7bcc8bd..91b57c7041 100644
--- a/libdeepgalois/include/deepgalois/layers/layer.h
+++ b/libdeepgalois/include/deepgalois/layers/layer.h
@@ -13,9 +13,9 @@
 #include "deepgalois/Context.h"
 #include "deepgalois/optimizer.h"
 #include "deepgalois/layers/node.h"
+#include "deepgalois/DistContext.h"
 
 #ifndef __GALOIS_HET_CUDA__
-#include "deepgalois/DistContext.h"
 #include "galois/graphs/GluonSubstrate.h"
 #include "deepgalois/layers/GluonGradients.h"
 #include "deepgalois/layers/GradientSyncStructs.h"
@@ -38,11 +38,7 @@ namespace deepgalois {
  **/
 class layer : public deepgalois::node {
 public:
-#ifdef __GALOIS_HET_CUDA__
-  using ContextType = deepgalois::Context;
-#else
   using ContextType = deepgalois::DistContext;
-#endif
 
   layer(unsigned level, std::vector<size_t> in_dims,
         std::vector<size_t> out_dims)
diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp
index a9c604befc..528ba700cc 100644
--- a/libdeepgalois/src/DistContext.cpp
+++ b/libdeepgalois/src/DistContext.cpp
@@ -5,6 +5,29 @@
 namespace deepgalois {
 DistContext::~DistContext() {}
 
+void DistContext::saveDistGraph(DGraph* a) {
+  partitionedGraph = a;
+
+  // construct lgraph from underlying lc csr graph
+  // TODO fix this so i don't have more than 1 copy of graph in memory
+  this->lGraph = new Graph();
+  this->lGraph->allocateFrom(a->size(), a->sizeEdges());
+  this->lGraph->constructNodes();
+
+  galois::do_all(
+      galois::iterate((size_t)0, a->size()),
+      [&](const auto src) {
+        this->lGraph->fixEndEdge(src, *a->edge_end(src));
+        index_t idx = *(a->edge_begin(src));
+
+        for (auto e = a->edge_begin(src); e != a->edge_end(src); e++) {
+          const auto dst = a->getEdgeDst(e);
+          this->lGraph->constructEdge(idx++, dst, 0);
+        }
+      },
+      galois::loopname("lgraphcopy"));
+}
+
 // TODO move to reader class
 size_t DistContext::read_labels(bool isSingleClassLabel,
                                 std::string dataset_str) {
diff --git a/libdeepgalois/src/net.cu b/libdeepgalois/src/Net.cu
similarity index 95%
rename from libdeepgalois/src/net.cu
rename to libdeepgalois/src/Net.cu
index f1bbe97c94..647e8e0738 100644
--- a/libdeepgalois/src/net.cu
+++ b/libdeepgalois/src/Net.cu
@@ -148,14 +148,13 @@ acc_t masked_f1_score_gpu(int num_classes, int begin, int end, int count,
 namespace deepgalois {
 
 void Net::init() {
-  copy_masks_device(num_samples, train_masks, d_train_masks);
-  copy_masks_device(num_samples, val_masks, d_val_masks);
-  context
-      ->copy_data_to_device(); // copy labels and input features to the device
+  copy_masks_device(globalSamples, globalTrainMasks, d_train_masks);
+  copy_masks_device(globalSamples, globalValMasks, d_val_masks);
+  distContext->copy_data_to_device(); // copy labels and input features to the device
 }
 
 void Net::copy_test_masks_to_device() {
-  copy_masks_device(num_samples, test_masks, d_test_masks);
+  copy_masks_device(globalSamples, test_masks, d_test_masks);
 }
 
 // add weight decay
@@ -166,7 +165,7 @@ void Net::regularize() {
            layers[layer_id]->get_grads_device_ptr());
 }
 
-void Net::normalize() {}
+//void Net::normalize() {}
 
 acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count,
                            mask_t* masks, float_t* preds,

From 5acfdf9a62d95c812ee66273714c4448fc1b328b Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Tue, 12 May 2020 11:01:57 -0500
Subject: [PATCH 290/660] fix DistContext

---
 libdeepgalois/CMakeLists.txt                  |  4 +-
 .../include/deepgalois/DistContext.h          | 36 ++++++--
 libdeepgalois/include/deepgalois/GraphTypes.h |  3 +
 libdeepgalois/include/deepgalois/Net.h        |  4 +
 libdeepgalois/include/deepgalois/reader.h     |  4 +-
 libdeepgalois/src/DistContext.cpp             |  1 +
 .../src/{context.cu => DistContext.cu}        | 92 ++++++++-----------
 libdeepgalois/src/math_functions.cu           |  2 +-
 libdeepgalois/src/reader.cpp                  |  2 +-
 lonestar/gnn/include/engine.h                 | 54 ++++++++++-
 lonestar/gnn/include/lonestargnn.h            | 70 ++------------
 11 files changed, 143 insertions(+), 129 deletions(-)
 rename libdeepgalois/src/{context.cu => DistContext.cu} (67%)

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index aa4850c8c4..7548664a9d 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -27,7 +27,7 @@ link_directories(${CMAKE_SOURCE_DIR}/libgalois)
 
 if(ENABLE_HETERO_GALOIS)
   # hetero path
-  set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -D__GALOIS_HET_CUDA__")
+  set(CUDA_NVCC_FLAGS "-D__GALOIS_HET_CUDA__ ${CUDA_NVCC_FLAGS}")
   set(CUB_ROOT "${CMAKE_SOURCE_DIR}/cub") # only required headers
   include_directories("${CUB_ROOT}")
   set(MGPU_ROOT "${CMAKE_SOURCE_DIR}/moderngpu") # only required headers
@@ -57,7 +57,7 @@ if(ENABLE_HETERO_GALOIS)
     src/layers/aggregator.cu
     src/math_functions.cu
     src/optimizer.cu
-    src/context.cu
+    src/DistContext.cu
     src/lgraph.cu
     src/node.cu
     src/Net.cu
diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h
index 9a3496a9c9..300bd216cc 100644
--- a/libdeepgalois/include/deepgalois/DistContext.h
+++ b/libdeepgalois/include/deepgalois/DistContext.h
@@ -13,10 +13,18 @@
 namespace deepgalois {
 
 class DistContext {
-  size_t num_classes; // number of classes: E
-  size_t feat_len;    // input feature length: D
-  Graph* lGraph;            // laerning graph version
-#ifndef __GALOIS_HET_CUDA__
+  bool is_device;         // is this on device or host
+  bool is_selfloop_added; // whether selfloop is added to the input graph
+  bool usingSingleClass;
+  size_t num_classes;     // number of classes: E
+  size_t feat_len;        // input feature length: D
+  Graph* lGraph;          // laerning graph version
+#ifdef __GALOIS_HET_CUDA__
+  label_t* d_labels;      // labels on device
+  label_t* d_labels_subg; // labels for subgraph on device
+  float_t* d_feats;       // input features on device
+  float_t* d_feats_subg;  // input features for subgraph on device
+#else
   galois::graphs::GluonSubstrate<DGraph>* syncSubstrate;
 #endif
   DGraph* partitionedGraph; // the input graph, |V| = N
@@ -28,17 +36,21 @@ class DistContext {
   std::vector<float_t> h_feats_subg;   // input features for subgraph
   std::vector<float_t> normFactors;    // normalization constant based on graph structure
   std::vector<float_t> normFactorsSub; // normalization constant for subgraph
-  bool usingSingleClass;
 
 public:
   // TODO better constructor
-  DistContext() : usingSingleClass(true){};
+  DistContext();
+  DistContext(bool isDevice) : is_device(isDevice) {}
   ~DistContext();
 
+  size_t read_graph(std::string dataset_str, bool selfloop = false);
+
   //! read labels of local nodes only
   size_t read_labels(bool isSingleClassLabel, std::string dataset_str);
+
   //! read features of local nodes only
   size_t read_features(std::string dataset_str);
+
   //! read masks of local nodes only
   size_t read_masks(std::string dataset_str, std::string mask_type, size_t n,
                     size_t& begin, size_t& end, mask_t* masks, DGraph* dGraph);
@@ -52,7 +64,17 @@ class DistContext {
   label_t* get_labels_subg_ptr() { return h_labels_subg.data(); }
 
   void initializeSyncSubstrate();
-#ifndef __GALOIS_HET_CUDA__
+#ifdef __GALOIS_HET_CUDA__
+  void copy_data_to_device(); // copy labels and input features
+  static cublasHandle_t cublas_handle_;         // used to call cuBLAS
+  static cusparseHandle_t cusparse_handle_;     // used to call cuSPARSE
+  static cusparseMatDescr_t cusparse_matdescr_; // used to call cuSPARSE
+  static curandGenerator_t curand_generator_; // used to generate random numbers on GPU
+  inline static cublasHandle_t cublas_handle() { return cublas_handle_; }
+  inline static cusparseHandle_t cusparse_handle() { return cusparse_handle_; }
+  inline static cusparseMatDescr_t cusparse_matdescr() { return cusparse_matdescr_; }
+  inline static curandGenerator_t curand_generator() { return curand_generator_; }
+#else
   void saveDistGraph(DGraph* a);
   galois::graphs::GluonSubstrate<DGraph>* getSyncSubstrate();
 #endif
diff --git a/libdeepgalois/include/deepgalois/GraphTypes.h b/libdeepgalois/include/deepgalois/GraphTypes.h
index 6829194e26..4e39a820f9 100644
--- a/libdeepgalois/include/deepgalois/GraphTypes.h
+++ b/libdeepgalois/include/deepgalois/GraphTypes.h
@@ -4,7 +4,10 @@
 #include "deepgalois/lgraph.h"
 
 #ifdef __GALOIS_HET_CUDA__
+#define USE_CSRGRAPH
+#ifdef USE_CSRGRAPH
 #include "graph_gpu.h"
+#endif
 #else
 #include "galois/Galois.h"
 #include "galois/graphs/NewGeneric.h"
diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h
index 53ffd54960..2264db6690 100644
--- a/libdeepgalois/include/deepgalois/Net.h
+++ b/libdeepgalois/include/deepgalois/Net.h
@@ -131,6 +131,10 @@ class Net {
     graphTopologyContext->set_dataset(dataset_str);
     // read *entire* graph, get num nodes
     globalSamples = graphTopologyContext->read_graph(selfloop);
+#ifdef __GALOIS_HET_CUDA__
+    this->distContext = new deepgalois::DistContext();
+    this->distNumSamples = this->distContext->read_graph(dataset_str, selfloop);
+#endif
 
     // get training and validation sets: this is to create the training
     // subgraph in the sampler
diff --git a/libdeepgalois/include/deepgalois/reader.h b/libdeepgalois/include/deepgalois/reader.h
index 1bcda0b4b7..55890d79ae 100644
--- a/libdeepgalois/include/deepgalois/reader.h
+++ b/libdeepgalois/include/deepgalois/reader.h
@@ -1,5 +1,5 @@
 #pragma once
-#include "deepgalois/GraphTypes.h"
+#include "deepgalois/lgraph.h"
 
 namespace deepgalois {
 
@@ -16,7 +16,7 @@ class Reader {
   size_t read_features(float_t*& feats, std::string filetype = "bin");
   size_t read_masks(std::string mask_type, size_t n, size_t& begin, size_t& end,
                     mask_t* masks);
-  void readGraphFromGRFile(Graph* g);
+  void readGraphFromGRFile(LearningGraph* g);
 };
 
 } // namespace deepgalois
diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp
index 528ba700cc..2d6cb5de85 100644
--- a/libdeepgalois/src/DistContext.cpp
+++ b/libdeepgalois/src/DistContext.cpp
@@ -3,6 +3,7 @@
 #include "deepgalois/configs.h"
 
 namespace deepgalois {
+DistContext::DistContext() : usingSingleClass(true) {}
 DistContext::~DistContext() {}
 
 void DistContext::saveDistGraph(DGraph* a) {
diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/DistContext.cu
similarity index 67%
rename from libdeepgalois/src/context.cu
rename to libdeepgalois/src/DistContext.cu
index 05a1b0cd8f..26c5c56d90 100644
--- a/libdeepgalois/src/context.cu
+++ b/libdeepgalois/src/DistContext.cu
@@ -2,7 +2,7 @@
 #include <cstdio>
 #include <unistd.h>
 #include <sys/types.h>
-#include "deepgalois/context.h"
+#include "deepgalois/DistContext.h"
 #include "deepgalois/math_functions.hh"
 #include "deepgalois/configs.h"
 
@@ -27,26 +27,21 @@ int64_t cluster_seedgen(void) {
 namespace deepgalois {
 
 // computing normalization factor for each vertex
-__global__ void norm_factor_computing_node(int n, GraphGPU graph,
-                                           float_t* norm_fac) {
+__global__ void norm_factor_computing_node(int n, GraphGPU graph, float_t* norm_fac) {
   CUDA_KERNEL_LOOP(i, n) {
     float_t temp = sqrt(float_t(graph.getOutDegree(i)));
-    if (temp == 0.0)
-      norm_fac[i] = 0.0;
-    else
-      norm_fac[i] = 1.0 / temp;
+    if (temp == 0.0) norm_fac[i] = 0.0;
+    else norm_fac[i] = 1.0 / temp;
   }
 }
 
 // TODO: make sure self-loop added for each vertex
 // computing normalization factor for each edge
-__global__ void norm_factor_computing_edge(int n, GraphGPU graph,
-                                           float_t* norm_fac) {
+__global__ void norm_factor_computing_edge(int n, GraphGPU graph, float_t* norm_fac) {
   CUDA_KERNEL_LOOP(src, n) {
     assert(src < n);
     float_t d_src = float_t(graph.getOutDegree(src));
-    assert(d_src !=
-           0.0); // should never be zero since self-loop added for each vertex
+    assert(d_src != 0.0); // should never be zero since self-loop added for each vertex
     d_src       = 1.0 / sqrt(d_src);
     auto start  = graph.edge_begin(src);
     index_t end = graph.edge_end(src);
@@ -63,12 +58,12 @@ __global__ void norm_factor_computing_edge(int n, GraphGPU graph,
   }
 }
 
-cublasHandle_t Context::cublas_handle_         = 0;
-cusparseHandle_t Context::cusparse_handle_     = 0;
-cusparseMatDescr_t Context::cusparse_matdescr_ = 0;
-curandGenerator_t Context::curand_generator_   = 0;
+cublasHandle_t DistContext::cublas_handle_         = 0;
+cusparseHandle_t DistContext::cusparse_handle_     = 0;
+cusparseMatDescr_t DistContext::cusparse_matdescr_ = 0;
+curandGenerator_t DistContext::curand_generator_   = 0;
 
-Context::Context() : Context(true) {
+DistContext::DistContext() : DistContext(true) {
   CUBLAS_CHECK(cublasCreate(&cublas_handle_));
   CUSPARSE_CHECK(cusparseCreate(&cusparse_handle_));
   CUSPARSE_CHECK(cusparseCreateMatDescr(&cusparse_matdescr_));
@@ -82,7 +77,7 @@ Context::Context() : Context(true) {
       curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen()));
 }
 
-Context::~Context() {
+DistContext::~DistContext() {
   if (cublas_handle_)
     CUBLAS_CHECK(cublasDestroy(cublas_handle_));
   if (cusparse_handle_)
@@ -95,38 +90,37 @@ Context::~Context() {
     CUDA_CHECK(cudaFree(d_labels));
   if (d_feats)
     CUDA_CHECK(cudaFree(d_feats));
-  if (norm_factors)
-    CUDA_CHECK(cudaFree(norm_factors));
 }
 
-void Context::allocateSubgraphs(int n_sg) {}
+void DistContext::allocateSubgraphs(int n_sg) {}
 
-void Context::gen_subgraph_labels(size_t m, const mask_t* masks) {}
+void DistContext::constructSubgraphLabels(size_t m, const mask_t* masks) {}
 
-void Context::gen_subgraph_feats(size_t m, const mask_t* masks) {}
+void DistContext::constructSubgraphFeatures(size_t m, const mask_t* masks) {}
 
-void Context::norm_factor_computing(bool is_subgraph, int subg_id) {
+void DistContext::constructNormFactor(deepgalois::Context* globalContext) {
+  auto n = partitionedGraph->size();
   std::cout << "Pre-computing normalization factor (n=" << n << ") ... ";
   if (!is_selfloop_added) {
     std::cout << "Set -sl=1 to add selfloop\n";
     exit(0);
   }
 #ifdef USE_CUSPARSE
-  int nnz = graph_gpu.sizeEdges();
-  CUDA_CHECK(cudaMalloc((void**)&norm_factors, nnz * sizeof(float_t)));
-  init_const_gpu(nnz, 0.0, norm_factors);
+  int nnz = partitionedGraph->sizeEdges();
+  CUDA_CHECK(cudaMalloc((void**)&normFactors[0], nnz * sizeof(float_t)));
+  init_const_gpu(nnz, 0.0, &normFactors[0]);
   norm_factor_computing_edge<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
-      n, graph_gpu, norm_factors);
+      n, *partitionedGraph, &normFactors[0]);
 #else
-  CUDA_CHECK(cudaMalloc((void**)&norm_factors, n * sizeof(float_t)));
+  CUDA_CHECK(cudaMalloc((void**)&(&normFactors[0]), n * sizeof(float_t)));
   norm_factor_computing_node<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
-      n, graph_gpu, norm_factors);
+      n, *partitionedGraph, &normFactors[0]);
 #endif
   CudaTest("solving norm_factor_computing kernel failed");
   std::cout << "Done\n";
 }
 /*
-void Context::SetDevice(const int device_id) {
+void DistContext::SetDevice(const int device_id) {
   int current_device;
   CUDA_CHECK(cudaGetDevice(&current_device));
   if (current_device == device_id) return;
@@ -141,7 +135,8 @@ CURAND_RNG_PSEUDO_DEFAULT));
 cluster_seedgen()));
 }
 */
-size_t Context::read_graph(bool selfloop) {
+size_t DistContext::read_graph(std::string dataset, bool selfloop) {
+  partitionedGraph = new DGraph();
 #ifdef USE_CSRGRAPH
   std::string filename = path + dataset + ".csgr";
   GraphGPU g;
@@ -150,41 +145,30 @@ size_t Context::read_graph(bool selfloop) {
     g.add_selfloop();
     is_selfloop_added = selfloop;
   }
-  g.copy_to_gpu(graph_gpu);
+  g.copy_to_gpu(*partitionedGraph);
 #else
-  graph_gpu.readGraph(dataset);
+  partitionedGraph->readGraph(dataset);
   if (selfloop) {
-    graph_gpu.add_selfloop();
+    partitionedGraph->add_selfloop();
     is_selfloop_added = selfloop;
   }
-  graph_gpu.copy_to_gpu();
+  partitionedGraph->copy_to_gpu();
 #endif
-  n = graph_gpu.size();
-  return n;
+  return partitionedGraph->size();
 }
 
-void Context::copy_data_to_device() {
-  if (is_single_class) {
+void DistContext::copy_data_to_device() {
+  auto n = partitionedGraph->size();
+  if (usingSingleClass) {
     CUDA_CHECK(cudaMalloc((void**)&d_labels, n * sizeof(label_t)));
-    CUDA_CHECK(cudaMemcpy(d_labels, h_labels, n * sizeof(label_t),
-                          cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpy(d_labels, h_labels, n * sizeof(label_t), cudaMemcpyHostToDevice));
   } else {
-    CUDA_CHECK(
-        cudaMalloc((void**)&d_labels, n * num_classes * sizeof(label_t)));
-    CUDA_CHECK(cudaMemcpy(d_labels, h_labels, n * num_classes * sizeof(label_t),
-                          cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMalloc((void**)&d_labels, n * num_classes * sizeof(label_t)));
+    CUDA_CHECK(cudaMemcpy(d_labels, h_labels, n * num_classes * sizeof(label_t), cudaMemcpyHostToDevice));
   }
   CUDA_CHECK(cudaMalloc((void**)&d_feats, n * feat_len * sizeof(float_t)));
-  CUDA_CHECK(cudaMemcpy(d_feats, &h_feats[0], n * feat_len * sizeof(float_t),
-                        cudaMemcpyHostToDevice));
+  CUDA_CHECK(cudaMemcpy(d_feats, &h_feats[0], n * feat_len * sizeof(float_t), cudaMemcpyHostToDevice));
   // print_device_vector(10, d_feats, "d_feats");
 }
 
-// void Context::copy_data_to_device() {
-// float_malloc_device(n, d_labels);
-// float_copy_device(n, h_labels, d_labels);
-// float_malloc_device(n*feat_len, d_feats);
-// float_copy_device(n*feat_len, &h_feats[0], d_feats);
-//}
-
 } // namespace deepgalois
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index 8b5ab8100f..80e4f6d394 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -1,5 +1,5 @@
 #include "deepgalois/math_functions.hh"
-#include "deepgalois/context.h"
+#include "deepgalois/DistContext.h"
 #include "gg.h"
 #include "ggcuda.h"
 #include "cub/cub.cuh"
diff --git a/libdeepgalois/src/reader.cpp b/libdeepgalois/src/reader.cpp
index 29f729f3a4..c8de34e448 100644
--- a/libdeepgalois/src/reader.cpp
+++ b/libdeepgalois/src/reader.cpp
@@ -166,7 +166,7 @@ void Reader::progressPrint(unsigned max, unsigned i) {
   }
 }
 
-void Reader::readGraphFromGRFile(Graph* g) {
+void Reader::readGraphFromGRFile(LearningGraph* g) {
   std::string filename = path + dataset_str + ".csgr";
   std::ifstream ifs;
   ifs.open(filename);
diff --git a/lonestar/gnn/include/engine.h b/lonestar/gnn/include/engine.h
index 4820d5c7fc..21be590817 100644
--- a/lonestar/gnn/include/engine.h
+++ b/lonestar/gnn/include/engine.h
@@ -1,15 +1,65 @@
 #ifdef GALOIS_USE_DIST
 #include "DistributedGraphLoader.h"
+#include "galois/DistGalois.h"
+#include "galois/runtime/Network.h"
 #endif
+#include "deepgalois/Net.h"
+
+//! initialize lonestargnn benchmark
+void LonestarGnnStart(int argc, char** argv, const char* app, const char* desc,
+                      const char* url) {
+  llvm::cl::SetVersionPrinter(LonestarGnnPrintVersion);
+  llvm::cl::ParseCommandLineOptions(argc, argv);
+  galois::runtime::setStatFile(statFile);
+
+#ifndef __GALOIS_HET_CUDA__
+  numThreads = galois::setActiveThreads(numThreads); // number of threads on CPU
+#endif
+
+#ifdef GALOIS_USE_DIST
+  auto& net = galois::runtime::getSystemNetworkInterface();
+  if (net.ID == 0) {
+#endif
+  LonestarGnnPrintVersion(llvm::outs());
+  std::cout << "Copyright (C) " << galois::getCopyrightYear()
+            << " The University of Texas at Austin\n";
+  std::cout << "http://iss.ices.utexas.edu/galois/\n\n";
+  std::cout << "application: " << (app ? app : "unspecified") << "\n";
+  if (desc)
+    std::cout << desc << "\n";
+  if (url)
+    std::cout << "http://iss.ices.utexas.edu/?p=projects/galois/benchmarks/"
+              << url << "\n";
+  std::cout << "\n";
+  std::ostringstream cmdout;
+  for (int i = 0; i < argc; ++i) {
+    cmdout << argv[i];
+    if (i != argc - 1)
+      cmdout << " ";
+  }
+  galois::runtime::reportParam("(NULL)", "CommandLine", cmdout.str());
+  galois::runtime::reportParam("(NULL)", "Threads", numThreads);
+#ifdef GALOIS_USE_DIST
+  }
+#endif
+
+  char name[256];
+  gethostname(name, 256);
+  galois::runtime::reportParam("(NULL)", "Hostname", name);
+}
 
 int main(int argc, char** argv) {
+#ifdef GALOIS_USE_DIST
   galois::DistMemSys G;
+#endif
   LonestarGnnStart(argc, argv, name, desc, url);
 
   // Get a partitioned graph first
   std::vector<unsigned> dummyVec;
-  deepgalois::DGraph* dGraph =
-      galois::graphs::constructSymmetricGraph<char, void>(dummyVec);
+  deepgalois::DGraph* dGraph = NULL;
+#ifdef GALOIS_USE_DIST
+  dGraph = galois::graphs::constructSymmetricGraph<char, void>(dummyVec);
+#endif
 
   // initialize network + whole context on CPU
   // read network, features, ground truth, initialize metadata
diff --git a/lonestar/gnn/include/lonestargnn.h b/lonestar/gnn/include/lonestargnn.h
index 21e73cb024..639396c6b5 100644
--- a/lonestar/gnn/include/lonestargnn.h
+++ b/lonestar/gnn/include/lonestargnn.h
@@ -5,15 +5,12 @@
 #include "galois/Timer.h"
 #include "galois/Galois.h"
 #include "galois/Version.h"
-#include "galois/Reduction.h"
-#include "galois/ParallelSTL.h"
-#include "galois/runtime/Profile.h"
+//#include "galois/Reduction.h"
+//#include "galois/ParallelSTL.h"
+//#include "galois/runtime/Profile.h"
 #include "llvm/Support/CommandLine.h"
 #include <boost/iterator/transform_iterator.hpp>
 
-#include "galois/DistGalois.h"
-#include "galois/runtime/Network.h"
-
 namespace cll = llvm::cl;
 static cll::opt<std::string> dataset(cll::Positional, 
     cll::desc("<dataset name>"), cll::Required); // 'cora', 'citeseer', 'pubmed'
@@ -50,62 +47,15 @@ extern llvm::cl::opt<int> numThreads;
 extern llvm::cl::opt<std::string> statFile;
 
 //! standard global options to the benchmarks
-llvm::cl::opt<bool>
-    skipVerify("noverify",
-               llvm::cl::desc("Skip verification step (default value false)"),
-               llvm::cl::init(false));
-llvm::cl::opt<int>
-    numThreads("t", llvm::cl::desc("Number of threads (default value 1)"),
-               llvm::cl::init(1));
-llvm::cl::opt<std::string> statFile(
-    "statFile",
-    llvm::cl::desc("ouput file to print stats to (default value empty)"),
-    llvm::cl::init(""));
+llvm::cl::opt<bool> skipVerify("noverify",
+    llvm::cl::desc("Skip verification step (default value false)"), llvm::cl::init(false));
+llvm::cl::opt<int>numThreads("t", llvm::cl::desc("Number of threads (default value 1)"), llvm::cl::init(1));
+llvm::cl::opt<std::string> statFile("statFile",
+    llvm::cl::desc("ouput file to print stats to (default value empty)"), llvm::cl::init(""));
 
 static void LonestarGnnPrintVersion(llvm::raw_ostream& out) {
-  out << "LoneStarGNN Benchmark Suite v" << galois::getVersion() << " ("
-      << galois::getRevision() << ")\n";
+  out << "LoneStarGNN Benchmark Suite v" << galois::getVersion()
+      << " (" << galois::getRevision() << ")\n";
   out.flush();
 }
 
-//! initialize lonestargnn benchmark
-void LonestarGnnStart(int argc, char** argv, const char* app, const char* desc,
-                      const char* url) {
-  llvm::cl::SetVersionPrinter(LonestarGnnPrintVersion);
-  llvm::cl::ParseCommandLineOptions(argc, argv);
-  numThreads = galois::setActiveThreads(numThreads);
-  galois::runtime::setStatFile(statFile);
-
-#ifdef GALOIS_USE_DIST
-  auto& net = galois::runtime::getSystemNetworkInterface();
-  if (net.ID == 0) {
-#endif
-  LonestarGnnPrintVersion(llvm::outs());
-  std::cout << "Copyright (C) " << galois::getCopyrightYear()
-            << " The University of Texas at Austin\n";
-  std::cout << "http://iss.ices.utexas.edu/galois/\n\n";
-  std::cout << "application: " << (app ? app : "unspecified") << "\n";
-  if (desc)
-    std::cout << desc << "\n";
-  if (url)
-    std::cout << "http://iss.ices.utexas.edu/?p=projects/galois/benchmarks/"
-              << url << "\n";
-  std::cout << "\n";
-  std::ostringstream cmdout;
-  for (int i = 0; i < argc; ++i) {
-    cmdout << argv[i];
-    if (i != argc - 1)
-      cmdout << " ";
-  }
-  galois::runtime::reportParam("(NULL)", "CommandLine", cmdout.str());
-  galois::runtime::reportParam("(NULL)", "Threads", numThreads);
-#ifdef GALOIS_USE_DIST
-  }
-#endif
-
-  char name[256];
-  gethostname(name, 256);
-  galois::runtime::reportParam("(NULL)", "Hostname", name);
-}
-
-#include "deepgalois/Net.h"

From 9d824604b31f1e1f583024717cf120c91f0a9990 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Tue, 12 May 2020 13:20:28 -0500
Subject: [PATCH 291/660] remove Context.cpp

---
 libdeepgalois/CMakeLists.txt                  |  1 -
 libdeepgalois/include/deepgalois/Context.h    | 82 +++++------------
 .../include/deepgalois/DistContext.h          | 29 ++++--
 libdeepgalois/include/deepgalois/GraphTypes.h |  1 +
 libdeepgalois/include/deepgalois/Net.h        |  1 +
 libdeepgalois/include/deepgalois/lgraph.h     |  2 +-
 libdeepgalois/src/Context.cpp                 | 88 -------------------
 libdeepgalois/src/DistContext.cu              |  8 ++
 libdeepgalois/src/lgraph.cpp                  | 10 +--
 libdeepgalois/src/reader.cpp                  | 32 +++++++
 10 files changed, 92 insertions(+), 162 deletions(-)
 delete mode 100644 libdeepgalois/src/Context.cpp

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index 7548664a9d..064f24d0d7 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -84,7 +84,6 @@ else()
     src/layers/aggregator.cpp
     src/math_functions.cpp
     src/optimizer.cpp
-    src/Context.cpp
     src/DistContext.cpp
     src/Sampler.cpp
     src/reader.cpp
diff --git a/libdeepgalois/include/deepgalois/Context.h b/libdeepgalois/include/deepgalois/Context.h
index 0be03a1972..6200540847 100644
--- a/libdeepgalois/include/deepgalois/Context.h
+++ b/libdeepgalois/include/deepgalois/Context.h
@@ -1,85 +1,49 @@
 #pragma once
-/**
- * Based on common.hpp file of the Caffe deep learning library.
- */
-
 #include <string>
 #include <cassert>
 #include "deepgalois/types.h"
 #include "deepgalois/reader.h"
+#include "deepgalois/configs.h"
 #include "deepgalois/GraphTypes.h"
 
-#ifdef __GALOIS_HET_CUDA__
-#include "deepgalois/cutils.h"
-#endif
-
 namespace deepgalois {
 
 class Context {
-  std::string dataset;
   bool is_device;         // is this on device or host
   bool is_selfloop_added; // whether selfloop is added to the input graph
-
-  label_t* d_labels;      // labels on device
-  label_t* d_labels_subg; // labels for subgraph on device
-  float_t* d_feats;       // input features on device
-  float_t* d_feats_subg;  // input features for subgraph on device
-
+  std::string dataset;
   Reader reader;
 
 public:
-// TODO separate below to public and private
-#ifndef __GALOIS_HET_CUDA__
-  Graph* graph_cpu; // the input graph, |V| = N
-  std::vector<Graph*> subgraphs_cpu;
-  void add_selfloop(Graph& og, Graph& g);
-  //! returns pointer to the graph
-  Graph* getGraphPointer() { return graph_cpu; }
-#else
-  static cublasHandle_t cublas_handle_;         // used to call cuBLAS
-  static cusparseHandle_t cusparse_handle_;     // used to call cuSPARSE
-  static cusparseMatDescr_t cusparse_matdescr_; // used to call cuSPARSE
-  static curandGenerator_t curand_generator_; // used to generate random numbers on GPU
-
-  GraphGPU graph_gpu; // the input graph, |V| = N
-  std::vector<GraphGPU*> subgraphs_gpu;
-  GraphGPU* getGraphPointer() { return &graph_gpu; }
-  GraphGPU* getSubgraphPointer(int id) { return subgraphs_gpu[id]; };
-  float_t* get_feats_ptr() { return d_feats; }
-  float_t* get_feats_subg_ptr() { return d_feats_subg; }
-  label_t* get_labels_ptr() { return d_labels; }
-  label_t* get_labels_subg_ptr() { return d_labels_subg; }
-
-  inline static cublasHandle_t cublas_handle() { return cublas_handle_; }
-  inline static cusparseHandle_t cusparse_handle() { return cusparse_handle_; }
-  inline static cusparseMatDescr_t cusparse_matdescr() { return cusparse_matdescr_; }
-  inline static curandGenerator_t curand_generator() { return curand_generator_; }
-#endif
-
-  Context();
+  GraphCPU* graph_cpu; // the input graph, |V| = N
+  GraphCPU* getGraphPointer() { return graph_cpu; }
+  Context() : Context(false) {}
   //! initializer for gpu; goes ahead and sets a few things
-  Context(bool use_gpu)
-      : is_device(use_gpu), is_selfloop_added(false), d_labels(NULL),
-        d_labels_subg(NULL), d_feats(NULL), d_feats_subg(NULL) {}
-  ~Context();
-
-  size_t read_graph(bool selfloop);
-
-  size_t read_masks(std::string mask_type, size_t n, size_t& begin, size_t& end,
-                    mask_t* masks) {
-    return reader.read_masks(mask_type, n, begin, end, masks);
-  }
-
+  Context(bool use_gpu) : is_device(use_gpu), is_selfloop_added(false) {}
+  ~Context() {}
   void set_dataset(std::string dataset_str) {
     dataset = dataset_str;
     reader.init(dataset);
   }
+  size_t read_masks(std::string mask_type, size_t n, 
+                    size_t& begin, size_t& end, mask_t* masks) {
+    return reader.read_masks(mask_type, n, begin, end, masks);
+  }
+  size_t read_graph(bool selfloop) {
+    graph_cpu            = new GraphCPU();
+    graph_cpu->readGraph(dataset, selfloop);
+    is_selfloop_added = selfloop;
+    std::cout << "num_vertices " << graph_cpu->size() 
+              << " num_edges " << graph_cpu->sizeEdges() << "\n";
+    return graph_cpu->size();
+  }
 
   //! Checks if subgraph being used, sets currenet graph, then calls degreex
   //! counting
-  Graph* getFullGraph();
-
-  void copy_data_to_device(); // copy labels and input features
+  GraphCPU* getFullGraph() {
+    graph_cpu->degree_counting(); // TODO: why is it here? should be in read_graph
+    return graph_cpu;
+  }
 };
 
 } // namespace deepgalois
diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h
index 300bd216cc..745c298608 100644
--- a/libdeepgalois/include/deepgalois/DistContext.h
+++ b/libdeepgalois/include/deepgalois/DistContext.h
@@ -1,14 +1,15 @@
 #ifndef __DG_DIST_CONTEXT__
 #define __DG_DIST_CONTEXT__
-/**
- * Based on common.hpp file of the Caffe deep learning library.
- */
-#ifndef __GALOIS_HET_CUDA__
+#ifdef __GALOIS_HET_CUDA__
+#include "deepgalois/cutils.h"
+#else
 #include "galois/graphs/GluonSubstrate.h"
 #endif
+
 #include "deepgalois/types.h"
 #include "deepgalois/Context.h"
 #include "deepgalois/GraphTypes.h"
+#include "deepgalois/reader.h"
 
 namespace deepgalois {
 
@@ -16,6 +17,7 @@ class DistContext {
   bool is_device;         // is this on device or host
   bool is_selfloop_added; // whether selfloop is added to the input graph
   bool usingSingleClass;
+  std::string dataset;
   size_t num_classes;     // number of classes: E
   size_t feat_len;        // input feature length: D
   Graph* lGraph;          // laerning graph version
@@ -37,6 +39,8 @@ class DistContext {
   std::vector<float_t> normFactors;    // normalization constant based on graph structure
   std::vector<float_t> normFactorsSub; // normalization constant for subgraph
 
+  Reader reader;
+
 public:
   // TODO better constructor
   DistContext();
@@ -58,13 +62,13 @@ class DistContext {
   DGraph* getGraphPointer() { return partitionedGraph; }
   Graph* getLGraphPointer() { return lGraph; }
   Graph* getSubgraphPointer(int id) { return partitionedSubgraphs[id]; };
-  float_t* get_feats_ptr() { return h_feats; }
-  float_t* get_feats_subg_ptr() { return h_feats_subg.data(); }
-  label_t* get_labels_ptr() { return h_labels; }
-  label_t* get_labels_subg_ptr() { return h_labels_subg.data(); }
 
   void initializeSyncSubstrate();
 #ifdef __GALOIS_HET_CUDA__
+  float_t* get_feats_ptr() { return d_feats; }
+  float_t* get_feats_subg_ptr() { return d_feats_subg; }
+  label_t* get_labels_ptr() { return d_labels; }
+  label_t* get_labels_subg_ptr() { return d_labels_subg; }
   void copy_data_to_device(); // copy labels and input features
   static cublasHandle_t cublas_handle_;         // used to call cuBLAS
   static cusparseHandle_t cusparse_handle_;     // used to call cuSPARSE
@@ -77,8 +81,17 @@ class DistContext {
 #else
   void saveDistGraph(DGraph* a);
   galois::graphs::GluonSubstrate<DGraph>* getSyncSubstrate();
+  float_t* get_feats_ptr() { return h_feats; }
+  float_t* get_feats_subg_ptr() { return h_feats_subg.data(); }
+  label_t* get_labels_ptr() { return h_labels; }
+  label_t* get_labels_subg_ptr() { return h_labels_subg.data(); }
 #endif
 
+  void set_dataset(std::string dataset_str) {
+    dataset = dataset_str;
+    reader.init(dataset);
+  }
+
   //! allocate the norm factor vector
   void allocNormFactor();
   void allocNormFactorSub(int subID);
diff --git a/libdeepgalois/include/deepgalois/GraphTypes.h b/libdeepgalois/include/deepgalois/GraphTypes.h
index 4e39a820f9..2ee3f86b93 100644
--- a/libdeepgalois/include/deepgalois/GraphTypes.h
+++ b/libdeepgalois/include/deepgalois/GraphTypes.h
@@ -15,6 +15,7 @@
 
 namespace deepgalois {
 using edge_iterator = index_t;
+using GraphCPU      = LearningGraph;
 #ifdef __GALOIS_HET_CUDA__
 using DGraph        = CSRGraph;
 using Graph         = CSRGraph;
diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h
index 2264db6690..f6e8516c5e 100644
--- a/libdeepgalois/include/deepgalois/Net.h
+++ b/libdeepgalois/include/deepgalois/Net.h
@@ -133,6 +133,7 @@ class Net {
     globalSamples = graphTopologyContext->read_graph(selfloop);
 #ifdef __GALOIS_HET_CUDA__
     this->distContext = new deepgalois::DistContext();
+    this->distContext->set_dataset(dataset_str);
     this->distNumSamples = this->distContext->read_graph(dataset_str, selfloop);
 #endif
 
diff --git a/libdeepgalois/include/deepgalois/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h
index 40ca6c5a18..e9a185bfac 100644
--- a/libdeepgalois/include/deepgalois/lgraph.h
+++ b/libdeepgalois/include/deepgalois/lgraph.h
@@ -56,7 +56,7 @@ class LearningGraph {
   void degree_counting();
   void constructNodes() {}
 
-  void readGraph(std::string dataset);
+  void readGraph(std::string dataset, bool selfloop = false);
   void fixEndEdge(index_t vid, index_t row_end) { rowptr_[vid + 1] = row_end; }
   void allocateFrom(index_t nv, index_t ne) {
     // printf("Allocating num_vertices %d num_edgesi %d\n", num_vertices_,
diff --git a/libdeepgalois/src/Context.cpp b/libdeepgalois/src/Context.cpp
deleted file mode 100644
index c9bbe9e706..0000000000
--- a/libdeepgalois/src/Context.cpp
+++ /dev/null
@@ -1,88 +0,0 @@
-/**
- * Based on common.hpp file of the Caffe deep learning library.
- */
-#include "deepgalois/Context.h"
-#include "deepgalois/utils.h"
-#include "deepgalois/configs.h"
-#include "galois/Galois.h"
-
-namespace deepgalois {
-
-Context::Context() : Context(false) {}
-
-Context::~Context() {}
-
-size_t Context::read_graph(bool selfloop) {
-  std::string filename = path + dataset + ".csgr";
-  std::string filetype = "gr";
-  galois::StatTimer Tread("GraphReadingTime");
-  Tread.start();
-  if (filetype == "bin") {
-    graph_cpu->readGraph(dataset);
-  } else if (filetype == "gr") {
-    graph_cpu            = new Graph();
-    std::string filename = path + dataset + ".csgr";
-    printf("Reading .gr file: %s\n", filename.c_str());
-    if (selfloop) {
-      galois::gWarn("SELF LOOPS NOT SUPPORTED AT THIS TIME");
-      Graph graph_temp;
-      // galois::graphs::readGraph(graph_temp, filename);
-      graph_temp.readGraph(dataset);
-      add_selfloop(graph_temp, *graph_cpu);
-      is_selfloop_added = selfloop;
-      //} else galois::graphs::readGraph(*graph_cpu, filename);
-    } else {
-      graph_cpu->readGraph(dataset);
-      galois::gPrint("graph read size ", graph_cpu->size());
-    }
-    // TODO dist version of self loop
-  } else {
-    GALOIS_DIE("unknown file format for readgraph");
-  }
-  Tread.stop();
-
-  auto g = getGraphPointer();
-  galois::gPrint("num_vertices ", g->size(), " num_edges ", g->sizeEdges(),
-                 "\n");
-  return g->size();
-}
-
-void Context::add_selfloop(Graph& og, Graph& g) {
-  // TODO not actually implemented yet
-  g.allocateFrom(og.size(), og.size() + og.sizeEdges());
-  g.constructNodes();
-  // for (size_t src = 0; src < og.size(); src++) {
-  //  //g.getData(src) = 1;
-  //  auto begin = og.edge_begin(src);
-  //  auto end = og.edge_end(src);
-  //  g.fixEndEdge(src, end+src+1);
-  //  bool self_inserted = false;
-  //  if (begin == end) {
-  //    new_edge_dst[begin+i] = i;
-  //    continue;
-  //  }
-  //  for (auto e = begin; e != end; e++) {
-  //    auto dst = og.getEdgeDst(e);
-  //    if (!self_inserted) {
-  //      if (dst > src) {
-  //        g.constructEdge(e+src, src, 0);
-  //        g.constructEdge(e+src+1, dst, 0);
-  //        self_inserted = true;
-  //      } else if (e+1 == end) {
-  //        g.constructEdge(e+src+1, src, 0);
-  //        g.constructEdge(e+src, dst, 0);
-  //        self_inserted = true;
-  //      } else g.constructEdge(e+src, dst, 0);
-  //    } else g.constructEdge(e+src+1, dst, 0);
-  //  }
-  //}
-}
-
-// get current graph, also gets degrees of g
-Graph* Context::getFullGraph() {
-  Graph* g = getGraphPointer();
-  g->degree_counting();
-  return g;
-}
-
-} // namespace deepgalois
diff --git a/libdeepgalois/src/DistContext.cu b/libdeepgalois/src/DistContext.cu
index 26c5c56d90..c43d0020f0 100644
--- a/libdeepgalois/src/DistContext.cu
+++ b/libdeepgalois/src/DistContext.cu
@@ -92,6 +92,10 @@ DistContext::~DistContext() {
     CUDA_CHECK(cudaFree(d_feats));
 }
 
+size_t read_masks(std::string mask_type, size_t n, size_t& begin, size_t& end, mask_t* masks) {
+  return reader.read_masks(mask_type, n, begin, end, masks);
+}
+
 void DistContext::allocateSubgraphs(int n_sg) {}
 
 void DistContext::constructSubgraphLabels(size_t m, const mask_t* masks) {}
@@ -119,6 +123,10 @@ void DistContext::constructNormFactor(deepgalois::Context* globalContext) {
   CudaTest("solving norm_factor_computing kernel failed");
   std::cout << "Done\n";
 }
+
+void DistContext::constructNormFactorSub(int subgraphID) {
+}
+
 /*
 void DistContext::SetDevice(const int device_id) {
   int current_device;
diff --git a/libdeepgalois/src/lgraph.cpp b/libdeepgalois/src/lgraph.cpp
index 572f4e5662..c0c39b4023 100644
--- a/libdeepgalois/src/lgraph.cpp
+++ b/libdeepgalois/src/lgraph.cpp
@@ -20,7 +20,8 @@ uint64_t LearningGraph::numMasters() { return 0; }
 
 uint64_t LearningGraph::globalSize() { return 0; }
 
-void LearningGraph::readGraph(std::string dataset) {
+void LearningGraph::readGraph(std::string dataset, bool selfloop) {
+  if (selfloop) std::cout << "selfloop not yet implemented\n";
   deepgalois::Reader reader(dataset);
   reader.readGraphFromGRFile(this);
 }
@@ -28,10 +29,9 @@ void LearningGraph::readGraph(std::string dataset) {
 void LearningGraph::degree_counting() {
   // if (degrees_ != NULL) return;
   // degrees_ = new index_t[num_vertices_];
-  galois::do_all(
-      galois::iterate(size_t(0), size_t(num_vertices_)),
-      [&](auto v) { degrees_[v] = rowptr_[v + 1] - rowptr_[v]; },
-      galois::loopname("DegreeCounting"));
+  galois::do_all(galois::iterate(size_t(0), size_t(num_vertices_)),
+    [&](auto v) { degrees_[v] = rowptr_[v + 1] - rowptr_[v]; },
+    galois::loopname("DegreeCounting"));
 }
 
 void LearningGraph::dealloc() {}
diff --git a/libdeepgalois/src/reader.cpp b/libdeepgalois/src/reader.cpp
index c8de34e448..961b852ded 100644
--- a/libdeepgalois/src/reader.cpp
+++ b/libdeepgalois/src/reader.cpp
@@ -255,4 +255,36 @@ void Reader::readGraphFromGRFile(LearningGraph* g) {
             << masterLength / 1000.0 / runtime << " MB/s)\n\n";
 }
 
+/*
+void add_selfloop(Graph& og, Graph& g) {
+  g.allocateFrom(og.size(), og.size() + og.sizeEdges());
+  g.constructNodes();
+   for (size_t src = 0; src < og.size(); src++) {
+    //g.getData(src) = 1;
+    auto begin = og.edge_begin(src);
+    auto end = og.edge_end(src);
+    g.fixEndEdge(src, end+src+1);
+    bool self_inserted = false;
+    if (begin == end) {
+      new_edge_dst[begin+i] = i;
+      continue;
+    }
+    for (auto e = begin; e != end; e++) {
+      auto dst = og.getEdgeDst(e);
+      if (!self_inserted) {
+        if (dst > src) {
+          g.constructEdge(e+src, src, 0);
+          g.constructEdge(e+src+1, dst, 0);
+          self_inserted = true;
+        } else if (e+1 == end) {
+          g.constructEdge(e+src+1, src, 0);
+          g.constructEdge(e+src, dst, 0);
+          self_inserted = true;
+        } else g.constructEdge(e+src, dst, 0);
+      } else g.constructEdge(e+src+1, dst, 0);
+    }
+  }
+}
+//*/
+
 } // namespace deepgalois

From 7d6cc3bef777c8395d6f8a01927a3816db1c296b Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Tue, 12 May 2020 14:16:40 -0500
Subject: [PATCH 292/660] fix gpu compilation

---
 libdeepgalois/include/deepgalois/Net.h | 21 +++++----------
 libdeepgalois/src/DistContext.cpp      | 13 ++++-----
 libdeepgalois/src/DistContext.cu       | 11 +++++++-
 libdeepgalois/src/Net.cpp              |  7 +----
 libdeepgalois/src/Net.cu               | 27 +++++++++++++++++++
 libdeepgalois/src/lgraph.cu            |  2 +-
 libdeepgalois/src/math_functions.cu    | 37 +++++++++++++-------------
 lonestar/gnn/include/engine.h          | 12 +++++++++
 lonestar/gnn/include/lonestargnn.h     | 15 -----------
 9 files changed, 81 insertions(+), 64 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h
index f6e8516c5e..405e4c5cb2 100644
--- a/libdeepgalois/include/deepgalois/Net.h
+++ b/libdeepgalois/include/deepgalois/Net.h
@@ -110,7 +110,7 @@ class Net {
 
     assert(n_conv > 0);
 
-    // TODO use galois print
+    // TODO use galois print: need avoid including Galois.h for GPU
     std::cout << header << "Configuration: num_threads " << num_threads
               << ", num_conv_layers " << num_conv_layers << ", num_epochs "
               << num_epochs << ", hidden1 " << hidden1 << ", learning_rate "
@@ -131,11 +131,6 @@ class Net {
     graphTopologyContext->set_dataset(dataset_str);
     // read *entire* graph, get num nodes
     globalSamples = graphTopologyContext->read_graph(selfloop);
-#ifdef __GALOIS_HET_CUDA__
-    this->distContext = new deepgalois::DistContext();
-    this->distContext->set_dataset(dataset_str);
-    this->distNumSamples = this->distContext->read_graph(dataset_str, selfloop);
-#endif
 
     // get training and validation sets: this is to create the training
     // subgraph in the sampler
@@ -166,18 +161,16 @@ class Net {
           "val", globalSamples, globalValBegin, globalValEnd, globalValMasks);
     }
 
-#ifndef __GALOIS_HET_CUDA__
     // make sure sampel size isn't greater than what we have to train with
-    if (subgraph_sample_size > globalTrainCount) {
-      GALOIS_DIE("subgraph size can not be larger than the size of training "
-                 "set\n");
-    }
+    assert(subgraph_sample_size <= globalTrainCount);
+
+    layers.resize(num_layers);
+    // hidden1 level embedding: 16
+    for (size_t i = 1; i < num_conv_layers; i++) feature_dims[i] = this->h1;
 
     // features are read in distcontext, not this context (this context only
     // used for sampling)
-
-    this->sampler = new Sampler();
-#endif
+    init();
   }
 
   //! Default net constructor
diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp
index 2d6cb5de85..e3c5efb038 100644
--- a/libdeepgalois/src/DistContext.cpp
+++ b/libdeepgalois/src/DistContext.cpp
@@ -30,8 +30,7 @@ void DistContext::saveDistGraph(DGraph* a) {
 }
 
 // TODO move to reader class
-size_t DistContext::read_labels(bool isSingleClassLabel,
-                                std::string dataset_str) {
+size_t DistContext::read_labels(bool isSingleClassLabel, std::string dataset_str) {
   DGraph* dGraph         = DistContext::partitionedGraph;
   this->usingSingleClass = isSingleClassLabel;
   unsigned myID          = galois::runtime::getSystemNetworkInterface().ID;
@@ -49,14 +48,12 @@ size_t DistContext::read_labels(bool isSingleClassLabel,
   // size of labels should be # local nodes
   if (isSingleClassLabel) {
     galois::gPrint("[", myID, "] One hot labels...\n");
-    this->h_labels =
-        new label_t[dGraph->size()]; // single-class (one-hot) label for
-                                     // each vertex: N x 1
+    // single-class (one-hot) label for each vertex: N x 1
+    this->h_labels = new label_t[dGraph->size()]; 
   } else {
     galois::gPrint("[", myID, "] Multi-class labels...\n");
-    this->h_labels = new label_t[dGraph->size() *
-                                 this->num_classes]; // multi-class label for
-                                                     // each vertex: N x E
+    this->h_labels = new label_t[dGraph->size() * this->num_classes]; 
+    // multi-class label for each vertex: N x E
   }
 
   uint32_t foundVertices = 0;
diff --git a/libdeepgalois/src/DistContext.cu b/libdeepgalois/src/DistContext.cu
index c43d0020f0..0c7f89d2db 100644
--- a/libdeepgalois/src/DistContext.cu
+++ b/libdeepgalois/src/DistContext.cu
@@ -92,7 +92,16 @@ DistContext::~DistContext() {
     CUDA_CHECK(cudaFree(d_feats));
 }
 
-size_t read_masks(std::string mask_type, size_t n, size_t& begin, size_t& end, mask_t* masks) {
+size_t DistContext::read_labels(bool isSingleClass, std::string dataset_str) {
+  return reader.read_labels(isSingleClass, h_labels);
+}
+
+size_t DistContext::read_features(std::string dataset_str) {
+  return reader.read_features(h_feats);
+}
+
+size_t DistContext::read_masks(std::string dataset_str, std::string mask_type, size_t n, 
+                               size_t& begin, size_t& end, mask_t* masks, DGraph* dGraph) {
   return reader.read_masks(mask_type, n, begin, end, masks);
 }
 
diff --git a/libdeepgalois/src/Net.cpp b/libdeepgalois/src/Net.cpp
index fbb6323891..809642dfd8 100644
--- a/libdeepgalois/src/Net.cpp
+++ b/libdeepgalois/src/Net.cpp
@@ -59,8 +59,7 @@ void Net::partitionInit(DGraph* graph, std::string dataset_str,
 
   // input feature dimension: D
   feature_dims[0] = this->distContext->read_features(dataset_str);
-  for (size_t i = 1; i < num_conv_layers; i++)
-    feature_dims[i] = this->h1;                // hidden1 level embedding: 16
+
   feature_dims[num_conv_layers] = num_classes; // output embedding: E
   if (this->has_l2norm) {
     // l2 normalized embedding: E
@@ -70,12 +69,9 @@ void Net::partitionInit(DGraph* graph, std::string dataset_str,
     // MLP embedding: E
     feature_dims[num_layers - 1] = num_classes;
   }
-
   feature_dims[num_layers] = num_classes; // normalized output embedding: E
-  layers.resize(num_layers);
 }
 
-#ifndef __GALOIS_HET_CUDA__
 void Net::init() {
   if (subgraph_sample_size)
     sampler = new deepgalois::Sampler();
@@ -164,6 +160,5 @@ acc_t Net::masked_multi_class_accuracy(size_t begin, size_t end, size_t count,
   return deepgalois::masked_f1_score(begin, end, count, masks, num_classes,
                                      ground_truth, preds);
 }
-#endif
 
 } // namespace deepgalois
diff --git a/libdeepgalois/src/Net.cu b/libdeepgalois/src/Net.cu
index 647e8e0738..b170afa35d 100644
--- a/libdeepgalois/src/Net.cu
+++ b/libdeepgalois/src/Net.cu
@@ -150,7 +150,34 @@ namespace deepgalois {
 void Net::init() {
   copy_masks_device(globalSamples, globalTrainMasks, d_train_masks);
   copy_masks_device(globalSamples, globalValMasks, d_val_masks);
+}
+
+void Net::partitionInit(DGraph* graph, std::string dataset_str, bool isSingleClassLabel) {
+  this->distContext = new deepgalois::DistContext();
+  this->distContext->set_dataset(dataset_str);
+
+  // read the graph into CPU memory and copy it to GPU memory
+  this->distNumSamples = this->distContext->read_graph(dataset_str, is_selfloop);
+
+  // read labels into CPU memory
+  num_classes = this->distContext->read_labels(isSingleClassLabel, dataset_str);
+
+  // read features into CPU memory
+  feature_dims[0] = this->distContext->read_features(dataset_str);
+
+  // copy labels and features from CPU memory to GPU memory
   distContext->copy_data_to_device(); // copy labels and input features to the device
+
+  feature_dims[num_conv_layers] = num_classes; // output embedding: E
+  if (this->has_l2norm) {
+    // l2 normalized embedding: E
+    feature_dims[num_conv_layers + 1] = num_classes;
+  }
+  if (this->has_dense) {
+    // MLP embedding: E
+    feature_dims[num_layers - 1] = num_classes;
+  }
+  feature_dims[num_layers] = num_classes; // normalized output embedding: E
 }
 
 void Net::copy_test_masks_to_device() {
diff --git a/libdeepgalois/src/lgraph.cu b/libdeepgalois/src/lgraph.cu
index 679a4b6d8a..9e1f2ab29e 100644
--- a/libdeepgalois/src/lgraph.cu
+++ b/libdeepgalois/src/lgraph.cu
@@ -5,7 +5,7 @@
 
 namespace deepgalois {
 
-void LearningGraph::readGraph(std::string dataset) {
+void LearningGraph::readGraph(std::string dataset, bool selfloop) {
   deepgalois::Reader reader(dataset);
   reader.readGraphFromGRFile(this);
 }
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index 80e4f6d394..1ea5662d91 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -33,12 +33,12 @@ bool isnan_gpu(int n, const float_t* array) {
 
 void gpu_rng_uniform(size_t n, float_t* r) {
   CURAND_CHECK(
-      curandGenerateUniform(deepgalois::Context::curand_generator(), r, n));
+      curandGenerateUniform(deepgalois::DistContext::curand_generator(), r, n));
 }
 
 void rng_uniform_gpu(size_t n, const float_t a, const float_t b, float_t* r) {
   CURAND_CHECK(
-      curandGenerateUniform(deepgalois::Context::curand_generator(), r, n));
+      curandGenerateUniform(deepgalois::DistContext::curand_generator(), r, n));
   const float range = b - a;
   if (range != float_t(1))
     scal_gpu(n, range, r);
@@ -48,7 +48,7 @@ void rng_uniform_gpu(size_t n, const float_t a, const float_t b, float_t* r) {
 
 void gpu_rng_gaussian(const int n, const float_t mu, const float_t sigma,
                       float_t* r) {
-  CURAND_CHECK(curandGenerateNormal(deepgalois::Context::curand_generator(), r,
+  CURAND_CHECK(curandGenerateNormal(deepgalois::DistContext::curand_generator(), r,
                                     n, mu, sigma));
 }
 
@@ -203,7 +203,7 @@ void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
       (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
   cublasOperation_t cuTransB =
       (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  CUBLAS_CHECK(cublasSgemm(deepgalois::Context::cublas_handle(), cuTransB,
+  CUBLAS_CHECK(cublasSgemm(deepgalois::DistContext::cublas_handle(), cuTransB,
                            cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C,
                            N));
 }
@@ -228,14 +228,14 @@ void csrmm_gpu(const int M, const int N, const int K, const int nnz,
   // std::cout << "[debug] csrmm_gpu m=" << M << ", n=" << N << ", k=" << K <<
   // ", nnz=" << nnz << "\n";
   CUSPARSE_CHECK(cusparseScsrmm2(
-      deepgalois::Context::cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE,
+      deepgalois::DistContext::cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE,
       CUSPARSE_OPERATION_TRANSPOSE, M, N, K, nnz, &alpha,
-      deepgalois::Context::cusparse_matdescr(), A_nonzeros, A_idx_ptr,
+      deepgalois::DistContext::cusparse_matdescr(), A_nonzeros, A_idx_ptr,
       A_nnz_idx, B, N, &beta, transpose_C, M));
   // transpose C
   const float one  = 1.0;
   const float zero = 0.0;
-  CUBLAS_CHECK(cublasSgeam(deepgalois::Context::cublas_handle(), CUBLAS_OP_T,
+  CUBLAS_CHECK(cublasSgeam(deepgalois::DistContext::cublas_handle(), CUBLAS_OP_T,
                            CUBLAS_OP_T, N, M, &one, transpose_C, M, &zero, NULL,
                            M, C, N));
 }
@@ -253,21 +253,21 @@ A_nonzeros, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));
 CUSPARSE_ORDER_COL)); cusparseDnMatDescr_t C_descr;
   CUSPARSE_CHECK(cusparseCreateDnMat(&C_descr, M, N, M, C, CUDA_R_32F,
 CUSPARSE_ORDER_COL)); size_t bufferSize;
-  CUSPARSE_CHECK(cusparseSpMM_bufferSize(deepgalois::Context::cusparse_handle(),
+  CUSPARSE_CHECK(cusparseSpMM_bufferSize(deepgalois::DistContext::cusparse_handle(),
                        CUSPARSE_OPERATION_NON_TRANSPOSE,
 CUSPARSE_OPERATION_TRANSPOSE, (void*)&alpha, A_descr, B_descr, (void*)&beta,
 C_descr, CUDA_R_32F, CUSPARSE_COOMM_ALG1, &bufferSize));
   cudaDeviceSynchronize();
   void* buffer = NULL;
   if (bufferSize > 0) CUDA_CHECK(cudaMalloc(&buffer, bufferSize));
-  CUSPARSE_CHECK(cusparseSpMM(deepgalois::Context::cusparse_handle(),
+  CUSPARSE_CHECK(cusparseSpMM(deepgalois::DistContext::cusparse_handle(),
                  CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE,
                  (const void*)&alpha, A_descr, B_descr, (const void*)&beta,
 C_descr, CUDA_R_32F, CUSPARSE_COOMM_ALG1, buffer)); cudaDeviceSynchronize();
   //transpose C
   const float one = 1.0;
   const float zero = 0.0;
-  CUBLAS_CHECK(cublasSgeam(deepgalois::Context::cublas_handle(), CUBLAS_OP_T,
+  CUBLAS_CHECK(cublasSgeam(deepgalois::DistContext::cublas_handle(), CUBLAS_OP_T,
 CUBLAS_OP_T, N, M, &one, transpose_C, M, &zero, NULL, M, C, N));
 }
 //*/
@@ -276,29 +276,29 @@ void gemv_gpu(const CBLAS_TRANSPOSE TransA, const int M, const int N,
               const float beta, float* y) {
   cublasOperation_t cuTransA =
       (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
-  CUBLAS_CHECK(cublasSgemv(deepgalois::Context::cublas_handle(), cuTransA, N, M,
+  CUBLAS_CHECK(cublasSgemv(deepgalois::DistContext::cublas_handle(), cuTransA, N, M,
                            &alpha, A, N, x, 1, &beta, y, 1));
 }
 
 void scal_gpu(const int N, const float alpha, float* X) {
   CUBLAS_CHECK(
-      cublasSscal(deepgalois::Context::cublas_handle(), N, &alpha, X, 1));
+      cublasSscal(deepgalois::DistContext::cublas_handle(), N, &alpha, X, 1));
 }
 
 void dot_gpu(const int n, const float* x, const float* y, float* out) {
   CUBLAS_CHECK(
-      cublasSdot(deepgalois::Context::cublas_handle(), n, x, 1, y, 1, out));
+      cublasSdot(deepgalois::DistContext::cublas_handle(), n, x, 1, y, 1, out));
 }
 
 void asum_gpu(const int n, const float* x, float* y) {
-  CUBLAS_CHECK(cublasSasum(deepgalois::Context::cublas_handle(), n, x, 1, y));
+  CUBLAS_CHECK(cublasSasum(deepgalois::DistContext::cublas_handle(), n, x, 1, y));
 }
 
 void scale_gpu(const int n, const float alpha, const float* x, float* y) {
   CUBLAS_CHECK(
-      cublasScopy(deepgalois::Context::cublas_handle(), n, x, 1, y, 1));
+      cublasScopy(deepgalois::DistContext::cublas_handle(), n, x, 1, y, 1));
   CUBLAS_CHECK(
-      cublasSscal(deepgalois::Context::cublas_handle(), n, &alpha, y, 1));
+      cublasSscal(deepgalois::DistContext::cublas_handle(), n, &alpha, y, 1));
 }
 
 __global__ void set_kernel(const int n, const float_t alpha, float_t* y) {
@@ -341,7 +341,7 @@ __global__ void axpy_kernel(const int n, const float_t a, const float_t* x,
 void axpy_gpu(const int n, const float_t a, const float_t* x, float_t* y) {
   // axpy_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, a, x, y);
   CUBLAS_CHECK(
-      cublasSaxpy(deepgalois::Context::cublas_handle(), n, &a, x, 1, y, 1));
+      cublasSaxpy(deepgalois::DistContext::cublas_handle(), n, &a, x, 1, y, 1));
   CudaTest("solving axpy kernel failed");
 }
 
@@ -354,8 +354,7 @@ __global__ void l2_norm_kernel(const int n, const float_t* a, float_t* sum) {
 
 acc_t l2_norm_gpu(int n, const float_t* x) {
   float_t sum = 0.0;
-  CUBLAS_CHECK(
-      cublasSnrm2(deepgalois::Context::cublas_handle(), n, x, 1, &sum));
+  CUBLAS_CHECK(cublasSnrm2(deepgalois::DistContext::cublas_handle(), n, x, 1, &sum));
   // float_t *d_sum;
   // CUDA_CHECK(cudaMalloc((void**)&d_sum, sizeof(float_t));
   // CUDA_CHECK(cudaMemcpy(d_sum, &sum, sizeof(acc_t), cudaMemcpyHostToDevice));
diff --git a/lonestar/gnn/include/engine.h b/lonestar/gnn/include/engine.h
index 21be590817..727cd52f6b 100644
--- a/lonestar/gnn/include/engine.h
+++ b/lonestar/gnn/include/engine.h
@@ -1,10 +1,22 @@
+// Execution engine
+#include <iostream>
+#include <sstream>
 #ifdef GALOIS_USE_DIST
 #include "DistributedGraphLoader.h"
 #include "galois/DistGalois.h"
 #include "galois/runtime/Network.h"
 #endif
+#include "galois/Galois.h"
+#include "galois/Version.h"
+#include "galois/Timer.h"
 #include "deepgalois/Net.h"
 
+static void LonestarGnnPrintVersion(llvm::raw_ostream& out) {
+  out << "LoneStarGNN Benchmark Suite v" << galois::getVersion()
+      << " (" << galois::getRevision() << ")\n";
+  out.flush();
+}
+
 //! initialize lonestargnn benchmark
 void LonestarGnnStart(int argc, char** argv, const char* app, const char* desc,
                       const char* url) {
diff --git a/lonestar/gnn/include/lonestargnn.h b/lonestar/gnn/include/lonestargnn.h
index 639396c6b5..8b18e80ae0 100644
--- a/lonestar/gnn/include/lonestargnn.h
+++ b/lonestar/gnn/include/lonestargnn.h
@@ -1,15 +1,6 @@
 #pragma once
 
-#include <sstream>
-#include <iostream>
-#include "galois/Timer.h"
-#include "galois/Galois.h"
-#include "galois/Version.h"
-//#include "galois/Reduction.h"
-//#include "galois/ParallelSTL.h"
-//#include "galois/runtime/Profile.h"
 #include "llvm/Support/CommandLine.h"
-#include <boost/iterator/transform_iterator.hpp>
 
 namespace cll = llvm::cl;
 static cll::opt<std::string> dataset(cll::Positional, 
@@ -53,9 +44,3 @@ llvm::cl::opt<int>numThreads("t", llvm::cl::desc("Number of threads (default val
 llvm::cl::opt<std::string> statFile("statFile",
     llvm::cl::desc("ouput file to print stats to (default value empty)"), llvm::cl::init(""));
 
-static void LonestarGnnPrintVersion(llvm::raw_ostream& out) {
-  out << "LoneStarGNN Benchmark Suite v" << galois::getVersion()
-      << " (" << galois::getRevision() << ")\n";
-  out.flush();
-}
-

From d053ce9d0c0bff60ba479f64da439720d425a6b8 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Tue, 12 May 2020 15:39:51 -0500
Subject: [PATCH 293/660] gpu fixed

---
 .../include/deepgalois/DistContext.h          | 11 +++--
 libdeepgalois/include/deepgalois/Net.h        | 42 ++++---------------
 libdeepgalois/src/DistContext.cu              | 20 +++++----
 libdeepgalois/src/Net.cpp                     | 17 ++++++++
 libdeepgalois/src/Net.cu                      | 15 +++++++
 libdeepgalois/src/layers/aggregator.cu        |  7 ++--
 libdeepgalois/src/math_functions.cu           | 15 +++----
 lonestar/gnn/include/engine.h                 |  2 +
 8 files changed, 69 insertions(+), 60 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h
index 745c298608..ff28bb607c 100644
--- a/libdeepgalois/include/deepgalois/DistContext.h
+++ b/libdeepgalois/include/deepgalois/DistContext.h
@@ -20,12 +20,14 @@ class DistContext {
   std::string dataset;
   size_t num_classes;     // number of classes: E
   size_t feat_len;        // input feature length: D
-  Graph* lGraph;          // laerning graph version
+  Graph* lGraph;          // learning graph version
 #ifdef __GALOIS_HET_CUDA__
   label_t* d_labels;      // labels on device
   label_t* d_labels_subg; // labels for subgraph on device
   float_t* d_feats;       // input features on device
   float_t* d_feats_subg;  // input features for subgraph on device
+  float_t* d_normFactors;
+  float_t* d_normFactorsSub;
 #else
   galois::graphs::GluonSubstrate<DGraph>* syncSubstrate;
 #endif
@@ -69,6 +71,8 @@ class DistContext {
   float_t* get_feats_subg_ptr() { return d_feats_subg; }
   label_t* get_labels_ptr() { return d_labels; }
   label_t* get_labels_subg_ptr() { return d_labels_subg; }
+  float_t* get_norm_factors_ptr() { return d_normFactors; }
+  float_t* get_norm_factors_subg_ptr() { return d_normFactorsSub; }
   void copy_data_to_device(); // copy labels and input features
   static cublasHandle_t cublas_handle_;         // used to call cuBLAS
   static cusparseHandle_t cusparse_handle_;     // used to call cuSPARSE
@@ -85,6 +89,8 @@ class DistContext {
   float_t* get_feats_subg_ptr() { return h_feats_subg.data(); }
   label_t* get_labels_ptr() { return h_labels; }
   label_t* get_labels_subg_ptr() { return h_labels_subg.data(); }
+  float_t* get_norm_factors_ptr() { return normFactors.data(); }
+  float_t* get_norm_factors_subg_ptr() { return &normFactorsSub[0]; }
 #endif
 
   void set_dataset(std::string dataset_str) {
@@ -102,9 +108,6 @@ class DistContext {
   void constructSubgraphLabels(size_t m, const mask_t* masks);
   void constructSubgraphFeatures(size_t m, const mask_t* masks);
 
-  float_t* get_norm_factors_ptr() { return normFactors.data(); }
-  float_t* get_norm_factors_subg_ptr() { return &normFactorsSub[0]; }
-
   //! return label for some node
   //! NOTE: this is LID, not GID
   label_t get_label(size_t i) { return h_labels[i]; }
diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h
index 405e4c5cb2..f8e601d0fa 100644
--- a/libdeepgalois/include/deepgalois/Net.h
+++ b/libdeepgalois/include/deepgalois/Net.h
@@ -271,14 +271,12 @@ class Net {
         // update labels for subgraph
         distContext->constructSubgraphLabels(
             this->subgraphNumVertices, &subgraphs_masks[sg_id * globalSamples]);
-        layers[num_layers - 1]->set_labels_ptr(
-            distContext->get_labels_subg_ptr());
+        layers[num_layers - 1]->set_labels_ptr(distContext->get_labels_subg_ptr());
 
         // update features for subgraph
         distContext->constructSubgraphFeatures(
             this->subgraphNumVertices, &subgraphs_masks[sg_id * globalSamples]);
-        layers[0]->set_feats_ptr(
-            distContext->get_feats_subg_ptr()); // feed input data
+        layers[0]->set_feats_ptr(distContext->get_feats_subg_ptr()); // feed input data
 
         // Graph* testing = distContext->getSubgraphPointer(sg_id);
         // for (size_t i = 0; i < testing->size(); i++) {
@@ -419,33 +417,7 @@ class Net {
   }
 
   // read masks of test set
-  void read_test_masks(std::string dataset) {
-    test_masks = new mask_t[distNumSamples];
-    if (dataset == "reddit") {
-      globalTestBegin = 177262;
-      globalTestCount = 55703;
-      globalTestEnd   = globalTestBegin + globalTestCount;
-      for (size_t i = globalTestBegin; i < globalTestEnd; i++) {
-#ifndef __GALOIS_HET_CUDA__
-        if (dGraph->isLocal(i))
-          test_masks[dGraph->getLID(i)] = 1;
-#else
-        // TODO: Read for GPU
-#endif
-      }
-    } else {
-      globalTestCount = distContext->read_masks(
-          dataset, std::string("test"), globalSamples, globalTestBegin,
-#ifdef __GALOIS_HET_CUDA__
-          globalTestEnd, test_masks, NULL);
-#else
-          globalTestEnd, test_masks, dGraph);
-#endif
-    }
-#ifdef __GALOIS_HET_CUDA__
-    copy_test_masks_to_device();
-#endif
-  }
+  void read_test_masks(std::string dataset);
   void copy_test_masks_to_device();
 
   void construct_layers() {
@@ -454,17 +426,14 @@ class Net {
     for (size_t i = 0; i < num_conv_layers - 1; i++) {
       append_conv_layer(i, true); // conv layers, act=true
     }
-
     append_conv_layer(num_conv_layers - 1); // the last hidden layer, act=false
 
     if (has_l2norm) {
       append_l2norm_layer(num_conv_layers); // l2_norm layer
     }
-
     if (has_dense) {
       append_dense_layer(num_layers - 2); // dense layer
     }
-
     append_out_layer(num_layers - 1); // output layer
 
     // allocate memory for intermediate features and gradients
@@ -474,7 +443,6 @@ class Net {
     for (size_t i = 1; i < num_layers; i++) {
       connect(layers[i - 1], layers[i]);
     }
-
     for (size_t i = 0; i < num_layers; i++) {
       layers[i]->malloc_and_init();
     }
@@ -537,7 +505,11 @@ class Net {
     out_dims[1]              = get_out_dim(layer_id);
     layers[layer_id] = new graph_conv_layer(layer_id, act, norm, bias, dropout,
                                             dropout_rate, in_dims, out_dims);
+#ifdef __GALOIS_HET_CUDA__
+    layers[layer_id]->set_graph_ptr(distContext->getGraphPointer());
+#else
     layers[layer_id]->set_graph_ptr(distContext->getLGraphPointer());
+#endif
   }
 
   // update trainable weights after back-propagation
diff --git a/libdeepgalois/src/DistContext.cu b/libdeepgalois/src/DistContext.cu
index 0c7f89d2db..91d39bb9a4 100644
--- a/libdeepgalois/src/DistContext.cu
+++ b/libdeepgalois/src/DistContext.cu
@@ -93,11 +93,13 @@ DistContext::~DistContext() {
 }
 
 size_t DistContext::read_labels(bool isSingleClass, std::string dataset_str) {
-  return reader.read_labels(isSingleClass, h_labels);
+  num_classes = reader.read_labels(isSingleClass, h_labels);
+  return num_classes;
 }
 
 size_t DistContext::read_features(std::string dataset_str) {
-  return reader.read_features(h_feats);
+  feat_len = reader.read_features(h_feats);
+  return feat_len;
 }
 
 size_t DistContext::read_masks(std::string dataset_str, std::string mask_type, size_t n, 
@@ -119,15 +121,15 @@ void DistContext::constructNormFactor(deepgalois::Context* globalContext) {
     exit(0);
   }
 #ifdef USE_CUSPARSE
-  int nnz = partitionedGraph->sizeEdges();
-  CUDA_CHECK(cudaMalloc((void**)&normFactors[0], nnz * sizeof(float_t)));
-  init_const_gpu(nnz, 0.0, &normFactors[0]);
+  auto nnz = partitionedGraph->sizeEdges();
+  CUDA_CHECK(cudaMalloc((void**)&d_normFactors, nnz * sizeof(float_t)));
+  init_const_gpu(nnz, 0.0, d_normFactors);
   norm_factor_computing_edge<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
-      n, *partitionedGraph, &normFactors[0]);
+      n, *partitionedGraph, d_normFactors);
 #else
-  CUDA_CHECK(cudaMalloc((void**)&(&normFactors[0]), n * sizeof(float_t)));
+  CUDA_CHECK(cudaMalloc((void**)&d_normFactors, n * sizeof(float_t)));
   norm_factor_computing_node<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
-      n, *partitionedGraph, &normFactors[0]);
+      n, *partitionedGraph, d_normFactors);
 #endif
   CudaTest("solving norm_factor_computing kernel failed");
   std::cout << "Done\n";
@@ -176,6 +178,7 @@ size_t DistContext::read_graph(std::string dataset, bool selfloop) {
 
 void DistContext::copy_data_to_device() {
   auto n = partitionedGraph->size();
+  std::cout << "Copying labels and features to GPU memory. n = " << n << " ... ";
   if (usingSingleClass) {
     CUDA_CHECK(cudaMalloc((void**)&d_labels, n * sizeof(label_t)));
     CUDA_CHECK(cudaMemcpy(d_labels, h_labels, n * sizeof(label_t), cudaMemcpyHostToDevice));
@@ -186,6 +189,7 @@ void DistContext::copy_data_to_device() {
   CUDA_CHECK(cudaMalloc((void**)&d_feats, n * feat_len * sizeof(float_t)));
   CUDA_CHECK(cudaMemcpy(d_feats, &h_feats[0], n * feat_len * sizeof(float_t), cudaMemcpyHostToDevice));
   // print_device_vector(10, d_feats, "d_feats");
+  std::cout << "Done\n";
 }
 
 } // namespace deepgalois
diff --git a/libdeepgalois/src/Net.cpp b/libdeepgalois/src/Net.cpp
index 809642dfd8..c9d5f1e7fc 100644
--- a/libdeepgalois/src/Net.cpp
+++ b/libdeepgalois/src/Net.cpp
@@ -86,6 +86,23 @@ void Net::regularize() {
              layers[layer_id]->get_grads_ptr());
 }
 
+void Net::read_test_masks(std::string dataset) {
+  test_masks = new mask_t[distNumSamples];
+  if (dataset == "reddit") {
+    globalTestBegin = 177262;
+    globalTestCount = 55703;
+    globalTestEnd   = globalTestBegin + globalTestCount;
+    for (size_t i = globalTestBegin; i < globalTestEnd; i++) {
+      if (dGraph->isLocal(i))
+        test_masks[dGraph->getLID(i)] = 1;
+    }
+  } else {
+    globalTestCount = distContext->read_masks(dataset, std::string("test"), 
+        globalSamples, globalTestBegin, globalTestEnd, test_masks, dGraph);
+  }
+  copy_test_masks_to_device();
+}
+
 /**
  *
  * @param begin GLOBAL begin
diff --git a/libdeepgalois/src/Net.cu b/libdeepgalois/src/Net.cu
index b170afa35d..7ea47dc3ae 100644
--- a/libdeepgalois/src/Net.cu
+++ b/libdeepgalois/src/Net.cu
@@ -180,6 +180,21 @@ void Net::partitionInit(DGraph* graph, std::string dataset_str, bool isSingleCla
   feature_dims[num_layers] = num_classes; // normalized output embedding: E
 }
 
+void Net::read_test_masks(std::string dataset) {
+  test_masks = new mask_t[distNumSamples];
+  if (dataset == "reddit") {
+    globalTestBegin = 177262;
+    globalTestCount = 55703;
+    globalTestEnd   = globalTestBegin + globalTestCount;
+    for (size_t i = globalTestBegin; i < globalTestEnd; i++)
+        test_masks[i] = 1;
+  } else {
+    globalTestCount = distContext->read_masks(dataset, std::string("test"), 
+        globalSamples, globalTestBegin, globalTestEnd, test_masks, NULL);
+  }
+  copy_test_masks_to_device();
+}
+
 void Net::copy_test_masks_to_device() {
   copy_masks_device(globalSamples, test_masks, d_test_masks);
 }
diff --git a/libdeepgalois/src/layers/aggregator.cu b/libdeepgalois/src/layers/aggregator.cu
index 2bfe55ca46..b29e980da3 100644
--- a/libdeepgalois/src/layers/aggregator.cu
+++ b/libdeepgalois/src/layers/aggregator.cu
@@ -90,13 +90,12 @@ void update_all_csrmm(size_t len, GraphGPU& g, const float_t* in, float_t* out,
   float* temp;
   const int* row_start = (const int*)g.row_start_ptr();
   const int* edge_dst  = (const int*)g.edge_dst_ptr();
-  // printf("row_start_ptr: 0x%x\n", row_start);
-  // printf("edge_dst_ptr: 0x%x\n", edge_dst);
+  //printf("row_start_ptr: 0x%x\n", row_start);
+  //printf("edge_dst_ptr: 0x%x\n", edge_dst);
   // print_device_int_vector(10, row_start, "row_start");
   // print_device_int_vector(10, edge_dst, "edge_dst");
   float_malloc_device(n * len, temp); // TODO: avoid repetitive allocation
-  csrmm_gpu(n, len, n, nnz, 1.0, norm_factor, row_start, edge_dst, in, 0.0,
-            temp, out);
+  csrmm_gpu(n, len, n, nnz, 1.0, norm_factor, row_start, edge_dst, in, 0.0, temp, out);
   float_free_device(temp);
 }
 
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index 1ea5662d91..246091903c 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -225,19 +225,16 @@ void csrmm_gpu(const int M, const int N, const int K, const int nnz,
                const float alpha, const float* A_nonzeros, const int* A_idx_ptr,
                const int* A_nnz_idx, const float* B, const float beta,
                float* transpose_C, float* C) {
-  // std::cout << "[debug] csrmm_gpu m=" << M << ", n=" << N << ", k=" << K <<
-  // ", nnz=" << nnz << "\n";
-  CUSPARSE_CHECK(cusparseScsrmm2(
-      deepgalois::DistContext::cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE,
-      CUSPARSE_OPERATION_TRANSPOSE, M, N, K, nnz, &alpha,
-      deepgalois::DistContext::cusparse_matdescr(), A_nonzeros, A_idx_ptr,
-      A_nnz_idx, B, N, &beta, transpose_C, M));
+  //std::cout << "[debug] csrmm_gpu m=" << M << ", n=" << N << ", k=" << K << ", nnz=" << nnz << "\n";
+  CUSPARSE_CHECK(cusparseScsrmm2(deepgalois::DistContext::cusparse_handle(), 
+             CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE,
+             M, N, K, nnz, &alpha, deepgalois::DistContext::cusparse_matdescr(), 
+             A_nonzeros, A_idx_ptr, A_nnz_idx, B, N, &beta, transpose_C, M));
   // transpose C
   const float one  = 1.0;
   const float zero = 0.0;
   CUBLAS_CHECK(cublasSgeam(deepgalois::DistContext::cublas_handle(), CUBLAS_OP_T,
-                           CUBLAS_OP_T, N, M, &one, transpose_C, M, &zero, NULL,
-                           M, C, N));
+                           CUBLAS_OP_T, N, M, &one, transpose_C, M, &zero, NULL, M, C, N));
 }
 /*
 void csrmm_gpu_new(const int M, const int N, const int K, const int nnz,
diff --git a/lonestar/gnn/include/engine.h b/lonestar/gnn/include/engine.h
index 727cd52f6b..ad63ffdb78 100644
--- a/lonestar/gnn/include/engine.h
+++ b/lonestar/gnn/include/engine.h
@@ -63,6 +63,8 @@ void LonestarGnnStart(int argc, char** argv, const char* app, const char* desc,
 int main(int argc, char** argv) {
 #ifdef GALOIS_USE_DIST
   galois::DistMemSys G;
+#else
+  galois::SharedMemSys G;
 #endif
   LonestarGnnStart(argc, argv, name, desc, url);
 

From 84527772827a8b2d3f5bc055038b885c3bb68b52 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Tue, 12 May 2020 15:47:22 -0500
Subject: [PATCH 294/660] small fix

---
 libdeepgalois/include/deepgalois/Net.h | 2 +-
 libdeepgalois/src/Net.cpp              | 1 -
 libdeepgalois/src/Net.cu               | 7 +++----
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h
index f8e601d0fa..5b45b03d11 100644
--- a/libdeepgalois/include/deepgalois/Net.h
+++ b/libdeepgalois/include/deepgalois/Net.h
@@ -418,7 +418,7 @@ class Net {
 
   // read masks of test set
   void read_test_masks(std::string dataset);
-  void copy_test_masks_to_device();
+  //void copy_test_masks_to_device();
 
   void construct_layers() {
     // append conv layers
diff --git a/libdeepgalois/src/Net.cpp b/libdeepgalois/src/Net.cpp
index c9d5f1e7fc..41ce7b2d77 100644
--- a/libdeepgalois/src/Net.cpp
+++ b/libdeepgalois/src/Net.cpp
@@ -100,7 +100,6 @@ void Net::read_test_masks(std::string dataset) {
     globalTestCount = distContext->read_masks(dataset, std::string("test"), 
         globalSamples, globalTestBegin, globalTestEnd, test_masks, dGraph);
   }
-  copy_test_masks_to_device();
 }
 
 /**
diff --git a/libdeepgalois/src/Net.cu b/libdeepgalois/src/Net.cu
index 7ea47dc3ae..b63e5df3a6 100644
--- a/libdeepgalois/src/Net.cu
+++ b/libdeepgalois/src/Net.cu
@@ -192,13 +192,12 @@ void Net::read_test_masks(std::string dataset) {
     globalTestCount = distContext->read_masks(dataset, std::string("test"), 
         globalSamples, globalTestBegin, globalTestEnd, test_masks, NULL);
   }
-  copy_test_masks_to_device();
-}
-
-void Net::copy_test_masks_to_device() {
+  //copy_test_masks_to_device();
   copy_masks_device(globalSamples, test_masks, d_test_masks);
 }
 
+//void Net::copy_test_masks_to_device() {}
+
 // add weight decay
 void Net::regularize() {
   size_t layer_id = 0;

From df94c434ba18a4e6e24e84c2d9f3e1d38f29c230 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Tue, 12 May 2020 17:04:44 -0500
Subject: [PATCH 295/660] fix sampler interface

---
 libdeepgalois/include/deepgalois/Net.h       |  8 ++-
 libdeepgalois/include/deepgalois/Sampler.h   | 44 ++++++--------
 libdeepgalois/src/DistContext.cpp            |  4 +-
 libdeepgalois/src/Sampler.cpp                | 64 ++++++++------------
 libdeepgalois/src/{sampler.cu => Sampler.cu} | 26 +++-----
 5 files changed, 59 insertions(+), 87 deletions(-)
 rename libdeepgalois/src/{sampler.cu => Sampler.cu} (87%)

diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h
index 5b45b03d11..e62f68b297 100644
--- a/libdeepgalois/include/deepgalois/Net.h
+++ b/libdeepgalois/include/deepgalois/Net.h
@@ -230,9 +230,11 @@ class Net {
           // generate subgraphs
 #ifndef __GALOIS_HET_CUDA__
           for (int sid = 0; sid < num_subgraphs; sid++) {
-            sampler->sampleSubgraph(
-                subgraph_sample_size, *(distContext->getSubgraphPointer(sid)),
-                &subgraphs_masks[sid * globalSamples], curEpoch);
+            VertexSet sampledSet;
+            sampler->selectVertices(subgraph_sample_size, sampledSet, curEpoch); // m = 1000 by default
+            sampler->generateSubgraph(sampledSet,
+                                      &subgraphs_masks[sid * globalSamples],
+                                      distContext->getSubgraphPointer(sid));
           }
 #endif
           num_subg_remain = num_subgraphs;
diff --git a/libdeepgalois/include/deepgalois/Sampler.h b/libdeepgalois/include/deepgalois/Sampler.h
index 72d5425817..bdff17e6e2 100644
--- a/libdeepgalois/include/deepgalois/Sampler.h
+++ b/libdeepgalois/include/deepgalois/Sampler.h
@@ -12,7 +12,7 @@ class Sampler {
   typedef int db_t;
 
 protected:
-  int m_;
+  index_t m; // number of vertice in the frontier
   size_t count_;
 
   //! averaged degree of masked graph
@@ -34,8 +34,7 @@ class Sampler {
   //! Reindex a graph to only contain those in the vertex set
   void reindexSubgraph(VertexSet& keptVertices, Graph& g, Graph& reindexed);
 
-  //! Given a graph, return a graph with edges to unmasked vertices removed in
-  //! mg
+  //! Given a graph, return a graph with edges to unmasked vertices removed in mg
   template <typename GraphTy>
   void getMaskedGraph(size_t n, mask_t* masks, GraphTy* g, Graph& sub) {
     std::vector<uint32_t> degrees(n, 0);
@@ -70,55 +69,48 @@ class Sampler {
 
   //! determine degree of each vertex in a masked graph (given by masks and g)
   template <typename GraphTy>
-  void getMaskedDegrees(size_t n, mask_t* masks, GraphTy* g,
-                        std::vector<uint32_t>& degrees) {
+  void getMaskedDegrees(size_t n, mask_t* masks, GraphTy* g, std::vector<uint32_t>& degrees) {
     assert(degrees.size() == n);
 #ifdef PARALLEL_GEN
-    galois::do_all(
-        galois::iterate(size_t(0), n),
-        [&](const auto src) {
+    galois::do_all(galois::iterate(size_t(0), n), [&](const auto src) {
 #else
     for (size_t src = 0; src < n; src++) {
 #endif
-          if (masks[src] == 1) {
-            for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) {
-              const auto dst = g->getEdgeDst(e);
-              if (masks[dst] == 1) {
-                // galois::gInfo("Edge ", src, " ", dst);
-                degrees[src]++;
-              }
-            }
+      if (masks[src] == 1) {
+        for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) {
+          const auto dst = g->getEdgeDst(e);
+          if (masks[dst] == 1) {
+            // galois::gInfo("Edge ", src, " ", dst);
+            degrees[src]++;
           }
         }
+      }
+    }
 #ifdef PARALLEL_GEN
-        ,
-        galois::loopname("update_degrees"));
+    , galois::loopname("update_degrees"));
 #endif
   }
 
   //! Set masks bitset with IDs in the vertices VertexSet
   void createMasks(size_t n, VertexSet vertices, mask_t* masks);
   inline VertexList reindexVertices(size_t n, VertexSet vertex_set);
-  void checkGSDB(std::vector<db_t>& DB0, std::vector<db_t>& DB1,
-                 std::vector<db_t>& DB2, size_t size);
+  void checkGSDB(std::vector<db_t>& DB0, std::vector<db_t>& DB1, std::vector<db_t>& DB2, index_t size);
 
   //! convert set of gids to lids
   VertexSet convertToLID(VertexSet& gidSet);
 
 public:
-  Sampler() : m_(DEFAULT_SIZE_FRONTIER) {}
+  Sampler() : m(DEFAULT_SIZE_FRONTIER) {}
   ~Sampler() {}
 
   //! sample a subgraph sg of size n from graph g
   //! sg is overwritten/is output
-  void sampleSubgraph(size_t n, Graph& sg, mask_t* masks, unsigned seed = 0);
+  void generateSubgraph(VertexSet &vertex_set, mask_t* masks, Graph* sg);
 
   //! API function for user-defined selection strategy
   // TODO how to expose this?
-  virtual void selectVertices(size_t nv, size_t n, int m, Graph* g,
-                              VertexList vertices, VertexSet& vertex_set);
-  virtual void selectVertices(size_t n, int m, VertexSet& vertex_set,
-                              unsigned seed);
+  virtual void selectVertices(index_t nv, index_t n, Graph* g, VertexList vertices, VertexSet& vertex_set);
+  virtual void selectVertices(index_t n, VertexSet& vertex_set, unsigned seed);
 
   // galois::runtime::iterable<galois::NoDerefIterator<edge_iterator> >
   // neighbor_sampler(Graph &g, VertexID v);
diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp
index e3c5efb038..4e6b839179 100644
--- a/libdeepgalois/src/DistContext.cpp
+++ b/libdeepgalois/src/DistContext.cpp
@@ -270,7 +270,7 @@ void DistContext::constructNormFactor(deepgalois::Context* globalContext) {
 }
 
 void DistContext::constructNormFactorSub(int subgraphID) {
-    galois::gPrint("Sub norm factor construction\n");
+    //galois::gPrint("Sub norm factor construction\n");
     // right now norm factor based on subgraph
     // TODO fix this for dist execution
 
@@ -316,7 +316,7 @@ void DistContext::constructNormFactorSub(int subgraphID) {
       },
       galois::loopname("NormCountingNode"));
 #endif
-  galois::gPrint("Sub norm factor construction done\n");
+  //galois::gPrint("Sub norm factor construction done\n");
 }
 //! generate labels for the subgraph, m is subgraph size, mask
 //! tells which vertices to use
diff --git a/libdeepgalois/src/Sampler.cpp b/libdeepgalois/src/Sampler.cpp
index 966caaedf3..2eb18942a4 100644
--- a/libdeepgalois/src/Sampler.cpp
+++ b/libdeepgalois/src/Sampler.cpp
@@ -26,8 +26,7 @@ inline unsigned getDegree(Graph* g, index_t v) {
   return g->edge_end(v) - g->edge_begin(v);
 }
 
-void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, Graph* g,
-                                    DGraph* dg) {
+void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, Graph* g, DGraph* dg) {
   this->count_ = count;
   // save original graph
   Sampler::globalGraph = g;
@@ -41,7 +40,7 @@ void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, Graph* g,
   // get degrees of nodes that will be in new graph
   this->getMaskedDegrees(g->size(), masks, g, degrees);
   auto offsets = deepgalois::parallel_prefix_sum(degrees);
-  size_t ne    = offsets[g->size()];
+  auto ne    = offsets[g->size()];
 
   // save ids (of original graph) of training nodes to vector
   for (size_t i = 0; i < g->size(); i++) {
@@ -86,7 +85,7 @@ void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, Graph* g,
 
 // helper function for graph saint implementation below
 void Sampler::checkGSDB(std::vector<db_t>& DB0, std::vector<db_t>& DB1,
-                        std::vector<db_t>& DB2, size_t size) {
+                        std::vector<db_t>& DB2, index_t size) {
   if (DB0.capacity() < size) {
     DB0.reserve(DB0.capacity() * 2);
     DB1.reserve(DB1.capacity() * 2);
@@ -99,10 +98,8 @@ void Sampler::checkGSDB(std::vector<db_t>& DB0, std::vector<db_t>& DB1,
 
 // implementation from GraphSAINT
 // https://github.com/GraphSAINT/GraphSAINT/blob/master/ipdps19_cpp/sample.cpp
-void Sampler::selectVertices(size_t n, int m, VertexSet& st, unsigned seed) {
-  if (n < (size_t)m) {
-    m = n;
-  }
+void Sampler::selectVertices(index_t n, VertexSet& st, unsigned seed) {
+  if (n < m) m = n;
   unsigned myseed = seed;
 
   // unsigned myseed = tid;
@@ -127,7 +124,7 @@ void Sampler::selectVertices(size_t n, int m, VertexSet& st, unsigned seed) {
   // for (size_t i = 0; i < 10; i++) std::cout << trainingNodes[i] << " ";
   // printf(")\n");
 
-  for (int i = 0; i < m; i++) {
+  for (index_t i = 0; i < m; i++) {
     auto rand_idx = rand_r(&myseed) % Sampler::trainingNodes.size();
     db_t v = IA3[i] = Sampler::trainingNodes[rand_idx];
     st.insert(v);
@@ -139,11 +136,11 @@ void Sampler::selectVertices(size_t n, int m, VertexSet& st, unsigned seed) {
   // calculate prefix sum for IA0 and store in IA2 to compute the address for
   // each frontier in DB
   IA2[0] = IA0[0];
-  for (int i = 1; i < m; i++)
+  for (index_t i = 1; i < m; i++)
     IA2[i] = IA2[i - 1] + IA0[i];
   // now fill DB accordingly
   checkGSDB(DB0, DB1, DB2, IA2[m - 1]);
-  for (int i = 0; i < m; i++) {
+  for (index_t i = 0; i < m; i++) {
     db_t DB_start = (i == 0) ? 0 : IA2[i - 1];
     db_t DB_end   = IA2[i];
     for (auto j = DB_start; j < DB_end; j++) {
@@ -154,7 +151,7 @@ void Sampler::selectVertices(size_t n, int m, VertexSet& st, unsigned seed) {
   }
 
   db_t choose, neigh_v, newsize, tmp;
-  for (size_t itr = 0; itr < n - m; itr++) {
+  for (index_t itr = 0; itr < n - m; itr++) {
     choose = db_t(-1);
     while (choose == db_t(-1)) {
       tmp = rand_r(&myseed) % DB0.size();
@@ -249,24 +246,24 @@ void Sampler::selectVertices(size_t n, int m, VertexSet& st, unsigned seed) {
 // n: number of vertices in the subgraph;
 // m: number of vertices in the frontier.
 // our implementation of GraphSAINT sampling
-void Sampler::selectVertices(size_t nv, size_t n, int m, Graph* g,
+void Sampler::selectVertices(index_t nv, index_t n, Graph* g,
                              VertexList vertices, VertexSet& vertex_set) {
   // galois::gPrint("Select a vertex set of size ", n, " from ", nv, " vertices,
   // graph size: ", g->size(), "\n");
   assert(nv == vertices.size());
-  auto frontier_indices = deepgalois::select_k_items(
-      m, 0, (int)nv); // randomly select m vertices from vertices as frontier
+  // randomly select m vertices from vertices as frontier
+  auto frontier_indices = deepgalois::select_k_items((int)m, 0, (int)nv);
   VertexList frontier(m);
-  for (int i = 0; i < m; i++)
+  for (index_t i = 0; i < m; i++)
     frontier[i] = vertices[frontier_indices[i]];
   vertex_set.insert(frontier.begin(), frontier.end());
   // galois::gPrint("vertex_set size: ", vertex_set.size(), "\n");
   int* degrees = new int[m];
-  for (int i = 0; i < m; i++) {
-    // galois::do_all(galois::iterate(size_t(0), size_t(m)), [&](const auto i) {
+  //galois::do_all(galois::iterate(size_t(0), size_t(m)), [&](const auto i) {
+  for (index_t i = 0; i < m; i++) {
     degrees[i] = (int)getDegree(g, frontier[i]);
   } //, galois::loopname("compute_degrees"));
-  for (size_t i = 0; i < n - m; i++) {
+  for (index_t i = 0; i < n - m; i++) {
     auto pos    = select_one_item((int)m, degrees);
     auto u      = frontier[pos];
     auto degree = degrees[pos];
@@ -294,8 +291,7 @@ void Sampler::selectVertices(size_t nv, size_t n, int m, Graph* g,
 void Sampler::createMasks(size_t n, VertexSet vertices, mask_t* masks) {
   // galois::gPrint("Updating masks, size = ", vertices.size(), "\n");
   std::fill(masks, masks + n, 0);
-  for (auto v : vertices)
-    masks[v] = 1;
+  for (auto v : vertices) masks[v] = 1;
 }
 
 inline VertexList Sampler::reindexVertices(size_t n, VertexSet vertex_set) {
@@ -309,8 +305,7 @@ inline VertexList Sampler::reindexVertices(size_t n, VertexSet vertex_set) {
 
 // Given a subset of vertices and a graph g, generate a subgraph sg from the
 // graph g
-void Sampler::reindexSubgraph(VertexSet& keptVertices, Graph& origGraph,
-                              Graph& reindexGraph) {
+void Sampler::reindexSubgraph(VertexSet& keptVertices, Graph& origGraph, Graph& reindexGraph) {
   // auto n = origGraph.size(); // old graph size
   auto nv            = keptVertices.size(); // new graph (subgraph) size
   VertexList new_ids = this->reindexVertices(globalGraph->size(), keptVertices);
@@ -328,9 +323,7 @@ void Sampler::reindexSubgraph(VertexSet& keptVertices, Graph& origGraph,
   VertexList old_ids(keptVertices.begin(),
                      keptVertices.end()); // vertex ID mapping
 #ifdef PARALLEL_GEN
-  galois::do_all(
-      galois::iterate((size_t)0, nv),
-      [&](const auto i) {
+  galois::do_all(galois::iterate((size_t)0, nv), [&](const auto i) {
 #else
   for (size_t i = 0; i < nv; i++) {
 #endif
@@ -346,8 +339,7 @@ void Sampler::reindexSubgraph(VertexSet& keptVertices, Graph& origGraph,
         }
       }
 #ifdef PARALLEL_GEN
-      ,
-      galois::loopname("construct_graph"));
+      , galois::loopname("construct_graph"));
 #endif
 }
 
@@ -362,14 +354,9 @@ VertexSet Sampler::convertToLID(VertexSet& gidSet) {
   return existingLIDs;
 }
 
-void Sampler::sampleSubgraph(size_t n, Graph& sg, mask_t* masks,
-                             unsigned seed) {
-  VertexSet sampledSet;
+void Sampler::generateSubgraph(VertexSet &sampledSet, mask_t* masks, Graph* sg) {
   // n = 9000 by default
-  // this->selectVertices(count_, n, m_, globalMaskedGraph, vertices_,
-  // sampledSet); do the sampling of vertices from training set + using masked
-  // graph
-  this->selectVertices(n, m_, sampledSet, seed); // m = 1000 by default
+  // do the sampling of vertices from training set + using masked graph
 
   // sampledSet is a list of *global* ids in the graph
   // create new vertex set with LIDs for partitioned graph
@@ -388,10 +375,9 @@ void Sampler::sampleSubgraph(size_t n, Graph& sg, mask_t* masks,
   // this graph will contain sampled vertices and induced subgraph for it
   Graph maskedSG;
   // TODO use partMaskedGraph once constructed later
-  this->getMaskedGraph(
-      Sampler::partGraph->size(), masks, Sampler::partGraph,
-      maskedSG); // remove edges whose destination is not masked
-  this->reindexSubgraph(sampledLIDs, maskedSG, sg);
+  // remove edges whose destination is not masked
+  this->getMaskedGraph(Sampler::partGraph->size(), masks, Sampler::partGraph, maskedSG);
+  this->reindexSubgraph(sampledLIDs, maskedSG, *sg);
 
   // galois::gPrint("sg num edges is ", sg.sizeEdges(), "\n");
 }
diff --git a/libdeepgalois/src/sampler.cu b/libdeepgalois/src/Sampler.cu
similarity index 87%
rename from libdeepgalois/src/sampler.cu
rename to libdeepgalois/src/Sampler.cu
index 6fb452db4c..b3f949ca39 100644
--- a/libdeepgalois/src/sampler.cu
+++ b/libdeepgalois/src/Sampler.cu
@@ -97,8 +97,7 @@ inline VertexList Sampler::reindexing_vertices(size_t n, VertexSet vertex_set) {
   return new_ids;
 }
 
-void Sampler::generate_masked_graph(index_t n, mask_t* masks, GraphGPU* g,
-                                    GraphGPU* subg) {
+void Sampler::generate_masked_graph(index_t n, mask_t* masks, GraphGPU* g, GraphGPU* subg) {
   index_t *degrees, *offsets;
   CUDA_CHECK(cudaMalloc((void**)&degrees, sizeof(index_t)*n);
   get_masked_degrees<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, masks, g, degrees);
@@ -112,33 +111,26 @@ void Sampler::generate_masked_graph(index_t n, mask_t* masks, GraphGPU* g,
   CUDA_CHECK(cudaFree(pffsets));
 }
 
-// use a random walk to select vertex subset
-void Sampler::select_vertices(size_t n, int m, VertexSet& st) {}
-
 // n: size of the original graph
 // nv: size of the subgraph; i.e. size of vertex_set
 // masks, graph g and subgraph sub are on the device (GPU)
-void Sampler::generate_subgraph(index_t nv, VertexSet vertex_set, mask_t* masks,
-                                GraphGPU* g, GraphGPU* sub) {
+void Sampler::generateSubgraph(VertexSet &vertex_set, mask_t* masks, GraphGPU* g, GraphGPU* sub) {
+  auto nv = vertex_set.size();
   // convert the vertex_set to a vertex_list and copy it to the device
   VertexList vertex_list(vertex_set.begin(), vertex_set.end());
   index_t* d_vertex_list;
   cudaMalloc((void**)&d_vertex_list, nv * sizeof(index_t));
-  CUDA_CHECK(cudaMemcpy(d_vertex_list, &vertex_list[0], nv * sizeof(index_t),
-                        cudaMemcpyHostToDevice));
+  CUDA_CHECK(cudaMemcpy(d_vertex_list, &vertex_list[0], nv * sizeof(index_t), cudaMemcpyHostToDevice));
 
   index_t n = graph->size();
-  update_masks(n, d_vertex_list,
-               masks); // set masks for vertices in the vertex_set
-  GraphGPU
-      masked_sg; // size is the same as original graph, but masked dst removed
-  generate_masked_graph(
-      n, masks, g, &masked_sg); // remove edges whose destination is not masked
+  update_masks(n, d_vertex_list, masks); // set masks for vertices in the vertex_set
+  GraphGPU masked_sg; // size is the same as original graph, but masked dst removed
+  generate_masked_graph(n, masks, globalGraph, &masked_sg); // remove edges whose destination is not masked
 
   // re-index the subgraph
-  index_t* d_new_ids; // Given an old vertex ID ∈ [0, n), returns a new vertex
-                      // ID ∈ [0, nv)
+  index_t* d_new_ids;
   cudaMalloc((void**)&d_new_ids, n * sizeof(index_t));
+  // Given an old vertex ID ∈ [0, n), returns a new vertex ID ∈ [0, nv)
   auto new_ids = reindexing_vertices(nv, vertex_set);
   CUDA_CHECK(cudaMemcpy(d_new_ids, &new_ids[0], n * sizeof(index_t),
                         cudaMemcpyHostToDevice));

From ee98dfbd9b2fec69411a889d2be35aad4cf58533 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Wed, 13 May 2020 08:13:33 -0500
Subject: [PATCH 296/660] refine Sampler interface

---
 libdeepgalois/CMakeLists.txt               |   4 +-
 libdeepgalois/include/deepgalois/Net.h     |  24 ++--
 libdeepgalois/include/deepgalois/Sampler.h |  75 +++-------
 libdeepgalois/src/RandomWalk.cpp           | 153 +++++++++++++++++++++
 libdeepgalois/src/Sampler.cpp              | 108 ++++++++++-----
 libdeepgalois/src/Sampler.cu               |  53 +++----
 6 files changed, 286 insertions(+), 131 deletions(-)
 create mode 100644 libdeepgalois/src/RandomWalk.cpp

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index 064f24d0d7..e66443c22a 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -58,6 +58,7 @@ if(ENABLE_HETERO_GALOIS)
     src/math_functions.cu
     src/optimizer.cu
     src/DistContext.cu
+    src/Sampler.cu
     src/lgraph.cu
     src/node.cu
     src/Net.cu
@@ -72,7 +73,7 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 
 if(ENABLE_HETERO_GALOIS)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__GALOIS_HET_CUDA__")
-  set(sources src/reader.cpp)
+  set(sources src/reader.cpp src/RandomWalk.cpp)
 else()
   set(sources
     src/layers/softmax_loss_layer.cpp
@@ -85,6 +86,7 @@ else()
     src/math_functions.cpp
     src/optimizer.cpp
     src/DistContext.cpp
+    src/RandomWalk.cpp
     src/Sampler.cpp
     src/reader.cpp
     src/lgraph.cpp
diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h
index e62f68b297..e47664804f 100644
--- a/libdeepgalois/include/deepgalois/Net.h
+++ b/libdeepgalois/include/deepgalois/Net.h
@@ -13,10 +13,7 @@
 #include "deepgalois/Context.h"
 #include "deepgalois/GraphTypes.h"
 #include "deepgalois/DistContext.h"
-
-#ifndef __GALOIS_HET_CUDA__
 #include "deepgalois/Sampler.h"
-#endif
 
 namespace deepgalois {
 
@@ -87,10 +84,7 @@ class Net {
   //! dist context holds graph data of the partitioned graph only
   deepgalois::DistContext* distContext;
   DGraph* dGraph;
-
-#ifndef __GALOIS_HET_CUDA__
   Sampler* sampler;
-#endif
 
 public:
   Net(std::string dataset_str, int nt, unsigned n_conv, int epochs,
@@ -202,11 +196,13 @@ class Net {
       distContext->allocateSubgraphs(num_subgraphs);
       subgraphs_masks = new mask_t[distNumSamples * num_subgraphs];
       std::cout << header << "Constructing training vertex set induced graph...\n";
-#ifndef __GALOIS_HET_CUDA__
-      sampler->initializeMaskedGraph(globalTrainCount, globalTrainMasks,
-                                     graphTopologyContext->getGraphPointer(),
-                                     distContext->getGraphPointer());
+#ifdef __GALOIS_HET_CUDA__
+      auto gg = distContext->getGraphPointer();
+#else
+      auto gg = graphTopologyContext->getGraphPointer();
 #endif
+      sampler->initializeMaskedGraph(globalTrainCount, globalTrainMasks, gg,
+                                     distContext->getGraphPointer());
     }
 
     std::cout << header << "Start training...\n";
@@ -228,15 +224,12 @@ class Net {
           t_subgen.Start();
 
           // generate subgraphs
-#ifndef __GALOIS_HET_CUDA__
           for (int sid = 0; sid < num_subgraphs; sid++) {
             VertexSet sampledSet;
             sampler->selectVertices(subgraph_sample_size, sampledSet, curEpoch); // m = 1000 by default
-            sampler->generateSubgraph(sampledSet,
-                                      &subgraphs_masks[sid * globalSamples],
+            sampler->generateSubgraph(sampledSet, &subgraphs_masks[sid * globalSamples],
                                       distContext->getSubgraphPointer(sid));
           }
-#endif
           num_subg_remain = num_subgraphs;
           t_subgen.Stop();
           // std::cout << "Done, time: " << t_subgen.Millisecs() << "\n";
@@ -373,7 +366,6 @@ class Net {
       masks = test_masks;
     }
 
-#ifndef __GALOIS_HET_CUDA__
     // switch to the original graph if not training
     if (subgraph_sample_size && type != "train") {
       for (size_t i = 0; i < num_layers; i++)
@@ -385,7 +377,7 @@ class Net {
       layers[num_layers - 1]->set_labels_ptr(distContext->get_labels_ptr());
       layers[0]->set_feats_ptr(distContext->get_feats_ptr()); // feed input data
     }
-#else
+#ifdef __GALOIS_HET_CUDA__
     if (type == "train") {
       masks = d_train_masks;
     } else if (type == "val") {
diff --git a/libdeepgalois/include/deepgalois/Sampler.h b/libdeepgalois/include/deepgalois/Sampler.h
index bdff17e6e2..6b24c8fce2 100644
--- a/libdeepgalois/include/deepgalois/Sampler.h
+++ b/libdeepgalois/include/deepgalois/Sampler.h
@@ -35,70 +35,39 @@ class Sampler {
   void reindexSubgraph(VertexSet& keptVertices, Graph& g, Graph& reindexed);
 
   //! Given a graph, return a graph with edges to unmasked vertices removed in mg
-  template <typename GraphTy>
-  void getMaskedGraph(size_t n, mask_t* masks, GraphTy* g, Graph& sub) {
-    std::vector<uint32_t> degrees(n, 0);
-    this->getMaskedDegrees(n, masks, g, degrees);
-    // auto offsets = deepgalois::parallel_prefix_sum(degrees);
-    auto offsets = deepgalois::prefix_sum(degrees);
-    size_t ne    = offsets[n];
-    // galois::gPrint("Generate masked graph: num_vertices=", n, ", num_edges=",
-    // ne, "\n");
-
-    // note this constructs the full graph's nodes; just trims edges
-    sub.allocateFrom(n, ne);
-    sub.constructNodes();
-
-    galois::do_all(
-        galois::iterate((size_t)0, n),
-        [&](const auto src) {
-          sub.fixEndEdge(src, offsets[src + 1]);
-          if (masks[src] == 1) {
-            auto idx = offsets[src];
-            for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) {
-              const auto dst = g->getEdgeDst(e);
-              if (masks[dst] == 1) {
-                // galois::gPrint(src, " ", dst, "\n");
-                sub.constructEdge(idx++, dst, 0);
-              }
-            }
-          }
-        },
-        galois::loopname("gen_subgraph"));
-  }
+  template <typename GraphTy, typename SubgraphTy = Graph>
+  void getMaskedGraph(index_t n, mask_t* masks, GraphTy* g, SubgraphTy* sub);
 
   //! determine degree of each vertex in a masked graph (given by masks and g)
   template <typename GraphTy>
-  void getMaskedDegrees(size_t n, mask_t* masks, GraphTy* g, std::vector<uint32_t>& degrees) {
-    assert(degrees.size() == n);
-#ifdef PARALLEL_GEN
-    galois::do_all(galois::iterate(size_t(0), n), [&](const auto src) {
-#else
-    for (size_t src = 0; src < n; src++) {
-#endif
-      if (masks[src] == 1) {
-        for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) {
-          const auto dst = g->getEdgeDst(e);
-          if (masks[dst] == 1) {
-            // galois::gInfo("Edge ", src, " ", dst);
-            degrees[src]++;
-          }
-        }
-      }
-    }
-#ifdef PARALLEL_GEN
-    , galois::loopname("update_degrees"));
-#endif
-  }
+  void getMaskedDegrees(size_t n, mask_t* masks, GraphTy* g, std::vector<uint32_t>& degrees);
 
   //! Set masks bitset with IDs in the vertices VertexSet
   void createMasks(size_t n, VertexSet vertices, mask_t* masks);
   inline VertexList reindexVertices(size_t n, VertexSet vertex_set);
-  void checkGSDB(std::vector<db_t>& DB0, std::vector<db_t>& DB1, std::vector<db_t>& DB2, index_t size);
+  //void checkGSDB(std::vector<db_t>& DB0, std::vector<db_t>& DB1, std::vector<db_t>& DB2, index_t size);
 
   //! convert set of gids to lids
   VertexSet convertToLID(VertexSet& gidSet);
 
+  //! helper function to get degree of some vertex given some graph
+  inline unsigned getDegree(Graph* g, index_t v) {
+    return g->edge_end(v) - g->edge_begin(v);
+  }
+
+  // helper function for graph saint implementation below
+  void checkGSDB(std::vector<db_t>& DB0, std::vector<db_t>& DB1,
+      std::vector<db_t>& DB2, index_t size) {
+    if (DB0.capacity() < size) {
+      DB0.reserve(DB0.capacity() * 2);
+      DB1.reserve(DB1.capacity() * 2);
+      DB2.reserve(DB2.capacity() * 2);
+    }
+    DB0.resize(size);
+    DB1.resize(size);
+    DB2.resize(size);
+  }
+
 public:
   Sampler() : m(DEFAULT_SIZE_FRONTIER) {}
   ~Sampler() {}
diff --git a/libdeepgalois/src/RandomWalk.cpp b/libdeepgalois/src/RandomWalk.cpp
new file mode 100644
index 0000000000..09e76e9fc7
--- /dev/null
+++ b/libdeepgalois/src/RandomWalk.cpp
@@ -0,0 +1,153 @@
+#include <time.h>
+#include <vector>
+#include <iostream>
+#include "deepgalois/utils.h"
+#include "deepgalois/Sampler.h"
+
+namespace deepgalois {
+
+// implementation from GraphSAINT
+// https://github.com/GraphSAINT/GraphSAINT/blob/master/ipdps19_cpp/sample.cpp
+void Sampler::selectVertices(index_t n, VertexSet& st, unsigned seed) {
+  if (n < m) m = n;
+  unsigned myseed = seed;
+
+  // unsigned myseed = tid;
+  // DBx: Dashboard line x, IAx: Index array line x
+  std::vector<db_t> DB0, DB1, DB2, IA0, IA1, IA2, IA3, IA4, nDB0, nDB1, nDB2;
+  DB0.reserve(subg_deg * m * ETA);
+  DB1.reserve(subg_deg * m * ETA);
+  DB2.reserve(subg_deg * m * ETA);
+  IA0.reserve(n);
+  IA1.reserve(n);
+  IA2.reserve(n);
+  IA3.reserve(n);
+  IA4.reserve(n);
+  IA0.resize(m);
+  IA1.resize(m);
+  IA2.resize(m);
+  IA3.resize(m);
+
+  // galois::gPrint("seed ", myseed, " m ", m, "\n");
+  // galois::gPrint("trainingNodes size: ", trainingNodes.size(), "\n");
+  // printf("( ");
+  // for (size_t i = 0; i < 10; i++) std::cout << trainingNodes[i] << " ";
+  // printf(")\n");
+
+  for (index_t i = 0; i < m; i++) {
+    auto rand_idx = rand_r(&myseed) % Sampler::trainingNodes.size();
+    db_t v = IA3[i] = Sampler::trainingNodes[rand_idx];
+    st.insert(v);
+    IA0[i] = getDegree(Sampler::globalMaskedGraph, v);
+    IA0[i] = (IA0[i] > SAMPLE_CLIP) ? SAMPLE_CLIP : IA0[i];
+    IA1[i] = 1;
+    IA2[i] = 0;
+  }
+  // calculate prefix sum for IA0 and store in IA2 to compute the address for
+  // each frontier in DB
+  IA2[0] = IA0[0];
+  for (index_t i = 1; i < m; i++)
+    IA2[i] = IA2[i - 1] + IA0[i];
+  // now fill DB accordingly
+  checkGSDB(DB0, DB1, DB2, IA2[m - 1]);
+  for (index_t i = 0; i < m; i++) {
+    db_t DB_start = (i == 0) ? 0 : IA2[i - 1];
+    db_t DB_end   = IA2[i];
+    for (auto j = DB_start; j < DB_end; j++) {
+      DB0[j] = IA3[i];
+      DB1[j] = (j == DB_start) ? (j - DB_end) : (j - DB_start);
+      DB2[j] = i + 1;
+    }
+  }
+
+  db_t choose, neigh_v, newsize, tmp;
+  for (index_t itr = 0; itr < n - m; itr++) {
+    choose = db_t(-1);
+    while (choose == db_t(-1)) {
+      tmp = rand_r(&myseed) % DB0.size();
+      if (size_t(tmp) < DB0.size())
+        if (DB0[tmp] != db_t(-1))
+          choose = tmp;
+    }
+    choose      = (DB1[choose] < 0) ? choose : (choose - DB1[choose]);
+    db_t v      = DB0[choose];
+    auto degree = getDegree(Sampler::globalMaskedGraph, v);
+    neigh_v     = (degree != 0) ? rand_r(&myseed) % degree : db_t(-1);
+    if (neigh_v != db_t(-1)) {
+      neigh_v = Sampler::globalMaskedGraph->getEdgeDst(
+          Sampler::globalMaskedGraph->edge_begin(v) + neigh_v);
+      st.insert(neigh_v);
+      IA1[DB2[choose] - 1] = 0;
+      IA0[DB2[choose] - 1] = 0;
+      for (auto i = choose; i < choose - DB1[choose]; i++)
+        DB0[i] = db_t(-1);
+      newsize = getDegree(Sampler::globalMaskedGraph, neigh_v);
+      newsize = (newsize > SAMPLE_CLIP) ? SAMPLE_CLIP : newsize;
+    } else
+      newsize = 0;
+    // shrink DB to remove sampled nodes, also shrink IA accordingly
+    bool cond = DB0.size() + newsize > DB0.capacity();
+    if (cond) {
+      // compute prefix sum for the location in shrinked DB
+      IA4.resize(IA0.size());
+      IA4[0] = IA0[0];
+      for (size_t i = 1; i < IA0.size(); i++)
+        IA4[i] = IA4[i - 1] + IA0[i];
+      nDB0.resize(IA4.back());
+      nDB1.resize(IA4.back());
+      nDB2.resize(IA4.back());
+      IA2.assign(IA4.begin(), IA4.end());
+      for (size_t i = 0; i < IA0.size(); i++) {
+        if (IA1[i] == 0)
+          continue;
+        db_t DB_start = (i == 0) ? 0 : IA4[i - 1];
+        db_t DB_end   = IA4[i];
+        for (auto j = DB_start; j < DB_end; j++) {
+          nDB0[j] = IA3[i];
+          nDB1[j] = (j == DB_start) ? (j - DB_end) : (j - DB_start);
+          nDB2[j] = i + 1;
+        }
+      }
+      // remap the index in DB2 by compute prefix of IA1 (new idx in IA)
+      IA4.resize(IA1.size());
+      IA4[0] = IA1[0];
+      for (size_t i = 1; i < IA1.size(); i++)
+        IA4[i] = IA4[i - 1] + IA1[i];
+      DB0.assign(nDB0.begin(), nDB0.end());
+      DB1.assign(nDB1.begin(), nDB1.end());
+      DB2.assign(nDB2.begin(), nDB2.end());
+      for (auto i = DB2.begin(); i < DB2.end(); i++)
+        *i = IA4[*i - 1];
+      db_t curr = 0;
+      for (size_t i = 0; i < IA0.size(); i++) {
+        if (IA0[i] != 0) {
+          IA0[curr] = IA0[i];
+          IA1[curr] = IA1[i];
+          IA2[curr] = IA2[i];
+          IA3[curr] = IA3[i];
+          curr++;
+        }
+      }
+      IA0.resize(curr);
+      IA1.resize(curr);
+      IA2.resize(curr);
+      IA3.resize(curr);
+    }
+    checkGSDB(DB0, DB1, DB2, newsize + DB0.size());
+    IA0.push_back(newsize);
+    IA1.push_back(1);
+    IA2.push_back(IA2.back() + IA0.back());
+    IA3.push_back(neigh_v);
+    db_t DB_start = (*(IA2.end() - 2));
+    db_t DB_end   = IA2.back();
+    for (auto j = DB_start; j < DB_end; j++) {
+      DB0[j] = IA3.back();
+      DB1[j] = (j == DB_start) ? (j - DB_end) : (j - DB_start);
+      DB2[j] = IA3.size();
+    }
+  }
+  // galois::gPrint("Done selection, vertex_set size: ", st.size(), ", set: ");
+  // print_vertex_set(st);
+}
+
+} // namespace deepgalois
diff --git a/libdeepgalois/src/Sampler.cpp b/libdeepgalois/src/Sampler.cpp
index 2eb18942a4..aed0768ac0 100644
--- a/libdeepgalois/src/Sampler.cpp
+++ b/libdeepgalois/src/Sampler.cpp
@@ -1,8 +1,8 @@
-#include "deepgalois/utils.h"
-#include "deepgalois/Sampler.h"
-#include "galois/Galois.h"
 #include <time.h>
 #include <vector>
+#include "galois/Galois.h"
+#include "deepgalois/utils.h"
+#include "deepgalois/Sampler.h"
 #define PARALLEL_GEN
 
 namespace deepgalois {
@@ -21,11 +21,6 @@ void print_vertex_set(VertexSet vertex_set) {
   galois::gPrint(")\n");
 }
 
-//! helper function to get degree of some vertex given some graph
-inline unsigned getDegree(Graph* g, index_t v) {
-  return g->edge_end(v) - g->edge_begin(v);
-}
-
 void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, Graph* g, DGraph* dg) {
   this->count_ = count;
   // save original graph
@@ -51,22 +46,19 @@ void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, Graph* g, DGrap
   Sampler::globalMaskedGraph->allocateFrom(g->size(), ne);
   Sampler::globalMaskedGraph->constructNodes();
   // same as original graph, except keep only edges involved in masks
-  galois::do_all(
-      galois::iterate((size_t)0, g->size()),
-      [&](const auto src) {
-        Sampler::globalMaskedGraph->fixEndEdge(src, offsets[src + 1]);
-        if (masks[src] == 1) {
-          auto idx = offsets[src];
-          for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) {
-            const auto dst = g->getEdgeDst(e);
-            if (masks[dst] == 1) {
-              // galois::gPrint(src, " ", dst, "\n");
-              Sampler::globalMaskedGraph->constructEdge(idx++, dst, 0);
-            }
-          }
+  galois::do_all(galois::iterate((size_t)0, g->size()), [&](const auto src) {
+    Sampler::globalMaskedGraph->fixEndEdge(src, offsets[src + 1]);
+    if (masks[src] == 1) {
+      auto idx = offsets[src];
+      for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) {
+        const auto dst = g->getEdgeDst(e);
+        if (masks[dst] == 1) {
+          // galois::gPrint(src, " ", dst, "\n");
+          Sampler::globalMaskedGraph->constructEdge(idx++, dst, 0);
         }
-      },
-      galois::loopname("gen_subgraph"));
+      }
+    }
+  }, galois::loopname("gen_subgraph"));
 
   Sampler::globalMaskedGraph->degree_counting();
   Sampler::avg_deg = globalMaskedGraph->sizeEdges() / globalMaskedGraph->size();
@@ -83,19 +75,7 @@ void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, Graph* g, DGrap
   //}
 }
 
-// helper function for graph saint implementation below
-void Sampler::checkGSDB(std::vector<db_t>& DB0, std::vector<db_t>& DB1,
-                        std::vector<db_t>& DB2, index_t size) {
-  if (DB0.capacity() < size) {
-    DB0.reserve(DB0.capacity() * 2);
-    DB1.reserve(DB1.capacity() * 2);
-    DB2.reserve(DB2.capacity() * 2);
-  }
-  DB0.resize(size);
-  DB1.resize(size);
-  DB2.resize(size);
-}
-
+/*
 // implementation from GraphSAINT
 // https://github.com/GraphSAINT/GraphSAINT/blob/master/ipdps19_cpp/sample.cpp
 void Sampler::selectVertices(index_t n, VertexSet& st, unsigned seed) {
@@ -239,6 +219,7 @@ void Sampler::selectVertices(index_t n, VertexSet& st, unsigned seed) {
   // galois::gPrint("Done selection, vertex_set size: ", st.size(), ", set: ");
   // print_vertex_set(st);
 }
+*/
 
 // API function for user-defined selection strategy
 // Select n vertices from vertices and put them in vertex_set.
@@ -323,7 +304,7 @@ void Sampler::reindexSubgraph(VertexSet& keptVertices, Graph& origGraph, Graph&
   VertexList old_ids(keptVertices.begin(),
                      keptVertices.end()); // vertex ID mapping
 #ifdef PARALLEL_GEN
-  galois::do_all(galois::iterate((size_t)0, nv), [&](const auto i) {
+  galois::do_all(galois::iterate(size_t(0), size_t(nv)), [&](const auto i) {
 #else
   for (size_t i = 0; i < nv; i++) {
 #endif
@@ -354,6 +335,57 @@ VertexSet Sampler::convertToLID(VertexSet& gidSet) {
   return existingLIDs;
 }
 
+template <typename GraphTy>
+void Sampler::getMaskedDegrees(size_t n, mask_t* masks, GraphTy* g, std::vector<uint32_t>& degrees) {
+  assert(degrees.size() == n);
+#ifdef PARALLEL_GEN
+  galois::do_all(galois::iterate(size_t(0), n), [&](const auto src) {
+#else
+  for (size_t src = 0; src < n; src++) {
+#endif
+    if (masks[src] == 1) {
+      for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) {
+        const auto dst = g->getEdgeDst(e);
+        if (masks[dst] == 1) {
+          // galois::gInfo("Edge ", src, " ", dst);
+          degrees[src]++;
+        }
+      }
+    }
+  }
+#ifdef PARALLEL_GEN
+  , galois::loopname("update_degrees"));
+#endif
+}
+
+template <typename GraphTy, typename SubgraphTy>
+void Sampler::getMaskedGraph(index_t n, mask_t* masks, GraphTy* g, SubgraphTy* sub) {
+  std::vector<uint32_t> degrees(n, 0);
+  this->getMaskedDegrees(n, masks, g, degrees);
+  // auto offsets = deepgalois::parallel_prefix_sum(degrees);
+  auto offsets = deepgalois::prefix_sum(degrees);
+  size_t ne    = offsets[n];
+  // galois::gPrint("getMaskedGraph: num_vertices=", n, ", num_edges=", ne, "\n");
+
+  // note this constructs the full graph's nodes; just trims edges
+  sub->allocateFrom(n, ne);
+  sub->constructNodes();
+
+  galois::do_all(galois::iterate(size_t(0), size_t(n)), [&](const auto src) {
+    sub->fixEndEdge(src, offsets[src + 1]);
+    if (masks[src] == 1) {
+      auto idx = offsets[src];
+      for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) {
+        auto dst = g->getEdgeDst(e);
+        if (masks[dst] == 1) {
+          // galois::gPrint(src, " ", dst, "\n");
+          sub->constructEdge(idx++, dst, 0);
+        }
+      }
+    }
+  }, galois::loopname("gen_subgraph"));
+}
+
 void Sampler::generateSubgraph(VertexSet &sampledSet, mask_t* masks, Graph* sg) {
   // n = 9000 by default
   // do the sampling of vertices from training set + using masked graph
@@ -376,7 +408,7 @@ void Sampler::generateSubgraph(VertexSet &sampledSet, mask_t* masks, Graph* sg)
   Graph maskedSG;
   // TODO use partMaskedGraph once constructed later
   // remove edges whose destination is not masked
-  this->getMaskedGraph(Sampler::partGraph->size(), masks, Sampler::partGraph, maskedSG);
+  this->getMaskedGraph(Sampler::partGraph->size(), masks, Sampler::partGraph, &maskedSG);
   this->reindexSubgraph(sampledLIDs, maskedSG, *sg);
 
   // galois::gPrint("sg num edges is ", sg.sizeEdges(), "\n");
diff --git a/libdeepgalois/src/Sampler.cu b/libdeepgalois/src/Sampler.cu
index b3f949ca39..a0528564dd 100644
--- a/libdeepgalois/src/Sampler.cu
+++ b/libdeepgalois/src/Sampler.cu
@@ -1,6 +1,7 @@
 #include <thrust/scan.h>
 #include <thrust/execution_policy.h>
-#include "deepgalois/sampler.h"
+#include "deepgalois/cutils.h"
+#include "deepgalois/Sampler.h"
 
 namespace deepgalois {
 
@@ -76,10 +77,15 @@ __global__ void generate_graph_kernel(index_t n, const index_t* offsets,
   }
 }
 
-void Sampler::update_masks(size_t n, index_t* vertices, mask_t* masks) {
-  set_masks<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, vertices, masks);
+void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, Graph* g, DGraph* dg) {
+  this->count_ = count;
+  // save original graph
+  Sampler::globalGraph = g;
+  // save partitioned graph
+  Sampler::partGraph = dg;
 }
 
+/*
 void Sampler::indexing(size_t n, index_t* vertices, index_t* new_indices) {
   index_t vid = 0;
   for (index_t i = 0; i < n; i++) {
@@ -87,8 +93,8 @@ void Sampler::indexing(size_t n, index_t* vertices, index_t* new_indices) {
     new_indices[v] = vid++;
   }
 }
-
-inline VertexList Sampler::reindexing_vertices(size_t n, VertexSet vertex_set) {
+*/
+inline VertexList Sampler::reindexVertices(size_t n, VertexSet vertex_set) {
   VertexList new_ids(n, 0);
   int vid = 0;
   for (auto v : vertex_set) {
@@ -97,24 +103,26 @@ inline VertexList Sampler::reindexing_vertices(size_t n, VertexSet vertex_set) {
   return new_ids;
 }
 
-void Sampler::generate_masked_graph(index_t n, mask_t* masks, GraphGPU* g, GraphGPU* subg) {
+template <typename GraphTy, typename SubgraphTy>
+void Sampler::getMaskedGraph(index_t n, mask_t* masks, GraphTy* g, SubgraphTy* subg) {
   index_t *degrees, *offsets;
-  CUDA_CHECK(cudaMalloc((void**)&degrees, sizeof(index_t)*n);
-  get_masked_degrees<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, masks, g, degrees);
+  CUDA_CHECK(cudaMalloc((void**)&degrees, sizeof(index_t)*n));
+  get_masked_degrees<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, masks, *g, degrees);
   CUDA_CHECK(cudaFree(degrees));
-  CUDA_CHECK(cudaMalloc((void**)&offsets, sizeof(index_t)*(n+1));
+  CUDA_CHECK(cudaMalloc((void**)&offsets, sizeof(index_t)*(n+1)));
   thrust::exclusive_scan(thrust::device, degrees, degrees+n, offsets);
   index_t ne;
   CUDA_CHECK(cudaMemcpy(&ne, offsets+n, sizeof(index_t), cudaMemcpyDeviceToHost));
-  subg.allocateFrom(n, ne); // TODO: avoid reallocation
-  generate_masked_graph_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, masks, offsets, g, subg);
-  CUDA_CHECK(cudaFree(pffsets));
+  subg->allocateFrom(n, ne); // TODO: avoid reallocation
+  generate_masked_graph_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, masks, offsets, *g, *subg);
+  CUDA_CHECK(cudaFree(offsets));
 }
 
 // n: size of the original graph
 // nv: size of the subgraph; i.e. size of vertex_set
 // masks, graph g and subgraph sub are on the device (GPU)
-void Sampler::generateSubgraph(VertexSet &vertex_set, mask_t* masks, GraphGPU* g, GraphGPU* sub) {
+void Sampler::generateSubgraph(VertexSet &vertex_set, mask_t* masks, GraphGPU* sub) {
+  index_t n = globalGraph->size();
   auto nv = vertex_set.size();
   // convert the vertex_set to a vertex_list and copy it to the device
   VertexList vertex_list(vertex_set.begin(), vertex_set.end());
@@ -122,33 +130,32 @@ void Sampler::generateSubgraph(VertexSet &vertex_set, mask_t* masks, GraphGPU* g
   cudaMalloc((void**)&d_vertex_list, nv * sizeof(index_t));
   CUDA_CHECK(cudaMemcpy(d_vertex_list, &vertex_list[0], nv * sizeof(index_t), cudaMemcpyHostToDevice));
 
-  index_t n = graph->size();
-  update_masks(n, d_vertex_list, masks); // set masks for vertices in the vertex_set
+  // createMasks: set masks for vertices in the vertex_set
+  set_masks<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, d_vertex_list, masks);
   GraphGPU masked_sg; // size is the same as original graph, but masked dst removed
-  generate_masked_graph(n, masks, globalGraph, &masked_sg); // remove edges whose destination is not masked
+  getMaskedGraph(n, masks, globalGraph, &masked_sg); // remove edges whose destination is not masked
 
   // re-index the subgraph
   index_t* d_new_ids;
   cudaMalloc((void**)&d_new_ids, n * sizeof(index_t));
   // Given an old vertex ID ∈ [0, n), returns a new vertex ID ∈ [0, nv)
-  auto new_ids = reindexing_vertices(nv, vertex_set);
-  CUDA_CHECK(cudaMemcpy(d_new_ids, &new_ids[0], n * sizeof(index_t),
-                        cudaMemcpyHostToDevice));
+  auto new_ids = reindexVertices(nv, vertex_set);
+  CUDA_CHECK(cudaMemcpy(d_new_ids, &new_ids[0], n * sizeof(index_t), cudaMemcpyHostToDevice));
 
   // generate the offsets for the re-indexed subgraph
   index_t *degrees, *offsets;
-  CUDA_CHECK(cudaMalloc((void**)&degrees, sizeof(index_t)*nv);
+  CUDA_CHECK(cudaMalloc((void**)&degrees, sizeof(index_t)*nv));
   get_new_degrees<<<CUDA_GET_BLOCKS(nv), CUDA_NUM_THREADS>>>(nv, d_vertex_list, d_new_ids, masked_sg, degrees);
   CUDA_CHECK(cudaFree(degrees));
-  CUDA_CHECK(cudaMalloc((void**)&offsets, sizeof(index_t)*(nv+1));
+  CUDA_CHECK(cudaMalloc((void**)&offsets, sizeof(index_t)*(nv+1)));
   thrust::exclusive_scan(thrust::device, degrees, degrees+nv, offsets);
   index_t ne;
   CUDA_CHECK(cudaMemcpy(&ne, offsets+nv, sizeof(index_t), cudaMemcpyDeviceToHost));
 
   // allocate memory for the subgraph
-  sub.allocateFrom(nv, ne); // avoid reallocation
+  sub->allocateFrom(nv, ne); // avoid reallocation
   // generate the subgraph
-  generate_graph_kernel<<<CUDA_GET_BLOCKS(nv), CUDA_NUM_THREADS>>>(nv, offsets, d_vertex_list, d_new_ids, masked_sg, sub);
+  generate_graph_kernel<<<CUDA_GET_BLOCKS(nv), CUDA_NUM_THREADS>>>(nv, offsets, d_vertex_list, d_new_ids, masked_sg, *sub);
 }
 
 } // namespace deepgalois

From c0041ac4dbe85121ca40830067d4f6cf860f1bfe Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Wed, 13 May 2020 09:07:55 -0500
Subject: [PATCH 297/660] fix types

---
 libdeepgalois/CMakeLists.txt               |  2 +-
 libdeepgalois/include/deepgalois/Net.h     |  7 +--
 libdeepgalois/include/deepgalois/Sampler.h | 10 ++--
 libdeepgalois/src/RandomWalk.cpp           | 57 +++++++++++++++++++
 libdeepgalois/src/Sampler.cpp              | 66 ++--------------------
 libdeepgalois/src/Sampler.cu               | 12 +---
 6 files changed, 71 insertions(+), 83 deletions(-)

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index e66443c22a..a022a36655 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -73,7 +73,7 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 
 if(ENABLE_HETERO_GALOIS)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__GALOIS_HET_CUDA__")
-  set(sources src/reader.cpp src/RandomWalk.cpp)
+  set(sources src/reader.cpp src/RandomWalk.cpp src/utils.cpp)
 else()
   set(sources
     src/layers/softmax_loss_layer.cpp
diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h
index e47664804f..7893e40502 100644
--- a/libdeepgalois/include/deepgalois/Net.h
+++ b/libdeepgalois/include/deepgalois/Net.h
@@ -196,11 +196,8 @@ class Net {
       distContext->allocateSubgraphs(num_subgraphs);
       subgraphs_masks = new mask_t[distNumSamples * num_subgraphs];
       std::cout << header << "Constructing training vertex set induced graph...\n";
-#ifdef __GALOIS_HET_CUDA__
-      auto gg = distContext->getGraphPointer();
-#else
-      auto gg = graphTopologyContext->getGraphPointer();
-#endif
+      //auto gg = distContext->getGraphPointer();
+      auto gg = graphTopologyContext->getGraphPointer(); // gloabl graph in CPU mem
       sampler->initializeMaskedGraph(globalTrainCount, globalTrainMasks, gg,
                                      distContext->getGraphPointer());
     }
diff --git a/libdeepgalois/include/deepgalois/Sampler.h b/libdeepgalois/include/deepgalois/Sampler.h
index 6b24c8fce2..e823ef67ce 100644
--- a/libdeepgalois/include/deepgalois/Sampler.h
+++ b/libdeepgalois/include/deepgalois/Sampler.h
@@ -27,8 +27,8 @@ class Sampler {
   std::vector<index_t> trainingNodes;
 
   //! masked original graph; typically to the training set
-  Graph* globalMaskedGraph;
-  Graph* globalGraph;
+  GraphCPU* globalMaskedGraph;
+  GraphCPU* globalGraph;
   DGraph* partGraph;
 
   //! Reindex a graph to only contain those in the vertex set
@@ -39,7 +39,7 @@ class Sampler {
   void getMaskedGraph(index_t n, mask_t* masks, GraphTy* g, SubgraphTy* sub);
 
   //! determine degree of each vertex in a masked graph (given by masks and g)
-  template <typename GraphTy>
+  template <typename GraphTy = GraphCPU>
   void getMaskedDegrees(size_t n, mask_t* masks, GraphTy* g, std::vector<uint32_t>& degrees);
 
   //! Set masks bitset with IDs in the vertices VertexSet
@@ -51,7 +51,7 @@ class Sampler {
   VertexSet convertToLID(VertexSet& gidSet);
 
   //! helper function to get degree of some vertex given some graph
-  inline unsigned getDegree(Graph* g, index_t v) {
+  inline unsigned getDegree(GraphCPU* g, index_t v) {
     return g->edge_end(v) - g->edge_begin(v);
   }
 
@@ -86,7 +86,7 @@ class Sampler {
 
   //! Given a mask, construct the graph with only those vertices ans ave as the
   //! masked graph in this class for the sampler.
-  void initializeMaskedGraph(size_t count, mask_t* masks, Graph* g, DGraph* dg);
+  void initializeMaskedGraph(size_t count, mask_t* masks, GraphCPU* g, DGraph* dg);
 };
 
 } // namespace deepgalois
diff --git a/libdeepgalois/src/RandomWalk.cpp b/libdeepgalois/src/RandomWalk.cpp
index 09e76e9fc7..ed2b3528c1 100644
--- a/libdeepgalois/src/RandomWalk.cpp
+++ b/libdeepgalois/src/RandomWalk.cpp
@@ -1,11 +1,68 @@
 #include <time.h>
 #include <vector>
 #include <iostream>
+#include "galois/Galois.h"
 #include "deepgalois/utils.h"
 #include "deepgalois/Sampler.h"
 
 namespace deepgalois {
 
+void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, GraphCPU* g, DGraph* dg) {
+  this->count_ = count;
+  // save original graph
+  Sampler::globalGraph = g;
+  // save partitioned graph
+  Sampler::partGraph = dg;
+
+  // allocate the object for the new masked graph
+  Sampler::globalMaskedGraph = new GraphCPU();
+
+  std::vector<uint32_t> degrees(g->size(), 0);
+  // get degrees of nodes that will be in new graph
+  //this->getMaskedDegrees(g->size(), masks, g, degrees);
+  galois::do_all(galois::iterate(size_t(0), g->size()), [&](const auto src) {
+    if (masks[src] == 1) {
+      for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) {
+        const auto dst = g->getEdgeDst(e);
+        if (masks[dst] == 1) degrees[src]++;
+      }
+    }
+  } , galois::loopname("update_degrees"));
+
+  auto offsets = deepgalois::parallel_prefix_sum(degrees);
+  auto ne    = offsets[g->size()];
+
+  // save ids (of original graph) of training nodes to vector
+  for (size_t i = 0; i < g->size(); i++) {
+    if (masks[i] == 1)
+      Sampler::trainingNodes.push_back(i);
+  }
+
+  Sampler::globalMaskedGraph->allocateFrom(g->size(), ne);
+  Sampler::globalMaskedGraph->constructNodes();
+  // same as original graph, except keep only edges involved in masks
+  galois::do_all(galois::iterate((size_t)0, g->size()), [&](const auto src) {
+    Sampler::globalMaskedGraph->fixEndEdge(src, offsets[src + 1]);
+    if (masks[src] == 1) {
+      auto idx = offsets[src];
+      for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) {
+        const auto dst = g->getEdgeDst(e);
+        if (masks[dst] == 1) {
+          // galois::gPrint(src, " ", dst, "\n");
+          Sampler::globalMaskedGraph->constructEdge(idx++, dst, 0);
+        }
+      }
+    }
+  }, galois::loopname("gen_subgraph"));
+
+  Sampler::globalMaskedGraph->degree_counting();
+  Sampler::avg_deg = globalMaskedGraph->sizeEdges() / globalMaskedGraph->size();
+  Sampler::subg_deg = (avg_deg > SAMPLE_CLIP) ? SAMPLE_CLIP : avg_deg;
+
+  // TODO masked part graph as well to save time later; right now constructing
+  // from full part graph
+}
+
 // implementation from GraphSAINT
 // https://github.com/GraphSAINT/GraphSAINT/blob/master/ipdps19_cpp/sample.cpp
 void Sampler::selectVertices(index_t n, VertexSet& st, unsigned seed) {
diff --git a/libdeepgalois/src/Sampler.cpp b/libdeepgalois/src/Sampler.cpp
index aed0768ac0..b3cc862eca 100644
--- a/libdeepgalois/src/Sampler.cpp
+++ b/libdeepgalois/src/Sampler.cpp
@@ -21,60 +21,6 @@ void print_vertex_set(VertexSet vertex_set) {
   galois::gPrint(")\n");
 }
 
-void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, Graph* g, DGraph* dg) {
-  this->count_ = count;
-  // save original graph
-  Sampler::globalGraph = g;
-  // save partitioned graph
-  Sampler::partGraph = dg;
-
-  // allocate the object for the new masked graph
-  Sampler::globalMaskedGraph = new Graph();
-
-  std::vector<uint32_t> degrees(g->size(), 0);
-  // get degrees of nodes that will be in new graph
-  this->getMaskedDegrees(g->size(), masks, g, degrees);
-  auto offsets = deepgalois::parallel_prefix_sum(degrees);
-  auto ne    = offsets[g->size()];
-
-  // save ids (of original graph) of training nodes to vector
-  for (size_t i = 0; i < g->size(); i++) {
-    if (masks[i] == 1)
-      Sampler::trainingNodes.push_back(i);
-  }
-
-  Sampler::globalMaskedGraph->allocateFrom(g->size(), ne);
-  Sampler::globalMaskedGraph->constructNodes();
-  // same as original graph, except keep only edges involved in masks
-  galois::do_all(galois::iterate((size_t)0, g->size()), [&](const auto src) {
-    Sampler::globalMaskedGraph->fixEndEdge(src, offsets[src + 1]);
-    if (masks[src] == 1) {
-      auto idx = offsets[src];
-      for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) {
-        const auto dst = g->getEdgeDst(e);
-        if (masks[dst] == 1) {
-          // galois::gPrint(src, " ", dst, "\n");
-          Sampler::globalMaskedGraph->constructEdge(idx++, dst, 0);
-        }
-      }
-    }
-  }, galois::loopname("gen_subgraph"));
-
-  Sampler::globalMaskedGraph->degree_counting();
-  Sampler::avg_deg = globalMaskedGraph->sizeEdges() / globalMaskedGraph->size();
-  Sampler::subg_deg = (avg_deg > SAMPLE_CLIP) ? SAMPLE_CLIP : avg_deg;
-
-  // TODO masked part graph as well to save time later; right now constructing
-  // from full part graph
-
-  // size_t idx = 0;
-  // vertices_.resize(count);
-  // for (size_t i = begin; i < end; i++) {
-  //  if (masks_[i] == 1)
-  //    vertices_[idx++] = i;
-  //}
-}
-
 /*
 // implementation from GraphSAINT
 // https://github.com/GraphSAINT/GraphSAINT/blob/master/ipdps19_cpp/sample.cpp
@@ -337,12 +283,11 @@ VertexSet Sampler::convertToLID(VertexSet& gidSet) {
 
 template <typename GraphTy>
 void Sampler::getMaskedDegrees(size_t n, mask_t* masks, GraphTy* g, std::vector<uint32_t>& degrees) {
+//template <>
+//void Sampler::getMaskedDegrees(size_t n, mask_t* masks, GraphCPU* g, std::vector<uint32_t>& degrees) {
   assert(degrees.size() == n);
-#ifdef PARALLEL_GEN
   galois::do_all(galois::iterate(size_t(0), n), [&](const auto src) {
-#else
-  for (size_t src = 0; src < n; src++) {
-#endif
+  //for (size_t src = 0; src < n; src++) {
     if (masks[src] == 1) {
       for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) {
         const auto dst = g->getEdgeDst(e);
@@ -352,10 +297,7 @@ void Sampler::getMaskedDegrees(size_t n, mask_t* masks, GraphTy* g, std::vector<
         }
       }
     }
-  }
-#ifdef PARALLEL_GEN
-  , galois::loopname("update_degrees"));
-#endif
+  } , galois::loopname("update_degrees"));
 }
 
 template <typename GraphTy, typename SubgraphTy>
diff --git a/libdeepgalois/src/Sampler.cu b/libdeepgalois/src/Sampler.cu
index a0528564dd..69a66d1cfc 100644
--- a/libdeepgalois/src/Sampler.cu
+++ b/libdeepgalois/src/Sampler.cu
@@ -77,14 +77,6 @@ __global__ void generate_graph_kernel(index_t n, const index_t* offsets,
   }
 }
 
-void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, Graph* g, DGraph* dg) {
-  this->count_ = count;
-  // save original graph
-  Sampler::globalGraph = g;
-  // save partitioned graph
-  Sampler::partGraph = dg;
-}
-
 /*
 void Sampler::indexing(size_t n, index_t* vertices, index_t* new_indices) {
   index_t vid = 0;
@@ -122,7 +114,7 @@ void Sampler::getMaskedGraph(index_t n, mask_t* masks, GraphTy* g, SubgraphTy* s
 // nv: size of the subgraph; i.e. size of vertex_set
 // masks, graph g and subgraph sub are on the device (GPU)
 void Sampler::generateSubgraph(VertexSet &vertex_set, mask_t* masks, GraphGPU* sub) {
-  index_t n = globalGraph->size();
+  index_t n = partGraph->size();
   auto nv = vertex_set.size();
   // convert the vertex_set to a vertex_list and copy it to the device
   VertexList vertex_list(vertex_set.begin(), vertex_set.end());
@@ -133,7 +125,7 @@ void Sampler::generateSubgraph(VertexSet &vertex_set, mask_t* masks, GraphGPU* s
   // createMasks: set masks for vertices in the vertex_set
   set_masks<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, d_vertex_list, masks);
   GraphGPU masked_sg; // size is the same as original graph, but masked dst removed
-  getMaskedGraph(n, masks, globalGraph, &masked_sg); // remove edges whose destination is not masked
+  getMaskedGraph(n, masks, partGraph, &masked_sg); // remove edges whose destination is not masked
 
   // re-index the subgraph
   index_t* d_new_ids;

From 755baf19c50033e12d9677b2e6526f2ecc28b4ae Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Wed, 13 May 2020 13:22:08 -0500
Subject: [PATCH 298/660] extend csrgraph

---
 .../include/deepgalois/DistContext.h          |  8 +-
 libdeepgalois/include/deepgalois/Net.h        | 26 +++---
 libdeepgalois/include/deepgalois/Sampler.h    | 15 +++-
 libdeepgalois/include/deepgalois/lgraph.h     | 10 ++-
 .../include/deepgalois/math_functions.hh      |  9 +-
 libdeepgalois/src/DistContext.cpp             | 87 ++++++++-----------
 libdeepgalois/src/DistContext.cu              | 38 ++++++--
 libdeepgalois/src/Net.cpp                     |  5 +-
 libdeepgalois/src/Net.cu                      |  8 +-
 libdeepgalois/src/RandomWalk.cpp              | 13 +--
 libdeepgalois/src/Sampler.cpp                 |  9 --
 libdeepgalois/src/Sampler.cu                  | 39 ++++++---
 libdeepgalois/src/math_functions.cu           | 16 +++-
 libdeepgalois/src/reader.cpp                  |  2 +-
 libgpu/include/graph_gpu.h                    | 17 +++-
 libgpu/src/csr_graph.cu                       | 11 ++-
 16 files changed, 189 insertions(+), 124 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h
index ff28bb607c..08e101e898 100644
--- a/libdeepgalois/include/deepgalois/DistContext.h
+++ b/libdeepgalois/include/deepgalois/DistContext.h
@@ -116,7 +116,13 @@ class DistContext {
   float_t* get_in_ptr();
 
   //! allocate memory for subgraphs (don't actually build them)
-  void allocateSubgraphs(int num_subgraphs);
+  void allocateSubgraphs(int num_subgraphs, unsigned max_size) {
+    partitionedSubgraphs.resize(num_subgraphs);
+    for (int i = 0; i < num_subgraphs; i++) {
+      partitionedSubgraphs[i] = new Graph();
+      partitionedSubgraphs[i]->set_max_size(max_size);
+    }
+  }
 };
 
 } // namespace deepgalois
diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h
index 7893e40502..082949d7fb 100644
--- a/libdeepgalois/include/deepgalois/Net.h
+++ b/libdeepgalois/include/deepgalois/Net.h
@@ -72,6 +72,7 @@ class Net {
   mask_t* d_test_masks;  // masks for test on device
 
   mask_t* subgraphs_masks;          // masks for subgraphs; size of local graph
+  mask_t* d_subgraphs_masks;        // masks for subgraphs on device; size of local graph
   std::vector<size_t> feature_dims; // feature dimnesions for each layer
   std::vector<layer*> layers;       // all the layers in the neural network
 
@@ -164,7 +165,7 @@ class Net {
 
     // features are read in distcontext, not this context (this context only
     // used for sampling)
-    init();
+    if (subgraph_sample_size) sampler = new deepgalois::Sampler();
   }
 
   //! Default net constructor
@@ -180,7 +181,7 @@ class Net {
   //      num_vertices_sg(9000), globalTrainMasks(NULL), globalValMasks(NULL),
   //      test_masks(NULL), context(NULL) {}
 
-  void init();
+  void allocateSubgraphsMasks(int num_subgraphs);
 
   //! Initializes metadata for the partition
   void partitionInit(DGraph* graph, std::string dataset_str, bool isSingleClassLabel);
@@ -193,8 +194,8 @@ class Net {
     int num_subg_remain     = 0;
 
     if (subgraph_sample_size) {
-      distContext->allocateSubgraphs(num_subgraphs);
-      subgraphs_masks = new mask_t[distNumSamples * num_subgraphs];
+      distContext->allocateSubgraphs(num_subgraphs, subgraph_sample_size);
+      allocateSubgraphsMasks(num_subgraphs);
       std::cout << header << "Constructing training vertex set induced graph...\n";
       //auto gg = distContext->getGraphPointer();
       auto gg = graphTopologyContext->getGraphPointer(); // gloabl graph in CPU mem
@@ -224,7 +225,7 @@ class Net {
           for (int sid = 0; sid < num_subgraphs; sid++) {
             VertexSet sampledSet;
             sampler->selectVertices(subgraph_sample_size, sampledSet, curEpoch); // m = 1000 by default
-            sampler->generateSubgraph(sampledSet, &subgraphs_masks[sid * globalSamples],
+            sampler->generateSubgraph(sampledSet, subgraphs_masks + sid * globalSamples,
                                       distContext->getSubgraphPointer(sid));
           }
           num_subg_remain = num_subgraphs;
@@ -245,8 +246,8 @@ class Net {
         auto subgraphPointer      = distContext->getSubgraphPointer(sg_id);
         this->subgraphNumVertices = subgraphPointer->size();
 
-        // galois::gPrint("Subgraph num_vertices: ", subgraphNumVertices, ",
-        // num_edges: ", subgraphPointer->sizeEdges(), "\n");
+        std::cout << "Subgraph num_vertices: " << subgraphNumVertices 
+                  << ", num_edges: " << subgraphPointer->sizeEdges() << "\n";
         for (size_t i = 0; i < num_layers; i++) {
           layers[i]->update_dim_size(this->subgraphNumVertices);
         }
@@ -256,18 +257,17 @@ class Net {
         distContext->constructNormFactorSub(sg_id);
         for (size_t i = 0; i < num_conv_layers; i++) {
           layers[i]->set_graph_ptr(subgraphPointer);
-          layers[i]->set_norm_consts_ptr(
-              distContext->get_norm_factors_subg_ptr());
+          layers[i]->set_norm_consts_ptr(distContext->get_norm_factors_subg_ptr());
         }
 
         // update labels for subgraph
-        distContext->constructSubgraphLabels(
-            this->subgraphNumVertices, &subgraphs_masks[sg_id * globalSamples]);
+        distContext->constructSubgraphLabels(this->subgraphNumVertices,
+                                             subgraphs_masks + sg_id * globalSamples);
         layers[num_layers - 1]->set_labels_ptr(distContext->get_labels_subg_ptr());
 
         // update features for subgraph
-        distContext->constructSubgraphFeatures(
-            this->subgraphNumVertices, &subgraphs_masks[sg_id * globalSamples]);
+        distContext->constructSubgraphFeatures(this->subgraphNumVertices,
+                                               subgraphs_masks + sg_id * globalSamples);
         layers[0]->set_feats_ptr(distContext->get_feats_subg_ptr()); // feed input data
 
         // Graph* testing = distContext->getSubgraphPointer(sg_id);
diff --git a/libdeepgalois/include/deepgalois/Sampler.h b/libdeepgalois/include/deepgalois/Sampler.h
index e823ef67ce..b8f19dcca7 100644
--- a/libdeepgalois/include/deepgalois/Sampler.h
+++ b/libdeepgalois/include/deepgalois/Sampler.h
@@ -44,7 +44,7 @@ class Sampler {
 
   //! Set masks bitset with IDs in the vertices VertexSet
   void createMasks(size_t n, VertexSet vertices, mask_t* masks);
-  inline VertexList reindexVertices(size_t n, VertexSet vertex_set);
+  //inline VertexList reindexVertices(size_t n, VertexSet vertex_set);
   //void checkGSDB(std::vector<db_t>& DB0, std::vector<db_t>& DB1, std::vector<db_t>& DB2, index_t size);
 
   //! convert set of gids to lids
@@ -52,7 +52,16 @@ class Sampler {
 
   //! helper function to get degree of some vertex given some graph
   inline unsigned getDegree(GraphCPU* g, index_t v) {
-    return g->edge_end(v) - g->edge_begin(v);
+    return g->edge_end_host(v) - g->edge_begin_host(v);
+  }
+
+  inline VertexList reindexVertices(size_t n, VertexSet vertex_set) {
+    VertexList new_ids(n, 0);
+    int vid = 0;
+    for (auto v : vertex_set) {
+      new_ids[v] = vid++; // reindex
+    }
+    return new_ids;
   }
 
   // helper function for graph saint implementation below
@@ -78,7 +87,7 @@ class Sampler {
 
   //! API function for user-defined selection strategy
   // TODO how to expose this?
-  virtual void selectVertices(index_t nv, index_t n, Graph* g, VertexList vertices, VertexSet& vertex_set);
+  void selectVertices(index_t nv, index_t n, Graph* g, VertexList vertices, VertexSet& vertex_set);
   virtual void selectVertices(index_t n, VertexSet& vertex_set, unsigned seed);
 
   // galois::runtime::iterable<galois::NoDerefIterator<edge_iterator> >
diff --git a/libdeepgalois/include/deepgalois/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h
index e9a185bfac..e0527b2161 100644
--- a/libdeepgalois/include/deepgalois/lgraph.h
+++ b/libdeepgalois/include/deepgalois/lgraph.h
@@ -16,6 +16,7 @@ class LearningGraph {
   // typedef index_t* IndexList;
 protected:
   bool is_device;
+  index_t max_size_;
   index_t num_vertices_;
   index_t num_edges_;
   IndexList rowptr_;
@@ -34,8 +35,9 @@ class LearningGraph {
 public:
   typedef size_t iterator;
   LearningGraph(bool use_gpu)
-      : is_device(use_gpu), num_vertices_(0), num_edges_(0), vertex_data_(NULL),
-        edge_data_(NULL) {}
+      : is_device(use_gpu), max_size_(0),
+        num_vertices_(0), num_edges_(0), 
+        vertex_data_(NULL), edge_data_(NULL) {}
   LearningGraph() : LearningGraph(false) {}
   ~LearningGraph() { dealloc(); }
   void init(index_t nv, index_t ne) {
@@ -55,6 +57,7 @@ class LearningGraph {
   void dealloc();
   void degree_counting();
   void constructNodes() {}
+  void set_max_size(index_t max) { assert(max>0); max_size_ = max; }
 
   void readGraph(std::string dataset, bool selfloop = false);
   void fixEndEdge(index_t vid, index_t row_end) { rowptr_[vid + 1] = row_end; }
@@ -121,6 +124,9 @@ class LearningGraph {
 
   index_t* row_start_host_ptr() { return &rowptr_[0]; }
   index_t* edge_dst_host_ptr() { return &colidx_[0]; }
+  index_t getEdgeDstHost(index_t eid) { return colidx_[eid]; }
+  index_t edge_begin_host(index_t vid) { return rowptr_[vid]; }
+  index_t edge_end_host(index_t vid) { return rowptr_[vid + 1]; }
 #ifndef __GALOIS_HET_CUDA__
   index_t getEdgeDst(index_t eid) { return colidx_[eid]; }
   index_t edge_begin(index_t vid) { return rowptr_[vid]; }
diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh
index 89cc3d5d9c..6c002e2ffb 100644
--- a/libdeepgalois/include/deepgalois/math_functions.hh
+++ b/libdeepgalois/include/deepgalois/math_functions.hh
@@ -163,10 +163,11 @@ void copy_masks_device(int n, mask_t* h_masks, mask_t*& d_masks);
 void float_malloc_device(int n, float_t*& ptr);
 void float_free_device(float_t*& ptr);
 void float_copy_device(int n, float_t* h_ptr, float_t* d_ptr);
-acc_t masked_avg_loss_gpu(int begin, int end, int count, mask_t* masks,
-                          float_t* loss);
+void uint8_malloc_device(int n, uint8_t*& ptr);
+void uint8_free_device(uint8_t*& ptr);
+void uint8_copy_device(int n, uint8_t* h_ptr, uint8_t* d_ptr);
+acc_t masked_avg_loss_gpu(int begin, int end, int count, mask_t* masks, float_t* loss);
 acc_t l2_norm_gpu(int n, const float_t* in);
 void l2_norm_gpu(size_t x, size_t y, const float_t* in, float_t* out);
-void d_l2_norm_gpu(size_t x, size_t y, const float_t* in_data, float_t* in_diff,
-                   float_t* out_diff);
+void d_l2_norm_gpu(size_t x, size_t y, const float_t* in_data, float_t* in_diff, float_t* out_diff);
 #endif
diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp
index 4e6b839179..e6c6121d80 100644
--- a/libdeepgalois/src/DistContext.cpp
+++ b/libdeepgalois/src/DistContext.cpp
@@ -321,66 +321,55 @@ void DistContext::constructNormFactorSub(int subgraphID) {
 //! generate labels for the subgraph, m is subgraph size, mask
 //! tells which vertices to use
 void DistContext::constructSubgraphLabels(size_t m, const mask_t* masks) {
+  if (DistContext::usingSingleClass) {
+    DistContext::h_labels_subg.resize(m);
+  } else {
+    DistContext::h_labels_subg.resize(m * DistContext::num_classes);
+  }
+  size_t count = 0;
+  // see which labels to copy over for this subgraph
+  for (size_t i = 0; i < this->partitionedGraph->size(); i++) {
+    if (masks[i] == 1) {
       if (DistContext::usingSingleClass) {
-        DistContext::h_labels_subg.resize(m);
+        DistContext::h_labels_subg[count] = h_labels[i];
       } else {
-        DistContext::h_labels_subg.resize(m * DistContext::num_classes);
-      }
-
-      size_t count = 0;
-      // see which labels to copy over for this subgraph
-      for (size_t i = 0; i < this->partitionedGraph->size(); i++) {
-        if (masks[i] == 1) {
-          if (DistContext::usingSingleClass) {
-            DistContext::h_labels_subg[count] = h_labels[i];
-          } else {
-            std::copy(
-                DistContext::h_labels + i * DistContext::num_classes,
-                DistContext::h_labels + (i + 1) * DistContext::num_classes,
-                &DistContext::h_labels_subg[count * DistContext::num_classes]);
-          }
-          // galois::gPrint("l ", (float)DistContext::h_labels_subg[count],
-          // "\n");
-          count++;
-        }
+        std::copy(
+            DistContext::h_labels + i * DistContext::num_classes,
+            DistContext::h_labels + (i + 1) * DistContext::num_classes,
+            &DistContext::h_labels_subg[count * DistContext::num_classes]);
       }
-      GALOIS_ASSERT(count == m);
+      // galois::gPrint("l ", (float)DistContext::h_labels_subg[count], "\n");
+      count++;
+    }
+  }
+  GALOIS_ASSERT(count == m);
 }
 
 //! generate input features for the subgraph, m is subgraph size,
 //! masks tells which vertices to use
 void DistContext::constructSubgraphFeatures(size_t m, const mask_t* masks) {
-      size_t count = 0;
-      // if (h_feats_subg == NULL) h_feats_subg = new float_t[m*feat_len];
-      DistContext::h_feats_subg.resize(m * feat_len);
-      for (size_t i = 0; i < this->partitionedGraph->size(); i++) {
-        if (masks[i] == 1) {
-          std::copy(DistContext::h_feats + i * DistContext::feat_len,
-                    DistContext::h_feats + (i + 1) * DistContext::feat_len,
-                    &DistContext::h_feats_subg[count * DistContext::feat_len]);
-          // for (unsigned a = 0; a < DistContext::feat_len; a++) {
-          //  if (h_feats_subg[count * DistContext::feat_len + a] != 0) {
-          //    galois::gPrint(h_feats_subg[count * DistContext::feat_len + a],
-          //    " ");
-          //  }
-          //}
-          // galois::gPrint("\n");
-          count++;
-        }
-      }
-      GALOIS_ASSERT(count == m);
+  size_t count = 0;
+  DistContext::h_feats_subg.resize(m * feat_len);
+  for (size_t i = 0; i < this->partitionedGraph->size(); i++) {
+    if (masks[i] == 1) {
+      std::copy(DistContext::h_feats + i * DistContext::feat_len,
+          DistContext::h_feats + (i + 1) * DistContext::feat_len,
+          &DistContext::h_feats_subg[count * DistContext::feat_len]);
+      // for (unsigned a = 0; a < DistContext::feat_len; a++) {
+      //  if (h_feats_subg[count * DistContext::feat_len + a] != 0) {
+      //    galois::gPrint(h_feats_subg[count * DistContext::feat_len + a],
+      //    " ");
+      //  }
+      //}
+      // galois::gPrint("\n");
+      count++;
+    }
+  }
+  GALOIS_ASSERT(count == m);
 }
 
-
 galois::graphs::GluonSubstrate<DGraph>* DistContext::getSyncSubstrate() {
-      return DistContext::syncSubstrate;
-};
-
-void DistContext::allocateSubgraphs(int num_subgraphs) {
-      partitionedSubgraphs.resize(num_subgraphs);
-      for (int i = 0; i < num_subgraphs; i++) {
-        partitionedSubgraphs[i] = new Graph();
-      }
+  return DistContext::syncSubstrate;
 }
 
 } // namespace deepgalois
diff --git a/libdeepgalois/src/DistContext.cu b/libdeepgalois/src/DistContext.cu
index 91d39bb9a4..7542849cef 100644
--- a/libdeepgalois/src/DistContext.cu
+++ b/libdeepgalois/src/DistContext.cu
@@ -107,11 +107,40 @@ size_t DistContext::read_masks(std::string dataset_str, std::string mask_type, s
   return reader.read_masks(mask_type, n, begin, end, masks);
 }
 
-void DistContext::allocateSubgraphs(int n_sg) {}
+void DistContext::constructSubgraphLabels(size_t m, const mask_t* masks) {
+  size_t labels_size = m;
+  if (!usingSingleClass) labels_size = m * num_classes;
+  h_labels_subg.resize(labels_size);
+  size_t count = 0;
+  for (size_t i = 0; i < this->partitionedGraph->size(); i++) {
+    if (masks[i] == 1) {
+      if (usingSingleClass) h_labels_subg[count] = h_labels[i];
+      else std::copy(h_labels + i * num_classes, h_labels + (i + 1) * num_classes, 
+                     &h_labels_subg[count * num_classes]);
+      count++;
+    }
+  }
+  if (d_labels_subg) uint8_free_device(d_labels_subg);
+  uint8_malloc_device(labels_size, d_labels_subg);
+  uint8_copy_device(labels_size, &h_labels_subg[0], d_labels_subg);
+}
 
-void DistContext::constructSubgraphLabels(size_t m, const mask_t* masks) {}
+void DistContext::constructSubgraphFeatures(size_t m, const mask_t* masks) {
+  size_t count = 0;
+  DistContext::h_feats_subg.resize(m * feat_len);
+  for (size_t i = 0; i < this->partitionedGraph->size(); i++) {
+    if (masks[i] == 1) {
+      std::copy(h_feats + i * feat_len, h_feats + (i + 1) * feat_len, &h_feats_subg[count * feat_len]);
+      count++;
+    }
+  }
+  if (d_feats_subg) float_free_device(d_feats_subg);
+  float_malloc_device(m * feat_len, d_feats_subg);
+  float_copy_device(m * feat_len, &h_feats_subg[0], d_feats_subg);
+}
 
-void DistContext::constructSubgraphFeatures(size_t m, const mask_t* masks) {}
+void DistContext::constructNormFactorSub(int subgraphID) {
+}
 
 void DistContext::constructNormFactor(deepgalois::Context* globalContext) {
   auto n = partitionedGraph->size();
@@ -135,9 +164,6 @@ void DistContext::constructNormFactor(deepgalois::Context* globalContext) {
   std::cout << "Done\n";
 }
 
-void DistContext::constructNormFactorSub(int subgraphID) {
-}
-
 /*
 void DistContext::SetDevice(const int device_id) {
   int current_device;
diff --git a/libdeepgalois/src/Net.cpp b/libdeepgalois/src/Net.cpp
index 41ce7b2d77..3bc7762fd5 100644
--- a/libdeepgalois/src/Net.cpp
+++ b/libdeepgalois/src/Net.cpp
@@ -72,9 +72,8 @@ void Net::partitionInit(DGraph* graph, std::string dataset_str,
   feature_dims[num_layers] = num_classes; // normalized output embedding: E
 }
 
-void Net::init() {
-  if (subgraph_sample_size)
-    sampler = new deepgalois::Sampler();
+void Net::allocateSubgraphsMasks(int num_subgraphs) {
+  subgraphs_masks = new mask_t[distNumSamples * num_subgraphs];
 }
 
 // add weight decay
diff --git a/libdeepgalois/src/Net.cu b/libdeepgalois/src/Net.cu
index b63e5df3a6..7b76f217dd 100644
--- a/libdeepgalois/src/Net.cu
+++ b/libdeepgalois/src/Net.cu
@@ -147,12 +147,14 @@ acc_t masked_f1_score_gpu(int num_classes, int begin, int end, int count,
 
 namespace deepgalois {
 
-void Net::init() {
-  copy_masks_device(globalSamples, globalTrainMasks, d_train_masks);
-  copy_masks_device(globalSamples, globalValMasks, d_val_masks);
+void Net::allocateSubgraphsMasks(int num_subgraphs) {
+  CUDA_CHECK(cudaMalloc((void**)&subgraphs_masks, distNumSamples * num_subgraphs * sizeof(mask_t)));
 }
 
 void Net::partitionInit(DGraph* graph, std::string dataset_str, bool isSingleClassLabel) {
+  copy_masks_device(globalSamples, globalTrainMasks, d_train_masks);
+  copy_masks_device(globalSamples, globalValMasks, d_val_masks);
+
   this->distContext = new deepgalois::DistContext();
   this->distContext->set_dataset(dataset_str);
 
diff --git a/libdeepgalois/src/RandomWalk.cpp b/libdeepgalois/src/RandomWalk.cpp
index ed2b3528c1..cf2112ca60 100644
--- a/libdeepgalois/src/RandomWalk.cpp
+++ b/libdeepgalois/src/RandomWalk.cpp
@@ -18,12 +18,13 @@ void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, GraphCPU* g, DG
   Sampler::globalMaskedGraph = new GraphCPU();
 
   std::vector<uint32_t> degrees(g->size(), 0);
+  galois::gPrint("graph size: ", g->size(), "\n");
   // get degrees of nodes that will be in new graph
   //this->getMaskedDegrees(g->size(), masks, g, degrees);
   galois::do_all(galois::iterate(size_t(0), g->size()), [&](const auto src) {
     if (masks[src] == 1) {
-      for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) {
-        const auto dst = g->getEdgeDst(e);
+      for (auto e = g->edge_begin_host(src); e != g->edge_end_host(src); e++) {
+        const auto dst = g->getEdgeDstHost(e);
         if (masks[dst] == 1) degrees[src]++;
       }
     }
@@ -45,8 +46,8 @@ void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, GraphCPU* g, DG
     Sampler::globalMaskedGraph->fixEndEdge(src, offsets[src + 1]);
     if (masks[src] == 1) {
       auto idx = offsets[src];
-      for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) {
-        const auto dst = g->getEdgeDst(e);
+      for (auto e = g->edge_begin_host(src); e != g->edge_end_host(src); e++) {
+        const auto dst = g->getEdgeDstHost(e);
         if (masks[dst] == 1) {
           // galois::gPrint(src, " ", dst, "\n");
           Sampler::globalMaskedGraph->constructEdge(idx++, dst, 0);
@@ -131,8 +132,8 @@ void Sampler::selectVertices(index_t n, VertexSet& st, unsigned seed) {
     auto degree = getDegree(Sampler::globalMaskedGraph, v);
     neigh_v     = (degree != 0) ? rand_r(&myseed) % degree : db_t(-1);
     if (neigh_v != db_t(-1)) {
-      neigh_v = Sampler::globalMaskedGraph->getEdgeDst(
-          Sampler::globalMaskedGraph->edge_begin(v) + neigh_v);
+      neigh_v = Sampler::globalMaskedGraph->getEdgeDstHost(
+          Sampler::globalMaskedGraph->edge_begin_host(v) + neigh_v);
       st.insert(neigh_v);
       IA1[DB2[choose] - 1] = 0;
       IA0[DB2[choose] - 1] = 0;
diff --git a/libdeepgalois/src/Sampler.cpp b/libdeepgalois/src/Sampler.cpp
index b3cc862eca..1feb2ecb69 100644
--- a/libdeepgalois/src/Sampler.cpp
+++ b/libdeepgalois/src/Sampler.cpp
@@ -221,15 +221,6 @@ void Sampler::createMasks(size_t n, VertexSet vertices, mask_t* masks) {
   for (auto v : vertices) masks[v] = 1;
 }
 
-inline VertexList Sampler::reindexVertices(size_t n, VertexSet vertex_set) {
-  VertexList new_ids(n, 0);
-  int vid = 0;
-  for (auto v : vertex_set) {
-    new_ids[v] = vid++; // reindex
-  }
-  return new_ids;
-}
-
 // Given a subset of vertices and a graph g, generate a subgraph sg from the
 // graph g
 void Sampler::reindexSubgraph(VertexSet& keptVertices, Graph& origGraph, Graph& reindexGraph) {
diff --git a/libdeepgalois/src/Sampler.cu b/libdeepgalois/src/Sampler.cu
index 69a66d1cfc..c5db16c5f1 100644
--- a/libdeepgalois/src/Sampler.cu
+++ b/libdeepgalois/src/Sampler.cu
@@ -5,6 +5,10 @@
 
 namespace deepgalois {
 
+__global__ void clear_masks(index_t n, mask_t* masks) {
+  CUDA_KERNEL_LOOP(i, n) { masks[i] = 0; }
+}
+
 // set the masks of vertices in a given vertex set
 // n is the size of the vertex set
 __global__ void set_masks(index_t n, index_t* vertices, mask_t* masks) {
@@ -16,6 +20,8 @@ __global__ void set_masks(index_t n, index_t* vertices, mask_t* masks) {
 __global__ void get_masked_degrees(index_t n, mask_t* masks, GraphGPU g,
                                    index_t* degrees) {
   CUDA_KERNEL_LOOP(src, n) {
+    if (src < 10) printf("masks[%d] = %d\n", src, masks[src]);
+    degrees[src] = 0;
     if (masks[src] == 1) {
       for (auto e = g.edge_begin(src); e != g.edge_end(src); e++) {
         auto dst = g.getEdgeDst(e);
@@ -23,6 +29,7 @@ __global__ void get_masked_degrees(index_t n, mask_t* masks, GraphGPU g,
           degrees[src]++;
       }
     }
+    if (src < 10) printf("degrees[%d] = %d\n", src, degrees[src]);
   }
 }
 
@@ -86,25 +93,19 @@ void Sampler::indexing(size_t n, index_t* vertices, index_t* new_indices) {
   }
 }
 */
-inline VertexList Sampler::reindexVertices(size_t n, VertexSet vertex_set) {
-  VertexList new_ids(n, 0);
-  int vid = 0;
-  for (auto v : vertex_set) {
-    new_ids[v] = vid++; // reindex
-  }
-  return new_ids;
-}
 
 template <typename GraphTy, typename SubgraphTy>
 void Sampler::getMaskedGraph(index_t n, mask_t* masks, GraphTy* g, SubgraphTy* subg) {
+  std::cout << "Original graph size: " << g->size() << " edges: " << g->sizeEdges() << "\n";
   index_t *degrees, *offsets;
   CUDA_CHECK(cudaMalloc((void**)&degrees, sizeof(index_t)*n));
   get_masked_degrees<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, masks, *g, degrees);
-  CUDA_CHECK(cudaFree(degrees));
   CUDA_CHECK(cudaMalloc((void**)&offsets, sizeof(index_t)*(n+1)));
-  thrust::exclusive_scan(thrust::device, degrees, degrees+n, offsets);
+  thrust::exclusive_scan(thrust::device, degrees, degrees+n+1, offsets);
+  CUDA_CHECK(cudaFree(degrees));
   index_t ne;
-  CUDA_CHECK(cudaMemcpy(&ne, offsets+n, sizeof(index_t), cudaMemcpyDeviceToHost));
+  CUDA_CHECK(cudaMemcpy(&ne, &offsets[n], sizeof(index_t), cudaMemcpyDeviceToHost));
+  std::cout << "maskedSG num_edges " << ne << "\n";
   subg->allocateFrom(n, ne); // TODO: avoid reallocation
   generate_masked_graph_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, masks, offsets, *g, *subg);
   CUDA_CHECK(cudaFree(offsets));
@@ -116,38 +117,48 @@ void Sampler::getMaskedGraph(index_t n, mask_t* masks, GraphTy* g, SubgraphTy* s
 void Sampler::generateSubgraph(VertexSet &vertex_set, mask_t* masks, GraphGPU* sub) {
   index_t n = partGraph->size();
   auto nv = vertex_set.size();
+  std::cout << "g size: " << n << " sg sizes: " << nv << "\n";
   // convert the vertex_set to a vertex_list and copy it to the device
   VertexList vertex_list(vertex_set.begin(), vertex_set.end());
   index_t* d_vertex_list;
   cudaMalloc((void**)&d_vertex_list, nv * sizeof(index_t));
   CUDA_CHECK(cudaMemcpy(d_vertex_list, &vertex_list[0], nv * sizeof(index_t), cudaMemcpyHostToDevice));
 
+  clear_masks<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, masks); // set all 0
+  CudaTest("solving clear_masks kernel failed");
   // createMasks: set masks for vertices in the vertex_set
   set_masks<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, d_vertex_list, masks);
+  CudaTest("solving set_masks kernel failed");
   GraphGPU masked_sg; // size is the same as original graph, but masked dst removed
   getMaskedGraph(n, masks, partGraph, &masked_sg); // remove edges whose destination is not masked
+  std::cout << "maskedGraph generated\n";
 
   // re-index the subgraph
   index_t* d_new_ids;
   cudaMalloc((void**)&d_new_ids, n * sizeof(index_t));
   // Given an old vertex ID ∈ [0, n), returns a new vertex ID ∈ [0, nv)
-  auto new_ids = reindexVertices(nv, vertex_set);
+  auto new_ids = reindexVertices(n, vertex_set);
   CUDA_CHECK(cudaMemcpy(d_new_ids, &new_ids[0], n * sizeof(index_t), cudaMemcpyHostToDevice));
 
   // generate the offsets for the re-indexed subgraph
   index_t *degrees, *offsets;
   CUDA_CHECK(cudaMalloc((void**)&degrees, sizeof(index_t)*nv));
   get_new_degrees<<<CUDA_GET_BLOCKS(nv), CUDA_NUM_THREADS>>>(nv, d_vertex_list, d_new_ids, masked_sg, degrees);
-  CUDA_CHECK(cudaFree(degrees));
+  CudaTest("solving get_new_degrees kernel failed");
   CUDA_CHECK(cudaMalloc((void**)&offsets, sizeof(index_t)*(nv+1)));
-  thrust::exclusive_scan(thrust::device, degrees, degrees+nv, offsets);
+  thrust::exclusive_scan(thrust::device, degrees, degrees+nv+1, offsets);
+  CUDA_CHECK(cudaFree(degrees));
   index_t ne;
   CUDA_CHECK(cudaMemcpy(&ne, offsets+nv, sizeof(index_t), cudaMemcpyDeviceToHost));
+  std::cout << "subgraph num_edges " << ne << "\n";
 
   // allocate memory for the subgraph
   sub->allocateFrom(nv, ne); // avoid reallocation
   // generate the subgraph
   generate_graph_kernel<<<CUDA_GET_BLOCKS(nv), CUDA_NUM_THREADS>>>(nv, offsets, d_vertex_list, d_new_ids, masked_sg, *sub);
+  CudaTest("solving generate_graph kernel failed");
+  CUDA_CHECK(cudaFree(offsets));
+  std::cout << "Subgraph generated\n";
 }
 
 } // namespace deepgalois
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index 246091903c..9a7c4bc1dd 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -69,15 +69,23 @@ void float_malloc_device(int n, float_t*& ptr) {
 void float_free_device(float_t*& ptr) { CUDA_CHECK(cudaFree(ptr)); }
 
 void float_copy_device(int n, float_t* h_ptr, float_t* d_ptr) {
-  CUDA_CHECK(
-      cudaMemcpy(d_ptr, h_ptr, n * sizeof(float_t), cudaMemcpyHostToDevice));
+  CUDA_CHECK(cudaMemcpy(d_ptr, h_ptr, n * sizeof(float_t), cudaMemcpyHostToDevice));
+}
+
+void uint8_malloc_device(int n, uint8_t*& ptr) {
+  CUDA_CHECK(cudaMalloc((void**)&ptr, n * sizeof(uint8_t)));
+}
+
+void uint8_free_device(uint8_t*& ptr) { CUDA_CHECK(cudaFree(ptr)); }
+
+void uint8_copy_device(int n, uint8_t* h_ptr, uint8_t* d_ptr) {
+  CUDA_CHECK(cudaMemcpy(d_ptr, h_ptr, n * sizeof(uint8_t), cudaMemcpyHostToDevice));
 }
 
 void copy_masks_device(int n, mask_t* h_masks, mask_t*& d_masks) {
   assert(h_masks != NULL);
   CUDA_CHECK(cudaMalloc((void**)&d_masks, n * sizeof(mask_t)));
-  CUDA_CHECK(
-      cudaMemcpy(d_masks, h_masks, n * sizeof(mask_t), cudaMemcpyHostToDevice));
+  CUDA_CHECK(cudaMemcpy(d_masks, h_masks, n * sizeof(mask_t), cudaMemcpyHostToDevice));
 }
 
 __global__ void setup_curand_kernel(const int n, curandState* state) {
diff --git a/libdeepgalois/src/reader.cpp b/libdeepgalois/src/reader.cpp
index 961b852ded..6e6e00a5d1 100644
--- a/libdeepgalois/src/reader.cpp
+++ b/libdeepgalois/src/reader.cpp
@@ -209,7 +209,7 @@ void Reader::readGraphFromGRFile(LearningGraph* g) {
     std::cout << "LearningGraph: currently edge data not supported.\n";
     exit(1);
   }
-  printf("num_vertices %lu, num_edges %lu.\n", nv, ne);
+  printf("num_vertices %lu num_edges %lu\n", nv, ne);
   g->allocateFrom(nv, ne);
   auto rowptr = g->row_start_host_ptr();
   for (unsigned vid = 0; vid < nv; ++vid) {
diff --git a/libgpu/include/graph_gpu.h b/libgpu/include/graph_gpu.h
index 4c480bd8fa..4ddf57b950 100644
--- a/libgpu/include/graph_gpu.h
+++ b/libgpu/include/graph_gpu.h
@@ -175,15 +175,24 @@ struct CSRGraph {
     edge_dst[eid] = dst;
     if (edge_data) edge_data[eid] = edata;
   }
-  void malloc_index_device(index_type n, index_type *ptr);
+  void malloc_index_device(index_type n, index_type*& ptr);
+  void free_index_device(index_type*& ptr);
   void set_index(index_type pos, index_type value, index_type *ptr);
   void allocateFrom(index_type nv, index_type ne) {
+    bool need_realloc = false;
+    if (nedges < ne) need_realloc = true;
     nnodes = nv;
     nedges = ne;
-    malloc_index_device(nedges, edge_dst);
-    malloc_index_device(nnodes+1, row_start);
+    if (max_size < nnodes) max_size = nnodes;
+    printf("allocating memory on gpu nnodes %d nedges %d\n", max_size, nedges);
+    if (need_realloc) {
+      if (edge_dst) free_index_device(edge_dst);
+      malloc_index_device(nedges, edge_dst);
+    }
+    if (!row_start) malloc_index_device(max_size+1, row_start);
     set_index(0, 0, row_start);
   }
+  void set_max_size(index_type max) { assert(max>0); max_size = max; }
   size_t size() { return size_t(nnodes); }
   size_t sizeEdges() { return size_t(nedges); }
   void degree_counting() {}
@@ -194,5 +203,7 @@ struct CSRGraph {
   edge_data_type* edge_data;
   node_data_type* node_data;
   bool device_graph;
+  index_type max_size; // this is for reallocation; avoid re-malloc
+  bool is_allocated; // this is for reallocation
 };
 #endif
diff --git a/libgpu/src/csr_graph.cu b/libgpu/src/csr_graph.cu
index e7be218138..19ca915cd0 100644
--- a/libgpu/src/csr_graph.cu
+++ b/libgpu/src/csr_graph.cu
@@ -21,7 +21,8 @@ unsigned CSRGraph::init() {
   node_data = NULL;
   nnodes = nedges = 0;
   device_graph = false;
-
+  is_allocated = false;
+  max_size = 0;
   return 0;
 }
 
@@ -46,7 +47,11 @@ unsigned CSRGraph::allocOnHost(bool no_edge_data) {
   return ((no_edge_data || edge_data) && row_start && edge_dst && node_data);
 }
 
-void CSRGraph::malloc_index_device(index_type n, index_type *ptr) {
+void CSRGraph::free_index_device(index_type*& ptr) {
+  check_cuda(cudaFree(ptr));
+}
+
+void CSRGraph::malloc_index_device(index_type n, index_type*& ptr) {
   check_cuda(cudaMalloc((void **) &ptr, n * sizeof(index_type)));
 }
 
@@ -213,7 +218,7 @@ unsigned CSRGraph::readFromGR(const char file[], bool read_edge_data) {
   nnodes = numNodes;
   nedges = numEdges;
 
-  printf("nnodes=%d, nedges=%d, sizeEdge=%d.\n", nnodes, nedges, sizeEdgeTy);
+  printf("nnodes %d nedges %d sizeEdge %d\n", nnodes, nedges, sizeEdgeTy);
   allocOnHost(!read_edge_data);
 
   row_start[0] = 0;

From 4010b586f223261a701912255123aefcd421aa0d Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 14 May 2020 14:44:25 -0500
Subject: [PATCH 299/660] cout->gPrint, host ID headers, general cleanup makes
 dist execution logs easier to parse

---
 libdeepgalois/include/deepgalois/Context.h    |  11 +-
 libdeepgalois/include/deepgalois/Net.h        | 102 +++++++++------
 .../include/deepgalois/layers/layer.h         |  79 ++++++------
 libdeepgalois/src/DistContext.cpp             | 122 +++++++++---------
 libdeepgalois/src/Net.cpp                     |  25 ++--
 libdeepgalois/src/reader.cpp                  |  26 ++--
 libdeepgalois/src/utils.cpp                   |   6 +-
 lonestar/gnn/gcn/gcn.cpp                      |   3 +-
 lonestar/gnn/include/engine.h                 |  44 +++----
 9 files changed, 224 insertions(+), 194 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/Context.h b/libdeepgalois/include/deepgalois/Context.h
index 6200540847..ba3d1510bf 100644
--- a/libdeepgalois/include/deepgalois/Context.h
+++ b/libdeepgalois/include/deepgalois/Context.h
@@ -25,23 +25,22 @@ class Context {
     dataset = dataset_str;
     reader.init(dataset);
   }
-  size_t read_masks(std::string mask_type, size_t n, 
-                    size_t& begin, size_t& end, mask_t* masks) {
+  size_t read_masks(std::string mask_type, size_t n, size_t& begin, size_t& end,
+                    mask_t* masks) {
     return reader.read_masks(mask_type, n, begin, end, masks);
   }
   size_t read_graph(bool selfloop) {
-    graph_cpu            = new GraphCPU();
+    graph_cpu = new GraphCPU();
     graph_cpu->readGraph(dataset, selfloop);
     is_selfloop_added = selfloop;
-    std::cout << "num_vertices " << graph_cpu->size() 
-              << " num_edges " << graph_cpu->sizeEdges() << "\n";
     return graph_cpu->size();
   }
 
   //! Checks if subgraph being used, sets currenet graph, then calls degreex
   //! counting
   GraphCPU* getFullGraph() {
-    graph_cpu->degree_counting(); // TODO: why is it here? should be in read_graph
+    graph_cpu
+        ->degree_counting(); // TODO: why is it here? should be in read_graph
     return graph_cpu;
   }
 };
diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h
index 082949d7fb..3971da74d2 100644
--- a/libdeepgalois/include/deepgalois/Net.h
+++ b/libdeepgalois/include/deepgalois/Net.h
@@ -71,8 +71,9 @@ class Net {
   mask_t* d_val_masks;   // masks for validation on device
   mask_t* d_test_masks;  // masks for test on device
 
-  mask_t* subgraphs_masks;          // masks for subgraphs; size of local graph
-  mask_t* d_subgraphs_masks;        // masks for subgraphs on device; size of local graph
+  mask_t* subgraphs_masks; // masks for subgraphs; size of local graph
+  mask_t*
+      d_subgraphs_masks; // masks for subgraphs on device; size of local graph
   std::vector<size_t> feature_dims; // feature dimnesions for each layer
   std::vector<layer*> layers;       // all the layers in the neural network
 
@@ -90,7 +91,8 @@ class Net {
 public:
   Net(std::string dataset_str, int nt, unsigned n_conv, int epochs,
       unsigned hidden1, float lr, float dropout, float wd, bool selfloop,
-      bool single, bool l2norm, bool dense, unsigned neigh_sz, unsigned subg_sz, int val_itv)
+      bool single, bool l2norm, bool dense, unsigned neigh_sz, unsigned subg_sz,
+      int val_itv)
       : is_single_class(single), has_l2norm(l2norm), has_dense(dense),
         neighbor_sample_size(neigh_sz), subgraph_sample_size(subg_sz),
         num_threads(nt), num_conv_layers(n_conv), num_epochs(epochs),
@@ -98,7 +100,7 @@ class Net {
         val_interval(val_itv), num_subgraphs(1), is_selfloop(selfloop) {
     // init some identifiers for this host
 #ifndef __GALOIS_HET_CUDA__
-    this->myID      = galois::runtime::getSystemNetworkInterface().ID;
+    this->myID = galois::runtime::getSystemNetworkInterface().ID;
 #endif
     this->header    = "[" + std::to_string(myID) + "] ";
     this->seperator = " ";
@@ -161,11 +163,13 @@ class Net {
 
     layers.resize(num_layers);
     // hidden1 level embedding: 16
-    for (size_t i = 1; i < num_conv_layers; i++) feature_dims[i] = this->h1;
+    for (size_t i = 1; i < num_conv_layers; i++)
+      feature_dims[i] = this->h1;
 
     // features are read in distcontext, not this context (this context only
     // used for sampling)
-    if (subgraph_sample_size) sampler = new deepgalois::Sampler();
+    if (subgraph_sample_size)
+      sampler = new deepgalois::Sampler();
   }
 
   //! Default net constructor
@@ -183,8 +187,9 @@ class Net {
 
   void allocateSubgraphsMasks(int num_subgraphs);
 
-  //! Initializes metadata for the partition
-  void partitionInit(DGraph* graph, std::string dataset_str, bool isSingleClassLabel);
+  //! Initializes metadata for the partition: loads data, labels, etc
+  void partitionInit(DGraph* graph, std::string dataset_str,
+                     bool isSingleClassLabel);
   size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; }
   size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id + 1]; }
   void regularize(); // add weight decay
@@ -196,14 +201,16 @@ class Net {
     if (subgraph_sample_size) {
       distContext->allocateSubgraphs(num_subgraphs, subgraph_sample_size);
       allocateSubgraphsMasks(num_subgraphs);
-      std::cout << header << "Constructing training vertex set induced graph...\n";
-      //auto gg = distContext->getGraphPointer();
-      auto gg = graphTopologyContext->getGraphPointer(); // gloabl graph in CPU mem
+      std::cout << header
+                << "Constructing training vertex set induced graph...\n";
+      // auto gg = distContext->getGraphPointer();
+      auto gg =
+          graphTopologyContext->getGraphPointer(); // gloabl graph in CPU mem
       sampler->initializeMaskedGraph(globalTrainCount, globalTrainMasks, gg,
                                      distContext->getGraphPointer());
     }
 
-    std::cout << header << "Start training...\n";
+    galois::gPrint(header, "Start training...\n");
 
     Timer t_epoch;
 
@@ -216,7 +223,8 @@ class Net {
       ////////////////////////////////////////////////////////////////////////////////
       if (subgraph_sample_size) {
         if (num_subg_remain == 0) {
-          std::cout << header << "Generating " << num_subgraphs << " subgraph(s)\n";
+          std::cout << header << "Generating " << num_subgraphs
+                    << " subgraph(s)\n";
           // TODO stat timer instead of this timer
           Timer t_subgen;
           t_subgen.Start();
@@ -224,8 +232,10 @@ class Net {
           // generate subgraphs
           for (int sid = 0; sid < num_subgraphs; sid++) {
             VertexSet sampledSet;
-            sampler->selectVertices(subgraph_sample_size, sampledSet, curEpoch); // m = 1000 by default
-            sampler->generateSubgraph(sampledSet, subgraphs_masks + sid * globalSamples,
+            sampler->selectVertices(subgraph_sample_size, sampledSet,
+                                    curEpoch); // m = 1000 by default
+            sampler->generateSubgraph(sampledSet,
+                                      subgraphs_masks + sid * globalSamples,
                                       distContext->getSubgraphPointer(sid));
           }
           num_subg_remain = num_subgraphs;
@@ -246,7 +256,7 @@ class Net {
         auto subgraphPointer      = distContext->getSubgraphPointer(sg_id);
         this->subgraphNumVertices = subgraphPointer->size();
 
-        std::cout << "Subgraph num_vertices: " << subgraphNumVertices 
+        std::cout << "Subgraph num_vertices: " << subgraphNumVertices
                   << ", num_edges: " << subgraphPointer->sizeEdges() << "\n";
         for (size_t i = 0; i < num_layers; i++) {
           layers[i]->update_dim_size(this->subgraphNumVertices);
@@ -257,18 +267,21 @@ class Net {
         distContext->constructNormFactorSub(sg_id);
         for (size_t i = 0; i < num_conv_layers; i++) {
           layers[i]->set_graph_ptr(subgraphPointer);
-          layers[i]->set_norm_consts_ptr(distContext->get_norm_factors_subg_ptr());
+          layers[i]->set_norm_consts_ptr(
+              distContext->get_norm_factors_subg_ptr());
         }
 
         // update labels for subgraph
-        distContext->constructSubgraphLabels(this->subgraphNumVertices,
-                                             subgraphs_masks + sg_id * globalSamples);
-        layers[num_layers - 1]->set_labels_ptr(distContext->get_labels_subg_ptr());
+        distContext->constructSubgraphLabels(
+            this->subgraphNumVertices, subgraphs_masks + sg_id * globalSamples);
+        layers[num_layers - 1]->set_labels_ptr(
+            distContext->get_labels_subg_ptr());
 
         // update features for subgraph
-        distContext->constructSubgraphFeatures(this->subgraphNumVertices,
-                                               subgraphs_masks + sg_id * globalSamples);
-        layers[0]->set_feats_ptr(distContext->get_feats_subg_ptr()); // feed input data
+        distContext->constructSubgraphFeatures(
+            this->subgraphNumVertices, subgraphs_masks + sg_id * globalSamples);
+        layers[0]->set_feats_ptr(
+            distContext->get_feats_subg_ptr()); // feed input data
 
         // Graph* testing = distContext->getSubgraphPointer(sg_id);
         // for (size_t i = 0; i < testing->size(); i++) {
@@ -281,28 +294,31 @@ class Net {
       ////////////////////////////////////////////////////////////////////////////////
 
       // training steps
-      std::cout << header << "Epoch " << std::setw(3) << curEpoch << seperator;
+      galois::gPrint(header, "Epoch ", std::setw(3), curEpoch, "\n");
       set_netphases(net_phase::train);
       acc_t train_loss = 0.0, train_acc = 0.0;
 
+      galois::gPrint(header, "Calling into eval for forward propagation\n");
       // forward: after this phase, layer edges will contain intermediate
       // features for use during backprop
       double fw_time = evaluate("train", train_loss, train_acc);
 
+      galois::gPrint(header, "Calling into backward propagation\n");
       // backward: use intermediate features + ground truth to update layers
       // with feature gradients whcih are then used to calculate weight
       // gradients
       Net::bprop();
 
-      // gradient update: use gradients stored on each layer to update model for
-      // next epoch
+      galois::gPrint(header, "Weight update call\n");
+      // gradient update: use gradients stored on each layer to update model
+      // for next epoch
       Net::update_weights(opt); // update parameters
 
       // validation / testing
       set_netphases(net_phase::test);
 
-      std::cout << header << "train_loss " << std::setprecision(3) << std::fixed
-                << train_loss << " train_acc " << train_acc << seperator;
+      galois::gPrint(header, "train_loss ", std::setprecision(3), std::fixed,
+                     train_loss, " train_acc ", train_acc, "\n");
 
       t_epoch.Stop();
 
@@ -313,22 +329,22 @@ class Net {
         // Validation
         acc_t val_loss = 0.0, val_acc = 0.0;
         double val_time = evaluate("val", val_loss, val_acc);
-        std::cout << header << "val_loss " << std::setprecision(3) << std::fixed
-                  << val_loss << " val_acc " << val_acc << seperator;
-        std::cout << header << "time " << std::setprecision(3) << std::fixed
-                  << epoch_time + val_time << " ms (train_time " << epoch_time
-                  << " val_time " << val_time << ")\n";
+        galois::gPrint(header, "val_loss ", std::setprecision(3), std::fixed,
+                       val_loss, " val_acc ", val_acc, "\n");
+        galois::gPrint(header, "time ", std::setprecision(3), std::fixed,
+                       epoch_time + val_time, " ms (train_time ", epoch_time,
+                       " val_time ", val_time, ")\n");
       } else {
-        std::cout << header << "train_time " << std::fixed << epoch_time
-                  << " ms (fw " << fw_time << ", bw " << epoch_time - fw_time
-                  << ")\n";
+        galois::gPrint(header, "train_time ", std::fixed, epoch_time,
+                       " ms (fw ", fw_time, ", bw ", epoch_time - fw_time,
+                       ")\n");
       }
     } // epoch loop
 
     double avg_train_time = total_train_time / (double)num_epochs;
     double throughput     = 1000.0 * (double)num_epochs / total_train_time;
-    std::cout << header << "Average training time per epoch: " << avg_train_time
-              << " ms. Throughput: " << throughput << " epoch/s\n";
+    galois::gPrint(header, "Average training time per epoch: ", avg_train_time,
+                   " ms. Throughput: ", throughput, " epoch/s\n");
   }
 
   // evaluate, i.e. inference or predict
@@ -384,7 +400,9 @@ class Net {
     }
 #endif
 
+    galois::gPrint(header, "Doing actual forward propagation\n");
     loss                 = fprop(begin, end, count, masks);
+    galois::gPrint(header, "Forward propagation donne, going to check accuracy\n");
     float_t* predictions = layers[num_layers - 1]->next()->get_data();
 
     // labels will be subgraph labels if applicable
@@ -409,11 +427,11 @@ class Net {
 
   // read masks of test set
   void read_test_masks(std::string dataset);
-  //void copy_test_masks_to_device();
+  // void copy_test_masks_to_device();
 
   void construct_layers() {
     // append conv layers
-    std::cout << "\nConstructing layers...\n";
+    galois::gPrint(header, "Constructing layers...\n");
     for (size_t i = 0; i < num_conv_layers - 1; i++) {
       append_conv_layer(i, true); // conv layers, act=true
     }
@@ -519,11 +537,15 @@ class Net {
     // set mask for the last layer; globals
     // TODO this should be distirbuted sample begin->end not global; fix later
     // seems to be unused in code right now anyways
+    galois::gPrint(header, "fprop: set sample mask\n");
     layers[num_layers - 1]->set_sample_mask(begin, end, count, masks);
 
     for (size_t i = 0; i < num_layers; i++) {
+      galois::gPrint(header, "fprop: layer ", i, " forward call\n");
       layers[i]->forward();
     }
+
+    galois::gPrint(header, "fprop: getting loss\n");
     // prediction error
     auto loss = layers[num_layers - 1]->get_prediction_loss();
     // Squared Norm Regularization to mitigate overfitting
diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h
index 91b57c7041..99ec74fb4a 100644
--- a/libdeepgalois/include/deepgalois/layers/layer.h
+++ b/libdeepgalois/include/deepgalois/layers/layer.h
@@ -40,6 +40,44 @@ class layer : public deepgalois::node {
 public:
   using ContextType = deepgalois::DistContext;
 
+protected:
+  const std::string header =
+      "[" + std::to_string(galois::runtime::getSystemNetworkInterface().ID) +
+      "] ";
+  unsigned level_;                 // layer id: [0, num_layers-1]
+  size_t begin_;                   // sample begin index
+  size_t end_;                     // sample end index
+  size_t count_;                   // number of samples
+  size_t num_dims;                 // number of dimensions
+  net_phase phase_;                // in which phase: train, val or test
+  std::vector<size_t> input_dims;  // input dimensions
+  std::vector<size_t> output_dims; // output dimentions
+  std::string name_;               // name of this layer
+  bool trainable_;                 // is this layer trainable
+  bool use_mask;
+  vec_t W; // parameters to learn, for vertex v, layer0: D x 16, layer1: 16 x E
+  vec_t Q; // parameters to learn, for vertex u, i.e. v's neighbors, layer0: D x
+           // 16, layer1: 16 x E
+  vec_t weight_grad; // weight gradient for updating parameters
+  float_t* d_W;
+  float_t* d_weight_grad;
+  mask_t* masks_; // masks to show which samples are valid
+  mask_t* d_masks_;
+  float_t* loss; // error for each vertex: N x 1
+  ContextType* context;
+  label_t* labels;
+  float_t* norm_consts;
+// TODO
+#ifdef __GALOIS_HET_CUDA__
+  GraphGPU* graph_gpu;
+#else
+  Graph* graph_cpu;
+  // Used for synchronization of weight gradients
+  deepgalois::GluonGradients* gradientGraph;
+  galois::graphs::GluonSubstrate<deepgalois::GluonGradients>* syncSub;
+#endif
+
+public:
   layer(unsigned level, std::vector<size_t> in_dims,
         std::vector<size_t> out_dims)
       : level_(level), begin_(0), end_(0), num_dims(in_dims.size()),
@@ -48,9 +86,10 @@ class layer : public deepgalois::node {
   virtual std::string layer_type() const = 0;
   virtual void malloc_and_init() {}
   void print_layer_info() { //! debug print function
-    std::cout << "Layer" << level_ << " type: " << layer_type() << " input["
-              << input_dims[0] << "," << input_dims[1] << "] output["
-              << output_dims[0] << "," << output_dims[1] << "]\n";
+    unsigned myID = galois::runtime::getSystemNetworkInterface().ID;
+    galois::gPrint("[", myID, "] Layer", level_, " type: ", layer_type(),
+                   "input[", input_dims[0], ",", input_dims[1], "] output[",
+                   output_dims[0], ",", output_dims[1], "]\n");
   }
   // get methods
   virtual acc_t get_prediction_loss() { return acc_t(0); }
@@ -148,40 +187,6 @@ class layer : public deepgalois::node {
     // prev()->clear_grads();
     next()->clear_grads();
   }
-
-protected:
-  unsigned level_;                 // layer id: [0, num_layers-1]
-  size_t begin_;                   // sample begin index
-  size_t end_;                     // sample end index
-  size_t count_;                   // number of samples
-  size_t num_dims;                 // number of dimensions
-  net_phase phase_;                // in which phase: train, val or test
-  std::vector<size_t> input_dims;  // input dimensions
-  std::vector<size_t> output_dims; // output dimentions
-  std::string name_;               // name of this layer
-  bool trainable_;                 // is this layer trainable
-  bool use_mask;
-  vec_t W; // parameters to learn, for vertex v, layer0: D x 16, layer1: 16 x E
-  vec_t Q; // parameters to learn, for vertex u, i.e. v's neighbors, layer0: D x
-           // 16, layer1: 16 x E
-  vec_t weight_grad; // weight gradient for updating parameters
-  float_t* d_W;
-  float_t* d_weight_grad;
-  mask_t* masks_; // masks to show which samples are valid
-  mask_t* d_masks_;
-  float_t* loss; // error for each vertex: N x 1
-  ContextType* context;
-  label_t* labels;
-  float_t* norm_consts;
-// TODO
-#ifdef __GALOIS_HET_CUDA__
-  GraphGPU* graph_gpu;
-#else
-  Graph* graph_cpu;
-  // Used for synchronization of weight gradients
-  deepgalois::GluonGradients* gradientGraph;
-  galois::graphs::GluonSubstrate<deepgalois::GluonGradients>* syncSub;
-#endif
 };
 
 //! Connects tail to head's edge and sets that edge's target to tail
diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp
index e6c6121d80..b9caa7ef5a 100644
--- a/libdeepgalois/src/DistContext.cpp
+++ b/libdeepgalois/src/DistContext.cpp
@@ -30,7 +30,8 @@ void DistContext::saveDistGraph(DGraph* a) {
 }
 
 // TODO move to reader class
-size_t DistContext::read_labels(bool isSingleClassLabel, std::string dataset_str) {
+size_t DistContext::read_labels(bool isSingleClassLabel,
+                                std::string dataset_str) {
   DGraph* dGraph         = DistContext::partitionedGraph;
   this->usingSingleClass = isSingleClassLabel;
   unsigned myID          = galois::runtime::getSystemNetworkInterface().ID;
@@ -49,10 +50,10 @@ size_t DistContext::read_labels(bool isSingleClassLabel, std::string dataset_str
   if (isSingleClassLabel) {
     galois::gPrint("[", myID, "] One hot labels...\n");
     // single-class (one-hot) label for each vertex: N x 1
-    this->h_labels = new label_t[dGraph->size()]; 
+    this->h_labels = new label_t[dGraph->size()];
   } else {
     galois::gPrint("[", myID, "] Multi-class labels...\n");
-    this->h_labels = new label_t[dGraph->size() * this->num_classes]; 
+    this->h_labels = new label_t[dGraph->size() * this->num_classes];
     // multi-class label for each vertex: N x E
   }
 
@@ -113,7 +114,7 @@ size_t DistContext::read_features(std::string dataset_str) {
   ifs >> m >> this->feat_len >> std::ws;
   ifs.close();
 
-  galois::gPrint("N x D: ", m, " x ", feat_len, "\n");
+  galois::gPrint("[", myID, "] N x D: ", m, " x ", feat_len, "\n");
 
   // TODO read in without using 2 in-memory buffers
   // full read feats to load into h_feats
@@ -151,6 +152,8 @@ size_t DistContext::read_features(std::string dataset_str) {
 size_t DistContext::read_masks(std::string dataset_str, std::string mask_type,
                                size_t n, size_t& begin, size_t& end,
                                mask_t* masks, DGraph* dGraph) {
+  unsigned myID = galois::runtime::getSystemNetworkInterface().ID;
+
   bool dataset_found = false;
   for (int i = 0; i < NUM_DATASETS; i++) {
     if (dataset_str == dataset_names[i]) {
@@ -159,8 +162,7 @@ size_t DistContext::read_masks(std::string dataset_str, std::string mask_type,
     }
   }
   if (!dataset_found) {
-    std::cout << "Dataset currently not supported\n";
-    exit(1);
+    GALOIS_DIE("Dataset currently not supported");
   }
   size_t i             = 0;
   size_t sample_count  = 0;
@@ -185,9 +187,9 @@ size_t DistContext::read_masks(std::string dataset_str, std::string mask_type,
     }
     i++;
   }
-  std::cout << mask_type + "_mask range: [" << begin << ", " << end
-            << ") Number of valid samples: " << sample_count << "("
-            << (float)sample_count / (float)n * (float)100 << "\%)\n";
+  galois::gPrint("[", myID, "] ", mask_type, "_mask range: [", begin, ", ", end,
+                 ") Number of valid samples: ", sample_count, "(",
+                 (float)sample_count / (float)n * (float)100, "\%)\n");
   in.close();
   return sample_count;
 }
@@ -207,7 +209,6 @@ void DistContext::allocNormFactor() {
 #else
   this->normFactors.resize(partitionedGraph->size());
 #endif
-  // TODO clean out?
 }
 
 void DistContext::allocNormFactorSub(int subID) {
@@ -216,11 +217,11 @@ void DistContext::allocNormFactorSub(int subID) {
 #else
   this->normFactorsSub.resize(partitionedSubgraphs[subID]->size());
 #endif
-  // TODO clean out?
 }
 
 void DistContext::constructNormFactor(deepgalois::Context* globalContext) {
-  galois::gPrint("Norm factor construction\n");
+  unsigned myID = galois::runtime::getSystemNetworkInterface().ID;
+  galois::gPrint("[", myID, "] Norm factor construction\n");
   // using original graph to get ids
   Graph* wholeGraph = globalContext->getFullGraph();
 
@@ -233,25 +234,26 @@ void DistContext::constructNormFactor(deepgalois::Context* globalContext) {
   //);
 
 #ifdef USE_MKL
-  galois::do_all(galois::iterate((size_t)0, partitionedGraph->size()),
-    [&] (unsigned i) {
-      float_t c_i =
-          std::sqrt(float_t(wholeGraph->get_degree(partitionedGraph->getGID(i))));
-
-      for (auto e = partitionedGraph->edge_begin(i);
-           e != partitionedGraph->edge_end(i); e++) {
-        const auto j = partitionedGraph->getEdgeDst(e);
-        float_t c_j  = std::sqrt(
-            float_t(wholeGraph->get_degree(partitionedGraph->getGID(j))));
-
-        if (c_i == 0.0 || c_j == 0.0) {
-          this->normFactors[*e] = 0.0;
-        } else {
-          this->normFactors[*e] = 1.0 / (c_i * c_j);
+  galois::do_all(
+      galois::iterate((size_t)0, partitionedGraph->size()),
+      [&](unsigned i) {
+        float_t c_i = std::sqrt(
+            float_t(wholeGraph->get_degree(partitionedGraph->getGID(i))));
+
+        for (auto e = partitionedGraph->edge_begin(i);
+             e != partitionedGraph->edge_end(i); e++) {
+          const auto j = partitionedGraph->getEdgeDst(e);
+          float_t c_j  = std::sqrt(
+              float_t(wholeGraph->get_degree(partitionedGraph->getGID(j))));
+
+          if (c_i == 0.0 || c_j == 0.0) {
+            this->normFactors[*e] = 0.0;
+          } else {
+            this->normFactors[*e] = 1.0 / (c_i * c_j);
+          }
         }
-      }
-    },
-    galois::loopname("NormCountingEdge"));
+      },
+      galois::loopname("NormCountingEdge"));
 #else
   galois::do_all(
       galois::iterate((size_t)0, partitionedGraph->size()),
@@ -266,40 +268,42 @@ void DistContext::constructNormFactor(deepgalois::Context* globalContext) {
       },
       galois::loopname("NormCountingNode"));
 #endif
-  galois::gPrint("Norm factor construction done\n");
+  galois::gPrint("[", myID, "] Norm factor construction done \n");
 }
 
 void DistContext::constructNormFactorSub(int subgraphID) {
-    //galois::gPrint("Sub norm factor construction\n");
-    // right now norm factor based on subgraph
-    // TODO fix this for dist execution
+  // galois::gPrint("Sub norm factor construction\n");
+  // right now norm factor based on subgraph
+  // TODO fix this for dist execution
 
-    allocNormFactorSub(subgraphID);
+  allocNormFactorSub(subgraphID);
 
-    Graph& graphToUse = *partitionedSubgraphs[subgraphID];
-    graphToUse.degree_counting();
+  Graph& graphToUse = *partitionedSubgraphs[subgraphID];
+  graphToUse.degree_counting();
 
-    // TODO using partitioned subgraph rather than whoel graph; i.e. dist
-    // setting wrong
+  // TODO using partitioned subgraph rather than whoel graph; i.e. dist
+  // setting wrong
 #ifdef USE_MKL
-  galois::do_all(galois::iterate((size_t)0, graphToUse.size()),
-    [&] (unsigned i) {
-      // float_t c_i =
-      // std::sqrt(float_t(wholeGraph->get_degree(partitionedGraph->getGID(i))));
-      float_t c_i = std::sqrt(float_t(graphToUse.get_degree(i)));
-
-      for (index_t e = graphToUse.edge_begin(i); e != graphToUse.edge_end(i);
-           e++) {
-        const auto j = graphToUse.getEdgeDst(e);
-        float_t c_j  = std::sqrt(float_t(graphToUse.get_degree(j)));
-
-        if (c_i == 0.0 || c_j == 0.0) {
-          this->normFactorsSub[e] = 0.0;
-        } else {
-          this->normFactorsSub[e] = 1.0 / (c_i * c_j);
+  galois::do_all(
+      galois::iterate((size_t)0, graphToUse.size()),
+      [&](unsigned i) {
+        // float_t c_i =
+        // std::sqrt(float_t(wholeGraph->get_degree(partitionedGraph->getGID(i))));
+        float_t c_i = std::sqrt(float_t(graphToUse.get_degree(i)));
+
+        for (index_t e = graphToUse.edge_begin(i); e != graphToUse.edge_end(i);
+             e++) {
+          const auto j = graphToUse.getEdgeDst(e);
+          float_t c_j  = std::sqrt(float_t(graphToUse.get_degree(j)));
+
+          if (c_i == 0.0 || c_j == 0.0) {
+            this->normFactorsSub[e] = 0.0;
+          } else {
+            this->normFactorsSub[e] = 1.0 / (c_i * c_j);
+          }
         }
-      }
-    }, galois::loopname("NormCountingEdge"));
+      },
+      galois::loopname("NormCountingEdge"));
 #else
   galois::do_all(
       galois::iterate((size_t)0, graphToUse.size()),
@@ -316,7 +320,7 @@ void DistContext::constructNormFactorSub(int subgraphID) {
       },
       galois::loopname("NormCountingNode"));
 #endif
-  //galois::gPrint("Sub norm factor construction done\n");
+  // galois::gPrint("Sub norm factor construction done\n");
 }
 //! generate labels for the subgraph, m is subgraph size, mask
 //! tells which vertices to use
@@ -353,8 +357,8 @@ void DistContext::constructSubgraphFeatures(size_t m, const mask_t* masks) {
   for (size_t i = 0; i < this->partitionedGraph->size(); i++) {
     if (masks[i] == 1) {
       std::copy(DistContext::h_feats + i * DistContext::feat_len,
-          DistContext::h_feats + (i + 1) * DistContext::feat_len,
-          &DistContext::h_feats_subg[count * DistContext::feat_len]);
+                DistContext::h_feats + (i + 1) * DistContext::feat_len,
+                &DistContext::h_feats_subg[count * DistContext::feat_len]);
       // for (unsigned a = 0; a < DistContext::feat_len; a++) {
       //  if (h_feats_subg[count * DistContext::feat_len + a] != 0) {
       //    galois::gPrint(h_feats_subg[count * DistContext::feat_len + a],
diff --git a/libdeepgalois/src/Net.cpp b/libdeepgalois/src/Net.cpp
index 3bc7762fd5..da2a7356ea 100644
--- a/libdeepgalois/src/Net.cpp
+++ b/libdeepgalois/src/Net.cpp
@@ -29,32 +29,26 @@ void Net::partitionInit(DGraph* graph, std::string dataset_str,
             0);
   std::fill(this->distValMasks, this->distValMasks + this->distNumSamples, 0);
 
+  // load the training/val masks
   if (dataset_str == "reddit") {
-    // this->globalTrainBegin = 0;
-    // this->globalTrainCount = 153431;
-    // this->globalTrainEnd = this->globalTrainBegin + this->globalTrainCount;
-    // this->globalValBegin = 153431;
-    // this->globalValCount = 23831;
-    // this->globalValEnd = this->globalValBegin + this->globalValCount;
-
     // find local ID from global ID, set if it exists
-    for (size_t i = globalTrainBegin; i < globalTrainEnd; i++) {
+    for (size_t i = this->globalTrainBegin; i < this->globalTrainEnd; i++) {
       if (this->dGraph->isLocal(i)) {
         this->distTrainMasks[this->dGraph->getLID(i)] = 1;
       }
     }
-    for (size_t i = globalValBegin; i < globalValEnd; i++) {
+    for (size_t i = this->globalValBegin; i < this->globalValEnd; i++) {
       if (this->dGraph->isLocal(i)) {
         this->distValMasks[this->dGraph->getLID(i)] = 1;
       }
     }
   } else {
     globalTrainCount = this->distContext->read_masks(
-        dataset_str, "train", this->distNumSamples, globalTrainBegin,
-        globalTrainEnd, this->distTrainMasks, this->dGraph);
+        dataset_str, "train", this->distNumSamples, this->globalTrainBegin,
+        this->globalTrainEnd, this->distTrainMasks, this->dGraph);
     globalValCount = this->distContext->read_masks(
-        dataset_str, "val", this->distNumSamples, globalValBegin, globalValEnd,
-        this->distValMasks, this->dGraph);
+        dataset_str, "val", this->distNumSamples, this->globalValBegin,
+        this->globalValEnd, this->distValMasks, this->dGraph);
   }
 
   // input feature dimension: D
@@ -96,8 +90,9 @@ void Net::read_test_masks(std::string dataset) {
         test_masks[dGraph->getLID(i)] = 1;
     }
   } else {
-    globalTestCount = distContext->read_masks(dataset, std::string("test"), 
-        globalSamples, globalTestBegin, globalTestEnd, test_masks, dGraph);
+    globalTestCount = distContext->read_masks(
+        dataset, std::string("test"), globalSamples, globalTestBegin,
+        globalTestEnd, test_masks, dGraph);
   }
 }
 
diff --git a/libdeepgalois/src/reader.cpp b/libdeepgalois/src/reader.cpp
index 6e6e00a5d1..54987d4635 100644
--- a/libdeepgalois/src/reader.cpp
+++ b/libdeepgalois/src/reader.cpp
@@ -16,7 +16,9 @@ namespace deepgalois {
 // be computed as y.argmax(axis=1) from one-hot encoded vector (y) of labels if
 // required.
 size_t Reader::read_labels(bool is_single_class, label_t*& labels) {
-  std::cout << "Reading labels ... ";
+  unsigned myID = galois::runtime::getSystemNetworkInterface().ID;
+  galois::gPrint("[", myID, "] Reader: Reading labels...\n");
+
   Timer t_read;
   t_read.Start();
   std::string filename = path + dataset_str + "-labels.txt";
@@ -26,11 +28,12 @@ size_t Reader::read_labels(bool is_single_class, label_t*& labels) {
   size_t m, num_classes; // m: number of samples
   in >> m >> num_classes >> std::ws;
   if (is_single_class) {
-    std::cout << "Using single-class (one-hot) labels\n";
+    galois::gPrint("[", myID,
+                   "] Reader: Using single-class (one-hot) labels\n");
     labels =
         new label_t[m]; // single-class (one-hot) label for each vertex: N x 1
   } else {
-    std::cout << "Using multi-class labels\n";
+    galois::gPrint("[", myID, "] Reader: Using multi-class (one-hot) labels\n");
     labels =
         new label_t[m *
                     num_classes]; // multi-class label for each vertex: N x E
@@ -55,8 +58,8 @@ size_t Reader::read_labels(bool is_single_class, label_t*& labels) {
   in.close();
   t_read.Stop();
   // print the number of vertex classes
-  std::cout << "Done, unique label counts: " << num_classes
-            << ", time: " << t_read.Millisecs() << " ms\n";
+  galois::gPrint("[", myID, "] Done, unique label counts: ", num_classes,
+                 ", time: ", t_read.Millisecs(), " ms\n");
   // for (auto i = 0; i < 10; i ++) std::cout << "labels[" << i << "] = " <<
   // unsigned(labels[i]) << "\n";
   return num_classes;
@@ -147,9 +150,9 @@ size_t Reader::read_masks(std::string mask_type, size_t n, size_t& begin,
     }
     i++;
   }
-  std::cout << mask_type + "_mask range: [" << begin << ", " << end
-            << ") Number of valid samples: " << sample_count << " ("
-            << (float)sample_count / (float)n * (float)100 << "\%)\n";
+  galois::gPrint("Global read", mask_type, "_mask range: [", begin, ", ", end,
+                 ") Number of valid samples: ", sample_count, " (",
+                 (float)sample_count / (float)n * (float)100, "\%)\n");
   in.close();
   return sample_count;
 }
@@ -209,7 +212,6 @@ void Reader::readGraphFromGRFile(LearningGraph* g) {
     std::cout << "LearningGraph: currently edge data not supported.\n";
     exit(1);
   }
-  printf("num_vertices %lu num_edges %lu\n", nv, ne);
   g->allocateFrom(nv, ne);
   auto rowptr = g->row_start_host_ptr();
   for (unsigned vid = 0; vid < nv; ++vid) {
@@ -250,9 +252,9 @@ void Reader::readGraphFromGRFile(LearningGraph* g) {
     ifs.close();
   */
   t.Stop();
-  double runtime = t.Millisecs();
-  std::cout << "read " << masterLength << " bytes in " << runtime << " ms ("
-            << masterLength / 1000.0 / runtime << " MB/s)\n\n";
+  // double runtime = t.Millisecs();
+  // std::cout << "read " << masterLength << " bytes in " << runtime << " ms ("
+  //          << masterLength / 1000.0 / runtime << " MB/s)\n\n";
 }
 
 /*
diff --git a/libdeepgalois/src/utils.cpp b/libdeepgalois/src/utils.cpp
index 3f67974c67..2780c692be 100644
--- a/libdeepgalois/src/utils.cpp
+++ b/libdeepgalois/src/utils.cpp
@@ -25,6 +25,7 @@ OutTy* parallel_prefix_sum(const std::vector<InTy>& in) {
     total += local_sums[block];
   }
   bulk_prefix[num_blocks] = total;
+  // TODO do not use new here: difficult to track and free later
   OutTy* prefix           = new OutTy[in.size() + 1];
   galois::do_all(
       galois::iterate((size_t)0, num_blocks), [&](const size_t& block) {
@@ -109,8 +110,9 @@ acc_t masked_f1_score(size_t begin, size_t end, size_t, mask_t* masks,
       recall_mic + precision_mic > 0.
           ? 2. * (recall_mic * precision_mic) / (recall_mic + precision_mic)
           : 0.;
-  std::cout << std::setprecision(3) << std::fixed << " (f1_micro: " << f1_micro
-            << ", f1_macro: " << f1_macro << ") ";
+  unsigned myID = galois::runtime::getSystemNetworkInterface().ID;
+  galois::gPrint("[", myID, "]", std::setprecision(3), std::fixed,
+                 " (f1_micro:", f1_micro, ", f1_macro: ", f1_macro, ")\n");
   return f1_micro;
 }
 
diff --git a/lonestar/gnn/gcn/gcn.cpp b/lonestar/gnn/gcn/gcn.cpp
index c33e7d5574..454179ad5d 100644
--- a/lonestar/gnn/gcn/gcn.cpp
+++ b/lonestar/gnn/gcn/gcn.cpp
@@ -6,5 +6,6 @@ const char* name = "Graph Convolutional Networks";
 const char* desc = "Graph convolutional neural networks on an undirected graph";
 const char* url  = 0;
 
+// TODO rather than having main being part of include file, have main in this
+// just be a function call to some common start function
 #include "engine.h"
-
diff --git a/lonestar/gnn/include/engine.h b/lonestar/gnn/include/engine.h
index ad63ffdb78..155c65ca68 100644
--- a/lonestar/gnn/include/engine.h
+++ b/lonestar/gnn/include/engine.h
@@ -12,8 +12,8 @@
 #include "deepgalois/Net.h"
 
 static void LonestarGnnPrintVersion(llvm::raw_ostream& out) {
-  out << "LoneStarGNN Benchmark Suite v" << galois::getVersion()
-      << " (" << galois::getRevision() << ")\n";
+  out << "LoneStarGNN Benchmark Suite v" << galois::getVersion() << " ("
+      << galois::getRevision() << ")\n";
   out.flush();
 }
 
@@ -32,25 +32,25 @@ void LonestarGnnStart(int argc, char** argv, const char* app, const char* desc,
   auto& net = galois::runtime::getSystemNetworkInterface();
   if (net.ID == 0) {
 #endif
-  LonestarGnnPrintVersion(llvm::outs());
-  std::cout << "Copyright (C) " << galois::getCopyrightYear()
-            << " The University of Texas at Austin\n";
-  std::cout << "http://iss.ices.utexas.edu/galois/\n\n";
-  std::cout << "application: " << (app ? app : "unspecified") << "\n";
-  if (desc)
-    std::cout << desc << "\n";
-  if (url)
-    std::cout << "http://iss.ices.utexas.edu/?p=projects/galois/benchmarks/"
-              << url << "\n";
-  std::cout << "\n";
-  std::ostringstream cmdout;
-  for (int i = 0; i < argc; ++i) {
-    cmdout << argv[i];
-    if (i != argc - 1)
-      cmdout << " ";
-  }
-  galois::runtime::reportParam("(NULL)", "CommandLine", cmdout.str());
-  galois::runtime::reportParam("(NULL)", "Threads", numThreads);
+    LonestarGnnPrintVersion(llvm::outs());
+    std::cout << "Copyright (C) " << galois::getCopyrightYear()
+              << " The University of Texas at Austin\n";
+    std::cout << "http://iss.ices.utexas.edu/galois/\n\n";
+    std::cout << "application: " << (app ? app : "unspecified") << "\n";
+    if (desc)
+      std::cout << desc << "\n";
+    if (url)
+      std::cout << "http://iss.ices.utexas.edu/?p=projects/galois/benchmarks/"
+                << url << "\n";
+    std::cout << "\n";
+    std::ostringstream cmdout;
+    for (int i = 0; i < argc; ++i) {
+      cmdout << argv[i];
+      if (i != argc - 1)
+        cmdout << " ";
+    }
+    galois::runtime::reportParam("(NULL)", "CommandLine", cmdout.str());
+    galois::runtime::reportParam("(NULL)", "Threads", numThreads);
 #ifdef GALOIS_USE_DIST
   }
 #endif
@@ -76,7 +76,7 @@ int main(int argc, char** argv) {
 #endif
 
   // initialize network + whole context on CPU
-  // read network, features, ground truth, initialize metadata
+  // read network, initialize metadata
   // default setting for now; can be customized by the user
   deepgalois::Net network(dataset, numThreads, num_conv_layers, epochs, hidden1,
                           learning_rate, dropout_rate, weight_decay,

From eb95b9ef254786850ff87faefbb6be87116fdc5c Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 15 May 2020 17:02:24 -0500
Subject: [PATCH 300/660] output layers now dist part aware

---
 .../src/layers/sigmoid_loss_layer.cpp         | 98 ++++++++++++-------
 .../src/layers/softmax_loss_layer.cpp         | 84 +++++++++-------
 2 files changed, 112 insertions(+), 70 deletions(-)

diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
index 3dcb312f08..8d72ed4b07 100644
--- a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
+++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
@@ -25,22 +25,32 @@ inline label_t sigmoid_loss_layer::get_label(size_t i, size_t j) {
 
 void sigmoid_loss_layer::forward_propagation(const float_t* in_data,
                                              float_t* out_data) {
-  size_t len = input_dims[1];
+  size_t featLen = input_dims[1];
   galois::do_all(
       galois::iterate(begin_, end_),
-      [&](const auto& i) {
-        if (!use_mask || masks_[i] == 1) { // masked
-          size_t idx = len * i;
-          // output is normalized input for this layer
-          math::sigmoid(len, &in_data[idx],
-                        &out_data[idx]); // normalize using sigmoid
-          // one hot encoded vector for the labels
-          float_t* ground_truth = new float_t[len];
-          for (size_t j = 0; j < len; j++)
-            ground_truth[j] = (float_t)get_label(i, j);
-          // loss calculation
-          loss[i] = math::cross_entropy(len, ground_truth, &out_data[idx]);
-          delete[] ground_truth;
+      [&](const auto& gid) {
+        if (!use_mask || masks_[gid] == 1) { // masked
+          // check if local to this host
+          if (this->context->isLocal(gid)) {
+            unsigned lid = this->context->getLID(gid);
+            size_t idx   = featLen * lid;
+
+            // output is normalized input for this layer
+            math::sigmoid(featLen, &in_data[idx],
+                          &out_data[idx]); // normalize using sigmoid
+
+            // one hot encoded vector for the labels
+            // TODO this is a bottleneck; big lock on memory allocator
+            float_t* ground_truth = new float_t[featLen];
+            for (size_t j = 0; j < featLen; j++)
+              ground_truth[j] = (float_t)get_label(lid, j);
+            // loss calculation
+            this->loss[lid] =
+                math::cross_entropy(featLen, ground_truth, &out_data[idx]);
+
+            // TODO this is a bottleneck, lock on memory possibly
+            delete[] ground_truth;
+          }
         }
       },
       galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
@@ -50,23 +60,31 @@ void sigmoid_loss_layer::forward_propagation(const float_t* in_data,
 void sigmoid_loss_layer::back_propagation(const float_t* in_data,
                                           const float_t* out_data, float_t*,
                                           float_t* in_grad) {
-  size_t len = layer::input_dims[1];
+  size_t featLen = layer::input_dims[1];
+
   galois::do_all(
       galois::iterate(layer::begin_, layer::end_),
-      [&](const auto& i) {
-        if (!use_mask || masks_[i] == 1) { // masked
-          size_t idx            = len * i;
-          float_t* norm_grad    = new float_t[len];
-          float_t* ground_truth = new float_t[len];
-          for (size_t j = 0; j < len; j++)
-            ground_truth[j] = (float_t)get_label(i, j);
-          // use ground truth to determine derivative of cross entropy
-          math::d_cross_entropy(len, ground_truth, &out_data[idx], norm_grad);
-          // derviative sigmoid to gradient used in the next layer
-          math::d_sigmoid(len, &in_data[idx], &out_data[idx], &in_grad[idx],
-                          norm_grad);
-          delete[] norm_grad;
-          delete[] ground_truth;
+      [&](const auto& gid) {
+        if (!use_mask || masks_[gid] == 1) { // masked
+          if (this->context->isLocal(gid)) {
+            unsigned lid = this->context->getLID(gid);
+
+            size_t idx = featLen * lid;
+            // TODO this is bad
+            float_t* norm_grad    = new float_t[featLen];
+            float_t* ground_truth = new float_t[featLen];
+            for (size_t j = 0; j < featLen; j++)
+              ground_truth[j] = (float_t)get_label(lid, j);
+            // use ground truth to determine derivative of cross entropy
+            math::d_cross_entropy(featLen, ground_truth, &out_data[idx],
+                                  norm_grad);
+            // derviative sigmoid to gradient used in the next layer
+            math::d_sigmoid(featLen, &in_data[idx], &out_data[idx],
+                            &in_grad[idx], norm_grad);
+            // TODO this is bad
+            delete[] norm_grad;
+            delete[] ground_truth;
+          }
         }
       },
       galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
@@ -74,23 +92,31 @@ void sigmoid_loss_layer::back_propagation(const float_t* in_data,
 }
 
 acc_t sigmoid_loss_layer::get_prediction_loss() {
-  assert(count_ > 0);
   galois::GAccumulator<acc_t> total_loss;
   galois::GAccumulator<size_t> valid_sample_count;
   total_loss.reset();
   valid_sample_count.reset();
+
   galois::do_all(
       galois::iterate(layer::begin_, layer::end_),
-      [&](const auto& i) {
-        if (!use_mask || masks_[i]) {
-          total_loss += loss[i];
-          valid_sample_count += 1;
+      [&](const auto& gid) {
+        if (!use_mask || masks_[gid]) {
+          if (this->context->isLocal(gid)) {
+            unsigned lid = this->context->getLID(gid);
+            total_loss += this->loss[lid];
+            valid_sample_count += 1;
+          }
         }
       },
       galois::chunk_size<256>(), galois::steal(),
       galois::loopname("getMaskedLoss"));
-  assert(valid_sample_count.reduce() == count_);
-  return total_loss.reduce() / (acc_t)count_;
+
+  size_t c = valid_sample_count.reduce();
+  if (c > 0) {
+    return total_loss.reduce() / (acc_t)valid_sample_count.reduce();
+  } else {
+    return 0;
+  }
 }
 
 } // namespace deepgalois
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp
index 940fbeb798..3581365427 100644
--- a/libdeepgalois/src/layers/softmax_loss_layer.cpp
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp
@@ -27,20 +27,26 @@ inline label_t softmax_loss_layer::get_label(size_t i) {
 // 𝑦[i] = 𝑒^𝑥[i] / Σ 𝑒^𝑥[𝑘]
 void softmax_loss_layer::forward_propagation(const float_t* in_data,
                                              float_t* out_data) {
-  size_t len = input_dims[1];
+  // size_t numSamples = input_dims;
+  size_t featLen = input_dims[1];
   galois::do_all(
       galois::iterate(begin_, end_),
-      [&](const auto& i) {
-        if (!use_mask || masks_[i] == 1) { // masked
-          // output is normalized input for this layer
-          math::softmax(len, &in_data[len * i],
-                        &out_data[len * i]); // normalize using softmax
-          // one hot encoded vector for the labels
-          vec_t groundTruth(output_dims[1], 0.0); // ground truth
-          groundTruth[get_label(i)] = 1.0;        // one-hot
-          // loss calculation
-          loss[i] =
-              math::cross_entropy(len, &groundTruth[0], &out_data[len * i]);
+      [&](const unsigned gid) {
+        // if no mask used it means all are fair game
+        if (!use_mask || masks_[gid] == 1) {
+          if (this->context->isLocal(gid)) {
+            unsigned lid = this->context->getLID(gid);
+            // output is normalized input for this layer
+            math::softmax(featLen, &in_data[featLen * lid],
+                          &out_data[featLen * lid]); // normalize using softmax
+            // one hot encoded vector for the labels
+            vec_t groundTruth(output_dims[1], 0.0); // ground truth
+            // labels are local
+            groundTruth[get_label(lid)] = 1.0; // one-hot
+            // loss calculation
+            loss[lid] = math::cross_entropy(featLen, &groundTruth[0],
+                                            &out_data[featLen * lid]);
+          }
         }
       },
       galois::chunk_size<64>(), galois::steal(),
@@ -54,20 +60,24 @@ void softmax_loss_layer::back_propagation(const float_t* in_data,
                                           const float_t* out_data, float_t*,
                                           float_t* in_grad) {
   // note: out_grad is ignored because it shouldn't exist (this is output layer)
-  size_t len = layer::input_dims[1];
+  size_t featLen = layer::input_dims[1];
   galois::do_all(
       galois::iterate(layer::begin_, layer::end_),
-      [&](const auto& i) {
-        if (!use_mask || masks_[i] == 1) { // masked
-          vec_t norm_grad(len);
-          std::vector<acc_t> groundTruth(len, 0.0);
-          groundTruth[get_label(i)] = 1.0;
-          // use ground truth to determine derivative of cross entropy
-          math::d_cross_entropy(len, &groundTruth[0], &out_data[len * i],
-                                &norm_grad[0]);
-          // derviative softmax to gradient used in the next layer
-          math::d_softmax(len, &in_data[len * i], &out_data[len * i],
-                          &in_grad[len * i], &norm_grad[0]);
+      [&](const auto& gid) {
+        if (!use_mask || masks_[gid] == 1) { // masked
+          if (this->context->isLocal(gid)) {
+            unsigned lid = this->context->getLID(gid);
+            vec_t norm_grad(featLen);
+            std::vector<acc_t> groundTruth(featLen, 0.0);
+            groundTruth[get_label(lid)] = 1.0;
+            // use ground truth to determine derivative of cross entropy
+            math::d_cross_entropy(featLen, &groundTruth[0],
+                                  &out_data[featLen * lid], &norm_grad[0]);
+            // derviative softmax to gradient used in the next layer
+            math::d_softmax(featLen, &in_data[featLen * lid],
+                            &out_data[featLen * lid], &in_grad[featLen * lid],
+                            &norm_grad[0]);
+          }
         }
       },
       galois::chunk_size<64>(), galois::steal(),
@@ -77,25 +87,31 @@ void softmax_loss_layer::back_propagation(const float_t* in_data,
 }
 
 acc_t softmax_loss_layer::get_prediction_loss() {
-  assert(count_ > 0);
   galois::GAccumulator<acc_t> total_loss;
   galois::GAccumulator<size_t> valid_sample_count;
   total_loss.reset();
   valid_sample_count.reset();
+
   galois::do_all(
       galois::iterate(layer::begin_, layer::end_),
-      [&](const auto& i) {
-        if (!use_mask || masks_[i]) {
-          total_loss += loss[i];
-          valid_sample_count += 1;
+      [&](const auto& gid) {
+        if (!use_mask || masks_[gid]) {
+          if (this->context->isLocal(gid)) {
+            unsigned lid = this->context->getLID(gid);
+            total_loss += this->loss[lid];
+            valid_sample_count += 1;
+          }
         }
       },
-      galois::chunk_size<64>(), galois::steal(),
+      galois::chunk_size<256>(), galois::steal(),
       galois::loopname("getMaskedLoss"));
-  // std::cout << "begin = " << begin_ << " end = " << end_ << " count = " <<
-  // count_ << " valid_count = " << valid_sample_count.reduce() << "\n";
-  assert(valid_sample_count.reduce() == count_);
-  return total_loss.reduce() / (acc_t)count_;
+
+  size_t c = valid_sample_count.reduce();
+  if (c > 0) {
+    return total_loss.reduce() / (acc_t)valid_sample_count.reduce();
+  } else {
+    return 0;
+  }
 }
 
 } // namespace deepgalois

From 7bab2e7558549c226fe8e7fb2306c68bf37b75dd Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 15 May 2020 17:03:38 -0500
Subject: [PATCH 301/660] distcontext: added interface to query node presence
 in dist graph

---
 .../include/deepgalois/DistContext.h          | 19 +++++++------
 libdeepgalois/src/DistContext.cpp             | 27 ++++++++++++++++++-
 2 files changed, 37 insertions(+), 9 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h
index 08e101e898..49222eb3ab 100644
--- a/libdeepgalois/include/deepgalois/DistContext.h
+++ b/libdeepgalois/include/deepgalois/DistContext.h
@@ -110,19 +110,22 @@ class DistContext {
 
   //! return label for some node
   //! NOTE: this is LID, not GID
-  label_t get_label(size_t i) { return h_labels[i]; }
+  label_t get_label(size_t lid) { return h_labels[lid]; }
 
   //! returns pointer to the features of each local node
   float_t* get_in_ptr();
 
   //! allocate memory for subgraphs (don't actually build them)
-  void allocateSubgraphs(int num_subgraphs, unsigned max_size) {
-    partitionedSubgraphs.resize(num_subgraphs);
-    for (int i = 0; i < num_subgraphs; i++) {
-      partitionedSubgraphs[i] = new Graph();
-      partitionedSubgraphs[i]->set_max_size(max_size);
-    }
-  }
+  void allocateSubgraphs(int num_subgraphs, unsigned max_size);
+
+  //! return if a vertex is owned by the partitioned graph this context contains
+  bool isOwned(unsigned gid);
+  //! return if part graph has provided vertex for given gid locally
+  bool isLocal(unsigned gid);
+  //! get GID of an lid for a vertex
+  unsigned getGID(unsigned lid);
+  //! get local id of a vertex given a global id for that vertex
+  unsigned getLID(unsigned gid);
 };
 
 } // namespace deepgalois
diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp
index b9caa7ef5a..320cc75b7f 100644
--- a/libdeepgalois/src/DistContext.cpp
+++ b/libdeepgalois/src/DistContext.cpp
@@ -148,7 +148,7 @@ size_t DistContext::read_features(std::string dataset_str) {
   return feat_len;
 }
 
-// TODO move to reader class
+// TODO move to reader class/reuse reader class somehow
 size_t DistContext::read_masks(std::string dataset_str, std::string mask_type,
                                size_t n, size_t& begin, size_t& end,
                                mask_t* masks, DGraph* dGraph) {
@@ -376,4 +376,29 @@ galois::graphs::GluonSubstrate<DGraph>* DistContext::getSyncSubstrate() {
   return DistContext::syncSubstrate;
 }
 
+//! allocate memory for subgraphs (don't actually build them)
+void DistContext::allocateSubgraphs(int num_subgraphs, unsigned max_size) {
+  this->partitionedSubgraphs.resize(num_subgraphs);
+  for (int i = 0; i < num_subgraphs; i++) {
+    this->partitionedSubgraphs[i] = new Graph();
+    this->partitionedSubgraphs[i]->set_max_size(max_size);
+  }
+}
+
+bool DistContext::isOwned(unsigned gid) {
+  return this->partitionedGraph->isOwned(gid);
+}
+
+bool DistContext::isLocal(unsigned gid) {
+  return this->partitionedGraph->isLocal(gid);
+}
+
+unsigned DistContext::getGID(unsigned lid) {
+  return this->partitionedGraph->getGID(lid);
+}
+
+unsigned DistContext::getLID(unsigned gid) {
+  return this->partitionedGraph->getLID(gid);
+}
+
 } // namespace deepgalois

From 5a251d4906dcf444f0b6dff098643d7f43fbac56 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 15 May 2020 17:11:06 -0500
Subject: [PATCH 302/660] test masks always read global: fixed var names to be
 much more readable

---
 libdeepgalois/include/deepgalois/Net.h | 108 +++++++++++++------------
 libdeepgalois/src/Net.cpp              |  86 ++++++++++----------
 libdeepgalois/src/reader.cpp           |   2 +-
 libdeepgalois/src/utils.cpp            |   9 ++-
 4 files changed, 111 insertions(+), 94 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h
index 3971da74d2..3bc9f8684c 100644
--- a/libdeepgalois/include/deepgalois/Net.h
+++ b/libdeepgalois/include/deepgalois/Net.h
@@ -63,17 +63,19 @@ class Net {
 
   mask_t* globalTrainMasks; // masks for training
   mask_t* globalValMasks;   // masks for validation
+  mask_t* globalTestMasks;  // masks for test
+  // TODO it's looking like we may not even need these dist versions
   mask_t* distTrainMasks;
   mask_t* distValMasks;
-  mask_t* test_masks; // masks for test
+  mask_t* distTestMasks; // masks for test, dst
 
   mask_t* d_train_masks; // masks for training on device
   mask_t* d_val_masks;   // masks for validation on device
   mask_t* d_test_masks;  // masks for test on device
 
   mask_t* subgraphs_masks; // masks for subgraphs; size of local graph
-  mask_t*
-      d_subgraphs_masks; // masks for subgraphs on device; size of local graph
+  // masks for subgraphs on device; size of local graph
+  mask_t* d_subgraphs_masks;
   std::vector<size_t> feature_dims; // feature dimnesions for each layer
   std::vector<layer*> layers;       // all the layers in the neural network
 
@@ -107,12 +109,11 @@ class Net {
 
     assert(n_conv > 0);
 
-    // TODO use galois print: need avoid including Galois.h for GPU
-    std::cout << header << "Configuration: num_threads " << num_threads
-              << ", num_conv_layers " << num_conv_layers << ", num_epochs "
-              << num_epochs << ", hidden1 " << hidden1 << ", learning_rate "
-              << learning_rate << ", dropout_rate " << dropout_rate
-              << ", weight_decay " << weight_decay << "\n";
+    galois::gPrint(header, "Configuration: num_threads ", num_threads,
+                   ", num_conv_layers ", num_conv_layers, ", num_epochs ",
+                   num_epochs, ", hidden1 ", hidden1, ", learning_rate ",
+                   learning_rate, ", dropout_rate ", dropout_rate,
+                   ", weight_decay ", weight_decay, "\n");
     this->num_layers = num_conv_layers + 1;
 
     // additional layers to add
@@ -133,6 +134,7 @@ class Net {
     // subgraph in the sampler
     globalTrainMasks = new mask_t[globalSamples];
     globalValMasks   = new mask_t[globalSamples];
+    globalTestMasks  = new mask_t[globalSamples];
     std::fill(globalTrainMasks, globalTrainMasks + globalSamples, 0);
     std::fill(globalValMasks, globalValMasks + globalSamples, 0);
 
@@ -183,7 +185,7 @@ class Net {
   //      globalValCount(0), globalTestBegin(0), globalTestEnd(0),
   //      globalTestCount(0), val_interval(1), num_subgraphs(1),
   //      num_vertices_sg(9000), globalTrainMasks(NULL), globalValMasks(NULL),
-  //      test_masks(NULL), context(NULL) {}
+  //      globalTestMasks(NULL), context(NULL) {}
 
   void allocateSubgraphsMasks(int num_subgraphs);
 
@@ -351,32 +353,32 @@ class Net {
   double evaluate(std::string type, acc_t& loss, acc_t& acc) {
     Timer t_eval;
     t_eval.Start();
-    size_t begin = 0, end = 0, count = 0;
-    mask_t* masks = NULL;
+    size_t gBegin = 0, gEnd = 0, gCount = 0;
+    mask_t* gMasks = NULL;
 
     // TODO global here good for dist case?
     if (type == "train") {
-      begin = globalTrainBegin;
-      end   = globalTrainEnd;
-      count = globalTrainCount;
-      masks = globalTrainMasks;
+      gBegin = globalTrainBegin;
+      gEnd   = globalTrainEnd;
+      gCount = globalTrainCount;
+      gMasks = globalTrainMasks;
       if (subgraph_sample_size) {
-        // update masks for subgraph
-        masks = NULL;
-        begin = 0;
-        end   = this->subgraphNumVertices;
-        count = this->subgraphNumVertices;
+        // update gMasks for subgraph
+        gMasks = NULL;
+        gBegin = 0;
+        gEnd   = this->subgraphNumVertices;
+        gCount = this->subgraphNumVertices;
       }
     } else if (type == "val") {
-      begin = globalValBegin;
-      end   = globalValEnd;
-      count = globalValCount;
-      masks = globalValMasks;
+      gBegin = globalValBegin;
+      gEnd   = globalValEnd;
+      gCount = globalValCount;
+      gMasks = globalValMasks;
     } else {
-      begin = globalTestBegin;
-      end   = globalTestEnd;
-      count = globalTestCount;
-      masks = test_masks;
+      gBegin = globalTestBegin;
+      gEnd   = globalTestEnd;
+      gCount = globalTestCount;
+      gMasks = globalTestMasks;
     }
 
     // switch to the original graph if not training
@@ -392,41 +394,46 @@ class Net {
     }
 #ifdef __GALOIS_HET_CUDA__
     if (type == "train") {
-      masks = d_train_masks;
+      gMasks = d_train_masks;
     } else if (type == "val") {
-      masks = d_val_masks;
+      gMasks = d_val_masks;
     } else {
-      masks = d_test_masks;
+      gMasks = d_test_masks;
     }
 #endif
 
     galois::gPrint(header, "Doing actual forward propagation\n");
-    loss                 = fprop(begin, end, count, masks);
-    galois::gPrint(header, "Forward propagation donne, going to check accuracy\n");
+    loss = fprop(gBegin, gEnd, gCount, gMasks);
+    galois::gPrint(header,
+                   "Forward propagation donne, going to check accuracy\n");
     float_t* predictions = layers[num_layers - 1]->next()->get_data();
 
     // labels will be subgraph labels if applicable
-    label_t* labels;
+    label_t* localLabels;
     if (type == "train" && subgraph_sample_size) {
-      labels = distContext->get_labels_subg_ptr();
+      localLabels = distContext->get_labels_subg_ptr();
     } else {
       // note this grabs global labels; everything passed in should be global
-      labels = distContext->get_labels_ptr();
+      localLabels = distContext->get_labels_ptr();
     }
 
     if (is_single_class) {
-      acc = masked_accuracy(begin, end, count, masks, predictions, labels);
+      acc = masked_accuracy(gBegin, gEnd, gCount, gMasks, predictions,
+                            localLabels);
     } else {
-      acc = masked_multi_class_accuracy(begin, end, count, masks, predictions,
-                                        labels);
+      acc = masked_multi_class_accuracy(gBegin, gEnd, gCount, gMasks,
+                                        predictions, localLabels);
     }
 
     t_eval.Stop();
     return t_eval.Millisecs();
   }
 
-  // read masks of test set
+  //! read masks of test set for GLOBAL set
   void read_test_masks(std::string dataset);
+  //! read test masks only for local nodes; assumes dist context is initialized
+  void readDistributedTestMasks(std::string dataset);
+
   // void copy_test_masks_to_device();
 
   void construct_layers() {
@@ -533,12 +540,12 @@ class Net {
 
   //! forward propagation: [begin, end) is the range of samples used.
   //! calls "forward" on each layer and returns the loss of the final layer
-  acc_t fprop(size_t begin, size_t end, size_t count, mask_t* masks) {
+  acc_t fprop(size_t gBegin, size_t gEnd, size_t gCount, mask_t* gMasks) {
     // set mask for the last layer; globals
-    // TODO this should be distirbuted sample begin->end not global; fix later
+    // TODO this should be distirbuted sample gBegin->end not global; fix later
     // seems to be unused in code right now anyways
     galois::gPrint(header, "fprop: set sample mask\n");
-    layers[num_layers - 1]->set_sample_mask(begin, end, count, masks);
+    layers[num_layers - 1]->set_sample_mask(gBegin, gEnd, gCount, gMasks);
 
     for (size_t i = 0; i < num_layers; i++) {
       galois::gPrint(header, "fprop: layer ", i, " forward call\n");
@@ -547,7 +554,7 @@ class Net {
 
     galois::gPrint(header, "fprop: getting loss\n");
     // prediction error
-    auto loss = layers[num_layers - 1]->get_prediction_loss();
+    acc_t loss = layers[num_layers - 1]->get_prediction_loss();
     // Squared Norm Regularization to mitigate overfitting
     loss += weight_decay * layers[0]->get_weight_decay_loss();
     return loss;
@@ -576,11 +583,12 @@ class Net {
   }
 
   // comparing outputs with the ground truth (labels)
-  acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks,
-                        float_t* preds, label_t* ground_truth);
-  acc_t masked_multi_class_accuracy(size_t begin, size_t end, size_t count,
-                                    mask_t* masks, float_t* preds,
-                                    label_t* ground_truth);
+  acc_t masked_accuracy(size_t gBegin, size_t gEnd, size_t gCount,
+                        mask_t* gMasks, float_t* preds,
+                        label_t* localGroundTruth);
+  acc_t masked_multi_class_accuracy(size_t gBegin, size_t gEnd, size_t gCount,
+                                    mask_t* gMasks, float_t* preds,
+                                    label_t* localGroundTruth);
 };
 
 } // namespace deepgalois
diff --git a/libdeepgalois/src/Net.cpp b/libdeepgalois/src/Net.cpp
index da2a7356ea..55de8ad3ae 100644
--- a/libdeepgalois/src/Net.cpp
+++ b/libdeepgalois/src/Net.cpp
@@ -80,95 +80,99 @@ void Net::regularize() {
 }
 
 void Net::read_test_masks(std::string dataset) {
-  test_masks = new mask_t[distNumSamples];
+  if (dataset == "reddit") {
+    globalTestBegin = 177262;
+    globalTestCount = 55703;
+    globalTestEnd   = globalTestBegin + globalTestCount;
+    for (size_t i = globalTestBegin; i < globalTestEnd; i++) {
+      globalTestMasks[i] = 1;
+    }
+  } else {
+    globalTestCount = graphTopologyContext->read_masks(
+        "test", globalSamples, globalTestBegin, globalTestEnd, globalTestMasks);
+  }
+}
+
+void Net::readDistributedTestMasks(std::string dataset) {
+  distTestMasks = new mask_t[distNumSamples];
   if (dataset == "reddit") {
     globalTestBegin = 177262;
     globalTestCount = 55703;
     globalTestEnd   = globalTestBegin + globalTestCount;
     for (size_t i = globalTestBegin; i < globalTestEnd; i++) {
       if (dGraph->isLocal(i))
-        test_masks[dGraph->getLID(i)] = 1;
+        distTestMasks[dGraph->getLID(i)] = 1;
     }
   } else {
     globalTestCount = distContext->read_masks(
         dataset, std::string("test"), globalSamples, globalTestBegin,
-        globalTestEnd, test_masks, dGraph);
+        globalTestEnd, distTestMasks, dGraph);
   }
 }
 
 /**
- *
- * @param begin GLOBAL begin
- * @param end GLOBAL end
- * @param masks: GLOBAL masks
- * @param count GLOBAL training count
+ * @param gBegin GLOBAL begin
+ * @param gEnd GLOBAL end
+ * @param gMasks: GLOBAL masks
+ * @param gCount GLOBAL training count
  */
-acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count,
-                           mask_t* masks, float_t* preds,
-                           label_t* ground_truth) {
+acc_t Net::masked_accuracy(size_t gBegin, size_t gEnd, size_t gCount,
+                           mask_t* gMasks, float_t* preds,
+                           label_t* localGroundTruth) {
   galois::DGAccumulator<acc_t> accuracy_all;
   galois::DGAccumulator<uint32_t> sampleCount;
   accuracy_all.reset();
   sampleCount.reset();
 
-  // TODO figure this out for distributed case
   galois::do_all(
-      galois::iterate(begin, end),
+      galois::iterate(gBegin, gEnd),
       [&](const auto& i) {
-#ifndef GALOIS_USE_DIST
-        if (masks == NULL ||
-            masks[i] == 1) { // use sampled graph when masks is NULL
-          // get prediction
-          auto pred = math::argmax(num_classes, preds + i * num_classes);
-          // check prediction
-          if ((label_t)pred == ground_truth[i])
-            accuracy_all += 1.0;
-        }
-#else
-        // TODO dist subraph
-
         // only look at owned nodes (i.e. masters); the prediction for these
         // should only by handled on the owner
         if (this->dGraph->isOwned(i)) {
           sampleCount += 1;
 
           uint32_t localID = this->dGraph->getLID(i);
-          if (masks == NULL) {
-            // GALOIS_DIE("subgraphs not implemented for dist yet");
-            // subgraph here: TODO
+          if (gMasks == NULL) {
             auto pred =
                 math::argmax(num_classes, &preds[localID * num_classes]);
             // check prediction
-            if ((label_t)pred == ground_truth[localID])
+            if ((label_t)pred == localGroundTruth[localID])
               accuracy_all += 1.0;
           } else {
-            if (masks[localID] == 1) {
+            // TODO masks needs to be local id
+            if (gMasks[localID] == 1) {
               // get prediction
               auto pred =
                   math::argmax(num_classes, &preds[localID * num_classes]);
               // check prediction
-              if ((label_t)pred == ground_truth[localID])
+              if ((label_t)pred == localGroundTruth[localID])
                 accuracy_all += 1.0;
             }
           }
         }
-#endif
       },
       galois::loopname("getMaskedLoss"));
 
-  count = sampleCount.reduce();
-  galois::gDebug("sample count is ", count);
+  gCount = sampleCount.reduce();
+  galois::gDebug("sample count is ", gCount);
 
   // all hosts should get same accuracy
-  return accuracy_all.reduce() / (acc_t)count;
+  return accuracy_all.reduce() / (acc_t)gCount;
 }
 
-acc_t Net::masked_multi_class_accuracy(size_t begin, size_t end, size_t count,
-                                       mask_t* masks, float_t* preds,
-                                       label_t* ground_truth) {
-  // TODO dist version
-  return deepgalois::masked_f1_score(begin, end, count, masks, num_classes,
-                                     ground_truth, preds);
+acc_t Net::masked_multi_class_accuracy(size_t gBegin, size_t gEnd,
+                                       size_t gCount, mask_t* gMasks,
+                                       float_t* preds,
+                                       label_t* localGroundTruth) {
+  // TODO fix this
+  if (galois::runtime::getSystemNetworkInterface().Num > 1) {
+    GALOIS_DIE(
+        "Multi-class accuracy not yet implemented for distributed setting\n");
+  }
+
+  return deepgalois::masked_f1_score(gBegin, gEnd, gCount, gMasks, num_classes,
+                                     localGroundTruth, preds);
 }
 
 } // namespace deepgalois
diff --git a/libdeepgalois/src/reader.cpp b/libdeepgalois/src/reader.cpp
index 54987d4635..d131913587 100644
--- a/libdeepgalois/src/reader.cpp
+++ b/libdeepgalois/src/reader.cpp
@@ -150,7 +150,7 @@ size_t Reader::read_masks(std::string mask_type, size_t n, size_t& begin,
     }
     i++;
   }
-  galois::gPrint("Global read", mask_type, "_mask range: [", begin, ", ", end,
+  galois::gPrint("Global read ", mask_type, "_mask range: [", begin, ", ", end,
                  ") Number of valid samples: ", sample_count, " (",
                  (float)sample_count / (float)n * (float)100, "\%)\n");
   in.close();
diff --git a/libdeepgalois/src/utils.cpp b/libdeepgalois/src/utils.cpp
index 2780c692be..9fb90c46c1 100644
--- a/libdeepgalois/src/utils.cpp
+++ b/libdeepgalois/src/utils.cpp
@@ -60,12 +60,14 @@ parallel_prefix_sum<uint32_t, uint32_t>(const std::vector<uint32_t>& in);
 acc_t masked_f1_score(size_t begin, size_t end, size_t, mask_t* masks,
                       size_t num_classes, label_t* ground_truth,
                       float_t* pred) {
+  // TODO dist version; make aware of distributed execution
   double precision_cls(0.), recall_cls(0.), f1_accum(0.);
   int tp_accum(0), fn_accum(0), fp_accum(0), tn_accum(0);
+
   for (size_t col = 0; col < num_classes; col++) {
     int tp_cls(0), fp_cls(0), fn_cls(0), tn_cls(0);
+
     for (size_t row = begin; row < end; row++) {
-      // galois::do_all(galois::iterate(begin, end), [&](const auto& row) {
       if (masks == NULL || masks[row] == 1) {
         auto idx = row * num_classes + col;
         if (ground_truth[idx] == 1 && pred[idx] > 0.5) {
@@ -83,7 +85,7 @@ acc_t masked_f1_score(size_t begin, size_t end, size_t, mask_t* masks,
         }
       }
     }
-    //}, galois::loopname("MaskedF1Score"));
+
     tp_accum += tp_cls;
     fn_accum += fn_cls;
     fp_accum += fp_cls;
@@ -97,6 +99,7 @@ acc_t masked_f1_score(size_t begin, size_t end, size_t, mask_t* masks,
             ? 2. * (recall_cls * precision_cls) / (recall_cls + precision_cls)
             : 0.;
   }
+
   double f1_macro = f1_accum / (double)num_classes;
   // double accuracy_mic =
   // (double)(tp_accum+tn_accum)/(double)(tp_accum+tn_accum+fp_accum+fn_accum);
@@ -110,9 +113,11 @@ acc_t masked_f1_score(size_t begin, size_t end, size_t, mask_t* masks,
       recall_mic + precision_mic > 0.
           ? 2. * (recall_mic * precision_mic) / (recall_mic + precision_mic)
           : 0.;
+
   unsigned myID = galois::runtime::getSystemNetworkInterface().ID;
   galois::gPrint("[", myID, "]", std::setprecision(3), std::fixed,
                  " (f1_micro:", f1_micro, ", f1_macro: ", f1_macro, ")\n");
+
   return f1_micro;
 }
 

From f68125b25dd5077925c26531d0e3c261aa2f1820 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 15 May 2020 18:16:32 -0500
Subject: [PATCH 303/660] reactivated sync; still buggy apparenltly

---
 libdeepgalois/include/deepgalois/Net.h        |  2 +-
 .../deepgalois/layers/GradientSyncStructs.h   |  1 +
 libdeepgalois/src/Net.cpp                     |  2 +-
 libdeepgalois/src/layers/graph_conv_layer.cpp | 20 +++++++++----------
 4 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h
index 3bc9f8684c..522365b662 100644
--- a/libdeepgalois/include/deepgalois/Net.h
+++ b/libdeepgalois/include/deepgalois/Net.h
@@ -413,7 +413,7 @@ class Net {
     if (type == "train" && subgraph_sample_size) {
       localLabels = distContext->get_labels_subg_ptr();
     } else {
-      // note this grabs global labels; everything passed in should be global
+      // note this grabs local labels
       localLabels = distContext->get_labels_ptr();
     }
 
diff --git a/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h
index dd2f3de6a9..0f73f2cbca 100644
--- a/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h
+++ b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h
@@ -15,6 +15,7 @@ struct GradientSync {
     //  galois::gInfo("weight ", node_id, " not consistent with one received");
     //}
     weight += y;
+    weight /= 2;
     return true;
   }
 
diff --git a/libdeepgalois/src/Net.cpp b/libdeepgalois/src/Net.cpp
index 55de8ad3ae..605cd209e1 100644
--- a/libdeepgalois/src/Net.cpp
+++ b/libdeepgalois/src/Net.cpp
@@ -155,7 +155,7 @@ acc_t Net::masked_accuracy(size_t gBegin, size_t gEnd, size_t gCount,
       galois::loopname("getMaskedLoss"));
 
   gCount = sampleCount.reduce();
-  galois::gDebug("sample count is ", gCount);
+  galois::gDebug("Total sample count is ", gCount);
 
   // all hosts should get same accuracy
   return accuracy_all.reduce() / (acc_t)gCount;
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index d7c29d1cfa..941a796a81 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -118,10 +118,10 @@ void graph_conv_layer::forward_propagation(const float_t* in_data,
 
   // TODO sync of out_data required here
   // TODO how to do this for the sampled case?
-  // deepgalois::_syncVectorSize = z;
-  // deepgalois::_dataToSync     = out_data;
-  // layer::context->getSyncSubstrate()->sync<writeAny, readAny, GraphConvSync>(
-  //    "AggSync");
+  deepgalois::_syncVectorSize = z;
+  deepgalois::_dataToSync     = out_data;
+  layer::context->getSyncSubstrate()->sync<writeAny, readAny, GraphConvSync>(
+     "AggSync");
 
   // run relu activation on output if specified
   if (act_)
@@ -164,16 +164,16 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
   }
 
   // sync agg
-  // deepgalois::_syncVectorSize = z;
-  // deepgalois::_dataToSync     = out_temp;
-  // layer::context->getSyncSubstrate()->sync<writeAny, readAny, GraphConvSync>(
-  //    "AggSyncBack");
+  deepgalois::_syncVectorSize = z;
+  deepgalois::_dataToSync     = out_temp;
+  layer::context->getSyncSubstrate()->sync<writeAny, readAny, GraphConvSync>(
+     "AggSyncBack");
 
   if (level_ != 0 && dropout_)
     math::d_dropout_cpu(x, y, scale_, in_grad, dropout_mask, in_grad);
 
-  // layer::syncSub->sync<writeAny, readAny, GradientSync>("GradientSync");
-  // galois::gInfo("[", layer::gradientGraph->myHostID(), "] Sync done");
+  layer::syncSub->sync<writeAny, readAny, GradientSync>("GradientSync");
+  galois::gInfo("[", layer::gradientGraph->myHostID(), "] Sync done");
 }
 
 acc_t graph_conv_layer::get_weight_decay_loss() {

From e41fa34451bf8aa429f339ef768060c2db745910 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Sat, 16 May 2020 06:58:20 -0500
Subject: [PATCH 304/660] update sampler

---
 libdeepgalois/include/deepgalois/Sampler.h |  7 ++++++-
 libdeepgalois/src/Sampler.cpp              |  6 ------
 libdeepgalois/src/Sampler.cu               | 14 +++++++++-----
 3 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/Sampler.h b/libdeepgalois/include/deepgalois/Sampler.h
index b8f19dcca7..1b5754f394 100644
--- a/libdeepgalois/include/deepgalois/Sampler.h
+++ b/libdeepgalois/include/deepgalois/Sampler.h
@@ -43,13 +43,18 @@ class Sampler {
   void getMaskedDegrees(size_t n, mask_t* masks, GraphTy* g, std::vector<uint32_t>& degrees);
 
   //! Set masks bitset with IDs in the vertices VertexSet
-  void createMasks(size_t n, VertexSet vertices, mask_t* masks);
+  //void createMasks(size_t n, VertexSet vertices, mask_t* masks);
   //inline VertexList reindexVertices(size_t n, VertexSet vertex_set);
   //void checkGSDB(std::vector<db_t>& DB0, std::vector<db_t>& DB1, std::vector<db_t>& DB2, index_t size);
 
   //! convert set of gids to lids
   VertexSet convertToLID(VertexSet& gidSet);
 
+  void createMasks(size_t n, VertexSet vertices, mask_t* masks) {
+    std::fill(masks, masks + n, 0);
+    for (auto v : vertices) masks[v] = 1;
+  }
+
   //! helper function to get degree of some vertex given some graph
   inline unsigned getDegree(GraphCPU* g, index_t v) {
     return g->edge_end_host(v) - g->edge_begin_host(v);
diff --git a/libdeepgalois/src/Sampler.cpp b/libdeepgalois/src/Sampler.cpp
index 1feb2ecb69..36b697ecb6 100644
--- a/libdeepgalois/src/Sampler.cpp
+++ b/libdeepgalois/src/Sampler.cpp
@@ -215,12 +215,6 @@ void Sampler::selectVertices(index_t nv, index_t n, Graph* g,
   */
 }
 
-void Sampler::createMasks(size_t n, VertexSet vertices, mask_t* masks) {
-  // galois::gPrint("Updating masks, size = ", vertices.size(), "\n");
-  std::fill(masks, masks + n, 0);
-  for (auto v : vertices) masks[v] = 1;
-}
-
 // Given a subset of vertices and a graph g, generate a subgraph sg from the
 // graph g
 void Sampler::reindexSubgraph(VertexSet& keptVertices, Graph& origGraph, Graph& reindexGraph) {
diff --git a/libdeepgalois/src/Sampler.cu b/libdeepgalois/src/Sampler.cu
index c5db16c5f1..97835ea9cc 100644
--- a/libdeepgalois/src/Sampler.cu
+++ b/libdeepgalois/src/Sampler.cu
@@ -124,13 +124,17 @@ void Sampler::generateSubgraph(VertexSet &vertex_set, mask_t* masks, GraphGPU* s
   cudaMalloc((void**)&d_vertex_list, nv * sizeof(index_t));
   CUDA_CHECK(cudaMemcpy(d_vertex_list, &vertex_list[0], nv * sizeof(index_t), cudaMemcpyHostToDevice));
 
-  clear_masks<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, masks); // set all 0
-  CudaTest("solving clear_masks kernel failed");
+  createMasks(n, vertex_set, masks);
+  mask_t* d_masks;
+  cudaMalloc((void**)&d_masks, n * sizeof(mask_t));
+  CUDA_CHECK(cudaMemcpy(d_masks, masks, n * sizeof(mask_t), cudaMemcpyHostToDevice));
+  //clear_masks<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, d_masks); // set all 0
+  //CudaTest("solving clear_masks kernel failed");
   // createMasks: set masks for vertices in the vertex_set
-  set_masks<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, d_vertex_list, masks);
-  CudaTest("solving set_masks kernel failed");
+  //set_masks<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, d_vertex_list, d_masks);
+  //CudaTest("solving set_masks kernel failed");
   GraphGPU masked_sg; // size is the same as original graph, but masked dst removed
-  getMaskedGraph(n, masks, partGraph, &masked_sg); // remove edges whose destination is not masked
+  getMaskedGraph(n, d_masks, partGraph, &masked_sg); // remove edges whose destination is not masked
   std::cout << "maskedGraph generated\n";
 
   // re-index the subgraph

From 6a72b42153f63aabdad44fc59de83cd8444a76ad Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Sat, 16 May 2020 21:00:40 -0500
Subject: [PATCH 305/660] fix some gpu compile errors

---
 .../include/deepgalois/layers/layer.h         |  6 +++---
 libdeepgalois/src/reader.cpp                  | 20 ++++++++++---------
 libdeepgalois/src/utils.cpp                   |  9 ++++++---
 3 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h
index 99ec74fb4a..ee2d66aa95 100644
--- a/libdeepgalois/include/deepgalois/layers/layer.h
+++ b/libdeepgalois/include/deepgalois/layers/layer.h
@@ -41,9 +41,9 @@ class layer : public deepgalois::node {
   using ContextType = deepgalois::DistContext;
 
 protected:
-  const std::string header =
-      "[" + std::to_string(galois::runtime::getSystemNetworkInterface().ID) +
-      "] ";
+  //const std::string header =
+  //    "[" + std::to_string(galois::runtime::getSystemNetworkInterface().ID) +
+  //    "] ";
   unsigned level_;                 // layer id: [0, num_layers-1]
   size_t begin_;                   // sample begin index
   size_t end_;                     // sample end index
diff --git a/libdeepgalois/src/reader.cpp b/libdeepgalois/src/reader.cpp
index d131913587..6c408f6449 100644
--- a/libdeepgalois/src/reader.cpp
+++ b/libdeepgalois/src/reader.cpp
@@ -16,8 +16,11 @@ namespace deepgalois {
 // be computed as y.argmax(axis=1) from one-hot encoded vector (y) of labels if
 // required.
 size_t Reader::read_labels(bool is_single_class, label_t*& labels) {
-  unsigned myID = galois::runtime::getSystemNetworkInterface().ID;
+  unsigned myID = 0;
+#ifndef __GALOIS_HET_CUDA__
+  myID = galois::runtime::getSystemNetworkInterface().ID;
   galois::gPrint("[", myID, "] Reader: Reading labels...\n");
+#endif
 
   Timer t_read;
   t_read.Start();
@@ -28,12 +31,11 @@ size_t Reader::read_labels(bool is_single_class, label_t*& labels) {
   size_t m, num_classes; // m: number of samples
   in >> m >> num_classes >> std::ws;
   if (is_single_class) {
-    galois::gPrint("[", myID,
-                   "] Reader: Using single-class (one-hot) labels\n");
+    std::cout << "[" << myID << "] Reader: Using single-class (one-hot) labels\n";
     labels =
         new label_t[m]; // single-class (one-hot) label for each vertex: N x 1
   } else {
-    galois::gPrint("[", myID, "] Reader: Using multi-class (one-hot) labels\n");
+    std::cout << "[" << myID << "] Reader: Using multi-class (one-hot) labels\n";
     labels =
         new label_t[m *
                     num_classes]; // multi-class label for each vertex: N x E
@@ -58,8 +60,8 @@ size_t Reader::read_labels(bool is_single_class, label_t*& labels) {
   in.close();
   t_read.Stop();
   // print the number of vertex classes
-  galois::gPrint("[", myID, "] Done, unique label counts: ", num_classes,
-                 ", time: ", t_read.Millisecs(), " ms\n");
+  std::cout << "[" << myID << "] Done, unique label counts: " << num_classes
+            << ", time: " << t_read.Millisecs() << " ms\n";
   // for (auto i = 0; i < 10; i ++) std::cout << "labels[" << i << "] = " <<
   // unsigned(labels[i]) << "\n";
   return num_classes;
@@ -150,9 +152,9 @@ size_t Reader::read_masks(std::string mask_type, size_t n, size_t& begin,
     }
     i++;
   }
-  galois::gPrint("Global read ", mask_type, "_mask range: [", begin, ", ", end,
-                 ") Number of valid samples: ", sample_count, " (",
-                 (float)sample_count / (float)n * (float)100, "\%)\n");
+  std::cout << "Global read " << mask_type << "_mask range: [" << begin
+            << ", " << end << ") Number of valid samples: " << sample_count
+            << " (" << (float)sample_count / (float)n * (float)100 << "\%)\n";
   in.close();
   return sample_count;
 }
diff --git a/libdeepgalois/src/utils.cpp b/libdeepgalois/src/utils.cpp
index 9fb90c46c1..2f4e6ba549 100644
--- a/libdeepgalois/src/utils.cpp
+++ b/libdeepgalois/src/utils.cpp
@@ -114,9 +114,12 @@ acc_t masked_f1_score(size_t begin, size_t end, size_t, mask_t* masks,
           ? 2. * (recall_mic * precision_mic) / (recall_mic + precision_mic)
           : 0.;
 
-  unsigned myID = galois::runtime::getSystemNetworkInterface().ID;
-  galois::gPrint("[", myID, "]", std::setprecision(3), std::fixed,
-                 " (f1_micro:", f1_micro, ", f1_macro: ", f1_macro, ")\n");
+  unsigned myID = 0;
+#ifndef __GALOIS_HET_CUDA__
+  myID = galois::runtime::getSystemNetworkInterface().ID;
+#endif
+  std::cout << "[" << myID << "]" << std::setprecision(3) << std::fixed
+            << " (f1_micro:" << f1_micro << ", f1_macro: " << f1_macro << ")\n";
 
   return f1_micro;
 }

From 08cb177d133a7266f91392e8ee3ea3b94008a8ed Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Sat, 16 May 2020 21:10:42 -0500
Subject: [PATCH 306/660] tiny

---
 libdeepgalois/include/deepgalois/layers/layer.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h
index ee2d66aa95..7715836404 100644
--- a/libdeepgalois/include/deepgalois/layers/layer.h
+++ b/libdeepgalois/include/deepgalois/layers/layer.h
@@ -86,10 +86,13 @@ class layer : public deepgalois::node {
   virtual std::string layer_type() const = 0;
   virtual void malloc_and_init() {}
   void print_layer_info() { //! debug print function
-    unsigned myID = galois::runtime::getSystemNetworkInterface().ID;
-    galois::gPrint("[", myID, "] Layer", level_, " type: ", layer_type(),
-                   "input[", input_dims[0], ",", input_dims[1], "] output[",
-                   output_dims[0], ",", output_dims[1], "]\n");
+    unsigned myID = 0;
+#ifndef __GALOIS_HET_CUDA__
+    galois::runtime::getSystemNetworkInterface().ID;
+#endif
+    std::cout << "[" << myID << "] Layer" << level_ << " type: " << layer_type()
+              << "input[" << input_dims[0], "," << input_dims[1] << "] output["
+              << output_dims[0] << "," << output_dims[1] << "]\n";
   }
   // get methods
   virtual acc_t get_prediction_loss() { return acc_t(0); }

From 8bd3e33fa5e0b60048faa9d7315b67135729069a Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Sat, 16 May 2020 21:18:47 -0500
Subject: [PATCH 307/660] fix error

---
 libdeepgalois/include/deepgalois/layers/layer.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h
index 7715836404..b7779b7e5b 100644
--- a/libdeepgalois/include/deepgalois/layers/layer.h
+++ b/libdeepgalois/include/deepgalois/layers/layer.h
@@ -88,10 +88,10 @@ class layer : public deepgalois::node {
   void print_layer_info() { //! debug print function
     unsigned myID = 0;
 #ifndef __GALOIS_HET_CUDA__
-    galois::runtime::getSystemNetworkInterface().ID;
+    myID = galois::runtime::getSystemNetworkInterface().ID;
 #endif
     std::cout << "[" << myID << "] Layer" << level_ << " type: " << layer_type()
-              << "input[" << input_dims[0], "," << input_dims[1] << "] output["
+              << "input[" << input_dims[0] << "," << input_dims[1] << "] output["
               << output_dims[0] << "," << output_dims[1] << "]\n";
   }
   // get methods

From 5e7f0b56bb744c78b425c8a4823ce4faf2fafa08 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Sat, 16 May 2020 21:58:39 -0500
Subject: [PATCH 308/660] fix test_masks name

---
 libdeepgalois/src/Net.cu | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/libdeepgalois/src/Net.cu b/libdeepgalois/src/Net.cu
index 7b76f217dd..8e8ce83267 100644
--- a/libdeepgalois/src/Net.cu
+++ b/libdeepgalois/src/Net.cu
@@ -183,19 +183,18 @@ void Net::partitionInit(DGraph* graph, std::string dataset_str, bool isSingleCla
 }
 
 void Net::read_test_masks(std::string dataset) {
-  test_masks = new mask_t[distNumSamples];
   if (dataset == "reddit") {
     globalTestBegin = 177262;
     globalTestCount = 55703;
     globalTestEnd   = globalTestBegin + globalTestCount;
     for (size_t i = globalTestBegin; i < globalTestEnd; i++)
-        test_masks[i] = 1;
+        globalTestMasks[i] = 1;
   } else {
     globalTestCount = distContext->read_masks(dataset, std::string("test"), 
-        globalSamples, globalTestBegin, globalTestEnd, test_masks, NULL);
+        globalSamples, globalTestBegin, globalTestEnd, globalTestMasks, NULL);
   }
   //copy_test_masks_to_device();
-  copy_masks_device(globalSamples, test_masks, d_test_masks);
+  copy_masks_device(globalSamples, globalTestMasks, d_test_masks);
 }
 
 //void Net::copy_test_masks_to_device() {}

From 925d1a5a54cb1c632cd085d9514c364d7ab4c9ba Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Sat, 16 May 2020 21:14:44 -0500
Subject: [PATCH 309/660] Fixed accuracy checking in the distributed setting

---
 libdeepgalois/src/Net.cpp     | 17 ++++++++---------
 lonestar/gnn/include/engine.h |  1 -
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/libdeepgalois/src/Net.cpp b/libdeepgalois/src/Net.cpp
index 605cd209e1..d07b19f912 100644
--- a/libdeepgalois/src/Net.cpp
+++ b/libdeepgalois/src/Net.cpp
@@ -126,28 +126,28 @@ acc_t Net::masked_accuracy(size_t gBegin, size_t gEnd, size_t gCount,
 
   galois::do_all(
       galois::iterate(gBegin, gEnd),
-      [&](const auto& i) {
+      [&](const auto& gid) {
         // only look at owned nodes (i.e. masters); the prediction for these
         // should only by handled on the owner
-        if (this->dGraph->isOwned(i)) {
+        if (this->dGraph->isOwned(gid)) {
           sampleCount += 1;
-
-          uint32_t localID = this->dGraph->getLID(i);
+          uint32_t localID = this->dGraph->getLID(gid);
           if (gMasks == NULL) {
             auto pred =
                 math::argmax(num_classes, &preds[localID * num_classes]);
             // check prediction
-            if ((label_t)pred == localGroundTruth[localID])
+            if ((label_t)pred == localGroundTruth[localID]) {
               accuracy_all += 1.0;
+            }
           } else {
-            // TODO masks needs to be local id
-            if (gMasks[localID] == 1) {
+            if (gMasks[gid] == 1) {
               // get prediction
               auto pred =
                   math::argmax(num_classes, &preds[localID * num_classes]);
               // check prediction
-              if ((label_t)pred == localGroundTruth[localID])
+              if ((label_t)pred == localGroundTruth[localID]) {
                 accuracy_all += 1.0;
+              }
             }
           }
         }
@@ -156,7 +156,6 @@ acc_t Net::masked_accuracy(size_t gBegin, size_t gEnd, size_t gCount,
 
   gCount = sampleCount.reduce();
   galois::gDebug("Total sample count is ", gCount);
-
   // all hosts should get same accuracy
   return accuracy_all.reduce() / (acc_t)gCount;
 }
diff --git a/lonestar/gnn/include/engine.h b/lonestar/gnn/include/engine.h
index 155c65ca68..36a04b2f70 100644
--- a/lonestar/gnn/include/engine.h
+++ b/lonestar/gnn/include/engine.h
@@ -103,7 +103,6 @@ int main(int argc, char** argv) {
 
   if (do_test) {
     // test using test samples
-    galois::gPrint("\n");
     network.read_test_masks(dataset);
     galois::StatTimer Ttest("Test");
     Ttest.start();

From bc1f798bef0f97c63c1527165e01629512ab5761 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Sat, 16 May 2020 22:11:21 -0500
Subject: [PATCH 310/660] RIP gPrint In all seriosness, GPU not compiling if we
 include Galois in Net and such is a serious issue for distributed execution;
 will need to be fixed later

---
 libdeepgalois/include/deepgalois/Net.h        | 68 +++++++++++--------
 .../deepgalois/layers/GradientSyncStructs.h   |  2 +
 .../layers/GraphConvSyncStructures.h          |  2 +
 .../include/deepgalois/layers/layer.h         | 16 +++--
 libdeepgalois/include/deepgalois/reader.h     |  2 +-
 libdeepgalois/include/deepgalois/utils.h      |  5 +-
 libdeepgalois/src/reader.cpp                  |  9 +++
 libdeepgalois/src/utils.cpp                   |  2 +
 lonestar/gnn/include/engine.h                 |  7 +-
 9 files changed, 66 insertions(+), 47 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h
index 522365b662..d3558a99e3 100644
--- a/libdeepgalois/include/deepgalois/Net.h
+++ b/libdeepgalois/include/deepgalois/Net.h
@@ -109,11 +109,11 @@ class Net {
 
     assert(n_conv > 0);
 
-    galois::gPrint(header, "Configuration: num_threads ", num_threads,
-                   ", num_conv_layers ", num_conv_layers, ", num_epochs ",
-                   num_epochs, ", hidden1 ", hidden1, ", learning_rate ",
-                   learning_rate, ", dropout_rate ", dropout_rate,
-                   ", weight_decay ", weight_decay, "\n");
+    //galois::gPrint(header, "Configuration: num_threads ", num_threads,
+    //               ", num_conv_layers ", num_conv_layers, ", num_epochs ",
+    //               num_epochs, ", hidden1 ", hidden1, ", learning_rate ",
+    //               learning_rate, ", dropout_rate ", dropout_rate,
+    //               ", weight_decay ", weight_decay, "\n");
     this->num_layers = num_conv_layers + 1;
 
     // additional layers to add
@@ -201,7 +201,10 @@ class Net {
     int num_subg_remain     = 0;
 
     if (subgraph_sample_size) {
+// TOOD this needs to be enabled
+#ifndef __GALOIS_HET_CUDA__
       distContext->allocateSubgraphs(num_subgraphs, subgraph_sample_size);
+#endif
       allocateSubgraphsMasks(num_subgraphs);
       std::cout << header
                 << "Constructing training vertex set induced graph...\n";
@@ -212,7 +215,7 @@ class Net {
                                      distContext->getGraphPointer());
     }
 
-    galois::gPrint(header, "Start training...\n");
+    //galois::gPrint(header, "Start training...\n");
 
     Timer t_epoch;
 
@@ -296,22 +299,24 @@ class Net {
       ////////////////////////////////////////////////////////////////////////////////
 
       // training steps
-      galois::gPrint(header, "Epoch ", std::setw(3), curEpoch, "\n");
+      //galois::gPrint(header, "Epoch ", std::setw(3), curEpoch, "\n");
       set_netphases(net_phase::train);
       acc_t train_loss = 0.0, train_acc = 0.0;
 
-      galois::gPrint(header, "Calling into eval for forward propagation\n");
+      //galois::gPrint(header, "Calling into eval for forward propagation\n");
       // forward: after this phase, layer edges will contain intermediate
       // features for use during backprop
-      double fw_time = evaluate("train", train_loss, train_acc);
+      //double fw_time = evaluate("train", train_loss, train_acc);
+      evaluate("train", train_loss, train_acc);
+
 
-      galois::gPrint(header, "Calling into backward propagation\n");
+      //galois::gPrint(header, "Calling into backward propagation\n");
       // backward: use intermediate features + ground truth to update layers
       // with feature gradients whcih are then used to calculate weight
       // gradients
       Net::bprop();
 
-      galois::gPrint(header, "Weight update call\n");
+      //galois::gPrint(header, "Weight update call\n");
       // gradient update: use gradients stored on each layer to update model
       // for next epoch
       Net::update_weights(opt); // update parameters
@@ -319,8 +324,8 @@ class Net {
       // validation / testing
       set_netphases(net_phase::test);
 
-      galois::gPrint(header, "train_loss ", std::setprecision(3), std::fixed,
-                     train_loss, " train_acc ", train_acc, "\n");
+      //galois::gPrint(header, "train_loss ", std::setprecision(3), std::fixed,
+      //               train_loss, " train_acc ", train_acc, "\n");
 
       t_epoch.Stop();
 
@@ -331,22 +336,25 @@ class Net {
         // Validation
         acc_t val_loss = 0.0, val_acc = 0.0;
         double val_time = evaluate("val", val_loss, val_acc);
-        galois::gPrint(header, "val_loss ", std::setprecision(3), std::fixed,
-                       val_loss, " val_acc ", val_acc, "\n");
-        galois::gPrint(header, "time ", std::setprecision(3), std::fixed,
-                       epoch_time + val_time, " ms (train_time ", epoch_time,
-                       " val_time ", val_time, ")\n");
+        std::cout << header << "val_loss " << std::setprecision(3) << std::fixed <<
+                       val_loss << " val_acc " << val_acc << " time " << val_time << "\n";
+        //galois::gPrint(header, "val_loss ", std::setprecision(3), std::fixed,
+        //               val_loss, " val_acc ", val_acc, "\n");
+        //galois::gPrint(header, "time ", std::setprecision(3), std::fixed,
+        //               epoch_time + val_time, " ms (train_time ", epoch_time,
+        //               " val_time ", val_time, ")\n");
       } else {
-        galois::gPrint(header, "train_time ", std::fixed, epoch_time,
-                       " ms (fw ", fw_time, ", bw ", epoch_time - fw_time,
-                       ")\n");
+        //galois::gPrint(header, "train_time ", std::fixed, epoch_time,
+        //               " ms (fw ", fw_time, ", bw ", epoch_time - fw_time,
+        //               ")\n");
       }
     } // epoch loop
 
     double avg_train_time = total_train_time / (double)num_epochs;
     double throughput     = 1000.0 * (double)num_epochs / total_train_time;
-    galois::gPrint(header, "Average training time per epoch: ", avg_train_time,
-                   " ms. Throughput: ", throughput, " epoch/s\n");
+    std::cout << "ave training time " << avg_train_time << " through " << throughput << "\n";
+    //galois::gPrint(header, "Average training time per epoch: ", avg_train_time,
+    //               " ms. Throughput: ", throughput, " epoch/s\n");
   }
 
   // evaluate, i.e. inference or predict
@@ -402,10 +410,10 @@ class Net {
     }
 #endif
 
-    galois::gPrint(header, "Doing actual forward propagation\n");
+    //galois::gPrint(header, "Doing actual forward propagation\n");
     loss = fprop(gBegin, gEnd, gCount, gMasks);
-    galois::gPrint(header,
-                   "Forward propagation donne, going to check accuracy\n");
+    //galois::gPrint(header,
+    //               "Forward propagation donne, going to check accuracy\n");
     float_t* predictions = layers[num_layers - 1]->next()->get_data();
 
     // labels will be subgraph labels if applicable
@@ -438,7 +446,7 @@ class Net {
 
   void construct_layers() {
     // append conv layers
-    galois::gPrint(header, "Constructing layers...\n");
+    //galois::gPrint(header, "Constructing layers...\n");
     for (size_t i = 0; i < num_conv_layers - 1; i++) {
       append_conv_layer(i, true); // conv layers, act=true
     }
@@ -544,15 +552,15 @@ class Net {
     // set mask for the last layer; globals
     // TODO this should be distirbuted sample gBegin->end not global; fix later
     // seems to be unused in code right now anyways
-    galois::gPrint(header, "fprop: set sample mask\n");
+    //galois::gPrint(header, "fprop: set sample mask\n");
     layers[num_layers - 1]->set_sample_mask(gBegin, gEnd, gCount, gMasks);
 
     for (size_t i = 0; i < num_layers; i++) {
-      galois::gPrint(header, "fprop: layer ", i, " forward call\n");
+      //galois::gPrint(header, "fprop: layer ", i, " forward call\n");
       layers[i]->forward();
     }
 
-    galois::gPrint(header, "fprop: getting loss\n");
+    //galois::gPrint(header, "fprop: getting loss\n");
     // prediction error
     acc_t loss = layers[num_layers - 1]->get_prediction_loss();
     // Squared Norm Regularization to mitigate overfitting
diff --git a/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h
index 0f73f2cbca..c962f20004 100644
--- a/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h
+++ b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h
@@ -1,3 +1,4 @@
+#ifndef __GALOIS_HET_CUDA__
 #ifndef __GRAD_SYNC_STRUCT__
 #define __GRAD_SYNC_STRUCT__
 
@@ -44,3 +45,4 @@ struct GradientSync {
 // TODO bitset; might have to do it manually
 // GALOIS_SYNC_STRUCTURE_BITSET(TODOTHIS?);
 #endif
+#endif
diff --git a/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h b/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h
index cb5a33e783..7c3c038d15 100644
--- a/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h
+++ b/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h
@@ -1,3 +1,4 @@
+#ifndef __GALOIS_HET_CUDA__
 #ifndef __GRAPH_CONV_SYNC_STRUCT__
 #define __GRAPH_CONV_SYNC_STRUCT__
 
@@ -62,3 +63,4 @@ struct GraphConvSync {
 };
 
 #endif
+#endif
diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h
index b7779b7e5b..47ddb20dc3 100644
--- a/libdeepgalois/include/deepgalois/layers/layer.h
+++ b/libdeepgalois/include/deepgalois/layers/layer.h
@@ -41,9 +41,11 @@ class layer : public deepgalois::node {
   using ContextType = deepgalois::DistContext;
 
 protected:
-  //const std::string header =
-  //    "[" + std::to_string(galois::runtime::getSystemNetworkInterface().ID) +
-  //    "] ";
+  #ifndef __GALOIS_HET_CUDA__
+  const std::string header =
+      "[" + std::to_string(galois::runtime::getSystemNetworkInterface().ID) +
+      "] ";
+  #endif
   unsigned level_;                 // layer id: [0, num_layers-1]
   size_t begin_;                   // sample begin index
   size_t end_;                     // sample end index
@@ -86,13 +88,15 @@ class layer : public deepgalois::node {
   virtual std::string layer_type() const = 0;
   virtual void malloc_and_init() {}
   void print_layer_info() { //! debug print function
-    unsigned myID = 0;
 #ifndef __GALOIS_HET_CUDA__
-    myID = galois::runtime::getSystemNetworkInterface().ID;
+    unsigned myID = galois::runtime::getSystemNetworkInterface().ID;
 #endif
-    std::cout << "[" << myID << "] Layer" << level_ << " type: " << layer_type()
+    std::cout << "Layer " << level_ << " type: " << layer_type()
               << "input[" << input_dims[0] << "," << input_dims[1] << "] output["
               << output_dims[0] << "," << output_dims[1] << "]\n";
+    //galois::gPrint("[", myID, "] Layer", level_, " type: ", layer_type(),
+    //               "input[", input_dims[0], ",", input_dims[1], "] output[",
+    //               output_dims[0], ",", output_dims[1], "]\n");
   }
   // get methods
   virtual acc_t get_prediction_loss() { return acc_t(0); }
diff --git a/libdeepgalois/include/deepgalois/reader.h b/libdeepgalois/include/deepgalois/reader.h
index 55890d79ae..5e034ec210 100644
--- a/libdeepgalois/include/deepgalois/reader.h
+++ b/libdeepgalois/include/deepgalois/reader.h
@@ -1,6 +1,6 @@
 #pragma once
 #include "deepgalois/lgraph.h"
-
+//#include "galois/DistGalois.h"
 namespace deepgalois {
 
 class Reader {
diff --git a/libdeepgalois/include/deepgalois/utils.h b/libdeepgalois/include/deepgalois/utils.h
index 7093897af2..91ccc94b83 100644
--- a/libdeepgalois/include/deepgalois/utils.h
+++ b/libdeepgalois/include/deepgalois/utils.h
@@ -7,11 +7,8 @@
 #include <iostream>
 #include <sys/time.h>
 #include <sys/resource.h>
-#ifdef GALOIS_USE_DIST
 #include "deepgalois/GraphTypes.h"
-#else
-#include "deepgalois/types.h"
-#endif
+//#include "galois/DistGalois.h"
 
 namespace deepgalois {
 
diff --git a/libdeepgalois/src/reader.cpp b/libdeepgalois/src/reader.cpp
index 6c408f6449..e4c110dd1e 100644
--- a/libdeepgalois/src/reader.cpp
+++ b/libdeepgalois/src/reader.cpp
@@ -32,9 +32,12 @@ size_t Reader::read_labels(bool is_single_class, label_t*& labels) {
   in >> m >> num_classes >> std::ws;
   if (is_single_class) {
     std::cout << "[" << myID << "] Reader: Using single-class (one-hot) labels\n";
+    //galois::gPrint("[", myID,
+    //               "] Reader: Using single-class (one-hot) labels\n");
     labels =
         new label_t[m]; // single-class (one-hot) label for each vertex: N x 1
   } else {
+    //galois::gPrint("[", myID, "] Reader: Using multi-class (one-hot) labels\n");
     std::cout << "[" << myID << "] Reader: Using multi-class (one-hot) labels\n";
     labels =
         new label_t[m *
@@ -62,6 +65,8 @@ size_t Reader::read_labels(bool is_single_class, label_t*& labels) {
   // print the number of vertex classes
   std::cout << "[" << myID << "] Done, unique label counts: " << num_classes
             << ", time: " << t_read.Millisecs() << " ms\n";
+  //galois::gPrint("[", myID, "] Done, unique label counts: ", num_classes,
+                 //", time: ", t_read.Millisecs(), " ms\n");
   // for (auto i = 0; i < 10; i ++) std::cout << "labels[" << i << "] = " <<
   // unsigned(labels[i]) << "\n";
   return num_classes;
@@ -121,6 +126,7 @@ size_t Reader::read_features(float_t*& feats, std::string filetype) {
 //! set to create mask from
 size_t Reader::read_masks(std::string mask_type, size_t n, size_t& begin,
                           size_t& end, mask_t* masks) {
+  std::cout << "n:" << n << "\n";
   bool dataset_found = false;
   for (int i = 0; i < NUM_DATASETS; i++) {
     if (dataset_str == dataset_names[i]) {
@@ -155,6 +161,9 @@ size_t Reader::read_masks(std::string mask_type, size_t n, size_t& begin,
   std::cout << "Global read " << mask_type << "_mask range: [" << begin
             << ", " << end << ") Number of valid samples: " << sample_count
             << " (" << (float)sample_count / (float)n * (float)100 << "\%)\n";
+  //galois::gPrint("Global read ", mask_type, "_mask range: [", begin, ", ", end,
+  //               ") Number of valid samples: ", sample_count, " (",
+  //               (float)sample_count / (float)n * (float)100, "\%)\n");
   in.close();
   return sample_count;
 }
diff --git a/libdeepgalois/src/utils.cpp b/libdeepgalois/src/utils.cpp
index 2f4e6ba549..db738dd2f3 100644
--- a/libdeepgalois/src/utils.cpp
+++ b/libdeepgalois/src/utils.cpp
@@ -120,6 +120,8 @@ acc_t masked_f1_score(size_t begin, size_t end, size_t, mask_t* masks,
 #endif
   std::cout << "[" << myID << "]" << std::setprecision(3) << std::fixed
             << " (f1_micro:" << f1_micro << ", f1_macro: " << f1_macro << ")\n";
+  //galois::gPrint("[", myID, "]", std::setprecision(3), std::fixed,
+  //               " (f1_micro:", f1_micro, ", f1_macro: ", f1_macro, ")\n");
 
   return f1_micro;
 }
diff --git a/lonestar/gnn/include/engine.h b/lonestar/gnn/include/engine.h
index 36a04b2f70..65a3aa9d37 100644
--- a/lonestar/gnn/include/engine.h
+++ b/lonestar/gnn/include/engine.h
@@ -61,20 +61,15 @@ void LonestarGnnStart(int argc, char** argv, const char* app, const char* desc,
 }
 
 int main(int argc, char** argv) {
-#ifdef GALOIS_USE_DIST
   galois::DistMemSys G;
-#else
-  galois::SharedMemSys G;
-#endif
   LonestarGnnStart(argc, argv, name, desc, url);
 
   // Get a partitioned graph first
   std::vector<unsigned> dummyVec;
   deepgalois::DGraph* dGraph = NULL;
-#ifdef GALOIS_USE_DIST
+#ifndef __GALOIS_HET_CUDA__
   dGraph = galois::graphs::constructSymmetricGraph<char, void>(dummyVec);
 #endif
-
   // initialize network + whole context on CPU
   // read network, initialize metadata
   // default setting for now; can be customized by the user

From ad7e98fe65d68e4a48f7d65298036fb6b78fde59 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Sat, 16 May 2020 22:58:26 -0500
Subject: [PATCH 311/660] fix printing

---
 libdeepgalois/include/deepgalois/Net.h        | 60 +++++++++++++------
 .../include/deepgalois/layers/layer.h         |  5 +-
 2 files changed, 44 insertions(+), 21 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h
index d3558a99e3..b48315f00f 100644
--- a/libdeepgalois/include/deepgalois/Net.h
+++ b/libdeepgalois/include/deepgalois/Net.h
@@ -299,15 +299,19 @@ class Net {
       ////////////////////////////////////////////////////////////////////////////////
 
       // training steps
-      //galois::gPrint(header, "Epoch ", std::setw(3), curEpoch, "\n");
+#ifdef __GALOIS_HET_CUDA__
+      std::cout << header << "Epoch " << std::setw(3) << curEpoch << " ";
+#else
+      galois::gPrint(header, "Epoch ", std::setw(3), curEpoch, "\n");
+#endif
       set_netphases(net_phase::train);
       acc_t train_loss = 0.0, train_acc = 0.0;
 
       //galois::gPrint(header, "Calling into eval for forward propagation\n");
       // forward: after this phase, layer edges will contain intermediate
       // features for use during backprop
-      //double fw_time = evaluate("train", train_loss, train_acc);
-      evaluate("train", train_loss, train_acc);
+      double fw_time = evaluate("train", train_loss, train_acc);
+      //evaluate("train", train_loss, train_acc);
 
 
       //galois::gPrint(header, "Calling into backward propagation\n");
@@ -324,9 +328,13 @@ class Net {
       // validation / testing
       set_netphases(net_phase::test);
 
-      //galois::gPrint(header, "train_loss ", std::setprecision(3), std::fixed,
-      //               train_loss, " train_acc ", train_acc, "\n");
-
+#ifdef __GALOIS_HET_CUDA__
+      std::cout << header << "train_loss " << std::setprecision(3) << std::fixed
+                << train_loss << " train_acc " << train_acc << " ";
+#else
+      galois::gPrint(header, "train_loss ", std::setprecision(3), std::fixed,
+                     train_loss, " train_acc ", train_acc, "\n");
+#endif
       t_epoch.Stop();
 
       double epoch_time = t_epoch.Millisecs();
@@ -336,25 +344,39 @@ class Net {
         // Validation
         acc_t val_loss = 0.0, val_acc = 0.0;
         double val_time = evaluate("val", val_loss, val_acc);
-        std::cout << header << "val_loss " << std::setprecision(3) << std::fixed <<
-                       val_loss << " val_acc " << val_acc << " time " << val_time << "\n";
-        //galois::gPrint(header, "val_loss ", std::setprecision(3), std::fixed,
-        //               val_loss, " val_acc ", val_acc, "\n");
-        //galois::gPrint(header, "time ", std::setprecision(3), std::fixed,
-        //               epoch_time + val_time, " ms (train_time ", epoch_time,
-        //               " val_time ", val_time, ")\n");
+#ifdef __GALOIS_HET_CUDA__
+        std::cout << header << "val_loss " << std::setprecision(3) << std::fixed
+                  << val_loss << " val_acc " << val_acc << " ";
+        std::cout << header << "time " << std::setprecision(3) << std::fixed
+                  << epoch_time + val_time << " ms (train_time " << epoch_time
+                  << " val_time " << val_time << ")\n";
+#else
+        galois::gPrint(header, "val_loss ", std::setprecision(3), std::fixed,
+                       val_loss, " val_acc ", val_acc, "\n");
+        galois::gPrint(header, "time ", std::setprecision(3), std::fixed,
+                       epoch_time + val_time, " ms (train_time ", epoch_time,
+                       " val_time ", val_time, ")\n");
+#endif
       } else {
-        //galois::gPrint(header, "train_time ", std::fixed, epoch_time,
-        //               " ms (fw ", fw_time, ", bw ", epoch_time - fw_time,
-        //               ")\n");
+#ifdef __GALOIS_HET_CUDA__
+        std::cout << header << "train_time " << std::fixed << epoch_time
+                  << " ms (fw " << fw_time << ", bw " << epoch_time - fw_time << ")\n";
+#else
+        galois::gPrint(header, "train_time ", std::fixed, epoch_time,
+                       " ms (fw ", fw_time, ", bw ", epoch_time - fw_time, ")\n");
+#endif
       }
     } // epoch loop
 
     double avg_train_time = total_train_time / (double)num_epochs;
     double throughput     = 1000.0 * (double)num_epochs / total_train_time;
-    std::cout << "ave training time " << avg_train_time << " through " << throughput << "\n";
-    //galois::gPrint(header, "Average training time per epoch: ", avg_train_time,
-    //               " ms. Throughput: ", throughput, " epoch/s\n");
+#ifdef __GALOIS_HET_CUDA__
+    std::cout << "Average training time per epoch: " << avg_train_time 
+              << "ms. Throughput " << throughput << " epoch/s\n";
+#else
+    galois::gPrint(header, "Average training time per epoch: ", avg_train_time,
+                   " ms. Throughput: ", throughput, " epoch/s\n");
+#endif
   }
 
   // evaluate, i.e. inference or predict
diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h
index 47ddb20dc3..b21adefea1 100644
--- a/libdeepgalois/include/deepgalois/layers/layer.h
+++ b/libdeepgalois/include/deepgalois/layers/layer.h
@@ -88,10 +88,11 @@ class layer : public deepgalois::node {
   virtual std::string layer_type() const = 0;
   virtual void malloc_and_init() {}
   void print_layer_info() { //! debug print function
+    unsigned myID = 0;
 #ifndef __GALOIS_HET_CUDA__
-    unsigned myID = galois::runtime::getSystemNetworkInterface().ID;
+    myID = galois::runtime::getSystemNetworkInterface().ID;
 #endif
-    std::cout << "Layer " << level_ << " type: " << layer_type()
+    std::cout << "[" << myID << "] Layer " << level_ << " type: " << layer_type()
               << "input[" << input_dims[0] << "," << input_dims[1] << "] output["
               << output_dims[0] << "," << output_dims[1] << "]\n";
     //galois::gPrint("[", myID, "] Layer", level_, " type: ", layer_type(),

From b26df3b6bd385bb052ddd183c9d54e0b87d23862 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Sat, 16 May 2020 23:18:38 -0500
Subject: [PATCH 312/660] add for single-gpu compilation

---
 lonestar/gnn/include/engine.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/lonestar/gnn/include/engine.h b/lonestar/gnn/include/engine.h
index 65a3aa9d37..7de3350399 100644
--- a/lonestar/gnn/include/engine.h
+++ b/lonestar/gnn/include/engine.h
@@ -61,7 +61,11 @@ void LonestarGnnStart(int argc, char** argv, const char* app, const char* desc,
 }
 
 int main(int argc, char** argv) {
+#ifdef __GALOIS_HET_CUDA__
+  galois::SharedMemSys G;
+#else
   galois::DistMemSys G;
+#endif
   LonestarGnnStart(argc, argv, name, desc, url);
 
   // Get a partitioned graph first

From 12c44e42ba60a1216054d8789a7ff275e14a3ca5 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Sun, 17 May 2020 09:13:35 -0500
Subject: [PATCH 313/660] gpu sampling works. Use this command to test (test
 acc 93.4%): ./gcn reddit -k=200 -sl=1 -ss=9000 -vi=20 -h=128 -dr=0.1

---
 .../include/deepgalois/DistContext.h          | 14 +++---
 libdeepgalois/include/deepgalois/Net.h        | 11 ++---
 libdeepgalois/src/DistContext.cpp             |  5 ++-
 libdeepgalois/src/DistContext.cu              | 44 +++++++++++++++++--
 libdeepgalois/src/Net.cu                      |  3 +-
 libdeepgalois/src/Sampler.cu                  | 16 +++----
 libgpu/include/graph_gpu.h                    |  4 +-
 7 files changed, 70 insertions(+), 27 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h
index 49222eb3ab..332eddb3ba 100644
--- a/libdeepgalois/include/deepgalois/DistContext.h
+++ b/libdeepgalois/include/deepgalois/DistContext.h
@@ -21,6 +21,10 @@ class DistContext {
   size_t num_classes;     // number of classes: E
   size_t feat_len;        // input feature length: D
   Graph* lGraph;          // learning graph version
+  DGraph* partitionedGraph; // the input graph, |V| = N
+  std::vector<Graph*> partitionedSubgraphs;
+  label_t* h_labels;      // labels for classification. Single-class: Nx1, multi-class: NxE
+  float_t* h_feats;       // input features: N x D
 #ifdef __GALOIS_HET_CUDA__
   label_t* d_labels;      // labels on device
   label_t* d_labels_subg; // labels for subgraph on device
@@ -31,11 +35,6 @@ class DistContext {
 #else
   galois::graphs::GluonSubstrate<DGraph>* syncSubstrate;
 #endif
-  DGraph* partitionedGraph; // the input graph, |V| = N
-  std::vector<Graph*> partitionedSubgraphs;
-  label_t* h_labels; // labels for classification. Single-class label: Nx1,
-                     // multi-class label: NxE
-  float_t* h_feats;                    // input features: N x D
   std::vector<label_t> h_labels_subg;  // labels for subgraph
   std::vector<float_t> h_feats_subg;   // input features for subgraph
   std::vector<float_t> normFactors;    // normalization constant based on graph structure
@@ -46,7 +45,10 @@ class DistContext {
 public:
   // TODO better constructor
   DistContext();
-  DistContext(bool isDevice) : is_device(isDevice) {}
+  DistContext(bool isDevice) : is_device(isDevice), is_selfloop_added(false),
+                               usingSingleClass(true), dataset(""),
+                               num_classes(0), feat_len(0), lGraph(NULL),
+                               partitionedGraph(NULL), h_labels(0), h_feats(0) {}
   ~DistContext();
 
   size_t read_graph(std::string dataset_str, bool selfloop = false);
diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h
index b48315f00f..7026ee623d 100644
--- a/libdeepgalois/include/deepgalois/Net.h
+++ b/libdeepgalois/include/deepgalois/Net.h
@@ -201,10 +201,7 @@ class Net {
     int num_subg_remain     = 0;
 
     if (subgraph_sample_size) {
-// TOOD this needs to be enabled
-#ifndef __GALOIS_HET_CUDA__
       distContext->allocateSubgraphs(num_subgraphs, subgraph_sample_size);
-#endif
       allocateSubgraphsMasks(num_subgraphs);
       std::cout << header
                 << "Constructing training vertex set induced graph...\n";
@@ -261,8 +258,8 @@ class Net {
         auto subgraphPointer      = distContext->getSubgraphPointer(sg_id);
         this->subgraphNumVertices = subgraphPointer->size();
 
-        std::cout << "Subgraph num_vertices: " << subgraphNumVertices
-                  << ", num_edges: " << subgraphPointer->sizeEdges() << "\n";
+        //std::cout << "Subgraph num_vertices: " << subgraphNumVertices
+        //          << ", num_edges: " << subgraphPointer->sizeEdges() << "\n";
         for (size_t i = 0; i < num_layers; i++) {
           layers[i]->update_dim_size(this->subgraphNumVertices);
         }
@@ -416,7 +413,11 @@ class Net {
       for (size_t i = 0; i < num_layers; i++)
         layers[i]->update_dim_size(distNumSamples);
       for (size_t i = 0; i < num_conv_layers; i++) {
+#ifdef __GALOIS_HET_CUDA__
+        layers[i]->set_graph_ptr(distContext->getGraphPointer());
+#else
         layers[i]->set_graph_ptr(distContext->getLGraphPointer());
+#endif
         layers[i]->set_norm_consts_ptr(distContext->get_norm_factors_ptr());
       }
       layers[num_layers - 1]->set_labels_ptr(distContext->get_labels_ptr());
diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp
index 320cc75b7f..4a9087b0b3 100644
--- a/libdeepgalois/src/DistContext.cpp
+++ b/libdeepgalois/src/DistContext.cpp
@@ -3,7 +3,10 @@
 #include "deepgalois/configs.h"
 
 namespace deepgalois {
-DistContext::DistContext() : usingSingleClass(true) {}
+DistContext::DistContext() : DistContext(false) {
+  syncSubstrate = NULL;
+}
+
 DistContext::~DistContext() {}
 
 void DistContext::saveDistGraph(DGraph* a) {
diff --git a/libdeepgalois/src/DistContext.cu b/libdeepgalois/src/DistContext.cu
index 7542849cef..b67f0f9125 100644
--- a/libdeepgalois/src/DistContext.cu
+++ b/libdeepgalois/src/DistContext.cu
@@ -64,6 +64,12 @@ cusparseMatDescr_t DistContext::cusparse_matdescr_ = 0;
 curandGenerator_t DistContext::curand_generator_   = 0;
 
 DistContext::DistContext() : DistContext(true) {
+  d_labels = NULL; 
+  d_feats = NULL;
+  d_labels_subg = NULL; 
+  d_feats_subg = NULL;
+  d_normFactors = NULL;
+  d_normFactorsSub = NULL;
   CUBLAS_CHECK(cublasCreate(&cublas_handle_));
   CUSPARSE_CHECK(cusparseCreate(&cusparse_handle_));
   CUSPARSE_CHECK(cusparseCreateMatDescr(&cusparse_matdescr_));
@@ -86,10 +92,12 @@ DistContext::~DistContext() {
     CUSPARSE_CHECK(cusparseDestroyMatDescr(cusparse_matdescr_));
   if (curand_generator_)
     CURAND_CHECK(curandDestroyGenerator(curand_generator_));
-  if (d_labels)
-    CUDA_CHECK(cudaFree(d_labels));
-  if (d_feats)
-    CUDA_CHECK(cudaFree(d_feats));
+  if (d_labels) CUDA_CHECK(cudaFree(d_labels));
+  if (d_feats) CUDA_CHECK(cudaFree(d_feats));
+  if (d_normFactors) CUDA_CHECK(cudaFree(d_normFactors));
+  if (d_labels_subg) CUDA_CHECK(cudaFree(d_labels_subg));
+  if (d_feats_subg) CUDA_CHECK(cudaFree(d_feats_subg));
+  if (d_normFactorsSub) CUDA_CHECK(cudaFree(d_normFactorsSub));
 }
 
 size_t DistContext::read_labels(bool isSingleClass, std::string dataset_str) {
@@ -107,6 +115,15 @@ size_t DistContext::read_masks(std::string dataset_str, std::string mask_type, s
   return reader.read_masks(mask_type, n, begin, end, masks);
 }
 
+//! allocate memory for subgraphs (don't actually build them)
+void DistContext::allocateSubgraphs(int num_subgraphs, unsigned max_size) {
+  this->partitionedSubgraphs.resize(num_subgraphs);
+  for (int i = 0; i < num_subgraphs; i++) {
+    this->partitionedSubgraphs[i] = new Graph();
+    this->partitionedSubgraphs[i]->set_max_size(max_size);
+  }
+}
+
 void DistContext::constructSubgraphLabels(size_t m, const mask_t* masks) {
   size_t labels_size = m;
   if (!usingSingleClass) labels_size = m * num_classes;
@@ -126,6 +143,7 @@ void DistContext::constructSubgraphLabels(size_t m, const mask_t* masks) {
 }
 
 void DistContext::constructSubgraphFeatures(size_t m, const mask_t* masks) {
+  //std::cout << "construct subgraph features (d_feats_subg: " << d_feats_subg << ") ... ";
   size_t count = 0;
   DistContext::h_feats_subg.resize(m * feat_len);
   for (size_t i = 0; i < this->partitionedGraph->size(); i++) {
@@ -137,9 +155,27 @@ void DistContext::constructSubgraphFeatures(size_t m, const mask_t* masks) {
   if (d_feats_subg) float_free_device(d_feats_subg);
   float_malloc_device(m * feat_len, d_feats_subg);
   float_copy_device(m * feat_len, &h_feats_subg[0], d_feats_subg);
+  //std::cout << "Done\n";
 }
 
 void DistContext::constructNormFactorSub(int subgraphID) {
+  Graph& graphToUse = *partitionedSubgraphs[subgraphID];
+  auto n = graphToUse.size();
+  //std::cout << "Pre-computing subgraph normalization factor (n=" << n << ") ... ";
+
+ #ifdef USE_CUSPARSE
+  auto nnz = graphToUse.sizeEdges();
+  float_malloc_device(nnz, d_normFactorsSub);
+  init_const_gpu(nnz, 0.0, d_normFactors);
+  norm_factor_computing_edge<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
+      n, graphToUse, d_normFactorsSub);
+#else
+  float_malloc_device(n, d_normFactorsSub);
+  norm_factor_computing_node<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
+      n, graphToUse, d_normFactorsSub);
+#endif
+  CudaTest("solving norm_factor_computing kernel failed");
+  //std::cout << "Done\n";
 }
 
 void DistContext::constructNormFactor(deepgalois::Context* globalContext) {
diff --git a/libdeepgalois/src/Net.cu b/libdeepgalois/src/Net.cu
index 8e8ce83267..2921b81996 100644
--- a/libdeepgalois/src/Net.cu
+++ b/libdeepgalois/src/Net.cu
@@ -148,7 +148,8 @@ acc_t masked_f1_score_gpu(int num_classes, int begin, int end, int count,
 namespace deepgalois {
 
 void Net::allocateSubgraphsMasks(int num_subgraphs) {
-  CUDA_CHECK(cudaMalloc((void**)&subgraphs_masks, distNumSamples * num_subgraphs * sizeof(mask_t)));
+  subgraphs_masks = new mask_t[distNumSamples * num_subgraphs];
+  //CUDA_CHECK(cudaMalloc((void**)&subgraphs_masks, distNumSamples * num_subgraphs * sizeof(mask_t)));
 }
 
 void Net::partitionInit(DGraph* graph, std::string dataset_str, bool isSingleClassLabel) {
diff --git a/libdeepgalois/src/Sampler.cu b/libdeepgalois/src/Sampler.cu
index 97835ea9cc..1cdfc49e32 100644
--- a/libdeepgalois/src/Sampler.cu
+++ b/libdeepgalois/src/Sampler.cu
@@ -20,7 +20,7 @@ __global__ void set_masks(index_t n, index_t* vertices, mask_t* masks) {
 __global__ void get_masked_degrees(index_t n, mask_t* masks, GraphGPU g,
                                    index_t* degrees) {
   CUDA_KERNEL_LOOP(src, n) {
-    if (src < 10) printf("masks[%d] = %d\n", src, masks[src]);
+    //if (src < 10) printf("masks[%d] = %d\n", src, masks[src]);
     degrees[src] = 0;
     if (masks[src] == 1) {
       for (auto e = g.edge_begin(src); e != g.edge_end(src); e++) {
@@ -29,7 +29,7 @@ __global__ void get_masked_degrees(index_t n, mask_t* masks, GraphGPU g,
           degrees[src]++;
       }
     }
-    if (src < 10) printf("degrees[%d] = %d\n", src, degrees[src]);
+    //if (src < 10) printf("degrees[%d] = %d\n", src, degrees[src]);
   }
 }
 
@@ -96,7 +96,7 @@ void Sampler::indexing(size_t n, index_t* vertices, index_t* new_indices) {
 
 template <typename GraphTy, typename SubgraphTy>
 void Sampler::getMaskedGraph(index_t n, mask_t* masks, GraphTy* g, SubgraphTy* subg) {
-  std::cout << "Original graph size: " << g->size() << " edges: " << g->sizeEdges() << "\n";
+  //std::cout << "Original graph size: " << g->size() << " edges: " << g->sizeEdges() << "\n";
   index_t *degrees, *offsets;
   CUDA_CHECK(cudaMalloc((void**)&degrees, sizeof(index_t)*n));
   get_masked_degrees<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, masks, *g, degrees);
@@ -105,7 +105,7 @@ void Sampler::getMaskedGraph(index_t n, mask_t* masks, GraphTy* g, SubgraphTy* s
   CUDA_CHECK(cudaFree(degrees));
   index_t ne;
   CUDA_CHECK(cudaMemcpy(&ne, &offsets[n], sizeof(index_t), cudaMemcpyDeviceToHost));
-  std::cout << "maskedSG num_edges " << ne << "\n";
+  //std::cout << "maskedSG num_edges " << ne << "\n";
   subg->allocateFrom(n, ne); // TODO: avoid reallocation
   generate_masked_graph_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, masks, offsets, *g, *subg);
   CUDA_CHECK(cudaFree(offsets));
@@ -117,7 +117,7 @@ void Sampler::getMaskedGraph(index_t n, mask_t* masks, GraphTy* g, SubgraphTy* s
 void Sampler::generateSubgraph(VertexSet &vertex_set, mask_t* masks, GraphGPU* sub) {
   index_t n = partGraph->size();
   auto nv = vertex_set.size();
-  std::cout << "g size: " << n << " sg sizes: " << nv << "\n";
+  //std::cout << "g size: " << n << " sg sizes: " << nv << "\n";
   // convert the vertex_set to a vertex_list and copy it to the device
   VertexList vertex_list(vertex_set.begin(), vertex_set.end());
   index_t* d_vertex_list;
@@ -135,7 +135,7 @@ void Sampler::generateSubgraph(VertexSet &vertex_set, mask_t* masks, GraphGPU* s
   //CudaTest("solving set_masks kernel failed");
   GraphGPU masked_sg; // size is the same as original graph, but masked dst removed
   getMaskedGraph(n, d_masks, partGraph, &masked_sg); // remove edges whose destination is not masked
-  std::cout << "maskedGraph generated\n";
+  //std::cout << "maskedGraph generated\n";
 
   // re-index the subgraph
   index_t* d_new_ids;
@@ -154,7 +154,7 @@ void Sampler::generateSubgraph(VertexSet &vertex_set, mask_t* masks, GraphGPU* s
   CUDA_CHECK(cudaFree(degrees));
   index_t ne;
   CUDA_CHECK(cudaMemcpy(&ne, offsets+nv, sizeof(index_t), cudaMemcpyDeviceToHost));
-  std::cout << "subgraph num_edges " << ne << "\n";
+  //std::cout << "subgraph num_edges " << ne << "\n";
 
   // allocate memory for the subgraph
   sub->allocateFrom(nv, ne); // avoid reallocation
@@ -162,7 +162,7 @@ void Sampler::generateSubgraph(VertexSet &vertex_set, mask_t* masks, GraphGPU* s
   generate_graph_kernel<<<CUDA_GET_BLOCKS(nv), CUDA_NUM_THREADS>>>(nv, offsets, d_vertex_list, d_new_ids, masked_sg, *sub);
   CudaTest("solving generate_graph kernel failed");
   CUDA_CHECK(cudaFree(offsets));
-  std::cout << "Subgraph generated\n";
+  //std::cout << "Subgraph generated\n";
 }
 
 } // namespace deepgalois
diff --git a/libgpu/include/graph_gpu.h b/libgpu/include/graph_gpu.h
index 4ddf57b950..d208a3328c 100644
--- a/libgpu/include/graph_gpu.h
+++ b/libgpu/include/graph_gpu.h
@@ -139,7 +139,7 @@ struct CSRGraph {
 		delete edge_dst;
 		edge_dst = new_edge_dst;
 		nedges += nnodes;
-        printf("nnodes = %d, nedges = %d\n", nnodes, nedges);
+    printf("nnodes = %d, nedges = %d\n", nnodes, nedges);
 		//print_neighbors(nnodes-1);
 		//print_neighbors(0);
 	}
@@ -184,7 +184,7 @@ struct CSRGraph {
     nnodes = nv;
     nedges = ne;
     if (max_size < nnodes) max_size = nnodes;
-    printf("allocating memory on gpu nnodes %d nedges %d\n", max_size, nedges);
+    //printf("allocating memory on gpu nnodes %d nedges %d\n", max_size, nedges);
     if (need_realloc) {
       if (edge_dst) free_index_device(edge_dst);
       malloc_index_device(nedges, edge_dst);

From a647e0ec321f51bc960c1025efd7a3c1072f2926 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 18 May 2020 16:50:57 -0500
Subject: [PATCH 314/660] GNN-OEC policy: test vertices evenly among hosts

---
 libcusp/include/galois/graphs/BasePolicies.h  | 17 ++++-
 .../galois/graphs/GenericPartitioners.h       | 47 ++++++++++++++
 libcusp/include/galois/graphs/NewGeneric.h    | 58 +++++++++++++++++
 lonestar/gnn/include/DistributedGraphLoader.h | 63 +++++++------------
 lonestar/gnn/src/DistributedGraphLoader.cpp   | 28 ++++-----
 5 files changed, 154 insertions(+), 59 deletions(-)

diff --git a/libcusp/include/galois/graphs/BasePolicies.h b/libcusp/include/galois/graphs/BasePolicies.h
index 1b1fcd8aa4..511804276e 100644
--- a/libcusp/include/galois/graphs/BasePolicies.h
+++ b/libcusp/include/galois/graphs/BasePolicies.h
@@ -65,6 +65,8 @@ class PartitioningScaffold {
   void saveGIDToHost(std::vector<std::pair<uint64_t, uint64_t>>& gid2host) {
     _gid2host = gid2host;
   }
+
+  bool predeterminedMapping(std::vector<uint32_t>&) { return false; }
 };
 
 /**
@@ -149,8 +151,13 @@ class CustomMasterAssignment : public PartitioningScaffold {
   char _status; //!< Specifies what phase of master assignment partitioner is on
   //! Metadata for determining where a node's master is
   std::vector<uint32_t> _localNodeToMaster;
-  //! Map GID to its master
+  //! Map GID to its master; only for nodes we own
   std::unordered_map<uint64_t, uint32_t> _gid2masters;
+  //! Unlike gid2masters, this contains a mapping in vector form of ALL mappings
+  //! for all nodes in the graph instead of just local ones; only used if it is
+  //! known exactly where everything ends up before partitioning
+  std::vector<uint32_t> _globalHostMap;
+
   //! This host's node offset (each host reads a distinct contiguous portion
   //! of graph
   uint64_t _nodeOffset;
@@ -183,6 +190,8 @@ class CustomMasterAssignment : public PartitioningScaffold {
    * mapping is not found but instead returns -1 if in stage 1, else
    * fails.
    *
+   * ONLY WORKS IF GID IS ON LOCAL HOST ELSE WILL FAIL
+   *
    * @param gid GID to get master of
    * @returns Master of specified GID, -1, unsigned, if not found
    */
@@ -202,11 +211,13 @@ class CustomMasterAssignment : public PartitioningScaffold {
         } else {
           // NOT FOUND (not necessarily a bad thing, and required for
           // some cases)
-          galois::gDebug("[", _hostID, "] ", gid, " not found!");
+          galois::gDebug("[", _hostID, "] ", gid,
+                         " not found for retrieveMaster!");
           if (_status == 2) {
             // die if we expect all gids to be mapped already (stage 2)
             GALOIS_DIE("should not fail to find a GID after stage 2 "
-                       "of master assignment phase");
+                       "of master assignment phase; that or passed in gid that"
+                       " doesn't exist on this host");
           }
           return (uint32_t)-1;
         }
diff --git a/libcusp/include/galois/graphs/GenericPartitioners.h b/libcusp/include/galois/graphs/GenericPartitioners.h
index db73b84525..f1a0809f37 100644
--- a/libcusp/include/galois/graphs/GenericPartitioners.h
+++ b/libcusp/include/galois/graphs/GenericPartitioners.h
@@ -909,4 +909,51 @@ class SugarColumnFlipP : public galois::graphs::CustomMasterAssignment {
   }
 };
 
+class GnnOEC : public galois::graphs::CustomMasterAssignment {
+public:
+  GnnOEC(uint32_t hostID, uint32_t numHosts, uint64_t numNodes,
+         uint64_t numEdges)
+      : galois::graphs::CustomMasterAssignment(hostID, numHosts, numNodes,
+                                               numEdges){};
+
+  template <typename EdgeTy>
+  uint32_t getMaster(uint32_t src, galois::graphs::BufferedGraph<EdgeTy>&,
+                     const std::vector<uint32_t>&,
+                     std::unordered_map<uint64_t, uint32_t>&,
+                     const std::vector<uint64_t>&,
+                     std::vector<galois::CopyableAtomic<uint64_t>>&,
+                     const std::vector<uint64_t>&,
+                     std::vector<galois::CopyableAtomic<uint64_t>>&) {
+    // this is expected to be set
+    return _globalHostMap[src];
+  }
+
+  uint32_t retrieveMaster(uint32_t gid) const { return _globalHostMap[gid]; }
+
+  //! outgoing edge cut
+  uint32_t getEdgeOwner(uint32_t src, uint32_t, uint64_t) const {
+    return retrieveMaster(src);
+  }
+
+  bool noCommunication() { return false; }
+  bool isVertexCut() const { return false; }
+  void serializePartition(boost::archive::binary_oarchive&) {}
+  void deserializePartition(boost::archive::binary_iarchive&) {}
+  std::pair<unsigned, unsigned> cartesianGrid() {
+    return std::make_pair(0u, 0u);
+  }
+
+  bool predeterminedMapping(std::vector<uint32_t>& mappings) {
+    if (mappings.size() != _numNodes) {
+      GALOIS_DIE("predetermined mapping size not equal to num nodes");
+    }
+    _globalHostMap.resize(_numNodes);
+
+    galois::do_all(galois::iterate((size_t)0, mappings.size()),
+                   [&](size_t n) { _globalHostMap[n] = mappings[n]; });
+
+    return true;
+  }
+};
+
 #endif
diff --git a/libcusp/include/galois/graphs/NewGeneric.h b/libcusp/include/galois/graphs/NewGeneric.h
index efbf657670..ac17e25aed 100644
--- a/libcusp/include/galois/graphs/NewGeneric.h
+++ b/libcusp/include/galois/graphs/NewGeneric.h
@@ -73,6 +73,28 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
 
   uint32_t nodesToReceive;
 
+  std::vector<uint32_t> getGNNBreakpoints(std::string filename) {
+    // contains 2 numbers: begin and end of test
+    // everything else can be split evenly among hosts as they are not
+    // performance critical
+    std::vector<uint32_t> bps;
+
+    // if through all possible GNN outputs
+    if (filename.find("cora") != std::string::npos) {
+      bps.push_back(0);
+      bps.push_back(140);
+    } else if (filename.find("reddit") != std::string::npos) {
+      bps.push_back(0);
+      bps.push_back(153431);
+    } else if (filename.find("ppi") != std::string::npos) {
+      bps.push_back(0);
+      bps.push_back(9716);
+    }
+    // TODO hardcode the rest
+
+    return bps;
+  }
+
 public:
   //! typedef for base DistGraph class
   using base_DistGraph = DistGraph<NodeTy, EdgeTy>;
@@ -173,6 +195,7 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
     }
 
     galois::graphs::OfflineGraph g(filename);
+
     base_DistGraph::numGlobalNodes = g.size();
     base_DistGraph::numGlobalEdges = g.sizeEdges();
     std::vector<unsigned> dummy;
@@ -190,6 +213,41 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
     // TODO abstract this away somehow
     graphPartitioner->saveGIDToHost(base_DistGraph::gid2host);
 
+    // get training nodes and split evenly among hosts
+    std::vector<uint32_t> trainPoints = this->getGNNBreakpoints(filename);
+    if (!trainPoints.empty()) {
+      std::vector<unsigned> testDistribution =
+          galois::graphs::determineUnitRangesFromPrefixSum(
+              base_DistGraph::numHosts, g, trainPoints[0], trainPoints[1]);
+
+      std::vector<unsigned> restDistribution =
+          galois::graphs::determineUnitRangesFromPrefixSum(
+              base_DistGraph::numHosts, g, trainPoints[1], g.size());
+
+      // create global distribution of edges
+      std::vector<uint32_t> mappings(g.size());
+      galois::do_all(
+          galois::iterate((size_t)0, (size_t)base_DistGraph::numHosts),
+          [&](size_t h) {
+            // test
+            uint32_t hCur = testDistribution[h];
+            uint32_t hEnd = testDistribution[h + 1];
+            for (; hCur < hEnd; hCur++) {
+              mappings[hCur] = h;
+            }
+            // the rest
+            hCur = restDistribution[h];
+            hEnd = restDistribution[h + 1];
+            for (; hCur < hEnd; hCur++) {
+              mappings[hCur] = h;
+            }
+          });
+      bool validPart = graphPartitioner->predeterminedMapping(mappings);
+      if (!validPart) {
+        galois::gWarn("partitioning policy used doesn't use trainpoints");
+      }
+    }
+
     uint64_t nodeBegin = base_DistGraph::gid2host[base_DistGraph::id].first;
     typename galois::graphs::OfflineGraph::edge_iterator edgeBegin =
         g.edge_begin(nodeBegin);
diff --git a/lonestar/gnn/include/DistributedGraphLoader.h b/lonestar/gnn/include/DistributedGraphLoader.h
index 7827c1a39f..f3755a886f 100644
--- a/lonestar/gnn/include/DistributedGraphLoader.h
+++ b/lonestar/gnn/include/DistributedGraphLoader.h
@@ -1,7 +1,7 @@
 /*
- * This file belongs to the Galois project, a C++ library for exploiting parallelism.
- * The code is being released under the terms of the 3-Clause BSD License (a
- * copy is located in LICENSE.txt at the top-level directory).
+ * This file belongs to the Galois project, a C++ library for exploiting
+ * parallelism. The code is being released under the terms of the 3-Clause BSD
+ * License (a copy is located in LICENSE.txt at the top-level directory).
  *
  * Copyright (C) 2019, The University of Texas at Austin. All rights reserved.
  * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
@@ -42,17 +42,18 @@ namespace graphs {
 
 //! enums of partitioning schemes supported
 enum PARTITIONING_SCHEME {
-  OEC,                   //!< outgoing edge cut
-  IEC,                   //!< incoming edge cut
-  HOVC,                  //!< outgoing hybrid vertex cut
-  HIVC,                  //!< incoming hybrid vertex cut
-  CART_VCUT,             //!< cartesian vertex cut
-  CART_VCUT_IEC,         //!< cartesian vertex cut using iec
-  GINGER_O,              //!< Ginger, outgoing
-  GINGER_I,              //!< Ginger, incoming
-  FENNEL_O,              //!< Fennel, oec
-  FENNEL_I,              //!< Fennel, iec
-  SUGAR_O                //!< Sugar, oec
+  OEC,           //!< outgoing edge cut
+  IEC,           //!< incoming edge cut
+  HOVC,          //!< outgoing hybrid vertex cut
+  HIVC,          //!< incoming hybrid vertex cut
+  CART_VCUT,     //!< cartesian vertex cut
+  CART_VCUT_IEC, //!< cartesian vertex cut using iec
+  GINGER_O,      //!< Ginger, outgoing
+  GINGER_I,      //!< Ginger, incoming
+  FENNEL_O,      //!< Fennel, oec
+  FENNEL_I,      //!< Fennel, iec
+  SUGAR_O,       //!< Sugar, oec
+  GNN_OEC        //!< gnn, oec
 };
 
 /**
@@ -85,6 +86,8 @@ inline const char* EnumToString(PARTITIONING_SCHEME e) {
     return "fennel-iec";
   case SUGAR_O:
     return "sugar-oec";
+  case GNN_OEC:
+    return "gnn-oec";
   default:
     GALOIS_DIE("Unsupported partition");
   }
@@ -121,8 +124,7 @@ namespace graphs {
  * loaded based on command line arguments
  */
 template <typename NodeData, typename EdgeData>
-DistGraph<NodeData, EdgeData>*
-constructSymmetricGraph(std::vector<unsigned>&) {
+DistGraph<NodeData, EdgeData>* constructSymmetricGraph(std::vector<unsigned>&) {
   std::string inputFile = deepgalois::path + dataset + ".csgr";
   galois::gInfo("File to read is ", inputFile);
 
@@ -130,36 +132,19 @@ constructSymmetricGraph(std::vector<unsigned>&) {
   case OEC:
   case IEC:
     return cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
-      inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, ""
-    );
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, "");
   case HOVC:
   case HIVC:
     return cuspPartitionGraph<GenericHVC, NodeData, EdgeData>(
-      inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, ""
-    );
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, "");
 
   case CART_VCUT:
   case CART_VCUT_IEC:
     return cuspPartitionGraph<GenericCVC, NodeData, EdgeData>(
-      inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, ""
-    );
-
-  case GINGER_O:
-  case GINGER_I:
-    return cuspPartitionGraph<GingerP, NodeData, EdgeData>(
-      inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, ""
-    );
-
-  case FENNEL_O:
-  case FENNEL_I:
-    return cuspPartitionGraph<FennelP, NodeData, EdgeData>(
-      inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, ""
-    );
-
-  case SUGAR_O:
-    return cuspPartitionGraph<SugarP, NodeData, EdgeData>(
-      inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, ""
-    );
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, "");
+  case GNN_OEC:
+    return cuspPartitionGraph<GnnOEC, NodeData, EdgeData>(
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, "");
   default:
     GALOIS_DIE("Error: partition scheme specified is invalid");
     return nullptr;
diff --git a/lonestar/gnn/src/DistributedGraphLoader.cpp b/lonestar/gnn/src/DistributedGraphLoader.cpp
index 7c309dedc2..71953ea53e 100644
--- a/lonestar/gnn/src/DistributedGraphLoader.cpp
+++ b/lonestar/gnn/src/DistributedGraphLoader.cpp
@@ -1,7 +1,7 @@
 /*
- * This file belongs to the Galois project, a C++ library for exploiting parallelism.
- * The code is being released under the terms of the 3-Clause BSD License (a
- * copy is located in LICENSE.txt at the top-level directory).
+ * This file belongs to the Galois project, a C++ library for exploiting
+ * parallelism. The code is being released under the terms of the 3-Clause BSD
+ * License (a copy is located in LICENSE.txt at the top-level directory).
  *
  * Copyright (C) 2019, The University of Texas at Austin. All rights reserved.
  * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
@@ -32,17 +32,11 @@ namespace cll = llvm::cl;
 
 cll::opt<PARTITIONING_SCHEME> partitionScheme(
     "partition", cll::desc("Type of partitioning."),
-    cll::values(
-        clEnumValN(OEC, "oec", "Outgoing Edge-Cut (default)"),
-        clEnumValN(IEC, "iec", "Incoming Edge-Cut"),
-        clEnumValN(HOVC, "hovc", "Outgoing Hybrid Vertex-Cut"),
-        clEnumValN(HIVC, "hivc", "Incoming Hybrid Vertex-Cut"),
-        clEnumValN(CART_VCUT, "cvc", "Cartesian Vertex-Cut of oec"),
-        clEnumValN(CART_VCUT_IEC, "cvc-iec", "Cartesian Vertex-Cut of iec"),
-        //clEnumValN(CEC, "cec", "Custom edge cut from vertexID mapping"),
-        clEnumValN(GINGER_O, "ginger-o", "ginger, outgiong edges, using CuSP"),
-        clEnumValN(GINGER_I, "ginger-i", "ginger, incoming edges, using CuSP"),
-        clEnumValN(FENNEL_O, "fennel-o", "fennel, outgoing edge cut, using CuSP"),
-        clEnumValN(FENNEL_I, "fennel-i", "fennel, incoming edge cut, using CuSP"),
-        clEnumValN(SUGAR_O, "sugar-o", "fennel, incoming edge cut, using CuSP")),
-    cll::init(OEC));
+    cll::values(clEnumValN(OEC, "oec", "Outgoing Edge-Cut (default)"),
+                clEnumValN(IEC, "iec", "Incoming Edge-Cut"),
+                clEnumValN(CART_VCUT, "cvc", "Cartesian Vertex-Cut of oec"),
+                clEnumValN(CART_VCUT_IEC, "cvc-iec",
+                           "Cartesian Vertex-Cut of iec"),
+                clEnumValN(GNN_OEC, "g-oec",
+                           "gnn oec: train nodes evenly distributed")),
+    cll::init(GNN_OEC));

From 18d102c8923983ed53d0bf91586177af4146cc43 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 19 May 2020 13:16:45 -0500
Subject: [PATCH 315/660] gnn eneable dist galoi s-> galois use dist

---
 libdeepgalois/CMakeLists.txt                               | 2 +-
 libdeepgalois/include/deepgalois/layers/graph_conv_layer.h | 2 --
 libdeepgalois/include/deepgalois/types.h                   | 2 --
 lonestar/gnn/CMakeLists.txt                                | 2 +-
 lonestar/gnn/gat/CMakeLists.txt                            | 2 +-
 lonestar/gnn/gcn/CMakeLists.txt                            | 2 +-
 lonestar/gnn/gin/CMakeLists.txt                            | 2 +-
 lonestar/gnn/gin/gin.cpp                                   | 2 --
 lonestar/gnn/include/engine.h                              | 7 -------
 lonestar/gnn/sage/CMakeLists.txt                           | 2 +-
 10 files changed, 6 insertions(+), 19 deletions(-)

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index a022a36655..32c5fa0212 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -106,7 +106,7 @@ target_include_directories(dg_cpu PUBLIC
 )
 
 # dist galois setup/linking to dg_cpu
-if(ENABLE_DIST_GALOIS)
+if(GALOIS_ENABLE_DIST)
   target_link_libraries(dg_cpu galois_dist_async galois_cusp galois_gluon)
   target_include_directories(dg_cpu PUBLIC
     ${CMAKE_SOURCE_DIR}/libdist/include
diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
index 216b7e1935..a02beebd57 100644
--- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
@@ -1,9 +1,7 @@
 #pragma once
 #include "layer.h"
 #include "deepgalois/layers/aggregator.h"
-#ifdef GALOIS_USE_DIST
 #include "deepgalois/layers/GraphConvSyncStructures.h"
-#endif
 
 /**
  * GraphConv Layer; based on DGL implementation + follows TinyDNN layer
diff --git a/libdeepgalois/include/deepgalois/types.h b/libdeepgalois/include/deepgalois/types.h
index e3165abc8a..43d55eb331 100644
--- a/libdeepgalois/include/deepgalois/types.h
+++ b/libdeepgalois/include/deepgalois/types.h
@@ -43,7 +43,6 @@ enum class net_phase { train, test };
 #define WARPS_PER_BLOCK (BLOCK_SIZE / WARP_SIZE)
 #define USE_CUSPARSE
 
-#ifdef GALOIS_USE_DIST
 namespace deepgalois {
 // TODO only being used by graph conv layer at the moment so extern works,
 // but this design is bad and needs to be revisited
@@ -54,6 +53,5 @@ extern float_t* _dataToSync;
 //! sync
 extern long unsigned _syncVectorSize;
 } // namespace deepgalois
-#endif
 
 #endif
diff --git a/lonestar/gnn/CMakeLists.txt b/lonestar/gnn/CMakeLists.txt
index 40eac53052..0020736fee 100644
--- a/lonestar/gnn/CMakeLists.txt
+++ b/lonestar/gnn/CMakeLists.txt
@@ -17,7 +17,7 @@ if(ENABLE_HETERO_GALOIS)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__GALOIS_HET_CUDA__")
 endif()
 
-if(ENABLE_DIST_GALOIS)
+if(GALOIS_ENABLE_DIST)
   add_library(distgraphloader STATIC src/DistributedGraphLoader.cpp)
   target_include_directories(distgraphloader PUBLIC include)
   target_link_libraries(distgraphloader galois_cusp LLVMSupport)
diff --git a/lonestar/gnn/gat/CMakeLists.txt b/lonestar/gnn/gat/CMakeLists.txt
index f9f1efdc6f..8d172ac154 100644
--- a/lonestar/gnn/gat/CMakeLists.txt
+++ b/lonestar/gnn/gat/CMakeLists.txt
@@ -8,7 +8,7 @@ if(ENABLE_HETERO_GALOIS)
   target_link_libraries(gat PRIVATE -lcudart -lcublas -lcurand -lcudadevrt)
 else()
 target_link_libraries(gat PRIVATE dg_cpu)
-if(ENABLE_DIST_GALOIS)
+if(GALOIS_ENABLE_DIST)
   target_link_libraries(gat PRIVATE distgraphloader)
 endif()
 endif()
diff --git a/lonestar/gnn/gcn/CMakeLists.txt b/lonestar/gnn/gcn/CMakeLists.txt
index fc5f134d76..9ed4ef97d9 100644
--- a/lonestar/gnn/gcn/CMakeLists.txt
+++ b/lonestar/gnn/gcn/CMakeLists.txt
@@ -11,7 +11,7 @@ if(ENABLE_HETERO_GALOIS)
   target_link_libraries(gcn PRIVATE -lcudart -lcublas -lcurand -lcudadevrt)
 else()
 target_link_libraries(gcn PRIVATE dg_cpu)
-if(ENABLE_DIST_GALOIS)
+if(GALOIS_ENABLE_DIST)
   target_link_libraries(gcn PRIVATE distgraphloader)
 endif()
 endif()
diff --git a/lonestar/gnn/gin/CMakeLists.txt b/lonestar/gnn/gin/CMakeLists.txt
index f32f47179e..5d63e3d0d7 100644
--- a/lonestar/gnn/gin/CMakeLists.txt
+++ b/lonestar/gnn/gin/CMakeLists.txt
@@ -1,6 +1,6 @@
 app(gin gin.cpp)
 target_link_libraries(gin dg_cpu)
-if(ENABLE_DIST_GALOIS)
+if(GALOIS_ENABLE_DIST)
   target_link_libraries(gin distgraphloader)
 endif()
 if(ENABLE_HETERO_GALOIS)
diff --git a/lonestar/gnn/gin/gin.cpp b/lonestar/gnn/gin/gin.cpp
index aecfcf9b35..4eb8835214 100644
--- a/lonestar/gnn/gin/gin.cpp
+++ b/lonestar/gnn/gin/gin.cpp
@@ -1,9 +1,7 @@
 // Graph Neural Networks
 // Xuhao Chen <cxh@utexas.edu>
 #include "lonestargnn.h"
-#ifdef GALOIS_USE_DIST
 #include "DistributedGraphLoader.h"
-#endif
 
 const char* name = "Graph Isomorphism Network (GIN)";
 const char* desc = "Graph isomorphism neural networks on an undirected graph";
diff --git a/lonestar/gnn/include/engine.h b/lonestar/gnn/include/engine.h
index 7de3350399..cf39ce95f3 100644
--- a/lonestar/gnn/include/engine.h
+++ b/lonestar/gnn/include/engine.h
@@ -1,12 +1,9 @@
 // Execution engine
 #include <iostream>
 #include <sstream>
-#ifdef GALOIS_USE_DIST
 #include "DistributedGraphLoader.h"
 #include "galois/DistGalois.h"
 #include "galois/runtime/Network.h"
-#endif
-#include "galois/Galois.h"
 #include "galois/Version.h"
 #include "galois/Timer.h"
 #include "deepgalois/Net.h"
@@ -28,10 +25,8 @@ void LonestarGnnStart(int argc, char** argv, const char* app, const char* desc,
   numThreads = galois::setActiveThreads(numThreads); // number of threads on CPU
 #endif
 
-#ifdef GALOIS_USE_DIST
   auto& net = galois::runtime::getSystemNetworkInterface();
   if (net.ID == 0) {
-#endif
     LonestarGnnPrintVersion(llvm::outs());
     std::cout << "Copyright (C) " << galois::getCopyrightYear()
               << " The University of Texas at Austin\n";
@@ -51,9 +46,7 @@ void LonestarGnnStart(int argc, char** argv, const char* app, const char* desc,
     }
     galois::runtime::reportParam("(NULL)", "CommandLine", cmdout.str());
     galois::runtime::reportParam("(NULL)", "Threads", numThreads);
-#ifdef GALOIS_USE_DIST
   }
-#endif
 
   char name[256];
   gethostname(name, 256);
diff --git a/lonestar/gnn/sage/CMakeLists.txt b/lonestar/gnn/sage/CMakeLists.txt
index 94b6d234b7..ee95292588 100644
--- a/lonestar/gnn/sage/CMakeLists.txt
+++ b/lonestar/gnn/sage/CMakeLists.txt
@@ -8,7 +8,7 @@ if(ENABLE_HETERO_GALOIS)
   target_link_libraries(sage PRIVATE -lcudart -lcublas -lcurand -lcudadevrt)
 else()
 target_link_libraries(sage PRIVATE dg_cpu)
-if(ENABLE_DIST_GALOIS)
+if(GALOIS_ENABLE_DIST)
   target_link_libraries(sage PRIVATE distgraphloader)
 endif()
 endif()

From 6659aa0356393bf5657bc268509afb9997ab757f Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 19 May 2020 13:33:53 -0500
Subject: [PATCH 316/660] heterogalois -> eneable gpu; TODO fix moderngpu

---
 CMakeLists.txt                                | 18 ++++++++---------
 libdeepgalois/CMakeLists.txt                  | 12 +++++------
 .../include/deepgalois/DistContext.h          |  6 +++---
 libdeepgalois/include/deepgalois/GraphTypes.h |  4 ++--
 libdeepgalois/include/deepgalois/Net.h        | 20 +++++++++----------
 .../deepgalois/layers/GradientSyncStructs.h   |  2 +-
 .../layers/GraphConvSyncStructures.h          |  2 +-
 .../include/deepgalois/layers/aggregator.h    |  2 +-
 .../deepgalois/layers/graph_conv_layer.h      |  2 +-
 .../include/deepgalois/layers/layer.h         | 14 ++++++-------
 libdeepgalois/include/deepgalois/lgraph.h     |  2 +-
 libdeepgalois/include/deepgalois/optimizer.h  | 18 ++++++++---------
 libdeepgalois/src/reader.cpp                  |  2 +-
 libdeepgalois/src/utils.cpp                   |  2 +-
 lonestar/gnn/CMakeLists.txt                   |  6 +++---
 lonestar/gnn/gat/CMakeLists.txt               |  2 +-
 lonestar/gnn/gcn/CMakeLists.txt               |  4 ++--
 lonestar/gnn/gin/CMakeLists.txt               |  2 +-
 lonestar/gnn/include/engine.h                 |  6 +++---
 lonestar/gnn/sage/CMakeLists.txt              |  2 +-
 20 files changed, 63 insertions(+), 65 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f08d91e6fb..02e2aca6b6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -254,7 +254,14 @@ if(USE_DEEPGALOIS)
 endif(USE_DEEPGALOIS)
 
 if (GALOIS_ENABLE_GPU)
-  # TODO may require cleanup
+  enable_language(CUDA)
+  string(REPLACE "." "" GENCODES ${CUDA_CAPABILITY})
+  string(REPLACE "," ";" GENCODES ${GENCODES})
+  foreach(GENCODE ${GENCODES})
+    add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_${GENCODE},code=sm_${GENCODE}>")
+  endforeach()
+  add_subdirectory(libgpu)
+
   if (USE_DEEPGALOIS)
     SET(CUDA_SEPARABLE_COMPILATION ON)
     find_package(CUDA REQUIRED)
@@ -279,15 +286,6 @@ if (GALOIS_ENABLE_GPU)
 
     #find_package(OpenCL REQUIRED)
   endif()
-
-  enable_language(CUDA)
-  string(REPLACE "." "" GENCODES ${CUDA_CAPABILITY})
-  string(REPLACE "," ";" GENCODES ${GENCODES})
-  foreach(GENCODE ${GENCODES})
-    add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_${GENCODE},code=sm_${GENCODE}>")
-  endforeach()
-
-  add_subdirectory(libgpu)
 endif()
 add_subdirectory(libpangolin)
 
diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index 32c5fa0212..1abc692a9f 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -25,9 +25,9 @@ include_directories(${CMAKE_SOURCE_DIR}/libgalois/include)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
 link_directories(${CMAKE_SOURCE_DIR}/libgalois)
 
-if(ENABLE_HETERO_GALOIS)
+if(GALOIS_ENABLE_GPU)
   # hetero path
-  set(CUDA_NVCC_FLAGS "-D__GALOIS_HET_CUDA__ ${CUDA_NVCC_FLAGS}")
+  set(CUDA_NVCC_FLAGS "-DGALOIS_ENABLE_GPU ${CUDA_NVCC_FLAGS}")
   set(CUB_ROOT "${CMAKE_SOURCE_DIR}/cub") # only required headers
   include_directories("${CUB_ROOT}")
   set(MGPU_ROOT "${CMAKE_SOURCE_DIR}/moderngpu") # only required headers
@@ -71,8 +71,8 @@ endif()
 
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 
-if(ENABLE_HETERO_GALOIS)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__GALOIS_HET_CUDA__")
+if(GALOIS_ENABLE_GPU)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGALOIS_ENABLE_GPU")
   set(sources src/reader.cpp src/RandomWalk.cpp src/utils.cpp)
 else()
   set(sources
@@ -94,7 +94,7 @@ else()
     src/node.cpp
     src/Net.cpp
   )
-endif(ENABLE_HETERO_GALOIS)
+endif(GALOIS_ENABLE_GPU)
 
 add_library(dg_cpu STATIC ${sources})
 target_link_libraries(dg_cpu galois_shmem)
@@ -114,7 +114,7 @@ if(GALOIS_ENABLE_DIST)
     ${CMAKE_SOURCE_DIR}/libgluon/include
   )
 
-  if(ENABLE_HETERO_GALOIS)
+  if(GALOIS_ENABLE_GPU)
     target_link_libraries(dg_gpu galois_dist_async galois_cusp galois_gluon)
     target_include_directories(dg_gpu PUBLIC
       ${CMAKE_SOURCE_DIR}/libdist/include
diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h
index 332eddb3ba..c614a92ca2 100644
--- a/libdeepgalois/include/deepgalois/DistContext.h
+++ b/libdeepgalois/include/deepgalois/DistContext.h
@@ -1,6 +1,6 @@
 #ifndef __DG_DIST_CONTEXT__
 #define __DG_DIST_CONTEXT__
-#ifdef __GALOIS_HET_CUDA__
+#ifdef GALOIS_ENABLE_GPU
 #include "deepgalois/cutils.h"
 #else
 #include "galois/graphs/GluonSubstrate.h"
@@ -25,7 +25,7 @@ class DistContext {
   std::vector<Graph*> partitionedSubgraphs;
   label_t* h_labels;      // labels for classification. Single-class: Nx1, multi-class: NxE
   float_t* h_feats;       // input features: N x D
-#ifdef __GALOIS_HET_CUDA__
+#ifdef GALOIS_ENABLE_GPU
   label_t* d_labels;      // labels on device
   label_t* d_labels_subg; // labels for subgraph on device
   float_t* d_feats;       // input features on device
@@ -68,7 +68,7 @@ class DistContext {
   Graph* getSubgraphPointer(int id) { return partitionedSubgraphs[id]; };
 
   void initializeSyncSubstrate();
-#ifdef __GALOIS_HET_CUDA__
+#ifdef GALOIS_ENABLE_GPU
   float_t* get_feats_ptr() { return d_feats; }
   float_t* get_feats_subg_ptr() { return d_feats_subg; }
   label_t* get_labels_ptr() { return d_labels; }
diff --git a/libdeepgalois/include/deepgalois/GraphTypes.h b/libdeepgalois/include/deepgalois/GraphTypes.h
index 2ee3f86b93..1528375290 100644
--- a/libdeepgalois/include/deepgalois/GraphTypes.h
+++ b/libdeepgalois/include/deepgalois/GraphTypes.h
@@ -3,7 +3,7 @@
 #include "deepgalois/types.h"
 #include "deepgalois/lgraph.h"
 
-#ifdef __GALOIS_HET_CUDA__
+#ifdef GALOIS_ENABLE_GPU
 #define USE_CSRGRAPH
 #ifdef USE_CSRGRAPH
 #include "graph_gpu.h"
@@ -16,7 +16,7 @@
 namespace deepgalois {
 using edge_iterator = index_t;
 using GraphCPU      = LearningGraph;
-#ifdef __GALOIS_HET_CUDA__
+#ifdef GALOIS_ENABLE_GPU
 using DGraph        = CSRGraph;
 using Graph         = CSRGraph;
 using GraphGPU      = CSRGraph;
diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h
index 7026ee623d..e17a9f9b76 100644
--- a/libdeepgalois/include/deepgalois/Net.h
+++ b/libdeepgalois/include/deepgalois/Net.h
@@ -22,7 +22,7 @@ namespace deepgalois {
 // layer 1: features N x D, weights D x 16, out N x 16 (hidden1=16)
 // layer 2: features N x 16, weights 16 x E, out N x E
 class Net {
-#ifdef __GALOIS_HET_CUDA__
+#ifdef GALOIS_ENABLE_GPU
   unsigned myID = 0;
 #else
   unsigned myID = galois::runtime::getSystemNetworkInterface().ID;
@@ -101,7 +101,7 @@ class Net {
         h1(hidden1), learning_rate(lr), dropout_rate(dropout), weight_decay(wd),
         val_interval(val_itv), num_subgraphs(1), is_selfloop(selfloop) {
     // init some identifiers for this host
-#ifndef __GALOIS_HET_CUDA__
+#ifndef GALOIS_ENABLE_GPU
     this->myID = galois::runtime::getSystemNetworkInterface().ID;
 #endif
     this->header    = "[" + std::to_string(myID) + "] ";
@@ -296,7 +296,7 @@ class Net {
       ////////////////////////////////////////////////////////////////////////////////
 
       // training steps
-#ifdef __GALOIS_HET_CUDA__
+#ifdef GALOIS_ENABLE_GPU
       std::cout << header << "Epoch " << std::setw(3) << curEpoch << " ";
 #else
       galois::gPrint(header, "Epoch ", std::setw(3), curEpoch, "\n");
@@ -325,7 +325,7 @@ class Net {
       // validation / testing
       set_netphases(net_phase::test);
 
-#ifdef __GALOIS_HET_CUDA__
+#ifdef GALOIS_ENABLE_GPU
       std::cout << header << "train_loss " << std::setprecision(3) << std::fixed
                 << train_loss << " train_acc " << train_acc << " ";
 #else
@@ -341,7 +341,7 @@ class Net {
         // Validation
         acc_t val_loss = 0.0, val_acc = 0.0;
         double val_time = evaluate("val", val_loss, val_acc);
-#ifdef __GALOIS_HET_CUDA__
+#ifdef GALOIS_ENABLE_GPU
         std::cout << header << "val_loss " << std::setprecision(3) << std::fixed
                   << val_loss << " val_acc " << val_acc << " ";
         std::cout << header << "time " << std::setprecision(3) << std::fixed
@@ -355,7 +355,7 @@ class Net {
                        " val_time ", val_time, ")\n");
 #endif
       } else {
-#ifdef __GALOIS_HET_CUDA__
+#ifdef GALOIS_ENABLE_GPU
         std::cout << header << "train_time " << std::fixed << epoch_time
                   << " ms (fw " << fw_time << ", bw " << epoch_time - fw_time << ")\n";
 #else
@@ -367,7 +367,7 @@ class Net {
 
     double avg_train_time = total_train_time / (double)num_epochs;
     double throughput     = 1000.0 * (double)num_epochs / total_train_time;
-#ifdef __GALOIS_HET_CUDA__
+#ifdef GALOIS_ENABLE_GPU
     std::cout << "Average training time per epoch: " << avg_train_time 
               << "ms. Throughput " << throughput << " epoch/s\n";
 #else
@@ -413,7 +413,7 @@ class Net {
       for (size_t i = 0; i < num_layers; i++)
         layers[i]->update_dim_size(distNumSamples);
       for (size_t i = 0; i < num_conv_layers; i++) {
-#ifdef __GALOIS_HET_CUDA__
+#ifdef GALOIS_ENABLE_GPU
         layers[i]->set_graph_ptr(distContext->getGraphPointer());
 #else
         layers[i]->set_graph_ptr(distContext->getLGraphPointer());
@@ -423,7 +423,7 @@ class Net {
       layers[num_layers - 1]->set_labels_ptr(distContext->get_labels_ptr());
       layers[0]->set_feats_ptr(distContext->get_feats_ptr()); // feed input data
     }
-#ifdef __GALOIS_HET_CUDA__
+#ifdef GALOIS_ENABLE_GPU
     if (type == "train") {
       gMasks = d_train_masks;
     } else if (type == "val") {
@@ -552,7 +552,7 @@ class Net {
     out_dims[1]              = get_out_dim(layer_id);
     layers[layer_id] = new graph_conv_layer(layer_id, act, norm, bias, dropout,
                                             dropout_rate, in_dims, out_dims);
-#ifdef __GALOIS_HET_CUDA__
+#ifdef GALOIS_ENABLE_GPU
     layers[layer_id]->set_graph_ptr(distContext->getGraphPointer());
 #else
     layers[layer_id]->set_graph_ptr(distContext->getLGraphPointer());
diff --git a/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h
index c962f20004..9b325311b7 100644
--- a/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h
+++ b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h
@@ -1,4 +1,4 @@
-#ifndef __GALOIS_HET_CUDA__
+#ifndef GALOIS_ENABLE_GPU
 #ifndef __GRAD_SYNC_STRUCT__
 #define __GRAD_SYNC_STRUCT__
 
diff --git a/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h b/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h
index 7c3c038d15..95e09b1c0d 100644
--- a/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h
+++ b/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h
@@ -1,4 +1,4 @@
-#ifndef __GALOIS_HET_CUDA__
+#ifndef GALOIS_ENABLE_GPU
 #ifndef __GRAPH_CONV_SYNC_STRUCT__
 #define __GRAPH_CONV_SYNC_STRUCT__
 
diff --git a/libdeepgalois/include/deepgalois/layers/aggregator.h b/libdeepgalois/include/deepgalois/layers/aggregator.h
index 142812c6ba..3f2d3c7f1b 100644
--- a/libdeepgalois/include/deepgalois/layers/aggregator.h
+++ b/libdeepgalois/include/deepgalois/layers/aggregator.h
@@ -2,7 +2,7 @@
 #include "deepgalois/types.h"
 //! For each node in the graph, add the embeddings of all of its neighbors
 //! together (using norm_factor if specified)
-#ifndef __GALOIS_HET_CUDA__
+#ifndef GALOIS_ENABLE_GPU
 #include "deepgalois/GraphTypes.h"
 namespace deepgalois {
 // TODO template arg
diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
index a02beebd57..ad954215fc 100644
--- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
@@ -51,7 +51,7 @@ class graph_conv_layer : public layer {
   virtual void back_propagation(const float_t* in_data, const float_t* out_data,
                                 float_t* out_grad, float_t* in_grad);
   // user-defined aggregate function
-#ifndef __GALOIS_HET_CUDA__
+#ifndef GALOIS_ENABLE_GPU
   virtual void aggregate(size_t len, Graph& g, const float_t* in, float_t* out);
   void d_aggregate(size_t len, Graph& g, const float_t* in, float_t* out);
 #else
diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h
index b21adefea1..02b5abebb4 100644
--- a/libdeepgalois/include/deepgalois/layers/layer.h
+++ b/libdeepgalois/include/deepgalois/layers/layer.h
@@ -15,7 +15,7 @@
 #include "deepgalois/layers/node.h"
 #include "deepgalois/DistContext.h"
 
-#ifndef __GALOIS_HET_CUDA__
+#ifndef GALOIS_ENABLE_GPU
 #include "galois/graphs/GluonSubstrate.h"
 #include "deepgalois/layers/GluonGradients.h"
 #include "deepgalois/layers/GradientSyncStructs.h"
@@ -41,7 +41,7 @@ class layer : public deepgalois::node {
   using ContextType = deepgalois::DistContext;
 
 protected:
-  #ifndef __GALOIS_HET_CUDA__
+  #ifndef GALOIS_ENABLE_GPU
   const std::string header =
       "[" + std::to_string(galois::runtime::getSystemNetworkInterface().ID) +
       "] ";
@@ -70,7 +70,7 @@ class layer : public deepgalois::node {
   label_t* labels;
   float_t* norm_consts;
 // TODO
-#ifdef __GALOIS_HET_CUDA__
+#ifdef GALOIS_ENABLE_GPU
   GraphGPU* graph_gpu;
 #else
   Graph* graph_cpu;
@@ -89,7 +89,7 @@ class layer : public deepgalois::node {
   virtual void malloc_and_init() {}
   void print_layer_info() { //! debug print function
     unsigned myID = 0;
-#ifndef __GALOIS_HET_CUDA__
+#ifndef GALOIS_ENABLE_GPU
     myID = galois::runtime::getSystemNetworkInterface().ID;
 #endif
     std::cout << "[" << myID << "] Layer " << level_ << " type: " << layer_type()
@@ -120,7 +120,7 @@ class layer : public deepgalois::node {
   void set_norm_consts_ptr(float_t* ptr) { norm_consts = ptr; }
   void set_feats_ptr(float_t* ptr) { prev_->set_data(ptr); }
   void set_name(std::string name) { name_ = name; } // name metadata
-#ifndef __GALOIS_HET_CUDA__
+#ifndef GALOIS_ENABLE_GPU
   void set_graph_ptr(Graph* ptr) { graph_cpu = ptr; }
 #else
   void set_graph_ptr(GraphGPU* ptr) { graph_gpu = ptr; }
@@ -145,7 +145,7 @@ class layer : public deepgalois::node {
     use_mask = false;
     if (masks != NULL) {
       use_mask = true;
-#ifndef __GALOIS_HET_CUDA__
+#ifndef GALOIS_ENABLE_GPU
       masks_ = masks;
 #else
       d_masks_ = masks;
@@ -183,7 +183,7 @@ class layer : public deepgalois::node {
 
   //! use optimizer to update weights given gradient (weight_grad)
   void update_weight(deepgalois::optimizer* opt) {
-#ifndef __GALOIS_HET_CUDA__
+#ifndef GALOIS_ENABLE_GPU
     // parallelize only when target size is big enough to mitigate thread
     // spawning overhead.
     // bool parallel = (W.size() >= 512);
diff --git a/libdeepgalois/include/deepgalois/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h
index e0527b2161..2e086ebf88 100644
--- a/libdeepgalois/include/deepgalois/lgraph.h
+++ b/libdeepgalois/include/deepgalois/lgraph.h
@@ -127,7 +127,7 @@ class LearningGraph {
   index_t getEdgeDstHost(index_t eid) { return colidx_[eid]; }
   index_t edge_begin_host(index_t vid) { return rowptr_[vid]; }
   index_t edge_end_host(index_t vid) { return rowptr_[vid + 1]; }
-#ifndef __GALOIS_HET_CUDA__
+#ifndef GALOIS_ENABLE_GPU
   index_t getEdgeDst(index_t eid) { return colidx_[eid]; }
   index_t edge_begin(index_t vid) { return rowptr_[vid]; }
   index_t edge_end(index_t vid) { return rowptr_[vid + 1]; }
diff --git a/libdeepgalois/include/deepgalois/optimizer.h b/libdeepgalois/include/deepgalois/optimizer.h
index 3a0139418e..ceb0f93ba0 100644
--- a/libdeepgalois/include/deepgalois/optimizer.h
+++ b/libdeepgalois/include/deepgalois/optimizer.h
@@ -30,7 +30,7 @@ struct optimizer {
   optimizer& operator=(optimizer&&)              = default;
   virtual ~optimizer()                           = default;
   virtual void update(const vec_t& dW, vec_t& W) = 0;
-#ifdef __GALOIS_HET_CUDA__
+#ifdef GALOIS_ENABLE_GPU
   virtual void update_gpu(const size_t n, const float_t* dW, float_t* W) = 0;
 #endif
   virtual void reset() {} // override to implement pre-learning action
@@ -53,7 +53,7 @@ struct stateful_optimizer : public optimizer {
     return E_[Index][&key];
   }
   std::unordered_map<const vec_t*, vec_t> E_[N];
-#ifdef __GALOIS_HET_CUDA__
+#ifdef GALOIS_ENABLE_GPU
   template <int Index>
   float_t* get_gpu(const size_t n, const float_t* key);
   std::unordered_map<const float_t*, float_t*> dE_[N];
@@ -70,7 +70,7 @@ struct stateful_optimizer : public optimizer {
 struct adagrad : public stateful_optimizer<1> {
   adagrad() : alpha(0.01), eps(float_t(1e-8)) {}
   void update(const vec_t& dW, vec_t& W);
-#ifdef __GALOIS_HET_CUDA__
+#ifdef GALOIS_ENABLE_GPU
   void update_gpu(const size_t n, const float_t* dW, float_t* W);
 #endif
   float_t alpha; // learning rate
@@ -87,7 +87,7 @@ struct adagrad : public stateful_optimizer<1> {
 struct RMSprop : public stateful_optimizer<1> {
   RMSprop() : alpha(float_t(0.0001)), mu(float_t(0.99)), eps(float_t(1e-8)) {}
   void update(const vec_t& dW, vec_t& W);
-#ifdef __GALOIS_HET_CUDA__
+#ifdef GALOIS_ENABLE_GPU
   void update_gpu(const size_t n, const float_t* dW, float_t* W);
 #endif
   float_t alpha; // learning rate
@@ -103,7 +103,7 @@ struct adam : public stateful_optimizer<2> {
       : alpha(float_t(0.01)), b1(float_t(0.9)), b2(float_t(0.999)),
         b1_t(float_t(0.9)), b2_t(float_t(0.999)), eps(float_t(1e-8)) {}
   void update(const vec_t& dW, vec_t& W);
-#ifdef __GALOIS_HET_CUDA__
+#ifdef GALOIS_ENABLE_GPU
   void update_gpu(const size_t n, const float_t* dW, float_t* W);
 #endif
 
@@ -128,7 +128,7 @@ struct adamax : public stateful_optimizer<2> {
       : alpha(float_t(0.002)), b1(float_t(0.9)), b2(float_t(0.999)), b1_t(b1),
         eps(float_t(1e-8)) {}
   void update(const vec_t& dW, vec_t& W);
-#ifdef __GALOIS_HET_CUDA__
+#ifdef GALOIS_ENABLE_GPU
   void update_gpu(const size_t n, const float_t* dW, float_t* W);
 #endif
 
@@ -146,7 +146,7 @@ struct adamax : public stateful_optimizer<2> {
 struct gradient_descent : public optimizer {
   gradient_descent() : alpha(float_t(0.01)), lambda(float_t(0)) {}
   void update(const vec_t& dW, vec_t& W);
-#ifdef __GALOIS_HET_CUDA__
+#ifdef GALOIS_ENABLE_GPU
   void update_gpu(const size_t n, const float_t* dW, float_t* W);
 #endif
   float_t alpha;  // learning rate
@@ -164,7 +164,7 @@ struct momentum : public stateful_optimizer<1> {
 public:
   momentum() : alpha(float_t(0.01)), lambda(float_t(0)), mu(float_t(0.9)) {}
   void update(const vec_t& dW, vec_t& W);
-#ifdef __GALOIS_HET_CUDA__
+#ifdef GALOIS_ENABLE_GPU
   void update_gpu(const size_t n, const float_t* dW, float_t* W);
 #endif
 
@@ -185,7 +185,7 @@ struct nesterov_momentum : public stateful_optimizer<1> {
   nesterov_momentum()
       : alpha(float_t(0.01)), lambda(float_t(0)), mu(float_t(0.9)) {}
   void update(const vec_t& dW, vec_t& W);
-#ifdef __GALOIS_HET_CUDA__
+#ifdef GALOIS_ENABLE_GPU
   void update_gpu(const size_t n, const float_t* dW, float_t* W);
 #endif
 
diff --git a/libdeepgalois/src/reader.cpp b/libdeepgalois/src/reader.cpp
index e4c110dd1e..016a72d26a 100644
--- a/libdeepgalois/src/reader.cpp
+++ b/libdeepgalois/src/reader.cpp
@@ -17,7 +17,7 @@ namespace deepgalois {
 // required.
 size_t Reader::read_labels(bool is_single_class, label_t*& labels) {
   unsigned myID = 0;
-#ifndef __GALOIS_HET_CUDA__
+#ifndef GALOIS_ENABLE_GPU
   myID = galois::runtime::getSystemNetworkInterface().ID;
   galois::gPrint("[", myID, "] Reader: Reading labels...\n");
 #endif
diff --git a/libdeepgalois/src/utils.cpp b/libdeepgalois/src/utils.cpp
index db738dd2f3..10cd18832c 100644
--- a/libdeepgalois/src/utils.cpp
+++ b/libdeepgalois/src/utils.cpp
@@ -115,7 +115,7 @@ acc_t masked_f1_score(size_t begin, size_t end, size_t, mask_t* masks,
           : 0.;
 
   unsigned myID = 0;
-#ifndef __GALOIS_HET_CUDA__
+#ifndef GALOIS_ENABLE_GPU
   myID = galois::runtime::getSystemNetworkInterface().ID;
 #endif
   std::cout << "[" << myID << "]" << std::setprecision(3) << std::fixed
diff --git a/lonestar/gnn/CMakeLists.txt b/lonestar/gnn/CMakeLists.txt
index 0020736fee..f718db4942 100644
--- a/lonestar/gnn/CMakeLists.txt
+++ b/lonestar/gnn/CMakeLists.txt
@@ -2,7 +2,7 @@ include_directories(${CMAKE_SOURCE_DIR}/lonestar/gnn/include)
 include_directories(${CMAKE_SOURCE_DIR}/libdeepgalois/include)
 include_directories(${CUDA_HOME}/include)
 link_directories(${CUDA_HOME}/lib64)
-if(ENABLE_HETERO_GALOIS)
+if(GALOIS_ENABLE_GPU)
   include_directories(${CMAKE_SOURCE_DIR}/libgpu/include)
   link_directories(${INTEL_LIBS_DIR})
 endif()
@@ -13,8 +13,8 @@ if(USE_MKL_BLAS)
 endif()
 link_directories(${BLAS_LIB_DIR})
 
-if(ENABLE_HETERO_GALOIS)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__GALOIS_HET_CUDA__")
+if(GALOIS_ENABLE_GPU)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGALOIS_ENABLE_GPU")
 endif()
 
 if(GALOIS_ENABLE_DIST)
diff --git a/lonestar/gnn/gat/CMakeLists.txt b/lonestar/gnn/gat/CMakeLists.txt
index 8d172ac154..5fc85aa8a6 100644
--- a/lonestar/gnn/gat/CMakeLists.txt
+++ b/lonestar/gnn/gat/CMakeLists.txt
@@ -1,7 +1,7 @@
 add_executable(gat gat.cpp)
 target_link_libraries(gat PRIVATE Galois::shmem lonestar)
 
-if(ENABLE_HETERO_GALOIS)
+if(GALOIS_ENABLE_GPU)
   set_property(TARGET gat PROPERTY CUDA_STANDARD 14)
   set_property(TARGET gat PROPERTY CUDA_SEPARABLE_COMPILATION ON)
   target_link_libraries(gat PRIVATE dg_gpu dg_cpu)
diff --git a/lonestar/gnn/gcn/CMakeLists.txt b/lonestar/gnn/gcn/CMakeLists.txt
index 9ed4ef97d9..7ad1ba6e29 100644
--- a/lonestar/gnn/gcn/CMakeLists.txt
+++ b/lonestar/gnn/gcn/CMakeLists.txt
@@ -1,10 +1,10 @@
-#if(ENABLE_HETERO_GALOIS)
+#if(GALOIS_ENABLE_GPU)
 #  set_source_files_properties(gcn.cpp PROPERTIES LANGUAGE CUDA)
 #endif()
 add_executable(gcn gcn.cpp)
 target_link_libraries(gcn PRIVATE Galois::shmem lonestar)
 
-if(ENABLE_HETERO_GALOIS)
+if(GALOIS_ENABLE_GPU)
   set_property(TARGET gcn PROPERTY CUDA_STANDARD 14)
   set_property(TARGET gcn PROPERTY CUDA_SEPARABLE_COMPILATION ON)
   target_link_libraries(gcn PRIVATE dg_gpu dg_cpu)
diff --git a/lonestar/gnn/gin/CMakeLists.txt b/lonestar/gnn/gin/CMakeLists.txt
index 5d63e3d0d7..7e6027174a 100644
--- a/lonestar/gnn/gin/CMakeLists.txt
+++ b/lonestar/gnn/gin/CMakeLists.txt
@@ -3,7 +3,7 @@ target_link_libraries(gin dg_cpu)
 if(GALOIS_ENABLE_DIST)
   target_link_libraries(gin distgraphloader)
 endif()
-if(ENABLE_HETERO_GALOIS)
+if(GALOIS_ENABLE_GPU)
   target_link_libraries(gin dg_gpu)
   target_link_libraries(gin -lcudart -lcublas -lcurand -lcudadevrt)
 endif()
diff --git a/lonestar/gnn/include/engine.h b/lonestar/gnn/include/engine.h
index cf39ce95f3..84aa5cbadd 100644
--- a/lonestar/gnn/include/engine.h
+++ b/lonestar/gnn/include/engine.h
@@ -21,7 +21,7 @@ void LonestarGnnStart(int argc, char** argv, const char* app, const char* desc,
   llvm::cl::ParseCommandLineOptions(argc, argv);
   galois::runtime::setStatFile(statFile);
 
-#ifndef __GALOIS_HET_CUDA__
+#ifndef GALOIS_ENABLE_GPU
   numThreads = galois::setActiveThreads(numThreads); // number of threads on CPU
 #endif
 
@@ -54,7 +54,7 @@ void LonestarGnnStart(int argc, char** argv, const char* app, const char* desc,
 }
 
 int main(int argc, char** argv) {
-#ifdef __GALOIS_HET_CUDA__
+#ifdef GALOIS_ENABLE_GPU
   galois::SharedMemSys G;
 #else
   galois::DistMemSys G;
@@ -64,7 +64,7 @@ int main(int argc, char** argv) {
   // Get a partitioned graph first
   std::vector<unsigned> dummyVec;
   deepgalois::DGraph* dGraph = NULL;
-#ifndef __GALOIS_HET_CUDA__
+#ifndef GALOIS_ENABLE_GPU
   dGraph = galois::graphs::constructSymmetricGraph<char, void>(dummyVec);
 #endif
   // initialize network + whole context on CPU
diff --git a/lonestar/gnn/sage/CMakeLists.txt b/lonestar/gnn/sage/CMakeLists.txt
index ee95292588..b820f7024b 100644
--- a/lonestar/gnn/sage/CMakeLists.txt
+++ b/lonestar/gnn/sage/CMakeLists.txt
@@ -1,7 +1,7 @@
 add_executable(sage sage.cpp)
 target_link_libraries(sage PRIVATE Galois::shmem lonestar)
 
-if(ENABLE_HETERO_GALOIS)
+if(GALOIS_ENABLE_GPU)
   set_property(TARGET sage PROPERTY CUDA_STANDARD 14)
   set_property(TARGET sage PROPERTY CUDA_SEPARABLE_COMPILATION ON)
   target_link_libraries(sage PRIVATE dg_gpu dg_cpu)

From 893e44128e1a28b3de23b0fd9f53ff5756dfc5fd Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Tue, 19 May 2020 16:53:44 -0500
Subject: [PATCH 317/660] fix gpu compile errors

---
 libdeepgalois/CMakeLists.txt  | 2 +-
 lonestar/gnn/include/engine.h | 9 +++++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index 1abc692a9f..4f7d9f93ac 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -27,7 +27,7 @@ link_directories(${CMAKE_SOURCE_DIR}/libgalois)
 
 if(GALOIS_ENABLE_GPU)
   # hetero path
-  set(CUDA_NVCC_FLAGS "-DGALOIS_ENABLE_GPU ${CUDA_NVCC_FLAGS}")
+  set(CUDA_NVCC_FLAGS "-DGALOIS_ENABLE_GPU --extended-lambda ${CUDA_NVCC_FLAGS}")
   set(CUB_ROOT "${CMAKE_SOURCE_DIR}/cub") # only required headers
   include_directories("${CUB_ROOT}")
   set(MGPU_ROOT "${CMAKE_SOURCE_DIR}/moderngpu") # only required headers
diff --git a/lonestar/gnn/include/engine.h b/lonestar/gnn/include/engine.h
index 84aa5cbadd..25644c720d 100644
--- a/lonestar/gnn/include/engine.h
+++ b/lonestar/gnn/include/engine.h
@@ -1,9 +1,13 @@
 // Execution engine
 #include <iostream>
 #include <sstream>
+#ifdef GALOIS_ENABLE_GPU
+#include "galois/Galois.h"
+#else
 #include "DistributedGraphLoader.h"
 #include "galois/DistGalois.h"
 #include "galois/runtime/Network.h"
+#endif
 #include "galois/Version.h"
 #include "galois/Timer.h"
 #include "deepgalois/Net.h"
@@ -21,12 +25,13 @@ void LonestarGnnStart(int argc, char** argv, const char* app, const char* desc,
   llvm::cl::ParseCommandLineOptions(argc, argv);
   galois::runtime::setStatFile(statFile);
 
+  unsigned hostID = 0;
 #ifndef GALOIS_ENABLE_GPU
   numThreads = galois::setActiveThreads(numThreads); // number of threads on CPU
+  hostID = galois::runtime::getSystemNetworkInterface().ID;
 #endif
 
-  auto& net = galois::runtime::getSystemNetworkInterface();
-  if (net.ID == 0) {
+  if (hostID == 0) {
     LonestarGnnPrintVersion(llvm::outs());
     std::cout << "Copyright (C) " << galois::getCopyrightYear()
               << " The University of Texas at Austin\n";

From c5a5ee56892f85943279f51e25e5676ca02f0156 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Mon, 1 Jun 2020 11:41:08 -0500
Subject: [PATCH 318/660] add src/Train.cpp

---
 libdeepgalois/CMakeLists.txt           |   3 +-
 libdeepgalois/include/deepgalois/Net.h | 527 ++-----------------------
 libdeepgalois/src/Train.cpp            | 509 ++++++++++++++++++++++++
 3 files changed, 540 insertions(+), 499 deletions(-)
 create mode 100644 libdeepgalois/src/Train.cpp

diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
index 4f7d9f93ac..44be89edad 100644
--- a/libdeepgalois/CMakeLists.txt
+++ b/libdeepgalois/CMakeLists.txt
@@ -73,7 +73,7 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 
 if(GALOIS_ENABLE_GPU)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGALOIS_ENABLE_GPU")
-  set(sources src/reader.cpp src/RandomWalk.cpp src/utils.cpp)
+  set(sources src/reader.cpp src/RandomWalk.cpp src/utils.cpp src/Train.cpp)
 else()
   set(sources
     src/layers/softmax_loss_layer.cpp
@@ -91,6 +91,7 @@ else()
     src/reader.cpp
     src/lgraph.cpp
     src/utils.cpp
+    src/Train.cpp
     src/node.cpp
     src/Net.cpp
   )
diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h
index e17a9f9b76..6c720f730d 100644
--- a/libdeepgalois/include/deepgalois/Net.h
+++ b/libdeepgalois/include/deepgalois/Net.h
@@ -91,374 +91,26 @@ class Net {
   Sampler* sampler;
 
 public:
-  Net(std::string dataset_str, int nt, unsigned n_conv, int epochs,
-      unsigned hidden1, float lr, float dropout, float wd, bool selfloop,
-      bool single, bool l2norm, bool dense, unsigned neigh_sz, unsigned subg_sz,
-      int val_itv)
-      : is_single_class(single), has_l2norm(l2norm), has_dense(dense),
-        neighbor_sample_size(neigh_sz), subgraph_sample_size(subg_sz),
-        num_threads(nt), num_conv_layers(n_conv), num_epochs(epochs),
-        h1(hidden1), learning_rate(lr), dropout_rate(dropout), weight_decay(wd),
-        val_interval(val_itv), num_subgraphs(1), is_selfloop(selfloop) {
-    // init some identifiers for this host
-#ifndef GALOIS_ENABLE_GPU
-    this->myID = galois::runtime::getSystemNetworkInterface().ID;
-#endif
-    this->header    = "[" + std::to_string(myID) + "] ";
-    this->seperator = " ";
-
-    assert(n_conv > 0);
-
-    //galois::gPrint(header, "Configuration: num_threads ", num_threads,
-    //               ", num_conv_layers ", num_conv_layers, ", num_epochs ",
-    //               num_epochs, ", hidden1 ", hidden1, ", learning_rate ",
-    //               learning_rate, ", dropout_rate ", dropout_rate,
-    //               ", weight_decay ", weight_decay, "\n");
-    this->num_layers = num_conv_layers + 1;
-
-    // additional layers to add
-    if (has_l2norm)
-      this->num_layers++;
-    if (has_dense)
-      this->num_layers++;
-    // initialize feature metadata
-    feature_dims.resize(num_layers + 1);
-
-    // initialze global graph context
-    graphTopologyContext = new deepgalois::Context();
-    graphTopologyContext->set_dataset(dataset_str);
-    // read *entire* graph, get num nodes
-    globalSamples = graphTopologyContext->read_graph(selfloop);
-
-    // get training and validation sets: this is to create the training
-    // subgraph in the sampler
-    globalTrainMasks = new mask_t[globalSamples];
-    globalValMasks   = new mask_t[globalSamples];
-    globalTestMasks  = new mask_t[globalSamples];
-    std::fill(globalTrainMasks, globalTrainMasks + globalSamples, 0);
-    std::fill(globalValMasks, globalValMasks + globalSamples, 0);
-
-    // reddit is hard coded
-    if (dataset_str == "reddit") {
-      this->globalTrainBegin = 0;
-      this->globalTrainCount = 153431;
-      this->globalTrainEnd   = this->globalTrainBegin + this->globalTrainCount;
-      this->globalValBegin   = 153431;
-      this->globalValCount   = 23831;
-      this->globalValEnd     = this->globalValBegin + this->globalValCount;
-
-      // TODO do all can be used below
-      for (size_t i = globalTrainBegin; i < globalTrainEnd; i++)
-        globalTrainMasks[i] = 1;
-      for (size_t i = globalValBegin; i < globalValEnd; i++)
-        globalValMasks[i] = 1;
-    } else {
-      globalTrainCount = graphTopologyContext->read_masks(
-          "train", globalSamples, globalTrainBegin, globalTrainEnd,
-          globalTrainMasks);
-      globalValCount = graphTopologyContext->read_masks(
-          "val", globalSamples, globalValBegin, globalValEnd, globalValMasks);
-    }
-
-    // make sure sampel size isn't greater than what we have to train with
-    assert(subgraph_sample_size <= globalTrainCount);
-
-    layers.resize(num_layers);
-    // hidden1 level embedding: 16
-    for (size_t i = 1; i < num_conv_layers; i++)
-      feature_dims[i] = this->h1;
-
-    // features are read in distcontext, not this context (this context only
-    // used for sampling)
-    if (subgraph_sample_size)
-      sampler = new deepgalois::Sampler();
-  }
-
   //! Default net constructor
-  // Net()
-  //    : is_single_class(true), has_l2norm(false), has_dense(false),
-  //      neighbor_sample_size(0), subgraph_sample_size(0), num_threads(1),
-  //      globalSamples(0), num_classes(0), num_conv_layers(0), num_layers(0),
-  //      num_epochs(0), learning_rate(0.0), dropout_rate(0.0),
-  //      weight_decay(0.0), globalTrainBegin(0), globalTrainEnd(0),
-  //      globalTrainCount(0), globalValBegin(0), globalValEnd(0),
-  //      globalValCount(0), globalTestBegin(0), globalTestEnd(0),
-  //      globalTestCount(0), val_interval(1), num_subgraphs(1),
-  //      num_vertices_sg(9000), globalTrainMasks(NULL), globalValMasks(NULL),
-  //      globalTestMasks(NULL), context(NULL) {}
+  Net() : Net("reddit", 1, 2, 200, 16, 0.01, 0.5, 5e-4, 
+              false, true, false, false, 25, 9000, 1) {}
 
+  //! Net constructor
+  Net(std::string dataset_str, int nt, unsigned n_conv, int epochs,
+      unsigned hidden1, float lr, float dropout, float wd, 
+      bool selfloop, bool single, bool l2norm, bool dense, 
+      unsigned neigh_sz, unsigned subg_sz, int val_itv);
+
+  // allocate memory for subgraph masks
   void allocateSubgraphsMasks(int num_subgraphs);
 
   //! Initializes metadata for the partition: loads data, labels, etc
-  void partitionInit(DGraph* graph, std::string dataset_str,
-                     bool isSingleClassLabel);
+  void partitionInit(DGraph* graph, std::string dataset_str, bool isSingleClassLabel);
   size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; }
   size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id + 1]; }
   void regularize(); // add weight decay
-
-  void train(optimizer* opt, bool need_validate) {
-    double total_train_time = 0.0;
-    int num_subg_remain     = 0;
-
-    if (subgraph_sample_size) {
-      distContext->allocateSubgraphs(num_subgraphs, subgraph_sample_size);
-      allocateSubgraphsMasks(num_subgraphs);
-      std::cout << header
-                << "Constructing training vertex set induced graph...\n";
-      // auto gg = distContext->getGraphPointer();
-      auto gg =
-          graphTopologyContext->getGraphPointer(); // gloabl graph in CPU mem
-      sampler->initializeMaskedGraph(globalTrainCount, globalTrainMasks, gg,
-                                     distContext->getGraphPointer());
-    }
-
-    //galois::gPrint(header, "Start training...\n");
-
-    Timer t_epoch;
-
-    // run epochs
-    for (int curEpoch = 0; curEpoch < num_epochs; curEpoch++) {
-      t_epoch.Start();
-
-      ////////////////////////////////////////////////////////////////////////////////
-      // Sampling
-      ////////////////////////////////////////////////////////////////////////////////
-      if (subgraph_sample_size) {
-        if (num_subg_remain == 0) {
-          std::cout << header << "Generating " << num_subgraphs
-                    << " subgraph(s)\n";
-          // TODO stat timer instead of this timer
-          Timer t_subgen;
-          t_subgen.Start();
-
-          // generate subgraphs
-          for (int sid = 0; sid < num_subgraphs; sid++) {
-            VertexSet sampledSet;
-            sampler->selectVertices(subgraph_sample_size, sampledSet,
-                                    curEpoch); // m = 1000 by default
-            sampler->generateSubgraph(sampledSet,
-                                      subgraphs_masks + sid * globalSamples,
-                                      distContext->getSubgraphPointer(sid));
-          }
-          num_subg_remain = num_subgraphs;
-          t_subgen.Stop();
-          // std::cout << "Done, time: " << t_subgen.Millisecs() << "\n";
-        }
-        // count their degrees
-        for (int i = 0; i < num_subgraphs; i++) {
-          auto sg_ptr = distContext->getSubgraphPointer(i);
-          sg_ptr->degree_counting();
-          // galois::gPrint("\tsubgraph[", i, "]: num_v ", sg_ptr->size(), "
-          // num_e ", sg_ptr->sizeEdges(), "\n");
-        }
-
-        // choose a subgraph to use
-        num_subg_remain--;
-        int sg_id                 = num_subg_remain;
-        auto subgraphPointer      = distContext->getSubgraphPointer(sg_id);
-        this->subgraphNumVertices = subgraphPointer->size();
-
-        //std::cout << "Subgraph num_vertices: " << subgraphNumVertices
-        //          << ", num_edges: " << subgraphPointer->sizeEdges() << "\n";
-        for (size_t i = 0; i < num_layers; i++) {
-          layers[i]->update_dim_size(this->subgraphNumVertices);
-        }
-
-        // TODO dist version where i need global degrees
-        // change normalization constants
-        distContext->constructNormFactorSub(sg_id);
-        for (size_t i = 0; i < num_conv_layers; i++) {
-          layers[i]->set_graph_ptr(subgraphPointer);
-          layers[i]->set_norm_consts_ptr(
-              distContext->get_norm_factors_subg_ptr());
-        }
-
-        // update labels for subgraph
-        distContext->constructSubgraphLabels(
-            this->subgraphNumVertices, subgraphs_masks + sg_id * globalSamples);
-        layers[num_layers - 1]->set_labels_ptr(
-            distContext->get_labels_subg_ptr());
-
-        // update features for subgraph
-        distContext->constructSubgraphFeatures(
-            this->subgraphNumVertices, subgraphs_masks + sg_id * globalSamples);
-        layers[0]->set_feats_ptr(
-            distContext->get_feats_subg_ptr()); // feed input data
-
-        // Graph* testing = distContext->getSubgraphPointer(sg_id);
-        // for (size_t i = 0; i < testing->size(); i++) {
-        //  for (auto j = testing->edge_begin(i); j < testing->edge_end(i); j++)
-        //  {
-        //    galois::gPrint(i, " ", testing->getEdgeDst(j), "\n");
-        //  }
-        //}
-      } // end subgraph sample loop
-      ////////////////////////////////////////////////////////////////////////////////
-
-      // training steps
-#ifdef GALOIS_ENABLE_GPU
-      std::cout << header << "Epoch " << std::setw(3) << curEpoch << " ";
-#else
-      galois::gPrint(header, "Epoch ", std::setw(3), curEpoch, "\n");
-#endif
-      set_netphases(net_phase::train);
-      acc_t train_loss = 0.0, train_acc = 0.0;
-
-      //galois::gPrint(header, "Calling into eval for forward propagation\n");
-      // forward: after this phase, layer edges will contain intermediate
-      // features for use during backprop
-      double fw_time = evaluate("train", train_loss, train_acc);
-      //evaluate("train", train_loss, train_acc);
-
-
-      //galois::gPrint(header, "Calling into backward propagation\n");
-      // backward: use intermediate features + ground truth to update layers
-      // with feature gradients whcih are then used to calculate weight
-      // gradients
-      Net::bprop();
-
-      //galois::gPrint(header, "Weight update call\n");
-      // gradient update: use gradients stored on each layer to update model
-      // for next epoch
-      Net::update_weights(opt); // update parameters
-
-      // validation / testing
-      set_netphases(net_phase::test);
-
-#ifdef GALOIS_ENABLE_GPU
-      std::cout << header << "train_loss " << std::setprecision(3) << std::fixed
-                << train_loss << " train_acc " << train_acc << " ";
-#else
-      galois::gPrint(header, "train_loss ", std::setprecision(3), std::fixed,
-                     train_loss, " train_acc ", train_acc, "\n");
-#endif
-      t_epoch.Stop();
-
-      double epoch_time = t_epoch.Millisecs();
-      total_train_time += epoch_time;
-
-      if (need_validate && curEpoch % val_interval == 0) {
-        // Validation
-        acc_t val_loss = 0.0, val_acc = 0.0;
-        double val_time = evaluate("val", val_loss, val_acc);
-#ifdef GALOIS_ENABLE_GPU
-        std::cout << header << "val_loss " << std::setprecision(3) << std::fixed
-                  << val_loss << " val_acc " << val_acc << " ";
-        std::cout << header << "time " << std::setprecision(3) << std::fixed
-                  << epoch_time + val_time << " ms (train_time " << epoch_time
-                  << " val_time " << val_time << ")\n";
-#else
-        galois::gPrint(header, "val_loss ", std::setprecision(3), std::fixed,
-                       val_loss, " val_acc ", val_acc, "\n");
-        galois::gPrint(header, "time ", std::setprecision(3), std::fixed,
-                       epoch_time + val_time, " ms (train_time ", epoch_time,
-                       " val_time ", val_time, ")\n");
-#endif
-      } else {
-#ifdef GALOIS_ENABLE_GPU
-        std::cout << header << "train_time " << std::fixed << epoch_time
-                  << " ms (fw " << fw_time << ", bw " << epoch_time - fw_time << ")\n";
-#else
-        galois::gPrint(header, "train_time ", std::fixed, epoch_time,
-                       " ms (fw ", fw_time, ", bw ", epoch_time - fw_time, ")\n");
-#endif
-      }
-    } // epoch loop
-
-    double avg_train_time = total_train_time / (double)num_epochs;
-    double throughput     = 1000.0 * (double)num_epochs / total_train_time;
-#ifdef GALOIS_ENABLE_GPU
-    std::cout << "Average training time per epoch: " << avg_train_time 
-              << "ms. Throughput " << throughput << " epoch/s\n";
-#else
-    galois::gPrint(header, "Average training time per epoch: ", avg_train_time,
-                   " ms. Throughput: ", throughput, " epoch/s\n");
-#endif
-  }
-
-  // evaluate, i.e. inference or predict
-  double evaluate(std::string type, acc_t& loss, acc_t& acc) {
-    Timer t_eval;
-    t_eval.Start();
-    size_t gBegin = 0, gEnd = 0, gCount = 0;
-    mask_t* gMasks = NULL;
-
-    // TODO global here good for dist case?
-    if (type == "train") {
-      gBegin = globalTrainBegin;
-      gEnd   = globalTrainEnd;
-      gCount = globalTrainCount;
-      gMasks = globalTrainMasks;
-      if (subgraph_sample_size) {
-        // update gMasks for subgraph
-        gMasks = NULL;
-        gBegin = 0;
-        gEnd   = this->subgraphNumVertices;
-        gCount = this->subgraphNumVertices;
-      }
-    } else if (type == "val") {
-      gBegin = globalValBegin;
-      gEnd   = globalValEnd;
-      gCount = globalValCount;
-      gMasks = globalValMasks;
-    } else {
-      gBegin = globalTestBegin;
-      gEnd   = globalTestEnd;
-      gCount = globalTestCount;
-      gMasks = globalTestMasks;
-    }
-
-    // switch to the original graph if not training
-    if (subgraph_sample_size && type != "train") {
-      for (size_t i = 0; i < num_layers; i++)
-        layers[i]->update_dim_size(distNumSamples);
-      for (size_t i = 0; i < num_conv_layers; i++) {
-#ifdef GALOIS_ENABLE_GPU
-        layers[i]->set_graph_ptr(distContext->getGraphPointer());
-#else
-        layers[i]->set_graph_ptr(distContext->getLGraphPointer());
-#endif
-        layers[i]->set_norm_consts_ptr(distContext->get_norm_factors_ptr());
-      }
-      layers[num_layers - 1]->set_labels_ptr(distContext->get_labels_ptr());
-      layers[0]->set_feats_ptr(distContext->get_feats_ptr()); // feed input data
-    }
-#ifdef GALOIS_ENABLE_GPU
-    if (type == "train") {
-      gMasks = d_train_masks;
-    } else if (type == "val") {
-      gMasks = d_val_masks;
-    } else {
-      gMasks = d_test_masks;
-    }
-#endif
-
-    //galois::gPrint(header, "Doing actual forward propagation\n");
-    loss = fprop(gBegin, gEnd, gCount, gMasks);
-    //galois::gPrint(header,
-    //               "Forward propagation donne, going to check accuracy\n");
-    float_t* predictions = layers[num_layers - 1]->next()->get_data();
-
-    // labels will be subgraph labels if applicable
-    label_t* localLabels;
-    if (type == "train" && subgraph_sample_size) {
-      localLabels = distContext->get_labels_subg_ptr();
-    } else {
-      // note this grabs local labels
-      localLabels = distContext->get_labels_ptr();
-    }
-
-    if (is_single_class) {
-      acc = masked_accuracy(gBegin, gEnd, gCount, gMasks, predictions,
-                            localLabels);
-    } else {
-      acc = masked_multi_class_accuracy(gBegin, gEnd, gCount, gMasks,
-                                        predictions, localLabels);
-    }
-
-    t_eval.Stop();
-    return t_eval.Millisecs();
-  }
+  void train(optimizer* opt, bool need_validate);
+  double evaluate(std::string type, acc_t& loss, acc_t& acc);
 
   //! read masks of test set for GLOBAL set
   void read_test_masks(std::string dataset);
@@ -466,153 +118,32 @@ class Net {
   void readDistributedTestMasks(std::string dataset);
 
   // void copy_test_masks_to_device();
-
-  void construct_layers() {
-    // append conv layers
-    //galois::gPrint(header, "Constructing layers...\n");
-    for (size_t i = 0; i < num_conv_layers - 1; i++) {
-      append_conv_layer(i, true); // conv layers, act=true
-    }
-    append_conv_layer(num_conv_layers - 1); // the last hidden layer, act=false
-
-    if (has_l2norm) {
-      append_l2norm_layer(num_conv_layers); // l2_norm layer
-    }
-    if (has_dense) {
-      append_dense_layer(num_layers - 2); // dense layer
-    }
-    append_out_layer(num_layers - 1); // output layer
-
-    // allocate memory for intermediate features and gradients
-    for (size_t i = 0; i < num_layers; i++) {
-      layers[i]->add_edge();
-    }
-    for (size_t i = 1; i < num_layers; i++) {
-      connect(layers[i - 1], layers[i]);
-    }
-    for (size_t i = 0; i < num_layers; i++) {
-      layers[i]->malloc_and_init();
-    }
-
-    layers[0]->set_in_data(distContext->get_feats_ptr()); // feed input data
-    // precompute the normalization constant based on graph structure
-    // context->norm_factor_computing(false);
-    distContext->constructNormFactor(graphTopologyContext);
-    for (size_t i = 0; i < num_conv_layers; i++)
-      layers[i]->set_norm_consts_ptr(distContext->get_norm_factors_ptr());
-    set_contexts();
-  }
+  void construct_layers();
 
   //! Add an l2_norm layer to the network
-  void append_l2norm_layer(size_t layer_id) {
-    assert(layer_id > 0); // can not be the first layer
-    std::vector<size_t> in_dims(2), out_dims(2);
-    in_dims[0]       = distNumSamples;
-    in_dims[0]       = distNumSamples;
-    in_dims[1]       = get_in_dim(layer_id);
-    out_dims[1]      = get_out_dim(layer_id);
-    layers[layer_id] = new l2_norm_layer(layer_id, in_dims, out_dims);
-  }
+  void append_l2norm_layer(size_t layer_id);
 
   //! Add an dense layer to the network
-  void append_dense_layer(size_t layer_id) {
-    assert(layer_id > 0); // can not be the first layer
-    std::vector<size_t> in_dims(2), out_dims(2);
-    in_dims[0]  = distNumSamples;
-    in_dims[0]  = distNumSamples;
-    in_dims[1]  = get_in_dim(layer_id);
-    out_dims[1] = get_out_dim(layer_id);
-    // layers[layer_id] = new dense_layer(layer_id, in_dims, out_dims);
-  }
+  void append_dense_layer(size_t layer_id);
 
   //! Add an output layer to the network
-  void append_out_layer(size_t layer_id) {
-    assert(layer_id > 0); // can not be the first layer
-    std::vector<size_t> in_dims(2), out_dims(2);
-    in_dims[0] = out_dims[0] = distNumSamples;
-    in_dims[1]               = get_in_dim(layer_id);
-    out_dims[1]              = get_out_dim(layer_id);
-
-    if (is_single_class)
-      layers[layer_id] = new softmax_loss_layer(layer_id, in_dims, out_dims);
-    else
-      layers[layer_id] = new sigmoid_loss_layer(layer_id, in_dims, out_dims);
-
-    layers[layer_id]->set_labels_ptr(distContext->get_labels_ptr());
-  }
+  void append_out_layer(size_t layer_id);
 
   //! Add a convolution layer to the network
   void append_conv_layer(size_t layer_id, bool act = false, bool norm = true,
-                         bool bias = false, bool dropout = true) {
-    assert(dropout_rate < 1.0);
-    assert(layer_id < num_conv_layers);
-    std::vector<size_t> in_dims(2), out_dims(2);
-    in_dims[0] = out_dims[0] = distNumSamples;
-    in_dims[1]               = get_in_dim(layer_id);
-    out_dims[1]              = get_out_dim(layer_id);
-    layers[layer_id] = new graph_conv_layer(layer_id, act, norm, bias, dropout,
-                                            dropout_rate, in_dims, out_dims);
-#ifdef GALOIS_ENABLE_GPU
-    layers[layer_id]->set_graph_ptr(distContext->getGraphPointer());
-#else
-    layers[layer_id]->set_graph_ptr(distContext->getLGraphPointer());
-#endif
-  }
-
-  // update trainable weights after back-propagation
-  void update_weights(optimizer* opt) {
-    regularize();
-    for (size_t i = 0; i < num_layers; i++) {
-      if (layers[i]->trainable()) {
-        layers[i]->update_weight(opt);
-      }
-    }
-  }
-
-  //! forward propagation: [begin, end) is the range of samples used.
-  //! calls "forward" on each layer and returns the loss of the final layer
-  acc_t fprop(size_t gBegin, size_t gEnd, size_t gCount, mask_t* gMasks) {
-    // set mask for the last layer; globals
-    // TODO this should be distirbuted sample gBegin->end not global; fix later
-    // seems to be unused in code right now anyways
-    //galois::gPrint(header, "fprop: set sample mask\n");
-    layers[num_layers - 1]->set_sample_mask(gBegin, gEnd, gCount, gMasks);
-
-    for (size_t i = 0; i < num_layers; i++) {
-      //galois::gPrint(header, "fprop: layer ", i, " forward call\n");
-      layers[i]->forward();
-    }
-
-    //galois::gPrint(header, "fprop: getting loss\n");
-    // prediction error
-    acc_t loss = layers[num_layers - 1]->get_prediction_loss();
-    // Squared Norm Regularization to mitigate overfitting
-    loss += weight_decay * layers[0]->get_weight_decay_loss();
-    return loss;
-  }
-
-  void bprop() {
-    for (size_t i = num_layers; i != 0; i--) {
-      layers[i - 1]->backward();
-    }
-  }
-
-  //! Save the context object to all layers of the network
-  void set_contexts() {
-    for (size_t i = 0; i < num_layers; i++)
-      layers[i]->set_context(distContext);
-  }
-  //! set netphases for all layers in this network
-  void set_netphases(net_phase phase) {
-    for (size_t i = 0; i < num_layers; i++)
-      layers[i]->set_netphase(phase);
-  }
-  //! print all layers
-  void print_layers_info() {
-    for (size_t i = 0; i < num_layers; i++)
-      layers[i]->print_layer_info();
-  }
-
+                         bool bias = false, bool dropout = true);
+
+  // update trainable weights after back-prop
+  void update_weights(optimizer* opt);
+
+  // forward propagation
+  acc_t fprop(size_t gBegin, size_t gEnd, size_t gCount, mask_t* gMasks);
+  void bprop(); // back propagation
+  void set_contexts(); // Save the context
+  void set_netphases(net_phase phase); // current phase: train or test
+  void print_layers_info(); // print layer information
+  void print_configs(); // print the configurations
+ 
   // comparing outputs with the ground truth (labels)
   acc_t masked_accuracy(size_t gBegin, size_t gEnd, size_t gCount,
                         mask_t* gMasks, float_t* preds,
diff --git a/libdeepgalois/src/Train.cpp b/libdeepgalois/src/Train.cpp
new file mode 100644
index 0000000000..75724a134d
--- /dev/null
+++ b/libdeepgalois/src/Train.cpp
@@ -0,0 +1,509 @@
+#include "galois/Galois.h"
+#include "deepgalois/Net.h"
+
+namespace deepgalois {
+
+Net::Net(std::string dataset_str, int nt, unsigned n_conv, int epochs,
+    unsigned hidden1, float lr, float dropout, float wd, 
+    bool selfloop, bool single, bool l2norm, bool dense, 
+    unsigned neigh_sz, unsigned subg_sz, int val_itv) :
+//    globalSamples(0), num_classes(0), num_conv_layers(0), num_layers(0),
+//    globalTrainBegin(0), globalTrainEnd(0), globalTrainCount(0), 
+//    globalValBegin(0), globalValEnd(0), globalValCount(0), 
+//    globalTestBegin(0), globalTestEnd(0), globalTestCount(0), 
+//    globalTrainMasks(NULL), globalValMasks(NULL), globalTestMasks(NULL) {}
+      is_single_class(single), has_l2norm(l2norm), has_dense(dense),
+      neighbor_sample_size(neigh_sz), subgraph_sample_size(subg_sz),
+      num_threads(nt), num_conv_layers(n_conv), num_epochs(epochs),
+      h1(hidden1), learning_rate(lr), dropout_rate(dropout), weight_decay(wd),
+      val_interval(val_itv), num_subgraphs(1), is_selfloop(selfloop) {
+    // init some identifiers for this host
+#ifndef GALOIS_ENABLE_GPU
+  this->myID = galois::runtime::getSystemNetworkInterface().ID;
+#endif
+  this->header    = "[" + std::to_string(myID) + "] ";
+  this->seperator = " ";
+
+  assert(n_conv > 0);
+
+  this->num_layers = num_conv_layers + 1;
+
+  // additional layers to add
+  if (has_l2norm)
+    this->num_layers++;
+  if (has_dense)
+    this->num_layers++;
+  // initialize feature metadata
+  feature_dims.resize(num_layers + 1);
+  print_configs();
+
+  // initialze global graph context
+  graphTopologyContext = new deepgalois::Context();
+  graphTopologyContext->set_dataset(dataset_str);
+  // read *entire* graph, get num nodes
+  globalSamples = graphTopologyContext->read_graph(selfloop);
+
+  // get training and validation sets: this is to create the training
+  // subgraph in the sampler
+  globalTrainMasks = new mask_t[globalSamples];
+  globalValMasks   = new mask_t[globalSamples];
+  globalTestMasks  = new mask_t[globalSamples];
+  std::fill(globalTrainMasks, globalTrainMasks + globalSamples, 0);
+  std::fill(globalValMasks, globalValMasks + globalSamples, 0);
+
+  // reddit is hard coded
+  if (dataset_str == "reddit") {
+    this->globalTrainBegin = 0;
+    this->globalTrainCount = 153431;
+    this->globalTrainEnd   = this->globalTrainBegin + this->globalTrainCount;
+    this->globalValBegin   = 153431;
+    this->globalValCount   = 23831;
+    this->globalValEnd     = this->globalValBegin + this->globalValCount;
+
+    // TODO do all can be used below
+    for (size_t i = globalTrainBegin; i < globalTrainEnd; i++)
+      globalTrainMasks[i] = 1;
+    for (size_t i = globalValBegin; i < globalValEnd; i++)
+      globalValMasks[i] = 1;
+  } else {
+    globalTrainCount = graphTopologyContext->read_masks(
+        "train", globalSamples, globalTrainBegin, globalTrainEnd,
+        globalTrainMasks);
+    globalValCount = graphTopologyContext->read_masks(
+        "val", globalSamples, globalValBegin, globalValEnd, globalValMasks);
+  }
+  // make sure sampel size isn't greater than what we have to train with
+  assert(subgraph_sample_size <= globalTrainCount);
+
+  layers.resize(num_layers);
+  // hidden1 level embedding: 16
+  for (size_t i = 1; i < num_conv_layers; i++)
+    feature_dims[i] = this->h1;
+
+  // features are read in distcontext, not this context (this context only
+  // used for sampling)
+  if (subgraph_sample_size)
+    sampler = new deepgalois::Sampler();
+}
+
+void Net::train(optimizer* opt, bool need_validate) {
+  double total_train_time = 0.0;
+  int num_subg_remain     = 0;
+
+  if (subgraph_sample_size) {
+    distContext->allocateSubgraphs(num_subgraphs, subgraph_sample_size);
+    allocateSubgraphsMasks(num_subgraphs);
+    std::cout << header
+      << "Constructing training vertex set induced graph...\n";
+    // auto gg = distContext->getGraphPointer();
+    auto gg =
+      graphTopologyContext->getGraphPointer(); // gloabl graph in CPU mem
+    sampler->initializeMaskedGraph(globalTrainCount, globalTrainMasks, gg,
+        distContext->getGraphPointer());
+  }
+
+  //galois::gPrint(header, "Start training...\n");
+
+  Timer t_epoch;
+
+  // run epochs
+  for (int curEpoch = 0; curEpoch < num_epochs; curEpoch++) {
+    t_epoch.Start();
+
+    ////////////////////////////////////////////////////////////////////////////////
+    // Sampling
+    ////////////////////////////////////////////////////////////////////////////////
+    if (subgraph_sample_size) {
+      if (num_subg_remain == 0) {
+        std::cout << header << "Generating " << num_subgraphs
+          << " subgraph(s)\n";
+        // TODO stat timer instead of this timer
+        Timer t_subgen;
+        t_subgen.Start();
+
+        // generate subgraphs
+        for (int sid = 0; sid < num_subgraphs; sid++) {
+          VertexSet sampledSet;
+          sampler->selectVertices(subgraph_sample_size, sampledSet,
+              curEpoch); // m = 1000 by default
+          sampler->generateSubgraph(sampledSet,
+              subgraphs_masks + sid * globalSamples,
+              distContext->getSubgraphPointer(sid));
+        }
+        num_subg_remain = num_subgraphs;
+        t_subgen.Stop();
+        // std::cout << "Done, time: " << t_subgen.Millisecs() << "\n";
+      }
+      // count their degrees
+      for (int i = 0; i < num_subgraphs; i++) {
+        auto sg_ptr = distContext->getSubgraphPointer(i);
+        sg_ptr->degree_counting();
+        // galois::gPrint("\tsubgraph[", i, "]: num_v ", sg_ptr->size(), "
+        // num_e ", sg_ptr->sizeEdges(), "\n");
+      }
+
+      // choose a subgraph to use
+      num_subg_remain--;
+      int sg_id                 = num_subg_remain;
+      auto subgraphPointer      = distContext->getSubgraphPointer(sg_id);
+      this->subgraphNumVertices = subgraphPointer->size();
+
+      //std::cout << "Subgraph num_vertices: " << subgraphNumVertices
+      //          << ", num_edges: " << subgraphPointer->sizeEdges() << "\n";
+      for (size_t i = 0; i < num_layers; i++) {
+        layers[i]->update_dim_size(this->subgraphNumVertices);
+      }
+
+      // TODO dist version where i need global degrees
+      // change normalization constants
+      distContext->constructNormFactorSub(sg_id);
+      for (size_t i = 0; i < num_conv_layers; i++) {
+        layers[i]->set_graph_ptr(subgraphPointer);
+        layers[i]->set_norm_consts_ptr(
+            distContext->get_norm_factors_subg_ptr());
+      }
+
+      // update labels for subgraph
+      distContext->constructSubgraphLabels(
+          this->subgraphNumVertices, subgraphs_masks + sg_id * globalSamples);
+      layers[num_layers - 1]->set_labels_ptr(
+          distContext->get_labels_subg_ptr());
+
+      // update features for subgraph
+      distContext->constructSubgraphFeatures(
+          this->subgraphNumVertices, subgraphs_masks + sg_id * globalSamples);
+      layers[0]->set_feats_ptr(
+          distContext->get_feats_subg_ptr()); // feed input data
+
+      // Graph* testing = distContext->getSubgraphPointer(sg_id);
+      // for (size_t i = 0; i < testing->size(); i++) {
+      //  for (auto j = testing->edge_begin(i); j < testing->edge_end(i); j++)
+      //  {
+      //    galois::gPrint(i, " ", testing->getEdgeDst(j), "\n");
+      //  }
+      //}
+    } // end subgraph sample loop
+    ////////////////////////////////////////////////////////////////////////////////
+
+    // training steps
+#ifdef GALOIS_ENABLE_GPU
+    std::cout << header << "Epoch " << std::setw(3) << curEpoch << " ";
+#else
+    galois::gPrint(header, "Epoch ", std::setw(3), curEpoch, "\n");
+#endif
+    set_netphases(net_phase::train);
+    acc_t train_loss = 0.0, train_acc = 0.0;
+
+    //galois::gPrint(header, "Calling into eval for forward propagation\n");
+    // forward: after this phase, layer edges will contain intermediate
+    // features for use during backprop
+    double fw_time = evaluate("train", train_loss, train_acc);
+    //evaluate("train", train_loss, train_acc);
+
+
+    //galois::gPrint(header, "Calling into backward propagation\n");
+    // backward: use intermediate features + ground truth to update layers
+    // with feature gradients whcih are then used to calculate weight
+    // gradients
+    Net::bprop();
+
+    //galois::gPrint(header, "Weight update call\n");
+    // gradient update: use gradients stored on each layer to update model
+    // for next epoch
+    Net::update_weights(opt); // update parameters
+
+    // validation / testing
+    set_netphases(net_phase::test);
+
+#ifdef GALOIS_ENABLE_GPU
+    std::cout << header << "train_loss " << std::setprecision(3) << std::fixed
+      << train_loss << " train_acc " << train_acc << " ";
+#else
+    galois::gPrint(header, "train_loss ", std::setprecision(3), std::fixed,
+        train_loss, " train_acc ", train_acc, "\n");
+#endif
+    t_epoch.Stop();
+
+    double epoch_time = t_epoch.Millisecs();
+    total_train_time += epoch_time;
+
+    if (need_validate && curEpoch % val_interval == 0) {
+      // Validation
+      acc_t val_loss = 0.0, val_acc = 0.0;
+      double val_time = evaluate("val", val_loss, val_acc);
+#ifdef GALOIS_ENABLE_GPU
+      std::cout << header << "val_loss " << std::setprecision(3) << std::fixed
+        << val_loss << " val_acc " << val_acc << " ";
+      std::cout << header << "time " << std::setprecision(3) << std::fixed
+        << epoch_time + val_time << " ms (train_time " << epoch_time
+        << " val_time " << val_time << ")\n";
+#else
+      galois::gPrint(header, "val_loss ", std::setprecision(3), std::fixed,
+          val_loss, " val_acc ", val_acc, "\n");
+      galois::gPrint(header, "time ", std::setprecision(3), std::fixed,
+          epoch_time + val_time, " ms (train_time ", epoch_time,
+          " val_time ", val_time, ")\n");
+#endif
+    } else {
+#ifdef GALOIS_ENABLE_GPU
+      std::cout << header << "train_time " << std::fixed << epoch_time
+        << " ms (fw " << fw_time << ", bw " << epoch_time - fw_time << ")\n";
+#else
+      galois::gPrint(header, "train_time ", std::fixed, epoch_time,
+          " ms (fw ", fw_time, ", bw ", epoch_time - fw_time, ")\n");
+#endif
+    }
+  } // epoch loop
+
+  double avg_train_time = total_train_time / (double)num_epochs;
+  double throughput     = 1000.0 * (double)num_epochs / total_train_time;
+#ifdef GALOIS_ENABLE_GPU
+  std::cout << "Average training time per epoch: " << avg_train_time 
+    << "ms. Throughput " << throughput << " epoch/s\n";
+#else
+  galois::gPrint(header, "Average training time per epoch: ", avg_train_time,
+      " ms. Throughput: ", throughput, " epoch/s\n");
+#endif
+}
+
+// evaluate, i.e. inference or predict
+double Net::evaluate(std::string type, acc_t& loss, acc_t& acc) {
+  Timer t_eval;
+  t_eval.Start();
+  size_t gBegin = 0, gEnd = 0, gCount = 0;
+  mask_t* gMasks = NULL;
+
+  // TODO global here good for dist case?
+  if (type == "train") {
+    gBegin = globalTrainBegin;
+    gEnd   = globalTrainEnd;
+    gCount = globalTrainCount;
+    gMasks = globalTrainMasks;
+    if (subgraph_sample_size) {
+      // update gMasks for subgraph
+      gMasks = NULL;
+      gBegin = 0;
+      gEnd   = this->subgraphNumVertices;
+      gCount = this->subgraphNumVertices;
+    }
+  } else if (type == "val") {
+    gBegin = globalValBegin;
+    gEnd   = globalValEnd;
+    gCount = globalValCount;
+    gMasks = globalValMasks;
+  } else {
+    gBegin = globalTestBegin;
+    gEnd   = globalTestEnd;
+    gCount = globalTestCount;
+    gMasks = globalTestMasks;
+  }
+
+  // switch to the original graph if not training
+  if (subgraph_sample_size && type != "train") {
+    for (size_t i = 0; i < num_layers; i++)
+      layers[i]->update_dim_size(distNumSamples);
+    for (size_t i = 0; i < num_conv_layers; i++) {
+#ifdef GALOIS_ENABLE_GPU
+      layers[i]->set_graph_ptr(distContext->getGraphPointer());
+#else
+      layers[i]->set_graph_ptr(distContext->getLGraphPointer());
+#endif
+      layers[i]->set_norm_consts_ptr(distContext->get_norm_factors_ptr());
+    }
+    layers[num_layers - 1]->set_labels_ptr(distContext->get_labels_ptr());
+    layers[0]->set_feats_ptr(distContext->get_feats_ptr()); // feed input data
+  }
+#ifdef GALOIS_ENABLE_GPU
+  if (type == "train") {
+    gMasks = d_train_masks;
+  } else if (type == "val") {
+    gMasks = d_val_masks;
+  } else {
+    gMasks = d_test_masks;
+  }
+#endif
+
+  //galois::gPrint(header, "Doing actual forward propagation\n");
+  loss = fprop(gBegin, gEnd, gCount, gMasks);
+  //galois::gPrint(header,
+  //               "Forward propagation donne, going to check accuracy\n");
+  float_t* predictions = layers[num_layers - 1]->next()->get_data();
+
+  // labels will be subgraph labels if applicable
+  label_t* localLabels;
+  if (type == "train" && subgraph_sample_size) {
+    localLabels = distContext->get_labels_subg_ptr();
+  } else {
+    // note this grabs local labels
+    localLabels = distContext->get_labels_ptr();
+  }
+
+  if (is_single_class) {
+    acc = masked_accuracy(gBegin, gEnd, gCount, gMasks, predictions,
+        localLabels);
+  } else {
+    acc = masked_multi_class_accuracy(gBegin, gEnd, gCount, gMasks,
+        predictions, localLabels);
+  }
+
+  t_eval.Stop();
+  return t_eval.Millisecs();
+}
+
+void Net::construct_layers() {
+  // append conv layers
+  //galois::gPrint(header, "Constructing layers...\n");
+  for (size_t i = 0; i < num_conv_layers - 1; i++) {
+    append_conv_layer(i, true); // conv layers, act=true
+  }
+  append_conv_layer(num_conv_layers - 1); // the last hidden layer, act=false
+
+  if (has_l2norm) {
+    append_l2norm_layer(num_conv_layers); // l2_norm layer
+  }
+  if (has_dense) {
+    append_dense_layer(num_layers - 2); // dense layer
+  }
+  append_out_layer(num_layers - 1); // output layer
+
+  // allocate memory for intermediate features and gradients
+  for (size_t i = 0; i < num_layers; i++) {
+    layers[i]->add_edge();
+  }
+  for (size_t i = 1; i < num_layers; i++) {
+    connect(layers[i - 1], layers[i]);
+  }
+  for (size_t i = 0; i < num_layers; i++) {
+    layers[i]->malloc_and_init();
+  }
+
+  layers[0]->set_in_data(distContext->get_feats_ptr()); // feed input data
+  // precompute the normalization constant based on graph structure
+  // context->norm_factor_computing(false);
+  distContext->constructNormFactor(graphTopologyContext);
+  for (size_t i = 0; i < num_conv_layers; i++)
+    layers[i]->set_norm_consts_ptr(distContext->get_norm_factors_ptr());
+  set_contexts();
+}
+
+//! Add an l2_norm layer to the network
+void Net::append_l2norm_layer(size_t layer_id) {
+  assert(layer_id > 0); // can not be the first layer
+  std::vector<size_t> in_dims(2), out_dims(2);
+  in_dims[0]       = distNumSamples;
+  in_dims[0]       = distNumSamples;
+  in_dims[1]       = get_in_dim(layer_id);
+  out_dims[1]      = get_out_dim(layer_id);
+  layers[layer_id] = new l2_norm_layer(layer_id, in_dims, out_dims);
+}
+
+//! Add an dense layer to the network
+void Net::append_dense_layer(size_t layer_id) {
+  assert(layer_id > 0); // can not be the first layer
+  std::vector<size_t> in_dims(2), out_dims(2);
+  in_dims[0]  = distNumSamples;
+  in_dims[0]  = distNumSamples;
+  in_dims[1]  = get_in_dim(layer_id);
+  out_dims[1] = get_out_dim(layer_id);
+  // layers[layer_id] = new dense_layer(layer_id, in_dims, out_dims);
+}
+
+//! Add an output layer to the network
+void Net::append_out_layer(size_t layer_id) {
+  assert(layer_id > 0); // can not be the first layer
+  std::vector<size_t> in_dims(2), out_dims(2);
+  in_dims[0] = out_dims[0] = distNumSamples;
+  in_dims[1]               = get_in_dim(layer_id);
+  out_dims[1]              = get_out_dim(layer_id);
+
+  if (is_single_class)
+    layers[layer_id] = new softmax_loss_layer(layer_id, in_dims, out_dims);
+  else
+    layers[layer_id] = new sigmoid_loss_layer(layer_id, in_dims, out_dims);
+
+  layers[layer_id]->set_labels_ptr(distContext->get_labels_ptr());
+}
+//! Add a convolution layer to the network
+void Net::append_conv_layer(size_t layer_id, bool act, bool norm, bool bias, bool dropout) {
+  assert(dropout_rate < 1.0);
+  assert(layer_id < num_conv_layers);
+  std::vector<size_t> in_dims(2), out_dims(2);
+  in_dims[0] = out_dims[0] = distNumSamples;
+  in_dims[1]               = get_in_dim(layer_id);
+  out_dims[1]              = get_out_dim(layer_id);
+  layers[layer_id] = new graph_conv_layer(layer_id, act, norm, bias, dropout,
+      dropout_rate, in_dims, out_dims);
+#ifdef GALOIS_ENABLE_GPU
+  layers[layer_id]->set_graph_ptr(distContext->getGraphPointer());
+#else
+  layers[layer_id]->set_graph_ptr(distContext->getLGraphPointer());
+#endif
+}
+
+//! forward propagation: [begin, end) is the range of samples used.
+//! calls "forward" on each layer and returns the loss of the final layer
+acc_t Net::fprop(size_t gBegin, size_t gEnd, size_t gCount, mask_t* gMasks) {
+  // set mask for the last layer; globals
+  // TODO this should be distirbuted sample gBegin->end not global; fix later
+  // seems to be unused in code right now anyways
+  //galois::gPrint(header, "fprop: set sample mask\n");
+  layers[num_layers - 1]->set_sample_mask(gBegin, gEnd, gCount, gMasks);
+
+  for (size_t i = 0; i < num_layers; i++) {
+    //galois::gPrint(header, "fprop: layer ", i, " forward call\n");
+    layers[i]->forward();
+  }
+
+  //galois::gPrint(header, "fprop: getting loss\n");
+  // prediction error
+  acc_t loss = layers[num_layers - 1]->get_prediction_loss();
+  // Squared Norm Regularization to mitigate overfitting
+  loss += weight_decay * layers[0]->get_weight_decay_loss();
+  return loss;
+}
+
+// back propagation
+void Net::bprop() {
+  for (size_t i = num_layers; i != 0; i--) {
+    layers[i - 1]->backward();
+  }
+}
+
+// update trainable weights after back-propagation
+void Net::update_weights(optimizer* opt) {
+  regularize();
+  for (size_t i = 0; i < num_layers; i++) {
+    if (layers[i]->trainable()) {
+      layers[i]->update_weight(opt);
+    }
+  }
+}
+
+//! Save the context object to all layers of the network
+void Net::set_contexts() {
+  for (size_t i = 0; i < num_layers; i++)
+    layers[i]->set_context(distContext);
+}
+
+//! set netphases for all layers in this network
+void Net::set_netphases(net_phase phase) {
+  for (size_t i = 0; i < num_layers; i++)
+    layers[i]->set_netphase(phase);
+}
+
+//! print all layers
+void Net::print_layers_info() {
+  for (size_t i = 0; i < num_layers; i++)
+    layers[i]->print_layer_info();
+}
+
+// print the configurations
+void Net::print_configs() {
+  galois::gPrint(header, "Configuration: num_threads ", num_threads,
+      ", num_conv_layers ", num_conv_layers, ", num_epochs ",
+      num_epochs, ", hidden_feat_len ", h1, ", learning_rate ",
+      learning_rate, ", dropout_rate ", dropout_rate,
+      ", weight_decay ", weight_decay, "\n");
+}
+
+} // end namespace

From bd6e0fbecf08a4399c70701018d9d15b0d84540f Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Tue, 2 Jun 2020 09:16:16 -0500
Subject: [PATCH 319/660] clean Net.h

---
 libdeepgalois/include/deepgalois/Net.h |  9 +--------
 libdeepgalois/src/Train.cpp            | 19 +++++++++----------
 2 files changed, 10 insertions(+), 18 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h
index 6c720f730d..81754f915a 100644
--- a/libdeepgalois/include/deepgalois/Net.h
+++ b/libdeepgalois/include/deepgalois/Net.h
@@ -22,14 +22,7 @@ namespace deepgalois {
 // layer 1: features N x D, weights D x 16, out N x 16 (hidden1=16)
 // layer 2: features N x 16, weights 16 x E, out N x E
 class Net {
-#ifdef GALOIS_ENABLE_GPU
-  unsigned myID = 0;
-#else
-  unsigned myID = galois::runtime::getSystemNetworkInterface().ID;
-#endif
-  std::string header    = "[" + std::to_string(myID) + "] ";
-  std::string seperator = "\n";
-
+  std::string header;
   bool is_single_class;          // single-class (one-hot) or multi-class label
   bool has_l2norm;               // whether the net contains an l2_norm layer
   bool has_dense;                // whether the net contains an dense layer
diff --git a/libdeepgalois/src/Train.cpp b/libdeepgalois/src/Train.cpp
index 75724a134d..7bd0b70385 100644
--- a/libdeepgalois/src/Train.cpp
+++ b/libdeepgalois/src/Train.cpp
@@ -18,14 +18,12 @@ Net::Net(std::string dataset_str, int nt, unsigned n_conv, int epochs,
       h1(hidden1), learning_rate(lr), dropout_rate(dropout), weight_decay(wd),
       val_interval(val_itv), num_subgraphs(1), is_selfloop(selfloop) {
     // init some identifiers for this host
-#ifndef GALOIS_ENABLE_GPU
-  this->myID = galois::runtime::getSystemNetworkInterface().ID;
+  unsigned myID = 0;
+#ifdef GALOIS_ENABLE_DIST
+  myID = galois::runtime::getSystemNetworkInterface().ID;
 #endif
-  this->header    = "[" + std::to_string(myID) + "] ";
-  this->seperator = " ";
-
+  this->header = "[" + std::to_string(myID) + "] ";
   assert(n_conv > 0);
-
   this->num_layers = num_conv_layers + 1;
 
   // additional layers to add
@@ -87,6 +85,7 @@ Net::Net(std::string dataset_str, int nt, unsigned n_conv, int epochs,
 }
 
 void Net::train(optimizer* opt, bool need_validate) {
+  std::string separator = "\n";
   double total_train_time = 0.0;
   int num_subg_remain     = 0;
 
@@ -102,7 +101,7 @@ void Net::train(optimizer* opt, bool need_validate) {
         distContext->getGraphPointer());
   }
 
-  //galois::gPrint(header, "Start training...\n");
+  galois::gPrint(header, "Start training...\n");
 
   Timer t_epoch;
 
@@ -189,7 +188,7 @@ void Net::train(optimizer* opt, bool need_validate) {
 #ifdef GALOIS_ENABLE_GPU
     std::cout << header << "Epoch " << std::setw(3) << curEpoch << " ";
 #else
-    galois::gPrint(header, "Epoch ", std::setw(3), curEpoch, "\n");
+    galois::gPrint(header, "Epoch ", std::setw(3), curEpoch, separator);
 #endif
     set_netphases(net_phase::train);
     acc_t train_loss = 0.0, train_acc = 0.0;
@@ -220,7 +219,7 @@ void Net::train(optimizer* opt, bool need_validate) {
       << train_loss << " train_acc " << train_acc << " ";
 #else
     galois::gPrint(header, "train_loss ", std::setprecision(3), std::fixed,
-        train_loss, " train_acc ", train_acc, "\n");
+        train_loss, " train_acc ", train_acc, separator);
 #endif
     t_epoch.Stop();
 
@@ -239,7 +238,7 @@ void Net::train(optimizer* opt, bool need_validate) {
         << " val_time " << val_time << ")\n";
 #else
       galois::gPrint(header, "val_loss ", std::setprecision(3), std::fixed,
-          val_loss, " val_acc ", val_acc, "\n");
+          val_loss, " val_acc ", val_acc, separator);
       galois::gPrint(header, "time ", std::setprecision(3), std::fixed,
           epoch_time + val_time, " ms (train_time ", epoch_time,
           " val_time ", val_time, ")\n");

From bb4179b3df37855b21764eeb5ceb04c76f0e6e16 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Sat, 27 Jun 2020 14:07:02 -0500
Subject: [PATCH 320/660] timer set 1

---
 libdeepgalois/src/Train.cpp | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/libdeepgalois/src/Train.cpp b/libdeepgalois/src/Train.cpp
index 7bd0b70385..284044cd4c 100644
--- a/libdeepgalois/src/Train.cpp
+++ b/libdeepgalois/src/Train.cpp
@@ -90,6 +90,8 @@ void Net::train(optimizer* opt, bool need_validate) {
   int num_subg_remain     = 0;
 
   if (subgraph_sample_size) {
+    galois::StatTimer construct_time("SubgraphAllocateTime");
+    construct_time.start();
     distContext->allocateSubgraphs(num_subgraphs, subgraph_sample_size);
     allocateSubgraphsMasks(num_subgraphs);
     std::cout << header
@@ -99,6 +101,7 @@ void Net::train(optimizer* opt, bool need_validate) {
       graphTopologyContext->getGraphPointer(); // gloabl graph in CPU mem
     sampler->initializeMaskedGraph(globalTrainCount, globalTrainMasks, gg,
         distContext->getGraphPointer());
+    construct_time.stop();
   }
 
   galois::gPrint(header, "Start training...\n");
@@ -113,12 +116,13 @@ void Net::train(optimizer* opt, bool need_validate) {
     // Sampling
     ////////////////////////////////////////////////////////////////////////////////
     if (subgraph_sample_size) {
+      galois::StatTimer sample_time("SubgraphSampleTime");
+      sample_time.start();
       if (num_subg_remain == 0) {
         std::cout << header << "Generating " << num_subgraphs
           << " subgraph(s)\n";
-        // TODO stat timer instead of this timer
-        Timer t_subgen;
-        t_subgen.Start();
+        galois::StatTimer t_subgen("SubgraphGenerateTime");
+        t_subgen.start();
 
         // generate subgraphs
         for (int sid = 0; sid < num_subgraphs; sid++) {
@@ -130,8 +134,7 @@ void Net::train(optimizer* opt, bool need_validate) {
               distContext->getSubgraphPointer(sid));
         }
         num_subg_remain = num_subgraphs;
-        t_subgen.Stop();
-        // std::cout << "Done, time: " << t_subgen.Millisecs() << "\n";
+        t_subgen.stop();
       }
       // count their degrees
       for (int i = 0; i < num_subgraphs; i++) {
@@ -181,6 +184,7 @@ void Net::train(optimizer* opt, bool need_validate) {
       //    galois::gPrint(i, " ", testing->getEdgeDst(j), "\n");
       //  }
       //}
+      sample_time.stop();
     } // end subgraph sample loop
     ////////////////////////////////////////////////////////////////////////////////
 
@@ -267,8 +271,13 @@ void Net::train(optimizer* opt, bool need_validate) {
 
 // evaluate, i.e. inference or predict
 double Net::evaluate(std::string type, acc_t& loss, acc_t& acc) {
+  // TODO get rid of this timer
   Timer t_eval;
   t_eval.Start();
+
+  galois::StatTimer eval_timer("EvaluateTime");
+  eval_timer.start();
+
   size_t gBegin = 0, gEnd = 0, gCount = 0;
   mask_t* gMasks = NULL;
 
@@ -345,6 +354,9 @@ double Net::evaluate(std::string type, acc_t& loss, acc_t& acc) {
         predictions, localLabels);
   }
 
+  eval_timer.stop();
+
+  // TODO replace with stat timer
   t_eval.Stop();
   return t_eval.Millisecs();
 }
@@ -442,6 +454,8 @@ void Net::append_conv_layer(size_t layer_id, bool act, bool norm, bool bias, boo
 //! forward propagation: [begin, end) is the range of samples used.
 //! calls "forward" on each layer and returns the loss of the final layer
 acc_t Net::fprop(size_t gBegin, size_t gEnd, size_t gCount, mask_t* gMasks) {
+  galois::StatTimer fprop_timer("ForwardPropTime");
+  fprop_timer.start();
   // set mask for the last layer; globals
   // TODO this should be distirbuted sample gBegin->end not global; fix later
   // seems to be unused in code right now anyways
@@ -458,14 +472,18 @@ acc_t Net::fprop(size_t gBegin, size_t gEnd, size_t gCount, mask_t* gMasks) {
   acc_t loss = layers[num_layers - 1]->get_prediction_loss();
   // Squared Norm Regularization to mitigate overfitting
   loss += weight_decay * layers[0]->get_weight_decay_loss();
+  fprop_timer.stop();
   return loss;
 }
 
 // back propagation
 void Net::bprop() {
+  galois::StatTimer bprop_timer("BackPropTime");
+  bprop_timer.start();
   for (size_t i = num_layers; i != 0; i--) {
     layers[i - 1]->backward();
   }
+  bprop_timer.stop();
 }
 
 // update trainable weights after back-propagation

From 8f37a6d8754646a4ab0d1bcacb77c375fbf89bc3 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Sat, 27 Jun 2020 15:29:02 -0500
Subject: [PATCH 321/660] timers set 2

---
 libdeepgalois/src/Train.cpp                   |  3 +++
 libdeepgalois/src/layers/graph_conv_layer.cpp | 12 +++++++++---
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/libdeepgalois/src/Train.cpp b/libdeepgalois/src/Train.cpp
index 284044cd4c..4e363bb1b1 100644
--- a/libdeepgalois/src/Train.cpp
+++ b/libdeepgalois/src/Train.cpp
@@ -85,6 +85,8 @@ Net::Net(std::string dataset_str, int nt, unsigned n_conv, int epochs,
 }
 
 void Net::train(optimizer* opt, bool need_validate) {
+  galois::StatTimer train_timer("Timer_0");
+  train_timer.start();
   std::string separator = "\n";
   double total_train_time = 0.0;
   int num_subg_remain     = 0;
@@ -267,6 +269,7 @@ void Net::train(optimizer* opt, bool need_validate) {
   galois::gPrint(header, "Average training time per epoch: ", avg_train_time,
       " ms. Throughput: ", throughput, " epoch/s\n");
 #endif
+  train_timer.stop();
 }
 
 // evaluate, i.e. inference or predict
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 941a796a81..2a0eb05d67 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -92,6 +92,8 @@ void graph_conv_layer::malloc_and_init() {
 // 𝒉[𝑙] = σ(𝑊 * Σ(𝒉[𝑙-1]))
 void graph_conv_layer::forward_propagation(const float_t* in_data,
                                            float_t* out_data) {
+  galois::StatTimer conv_timer("GraphConvForward");
+  conv_timer.start();
   size_t x = input_dims[0];
   size_t y = input_dims[1];
   size_t z = output_dims[1];
@@ -121,17 +123,20 @@ void graph_conv_layer::forward_propagation(const float_t* in_data,
   deepgalois::_syncVectorSize = z;
   deepgalois::_dataToSync     = out_data;
   layer::context->getSyncSubstrate()->sync<writeAny, readAny, GraphConvSync>(
-     "AggSync");
+     "GraphConvForward");
 
   // run relu activation on output if specified
   if (act_)
     math::relu_cpu(x * z, out_data, out_data);
+  conv_timer.stop();
 }
 
 // 𝜕𝐸 / 𝜕𝑦[𝑙−1] = 𝜕𝐸 / 𝜕𝑦[𝑙] ∗ 𝑊 ^𝑇
 void graph_conv_layer::back_propagation(const float_t* in_data,
                                         const float_t* out_data,
                                         float_t* out_grad, float_t* in_grad) {
+  galois::StatTimer conv_timer("GraphConvBackward");
+  conv_timer.start();
   size_t x = input_dims[0];
   size_t y = input_dims[1];
   size_t z = output_dims[1];
@@ -167,13 +172,14 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
   deepgalois::_syncVectorSize = z;
   deepgalois::_dataToSync     = out_temp;
   layer::context->getSyncSubstrate()->sync<writeAny, readAny, GraphConvSync>(
-     "AggSyncBack");
+     "GraphConvBackward");
 
   if (level_ != 0 && dropout_)
     math::d_dropout_cpu(x, y, scale_, in_grad, dropout_mask, in_grad);
 
-  layer::syncSub->sync<writeAny, readAny, GradientSync>("GradientSync");
+  layer::syncSub->sync<writeAny, readAny, GradientSync>("Gradients");
   galois::gInfo("[", layer::gradientGraph->myHostID(), "] Sync done");
+  conv_timer.stop();
 }
 
 acc_t graph_conv_layer::get_weight_decay_loss() {

From fe13b47cc86a83271282f407ff34bb3a85787f30 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Sat, 27 Jun 2020 16:48:43 -0500
Subject: [PATCH 322/660] timer set 3 (graph conv breakdown)

---
 libdeepgalois/src/layers/graph_conv_layer.cpp | 25 +++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 2a0eb05d67..b2fe0784f7 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -34,22 +34,28 @@ inline void graph_conv_layer::zero_init_matrix(size_t dim_x, size_t dim_y,
 // aggregate based on graph topology
 void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in,
                                  float_t* out) {
+  galois::StatTimer aggregate_timer("AggregateTime");
+  aggregate_timer.start();
   // normalization constant based on graph structure
 #ifdef USE_MKL
   update_all_csrmm(len, g, in, out, norm_, norm_consts);
 #else
   update_all(len, g, in, out, norm_, norm_consts);
 #endif
+  aggregate_timer.stop();
 }
 
 // since graph is symmetric, the derivative is the same
 void graph_conv_layer::d_aggregate(size_t len, Graph& g, const float_t* in,
                                    float_t* out) {
+  galois::StatTimer aggregate_timer("AggregateDerivativeTime");
+  aggregate_timer.start();
 #ifdef USE_MKL
   update_all_csrmm(len, g, in, out, norm_, norm_consts); // x*x; x*z -> x*z
 #else
   update_all(len, g, in, out, norm_, norm_consts); // x*x; x*z -> x*z
 #endif
+  aggregate_timer.stop();
 }
 
 void graph_conv_layer::combine(size_t n, size_t len, const float_t* self,
@@ -98,6 +104,8 @@ void graph_conv_layer::forward_propagation(const float_t* in_data,
   size_t y = input_dims[1];
   size_t z = output_dims[1];
 
+  galois::StatTimer drop_timer("GraphConvForwardDropout");
+  drop_timer.start();
   // input: x*y; W: y*z; output: x*z
   // if y > z: mult W first to reduce the feature size for aggregation
   // else: aggregate first then mult W
@@ -107,7 +115,10 @@ void graph_conv_layer::forward_propagation(const float_t* in_data,
   } else {
     math::copy_cpu(x * y, in_data, in_temp);
   }
+  drop_timer.stop();
 
+  galois::StatTimer compute_timer("GraphConvForwardCompute");
+  compute_timer.start();
   if (y > z) {
     math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp,
                     &layer::W[0], 0.0, out_temp);
@@ -117,6 +128,7 @@ void graph_conv_layer::forward_propagation(const float_t* in_data,
     math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp1,
                     &layer::W[0], 0.0, out_data);
   }
+  compute_timer.stop();
 
   // TODO sync of out_data required here
   // TODO how to do this for the sampled case?
@@ -126,8 +138,12 @@ void graph_conv_layer::forward_propagation(const float_t* in_data,
      "GraphConvForward");
 
   // run relu activation on output if specified
+  galois::StatTimer relu_timer("GraphConvForwardRelu");
+  relu_timer.start();
   if (act_)
     math::relu_cpu(x * z, out_data, out_data);
+  relu_timer.stop();
+
   conv_timer.stop();
 }
 
@@ -141,10 +157,15 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
   size_t y = input_dims[1];
   size_t z = output_dims[1];
   // note; assumption here is that out_grad contains 1s or 0s via relu?
+  galois::StatTimer relu_timer("GraphConvBackwardRelu");
+  relu_timer.start();
   if (act_)
     math::d_relu_cpu(x * z, out_grad, out_data, out_grad);
+  relu_timer.stop();
   // else math::copy_cpu(x * z, out_grad, out_temp); // TODO: avoid copying
 
+  galois::StatTimer compute_timer("GraphConvBackwardCompute");
+  compute_timer.start();
   if (y > z) {
     d_aggregate(z, *graph_cpu, out_grad, out_temp);
     // at this point, out_temp has the derivative of data from last step to
@@ -167,6 +188,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
     math::sgemm_cpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_grad,
                     0.0, &layer::weight_grad[0]);
   }
+  compute_timer.stop();
 
   // sync agg
   deepgalois::_syncVectorSize = z;
@@ -174,8 +196,11 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
   layer::context->getSyncSubstrate()->sync<writeAny, readAny, GraphConvSync>(
      "GraphConvBackward");
 
+  galois::StatTimer drop_timer("GraphConvBackwardDropout");
+  drop_timer.start();
   if (level_ != 0 && dropout_)
     math::d_dropout_cpu(x, y, scale_, in_grad, dropout_mask, in_grad);
+  drop_timer.stop();
 
   layer::syncSub->sync<writeAny, readAny, GradientSync>("Gradients");
   galois::gInfo("[", layer::gradientGraph->myHostID(), "] Sync done");

From 90cc74711647b4c8fd9bad5b181e8bf2ae234d3e Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Sat, 11 Jul 2020 10:30:48 -0500
Subject: [PATCH 323/660] clang-format 10 run

---
 .../include/deepgalois/DistContext.h          |  43 ++++---
 libdeepgalois/include/deepgalois/GraphTypes.h |  10 +-
 libdeepgalois/include/deepgalois/Net.h        |  24 ++--
 libdeepgalois/include/deepgalois/Sampler.h    |  26 ++--
 .../include/deepgalois/layers/layer.h         |  13 +-
 libdeepgalois/include/deepgalois/lgraph.h     |   8 +-
 libdeepgalois/src/DistContext.cpp             |   4 +-
 libdeepgalois/src/RandomWalk.cpp              |  59 +++++----
 libdeepgalois/src/Sampler.cpp                 |  84 +++++++------
 libdeepgalois/src/Train.cpp                   | 114 +++++++++---------
 libdeepgalois/src/layers/graph_conv_layer.cpp |   4 +-
 libdeepgalois/src/lgraph.cpp                  |  10 +-
 libdeepgalois/src/reader.cpp                  |  24 ++--
 libdeepgalois/src/utils.cpp                   |   4 +-
 14 files changed, 239 insertions(+), 188 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h
index c614a92ca2..3ecf9ed411 100644
--- a/libdeepgalois/include/deepgalois/DistContext.h
+++ b/libdeepgalois/include/deepgalois/DistContext.h
@@ -18,13 +18,14 @@ class DistContext {
   bool is_selfloop_added; // whether selfloop is added to the input graph
   bool usingSingleClass;
   std::string dataset;
-  size_t num_classes;     // number of classes: E
-  size_t feat_len;        // input feature length: D
-  Graph* lGraph;          // learning graph version
+  size_t num_classes;       // number of classes: E
+  size_t feat_len;          // input feature length: D
+  Graph* lGraph;            // learning graph version
   DGraph* partitionedGraph; // the input graph, |V| = N
   std::vector<Graph*> partitionedSubgraphs;
-  label_t* h_labels;      // labels for classification. Single-class: Nx1, multi-class: NxE
-  float_t* h_feats;       // input features: N x D
+  label_t* h_labels; // labels for classification. Single-class: Nx1,
+                     // multi-class: NxE
+  float_t* h_feats;  // input features: N x D
 #ifdef GALOIS_ENABLE_GPU
   label_t* d_labels;      // labels on device
   label_t* d_labels_subg; // labels for subgraph on device
@@ -35,9 +36,10 @@ class DistContext {
 #else
   galois::graphs::GluonSubstrate<DGraph>* syncSubstrate;
 #endif
-  std::vector<label_t> h_labels_subg;  // labels for subgraph
-  std::vector<float_t> h_feats_subg;   // input features for subgraph
-  std::vector<float_t> normFactors;    // normalization constant based on graph structure
+  std::vector<label_t> h_labels_subg; // labels for subgraph
+  std::vector<float_t> h_feats_subg;  // input features for subgraph
+  std::vector<float_t>
+      normFactors; // normalization constant based on graph structure
   std::vector<float_t> normFactorsSub; // normalization constant for subgraph
 
   Reader reader;
@@ -45,10 +47,10 @@ class DistContext {
 public:
   // TODO better constructor
   DistContext();
-  DistContext(bool isDevice) : is_device(isDevice), is_selfloop_added(false),
-                               usingSingleClass(true), dataset(""),
-                               num_classes(0), feat_len(0), lGraph(NULL),
-                               partitionedGraph(NULL), h_labels(0), h_feats(0) {}
+  DistContext(bool isDevice)
+      : is_device(isDevice), is_selfloop_added(false), usingSingleClass(true),
+        dataset(""), num_classes(0), feat_len(0), lGraph(NULL),
+        partitionedGraph(NULL), h_labels(0), h_feats(0) {}
   ~DistContext();
 
   size_t read_graph(std::string dataset_str, bool selfloop = false);
@@ -75,15 +77,20 @@ class DistContext {
   label_t* get_labels_subg_ptr() { return d_labels_subg; }
   float_t* get_norm_factors_ptr() { return d_normFactors; }
   float_t* get_norm_factors_subg_ptr() { return d_normFactorsSub; }
-  void copy_data_to_device(); // copy labels and input features
-  static cublasHandle_t cublas_handle_;         // used to call cuBLAS
-  static cusparseHandle_t cusparse_handle_;     // used to call cuSPARSE
+  void copy_data_to_device();               // copy labels and input features
+  static cublasHandle_t cublas_handle_;     // used to call cuBLAS
+  static cusparseHandle_t cusparse_handle_; // used to call cuSPARSE
   static cusparseMatDescr_t cusparse_matdescr_; // used to call cuSPARSE
-  static curandGenerator_t curand_generator_; // used to generate random numbers on GPU
+  static curandGenerator_t
+      curand_generator_; // used to generate random numbers on GPU
   inline static cublasHandle_t cublas_handle() { return cublas_handle_; }
   inline static cusparseHandle_t cusparse_handle() { return cusparse_handle_; }
-  inline static cusparseMatDescr_t cusparse_matdescr() { return cusparse_matdescr_; }
-  inline static curandGenerator_t curand_generator() { return curand_generator_; }
+  inline static cusparseMatDescr_t cusparse_matdescr() {
+    return cusparse_matdescr_;
+  }
+  inline static curandGenerator_t curand_generator() {
+    return curand_generator_;
+  }
 #else
   void saveDistGraph(DGraph* a);
   galois::graphs::GluonSubstrate<DGraph>* getSyncSubstrate();
diff --git a/libdeepgalois/include/deepgalois/GraphTypes.h b/libdeepgalois/include/deepgalois/GraphTypes.h
index 1528375290..3f613a3039 100644
--- a/libdeepgalois/include/deepgalois/GraphTypes.h
+++ b/libdeepgalois/include/deepgalois/GraphTypes.h
@@ -17,11 +17,11 @@ namespace deepgalois {
 using edge_iterator = index_t;
 using GraphCPU      = LearningGraph;
 #ifdef GALOIS_ENABLE_GPU
-using DGraph        = CSRGraph;
-using Graph         = CSRGraph;
-using GraphGPU      = CSRGraph;
+using DGraph   = CSRGraph;
+using Graph    = CSRGraph;
+using GraphGPU = CSRGraph;
 #else
-using DGraph        = galois::graphs::DistGraph<char, void>;
-using Graph         = LearningGraph;
+using DGraph = galois::graphs::DistGraph<char, void>;
+using Graph  = LearningGraph;
 #endif
 } // namespace deepgalois
diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h
index 81754f915a..bd33924eee 100644
--- a/libdeepgalois/include/deepgalois/Net.h
+++ b/libdeepgalois/include/deepgalois/Net.h
@@ -85,20 +85,22 @@ class Net {
 
 public:
   //! Default net constructor
-  Net() : Net("reddit", 1, 2, 200, 16, 0.01, 0.5, 5e-4, 
-              false, true, false, false, 25, 9000, 1) {}
+  Net()
+      : Net("reddit", 1, 2, 200, 16, 0.01, 0.5, 5e-4, false, true, false, false,
+            25, 9000, 1) {}
 
   //! Net constructor
   Net(std::string dataset_str, int nt, unsigned n_conv, int epochs,
-      unsigned hidden1, float lr, float dropout, float wd, 
-      bool selfloop, bool single, bool l2norm, bool dense, 
-      unsigned neigh_sz, unsigned subg_sz, int val_itv);
+      unsigned hidden1, float lr, float dropout, float wd, bool selfloop,
+      bool single, bool l2norm, bool dense, unsigned neigh_sz, unsigned subg_sz,
+      int val_itv);
 
   // allocate memory for subgraph masks
   void allocateSubgraphsMasks(int num_subgraphs);
 
   //! Initializes metadata for the partition: loads data, labels, etc
-  void partitionInit(DGraph* graph, std::string dataset_str, bool isSingleClassLabel);
+  void partitionInit(DGraph* graph, std::string dataset_str,
+                     bool isSingleClassLabel);
   size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; }
   size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id + 1]; }
   void regularize(); // add weight decay
@@ -131,12 +133,12 @@ class Net {
 
   // forward propagation
   acc_t fprop(size_t gBegin, size_t gEnd, size_t gCount, mask_t* gMasks);
-  void bprop(); // back propagation
-  void set_contexts(); // Save the context
+  void bprop();                        // back propagation
+  void set_contexts();                 // Save the context
   void set_netphases(net_phase phase); // current phase: train or test
-  void print_layers_info(); // print layer information
-  void print_configs(); // print the configurations
- 
+  void print_layers_info();            // print layer information
+  void print_configs();                // print the configurations
+
   // comparing outputs with the ground truth (labels)
   acc_t masked_accuracy(size_t gBegin, size_t gEnd, size_t gCount,
                         mask_t* gMasks, float_t* preds,
diff --git a/libdeepgalois/include/deepgalois/Sampler.h b/libdeepgalois/include/deepgalois/Sampler.h
index 1b5754f394..ff1b460b10 100644
--- a/libdeepgalois/include/deepgalois/Sampler.h
+++ b/libdeepgalois/include/deepgalois/Sampler.h
@@ -34,25 +34,29 @@ class Sampler {
   //! Reindex a graph to only contain those in the vertex set
   void reindexSubgraph(VertexSet& keptVertices, Graph& g, Graph& reindexed);
 
-  //! Given a graph, return a graph with edges to unmasked vertices removed in mg
+  //! Given a graph, return a graph with edges to unmasked vertices removed in
+  //! mg
   template <typename GraphTy, typename SubgraphTy = Graph>
   void getMaskedGraph(index_t n, mask_t* masks, GraphTy* g, SubgraphTy* sub);
 
   //! determine degree of each vertex in a masked graph (given by masks and g)
   template <typename GraphTy = GraphCPU>
-  void getMaskedDegrees(size_t n, mask_t* masks, GraphTy* g, std::vector<uint32_t>& degrees);
+  void getMaskedDegrees(size_t n, mask_t* masks, GraphTy* g,
+                        std::vector<uint32_t>& degrees);
 
   //! Set masks bitset with IDs in the vertices VertexSet
-  //void createMasks(size_t n, VertexSet vertices, mask_t* masks);
-  //inline VertexList reindexVertices(size_t n, VertexSet vertex_set);
-  //void checkGSDB(std::vector<db_t>& DB0, std::vector<db_t>& DB1, std::vector<db_t>& DB2, index_t size);
+  // void createMasks(size_t n, VertexSet vertices, mask_t* masks);
+  // inline VertexList reindexVertices(size_t n, VertexSet vertex_set);
+  // void checkGSDB(std::vector<db_t>& DB0, std::vector<db_t>& DB1,
+  // std::vector<db_t>& DB2, index_t size);
 
   //! convert set of gids to lids
   VertexSet convertToLID(VertexSet& gidSet);
 
   void createMasks(size_t n, VertexSet vertices, mask_t* masks) {
     std::fill(masks, masks + n, 0);
-    for (auto v : vertices) masks[v] = 1;
+    for (auto v : vertices)
+      masks[v] = 1;
   }
 
   //! helper function to get degree of some vertex given some graph
@@ -71,7 +75,7 @@ class Sampler {
 
   // helper function for graph saint implementation below
   void checkGSDB(std::vector<db_t>& DB0, std::vector<db_t>& DB1,
-      std::vector<db_t>& DB2, index_t size) {
+                 std::vector<db_t>& DB2, index_t size) {
     if (DB0.capacity() < size) {
       DB0.reserve(DB0.capacity() * 2);
       DB1.reserve(DB1.capacity() * 2);
@@ -88,11 +92,12 @@ class Sampler {
 
   //! sample a subgraph sg of size n from graph g
   //! sg is overwritten/is output
-  void generateSubgraph(VertexSet &vertex_set, mask_t* masks, Graph* sg);
+  void generateSubgraph(VertexSet& vertex_set, mask_t* masks, Graph* sg);
 
   //! API function for user-defined selection strategy
   // TODO how to expose this?
-  void selectVertices(index_t nv, index_t n, Graph* g, VertexList vertices, VertexSet& vertex_set);
+  void selectVertices(index_t nv, index_t n, Graph* g, VertexList vertices,
+                      VertexSet& vertex_set);
   virtual void selectVertices(index_t n, VertexSet& vertex_set, unsigned seed);
 
   // galois::runtime::iterable<galois::NoDerefIterator<edge_iterator> >
@@ -100,7 +105,8 @@ class Sampler {
 
   //! Given a mask, construct the graph with only those vertices ans ave as the
   //! masked graph in this class for the sampler.
-  void initializeMaskedGraph(size_t count, mask_t* masks, GraphCPU* g, DGraph* dg);
+  void initializeMaskedGraph(size_t count, mask_t* masks, GraphCPU* g,
+                             DGraph* dg);
 };
 
 } // namespace deepgalois
diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h
index 02b5abebb4..874e7d41c6 100644
--- a/libdeepgalois/include/deepgalois/layers/layer.h
+++ b/libdeepgalois/include/deepgalois/layers/layer.h
@@ -41,11 +41,11 @@ class layer : public deepgalois::node {
   using ContextType = deepgalois::DistContext;
 
 protected:
-  #ifndef GALOIS_ENABLE_GPU
+#ifndef GALOIS_ENABLE_GPU
   const std::string header =
       "[" + std::to_string(galois::runtime::getSystemNetworkInterface().ID) +
       "] ";
-  #endif
+#endif
   unsigned level_;                 // layer id: [0, num_layers-1]
   size_t begin_;                   // sample begin index
   size_t end_;                     // sample end index
@@ -92,10 +92,11 @@ class layer : public deepgalois::node {
 #ifndef GALOIS_ENABLE_GPU
     myID = galois::runtime::getSystemNetworkInterface().ID;
 #endif
-    std::cout << "[" << myID << "] Layer " << level_ << " type: " << layer_type()
-              << "input[" << input_dims[0] << "," << input_dims[1] << "] output["
-              << output_dims[0] << "," << output_dims[1] << "]\n";
-    //galois::gPrint("[", myID, "] Layer", level_, " type: ", layer_type(),
+    std::cout << "[" << myID << "] Layer " << level_
+              << " type: " << layer_type() << "input[" << input_dims[0] << ","
+              << input_dims[1] << "] output[" << output_dims[0] << ","
+              << output_dims[1] << "]\n";
+    // galois::gPrint("[", myID, "] Layer", level_, " type: ", layer_type(),
     //               "input[", input_dims[0], ",", input_dims[1], "] output[",
     //               output_dims[0], ",", output_dims[1], "]\n");
   }
diff --git a/libdeepgalois/include/deepgalois/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h
index 2e086ebf88..01b84a60b6 100644
--- a/libdeepgalois/include/deepgalois/lgraph.h
+++ b/libdeepgalois/include/deepgalois/lgraph.h
@@ -35,8 +35,7 @@ class LearningGraph {
 public:
   typedef size_t iterator;
   LearningGraph(bool use_gpu)
-      : is_device(use_gpu), max_size_(0),
-        num_vertices_(0), num_edges_(0), 
+      : is_device(use_gpu), max_size_(0), num_vertices_(0), num_edges_(0),
         vertex_data_(NULL), edge_data_(NULL) {}
   LearningGraph() : LearningGraph(false) {}
   ~LearningGraph() { dealloc(); }
@@ -57,7 +56,10 @@ class LearningGraph {
   void dealloc();
   void degree_counting();
   void constructNodes() {}
-  void set_max_size(index_t max) { assert(max>0); max_size_ = max; }
+  void set_max_size(index_t max) {
+    assert(max > 0);
+    max_size_ = max;
+  }
 
   void readGraph(std::string dataset, bool selfloop = false);
   void fixEndEdge(index_t vid, index_t row_end) { rowptr_[vid + 1] = row_end; }
diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp
index 4a9087b0b3..e9f0ef4214 100644
--- a/libdeepgalois/src/DistContext.cpp
+++ b/libdeepgalois/src/DistContext.cpp
@@ -3,9 +3,7 @@
 #include "deepgalois/configs.h"
 
 namespace deepgalois {
-DistContext::DistContext() : DistContext(false) {
-  syncSubstrate = NULL;
-}
+DistContext::DistContext() : DistContext(false) { syncSubstrate = NULL; }
 
 DistContext::~DistContext() {}
 
diff --git a/libdeepgalois/src/RandomWalk.cpp b/libdeepgalois/src/RandomWalk.cpp
index cf2112ca60..23efe124d2 100644
--- a/libdeepgalois/src/RandomWalk.cpp
+++ b/libdeepgalois/src/RandomWalk.cpp
@@ -7,7 +7,8 @@
 
 namespace deepgalois {
 
-void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, GraphCPU* g, DGraph* dg) {
+void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, GraphCPU* g,
+                                    DGraph* dg) {
   this->count_ = count;
   // save original graph
   Sampler::globalGraph = g;
@@ -20,18 +21,23 @@ void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, GraphCPU* g, DG
   std::vector<uint32_t> degrees(g->size(), 0);
   galois::gPrint("graph size: ", g->size(), "\n");
   // get degrees of nodes that will be in new graph
-  //this->getMaskedDegrees(g->size(), masks, g, degrees);
-  galois::do_all(galois::iterate(size_t(0), g->size()), [&](const auto src) {
-    if (masks[src] == 1) {
-      for (auto e = g->edge_begin_host(src); e != g->edge_end_host(src); e++) {
-        const auto dst = g->getEdgeDstHost(e);
-        if (masks[dst] == 1) degrees[src]++;
-      }
-    }
-  } , galois::loopname("update_degrees"));
+  // this->getMaskedDegrees(g->size(), masks, g, degrees);
+  galois::do_all(
+      galois::iterate(size_t(0), g->size()),
+      [&](const auto src) {
+        if (masks[src] == 1) {
+          for (auto e = g->edge_begin_host(src); e != g->edge_end_host(src);
+               e++) {
+            const auto dst = g->getEdgeDstHost(e);
+            if (masks[dst] == 1)
+              degrees[src]++;
+          }
+        }
+      },
+      galois::loopname("update_degrees"));
 
   auto offsets = deepgalois::parallel_prefix_sum(degrees);
-  auto ne    = offsets[g->size()];
+  auto ne      = offsets[g->size()];
 
   // save ids (of original graph) of training nodes to vector
   for (size_t i = 0; i < g->size(); i++) {
@@ -42,19 +48,23 @@ void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, GraphCPU* g, DG
   Sampler::globalMaskedGraph->allocateFrom(g->size(), ne);
   Sampler::globalMaskedGraph->constructNodes();
   // same as original graph, except keep only edges involved in masks
-  galois::do_all(galois::iterate((size_t)0, g->size()), [&](const auto src) {
-    Sampler::globalMaskedGraph->fixEndEdge(src, offsets[src + 1]);
-    if (masks[src] == 1) {
-      auto idx = offsets[src];
-      for (auto e = g->edge_begin_host(src); e != g->edge_end_host(src); e++) {
-        const auto dst = g->getEdgeDstHost(e);
-        if (masks[dst] == 1) {
-          // galois::gPrint(src, " ", dst, "\n");
-          Sampler::globalMaskedGraph->constructEdge(idx++, dst, 0);
+  galois::do_all(
+      galois::iterate((size_t)0, g->size()),
+      [&](const auto src) {
+        Sampler::globalMaskedGraph->fixEndEdge(src, offsets[src + 1]);
+        if (masks[src] == 1) {
+          auto idx = offsets[src];
+          for (auto e = g->edge_begin_host(src); e != g->edge_end_host(src);
+               e++) {
+            const auto dst = g->getEdgeDstHost(e);
+            if (masks[dst] == 1) {
+              // galois::gPrint(src, " ", dst, "\n");
+              Sampler::globalMaskedGraph->constructEdge(idx++, dst, 0);
+            }
+          }
         }
-      }
-    }
-  }, galois::loopname("gen_subgraph"));
+      },
+      galois::loopname("gen_subgraph"));
 
   Sampler::globalMaskedGraph->degree_counting();
   Sampler::avg_deg = globalMaskedGraph->sizeEdges() / globalMaskedGraph->size();
@@ -67,7 +77,8 @@ void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, GraphCPU* g, DG
 // implementation from GraphSAINT
 // https://github.com/GraphSAINT/GraphSAINT/blob/master/ipdps19_cpp/sample.cpp
 void Sampler::selectVertices(index_t n, VertexSet& st, unsigned seed) {
-  if (n < m) m = n;
+  if (n < m)
+    m = n;
   unsigned myseed = seed;
 
   // unsigned myseed = tid;
diff --git a/libdeepgalois/src/Sampler.cpp b/libdeepgalois/src/Sampler.cpp
index 36b697ecb6..055b5c0a85 100644
--- a/libdeepgalois/src/Sampler.cpp
+++ b/libdeepgalois/src/Sampler.cpp
@@ -186,7 +186,7 @@ void Sampler::selectVertices(index_t nv, index_t n, Graph* g,
   vertex_set.insert(frontier.begin(), frontier.end());
   // galois::gPrint("vertex_set size: ", vertex_set.size(), "\n");
   int* degrees = new int[m];
-  //galois::do_all(galois::iterate(size_t(0), size_t(m)), [&](const auto i) {
+  // galois::do_all(galois::iterate(size_t(0), size_t(m)), [&](const auto i) {
   for (index_t i = 0; i < m; i++) {
     degrees[i] = (int)getDegree(g, frontier[i]);
   } //, galois::loopname("compute_degrees"));
@@ -217,7 +217,8 @@ void Sampler::selectVertices(index_t nv, index_t n, Graph* g,
 
 // Given a subset of vertices and a graph g, generate a subgraph sg from the
 // graph g
-void Sampler::reindexSubgraph(VertexSet& keptVertices, Graph& origGraph, Graph& reindexGraph) {
+void Sampler::reindexSubgraph(VertexSet& keptVertices, Graph& origGraph,
+                              Graph& reindexGraph) {
   // auto n = origGraph.size(); // old graph size
   auto nv            = keptVertices.size(); // new graph (subgraph) size
   VertexList new_ids = this->reindexVertices(globalGraph->size(), keptVertices);
@@ -235,7 +236,9 @@ void Sampler::reindexSubgraph(VertexSet& keptVertices, Graph& origGraph, Graph&
   VertexList old_ids(keptVertices.begin(),
                      keptVertices.end()); // vertex ID mapping
 #ifdef PARALLEL_GEN
-  galois::do_all(galois::iterate(size_t(0), size_t(nv)), [&](const auto i) {
+  galois::do_all(
+      galois::iterate(size_t(0), size_t(nv)),
+      [&](const auto i) {
 #else
   for (size_t i = 0; i < nv; i++) {
 #endif
@@ -251,7 +254,8 @@ void Sampler::reindexSubgraph(VertexSet& keptVertices, Graph& origGraph, Graph&
         }
       }
 #ifdef PARALLEL_GEN
-      , galois::loopname("construct_graph"));
+      ,
+      galois::loopname("construct_graph"));
 #endif
 }
 
@@ -267,53 +271,64 @@ VertexSet Sampler::convertToLID(VertexSet& gidSet) {
 }
 
 template <typename GraphTy>
-void Sampler::getMaskedDegrees(size_t n, mask_t* masks, GraphTy* g, std::vector<uint32_t>& degrees) {
-//template <>
-//void Sampler::getMaskedDegrees(size_t n, mask_t* masks, GraphCPU* g, std::vector<uint32_t>& degrees) {
+void Sampler::getMaskedDegrees(size_t n, mask_t* masks, GraphTy* g,
+                               std::vector<uint32_t>& degrees) {
+  // template <>
+  // void Sampler::getMaskedDegrees(size_t n, mask_t* masks, GraphCPU* g,
+  // std::vector<uint32_t>& degrees) {
   assert(degrees.size() == n);
-  galois::do_all(galois::iterate(size_t(0), n), [&](const auto src) {
-  //for (size_t src = 0; src < n; src++) {
-    if (masks[src] == 1) {
-      for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) {
-        const auto dst = g->getEdgeDst(e);
-        if (masks[dst] == 1) {
-          // galois::gInfo("Edge ", src, " ", dst);
-          degrees[src]++;
+  galois::do_all(
+      galois::iterate(size_t(0), n),
+      [&](const auto src) {
+        // for (size_t src = 0; src < n; src++) {
+        if (masks[src] == 1) {
+          for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) {
+            const auto dst = g->getEdgeDst(e);
+            if (masks[dst] == 1) {
+              // galois::gInfo("Edge ", src, " ", dst);
+              degrees[src]++;
+            }
+          }
         }
-      }
-    }
-  } , galois::loopname("update_degrees"));
+      },
+      galois::loopname("update_degrees"));
 }
 
 template <typename GraphTy, typename SubgraphTy>
-void Sampler::getMaskedGraph(index_t n, mask_t* masks, GraphTy* g, SubgraphTy* sub) {
+void Sampler::getMaskedGraph(index_t n, mask_t* masks, GraphTy* g,
+                             SubgraphTy* sub) {
   std::vector<uint32_t> degrees(n, 0);
   this->getMaskedDegrees(n, masks, g, degrees);
   // auto offsets = deepgalois::parallel_prefix_sum(degrees);
   auto offsets = deepgalois::prefix_sum(degrees);
   size_t ne    = offsets[n];
-  // galois::gPrint("getMaskedGraph: num_vertices=", n, ", num_edges=", ne, "\n");
+  // galois::gPrint("getMaskedGraph: num_vertices=", n, ", num_edges=", ne,
+  // "\n");
 
   // note this constructs the full graph's nodes; just trims edges
   sub->allocateFrom(n, ne);
   sub->constructNodes();
 
-  galois::do_all(galois::iterate(size_t(0), size_t(n)), [&](const auto src) {
-    sub->fixEndEdge(src, offsets[src + 1]);
-    if (masks[src] == 1) {
-      auto idx = offsets[src];
-      for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) {
-        auto dst = g->getEdgeDst(e);
-        if (masks[dst] == 1) {
-          // galois::gPrint(src, " ", dst, "\n");
-          sub->constructEdge(idx++, dst, 0);
+  galois::do_all(
+      galois::iterate(size_t(0), size_t(n)),
+      [&](const auto src) {
+        sub->fixEndEdge(src, offsets[src + 1]);
+        if (masks[src] == 1) {
+          auto idx = offsets[src];
+          for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) {
+            auto dst = g->getEdgeDst(e);
+            if (masks[dst] == 1) {
+              // galois::gPrint(src, " ", dst, "\n");
+              sub->constructEdge(idx++, dst, 0);
+            }
+          }
         }
-      }
-    }
-  }, galois::loopname("gen_subgraph"));
+      },
+      galois::loopname("gen_subgraph"));
 }
 
-void Sampler::generateSubgraph(VertexSet &sampledSet, mask_t* masks, Graph* sg) {
+void Sampler::generateSubgraph(VertexSet& sampledSet, mask_t* masks,
+                               Graph* sg) {
   // n = 9000 by default
   // do the sampling of vertices from training set + using masked graph
 
@@ -335,7 +350,8 @@ void Sampler::generateSubgraph(VertexSet &sampledSet, mask_t* masks, Graph* sg)
   Graph maskedSG;
   // TODO use partMaskedGraph once constructed later
   // remove edges whose destination is not masked
-  this->getMaskedGraph(Sampler::partGraph->size(), masks, Sampler::partGraph, &maskedSG);
+  this->getMaskedGraph(Sampler::partGraph->size(), masks, Sampler::partGraph,
+                       &maskedSG);
   this->reindexSubgraph(sampledLIDs, maskedSG, *sg);
 
   // galois::gPrint("sg num edges is ", sg.sizeEdges(), "\n");
diff --git a/libdeepgalois/src/Train.cpp b/libdeepgalois/src/Train.cpp
index 4e363bb1b1..992902e7b6 100644
--- a/libdeepgalois/src/Train.cpp
+++ b/libdeepgalois/src/Train.cpp
@@ -4,20 +4,21 @@
 namespace deepgalois {
 
 Net::Net(std::string dataset_str, int nt, unsigned n_conv, int epochs,
-    unsigned hidden1, float lr, float dropout, float wd, 
-    bool selfloop, bool single, bool l2norm, bool dense, 
-    unsigned neigh_sz, unsigned subg_sz, int val_itv) :
-//    globalSamples(0), num_classes(0), num_conv_layers(0), num_layers(0),
-//    globalTrainBegin(0), globalTrainEnd(0), globalTrainCount(0), 
-//    globalValBegin(0), globalValEnd(0), globalValCount(0), 
-//    globalTestBegin(0), globalTestEnd(0), globalTestCount(0), 
-//    globalTrainMasks(NULL), globalValMasks(NULL), globalTestMasks(NULL) {}
+         unsigned hidden1, float lr, float dropout, float wd, bool selfloop,
+         bool single, bool l2norm, bool dense, unsigned neigh_sz,
+         unsigned subg_sz, int val_itv)
+    : //    globalSamples(0), num_classes(0), num_conv_layers(0), num_layers(0),
+      //    globalTrainBegin(0), globalTrainEnd(0), globalTrainCount(0),
+      //    globalValBegin(0), globalValEnd(0), globalValCount(0),
+      //    globalTestBegin(0), globalTestEnd(0), globalTestCount(0),
+      //    globalTrainMasks(NULL), globalValMasks(NULL), globalTestMasks(NULL)
+      //    {}
       is_single_class(single), has_l2norm(l2norm), has_dense(dense),
       neighbor_sample_size(neigh_sz), subgraph_sample_size(subg_sz),
-      num_threads(nt), num_conv_layers(n_conv), num_epochs(epochs),
-      h1(hidden1), learning_rate(lr), dropout_rate(dropout), weight_decay(wd),
+      num_threads(nt), num_conv_layers(n_conv), num_epochs(epochs), h1(hidden1),
+      learning_rate(lr), dropout_rate(dropout), weight_decay(wd),
       val_interval(val_itv), num_subgraphs(1), is_selfloop(selfloop) {
-    // init some identifiers for this host
+  // init some identifiers for this host
   unsigned myID = 0;
 #ifdef GALOIS_ENABLE_DIST
   myID = galois::runtime::getSystemNetworkInterface().ID;
@@ -87,7 +88,7 @@ Net::Net(std::string dataset_str, int nt, unsigned n_conv, int epochs,
 void Net::train(optimizer* opt, bool need_validate) {
   galois::StatTimer train_timer("Timer_0");
   train_timer.start();
-  std::string separator = "\n";
+  std::string separator   = "\n";
   double total_train_time = 0.0;
   int num_subg_remain     = 0;
 
@@ -97,12 +98,12 @@ void Net::train(optimizer* opt, bool need_validate) {
     distContext->allocateSubgraphs(num_subgraphs, subgraph_sample_size);
     allocateSubgraphsMasks(num_subgraphs);
     std::cout << header
-      << "Constructing training vertex set induced graph...\n";
+              << "Constructing training vertex set induced graph...\n";
     // auto gg = distContext->getGraphPointer();
     auto gg =
-      graphTopologyContext->getGraphPointer(); // gloabl graph in CPU mem
+        graphTopologyContext->getGraphPointer(); // gloabl graph in CPU mem
     sampler->initializeMaskedGraph(globalTrainCount, globalTrainMasks, gg,
-        distContext->getGraphPointer());
+                                   distContext->getGraphPointer());
     construct_time.stop();
   }
 
@@ -122,7 +123,7 @@ void Net::train(optimizer* opt, bool need_validate) {
       sample_time.start();
       if (num_subg_remain == 0) {
         std::cout << header << "Generating " << num_subgraphs
-          << " subgraph(s)\n";
+                  << " subgraph(s)\n";
         galois::StatTimer t_subgen("SubgraphGenerateTime");
         t_subgen.start();
 
@@ -130,10 +131,10 @@ void Net::train(optimizer* opt, bool need_validate) {
         for (int sid = 0; sid < num_subgraphs; sid++) {
           VertexSet sampledSet;
           sampler->selectVertices(subgraph_sample_size, sampledSet,
-              curEpoch); // m = 1000 by default
+                                  curEpoch); // m = 1000 by default
           sampler->generateSubgraph(sampledSet,
-              subgraphs_masks + sid * globalSamples,
-              distContext->getSubgraphPointer(sid));
+                                    subgraphs_masks + sid * globalSamples,
+                                    distContext->getSubgraphPointer(sid));
         }
         num_subg_remain = num_subgraphs;
         t_subgen.stop();
@@ -152,7 +153,7 @@ void Net::train(optimizer* opt, bool need_validate) {
       auto subgraphPointer      = distContext->getSubgraphPointer(sg_id);
       this->subgraphNumVertices = subgraphPointer->size();
 
-      //std::cout << "Subgraph num_vertices: " << subgraphNumVertices
+      // std::cout << "Subgraph num_vertices: " << subgraphNumVertices
       //          << ", num_edges: " << subgraphPointer->sizeEdges() << "\n";
       for (size_t i = 0; i < num_layers; i++) {
         layers[i]->update_dim_size(this->subgraphNumVertices);
@@ -199,20 +200,19 @@ void Net::train(optimizer* opt, bool need_validate) {
     set_netphases(net_phase::train);
     acc_t train_loss = 0.0, train_acc = 0.0;
 
-    //galois::gPrint(header, "Calling into eval for forward propagation\n");
+    // galois::gPrint(header, "Calling into eval for forward propagation\n");
     // forward: after this phase, layer edges will contain intermediate
     // features for use during backprop
     double fw_time = evaluate("train", train_loss, train_acc);
-    //evaluate("train", train_loss, train_acc);
+    // evaluate("train", train_loss, train_acc);
 
-
-    //galois::gPrint(header, "Calling into backward propagation\n");
+    // galois::gPrint(header, "Calling into backward propagation\n");
     // backward: use intermediate features + ground truth to update layers
     // with feature gradients whcih are then used to calculate weight
     // gradients
     Net::bprop();
 
-    //galois::gPrint(header, "Weight update call\n");
+    // galois::gPrint(header, "Weight update call\n");
     // gradient update: use gradients stored on each layer to update model
     // for next epoch
     Net::update_weights(opt); // update parameters
@@ -222,10 +222,10 @@ void Net::train(optimizer* opt, bool need_validate) {
 
 #ifdef GALOIS_ENABLE_GPU
     std::cout << header << "train_loss " << std::setprecision(3) << std::fixed
-      << train_loss << " train_acc " << train_acc << " ";
+              << train_loss << " train_acc " << train_acc << " ";
 #else
     galois::gPrint(header, "train_loss ", std::setprecision(3), std::fixed,
-        train_loss, " train_acc ", train_acc, separator);
+                   train_loss, " train_acc ", train_acc, separator);
 #endif
     t_epoch.Stop();
 
@@ -238,24 +238,25 @@ void Net::train(optimizer* opt, bool need_validate) {
       double val_time = evaluate("val", val_loss, val_acc);
 #ifdef GALOIS_ENABLE_GPU
       std::cout << header << "val_loss " << std::setprecision(3) << std::fixed
-        << val_loss << " val_acc " << val_acc << " ";
+                << val_loss << " val_acc " << val_acc << " ";
       std::cout << header << "time " << std::setprecision(3) << std::fixed
-        << epoch_time + val_time << " ms (train_time " << epoch_time
-        << " val_time " << val_time << ")\n";
+                << epoch_time + val_time << " ms (train_time " << epoch_time
+                << " val_time " << val_time << ")\n";
 #else
       galois::gPrint(header, "val_loss ", std::setprecision(3), std::fixed,
-          val_loss, " val_acc ", val_acc, separator);
+                     val_loss, " val_acc ", val_acc, separator);
       galois::gPrint(header, "time ", std::setprecision(3), std::fixed,
-          epoch_time + val_time, " ms (train_time ", epoch_time,
-          " val_time ", val_time, ")\n");
+                     epoch_time + val_time, " ms (train_time ", epoch_time,
+                     " val_time ", val_time, ")\n");
 #endif
     } else {
 #ifdef GALOIS_ENABLE_GPU
       std::cout << header << "train_time " << std::fixed << epoch_time
-        << " ms (fw " << fw_time << ", bw " << epoch_time - fw_time << ")\n";
+                << " ms (fw " << fw_time << ", bw " << epoch_time - fw_time
+                << ")\n";
 #else
-      galois::gPrint(header, "train_time ", std::fixed, epoch_time,
-          " ms (fw ", fw_time, ", bw ", epoch_time - fw_time, ")\n");
+      galois::gPrint(header, "train_time ", std::fixed, epoch_time, " ms (fw ",
+                     fw_time, ", bw ", epoch_time - fw_time, ")\n");
 #endif
     }
   } // epoch loop
@@ -263,11 +264,11 @@ void Net::train(optimizer* opt, bool need_validate) {
   double avg_train_time = total_train_time / (double)num_epochs;
   double throughput     = 1000.0 * (double)num_epochs / total_train_time;
 #ifdef GALOIS_ENABLE_GPU
-  std::cout << "Average training time per epoch: " << avg_train_time 
-    << "ms. Throughput " << throughput << " epoch/s\n";
+  std::cout << "Average training time per epoch: " << avg_train_time
+            << "ms. Throughput " << throughput << " epoch/s\n";
 #else
   galois::gPrint(header, "Average training time per epoch: ", avg_train_time,
-      " ms. Throughput: ", throughput, " epoch/s\n");
+                 " ms. Throughput: ", throughput, " epoch/s\n");
 #endif
   train_timer.stop();
 }
@@ -334,9 +335,9 @@ double Net::evaluate(std::string type, acc_t& loss, acc_t& acc) {
   }
 #endif
 
-  //galois::gPrint(header, "Doing actual forward propagation\n");
+  // galois::gPrint(header, "Doing actual forward propagation\n");
   loss = fprop(gBegin, gEnd, gCount, gMasks);
-  //galois::gPrint(header,
+  // galois::gPrint(header,
   //               "Forward propagation donne, going to check accuracy\n");
   float_t* predictions = layers[num_layers - 1]->next()->get_data();
 
@@ -350,11 +351,11 @@ double Net::evaluate(std::string type, acc_t& loss, acc_t& acc) {
   }
 
   if (is_single_class) {
-    acc = masked_accuracy(gBegin, gEnd, gCount, gMasks, predictions,
-        localLabels);
+    acc =
+        masked_accuracy(gBegin, gEnd, gCount, gMasks, predictions, localLabels);
   } else {
-    acc = masked_multi_class_accuracy(gBegin, gEnd, gCount, gMasks,
-        predictions, localLabels);
+    acc = masked_multi_class_accuracy(gBegin, gEnd, gCount, gMasks, predictions,
+                                      localLabels);
   }
 
   eval_timer.stop();
@@ -366,7 +367,7 @@ double Net::evaluate(std::string type, acc_t& loss, acc_t& acc) {
 
 void Net::construct_layers() {
   // append conv layers
-  //galois::gPrint(header, "Constructing layers...\n");
+  // galois::gPrint(header, "Constructing layers...\n");
   for (size_t i = 0; i < num_conv_layers - 1; i++) {
     append_conv_layer(i, true); // conv layers, act=true
   }
@@ -438,7 +439,8 @@ void Net::append_out_layer(size_t layer_id) {
   layers[layer_id]->set_labels_ptr(distContext->get_labels_ptr());
 }
 //! Add a convolution layer to the network
-void Net::append_conv_layer(size_t layer_id, bool act, bool norm, bool bias, bool dropout) {
+void Net::append_conv_layer(size_t layer_id, bool act, bool norm, bool bias,
+                            bool dropout) {
   assert(dropout_rate < 1.0);
   assert(layer_id < num_conv_layers);
   std::vector<size_t> in_dims(2), out_dims(2);
@@ -446,7 +448,7 @@ void Net::append_conv_layer(size_t layer_id, bool act, bool norm, bool bias, boo
   in_dims[1]               = get_in_dim(layer_id);
   out_dims[1]              = get_out_dim(layer_id);
   layers[layer_id] = new graph_conv_layer(layer_id, act, norm, bias, dropout,
-      dropout_rate, in_dims, out_dims);
+                                          dropout_rate, in_dims, out_dims);
 #ifdef GALOIS_ENABLE_GPU
   layers[layer_id]->set_graph_ptr(distContext->getGraphPointer());
 #else
@@ -462,15 +464,15 @@ acc_t Net::fprop(size_t gBegin, size_t gEnd, size_t gCount, mask_t* gMasks) {
   // set mask for the last layer; globals
   // TODO this should be distirbuted sample gBegin->end not global; fix later
   // seems to be unused in code right now anyways
-  //galois::gPrint(header, "fprop: set sample mask\n");
+  // galois::gPrint(header, "fprop: set sample mask\n");
   layers[num_layers - 1]->set_sample_mask(gBegin, gEnd, gCount, gMasks);
 
   for (size_t i = 0; i < num_layers; i++) {
-    //galois::gPrint(header, "fprop: layer ", i, " forward call\n");
+    // galois::gPrint(header, "fprop: layer ", i, " forward call\n");
     layers[i]->forward();
   }
 
-  //galois::gPrint(header, "fprop: getting loss\n");
+  // galois::gPrint(header, "fprop: getting loss\n");
   // prediction error
   acc_t loss = layers[num_layers - 1]->get_prediction_loss();
   // Squared Norm Regularization to mitigate overfitting
@@ -520,10 +522,10 @@ void Net::print_layers_info() {
 // print the configurations
 void Net::print_configs() {
   galois::gPrint(header, "Configuration: num_threads ", num_threads,
-      ", num_conv_layers ", num_conv_layers, ", num_epochs ",
-      num_epochs, ", hidden_feat_len ", h1, ", learning_rate ",
-      learning_rate, ", dropout_rate ", dropout_rate,
-      ", weight_decay ", weight_decay, "\n");
+                 ", num_conv_layers ", num_conv_layers, ", num_epochs ",
+                 num_epochs, ", hidden_feat_len ", h1, ", learning_rate ",
+                 learning_rate, ", dropout_rate ", dropout_rate,
+                 ", weight_decay ", weight_decay, "\n");
 }
 
-} // end namespace
+} // namespace deepgalois
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index b2fe0784f7..4c11086495 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -135,7 +135,7 @@ void graph_conv_layer::forward_propagation(const float_t* in_data,
   deepgalois::_syncVectorSize = z;
   deepgalois::_dataToSync     = out_data;
   layer::context->getSyncSubstrate()->sync<writeAny, readAny, GraphConvSync>(
-     "GraphConvForward");
+      "GraphConvForward");
 
   // run relu activation on output if specified
   galois::StatTimer relu_timer("GraphConvForwardRelu");
@@ -194,7 +194,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
   deepgalois::_syncVectorSize = z;
   deepgalois::_dataToSync     = out_temp;
   layer::context->getSyncSubstrate()->sync<writeAny, readAny, GraphConvSync>(
-     "GraphConvBackward");
+      "GraphConvBackward");
 
   galois::StatTimer drop_timer("GraphConvBackwardDropout");
   drop_timer.start();
diff --git a/libdeepgalois/src/lgraph.cpp b/libdeepgalois/src/lgraph.cpp
index c0c39b4023..31cd353e51 100644
--- a/libdeepgalois/src/lgraph.cpp
+++ b/libdeepgalois/src/lgraph.cpp
@@ -21,7 +21,8 @@ uint64_t LearningGraph::numMasters() { return 0; }
 uint64_t LearningGraph::globalSize() { return 0; }
 
 void LearningGraph::readGraph(std::string dataset, bool selfloop) {
-  if (selfloop) std::cout << "selfloop not yet implemented\n";
+  if (selfloop)
+    std::cout << "selfloop not yet implemented\n";
   deepgalois::Reader reader(dataset);
   reader.readGraphFromGRFile(this);
 }
@@ -29,9 +30,10 @@ void LearningGraph::readGraph(std::string dataset, bool selfloop) {
 void LearningGraph::degree_counting() {
   // if (degrees_ != NULL) return;
   // degrees_ = new index_t[num_vertices_];
-  galois::do_all(galois::iterate(size_t(0), size_t(num_vertices_)),
-    [&](auto v) { degrees_[v] = rowptr_[v + 1] - rowptr_[v]; },
-    galois::loopname("DegreeCounting"));
+  galois::do_all(
+      galois::iterate(size_t(0), size_t(num_vertices_)),
+      [&](auto v) { degrees_[v] = rowptr_[v + 1] - rowptr_[v]; },
+      galois::loopname("DegreeCounting"));
 }
 
 void LearningGraph::dealloc() {}
diff --git a/libdeepgalois/src/reader.cpp b/libdeepgalois/src/reader.cpp
index 016a72d26a..bf5792fca4 100644
--- a/libdeepgalois/src/reader.cpp
+++ b/libdeepgalois/src/reader.cpp
@@ -31,14 +31,17 @@ size_t Reader::read_labels(bool is_single_class, label_t*& labels) {
   size_t m, num_classes; // m: number of samples
   in >> m >> num_classes >> std::ws;
   if (is_single_class) {
-    std::cout << "[" << myID << "] Reader: Using single-class (one-hot) labels\n";
-    //galois::gPrint("[", myID,
+    std::cout << "[" << myID
+              << "] Reader: Using single-class (one-hot) labels\n";
+    // galois::gPrint("[", myID,
     //               "] Reader: Using single-class (one-hot) labels\n");
     labels =
         new label_t[m]; // single-class (one-hot) label for each vertex: N x 1
   } else {
-    //galois::gPrint("[", myID, "] Reader: Using multi-class (one-hot) labels\n");
-    std::cout << "[" << myID << "] Reader: Using multi-class (one-hot) labels\n";
+    // galois::gPrint("[", myID, "] Reader: Using multi-class (one-hot)
+    // labels\n");
+    std::cout << "[" << myID
+              << "] Reader: Using multi-class (one-hot) labels\n";
     labels =
         new label_t[m *
                     num_classes]; // multi-class label for each vertex: N x E
@@ -65,8 +68,8 @@ size_t Reader::read_labels(bool is_single_class, label_t*& labels) {
   // print the number of vertex classes
   std::cout << "[" << myID << "] Done, unique label counts: " << num_classes
             << ", time: " << t_read.Millisecs() << " ms\n";
-  //galois::gPrint("[", myID, "] Done, unique label counts: ", num_classes,
-                 //", time: ", t_read.Millisecs(), " ms\n");
+  // galois::gPrint("[", myID, "] Done, unique label counts: ", num_classes,
+  //", time: ", t_read.Millisecs(), " ms\n");
   // for (auto i = 0; i < 10; i ++) std::cout << "labels[" << i << "] = " <<
   // unsigned(labels[i]) << "\n";
   return num_classes;
@@ -158,10 +161,11 @@ size_t Reader::read_masks(std::string mask_type, size_t n, size_t& begin,
     }
     i++;
   }
-  std::cout << "Global read " << mask_type << "_mask range: [" << begin
-            << ", " << end << ") Number of valid samples: " << sample_count
-            << " (" << (float)sample_count / (float)n * (float)100 << "\%)\n";
-  //galois::gPrint("Global read ", mask_type, "_mask range: [", begin, ", ", end,
+  std::cout << "Global read " << mask_type << "_mask range: [" << begin << ", "
+            << end << ") Number of valid samples: " << sample_count << " ("
+            << (float)sample_count / (float)n * (float)100 << "\%)\n";
+  // galois::gPrint("Global read ", mask_type, "_mask range: [", begin, ", ",
+  // end,
   //               ") Number of valid samples: ", sample_count, " (",
   //               (float)sample_count / (float)n * (float)100, "\%)\n");
   in.close();
diff --git a/libdeepgalois/src/utils.cpp b/libdeepgalois/src/utils.cpp
index 10cd18832c..929881dd25 100644
--- a/libdeepgalois/src/utils.cpp
+++ b/libdeepgalois/src/utils.cpp
@@ -26,7 +26,7 @@ OutTy* parallel_prefix_sum(const std::vector<InTy>& in) {
   }
   bulk_prefix[num_blocks] = total;
   // TODO do not use new here: difficult to track and free later
-  OutTy* prefix           = new OutTy[in.size() + 1];
+  OutTy* prefix = new OutTy[in.size() + 1];
   galois::do_all(
       galois::iterate((size_t)0, num_blocks), [&](const size_t& block) {
         OutTy local_total = bulk_prefix[block];
@@ -120,7 +120,7 @@ acc_t masked_f1_score(size_t begin, size_t end, size_t, mask_t* masks,
 #endif
   std::cout << "[" << myID << "]" << std::setprecision(3) << std::fixed
             << " (f1_micro:" << f1_micro << ", f1_macro: " << f1_macro << ")\n";
-  //galois::gPrint("[", myID, "]", std::setprecision(3), std::fixed,
+  // galois::gPrint("[", myID, "]", std::setprecision(3), std::fixed,
   //               " (f1_micro:", f1_micro, ", f1_macro: ", f1_macro, ")\n");
 
   return f1_micro;

From 1af2982fac79a4702155dde306ff0a80b41190b9 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Sat, 11 Jul 2020 11:39:36 -0500
Subject: [PATCH 324/660] BufferWrapper to wrap memory buffers with a size

---
 .../layers/GraphConvSyncStructures.h          | 16 ++-----
 libdist/include/galois/BufferWrapper.h        | 44 +++++++++++++++++++
 2 files changed, 48 insertions(+), 12 deletions(-)
 create mode 100644 libdist/include/galois/BufferWrapper.h

diff --git a/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h b/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h
index 95e09b1c0d..b07b672fa1 100644
--- a/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h
+++ b/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h
@@ -1,23 +1,15 @@
 #ifndef GALOIS_ENABLE_GPU
 #ifndef __GRAPH_CONV_SYNC_STRUCT__
 #define __GRAPH_CONV_SYNC_STRUCT__
+#include "galois/BufferWrapper.h"
 
 struct GraphConvSync {
-  using ValTy = std::vector<float>;
+  using ValTy = galois::BufferWrapper<float>;
 
   //! return a vector of floats to sync
   static ValTy extract(uint32_t node_id, char&) {
-    // TODO figure out how to avoid copy from C array to vector; best
-    // way is if original data is in a vector probably, but that has the
-    // issue of not being able to directly call BLAS
-    ValTy vecToReturn;
-    // allocate space
-    vecToReturn.resize(deepgalois::_syncVectorSize);
-    // copy the node's data to vector to serialize/send
-    for (unsigned i = 0; i < deepgalois::_syncVectorSize; i++) {
-      vecToReturn[i] =
-          deepgalois::_dataToSync[node_id * deepgalois::_syncVectorSize + i];
-    }
+    ValTy vecToReturn(&deepgalois::_dataToSync[node_id * deepgalois::_syncVectorSize],
+                      deepgalois::_syncVectorSize);
     // move constructor should kick in here to avoid return copy
     return vecToReturn;
   }
diff --git a/libdist/include/galois/BufferWrapper.h b/libdist/include/galois/BufferWrapper.h
new file mode 100644
index 0000000000..eeebd7f747
--- /dev/null
+++ b/libdist/include/galois/BufferWrapper.h
@@ -0,0 +1,44 @@
+#ifndef GALOIS_BUFFER_WRAPPER
+#define GALOIS_BUFFER_WRAPPER
+#include <cassert>
+
+namespace galois {
+
+//! Wraps a pointer representing an array with the number of elements the
+//! array contains (or that we want to handle with this class)
+//! Used to avoid copying of memory into a vector for
+//! serialization/deserialization purpose
+//! @todo give this a better name
+template<typename ElementType>
+class BufferWrapper {
+  //! Raw memory kept by this class
+  ElementType* raw_memory;
+  //! Number of elements that can be accessed from the raw_memory pointer
+  size_t num_elements;
+public:
+  //! Default constructor doesn't exist: must provide pointer and size
+  BufferWrapper() = delete;
+  //! Save a pointer and the number of elements in that array that this can access
+  BufferWrapper(ElementType* pointer, size_t num_elements_) : raw_memory(pointer),
+  num_elements(num_elements_) {};
+
+  //! Returns element at some specified index of the array
+  ElementType& operator[](size_t index) {
+    assert(index < num_elements);
+    return raw_memory[index];
+  }
+
+  //! Returns element at some specified index of the array; const i.e. not modifiable
+  const ElementType& operator[](size_t index) const {
+    assert(index < num_elements);
+    return raw_memory[index];
+  }
+
+  //! Return number of elements in the array
+  size_t size() const {
+    return this->num_elements;
+  }
+};
+
+} // end namespace
+#endif

From 1f000d8e9855b21ecc88ad1d930d9208ae370700 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Sat, 11 Jul 2020 14:33:42 -0500
Subject: [PATCH 325/660] bufferwrapper functionality + serialization of it

---
 libdist/include/galois/BufferWrapper.h     | 67 ++++++++++++++++++----
 libdist/include/galois/runtime/Serialize.h | 42 ++++++++++++++
 2 files changed, 97 insertions(+), 12 deletions(-)

diff --git a/libdist/include/galois/BufferWrapper.h b/libdist/include/galois/BufferWrapper.h
index eeebd7f747..4c5854d5ad 100644
--- a/libdist/include/galois/BufferWrapper.h
+++ b/libdist/include/galois/BufferWrapper.h
@@ -1,26 +1,52 @@
 #ifndef GALOIS_BUFFER_WRAPPER
 #define GALOIS_BUFFER_WRAPPER
+#include "galois/gstl.h"
 #include <cassert>
 
 namespace galois {
 
 //! Wraps a pointer representing an array with the number of elements the
 //! array contains (or that we want to handle with this class)
+//!
 //! Used to avoid copying of memory into a vector for
 //! serialization/deserialization purpose
 //! @todo give this a better name
-template<typename ElementType>
+template <typename ElementType>
 class BufferWrapper {
-  //! Raw memory kept by this class
+public:
+  using size_type  = size_t;
+  using value_type = ElementType;
+
+private:
+  //! This vector is allocated when creating a buffer wrapper from scratch
+  //! (i.e. during deserialization into one)
+  galois::gstl::Vector<ElementType> dummy;
+  //! Raw memory kept by this class; either points to existing memory or to the
+  //! vector memory held by this class
   ElementType* raw_memory;
   //! Number of elements that can be accessed from the raw_memory pointer
-  size_t num_elements;
+  size_type num_elements;
+
 public:
-  //! Default constructor doesn't exist: must provide pointer and size
-  BufferWrapper() = delete;
-  //! Save a pointer and the number of elements in that array that this can access
-  BufferWrapper(ElementType* pointer, size_t num_elements_) : raw_memory(pointer),
-  num_elements(num_elements_) {};
+  //! Default constructor 0s everything
+  BufferWrapper() {
+    this->raw_memory   = 0;
+    this->num_elements = 0;
+  }
+
+  //! frees dummy vector
+  ~BufferWrapper() {
+    // explicit vector clear; regular destructor probably frees it, but
+    // doing it for safetey
+    if (dummy.size()) {
+      dummy.clear();
+    }
+  }
+
+  //! Save a pointer and the number of elements in that array that this can
+  //! access
+  BufferWrapper(ElementType* pointer, size_t num_elements_)
+      : raw_memory(pointer), num_elements(num_elements_){};
 
   //! Returns element at some specified index of the array
   ElementType& operator[](size_t index) {
@@ -28,17 +54,34 @@ class BufferWrapper {
     return raw_memory[index];
   }
 
-  //! Returns element at some specified index of the array; const i.e. not modifiable
+  //! Returns element at some specified index of the array; const i.e. not
+  //! modifiable
   const ElementType& operator[](size_t index) const {
     assert(index < num_elements);
     return raw_memory[index];
   }
 
   //! Return number of elements in the array
-  size_t size() const {
-    return this->num_elements;
+  size_t size() const { return this->num_elements; }
+
+  //! return unmodifiable pointer to raw_memory
+  const ElementType* data() const { return raw_memory; }
+  //! return pointer to raw_memory
+  ElementType* data() { return raw_memory; }
+
+  //! Allocates memory in the underlying vector; should only be used for
+  //! deserialization into this class during communication
+  void resize(size_t new_size) {
+    if (!this->raw_memory) {
+      this->dummy.resize(new_size);
+      this->raw_memory   = this->dummy.data();
+      this->num_elements = this->dummy.size();
+    } else {
+      GALOIS_DIE("calling resize when there is already raw memory "
+                 "allocated");
+    }
   }
 };
 
-} // end namespace
+} // namespace galois
 #endif
diff --git a/libdist/include/galois/runtime/Serialize.h b/libdist/include/galois/runtime/Serialize.h
index 688e4be59d..40cd4f4b7e 100644
--- a/libdist/include/galois/runtime/Serialize.h
+++ b/libdist/include/galois/runtime/Serialize.h
@@ -43,6 +43,7 @@
 #include <galois/AtomicWrapper.h>
 #include <galois/PODResizeableArray.h>
 #include "galois/CopyableTuple.h"
+#include "galois/BufferWrapper.h"
 #include "galois/Bag.h"
 
 namespace galois {
@@ -305,6 +306,12 @@ gSizedObj(const T&,
   return sizeof(uintptr_t);
 }
 
+//! Size of BufferWrapper is size + number of things in it
+template <typename T>
+inline size_t gSizedObj(const galois::BufferWrapper<T>& data) {
+  return sizeof(size_t) + data.size() * sizeof(T);
+}
+
 /**
  * Returns the size necessary for storing 2 elements of a pair into a
  * serialize buffer.
@@ -561,6 +568,11 @@ template <typename T, typename Alloc>
 inline void gSerializeObj(SerializeBuffer& buf,
                           const std::vector<T, Alloc>& data);
 
+// Forward declaration of buff serialize
+template <typename T>
+inline void gSerializeObj(SerializeBuffer& buf,
+                          const galois::BufferWrapper<T>& data);
+
 /**
  * Serialize a sequence type into a buffer.
  *
@@ -608,6 +620,18 @@ inline void gSerializeObj(SerializeBuffer& buf,
     gSerializeSeq(buf, data);
 }
 
+//! Serialize BufferWrapper similarly to vector
+template <typename T>
+inline void gSerializeObj(SerializeBuffer& buf,
+                          const galois::BufferWrapper<T>& data) {
+  if (is_memory_copyable<T>::value) {
+    gSerializeLinearSeq(buf, data);
+  } else {
+    GALOIS_DIE("have not implemented support for serializing nonPOD buffer "
+               "wrapper");
+  }
+}
+
 /**
  * Serialize a PODResizeableArray into a buffer, choosing to do a memcopy or
  * to serialize each element individually depending on data.
@@ -919,6 +943,10 @@ gDeserializeObj(DeSerializeBuffer& buf,
 template <typename T, typename Alloc>
 void gDeserializeObj(DeSerializeBuffer& buf, std::vector<T, Alloc>& data);
 
+// Forward declaration of buff wrapper deserialize
+template <typename T>
+void gDeserializeObj(DeSerializeBuffer& buf, galois::BufferWrapper<T>& data);
+
 /**
  * Deserialize into a sequence object
  *
@@ -986,6 +1014,20 @@ void gDeserializeObj(DeSerializeBuffer& buf, std::vector<T, Alloc>& data) {
     gDeserializeSeq(buf, data);
 }
 
+//! deserialize into buf wrapper
+template <typename T>
+void gDeserializeObj(DeSerializeBuffer& buf, galois::BufferWrapper<T>& bf) {
+  if (is_memory_copyable<T>::value) {
+    // manual deserialization here
+    size_t buffer_size;
+    gDeserializeObj(buf, buffer_size);
+    bf.resize(buffer_size);
+    buf.extract((uint8_t*)bf.data(), buffer_size * sizeof(T));
+  } else {
+    GALOIS_DIE("deserialize for buf wrapper not implemented for nonpod");
+  }
+}
+
 /**
  * Deserialize into a PODResizeableArray
  *

From 46a052effcea16c251db6427c554a16c7773d117 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Sat, 11 Jul 2020 18:03:00 -0500
Subject: [PATCH 326/660] fixed bufferwrapper vector access

Originally the raw_mem pointer is initialized to the same data as the
dummy vector. However, the dummy vector's data pointer is updated at
some point, causes the old initiailization to be useless. Changed
behvaior such that if dummy vector is initialized, then access is always
from it. TODO how much overhead is the check?
---
 libdist/include/galois/BufferWrapper.h     | 50 +++++++++++++++++-----
 libdist/include/galois/runtime/Serialize.h |  2 +-
 2 files changed, 40 insertions(+), 12 deletions(-)

diff --git a/libdist/include/galois/BufferWrapper.h b/libdist/include/galois/BufferWrapper.h
index 4c5854d5ad..8066f3a25e 100644
--- a/libdist/include/galois/BufferWrapper.h
+++ b/libdist/include/galois/BufferWrapper.h
@@ -21,8 +21,9 @@ class BufferWrapper {
   //! This vector is allocated when creating a buffer wrapper from scratch
   //! (i.e. during deserialization into one)
   galois::gstl::Vector<ElementType> dummy;
-  //! Raw memory kept by this class; either points to existing memory or to the
-  //! vector memory held by this class
+  //! Raw memory kept by this class; either points to existing memory or is
+  //! empty (vector.data changes when this object is copied, causes issues
+  //! with correcntess)
   ElementType* raw_memory;
   //! Number of elements that can be accessed from the raw_memory pointer
   size_type num_elements;
@@ -30,6 +31,7 @@ class BufferWrapper {
 public:
   //! Default constructor 0s everything
   BufferWrapper() {
+    dummy.clear();
     this->raw_memory   = 0;
     this->num_elements = 0;
   }
@@ -50,37 +52,63 @@ class BufferWrapper {
 
   //! Returns element at some specified index of the array
   ElementType& operator[](size_t index) {
-    assert(index < num_elements);
-    return raw_memory[index];
+    assert(index < this->num_elements);
+    if (dummy.size()) {
+      return dummy[index];
+    } else {
+      return raw_memory[index];
+    }
   }
 
   //! Returns element at some specified index of the array; const i.e. not
   //! modifiable
   const ElementType& operator[](size_t index) const {
-    assert(index < num_elements);
-    return raw_memory[index];
+    assert(index < this->num_elements);
+    if (dummy.size()) {
+      return dummy[index];
+    } else {
+      return raw_memory[index];
+    }
   }
 
   //! Return number of elements in the array
   size_t size() const { return this->num_elements; }
 
   //! return unmodifiable pointer to raw_memory
-  const ElementType* data() const { return raw_memory; }
+  const ElementType* data() const {
+    if (dummy.size()) {
+      return dummy.data();
+    } else {
+      return raw_memory;
+    }
+  }
+
   //! return pointer to raw_memory
-  ElementType* data() { return raw_memory; }
+  ElementType* data() {
+    if (dummy.size()) {
+      return dummy.data();
+    } else {
+      return raw_memory;
+    }
+  }
 
   //! Allocates memory in the underlying vector; should only be used for
   //! deserialization into this class during communication
+  //! This also means you shouldn't use raw_data
   void resize(size_t new_size) {
-    if (!this->raw_memory) {
+    if (!this->dummy.size()) {
       this->dummy.resize(new_size);
-      this->raw_memory   = this->dummy.data();
       this->num_elements = this->dummy.size();
     } else {
-      GALOIS_DIE("calling resize when there is already raw memory "
+      GALOIS_DIE("calling resize when there is already memory "
                  "allocated");
     }
   }
+
+  ElementType* get_vec_data() {
+    assert(this->dummy.size());
+    return dummy.data();
+  }
 };
 
 } // namespace galois
diff --git a/libdist/include/galois/runtime/Serialize.h b/libdist/include/galois/runtime/Serialize.h
index 40cd4f4b7e..489676928b 100644
--- a/libdist/include/galois/runtime/Serialize.h
+++ b/libdist/include/galois/runtime/Serialize.h
@@ -1022,7 +1022,7 @@ void gDeserializeObj(DeSerializeBuffer& buf, galois::BufferWrapper<T>& bf) {
     size_t buffer_size;
     gDeserializeObj(buf, buffer_size);
     bf.resize(buffer_size);
-    buf.extract((uint8_t*)bf.data(), buffer_size * sizeof(T));
+    buf.extract((uint8_t*)bf.get_vec_data(), buffer_size * sizeof(T));
   } else {
     GALOIS_DIE("deserialize for buf wrapper not implemented for nonpod");
   }

From b1fba8e18afc05311a8dba8b5ab388ec776364a7 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Sat, 11 Jul 2020 19:21:50 -0500
Subject: [PATCH 327/660] thread local rng for dropout_cpu

---
 libdeepgalois/include/deepgalois/utils.h |  16 ++++
 libdeepgalois/src/math_functions.cpp     | 112 +++--------------------
 2 files changed, 30 insertions(+), 98 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/utils.h b/libdeepgalois/include/deepgalois/utils.h
index 91ccc94b83..ca93b9da62 100644
--- a/libdeepgalois/include/deepgalois/utils.h
+++ b/libdeepgalois/include/deepgalois/utils.h
@@ -65,6 +65,22 @@ class Timer {
   struct timeval elapsed_time_;
 };
 
+class PerThreadRNG {
+  galois::substrate::PerThreadStorage<std::default_random_engine> engine;
+  galois::substrate::PerThreadStorage<std::uniform_real_distribution<float_t>>
+      distribution;
+
+public:
+  //! init distribution
+  PerThreadRNG() : distribution{0.0, 1.0} {};
+
+  //! thread local RNG float from 0 to 1
+  float_t get_number() {
+    float_t num = (*distribution.getLocal())(*engine.getLocal());
+    return num;
+  }
+};
+
 class random_generator {
 public:
   static random_generator& get_instance() {
diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp
index 3b96341c66..11f99e15e0 100644
--- a/libdeepgalois/src/math_functions.cpp
+++ b/libdeepgalois/src/math_functions.cpp
@@ -38,15 +38,20 @@ void rng_bernoulli(size_t n, const float_t p, uint8_t* r) {
 }
 */
 
-std::default_random_engine generator;
-std::uniform_real_distribution<float_t> distribution(0.0, 1.0);
+// anon namespace so these things don't leak elsewhere
+namespace {
+static deepgalois::PerThreadRNG* per_thread_rng = nullptr;
+}
 
 namespace deepgalois {
 
 namespace math {
 
 inline uint8_t bernoulli(float_t p) {
-  return distribution(generator) > p ? 1 : 0;
+  if (!per_thread_rng) {
+    per_thread_rng = new PerThreadRNG();
+  }
+  return per_thread_rng->get_number() > p ? 1 : 0;
 }
 
 //! wrapper function to call cblas_sgemm
@@ -116,80 +121,7 @@ void mvmul(const CBLAS_TRANSPOSE TransA, const int M, const int N,
   cblas_sgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1);
 }
 
-inline void rng_uniform_cpu(size_t n, float_t* r) {
-#ifdef USE_MKL
-  VSLStreamStatePtr stream;
-  // Initializing the streams
-  vslNewStream(&stream, VSL_BRNG_SOBOL, 1);
-  // Generating
-  vsRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, n, r, 0.0f, 1.0f);
-  // Deleting the streams
-  vslDeleteStream(&stream);
-#else
-  for (size_t i = 0; i < n; ++i) {
-    r[i] = distribution(generator);
-  }
-  // galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) {
-  //  unsigned short xi[3];
-  //  r[i] = erand48(xi);
-  //}, galois::loopname("randomMaskGen"));
-#endif
-}
-
 const size_t vec_len = 8; // for 32-bit floating point in AVX2; TODO AVX512
-/*
-// vector add
-void vadd_cpu(size_t n, const float_t* a, const float_t* b, float_t* out) {
-#ifdef __AVX2__
-  const size_t alignedN = n - n % vec_len;
-  for (size_t i = 0; i < alignedN; i += vec_len)
-    _mm256_storeu_ps(&out[i], _mm256_add_ps(_mm256_loadu_ps(&a[i]),
-_mm256_loadu_ps(&b[i]))); for (size_t i = alignedN; i < n; ++i) out[i] = a[i] +
-b[i]; #else for (size_t i = 0; i < n; ++i) out[i] = a[i] + b[i]; #endif
-}
-
-#if defined(__AVX__) || defined(__AVX2__)
-void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out)
-{ const size_t alignedN = n - n % vec_len; const __m256 scal =
-_mm256_set1_ps(alpha); for (size_t i = 0; i < alignedN; i += vec_len)
-    _mm256_storeu_ps(&out[i], _mm256_mul_ps(_mm256_loadu_ps(&in[i]), scal));
-  for (size_t i = alignedN; i < n; ++i) out[i] = alpha * in[i];
-}
-
-// SAXPY stands for “Single-precision A*X Plus Y"
-void axpy(size_t n, const float_t a, float_t *x, float_t *y) {
-  const size_t alignedN = n - n % vec_len;
-  const __m256 alpha = _mm256_set1_ps(a);
-  for (size_t i = 0; i < alignedN; i += vec_len) {
-    __m256  product = _mm256_mul_ps(_mm256_loadu_ps(&x[i]), alpha);
-    _mm256_storeu_ps(&y[i], _mm256_add_ps(_mm256_loadu_ps(&y[i]), product));
-  }
-  for (size_t i = alignedN; i < n; ++i) y[i] = a * x[i] + y[i];
-}
-
-float_t l2_norm(size_t n, const float_t* in) {
-  const size_t alignedN = n - n % vec_len;
-  __m256 vsum = _mm256_set1_ps(0.0);
-  for (size_t i = 0; i < alignedN; i += vec_len) {
-    __m256 a = _mm256_loadu_ps(&in[i]);
-    vsum = _mm256_add_ps(vsum, _mm256_mul_ps(a, a));
-  }
-  __m256 sum = _mm256_hadd_ps(vsum, vsum);
-  return (((float_t*)&sum)[0] + ((float_t*)&sum)[2]) / 2.0;
-}
-#else
-// vector multiply scalar
-void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out)
-{ for (size_t i = 0; i < n; ++i) out[i] = alpha * in[i];
-}
-
-float_t l2_norm(size_t n, const float_t* a) {
-  float_t sum = 0.0;
-  for (size_t i = 0; i < n; ++i) sum += a[i] * a[i];
-  return sum / 2.0;
-}
-#endif
-*/
 
 void vadd_cpu(size_t n, const float_t* a, const float_t* b, float_t* y) {
 #ifdef USE_MKL
@@ -259,28 +191,12 @@ void dropout(size_t m, float scale, float dropout_rate, const float_t* in,
 void dropout_cpu(size_t n, size_t m, float scale, float dropout_rate,
                  const float_t* in, mask_t* masks, float_t* out) {
   size_t len = n * m;
-  /*
-  #ifdef USE_MKL
-    vec_t rands(len);
-    rng_uniform_cpu(len, &rands[0]);
-    galois::do_all(galois::iterate((size_t)0, len), [&](const auto& i) {
-      masks[i] = rands[i] > dropout_rate ? 1 : 0;
-    }, galois::loopname("randomMaskGen"));
-  */
-  /*
-    galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) {
-      auto idx = i * m;
-      vec_t rands(m);
-      rng_uniform_cpu(m, &rands[0]);
-      for (size_t j = 0; j < m; ++j)
-        masks[idx+j] = rands[j] > dropout_rate ? 1 : 0;
-    }, galois::loopname("dropout"));
-  #else
-  */
-  for (size_t i = 0; i < len; ++i) {
-    masks[i] = bernoulli(dropout_rate);
-  }
-  //#endif
+
+  galois::do_all(
+      galois::iterate((size_t)0, len),
+      [&](size_t i) { masks[i] = bernoulli(dropout_rate); },
+      galois::loopname("dropout RNG"));
+
   galois::do_all(
       galois::iterate((size_t)0, len),
       [&](const auto& i) { out[i] = in[i] * (float_t)masks[i] * scale; },

From b2f1c6f82d9e114aa462ce58186d05ebe6d6487e Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Wed, 15 Jul 2020 09:58:00 -0500
Subject: [PATCH 328/660] fix gpu compilation

---
 libdeepgalois/include/deepgalois/random.h | 53 +++++++++++++++++++++++
 libdeepgalois/include/deepgalois/utils.h  | 47 +-------------------
 libdeepgalois/src/math_functions.cpp      |  1 +
 libdeepgalois/src/reader.cpp              |  4 ++
 libdeepgalois/src/utils.cpp               |  3 ++
 5 files changed, 62 insertions(+), 46 deletions(-)
 create mode 100644 libdeepgalois/include/deepgalois/random.h

diff --git a/libdeepgalois/include/deepgalois/random.h b/libdeepgalois/include/deepgalois/random.h
new file mode 100644
index 0000000000..bf1648bc2a
--- /dev/null
+++ b/libdeepgalois/include/deepgalois/random.h
@@ -0,0 +1,53 @@
+// random number generators for CPU
+#pragma once
+
+#include <random>
+#include "galois/Galois.h"
+#include "deepgalois/GraphTypes.h"
+
+namespace deepgalois {
+
+class PerThreadRNG {
+  galois::substrate::PerThreadStorage<std::default_random_engine> engine;
+  galois::substrate::PerThreadStorage<std::uniform_real_distribution<float_t>>
+      distribution;
+
+public:
+  //! init distribution
+  PerThreadRNG() : distribution{0.0, 1.0} {};
+
+  //! thread local RNG float from 0 to 1
+  float_t get_number() {
+    float_t num = (*distribution.getLocal())(*engine.getLocal());
+    return num;
+  }
+};
+
+class random_generator {
+public:
+  static random_generator& get_instance() {
+    static random_generator instance;
+    return instance;
+  }
+  std::mt19937& operator()() { return gen_; }
+  void set_seed(unsigned int seed) { gen_.seed(seed); }
+
+private:
+  random_generator() : gen_(1) {}
+  std::mt19937 gen_;
+};
+
+template <typename T>
+inline typename std::enable_if<std::is_integral<T>::value, T>::type
+uniform_rand(T min, T max) {
+  std::uniform_int_distribution<T> dst(min, max);
+  return dst(random_generator::get_instance()());
+}
+
+template <typename T>
+inline typename std::enable_if<std::is_floating_point<T>::value, T>::type
+uniform_rand(T min, T max) {
+  std::uniform_real_distribution<T> dst(min, max);
+  return dst(random_generator::get_instance()());
+}
+} //end of namespace
diff --git a/libdeepgalois/include/deepgalois/utils.h b/libdeepgalois/include/deepgalois/utils.h
index ca93b9da62..bf74aad196 100644
--- a/libdeepgalois/include/deepgalois/utils.h
+++ b/libdeepgalois/include/deepgalois/utils.h
@@ -7,8 +7,7 @@
 #include <iostream>
 #include <sys/time.h>
 #include <sys/resource.h>
-#include "deepgalois/GraphTypes.h"
-//#include "galois/DistGalois.h"
+#include "deepgalois/types.h"
 
 namespace deepgalois {
 
@@ -65,50 +64,6 @@ class Timer {
   struct timeval elapsed_time_;
 };
 
-class PerThreadRNG {
-  galois::substrate::PerThreadStorage<std::default_random_engine> engine;
-  galois::substrate::PerThreadStorage<std::uniform_real_distribution<float_t>>
-      distribution;
-
-public:
-  //! init distribution
-  PerThreadRNG() : distribution{0.0, 1.0} {};
-
-  //! thread local RNG float from 0 to 1
-  float_t get_number() {
-    float_t num = (*distribution.getLocal())(*engine.getLocal());
-    return num;
-  }
-};
-
-class random_generator {
-public:
-  static random_generator& get_instance() {
-    static random_generator instance;
-    return instance;
-  }
-  std::mt19937& operator()() { return gen_; }
-  void set_seed(unsigned int seed) { gen_.seed(seed); }
-
-private:
-  random_generator() : gen_(1) {}
-  std::mt19937 gen_;
-};
-
-template <typename T>
-inline typename std::enable_if<std::is_integral<T>::value, T>::type
-uniform_rand(T min, T max) {
-  std::uniform_int_distribution<T> dst(min, max);
-  return dst(random_generator::get_instance()());
-}
-
-template <typename T>
-inline typename std::enable_if<std::is_floating_point<T>::value, T>::type
-uniform_rand(T min, T max) {
-  std::uniform_real_distribution<T> dst(min, max);
-  return dst(random_generator::get_instance()());
-}
-
 // sequential prefix sum
 template <typename InTy = unsigned, typename OutTy = unsigned>
 inline std::vector<OutTy> prefix_sum(const std::vector<InTy>& in) {
diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp
index 11f99e15e0..6d5b13df78 100644
--- a/libdeepgalois/src/math_functions.cpp
+++ b/libdeepgalois/src/math_functions.cpp
@@ -6,6 +6,7 @@
 #include "galois/Timer.h"
 #include "galois/Galois.h"
 #include "deepgalois/utils.h"
+#include "deepgalois/random.h"
 #include "deepgalois/math_functions.hh"
 
 #ifdef USE_MKL
diff --git a/libdeepgalois/src/reader.cpp b/libdeepgalois/src/reader.cpp
index bf5792fca4..d7e1bcf44b 100644
--- a/libdeepgalois/src/reader.cpp
+++ b/libdeepgalois/src/reader.cpp
@@ -1,6 +1,7 @@
 #include "deepgalois/reader.h"
 #include "deepgalois/utils.h"
 #include "deepgalois/configs.h"
+#include "galois/Galois.h"
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <sys/mman.h>
@@ -8,6 +9,9 @@
 #include <unistd.h> /* For open(), creat() */
 #include <fstream>
 #include <cassert>
+#ifndef GALOIS_ENABLE_GPU
+#include "galois/DistGalois.h"
+#endif
 
 namespace deepgalois {
 
diff --git a/libdeepgalois/src/utils.cpp b/libdeepgalois/src/utils.cpp
index 929881dd25..61ff3a2e58 100644
--- a/libdeepgalois/src/utils.cpp
+++ b/libdeepgalois/src/utils.cpp
@@ -1,5 +1,8 @@
 #include "galois/Galois.h"
 #include "deepgalois/utils.h"
+#ifndef GALOIS_ENABLE_GPU
+#include "galois/DistGalois.h"
+#endif
 
 namespace deepgalois {
 

From 33d2b542d9e2aea7091b503537160bab4e783652 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Wed, 15 Jul 2020 11:24:57 -0500
Subject: [PATCH 329/660] add gat_fw

---
 .../include/deepgalois/layers/layer.h         |  1 +
 .../include/deepgalois/math_functions.hh      |  3 +
 libdeepgalois/src/layers/gat_fw.h             | 66 +++++++++++++++++++
 libdeepgalois/src/layers/graph_conv_layer.cpp |  6 ++
 libdeepgalois/src/math_functions.cpp          |  6 ++
 5 files changed, 82 insertions(+)
 create mode 100644 libdeepgalois/src/layers/gat_fw.h

diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h
index 874e7d41c6..7ac5b8b649 100644
--- a/libdeepgalois/include/deepgalois/layers/layer.h
+++ b/libdeepgalois/include/deepgalois/layers/layer.h
@@ -60,6 +60,7 @@ class layer : public deepgalois::node {
   vec_t W; // parameters to learn, for vertex v, layer0: D x 16, layer1: 16 x E
   vec_t Q; // parameters to learn, for vertex u, i.e. v's neighbors, layer0: D x
            // 16, layer1: 16 x E
+  vec_t alpha; // parameters to learn (H x 1), only used for GAT
   vec_t weight_grad; // weight gradient for updating parameters
   float_t* d_W;
   float_t* d_weight_grad;
diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh
index 6c002e2ffb..8f5cc25d37 100644
--- a/libdeepgalois/include/deepgalois/math_functions.hh
+++ b/libdeepgalois/include/deepgalois/math_functions.hh
@@ -48,6 +48,9 @@ void mul_scalar(size_t n, const float_t alpha, const float_t* x, float_t* y);
 //! do dot product of 2 vectors
 float_t dot(size_t n, const float_t* x, const float_t* y);
 
+// concatenation of two vectors into one
+void concat(size_t n, const float_t* x, const float_t* y, float_t* z);
+
 // SAXPY stands for “Single-precision A*X Plus Y"
 void axpy(size_t n, const float_t a, float_t* x, float_t* y);
 
diff --git a/libdeepgalois/src/layers/gat_fw.h b/libdeepgalois/src/layers/gat_fw.h
new file mode 100644
index 0000000000..3e77ebc797
--- /dev/null
+++ b/libdeepgalois/src/layers/gat_fw.h
@@ -0,0 +1,66 @@
+//#define USE_GAT
+#ifdef USE_GAT
+void graph_conv_layer::forward_propagation(const float_t* in_data,
+                                           float_t* out_data) {
+  galois::StatTimer conv_timer("GraphConvForward");
+  conv_timer.start();
+  size_t x = input_dims[0];
+  size_t y = input_dims[1];
+  size_t z = output_dims[1];
+
+  // (1) dropout
+  if (dropout_ && phase_ == net_phase::train) {
+    math::dropout_cpu(x, y, scale_, dropout_rate_, in_data,
+                      dropout_mask, in_temp);
+  } else {
+    math::copy_cpu(x * y, in_data, in_temp);
+  }
+
+  // (2) linear transformation
+  math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp,
+                  &layer::W[0], 0.0, out_temp);
+
+  auto &g = *graph_cpu;
+  size_t n = g.size();
+  size_t len = z;
+  float_t* in = out_temp;
+  float_t* out = out_data;
+  
+  galois::do_all(galois::iterate(size_t(0), n), [&](const auto src) {
+    auto src_idx = src * len;
+    auto deg_src = g.get_degree(src);
+
+    // (3) concatenation, dot product, LeakyReLU
+    int i = 0;
+    vec_t scores(deg_src);
+    //for (auto e : g.edges(src)) {
+    for (auto e = g.edge_begin(src); e != g.edge_end(src); e++) {
+      auto dst = g.getEdgeDst(e);
+      auto dst_idx = dst * len;
+      vec_t concat_vec(2*z);
+      math::concat(z, &in[src_idx], &in[dst_idx], &concat_vec[0]);
+      // alpha: learnable weight vector
+      scores[i++] = math::dot(2*z, &alpha[0], &concat_vec[0]);
+    }
+
+    // (4) softmax to normalize the attention scores on each vertex’s incoming edges
+    vec_t normalized_scores(deg_src, 0);
+    math::softmax(deg_src, &scores[0], &normalized_scores[0]); // normalize using softmax
+    math::clear_cpu(len, &out[src_idx]);
+
+    // (5) aggregation: scaled by the attention scores
+    //for (auto e : g.edges(src)) {
+    for (auto e = g.edge_begin(src); e != g.edge_end(src); e++) {
+      auto dst = g.getEdgeDst(e);
+      auto dst_idx = dst * len;
+      auto score = normalized_scores[dst];
+      vec_t neighbor(len);
+      math::scale(len, score, &in[dst_idx], &neighbor[0]);
+      math::vadd_cpu(len, &out[src_idx], &neighbor[0], &out[src_idx]);
+    }
+  });
+  
+  // (6) ReLU
+  if (act_) math::relu_cpu(x * z, out_data, out_data);
+}
+#endif
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 4c11086495..a6c49f615b 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -3,6 +3,7 @@
 #include "deepgalois/utils.h"
 
 namespace deepgalois {
+#include "gat_fw.h"
 
 //! Set this to let sync struct know where to get data from
 float_t* _dataToSync = nullptr;
@@ -86,6 +87,9 @@ void graph_conv_layer::malloc_and_init() {
   // rand_init_matrix(y, z, Q);
   zero_init_matrix(y, z, layer::weight_grad);
 
+  // alpha is only used for GAT
+  rand_init_matrix(2*z, 1, alpha, 1);
+
   if (dropout_)
     dropout_mask = new mask_t[x * y];
   in_temp    = new float_t[x * y];
@@ -95,6 +99,7 @@ void graph_conv_layer::malloc_and_init() {
     in_temp1 = new float_t[x * y];
 }
 
+#ifndef USE_GAT
 // 𝒉[𝑙] = σ(𝑊 * Σ(𝒉[𝑙-1]))
 void graph_conv_layer::forward_propagation(const float_t* in_data,
                                            float_t* out_data) {
@@ -146,6 +151,7 @@ void graph_conv_layer::forward_propagation(const float_t* in_data,
 
   conv_timer.stop();
 }
+#endif
 
 // 𝜕𝐸 / 𝜕𝑦[𝑙−1] = 𝜕𝐸 / 𝜕𝑦[𝑙] ∗ 𝑊 ^𝑇
 void graph_conv_layer::back_propagation(const float_t* in_data,
diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp
index 6d5b13df78..a5e6b50eec 100644
--- a/libdeepgalois/src/math_functions.cpp
+++ b/libdeepgalois/src/math_functions.cpp
@@ -175,6 +175,12 @@ float_t dot(size_t n, const float_t* x, const float_t* y) {
   return cblas_sdot(n, x, 1, y, 1);
 }
 
+// concatenation of two vectors into one
+void concat(size_t n, const float_t* x, const float_t* y, float_t* z) {
+  copy_cpu(n, x, z);
+  copy_cpu(n, y, z+n);
+}
+
 void clear_cpu(size_t n, float_t* in) {
   // for (size_t i = 0; i < n; i++) in[i] = 0;
   std::fill(in, in + n, 0);

From 234f31c56eb0492cb5b7e0db8efd5819a39285ec Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Sat, 18 Jul 2020 12:52:29 -0500
Subject: [PATCH 330/660] bitset for forward sync for graphconv

---
 .../layers/GraphConvSyncStructures.h          | 17 ++++++++-
 .../deepgalois/layers/graph_conv_layer.h      |  1 -
 libdeepgalois/src/layers/graph_conv_layer.cpp | 37 +++++++++++++++++--
 3 files changed, 49 insertions(+), 6 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h b/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h
index b07b672fa1..bc88656bec 100644
--- a/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h
+++ b/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h
@@ -8,8 +8,9 @@ struct GraphConvSync {
 
   //! return a vector of floats to sync
   static ValTy extract(uint32_t node_id, char&) {
-    ValTy vecToReturn(&deepgalois::_dataToSync[node_id * deepgalois::_syncVectorSize],
-                      deepgalois::_syncVectorSize);
+    ValTy vecToReturn(
+        &deepgalois::_dataToSync[node_id * deepgalois::_syncVectorSize],
+        deepgalois::_syncVectorSize);
     // move constructor should kick in here to avoid return copy
     return vecToReturn;
   }
@@ -54,5 +55,17 @@ struct GraphConvSync {
   static bool setVal_batch(unsigned, uint8_t*, DataCommMode) { return false; }
 };
 
+struct Bitset_gradient {
+  static constexpr bool is_vector_bitset() { return false; }
+
+  static constexpr bool is_valid() { return true; }
+
+  static galois::DynamicBitSet& get() { return bitset_gradient; }
+
+  static void reset_range(size_t begin, size_t end) {
+    bitset_gradient.reset(begin, end);
+  }
+};
+
 #endif
 #endif
diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
index ad954215fc..f1501d39d2 100644
--- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
@@ -1,7 +1,6 @@
 #pragma once
 #include "layer.h"
 #include "deepgalois/layers/aggregator.h"
-#include "deepgalois/layers/GraphConvSyncStructures.h"
 
 /**
  * GraphConv Layer; based on DGL implementation + follows TinyDNN layer
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index a6c49f615b..791b9c9a51 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -2,6 +2,9 @@
 #include "deepgalois/math_functions.hh"
 #include "deepgalois/utils.h"
 
+static galois::DynamicBitSet bitset_gradient;
+#include "deepgalois/layers/GraphConvSyncStructures.h"
+
 namespace deepgalois {
 #include "gat_fw.h"
 
@@ -73,6 +76,9 @@ void graph_conv_layer::malloc_and_init() {
   size_t y = input_dims[1];
   size_t z = output_dims[1];
 
+  galois::gInfo("bitset size is going to be ", x);
+  bitset_gradient.resize(x);
+
   // setup gluon
   layer::gradientGraph =
       new deepgalois::GluonGradients(layer::weight_grad, y * z);
@@ -88,7 +94,7 @@ void graph_conv_layer::malloc_and_init() {
   zero_init_matrix(y, z, layer::weight_grad);
 
   // alpha is only used for GAT
-  rand_init_matrix(2*z, 1, alpha, 1);
+  rand_init_matrix(2 * z, 1, alpha, 1);
 
   if (dropout_)
     dropout_mask = new mask_t[x * y];
@@ -139,8 +145,33 @@ void graph_conv_layer::forward_propagation(const float_t* in_data,
   // TODO how to do this for the sampled case?
   deepgalois::_syncVectorSize = z;
   deepgalois::_dataToSync     = out_data;
-  layer::context->getSyncSubstrate()->sync<writeAny, readAny, GraphConvSync>(
-      "GraphConvForward");
+  // bitset setting
+  galois::do_all(
+      galois::iterate((size_t)0, bitset_gradient.size()),
+      [&](size_t node_id) {
+        bool set_true = false;
+        // check for non-zeros; the moment one is found, set true becomes true
+        // and we break out of the loop
+        for (size_t i = 0; i < deepgalois::_syncVectorSize; i++) {
+          auto val =
+              deepgalois::_dataToSync[node_id * deepgalois::_syncVectorSize +
+                                      i];
+          if (val != 0) {
+            set_true = true;
+            break;
+          }
+        }
+
+        if (set_true) {
+          bitset_gradient.set(node_id);
+        }
+      },
+      galois::loopname("BitsetGraphConvForward"), galois::no_stats());
+  galois::gPrint(bitset_gradient.count(), " out of ", bitset_gradient.size(),
+                 "\n");
+  layer::context->getSyncSubstrate()
+      ->sync<writeAny, readAny, GraphConvSync, Bitset_gradient>(
+          "GraphConvForward");
 
   // run relu activation on output if specified
   galois::StatTimer relu_timer("GraphConvForwardRelu");

From 1451f8550f46df2436d731f1adc7b83b2d67c645 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Sat, 18 Jul 2020 13:09:43 -0500
Subject: [PATCH 331/660] backward graphconv sync bitset

---
 libdeepgalois/src/layers/graph_conv_layer.cpp | 29 +++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 791b9c9a51..3ba451ebf3 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -230,8 +230,33 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
   // sync agg
   deepgalois::_syncVectorSize = z;
   deepgalois::_dataToSync     = out_temp;
-  layer::context->getSyncSubstrate()->sync<writeAny, readAny, GraphConvSync>(
-      "GraphConvBackward");
+  galois::do_all(
+      galois::iterate((size_t)0, bitset_gradient.size()),
+      [&](size_t node_id) {
+        bool set_true = false;
+        // check for non-zeros; the moment one is found, set true becomes true
+        // and we break out of the loop
+        for (size_t i = 0; i < deepgalois::_syncVectorSize; i++) {
+          auto val =
+              deepgalois::_dataToSync[node_id * deepgalois::_syncVectorSize +
+                                      i];
+          if (val != 0) {
+            set_true = true;
+            break;
+          }
+        }
+
+        if (set_true) {
+          bitset_gradient.set(node_id);
+        }
+      },
+      galois::loopname("BitsetGraphConvBackward"), galois::no_stats());
+  galois::gPrint("backward ", bitset_gradient.count(), " out of ",
+                 bitset_gradient.size(), "\n");
+
+  layer::context->getSyncSubstrate()
+      ->sync<writeAny, readAny, GraphConvSync, Bitset_gradient>(
+          "GraphConvBackward");
 
   galois::StatTimer drop_timer("GraphConvBackwardDropout");
   drop_timer.start();

From 374b2725a8a9c499e350fea2dae0fe202b7bc73d Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Sat, 18 Jul 2020 15:58:26 -0500
Subject: [PATCH 332/660] conv bitset namechange, cleanup/modularity

---
 .../deepgalois/layers/GradientSyncStructs.h   |  3 +-
 .../layers/GraphConvSyncStructures.h          | 13 +--
 .../include/deepgalois/layers/layer.h         |  1 -
 libdeepgalois/src/layers/graph_conv_layer.cpp | 95 +++++++++----------
 4 files changed, 45 insertions(+), 67 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h
index 9b325311b7..26420aa30d 100644
--- a/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h
+++ b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h
@@ -42,7 +42,6 @@ struct GradientSync {
   static bool setVal_batch(unsigned, uint8_t*, DataCommMode) { return false; }
 };
 
-// TODO bitset; might have to do it manually
-// GALOIS_SYNC_STRUCTURE_BITSET(TODOTHIS?);
+// no bitset; everything is sent anyways
 #endif
 #endif
diff --git a/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h b/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h
index bc88656bec..570aa56d2b 100644
--- a/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h
+++ b/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h
@@ -55,17 +55,6 @@ struct GraphConvSync {
   static bool setVal_batch(unsigned, uint8_t*, DataCommMode) { return false; }
 };
 
-struct Bitset_gradient {
-  static constexpr bool is_vector_bitset() { return false; }
-
-  static constexpr bool is_valid() { return true; }
-
-  static galois::DynamicBitSet& get() { return bitset_gradient; }
-
-  static void reset_range(size_t begin, size_t end) {
-    bitset_gradient.reset(begin, end);
-  }
-};
-
+GALOIS_SYNC_STRUCTURE_BITSET(conv);
 #endif
 #endif
diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h
index 7ac5b8b649..5d4aae6023 100644
--- a/libdeepgalois/include/deepgalois/layers/layer.h
+++ b/libdeepgalois/include/deepgalois/layers/layer.h
@@ -18,7 +18,6 @@
 #ifndef GALOIS_ENABLE_GPU
 #include "galois/graphs/GluonSubstrate.h"
 #include "deepgalois/layers/GluonGradients.h"
-#include "deepgalois/layers/GradientSyncStructs.h"
 #endif
 
 namespace deepgalois {
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 3ba451ebf3..1d543f0a78 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -2,8 +2,10 @@
 #include "deepgalois/math_functions.hh"
 #include "deepgalois/utils.h"
 
-static galois::DynamicBitSet bitset_gradient;
+static galois::DynamicBitSet bitset_conv;
+
 #include "deepgalois/layers/GraphConvSyncStructures.h"
+#include "deepgalois/layers/GradientSyncStructs.h"
 
 namespace deepgalois {
 #include "gat_fw.h"
@@ -76,8 +78,8 @@ void graph_conv_layer::malloc_and_init() {
   size_t y = input_dims[1];
   size_t z = output_dims[1];
 
-  galois::gInfo("bitset size is going to be ", x);
-  bitset_gradient.resize(x);
+  galois::gInfo("conv bitset size is going to be ", x);
+  bitset_conv.resize(x);
 
   // setup gluon
   layer::gradientGraph =
@@ -86,6 +88,7 @@ void graph_conv_layer::malloc_and_init() {
       new galois::graphs::GluonSubstrate<deepgalois::GluonGradients>(
           *layer::gradientGraph, layer::gradientGraph->myHostID(),
           layer::gradientGraph->numHosts(), false);
+  galois::gInfo("gradient bitset size is going to be ", y * z);
 
   // make sure seed consistent across all hosts for weight matrix
   rand_init_matrix(y, z, W, 1);
@@ -105,6 +108,34 @@ void graph_conv_layer::malloc_and_init() {
     in_temp1 = new float_t[x * y];
 }
 
+namespace {
+void set_conv_bitset() {
+  // bitset setting
+  galois::do_all(
+      galois::iterate((size_t)0, bitset_conv.size()),
+      [&](size_t node_id) {
+        bool set_true = false;
+        // check for non-zeros; the moment one is found, set true becomes true
+        // and we break out of the loop
+        for (size_t i = 0; i < deepgalois::_syncVectorSize; i++) {
+          auto val =
+              deepgalois::_dataToSync[node_id * deepgalois::_syncVectorSize +
+                                      i];
+          if (val != 0) {
+            set_true = true;
+            break;
+          }
+        }
+
+        if (set_true) {
+          bitset_conv.set(node_id);
+        }
+      },
+      galois::loopname("BitsetGraphConv"), galois::no_stats());
+}
+
+} // end anonymous namespace
+
 #ifndef USE_GAT
 // 𝒉[𝑙] = σ(𝑊 * Σ(𝒉[𝑙-1]))
 void graph_conv_layer::forward_propagation(const float_t* in_data,
@@ -145,33 +176,12 @@ void graph_conv_layer::forward_propagation(const float_t* in_data,
   // TODO how to do this for the sampled case?
   deepgalois::_syncVectorSize = z;
   deepgalois::_dataToSync     = out_data;
-  // bitset setting
-  galois::do_all(
-      galois::iterate((size_t)0, bitset_gradient.size()),
-      [&](size_t node_id) {
-        bool set_true = false;
-        // check for non-zeros; the moment one is found, set true becomes true
-        // and we break out of the loop
-        for (size_t i = 0; i < deepgalois::_syncVectorSize; i++) {
-          auto val =
-              deepgalois::_dataToSync[node_id * deepgalois::_syncVectorSize +
-                                      i];
-          if (val != 0) {
-            set_true = true;
-            break;
-          }
-        }
+  set_conv_bitset();
+  galois::gPrint("forward ", bitset_conv.count(), " out of ",
+                 bitset_conv.size(), "\n");
 
-        if (set_true) {
-          bitset_gradient.set(node_id);
-        }
-      },
-      galois::loopname("BitsetGraphConvForward"), galois::no_stats());
-  galois::gPrint(bitset_gradient.count(), " out of ", bitset_gradient.size(),
-                 "\n");
   layer::context->getSyncSubstrate()
-      ->sync<writeAny, readAny, GraphConvSync, Bitset_gradient>(
-          "GraphConvForward");
+      ->sync<writeAny, readAny, GraphConvSync, Bitset_conv>("GraphConvForward");
 
   // run relu activation on output if specified
   galois::StatTimer relu_timer("GraphConvForwardRelu");
@@ -230,32 +240,13 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
   // sync agg
   deepgalois::_syncVectorSize = z;
   deepgalois::_dataToSync     = out_temp;
-  galois::do_all(
-      galois::iterate((size_t)0, bitset_gradient.size()),
-      [&](size_t node_id) {
-        bool set_true = false;
-        // check for non-zeros; the moment one is found, set true becomes true
-        // and we break out of the loop
-        for (size_t i = 0; i < deepgalois::_syncVectorSize; i++) {
-          auto val =
-              deepgalois::_dataToSync[node_id * deepgalois::_syncVectorSize +
-                                      i];
-          if (val != 0) {
-            set_true = true;
-            break;
-          }
-        }
-
-        if (set_true) {
-          bitset_gradient.set(node_id);
-        }
-      },
-      galois::loopname("BitsetGraphConvBackward"), galois::no_stats());
-  galois::gPrint("backward ", bitset_gradient.count(), " out of ",
-                 bitset_gradient.size(), "\n");
+  set_conv_bitset();
+  galois::gPrint("backward ", bitset_conv.count(), " out of ",
+                 bitset_conv.size(), "\n");
 
   layer::context->getSyncSubstrate()
-      ->sync<writeAny, readAny, GraphConvSync, Bitset_gradient>(
+      ->sync<writeAny, readAny, GraphConvSync, Bitset_conv>(
+          //->sync<writeAny, readAny, GraphConvSync>(
           "GraphConvBackward");
 
   galois::StatTimer drop_timer("GraphConvBackwardDropout");

From ac136dbf443c7e6091edef4db54f27f8cdb5a8db Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Sat, 18 Jul 2020 17:12:16 -0500
Subject: [PATCH 333/660] gnn-cvc

---
 .../galois/graphs/GenericPartitioners.h       | 88 +++++++++++++++++++
 lonestar/gnn/include/DistributedGraphLoader.h |  8 +-
 lonestar/gnn/src/DistributedGraphLoader.cpp   | 15 ++--
 3 files changed, 103 insertions(+), 8 deletions(-)

diff --git a/libcusp/include/galois/graphs/GenericPartitioners.h b/libcusp/include/galois/graphs/GenericPartitioners.h
index f1a0809f37..3f0d30e212 100644
--- a/libcusp/include/galois/graphs/GenericPartitioners.h
+++ b/libcusp/include/galois/graphs/GenericPartitioners.h
@@ -956,4 +956,92 @@ class GnnOEC : public galois::graphs::CustomMasterAssignment {
   }
 };
 
+class GnnCVC : public galois::graphs::CustomMasterAssignment {
+  unsigned numRowHosts;
+  unsigned numColumnHosts;
+  unsigned _h_offset;
+
+  void factorizeHosts() {
+    numColumnHosts = sqrt(_numHosts);
+
+    while ((_numHosts % numColumnHosts) != 0)
+      numColumnHosts--;
+
+    numRowHosts = _numHosts / numColumnHosts;
+    assert(numRowHosts >= numColumnHosts);
+
+    if (_hostID == 0) {
+      galois::gPrint("Cartesian grid: ", numRowHosts, " x ", numColumnHosts,
+                     "\n");
+    }
+  }
+
+  //! Returns the grid row ID of this host
+  unsigned gridRowID() const { return (_hostID / numColumnHosts); }
+  //! Returns the grid row ID of the specified host
+  unsigned gridRowID(unsigned id) const { return (id / numColumnHosts); }
+  //! Returns the grid column ID of this host
+  unsigned gridColumnID() const { return (_hostID % numColumnHosts); }
+  //! Returns the grid column ID of the specified host
+  unsigned gridColumnID(unsigned id) const { return (id % numColumnHosts); }
+
+  //! Find the column of a particular node
+  unsigned getColumnOfNode(uint64_t gid) const {
+    return gridColumnID(retrieveMaster(gid));
+  }
+
+public:
+  GnnCVC(uint32_t hostID, uint32_t numHosts, uint64_t numNodes,
+         uint64_t numEdges)
+      : galois::graphs::CustomMasterAssignment(hostID, numHosts, numNodes,
+                                               numEdges) {
+    factorizeHosts();
+    _h_offset = gridRowID() * numColumnHosts;
+  };
+
+  template <typename EdgeTy>
+  uint32_t getMaster(uint32_t src, galois::graphs::BufferedGraph<EdgeTy>&,
+                     const std::vector<uint32_t>&,
+                     std::unordered_map<uint64_t, uint32_t>&,
+                     const std::vector<uint64_t>&,
+                     std::vector<galois::CopyableAtomic<uint64_t>>&,
+                     const std::vector<uint64_t>&,
+                     std::vector<galois::CopyableAtomic<uint64_t>>&) {
+    // this is expected to be set
+    return _globalHostMap[src];
+  }
+
+  uint32_t retrieveMaster(uint32_t gid) const { return _globalHostMap[gid]; }
+
+  uint32_t getEdgeOwner(uint32_t, uint32_t dst, uint64_t) const {
+    int i = getColumnOfNode(dst);
+    return _h_offset + i;
+  }
+
+  bool noCommunication() { return false; }
+  bool isVertexCut() const {
+    if ((numRowHosts == 1) || (numColumnHosts == 1))
+      return false;
+    return true;
+  }
+
+  void serializePartition(boost::archive::binary_oarchive&) {}
+  void deserializePartition(boost::archive::binary_iarchive&) {}
+  std::pair<unsigned, unsigned> cartesianGrid() {
+    return std::make_pair(numRowHosts, numColumnHosts);
+  }
+
+  bool predeterminedMapping(std::vector<uint32_t>& mappings) {
+    if (mappings.size() != _numNodes) {
+      GALOIS_DIE("predetermined mapping size not equal to num nodes");
+    }
+    _globalHostMap.resize(_numNodes);
+
+    galois::do_all(galois::iterate((size_t)0, mappings.size()),
+                   [&](size_t n) { _globalHostMap[n] = mappings[n]; });
+
+    return true;
+  }
+};
+
 #endif
diff --git a/lonestar/gnn/include/DistributedGraphLoader.h b/lonestar/gnn/include/DistributedGraphLoader.h
index f3755a886f..65104a6031 100644
--- a/lonestar/gnn/include/DistributedGraphLoader.h
+++ b/lonestar/gnn/include/DistributedGraphLoader.h
@@ -53,7 +53,8 @@ enum PARTITIONING_SCHEME {
   FENNEL_O,      //!< Fennel, oec
   FENNEL_I,      //!< Fennel, iec
   SUGAR_O,       //!< Sugar, oec
-  GNN_OEC        //!< gnn, oec
+  GNN_OEC,       //!< gnn, oec
+  GNN_CVC        //!< gnn, cvc
 };
 
 /**
@@ -88,6 +89,8 @@ inline const char* EnumToString(PARTITIONING_SCHEME e) {
     return "sugar-oec";
   case GNN_OEC:
     return "gnn-oec";
+  case GNN_CVC:
+    return "gnn-cvc";
   default:
     GALOIS_DIE("Unsupported partition");
   }
@@ -145,6 +148,9 @@ DistGraph<NodeData, EdgeData>* constructSymmetricGraph(std::vector<unsigned>&) {
   case GNN_OEC:
     return cuspPartitionGraph<GnnOEC, NodeData, EdgeData>(
         inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, "");
+  case GNN_CVC:
+    return cuspPartitionGraph<GnnCVC, NodeData, EdgeData>(
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, "");
   default:
     GALOIS_DIE("Error: partition scheme specified is invalid");
     return nullptr;
diff --git a/lonestar/gnn/src/DistributedGraphLoader.cpp b/lonestar/gnn/src/DistributedGraphLoader.cpp
index 71953ea53e..e18340fe82 100644
--- a/lonestar/gnn/src/DistributedGraphLoader.cpp
+++ b/lonestar/gnn/src/DistributedGraphLoader.cpp
@@ -32,11 +32,12 @@ namespace cll = llvm::cl;
 
 cll::opt<PARTITIONING_SCHEME> partitionScheme(
     "partition", cll::desc("Type of partitioning."),
-    cll::values(clEnumValN(OEC, "oec", "Outgoing Edge-Cut (default)"),
-                clEnumValN(IEC, "iec", "Incoming Edge-Cut"),
-                clEnumValN(CART_VCUT, "cvc", "Cartesian Vertex-Cut of oec"),
-                clEnumValN(CART_VCUT_IEC, "cvc-iec",
-                           "Cartesian Vertex-Cut of iec"),
-                clEnumValN(GNN_OEC, "g-oec",
-                           "gnn oec: train nodes evenly distributed")),
+    cll::values(
+        clEnumValN(OEC, "oec", "Outgoing Edge-Cut (default)"),
+        clEnumValN(IEC, "iec", "Incoming Edge-Cut"),
+        clEnumValN(CART_VCUT, "cvc", "Cartesian Vertex-Cut of oec"),
+        clEnumValN(CART_VCUT_IEC, "cvc-iec", "Cartesian Vertex-Cut of iec"),
+        clEnumValN(GNN_OEC, "g-oec", "gnn oec: train nodes evenly distributed"),
+        clEnumValN(GNN_CVC, "g-cvc",
+                   "gnn cvc: train nodes evenly distributed")),
     cll::init(GNN_OEC));

From 33661539fc9276f1f0e6f21a57006ed7156bad8e Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Sun, 19 Jul 2020 10:37:46 -0500
Subject: [PATCH 334/660] update gat

---
 .../deepgalois/layers/graph_conv_layer.h      |   1 +
 .../include/deepgalois/layers/layer.h         |  21 ++-
 .../include/deepgalois/math_functions.hh      |   2 +
 libdeepgalois/src/layers/gat_fw.h             | 171 +++++++++++++-----
 libdeepgalois/src/layers/graph_conv_layer.cpp |  92 ++++++----
 libdeepgalois/src/math_functions.cpp          |   8 +
 6 files changed, 208 insertions(+), 87 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
index f1501d39d2..d112ddf785 100644
--- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
@@ -74,6 +74,7 @@ class graph_conv_layer : public layer {
   float_t* in_temp1;
   float_t* trans_data;  // y*x
   mask_t* dropout_mask; // x*y
+  float_t epsilon; // LeakyReLU angle of negative slope: set to 0.2
 
   // Glorot & Bengio (AISTATS 2010)
   inline void rand_init_matrix(size_t dim_x, size_t dim_y, vec_t& matrix,
diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h
index 5d4aae6023..534d99b821 100644
--- a/libdeepgalois/include/deepgalois/layers/layer.h
+++ b/libdeepgalois/include/deepgalois/layers/layer.h
@@ -57,18 +57,25 @@ class layer : public deepgalois::node {
   bool trainable_;                 // is this layer trainable
   bool use_mask;
   vec_t W; // parameters to learn, for vertex v, layer0: D x 16, layer1: 16 x E
-  vec_t Q; // parameters to learn, for vertex u, i.e. v's neighbors, layer0: D x
-           // 16, layer1: 16 x E
-  vec_t alpha; // parameters to learn (H x 1), only used for GAT
+  vec_t Q; // parameters to learn, for vertex v's neighbors, same size as W
   vec_t weight_grad; // weight gradient for updating parameters
-  float_t* d_W;
-  float_t* d_weight_grad;
+  float_t* d_W; // parameters to learn on device (GPU)
+  float_t* d_weight_grad; // weight gradient on device (GPU)
+  vec_t alpha_l; // parameters to learn (H x 1), only used for GAT
+  vec_t alpha_r; // parameters to learn (H x 1), only used for GAT
+  vec_t alpha_lgrad; // gradients for updating alpha (GAT only)
+  vec_t alpha_rgrad; // gradients for updating alpha (GAT only)
   mask_t* masks_; // masks to show which samples are valid
-  mask_t* d_masks_;
+  mask_t* d_masks_; // masks on device (GPU)
   float_t* loss; // error for each vertex: N x 1
   ContextType* context;
   label_t* labels;
-  float_t* norm_consts;
+  float_t* norm_consts; // normalization score
+  vec_t scores; // un-normalized scores
+  vec_t temp_scores; // un-normalized scores
+  vec_t scores_grad; // gradients of un-normalized scores
+  vec_t norm_scores; // normalized scores
+  vec_t norm_scores_grad; // gradients of normalized scores
 // TODO
 #ifdef GALOIS_ENABLE_GPU
   GraphGPU* graph_gpu;
diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh
index 8f5cc25d37..38f461620a 100644
--- a/libdeepgalois/include/deepgalois/math_functions.hh
+++ b/libdeepgalois/include/deepgalois/math_functions.hh
@@ -81,6 +81,8 @@ void relu_cpu(size_t n, const float_t* in, float_t* out);
 void d_relu_cpu(size_t n, const float_t* in, const float_t* data, float_t* out);
 
 // Leaky ReLU
+void leaky_relu(float_t epsilon, float_t in, float_t &out);
+void d_leaky_relu(float_t epsilon, float_t in, float_t data, float_t &out);
 void leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in, float_t* out);
 void d_leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in,
                       const float_t* data, float_t* out);
diff --git a/libdeepgalois/src/layers/gat_fw.h b/libdeepgalois/src/layers/gat_fw.h
index 3e77ebc797..e9a7bada37 100644
--- a/libdeepgalois/src/layers/gat_fw.h
+++ b/libdeepgalois/src/layers/gat_fw.h
@@ -1,5 +1,108 @@
 //#define USE_GAT
 #ifdef USE_GAT
+// `Graph Attention Network <https://arxiv.org/pdf/1710.10903.pdf>` 
+// NOTE: GAT paper uses "first concatenation then linear projection"
+//  to compute attention scores, while ours is "first projection then
+//  addition", the two approaches are mathematically equivalent:
+//  We decompose the weight vector a mentioned in the paper into
+//  [a_l || a_r], then  a^T [Wh_i || Wh_j] = a_l Wh_i + a_r Wh_j
+//  Our implementation is much efficient because we do not need to
+//  save [Wh_i || Wh_j] on edges, which is not memory-efficient. Plus,
+//  addition could be optimized with DGL's built-in function u_add_v,
+//  which further speeds up computation and saves memory footprint.
+ 
+void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in,
+                                 float_t* out) {
+  size_t n = g.size();
+  galois::do_all(galois::iterate(size_t(0), n), [&](const auto src) {
+    auto src_idx = src * len;
+    auto deg_src = g.get_degree(src);
+
+    // concatenation, dot product, LeakyReLU
+    //int i = 0;
+    //vec_t scores(deg_src);
+    auto begin = g.edge_begin(src);
+    auto end = g.edge_end(src);
+    // alpha: learnable weight vector (shared by all vertices)
+    float_t src_score = math::dot(len, &alpha_l[0], &in[src_idx]);
+    for (auto e = begin; e != end; e++) {
+      auto dst = g.getEdgeDst(e);
+      auto dst_idx = dst * len;
+      //vec_t concat_vec(2*len);
+      //math::concat(len, &in[src_idx], &in[dst_idx], &concat_vec[0]);
+      //float_t score = math::dot(2*len, &alpha[0], &concat_vec[0]);
+      float_t dst_score = math::dot(len, &alpha_r[0], &in[dst_idx]);
+      temp_scores[e] = src_score + dst_score;
+      math::leaky_relu(epsilon, temp_scores[e], scores[e]);
+    }
+
+    // softmax to normalize the attention scores on each vertex’s incoming edges
+    //vec_t normalized_scores(deg_src, 0);
+    //math::softmax(deg_src, &scores[0], &normalized_scores[0]);
+    math::softmax(deg_src, &scores[begin], &norm_scores[begin]);
+
+    // aggregation: scaled by the attention scores
+    math::clear_cpu(len, &out[src_idx]);
+    for (auto e = begin; e != end; e++) {
+      auto dst = g.getEdgeDst(e);
+      auto dst_idx = dst * len;
+      auto score = norm_scores[e];
+      vec_t neighbor(len);
+      math::scale(len, score, &in[dst_idx], &neighbor[0]);
+      math::vadd_cpu(len, &out[src_idx], &neighbor[0], &out[src_idx]);
+    }
+  });
+}
+
+void graph_conv_layer::d_compute_scores(size_t len, Graph& g,
+                                        const float_t* in_data, 
+                                        const float_t *out_data,
+                                        const float_t* in_grad) {
+  size_t n = g.size();
+
+  // compute gradients for the learnable vector `alpha`
+  //vec_t temp_grad(n*n);
+  //math::sgemm_cpu(CblasTrans, CblasNoTrans, n, len, n, 1.0, out_data,
+  //                in_grad, 0.0, temp_grad);
+  galois::do_all(galois::iterate(size_t(0), n), [&](const auto src) {
+    auto begin = g.edge_begin(src);
+    auto end = g.edge_end(src);
+    auto deg_src = g.get_degree(src);
+    math::d_softmax(deg_src, &scores[begin], &norm_scores[begin], 
+                    &scores_grad[begin], &norm_scores_grad[begin]);
+    for (auto e = begin; e != end; e++) {
+      auto dst = g.getEdgeDst(e);
+      // use norm_scores_grad as temp_scores_grad since its data is useless already
+      math::d_leaky_relu(epsilon, &scores_grad[e], 
+                         &temp_scores[e], &norm_scores_grad[e]);
+      math::scale(len, norm_scores_grad[e], &in_data[src_idx], &alpha_lgrad[0]);
+      math::scale(len, norm_scores_grad[e], &in_data[dst_idx], &alpha_rgrad[0]);
+    }
+  });
+}
+
+void graph_conv_layer::d_aggregate(size_t len, Graph& g,
+                                   const float_t* in_grad, float_t* out_grad) {
+  size_t n = g.size();
+
+  // aggregation: the derivative is transposed;
+  // the graph is undirected (structurally symmetric), 
+  // but values are not the same for the symmetric positions
+  galois::do_all(galois::iterate(size_t(0), n), [&](const auto src) {
+    auto src_idx = src * len;
+    auto src_begin = g.edge_begin(src);
+    for (auto e = src_begin; e != g.edge_end(src); e++) {
+      auto dst = g.getEdgeDst(e);
+      auto dst_idx = dst * len;
+      auto dst_begin = g.edge_begin(dst);
+      auto score = norm_scores[dst_begin+e-src_begin]; // transposed
+      vec_t neighbor(len);
+      math::scale(len, score, &in_grad[dst_idx], &neighbor[0]);
+      math::vadd_cpu(len, &out_grad[src_idx], &neighbor[0], &out_grad[src_idx]);
+    }
+  });
+}
+
 void graph_conv_layer::forward_propagation(const float_t* in_data,
                                            float_t* out_data) {
   galois::StatTimer conv_timer("GraphConvForward");
@@ -8,7 +111,7 @@ void graph_conv_layer::forward_propagation(const float_t* in_data,
   size_t y = input_dims[1];
   size_t z = output_dims[1];
 
-  // (1) dropout
+  // dropout
   if (dropout_ && phase_ == net_phase::train) {
     math::dropout_cpu(x, y, scale_, dropout_rate_, in_data,
                       dropout_mask, in_temp);
@@ -16,51 +119,37 @@ void graph_conv_layer::forward_propagation(const float_t* in_data,
     math::copy_cpu(x * y, in_data, in_temp);
   }
 
-  // (2) linear transformation
+  // linear transformation
   math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp,
                   &layer::W[0], 0.0, out_temp);
 
-  auto &g = *graph_cpu;
-  size_t n = g.size();
-  size_t len = z;
-  float_t* in = out_temp;
-  float_t* out = out_data;
+  // aggregation
+  aggregate(z, *graph_cpu, out_temp, out_data);
   
-  galois::do_all(galois::iterate(size_t(0), n), [&](const auto src) {
-    auto src_idx = src * len;
-    auto deg_src = g.get_degree(src);
-
-    // (3) concatenation, dot product, LeakyReLU
-    int i = 0;
-    vec_t scores(deg_src);
-    //for (auto e : g.edges(src)) {
-    for (auto e = g.edge_begin(src); e != g.edge_end(src); e++) {
-      auto dst = g.getEdgeDst(e);
-      auto dst_idx = dst * len;
-      vec_t concat_vec(2*z);
-      math::concat(z, &in[src_idx], &in[dst_idx], &concat_vec[0]);
-      // alpha: learnable weight vector
-      scores[i++] = math::dot(2*z, &alpha[0], &concat_vec[0]);
-    }
+  // ReLU
+  if (act_) math::relu_cpu(x * z, out_data, out_data);
+}
 
-    // (4) softmax to normalize the attention scores on each vertex’s incoming edges
-    vec_t normalized_scores(deg_src, 0);
-    math::softmax(deg_src, &scores[0], &normalized_scores[0]); // normalize using softmax
-    math::clear_cpu(len, &out[src_idx]);
+void graph_conv_layer::back_propagation(const float_t* in_data,
+                                        const float_t* out_data,
+                                        float_t* out_grad, float_t* in_grad) {
+  size_t x = input_dims[0];
+  size_t y = input_dims[1];
+  size_t z = output_dims[1];
+  if (act_) math::d_relu_cpu(x * z, out_grad, out_data, out_grad);
 
-    // (5) aggregation: scaled by the attention scores
-    //for (auto e : g.edges(src)) {
-    for (auto e = g.edge_begin(src); e != g.edge_end(src); e++) {
-      auto dst = g.getEdgeDst(e);
-      auto dst_idx = dst * len;
-      auto score = normalized_scores[dst];
-      vec_t neighbor(len);
-      math::scale(len, score, &in[dst_idx], &neighbor[0]);
-      math::vadd_cpu(len, &out[src_idx], &neighbor[0], &out[src_idx]);
-    }
-  });
-  
-  // (6) ReLU
-  if (act_) math::relu_cpu(x * z, out_data, out_data);
+  // compute gradients for alpha (alpha is a learnable vector)
+  d_compute_scores(z, *graph_cpu, in_temp, out_temp, out_grad);
+  // compute gradients for feature vectors
+  d_aggregate(z, *graph_cpu, out_grad, out_temp);
+  if (level_ != 0) {
+    math::sgemm_cpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, &W[0],
+                    0.0, in_grad); // x*z; z*y -> x*y
+    math::sgemm_cpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp,
+                    0.0, &layer::weight_grad[0]); // y*x; x*z; y*z
+  }
+  if (level_ != 0 && dropout_)
+    math::d_dropout_cpu(x, y, scale_, in_grad, dropout_mask, in_grad);
 }
+
 #endif
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 1d543f0a78..a17f6527bc 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -37,42 +37,6 @@ inline void graph_conv_layer::zero_init_matrix(size_t dim_x, size_t dim_y,
   }
 }
 
-// aggregate based on graph topology
-void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in,
-                                 float_t* out) {
-  galois::StatTimer aggregate_timer("AggregateTime");
-  aggregate_timer.start();
-  // normalization constant based on graph structure
-#ifdef USE_MKL
-  update_all_csrmm(len, g, in, out, norm_, norm_consts);
-#else
-  update_all(len, g, in, out, norm_, norm_consts);
-#endif
-  aggregate_timer.stop();
-}
-
-// since graph is symmetric, the derivative is the same
-void graph_conv_layer::d_aggregate(size_t len, Graph& g, const float_t* in,
-                                   float_t* out) {
-  galois::StatTimer aggregate_timer("AggregateDerivativeTime");
-  aggregate_timer.start();
-#ifdef USE_MKL
-  update_all_csrmm(len, g, in, out, norm_, norm_consts); // x*x; x*z -> x*z
-#else
-  update_all(len, g, in, out, norm_, norm_consts); // x*x; x*z -> x*z
-#endif
-  aggregate_timer.stop();
-}
-
-void graph_conv_layer::combine(size_t n, size_t len, const float_t* self,
-                               const float_t* neighbors, float_t* out) {
-  float_t* a = new float_t[len];
-  float_t* b = new float_t[len];
-  math::mvmul(CblasNoTrans, n, len, 1.0, &Q[0], self, 0.0, a);
-  math::mvmul(CblasNoTrans, n, len, 1.0, &W[0], neighbors, 0.0, b);
-  math::vadd_cpu(len, a, b, out); // out = W*self + Q*neighbors
-}
-
 void graph_conv_layer::malloc_and_init() {
   size_t x = input_dims[0];
   size_t y = input_dims[1];
@@ -92,12 +56,26 @@ void graph_conv_layer::malloc_and_init() {
 
   // make sure seed consistent across all hosts for weight matrix
   rand_init_matrix(y, z, W, 1);
+  //rand_init_matrix(y, z, Q, 1); // for GraphSAGE
 
-  // rand_init_matrix(y, z, Q);
   zero_init_matrix(y, z, layer::weight_grad);
 
+#ifdef USE_GAT
   // alpha is only used for GAT
-  rand_init_matrix(2 * z, 1, alpha, 1);
+  rand_init_matrix(z, 1, alpha_l, 1);
+  rand_init_matrix(z, 1, alpha_r, 1);
+  alpha_lgrad.resize(2*z);
+  alpha_rgrad.resize(2*z);
+  std::fill(alpha_lgrad.begin(), alpha_lgrad.end(), 0);
+  std::fill(alpha_rgrad.begin(), alpha_rgrad.end(), 0);
+  auto ne = graph_cpu->sizeEdges(); // number of edges
+  scores.resize(ne); // a score for each edge
+  temp_scores.resize(ne);
+  scores_grad.resize(ne);
+  norm_scores.resize(ne);
+  norm_scores_grad.resize(ne);
+  epsilon = 0.2; // LeakyReLU angle of negative slope
+#endif
 
   if (dropout_)
     dropout_mask = new mask_t[x * y];
@@ -136,7 +114,43 @@ void set_conv_bitset() {
 
 } // end anonymous namespace
 
+void graph_conv_layer::combine(size_t n, size_t len, const float_t* self,
+                               const float_t* neighbors, float_t* out) {
+  float_t* a = new float_t[len];
+  float_t* b = new float_t[len];
+  math::mvmul(CblasNoTrans, n, len, 1.0, &Q[0], self, 0.0, a);
+  math::mvmul(CblasNoTrans, n, len, 1.0, &W[0], neighbors, 0.0, b);
+  math::vadd_cpu(len, a, b, out); // out = W*self + Q*neighbors
+}
+
 #ifndef USE_GAT
+// aggregate based on graph topology
+void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in,
+                                 float_t* out) {
+  galois::StatTimer aggregate_timer("AggregateTime");
+  aggregate_timer.start();
+  // normalization constant based on graph structure
+#ifdef USE_MKL
+  update_all_csrmm(len, g, in, out, norm_, norm_consts);
+#else
+  update_all(len, g, in, out, norm_, norm_consts);
+#endif
+  aggregate_timer.stop();
+}
+
+// since graph is symmetric, the derivative is the same
+void graph_conv_layer::d_aggregate(size_t len, Graph& g, const float_t* in,
+                                   float_t* out) {
+  galois::StatTimer aggregate_timer("AggregateDerivativeTime");
+  aggregate_timer.start();
+#ifdef USE_MKL
+  update_all_csrmm(len, g, in, out, norm_, norm_consts); // x*x; x*z -> x*z
+#else
+  update_all(len, g, in, out, norm_, norm_consts); // x*x; x*z -> x*z
+#endif
+  aggregate_timer.stop();
+}
+
 // 𝒉[𝑙] = σ(𝑊 * Σ(𝒉[𝑙-1]))
 void graph_conv_layer::forward_propagation(const float_t* in_data,
                                            float_t* out_data) {
@@ -192,7 +206,6 @@ void graph_conv_layer::forward_propagation(const float_t* in_data,
 
   conv_timer.stop();
 }
-#endif
 
 // 𝜕𝐸 / 𝜕𝑦[𝑙−1] = 𝜕𝐸 / 𝜕𝑦[𝑙] ∗ 𝑊 ^𝑇
 void graph_conv_layer::back_propagation(const float_t* in_data,
@@ -259,6 +272,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
   galois::gInfo("[", layer::gradientGraph->myHostID(), "] Sync done");
   conv_timer.stop();
 }
+#endif
 
 acc_t graph_conv_layer::get_weight_decay_loss() {
   return math::l2_norm(input_dims[1] * output_dims[1], &layer::W[0]);
diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp
index a5e6b50eec..aed0ac79b9 100644
--- a/libdeepgalois/src/math_functions.cpp
+++ b/libdeepgalois/src/math_functions.cpp
@@ -244,6 +244,14 @@ void d_relu_cpu(size_t n, const float_t* in, const float_t* data,
       galois::chunk_size<64>(), galois::loopname("d_relu"));
 }
 
+void leaky_relu(float_t epsilon, float_t in, float_t &out) {
+  out = in > 0.0 ? in : epsilon * in;
+}
+
+void d_leaky_relu(float_t epsilon, float_t in, float_t data, float_t &out) {
+  out = in * (data > 0.0 ? 1.0 : epsilon);
+}
+
 void leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in,
                     float_t* out) {
   // TODO: vectorize

From 2eeea3c20fd85be382106d43318a1682d2c84780 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Sat, 25 Jul 2020 12:49:02 -0500
Subject: [PATCH 335/660] print statement fixing

Reducing debug prints, reducing amount of times test/train acc is
printed in distributed execution (cleans up stdout).
---
 libdeepgalois/src/Train.cpp                   | 19 +++++++++++++++----
 libdeepgalois/src/layers/graph_conv_layer.cpp |  6 +-----
 lonestar/gnn/include/engine.h                 |  9 ++++++++-
 3 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/libdeepgalois/src/Train.cpp b/libdeepgalois/src/Train.cpp
index 992902e7b6..25b0c47d6f 100644
--- a/libdeepgalois/src/Train.cpp
+++ b/libdeepgalois/src/Train.cpp
@@ -20,7 +20,7 @@ Net::Net(std::string dataset_str, int nt, unsigned n_conv, int epochs,
       val_interval(val_itv), num_subgraphs(1), is_selfloop(selfloop) {
   // init some identifiers for this host
   unsigned myID = 0;
-#ifdef GALOIS_ENABLE_DIST
+#ifndef GALOIS_ENABLE_GPU
   myID = galois::runtime::getSystemNetworkInterface().ID;
 #endif
   this->header = "[" + std::to_string(myID) + "] ";
@@ -91,6 +91,9 @@ void Net::train(optimizer* opt, bool need_validate) {
   std::string separator   = "\n";
   double total_train_time = 0.0;
   int num_subg_remain     = 0;
+#ifndef GALOIS_ENABLE_GPU
+  unsigned hostID = galois::runtime::getSystemNetworkInterface().ID;
+#endif
 
   if (subgraph_sample_size) {
     galois::StatTimer construct_time("SubgraphAllocateTime");
@@ -195,7 +198,9 @@ void Net::train(optimizer* opt, bool need_validate) {
 #ifdef GALOIS_ENABLE_GPU
     std::cout << header << "Epoch " << std::setw(3) << curEpoch << " ";
 #else
-    galois::gPrint(header, "Epoch ", std::setw(3), curEpoch, separator);
+    if (hostID == 0) {
+      galois::gPrint("Epoch ", std::setw(3), curEpoch, separator);
+    }
 #endif
     set_netphases(net_phase::train);
     acc_t train_loss = 0.0, train_acc = 0.0;
@@ -224,8 +229,10 @@ void Net::train(optimizer* opt, bool need_validate) {
     std::cout << header << "train_loss " << std::setprecision(3) << std::fixed
               << train_loss << " train_acc " << train_acc << " ";
 #else
-    galois::gPrint(header, "train_loss ", std::setprecision(3), std::fixed,
-                   train_loss, " train_acc ", train_acc, separator);
+    if (hostID == 0) {
+      galois::gPrint(header, "train_loss ", std::setprecision(3), std::fixed,
+                     train_loss, " train_acc ", train_acc, separator);
+    }
 #endif
     t_epoch.Stop();
 
@@ -243,11 +250,13 @@ void Net::train(optimizer* opt, bool need_validate) {
                 << epoch_time + val_time << " ms (train_time " << epoch_time
                 << " val_time " << val_time << ")\n";
 #else
+    if (hostID == 0) {
       galois::gPrint(header, "val_loss ", std::setprecision(3), std::fixed,
                      val_loss, " val_acc ", val_acc, separator);
       galois::gPrint(header, "time ", std::setprecision(3), std::fixed,
                      epoch_time + val_time, " ms (train_time ", epoch_time,
                      " val_time ", val_time, ")\n");
+    }
 #endif
     } else {
 #ifdef GALOIS_ENABLE_GPU
@@ -255,8 +264,10 @@ void Net::train(optimizer* opt, bool need_validate) {
                 << " ms (fw " << fw_time << ", bw " << epoch_time - fw_time
                 << ")\n";
 #else
+    if (hostID == 0) {
       galois::gPrint(header, "train_time ", std::fixed, epoch_time, " ms (fw ",
                      fw_time, ", bw ", epoch_time - fw_time, ")\n");
+    }
 #endif
     }
   } // epoch loop
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index a17f6527bc..58da90e9ad 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -191,8 +191,6 @@ void graph_conv_layer::forward_propagation(const float_t* in_data,
   deepgalois::_syncVectorSize = z;
   deepgalois::_dataToSync     = out_data;
   set_conv_bitset();
-  galois::gPrint("forward ", bitset_conv.count(), " out of ",
-                 bitset_conv.size(), "\n");
 
   layer::context->getSyncSubstrate()
       ->sync<writeAny, readAny, GraphConvSync, Bitset_conv>("GraphConvForward");
@@ -254,8 +252,6 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
   deepgalois::_syncVectorSize = z;
   deepgalois::_dataToSync     = out_temp;
   set_conv_bitset();
-  galois::gPrint("backward ", bitset_conv.count(), " out of ",
-                 bitset_conv.size(), "\n");
 
   layer::context->getSyncSubstrate()
       ->sync<writeAny, readAny, GraphConvSync, Bitset_conv>(
@@ -269,7 +265,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
   drop_timer.stop();
 
   layer::syncSub->sync<writeAny, readAny, GradientSync>("Gradients");
-  galois::gInfo("[", layer::gradientGraph->myHostID(), "] Sync done");
+  galois::gDebug("[", layer::gradientGraph->myHostID(), "] Sync done");
   conv_timer.stop();
 }
 #endif
diff --git a/lonestar/gnn/include/engine.h b/lonestar/gnn/include/engine.h
index 25644c720d..f4bbf8e5b5 100644
--- a/lonestar/gnn/include/engine.h
+++ b/lonestar/gnn/include/engine.h
@@ -105,10 +105,17 @@ int main(int argc, char** argv) {
     Ttest.start();
     acc_t test_loss = 0.0, test_acc = 0.0;
     double test_time = network.evaluate("test", test_loss, test_acc);
+#ifndef GALOIS_ENABLE_GPU
+    if (galois::runtime::getSystemNetworkInterface().ID == 0) {
+      galois::gPrint("test_loss = ", test_loss, " test_acc = ", test_acc,
+                     " test_time = ", test_time, "\n");
+    }
+#else
     galois::gPrint("Testing: test_loss = ", test_loss, " test_acc = ", test_acc,
                    " test_time = ", test_time, "\n");
+#endif
     Ttest.stop();
   }
-  galois::gPrint("\n", rm.get_peak_memory(), "\n\n");
+  galois::gInfo(rm.get_peak_memory());
   return 0;
 }

From 09141e09719f3693e2173abb7a88bab748e58460 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Sat, 25 Jul 2020 13:54:27 -0500
Subject: [PATCH 336/660] report test epoch accuracy to time elapsed

used to get accuracy gain to time plots
---
 libdeepgalois/src/Train.cpp | 38 ++++++++++++++++++++++++-------------
 1 file changed, 25 insertions(+), 13 deletions(-)

diff --git a/libdeepgalois/src/Train.cpp b/libdeepgalois/src/Train.cpp
index 25b0c47d6f..67533bc55d 100644
--- a/libdeepgalois/src/Train.cpp
+++ b/libdeepgalois/src/Train.cpp
@@ -222,6 +222,8 @@ void Net::train(optimizer* opt, bool need_validate) {
     // for next epoch
     Net::update_weights(opt); // update parameters
 
+    t_epoch.Stop();
+
     // validation / testing
     set_netphases(net_phase::test);
 
@@ -234,12 +236,21 @@ void Net::train(optimizer* opt, bool need_validate) {
                      train_loss, " train_acc ", train_acc, separator);
     }
 #endif
-    t_epoch.Stop();
 
     double epoch_time = t_epoch.Millisecs();
     total_train_time += epoch_time;
 
-    if (need_validate && curEpoch % val_interval == 0) {
+    // report current total time + accuracy as a stat
+#ifndef GALOIS_ENABLE_GPU
+    if (hostID == 0) {
+      galois::runtime::reportParam(
+          std::string("GNN"),
+          "Epoch" + std::to_string(curEpoch) + "TestAccuracyAndTime",
+          std::to_string(train_acc) + ";" + std::to_string(total_train_time));
+    }
+#endif
+
+    if (need_validate && (curEpoch % val_interval == 0)) {
       // Validation
       acc_t val_loss = 0.0, val_acc = 0.0;
       double val_time = evaluate("val", val_loss, val_acc);
@@ -250,13 +261,13 @@ void Net::train(optimizer* opt, bool need_validate) {
                 << epoch_time + val_time << " ms (train_time " << epoch_time
                 << " val_time " << val_time << ")\n";
 #else
-    if (hostID == 0) {
-      galois::gPrint(header, "val_loss ", std::setprecision(3), std::fixed,
-                     val_loss, " val_acc ", val_acc, separator);
-      galois::gPrint(header, "time ", std::setprecision(3), std::fixed,
-                     epoch_time + val_time, " ms (train_time ", epoch_time,
-                     " val_time ", val_time, ")\n");
-    }
+      if (hostID == 0) {
+        galois::gPrint(header, "val_loss ", std::setprecision(3), std::fixed,
+                       val_loss, " val_acc ", val_acc, separator);
+        galois::gPrint(header, "time ", std::setprecision(3), std::fixed,
+                       epoch_time + val_time, " ms (train_time ", epoch_time,
+                       " val_time ", val_time, ")\n");
+      }
 #endif
     } else {
 #ifdef GALOIS_ENABLE_GPU
@@ -264,10 +275,11 @@ void Net::train(optimizer* opt, bool need_validate) {
                 << " ms (fw " << fw_time << ", bw " << epoch_time - fw_time
                 << ")\n";
 #else
-    if (hostID == 0) {
-      galois::gPrint(header, "train_time ", std::fixed, epoch_time, " ms (fw ",
-                     fw_time, ", bw ", epoch_time - fw_time, ")\n");
-    }
+      if (hostID == 0) {
+        galois::gPrint(header, "train_time ", std::fixed, epoch_time,
+                       " ms (fw ", fw_time, ", bw ", epoch_time - fw_time,
+                       ")\n");
+      }
 #endif
     }
   } // epoch loop

From f6ba338c58784528b54939194cc21ace40164356 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Sat, 1 Aug 2020 12:47:54 -0500
Subject: [PATCH 337/660] very minor print change

---
 libdeepgalois/src/Train.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libdeepgalois/src/Train.cpp b/libdeepgalois/src/Train.cpp
index 67533bc55d..3a1b1c397e 100644
--- a/libdeepgalois/src/Train.cpp
+++ b/libdeepgalois/src/Train.cpp
@@ -199,7 +199,7 @@ void Net::train(optimizer* opt, bool need_validate) {
     std::cout << header << "Epoch " << std::setw(3) << curEpoch << " ";
 #else
     if (hostID == 0) {
-      galois::gPrint("Epoch ", std::setw(3), curEpoch, separator);
+      galois::gInfo("Epoch ", std::setw(3), curEpoch);
     }
 #endif
     set_netphases(net_phase::train);

From b07f050ede97905984db7b49a73c76c99ecd6126 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 4 Sep 2020 18:08:45 -0500
Subject: [PATCH 338/660] sync only if not last layer (backward)

---
 .../deepgalois/layers/GradientSyncStructs.h   |  2 +-
 libdeepgalois/src/Train.cpp                   |  2 +-
 libdeepgalois/src/layers/graph_conv_layer.cpp | 26 +++++++++++--------
 3 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h
index 26420aa30d..2c32f13be2 100644
--- a/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h
+++ b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h
@@ -9,7 +9,7 @@ struct GradientSync {
 
   static ValTy extract(uint32_t, float_t& weight) { return weight; }
 
-  static bool reduce(uint32_t, float_t& weight, ValTy y) {
+  static bool reduce(uint32_t, float_t&, ValTy) {
     // TODO merge function here
     // for now make sure the weights are close enough
     // if (std::abs(weight - y) > 0.00001) {
diff --git a/libdeepgalois/src/Train.cpp b/libdeepgalois/src/Train.cpp
index 3a1b1c397e..4275232baa 100644
--- a/libdeepgalois/src/Train.cpp
+++ b/libdeepgalois/src/Train.cpp
@@ -491,7 +491,7 @@ acc_t Net::fprop(size_t gBegin, size_t gEnd, size_t gCount, mask_t* gMasks) {
   layers[num_layers - 1]->set_sample_mask(gBegin, gEnd, gCount, gMasks);
 
   for (size_t i = 0; i < num_layers; i++) {
-    // galois::gPrint(header, "fprop: layer ", i, " forward call\n");
+    galois::gPrint(header, "fprop: layer ", i, " forward call\n");
     layers[i]->forward();
   }
 
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 58da90e9ad..9320ade39c 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -52,7 +52,7 @@ void graph_conv_layer::malloc_and_init() {
       new galois::graphs::GluonSubstrate<deepgalois::GluonGradients>(
           *layer::gradientGraph, layer::gradientGraph->myHostID(),
           layer::gradientGraph->numHosts(), false);
-  galois::gInfo("gradient bitset size is going to be ", y * z);
+  galois::gInfo("gradient bitset size is going to be ", y * z, " ", y, " ", z);
 
   // make sure seed consistent across all hosts for weight matrix
   rand_init_matrix(y, z, W, 1);
@@ -159,6 +159,7 @@ void graph_conv_layer::forward_propagation(const float_t* in_data,
   size_t x = input_dims[0];
   size_t y = input_dims[1];
   size_t z = output_dims[1];
+  galois::gPrint("forward ", x, " ", y, " ", z, "\n");
 
   galois::StatTimer drop_timer("GraphConvForwardDropout");
   drop_timer.start();
@@ -192,6 +193,7 @@ void graph_conv_layer::forward_propagation(const float_t* in_data,
   deepgalois::_dataToSync     = out_data;
   set_conv_bitset();
 
+  galois::gPrint("forward ", x, " ", y, " ", z, " sync calling\n");
   layer::context->getSyncSubstrate()
       ->sync<writeAny, readAny, GraphConvSync, Bitset_conv>("GraphConvForward");
 
@@ -229,10 +231,11 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
     // at this point, out_temp has the derivative of data from last step to
     // use for both updating gradients for features and gradients for weights
     // this calculates gradients for the node predictions
-    if (level_ != 0) // no need to calculate in_grad for the first layer
+    if (level_ != 0) {// no need to calculate in_grad for the first layer
       // derivative of matmul needs transposed matrix
       math::sgemm_cpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, &W[0],
                       0.0, in_grad); // x*z; z*y -> x*y
+    }
     // calculate weight gradients using input data; multiplied by gradients from
     // last back prop step
     math::sgemm_cpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp,
@@ -249,15 +252,16 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
   compute_timer.stop();
 
   // sync agg
-  deepgalois::_syncVectorSize = z;
-  deepgalois::_dataToSync     = out_temp;
-  set_conv_bitset();
-
-  layer::context->getSyncSubstrate()
-      ->sync<writeAny, readAny, GraphConvSync, Bitset_conv>(
-          //->sync<writeAny, readAny, GraphConvSync>(
-          "GraphConvBackward");
-
+  //galois::gPrint(header, "x is ", x, " y is ", y,  " z is ", z, "\n");
+  if (level_ != 0) {
+    deepgalois::_syncVectorSize = y;
+    deepgalois::_dataToSync     = in_grad;
+    set_conv_bitset();
+    layer::context->getSyncSubstrate()
+        ->sync<writeAny, readAny, GraphConvSync, Bitset_conv>(
+            //->sync<writeAny, readAny, GraphConvSync>(
+            "GraphConvBackward");
+  }
   galois::StatTimer drop_timer("GraphConvBackwardDropout");
   drop_timer.start();
   if (level_ != 0 && dropout_)

From 57d1596e5f3397f571591d81d8e5be5f471eac16 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 14 Sep 2020 16:38:09 -0500
Subject: [PATCH 339/660] GNNGraph initial load implementation

This commit adds the GNNGraph class with the implementation of the load
features done: partitions, loads features, labels, and masks, and
initializes the sync substrate.

Some thought has to be put in on how to access the feature array being
synchronized, however. Before it was done via a global...
---
 CMakeLists.txt                          |   1 +
 libgnn/CMakeLists.txt                   |  34 +++
 libgnn/include/galois/GNNTypes.h        |  15 ++
 libgnn/include/galois/graphs/GNNGraph.h | 143 +++++++++++++
 libgnn/src/GNNGraph.cpp                 | 265 ++++++++++++++++++++++++
 libgnn/test/CMakeLists.txt              |   5 +
 libgnn/test/gnngraph-test.cpp           |  33 +++
 7 files changed, 496 insertions(+)
 create mode 100644 libgnn/CMakeLists.txt
 create mode 100644 libgnn/include/galois/GNNTypes.h
 create mode 100644 libgnn/include/galois/graphs/GNNGraph.h
 create mode 100644 libgnn/src/GNNGraph.cpp
 create mode 100644 libgnn/test/CMakeLists.txt
 create mode 100644 libgnn/test/gnngraph-test.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d82f802c97..fc01f4a1ef 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -238,6 +238,7 @@ if (GALOIS_ENABLE_DIST)
   add_subdirectory(libdist)
   add_subdirectory(libcusp)
   add_subdirectory(libgluon)
+  add_subdirectory(libgnn)
 endif()
 
 # TODO(loc) prefix with GALOIS
diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt
new file mode 100644
index 0000000000..a44b94f427
--- /dev/null
+++ b/libgnn/CMakeLists.txt
@@ -0,0 +1,34 @@
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -pthread")
+SET(BLAS_INC_DIR ${OPENBLAS_ROOT}/include/openblas)
+SET(BLAS_LIB_DIR ${OPENBLAS_ROOT}/lib64)
+set(BLAS_LIB "-lopenblas -lpthread")
+if(USE_MKL_BLAS)
+  link_directories(${INTEL_LIBS_DIR})
+  message(STATUS "ICC Libraries for MKL: ${INTEL_LIBS_DIR}")
+  SET(BLAS_INC_DIR ${MKL_ROOT}/include)
+  SET(BLAS_LIB_DIR ${MKL_ROOT}/lib/intel64)
+  set(BLAS_LIB "-lmkl_intel_lp64 -lmkl_sequential -lmkl_core")
+  #set(BLAS_LIB "-lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -lpthread -liomp5")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_MKL")
+endif()
+
+# blas library
+include_directories(${BLAS_INC_DIR})
+link_directories(${BLAS_LIB_DIR})
+
+set(sources
+  src/GNNGraph.cpp
+)
+
+add_library(galois_gnn STATIC ${sources})
+target_link_libraries(galois_gnn galois_shmem)
+target_link_libraries(galois_gnn ${MPI_CXX_LIBRARIES})
+target_link_libraries(galois_gnn ${BLAS_LIB} ${BOOST_LIBRARIES})
+target_link_libraries(galois_gnn galois_dist_async galois_cusp galois_gluon galois_support)
+target_include_directories(galois_gnn PUBLIC
+  ${CMAKE_CURRENT_SOURCE_DIR}/include
+)
+
+set_target_properties(galois_gnn PROPERTIES EXPORT_NAME gluon)
+
+add_subdirectory(test)
diff --git a/libgnn/include/galois/GNNTypes.h b/libgnn/include/galois/GNNTypes.h
new file mode 100644
index 0000000000..aaabaa15e0
--- /dev/null
+++ b/libgnn/include/galois/GNNTypes.h
@@ -0,0 +1,15 @@
+#pragma once
+//! @file GNNTypes.h
+//! Typedefs used by the Galois GNN code
+
+#include <cstdint>
+
+namespace galois {
+//! Floating point type to use throughout GNN compute; typedef'd so it's easier
+//! to flip later
+using GNNFloat = float;
+//! Type of the labels for a vertex
+using GNNLabel = uint8_t;
+//! Type of a feature on vertices
+using GNNFeature = float;
+} // end namespace galois
diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
new file mode 100644
index 0000000000..8bba9609fc
--- /dev/null
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -0,0 +1,143 @@
+#pragma once
+
+#include "galois/GNNTypes.h"
+#include "galois/graphs/CuSPPartitioner.h"
+#include "galois/graphs/GluonSubstrate.h"
+
+namespace galois {
+
+// TODO remove the need to hardcode this path
+//! Path to location of all gnn files
+static const std::string gnn_dataset_path =
+    "/net/ohm/export/iss/inputs/Learning/";
+
+//! Helper struct to maintain start/end/size of any particular range. Mostly
+//! used for mask ranges.
+struct GNNRange {
+  size_t begin{0};
+  size_t end{0};
+  size_t size{0};
+};
+
+namespace graphs {
+
+//! Possible partitioning schemes for the GNN graph
+enum class GNNPartitionScheme { kOEC, kCVC };
+
+//! XXX
+class GNNGraph {
+public:
+  // using LocalGraphType    = LearningGraph;
+  using GNNDistGraph = galois::graphs::DistGraph<char, void>;
+
+  //! Loads a graph and all relevant metadata (labels, features, masks, etc.)
+  GNNGraph(const std::string& dataset_name, GNNPartitionScheme partition_scheme,
+           bool has_single_class_label);
+
+private:
+  //! In a multi-host setting, this variable stores the host id that the graph
+  //! is currently running on
+  unsigned host_id_;
+  //! Number of classes for a single vertex label
+  size_t num_label_classes_{1};
+  //! Length of a feature node
+  size_t node_feature_length_{0};
+  //! Partitioned graph
+  std::unique_ptr<GNNDistGraph> partitioned_graph_;
+  // XXX is this necessary
+  //! Copy of underlying topology of the distributed graph
+  // std::unique_ptr<LocalGraphType> local_graph_;
+  //! Sync substrate for the partitioned graph
+  std::unique_ptr<galois::graphs::GluonSubstrate<GNNDistGraph>> sync_substrate_;
+  //! Ground truth label for nodes in the partitioned graph; Nx1 if single
+  //! class, N x num classes if multi-class label
+  std::unique_ptr<GNNLabel[]> local_ground_truth_labels_;
+  //! Feature vectors for nodes in partitioned graph
+  std::unique_ptr<GNNFeature[]> local_node_features_;
+
+  // TODO maybe revisit this and use an actual bitset
+  //! Bitset indicating which nodes are training nodes
+  std::unique_ptr<GNNLabel[]> local_training_mask_;
+  //! Bitset indicating which nodes are validation nodes
+  std::unique_ptr<GNNLabel[]> local_validation_mask_;
+  //! Bitset indicating which nodes are testing nodes
+  std::unique_ptr<GNNLabel[]> local_testing_mask_;
+
+  //! Global mask range for training nodes; must convert to LIDs when using
+  //! in this class
+  GNNRange global_training_mask_range_;
+  //! Global mask range for validation nodes; must convert to LIDs when using
+  //! in this class
+  GNNRange global_validation_mask_range_;
+  //! Global mask range for testing nodes; must convert to LIDs when using
+  //! in this class
+  GNNRange global_testing_mask_range_;
+
+  // XXX figure out what this is really used for
+  //! Normalization constant based on structure of the graph
+  std::vector<GNNFloat> norm_factors_;
+
+  // TODO vars for subgraphs as necessary
+
+  //! Read labels of local nodes only
+  void ReadLocalLabels(const std::string& dataset_name,
+                       bool has_single_class_label);
+  //! Read features of local nodes only
+  void ReadLocalFeatures(const std::string& dataset_str);
+  //! Helper function to read masks from file into the appropriate structures
+  //! given a name, mask type, and arrays to save into
+  size_t ReadLocalMasksFromFile(const std::string& dataset_name,
+                                const std::string& mask_type,
+                                GNNRange* mask_range, GNNLabel* masks);
+  //! Read masks of local nodes only for training, validation, and testing
+  void ReadLocalMasks(const std::string& dataset_name);
+
+  // public:
+  //
+  //  DGraph* getGraphPointer() { return partitionedGraph; }
+  //  Graph* getLGraphPointer() { return lGraph; }
+  //  Graph* getSubgraphPointer(int id) { return partitionedSubgraphs[id]; };
+  //
+  //  void initializeSyncSubstrate();
+  //
+  //  void saveDistGraph(DGraph* a);
+  //  galois::graphs::GluonSubstrate<DGraph>* getSyncSubstrate();
+  //  float_t* get_feats_ptr() { return h_feats; }
+  //  float_t* get_feats_subg_ptr() { return h_feats_subg.data(); }
+  //  label_t* get_labels_ptr() { return h_labels; }
+  //  label_t* get_labels_subg_ptr() { return h_labels_subg.data(); }
+  //  float_t* get_norm_factors_ptr() { return normFactors.data(); }
+  //  float_t* get_norm_factors_subg_ptr() { return &normFactorsSub[0]; }
+  //
+  //  //! allocate the norm factor vector
+  //  void allocNormFactor();
+  //  void allocNormFactorSub(int subID);
+  //  //! construct norm factor vector by using data from global graph
+  //  void constructNormFactor(deepgalois::Context* globalContext);
+  //  void constructNormFactorSub(int subgraphID);
+  //
+  //  void constructSubgraphLabels(size_t m, const mask_t* masks);
+  //  void constructSubgraphFeatures(size_t m, const mask_t* masks);
+  //
+  //  //! return label for some node
+  //  //! NOTE: this is LID, not GID
+  //  label_t get_label(size_t lid) { return h_labels[lid]; }
+  //
+  //  //! returns pointer to the features of each local node
+  //  float_t* get_in_ptr();
+  //
+  //  //! allocate memory for subgraphs (don't actually build them)
+  //  void allocateSubgraphs(int num_subgraphs, unsigned max_size);
+  //
+  //  //! return if a vertex is owned by the partitioned graph this context
+  //  contains bool isOwned(unsigned gid);
+  //  //! return if part graph has provided vertex for given gid locally
+  //  bool isLocal(unsigned gid);
+  //  //! get GID of an lid for a vertex
+  //  unsigned getGID(unsigned lid);
+  //  //! get local id of a vertex given a global id for that vertex
+  //  unsigned getLID(unsigned gid);
+};
+
+} // namespace graphs
+} // namespace galois
diff --git a/libgnn/src/GNNGraph.cpp b/libgnn/src/GNNGraph.cpp
new file mode 100644
index 0000000000..5a39ed4d25
--- /dev/null
+++ b/libgnn/src/GNNGraph.cpp
@@ -0,0 +1,265 @@
+// XXX include net interface if necessary
+#include "galois/graphs/GNNGraph.h"
+#include "galois/Logging.h"
+
+namespace {
+//! Partitions a particular dataset given some partitioning scheme
+std::unique_ptr<galois::graphs::GNNGraph::GNNDistGraph>
+LoadPartition(const std::string& dataset_name,
+              galois::graphs::GNNPartitionScheme partition_scheme) {
+  // XXX input path
+  std::string input_file = galois::gnn_dataset_path + dataset_name + ".csgr";
+  GALOIS_LOG_VERBOSE("File to read is {}", input_file);
+
+  // load partition
+  switch (partition_scheme) {
+  case galois::graphs::GNNPartitionScheme::kOEC:
+    return galois::cuspPartitionGraph<GnnOEC, char, void>(
+        input_file, galois::CUSP_CSR, galois::CUSP_CSR, true, "");
+  case galois::graphs::GNNPartitionScheme::kCVC:
+    return galois::cuspPartitionGraph<GnnCVC, char, void>(
+        input_file, galois::CUSP_CSR, galois::CUSP_CSR, true, "");
+  default:
+    GALOIS_LOG_FATAL("Error: partition scheme specified is invalid");
+    return nullptr;
+  }
+}
+
+} // end namespace
+
+galois::graphs::GNNGraph::GNNGraph(const std::string& dataset_name,
+                                   GNNPartitionScheme partition_scheme,
+                                   bool has_single_class_label) {
+  // save host id
+  host_id_ = galois::runtime::getSystemNetworkInterface().ID;
+  // load partition
+  partitioned_graph_ = LoadPartition(dataset_name, partition_scheme);
+
+  // read additional graph data
+  ReadLocalLabels(dataset_name, has_single_class_label);
+  ReadLocalFeatures(dataset_name);
+  ReadLocalMasks(dataset_name);
+
+  // init gluon from the partitioned graph
+  sync_substrate_ =
+      std::make_unique<galois::graphs::GluonSubstrate<GNNDistGraph>>(
+          *partitioned_graph_, host_id_,
+          galois::runtime::getSystemNetworkInterface().Num, false);
+}
+
+void galois::graphs::GNNGraph::ReadLocalLabels(const std::string& dataset_name,
+                                               bool has_single_class_label) {
+  GALOIS_LOG_VERBOSE("[{}] Reading labels from disk...", host_id_);
+  std::string filename =
+      galois::gnn_dataset_path + dataset_name + "-labels.txt";
+  // read file header, save num label classes while at it
+  std::ifstream file_stream;
+  file_stream.open(filename, std::ios::in);
+  size_t num_nodes;
+  file_stream >> num_nodes >> num_label_classes_ >> std::ws;
+  assert(num_nodes == partitioned_graph_->globalSize());
+
+  // allocate memory for labels
+  if (has_single_class_label) {
+    // single-class (one-hot) label for each vertex: N x 1
+    local_ground_truth_labels_ =
+        std::make_unique<GNNLabel[]>(partitioned_graph_->size());
+  } else {
+    // multi-class label for each vertex: N x num classes
+    local_ground_truth_labels_ = std::make_unique<GNNLabel[]>(
+        partitioned_graph_->size() * num_label_classes_);
+  }
+
+  size_t cur_gid              = 0;
+  size_t found_local_vertices = 0;
+  // each line contains a set of 0s and 1s
+  std::string read_line;
+
+  // loop through all labels of the graph
+  while (std::getline(file_stream, read_line)) {
+    // only process label if this node is local
+    if (partitioned_graph_->isLocal(cur_gid)) {
+      uint32_t cur_lid = partitioned_graph_->getLID(cur_gid);
+      // read line as bitset of 0s and 1s
+      std::istringstream label_stream(read_line);
+      unsigned cur_bit;
+      // bitset size is # of label classes
+      for (size_t cur_class = 0; cur_class < num_label_classes_; ++cur_class) {
+        // read a bit
+        label_stream >> cur_bit;
+
+        if (has_single_class_label) {
+          // in single class, only 1 bit is set in bitset; that represents the
+          // class to take
+          if (cur_bit != 0) {
+            // set class and break (assumption is that's the only bit that is
+            // set)
+            local_ground_truth_labels_[cur_lid] = cur_class;
+            break;
+          }
+        } else {
+          // else the entire bitset needs to be copied over to the label array
+          // TODO this can possibly be saved all at once rather than bit by bit?
+          local_ground_truth_labels_[cur_lid * num_label_classes_ + cur_class] =
+              cur_bit;
+        }
+      }
+      found_local_vertices++;
+    }
+    // always increment cur_gid
+    cur_gid++;
+  }
+
+  file_stream.close();
+
+  GALOIS_LOG_ASSERT(found_local_vertices == partitioned_graph_->size());
+}
+
+void galois::graphs::GNNGraph::ReadLocalFeatures(
+    const std::string& dataset_name) {
+  GALOIS_LOG_VERBOSE("[{}] Reading features from disk...", host_id_);
+
+  // read in dimensions of features, specifically node feature length
+  size_t num_vertices;
+
+  std::string file_dims = galois::gnn_dataset_path + dataset_name + "-dims.txt";
+  std::ifstream ifs;
+  ifs.open(file_dims, std::ios::in);
+  ifs >> num_vertices >> node_feature_length_;
+  ifs.close();
+
+  GALOIS_LOG_ASSERT(num_vertices == partitioned_graph_->globalSize());
+  GALOIS_LOG_VERBOSE("[{}] N x D: {} x {}", host_id_, num_vertices,
+                     node_feature_length_);
+
+  // memory for all features of all nodes in graph
+  // TODO read features without loading entire feature file into memory; this
+  // is quite inefficient
+  std::unique_ptr<GNNFloat[]> full_feature_set =
+      std::make_unique<GNNFloat[]>(num_vertices * node_feature_length_);
+
+  // read in all features
+  std::ifstream file_stream;
+  std::string feature_file =
+      galois::gnn_dataset_path + dataset_name + "-feats.bin";
+  file_stream.open(feature_file, std::ios::binary | std::ios::in);
+  file_stream.read((char*)full_feature_set.get(),
+                   sizeof(GNNFloat) * num_vertices * node_feature_length_);
+  file_stream.close();
+
+  // allocate memory for local features
+  local_node_features_ = std::make_unique<GNNFeature[]>(
+      partitioned_graph_->size() * node_feature_length_);
+
+  // copy over features for local nodes only
+  size_t local_vertex = 0;
+  for (size_t i = 0; i < num_vertices; i++) {
+    if (partitioned_graph_->isLocal(i)) {
+      // copy over feature vector
+      std::copy(full_feature_set.get() + i * node_feature_length_,
+                full_feature_set.get() + (i + 1) * node_feature_length_,
+                &local_node_features_[local_vertex * node_feature_length_]);
+      local_vertex++;
+    }
+  }
+  full_feature_set.reset();
+  GALOIS_LOG_ASSERT(local_vertex++ == partitioned_graph_->size());
+}
+
+//! Helper function to read masks from file into the appropriate structures
+//! given a name, mask type, and arrays to save into
+size_t galois::graphs::GNNGraph::ReadLocalMasksFromFile(
+    const std::string& dataset_name, const std::string& mask_type,
+    GNNRange* mask_range, GNNLabel* masks) {
+  size_t range_begin;
+  size_t range_end;
+
+  // read mask range
+  std::string mask_filename =
+      galois::gnn_dataset_path + dataset_name + "-" + mask_type + "_mask.txt";
+  std::ifstream mask_stream;
+  mask_stream.open(mask_filename, std::ios::in);
+  mask_stream >> range_begin >> range_end >> std::ws;
+  GALOIS_LOG_ASSERT(range_begin <= range_end);
+
+  // set the range object
+  mask_range->begin = range_begin;
+  mask_range->end   = range_end;
+  mask_range->size  = range_end - range_begin;
+
+  size_t cur_line_num       = 0;
+  size_t local_sample_count = 0;
+  std::string line;
+  // each line is a number signifying if mask is set for the vertex
+  while (std::getline(mask_stream, line)) {
+    std::istringstream mask_stream(line);
+    // only examine vertices/lines in range
+    if (cur_line_num >= range_begin && cur_line_num < range_end) {
+      // only bother if node is local
+      if (partitioned_graph_->isLocal(cur_line_num)) {
+        unsigned mask = 0;
+        mask_stream >> mask;
+        if (mask == 1) {
+          masks[partitioned_graph_->getLID(cur_line_num)] = 1;
+          local_sample_count++;
+        }
+      }
+    }
+    cur_line_num++;
+  }
+  mask_stream.close();
+
+  return local_sample_count;
+}
+
+void galois::graphs::GNNGraph::ReadLocalMasks(const std::string& dataset_name) {
+  // allocate the memory for the local masks
+  local_training_mask_ =
+      std::make_unique<GNNLabel[]>(partitioned_graph_->size());
+  local_validation_mask_ =
+      std::make_unique<GNNLabel[]>(partitioned_graph_->size());
+  local_testing_mask_ =
+      std::make_unique<GNNLabel[]>(partitioned_graph_->size());
+
+  if (dataset_name == "reddit") {
+    // TODO reddit is hardcode handled at the moment; better way to not do
+    // this?
+    global_training_mask_range_   = {.begin = 0, .end = 153431, .size = 153431};
+    global_validation_mask_range_ = {
+        .begin = 153431, .end = 153431 + 23831, .size = 23831};
+    global_testing_mask_range_ = {
+        .begin = 177262, .end = 177262 + 55703, .size = 55703};
+
+    // training
+    for (size_t i = global_training_mask_range_.begin;
+         i < global_training_mask_range_.end; i++) {
+      if (partitioned_graph_->isLocal(i)) {
+        local_training_mask_[partitioned_graph_->getLID(i)] = 1;
+      }
+    }
+
+    // validation
+    for (size_t i = global_validation_mask_range_.begin;
+         i < global_validation_mask_range_.end; i++) {
+      if (partitioned_graph_->isLocal(i)) {
+        local_validation_mask_[partitioned_graph_->getLID(i)] = 1;
+      }
+    }
+
+    // testing
+    for (size_t i = global_testing_mask_range_.begin;
+         i < global_testing_mask_range_.end; i++) {
+      if (partitioned_graph_->isLocal(i)) {
+        local_testing_mask_[partitioned_graph_->getLID(i)] = 1;
+      }
+    }
+  } else {
+    // XXX i can get local sample counts from here if i need it
+    ReadLocalMasksFromFile(dataset_name, "train", &global_training_mask_range_,
+                           local_training_mask_.get());
+    ReadLocalMasksFromFile(dataset_name, "val", &global_validation_mask_range_,
+                           local_validation_mask_.get());
+    ReadLocalMasksFromFile(dataset_name, "test", &global_testing_mask_range_,
+                           local_testing_mask_.get());
+  }
+}
diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt
new file mode 100644
index 0000000000..83c6164eac
--- /dev/null
+++ b/libgnn/test/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_executable(gnngraph-test gnngraph-test.cpp)
+target_link_libraries(gnngraph-test galois_gnn)
+add_test(NAME gnngraph-test COMMAND gnngraph-test)
+
+# TODO multi host tests
diff --git a/libgnn/test/gnngraph-test.cpp b/libgnn/test/gnngraph-test.cpp
new file mode 100644
index 0000000000..78b6804513
--- /dev/null
+++ b/libgnn/test/gnngraph-test.cpp
@@ -0,0 +1,33 @@
+//! @file gnngraph-test.cpp
+//! Test loads a few graphs. Better if you run with multiple hosts.
+//! Doesn't really do much besides that.
+
+#include "galois/Logging.h"
+#include "galois/graphs/GNNGraph.h"
+
+int main() {
+  galois::DistMemSys G;
+
+  size_t num_threads = galois::setActiveThreads(
+      56 / galois::runtime::getSystemNetworkInterface().Num);
+  GALOIS_LOG_VERBOSE("[{}] Using {} threads",
+                     galois::runtime::getSystemNetworkInterface().ID,
+                     num_threads);
+
+  GALOIS_LOG_VERBOSE("reddit with multilabel, oec");
+  galois::graphs::GNNGraph("reddit", galois::graphs::GNNPartitionScheme::kOEC,
+                           false);
+  GALOIS_LOG_VERBOSE("reddit with single label, oec");
+  galois::graphs::GNNGraph("reddit", galois::graphs::GNNPartitionScheme::kOEC,
+                           true);
+  GALOIS_LOG_VERBOSE("reddit with multilabel, cvc");
+  galois::graphs::GNNGraph("reddit", galois::graphs::GNNPartitionScheme::kCVC,
+                           false);
+  GALOIS_LOG_VERBOSE("reddit with single label, cvc");
+  galois::graphs::GNNGraph("reddit", galois::graphs::GNNPartitionScheme::kCVC,
+                           true);
+
+  // TODO fix citeseer and goec
+  // galois::graphs::GNNGraph("citeseer",
+  // galois::graphs::GNNPartitionScheme::kOEC, false);
+}

From 964bc732b561bb364a6cfb4002e42e4a15e373ab Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 28 Sep 2020 17:54:18 -0500
Subject: [PATCH 340/660] GraphConvolutionalLayer

Adds an initial graph convolution layer implementation with a few
optimizations still pending. Also adds various things required to make
this work, including calls into a matrix multiply library, per thread
RNG, etc.

Also adds a test to make sure the conv layer by itself is sane + runs as
expected/without crashing.

Note this commit will not build because it does not include graph
structure changes. Those will come in the next commit (in the spirit of
keeping commits separate).
---
 libgnn/CMakeLists.txt                         |   3 +
 libgnn/include/galois/GNNMath.h               |  18 ++
 libgnn/include/galois/GNNTypes.h              |   1 +
 libgnn/include/galois/PerThreadRNG.h          |  32 +++
 libgnn/include/galois/layers/GNNLayer.h       | 133 +++++++++
 .../galois/layers/GraphConvolutionalLayer.h   |  58 ++++
 libgnn/src/GNNLayer.cpp                       | 100 +++++++
 libgnn/src/GNNMath.cpp                        |  42 +++
 libgnn/src/GraphConvolutionalLayer.cpp        | 164 +++++++++++
 libgnn/test/CMakeLists.txt                    |   4 +
 libgnn/test/convlayer-test.cpp                | 258 ++++++++++++++++++
 libgnn/test/gnngraph-test.cpp                 |   1 +
 12 files changed, 814 insertions(+)
 create mode 100644 libgnn/include/galois/GNNMath.h
 create mode 100644 libgnn/include/galois/PerThreadRNG.h
 create mode 100644 libgnn/include/galois/layers/GNNLayer.h
 create mode 100644 libgnn/include/galois/layers/GraphConvolutionalLayer.h
 create mode 100644 libgnn/src/GNNLayer.cpp
 create mode 100644 libgnn/src/GNNMath.cpp
 create mode 100644 libgnn/src/GraphConvolutionalLayer.cpp
 create mode 100644 libgnn/test/convlayer-test.cpp

diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt
index a44b94f427..28e8dc8630 100644
--- a/libgnn/CMakeLists.txt
+++ b/libgnn/CMakeLists.txt
@@ -18,6 +18,9 @@ link_directories(${BLAS_LIB_DIR})
 
 set(sources
   src/GNNGraph.cpp
+  src/GNNLayer.cpp
+  src/GNNMath.cpp
+  src/GraphConvolutionalLayer.cpp
 )
 
 add_library(galois_gnn STATIC ${sources})
diff --git a/libgnn/include/galois/GNNMath.h b/libgnn/include/galois/GNNMath.h
new file mode 100644
index 0000000000..755d281752
--- /dev/null
+++ b/libgnn/include/galois/GNNMath.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "galois/GNNTypes.h"
+#include <cblas.h>
+
+namespace galois {
+
+//! Given 2 float array pointers, do element wise addition of length elements
+//! Can be called in parallel sections as its sigle threaded code
+void VectorAdd(size_t length, const GNNFloat* a, const GNNFloat* b,
+               GNNFloat* output);
+
+//! Calls into a library BLAS call to do matrix muliply; uses default alpha/beta
+void CBlasSGEMM(const CBLAS_TRANSPOSE trans_a, const CBLAS_TRANSPOSE trans_b,
+                size_t input_rows, size_t input_columns, size_t output_columns,
+                const GNNFloat* a, const GNNFloat* b, GNNFloat* output);
+
+} // namespace galois
diff --git a/libgnn/include/galois/GNNTypes.h b/libgnn/include/galois/GNNTypes.h
index aaabaa15e0..56eed101f8 100644
--- a/libgnn/include/galois/GNNTypes.h
+++ b/libgnn/include/galois/GNNTypes.h
@@ -3,6 +3,7 @@
 //! Typedefs used by the Galois GNN code
 
 #include <cstdint>
+#include <cstddef>
 
 namespace galois {
 //! Floating point type to use throughout GNN compute; typedef'd so it's easier
diff --git a/libgnn/include/galois/PerThreadRNG.h b/libgnn/include/galois/PerThreadRNG.h
new file mode 100644
index 0000000000..80f8d11f0a
--- /dev/null
+++ b/libgnn/include/galois/PerThreadRNG.h
@@ -0,0 +1,32 @@
+#pragma once
+#include <random>
+#include "galois/substrate/PerThreadStorage.h"
+#include "galois/GNNTypes.h"
+
+namespace galois {
+
+//! Per thread RNG object for generating numbers in parallel
+class PerThreadRNG {
+public:
+  //! Default seed 0, default distribution 0 to 1
+  PerThreadRNG() : distribution_{0.0, 1.0} {};
+  //! User specified range
+  PerThreadRNG(float begin, float end) : distribution_{begin, end} {};
+  //! Returns a random number between numbers specified during init
+  GNNFloat GetRandomNumber() {
+    return (*distribution_.getLocal())(*engine_.getLocal());
+  }
+  //! Return true or false based on some dropout rate
+  bool DoBernoulli(float dropout_rate) {
+    return (GetRandomNumber() > dropout_rate) ? 1 : 0;
+  }
+
+private:
+  //! Per thread generator of random
+  galois::substrate::PerThreadStorage<std::default_random_engine> engine_;
+  //! Per thread distribution of random
+  galois::substrate::PerThreadStorage<std::uniform_real_distribution<GNNFloat>>
+      distribution_;
+};
+
+} // namespace galois
diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
new file mode 100644
index 0000000000..7df88d2ce7
--- /dev/null
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -0,0 +1,133 @@
+#pragma once
+
+#include "galois/PerThreadRNG.h"
+#include "galois/graphs/GNNGraph.h"
+
+namespace galois {
+
+//! Struct holding the dimensions of a layer. Assumption is that a layer takes
+//! a matrix and outputs another matrix with a different # of columns (e.g.
+//! matrix multiply with a set of weights)
+struct GNNLayerDimensions {
+  //! Number of rows in input and output of this layer
+  size_t input_rows;
+  //! Number of columns in input of this layer
+  size_t input_columns;
+  //! Number of columns output of this layer
+  size_t output_columns;
+};
+
+//! Config options for operations that can occur in a layer
+struct GNNConfig {
+  //! True if dropout is to be done at beginning of forward phase
+  bool do_dropout{false};
+  //! Rate at which to drop things if dropout is on
+  float dropout_rate{0.5};
+  //! True if some activation function is to be called done at end of forward
+  //! phase
+  bool do_activation{false};
+  //! True if normalization is to occur during multiplies
+  bool do_normalization{false};
+  // TODO activation type; for now default is softmax
+};
+
+// Tried to avoid inheritance, but keeping track of heterogeneous layers
+// becomes a mess if there isn't a base class I can create the container on.
+//! Base class for layers in a graph neural network
+class GNNLayer {
+public:
+  GNNLayer() = delete;
+  //! Creation of a layer needs the # of the layer, the graph to train on, and
+  //! the input/output dimensions of the MxM that occurs in the layer; config
+  //! as well
+  GNNLayer(size_t layer_num, const galois::graphs::GNNGraph& graph,
+           const GNNLayerDimensions& dimensions, const GNNConfig& config);
+
+  //! Uses a default config
+  GNNLayer(size_t layer_num, const galois::graphs::GNNGraph& graph,
+           const GNNLayerDimensions& dimensions)
+      : GNNLayer(layer_num, graph, dimensions, GNNConfig()) {}
+
+  //! Initializes all layer weights to 1. This is used as a debug function for
+  //! testing.
+  void InitAllWeightsTo1() { layer_weights_.assign(layer_weights_.size(), 1); }
+
+  //! Conducts the forward phase given the input to this layer which
+  //! ultimately leads to an output (classfication of node labels) at the end
+  //! of the GNN.
+  //! @returns Output of the forward phase (i.e. input to next layer)
+  virtual const std::vector<galois::GNNFloat>&
+  ForwardPhase(const std::vector<galois::GNNFloat>& input_embeddings) = 0;
+  //! Conducts the backward phase given the input to this layer; the backward
+  //! phase calculates the gradients to update the weights of trainable
+  //! parts of the layer (e.g., weights, trainable params for aggregate, etc.).
+  //! @param prev_layer_input The input that was given to this layer in the
+  //! forward phase
+  //! @param input_gradient gradient from the backward phase layer before this
+  //! one; takes a pointer to save space by writing intermediate results to it
+  //! @returns Output of the backward phase (i.e. input to previous layer); note
+  //! it's a pointer because layer can mess with it
+  virtual std::vector<galois::GNNFloat>*
+  BackwardPhase(const std::vector<galois::GNNFloat>& prev_layer_input,
+                std::vector<galois::GNNFloat>* input_gradient) = 0;
+
+  const std::vector<GNNFloat>& GetLayerWeightGradients() {
+    return layer_weight_gradients_;
+  }
+
+  //! Returns dimensions of this layer
+  // XXX may not be needed
+  const GNNLayerDimensions& GetLayerDimensions() { return layer_dimensions_; }
+
+protected:
+  //! Layer order (starts from 0); used in backward to shortcut output as layer
+  //! 0 does not need to do some things that other layers need to do
+  // XXX be more specific
+  size_t layer_number_;
+  //! Pointer to the graph being trained by this layer.
+  //! This is owned by the creator of this layer, so no need to free it when
+  //! this layer is destroyed.
+  const galois::graphs::GNNGraph& graph_;
+  //! Dimensions (input/output sizes) of this layer
+  GNNLayerDimensions layer_dimensions_;
+  //! Config object for certain parameters for layer
+  GNNConfig config_;
+  //! Weights used by this layer. Dimensions: input columns by output columns
+  std::vector<GNNFloat> layer_weights_;
+  //! Gradients used to update the weights of this layer
+  std::vector<GNNFloat> layer_weight_gradients_;
+  // There is a forward and a backward as their sizes will differ and we only
+  // want to allocate memory once to avoid runtime memory allocation.
+  //! The output of the forward phase for this layer.
+  std::vector<GNNFloat> forward_output_matrix_;
+  //! The output of the backward phase for this layer.
+  std::vector<GNNFloat> backward_output_matrix_;
+  //! RNG for matrix initialization
+  PerThreadRNG random_init_rng_{-5.0, 5.0};
+  //! RNG for dropout
+  PerThreadRNG dropout_rng_;
+  //! Indicates which fields of the weight matrix are dropped if dropout is
+  //! used
+  std::vector<bool> dropout_mask_;
+
+  //////////////////////////////////////////////////////////////////////////////
+
+  //! Randomly init a float vector using the class's random init RNG
+  void RandomInitVector(std::vector<GNNFloat>* vector_to_init);
+
+  //! Choose a set of weights from this layer's weights to keep and save to
+  //! the output matrix + apply some scaling to the kept weights based on
+  //! dropout rate
+  void DoDropout(std::vector<GNNFloat>* output_matrix);
+  //! Apply the derivative of dropout to the backward phase output
+  void DoDropoutDerivative();
+
+  //! Does some activation function based on configuration on forward output
+  //! matrix
+  void Activation();
+  //! Calculate derivative of activation function based on config on the matrix
+  // XXX
+  void ActivationDerivative(std::vector<GNNFloat>* matrix);
+};
+
+} // namespace galois
diff --git a/libgnn/include/galois/layers/GraphConvolutionalLayer.h b/libgnn/include/galois/layers/GraphConvolutionalLayer.h
new file mode 100644
index 0000000000..6a99682b8a
--- /dev/null
+++ b/libgnn/include/galois/layers/GraphConvolutionalLayer.h
@@ -0,0 +1,58 @@
+#pragma once
+#include "galois/layers/GNNLayer.h"
+
+namespace galois {
+
+class GraphConvolutionalLayer : public GNNLayer {
+public:
+  //! Initializes the variables of the base class and also allocates additional
+  //! memory for temporary matrices. Also initializes sync substrate for the
+  //! weight matrix
+  GraphConvolutionalLayer(size_t layer_num,
+                          const galois::graphs::GNNGraph& graph,
+                          const GNNLayerDimensions& dimensions,
+                          const GNNConfig& config);
+
+  GraphConvolutionalLayer(size_t layer_num,
+                          const galois::graphs::GNNGraph& graph,
+                          const GNNLayerDimensions& dimensions)
+      : GraphConvolutionalLayer(layer_num, graph, dimensions, GNNConfig()) {}
+
+  // Parent functions
+  const std::vector<GNNFloat>&
+  ForwardPhase(const std::vector<GNNFloat>& input_embeddings) final;
+  std::vector<GNNFloat>*
+  BackwardPhase(const std::vector<galois::GNNFloat>& prev_layer_input,
+                std::vector<GNNFloat>* input_gradient) final;
+
+private:
+  // 2 temporaries the size of the forward input; used for dropout and
+  // aggregation (if either are required)
+  std::vector<GNNFloat> in_temp_1_;
+  std::vector<GNNFloat> in_temp_2_;
+  // Temporary matrix the size of the output of the forward pass; used if
+  // an intermediate op occurs before writing to the final output matrix
+  std::vector<GNNFloat> out_temp_;
+  // Each thread has a vector of size # input columns or # output columns for
+  // storing intermediate results during aggregation.
+  // The one used depeneds on if aggregation occurs before or after the mxm.
+  galois::substrate::PerThreadStorage<std::vector<GNNFloat>>
+      input_column_intermediates_;
+  galois::substrate::PerThreadStorage<std::vector<GNNFloat>>
+      output_column_intermediates_;
+
+  //! Performs aggregation for all nodes of the graph given the length of the
+  //! vector to aggregate, the features themselves, an output array, and per
+  //! thread storage for the intermediate scaling via norm factor
+  void
+  AggregateAll(size_t column_length, const GNNFloat* node_embeddings,
+               GNNFloat* aggregate_output,
+               galois::substrate::PerThreadStorage<std::vector<GNNFloat>>* pts);
+
+  //! Do embedding update via mxm with this layer's weights (forward)
+  void UpdateEmbeddings(const GNNFloat* node_embeddings, GNNFloat* output);
+  //! Calculate graident via mxm with last layer's gradients (backward)
+  void UpdateEmbeddingsDerivative(const GNNFloat* gradients, GNNFloat* output);
+};
+
+} // namespace galois
diff --git a/libgnn/src/GNNLayer.cpp b/libgnn/src/GNNLayer.cpp
new file mode 100644
index 0000000000..0f4eaeb36b
--- /dev/null
+++ b/libgnn/src/GNNLayer.cpp
@@ -0,0 +1,100 @@
+#include "galois/layers/GNNLayer.h"
+
+galois::GNNLayer::GNNLayer(size_t layer_num,
+                           const galois::graphs::GNNGraph& graph,
+                           const GNNLayerDimensions& dimensions,
+                           const GNNConfig& config)
+    : layer_number_(layer_num), graph_(graph), layer_dimensions_(dimensions),
+      config_(config) {
+  // TODO some of this does not need alloc if not used
+  // dropout allocation; dropout is same as input
+  dropout_mask_.resize(layer_dimensions_.input_rows *
+                       layer_dimensions_.input_columns);
+  // allocate memory based on layer dimensions
+  size_t num_weight_elements =
+      layer_dimensions_.input_columns * layer_dimensions_.output_columns;
+  layer_weights_.resize(num_weight_elements);
+  layer_weight_gradients_.resize(num_weight_elements, 0);
+  // init weights randomly with a parallel loop
+  RandomInitVector(&layer_weights_);
+
+  size_t num_output_elements =
+      layer_dimensions_.input_rows * layer_dimensions_.output_columns;
+  forward_output_matrix_.resize(num_output_elements, 0);
+  backward_output_matrix_.resize(
+      layer_dimensions_.input_rows * layer_dimensions_.input_columns, 0);
+}
+
+void galois::GNNLayer::RandomInitVector(std::vector<GNNFloat>* vector_to_init) {
+  galois::do_all(
+      galois::iterate(static_cast<size_t>(0), vector_to_init->size()),
+      [&](size_t i) {
+        // pull from the class's per thread RNG
+        (*vector_to_init)[i] = random_init_rng_.GetRandomNumber();
+      },
+      galois::loopname("RandomInitVector"));
+}
+
+void galois::GNNLayer::DoDropout(std::vector<GNNFloat>* output_matrix) {
+  // XXX fix droptout, should use inputs not weights
+  size_t num_weights = layer_weights_.size();
+  // determine which weights to drop
+  galois::do_all(
+      galois::iterate(static_cast<size_t>(0), num_weights),
+      [&](size_t i) {
+        dropout_mask_[i] = dropout_rng_.DoBernoulli(config_.dropout_rate);
+      },
+      galois::loopname("LayerDropoutRNG"));
+
+  // create new matrix with non-dropped weights + some scaling
+  // TODO scaling?
+  GNNFloat scale = 1. / (1. - config_.dropout_rate);
+  galois::do_all(
+      galois::iterate(static_cast<size_t>(0), num_weights),
+      [&](size_t i) {
+        (*output_matrix)[i] =
+            layer_weights_[i] * static_cast<GNNFloat>(dropout_mask_[i]) * scale;
+      },
+      galois::loopname("LayerDropout"));
+}
+
+void galois::GNNLayer::DoDropoutDerivative() {
+  GNNFloat scale = 1. / (1. - config_.dropout_rate);
+  // use dropout mask to figure out derivative
+  galois::do_all(
+      galois::iterate(static_cast<size_t>(0), backward_output_matrix_.size()),
+      [&](size_t i) {
+        backward_output_matrix_[i] = backward_output_matrix_[i] *
+                                     static_cast<GNNFloat>(dropout_mask_[i]) *
+                                     scale;
+      },
+      galois::loopname("LayerDropoutDerivative"));
+}
+
+void galois::GNNLayer::Activation() {
+  // TODO only does relu at the moment; should check user specified activation
+  // and act accordingly
+  galois::do_all(
+      galois::iterate(static_cast<size_t>(0), forward_output_matrix_.size()),
+      [&](size_t i) {
+        forward_output_matrix_[i] =
+            std::max(forward_output_matrix_.at(i), static_cast<GNNFloat>(0));
+      },
+      galois::loopname("ReLU"));
+}
+
+void galois::GNNLayer::ActivationDerivative(std::vector<GNNFloat>* gradient) {
+  // TODO only does relu at the moment; should check user specified activation
+  // and act accordingly
+  // XXX
+  // keep gradient if the original output is greater than 0
+  galois::do_all(
+      galois::iterate(static_cast<size_t>(0), gradient->size()),
+      [&](size_t i) {
+        (*gradient)[i] =
+            (forward_output_matrix_.at(i) > static_cast<GNNFloat>(0))
+                ? (*gradient)[i]
+                : static_cast<GNNFloat>(0);
+      },
+      galois::loopname("ReLU-Derivative"));
+}
diff --git a/libgnn/src/GNNMath.cpp b/libgnn/src/GNNMath.cpp
new file mode 100644
index 0000000000..5ba0fdcf64
--- /dev/null
+++ b/libgnn/src/GNNMath.cpp
@@ -0,0 +1,42 @@
+#include <immintrin.h>
+#include "galois/GNNMath.h"
+
+void galois::VectorAdd(size_t length, const GNNFloat* a, const GNNFloat* b,
+                       GNNFloat* output) {
+#ifdef __AVX2__
+  constexpr size_t vectorization_length =
+      8; // for 32-bit floating point in AVX2; TODO AVX512
+  // can only do up to a particular multiple due to alignment
+  const size_t aligned_end = length - length % vectorization_length;
+  // do add via vector ops
+  for (size_t i = 0; i < aligned_end; i += vectorization_length) {
+    _mm256_storeu_ps(&output[i], _mm256_add_ps(_mm256_loadu_ps(&a[i]),
+                                               _mm256_loadu_ps(&b[i])));
+  }
+
+  // handle the rest
+  for (size_t i = aligned_end; i < length; ++i) {
+    output[i] = a[i] + b[i];
+  }
+#else
+  // no vector -> trivial loop add
+  for (size_t i = 0; i < length; ++i) {
+    output[i] = a[i] + b[i];
+  }
+#endif
+}
+
+void galois::CBlasSGEMM(const CBLAS_TRANSPOSE trans_a,
+                        const CBLAS_TRANSPOSE trans_b, size_t input_rows,
+                        size_t input_columns, size_t output_columns,
+                        const GNNFloat* a, const GNNFloat* b,
+                        GNNFloat* output) {
+  // set lead dimension based on cblas spec w.r.t. transpose setting
+  size_t lead_dim_a = (trans_a == CblasNoTrans) ? input_columns : input_rows;
+  size_t lead_dim_b =
+      (trans_b == CblasNoTrans) ? output_columns : input_columns;
+  // do the MM
+  cblas_sgemm(CblasRowMajor, trans_a, trans_b, input_rows, output_columns,
+              input_columns, 1.0, a, lead_dim_a, b, lead_dim_b, 0.0, output,
+              output_columns);
+}
diff --git a/libgnn/src/GraphConvolutionalLayer.cpp b/libgnn/src/GraphConvolutionalLayer.cpp
new file mode 100644
index 0000000000..bb00b83d61
--- /dev/null
+++ b/libgnn/src/GraphConvolutionalLayer.cpp
@@ -0,0 +1,164 @@
+#include "galois/Logging.h"
+#include "galois/GNNMath.h"
+#include "galois/layers/GraphConvolutionalLayer.h"
+
+galois::GraphConvolutionalLayer::GraphConvolutionalLayer(
+    size_t layer_num, const galois::graphs::GNNGraph& graph,
+    const GNNLayerDimensions& dimensions, const GNNConfig& config)
+    : galois::GNNLayer::GNNLayer(layer_num, graph, dimensions, config),
+      input_column_intermediates_(dimensions.input_columns),
+      output_column_intermediates_(dimensions.output_columns) {
+  size_t num_input_elements =
+      layer_dimensions_.input_rows * layer_dimensions_.input_columns;
+  in_temp_1_.resize(num_input_elements, 0);
+  // TODO temp2 does not need to be initialized in all circumstances
+  in_temp_2_.resize(num_input_elements, 0);
+
+  size_t num_output_elements =
+      layer_dimensions_.input_rows * layer_dimensions_.output_columns;
+  GALOIS_LOG_VERBOSE("Output elements {}", num_output_elements);
+  out_temp_.resize(num_output_elements, 0);
+}
+
+const std::vector<galois::GNNFloat>&
+galois::GraphConvolutionalLayer::ForwardPhase(
+    const std::vector<galois::GNNFloat>& input_embeddings) {
+  assert(input_embeddings.size() ==
+         (layer_dimensions_.input_rows * layer_dimensions_.input_columns));
+  assert(in_temp_1_.size() == input_embeddings.size());
+  assert(in_temp_2_.size() == input_embeddings.size());
+  assert(forward_output_matrix_.size() ==
+         (layer_dimensions_.input_rows * layer_dimensions_.output_columns));
+  // pointer to input to operate on
+  const GNNFloat* input_data = input_embeddings.data();
+  // first, dropout
+  // TODO only dropout if in training apparently
+  if (config_.do_dropout) {
+    GALOIS_LOG_VERBOSE("Doing dropout");
+    DoDropout(&in_temp_1_);
+    input_data = in_temp_1_.data();
+  }
+
+  GALOIS_LOG_VERBOSE("Doing aggregate");
+  // aggregation and update (or vice versa)
+  AggregateAll(layer_dimensions_.input_columns, input_data, in_temp_2_.data(),
+               &input_column_intermediates_);
+  GALOIS_LOG_VERBOSE("Doing embedding update");
+  // TODO synchronization of aggregation functions
+  UpdateEmbeddings(in_temp_2_.data(), forward_output_matrix_.data());
+
+  // TODO if input columns > output columns do update first then aggregate for
+  // efficiency
+
+  if (config_.do_activation) {
+    GALOIS_LOG_VERBOSE("Doing activation");
+    Activation();
+  }
+
+  assert(forward_output_matrix_.size() ==
+         (layer_dimensions_.input_rows * layer_dimensions_.output_columns));
+  return forward_output_matrix_;
+}
+
+std::vector<galois::GNNFloat>* galois::GraphConvolutionalLayer::BackwardPhase(
+    const std::vector<galois::GNNFloat>& prev_layer_input,
+    std::vector<galois::GNNFloat>* input_gradient) {
+  // derivative of activation
+  if (config_.do_activation) {
+    ActivationDerivative(input_gradient);
+  }
+
+  // derivative of aggregation/update
+  // TODO do optimized cased like the forward
+  if (layer_number_ != 0) {
+    // transposed sgemm for derivative; in_temp is output
+    UpdateEmbeddingsDerivative(input_gradient->data(), in_temp_1_.data());
+    // derivative of aggregate is the same due to symmetric graph
+    AggregateAll(layer_dimensions_.input_columns, in_temp_1_.data(),
+                 backward_output_matrix_.data(), &input_column_intermediates_);
+  }
+  // TODO sync agg/update
+
+  // weight gradient calculation
+  galois::CBlasSGEMM(CblasTrans, CblasNoTrans, layer_dimensions_.input_columns,
+                     layer_dimensions_.input_rows,
+                     layer_dimensions_.output_columns, prev_layer_input.data(),
+                     input_gradient->data(), layer_weight_gradients_.data());
+  // TODO sync weights
+
+  if (config_.do_dropout) {
+    DoDropoutDerivative();
+  }
+
+  return &backward_output_matrix_;
+}
+
+void galois::GraphConvolutionalLayer::AggregateAll(
+    size_t column_length, const GNNFloat* node_embeddings,
+    GNNFloat* aggregate_output,
+    galois::substrate::PerThreadStorage<std::vector<GNNFloat>>* pts) {
+  size_t num_nodes = graph_.size();
+  // galois::gPrint(pts, "\n");
+
+  galois::do_all(
+      galois::iterate(static_cast<size_t>(0), num_nodes),
+      [&](size_t src) {
+        size_t index_to_src_feature = src * column_length;
+        // zero out src feature first
+        // TODO can init to self as well
+        for (size_t i = 0; i < column_length; i++) {
+          aggregate_output[index_to_src_feature + i] = 0;
+        }
+
+        GNNFloat source_norm = 0.0;
+        if (config_.do_normalization) {
+          source_norm = graph_.NormFactor(src);
+        }
+
+        // loop through all destinations to grab the feature to aggregate
+        for (auto e = graph_.EdgeBegin(src); e != graph_.EdgeEnd(src); e++) {
+          size_t dst                  = graph_.EdgeDestination(e);
+          size_t index_to_dst_feature = dst * column_length;
+
+          if (config_.do_normalization) {
+            GNNFloat norm_scale = source_norm * graph_.NormFactor(dst);
+            // scale the value on the destination by the combined norm term
+            assert(pts->getLocal()->size() == column_length);
+            GNNFloat* intermediate = pts->getLocal()->data();
+            for (size_t i = 0; i < column_length; i++) {
+              intermediate[i] =
+                  norm_scale * node_embeddings[index_to_dst_feature + i];
+            }
+            // add intermediate instead of original feature
+            galois::VectorAdd(
+                column_length, &aggregate_output[index_to_src_feature],
+                intermediate, &aggregate_output[index_to_src_feature]);
+          } else {
+            // add dst feature to aggregate output
+            galois::VectorAdd(column_length,
+                              &aggregate_output[index_to_src_feature],
+                              &node_embeddings[index_to_dst_feature],
+                              &aggregate_output[index_to_src_feature]);
+          }
+        }
+      },
+      galois::steal(), galois::loopname("ConvolutionalAggregateAll"));
+}
+
+void galois::GraphConvolutionalLayer::UpdateEmbeddings(
+    const GNNFloat* node_embeddings, GNNFloat* output) {
+  galois::CBlasSGEMM(CblasNoTrans, CblasNoTrans, layer_dimensions_.input_rows,
+                     layer_dimensions_.input_columns,
+                     layer_dimensions_.output_columns, node_embeddings,
+                     layer_weights_.data(), output);
+}
+
+void galois::GraphConvolutionalLayer::UpdateEmbeddingsDerivative(
+    const GNNFloat* gradients, GNNFloat* output) {
+  // difference is Trans for B matrix (data) to get z by y (weights is y by z
+  // normally); result is x by y
+  galois::CBlasSGEMM(CblasNoTrans, CblasTrans, layer_dimensions_.input_rows,
+                     layer_dimensions_.output_columns,
+                     layer_dimensions_.input_columns, gradients,
+                     layer_weights_.data(), output);
+}
diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt
index 83c6164eac..9469f86aba 100644
--- a/libgnn/test/CMakeLists.txt
+++ b/libgnn/test/CMakeLists.txt
@@ -2,4 +2,8 @@ add_executable(gnngraph-test gnngraph-test.cpp)
 target_link_libraries(gnngraph-test galois_gnn)
 add_test(NAME gnngraph-test COMMAND gnngraph-test)
 
+add_executable(convlayer-test convlayer-test.cpp)
+target_link_libraries(convlayer-test galois_gnn)
+add_test(NAME convlayer-test COMMAND convlayer-test)
+
 # TODO multi host tests
diff --git a/libgnn/test/convlayer-test.cpp b/libgnn/test/convlayer-test.cpp
new file mode 100644
index 0000000000..2d98875a0d
--- /dev/null
+++ b/libgnn/test/convlayer-test.cpp
@@ -0,0 +1,258 @@
+//! @file convlayer-test.cpp
+//! Conv layer test with a test graph
+
+#include "galois/Logging.h"
+#include "galois/layers/GraphConvolutionalLayer.h"
+
+int main() {
+  galois::DistMemSys G;
+
+  // size_t num_threads = galois::setActiveThreads(
+  //    56 / galois::runtime::getSystemNetworkInterface().Num);
+  size_t num_threads = galois::setActiveThreads(1);
+
+  GALOIS_LOG_VERBOSE("[{}] Using {} threads",
+                     galois::runtime::getSystemNetworkInterface().ID,
+                     num_threads);
+  // load test graph
+  galois::graphs::GNNGraph test_graph(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true);
+
+  std::vector<galois::GNNFloat> feats = test_graph.GetLocalFeatures();
+  //////////////////////////////////////////////////////////////////////////////
+  // doubles as a test for reading as well
+  GALOIS_LOG_ASSERT(21 == feats.size());
+  GALOIS_LOG_ASSERT(0.0 == feats[0]);
+  GALOIS_LOG_ASSERT(0.0 == feats[1]);
+  GALOIS_LOG_ASSERT(0.0 == feats[2]);
+  GALOIS_LOG_ASSERT(1.0 == feats[3]);
+  GALOIS_LOG_ASSERT(1.0 == feats[4]);
+  GALOIS_LOG_ASSERT(1.0 == feats[5]);
+  GALOIS_LOG_ASSERT(2.0 == feats[6]);
+  GALOIS_LOG_ASSERT(2.0 == feats[7]);
+  GALOIS_LOG_ASSERT(2.0 == feats[8]);
+  GALOIS_LOG_ASSERT(3.0 == feats[9]);
+  GALOIS_LOG_ASSERT(3.0 == feats[10]);
+  GALOIS_LOG_ASSERT(3.0 == feats[11]);
+  GALOIS_LOG_ASSERT(4.0 == feats[12]);
+  GALOIS_LOG_ASSERT(4.0 == feats[13]);
+  GALOIS_LOG_ASSERT(4.0 == feats[14]);
+  GALOIS_LOG_ASSERT(5.0 == feats[15]);
+  GALOIS_LOG_ASSERT(5.0 == feats[16]);
+  GALOIS_LOG_ASSERT(5.0 == feats[17]);
+  GALOIS_LOG_ASSERT(6.0 == feats[18]);
+  GALOIS_LOG_ASSERT(6.0 == feats[19]);
+  GALOIS_LOG_ASSERT(6.0 == feats[20]);
+  //////////////////////////////////////////////////////////////////////////////
+
+  galois::GNNLayerDimensions dimension_0{
+      .input_rows = 7, .input_columns = 3, .output_columns = 2};
+
+  // create the layer, no norm factor
+  // note layer number is 1 so that it does something in backward phase
+  std::unique_ptr<galois::GraphConvolutionalLayer> layer_0 =
+      std::make_unique<galois::GraphConvolutionalLayer>(0, test_graph,
+                                                        dimension_0);
+  layer_0->InitAllWeightsTo1();
+  // make sure it runs in a sane manner
+  const std::vector<galois::GNNFloat>& layer_0_forward_output =
+      layer_0->ForwardPhase(test_graph.GetLocalFeatures());
+
+  //////////////////////////////////////////////////////////////////////////////
+  // sanity check layer 0 output
+  //////////////////////////////////////////////////////////////////////////////
+  // since norm factors aren't invovled it is possible to do full assertions
+  // 7 x 2
+  GALOIS_LOG_ASSERT(layer_0_forward_output.size() == 14);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[0] == 3);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[1] == 3);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[2] == 6);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[3] == 6);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[4] == 12);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[5] == 12);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[6] == 18);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[7] == 18);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[8] == 24);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[9] == 24);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[10] == 30);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[11] == 30);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[12] == 15);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[13] == 15);
+  //////////////////////////////////////////////////////////////////////////////
+
+  // dummy 1 matrix
+  std::vector<galois::GNNFloat> dummy_ones(14, 1);
+
+  // backward pass checking
+  // layer 0 means that an empty weight matrix is returned since there is no
+  // point passing back anything
+  std::vector<galois::GNNFloat>* layer_0_backward_output =
+      layer_0->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
+
+  //////////////////////////////////////////////////////////////////////////////
+  // sanity check layer 0 backward output; all 0 because layer 0
+  //////////////////////////////////////////////////////////////////////////////
+  // since norm factors aren't invovled it is possible to do full assertions
+  // 7 x 3
+  GALOIS_LOG_ASSERT(layer_0_backward_output->size() == 21);
+  GALOIS_LOG_ASSERT((*layer_0_backward_output)[0] == 0);
+  GALOIS_LOG_ASSERT((*layer_0_backward_output)[1] == 0);
+  GALOIS_LOG_ASSERT((*layer_0_backward_output)[2] == 0);
+  GALOIS_LOG_ASSERT((*layer_0_backward_output)[3] == 0);
+  GALOIS_LOG_ASSERT((*layer_0_backward_output)[4] == 0);
+  GALOIS_LOG_ASSERT((*layer_0_backward_output)[5] == 0);
+  GALOIS_LOG_ASSERT((*layer_0_backward_output)[6] == 0);
+  GALOIS_LOG_ASSERT((*layer_0_backward_output)[7] == 0);
+  GALOIS_LOG_ASSERT((*layer_0_backward_output)[8] == 0);
+  GALOIS_LOG_ASSERT((*layer_0_backward_output)[9] == 0);
+  GALOIS_LOG_ASSERT((*layer_0_backward_output)[10] == 0);
+  GALOIS_LOG_ASSERT((*layer_0_backward_output)[11] == 0);
+  GALOIS_LOG_ASSERT((*layer_0_backward_output)[12] == 0);
+  GALOIS_LOG_ASSERT((*layer_0_backward_output)[13] == 0);
+  GALOIS_LOG_ASSERT((*layer_0_backward_output)[14] == 0);
+  GALOIS_LOG_ASSERT((*layer_0_backward_output)[15] == 0);
+  GALOIS_LOG_ASSERT((*layer_0_backward_output)[16] == 0);
+  GALOIS_LOG_ASSERT((*layer_0_backward_output)[17] == 0);
+  GALOIS_LOG_ASSERT((*layer_0_backward_output)[18] == 0);
+  GALOIS_LOG_ASSERT((*layer_0_backward_output)[19] == 0);
+  GALOIS_LOG_ASSERT((*layer_0_backward_output)[20] == 0);
+
+  const std::vector<galois::GNNFloat> layer_0_weight_gradients =
+      layer_0->GetLayerWeightGradients();
+  // make sure they are sane
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients.size() == 6);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[0] == 21);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[1] == 21);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[2] == 21);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[3] == 21);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[4] == 21);
+
+  layer_0.reset();
+
+  //////////////////////////////////////////////////////////////////////////////
+
+  // create layer 1 for testing backward prop actually giving weights back
+
+  std::unique_ptr<galois::GraphConvolutionalLayer> layer_1 =
+      std::make_unique<galois::GraphConvolutionalLayer>(1, test_graph,
+                                                        dimension_0);
+  layer_1->InitAllWeightsTo1();
+  const std::vector<galois::GNNFloat>& layer_1_forward_output =
+      layer_1->ForwardPhase(test_graph.GetLocalFeatures());
+  // same check as before for sanity purposes
+  GALOIS_LOG_ASSERT(layer_1_forward_output.size() == 14);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[0] == 3);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[1] == 3);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[2] == 6);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[3] == 6);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[4] == 12);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[5] == 12);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[6] == 18);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[7] == 18);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[8] == 24);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[9] == 24);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[10] == 30);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[11] == 30);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[12] == 15);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[13] == 15);
+
+  // since layer isn't 0 anymore, backward phase will actually return something
+  dummy_ones.assign(14, 1);
+  std::vector<galois::GNNFloat>* layer_1_backward_output =
+      layer_1->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
+  //////////////////////////////////////////////////////////////////////////////
+  // check that multiplies go as expected
+  //////////////////////////////////////////////////////////////////////////////
+  GALOIS_LOG_ASSERT(layer_1_backward_output->size() == 21);
+  GALOIS_LOG_ASSERT((*layer_1_backward_output)[0] == 2);
+  GALOIS_LOG_ASSERT((*layer_1_backward_output)[1] == 2);
+  GALOIS_LOG_ASSERT((*layer_1_backward_output)[2] == 2);
+  GALOIS_LOG_ASSERT((*layer_1_backward_output)[3] == 4);
+  GALOIS_LOG_ASSERT((*layer_1_backward_output)[4] == 4);
+  GALOIS_LOG_ASSERT((*layer_1_backward_output)[5] == 4);
+  GALOIS_LOG_ASSERT((*layer_1_backward_output)[6] == 4);
+  GALOIS_LOG_ASSERT((*layer_1_backward_output)[7] == 4);
+  GALOIS_LOG_ASSERT((*layer_1_backward_output)[8] == 4);
+  GALOIS_LOG_ASSERT((*layer_1_backward_output)[9] == 4);
+  GALOIS_LOG_ASSERT((*layer_1_backward_output)[10] == 4);
+  GALOIS_LOG_ASSERT((*layer_1_backward_output)[11] == 4);
+  GALOIS_LOG_ASSERT((*layer_1_backward_output)[12] == 4);
+  GALOIS_LOG_ASSERT((*layer_1_backward_output)[13] == 4);
+  GALOIS_LOG_ASSERT((*layer_1_backward_output)[14] == 4);
+  GALOIS_LOG_ASSERT((*layer_1_backward_output)[15] == 4);
+  GALOIS_LOG_ASSERT((*layer_1_backward_output)[16] == 4);
+  GALOIS_LOG_ASSERT((*layer_1_backward_output)[17] == 4);
+  GALOIS_LOG_ASSERT((*layer_1_backward_output)[18] == 2);
+  GALOIS_LOG_ASSERT((*layer_1_backward_output)[19] == 2);
+  GALOIS_LOG_ASSERT((*layer_1_backward_output)[20] == 2);
+
+  const std::vector<galois::GNNFloat> layer_1_weight_gradients =
+      layer_1->GetLayerWeightGradients();
+  // make sure they are sane
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients.size() == 6);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[0] == 21);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[1] == 21);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[2] == 21);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[3] == 21);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[4] == 21);
+
+  layer_1.reset();
+
+  //////////////////////////////////////////////////////////////////////////////
+
+  galois::GNNConfig config = {
+      .do_dropout = true, .do_activation = true, .do_normalization = true};
+
+  // finally, just make sure dropout and activation run without crashes
+  // (verification requires floating point accuracy or setting a seed which I
+  // don't have time for at the moment
+  // TODO in future maybe add better unit test for this
+  std::unique_ptr<galois::GraphConvolutionalLayer> layer_2 =
+      std::make_unique<galois::GraphConvolutionalLayer>(1, test_graph,
+                                                        dimension_0, config);
+  const std::vector<galois::GNNFloat> l2_fo =
+      layer_2->ForwardPhase(test_graph.GetLocalFeatures());
+  GALOIS_LOG_ASSERT(l2_fo.size() == 14);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[0]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[1]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[2]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[3]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[4]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[5]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[6]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[7]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[8]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[9]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[10]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[11]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[12]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[13]);
+
+  std::vector<galois::GNNFloat>* l2_bo =
+      layer_2->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
+
+  GALOIS_LOG_ASSERT(l2_bo->size() == 21);
+  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[0]);
+  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[1]);
+  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[2]);
+  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[3]);
+  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[4]);
+  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[5]);
+  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[6]);
+  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[7]);
+  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[8]);
+  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[9]);
+  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[10]);
+  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[11]);
+  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[12]);
+  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[13]);
+  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[14]);
+  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[15]);
+  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[16]);
+  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[17]);
+  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[18]);
+  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[19]);
+  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[20]);
+
+  return 0;
+}
diff --git a/libgnn/test/gnngraph-test.cpp b/libgnn/test/gnngraph-test.cpp
index 78b6804513..7db24081f5 100644
--- a/libgnn/test/gnngraph-test.cpp
+++ b/libgnn/test/gnngraph-test.cpp
@@ -30,4 +30,5 @@ int main() {
   // TODO fix citeseer and goec
   // galois::graphs::GNNGraph("citeseer",
   // galois::graphs::GNNPartitionScheme::kOEC, false);
+  return 0;
 }

From 90a1447763d0aa4bde0a9b14a25579aa5f1ce9e6 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 28 Sep 2020 18:00:10 -0500
Subject: [PATCH 341/660] GNNGraph graph accessors, whole graph loading

Adds a few access functions to the GNN Graph such as edge iterators,
array accessors, etc. Also adds whole graph loading for the purposes of
norm calculation (and other things that may require the entire graph
down the line).

Also hardcodes the tester graph's test/train/val boundaries because the
partitioner requires it.
---
 libcusp/include/galois/graphs/NewGeneric.h |  8 +-
 libgnn/include/galois/graphs/GNNGraph.h    | 69 ++++++++++++++----
 libgnn/src/GNNGraph.cpp                    | 85 +++++++++++++++++-----
 3 files changed, 129 insertions(+), 33 deletions(-)

diff --git a/libcusp/include/galois/graphs/NewGeneric.h b/libcusp/include/galois/graphs/NewGeneric.h
index a5eb13ee5b..3af95db9dd 100644
--- a/libcusp/include/galois/graphs/NewGeneric.h
+++ b/libcusp/include/galois/graphs/NewGeneric.h
@@ -76,7 +76,7 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
   uint32_t nodesToReceive;
 
   std::vector<uint32_t> getGNNBreakpoints(std::string filename) {
-    // contains 2 numbers: begin and end of test
+    // contains 2 numbers: begin and end of train
     // everything else can be split evenly among hosts as they are not
     // performance critical
     std::vector<uint32_t> bps;
@@ -91,6 +91,12 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
     } else if (filename.find("ppi") != std::string::npos) {
       bps.push_back(0);
       bps.push_back(9716);
+    } else if (filename.find("tester") != std::string::npos) {
+      bps.push_back(0);
+      bps.push_back(5);
+    } else {
+      GALOIS_DIE("invalid input for gnn partitioning ", filename,
+                 " hardcode needed");
     }
     // TODO hardcode the rest
 
diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 8bba9609fc..81d94f1948 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -27,13 +27,45 @@ enum class GNNPartitionScheme { kOEC, kCVC };
 //! XXX
 class GNNGraph {
 public:
-  // using LocalGraphType    = LearningGraph;
   using GNNDistGraph = galois::graphs::DistGraph<char, void>;
+  using WholeGraph   = galois::graphs::LC_CSR_Graph<char, void>;
+  using GraphNode    = GNNDistGraph::GraphNode;
+  using EdgeIterator = GNNDistGraph::edge_iterator;
 
   //! Loads a graph and all relevant metadata (labels, features, masks, etc.)
   GNNGraph(const std::string& dataset_name, GNNPartitionScheme partition_scheme,
            bool has_single_class_label);
 
+  //! Return # of nodes in the partitioned graph
+  size_t size() const { return partitioned_graph_->size(); }
+
+  // All following functions take a local id
+  EdgeIterator EdgeBegin(GraphNode n) const {
+    return partitioned_graph_->edge_begin(n);
+  };
+  EdgeIterator EdgeEnd(GraphNode n) const {
+    return partitioned_graph_->edge_end(n);
+  };
+  GraphNode EdgeDestination(EdgeIterator ei) const {
+    return partitioned_graph_->getEdgeDst(ei);
+  };
+  GNNFloat NormFactor(GraphNode n) const { return norm_factors_[n]; }
+
+  const std::vector<GNNFloat>& GetLocalFeatures() const {
+    return local_node_features_;
+  }
+
+  //! Returns a pointer to the CSR indices where the first element starts at
+  //! 0 (used with MKL)
+  const uint32_t* GetZeroBasedRowPointer() {
+    return zero_start_graph_indices_.data();
+  }
+
+  //! Return pointer to all edge destinations; used with MKL
+  const uint32_t* GetEdgeDestPointer() {
+    return partitioned_graph_->edge_dst_ptr();
+  }
+
 private:
   //! In a multi-host setting, this variable stores the host id that the graph
   //! is currently running on
@@ -44,6 +76,12 @@ class GNNGraph {
   size_t node_feature_length_{0};
   //! Partitioned graph
   std::unique_ptr<GNNDistGraph> partitioned_graph_;
+  //! The entire topology of the dataset: used for things like norm factor
+  //! calculation or sampling
+  WholeGraph whole_graph_;
+  //! The indices pointer from the partitioned graph except with a 0
+  //! prepended to it; needed for MKL calls
+  std::vector<uint32_t> zero_start_graph_indices_;
   // XXX is this necessary
   //! Copy of underlying topology of the distributed graph
   // std::unique_ptr<LocalGraphType> local_graph_;
@@ -51,17 +89,17 @@ class GNNGraph {
   std::unique_ptr<galois::graphs::GluonSubstrate<GNNDistGraph>> sync_substrate_;
   //! Ground truth label for nodes in the partitioned graph; Nx1 if single
   //! class, N x num classes if multi-class label
-  std::unique_ptr<GNNLabel[]> local_ground_truth_labels_;
+  std::vector<GNNLabel> local_ground_truth_labels_;
   //! Feature vectors for nodes in partitioned graph
-  std::unique_ptr<GNNFeature[]> local_node_features_;
+  std::vector<GNNFeature> local_node_features_;
 
   // TODO maybe revisit this and use an actual bitset
   //! Bitset indicating which nodes are training nodes
-  std::unique_ptr<GNNLabel[]> local_training_mask_;
+  std::vector<GNNLabel> local_training_mask_;
   //! Bitset indicating which nodes are validation nodes
-  std::unique_ptr<GNNLabel[]> local_validation_mask_;
+  std::vector<GNNLabel> local_validation_mask_;
   //! Bitset indicating which nodes are testing nodes
-  std::unique_ptr<GNNLabel[]> local_testing_mask_;
+  std::vector<GNNLabel> local_testing_mask_;
 
   //! Global mask range for training nodes; must convert to LIDs when using
   //! in this class
@@ -73,8 +111,7 @@ class GNNGraph {
   //! in this class
   GNNRange global_testing_mask_range_;
 
-  // XXX figure out what this is really used for
-  //! Normalization constant based on structure of the graph
+  //! Normalization constant based on structure of the graph (degrees)
   std::vector<GNNFloat> norm_factors_;
 
   // TODO vars for subgraphs as necessary
@@ -91,15 +128,17 @@ class GNNGraph {
                                 GNNRange* mask_range, GNNLabel* masks);
   //! Read masks of local nodes only for training, validation, and testing
   void ReadLocalMasks(const std::string& dataset_name);
+  //! Init the node start indices that have a 0 at the beginning; straight
+  //! copy of the array from the partitioned graph save for the 0 at the
+  //! first element.
+  void InitZeroStartGraphIndices();
+  //! Reads the entire graph topology in (but nothing else)
+  void ReadWholeGraph(const std::string& dataset_name);
+  //! Initializes the norm factors using the entire graph's topology for global
+  //! degree access
+  void InitNormFactor();
 
   // public:
-  //
-  //  DGraph* getGraphPointer() { return partitionedGraph; }
-  //  Graph* getLGraphPointer() { return lGraph; }
-  //  Graph* getSubgraphPointer(int id) { return partitionedSubgraphs[id]; };
-  //
-  //  void initializeSyncSubstrate();
-  //
   //  void saveDistGraph(DGraph* a);
   //  galois::graphs::GluonSubstrate<DGraph>* getSyncSubstrate();
   //  float_t* get_feats_ptr() { return h_feats; }
diff --git a/libgnn/src/GNNGraph.cpp b/libgnn/src/GNNGraph.cpp
index 5a39ed4d25..78ff5d828c 100644
--- a/libgnn/src/GNNGraph.cpp
+++ b/libgnn/src/GNNGraph.cpp
@@ -1,6 +1,7 @@
 // XXX include net interface if necessary
-#include "galois/graphs/GNNGraph.h"
 #include "galois/Logging.h"
+#include "galois/graphs/ReadGraph.h"
+#include "galois/graphs/GNNGraph.h"
 
 namespace {
 //! Partitions a particular dataset given some partitioning scheme
@@ -9,7 +10,7 @@ LoadPartition(const std::string& dataset_name,
               galois::graphs::GNNPartitionScheme partition_scheme) {
   // XXX input path
   std::string input_file = galois::gnn_dataset_path + dataset_name + ".csgr";
-  GALOIS_LOG_VERBOSE("File to read is {}", input_file);
+  GALOIS_LOG_VERBOSE("Partition loading: File to read is {}", input_file);
 
   // load partition
   switch (partition_scheme) {
@@ -30,6 +31,8 @@ LoadPartition(const std::string& dataset_name,
 galois::graphs::GNNGraph::GNNGraph(const std::string& dataset_name,
                                    GNNPartitionScheme partition_scheme,
                                    bool has_single_class_label) {
+  GALOIS_LOG_VERBOSE("[{}] Constructing partitiong for {}", host_id_,
+                     dataset_name);
   // save host id
   host_id_ = galois::runtime::getSystemNetworkInterface().ID;
   // load partition
@@ -45,6 +48,13 @@ galois::graphs::GNNGraph::GNNGraph(const std::string& dataset_name,
       std::make_unique<galois::graphs::GluonSubstrate<GNNDistGraph>>(
           *partitioned_graph_, host_id_,
           galois::runtime::getSystemNetworkInterface().Num, false);
+
+  // create the 0 based row indices for MKL use
+  InitZeroStartGraphIndices();
+  // read in entire graph topology
+  ReadWholeGraph(dataset_name);
+  // init norm factors using the whole graph topology
+  InitNormFactor();
 }
 
 void galois::graphs::GNNGraph::ReadLocalLabels(const std::string& dataset_name,
@@ -62,12 +72,11 @@ void galois::graphs::GNNGraph::ReadLocalLabels(const std::string& dataset_name,
   // allocate memory for labels
   if (has_single_class_label) {
     // single-class (one-hot) label for each vertex: N x 1
-    local_ground_truth_labels_ =
-        std::make_unique<GNNLabel[]>(partitioned_graph_->size());
+    local_ground_truth_labels_.resize(partitioned_graph_->size());
   } else {
     // multi-class label for each vertex: N x num classes
-    local_ground_truth_labels_ = std::make_unique<GNNLabel[]>(
-        partitioned_graph_->size() * num_label_classes_);
+    local_ground_truth_labels_.resize(partitioned_graph_->size() *
+                                      num_label_classes_);
   }
 
   size_t cur_gid              = 0;
@@ -148,8 +157,8 @@ void galois::graphs::GNNGraph::ReadLocalFeatures(
   file_stream.close();
 
   // allocate memory for local features
-  local_node_features_ = std::make_unique<GNNFeature[]>(
-      partitioned_graph_->size() * node_feature_length_);
+  local_node_features_.resize(partitioned_graph_->size() *
+                              node_feature_length_);
 
   // copy over features for local nodes only
   size_t local_vertex = 0;
@@ -214,12 +223,9 @@ size_t galois::graphs::GNNGraph::ReadLocalMasksFromFile(
 
 void galois::graphs::GNNGraph::ReadLocalMasks(const std::string& dataset_name) {
   // allocate the memory for the local masks
-  local_training_mask_ =
-      std::make_unique<GNNLabel[]>(partitioned_graph_->size());
-  local_validation_mask_ =
-      std::make_unique<GNNLabel[]>(partitioned_graph_->size());
-  local_testing_mask_ =
-      std::make_unique<GNNLabel[]>(partitioned_graph_->size());
+  local_training_mask_.resize(partitioned_graph_->size());
+  local_validation_mask_.resize(partitioned_graph_->size());
+  local_testing_mask_.resize(partitioned_graph_->size());
 
   if (dataset_name == "reddit") {
     // TODO reddit is hardcode handled at the moment; better way to not do
@@ -256,10 +262,55 @@ void galois::graphs::GNNGraph::ReadLocalMasks(const std::string& dataset_name) {
   } else {
     // XXX i can get local sample counts from here if i need it
     ReadLocalMasksFromFile(dataset_name, "train", &global_training_mask_range_,
-                           local_training_mask_.get());
+                           local_training_mask_.data());
     ReadLocalMasksFromFile(dataset_name, "val", &global_validation_mask_range_,
-                           local_validation_mask_.get());
+                           local_validation_mask_.data());
     ReadLocalMasksFromFile(dataset_name, "test", &global_testing_mask_range_,
-                           local_testing_mask_.get());
+                           local_testing_mask_.data());
   }
 }
+
+void galois::graphs::GNNGraph::InitZeroStartGraphIndices() {
+  GALOIS_LOG_VERBOSE("[{}] Initializing node indices with 0 prepended",
+                     host_id_);
+  // size is num nodes + 1
+  zero_start_graph_indices_.resize(partitioned_graph_->size() + 1);
+  // first element is zero
+  zero_start_graph_indices_[0] = 0;
+  // the rest is a straight copy from partitioned graph (use edge_end to access
+  // it)
+  galois::do_all(
+      galois::iterate(static_cast<size_t>(0), partitioned_graph_->size()),
+      [&](size_t i) {
+        zero_start_graph_indices_[i + 1] = *(partitioned_graph_->edge_end(i));
+      },
+      galois::loopname("InitZeroStartGraphIndices"));
+}
+
+void galois::graphs::GNNGraph::ReadWholeGraph(const std::string& dataset_name) {
+  std::string input_file = galois::gnn_dataset_path + dataset_name + ".csgr";
+  GALOIS_LOG_VERBOSE("[{}] Reading entire graph: file to read is {}", host_id_,
+                     input_file);
+  galois::graphs::readGraph(whole_graph_, input_file);
+}
+
+void galois::graphs::GNNGraph::InitNormFactor() {
+  GALOIS_LOG_VERBOSE("[{}] Initializing norm factors", host_id_);
+  norm_factors_.resize(partitioned_graph_->size(), 0.0);
+
+  // get the norm factor contribution for each node based on the GLOBAL graph
+  galois::do_all(
+      galois::iterate(static_cast<size_t>(0), partitioned_graph_->size()),
+      [&](size_t local_id) {
+        // translate lid into gid to get global degree
+        size_t global_id     = partitioned_graph_->getGID(local_id);
+        size_t global_degree = whole_graph_.edge_end(global_id) -
+                               whole_graph_.edge_begin(global_id);
+        // only set if non-zero
+        if (global_degree != 0) {
+          norm_factors_[local_id] =
+              1.0 / std::sqrt(static_cast<float>(global_degree));
+        }
+      },
+      galois::loopname("InitNormFactor"));
+}

From f6d097d301d968f6c40aa52f9c958fbbf851d36f Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 29 Sep 2020 18:32:55 -0500
Subject: [PATCH 342/660] Layer phases + graph accessors

Added more accessors to the graph like node iterators and LID-GID
functions. More importantly, added the concept of a layer phase (test,
validate, train) which causes training/masking to occur differently.
Added a function to the graph object to check if an LID is part of a
particular group of nodes to train as well.
---
 libgnn/include/galois/GNNTypes.h        |  4 +++
 libgnn/include/galois/graphs/GNNGraph.h | 34 ++++++++++++++++++++----
 libgnn/src/GNNGraph.cpp                 | 35 +++++++++++++++++++++++++
 3 files changed, 68 insertions(+), 5 deletions(-)

diff --git a/libgnn/include/galois/GNNTypes.h b/libgnn/include/galois/GNNTypes.h
index 56eed101f8..a04fa14687 100644
--- a/libgnn/include/galois/GNNTypes.h
+++ b/libgnn/include/galois/GNNTypes.h
@@ -13,4 +13,8 @@ using GNNFloat = float;
 using GNNLabel = uint8_t;
 //! Type of a feature on vertices
 using GNNFeature = float;
+
+//! Phase of GNN computation
+enum class GNNPhase { kTrain, kValidate, kTest };
+
 } // end namespace galois
diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 81d94f1948..cefc505992 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -30,6 +30,8 @@ class GNNGraph {
   using GNNDistGraph = galois::graphs::DistGraph<char, void>;
   using WholeGraph   = galois::graphs::LC_CSR_Graph<char, void>;
   using GraphNode    = GNNDistGraph::GraphNode;
+  // defined as such because dist graph range objects used long unsigned
+  using NodeIterator = boost::counting_iterator<size_t>;
   using EdgeIterator = GNNDistGraph::edge_iterator;
 
   //! Loads a graph and all relevant metadata (labels, features, masks, etc.)
@@ -39,6 +41,29 @@ class GNNGraph {
   //! Return # of nodes in the partitioned graph
   size_t size() const { return partitioned_graph_->size(); }
 
+  //! Node begin for all local nodes
+  NodeIterator begin() const {
+    return partitioned_graph_->allNodesRange().begin();
+  }
+  //! Node end for all local nodes
+  NodeIterator end() const { return partitioned_graph_->allNodesRange().end(); }
+  //! Return GID of some local node
+  size_t GetGID(unsigned lid) const { return partitioned_graph_->getGID(lid); }
+  //! Given an LID and the current phase of GNN computation, determine if the
+  //! lid in question is valid for the current phase (i.e., it is part of
+  //! a training, validation, or test phase mask)
+  bool IsValidForPhase(const unsigned lid,
+                       const galois::GNNPhase current_phase) const;
+  //! Returns the label of some local id assuming labels are single class
+  //! labels.
+  GNNFloat GetSingleClassLabel(const unsigned lid) const {
+    assert(using_single_class_labels_);
+    return local_ground_truth_labels_[lid];
+  }
+
+  //! Return the number of label classes
+  size_t GetNumLabelClasses() const { return num_label_classes_; };
+
   // All following functions take a local id
   EdgeIterator EdgeBegin(GraphNode n) const {
     return partitioned_graph_->edge_begin(n);
@@ -57,12 +82,12 @@ class GNNGraph {
 
   //! Returns a pointer to the CSR indices where the first element starts at
   //! 0 (used with MKL)
-  const uint32_t* GetZeroBasedRowPointer() {
+  const uint32_t* GetZeroBasedRowPointer() const {
     return zero_start_graph_indices_.data();
   }
 
   //! Return pointer to all edge destinations; used with MKL
-  const uint32_t* GetEdgeDestPointer() {
+  const uint32_t* GetEdgeDestPointer() const {
     return partitioned_graph_->edge_dst_ptr();
   }
 
@@ -82,11 +107,10 @@ class GNNGraph {
   //! The indices pointer from the partitioned graph except with a 0
   //! prepended to it; needed for MKL calls
   std::vector<uint32_t> zero_start_graph_indices_;
-  // XXX is this necessary
-  //! Copy of underlying topology of the distributed graph
-  // std::unique_ptr<LocalGraphType> local_graph_;
   //! Sync substrate for the partitioned graph
   std::unique_ptr<galois::graphs::GluonSubstrate<GNNDistGraph>> sync_substrate_;
+  //! True if labels are single class
+  bool using_single_class_labels_;
   //! Ground truth label for nodes in the partitioned graph; Nx1 if single
   //! class, N x num classes if multi-class label
   std::vector<GNNLabel> local_ground_truth_labels_;
diff --git a/libgnn/src/GNNGraph.cpp b/libgnn/src/GNNGraph.cpp
index 78ff5d828c..38a78d68dc 100644
--- a/libgnn/src/GNNGraph.cpp
+++ b/libgnn/src/GNNGraph.cpp
@@ -57,6 +57,39 @@ galois::graphs::GNNGraph::GNNGraph(const std::string& dataset_name,
   InitNormFactor();
 }
 
+bool galois::graphs::GNNGraph::IsValidForPhase(
+    const unsigned lid, const galois::GNNPhase current_phase) const {
+  // convert to gid first
+  size_t gid = partitioned_graph_->getGID(lid);
+
+  // select range to use based on phase
+  const GNNRange* range_to_use;
+  switch (current_phase) {
+  case GNNPhase::kTrain:
+    range_to_use = &global_training_mask_range_;
+    break;
+  case GNNPhase::kValidate:
+    range_to_use = &global_validation_mask_range_;
+    break;
+  case GNNPhase::kTest:
+    range_to_use = &global_testing_mask_range_;
+    break;
+  default:
+    GALOIS_LOG_FATAL("Invalid phase used");
+    range_to_use = nullptr;
+  }
+
+  // if within range, it is valid
+  // TODO there is an assumption here that ranges are contiguous; may not
+  // necessarily be the case in all inputs in which case using the mask is safer
+  // (but less cache efficient)
+  if (range_to_use->begin <= gid && gid < range_to_use->end) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
 void galois::graphs::GNNGraph::ReadLocalLabels(const std::string& dataset_name,
                                                bool has_single_class_label) {
   GALOIS_LOG_VERBOSE("[{}] Reading labels from disk...", host_id_);
@@ -72,9 +105,11 @@ void galois::graphs::GNNGraph::ReadLocalLabels(const std::string& dataset_name,
   // allocate memory for labels
   if (has_single_class_label) {
     // single-class (one-hot) label for each vertex: N x 1
+    using_single_class_labels_ = true;
     local_ground_truth_labels_.resize(partitioned_graph_->size());
   } else {
     // multi-class label for each vertex: N x num classes
+    using_single_class_labels_ = false;
     local_ground_truth_labels_.resize(partitioned_graph_->size() *
                                       num_label_classes_);
   }

From 5e5214a7adf750babade5d5f78b3c832cfa1669f Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 29 Sep 2020 18:37:01 -0500
Subject: [PATCH 343/660] GNNLayers: layer phases and allocation disabling

Added the concept of layer phases to the GNN layer base class and added
a config option to disable allocation of weights (e.g., output layers do
not need weights to be allocated).
---
 libgnn/include/galois/layers/GNNLayer.h |  7 +++++++
 libgnn/src/GNNLayer.cpp                 | 24 +++++++++++++-----------
 2 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
index 7df88d2ce7..e5d83678f1 100644
--- a/libgnn/include/galois/layers/GNNLayer.h
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -19,6 +19,8 @@ struct GNNLayerDimensions {
 
 //! Config options for operations that can occur in a layer
 struct GNNConfig {
+  //! True if weights should be allocated
+  bool allocate_weights{true};
   //! True if dropout is to be done at beginning of forward phase
   bool do_dropout{false};
   //! Rate at which to drop things if dropout is on
@@ -48,6 +50,9 @@ class GNNLayer {
            const GNNLayerDimensions& dimensions)
       : GNNLayer(layer_num, graph, dimensions, GNNConfig()) {}
 
+  //! Changes this layer's phase
+  void SetLayerPhase(GNNPhase new_phase) { layer_phase_ = new_phase; }
+
   //! Initializes all layer weights to 1. This is used as a debug function for
   //! testing.
   void InitAllWeightsTo1() { layer_weights_.assign(layer_weights_.size(), 1); }
@@ -109,6 +114,8 @@ class GNNLayer {
   //! Indicates which fields of the weight matrix are dropped if dropout is
   //! used
   std::vector<bool> dropout_mask_;
+  //! Phase of GNN computation that this layer is currently in
+  galois::GNNPhase layer_phase_{galois::GNNPhase::kTrain};
 
   //////////////////////////////////////////////////////////////////////////////
 
diff --git a/libgnn/src/GNNLayer.cpp b/libgnn/src/GNNLayer.cpp
index 0f4eaeb36b..d14a5d1b05 100644
--- a/libgnn/src/GNNLayer.cpp
+++ b/libgnn/src/GNNLayer.cpp
@@ -6,17 +6,19 @@ galois::GNNLayer::GNNLayer(size_t layer_num,
                            const GNNConfig& config)
     : layer_number_(layer_num), graph_(graph), layer_dimensions_(dimensions),
       config_(config) {
-  // TODO some of this does not need alloc if not used
-  // dropout allocation; dropout is same as input
-  dropout_mask_.resize(layer_dimensions_.input_rows *
-                       layer_dimensions_.input_columns);
-  // allocate memory based on layer dimensions
-  size_t num_weight_elements =
-      layer_dimensions_.input_columns * layer_dimensions_.output_columns;
-  layer_weights_.resize(num_weight_elements);
-  layer_weight_gradients_.resize(num_weight_elements, 0);
-  // init weights randomly with a parallel loop
-  RandomInitVector(&layer_weights_);
+  if (config_.allocate_weights) {
+    // TODO some of this does not need alloc if not used
+    // dropout allocation; dropout is same as input
+    dropout_mask_.resize(layer_dimensions_.input_rows *
+                         layer_dimensions_.input_columns);
+    // allocate memory based on layer dimensions
+    size_t num_weight_elements =
+        layer_dimensions_.input_columns * layer_dimensions_.output_columns;
+    layer_weights_.resize(num_weight_elements);
+    layer_weight_gradients_.resize(num_weight_elements, 0);
+    // init weights randomly with a parallel loop
+    RandomInitVector(&layer_weights_);
+  }
 
   size_t num_output_elements =
       layer_dimensions_.input_rows * layer_dimensions_.output_columns;

From 8d19a26de4cbd4f16b4b3d4c3e9e617ed306da92 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 29 Sep 2020 18:38:33 -0500
Subject: [PATCH 344/660] GNNMath: softmax, cross entropy, max selector

Added softmax, cross entropy (and their deriviatives) for use in output
layers. Also added a utility function that selects the max element in a
list of elements.
---
 libgnn/include/galois/GNNMath.h | 22 +++++++++
 libgnn/src/GNNMath.cpp          | 79 +++++++++++++++++++++++++++++++++
 2 files changed, 101 insertions(+)

diff --git a/libgnn/include/galois/GNNMath.h b/libgnn/include/galois/GNNMath.h
index 755d281752..2cf913d5de 100644
--- a/libgnn/include/galois/GNNMath.h
+++ b/libgnn/include/galois/GNNMath.h
@@ -5,11 +5,33 @@
 
 namespace galois {
 
+//! Find max index in a vector of some length
+size_t MaxIndex(const size_t length, const GNNFloat* vector);
 //! Given 2 float array pointers, do element wise addition of length elements
 //! Can be called in parallel sections as its sigle threaded code
 void VectorAdd(size_t length, const GNNFloat* a, const GNNFloat* b,
                GNNFloat* output);
 
+//! Does a softmax operation on the input vector and saves result to output
+//! vector; single threaded so it can be called in a parallel section
+void GNNSoftmax(const size_t vector_length, const GNNFloat* input,
+                GNNFloat* output);
+//! Get derivative of softmax given the forward pass's input, the derivative
+//! from loss calculation, and a temp vector to store intermediate results.
+//! Everything is the same size.
+void GNNSoftmaxDerivative(const size_t vector_length,
+                          const GNNFloat* prev_output,
+                          const GNNFloat* prev_output_derivative,
+                          GNNFloat* temp_vector, GNNFloat* output);
+//! Performs cross entropy given a ground truth and input and returns the loss
+//! value.
+galois::GNNFloat GNNCrossEntropy(const size_t vector_length,
+                                 const GNNFloat* ground_truth,
+                                 const GNNFloat* input);
+//! Derivative of cross entropy; gradients saved into an output vector.
+void GNNCrossEntropyDerivative(const size_t vector_length,
+                               const GNNFloat* ground_truth,
+                               const GNNFloat* input, GNNFloat* gradients);
 //! Calls into a library BLAS call to do matrix muliply; uses default alpha/beta
 void CBlasSGEMM(const CBLAS_TRANSPOSE trans_a, const CBLAS_TRANSPOSE trans_b,
                 size_t input_rows, size_t input_columns, size_t output_columns,
diff --git a/libgnn/src/GNNMath.cpp b/libgnn/src/GNNMath.cpp
index 5ba0fdcf64..303e872e2a 100644
--- a/libgnn/src/GNNMath.cpp
+++ b/libgnn/src/GNNMath.cpp
@@ -1,5 +1,22 @@
+#include <cmath>
+#include <algorithm>
 #include <immintrin.h>
 #include "galois/GNNMath.h"
+#include "galois/Logging.h"
+
+size_t galois::MaxIndex(const size_t length, const GNNFloat* vector) {
+  size_t index     = 0;
+  GNNFloat cur_max = vector[0];
+
+  for (size_t i = 1; i < length; i++) {
+    if (vector[i] > cur_max) {
+      index   = i;
+      cur_max = vector[i];
+    }
+  }
+
+  return index;
+}
 
 void galois::VectorAdd(size_t length, const GNNFloat* a, const GNNFloat* b,
                        GNNFloat* output) {
@@ -26,6 +43,68 @@ void galois::VectorAdd(size_t length, const GNNFloat* a, const GNNFloat* b,
 #endif
 }
 
+void galois::GNNSoftmax(const size_t vector_length, const GNNFloat* input,
+                        GNNFloat* output) {
+  const GNNFloat max_element =
+      *(std::max_element(input, input + vector_length));
+  GNNFloat denominator = 0;
+  // normalize all elements using exponentional of max element
+  for (size_t i = 0; i < vector_length; i++) {
+    output[i] = std::exp(input[i] - max_element);
+    denominator += output[i];
+  }
+  // divide all by total to get a distribution
+  for (size_t i = 0; i < vector_length; i++) {
+    output[i] /= denominator;
+  }
+}
+
+void galois::GNNSoftmaxDerivative(const size_t vector_length,
+                                  const GNNFloat* prev_output,
+                                  const GNNFloat* prev_output_derivative,
+                                  GNNFloat* temp_vector, GNNFloat* output) {
+  for (size_t i = 0; i < vector_length; i++) {
+    for (size_t j = 0; j < vector_length; j++) {
+      temp_vector[j] = (j == i) ? prev_output[i] * (1.0 - prev_output[i])
+                                : -prev_output[j] * prev_output[i];
+    }
+    // TODO is sdot using threads? if so this is a nested parallelism problem
+    output[i] =
+        cblas_sdot(vector_length, prev_output_derivative, 1, temp_vector, 1);
+  }
+}
+
+galois::GNNFloat galois::GNNCrossEntropy(const size_t vector_length,
+                                         const GNNFloat* ground_truth,
+                                         const GNNFloat* input) {
+  GNNFloat loss = 0.0;
+
+  for (size_t i = 0; i < vector_length; i++) {
+    if (ground_truth[i] == 0.0) {
+      continue;
+    }
+
+    GALOIS_LOG_VERBOSE("Truth {} input {}", ground_truth[i], input[i]);
+
+    if (input[i] == 0.0) {
+      loss -= ground_truth[i] * std::log(static_cast<GNNFloat>(1e-10));
+    } else {
+      loss -= ground_truth[i] * std::log(input[i]);
+    }
+  }
+
+  return loss;
+}
+
+void galois::GNNCrossEntropyDerivative(const size_t vector_length,
+                                       const GNNFloat* ground_truth,
+                                       const GNNFloat* input,
+                                       GNNFloat* gradients) {
+  for (size_t i = 0; i < vector_length; i++) {
+    gradients[i] = -(ground_truth[i]) / (input[i] + 1e-10);
+  }
+}
+
 void galois::CBlasSGEMM(const CBLAS_TRANSPOSE trans_a,
                         const CBLAS_TRANSPOSE trans_b, size_t input_rows,
                         size_t input_columns, size_t output_columns,

From ac2ce73e13b8ea52a108f640ade306317ca44b3c Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 29 Sep 2020 18:41:31 -0500
Subject: [PATCH 345/660] GNN softmax loss output layer

This commit adds an implementation of the Softmax Loss output layer. The
layer creates a probability distribution of the rows of a passed in
matrix. The backward phase returns gradients on how to best move
computation towards a distribution that favors the ground truth.

Also includes some minor cleanup of the GCN layer.
---
 libgnn/CMakeLists.txt                       |  1 +
 libgnn/include/galois/layers/SoftmaxLayer.h | 51 +++++++++++++
 libgnn/src/GraphConvolutionalLayer.cpp      |  3 +-
 libgnn/src/SoftmaxLayer.cpp                 | 83 +++++++++++++++++++++
 4 files changed, 136 insertions(+), 2 deletions(-)
 create mode 100644 libgnn/include/galois/layers/SoftmaxLayer.h
 create mode 100644 libgnn/src/SoftmaxLayer.cpp

diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt
index 28e8dc8630..24f1f0e33d 100644
--- a/libgnn/CMakeLists.txt
+++ b/libgnn/CMakeLists.txt
@@ -21,6 +21,7 @@ set(sources
   src/GNNLayer.cpp
   src/GNNMath.cpp
   src/GraphConvolutionalLayer.cpp
+  src/SoftmaxLayer.cpp
 )
 
 add_library(galois_gnn STATIC ${sources})
diff --git a/libgnn/include/galois/layers/SoftmaxLayer.h b/libgnn/include/galois/layers/SoftmaxLayer.h
new file mode 100644
index 0000000000..45116f1b62
--- /dev/null
+++ b/libgnn/include/galois/layers/SoftmaxLayer.h
@@ -0,0 +1,51 @@
+#pragma once
+#include "galois/layers/GNNLayer.h"
+
+namespace galois {
+
+//! Softmax layer: takes each row of the input matrix and creates a probability
+//! distribution based on the magnitude of elements in each row.
+//! Currently this only works with **single class* labels and is coded as such.
+class SoftmaxLayer : public GNNLayer {
+public:
+  SoftmaxLayer(size_t layer_num, const galois::graphs::GNNGraph& graph,
+               const GNNLayerDimensions& dimensions)
+      : GNNLayer(layer_num, graph, dimensions,
+                 GNNConfig{.allocate_weights = false}),
+        input_loss_(dimensions.input_rows),
+        ground_truth_vectors_(dimensions.input_columns),
+        norm_gradient_vectors_(dimensions.input_columns),
+        softmax_temp_vectors_(dimensions.input_columns) {
+    // input/output columns must be equivalent in a softmax
+    GALOIS_LOG_ASSERT(dimensions.input_columns == dimensions.output_columns);
+  }
+
+  //! Creates probability distribution of each row of input
+  const std::vector<GNNFloat>&
+  ForwardPhase(const std::vector<GNNFloat>& input_embeddings) final;
+
+  //! Get gradients to fix distribution such that it leans more towards single
+  //! class ground truth.
+  std::vector<galois::GNNFloat>*
+  BackwardPhase(const std::vector<galois::GNNFloat>& prev_layer_input,
+                std::vector<GNNFloat>* input_gradient) final;
+
+  // TODO prediction loss function?
+private:
+  //! Loss for each row of the input
+  std::vector<GNNFloat> input_loss_;
+  //! Each thread gets storage to allocate the ground truth vector in during
+  //! calculation; each vector is the size of a feature vector
+  galois::substrate::PerThreadStorage<std::vector<GNNFloat>>
+      ground_truth_vectors_;
+  //! Each thread gets storage to allocate the gradients during backward
+  //! prop; each is the size of a feature vector
+  galois::substrate::PerThreadStorage<std::vector<GNNFloat>>
+      norm_gradient_vectors_;
+  //! Each thread gets storage for a temporary vector used during softmax
+  //! derivative calculation; each is the size of a feature vector
+  galois::substrate::PerThreadStorage<std::vector<GNNFloat>>
+      softmax_temp_vectors_;
+};
+
+} // namespace galois
diff --git a/libgnn/src/GraphConvolutionalLayer.cpp b/libgnn/src/GraphConvolutionalLayer.cpp
index bb00b83d61..fecea27d17 100644
--- a/libgnn/src/GraphConvolutionalLayer.cpp
+++ b/libgnn/src/GraphConvolutionalLayer.cpp
@@ -5,7 +5,7 @@
 galois::GraphConvolutionalLayer::GraphConvolutionalLayer(
     size_t layer_num, const galois::graphs::GNNGraph& graph,
     const GNNLayerDimensions& dimensions, const GNNConfig& config)
-    : galois::GNNLayer::GNNLayer(layer_num, graph, dimensions, config),
+    : GNNLayer(layer_num, graph, dimensions, config),
       input_column_intermediates_(dimensions.input_columns),
       output_column_intermediates_(dimensions.output_columns) {
   size_t num_input_elements =
@@ -98,7 +98,6 @@ void galois::GraphConvolutionalLayer::AggregateAll(
     GNNFloat* aggregate_output,
     galois::substrate::PerThreadStorage<std::vector<GNNFloat>>* pts) {
   size_t num_nodes = graph_.size();
-  // galois::gPrint(pts, "\n");
 
   galois::do_all(
       galois::iterate(static_cast<size_t>(0), num_nodes),
diff --git a/libgnn/src/SoftmaxLayer.cpp b/libgnn/src/SoftmaxLayer.cpp
new file mode 100644
index 0000000000..1c7073e560
--- /dev/null
+++ b/libgnn/src/SoftmaxLayer.cpp
@@ -0,0 +1,83 @@
+#include "galois/Logging.h"
+#include "galois/GNNMath.h"
+#include "galois/layers/SoftmaxLayer.h"
+
+const std::vector<galois::GNNFloat>& galois::SoftmaxLayer::ForwardPhase(
+    const std::vector<galois::GNNFloat>& input_embeddings) {
+  input_loss_.assign(input_loss_.size(), 0.0);
+  forward_output_matrix_.assign(forward_output_matrix_.size(), 0.0);
+
+  const size_t feature_length = layer_dimensions_.input_columns;
+  galois::do_all(
+      galois::iterate(graph_.begin(), graph_.end()),
+      [&](const unsigned i) {
+        if (graph_.IsValidForPhase(i, layer_phase_)) {
+          // do softmax
+          GNNSoftmax(feature_length, &input_embeddings[feature_length * i],
+                     &forward_output_matrix_[feature_length * i]);
+
+          // create ground truth vector for this LID
+          std::vector<GNNFloat>* ground_truth_vec =
+              ground_truth_vectors_.getLocal();
+          assert(ground_truth_vec->size() == feature_length);
+          ground_truth_vec->assign(ground_truth_vec->size(), 0.0);
+          GALOIS_LOG_VERBOSE("Label for LID {} is {}", i,
+                             graph_.GetSingleClassLabel(i));
+          // single class label is an index; set the correct one
+          (*ground_truth_vec)[static_cast<size_t>(
+              graph_.GetSingleClassLabel(i))] = 1.0;
+
+          // calculate loss for this LID (note not all i will be filled)
+          input_loss_[i] =
+              GNNCrossEntropy(feature_length, ground_truth_vec->data(),
+                              &forward_output_matrix_[feature_length * i]);
+          GALOIS_LOG_VERBOSE("Loss for LID {} is {}", i, input_loss_[i]);
+        }
+      },
+      // TODO chunk size?
+      // steal on as some threads may have nothing to work on
+      galois::steal(), galois::loopname("SoftmaxForward"));
+
+  return forward_output_matrix_;
+}
+
+std::vector<galois::GNNFloat>*
+galois::SoftmaxLayer::BackwardPhase(const std::vector<galois::GNNFloat>&,
+                                    std::vector<galois::GNNFloat>*) {
+  const size_t feature_length = layer_dimensions_.input_columns;
+  galois::do_all(
+      galois::iterate(graph_.begin(), graph_.end()),
+      [&](const unsigned i) {
+        if (graph_.IsValidForPhase(i, layer_phase_)) {
+          // create ground truth vector for this LID
+          std::vector<GNNFloat>* ground_truth_vec =
+              ground_truth_vectors_.getLocal();
+          assert(ground_truth_vec->size() == feature_length);
+          ground_truth_vec->assign(ground_truth_vec->size(), 0.0);
+          // single class label is an index; set the correct one
+          (*ground_truth_vec)[static_cast<size_t>(
+              graph_.GetSingleClassLabel(i))] = 1.0;
+
+          // derivative cross entropy into norm grad
+          std::vector<GNNFloat>* norm_gradient =
+              norm_gradient_vectors_.getLocal();
+          GNNCrossEntropyDerivative(feature_length, ground_truth_vec->data(),
+                                    forward_output_matrix_.data(),
+                                    norm_gradient->data());
+
+          // use norm grad with softmax deritave, save and return
+          std::vector<GNNFloat>* softmax_temp =
+              softmax_temp_vectors_.getLocal();
+          GNNSoftmaxDerivative(feature_length, forward_output_matrix_.data(),
+                               norm_gradient->data(), softmax_temp->data(),
+                               backward_output_matrix_.data());
+        }
+      },
+      // TODO chunk size?
+      // steal on as some threads may have nothing to work on
+      galois::steal(), galois::loopname("SoftmaxBackward"));
+
+  return &backward_output_matrix_;
+}
+
+// TODO function for getting loss

From d7645e2b61f1afd6fdfcc14868dc844aa23c8ce1 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 29 Sep 2020 18:43:52 -0500
Subject: [PATCH 346/660] Softmax layer test

Adds a test that runs the forward and backward phase of the softmax
layer. Verification only included  for the forward pass at this point:
it checks to make sure the probability distribution is as expected from
the test input. The backward phase is just run without checking its
output for sanity purposes.
---
 libgnn/test/CMakeLists.txt        |   4 ++
 libgnn/test/convlayer-test.cpp    |   5 +-
 libgnn/test/softmaxlayer-test.cpp | 107 ++++++++++++++++++++++++++++++
 3 files changed, 113 insertions(+), 3 deletions(-)
 create mode 100644 libgnn/test/softmaxlayer-test.cpp

diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt
index 9469f86aba..c604dd59e2 100644
--- a/libgnn/test/CMakeLists.txt
+++ b/libgnn/test/CMakeLists.txt
@@ -6,4 +6,8 @@ add_executable(convlayer-test convlayer-test.cpp)
 target_link_libraries(convlayer-test galois_gnn)
 add_test(NAME convlayer-test COMMAND convlayer-test)
 
+add_executable(softmaxlayer-test softmaxlayer-test.cpp)
+target_link_libraries(softmaxlayer-test galois_gnn)
+add_test(NAME softmaxlayer-test COMMAND softmaxlayer-test)
+
 # TODO multi host tests
diff --git a/libgnn/test/convlayer-test.cpp b/libgnn/test/convlayer-test.cpp
index 2d98875a0d..3c127b0ad0 100644
--- a/libgnn/test/convlayer-test.cpp
+++ b/libgnn/test/convlayer-test.cpp
@@ -7,9 +7,8 @@
 int main() {
   galois::DistMemSys G;
 
-  // size_t num_threads = galois::setActiveThreads(
-  //    56 / galois::runtime::getSystemNetworkInterface().Num);
-  size_t num_threads = galois::setActiveThreads(1);
+  size_t num_threads = galois::setActiveThreads(
+      56 / galois::runtime::getSystemNetworkInterface().Num);
 
   GALOIS_LOG_VERBOSE("[{}] Using {} threads",
                      galois::runtime::getSystemNetworkInterface().ID,
diff --git a/libgnn/test/softmaxlayer-test.cpp b/libgnn/test/softmaxlayer-test.cpp
new file mode 100644
index 0000000000..bd3cd8c5e3
--- /dev/null
+++ b/libgnn/test/softmaxlayer-test.cpp
@@ -0,0 +1,107 @@
+//! @file convlayer-test.cpp
+//! Softmax layer test with a test graph
+
+#include "galois/Logging.h"
+#include "galois/GNNMath.h"
+#include "galois/layers/SoftmaxLayer.h"
+
+int main() {
+  galois::DistMemSys G;
+
+  size_t num_threads = galois::setActiveThreads(
+      56 / galois::runtime::getSystemNetworkInterface().Num);
+  GALOIS_LOG_VERBOSE("Num threads is {}", num_threads);
+
+  // load test graph
+  galois::graphs::GNNGraph test_graph(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true);
+
+  // input/output columns must be same in softmax
+  galois::GNNLayerDimensions dimension_0{
+      .input_rows     = 7,
+      .input_columns  = test_graph.GetNumLabelClasses(),
+      .output_columns = test_graph.GetNumLabelClasses()};
+
+  GALOIS_LOG_VERBOSE("Num output classes is {}", dimension_0.input_columns);
+
+  // input to softmax
+  std::vector<galois::GNNFloat> softmax_input(49, 0.0);
+  // create input with perfect accuracy
+  softmax_input[0]  = 1;
+  softmax_input[8]  = 1;
+  softmax_input[16] = 1;
+  softmax_input[24] = 1;
+  softmax_input[32] = 1;
+  softmax_input[40] = 1;
+  softmax_input[48] = 1;
+
+  // train mode
+  auto output_layer =
+      std::make_unique<galois::SoftmaxLayer>(3, test_graph, dimension_0);
+  const std::vector<galois::GNNFloat>& prediction_distribution =
+      output_layer->ForwardPhase(softmax_input);
+  output_layer->BackwardPhase(softmax_input, nullptr);
+
+  // assert that predictions are as expected
+  for (size_t i = 0; i < 5; i++) {
+    GALOIS_LOG_ASSERT(galois::MaxIndex(7, &(prediction_distribution[i * 7])) ==
+                      i);
+  }
+  // train mode means last 2 vertices should be empty
+  for (size_t i = 5; i < 7; i++) {
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 0] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 1] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 2] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 3] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 4] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 5] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 6] == 0.0);
+  }
+
+  // validation mode
+  output_layer->SetLayerPhase(galois::GNNPhase::kValidate);
+  const std::vector<galois::GNNFloat>& pd2 =
+      output_layer->ForwardPhase(softmax_input);
+  output_layer->BackwardPhase(softmax_input, nullptr);
+  // validate vertex is index 5
+  GALOIS_LOG_ASSERT(galois::MaxIndex(7, &(pd2[5 * 7])) == 5);
+  for (size_t i = 0; i < 5; i++) {
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 0] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 1] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 2] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 3] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 4] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 5] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 6] == 0.0);
+  }
+  for (size_t i = 6; i < 7; i++) {
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 0] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 1] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 2] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 3] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 4] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 5] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 6] == 0.0);
+  }
+
+  // test mode
+  output_layer->SetLayerPhase(galois::GNNPhase::kTest);
+  const std::vector<galois::GNNFloat>& pd3 =
+      output_layer->ForwardPhase(softmax_input);
+  output_layer->BackwardPhase(softmax_input, nullptr);
+  // validate vertex is index 6
+  GALOIS_LOG_ASSERT(galois::MaxIndex(7, &(pd3[6 * 7])) == 6);
+  // all but last are empty distributions
+  for (size_t i = 0; i < 6; i++) {
+    GALOIS_LOG_ASSERT(pd3[i * 7 + 0] == 0.0);
+    GALOIS_LOG_ASSERT(pd3[i * 7 + 1] == 0.0);
+    GALOIS_LOG_ASSERT(pd3[i * 7 + 2] == 0.0);
+    GALOIS_LOG_ASSERT(pd3[i * 7 + 3] == 0.0);
+    GALOIS_LOG_ASSERT(pd3[i * 7 + 4] == 0.0);
+    GALOIS_LOG_ASSERT(pd3[i * 7 + 5] == 0.0);
+    GALOIS_LOG_ASSERT(pd3[i * 7 + 6] == 0.0);
+  }
+
+  // TODO in future maybe: add better test for backward phase besides just
+  // running it
+}

From b9cc256c7cda6bd59be4b68c48189e68b4afc2a6 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 22 Sep 2020 17:17:30 -0500
Subject: [PATCH 347/660] refixed gradient sync, added post process average

---
 .../include/deepgalois/layers/GradientSyncStructs.h |  6 +++++-
 libdeepgalois/src/layers/graph_conv_layer.cpp       | 13 +++++++++++++
 lonestar/gnn/include/engine.h                       |  2 +-
 3 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h
index 26420aa30d..6f600b40a8 100644
--- a/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h
+++ b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h
@@ -15,8 +15,12 @@ struct GradientSync {
     // if (std::abs(weight - y) > 0.00001) {
     //  galois::gInfo("weight ", node_id, " not consistent with one received");
     //}
+    if (y == 0) {
+      galois::gPrint("nothing important\n");
+    }
     weight += y;
-    weight /= 2;
+    // need a post process divide all step
+    //weight /= 2;
     return true;
   }
 
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index 9320ade39c..f6741f4b6d 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -268,7 +268,20 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
     math::d_dropout_cpu(x, y, scale_, in_grad, dropout_mask, in_grad);
   drop_timer.stop();
 
+  deepgalois::_syncVectorSize = z;
+  deepgalois::_dataToSync     = &layer::weight_grad[0];
+  unsigned host_num = galois::runtime::getSystemNetworkInterface().Num;
   layer::syncSub->sync<writeAny, readAny, GradientSync>("Gradients");
+  galois::do_all(
+    galois::iterate((size_t)0, (size_t)z),
+    [&] (size_t i) {
+      //galois::gPrint("before ", i, " ", layer::weight_grad[i], "\n");
+      layer::weight_grad[i] /= host_num;
+      //galois::gPrint("after ", i, " ", layer::weight_grad[i], "\n");
+    },
+    galois::loopname("sync post process")
+  );
+
   galois::gDebug("[", layer::gradientGraph->myHostID(), "] Sync done");
   conv_timer.stop();
 }
diff --git a/lonestar/gnn/include/engine.h b/lonestar/gnn/include/engine.h
index 016ac80831..f9afb28a4c 100644
--- a/lonestar/gnn/include/engine.h
+++ b/lonestar/gnn/include/engine.h
@@ -92,7 +92,7 @@ int main(int argc, char** argv) {
   // see optimizer.h for more details
   // optimizer *opt = new gradient_descent();
   // optimizer *opt = new adagrad();
-  deepgalois::optimizer* opt = new deepgalois::adam();
+  deepgalois::optimizer* opt = new deepgalois::adagrad();
   galois::StatTimer Ttrain("TrainAndVal");
   Ttrain.start();
   network.train(opt, do_validate); // do training using training samples

From 7b71274a02c1834ef95a8b3492fc861a56723a2c Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 30 Sep 2020 18:27:44 -0500
Subject: [PATCH 348/660] Add accessors and type to GNNLayers

Adds a few more accessors to GNNLayer classes as well as a type field
which can be used to determine what kind of layer an object is. Note the
separation between an output layer and an intermediate layer; this may
be merged later into a single field, but the current design is that an
intermediate and output layer are considered differently throughout the
codebase.
---
 libgnn/include/galois/layers/GNNLayer.h     | 32 +++++++++++++++++++--
 libgnn/include/galois/layers/SoftmaxLayer.h |  3 ++
 libgnn/src/GraphConvolutionalLayer.cpp      |  1 +
 3 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
index e5d83678f1..4144dbfead 100644
--- a/libgnn/include/galois/layers/GNNLayer.h
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -5,6 +5,19 @@
 
 namespace galois {
 
+//! Supported layer types in the GNN
+enum class GNNLayerType {
+  //! Invalid placeholder
+  kInvalid,
+  //! GCN
+  kGraphConvolutional
+  // TODO SAGE and GAT
+};
+
+// TODO Sigmoid
+//! Supported output layer types in the GNN
+enum class GNNOutputLayerType { kInvalid, kSoftmax };
+
 //! Struct holding the dimensions of a layer. Assumption is that a layer takes
 //! a matrix and outputs another matrix with a different # of columns (e.g.
 //! matrix multiply with a set of weights)
@@ -50,6 +63,7 @@ class GNNLayer {
            const GNNLayerDimensions& dimensions)
       : GNNLayer(layer_num, graph, dimensions, GNNConfig()) {}
 
+  GNNPhase layer_phase() { return layer_phase_; }
   //! Changes this layer's phase
   void SetLayerPhase(GNNPhase new_phase) { layer_phase_ = new_phase; }
 
@@ -76,13 +90,20 @@ class GNNLayer {
   BackwardPhase(const std::vector<galois::GNNFloat>& prev_layer_input,
                 std::vector<galois::GNNFloat>* input_gradient) = 0;
 
-  const std::vector<GNNFloat>& GetLayerWeightGradients() {
+  //! Returns the weight gradients
+  const std::vector<GNNFloat>& GetLayerWeightGradients() const {
     return layer_weight_gradients_;
   }
 
   //! Returns dimensions of this layer
-  // XXX may not be needed
-  const GNNLayerDimensions& GetLayerDimensions() { return layer_dimensions_; }
+  const GNNLayerDimensions& GetLayerDimensions() const {
+    return layer_dimensions_;
+  }
+
+  galois::GNNLayerType layer_type() const { return layer_type_; }
+  galois::GNNOutputLayerType output_layer_type() const {
+    return output_layer_type_;
+  }
 
 protected:
   //! Layer order (starts from 0); used in backward to shortcut output as layer
@@ -116,6 +137,11 @@ class GNNLayer {
   std::vector<bool> dropout_mask_;
   //! Phase of GNN computation that this layer is currently in
   galois::GNNPhase layer_phase_{galois::GNNPhase::kTrain};
+  //! Layer type (invalid if output layer)
+  galois::GNNLayerType layer_type_{galois::GNNLayerType::kInvalid};
+  //! Output layer type (remains invalid if not an output layer)
+  galois::GNNOutputLayerType output_layer_type_{
+      galois::GNNOutputLayerType::kInvalid};
 
   //////////////////////////////////////////////////////////////////////////////
 
diff --git a/libgnn/include/galois/layers/SoftmaxLayer.h b/libgnn/include/galois/layers/SoftmaxLayer.h
index 45116f1b62..3052429b8b 100644
--- a/libgnn/include/galois/layers/SoftmaxLayer.h
+++ b/libgnn/include/galois/layers/SoftmaxLayer.h
@@ -16,8 +16,11 @@ class SoftmaxLayer : public GNNLayer {
         ground_truth_vectors_(dimensions.input_columns),
         norm_gradient_vectors_(dimensions.input_columns),
         softmax_temp_vectors_(dimensions.input_columns) {
+    output_layer_type_ = galois::GNNOutputLayerType::kSoftmax;
     // input/output columns must be equivalent in a softmax
     GALOIS_LOG_ASSERT(dimensions.input_columns == dimensions.output_columns);
+    // output needs to match number of possible classes
+    GALOIS_LOG_ASSERT(dimensions.input_columns == graph.GetNumLabelClasses());
   }
 
   //! Creates probability distribution of each row of input
diff --git a/libgnn/src/GraphConvolutionalLayer.cpp b/libgnn/src/GraphConvolutionalLayer.cpp
index fecea27d17..a5abe1d0ef 100644
--- a/libgnn/src/GraphConvolutionalLayer.cpp
+++ b/libgnn/src/GraphConvolutionalLayer.cpp
@@ -18,6 +18,7 @@ galois::GraphConvolutionalLayer::GraphConvolutionalLayer(
       layer_dimensions_.input_rows * layer_dimensions_.output_columns;
   GALOIS_LOG_VERBOSE("Output elements {}", num_output_elements);
   out_temp_.resize(num_output_elements, 0);
+  layer_type_ = galois::GNNLayerType::kGraphConvolutional;
 }
 
 const std::vector<galois::GNNFloat>&

From b585f2b280a63cce4ff3904f4b07756a077aea62 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 30 Sep 2020 18:42:34 -0500
Subject: [PATCH 349/660] Graph neural network construction + unit test

Adds initial implementation of a graph neural network's constructor
which can construct an arbitrary GNN given some config object and a few
other details. Adds a unit test which checks the structure of the GNN to
make sure that it is sane.

Also adds a feature length accessor to GNNGraph.

Next step is the implementation of forward/backward phases in the GNN.
---
 libgnn/CMakeLists.txt                      |   1 +
 libgnn/include/galois/GraphNeuralNetwork.h | 128 +++++++++++++++++++++
 libgnn/include/galois/graphs/GNNGraph.h    |   2 +
 libgnn/src/GraphNeuralNetwork.cpp          |  57 +++++++++
 libgnn/test/CMakeLists.txt                 |   6 +-
 libgnn/test/gnnconstruct-test.cpp          |  64 +++++++++++
 6 files changed, 257 insertions(+), 1 deletion(-)
 create mode 100644 libgnn/include/galois/GraphNeuralNetwork.h
 create mode 100644 libgnn/src/GraphNeuralNetwork.cpp
 create mode 100644 libgnn/test/gnnconstruct-test.cpp

diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt
index 24f1f0e33d..ce6e6f990f 100644
--- a/libgnn/CMakeLists.txt
+++ b/libgnn/CMakeLists.txt
@@ -22,6 +22,7 @@ set(sources
   src/GNNMath.cpp
   src/GraphConvolutionalLayer.cpp
   src/SoftmaxLayer.cpp
+  src/GraphNeuralNetwork.cpp
 )
 
 add_library(galois_gnn STATIC ${sources})
diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h
new file mode 100644
index 0000000000..a4eb19f375
--- /dev/null
+++ b/libgnn/include/galois/GraphNeuralNetwork.h
@@ -0,0 +1,128 @@
+#pragma once
+//! @file GraphNeuralNetwork.h
+//!
+//! Defines the graph neural network class that is used to classify graphs as
+//! well as helper enums/classes involved with the GNN.
+
+#include "galois/Logging.h"
+#include "galois/graphs/GNNGraph.h"
+#include "galois/layers/GNNLayer.h"
+
+namespace galois {
+
+////////////////////////////////////////////////////////////////////////////////
+
+//! Configuration object passed into constructor of a GraphNeuralNetwork to
+//! determine how the network gets constructed.
+class GraphNeuralNetworkConfig {
+public:
+  // default move, no copy
+  GraphNeuralNetworkConfig()                                = delete;
+  GraphNeuralNetworkConfig(const GraphNeuralNetworkConfig&) = delete;
+  GraphNeuralNetworkConfig& operator=(const GraphNeuralNetworkConfig&) = delete;
+  GraphNeuralNetworkConfig(GraphNeuralNetworkConfig&&) = default;
+  GraphNeuralNetworkConfig& operator=(GraphNeuralNetworkConfig&&) = default;
+
+  //! Construction without a config for layers specified; uses a default
+  GraphNeuralNetworkConfig(size_t num_layers,
+                           const std::vector<GNNLayerType>& layer_types,
+                           const std::vector<size_t>& layer_column_sizes,
+                           GNNOutputLayerType output_layer_type)
+      : GraphNeuralNetworkConfig(num_layers, layer_types, layer_column_sizes,
+                                 output_layer_type,
+                                 GNNConfig{.do_dropout       = true,
+                                           .dropout_rate     = 0.3,
+                                           .do_normalization = true}) {}
+
+  //! Construction with a specified config for layers
+  GraphNeuralNetworkConfig(size_t num_layers,
+                           const std::vector<GNNLayerType>& layer_types,
+                           const std::vector<size_t>& layer_column_sizes,
+                           GNNOutputLayerType output_layer_type,
+                           const GNNConfig& default_layer_config)
+      : num_intermediate_layers_(num_layers), layer_types_(layer_types),
+        layer_column_sizes_(layer_column_sizes),
+        output_layer_type_(output_layer_type),
+        default_layer_config_(default_layer_config) {
+    // Do sanity checks on inputs
+    // should have a type for each layer
+    GALOIS_LOG_ASSERT(num_intermediate_layers_ == layer_types_.size());
+    // For now, should be at least 1 intermediate layer
+    GALOIS_LOG_ASSERT(num_intermediate_layers_ >= 1);
+    // + 1 because it includes output layer
+    GALOIS_LOG_ASSERT((num_intermediate_layers_ + 1) ==
+                      layer_column_sizes_.size());
+  }
+
+  //! # layers NOT including output layer
+  size_t num_intermediate_layers() { return num_intermediate_layers_; }
+  //! Get intermediate layer i
+  GNNLayerType intermediate_layer_type(size_t i) {
+    assert(i < num_intermediate_layers_);
+    return layer_types_[i];
+  }
+  //! Get intermediate layer i's size
+  size_t intermediate_layer_size(size_t i) {
+    assert(i < num_intermediate_layers_);
+    return layer_column_sizes_[i];
+  }
+  //! Type of output layer
+  GNNOutputLayerType output_layer_type() { return output_layer_type_; }
+  //! Size of output layer is last element of layer column sizes
+  size_t output_layer_size() {
+    return layer_column_sizes_[num_intermediate_layers_];
+  }
+  //! Get the default layer config of layers in this GNN
+  const GNNConfig& default_layer_config() { return default_layer_config_; }
+
+private:
+  //! Number of layers to construct in the GNN not including the output
+  //! layer
+  size_t num_intermediate_layers_;
+  //! Layers to construct for the GNN going from left to right; size should
+  //! match num_layers setting
+  std::vector<GNNLayerType> layer_types_;
+  //! Size (in columns) of each non-output layer; size should match num_layers
+  //! + 1 (+1 is for the output layer)
+  std::vector<size_t> layer_column_sizes_;
+  //! Output layer type
+  GNNOutputLayerType output_layer_type_;
+  //! Default config to use for layers
+  GNNConfig default_layer_config_;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+//! Class representing the graph neural network: contains the graph to train as
+//! well as all the layers that comprise it
+class GraphNeuralNetwork {
+public:
+  //! Construct the graph neural network given the graph to train on as well as
+  //! a configuration object
+  GraphNeuralNetwork(std::unique_ptr<graphs::GNNGraph> graph,
+                     GraphNeuralNetworkConfig&& config);
+
+  //! Number of intermediate layers (DOES NOT INCLUDE OUTPUT LAYER)
+  size_t num_intermediate_layers() { return gnn_layers_.size() - 1; }
+
+  //! Returns pointer to intermediate layer i
+  const galois::GNNLayer* GetIntermediateLayer(size_t i) {
+    if (i < gnn_layers_.size() - 1) {
+      return gnn_layers_[i].get();
+    } else {
+      GALOIS_LOG_FATAL("Accessing out of bounds intermediate layer {}", i);
+    }
+  }
+  //! Returns the output layer
+  const galois::GNNLayer* GetOutputLayer() { return gnn_layers_.back().get(); }
+
+private:
+  //! Underlying graph to train
+  std::unique_ptr<graphs::GNNGraph> graph_;
+  //! Configuration object used to construct this GNN
+  GraphNeuralNetworkConfig config_;
+  //! GNN layers including the output
+  std::vector<std::unique_ptr<galois::GNNLayer>> gnn_layers_;
+};
+
+} // namespace galois
diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index cefc505992..fa06453df9 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -76,6 +76,8 @@ class GNNGraph {
   };
   GNNFloat NormFactor(GraphNode n) const { return norm_factors_[n]; }
 
+  size_t node_feature_length() const { return node_feature_length_; }
+
   const std::vector<GNNFloat>& GetLocalFeatures() const {
     return local_node_features_;
   }
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
new file mode 100644
index 0000000000..9b6a4ad708
--- /dev/null
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -0,0 +1,57 @@
+#include "galois/GraphNeuralNetwork.h"
+#include "galois/layers/GraphConvolutionalLayer.h"
+#include "galois/layers/SoftmaxLayer.h"
+
+galois::GraphNeuralNetwork::GraphNeuralNetwork(
+    std::unique_ptr<galois::graphs::GNNGraph> graph,
+    galois::GraphNeuralNetworkConfig&& config)
+    : graph_(std::move(graph)), config_(std::move(config)) {
+  // max number of rows that can be passed as inputs; allocate space for it as
+  // this will be the # of rows for each layer
+  size_t max_rows = graph_->size();
+
+  // create the intermediate layers
+  for (size_t i = 0; i < config_.num_intermediate_layers(); i++) {
+    GNNLayerType layer_type = config_.intermediate_layer_type(i);
+    size_t prev_layer_columns;
+
+    if (i != 0) {
+      // grab previous layer's size
+      prev_layer_columns = config_.intermediate_layer_size(i - 1);
+    } else {
+      // first layer means the input columns are # features in graph
+      prev_layer_columns = graph_->node_feature_length();
+    }
+
+    GNNLayerDimensions layer_dims = {.input_rows    = max_rows,
+                                     .input_columns = prev_layer_columns,
+                                     .output_columns =
+                                         config_.intermediate_layer_size(i)};
+
+    switch (layer_type) {
+    case GNNLayerType::kGraphConvolutional:
+      gnn_layers_.push_back(std::move(std::make_unique<GraphConvolutionalLayer>(
+          i, *graph_, layer_dims, config_.default_layer_config())));
+      break;
+    default:
+      GALOIS_LOG_FATAL("Invalid layer type during network construction");
+    }
+  }
+
+  // create the output layer
+  GNNLayerDimensions output_dims = {
+      .input_rows = max_rows,
+      // get last intermediate layer column size
+      .input_columns = config_.intermediate_layer_size(
+          config_.num_intermediate_layers() - 1),
+      .output_columns = config_.output_layer_size()};
+
+  switch (config_.output_layer_type()) {
+  case (GNNOutputLayerType::kSoftmax):
+    gnn_layers_.push_back(std::move(std::make_unique<SoftmaxLayer>(
+        config_.num_intermediate_layers(), *graph_, output_dims)));
+    break;
+  default:
+    GALOIS_LOG_FATAL("Invalid layer type during network construction");
+  }
+}
diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt
index c604dd59e2..e7a04b5b5f 100644
--- a/libgnn/test/CMakeLists.txt
+++ b/libgnn/test/CMakeLists.txt
@@ -10,4 +10,8 @@ add_executable(softmaxlayer-test softmaxlayer-test.cpp)
 target_link_libraries(softmaxlayer-test galois_gnn)
 add_test(NAME softmaxlayer-test COMMAND softmaxlayer-test)
 
-# TODO multi host tests
+add_executable(gnnconstruct-test gnnconstruct-test.cpp)
+target_link_libraries(gnnconstruct-test galois_gnn)
+add_test(NAME gnnconstruct-test COMMAND gnnconstruct-test)
+
+# TODO multi host tests?
diff --git a/libgnn/test/gnnconstruct-test.cpp b/libgnn/test/gnnconstruct-test.cpp
new file mode 100644
index 0000000000..9265eb6b8b
--- /dev/null
+++ b/libgnn/test/gnnconstruct-test.cpp
@@ -0,0 +1,64 @@
+#include "galois/Logging.h"
+#include "galois/GraphNeuralNetwork.h"
+//! @file gnnconstruct-test.cpp
+//! Test to make sure construction works as expected
+
+int main() {
+  galois::DistMemSys G;
+
+  size_t num_threads = galois::setActiveThreads(
+      56 / galois::runtime::getSystemNetworkInterface().Num);
+
+  GALOIS_LOG_VERBOSE("[{}] Using {} threads",
+                     galois::runtime::getSystemNetworkInterface().ID,
+                     num_threads);
+  // load test graph
+  auto test_graph = std::make_unique<galois::graphs::GNNGraph>(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true);
+
+  // 2 layer test with softmax
+  std::vector<galois::GNNLayerType> layer_types = {
+      galois::GNNLayerType::kGraphConvolutional,
+      galois::GNNLayerType::kGraphConvolutional};
+  // note this includes the output; last 2 must be same because softmax
+  std::vector<size_t> layer_output_sizes = {4, 7, 7};
+  galois::GraphNeuralNetworkConfig gnn_config(
+      2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax);
+  galois::GraphNeuralNetwork gnn(std::move(test_graph), std::move(gnn_config));
+
+  // note this does not include output layer
+  GALOIS_LOG_ASSERT(gnn.num_intermediate_layers() == 2);
+  // assert layer types
+  GALOIS_LOG_ASSERT(galois::GNNLayerType::kGraphConvolutional ==
+                    gnn.GetIntermediateLayer(0)->layer_type());
+  GALOIS_LOG_ASSERT(galois::GNNOutputLayerType::kInvalid ==
+                    gnn.GetIntermediateLayer(0)->output_layer_type());
+  GALOIS_LOG_ASSERT(galois::GNNLayerType::kGraphConvolutional ==
+                    gnn.GetIntermediateLayer(1)->layer_type());
+  GALOIS_LOG_ASSERT(galois::GNNOutputLayerType::kInvalid ==
+                    gnn.GetIntermediateLayer(1)->output_layer_type());
+  GALOIS_LOG_ASSERT(galois::GNNLayerType::kInvalid ==
+                    gnn.GetOutputLayer()->layer_type());
+  GALOIS_LOG_ASSERT(galois::GNNOutputLayerType::kSoftmax ==
+                    gnn.GetOutputLayer()->output_layer_type());
+
+  // assert dimensions are what we expect
+  const galois::GNNLayerDimensions& layer0_dims =
+      gnn.GetIntermediateLayer(0)->GetLayerDimensions();
+  GALOIS_LOG_ASSERT(layer0_dims.input_rows == 7);
+  // remember tester has features of length 3
+  GALOIS_LOG_ASSERT(layer0_dims.input_columns == 3);
+  GALOIS_LOG_ASSERT(layer0_dims.output_columns == 4);
+
+  const galois::GNNLayerDimensions& layer1_dims =
+      gnn.GetIntermediateLayer(1)->GetLayerDimensions();
+  GALOIS_LOG_ASSERT(layer1_dims.input_rows == 7);
+  GALOIS_LOG_ASSERT(layer1_dims.input_columns == 4);
+  GALOIS_LOG_ASSERT(layer1_dims.output_columns == 7);
+
+  const galois::GNNLayerDimensions& output_dims =
+      gnn.GetOutputLayer()->GetLayerDimensions();
+  GALOIS_LOG_ASSERT(output_dims.input_rows == 7);
+  GALOIS_LOG_ASSERT(output_dims.input_columns == 7);
+  GALOIS_LOG_ASSERT(output_dims.output_columns == 7);
+}

From 53b97018f78ea6acc3f005cfdaac1f49feb11a29 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 1 Oct 2020 18:40:53 -0500
Subject: [PATCH 350/660] Forward phase (inference) of GNN

Added initial forward phase/inference call to the gnn class. Added
various accessors and setters to layers to facilitate checking the
results of a forward pass (e.g. getting reference to output).

Test added to make sure the forward pass works as expected.
---
 libgnn/include/galois/GraphNeuralNetwork.h |  21 +++
 libgnn/include/galois/layers/GNNLayer.h    |  43 +++---
 libgnn/src/GraphNeuralNetwork.cpp          |   9 ++
 libgnn/test/CMakeLists.txt                 |   4 +
 libgnn/test/gnnconstruct-test.cpp          |   5 +-
 libgnn/test/gnnfb-test.cpp                 | 156 +++++++++++++++++++++
 6 files changed, 220 insertions(+), 18 deletions(-)
 create mode 100644 libgnn/test/gnnfb-test.cpp

diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h
index a4eb19f375..8762612a9b 100644
--- a/libgnn/include/galois/GraphNeuralNetwork.h
+++ b/libgnn/include/galois/GraphNeuralNetwork.h
@@ -113,9 +113,30 @@ class GraphNeuralNetwork {
       GALOIS_LOG_FATAL("Accessing out of bounds intermediate layer {}", i);
     }
   }
+
+  //! Set the phases of all layers at once
+  void SetLayerPhases(galois::GNNPhase phase) {
+    for (std::unique_ptr<galois::GNNLayer>& ptr : gnn_layers_) {
+      ptr->SetLayerPhase(phase);
+    }
+  }
+
+  //! Set weights on all layers to 1; should be used for debugging only
+  void SetAllLayerWeightsTo1() {
+    for (std::unique_ptr<galois::GNNLayer>& ptr : gnn_layers_) {
+      ptr->InitAllWeightsTo1();
+    }
+  }
+
   //! Returns the output layer
   const galois::GNNLayer* GetOutputLayer() { return gnn_layers_.back().get(); }
 
+  //! Propogates the graph's feature vectors through the network to get a new
+  //! vector representation.
+  //! Also known as the forward phase in most literature
+  //! @returns Output layer's output
+  const std::vector<GNNFloat>* DoInference();
+
 private:
   //! Underlying graph to train
   std::unique_ptr<graphs::GNNGraph> graph_;
diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
index 4144dbfead..f22507b6be 100644
--- a/libgnn/include/galois/layers/GNNLayer.h
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -69,7 +69,33 @@ class GNNLayer {
 
   //! Initializes all layer weights to 1. This is used as a debug function for
   //! testing.
-  void InitAllWeightsTo1() { layer_weights_.assign(layer_weights_.size(), 1); }
+  void InitAllWeightsTo1() {
+    if (layer_weights_.size()) {
+      layer_weights_.assign(layer_weights_.size(), 1);
+    }
+  }
+
+  const std::vector<GNNFloat>& GetForwardOutput() const {
+    return forward_output_matrix_;
+  }
+  const std::vector<GNNFloat>& GetBackwardOutput() const {
+    return backward_output_matrix_;
+  }
+
+  //! Returns the weight gradients
+  const std::vector<GNNFloat>& GetLayerWeightGradients() const {
+    return layer_weight_gradients_;
+  }
+
+  //! Returns dimensions of this layer
+  const GNNLayerDimensions& GetLayerDimensions() const {
+    return layer_dimensions_;
+  }
+
+  galois::GNNLayerType layer_type() const { return layer_type_; }
+  galois::GNNOutputLayerType output_layer_type() const {
+    return output_layer_type_;
+  }
 
   //! Conducts the forward phase given the input to this layer which
   //! ultimately leads to an output (classfication of node labels) at the end
@@ -90,21 +116,6 @@ class GNNLayer {
   BackwardPhase(const std::vector<galois::GNNFloat>& prev_layer_input,
                 std::vector<galois::GNNFloat>* input_gradient) = 0;
 
-  //! Returns the weight gradients
-  const std::vector<GNNFloat>& GetLayerWeightGradients() const {
-    return layer_weight_gradients_;
-  }
-
-  //! Returns dimensions of this layer
-  const GNNLayerDimensions& GetLayerDimensions() const {
-    return layer_dimensions_;
-  }
-
-  galois::GNNLayerType layer_type() const { return layer_type_; }
-  galois::GNNOutputLayerType output_layer_type() const {
-    return output_layer_type_;
-  }
-
 protected:
   //! Layer order (starts from 0); used in backward to shortcut output as layer
   //! 0 does not need to do some things that other layers need to do
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index 9b6a4ad708..a593a218bf 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -55,3 +55,12 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork(
     GALOIS_LOG_FATAL("Invalid layer type during network construction");
   }
 }
+
+const std::vector<galois::GNNFloat>* galois::GraphNeuralNetwork::DoInference() {
+  // start with graph features and pass it through all layers of the network
+  const std::vector<GNNFloat>* layer_input = &(graph_->GetLocalFeatures());
+  for (std::unique_ptr<galois::GNNLayer>& ptr : gnn_layers_) {
+    layer_input = &(ptr->ForwardPhase(*layer_input));
+  }
+  return layer_input;
+}
diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt
index e7a04b5b5f..66c70c6f26 100644
--- a/libgnn/test/CMakeLists.txt
+++ b/libgnn/test/CMakeLists.txt
@@ -14,4 +14,8 @@ add_executable(gnnconstruct-test gnnconstruct-test.cpp)
 target_link_libraries(gnnconstruct-test galois_gnn)
 add_test(NAME gnnconstruct-test COMMAND gnnconstruct-test)
 
+add_executable(gnnfb-test gnnfb-test.cpp)
+target_link_libraries(gnnfb-test galois_gnn)
+add_test(NAME gnnfb-test COMMAND gnnfb-test)
+
 # TODO multi host tests?
diff --git a/libgnn/test/gnnconstruct-test.cpp b/libgnn/test/gnnconstruct-test.cpp
index 9265eb6b8b..537a16d5b0 100644
--- a/libgnn/test/gnnconstruct-test.cpp
+++ b/libgnn/test/gnnconstruct-test.cpp
@@ -1,8 +1,9 @@
-#include "galois/Logging.h"
-#include "galois/GraphNeuralNetwork.h"
 //! @file gnnconstruct-test.cpp
 //! Test to make sure construction works as expected
 
+#include "galois/Logging.h"
+#include "galois/GraphNeuralNetwork.h"
+
 int main() {
   galois::DistMemSys G;
 
diff --git a/libgnn/test/gnnfb-test.cpp b/libgnn/test/gnnfb-test.cpp
new file mode 100644
index 0000000000..7a9ee5d697
--- /dev/null
+++ b/libgnn/test/gnnfb-test.cpp
@@ -0,0 +1,156 @@
+//! @file gnnfb-test.cpp
+//! Runs a forward and backward phase on a GCN and an example graph.
+
+#include "galois/Logging.h"
+#include "galois/GraphNeuralNetwork.h"
+
+int main() {
+  galois::DistMemSys G;
+
+  size_t num_threads = galois::setActiveThreads(
+      56 / galois::runtime::getSystemNetworkInterface().Num);
+
+  GALOIS_LOG_VERBOSE("[{}] Using {} threads",
+                     galois::runtime::getSystemNetworkInterface().ID,
+                     num_threads);
+  // load test graph
+  auto test_graph = std::make_unique<galois::graphs::GNNGraph>(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true);
+
+  // 2 layer test with softmax
+  std::vector<galois::GNNLayerType> layer_types = {
+      galois::GNNLayerType::kGraphConvolutional,
+      galois::GNNLayerType::kGraphConvolutional};
+  // note this includes the output; last 2 must be same because softmax
+  std::vector<size_t> layer_output_sizes = {4, 7, 7};
+  // note GNNConfig is passed in; use a config that does not do anything extra
+  // like dropout or activation and the like so that input is easier to verify
+  galois::GraphNeuralNetworkConfig gnn_config(
+      2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax,
+      galois::GNNConfig());
+  auto gnn = std::make_unique<galois::GraphNeuralNetwork>(
+      std::move(test_graph), std::move(gnn_config));
+  // for constancy set everything to 1
+  gnn->SetAllLayerWeightsTo1();
+
+  //////////////////////////////////////////////////////////////////////////////
+  // forward phase
+  //////////////////////////////////////////////////////////////////////////////
+  gnn->DoInference();
+
+  // check output for layers to make sure it's as expected
+  const std::vector<galois::GNNFloat>& lf0_out =
+      gnn->GetIntermediateLayer(0)->GetForwardOutput();
+  GALOIS_LOG_ASSERT(lf0_out.size() == 28);
+  for (size_t i = 0; i < 4; i++) {
+    GALOIS_LOG_ASSERT(lf0_out[0 + i] == 3);
+  }
+  for (size_t i = 0; i < 4; i++) {
+    GALOIS_LOG_ASSERT(lf0_out[4 + i] == 6);
+  }
+  for (size_t i = 0; i < 4; i++) {
+    GALOIS_LOG_ASSERT(lf0_out[8 + i] == 12);
+  }
+  for (size_t i = 0; i < 4; i++) {
+    GALOIS_LOG_ASSERT(lf0_out[12 + i] == 18);
+  }
+  for (size_t i = 0; i < 4; i++) {
+    GALOIS_LOG_ASSERT(lf0_out[16 + i] == 24);
+  }
+  for (size_t i = 0; i < 4; i++) {
+    GALOIS_LOG_ASSERT(lf0_out[20 + i] == 30);
+  }
+  for (size_t i = 0; i < 4; i++) {
+    GALOIS_LOG_ASSERT(lf0_out[24 + i] == 15);
+  }
+
+  const std::vector<galois::GNNFloat>& lf1_out =
+      gnn->GetIntermediateLayer(1)->GetForwardOutput();
+  GALOIS_LOG_ASSERT(lf1_out.size() == 49);
+  for (size_t i = 0; i < 7; i++) {
+    GALOIS_LOG_ASSERT(lf1_out[0 + i] == 24);
+  }
+  for (size_t i = 0; i < 7; i++) {
+    GALOIS_LOG_ASSERT(lf1_out[7 + i] == 60);
+  }
+  for (size_t i = 0; i < 7; i++) {
+    GALOIS_LOG_ASSERT(lf1_out[14 + i] == 96);
+  }
+  for (size_t i = 0; i < 7; i++) {
+    GALOIS_LOG_ASSERT(lf1_out[21 + i] == 144);
+  }
+  for (size_t i = 0; i < 7; i++) {
+    GALOIS_LOG_ASSERT(lf1_out[28 + i] == 192);
+  }
+  for (size_t i = 0; i < 7; i++) {
+    GALOIS_LOG_ASSERT(lf1_out[35 + i] == 156);
+  }
+  for (size_t i = 0; i < 7; i++) {
+    GALOIS_LOG_ASSERT(lf1_out[42 + i] == 120);
+  }
+
+  const std::vector<galois::GNNFloat>& fo_out =
+      gnn->GetOutputLayer()->GetForwardOutput();
+  GALOIS_LOG_ASSERT(fo_out.size() == 49);
+  // since row all same, prob distribution across row should be same
+  for (size_t c = 0; c < 49; c += 7) {
+    for (size_t i = 0; i < 6; i++) {
+      GALOIS_LOG_VERBOSE("{}", fo_out[c + i]);
+      GALOIS_LOG_ASSERT(fo_out[c + i] == fo_out[c + i + 1]);
+    }
+  }
+
+  // train mode = last 2 should be masked off
+  for (size_t c = 35; c < 49; c += 7) {
+    for (size_t i = 0; i < 6; i++) {
+      GALOIS_LOG_ASSERT(fo_out[c + i] == 0);
+    }
+  }
+
+  //////////////////////////////////////////////////////////////////////////////
+  // TODO backward phase
+  //////////////////////////////////////////////////////////////////////////////
+
+  //////////////////////////////////////////////////////////////////////////////
+  // verify forward val and test masks
+  //////////////////////////////////////////////////////////////////////////////
+  gnn->SetLayerPhases(galois::GNNPhase::kValidate);
+  gnn->DoInference();
+  const std::vector<galois::GNNFloat>& fo_out_val =
+      gnn->GetOutputLayer()->GetForwardOutput();
+  for (size_t c = 0; c < 49; c += 7) {
+    for (size_t i = 0; i < 6; i++) {
+      GALOIS_LOG_ASSERT(fo_out_val[c + i] == fo_out_val[c + i + 1]);
+    }
+  }
+  // first 5 and last should be 0s
+  for (size_t c = 0; c < 35; c += 7) {
+    for (size_t i = 0; i < 6; i++) {
+      GALOIS_LOG_ASSERT(fo_out_val[c + i] == 0);
+    }
+  }
+  for (size_t c = 42; c < 49; c += 7) {
+    for (size_t i = 0; i < 6; i++) {
+      GALOIS_LOG_ASSERT(fo_out_val[c + i] == 0);
+    }
+  }
+
+  // all but last should be 0s
+  gnn->SetLayerPhases(galois::GNNPhase::kTest);
+  gnn->DoInference();
+  const std::vector<galois::GNNFloat>& fo_out_test =
+      gnn->GetOutputLayer()->GetForwardOutput();
+  for (size_t c = 0; c < 49; c += 7) {
+    for (size_t i = 0; i < 6; i++) {
+      GALOIS_LOG_ASSERT(fo_out_test[c + i] == fo_out_test[c + i + 1]);
+    }
+  }
+  // first 5 and last should be 0s
+  for (size_t c = 0; c < 42; c += 7) {
+    for (size_t i = 0; i < 6; i++) {
+      GALOIS_LOG_ASSERT(fo_out_test[c + i] == 0);
+    }
+  }
+
+  // TODO different config of gnn
+}

From d1aff528137624c6e81d51e5eb2b5424262ae989 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 1 Oct 2020 19:38:49 -0500
Subject: [PATCH 351/660] GNN backward phase; no weight update

Added implementation for GNN backward phase without the weight update.
Each layer passes its gradients to the previous layer in the GNN where
weight gradients as well as gradients for the next layer in propagation
are determined.

The weight gradients are not yet used to update the weights via SGD.
This requires the addition of optimizer classes and will be the next
step in this refactoring.

Adds a call to the backward phase in the GNN FB test to make sure no
crashes occur.
---
 libgnn/include/galois/GraphNeuralNetwork.h |  6 +++++
 libgnn/src/GraphNeuralNetwork.cpp          | 30 ++++++++++++++++++++++
 libgnn/test/gnnfb-test.cpp                 | 20 +++++++++++++--
 3 files changed, 54 insertions(+), 2 deletions(-)

diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h
index 8762612a9b..962350c8c4 100644
--- a/libgnn/include/galois/GraphNeuralNetwork.h
+++ b/libgnn/include/galois/GraphNeuralNetwork.h
@@ -32,6 +32,7 @@ class GraphNeuralNetworkConfig {
                                  output_layer_type,
                                  GNNConfig{.do_dropout       = true,
                                            .dropout_rate     = 0.3,
+                                           .do_activation    = true,
                                            .do_normalization = true}) {}
 
   //! Construction with a specified config for layers
@@ -137,6 +138,11 @@ class GraphNeuralNetwork {
   //! @returns Output layer's output
   const std::vector<GNNFloat>* DoInference();
 
+  //! Backpropagate gradients from the output layer backwards through the
+  //! network to update the layer weights. Also known as a backward phase in
+  //! most literature
+  void GradientPropagation();
+
 private:
   //! Underlying graph to train
   std::unique_ptr<graphs::GNNGraph> graph_;
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index a593a218bf..966cd3238a 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -64,3 +64,33 @@ const std::vector<galois::GNNFloat>* galois::GraphNeuralNetwork::DoInference() {
   }
   return layer_input;
 }
+
+void galois::GraphNeuralNetwork::GradientPropagation() {
+  // from output layer get initial gradients
+  std::vector<galois::GNNFloat> dummy;
+  std::unique_ptr<galois::GNNLayer>& output_layer = gnn_layers_.back();
+  std::vector<galois::GNNFloat>* current_gradients =
+      output_layer->BackwardPhase(dummy, nullptr);
+
+  // loops through intermediate layers in a backward fashion
+  // -1 to ignore output layer which was handled above
+  for (size_t i = 0; i < gnn_layers_.size() - 1; i++) {
+    // note this assumes you have at least 2 layers
+    size_t layer_index = gnn_layers_.size() - 2 - i;
+
+    // get the input to the layer before this one
+    const std::vector<galois::GNNFloat>* prev_layer_input;
+    if (layer_index != 0) {
+      prev_layer_input = &(gnn_layers_[layer_index - 1]->GetForwardOutput());
+    } else {
+      prev_layer_input = &(graph_->GetLocalFeatures());
+    }
+
+    // backward prop and get a new set of gradients
+    current_gradients = gnn_layers_[layer_index]->BackwardPhase(
+        *prev_layer_input, current_gradients);
+    // at this point in the layer the gradients exist; use the gradients to
+    // update the weights of the layer
+    // XXX need optimizers
+  }
+}
diff --git a/libgnn/test/gnnfb-test.cpp b/libgnn/test/gnnfb-test.cpp
index 7a9ee5d697..9fd43b4675 100644
--- a/libgnn/test/gnnfb-test.cpp
+++ b/libgnn/test/gnnfb-test.cpp
@@ -108,9 +108,12 @@ int main() {
   }
 
   //////////////////////////////////////////////////////////////////////////////
-  // TODO backward phase
+  // backward phase; run it; verifying is difficult due to floating point
+  // nature of softmax gradients
   //////////////////////////////////////////////////////////////////////////////
 
+  gnn->GradientPropagation();
+
   //////////////////////////////////////////////////////////////////////////////
   // verify forward val and test masks
   //////////////////////////////////////////////////////////////////////////////
@@ -151,6 +154,19 @@ int main() {
       GALOIS_LOG_ASSERT(fo_out_test[c + i] == 0);
     }
   }
+  //////////////////////////////////////////////////////////////////////////////
+  // run different config of gnn with dropout/activation
+  //////////////////////////////////////////////////////////////////////////////
 
-  // TODO different config of gnn
+  GALOIS_LOG_VERBOSE("Running with different congifuration");
+
+  test_graph = std::make_unique<galois::graphs::GNNGraph>(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true);
+  galois::GraphNeuralNetworkConfig gnn_config2(
+      2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax);
+  auto gnn2 = std::make_unique<galois::GraphNeuralNetwork>(
+      std::move(test_graph), std::move(gnn_config2));
+  // run to make sure no crashes occur
+  gnn2->DoInference();
+  gnn2->GradientPropagation();
 }

From 27d807d336f803d0bc29b6e90dda46776ab0015b Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 5 Oct 2020 14:22:33 -0500
Subject: [PATCH 352/660] Adam optimizer and test

Added the Adam optimizer and a test for it. Notably different from the
previous implementation is that there is now separate training variables
for each layer instead of having them shared among all layers.
---
 libgnn/CMakeLists.txt                 |  1 +
 libgnn/include/galois/GNNOptimizers.h | 70 +++++++++++++++++++++++++++
 libgnn/src/GNNOptimizers.cpp          | 44 +++++++++++++++++
 libgnn/test/CMakeLists.txt            |  4 ++
 libgnn/test/adam-test.cpp             | 44 +++++++++++++++++
 5 files changed, 163 insertions(+)
 create mode 100644 libgnn/include/galois/GNNOptimizers.h
 create mode 100644 libgnn/src/GNNOptimizers.cpp
 create mode 100644 libgnn/test/adam-test.cpp

diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt
index ce6e6f990f..e6c8786cd2 100644
--- a/libgnn/CMakeLists.txt
+++ b/libgnn/CMakeLists.txt
@@ -23,6 +23,7 @@ set(sources
   src/GraphConvolutionalLayer.cpp
   src/SoftmaxLayer.cpp
   src/GraphNeuralNetwork.cpp
+  src/GNNOptimizers.cpp
 )
 
 add_library(galois_gnn STATIC ${sources})
diff --git a/libgnn/include/galois/GNNOptimizers.h b/libgnn/include/galois/GNNOptimizers.h
new file mode 100644
index 0000000000..8037cbdef0
--- /dev/null
+++ b/libgnn/include/galois/GNNOptimizers.h
@@ -0,0 +1,70 @@
+#pragma once
+// Code inspired from this; actual code style is not the same + changed some
+// things such as adding params for every layer which TinyDNN does not seem to
+// do
+// https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/optimizers/optimizer.h
+// Copyright (c) 2013, Taiga Nomi and the respective contributors
+// All rights reserved.
+// Changed by Galois under 3-BSD
+#include "galois/GNNTypes.h"
+#include <vector>
+
+namespace galois {
+
+//! Virtual class; optimizers all need the descent function
+class BaseOptimizer {
+  virtual void GradientDescent(const std::vector<GNNFloat>& derivatives,
+                               std::vector<GNNFloat>* matrix,
+                               size_t layer_number) = 0;
+};
+
+//! Maintains a first and second moment for each weight in the weight matrix and
+//! does gradient descent invidiually on each weight
+class AdamOptimizer : public BaseOptimizer {
+public:
+  //! Struct for specifying adam config. Defaults based on the Adam paper.
+  struct AdamConfiguration {
+    GNNFloat alpha{0.001};
+    GNNFloat beta1{0.9};
+    GNNFloat beta2{0.999};
+    GNNFloat epsilon{1e-8};
+  };
+
+  //! Constructor allocates memory, initializes training vars for each layer
+  AdamOptimizer(const AdamConfiguration& config,
+                const std::vector<size_t>& trainable_layer_sizes,
+                size_t num_trainable_layers)
+      : config_(config), num_trainable_layers_(num_trainable_layers),
+        beta1_power_t_(num_trainable_layers_, config.beta1),
+        beta2_power_t_(num_trainable_layers_, config.beta2) {
+    assert(trainable_layer_sizes.size() == num_trainable_layers_);
+    // allocate vectors based on # of trainable layers
+    for (size_t layer_size : trainable_layer_sizes) {
+      first_moments_.emplace_back(layer_size, 0.0);
+      second_moments_.emplace_back(layer_size, 0.0);
+    }
+    assert(first_moments_.size() == num_trainable_layers_);
+    assert(second_moments_.size() == num_trainable_layers_);
+  }
+  //! Adam based gradient descent
+  void GradientDescent(const std::vector<GNNFloat>& derivatives,
+                       std::vector<GNNFloat>* matrix,
+                       size_t layer_number) final;
+
+private:
+  //! Configuration options for this layer
+  AdamConfiguration config_;
+  //! First moment vectors; one for each trainable layer
+  std::vector<std::vector<GNNFloat>> first_moments_;
+  //! Second moment vectors; one for each trainable layer
+  std::vector<std::vector<GNNFloat>> second_moments_;
+  //! Number of layers that can be trained (need moment vectors for each)
+  size_t num_trainable_layers_;
+  // power terms used in adam: updated by raising power every time update is
+  // called
+  // vector because one is necessary for each layer
+  std::vector<GNNFloat> beta1_power_t_;
+  std::vector<GNNFloat> beta2_power_t_;
+};
+
+} // namespace galois
diff --git a/libgnn/src/GNNOptimizers.cpp b/libgnn/src/GNNOptimizers.cpp
new file mode 100644
index 0000000000..8698aa37c3
--- /dev/null
+++ b/libgnn/src/GNNOptimizers.cpp
@@ -0,0 +1,44 @@
+#include "galois/Galois.h"
+#include "galois/GNNOptimizers.h"
+#include "galois/Logging.h"
+#include <cassert>
+
+void galois::AdamOptimizer::GradientDescent(
+    const std::vector<GNNFloat>& derivatives, std::vector<GNNFloat>* matrix,
+    size_t layer_number) {
+  assert(derivatives.size() == matrix->size());
+
+  // grab based on layer being used
+  std::vector<GNNFloat>& first_moment  = first_moments_[layer_number];
+  std::vector<GNNFloat>& second_moment = second_moments_[layer_number];
+  assert(derivatives.size() == first_moment.size());
+  assert(derivatives.size() == second_moment.size());
+
+  // individual weight updates via gradients
+  galois::do_all(
+      galois::iterate(static_cast<size_t>(0), matrix->size()),
+      [&](size_t i) {
+        // moment estimate updates
+        first_moment[i] = config_.beta1 * first_moment[i] +
+                          (1.0 - config_.beta1) * derivatives[i];
+        second_moment[i] =
+            config_.beta2 * second_moment[i] +
+            (1.0 - config_.beta2) * (derivatives[i] * derivatives[i]);
+        GALOIS_LOG_VERBOSE("{} {}", first_moment[i], second_moment[i]);
+        // bias corrected moments using beta power
+        GNNFloat bias_correct_first =
+            first_moment[i] / (1.0 - beta1_power_t_[layer_number]);
+        GNNFloat bias_correct_second =
+            second_moment[i] / (1.0 - beta2_power_t_[layer_number]);
+        GALOIS_LOG_VERBOSE("{} {}", bias_correct_first, bias_correct_second);
+        // weight update using bias corrected moments
+        (matrix->data())[i] -=
+            config_.alpha * bias_correct_first /
+            (std::sqrt(bias_correct_second) + config_.epsilon);
+      },
+      galois::loopname("AdamOptimizerGradientDescent"));
+
+  // update the power terms for next update call
+  beta1_power_t_[layer_number] *= config_.beta1;
+  beta2_power_t_[layer_number] *= config_.beta2;
+}
diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt
index 66c70c6f26..029f785ad1 100644
--- a/libgnn/test/CMakeLists.txt
+++ b/libgnn/test/CMakeLists.txt
@@ -18,4 +18,8 @@ add_executable(gnnfb-test gnnfb-test.cpp)
 target_link_libraries(gnnfb-test galois_gnn)
 add_test(NAME gnnfb-test COMMAND gnnfb-test)
 
+add_executable(adam-test adam-test.cpp)
+target_link_libraries(adam-test galois_gnn)
+add_test(NAME adam-test COMMAND adam-test)
+
 # TODO multi host tests?
diff --git a/libgnn/test/adam-test.cpp b/libgnn/test/adam-test.cpp
new file mode 100644
index 0000000000..e01368ce87
--- /dev/null
+++ b/libgnn/test/adam-test.cpp
@@ -0,0 +1,44 @@
+//! @file adam-test.cpp
+//! Tests the adam optimizer
+#include "galois/DistGalois.h"
+#include "galois/GNNOptimizers.h"
+#include "galois/Logging.h"
+
+int main() {
+  galois::DistMemSys G;
+
+  size_t num_threads = galois::setActiveThreads(
+      56 / galois::runtime::getSystemNetworkInterface().Num);
+  GALOIS_LOG_VERBOSE("[{}] Using {} threads",
+                     galois::runtime::getSystemNetworkInterface().ID,
+                     num_threads);
+
+  // create sample config that is easy to trace
+  galois::AdamOptimizer::AdamConfiguration config = {
+      .alpha = 1, .beta1 = 0.5, .beta2 = 0.5, .epsilon = 0};
+  std::vector<size_t> layer_sizes = {2, 1};
+  galois::AdamOptimizer adam(config, layer_sizes, 2);
+
+  std::vector<galois::GNNFloat> weights1 = {1, 1};
+  std::vector<galois::GNNFloat> weights2 = {10};
+  std::vector<galois::GNNFloat> grad1    = {1, 1};
+  std::vector<galois::GNNFloat> grad2    = {10};
+
+  adam.GradientDescent(grad1, &weights1, 0);
+  // check weights
+  GALOIS_LOG_ASSERT(weights1[0] == 0.0);
+  GALOIS_LOG_ASSERT(weights1[1] == 0.0);
+
+  adam.GradientDescent(grad2, &weights2, 1);
+  GALOIS_LOG_ASSERT(weights2[0] == 9.0);
+
+  // run again to check if adam keeps moments from before
+  adam.GradientDescent(grad1, &weights1, 0);
+  // check weights again (turns out derivative one ends up doing same thing)
+  GALOIS_LOG_ASSERT(weights1[0] == -1.0);
+  GALOIS_LOG_ASSERT(weights1[1] == -1.0);
+
+  // grad 2 again
+  adam.GradientDescent(grad2, &weights2, 1);
+  GALOIS_LOG_ASSERT(weights2[0] == 8.0);
+}

From 34c9848df23614e7bc24e6ab31a99b8a05007d63 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 5 Oct 2020 18:24:38 -0500
Subject: [PATCH 353/660] GNN now takes an optimizer object

Prep for gradient descent in GNN backward phase by adding an argument to
GNN constructor requiring that an optimizer be specified. Changed tests
to have optimizers as well.
---
 libgnn/include/galois/GNNOptimizers.h      | 15 +++++++++++----
 libgnn/include/galois/GraphNeuralNetwork.h |  4 ++++
 libgnn/src/GraphNeuralNetwork.cpp          |  4 +++-
 libgnn/test/gnnconstruct-test.cpp          |  5 ++++-
 libgnn/test/gnnfb-test.cpp                 | 10 ++++++----
 5 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/libgnn/include/galois/GNNOptimizers.h b/libgnn/include/galois/GNNOptimizers.h
index 8037cbdef0..84531e5a20 100644
--- a/libgnn/include/galois/GNNOptimizers.h
+++ b/libgnn/include/galois/GNNOptimizers.h
@@ -8,6 +8,7 @@
 // Changed by Galois under 3-BSD
 #include "galois/GNNTypes.h"
 #include <vector>
+#include <cassert>
 
 namespace galois {
 
@@ -30,6 +31,11 @@ class AdamOptimizer : public BaseOptimizer {
     GNNFloat epsilon{1e-8};
   };
 
+  AdamOptimizer(const std::vector<size_t>& trainable_layer_sizes,
+                size_t num_trainable_layers)
+      : AdamOptimizer(AdamConfiguration(), trainable_layer_sizes,
+                      num_trainable_layers) {}
+
   //! Constructor allocates memory, initializes training vars for each layer
   AdamOptimizer(const AdamConfiguration& config,
                 const std::vector<size_t>& trainable_layer_sizes,
@@ -37,11 +43,12 @@ class AdamOptimizer : public BaseOptimizer {
       : config_(config), num_trainable_layers_(num_trainable_layers),
         beta1_power_t_(num_trainable_layers_, config.beta1),
         beta2_power_t_(num_trainable_layers_, config.beta2) {
-    assert(trainable_layer_sizes.size() == num_trainable_layers_);
+    // >= because only prefix will be considered otherwise
+    assert(trainable_layer_sizes.size() >= num_trainable_layers_);
     // allocate vectors based on # of trainable layers
-    for (size_t layer_size : trainable_layer_sizes) {
-      first_moments_.emplace_back(layer_size, 0.0);
-      second_moments_.emplace_back(layer_size, 0.0);
+    for (size_t i = 0; i < num_trainable_layers_; i++) {
+      first_moments_.emplace_back(trainable_layer_sizes[i], 0.0);
+      second_moments_.emplace_back(trainable_layer_sizes[i], 0.0);
     }
     assert(first_moments_.size() == num_trainable_layers_);
     assert(second_moments_.size() == num_trainable_layers_);
diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h
index 962350c8c4..d9cd6febc9 100644
--- a/libgnn/include/galois/GraphNeuralNetwork.h
+++ b/libgnn/include/galois/GraphNeuralNetwork.h
@@ -5,6 +5,7 @@
 //! well as helper enums/classes involved with the GNN.
 
 #include "galois/Logging.h"
+#include "galois/GNNOptimizers.h"
 #include "galois/graphs/GNNGraph.h"
 #include "galois/layers/GNNLayer.h"
 
@@ -101,6 +102,7 @@ class GraphNeuralNetwork {
   //! Construct the graph neural network given the graph to train on as well as
   //! a configuration object
   GraphNeuralNetwork(std::unique_ptr<graphs::GNNGraph> graph,
+                     std::unique_ptr<BaseOptimizer> optimizer,
                      GraphNeuralNetworkConfig&& config);
 
   //! Number of intermediate layers (DOES NOT INCLUDE OUTPUT LAYER)
@@ -146,6 +148,8 @@ class GraphNeuralNetwork {
 private:
   //! Underlying graph to train
   std::unique_ptr<graphs::GNNGraph> graph_;
+  //! Optimizer object for weight updates
+  std::unique_ptr<BaseOptimizer> optimizer_;
   //! Configuration object used to construct this GNN
   GraphNeuralNetworkConfig config_;
   //! GNN layers including the output
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index 966cd3238a..daaa49297f 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -4,8 +4,10 @@
 
 galois::GraphNeuralNetwork::GraphNeuralNetwork(
     std::unique_ptr<galois::graphs::GNNGraph> graph,
+    std::unique_ptr<BaseOptimizer> optimizer,
     galois::GraphNeuralNetworkConfig&& config)
-    : graph_(std::move(graph)), config_(std::move(config)) {
+    : graph_(std::move(graph)), optimizer_(std::move(optimizer)),
+      config_(std::move(config)) {
   // max number of rows that can be passed as inputs; allocate space for it as
   // this will be the # of rows for each layer
   size_t max_rows = graph_->size();
diff --git a/libgnn/test/gnnconstruct-test.cpp b/libgnn/test/gnnconstruct-test.cpp
index 537a16d5b0..25abf0e4a1 100644
--- a/libgnn/test/gnnconstruct-test.cpp
+++ b/libgnn/test/gnnconstruct-test.cpp
@@ -25,7 +25,10 @@ int main() {
   std::vector<size_t> layer_output_sizes = {4, 7, 7};
   galois::GraphNeuralNetworkConfig gnn_config(
       2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax);
-  galois::GraphNeuralNetwork gnn(std::move(test_graph), std::move(gnn_config));
+  auto adam = std::make_unique<galois::AdamOptimizer>(layer_output_sizes, 2);
+
+  galois::GraphNeuralNetwork gnn(std::move(test_graph), std::move(adam),
+                                 std::move(gnn_config));
 
   // note this does not include output layer
   GALOIS_LOG_ASSERT(gnn.num_intermediate_layers() == 2);
diff --git a/libgnn/test/gnnfb-test.cpp b/libgnn/test/gnnfb-test.cpp
index 9fd43b4675..75b91f40b6 100644
--- a/libgnn/test/gnnfb-test.cpp
+++ b/libgnn/test/gnnfb-test.cpp
@@ -28,8 +28,9 @@ int main() {
   galois::GraphNeuralNetworkConfig gnn_config(
       2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax,
       galois::GNNConfig());
-  auto gnn = std::make_unique<galois::GraphNeuralNetwork>(
-      std::move(test_graph), std::move(gnn_config));
+  auto adam = std::make_unique<galois::AdamOptimizer>(layer_output_sizes, 2);
+  auto gnn  = std::make_unique<galois::GraphNeuralNetwork>(
+      std::move(test_graph), std::move(adam), std::move(gnn_config));
   // for constancy set everything to 1
   gnn->SetAllLayerWeightsTo1();
 
@@ -164,8 +165,9 @@ int main() {
       "tester", galois::graphs::GNNPartitionScheme::kOEC, true);
   galois::GraphNeuralNetworkConfig gnn_config2(
       2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax);
-  auto gnn2 = std::make_unique<galois::GraphNeuralNetwork>(
-      std::move(test_graph), std::move(gnn_config2));
+  auto adam2 = std::make_unique<galois::AdamOptimizer>(layer_output_sizes, 2);
+  auto gnn2  = std::make_unique<galois::GraphNeuralNetwork>(
+      std::move(test_graph), std::move(adam2), std::move(gnn_config2));
   // run to make sure no crashes occur
   gnn2->DoInference();
   gnn2->GradientPropagation();

From b0ecc157238f4dbad80de62902f19091c79a3660 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 5 Oct 2020 18:57:15 -0500
Subject: [PATCH 354/660] Added gradient descent to backward pass

GNN's backward pass now also calls the optimizer to update weights based
on the gradient, thus completing the pipeline for a full training epoch.
The tests have been updated accordingly to fix the adam size (supposed
to be size of layer, not output).

Next step is to add accuracy measures and then a full program can be
tested end to end.
---
 libgnn/include/galois/GNNOptimizers.h   |  1 +
 libgnn/include/galois/layers/GNNLayer.h |  6 +++++-
 libgnn/src/GNNLayer.cpp                 |  6 ++++++
 libgnn/src/GraphNeuralNetwork.cpp       |  3 ++-
 libgnn/test/gnnconstruct-test.cpp       |  3 ++-
 libgnn/test/gnnfb-test.cpp              | 12 ++++++++++--
 6 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/libgnn/include/galois/GNNOptimizers.h b/libgnn/include/galois/GNNOptimizers.h
index 84531e5a20..a970c54c56 100644
--- a/libgnn/include/galois/GNNOptimizers.h
+++ b/libgnn/include/galois/GNNOptimizers.h
@@ -14,6 +14,7 @@ namespace galois {
 
 //! Virtual class; optimizers all need the descent function
 class BaseOptimizer {
+public:
   virtual void GradientDescent(const std::vector<GNNFloat>& derivatives,
                                std::vector<GNNFloat>* matrix,
                                size_t layer_number) = 0;
diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
index f22507b6be..3647434be6 100644
--- a/libgnn/include/galois/layers/GNNLayer.h
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "galois/PerThreadRNG.h"
+#include "galois/GNNOptimizers.h"
 #include "galois/graphs/GNNGraph.h"
 
 namespace galois {
@@ -116,6 +117,10 @@ class GNNLayer {
   BackwardPhase(const std::vector<galois::GNNFloat>& prev_layer_input,
                 std::vector<galois::GNNFloat>* input_gradient) = 0;
 
+  //! Given an optimizer, update the weights in this layer based on gradients
+  //! stored in the layer
+  void OptimizeLayer(BaseOptimizer* optimizer, size_t trainable_layer_number);
+
 protected:
   //! Layer order (starts from 0); used in backward to shortcut output as layer
   //! 0 does not need to do some things that other layers need to do
@@ -170,7 +175,6 @@ class GNNLayer {
   //! matrix
   void Activation();
   //! Calculate derivative of activation function based on config on the matrix
-  // XXX
   void ActivationDerivative(std::vector<GNNFloat>* matrix);
 };
 
diff --git a/libgnn/src/GNNLayer.cpp b/libgnn/src/GNNLayer.cpp
index d14a5d1b05..33114a2f06 100644
--- a/libgnn/src/GNNLayer.cpp
+++ b/libgnn/src/GNNLayer.cpp
@@ -100,3 +100,9 @@ void galois::GNNLayer::ActivationDerivative(std::vector<GNNFloat>* gradient) {
       },
       galois::loopname("ReLU-Derivative"));
 }
+
+void galois::GNNLayer::OptimizeLayer(BaseOptimizer* optimizer,
+                                     size_t trainable_layer_number) {
+  optimizer->GradientDescent(layer_weight_gradients_, &layer_weights_,
+                             trainable_layer_number);
+}
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index daaa49297f..18675d2ce4 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -91,8 +91,9 @@ void galois::GraphNeuralNetwork::GradientPropagation() {
     // backward prop and get a new set of gradients
     current_gradients = gnn_layers_[layer_index]->BackwardPhase(
         *prev_layer_input, current_gradients);
+    // if not output do optimization/gradient descent
     // at this point in the layer the gradients exist; use the gradients to
     // update the weights of the layer
-    // XXX need optimizers
+    gnn_layers_[layer_index]->OptimizeLayer(optimizer_.get(), layer_index);
   }
 }
diff --git a/libgnn/test/gnnconstruct-test.cpp b/libgnn/test/gnnconstruct-test.cpp
index 25abf0e4a1..69c64105f6 100644
--- a/libgnn/test/gnnconstruct-test.cpp
+++ b/libgnn/test/gnnconstruct-test.cpp
@@ -25,7 +25,8 @@ int main() {
   std::vector<size_t> layer_output_sizes = {4, 7, 7};
   galois::GraphNeuralNetworkConfig gnn_config(
       2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax);
-  auto adam = std::make_unique<galois::AdamOptimizer>(layer_output_sizes, 2);
+  std::vector<size_t> adam_sizes = {12, 28};
+  auto adam = std::make_unique<galois::AdamOptimizer>(adam_sizes, 2);
 
   galois::GraphNeuralNetwork gnn(std::move(test_graph), std::move(adam),
                                  std::move(gnn_config));
diff --git a/libgnn/test/gnnfb-test.cpp b/libgnn/test/gnnfb-test.cpp
index 75b91f40b6..8142b2435b 100644
--- a/libgnn/test/gnnfb-test.cpp
+++ b/libgnn/test/gnnfb-test.cpp
@@ -28,7 +28,10 @@ int main() {
   galois::GraphNeuralNetworkConfig gnn_config(
       2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax,
       galois::GNNConfig());
-  auto adam = std::make_unique<galois::AdamOptimizer>(layer_output_sizes, 2);
+  // input is 7 x 3, layers are then 3 x 4 and 4 x 7 and 7 x 7
+  // middle 2 are trainable so 12 and 28
+  std::vector<size_t> adam_sizes = {12, 28};
+  auto adam = std::make_unique<galois::AdamOptimizer>(adam_sizes, 2);
   auto gnn  = std::make_unique<galois::GraphNeuralNetwork>(
       std::move(test_graph), std::move(adam), std::move(gnn_config));
   // for constancy set everything to 1
@@ -119,6 +122,7 @@ int main() {
   // verify forward val and test masks
   //////////////////////////////////////////////////////////////////////////////
   gnn->SetLayerPhases(galois::GNNPhase::kValidate);
+  gnn->SetAllLayerWeightsTo1();
   gnn->DoInference();
   const std::vector<galois::GNNFloat>& fo_out_val =
       gnn->GetOutputLayer()->GetForwardOutput();
@@ -138,9 +142,11 @@ int main() {
       GALOIS_LOG_ASSERT(fo_out_val[c + i] == 0);
     }
   }
+  gnn->GradientPropagation();
 
   // all but last should be 0s
   gnn->SetLayerPhases(galois::GNNPhase::kTest);
+  gnn->SetAllLayerWeightsTo1();
   gnn->DoInference();
   const std::vector<galois::GNNFloat>& fo_out_test =
       gnn->GetOutputLayer()->GetForwardOutput();
@@ -155,6 +161,8 @@ int main() {
       GALOIS_LOG_ASSERT(fo_out_test[c + i] == 0);
     }
   }
+  gnn->GradientPropagation();
+
   //////////////////////////////////////////////////////////////////////////////
   // run different config of gnn with dropout/activation
   //////////////////////////////////////////////////////////////////////////////
@@ -165,7 +173,7 @@ int main() {
       "tester", galois::graphs::GNNPartitionScheme::kOEC, true);
   galois::GraphNeuralNetworkConfig gnn_config2(
       2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax);
-  auto adam2 = std::make_unique<galois::AdamOptimizer>(layer_output_sizes, 2);
+  auto adam2 = std::make_unique<galois::AdamOptimizer>(adam_sizes, 2);
   auto gnn2  = std::make_unique<galois::GraphNeuralNetwork>(
       std::move(test_graph), std::move(adam2), std::move(gnn_config2));
   // run to make sure no crashes occur

From b07f243ff0e1d322d8cabbe9a04132275220b3cd Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 6 Oct 2020 13:24:49 -0500
Subject: [PATCH 355/660] Global accuracy in GNN; test for it as well

Added a function to the GNN class to get accuracy of a prediction
distribution from a softmax layer based on training, validiation, and
test modes in the GNN. Added a field to track the mode in the GNN in
addition to it being tracked in the layers.

Added a test to make sure accuracy readings were returning correctly as
expected as well.
---
 libgnn/include/galois/GraphNeuralNetwork.h  | 11 ++-
 libgnn/include/galois/graphs/GNNGraph.h     |  9 +++
 libgnn/include/galois/layers/SoftmaxLayer.h |  1 -
 libgnn/src/GraphNeuralNetwork.cpp           | 41 ++++++++++
 libgnn/test/CMakeLists.txt                  |  4 +
 libgnn/test/accuracy-test.cpp               | 89 +++++++++++++++++++++
 6 files changed, 153 insertions(+), 2 deletions(-)
 create mode 100644 libgnn/test/accuracy-test.cpp

diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h
index d9cd6febc9..80f7b07916 100644
--- a/libgnn/include/galois/GraphNeuralNetwork.h
+++ b/libgnn/include/galois/GraphNeuralNetwork.h
@@ -117,8 +117,9 @@ class GraphNeuralNetwork {
     }
   }
 
-  //! Set the phases of all layers at once
+  //! Set the phases of all layers at once as well as this network
   void SetLayerPhases(galois::GNNPhase phase) {
+    phase_ = phase;
     for (std::unique_ptr<galois::GNNLayer>& ptr : gnn_layers_) {
       ptr->SetLayerPhase(phase);
     }
@@ -140,6 +141,8 @@ class GraphNeuralNetwork {
   //! @returns Output layer's output
   const std::vector<GNNFloat>* DoInference();
 
+  float GetGlobalAccuracy(const std::vector<GNNFloat>& predictions);
+
   //! Backpropagate gradients from the output layer backwards through the
   //! network to update the layer weights. Also known as a backward phase in
   //! most literature
@@ -154,6 +157,12 @@ class GraphNeuralNetwork {
   GraphNeuralNetworkConfig config_;
   //! GNN layers including the output
   std::vector<std::unique_ptr<galois::GNNLayer>> gnn_layers_;
+  //! Current phase of the GNN: train, validation, test
+  GNNPhase phase_{GNNPhase::kTrain};
+  //! Used to track accurate predictions during accuracy calculation
+  DGAccumulator<size_t> num_correct_;
+  //! Used to count total number of things checked during accuracy calculation
+  DGAccumulator<size_t> total_checked_;
 };
 
 } // namespace galois
diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index fa06453df9..79d96d0da5 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -49,6 +49,15 @@ class GNNGraph {
   NodeIterator end() const { return partitioned_graph_->allNodesRange().end(); }
   //! Return GID of some local node
   size_t GetGID(unsigned lid) const { return partitioned_graph_->getGID(lid); }
+
+  NodeIterator begin_owned() const {
+    return partitioned_graph_->masterNodesRange().begin();
+  }
+
+  NodeIterator end_owned() const {
+    return partitioned_graph_->masterNodesRange().end();
+  }
+
   //! Given an LID and the current phase of GNN computation, determine if the
   //! lid in question is valid for the current phase (i.e., it is part of
   //! a training, validation, or test phase mask)
diff --git a/libgnn/include/galois/layers/SoftmaxLayer.h b/libgnn/include/galois/layers/SoftmaxLayer.h
index 3052429b8b..3b5ace94c8 100644
--- a/libgnn/include/galois/layers/SoftmaxLayer.h
+++ b/libgnn/include/galois/layers/SoftmaxLayer.h
@@ -33,7 +33,6 @@ class SoftmaxLayer : public GNNLayer {
   BackwardPhase(const std::vector<galois::GNNFloat>& prev_layer_input,
                 std::vector<GNNFloat>* input_gradient) final;
 
-  // TODO prediction loss function?
 private:
   //! Loss for each row of the input
   std::vector<GNNFloat> input_loss_;
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index 18675d2ce4..7c209a3cbf 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -1,3 +1,4 @@
+#include "galois/GNNMath.h"
 #include "galois/GraphNeuralNetwork.h"
 #include "galois/layers/GraphConvolutionalLayer.h"
 #include "galois/layers/SoftmaxLayer.h"
@@ -67,6 +68,46 @@ const std::vector<galois::GNNFloat>* galois::GraphNeuralNetwork::DoInference() {
   return layer_input;
 }
 
+float galois::GraphNeuralNetwork::GetGlobalAccuracy(
+    const std::vector<GNNFloat>& predictions) {
+  // check owned nodes' accuracy
+  size_t num_labels = graph_->GetNumLabelClasses();
+  assert((graph_->GetNumLabelClasses() * graph_->size()) == predictions.size());
+  num_correct_.reset();
+  total_checked_.reset();
+
+  galois::do_all(
+      galois::iterate(graph_->begin_owned(), graph_->end_owned()),
+      [&](const unsigned lid) {
+        if (graph_->IsValidForPhase(lid, phase_)) {
+          total_checked_ += 1;
+          // get prediction by getting max
+          size_t predicted_label =
+              galois::MaxIndex(num_labels, &(predictions[lid * num_labels]));
+          // GALOIS_LOG_VERBOSE("Checking LID {} with label {} against
+          // prediction {}",
+          //                   lid, graph_->GetSingleClassLabel(lid),
+          //                   predicted_label);
+          // check against ground truth and track accordingly
+          // TODO static cast used here is dangerous
+          if (predicted_label ==
+              static_cast<size_t>(graph_->GetSingleClassLabel(lid))) {
+            num_correct_ += 1;
+          }
+        }
+      },
+      // TODO chunk size?
+      // steal on as some threads may have nothing to work on
+      galois::steal(), galois::loopname("GlobalAccuracy"));
+  // TODO revise for later when multi-class labels come in
+
+  size_t global_correct = num_correct_.reduce();
+  size_t global_checked = total_checked_.reduce();
+
+  return static_cast<float>(global_correct) /
+         static_cast<float>(global_checked);
+}
+
 void galois::GraphNeuralNetwork::GradientPropagation() {
   // from output layer get initial gradients
   std::vector<galois::GNNFloat> dummy;
diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt
index 029f785ad1..5934ad6331 100644
--- a/libgnn/test/CMakeLists.txt
+++ b/libgnn/test/CMakeLists.txt
@@ -22,4 +22,8 @@ add_executable(adam-test adam-test.cpp)
 target_link_libraries(adam-test galois_gnn)
 add_test(NAME adam-test COMMAND adam-test)
 
+add_executable(accuracy-test accuracy-test.cpp)
+target_link_libraries(accuracy-test galois_gnn)
+add_test(NAME accuracy-test COMMAND accuracy-test)
+
 # TODO multi host tests?
diff --git a/libgnn/test/accuracy-test.cpp b/libgnn/test/accuracy-test.cpp
new file mode 100644
index 0000000000..61d449255f
--- /dev/null
+++ b/libgnn/test/accuracy-test.cpp
@@ -0,0 +1,89 @@
+//! @file accuracy-test.cpp
+//! Similar to softmax test except that accuracy is checked + it constructs
+//! a full network object.
+
+#include "galois/Logging.h"
+#include "galois/GraphNeuralNetwork.h"
+
+int main() {
+  galois::DistMemSys G;
+
+  size_t num_threads = galois::setActiveThreads(
+      56 / galois::runtime::getSystemNetworkInterface().Num);
+  GALOIS_LOG_VERBOSE("Num threads is {}", num_threads);
+
+  // load test graph
+  auto test_graph = std::make_unique<galois::graphs::GNNGraph>(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true);
+
+  std::vector<galois::GNNLayerType> layer_types = {
+      galois::GNNLayerType::kGraphConvolutional};
+  std::vector<size_t> layer_output_sizes = {7, 7};
+  galois::GraphNeuralNetworkConfig gnn_config(
+      1, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax,
+      galois::GNNConfig());
+
+  std::vector<size_t> adam_sizes = {21};
+  auto adam = std::make_unique<galois::AdamOptimizer>(adam_sizes, 1);
+
+  auto gnn = std::make_unique<galois::GraphNeuralNetwork>(
+      std::move(test_graph), std::move(adam), std::move(gnn_config));
+  // for constancy set everything to 1
+  gnn->SetAllLayerWeightsTo1();
+
+  //////////////////////////////////////////////////////////////////////////////
+
+  const std::vector<galois::GNNFloat>* distributions = gnn->DoInference();
+  // accuracy will be 0.2: everything chooses the first 1 as the entire row
+  // is the same
+  float pred_accuracy = gnn->GetGlobalAccuracy(*distributions);
+  GALOIS_LOG_VERBOSE("{}", pred_accuracy);
+  GALOIS_LOG_ASSERT(pred_accuracy == static_cast<float>(0.2));
+
+  // validation mode
+  gnn->SetLayerPhases(galois::GNNPhase::kValidate);
+  const std::vector<galois::GNNFloat>* dist2 = gnn->DoInference();
+  pred_accuracy                              = gnn->GetGlobalAccuracy(*dist2);
+  GALOIS_LOG_ASSERT(pred_accuracy == static_cast<float>(0.0));
+
+  // test mode
+  gnn->SetLayerPhases(galois::GNNPhase::kTest);
+  const std::vector<galois::GNNFloat>* dist3 = gnn->DoInference();
+  pred_accuracy                              = gnn->GetGlobalAccuracy(*dist3);
+  GALOIS_LOG_ASSERT(pred_accuracy == static_cast<float>(0.0));
+
+  // manufactured predictions to make sure it predicts things correctly based
+  // on mode
+  // prediction is correct if diagonal of the 7x7 matrix has largest value
+  std::vector<galois::GNNFloat> mpred = {
+      1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
+      0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
+
+  gnn->SetLayerPhases(galois::GNNPhase::kTrain);
+  pred_accuracy = gnn->GetGlobalAccuracy(mpred);
+  GALOIS_LOG_VERBOSE("{}", pred_accuracy);
+  GALOIS_LOG_ASSERT(pred_accuracy == static_cast<float>(0.8));
+
+  gnn->SetLayerPhases(galois::GNNPhase::kValidate);
+  pred_accuracy = gnn->GetGlobalAccuracy(mpred);
+  GALOIS_LOG_ASSERT(pred_accuracy == static_cast<float>(0.0));
+
+  gnn->SetLayerPhases(galois::GNNPhase::kTest);
+  pred_accuracy = gnn->GetGlobalAccuracy(mpred);
+  GALOIS_LOG_ASSERT(pred_accuracy == static_cast<float>(1.0));
+
+  std::vector<galois::GNNFloat> mpred2 = {
+      0.5, 0, 0, 0, 0, 0, 0, 0,   0.3, 0, 0, 0, 0, 0, 0.1, 0, 1,
+      0,   0, 0, 0, 0, 0, 0, 0.3, 0,   0, 0, 1, 0, 0, 0,   2, 0,
+      0,   0, 0, 0, 0, 0, 4, 0,   0,   0, 0, 0, 0, 0, 0.1};
+  pred_accuracy = gnn->GetGlobalAccuracy(mpred2);
+  GALOIS_LOG_ASSERT(pred_accuracy == static_cast<float>(1.0));
+
+  gnn->SetLayerPhases(galois::GNNPhase::kValidate);
+  pred_accuracy = gnn->GetGlobalAccuracy(mpred2);
+  GALOIS_LOG_ASSERT(pred_accuracy == static_cast<float>(1.0));
+
+  gnn->SetLayerPhases(galois::GNNPhase::kTest);
+  pred_accuracy = gnn->GetGlobalAccuracy(mpred2);
+  GALOIS_LOG_ASSERT(pred_accuracy == static_cast<float>(1.0));
+}

From 3a735e4c03c9ab651f5b109a0179935932618073 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 7 Oct 2020 16:09:51 -0500
Subject: [PATCH 356/660] Various libgnn cleanup/bugfixes

- Initialize different seeds for each thread in per thread RNG.
- Fix dropout sizes being used (there's still a problem with it
however).
- Added more assertions for safety purposes throughout code.
- Changed default value in Adam optimizer.
- Removed some VERBOSE prints to clean things up.

And various other things I may have forgotten.

Accidentally included CMakeList modification in this commit for
epoch-test; will be added in next commit.
---
 libgnn/include/galois/GNNOptimizers.h  |  2 +-
 libgnn/include/galois/PerThreadRNG.h   | 12 ++++++++++--
 libgnn/src/GNNLayer.cpp                | 21 +++++++++++++--------
 libgnn/src/GNNMath.cpp                 |  4 +---
 libgnn/src/GNNOptimizers.cpp           |  2 --
 libgnn/src/GraphConvolutionalLayer.cpp | 14 +++++++++-----
 libgnn/src/GraphNeuralNetwork.cpp      |  4 +++-
 libgnn/src/SoftmaxLayer.cpp            | 20 +++++++++++---------
 libgnn/test/CMakeLists.txt             |  4 ++++
 9 files changed, 52 insertions(+), 31 deletions(-)

diff --git a/libgnn/include/galois/GNNOptimizers.h b/libgnn/include/galois/GNNOptimizers.h
index a970c54c56..c0e8dd2582 100644
--- a/libgnn/include/galois/GNNOptimizers.h
+++ b/libgnn/include/galois/GNNOptimizers.h
@@ -26,7 +26,7 @@ class AdamOptimizer : public BaseOptimizer {
 public:
   //! Struct for specifying adam config. Defaults based on the Adam paper.
   struct AdamConfiguration {
-    GNNFloat alpha{0.001};
+    GNNFloat alpha{0.01};
     GNNFloat beta1{0.9};
     GNNFloat beta2{0.999};
     GNNFloat epsilon{1e-8};
diff --git a/libgnn/include/galois/PerThreadRNG.h b/libgnn/include/galois/PerThreadRNG.h
index 80f8d11f0a..fde88386ab 100644
--- a/libgnn/include/galois/PerThreadRNG.h
+++ b/libgnn/include/galois/PerThreadRNG.h
@@ -1,7 +1,9 @@
 #pragma once
 #include <random>
 #include "galois/substrate/PerThreadStorage.h"
+#include "galois/Galois.h"
 #include "galois/GNNTypes.h"
+#include "galois/Logging.h"
 
 namespace galois {
 
@@ -9,9 +11,15 @@ namespace galois {
 class PerThreadRNG {
 public:
   //! Default seed 0, default distribution 0 to 1
-  PerThreadRNG() : distribution_{0.0, 1.0} {};
+  PerThreadRNG() : PerThreadRNG(0.0, 1.0){};
   //! User specified range
-  PerThreadRNG(float begin, float end) : distribution_{begin, end} {};
+  PerThreadRNG(float begin, float end) : distribution_{begin, end} {
+    // each thread needs to have a different seed so that the same # isn't
+    // chosen across all threads
+    galois::on_each([&](unsigned tid, unsigned n_threads) {
+      engine_.getLocal()->seed(tid * n_threads);
+    });
+  };
   //! Returns a random number between numbers specified during init
   GNNFloat GetRandomNumber() {
     return (*distribution_.getLocal())(*engine_.getLocal());
diff --git a/libgnn/src/GNNLayer.cpp b/libgnn/src/GNNLayer.cpp
index 33114a2f06..396f7ddf7c 100644
--- a/libgnn/src/GNNLayer.cpp
+++ b/libgnn/src/GNNLayer.cpp
@@ -1,3 +1,4 @@
+#include "galois/Logging.h"
 #include "galois/layers/GNNLayer.h"
 
 galois::GNNLayer::GNNLayer(size_t layer_num,
@@ -9,8 +10,8 @@ galois::GNNLayer::GNNLayer(size_t layer_num,
   if (config_.allocate_weights) {
     // TODO some of this does not need alloc if not used
     // dropout allocation; dropout is same as input
-    dropout_mask_.resize(layer_dimensions_.input_rows *
-                         layer_dimensions_.input_columns);
+    dropout_mask_.resize(
+        layer_dimensions_.input_rows * layer_dimensions_.input_columns, false);
     // allocate memory based on layer dimensions
     size_t num_weight_elements =
         layer_dimensions_.input_columns * layer_dimensions_.output_columns;
@@ -37,22 +38,25 @@ void galois::GNNLayer::RandomInitVector(std::vector<GNNFloat>* vector_to_init) {
       galois::loopname("RandomInitVector"));
 }
 
+// XXX Something is wrong with dropout; accuracy suffers, figure out what
+// it is
 void galois::GNNLayer::DoDropout(std::vector<GNNFloat>* output_matrix) {
-  // XXX fix droptout, should use inputs not weights
-  size_t num_weights = layer_weights_.size();
+  size_t num_elements = output_matrix->size();
+  assert(num_elements == dropout_mask_.size());
+
   // determine which weights to drop
   galois::do_all(
-      galois::iterate(static_cast<size_t>(0), num_weights),
+      galois::iterate(static_cast<size_t>(0), num_elements),
       [&](size_t i) {
         dropout_mask_[i] = dropout_rng_.DoBernoulli(config_.dropout_rate);
       },
       galois::loopname("LayerDropoutRNG"));
 
   // create new matrix with non-dropped weights + some scaling
-  // TODO scaling?
+  // TODO save scaling elsewhere?
   GNNFloat scale = 1. / (1. - config_.dropout_rate);
   galois::do_all(
-      galois::iterate(static_cast<size_t>(0), num_weights),
+      galois::iterate(static_cast<size_t>(0), num_elements),
       [&](size_t i) {
         (*output_matrix)[i] =
             layer_weights_[i] * static_cast<GNNFloat>(dropout_mask_[i]) * scale;
@@ -61,7 +65,9 @@ void galois::GNNLayer::DoDropout(std::vector<GNNFloat>* output_matrix) {
 }
 
 void galois::GNNLayer::DoDropoutDerivative() {
+  assert(backward_output_matrix_.size() == dropout_mask_.size());
   GNNFloat scale = 1. / (1. - config_.dropout_rate);
+
   // use dropout mask to figure out derivative
   galois::do_all(
       galois::iterate(static_cast<size_t>(0), backward_output_matrix_.size()),
@@ -88,7 +94,6 @@ void galois::GNNLayer::Activation() {
 void galois::GNNLayer::ActivationDerivative(std::vector<GNNFloat>* gradient) {
   // TODO only does relu at the moment; should check user specified activation
   // and act accordingly
-  // XXX
   // keep gradient if the original output is greater than 0
   galois::do_all(
       galois::iterate(static_cast<size_t>(0), gradient->size()),
diff --git a/libgnn/src/GNNMath.cpp b/libgnn/src/GNNMath.cpp
index 303e872e2a..0087c7340b 100644
--- a/libgnn/src/GNNMath.cpp
+++ b/libgnn/src/GNNMath.cpp
@@ -84,8 +84,6 @@ galois::GNNFloat galois::GNNCrossEntropy(const size_t vector_length,
       continue;
     }
 
-    GALOIS_LOG_VERBOSE("Truth {} input {}", ground_truth[i], input[i]);
-
     if (input[i] == 0.0) {
       loss -= ground_truth[i] * std::log(static_cast<GNNFloat>(1e-10));
     } else {
@@ -101,7 +99,7 @@ void galois::GNNCrossEntropyDerivative(const size_t vector_length,
                                        const GNNFloat* input,
                                        GNNFloat* gradients) {
   for (size_t i = 0; i < vector_length; i++) {
-    gradients[i] = -(ground_truth[i]) / (input[i] + 1e-10);
+    gradients[i] = -(ground_truth[i]) / (input[i] + static_cast<float>(1e-10));
   }
 }
 
diff --git a/libgnn/src/GNNOptimizers.cpp b/libgnn/src/GNNOptimizers.cpp
index 8698aa37c3..53088825fd 100644
--- a/libgnn/src/GNNOptimizers.cpp
+++ b/libgnn/src/GNNOptimizers.cpp
@@ -24,13 +24,11 @@ void galois::AdamOptimizer::GradientDescent(
         second_moment[i] =
             config_.beta2 * second_moment[i] +
             (1.0 - config_.beta2) * (derivatives[i] * derivatives[i]);
-        GALOIS_LOG_VERBOSE("{} {}", first_moment[i], second_moment[i]);
         // bias corrected moments using beta power
         GNNFloat bias_correct_first =
             first_moment[i] / (1.0 - beta1_power_t_[layer_number]);
         GNNFloat bias_correct_second =
             second_moment[i] / (1.0 - beta2_power_t_[layer_number]);
-        GALOIS_LOG_VERBOSE("{} {}", bias_correct_first, bias_correct_second);
         // weight update using bias corrected moments
         (matrix->data())[i] -=
             config_.alpha * bias_correct_first /
diff --git a/libgnn/src/GraphConvolutionalLayer.cpp b/libgnn/src/GraphConvolutionalLayer.cpp
index a5abe1d0ef..7b513374db 100644
--- a/libgnn/src/GraphConvolutionalLayer.cpp
+++ b/libgnn/src/GraphConvolutionalLayer.cpp
@@ -34,17 +34,14 @@ galois::GraphConvolutionalLayer::ForwardPhase(
   const GNNFloat* input_data = input_embeddings.data();
   // first, dropout
   // TODO only dropout if in training apparently
-  if (config_.do_dropout) {
-    GALOIS_LOG_VERBOSE("Doing dropout");
+  if (config_.do_dropout && (layer_phase_ == GNNPhase::kTrain)) {
     DoDropout(&in_temp_1_);
     input_data = in_temp_1_.data();
   }
 
-  GALOIS_LOG_VERBOSE("Doing aggregate");
   // aggregation and update (or vice versa)
   AggregateAll(layer_dimensions_.input_columns, input_data, in_temp_2_.data(),
                &input_column_intermediates_);
-  GALOIS_LOG_VERBOSE("Doing embedding update");
   // TODO synchronization of aggregation functions
   UpdateEmbeddings(in_temp_2_.data(), forward_output_matrix_.data());
 
@@ -64,6 +61,7 @@ galois::GraphConvolutionalLayer::ForwardPhase(
 std::vector<galois::GNNFloat>* galois::GraphConvolutionalLayer::BackwardPhase(
     const std::vector<galois::GNNFloat>& prev_layer_input,
     std::vector<galois::GNNFloat>* input_gradient) {
+  assert(layer_phase_ == GNNPhase::kTrain);
   // derivative of activation
   if (config_.do_activation) {
     ActivationDerivative(input_gradient);
@@ -73,6 +71,10 @@ std::vector<galois::GNNFloat>* galois::GraphConvolutionalLayer::BackwardPhase(
   // TODO do optimized cased like the forward
   if (layer_number_ != 0) {
     // transposed sgemm for derivative; in_temp is output
+    assert(input_gradient->size() ==
+           layer_dimensions_.input_rows * layer_dimensions_.output_columns);
+    assert(in_temp_1_.size() ==
+           layer_dimensions_.input_columns * layer_dimensions_.input_rows);
     UpdateEmbeddingsDerivative(input_gradient->data(), in_temp_1_.data());
     // derivative of aggregate is the same due to symmetric graph
     AggregateAll(layer_dimensions_.input_columns, in_temp_1_.data(),
@@ -87,7 +89,7 @@ std::vector<galois::GNNFloat>* galois::GraphConvolutionalLayer::BackwardPhase(
                      input_gradient->data(), layer_weight_gradients_.data());
   // TODO sync weights
 
-  if (config_.do_dropout) {
+  if (config_.do_dropout && layer_number_ != 0) {
     DoDropoutDerivative();
   }
 
@@ -155,6 +157,8 @@ void galois::GraphConvolutionalLayer::UpdateEmbeddings(
 
 void galois::GraphConvolutionalLayer::UpdateEmbeddingsDerivative(
     const GNNFloat* gradients, GNNFloat* output) {
+  assert(layer_weights_.size() ==
+         layer_dimensions_.input_columns * layer_dimensions_.output_columns);
   // difference is Trans for B matrix (data) to get z by y (weights is y by z
   // normally); result is x by y
   galois::CBlasSGEMM(CblasNoTrans, CblasTrans, layer_dimensions_.input_rows,
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index 7c209a3cbf..3424c2b3e3 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -104,6 +104,8 @@ float galois::GraphNeuralNetwork::GetGlobalAccuracy(
   size_t global_correct = num_correct_.reduce();
   size_t global_checked = total_checked_.reduce();
 
+  GALOIS_LOG_VERBOSE("Accuracy: {} / {}", global_correct, global_checked);
+
   return static_cast<float>(global_correct) /
          static_cast<float>(global_checked);
 }
@@ -118,7 +120,7 @@ void galois::GraphNeuralNetwork::GradientPropagation() {
   // loops through intermediate layers in a backward fashion
   // -1 to ignore output layer which was handled above
   for (size_t i = 0; i < gnn_layers_.size() - 1; i++) {
-    // note this assumes you have at least 2 layers
+    // note this assumes you have at least 2 layers (including output)
     size_t layer_index = gnn_layers_.size() - 2 - i;
 
     // get the input to the layer before this one
diff --git a/libgnn/src/SoftmaxLayer.cpp b/libgnn/src/SoftmaxLayer.cpp
index 1c7073e560..30dc476965 100644
--- a/libgnn/src/SoftmaxLayer.cpp
+++ b/libgnn/src/SoftmaxLayer.cpp
@@ -6,8 +6,8 @@ const std::vector<galois::GNNFloat>& galois::SoftmaxLayer::ForwardPhase(
     const std::vector<galois::GNNFloat>& input_embeddings) {
   input_loss_.assign(input_loss_.size(), 0.0);
   forward_output_matrix_.assign(forward_output_matrix_.size(), 0.0);
-
   const size_t feature_length = layer_dimensions_.input_columns;
+
   galois::do_all(
       galois::iterate(graph_.begin(), graph_.end()),
       [&](const unsigned i) {
@@ -21,8 +21,6 @@ const std::vector<galois::GNNFloat>& galois::SoftmaxLayer::ForwardPhase(
               ground_truth_vectors_.getLocal();
           assert(ground_truth_vec->size() == feature_length);
           ground_truth_vec->assign(ground_truth_vec->size(), 0.0);
-          GALOIS_LOG_VERBOSE("Label for LID {} is {}", i,
-                             graph_.GetSingleClassLabel(i));
           // single class label is an index; set the correct one
           (*ground_truth_vec)[static_cast<size_t>(
               graph_.GetSingleClassLabel(i))] = 1.0;
@@ -31,7 +29,6 @@ const std::vector<galois::GNNFloat>& galois::SoftmaxLayer::ForwardPhase(
           input_loss_[i] =
               GNNCrossEntropy(feature_length, ground_truth_vec->data(),
                               &forward_output_matrix_[feature_length * i]);
-          GALOIS_LOG_VERBOSE("Loss for LID {} is {}", i, input_loss_[i]);
         }
       },
       // TODO chunk size?
@@ -45,11 +42,14 @@ std::vector<galois::GNNFloat>*
 galois::SoftmaxLayer::BackwardPhase(const std::vector<galois::GNNFloat>&,
                                     std::vector<galois::GNNFloat>*) {
   const size_t feature_length = layer_dimensions_.input_columns;
+
   galois::do_all(
       galois::iterate(graph_.begin(), graph_.end()),
       [&](const unsigned i) {
         if (graph_.IsValidForPhase(i, layer_phase_)) {
           // create ground truth vector for this LID
+          // TODO maybe make this part of the graph class instead of recreating
+          // every time
           std::vector<GNNFloat>* ground_truth_vec =
               ground_truth_vectors_.getLocal();
           assert(ground_truth_vec->size() == feature_length);
@@ -61,16 +61,18 @@ galois::SoftmaxLayer::BackwardPhase(const std::vector<galois::GNNFloat>&,
           // derivative cross entropy into norm grad
           std::vector<GNNFloat>* norm_gradient =
               norm_gradient_vectors_.getLocal();
-          GNNCrossEntropyDerivative(feature_length, ground_truth_vec->data(),
-                                    forward_output_matrix_.data(),
-                                    norm_gradient->data());
+          GNNCrossEntropyDerivative(
+              feature_length, ground_truth_vec->data(),
+              &(forward_output_matrix_[i * feature_length]),
+              norm_gradient->data());
 
           // use norm grad with softmax deritave, save and return
           std::vector<GNNFloat>* softmax_temp =
               softmax_temp_vectors_.getLocal();
-          GNNSoftmaxDerivative(feature_length, forward_output_matrix_.data(),
+          GNNSoftmaxDerivative(feature_length,
+                               &(forward_output_matrix_[i * feature_length]),
                                norm_gradient->data(), softmax_temp->data(),
-                               backward_output_matrix_.data());
+                               &(backward_output_matrix_[i * feature_length]));
         }
       },
       // TODO chunk size?
diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt
index 5934ad6331..7ad7bf1888 100644
--- a/libgnn/test/CMakeLists.txt
+++ b/libgnn/test/CMakeLists.txt
@@ -26,4 +26,8 @@ add_executable(accuracy-test accuracy-test.cpp)
 target_link_libraries(accuracy-test galois_gnn)
 add_test(NAME accuracy-test COMMAND accuracy-test)
 
+add_executable(epoch-test epoch-test.cpp)
+target_link_libraries(epoch-test galois_gnn)
+add_test(NAME epoch-test COMMAND epoch-test)
+
 # TODO multi host tests?

From 803f62378144b9b978ec3da429a0395416ac3013 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 7 Oct 2020 16:10:30 -0500
Subject: [PATCH 357/660] epoch-test

Test that runs on the cora dataset for 100 epochs to see if accuracy
grows as time passes.

The test has exposed some issues with the dropout option that will need
to be fixed in a future commit, but otherwise training accuracy seems to
grow as time passes and the program seems to scale.
---
 libgnn/test/epoch-test.cpp | 52 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)
 create mode 100644 libgnn/test/epoch-test.cpp

diff --git a/libgnn/test/epoch-test.cpp b/libgnn/test/epoch-test.cpp
new file mode 100644
index 0000000000..aada47eea2
--- /dev/null
+++ b/libgnn/test/epoch-test.cpp
@@ -0,0 +1,52 @@
+//! @file epoch-test.cpp
+//! Run 50 epochs of training to see if results improve.
+
+#include "galois/Logging.h"
+#include "galois/GraphNeuralNetwork.h"
+
+int main() {
+  galois::DistMemSys G;
+
+  size_t num_threads = galois::setActiveThreads(
+      56 / galois::runtime::getSystemNetworkInterface().Num);
+  // size_t num_threads = galois::setActiveThreads(1);
+  GALOIS_LOG_VERBOSE("Num threads is {}", num_threads);
+
+  // load graph
+  auto test_graph = std::make_unique<galois::graphs::GNNGraph>(
+      "cora", galois::graphs::GNNPartitionScheme::kOEC, true);
+
+  std::vector<galois::GNNLayerType> layer_types = {
+      galois::GNNLayerType::kGraphConvolutional,
+      galois::GNNLayerType::kGraphConvolutional};
+  std::vector<size_t> layer_output_sizes = {
+      16, test_graph->GetNumLabelClasses(), test_graph->GetNumLabelClasses()};
+  // XXX fix dropout accuracy
+  galois::GraphNeuralNetworkConfig gnn_config(
+      2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax,
+      galois::GNNConfig{.do_dropout = false, .do_normalization = true});
+
+  std::vector<size_t> adam_sizes = {16 * test_graph->node_feature_length(),
+                                    16 * test_graph->GetNumLabelClasses()};
+  auto adam = std::make_unique<galois::AdamOptimizer>(adam_sizes, 2);
+
+  auto gnn = std::make_unique<galois::GraphNeuralNetwork>(
+      std::move(test_graph), std::move(adam), std::move(gnn_config));
+
+  //////////////////////////////////////////////////////////////////////////////
+
+  // no verification; test should be eyeballed to make sure accuracy is
+  // increasing
+  for (size_t epoch = 0; epoch < 100; epoch++) {
+    const std::vector<galois::GNNFloat>* predictions = gnn->DoInference();
+    gnn->GradientPropagation();
+    galois::gPrint("Epoch ", epoch, ": Accuracy is ",
+                   gnn->GetGlobalAccuracy(*predictions), "\n");
+  }
+
+  // check test accuracy
+  gnn->SetLayerPhases(galois::GNNPhase::kTest);
+  const std::vector<galois::GNNFloat>* predictions = gnn->DoInference();
+  galois::gPrint("Test accuracy is ", gnn->GetGlobalAccuracy(*predictions),
+                 "\n");
+}

From 0a01df9789dbc87dc3aeae12d4b07e008798212e Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 8 Oct 2020 13:06:49 -0500
Subject: [PATCH 358/660] Removing some gDebugs

Removing some gDebugs that appear while debugging GNNs in NewGeneric and
GraphHelepers
---
 libcusp/include/galois/graphs/NewGeneric.h    | 13 ------------
 .../include/galois/graphs/GraphHelpers.h      | 20 -------------------
 2 files changed, 33 deletions(-)

diff --git a/libcusp/include/galois/graphs/NewGeneric.h b/libcusp/include/galois/graphs/NewGeneric.h
index 3af95db9dd..4632d3b4d8 100644
--- a/libcusp/include/galois/graphs/NewGeneric.h
+++ b/libcusp/include/galois/graphs/NewGeneric.h
@@ -560,16 +560,10 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
           lid++;
         }
       }
-      galois::gDebug("[", base_DistGraph::id, " -> ", h, "] bitset size ",
-                     (end - start) / 64, " vs. vector size ",
-                     syncNodes[h].size() / 2);
     }
     lid -= numLocal;
 
     assert(lid == numToReserve);
-    galois::gDebug("[", base_DistGraph::id, "] total bitset size ",
-                   (ghosts.size() - numLocal) / 64, " vs. total vector size ",
-                   numToReserve / 2);
 
     // TODO: should not be used after this - refactor to make this clean
     ghosts.resize(0);
@@ -1450,13 +1444,6 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
         asyncSyncLoad(nodeLoads, nodeAccum, edgeLoads, edgeAccum, loadsClear);
       }
       loadSyncTimer.stop();
-
-#ifndef NDEBUG
-      if (async) {
-        galois::gDebug("[", base_DistGraph::id, "] host count ",
-                       hostFinished.count());
-      }
-#endif
     }
 
     // if asynchronous, don't move on until everything is done
diff --git a/libgalois/include/galois/graphs/GraphHelpers.h b/libgalois/include/galois/graphs/GraphHelpers.h
index e7da20ebc1..ab0b48c5a5 100644
--- a/libgalois/include/galois/graphs/GraphHelpers.h
+++ b/libgalois/include/galois/graphs/GraphHelpers.h
@@ -167,8 +167,6 @@ auto divideNodesBinarySearch(
   // weight of a block (one block for each division by default; if scale
   // factor specifies something different, then use that instead)
   uint64_t blockWeight = (weight + numBlocks - 1) / numBlocks;
-  // galois::gDebug("weight ", weight, " numblock ", numBlocks, " blockwegith ",
-  //               blockWeight);
 
   // lower and upper blocks that this division should use determined
   // using scaleFactor
@@ -182,9 +180,6 @@ auto divideNodesBinarySearch(
   uint32_t blockUpper = scaleFactor[id];
 
   assert(blockLower <= blockUpper);
-  // galois::gDebug("Unit ", id, " block ", blockLower, " to ",
-  //               blockUpper, "; ", blockLower * blockWeight, " ",
-  //               blockUpper * blockWeight);
 
   uint64_t nodesLower;
   // use prefix sum to find node bounds
@@ -215,10 +210,6 @@ auto divideNodesBinarySearch(
     edgesUpper = edgePrefixSum[nodesUpper - 1 + nodeOffset] - edgeOffset;
   }
 
-  // galois::gDebug("Unit ", id, " nodes ", nodesLower, " to ",
-  //               nodesUpper, " edges ", edgesLower, " ",
-  //               edgesUpper);
-
   return GraphRange(
       NodeRange(iterator(nodesLower), iterator(nodesUpper)),
       EdgeRange(edge_iterator(edgesLower), edge_iterator(edgesUpper)));
@@ -294,11 +285,6 @@ void determineUnitRangesLoopGraph(GraphTy& graph, uint32_t unitsToSplit,
       // unit assinged no nodes, copy last one
       returnRanges[i + 1] = returnRanges[i];
     }
-
-    galois::gDebug("LoopGraph Unit ", i, " gets nodes ", returnRanges[i],
-                   " to ", returnRanges[i + 1], ", num edges is ",
-                   graph.edge_end(returnRanges[i + 1] - 1) -
-                       graph.edge_begin(returnRanges[i]));
   }
 }
 
@@ -362,9 +348,6 @@ void determineUnitRangesLoopPrefixSum(VectorTy& prefixSum,
       // unit assinged no nodes
       returnRanges[i + 1] = returnRanges[i];
     }
-
-    galois::gDebug("Unit ", i, " gets nodes ", returnRanges[i], " to ",
-                   returnRanges[i + 1]);
   }
 }
 
@@ -522,9 +505,6 @@ std::vector<uint32_t> determineUnitRangesFromPrefixSum(uint32_t unitsToSplit,
       // unit assinged no nodes
       nodeRanges[i + 1] = nodeRanges[i];
     }
-
-    galois::gDebug("Unit ", i, " gets nodes ", nodeRanges[i], " to ",
-                   nodeRanges[i + 1]);
   }
 
   return nodeRanges;

From 03007ddce3f25da74420a661b5f7f24927d32fda Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 8 Oct 2020 18:57:48 -0500
Subject: [PATCH 359/660] Glorot Bengio layer weight initialization

Added a new function to initialize GNN layer weights based on a paper by
Glorot and Bengio at AISTATS 2010. This was what was used in the
non-refactored code to great effect in terms of accuracy gain.
---
 libgnn/include/galois/layers/GNNLayer.h |  8 ++++++++
 libgnn/src/GNNLayer.cpp                 | 15 +++++++++++++--
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
index 3647434be6..37d32a3c4d 100644
--- a/libgnn/include/galois/layers/GNNLayer.h
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -161,6 +161,14 @@ class GNNLayer {
 
   //////////////////////////////////////////////////////////////////////////////
 
+  //! Init based from following paper
+  //! http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf
+  //! Since it is unclear what j and j+1 refer to in that paper, the things
+  //! used are the dimensions of this particular weight matrix
+  //! TODO revisit paper and see what they really mean
+  //! Code inspired DGL and TinyDNN
+  void GlorotBengioInit(std::vector<GNNFloat>* vector_to_init);
+
   //! Randomly init a float vector using the class's random init RNG
   void RandomInitVector(std::vector<GNNFloat>* vector_to_init);
 
diff --git a/libgnn/src/GNNLayer.cpp b/libgnn/src/GNNLayer.cpp
index 396f7ddf7c..5a8f106f20 100644
--- a/libgnn/src/GNNLayer.cpp
+++ b/libgnn/src/GNNLayer.cpp
@@ -17,8 +17,7 @@ galois::GNNLayer::GNNLayer(size_t layer_num,
         layer_dimensions_.input_columns * layer_dimensions_.output_columns;
     layer_weights_.resize(num_weight_elements);
     layer_weight_gradients_.resize(num_weight_elements, 0);
-    // init weights randomly with a parallel loop
-    RandomInitVector(&layer_weights_);
+    GlorotBengioInit(&layer_weights_);
   }
 
   size_t num_output_elements =
@@ -28,6 +27,18 @@ galois::GNNLayer::GNNLayer(size_t layer_num,
       layer_dimensions_.input_rows * layer_dimensions_.input_columns, 0);
 }
 
+void galois::GNNLayer::GlorotBengioInit(std::vector<GNNFloat>* vector_to_init) {
+  float max = std::sqrt(6.0) / std::sqrt(layer_dimensions_.output_columns +
+                                         layer_dimensions_.input_columns);
+  // TODO this seed should be configurable
+  std::default_random_engine rng(1);
+  std::uniform_real_distribution<GNNFloat> dist(-max, max);
+
+  for (size_t i = 0; i < vector_to_init->size(); i++) {
+    (*vector_to_init)[i] = dist(rng);
+  }
+}
+
 void galois::GNNLayer::RandomInitVector(std::vector<GNNFloat>* vector_to_init) {
   galois::do_all(
       galois::iterate(static_cast<size_t>(0), vector_to_init->size()),

From 39c50716fcb0872988f6b81f6ba4d559a1f83c7e Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 8 Oct 2020 19:42:38 -0500
Subject: [PATCH 360/660] GCN layer aggregate/update flip based on dims

Added an "optimization" to GCN passes where if input columns are greater
than output columns then update occurs before aggregation to make it so
aggregation has less work to do.

This comes from DGL which also does something similar to save on compute
time.
---
 libgnn/src/GraphConvolutionalLayer.cpp | 71 ++++++++++++++++++--------
 1 file changed, 50 insertions(+), 21 deletions(-)

diff --git a/libgnn/src/GraphConvolutionalLayer.cpp b/libgnn/src/GraphConvolutionalLayer.cpp
index 7b513374db..c2a838e0fd 100644
--- a/libgnn/src/GraphConvolutionalLayer.cpp
+++ b/libgnn/src/GraphConvolutionalLayer.cpp
@@ -33,17 +33,25 @@ galois::GraphConvolutionalLayer::ForwardPhase(
   // pointer to input to operate on
   const GNNFloat* input_data = input_embeddings.data();
   // first, dropout
-  // TODO only dropout if in training apparently
   if (config_.do_dropout && (layer_phase_ == GNNPhase::kTrain)) {
     DoDropout(&in_temp_1_);
     input_data = in_temp_1_.data();
   }
 
-  // aggregation and update (or vice versa)
-  AggregateAll(layer_dimensions_.input_columns, input_data, in_temp_2_.data(),
-               &input_column_intermediates_);
+  // flip aggregate/update if dimensions favor it (do less work)
+  if (layer_dimensions_.input_columns <= layer_dimensions_.output_columns) {
+    // aggregation and update
+    AggregateAll(layer_dimensions_.input_columns, input_data, in_temp_2_.data(),
+                 &input_column_intermediates_);
+    UpdateEmbeddings(in_temp_2_.data(), forward_output_matrix_.data());
+  } else {
+    // update to aggregate
+    UpdateEmbeddings(input_data, out_temp_.data());
+    AggregateAll(layer_dimensions_.output_columns, out_temp_.data(),
+                 forward_output_matrix_.data(), &output_column_intermediates_);
+  }
+
   // TODO synchronization of aggregation functions
-  UpdateEmbeddings(in_temp_2_.data(), forward_output_matrix_.data());
 
   // TODO if input columns > output columns do update first then aggregate for
   // efficiency
@@ -68,25 +76,46 @@ std::vector<galois::GNNFloat>* galois::GraphConvolutionalLayer::BackwardPhase(
   }
 
   // derivative of aggregation/update
-  // TODO do optimized cased like the forward
-  if (layer_number_ != 0) {
-    // transposed sgemm for derivative; in_temp is output
-    assert(input_gradient->size() ==
-           layer_dimensions_.input_rows * layer_dimensions_.output_columns);
-    assert(in_temp_1_.size() ==
-           layer_dimensions_.input_columns * layer_dimensions_.input_rows);
-    UpdateEmbeddingsDerivative(input_gradient->data(), in_temp_1_.data());
-    // derivative of aggregate is the same due to symmetric graph
-    AggregateAll(layer_dimensions_.input_columns, in_temp_1_.data(),
-                 backward_output_matrix_.data(), &input_column_intermediates_);
+  // TODO clean up logic here to reduce nesting
+  if (layer_dimensions_.input_columns <= layer_dimensions_.output_columns) {
+    if (layer_number_ != 0) {
+      // transposed sgemm for derivative; in_temp is output
+      assert(input_gradient->size() ==
+             layer_dimensions_.input_rows * layer_dimensions_.output_columns);
+      assert(in_temp_1_.size() ==
+             layer_dimensions_.input_columns * layer_dimensions_.input_rows);
+      UpdateEmbeddingsDerivative(input_gradient->data(), in_temp_1_.data());
+      // derivative of aggregate is the same due to symmetric graph
+      AggregateAll(layer_dimensions_.input_columns, in_temp_1_.data(),
+                   backward_output_matrix_.data(),
+                   &input_column_intermediates_);
+    }
+    // weight gradient calculation
+    galois::CBlasSGEMM(
+        CblasTrans, CblasNoTrans, layer_dimensions_.input_columns,
+        layer_dimensions_.input_rows, layer_dimensions_.output_columns,
+        prev_layer_input.data(), input_gradient->data(),
+        layer_weight_gradients_.data());
+  } else {
+    // aggregate occurs regardless of layer being equal to 0 because it is
+    // required in this case for the weight gradient calculation
+    AggregateAll(layer_dimensions_.output_columns, input_gradient->data(),
+                 out_temp_.data(), &output_column_intermediates_);
+    if (layer_number_ != 0) {
+      // derivative for update
+      UpdateEmbeddingsDerivative(out_temp_.data(),
+                                 backward_output_matrix_.data());
+    }
+    // weight gradient; note the use of the aggregated gradient in out_temp
+    galois::CBlasSGEMM(
+        CblasTrans, CblasNoTrans, layer_dimensions_.input_columns,
+        layer_dimensions_.input_rows, layer_dimensions_.output_columns,
+        prev_layer_input.data(), out_temp_.data(),
+        layer_weight_gradients_.data());
   }
+
   // TODO sync agg/update
 
-  // weight gradient calculation
-  galois::CBlasSGEMM(CblasTrans, CblasNoTrans, layer_dimensions_.input_columns,
-                     layer_dimensions_.input_rows,
-                     layer_dimensions_.output_columns, prev_layer_input.data(),
-                     input_gradient->data(), layer_weight_gradients_.data());
   // TODO sync weights
 
   if (config_.do_dropout && layer_number_ != 0) {

From ce4437e75fbe622894f6c6d95c41a41d5ceac08f Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 8 Oct 2020 19:45:17 -0500
Subject: [PATCH 361/660] Timer to epoch-test

Adds a timer to epoch test to compare with older code.

Note that after the previous few commits the accuracy now matches quite
closely to the older code (there are slight differences due to
corrections to the optimizer), and it is also faster than the older code
as well.
---
 libgnn/test/epoch-test.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/libgnn/test/epoch-test.cpp b/libgnn/test/epoch-test.cpp
index aada47eea2..0bf8c61f81 100644
--- a/libgnn/test/epoch-test.cpp
+++ b/libgnn/test/epoch-test.cpp
@@ -22,9 +22,12 @@ int main() {
   std::vector<size_t> layer_output_sizes = {
       16, test_graph->GetNumLabelClasses(), test_graph->GetNumLabelClasses()};
   // XXX fix dropout accuracy
+  // XXX fix activation too
   galois::GraphNeuralNetworkConfig gnn_config(
       2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax,
-      galois::GNNConfig{.do_dropout = false, .do_normalization = true});
+      galois::GNNConfig{.do_dropout       = false,
+                        .do_activation    = false,
+                        .do_normalization = true});
 
   std::vector<size_t> adam_sizes = {16 * test_graph->node_feature_length(),
                                     16 * test_graph->GetNumLabelClasses()};
@@ -37,6 +40,8 @@ int main() {
 
   // no verification; test should be eyeballed to make sure accuracy is
   // increasing
+  galois::StatTimer main_timer("Timer_0");
+  main_timer.start();
   for (size_t epoch = 0; epoch < 100; epoch++) {
     const std::vector<galois::GNNFloat>* predictions = gnn->DoInference();
     gnn->GradientPropagation();
@@ -49,4 +54,5 @@ int main() {
   const std::vector<galois::GNNFloat>* predictions = gnn->DoInference();
   galois::gPrint("Test accuracy is ", gnn->GetGlobalAccuracy(*predictions),
                  "\n");
+  main_timer.stop();
 }

From db5204ce20388ec92a7b0695d938016f749f851d Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 9 Oct 2020 18:37:49 -0500
Subject: [PATCH 362/660] Fixed forward dropout: input to drop not weights

Fixed the dropout occuring in the forward phase of the GCN layer: the
original implementation was dropping layer weights instead of dropping
the input like it is supposed to which completely wrecked accuracy and
caused segfaults.

Turning on dropout no longer makes accuracy horrible.
---
 libgnn/include/galois/layers/GNNLayer.h |  3 ++-
 libgnn/src/GNNLayer.cpp                 | 12 +++++++-----
 libgnn/src/GraphConvolutionalLayer.cpp  |  2 +-
 libgnn/test/epoch-test.cpp              |  6 +++---
 4 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
index 37d32a3c4d..0d30c337f2 100644
--- a/libgnn/include/galois/layers/GNNLayer.h
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -175,7 +175,8 @@ class GNNLayer {
   //! Choose a set of weights from this layer's weights to keep and save to
   //! the output matrix + apply some scaling to the kept weights based on
   //! dropout rate
-  void DoDropout(std::vector<GNNFloat>* output_matrix);
+  void DoDropout(const std::vector<GNNFloat>& input_to_drop,
+                 std::vector<GNNFloat>* output_matrix);
   //! Apply the derivative of dropout to the backward phase output
   void DoDropoutDerivative();
 
diff --git a/libgnn/src/GNNLayer.cpp b/libgnn/src/GNNLayer.cpp
index 5a8f106f20..bcefd42efe 100644
--- a/libgnn/src/GNNLayer.cpp
+++ b/libgnn/src/GNNLayer.cpp
@@ -51,11 +51,13 @@ void galois::GNNLayer::RandomInitVector(std::vector<GNNFloat>* vector_to_init) {
 
 // XXX Something is wrong with dropout; accuracy suffers, figure out what
 // it is
-void galois::GNNLayer::DoDropout(std::vector<GNNFloat>* output_matrix) {
+void galois::GNNLayer::DoDropout(const std::vector<GNNFloat>& input_to_dropout,
+                                 std::vector<GNNFloat>* output_matrix) {
   size_t num_elements = output_matrix->size();
   assert(num_elements == dropout_mask_.size());
+  assert(num_elements == input_to_dropout.size());
 
-  // determine which weights to drop
+  // determine which parts to drop
   galois::do_all(
       galois::iterate(static_cast<size_t>(0), num_elements),
       [&](size_t i) {
@@ -63,14 +65,14 @@ void galois::GNNLayer::DoDropout(std::vector<GNNFloat>* output_matrix) {
       },
       galois::loopname("LayerDropoutRNG"));
 
-  // create new matrix with non-dropped weights + some scaling
+  // create new matrix with non-dropped input + some scaling
   // TODO save scaling elsewhere?
   GNNFloat scale = 1. / (1. - config_.dropout_rate);
   galois::do_all(
       galois::iterate(static_cast<size_t>(0), num_elements),
       [&](size_t i) {
-        (*output_matrix)[i] =
-            layer_weights_[i] * static_cast<GNNFloat>(dropout_mask_[i]) * scale;
+        (*output_matrix)[i] = input_to_dropout[i] *
+                              static_cast<GNNFloat>(dropout_mask_[i]) * scale;
       },
       galois::loopname("LayerDropout"));
 }
diff --git a/libgnn/src/GraphConvolutionalLayer.cpp b/libgnn/src/GraphConvolutionalLayer.cpp
index c2a838e0fd..92f554da45 100644
--- a/libgnn/src/GraphConvolutionalLayer.cpp
+++ b/libgnn/src/GraphConvolutionalLayer.cpp
@@ -34,7 +34,7 @@ galois::GraphConvolutionalLayer::ForwardPhase(
   const GNNFloat* input_data = input_embeddings.data();
   // first, dropout
   if (config_.do_dropout && (layer_phase_ == GNNPhase::kTrain)) {
-    DoDropout(&in_temp_1_);
+    DoDropout(input_embeddings, &in_temp_1_);
     input_data = in_temp_1_.data();
   }
 
diff --git a/libgnn/test/epoch-test.cpp b/libgnn/test/epoch-test.cpp
index 0bf8c61f81..c6c98ab7d1 100644
--- a/libgnn/test/epoch-test.cpp
+++ b/libgnn/test/epoch-test.cpp
@@ -14,7 +14,7 @@ int main() {
 
   // load graph
   auto test_graph = std::make_unique<galois::graphs::GNNGraph>(
-      "cora", galois::graphs::GNNPartitionScheme::kOEC, true);
+      "reddit", galois::graphs::GNNPartitionScheme::kOEC, true);
 
   std::vector<galois::GNNLayerType> layer_types = {
       galois::GNNLayerType::kGraphConvolutional,
@@ -25,7 +25,7 @@ int main() {
   // XXX fix activation too
   galois::GraphNeuralNetworkConfig gnn_config(
       2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax,
-      galois::GNNConfig{.do_dropout       = false,
+      galois::GNNConfig{.do_dropout       = true,
                         .do_activation    = false,
                         .do_normalization = true});
 
@@ -42,7 +42,7 @@ int main() {
   // increasing
   galois::StatTimer main_timer("Timer_0");
   main_timer.start();
-  for (size_t epoch = 0; epoch < 100; epoch++) {
+  for (size_t epoch = 0; epoch < 5; epoch++) {
     const std::vector<galois::GNNFloat>* predictions = gnn->DoInference();
     gnn->GradientPropagation();
     galois::gPrint("Epoch ", epoch, ": Accuracy is ",

From 675018698fd3d66a1ea99c10c97060b23edba3be Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 9 Oct 2020 18:59:53 -0500
Subject: [PATCH 363/660] Disable convlayer-test and gnnfb-test for now

With the addition of the optimization that flips aggregation/update
order based on input/output columns, some old tests are now broken
because the answer differs due to the flip in order. Disabled tests for
now until I figure out the new correct output or undo the optimization
somehow for the tests.
---
 libgnn/test/CMakeLists.txt | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt
index 7ad7bf1888..791b79757e 100644
--- a/libgnn/test/CMakeLists.txt
+++ b/libgnn/test/CMakeLists.txt
@@ -2,9 +2,9 @@ add_executable(gnngraph-test gnngraph-test.cpp)
 target_link_libraries(gnngraph-test galois_gnn)
 add_test(NAME gnngraph-test COMMAND gnngraph-test)
 
-add_executable(convlayer-test convlayer-test.cpp)
-target_link_libraries(convlayer-test galois_gnn)
-add_test(NAME convlayer-test COMMAND convlayer-test)
+#add_executable(convlayer-test convlayer-test.cpp)
+#target_link_libraries(convlayer-test galois_gnn)
+#add_test(NAME convlayer-test COMMAND convlayer-test)
 
 add_executable(softmaxlayer-test softmaxlayer-test.cpp)
 target_link_libraries(softmaxlayer-test galois_gnn)
@@ -14,9 +14,9 @@ add_executable(gnnconstruct-test gnnconstruct-test.cpp)
 target_link_libraries(gnnconstruct-test galois_gnn)
 add_test(NAME gnnconstruct-test COMMAND gnnconstruct-test)
 
-add_executable(gnnfb-test gnnfb-test.cpp)
-target_link_libraries(gnnfb-test galois_gnn)
-add_test(NAME gnnfb-test COMMAND gnnfb-test)
+#add_executable(gnnfb-test gnnfb-test.cpp)
+#target_link_libraries(gnnfb-test galois_gnn)
+#add_test(NAME gnnfb-test COMMAND gnnfb-test)
 
 add_executable(adam-test adam-test.cpp)
 target_link_libraries(adam-test galois_gnn)

From a7db2e2c885e125ad3bc68bbc22202e89ba9aef7 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 12 Oct 2020 12:30:00 -0500
Subject: [PATCH 364/660] Option to disable agg/update flip, reenable tests

Adds an option to layers to make it so program doesn't automatically
flip aggregate/update to avoid more work in the aggregate step. This
allows the tests that broke before to be renabled. Also fixed gnnfb
test by removing back prop call in val and test (you should not call
those during those phases; back is only for training).
---
 libgnn/include/galois/layers/GNNLayer.h |  3 +++
 libgnn/src/GraphConvolutionalLayer.cpp  |  6 ++++--
 libgnn/test/CMakeLists.txt              | 12 ++++++------
 libgnn/test/convlayer-test.cpp          | 16 ++++++++++------
 libgnn/test/gnnfb-test.cpp              |  7 +++----
 5 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
index 0d30c337f2..ac6cc9dd0e 100644
--- a/libgnn/include/galois/layers/GNNLayer.h
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -44,6 +44,9 @@ struct GNNConfig {
   bool do_activation{false};
   //! True if normalization is to occur during multiplies
   bool do_normalization{false};
+  //! If this is true, aggregate may occur after multiply if # of input columns
+  //! is higher than output columns to do less work in aggregation
+  bool allow_aggregate_after_update{true};
   // TODO activation type; for now default is softmax
 };
 
diff --git a/libgnn/src/GraphConvolutionalLayer.cpp b/libgnn/src/GraphConvolutionalLayer.cpp
index 92f554da45..5a222ced62 100644
--- a/libgnn/src/GraphConvolutionalLayer.cpp
+++ b/libgnn/src/GraphConvolutionalLayer.cpp
@@ -39,7 +39,8 @@ galois::GraphConvolutionalLayer::ForwardPhase(
   }
 
   // flip aggregate/update if dimensions favor it (do less work)
-  if (layer_dimensions_.input_columns <= layer_dimensions_.output_columns) {
+  if (!config_.allow_aggregate_after_update ||
+      layer_dimensions_.input_columns <= layer_dimensions_.output_columns) {
     // aggregation and update
     AggregateAll(layer_dimensions_.input_columns, input_data, in_temp_2_.data(),
                  &input_column_intermediates_);
@@ -77,7 +78,8 @@ std::vector<galois::GNNFloat>* galois::GraphConvolutionalLayer::BackwardPhase(
 
   // derivative of aggregation/update
   // TODO clean up logic here to reduce nesting
-  if (layer_dimensions_.input_columns <= layer_dimensions_.output_columns) {
+  if (!config_.allow_aggregate_after_update ||
+      layer_dimensions_.input_columns <= layer_dimensions_.output_columns) {
     if (layer_number_ != 0) {
       // transposed sgemm for derivative; in_temp is output
       assert(input_gradient->size() ==
diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt
index 791b79757e..7ad7bf1888 100644
--- a/libgnn/test/CMakeLists.txt
+++ b/libgnn/test/CMakeLists.txt
@@ -2,9 +2,9 @@ add_executable(gnngraph-test gnngraph-test.cpp)
 target_link_libraries(gnngraph-test galois_gnn)
 add_test(NAME gnngraph-test COMMAND gnngraph-test)
 
-#add_executable(convlayer-test convlayer-test.cpp)
-#target_link_libraries(convlayer-test galois_gnn)
-#add_test(NAME convlayer-test COMMAND convlayer-test)
+add_executable(convlayer-test convlayer-test.cpp)
+target_link_libraries(convlayer-test galois_gnn)
+add_test(NAME convlayer-test COMMAND convlayer-test)
 
 add_executable(softmaxlayer-test softmaxlayer-test.cpp)
 target_link_libraries(softmaxlayer-test galois_gnn)
@@ -14,9 +14,9 @@ add_executable(gnnconstruct-test gnnconstruct-test.cpp)
 target_link_libraries(gnnconstruct-test galois_gnn)
 add_test(NAME gnnconstruct-test COMMAND gnnconstruct-test)
 
-#add_executable(gnnfb-test gnnfb-test.cpp)
-#target_link_libraries(gnnfb-test galois_gnn)
-#add_test(NAME gnnfb-test COMMAND gnnfb-test)
+add_executable(gnnfb-test gnnfb-test.cpp)
+target_link_libraries(gnnfb-test galois_gnn)
+add_test(NAME gnnfb-test COMMAND gnnfb-test)
 
 add_executable(adam-test adam-test.cpp)
 target_link_libraries(adam-test galois_gnn)
diff --git a/libgnn/test/convlayer-test.cpp b/libgnn/test/convlayer-test.cpp
index 3c127b0ad0..3f46fb84d2 100644
--- a/libgnn/test/convlayer-test.cpp
+++ b/libgnn/test/convlayer-test.cpp
@@ -50,8 +50,9 @@ int main() {
   // create the layer, no norm factor
   // note layer number is 1 so that it does something in backward phase
   std::unique_ptr<galois::GraphConvolutionalLayer> layer_0 =
-      std::make_unique<galois::GraphConvolutionalLayer>(0, test_graph,
-                                                        dimension_0);
+      std::make_unique<galois::GraphConvolutionalLayer>(
+          0, test_graph, dimension_0,
+          galois::GNNConfig{.allow_aggregate_after_update = false});
   layer_0->InitAllWeightsTo1();
   // make sure it runs in a sane manner
   const std::vector<galois::GNNFloat>& layer_0_forward_output =
@@ -133,8 +134,9 @@ int main() {
   // create layer 1 for testing backward prop actually giving weights back
 
   std::unique_ptr<galois::GraphConvolutionalLayer> layer_1 =
-      std::make_unique<galois::GraphConvolutionalLayer>(1, test_graph,
-                                                        dimension_0);
+      std::make_unique<galois::GraphConvolutionalLayer>(
+          1, test_graph, dimension_0,
+          galois::GNNConfig{.allow_aggregate_after_update = false});
   layer_1->InitAllWeightsTo1();
   const std::vector<galois::GNNFloat>& layer_1_forward_output =
       layer_1->ForwardPhase(test_graph.GetLocalFeatures());
@@ -199,8 +201,10 @@ int main() {
 
   //////////////////////////////////////////////////////////////////////////////
 
-  galois::GNNConfig config = {
-      .do_dropout = true, .do_activation = true, .do_normalization = true};
+  galois::GNNConfig config = {.do_dropout                   = true,
+                              .do_activation                = true,
+                              .do_normalization             = true,
+                              .allow_aggregate_after_update = false};
 
   // finally, just make sure dropout and activation run without crashes
   // (verification requires floating point accuracy or setting a seed which I
diff --git a/libgnn/test/gnnfb-test.cpp b/libgnn/test/gnnfb-test.cpp
index 8142b2435b..50c40ff2c1 100644
--- a/libgnn/test/gnnfb-test.cpp
+++ b/libgnn/test/gnnfb-test.cpp
@@ -27,7 +27,7 @@ int main() {
   // like dropout or activation and the like so that input is easier to verify
   galois::GraphNeuralNetworkConfig gnn_config(
       2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax,
-      galois::GNNConfig());
+      galois::GNNConfig{.allow_aggregate_after_update = false});
   // input is 7 x 3, layers are then 3 x 4 and 4 x 7 and 7 x 7
   // middle 2 are trainable so 12 and 28
   std::vector<size_t> adam_sizes = {12, 28};
@@ -142,7 +142,6 @@ int main() {
       GALOIS_LOG_ASSERT(fo_out_val[c + i] == 0);
     }
   }
-  gnn->GradientPropagation();
 
   // all but last should be 0s
   gnn->SetLayerPhases(galois::GNNPhase::kTest);
@@ -161,7 +160,6 @@ int main() {
       GALOIS_LOG_ASSERT(fo_out_test[c + i] == 0);
     }
   }
-  gnn->GradientPropagation();
 
   //////////////////////////////////////////////////////////////////////////////
   // run different config of gnn with dropout/activation
@@ -172,7 +170,8 @@ int main() {
   test_graph = std::make_unique<galois::graphs::GNNGraph>(
       "tester", galois::graphs::GNNPartitionScheme::kOEC, true);
   galois::GraphNeuralNetworkConfig gnn_config2(
-      2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax);
+      2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax,
+      galois::GNNConfig{.allow_aggregate_after_update = false});
   auto adam2 = std::make_unique<galois::AdamOptimizer>(adam_sizes, 2);
   auto gnn2  = std::make_unique<galois::GraphNeuralNetwork>(
       std::move(test_graph), std::move(adam2), std::move(gnn_config2));

From d83a224008a3996327062cbd072c93ef01034b0d Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 12 Oct 2020 12:32:04 -0500
Subject: [PATCH 365/660] old gnn: add tester as a valid input, use adam opt

Two small changes to make it easier to debug the new refactored code.
---
 libdeepgalois/include/deepgalois/configs.h | 5 +++--
 lonestar/gnn/include/engine.h              | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/libdeepgalois/include/deepgalois/configs.h b/libdeepgalois/include/deepgalois/configs.h
index f21dff7fed..5cbb1909fd 100644
--- a/libdeepgalois/include/deepgalois/configs.h
+++ b/libdeepgalois/include/deepgalois/configs.h
@@ -5,8 +5,9 @@ namespace deepgalois {
 const std::string path =
     "/net/ohm/export/iss/inputs/Learning/"; // path to the input dataset
 
-#define NUM_DATASETS 8
+#define NUM_DATASETS 9
 const std::string dataset_names[NUM_DATASETS] = {
-    "cora", "citeseer", "ppi", "pubmed", "flickr", "yelp", "reddit", "amazon"};
+    "cora", "citeseer", "ppi",    "pubmed", "flickr",
+    "yelp", "reddit",   "amazon", "tester"};
 
 } // namespace deepgalois
diff --git a/lonestar/gnn/include/engine.h b/lonestar/gnn/include/engine.h
index f9afb28a4c..016ac80831 100644
--- a/lonestar/gnn/include/engine.h
+++ b/lonestar/gnn/include/engine.h
@@ -92,7 +92,7 @@ int main(int argc, char** argv) {
   // see optimizer.h for more details
   // optimizer *opt = new gradient_descent();
   // optimizer *opt = new adagrad();
-  deepgalois::optimizer* opt = new deepgalois::adagrad();
+  deepgalois::optimizer* opt = new deepgalois::adam();
   galois::StatTimer Ttrain("TrainAndVal");
   Ttrain.start();
   network.train(opt, do_validate); // do training using training samples

From 06142dc7f83134fe67aaaaec95e28b608c857bc4 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 12 Oct 2020 13:47:13 -0500
Subject: [PATCH 366/660] Switch epoch test to cora

At this point in time turning on activation makes things much worse than
the older code; will have to figure out why this is the case, but this
is lower priority at the moment because activation also slows
convergence.
---
 libgnn/test/epoch-test.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/libgnn/test/epoch-test.cpp b/libgnn/test/epoch-test.cpp
index c6c98ab7d1..c37a7d2e34 100644
--- a/libgnn/test/epoch-test.cpp
+++ b/libgnn/test/epoch-test.cpp
@@ -14,15 +14,14 @@ int main() {
 
   // load graph
   auto test_graph = std::make_unique<galois::graphs::GNNGraph>(
-      "reddit", galois::graphs::GNNPartitionScheme::kOEC, true);
+      "cora", galois::graphs::GNNPartitionScheme::kOEC, true);
 
   std::vector<galois::GNNLayerType> layer_types = {
       galois::GNNLayerType::kGraphConvolutional,
       galois::GNNLayerType::kGraphConvolutional};
   std::vector<size_t> layer_output_sizes = {
       16, test_graph->GetNumLabelClasses(), test_graph->GetNumLabelClasses()};
-  // XXX fix dropout accuracy
-  // XXX fix activation too
+  // XXX Activation kills accuracy compared to old code, esp. for cora
   galois::GraphNeuralNetworkConfig gnn_config(
       2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax,
       galois::GNNConfig{.do_dropout       = true,
@@ -42,7 +41,7 @@ int main() {
   // increasing
   galois::StatTimer main_timer("Timer_0");
   main_timer.start();
-  for (size_t epoch = 0; epoch < 5; epoch++) {
+  for (size_t epoch = 0; epoch < 100; epoch++) {
     const std::vector<galois::GNNFloat>* predictions = gnn->DoInference();
     gnn->GradientPropagation();
     galois::gPrint("Epoch ", epoch, ": Accuracy is ",

From 600e0ef5831a4f6a3182f4c6d7d2f182079cab76 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 12 Oct 2020 15:31:15 -0500
Subject: [PATCH 367/660] libgnn CMakeLists cleanup

Removal of unused MKL links as well as openmp/pthreads which don't seem
to be required by openblas.
---
 libgnn/CMakeLists.txt | 18 ++++--------------
 1 file changed, 4 insertions(+), 14 deletions(-)

diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt
index e6c8786cd2..88398c3d60 100644
--- a/libgnn/CMakeLists.txt
+++ b/libgnn/CMakeLists.txt
@@ -1,21 +1,11 @@
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -pthread")
-SET(BLAS_INC_DIR ${OPENBLAS_ROOT}/include/openblas)
-SET(BLAS_LIB_DIR ${OPENBLAS_ROOT}/lib64)
-set(BLAS_LIB "-lopenblas -lpthread")
-if(USE_MKL_BLAS)
-  link_directories(${INTEL_LIBS_DIR})
-  message(STATUS "ICC Libraries for MKL: ${INTEL_LIBS_DIR}")
-  SET(BLAS_INC_DIR ${MKL_ROOT}/include)
-  SET(BLAS_LIB_DIR ${MKL_ROOT}/lib/intel64)
-  set(BLAS_LIB "-lmkl_intel_lp64 -lmkl_sequential -lmkl_core")
-  #set(BLAS_LIB "-lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -lpthread -liomp5")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_MKL")
-endif()
-
+set(BLAS_INC_DIR ${OPENBLAS_ROOT}/include/openblas)
+set(BLAS_LIB_DIR ${OPENBLAS_ROOT}/lib64)
 # blas library
 include_directories(${BLAS_INC_DIR})
 link_directories(${BLAS_LIB_DIR})
 
+set(BLAS_LIB "-lopenblas")
+
 set(sources
   src/GNNGraph.cpp
   src/GNNLayer.cpp

From bd84e8f3981107ad89161f260530f678d919f8ff Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 12 Oct 2020 16:39:47 -0500
Subject: [PATCH 368/660] Removed some galois/cusp/gluon debugs

Removed a bunch of gDebugs that were making output in Debug build
extremely hard to parse.
---
 libcusp/include/galois/graphs/BasePolicies.h  |  8 ---
 .../include/galois/graphs/DistributedGraph.h  |  3 -
 libcusp/include/galois/graphs/NewGeneric.h    | 62 +------------------
 libdist/include/galois/DTerminationDetector.h |  9 ---
 libgalois/include/galois/LargeArray.h         |  4 --
 .../include/galois/graphs/BufferedGraph.h     |  3 -
 6 files changed, 3 insertions(+), 86 deletions(-)

diff --git a/libcusp/include/galois/graphs/BasePolicies.h b/libcusp/include/galois/graphs/BasePolicies.h
index a529e391c7..446e9c7dae 100644
--- a/libcusp/include/galois/graphs/BasePolicies.h
+++ b/libcusp/include/galois/graphs/BasePolicies.h
@@ -203,16 +203,12 @@ class CustomMasterAssignment : public PartitioningScaffold {
         // found in map
         if (gidMasterIter != _gid2masters.end()) {
           uint32_t mappedMaster = gidMasterIter->second;
-          // galois::gDebug("[", _hostID, "] ", gid, " found with master ",
-          //               mappedMaster, "!");
           // make sure host is in bounds
           assert(mappedMaster < _numHosts);
           return mappedMaster;
         } else {
           // NOT FOUND (not necessarily a bad thing, and required for
           // some cases)
-          galois::gDebug("[", _hostID, "] ", gid,
-                         " not found for retrieveMaster!");
           if (_status == 2) {
             // die if we expect all gids to be mapped already (stage 2)
             GALOIS_DIE("should not fail to find a GID after stage 2 "
@@ -253,7 +249,6 @@ class CustomMasterAssignment : public PartitioningScaffold {
 
     for (auto i = gid2offsets.begin(); i != gid2offsets.end(); i++) {
       assert(i->second < localNodeToMaster.size());
-      galois::gDebug("Map ", i->first, " to ", localNodeToMaster[i->second]);
       _gid2masters[i->first] = localNodeToMaster[i->second];
     }
     assert(_gid2masters.size() == (originalSize + gid2offsets.size()));
@@ -314,13 +309,10 @@ class CustomMasterAssignment : public PartitioningScaffold {
       auto offsetIntoMapIter = _gid2masters.find(gid);
       if (offsetIntoMapIter == _gid2masters.end()) {
         // NOT FOUND
-        galois::gDebug("[", _hostID, "] ", gid, " not found; mapping!");
         _gid2masters[gid] = mappedMaster;
         return true;
       } else {
         // already mapped
-        galois::gDebug("[", _hostID, "] ", gid, " already mapped with master ",
-                       offsetIntoMapIter->second, "!");
         assert(offsetIntoMapIter->second == mappedMaster);
         return false;
       }
diff --git a/libcusp/include/galois/graphs/DistributedGraph.h b/libcusp/include/galois/graphs/DistributedGraph.h
index 41c0e810f1..bf70bbf3e2 100644
--- a/libcusp/include/galois/graphs/DistributedGraph.h
+++ b/libcusp/include/galois/graphs/DistributedGraph.h
@@ -776,7 +776,6 @@ class DistGraph {
                withEdgeRanges.size() != 0) {
       masterRanges = withEdgeRanges;
     } else {
-      galois::gDebug("Manually det. master thread ranges");
       masterRanges = galois::graphs::determineUnitRangesFromGraph(
           graph, galois::runtime::activeThreads, beginMaster,
           beginMaster + numOwned, 0);
@@ -802,7 +801,6 @@ class DistGraph {
                masterRanges.size() != 0) {
       withEdgeRanges = masterRanges;
     } else {
-      galois::gDebug("Manually det. with edges thread ranges");
       withEdgeRanges = galois::graphs::determineUnitRangesFromGraph(
           graph, galois::runtime::activeThreads, 0, numNodesWithEdges, 0);
     }
@@ -869,7 +867,6 @@ class DistGraph {
    * Deallocates underlying LC CSR Graph
    */
   void deallocate() {
-    galois::gDebug("Deallocating CSR in DistGraph");
     graph.deallocate();
   }
 
diff --git a/libcusp/include/galois/graphs/NewGeneric.h b/libcusp/include/galois/graphs/NewGeneric.h
index 4632d3b4d8..33a618c62f 100644
--- a/libcusp/include/galois/graphs/NewGeneric.h
+++ b/libcusp/include/galois/graphs/NewGeneric.h
@@ -1012,8 +1012,6 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
                             std::vector<uint32_t>& receivedOffsets,
                             std::vector<uint32_t>& receivedMasters) {
     uint64_t hostOffset = base_DistGraph::gid2host[sendingHost].first;
-    galois::gDebug("[", base_DistGraph::id, "] host ", sendingHost, " offset ",
-                   hostOffset);
 
     // if execution gets here, messageType was 1 or 2
     assert(receivedMasters.size() == receivedOffsets.size());
@@ -1021,10 +1019,8 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
     galois::do_all(
         galois::iterate((size_t)0, receivedMasters.size()),
         [&](size_t i) {
-          uint64_t curGID       = hostOffset + receivedOffsets[i];
-          uint32_t indexIntoMap = gid2offsets[curGID];
-          galois::gDebug("[", base_DistGraph::id, "] gid ", curGID, " offset ",
-                         indexIntoMap);
+          uint64_t curGID                 = hostOffset + receivedOffsets[i];
+          uint32_t indexIntoMap           = gid2offsets[curGID];
           localNodeToMaster[indexIntoMap] = receivedMasters[i];
         },
         galois::no_stats());
@@ -1069,9 +1065,6 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
                  messageType);
     }
 
-    galois::gDebug("[", base_DistGraph::id, "] host ", sendingHost,
-                   " send message type ", messageType);
-
     return std::make_pair(sendingHost, messageType);
   }
 
@@ -1126,9 +1119,6 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
           GALOIS_DIE("invalid message type for sync of master assignments: ",
                      messageType);
         }
-
-        galois::gDebug("[", base_DistGraph::id, "] host ", sendingHost,
-                       " send message type ", messageType);
       }
     } while (p);
   }
@@ -1316,8 +1306,6 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
     // gid to vector offset setup
     std::unordered_map<uint64_t, uint32_t> gid2offsets;
     uint64_t neighborCount = phase0MapSetup(ghosts, gid2offsets, syncNodes);
-    galois::gDebug("[", base_DistGraph::id, "] num neighbors found is ",
-                   neighborCount);
     // send off neighbor metadata
     phase0SendRecv(syncNodes);
 
@@ -1386,13 +1374,6 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
       auto work =
           getSpecificThreadRange(bufGraph, rangeVec, beginNode, endNode);
 
-      // debug print
-      // galois::on_each([&] (unsigned i, unsigned j) {
-      //  galois::gDebug("[", base_DistGraph::id, " ", i, "] sync round ",
-      //  syncRound, " local range ",
-      //                 *work.local_begin(), " ", *work.local_end());
-      //});
-
       galois::do_all(
           // iterate over my read nodes
           galois::iterate(work),
@@ -1410,10 +1391,6 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
             // on map with subtraction
             localNodeToMaster[node - globalOffset] = assignedHost;
 
-            // galois::gDebug("[", base_DistGraph::id, "] state round ",
-            // syncRound,
-            //               " set ", node, " ", node - globalOffset);
-
             // ptt.stop();
           },
           galois::loopname("Phase0DetermineMasters"), galois::steal(),
@@ -1460,14 +1437,6 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
       waitTime.start();
       while (hostFinished.count() != base_DistGraph::numHosts ||
              loadsClear.count() != base_DistGraph::numHosts) {
-        //#ifndef NDEBUG
-        // galois::gDebug("[", base_DistGraph::id, "] waiting for all hosts to
-        // finish, ",
-        //               hostFinished.count());
-        // galois::gDebug("[", base_DistGraph::id, "] waiting for all hosts
-        // loads "
-        //               "syncs to finish, ", loadsClear.count());
-        //#endif
         // make sure all assignments are done and all loads are done
         syncAssignmentReceivesAsync(localNodeToMaster, gid2offsets,
                                     hostFinished);
@@ -1476,15 +1445,9 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
       waitTime.stop();
     }
 
-#ifndef NDEBUG
-    printLoad(nodeLoads, nodeAccum);
-    printLoad(edgeLoads, edgeAccum);
-#endif
-
     // sanity check for correctness (all should be assigned)
     for (uint32_t i = 0; i < localNodeToMaster.size(); i++) {
       if (localNodeToMaster[i] == (uint32_t)-1) {
-        // galois::gDebug("[", base_DistGraph::id, "] bad index ", i);
         assert(localNodeToMaster[i] != (uint32_t)-1);
       }
     }
@@ -2041,9 +2004,6 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
     size_t curCount = 0;
     // size_t actuallySet = 0;
     for (uint32_t offset : offsetsToConsider.getOffsets()) {
-      // galois::gDebug("[", base_DistGraph::id, "] ", " setting ",
-      //               offset + hostOffset, " from host ", senderHost,
-      //               " to ", recvMasterLocations[curCount]);
       graphPartitioner->addMasterMapping(offset + hostOffset,
                                          recvMasterLocations[curCount]);
       // bool set = graphPartitioner->addMasterMapping(offset + hostOffset,
@@ -2051,9 +2011,6 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
       // if (set) { actuallySet++; }
       curCount++;
     }
-
-    // galois::gDebug("[", base_DistGraph::id, "] host ", senderHost, ": set ",
-    //               actuallySet, " out of ", recvMasterLocations.size());
   }
 
   /**
@@ -2070,9 +2027,6 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
     size_t curCount = 0;
     for (uint64_t gid : gids) {
       assert(gid < base_DistGraph::numGlobalNodes);
-      // galois::gDebug("[", base_DistGraph::id, "] ", " in-setting ", gid, " to
-      // ",
-      //               recvMasterLocations[curCount]);
       graphPartitioner->addMasterMapping(gid, recvMasterLocations[curCount]);
       curCount++;
     }
@@ -2133,7 +2087,6 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
         galois::runtime::gSerialize(b, offsets);
 
         if (graphPartitioner->masterAssignPhase()) {
-          // galois::gDebug("incoming master map serialization");
           // serializeIncomingMasterMap(b, curBitset, h);
           serializeIncomingMasterMap(b, curBitset);
         }
@@ -2142,7 +2095,6 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
         galois::runtime::gSerialize(b, 1);
         galois::runtime::gSerialize(b, curBitset);
         if (graphPartitioner->masterAssignPhase()) {
-          // galois::gDebug("incoming master map serialization");
           // serializeIncomingMasterMap(b, curBitset, h);
           serializeIncomingMasterMap(b, curBitset);
         }
@@ -2268,9 +2220,6 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
     inspectIncomingNodes(hasIncomingEdge, prefixSumOfEdges);
     finalizeInspection(prefixSumOfEdges);
 
-    galois::gDebug("[", base_DistGraph::id,
-                   "] To receive this many nodes: ", nodesToReceive);
-
     galois::gPrint("[", base_DistGraph::id, "] Inspection mapping complete.\n");
     return prefixSumOfEdges;
   }
@@ -2307,9 +2256,6 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
             galois::block_range((size_t)0, hostSize, tid, nthreads);
         uint64_t count = 0;
         for (size_t i = beginNode; i < endNode; i++) {
-          // galois::gDebug("[", base_DistGraph::id, "] ", i + startNode,
-          //               " mapped to ",
-          //               graphPartitioner->retrieveMaster(i+startNode));
           if (graphPartitioner->retrieveMaster(i + startNode) == myHID) {
             count++;
           }
@@ -2326,9 +2272,7 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
       assert(base_DistGraph::localToGlobalVector.size() ==
              base_DistGraph::numNodes);
 
-      uint32_t newMasterNodes = threadPrefixSums[activeThreads - 1];
-      galois::gDebug("[", base_DistGraph::id, "] This many masters from host ",
-                     h, ": ", newMasterNodes);
+      uint32_t newMasterNodes    = threadPrefixSums[activeThreads - 1];
       uint32_t startingNodeIndex = base_DistGraph::numNodes;
       // increase size of prefix sum + mapping vector
       prefixSumOfEdges.resize(base_DistGraph::numNodes + newMasterNodes);
diff --git a/libdist/include/galois/DTerminationDetector.h b/libdist/include/galois/DTerminationDetector.h
index 0f6d696838..28c58b3666 100644
--- a/libdist/include/galois/DTerminationDetector.h
+++ b/libdist/include/galois/DTerminationDetector.h
@@ -150,10 +150,8 @@ class DGTerminator {
 
   bool terminate() {
     bool active = (local_mdata != 0);
-    // if (active) galois::gDebug("[", net.ID, "] local work done \n");
     if (!active) {
       active = net.anyPendingSends();
-      // if (active) galois::gDebug("[", net.ID, "] pending send \n");
     }
     int snapshot_ended = 0;
     if (!active) {
@@ -166,8 +164,6 @@ class DGTerminator {
     }
     if (!active) { // check pending receives after checking snapshot
       active = net.anyPendingReceives();
-      if (active)
-        galois::gDebug("[", net.ID, "] pending receive");
     }
     if (active) {
       work_done = true;
@@ -178,16 +174,11 @@ class DGTerminator {
           work_done     = false;
           prev_snapshot = snapshot;
           ++snapshot;
-          galois::gDebug("[", net.ID, "] work done, taking snapshot ",
-                         snapshot);
           initiate_snapshot();
         } else if (prev_snapshot != snapshot) {
           prev_snapshot = snapshot;
-          galois::gDebug("[", net.ID, "] no work done, taking snapshot ",
-                         snapshot);
           initiate_snapshot();
         } else {
-          galois::gDebug("[", net.ID, "] terminating ", snapshot);
           // an explicit barrier may be required here
           // so that the next async phase begins on all hosts at the same time
           // however, this may add overheads when it is not required
diff --git a/libgalois/include/galois/LargeArray.h b/libgalois/include/galois/LargeArray.h
index da2b89b916..71df3036ff 100644
--- a/libgalois/include/galois/LargeArray.h
+++ b/libgalois/include/galois/LargeArray.h
@@ -80,21 +80,17 @@ class LargeArray {
     m_size = n;
     switch (t) {
     case Blocked:
-      galois::gDebug("Block-alloc'd");
       m_realdata =
           substrate::largeMallocBlocked(n * sizeof(T), runtime::activeThreads);
       break;
     case Interleaved:
-      galois::gDebug("Interleave-alloc'd");
       m_realdata = substrate::largeMallocInterleaved(n * sizeof(T),
                                                      runtime::activeThreads);
       break;
     case Local:
-      galois::gDebug("Local-allocd");
       m_realdata = substrate::largeMallocLocal(n * sizeof(T));
       break;
     case Floating:
-      galois::gDebug("Floating-alloc'd");
       m_realdata = substrate::largeMallocFloating(n * sizeof(T));
       break;
     };
diff --git a/libgalois/include/galois/graphs/BufferedGraph.h b/libgalois/include/galois/graphs/BufferedGraph.h
index 7140506311..e5e3fa4221 100644
--- a/libgalois/include/galois/graphs/BufferedGraph.h
+++ b/libgalois/include/galois/graphs/BufferedGraph.h
@@ -181,8 +181,6 @@ class BufferedGraph {
   void loadEdgeData(std::ifstream& graphFile, uint64_t edgeStart,
                     uint64_t numEdgesToLoad, uint64_t numGlobalNodes,
                     uint64_t numGlobalEdges) {
-    galois::gDebug("Loading edge data");
-
     if (numEdgesToLoad == 0) {
       return;
     }
@@ -233,7 +231,6 @@ class BufferedGraph {
       typename EdgeType,
       typename std::enable_if<std::is_void<EdgeType>::value>::type* = nullptr>
   void loadEdgeData(std::ifstream&, uint64_t, uint64_t, uint64_t, uint64_t) {
-    galois::gDebug("Not loading edge data");
     // do nothing (edge data is void, i.e. no edge data)
   }
 

From 9a33c6f806c33289f733ac153509dcd894152e2f Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 13 Oct 2020 13:53:12 -0500
Subject: [PATCH 369/660] Aggregation sync + feature reading fix + unit test

1) Fixes a bug in distributed feature reading that caused features to be
read incorrectly (GID 0 -> LID 0 instead of GID 0 -> LID of global node
0). Fixed in both refactored and non-refactored code. Preliminary
experiments show that it pretty much fixes accuracy in a distributed
setting. (have yet to check if it reaches single host accuracy)

2) Adds preliminary aggregation sync call to forward/backward phase:
trivial summation of rows of some matrix. Had to add globals to work
with current Gluon sync structures (having Katana's refactor would be
quite nice at this point in time......)

3) Adds a unit test for the aggregation sync. Pretty much the old conv
layer sync call except that it adds logic to deal with distributed rows.
---
 libdeepgalois/src/DistContext.cpp             |   2 +-
 libgnn/include/galois/graphs/GNNGraph.h       |  18 ++
 .../graphs/GraphAggregationSyncStructures.h   |  66 ++++++
 libgnn/src/GNNGraph.cpp                       |  56 +++--
 libgnn/src/GraphConvolutionalLayer.cpp        |  12 +-
 libgnn/test/CMakeLists.txt                    |   5 +
 libgnn/test/aggregate-sync-test.cpp           | 200 ++++++++++++++++++
 libgnn/test/convlayer-test.cpp                |   1 +
 8 files changed, 338 insertions(+), 22 deletions(-)
 create mode 100644 libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
 create mode 100644 libgnn/test/aggregate-sync-test.cpp

diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp
index e9f0ef4214..21bcad0fe3 100644
--- a/libdeepgalois/src/DistContext.cpp
+++ b/libdeepgalois/src/DistContext.cpp
@@ -136,7 +136,7 @@ size_t DistContext::read_features(std::string dataset_str) {
       // h_feats[count * feat_len] = fullFeats[i];
       std::copy(fullFeats + i * DistContext::feat_len,
                 fullFeats + (i + 1) * DistContext::feat_len,
-                &this->h_feats[count * DistContext::feat_len]);
+                &this->h_feats[dGraph->getLID(i) * DistContext::feat_len]);
       count++;
     }
   }
diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 79d96d0da5..a0b1430add 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -3,6 +3,7 @@
 #include "galois/GNNTypes.h"
 #include "galois/graphs/CuSPPartitioner.h"
 #include "galois/graphs/GluonSubstrate.h"
+#include "galois/graphs/GraphAggregationSyncStructures.h"
 
 namespace galois {
 
@@ -38,6 +39,12 @@ class GNNGraph {
   GNNGraph(const std::string& dataset_name, GNNPartitionScheme partition_scheme,
            bool has_single_class_label);
 
+  //! Returns host id
+  size_t host_id() const { return host_id_; }
+
+  //! Returns host id in brackets to use for printing things
+  const std::string& host_prefix() const { return host_prefix_; }
+
   //! Return # of nodes in the partitioned graph
   size_t size() const { return partitioned_graph_->size(); }
 
@@ -102,10 +109,21 @@ class GNNGraph {
     return partitioned_graph_->edge_dst_ptr();
   }
 
+  //! Given a matrix and the column size, do an aggregate sync where each row
+  //! is considered a node's data and sync using the graph's Gluon
+  //! substrate
+  //! Note that it's const because the only thing being used is the graph
+  //! topology of this object; the thing modified is the passed in matrix
+  void AggregateSync(GNNFloat* matrix_to_sync,
+                     const size_t matrix_column_size) const;
+
 private:
   //! In a multi-host setting, this variable stores the host id that the graph
   //! is currently running on
   unsigned host_id_;
+  //! String header that can be used for debug print statements to get the host
+  //! this graph is on
+  std::string host_prefix_;
   //! Number of classes for a single vertex label
   size_t num_label_classes_{1};
   //! Length of a feature node
diff --git a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
new file mode 100644
index 0000000000..75a18fd830
--- /dev/null
+++ b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
@@ -0,0 +1,66 @@
+// defined in GNNGraph.cpp; set in order to control which matrix
+// gets synchronized
+#include "galois/GNNTypes.h"
+#include "galois/BufferWrapper.h"
+
+namespace galois {
+namespace graphs {
+
+extern GNNFloat* gnn_matrix_to_sync_;
+extern size_t gnn_matrix_to_sync_column_length_;
+
+struct GNNSumAggregate {
+  using ValTy = galois::BufferWrapper<GNNFloat>;
+
+  //! return a vector of floats to sync
+  static ValTy extract(uint32_t node_id, char&) {
+    ValTy extracted_vec(
+        &gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_],
+        gnn_matrix_to_sync_column_length_);
+    // move constructor should kick in here to avoid return copy
+    return extracted_vec;
+  }
+
+  //! reduction is addition in this case; add received vector to
+  //! own vector
+  static bool reduce(uint32_t node_id, char&, ValTy y) {
+    assert(y.size() == gnn_matrix_to_sync_column_length_);
+    // loop and do addition
+    for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) {
+      gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_ + i] +=
+          y[i];
+    }
+    return true;
+  }
+
+  //! do nothing (waste of a write)
+  static void reset(uint32_t, char&) {}
+
+  //! element wise set
+  static void setVal(uint32_t node_id, char&, ValTy y) {
+    assert(y.size() == gnn_matrix_to_sync_column_length_);
+    // loop and do addition
+    for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) {
+      gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_ + i] =
+          y[i];
+    }
+  }
+
+  // GPU options TODO for GPU
+  static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {
+    return false;
+  }
+  static bool extract_batch(unsigned, uint8_t*) { return false; }
+  static bool extract_reset_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {
+    return false;
+  }
+  static bool extract_reset_batch(unsigned, uint8_t*) { return false; }
+  static bool reduce_batch(unsigned, uint8_t*, DataCommMode) { return false; }
+  static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) {
+    return false;
+  }
+  static bool setVal_batch(unsigned, uint8_t*, DataCommMode) { return false; }
+};
+
+} // namespace graphs
+} // namespace galois
diff --git a/libgnn/src/GNNGraph.cpp b/libgnn/src/GNNGraph.cpp
index 38a78d68dc..a327dfe641 100644
--- a/libgnn/src/GNNGraph.cpp
+++ b/libgnn/src/GNNGraph.cpp
@@ -28,6 +28,13 @@ LoadPartition(const std::string& dataset_name,
 
 } // end namespace
 
+namespace galois {
+namespace graphs {
+GNNFloat* gnn_matrix_to_sync_            = nullptr;
+size_t gnn_matrix_to_sync_column_length_ = 0;
+} // namespace graphs
+} // namespace galois
+
 galois::graphs::GNNGraph::GNNGraph(const std::string& dataset_name,
                                    GNNPartitionScheme partition_scheme,
                                    bool has_single_class_label) {
@@ -35,6 +42,10 @@ galois::graphs::GNNGraph::GNNGraph(const std::string& dataset_name,
                      dataset_name);
   // save host id
   host_id_ = galois::runtime::getSystemNetworkInterface().ID;
+  host_prefix_ =
+      std::string("[") +
+      std::to_string(galois::runtime::getSystemNetworkInterface().ID) +
+      std::string("] ");
   // load partition
   partitioned_graph_ = LoadPartition(dataset_name, partition_scheme);
 
@@ -90,6 +101,19 @@ bool galois::graphs::GNNGraph::IsValidForPhase(
   }
 }
 
+void galois::graphs::GNNGraph::AggregateSync(
+    GNNFloat* matrix_to_sync, const size_t matrix_column_size) const {
+  // set globals for the sync substrate
+  gnn_matrix_to_sync_               = matrix_to_sync;
+  gnn_matrix_to_sync_column_length_ = matrix_column_size;
+
+  // XXX bitset setting
+
+  // call sync
+  sync_substrate_->sync<writeSource, readAny, GNNSumAggregate>(
+      "GraphAggregateSync");
+}
+
 void galois::graphs::GNNGraph::ReadLocalLabels(const std::string& dataset_name,
                                                bool has_single_class_label) {
   GALOIS_LOG_VERBOSE("[{}] Reading labels from disk...", host_id_);
@@ -164,31 +188,32 @@ void galois::graphs::GNNGraph::ReadLocalFeatures(
   GALOIS_LOG_VERBOSE("[{}] Reading features from disk...", host_id_);
 
   // read in dimensions of features, specifically node feature length
-  size_t num_vertices;
+  size_t num_global_vertices;
 
   std::string file_dims = galois::gnn_dataset_path + dataset_name + "-dims.txt";
   std::ifstream ifs;
   ifs.open(file_dims, std::ios::in);
-  ifs >> num_vertices >> node_feature_length_;
+  ifs >> num_global_vertices >> node_feature_length_;
   ifs.close();
 
-  GALOIS_LOG_ASSERT(num_vertices == partitioned_graph_->globalSize());
-  GALOIS_LOG_VERBOSE("[{}] N x D: {} x {}", host_id_, num_vertices,
+  GALOIS_LOG_ASSERT(num_global_vertices == partitioned_graph_->globalSize());
+  GALOIS_LOG_VERBOSE("[{}] N x D: {} x {}", host_id_, num_global_vertices,
                      node_feature_length_);
 
   // memory for all features of all nodes in graph
   // TODO read features without loading entire feature file into memory; this
   // is quite inefficient
   std::unique_ptr<GNNFloat[]> full_feature_set =
-      std::make_unique<GNNFloat[]>(num_vertices * node_feature_length_);
+      std::make_unique<GNNFloat[]>(num_global_vertices * node_feature_length_);
 
   // read in all features
   std::ifstream file_stream;
   std::string feature_file =
       galois::gnn_dataset_path + dataset_name + "-feats.bin";
   file_stream.open(feature_file, std::ios::binary | std::ios::in);
-  file_stream.read((char*)full_feature_set.get(),
-                   sizeof(GNNFloat) * num_vertices * node_feature_length_);
+  file_stream.read((char*)full_feature_set.get(), sizeof(GNNFloat) *
+                                                      num_global_vertices *
+                                                      node_feature_length_);
   file_stream.close();
 
   // allocate memory for local features
@@ -196,18 +221,19 @@ void galois::graphs::GNNGraph::ReadLocalFeatures(
                               node_feature_length_);
 
   // copy over features for local nodes only
-  size_t local_vertex = 0;
-  for (size_t i = 0; i < num_vertices; i++) {
-    if (partitioned_graph_->isLocal(i)) {
+  size_t num_kept_vertices = 0;
+  for (size_t gid = 0; gid < num_global_vertices; gid++) {
+    if (partitioned_graph_->isLocal(gid)) {
       // copy over feature vector
-      std::copy(full_feature_set.get() + i * node_feature_length_,
-                full_feature_set.get() + (i + 1) * node_feature_length_,
-                &local_node_features_[local_vertex * node_feature_length_]);
-      local_vertex++;
+      std::copy(full_feature_set.get() + gid * node_feature_length_,
+                full_feature_set.get() + (gid + 1) * node_feature_length_,
+                &local_node_features_[partitioned_graph_->getLID(gid) *
+                                      node_feature_length_]);
+      num_kept_vertices++;
     }
   }
   full_feature_set.reset();
-  GALOIS_LOG_ASSERT(local_vertex++ == partitioned_graph_->size());
+  GALOIS_LOG_ASSERT(num_kept_vertices == partitioned_graph_->size());
 }
 
 //! Helper function to read masks from file into the appropriate structures
diff --git a/libgnn/src/GraphConvolutionalLayer.cpp b/libgnn/src/GraphConvolutionalLayer.cpp
index 5a222ced62..75125e6fe2 100644
--- a/libgnn/src/GraphConvolutionalLayer.cpp
+++ b/libgnn/src/GraphConvolutionalLayer.cpp
@@ -54,9 +54,6 @@ galois::GraphConvolutionalLayer::ForwardPhase(
 
   // TODO synchronization of aggregation functions
 
-  // TODO if input columns > output columns do update first then aggregate for
-  // efficiency
-
   if (config_.do_activation) {
     GALOIS_LOG_VERBOSE("Doing activation");
     Activation();
@@ -116,9 +113,9 @@ std::vector<galois::GNNFloat>* galois::GraphConvolutionalLayer::BackwardPhase(
         layer_weight_gradients_.data());
   }
 
-  // TODO sync agg/update
-
-  // TODO sync weights
+  // sync weight gradients; note aggregation sync occurs in the function call
+  // already
+  // XXX
 
   if (config_.do_dropout && layer_number_ != 0) {
     DoDropoutDerivative();
@@ -176,6 +173,9 @@ void galois::GraphConvolutionalLayer::AggregateAll(
         }
       },
       galois::steal(), galois::loopname("ConvolutionalAggregateAll"));
+
+  // aggregate sync
+  graph_.AggregateSync(aggregate_output, column_length);
 }
 
 void galois::GraphConvolutionalLayer::UpdateEmbeddings(
diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt
index 7ad7bf1888..70dc3c2b65 100644
--- a/libgnn/test/CMakeLists.txt
+++ b/libgnn/test/CMakeLists.txt
@@ -30,4 +30,9 @@ add_executable(epoch-test epoch-test.cpp)
 target_link_libraries(epoch-test galois_gnn)
 add_test(NAME epoch-test COMMAND epoch-test)
 
+# TODO figure out how to make this test run in parallel
+add_executable(aggregate-sync-test aggregate-sync-test.cpp)
+target_link_libraries(aggregate-sync-test galois_gnn)
+#add_test(NAME aggregate-sync-test COMMAND GALOIS_DO_NOT_BIND_THREADS=1 mpirun -n=4 ./aggregate-sync-test)
+
 # TODO multi host tests?
diff --git a/libgnn/test/aggregate-sync-test.cpp b/libgnn/test/aggregate-sync-test.cpp
new file mode 100644
index 0000000000..a6ca42e963
--- /dev/null
+++ b/libgnn/test/aggregate-sync-test.cpp
@@ -0,0 +1,200 @@
+#include "galois/Logging.h"
+#include "galois/GraphNeuralNetwork.h"
+#include "galois/layers/GraphConvolutionalLayer.h"
+
+int main() {
+  galois::DistMemSys G;
+
+  if (galois::runtime::getSystemNetworkInterface().Num == 1) {
+    GALOIS_LOG_ERROR("This test should be run with multiple hosts/processes");
+    exit(1);
+  }
+
+  auto test_graph = std::make_unique<galois::graphs::GNNGraph>(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true);
+
+  // print edges for sanity
+  for (size_t node = 0; node < test_graph->size(); node++) {
+    for (auto e = test_graph->EdgeBegin(node); e != test_graph->EdgeEnd(node);
+         e++) {
+      galois::gPrint(test_graph->host_prefix(), "Edge ",
+                     test_graph->GetGID(node), " ",
+                     test_graph->GetGID(test_graph->EdgeDestination(e)), "\n");
+    }
+  }
+
+  // create same layer from convlayer-test and make sure result is the same even
+  // in multi-host environment
+  galois::GNNLayerDimensions dimension_0{.input_rows     = test_graph->size(),
+                                         .input_columns  = 3,
+                                         .output_columns = 2};
+
+  // create the layer, no norm factor
+  // note layer number is 1 so that it does something in backward phase
+  std::unique_ptr<galois::GraphConvolutionalLayer> layer_0 =
+      std::make_unique<galois::GraphConvolutionalLayer>(
+          0, *(test_graph.get()), dimension_0,
+          galois::GNNConfig{.allow_aggregate_after_update = false});
+  layer_0->InitAllWeightsTo1();
+  // make sure it runs in a sane manner
+  const std::vector<galois::GNNFloat>& layer_0_forward_output =
+      layer_0->ForwardPhase(test_graph->GetLocalFeatures());
+
+  //////////////////////////////////////////////////////////////////////////////
+  // sanity check output
+  //////////////////////////////////////////////////////////////////////////////
+
+  // check each row on each host: convert row into GID, and based on GID we
+  // know what the ground truth is
+  // row 0 = 3
+  // row 1 = 6
+  // row 2 = 12
+  // row 3 = 18
+  // row 4 = 24
+  // row 5 = 30
+  // row 6 = 15
+
+  // row should correspond to LID
+  for (size_t row = 0; row < test_graph->size(); row++) {
+    // row -> GID
+    size_t global_row = test_graph->GetGID(row);
+
+    galois::GNNFloat ground_truth = 0.0;
+
+    switch (global_row) {
+    case 0:
+      ground_truth = 3;
+      break;
+    case 1:
+      ground_truth = 6;
+      break;
+    case 2:
+      ground_truth = 12;
+      break;
+    case 3:
+      ground_truth = 18;
+      break;
+    case 4:
+      ground_truth = 24;
+      break;
+    case 5:
+      ground_truth = 30;
+      break;
+    case 6:
+      ground_truth = 15;
+      break;
+    default:
+      GALOIS_LOG_FATAL("bad global row for test graph");
+      break;
+    }
+
+    // size 2 columns
+    for (size_t c = 0; c < 2; c++) {
+      GALOIS_LOG_ASSERT(layer_0_forward_output[row * 2 + c] == ground_truth);
+    }
+  }
+
+  //////////////////////////////////////////////////////////////////////////////
+
+  std::vector<galois::GNNFloat> dummy_ones(test_graph->size() * 2, 1);
+  // backward pass checking
+  // layer 0 means that an empty weight matrix is returned since there is no
+  // point passing back anything
+  std::vector<galois::GNNFloat>* layer_0_backward_output =
+      layer_0->BackwardPhase(test_graph->GetLocalFeatures(), &dummy_ones);
+
+  //////////////////////////////////////////////////////////////////////////////
+  // sanity check layer 0 backward output; all 0 because layer 0
+  //////////////////////////////////////////////////////////////////////////////
+  // since norm factors aren't invovled it is possible to do full assertions
+  GALOIS_LOG_ASSERT(layer_0_backward_output->size() == test_graph->size() * 3);
+  for (size_t i = 0; i < layer_0_backward_output->size(); i++) {
+    GALOIS_LOG_ASSERT((*layer_0_backward_output)[i] == 0);
+  }
+
+  //////////////////////////////////////////////////////////////////////////////
+  // layer 1 to check backward output
+  //////////////////////////////////////////////////////////////////////////////
+  std::unique_ptr<galois::GraphConvolutionalLayer> layer_1 =
+      std::make_unique<galois::GraphConvolutionalLayer>(
+          1, *(test_graph.get()), dimension_0,
+          galois::GNNConfig{.allow_aggregate_after_update = false});
+  layer_1->InitAllWeightsTo1();
+  const std::vector<galois::GNNFloat>& layer_1_forward_output =
+      layer_1->ForwardPhase(test_graph->GetLocalFeatures());
+
+  // same check for forward as before
+  for (size_t row = 0; row < test_graph->size(); row++) {
+    // row -> GID
+    size_t global_row = test_graph->GetGID(row);
+
+    galois::GNNFloat ground_truth = 0.0;
+
+    switch (global_row) {
+    case 0:
+      ground_truth = 3;
+      break;
+    case 1:
+      ground_truth = 6;
+      break;
+    case 2:
+      ground_truth = 12;
+      break;
+    case 3:
+      ground_truth = 18;
+      break;
+    case 4:
+      ground_truth = 24;
+      break;
+    case 5:
+      ground_truth = 30;
+      break;
+    case 6:
+      ground_truth = 15;
+      break;
+    default:
+      GALOIS_LOG_FATAL("bad global row for test graph");
+      break;
+    }
+
+    // size 2 columns
+    for (size_t c = 0; c < 2; c++) {
+      GALOIS_LOG_ASSERT(layer_1_forward_output[row * 2 + c] == ground_truth);
+    }
+  }
+
+  // since layer isn't 0 anymore, backward phase will actually return something
+  dummy_ones.assign(test_graph->size() * 2, 1);
+  std::vector<galois::GNNFloat>* layer_1_backward_output =
+      layer_1->BackwardPhase(test_graph->GetLocalFeatures(), &dummy_ones);
+
+  for (size_t row = 0; row < test_graph->size(); row++) {
+    // row -> GID
+    size_t global_row = test_graph->GetGID(row);
+
+    galois::GNNFloat ground_truth = 0.0;
+
+    switch (global_row) {
+    case 0:
+    case 6:
+      ground_truth = 2;
+      break;
+    case 1:
+    case 2:
+    case 3:
+    case 4:
+    case 5:
+      ground_truth = 4;
+      break;
+    default:
+      GALOIS_LOG_FATAL("bad global row for test graph");
+      break;
+    }
+
+    // size 3 columns
+    for (size_t c = 0; c < 3; c++) {
+      GALOIS_LOG_ASSERT((*layer_1_backward_output)[row * 3 + c] ==
+                        ground_truth);
+    }
+  }
+}
diff --git a/libgnn/test/convlayer-test.cpp b/libgnn/test/convlayer-test.cpp
index 3f46fb84d2..0bb6c25a4c 100644
--- a/libgnn/test/convlayer-test.cpp
+++ b/libgnn/test/convlayer-test.cpp
@@ -20,6 +20,7 @@ int main() {
   std::vector<galois::GNNFloat> feats = test_graph.GetLocalFeatures();
   //////////////////////////////////////////////////////////////////////////////
   // doubles as a test for reading as well
+  GALOIS_LOG_ASSERT(7 == test_graph.size());
   GALOIS_LOG_ASSERT(21 == feats.size());
   GALOIS_LOG_ASSERT(0.0 == feats[0]);
   GALOIS_LOG_ASSERT(0.0 == feats[1]);

From 4acdd1f5050e3fc4c19d1822c78d5bd986e0c9ee Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 13 Oct 2020 19:19:35 -0500
Subject: [PATCH 370/660] GNN weight gradient synchronization + unit test

Adds a wrapper over any vector to make it possible to sync the vector
using Gluon: assumes the vector is replicated present on all hosts.

Use this wrapper to synchronize weight gradients calculated on each host
during the backward phase of computation. This commit includes a sum and
an average sync function with the average one being used by default.
TODO later is to make this user configurable.

Adds a unit test to make sure the weight gradient works as expected.

With this commit an end to end training framework should work. There are
still some accuracy issues to resolve, however (not matching old code
accuracy).
---
 libgnn/CMakeLists.txt                         |  1 +
 libgnn/include/galois/layers/GNNLayer.h       | 13 ++++
 .../galois/layers/GluonGradientInterface.h    | 70 +++++++++++++++++++
 .../galois/layers/GradientSyncStructures.h    | 37 ++++++++++
 libgnn/src/GNNLayer.cpp                       | 33 +++++++++
 libgnn/src/GluonGradientInterface.cpp         | 49 +++++++++++++
 libgnn/src/GraphConvolutionalLayer.cpp        |  2 +-
 libgnn/test/CMakeLists.txt                    |  3 +
 libgnn/test/aggregate-sync-test.cpp           |  3 +-
 libgnn/test/convlayer-test.cpp                |  1 -
 libgnn/test/weight-sync-test.cpp              | 42 +++++++++++
 11 files changed, 251 insertions(+), 3 deletions(-)
 create mode 100644 libgnn/include/galois/layers/GluonGradientInterface.h
 create mode 100644 libgnn/include/galois/layers/GradientSyncStructures.h
 create mode 100644 libgnn/src/GluonGradientInterface.cpp
 create mode 100644 libgnn/test/weight-sync-test.cpp

diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt
index 88398c3d60..e8c109df6c 100644
--- a/libgnn/CMakeLists.txt
+++ b/libgnn/CMakeLists.txt
@@ -14,6 +14,7 @@ set(sources
   src/SoftmaxLayer.cpp
   src/GraphNeuralNetwork.cpp
   src/GNNOptimizers.cpp
+  src/GluonGradientInterface.cpp
 )
 
 add_library(galois_gnn STATIC ${sources})
diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
index ac6cc9dd0e..2232e82b5c 100644
--- a/libgnn/include/galois/layers/GNNLayer.h
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -3,6 +3,7 @@
 #include "galois/PerThreadRNG.h"
 #include "galois/GNNOptimizers.h"
 #include "galois/graphs/GNNGraph.h"
+#include "galois/layers/GluonGradientInterface.h"
 
 namespace galois {
 
@@ -141,6 +142,12 @@ class GNNLayer {
   std::vector<GNNFloat> layer_weights_;
   //! Gradients used to update the weights of this layer
   std::vector<GNNFloat> layer_weight_gradients_;
+  //! Wrapper over gradient matrix to make it compatible with Gluon
+  std::unique_ptr<GluonGradientInterface> gradient_sync_interface_;
+  //! Synchronization substrate for the weight gradients
+  std::unique_ptr<galois::graphs::GluonSubstrate<GluonGradientInterface>>
+      gradient_sync_substrate_;
+
   // There is a forward and a backward as their sizes will differ and we only
   // want to allocate memory once to avoid runtime memory allocation.
   //! The output of the forward phase for this layer.
@@ -188,6 +195,12 @@ class GNNLayer {
   void Activation();
   //! Calculate derivative of activation function based on config on the matrix
   void ActivationDerivative(std::vector<GNNFloat>* matrix);
+
+  //! Synchronize weight gradients with a summation
+  void WeightGradientSyncSum();
+  //! Synchronize weight gradients with a summation, then locally divide all
+  //! weights to get an average
+  void WeightGradientSyncAverage();
 };
 
 } // namespace galois
diff --git a/libgnn/include/galois/layers/GluonGradientInterface.h b/libgnn/include/galois/layers/GluonGradientInterface.h
new file mode 100644
index 0000000000..92c0a5eb69
--- /dev/null
+++ b/libgnn/include/galois/layers/GluonGradientInterface.h
@@ -0,0 +1,70 @@
+#pragma once
+
+#include "galois/GNNTypes.h"
+#include "galois/gstl.h"
+#include "galois/runtime/Network.h"
+
+namespace galois {
+
+// TODO figure out which function calls can be removed without causing compiler
+// to complain
+
+//! Wraps a matrix and allows it to be synchronized via Gluon as it provides
+//! all the functions Gluon needs.
+//! Assumes the matrix is initialized the same way across all hosts (if not
+//! they'll all see the same values after the first round of sync anyways)
+class GluonGradientInterface {
+public:
+  //! Save reference to weight gradients.
+  //! Then setup mirror metadata for Gluon to use during setup.
+  GluonGradientInterface(std::vector<GNNFloat>& gradients);
+
+  //! Size is number of weights since all hosts own everything
+  size_t size() const { return num_weights_; }
+  //! Global size is number of weights
+  size_t globalSize() const { return num_weights_; }
+  //! Return the weights owned by this host
+  size_t numMasters() const { return num_owned_; }
+  //! GID is same as LID since all hosts have all weights
+  uint32_t getGID(const uint32_t node_id) const { return node_id; }
+  //! LID is same as GID since all hosts have all weights
+  uint32_t getLID(const uint32_t node_id) const { return node_id; }
+  //! Return weight w
+  GNNFloat& getData(uint32_t w) const { return gradients_[w]; }
+  //! Return ranges for mirrors (unowned nodes)
+  const std::vector<std::pair<uint32_t, uint32_t>>& getMirrorRanges() const {
+    return mirror_ranges_;
+  }
+  //! Return mirror nodes for each host from this host's point of view
+  std::vector<std::vector<size_t>>& getMirrorNodes() { return mirror_nodes_; }
+
+  //////////////////////////////////////////////////////////////////////////////
+
+  // for all that follow, no edges in this sync so most of this returns what
+  // you expect
+  // size_t getNumNodesWithEdges() const { return 0; }
+  bool is_vertex_cut() const { return false; }
+  unsigned edge_begin(uint32_t) const { return 0; }
+  unsigned edge_end(uint32_t) const { return 0; }
+  unsigned getEdgeDst(uint32_t) const { return 0; }
+  unsigned getEdgeData(uint32_t) const { return 0; }
+  void deallocate() const {};
+
+private:
+  //! Reference to gradients that can get synchronized
+  std::vector<GNNFloat>& gradients_;
+  //! number of weight gradients
+  size_t num_weights_;
+  //! number of single gradients this host is responsible for
+  size_t num_owned_;
+  //! First weight that's a master
+  size_t begin_master_;
+  //! Last weight that's a master
+  size_t end_master_;
+  //! My nodes whose's masters are on other hosts; global ids
+  std::vector<std::vector<size_t>> mirror_nodes_;
+  //! nodes that are mirrors on this host
+  std::vector<std::pair<uint32_t, uint32_t>> mirror_ranges_;
+};
+
+} // namespace galois
diff --git a/libgnn/include/galois/layers/GradientSyncStructures.h b/libgnn/include/galois/layers/GradientSyncStructures.h
new file mode 100644
index 0000000000..32b7a85b82
--- /dev/null
+++ b/libgnn/include/galois/layers/GradientSyncStructures.h
@@ -0,0 +1,37 @@
+#pragma once
+#include "galois/GNNTypes.h"
+
+namespace galois {
+
+//! Simple summation of values
+struct WeightGradientSummation {
+  using ValTy = GNNFloat;
+  static ValTy extract(uint32_t, ValTy& weight) { return weight; }
+  static bool reduce(uint32_t, ValTy& weight, ValTy y) {
+    weight += y;
+    return true;
+  }
+
+  //! reset weight to 0
+  static void reset(uint32_t, ValTy& weight) { weight = 0.0; }
+
+  //! save weight
+  static void setVal(uint32_t, ValTy& weight, ValTy y) { weight = y; }
+
+  // GPU options TODO for GPU
+  static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {
+    return false;
+  }
+  static bool extract_batch(unsigned, uint8_t*) { return false; }
+  static bool extract_reset_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {
+    return false;
+  }
+  static bool extract_reset_batch(unsigned, uint8_t*) { return false; }
+  static bool reduce_batch(unsigned, uint8_t*, DataCommMode) { return false; }
+  static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) {
+    return false;
+  }
+  static bool setVal_batch(unsigned, uint8_t*, DataCommMode) { return false; }
+};
+
+} // namespace galois
diff --git a/libgnn/src/GNNLayer.cpp b/libgnn/src/GNNLayer.cpp
index bcefd42efe..838bf45905 100644
--- a/libgnn/src/GNNLayer.cpp
+++ b/libgnn/src/GNNLayer.cpp
@@ -1,5 +1,6 @@
 #include "galois/Logging.h"
 #include "galois/layers/GNNLayer.h"
+#include "galois/layers/GradientSyncStructures.h"
 
 galois::GNNLayer::GNNLayer(size_t layer_num,
                            const galois::graphs::GNNGraph& graph,
@@ -18,6 +19,15 @@ galois::GNNLayer::GNNLayer(size_t layer_num,
     layer_weights_.resize(num_weight_elements);
     layer_weight_gradients_.resize(num_weight_elements, 0);
     GlorotBengioInit(&layer_weights_);
+
+    // initialize sync substrate
+    gradient_sync_interface_ =
+        std::make_unique<GluonGradientInterface>(layer_weight_gradients_);
+    gradient_sync_substrate_ = std::make_unique<
+        galois::graphs::GluonSubstrate<GluonGradientInterface>>(
+        *gradient_sync_interface_,
+        galois::runtime::getSystemNetworkInterface().ID,
+        galois::runtime::getSystemNetworkInterface().Num, false);
   }
 
   size_t num_output_elements =
@@ -124,3 +134,26 @@ void galois::GNNLayer::OptimizeLayer(BaseOptimizer* optimizer,
   optimizer->GradientDescent(layer_weight_gradients_, &layer_weights_,
                              trainable_layer_number);
 }
+
+void galois::GNNLayer::WeightGradientSyncSum() {
+  // XXX bitset
+  gradient_sync_substrate_->sync<writeAny, readAny, WeightGradientSummation>(
+      "WeightGradientsSync");
+}
+
+void galois::GNNLayer::WeightGradientSyncAverage() {
+  size_t num_hosts = galois::runtime::getSystemNetworkInterface().Num;
+  if (num_hosts > 1) {
+    // XXX bitset
+    // sum, then average by dividing all by num hosts (every host participates
+    // in sync)
+    gradient_sync_substrate_->sync<writeAny, readAny, WeightGradientSummation>(
+        "WeightGradientsSyncAverage");
+    galois::do_all(
+        galois::iterate(static_cast<size_t>(0), layer_weight_gradients_.size()),
+        [&](size_t weight_index) {
+          layer_weight_gradients_[weight_index] /= num_hosts;
+        },
+        galois::loopname("WeightGradientSyncAverageDivide"));
+  }
+}
diff --git a/libgnn/src/GluonGradientInterface.cpp b/libgnn/src/GluonGradientInterface.cpp
new file mode 100644
index 0000000000..31770afb4e
--- /dev/null
+++ b/libgnn/src/GluonGradientInterface.cpp
@@ -0,0 +1,49 @@
+#include "galois/layers/GluonGradientInterface.h"
+
+galois::GluonGradientInterface::GluonGradientInterface(
+    std::vector<GNNFloat>& gradients)
+    : gradients_(gradients), num_weights_(gradients_.size()) {
+  size_t my_host   = galois::runtime::getSystemNetworkInterface().ID;
+  size_t num_hosts = galois::runtime::getSystemNetworkInterface().Num;
+
+  // allocate a vector for each host
+  mirror_nodes_.resize(num_hosts);
+
+  // loop through distribution of weights to hosts
+  for (unsigned h = 0; h < num_hosts; h++) {
+    std::pair<size_t, size_t> cur_range =
+        galois::block_range((size_t)0, num_weights_, h, num_hosts);
+
+    if (h != my_host) {
+      // setup mirrors for the host h which is just the list of IDs
+      size_t current_weight   = cur_range.first;
+      size_t last_weight      = cur_range.second;
+      size_t num_host_weights = last_weight - current_weight;
+
+      // set mirrors for host h
+      mirror_nodes_[h].reserve(num_host_weights);
+      for (; current_weight < last_weight; current_weight++) {
+        mirror_nodes_[h].push_back(current_weight);
+      }
+    } else {
+      // these belong to this host; save, then mirror ranges can be
+      // calculated from this
+      begin_master_ = cur_range.first;
+      end_master_   = cur_range.second;
+      num_owned_    = end_master_ - begin_master_;
+
+      // first range is 0 to begin master
+      if (begin_master_ > 0) {
+        mirror_ranges_.emplace_back(0, begin_master_);
+      }
+
+      // second range is endMaster to end
+      if (end_master_ < num_weights_) {
+        mirror_ranges_.emplace_back(end_master_, num_weights_);
+      }
+    }
+  }
+
+  galois::gInfo("[", my_host, "] Weight gradients: this host owns ",
+                begin_master_, " to ", end_master_);
+}
diff --git a/libgnn/src/GraphConvolutionalLayer.cpp b/libgnn/src/GraphConvolutionalLayer.cpp
index 75125e6fe2..d02a2bf0ca 100644
--- a/libgnn/src/GraphConvolutionalLayer.cpp
+++ b/libgnn/src/GraphConvolutionalLayer.cpp
@@ -115,7 +115,7 @@ std::vector<galois::GNNFloat>* galois::GraphConvolutionalLayer::BackwardPhase(
 
   // sync weight gradients; note aggregation sync occurs in the function call
   // already
-  // XXX
+  WeightGradientSyncAverage();
 
   if (config_.do_dropout && layer_number_ != 0) {
     DoDropoutDerivative();
diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt
index 70dc3c2b65..69ef29b43f 100644
--- a/libgnn/test/CMakeLists.txt
+++ b/libgnn/test/CMakeLists.txt
@@ -35,4 +35,7 @@ add_executable(aggregate-sync-test aggregate-sync-test.cpp)
 target_link_libraries(aggregate-sync-test galois_gnn)
 #add_test(NAME aggregate-sync-test COMMAND GALOIS_DO_NOT_BIND_THREADS=1 mpirun -n=4 ./aggregate-sync-test)
 
+add_executable(weight-sync-test weight-sync-test.cpp)
+target_link_libraries(weight-sync-test galois_gnn)
+
 # TODO multi host tests?
diff --git a/libgnn/test/aggregate-sync-test.cpp b/libgnn/test/aggregate-sync-test.cpp
index a6ca42e963..6b67c65bfe 100644
--- a/libgnn/test/aggregate-sync-test.cpp
+++ b/libgnn/test/aggregate-sync-test.cpp
@@ -30,7 +30,6 @@ int main() {
                                          .output_columns = 2};
 
   // create the layer, no norm factor
-  // note layer number is 1 so that it does something in backward phase
   std::unique_ptr<galois::GraphConvolutionalLayer> layer_0 =
       std::make_unique<galois::GraphConvolutionalLayer>(
           0, *(test_graph.get()), dimension_0,
@@ -197,4 +196,6 @@ int main() {
                         ground_truth);
     }
   }
+
+  // XXX TODO CVC
 }
diff --git a/libgnn/test/convlayer-test.cpp b/libgnn/test/convlayer-test.cpp
index 0bb6c25a4c..ffe3bb6513 100644
--- a/libgnn/test/convlayer-test.cpp
+++ b/libgnn/test/convlayer-test.cpp
@@ -49,7 +49,6 @@ int main() {
       .input_rows = 7, .input_columns = 3, .output_columns = 2};
 
   // create the layer, no norm factor
-  // note layer number is 1 so that it does something in backward phase
   std::unique_ptr<galois::GraphConvolutionalLayer> layer_0 =
       std::make_unique<galois::GraphConvolutionalLayer>(
           0, test_graph, dimension_0,
diff --git a/libgnn/test/weight-sync-test.cpp b/libgnn/test/weight-sync-test.cpp
new file mode 100644
index 0000000000..33c08df29b
--- /dev/null
+++ b/libgnn/test/weight-sync-test.cpp
@@ -0,0 +1,42 @@
+#include "galois/Logging.h"
+#include "galois/GraphNeuralNetwork.h"
+#include "galois/layers/GraphConvolutionalLayer.h"
+
+int main() {
+  galois::DistMemSys G;
+
+  if (galois::runtime::getSystemNetworkInterface().Num == 4) {
+    GALOIS_LOG_ERROR("This test should be run with 4 hosts/processes");
+    exit(1);
+  }
+
+  auto test_graph = std::make_unique<galois::graphs::GNNGraph>(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true);
+
+  // create same layer from convlayer-test and make sure result is the same even
+  // in multi-host environment
+  galois::GNNLayerDimensions dimension_0{.input_rows     = test_graph->size(),
+                                         .input_columns  = 3,
+                                         .output_columns = 2};
+
+  // create the layer, no norm factor
+  std::unique_ptr<galois::GraphConvolutionalLayer> layer_0 =
+      std::make_unique<galois::GraphConvolutionalLayer>(
+          0, *(test_graph.get()), dimension_0,
+          galois::GNNConfig{.allow_aggregate_after_update = false});
+  layer_0->InitAllWeightsTo1();
+
+  // backward pass checking; check the gradients out
+  std::vector<galois::GNNFloat> dummy_ones(test_graph->size() * 2, 1);
+  layer_0->BackwardPhase(test_graph->GetLocalFeatures(), &dummy_ones);
+
+  // gradient verification; average
+  // host 0 has 18, 1 has 21, 2 has 12, 3 has 0s; averaged to 12.75
+  const std::vector<galois::GNNFloat>& grads =
+      layer_0->GetLayerWeightGradients();
+  for (size_t i = 0; i < 6; i++) {
+    GALOIS_LOG_ASSERT(grads[i] == 12.75);
+  }
+
+  // XXX CVC
+}

From b7c9fc176e547942a84702b4c663697d49126231 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 16 Oct 2020 14:26:26 -0500
Subject: [PATCH 371/660] disabled ReLU in the old gnn code

It hurts more than helps from my experience
---
 libdeepgalois/src/layers/graph_conv_layer.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index f6741f4b6d..da9b01dbae 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -76,6 +76,8 @@ void graph_conv_layer::malloc_and_init() {
   norm_scores_grad.resize(ne);
   epsilon = 0.2; // LeakyReLU angle of negative slope
 #endif
+  dropout_ = true;
+  act_ = false;
 
   if (dropout_)
     dropout_mask = new mask_t[x * y];

From 8f289eaa68435f6db54bfe604d1eabe2de10dfbb Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 16 Oct 2020 19:23:28 -0500
Subject: [PATCH 372/660] libgnn needs MKL; non-blas dot product

1) Move over to use MKL instead of OpenBLAS for libgnn because MKL is
way easier to find on other machines.

2) Dot product use in GNN math is now a regular for loop rather than a
call to cblas (to avoid nested parallelism problem; some prelim testing
shows it doesn't affect performance).
---
 CMakeLists.txt                  |  4 +---
 libgnn/CMakeLists.txt           | 18 +++++++-----------
 libgnn/include/galois/GNNMath.h |  2 +-
 libgnn/src/GNNMath.cpp          | 16 +++++++++++++---
 4 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fc01f4a1ef..937251376c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -143,11 +143,9 @@ endif()
 if(USE_MKL_BLAS)
   SET(INTEL_ROOT /opt/apps/sysnet/intel/19.0)
   SET(MKL_ROOT ${INTEL_ROOT}/mkl)
-  SET(INTEL_LIBS_DIR ${INTEL_ROOT}/lib/intel64_lin)
-  find_package(MKL)
+  find_package(MKL REQUIRED)
   message(STATUS "MKL: ${MKL_INCLUDE_DIRS}")
   if (MKL_FOUND)
-    include_directories(${MKL_INCLUDE_DIRS})
   else()
     message(WARNING "MKL not found")
   endif()
diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt
index e8c109df6c..9d1b18b682 100644
--- a/libgnn/CMakeLists.txt
+++ b/libgnn/CMakeLists.txt
@@ -1,11 +1,3 @@
-set(BLAS_INC_DIR ${OPENBLAS_ROOT}/include/openblas)
-set(BLAS_LIB_DIR ${OPENBLAS_ROOT}/lib64)
-# blas library
-include_directories(${BLAS_INC_DIR})
-link_directories(${BLAS_LIB_DIR})
-
-set(BLAS_LIB "-lopenblas")
-
 set(sources
   src/GNNGraph.cpp
   src/GNNLayer.cpp
@@ -17,15 +9,19 @@ set(sources
   src/GluonGradientInterface.cpp
 )
 
+set(MKL_LIBRARIES ${MKL_ROOT}/lib/intel64)
+set(INTEL_LIBS "-lmkl_intel_lp64 -lmkl_sequential -lmkl_core")
+
 add_library(galois_gnn STATIC ${sources})
+target_link_directories(galois_gnn PUBLIC ${MKL_LIBRARIES})
 target_link_libraries(galois_gnn galois_shmem)
-target_link_libraries(galois_gnn ${MPI_CXX_LIBRARIES})
-target_link_libraries(galois_gnn ${BLAS_LIB} ${BOOST_LIBRARIES})
+target_link_libraries(galois_gnn ${INTEL_LIBS})
 target_link_libraries(galois_gnn galois_dist_async galois_cusp galois_gluon galois_support)
 target_include_directories(galois_gnn PUBLIC
   ${CMAKE_CURRENT_SOURCE_DIR}/include
+  ${MKL_INCLUDE_DIRS}
 )
 
-set_target_properties(galois_gnn PROPERTIES EXPORT_NAME gluon)
+set_target_properties(galois_gnn PROPERTIES EXPORT_NAME galois_gnn)
 
 add_subdirectory(test)
diff --git a/libgnn/include/galois/GNNMath.h b/libgnn/include/galois/GNNMath.h
index 2cf913d5de..488b538d75 100644
--- a/libgnn/include/galois/GNNMath.h
+++ b/libgnn/include/galois/GNNMath.h
@@ -1,7 +1,7 @@
 #pragma once
 
 #include "galois/GNNTypes.h"
-#include <cblas.h>
+#include <mkl.h>
 
 namespace galois {
 
diff --git a/libgnn/src/GNNMath.cpp b/libgnn/src/GNNMath.cpp
index 0087c7340b..5e9fb8d050 100644
--- a/libgnn/src/GNNMath.cpp
+++ b/libgnn/src/GNNMath.cpp
@@ -68,9 +68,18 @@ void galois::GNNSoftmaxDerivative(const size_t vector_length,
       temp_vector[j] = (j == i) ? prev_output[i] * (1.0 - prev_output[i])
                                 : -prev_output[j] * prev_output[i];
     }
-    // TODO is sdot using threads? if so this is a nested parallelism problem
-    output[i] =
-        cblas_sdot(vector_length, prev_output_derivative, 1, temp_vector, 1);
+    GNNFloat sdot_result = 0;
+    // TODO use vector instructions? would need another loop to add everything
+    // together + a temp vector to store results so probably about the same?
+    for (size_t k = 0; k < vector_length; k++) {
+      sdot_result += prev_output_derivative[k] * temp_vector[k];
+    }
+    output[i] = sdot_result;
+
+    // TODO this is currently disabled because of a nested parallelism problem
+    // (cblas may use more threads)
+    // output[i] =
+    //    cblas_sdot(vector_length, prev_output_derivative, 1, temp_vector, 1);
   }
 }
 
@@ -113,6 +122,7 @@ void galois::CBlasSGEMM(const CBLAS_TRANSPOSE trans_a,
   size_t lead_dim_b =
       (trans_b == CblasNoTrans) ? output_columns : input_columns;
   // do the MM
+  // TODO roll our own sgemm rather than use 3rd party?
   cblas_sgemm(CblasRowMajor, trans_a, trans_b, input_rows, output_columns,
               input_columns, 1.0, a, lead_dim_a, b, lead_dim_b, 0.0, output,
               output_columns);

From 0ed623fba833fc6a026837f877e2cf4592e1a674 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 16 Oct 2020 19:50:59 -0500
Subject: [PATCH 373/660] Removed structure initializer lists from libgnn

Some compilers don't like structure init lists and will fail to compile.
This commit makes the code way less compact and manually declares
structures to get around this annoying limitation.
---
 libgnn/include/galois/GraphNeuralNetwork.h |  6 +----
 libgnn/test/adam-test.cpp                  |  8 +++++--
 libgnn/test/aggregate-sync-test.cpp        | 19 ++++++++-------
 libgnn/test/convlayer-test.cpp             | 28 ++++++++++++----------
 libgnn/test/epoch-test.cpp                 | 12 ++++++----
 libgnn/test/gnnfb-test.cpp                 |  6 +++--
 libgnn/test/softmaxlayer-test.cpp          |  8 +++----
 libgnn/test/weight-sync-test.cpp           | 14 ++++++-----
 8 files changed, 56 insertions(+), 45 deletions(-)

diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h
index 80f7b07916..1762cda8da 100644
--- a/libgnn/include/galois/GraphNeuralNetwork.h
+++ b/libgnn/include/galois/GraphNeuralNetwork.h
@@ -30,11 +30,7 @@ class GraphNeuralNetworkConfig {
                            const std::vector<size_t>& layer_column_sizes,
                            GNNOutputLayerType output_layer_type)
       : GraphNeuralNetworkConfig(num_layers, layer_types, layer_column_sizes,
-                                 output_layer_type,
-                                 GNNConfig{.do_dropout       = true,
-                                           .dropout_rate     = 0.3,
-                                           .do_activation    = true,
-                                           .do_normalization = true}) {}
+                                 output_layer_type, GNNConfig()) {}
 
   //! Construction with a specified config for layers
   GraphNeuralNetworkConfig(size_t num_layers,
diff --git a/libgnn/test/adam-test.cpp b/libgnn/test/adam-test.cpp
index e01368ce87..dfdfcdad00 100644
--- a/libgnn/test/adam-test.cpp
+++ b/libgnn/test/adam-test.cpp
@@ -14,8 +14,12 @@ int main() {
                      num_threads);
 
   // create sample config that is easy to trace
-  galois::AdamOptimizer::AdamConfiguration config = {
-      .alpha = 1, .beta1 = 0.5, .beta2 = 0.5, .epsilon = 0};
+  galois::AdamOptimizer::AdamConfiguration config;
+  config.alpha   = 1;
+  config.beta1   = 0.5;
+  config.beta2   = 0.5;
+  config.epsilon = 0;
+
   std::vector<size_t> layer_sizes = {2, 1};
   galois::AdamOptimizer adam(config, layer_sizes, 2);
 
diff --git a/libgnn/test/aggregate-sync-test.cpp b/libgnn/test/aggregate-sync-test.cpp
index 6b67c65bfe..ea184e3e2a 100644
--- a/libgnn/test/aggregate-sync-test.cpp
+++ b/libgnn/test/aggregate-sync-test.cpp
@@ -25,15 +25,17 @@ int main() {
 
   // create same layer from convlayer-test and make sure result is the same even
   // in multi-host environment
-  galois::GNNLayerDimensions dimension_0{.input_rows     = test_graph->size(),
-                                         .input_columns  = 3,
-                                         .output_columns = 2};
+  galois::GNNLayerDimensions dimension_0;
+  dimension_0.input_rows     = test_graph->size();
+  dimension_0.input_columns  = 3;
+  dimension_0.output_columns = 2;
+  galois::GNNConfig l_config;
+  l_config.allow_aggregate_after_update = false;
 
   // create the layer, no norm factor
   std::unique_ptr<galois::GraphConvolutionalLayer> layer_0 =
-      std::make_unique<galois::GraphConvolutionalLayer>(
-          0, *(test_graph.get()), dimension_0,
-          galois::GNNConfig{.allow_aggregate_after_update = false});
+      std::make_unique<galois::GraphConvolutionalLayer>(0, *(test_graph.get()),
+                                                        dimension_0, l_config);
   layer_0->InitAllWeightsTo1();
   // make sure it runs in a sane manner
   const std::vector<galois::GNNFloat>& layer_0_forward_output =
@@ -115,9 +117,8 @@ int main() {
   // layer 1 to check backward output
   //////////////////////////////////////////////////////////////////////////////
   std::unique_ptr<galois::GraphConvolutionalLayer> layer_1 =
-      std::make_unique<galois::GraphConvolutionalLayer>(
-          1, *(test_graph.get()), dimension_0,
-          galois::GNNConfig{.allow_aggregate_after_update = false});
+      std::make_unique<galois::GraphConvolutionalLayer>(1, *(test_graph.get()),
+                                                        dimension_0, l_config);
   layer_1->InitAllWeightsTo1();
   const std::vector<galois::GNNFloat>& layer_1_forward_output =
       layer_1->ForwardPhase(test_graph->GetLocalFeatures());
diff --git a/libgnn/test/convlayer-test.cpp b/libgnn/test/convlayer-test.cpp
index ffe3bb6513..1d89cf198a 100644
--- a/libgnn/test/convlayer-test.cpp
+++ b/libgnn/test/convlayer-test.cpp
@@ -45,14 +45,18 @@ int main() {
   GALOIS_LOG_ASSERT(6.0 == feats[20]);
   //////////////////////////////////////////////////////////////////////////////
 
-  galois::GNNLayerDimensions dimension_0{
-      .input_rows = 7, .input_columns = 3, .output_columns = 2};
+  galois::GNNLayerDimensions dimension_0;
+  dimension_0.input_rows     = 7;
+  dimension_0.input_columns  = 3;
+  dimension_0.output_columns = 2;
+
+  galois::GNNConfig dcon;
+  dcon.allow_aggregate_after_update = false;
 
   // create the layer, no norm factor
   std::unique_ptr<galois::GraphConvolutionalLayer> layer_0 =
-      std::make_unique<galois::GraphConvolutionalLayer>(
-          0, test_graph, dimension_0,
-          galois::GNNConfig{.allow_aggregate_after_update = false});
+      std::make_unique<galois::GraphConvolutionalLayer>(0, test_graph,
+                                                        dimension_0, dcon);
   layer_0->InitAllWeightsTo1();
   // make sure it runs in a sane manner
   const std::vector<galois::GNNFloat>& layer_0_forward_output =
@@ -134,9 +138,8 @@ int main() {
   // create layer 1 for testing backward prop actually giving weights back
 
   std::unique_ptr<galois::GraphConvolutionalLayer> layer_1 =
-      std::make_unique<galois::GraphConvolutionalLayer>(
-          1, test_graph, dimension_0,
-          galois::GNNConfig{.allow_aggregate_after_update = false});
+      std::make_unique<galois::GraphConvolutionalLayer>(1, test_graph,
+                                                        dimension_0, dcon);
   layer_1->InitAllWeightsTo1();
   const std::vector<galois::GNNFloat>& layer_1_forward_output =
       layer_1->ForwardPhase(test_graph.GetLocalFeatures());
@@ -201,10 +204,11 @@ int main() {
 
   //////////////////////////////////////////////////////////////////////////////
 
-  galois::GNNConfig config = {.do_dropout                   = true,
-                              .do_activation                = true,
-                              .do_normalization             = true,
-                              .allow_aggregate_after_update = false};
+  galois::GNNConfig config;
+  config.do_dropout                   = true;
+  config.do_activation                = true;
+  config.do_normalization             = true;
+  config.allow_aggregate_after_update = false;
 
   // finally, just make sure dropout and activation run without crashes
   // (verification requires floating point accuracy or setting a seed which I
diff --git a/libgnn/test/epoch-test.cpp b/libgnn/test/epoch-test.cpp
index c37a7d2e34..20c987be60 100644
--- a/libgnn/test/epoch-test.cpp
+++ b/libgnn/test/epoch-test.cpp
@@ -14,19 +14,21 @@ int main() {
 
   // load graph
   auto test_graph = std::make_unique<galois::graphs::GNNGraph>(
-      "cora", galois::graphs::GNNPartitionScheme::kOEC, true);
+      "reddit", galois::graphs::GNNPartitionScheme::kCVC, true);
 
   std::vector<galois::GNNLayerType> layer_types = {
       galois::GNNLayerType::kGraphConvolutional,
       galois::GNNLayerType::kGraphConvolutional};
   std::vector<size_t> layer_output_sizes = {
       16, test_graph->GetNumLabelClasses(), test_graph->GetNumLabelClasses()};
+  galois::GNNConfig layer_config;
+  layer_config.do_dropout       = true;
+  layer_config.do_activation    = false;
+  layer_config.do_normalization = true;
   // XXX Activation kills accuracy compared to old code, esp. for cora
   galois::GraphNeuralNetworkConfig gnn_config(
       2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax,
-      galois::GNNConfig{.do_dropout       = true,
-                        .do_activation    = false,
-                        .do_normalization = true});
+      layer_config);
 
   std::vector<size_t> adam_sizes = {16 * test_graph->node_feature_length(),
                                     16 * test_graph->GetNumLabelClasses()};
@@ -41,7 +43,7 @@ int main() {
   // increasing
   galois::StatTimer main_timer("Timer_0");
   main_timer.start();
-  for (size_t epoch = 0; epoch < 100; epoch++) {
+  for (size_t epoch = 0; epoch < 20; epoch++) {
     const std::vector<galois::GNNFloat>* predictions = gnn->DoInference();
     gnn->GradientPropagation();
     galois::gPrint("Epoch ", epoch, ": Accuracy is ",
diff --git a/libgnn/test/gnnfb-test.cpp b/libgnn/test/gnnfb-test.cpp
index 50c40ff2c1..692cbfd30c 100644
--- a/libgnn/test/gnnfb-test.cpp
+++ b/libgnn/test/gnnfb-test.cpp
@@ -23,11 +23,13 @@ int main() {
       galois::GNNLayerType::kGraphConvolutional};
   // note this includes the output; last 2 must be same because softmax
   std::vector<size_t> layer_output_sizes = {4, 7, 7};
+  galois::GNNConfig dcon;
+  dcon.allow_aggregate_after_update = false;
   // note GNNConfig is passed in; use a config that does not do anything extra
   // like dropout or activation and the like so that input is easier to verify
   galois::GraphNeuralNetworkConfig gnn_config(
       2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax,
-      galois::GNNConfig{.allow_aggregate_after_update = false});
+      dcon);
   // input is 7 x 3, layers are then 3 x 4 and 4 x 7 and 7 x 7
   // middle 2 are trainable so 12 and 28
   std::vector<size_t> adam_sizes = {12, 28};
@@ -171,7 +173,7 @@ int main() {
       "tester", galois::graphs::GNNPartitionScheme::kOEC, true);
   galois::GraphNeuralNetworkConfig gnn_config2(
       2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax,
-      galois::GNNConfig{.allow_aggregate_after_update = false});
+      dcon);
   auto adam2 = std::make_unique<galois::AdamOptimizer>(adam_sizes, 2);
   auto gnn2  = std::make_unique<galois::GraphNeuralNetwork>(
       std::move(test_graph), std::move(adam2), std::move(gnn_config2));
diff --git a/libgnn/test/softmaxlayer-test.cpp b/libgnn/test/softmaxlayer-test.cpp
index bd3cd8c5e3..5d9fa87728 100644
--- a/libgnn/test/softmaxlayer-test.cpp
+++ b/libgnn/test/softmaxlayer-test.cpp
@@ -17,10 +17,10 @@ int main() {
       "tester", galois::graphs::GNNPartitionScheme::kOEC, true);
 
   // input/output columns must be same in softmax
-  galois::GNNLayerDimensions dimension_0{
-      .input_rows     = 7,
-      .input_columns  = test_graph.GetNumLabelClasses(),
-      .output_columns = test_graph.GetNumLabelClasses()};
+  galois::GNNLayerDimensions dimension_0;
+  dimension_0.input_rows     = 7;
+  dimension_0.input_columns  = test_graph.GetNumLabelClasses();
+  dimension_0.output_columns = test_graph.GetNumLabelClasses();
 
   GALOIS_LOG_VERBOSE("Num output classes is {}", dimension_0.input_columns);
 
diff --git a/libgnn/test/weight-sync-test.cpp b/libgnn/test/weight-sync-test.cpp
index 33c08df29b..37314fb59a 100644
--- a/libgnn/test/weight-sync-test.cpp
+++ b/libgnn/test/weight-sync-test.cpp
@@ -15,15 +15,17 @@ int main() {
 
   // create same layer from convlayer-test and make sure result is the same even
   // in multi-host environment
-  galois::GNNLayerDimensions dimension_0{.input_rows     = test_graph->size(),
-                                         .input_columns  = 3,
-                                         .output_columns = 2};
+  galois::GNNLayerDimensions dimension_0;
+  dimension_0.input_rows     = test_graph->size();
+  dimension_0.input_columns  = 3;
+  dimension_0.output_columns = 2;
+  galois::GNNConfig dcon;
 
+  dcon.allow_aggregate_after_update = false;
   // create the layer, no norm factor
   std::unique_ptr<galois::GraphConvolutionalLayer> layer_0 =
-      std::make_unique<galois::GraphConvolutionalLayer>(
-          0, *(test_graph.get()), dimension_0,
-          galois::GNNConfig{.allow_aggregate_after_update = false});
+      std::make_unique<galois::GraphConvolutionalLayer>(0, *(test_graph.get()),
+                                                        dimension_0, dcon);
   layer_0->InitAllWeightsTo1();
 
   // backward pass checking; check the gradients out

From 866f25c0517936fdad94cc198a04934db61cc5a4 Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Sat, 17 Oct 2020 10:59:30 -0500
Subject: [PATCH 374/660] example

---
 libgnn/src/SoftmaxLayer.cu | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 libgnn/src/SoftmaxLayer.cu

diff --git a/libgnn/src/SoftmaxLayer.cu b/libgnn/src/SoftmaxLayer.cu
new file mode 100644
index 0000000000..4a9bce5b26
--- /dev/null
+++ b/libgnn/src/SoftmaxLayer.cu
@@ -0,0 +1,24 @@
+#include "galois/Logging.h"
+#include "galois/GNNMath.h" // Please add GPU functions
+#include "galois/layers/SoftmaxLayer.h"
+
+// Allocate memory and initialize
+void galois::SoftmaxLayer::Init() {
+}
+
+// Input: in_tensor
+// Output: out_tensor
+void galois::SoftmaxLayer::Forward(const galois::GNNFloat* in_tensor,
+                                   galois::GNNFloat* out_tensor) {
+} 
+
+// Input: in_tensor
+// Input: out_tensor
+// Input: out_gradients
+// Output: in_gradients
+void galois::SoftmaxLayer::Backward(const galois::GNNFloat* in_tensor,
+                                    const galois::GNNFloat* out_tensor,
+                                    galois::GNNFloat* in_gradients,
+                                    galois::GNNFloat* out_gradients) {
+}
+

From fdb4aa19d3ffa6a5937d9d8193041a570ca36c7f Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Sat, 17 Oct 2020 11:06:40 -0500
Subject: [PATCH 375/660] ass Init

---
 libgnn/include/galois/layers/SoftmaxLayer.h | 3 +++
 libgnn/src/SoftmaxLayer.cpp                 | 4 ++++
 2 files changed, 7 insertions(+)

diff --git a/libgnn/include/galois/layers/SoftmaxLayer.h b/libgnn/include/galois/layers/SoftmaxLayer.h
index 3b5ace94c8..e410337964 100644
--- a/libgnn/include/galois/layers/SoftmaxLayer.h
+++ b/libgnn/include/galois/layers/SoftmaxLayer.h
@@ -21,6 +21,7 @@ class SoftmaxLayer : public GNNLayer {
     GALOIS_LOG_ASSERT(dimensions.input_columns == dimensions.output_columns);
     // output needs to match number of possible classes
     GALOIS_LOG_ASSERT(dimensions.input_columns == graph.GetNumLabelClasses());
+    Init();
   }
 
   //! Creates probability distribution of each row of input
@@ -48,6 +49,8 @@ class SoftmaxLayer : public GNNLayer {
   //! derivative calculation; each is the size of a feature vector
   galois::substrate::PerThreadStorage<std::vector<GNNFloat>>
       softmax_temp_vectors_;
+
+  void Init();
 };
 
 } // namespace galois
diff --git a/libgnn/src/SoftmaxLayer.cpp b/libgnn/src/SoftmaxLayer.cpp
index 30dc476965..1262555a36 100644
--- a/libgnn/src/SoftmaxLayer.cpp
+++ b/libgnn/src/SoftmaxLayer.cpp
@@ -2,6 +2,10 @@
 #include "galois/GNNMath.h"
 #include "galois/layers/SoftmaxLayer.h"
 
+// Allocate memory and initialize
+void galois::SoftmaxLayer::Init() {
+}
+
 const std::vector<galois::GNNFloat>& galois::SoftmaxLayer::ForwardPhase(
     const std::vector<galois::GNNFloat>& input_embeddings) {
   input_loss_.assign(input_loss_.size(), 0.0);

From d013d3593db41dd784b108924f9f078e699ab36a Mon Sep 17 00:00:00 2001
From: chenxuhao <chen_xuhao@126.com>
Date: Sat, 17 Oct 2020 11:40:55 -0500
Subject: [PATCH 376/660] add comments

---
 libgnn/src/SoftmaxLayer.cu | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/libgnn/src/SoftmaxLayer.cu b/libgnn/src/SoftmaxLayer.cu
index 4a9bce5b26..d9ed5fc0ff 100644
--- a/libgnn/src/SoftmaxLayer.cu
+++ b/libgnn/src/SoftmaxLayer.cu
@@ -16,6 +16,10 @@ void galois::SoftmaxLayer::Forward(const galois::GNNFloat* in_tensor,
 // Input: out_tensor
 // Input: out_gradients
 // Output: in_gradients
+// Note: although out_gradients is an input data, 
+//       it is not const because it can be reused
+//       to hold intermediate data inside this function, 
+//       to avoid allocating more memory
 void galois::SoftmaxLayer::Backward(const galois::GNNFloat* in_tensor,
                                     const galois::GNNFloat* out_tensor,
                                     galois::GNNFloat* in_gradients,

From e4fa27b7640f6f196b0552f92b028e76972a8c1e Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Sat, 17 Oct 2020 11:05:44 -0500
Subject: [PATCH 377/660] libgnn src directory follows include dir structure

Adds subdirectories to the src directory in libgnn to follow the same
structure as the include directory.
---
 libgnn/CMakeLists.txt                               | 12 ++++++------
 libgnn/src/{ => graphs}/GNNGraph.cpp                |  0
 libgnn/src/{ => layers}/GNNLayer.cpp                |  0
 libgnn/src/{ => layers}/GluonGradientInterface.cpp  |  0
 libgnn/src/{ => layers}/GraphConvolutionalLayer.cpp |  0
 libgnn/src/{ => layers}/SoftmaxLayer.cpp            |  0
 libgnn/src/{ => layers}/SoftmaxLayer.cu             |  0
 7 files changed, 6 insertions(+), 6 deletions(-)
 rename libgnn/src/{ => graphs}/GNNGraph.cpp (100%)
 rename libgnn/src/{ => layers}/GNNLayer.cpp (100%)
 rename libgnn/src/{ => layers}/GluonGradientInterface.cpp (100%)
 rename libgnn/src/{ => layers}/GraphConvolutionalLayer.cpp (100%)
 rename libgnn/src/{ => layers}/SoftmaxLayer.cpp (100%)
 rename libgnn/src/{ => layers}/SoftmaxLayer.cu (100%)

diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt
index 9d1b18b682..d635781ba6 100644
--- a/libgnn/CMakeLists.txt
+++ b/libgnn/CMakeLists.txt
@@ -1,12 +1,12 @@
 set(sources
-  src/GNNGraph.cpp
-  src/GNNLayer.cpp
   src/GNNMath.cpp
-  src/GraphConvolutionalLayer.cpp
-  src/SoftmaxLayer.cpp
-  src/GraphNeuralNetwork.cpp
   src/GNNOptimizers.cpp
-  src/GluonGradientInterface.cpp
+  src/GraphNeuralNetwork.cpp
+  src/graphs/GNNGraph.cpp
+  src/layers/GNNLayer.cpp
+  src/layers/GluonGradientInterface.cpp
+  src/layers/GraphConvolutionalLayer.cpp
+  src/layers/SoftmaxLayer.cpp
 )
 
 set(MKL_LIBRARIES ${MKL_ROOT}/lib/intel64)
diff --git a/libgnn/src/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
similarity index 100%
rename from libgnn/src/GNNGraph.cpp
rename to libgnn/src/graphs/GNNGraph.cpp
diff --git a/libgnn/src/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp
similarity index 100%
rename from libgnn/src/GNNLayer.cpp
rename to libgnn/src/layers/GNNLayer.cpp
diff --git a/libgnn/src/GluonGradientInterface.cpp b/libgnn/src/layers/GluonGradientInterface.cpp
similarity index 100%
rename from libgnn/src/GluonGradientInterface.cpp
rename to libgnn/src/layers/GluonGradientInterface.cpp
diff --git a/libgnn/src/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp
similarity index 100%
rename from libgnn/src/GraphConvolutionalLayer.cpp
rename to libgnn/src/layers/GraphConvolutionalLayer.cpp
diff --git a/libgnn/src/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp
similarity index 100%
rename from libgnn/src/SoftmaxLayer.cpp
rename to libgnn/src/layers/SoftmaxLayer.cpp
diff --git a/libgnn/src/SoftmaxLayer.cu b/libgnn/src/layers/SoftmaxLayer.cu
similarity index 100%
rename from libgnn/src/SoftmaxLayer.cu
rename to libgnn/src/layers/SoftmaxLayer.cu

From 67eb0a29400d5d6dd0809788306ca7d12efd2056 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Sat, 17 Oct 2020 11:50:42 -0500
Subject: [PATCH 378/660] GNNConfig -> GNNLayerConfig

GNNConfig is too general a name when it was only used for layers:
changed the name accorindingly.
---
 libgnn/include/galois/GraphNeuralNetwork.h             | 8 ++++----
 libgnn/include/galois/layers/GNNLayer.h                | 8 ++++----
 libgnn/include/galois/layers/GraphConvolutionalLayer.h | 5 +++--
 libgnn/include/galois/layers/SoftmaxLayer.h            | 2 +-
 libgnn/src/layers/GNNLayer.cpp                         | 2 +-
 libgnn/src/layers/GraphConvolutionalLayer.cpp          | 2 +-
 libgnn/test/accuracy-test.cpp                          | 2 +-
 libgnn/test/aggregate-sync-test.cpp                    | 2 +-
 libgnn/test/convlayer-test.cpp                         | 4 ++--
 libgnn/test/epoch-test.cpp                             | 2 +-
 libgnn/test/gnnfb-test.cpp                             | 7 ++++---
 libgnn/test/weight-sync-test.cpp                       | 2 +-
 12 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h
index 1762cda8da..919c11046a 100644
--- a/libgnn/include/galois/GraphNeuralNetwork.h
+++ b/libgnn/include/galois/GraphNeuralNetwork.h
@@ -30,14 +30,14 @@ class GraphNeuralNetworkConfig {
                            const std::vector<size_t>& layer_column_sizes,
                            GNNOutputLayerType output_layer_type)
       : GraphNeuralNetworkConfig(num_layers, layer_types, layer_column_sizes,
-                                 output_layer_type, GNNConfig()) {}
+                                 output_layer_type, GNNLayerConfig()) {}
 
   //! Construction with a specified config for layers
   GraphNeuralNetworkConfig(size_t num_layers,
                            const std::vector<GNNLayerType>& layer_types,
                            const std::vector<size_t>& layer_column_sizes,
                            GNNOutputLayerType output_layer_type,
-                           const GNNConfig& default_layer_config)
+                           const GNNLayerConfig& default_layer_config)
       : num_intermediate_layers_(num_layers), layer_types_(layer_types),
         layer_column_sizes_(layer_column_sizes),
         output_layer_type_(output_layer_type),
@@ -71,7 +71,7 @@ class GraphNeuralNetworkConfig {
     return layer_column_sizes_[num_intermediate_layers_];
   }
   //! Get the default layer config of layers in this GNN
-  const GNNConfig& default_layer_config() { return default_layer_config_; }
+  const GNNLayerConfig& default_layer_config() { return default_layer_config_; }
 
 private:
   //! Number of layers to construct in the GNN not including the output
@@ -86,7 +86,7 @@ class GraphNeuralNetworkConfig {
   //! Output layer type
   GNNOutputLayerType output_layer_type_;
   //! Default config to use for layers
-  GNNConfig default_layer_config_;
+  GNNLayerConfig default_layer_config_;
 };
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
index 2232e82b5c..2473de7229 100644
--- a/libgnn/include/galois/layers/GNNLayer.h
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -33,7 +33,7 @@ struct GNNLayerDimensions {
 };
 
 //! Config options for operations that can occur in a layer
-struct GNNConfig {
+struct GNNLayerConfig {
   //! True if weights should be allocated
   bool allocate_weights{true};
   //! True if dropout is to be done at beginning of forward phase
@@ -61,12 +61,12 @@ class GNNLayer {
   //! the input/output dimensions of the MxM that occurs in the layer; config
   //! as well
   GNNLayer(size_t layer_num, const galois::graphs::GNNGraph& graph,
-           const GNNLayerDimensions& dimensions, const GNNConfig& config);
+           const GNNLayerDimensions& dimensions, const GNNLayerConfig& config);
 
   //! Uses a default config
   GNNLayer(size_t layer_num, const galois::graphs::GNNGraph& graph,
            const GNNLayerDimensions& dimensions)
-      : GNNLayer(layer_num, graph, dimensions, GNNConfig()) {}
+      : GNNLayer(layer_num, graph, dimensions, GNNLayerConfig()) {}
 
   GNNPhase layer_phase() { return layer_phase_; }
   //! Changes this layer's phase
@@ -137,7 +137,7 @@ class GNNLayer {
   //! Dimensions (input/output sizes) of this layer
   GNNLayerDimensions layer_dimensions_;
   //! Config object for certain parameters for layer
-  GNNConfig config_;
+  GNNLayerConfig config_;
   //! Weights used by this layer. Dimensions: input columns by output columns
   std::vector<GNNFloat> layer_weights_;
   //! Gradients used to update the weights of this layer
diff --git a/libgnn/include/galois/layers/GraphConvolutionalLayer.h b/libgnn/include/galois/layers/GraphConvolutionalLayer.h
index 6a99682b8a..123a8d774a 100644
--- a/libgnn/include/galois/layers/GraphConvolutionalLayer.h
+++ b/libgnn/include/galois/layers/GraphConvolutionalLayer.h
@@ -11,12 +11,13 @@ class GraphConvolutionalLayer : public GNNLayer {
   GraphConvolutionalLayer(size_t layer_num,
                           const galois::graphs::GNNGraph& graph,
                           const GNNLayerDimensions& dimensions,
-                          const GNNConfig& config);
+                          const GNNLayerConfig& config);
 
   GraphConvolutionalLayer(size_t layer_num,
                           const galois::graphs::GNNGraph& graph,
                           const GNNLayerDimensions& dimensions)
-      : GraphConvolutionalLayer(layer_num, graph, dimensions, GNNConfig()) {}
+      : GraphConvolutionalLayer(layer_num, graph, dimensions,
+                                GNNLayerConfig()) {}
 
   // Parent functions
   const std::vector<GNNFloat>&
diff --git a/libgnn/include/galois/layers/SoftmaxLayer.h b/libgnn/include/galois/layers/SoftmaxLayer.h
index e410337964..815f2401ff 100644
--- a/libgnn/include/galois/layers/SoftmaxLayer.h
+++ b/libgnn/include/galois/layers/SoftmaxLayer.h
@@ -11,7 +11,7 @@ class SoftmaxLayer : public GNNLayer {
   SoftmaxLayer(size_t layer_num, const galois::graphs::GNNGraph& graph,
                const GNNLayerDimensions& dimensions)
       : GNNLayer(layer_num, graph, dimensions,
-                 GNNConfig{.allocate_weights = false}),
+                 GNNLayerConfig{.allocate_weights = false}),
         input_loss_(dimensions.input_rows),
         ground_truth_vectors_(dimensions.input_columns),
         norm_gradient_vectors_(dimensions.input_columns),
diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp
index 838bf45905..dc81a9ca2b 100644
--- a/libgnn/src/layers/GNNLayer.cpp
+++ b/libgnn/src/layers/GNNLayer.cpp
@@ -5,7 +5,7 @@
 galois::GNNLayer::GNNLayer(size_t layer_num,
                            const galois::graphs::GNNGraph& graph,
                            const GNNLayerDimensions& dimensions,
-                           const GNNConfig& config)
+                           const GNNLayerConfig& config)
     : layer_number_(layer_num), graph_(graph), layer_dimensions_(dimensions),
       config_(config) {
   if (config_.allocate_weights) {
diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp
index d02a2bf0ca..57a5d9505b 100644
--- a/libgnn/src/layers/GraphConvolutionalLayer.cpp
+++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp
@@ -4,7 +4,7 @@
 
 galois::GraphConvolutionalLayer::GraphConvolutionalLayer(
     size_t layer_num, const galois::graphs::GNNGraph& graph,
-    const GNNLayerDimensions& dimensions, const GNNConfig& config)
+    const GNNLayerDimensions& dimensions, const GNNLayerConfig& config)
     : GNNLayer(layer_num, graph, dimensions, config),
       input_column_intermediates_(dimensions.input_columns),
       output_column_intermediates_(dimensions.output_columns) {
diff --git a/libgnn/test/accuracy-test.cpp b/libgnn/test/accuracy-test.cpp
index 61d449255f..e1fc17702e 100644
--- a/libgnn/test/accuracy-test.cpp
+++ b/libgnn/test/accuracy-test.cpp
@@ -21,7 +21,7 @@ int main() {
   std::vector<size_t> layer_output_sizes = {7, 7};
   galois::GraphNeuralNetworkConfig gnn_config(
       1, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax,
-      galois::GNNConfig());
+      galois::GNNLayerConfig());
 
   std::vector<size_t> adam_sizes = {21};
   auto adam = std::make_unique<galois::AdamOptimizer>(adam_sizes, 1);
diff --git a/libgnn/test/aggregate-sync-test.cpp b/libgnn/test/aggregate-sync-test.cpp
index ea184e3e2a..432a546448 100644
--- a/libgnn/test/aggregate-sync-test.cpp
+++ b/libgnn/test/aggregate-sync-test.cpp
@@ -29,7 +29,7 @@ int main() {
   dimension_0.input_rows     = test_graph->size();
   dimension_0.input_columns  = 3;
   dimension_0.output_columns = 2;
-  galois::GNNConfig l_config;
+  galois::GNNLayerConfig l_config;
   l_config.allow_aggregate_after_update = false;
 
   // create the layer, no norm factor
diff --git a/libgnn/test/convlayer-test.cpp b/libgnn/test/convlayer-test.cpp
index 1d89cf198a..00825cf6f8 100644
--- a/libgnn/test/convlayer-test.cpp
+++ b/libgnn/test/convlayer-test.cpp
@@ -50,7 +50,7 @@ int main() {
   dimension_0.input_columns  = 3;
   dimension_0.output_columns = 2;
 
-  galois::GNNConfig dcon;
+  galois::GNNLayerConfig dcon;
   dcon.allow_aggregate_after_update = false;
 
   // create the layer, no norm factor
@@ -204,7 +204,7 @@ int main() {
 
   //////////////////////////////////////////////////////////////////////////////
 
-  galois::GNNConfig config;
+  galois::GNNLayerConfig config;
   config.do_dropout                   = true;
   config.do_activation                = true;
   config.do_normalization             = true;
diff --git a/libgnn/test/epoch-test.cpp b/libgnn/test/epoch-test.cpp
index 20c987be60..21d5249fd1 100644
--- a/libgnn/test/epoch-test.cpp
+++ b/libgnn/test/epoch-test.cpp
@@ -21,7 +21,7 @@ int main() {
       galois::GNNLayerType::kGraphConvolutional};
   std::vector<size_t> layer_output_sizes = {
       16, test_graph->GetNumLabelClasses(), test_graph->GetNumLabelClasses()};
-  galois::GNNConfig layer_config;
+  galois::GNNLayerConfig layer_config;
   layer_config.do_dropout       = true;
   layer_config.do_activation    = false;
   layer_config.do_normalization = true;
diff --git a/libgnn/test/gnnfb-test.cpp b/libgnn/test/gnnfb-test.cpp
index 692cbfd30c..d43e1b0e2e 100644
--- a/libgnn/test/gnnfb-test.cpp
+++ b/libgnn/test/gnnfb-test.cpp
@@ -23,10 +23,11 @@ int main() {
       galois::GNNLayerType::kGraphConvolutional};
   // note this includes the output; last 2 must be same because softmax
   std::vector<size_t> layer_output_sizes = {4, 7, 7};
-  galois::GNNConfig dcon;
+  galois::GNNLayerConfig dcon;
   dcon.allow_aggregate_after_update = false;
-  // note GNNConfig is passed in; use a config that does not do anything extra
-  // like dropout or activation and the like so that input is easier to verify
+  // note GNNLayerConfig is passed in; use a config that does not do anything
+  // extra like dropout or activation and the like so that input is easier to
+  // verify
   galois::GraphNeuralNetworkConfig gnn_config(
       2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax,
       dcon);
diff --git a/libgnn/test/weight-sync-test.cpp b/libgnn/test/weight-sync-test.cpp
index 37314fb59a..561aa95370 100644
--- a/libgnn/test/weight-sync-test.cpp
+++ b/libgnn/test/weight-sync-test.cpp
@@ -19,7 +19,7 @@ int main() {
   dimension_0.input_rows     = test_graph->size();
   dimension_0.input_columns  = 3;
   dimension_0.output_columns = 2;
-  galois::GNNConfig dcon;
+  galois::GNNLayerConfig dcon;
 
   dcon.allow_aggregate_after_update = false;
   // create the layer, no norm factor

From 434637650e1ef0cd7f605f5f6fee926752b142f1 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 19 Oct 2020 13:09:06 -0500
Subject: [PATCH 379/660] Command line options for GNN apps (libgnnbench)

Added a new static library in the lonestar directory called libgnnbench
that will be the backend for the (distributed) gnn benchmarks added in
the future. At the moment it contains command line declarations for user
configurable things in the GNN.

Also changed the variable name of the variable containing the default
path for gnn inputs (preparation for letting it get passed in as a user
defined parameter).
---
 libgnn/include/galois/graphs/GNNGraph.h       |  2 +-
 libgnn/src/graphs/GNNGraph.cpp                | 17 +++---
 lonestar/CMakeLists.txt                       |  1 +
 lonestar/libgnnbench/CMakeLists.txt           |  5 ++
 lonestar/libgnnbench/include/GNNBench/Input.h | 23 ++++++++
 lonestar/libgnnbench/src/Input.cpp            | 59 +++++++++++++++++++
 6 files changed, 99 insertions(+), 8 deletions(-)
 create mode 100644 lonestar/libgnnbench/CMakeLists.txt
 create mode 100644 lonestar/libgnnbench/include/GNNBench/Input.h
 create mode 100644 lonestar/libgnnbench/src/Input.cpp

diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index a0b1430add..2a7e20b445 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -9,7 +9,7 @@ namespace galois {
 
 // TODO remove the need to hardcode this path
 //! Path to location of all gnn files
-static const std::string gnn_dataset_path =
+static const std::string default_gnn_dataset_path =
     "/net/ohm/export/iss/inputs/Learning/";
 
 //! Helper struct to maintain start/end/size of any particular range. Mostly
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index a327dfe641..fe57784b30 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -9,7 +9,8 @@ std::unique_ptr<galois::graphs::GNNGraph::GNNDistGraph>
 LoadPartition(const std::string& dataset_name,
               galois::graphs::GNNPartitionScheme partition_scheme) {
   // XXX input path
-  std::string input_file = galois::gnn_dataset_path + dataset_name + ".csgr";
+  std::string input_file =
+      galois::default_gnn_dataset_path + dataset_name + ".csgr";
   GALOIS_LOG_VERBOSE("Partition loading: File to read is {}", input_file);
 
   // load partition
@@ -118,7 +119,7 @@ void galois::graphs::GNNGraph::ReadLocalLabels(const std::string& dataset_name,
                                                bool has_single_class_label) {
   GALOIS_LOG_VERBOSE("[{}] Reading labels from disk...", host_id_);
   std::string filename =
-      galois::gnn_dataset_path + dataset_name + "-labels.txt";
+      galois::default_gnn_dataset_path + dataset_name + "-labels.txt";
   // read file header, save num label classes while at it
   std::ifstream file_stream;
   file_stream.open(filename, std::ios::in);
@@ -190,7 +191,8 @@ void galois::graphs::GNNGraph::ReadLocalFeatures(
   // read in dimensions of features, specifically node feature length
   size_t num_global_vertices;
 
-  std::string file_dims = galois::gnn_dataset_path + dataset_name + "-dims.txt";
+  std::string file_dims =
+      galois::default_gnn_dataset_path + dataset_name + "-dims.txt";
   std::ifstream ifs;
   ifs.open(file_dims, std::ios::in);
   ifs >> num_global_vertices >> node_feature_length_;
@@ -209,7 +211,7 @@ void galois::graphs::GNNGraph::ReadLocalFeatures(
   // read in all features
   std::ifstream file_stream;
   std::string feature_file =
-      galois::gnn_dataset_path + dataset_name + "-feats.bin";
+      galois::default_gnn_dataset_path + dataset_name + "-feats.bin";
   file_stream.open(feature_file, std::ios::binary | std::ios::in);
   file_stream.read((char*)full_feature_set.get(), sizeof(GNNFloat) *
                                                       num_global_vertices *
@@ -245,8 +247,8 @@ size_t galois::graphs::GNNGraph::ReadLocalMasksFromFile(
   size_t range_end;
 
   // read mask range
-  std::string mask_filename =
-      galois::gnn_dataset_path + dataset_name + "-" + mask_type + "_mask.txt";
+  std::string mask_filename = galois::default_gnn_dataset_path + dataset_name +
+                              "-" + mask_type + "_mask.txt";
   std::ifstream mask_stream;
   mask_stream.open(mask_filename, std::ios::in);
   mask_stream >> range_begin >> range_end >> std::ws;
@@ -349,7 +351,8 @@ void galois::graphs::GNNGraph::InitZeroStartGraphIndices() {
 }
 
 void galois::graphs::GNNGraph::ReadWholeGraph(const std::string& dataset_name) {
-  std::string input_file = galois::gnn_dataset_path + dataset_name + ".csgr";
+  std::string input_file =
+      galois::default_gnn_dataset_path + dataset_name + ".csgr";
   GALOIS_LOG_VERBOSE("[{}] Reading entire graph: file to read is {}", host_id_,
                      input_file);
   galois::graphs::readGraph(whole_graph_, input_file);
diff --git a/lonestar/CMakeLists.txt b/lonestar/CMakeLists.txt
index e00c61eb89..a0efe7bae7 100644
--- a/lonestar/CMakeLists.txt
+++ b/lonestar/CMakeLists.txt
@@ -226,4 +226,5 @@ add_subdirectory(scientific)
 
 if(USE_DEEPGALOIS)
   add_subdirectory(gnn)
+  add_subdirectory(libgnnbench)
 endif(USE_DEEPGALOIS)
diff --git a/lonestar/libgnnbench/CMakeLists.txt b/lonestar/libgnnbench/CMakeLists.txt
new file mode 100644
index 0000000000..31d174d581
--- /dev/null
+++ b/lonestar/libgnnbench/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_library(gnnbench STATIC src/Input.cpp)
+target_include_directories(gnnbench PUBLIC
+  "${CMAKE_CURRENT_SOURCE_DIR}/include"
+)
+target_link_libraries(gnnbench galois_gnn LLVMSupport)
diff --git a/lonestar/libgnnbench/include/GNNBench/Input.h b/lonestar/libgnnbench/include/GNNBench/Input.h
new file mode 100644
index 0000000000..e9885026df
--- /dev/null
+++ b/lonestar/libgnnbench/include/GNNBench/Input.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include "galois/graphs/GNNGraph.h"
+#include <llvm/Support/CommandLine.h>
+
+//! Directory where all files used for GNN training are found
+extern llvm::cl::opt<std::string> input_directory;
+//! Base graph name (used to find the csgr, features, masks, etc.)
+extern llvm::cl::opt<std::string> input_file;
+//! Scheme used to partition the graph
+extern llvm::cl::opt<galois::graphs::GNNPartitionScheme> partition_scheme;
+// Control layer count and size
+extern llvm::cl::opt<size_t> num_layers;
+extern llvm::cl::list<size_t> layer_sizes;
+// Control dropout
+extern llvm::cl::opt<bool> do_dropout;
+extern llvm::cl::opt<float> dropout_rate;
+// Control activation
+extern llvm::cl::opt<bool> do_activation;
+// TODO activation layer type once more are supported
+//! Controls weight normalization based on degree
+extern llvm::cl::opt<bool> do_normalization;
+// TODO output layer type
diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp
new file mode 100644
index 0000000000..fe167e24b6
--- /dev/null
+++ b/lonestar/libgnnbench/src/Input.cpp
@@ -0,0 +1,59 @@
+#include "GNNBench/Input.h"
+
+namespace cll = llvm::cl;
+
+// Self documented via the desc argument
+
+llvm::cl::opt<std::string> input_directory(
+    "inputDirectory",
+    cll::desc("Base directory to find all files required for doing GNN "
+              "training (features, graph topology, masks, etc.)"),
+    cll::init(galois::default_gnn_dataset_path));
+
+llvm::cl::opt<std::string> input_file(
+    cll::Positional,
+    cll::desc("Base name of graph: used to find csgr, features, etc."),
+    cll::Required);
+
+llvm::cl::opt<galois::graphs::GNNPartitionScheme> partition_scheme(
+    "partition", cll::desc("Type of partitioning."),
+    cll::values(clEnumValN(galois::graphs::GNNPartitionScheme::kOEC, "oec",
+                           "Outgoing Edge-Cut (default)"),
+                clEnumValN(galois::graphs::GNNPartitionScheme::kCVC, "cvc",
+                           "Cartesian Vertex-Cut")),
+    cll::init(galois::graphs::GNNPartitionScheme::kOEC));
+
+llvm::cl::opt<size_t> num_layers(
+    "numLayers",
+    cll::desc(
+        "Number of intermediate layers in the neural network (default 2))"),
+    cll::init(2));
+
+llvm::cl::list<size_t> layer_sizes(
+    "layerSizes",
+    cll::desc(
+        "Comma separated list of numbers specifying intermediate layer sizes"),
+    cll::CommaSeparated);
+
+llvm::cl::opt<bool> do_dropout(
+    "doDropout",
+    cll::desc("If true (on by default), does dropout of input during training"),
+    cll::init(true));
+
+llvm::cl::opt<float> dropout_rate(
+    "dropoutRate",
+    cll::desc("Specifies probability that any one weight is DROPPED (e.g., if "
+              "0.1, then 10 percent chance of dropping) (default 0.5)"),
+    cll::init(0.5));
+
+llvm::cl::opt<bool>
+    do_activation("doActivation",
+                  cll::desc("If true (off by default), does activation at the "
+                            "end of an intermediate layer"),
+                  cll::init(false));
+
+llvm::cl::opt<bool>
+    do_normalization("doNormalization",
+                     cll::desc("If true (on by default), normalizes vertex "
+                               "features based on their degree"),
+                     cll::init(true));

From e891594d6a1f0ca8adeb9614620ba9e2a07c6658 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 19 Oct 2020 14:20:03 -0500
Subject: [PATCH 380/660] Disabled gDebug huge page message in PageAlloc

Makes debug mode prints way more than necessary
---
 libgalois/src/PageAlloc.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/libgalois/src/PageAlloc.cpp b/libgalois/src/PageAlloc.cpp
index e051a6431d..a45e72e93d 100644
--- a/libgalois/src/PageAlloc.cpp
+++ b/libgalois/src/PageAlloc.cpp
@@ -60,7 +60,6 @@ void* galois::substrate::allocPages(unsigned num, bool preFault) {
     void* ptr =
         trymmap(num * hugePageSize, preFault ? _MAP_HUGE_POP : _MAP_HUGE);
     if (!ptr) {
-      gDebug("Huge page alloc failed, falling back");
       ptr = trymmap(num * hugePageSize, preFault ? _MAP_POP : _MAP);
     }
 

From cb97c91519192213977100badba42add2fe11b42 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 19 Oct 2020 14:33:34 -0500
Subject: [PATCH 381/660] Start code for GNNBench, new dist gcn app

Adds the commandline parsing/stats setup code to GNNBench (based on
DistBench). Adds a new gcn-dist app under lonestart/gnn/distributed that
will be the main executable for gcn from now on. Successfully compiles
using the GNNBench code.

Next step is the GNN construction code in the lonestar end of the code.
---
 lonestar/gnn/CMakeLists.txt                   |  3 +-
 lonestar/gnn/distributed/CMakeLists.txt       |  1 +
 lonestar/gnn/distributed/gcn/CMakeLists.txt   |  3 +
 lonestar/gnn/distributed/gcn/gcn-dist.cpp     | 10 +++
 lonestar/libgnnbench/CMakeLists.txt           |  2 +-
 lonestar/libgnnbench/include/GNNBench/Input.h |  9 ++-
 lonestar/libgnnbench/include/GNNBench/Start.h | 22 ++++++
 lonestar/libgnnbench/src/Input.cpp            | 14 +++-
 lonestar/libgnnbench/src/Start.cpp            | 77 +++++++++++++++++++
 9 files changed, 137 insertions(+), 4 deletions(-)
 create mode 100644 lonestar/gnn/distributed/CMakeLists.txt
 create mode 100644 lonestar/gnn/distributed/gcn/CMakeLists.txt
 create mode 100644 lonestar/gnn/distributed/gcn/gcn-dist.cpp
 create mode 100644 lonestar/libgnnbench/include/GNNBench/Start.h
 create mode 100644 lonestar/libgnnbench/src/Start.cpp

diff --git a/lonestar/gnn/CMakeLists.txt b/lonestar/gnn/CMakeLists.txt
index f718db4942..d07810f48e 100644
--- a/lonestar/gnn/CMakeLists.txt
+++ b/lonestar/gnn/CMakeLists.txt
@@ -25,4 +25,5 @@ endif()
 
 add_subdirectory(gcn)
 #add_subdirectory(sage)
-add_subdirectory(gat)
+#add_subdirectory(gat)
+add_subdirectory(distributed)
diff --git a/lonestar/gnn/distributed/CMakeLists.txt b/lonestar/gnn/distributed/CMakeLists.txt
new file mode 100644
index 0000000000..7863ee29db
--- /dev/null
+++ b/lonestar/gnn/distributed/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(gcn)
diff --git a/lonestar/gnn/distributed/gcn/CMakeLists.txt b/lonestar/gnn/distributed/gcn/CMakeLists.txt
new file mode 100644
index 0000000000..c8c9d10447
--- /dev/null
+++ b/lonestar/gnn/distributed/gcn/CMakeLists.txt
@@ -0,0 +1,3 @@
+# link libgnn library and all should go well
+add_executable(gcn-dist gcn-dist.cpp)
+target_link_libraries(gcn-dist galois_gnn gnnbench)
diff --git a/lonestar/gnn/distributed/gcn/gcn-dist.cpp b/lonestar/gnn/distributed/gcn/gcn-dist.cpp
new file mode 100644
index 0000000000..031ae06d13
--- /dev/null
+++ b/lonestar/gnn/distributed/gcn/gcn-dist.cpp
@@ -0,0 +1,10 @@
+#include "GNNBench/Start.h"
+#include "galois/GraphNeuralNetwork.h"
+
+constexpr static const char* const name = "Graph Convolutional Network";
+
+int main(int argc, char* argv[]) {
+  galois::DistMemSys G;
+  GNNBenchStart(argc, argv, name);
+  return 0;
+}
diff --git a/lonestar/libgnnbench/CMakeLists.txt b/lonestar/libgnnbench/CMakeLists.txt
index 31d174d581..14d152c8e7 100644
--- a/lonestar/libgnnbench/CMakeLists.txt
+++ b/lonestar/libgnnbench/CMakeLists.txt
@@ -1,4 +1,4 @@
-add_library(gnnbench STATIC src/Input.cpp)
+add_library(gnnbench STATIC src/Input.cpp src/Start.cpp)
 target_include_directories(gnnbench PUBLIC
   "${CMAKE_CURRENT_SOURCE_DIR}/include"
 )
diff --git a/lonestar/libgnnbench/include/GNNBench/Input.h b/lonestar/libgnnbench/include/GNNBench/Input.h
index e9885026df..1bb2afdf70 100644
--- a/lonestar/libgnnbench/include/GNNBench/Input.h
+++ b/lonestar/libgnnbench/include/GNNBench/Input.h
@@ -1,12 +1,14 @@
 #pragma once
 
+#include "galois/GraphNeuralNetwork.h"
+#include "galois/Logging.h"
 #include "galois/graphs/GNNGraph.h"
 #include <llvm/Support/CommandLine.h>
 
 //! Directory where all files used for GNN training are found
 extern llvm::cl::opt<std::string> input_directory;
 //! Base graph name (used to find the csgr, features, masks, etc.)
-extern llvm::cl::opt<std::string> input_file;
+extern llvm::cl::opt<std::string> input_name;
 //! Scheme used to partition the graph
 extern llvm::cl::opt<galois::graphs::GNNPartitionScheme> partition_scheme;
 // Control layer count and size
@@ -21,3 +23,8 @@ extern llvm::cl::opt<bool> do_activation;
 //! Controls weight normalization based on degree
 extern llvm::cl::opt<bool> do_normalization;
 // TODO output layer type
+
+const char* GNNPartitionToString(galois::graphs::GNNPartitionScheme s);
+
+//! Using command line args above, create a GNN.
+// XXX
diff --git a/lonestar/libgnnbench/include/GNNBench/Start.h b/lonestar/libgnnbench/include/GNNBench/Start.h
new file mode 100644
index 0000000000..93fc3ee0b1
--- /dev/null
+++ b/lonestar/libgnnbench/include/GNNBench/Start.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include "galois/Galois.h"
+#include "galois/Version.h"
+#include "GNNBench/Input.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// CLI
+////////////////////////////////////////////////////////////////////////////////
+
+extern llvm::cl::opt<unsigned> num_threads;
+extern llvm::cl::opt<unsigned> num_runs;
+extern llvm::cl::opt<std::string> stat_file;
+
+////////////////////////////////////////////////////////////////////////////////
+// Init functions
+////////////////////////////////////////////////////////////////////////////////
+
+//! Parses command line + setup some stats
+void GNNBenchStart(int argc, char** argv, const char* app);
+void GNNBenchStart(int argc, char** argv, const char* app, const char* desc,
+                   const char* url);
diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp
index fe167e24b6..0965234b51 100644
--- a/lonestar/libgnnbench/src/Input.cpp
+++ b/lonestar/libgnnbench/src/Input.cpp
@@ -10,7 +10,7 @@ llvm::cl::opt<std::string> input_directory(
               "training (features, graph topology, masks, etc.)"),
     cll::init(galois::default_gnn_dataset_path));
 
-llvm::cl::opt<std::string> input_file(
+llvm::cl::opt<std::string> input_name(
     cll::Positional,
     cll::desc("Base name of graph: used to find csgr, features, etc."),
     cll::Required);
@@ -57,3 +57,15 @@ llvm::cl::opt<bool>
                      cll::desc("If true (on by default), normalizes vertex "
                                "features based on their degree"),
                      cll::init(true));
+
+const char* GNNPartitionToString(galois::graphs::GNNPartitionScheme s) {
+  switch (s) {
+  case galois::graphs::GNNPartitionScheme::kOEC:
+    return "oec";
+  case galois::graphs::GNNPartitionScheme::kCVC:
+    return "cvc";
+  default:
+    GALOIS_LOG_FATAL("Invalid partitioning scheme");
+    return "";
+  }
+}
diff --git a/lonestar/libgnnbench/src/Start.cpp b/lonestar/libgnnbench/src/Start.cpp
new file mode 100644
index 0000000000..6276b373ee
--- /dev/null
+++ b/lonestar/libgnnbench/src/Start.cpp
@@ -0,0 +1,77 @@
+#include "GNNBench/Start.h"
+
+namespace cll = llvm::cl;
+
+cll::opt<unsigned> num_threads("t", cll::desc("Number of threads (default 1)"),
+                               cll::init(1));
+cll::opt<unsigned> num_runs("runs", cll::desc("Number of runs (default 1)"),
+                            cll::init(1));
+cll::opt<std::string>
+    stat_file("statFile", cll::desc("Optional output file to print stats to"));
+
+////////////////////////////////////////////////////////////////////////////////
+
+static void PrintVersion(llvm::raw_ostream& out) {
+  out << "D-Galois Benchmark Suite v" << galois::getVersion() << " ("
+      << galois::getRevision() << ")\n";
+  out.flush();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+void GNNBenchStart(int argc, char** argv, const char* app) {
+  GNNBenchStart(argc, argv, app, nullptr, nullptr);
+}
+
+void GNNBenchStart(int argc, char** argv, const char* app, const char* desc,
+                   const char* url) {
+  llvm::cl::SetVersionPrinter(PrintVersion);
+  llvm::cl::ParseCommandLineOptions(argc, argv);
+  num_threads = galois::setActiveThreads(num_threads);
+  galois::runtime::setStatFile(stat_file);
+
+  auto& net = galois::runtime::getSystemNetworkInterface();
+
+  if (net.ID == 0) {
+    PrintVersion(llvm::outs());
+    llvm::outs() << "Copyright (C) " << galois::getCopyrightYear()
+                 << " The University of Texas at Austin\n";
+    llvm::outs() << "http://iss.ices.utexas.edu/galois/\n\n";
+    llvm::outs() << "application: " << (app ? app : "unspecified") << "\n";
+
+    if (desc) {
+      llvm::outs() << desc << "\n";
+    }
+    if (url) {
+      llvm::outs()
+          << "http://iss.ices.utexas.edu/?p=projects/galois/benchmarks/" << url
+          << "\n";
+    }
+    llvm::outs() << "\n";
+    llvm::outs().flush();
+
+    std::ostringstream cmdout;
+
+    for (int i = 0; i < argc; ++i) {
+      cmdout << argv[i];
+      if (i != argc - 1)
+        cmdout << " ";
+    }
+
+    galois::runtime::reportParam("GNNBench", "CommandLine", cmdout.str());
+    galois::runtime::reportParam("GNNBench", "Threads", num_threads);
+    galois::runtime::reportParam("GNNBench", "Hosts", net.Num);
+    galois::runtime::reportParam("GNNBench", "Runs", num_runs);
+    galois::runtime::reportParam("GNNBench", "Run_UUID",
+                                 galois::runtime::getRandUUID());
+    galois::runtime::reportParam("GNNBench", "InputDirectory", input_directory);
+    galois::runtime::reportParam("GNNBench", "Input", input_name);
+    galois::runtime::reportParam("GNNBench", "PartitionScheme",
+                                 GNNPartitionToString(partition_scheme));
+    // XXX report the rest of the command line options
+  }
+
+  char name[256];
+  gethostname(name, 256);
+  galois::runtime::reportParam("GNNBench", "Hostname", name);
+}

From 0f6cc119a5eab2ff57a1370db8e4fb2608090972 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 19 Oct 2020 18:04:09 -0500
Subject: [PATCH 382/660] GNNGraph takes an input directory argument

Adds an input directory argument to the constructor of a GNNGraph in
order to allow a caller to use an input directory that differs from the
default hard-coded directory. Done in prep for the new GCN app being
able to specify whatever input directory it wants.
---
 libgnn/include/galois/graphs/GNNGraph.h |  6 ++++-
 libgnn/src/graphs/GNNGraph.cpp          | 34 +++++++++++++++----------
 2 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 2a7e20b445..8ce85092ac 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -35,9 +35,11 @@ class GNNGraph {
   using NodeIterator = boost::counting_iterator<size_t>;
   using EdgeIterator = GNNDistGraph::edge_iterator;
 
-  //! Loads a graph and all relevant metadata (labels, features, masks, etc.)
   GNNGraph(const std::string& dataset_name, GNNPartitionScheme partition_scheme,
            bool has_single_class_label);
+  //! Loads a graph and all relevant metadata (labels, features, masks, etc.)
+  GNNGraph(const std::string& input_directory, const std::string& dataset_name,
+           GNNPartitionScheme partition_scheme, bool has_single_class_label);
 
   //! Returns host id
   size_t host_id() const { return host_id_; }
@@ -118,6 +120,8 @@ class GNNGraph {
                      const size_t matrix_column_size) const;
 
 private:
+  //! Directory for input data
+  const std::string input_directory_;
   //! In a multi-host setting, this variable stores the host id that the graph
   //! is currently running on
   unsigned host_id_;
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index fe57784b30..b77e5df0a8 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -6,11 +6,11 @@
 namespace {
 //! Partitions a particular dataset given some partitioning scheme
 std::unique_ptr<galois::graphs::GNNGraph::GNNDistGraph>
-LoadPartition(const std::string& dataset_name,
+LoadPartition(const std::string& input_directory,
+              const std::string& dataset_name,
               galois::graphs::GNNPartitionScheme partition_scheme) {
   // XXX input path
-  std::string input_file =
-      galois::default_gnn_dataset_path + dataset_name + ".csgr";
+  std::string input_file = input_directory + dataset_name + ".csgr";
   GALOIS_LOG_VERBOSE("Partition loading: File to read is {}", input_file);
 
   // load partition
@@ -39,6 +39,15 @@ size_t gnn_matrix_to_sync_column_length_ = 0;
 galois::graphs::GNNGraph::GNNGraph(const std::string& dataset_name,
                                    GNNPartitionScheme partition_scheme,
                                    bool has_single_class_label) {
+  GNNGraph(galois::default_gnn_dataset_path, dataset_name, partition_scheme,
+           has_single_class_label);
+}
+
+galois::graphs::GNNGraph::GNNGraph(const std::string& input_directory,
+                                   const std::string& dataset_name,
+                                   GNNPartitionScheme partition_scheme,
+                                   bool has_single_class_label)
+    : input_directory_(input_directory) {
   GALOIS_LOG_VERBOSE("[{}] Constructing partitiong for {}", host_id_,
                      dataset_name);
   // save host id
@@ -48,7 +57,8 @@ galois::graphs::GNNGraph::GNNGraph(const std::string& dataset_name,
       std::to_string(galois::runtime::getSystemNetworkInterface().ID) +
       std::string("] ");
   // load partition
-  partitioned_graph_ = LoadPartition(dataset_name, partition_scheme);
+  partitioned_graph_ =
+      LoadPartition(input_directory_, dataset_name, partition_scheme);
 
   // read additional graph data
   ReadLocalLabels(dataset_name, has_single_class_label);
@@ -118,8 +128,7 @@ void galois::graphs::GNNGraph::AggregateSync(
 void galois::graphs::GNNGraph::ReadLocalLabels(const std::string& dataset_name,
                                                bool has_single_class_label) {
   GALOIS_LOG_VERBOSE("[{}] Reading labels from disk...", host_id_);
-  std::string filename =
-      galois::default_gnn_dataset_path + dataset_name + "-labels.txt";
+  std::string filename = input_directory_ + dataset_name + "-labels.txt";
   // read file header, save num label classes while at it
   std::ifstream file_stream;
   file_stream.open(filename, std::ios::in);
@@ -191,8 +200,7 @@ void galois::graphs::GNNGraph::ReadLocalFeatures(
   // read in dimensions of features, specifically node feature length
   size_t num_global_vertices;
 
-  std::string file_dims =
-      galois::default_gnn_dataset_path + dataset_name + "-dims.txt";
+  std::string file_dims = input_directory_ + dataset_name + "-dims.txt";
   std::ifstream ifs;
   ifs.open(file_dims, std::ios::in);
   ifs >> num_global_vertices >> node_feature_length_;
@@ -210,8 +218,7 @@ void galois::graphs::GNNGraph::ReadLocalFeatures(
 
   // read in all features
   std::ifstream file_stream;
-  std::string feature_file =
-      galois::default_gnn_dataset_path + dataset_name + "-feats.bin";
+  std::string feature_file = input_directory_ + dataset_name + "-feats.bin";
   file_stream.open(feature_file, std::ios::binary | std::ios::in);
   file_stream.read((char*)full_feature_set.get(), sizeof(GNNFloat) *
                                                       num_global_vertices *
@@ -247,8 +254,8 @@ size_t galois::graphs::GNNGraph::ReadLocalMasksFromFile(
   size_t range_end;
 
   // read mask range
-  std::string mask_filename = galois::default_gnn_dataset_path + dataset_name +
-                              "-" + mask_type + "_mask.txt";
+  std::string mask_filename =
+      input_directory_ + dataset_name + "-" + mask_type + "_mask.txt";
   std::ifstream mask_stream;
   mask_stream.open(mask_filename, std::ios::in);
   mask_stream >> range_begin >> range_end >> std::ws;
@@ -351,8 +358,7 @@ void galois::graphs::GNNGraph::InitZeroStartGraphIndices() {
 }
 
 void galois::graphs::GNNGraph::ReadWholeGraph(const std::string& dataset_name) {
-  std::string input_file =
-      galois::default_gnn_dataset_path + dataset_name + ".csgr";
+  std::string input_file = input_directory_ + dataset_name + ".csgr";
   GALOIS_LOG_VERBOSE("[{}] Reading entire graph: file to read is {}", host_id_,
                      input_file);
   galois::graphs::readGraph(whole_graph_, input_file);

From fa8a3c543c3ab237f5c2f733293319a9ac548932 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 19 Oct 2020 18:05:57 -0500
Subject: [PATCH 383/660] Adds GNN user-side construction func to GNNBench

GNNBench now provides a GNN construction function that uses the user
specified command line options to create a GCN for use in an app.
---
 lonestar/gnn/distributed/gcn/gcn-dist.cpp     |  18 ++-
 lonestar/libgnnbench/include/GNNBench/Input.h |  11 +-
 lonestar/libgnnbench/src/Input.cpp            | 126 +++++++++++++++++-
 3 files changed, 146 insertions(+), 9 deletions(-)

diff --git a/lonestar/gnn/distributed/gcn/gcn-dist.cpp b/lonestar/gnn/distributed/gcn/gcn-dist.cpp
index 031ae06d13..b2c1888c7a 100644
--- a/lonestar/gnn/distributed/gcn/gcn-dist.cpp
+++ b/lonestar/gnn/distributed/gcn/gcn-dist.cpp
@@ -1,10 +1,26 @@
 #include "GNNBench/Start.h"
-#include "galois/GraphNeuralNetwork.h"
 
 constexpr static const char* const name = "Graph Convolutional Network";
 
 int main(int argc, char* argv[]) {
   galois::DistMemSys G;
   GNNBenchStart(argc, argv, name);
+
+  galois::StatTimer init_timer("InitializationTime");
+  init_timer.start();
+  std::unique_ptr<galois::GraphNeuralNetwork> gnn =
+    InitializeGraphNeuralNetwork(galois::GNNLayerType::kGraphConvolutional);
+  gnn->SetLayerPhases(galois::GNNPhase::kTrain);
+  init_timer.stop();
+
+  galois::StatTimer compute_timer("Timer_0");
+  compute_timer.start();
+
+  galois::StatTimer train_timer("TrainingTime");
+  train_timer.start();
+
+  train_timer.stop();
+  compute_timer.stop();
+
   return 0;
 }
diff --git a/lonestar/libgnnbench/include/GNNBench/Input.h b/lonestar/libgnnbench/include/GNNBench/Input.h
index 1bb2afdf70..fc5059bb0c 100644
--- a/lonestar/libgnnbench/include/GNNBench/Input.h
+++ b/lonestar/libgnnbench/include/GNNBench/Input.h
@@ -1,7 +1,6 @@
 #pragma once
 
 #include "galois/GraphNeuralNetwork.h"
-#include "galois/Logging.h"
 #include "galois/graphs/GNNGraph.h"
 #include <llvm/Support/CommandLine.h>
 
@@ -23,8 +22,14 @@ extern llvm::cl::opt<bool> do_activation;
 //! Controls weight normalization based on degree
 extern llvm::cl::opt<bool> do_normalization;
 // TODO output layer type
+// TODO optimizer type
+//! Toggles an optimization that flips aggregate/update step if it would be
+//! beneficial
+extern llvm::cl::opt<bool> agg_after_update;
 
 const char* GNNPartitionToString(galois::graphs::GNNPartitionScheme s);
 
-//! Using command line args above, create a GNN.
-// XXX
+//! Using command line args above, create a GNN using some specified layer type
+//! as the intermediate layer.
+std::unique_ptr<galois::GraphNeuralNetwork>
+InitializeGraphNeuralNetwork(galois::GNNLayerType layer_type);
diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp
index 0965234b51..97ef7a6fc3 100644
--- a/lonestar/libgnnbench/src/Input.cpp
+++ b/lonestar/libgnnbench/src/Input.cpp
@@ -1,3 +1,4 @@
+#include "galois/Logging.h"
 #include "GNNBench/Input.h"
 
 namespace cll = llvm::cl;
@@ -29,11 +30,11 @@ llvm::cl::opt<size_t> num_layers(
         "Number of intermediate layers in the neural network (default 2))"),
     cll::init(2));
 
-llvm::cl::list<size_t> layer_sizes(
-    "layerSizes",
-    cll::desc(
-        "Comma separated list of numbers specifying intermediate layer sizes"),
-    cll::CommaSeparated);
+llvm::cl::list<size_t>
+    layer_sizes("layerSizes",
+                cll::desc("Comma separated list of numbers specifying "
+                          "intermediate layer sizes (does not include output)"),
+                cll::CommaSeparated);
 
 llvm::cl::opt<bool> do_dropout(
     "doDropout",
@@ -58,6 +59,12 @@ llvm::cl::opt<bool>
                                "features based on their degree"),
                      cll::init(true));
 
+llvm::cl::opt<bool>
+    agg_after_update("allowAggregationAfterUpdate",
+                     cll::desc("If true (on by default), allows aggregate to "
+                               "be done after update as an optimization"),
+                     cll::init(true));
+
 const char* GNNPartitionToString(galois::graphs::GNNPartitionScheme s) {
   switch (s) {
   case galois::graphs::GNNPartitionScheme::kOEC:
@@ -69,3 +76,112 @@ const char* GNNPartitionToString(galois::graphs::GNNPartitionScheme s) {
     return "";
   }
 }
+
+//! Initializes the vector of layer sizes from command line args + graph
+std::vector<size_t>
+CreateLayerSizesVector(const galois::graphs::GNNGraph* gnn_graph) {
+  // set layer sizes for intermdiate and output layers
+  std::vector<size_t> layer_sizes_vector;
+  if (layer_sizes.size()) {
+    GALOIS_LOG_ASSERT(layer_sizes.size() == num_layers);
+    for (size_t i = 0; i < num_layers; i++) {
+      layer_sizes_vector.emplace_back(layer_sizes[i]);
+    }
+    // verify user satisfies last intermediate layer needing to have same size
+    // as # label classes
+    GALOIS_LOG_ASSERT(layer_sizes_vector.back() ==
+                      gnn_graph->GetNumLabelClasses());
+  } else {
+    // default 16 for everything until last 2
+    for (size_t i = 0; i < num_layers - 1; i++) {
+      layer_sizes_vector.emplace_back(16);
+    }
+    // last 2 sizes must be equivalent to # label classes; this is the last
+    // intermediate layer
+    layer_sizes_vector.emplace_back(gnn_graph->GetNumLabelClasses());
+  }
+
+  // TODO
+  // for now only softmax layer which dictates the output size of the last
+  // intermediate layer + size of the output layer
+  // output layer at the moment required to be same as # label classes
+  layer_sizes_vector.emplace_back(gnn_graph->GetNumLabelClasses());
+
+  return layer_sizes_vector;
+}
+
+//! Setup layer config struct based on cli args
+galois::GNNLayerConfig CreateLayerConfig() {
+  galois::GNNLayerConfig layer_config;
+  layer_config.do_dropout                   = do_dropout;
+  layer_config.dropout_rate                 = dropout_rate;
+  layer_config.do_activation                = do_activation;
+  layer_config.do_normalization             = do_normalization;
+  layer_config.allow_aggregate_after_update = agg_after_update;
+  return layer_config;
+}
+
+std::unique_ptr<galois::BaseOptimizer>
+CreateOptimizer(const galois::graphs::GNNGraph* gnn_graph) {
+  std::vector<size_t> opt_sizes;
+
+  // optimizer sizes are based on intermediate layer sizes, input feats, and
+  // # label classes
+  if (layer_sizes.size()) {
+    GALOIS_LOG_ASSERT(layer_sizes.size() == num_layers);
+    opt_sizes.emplace_back(gnn_graph->node_feature_length() * layer_sizes[0]);
+    // assumption here is that if it reached this point then layer sizes were
+    // already sanity checked previously (esp. last layer)
+    for (size_t i = 1; i < num_layers; i++) {
+      opt_sizes.emplace_back(layer_sizes[i] * layer_sizes[i - 1]);
+    }
+  } else {
+    // everything is size 16 until last
+    if (num_layers == 1) {
+      // single layer requires a bit of special handling
+      opt_sizes.emplace_back(gnn_graph->node_feature_length() *
+                             gnn_graph->GetNumLabelClasses());
+    } else {
+      // first
+      opt_sizes.emplace_back(gnn_graph->node_feature_length() * 16);
+      for (size_t i = 1; i < num_layers - 1; i++) {
+        opt_sizes.emplace_back(16 * 16);
+      }
+      // last
+      opt_sizes.emplace_back(16 * gnn_graph->GetNumLabelClasses());
+    }
+  }
+  GALOIS_LOG_ASSERT(opt_sizes.size() == num_layers);
+
+  // TODO only adam works right now, add the others later
+  return std::make_unique<galois::AdamOptimizer>(opt_sizes, num_layers);
+}
+
+std::unique_ptr<galois::GraphNeuralNetwork>
+InitializeGraphNeuralNetwork(galois::GNNLayerType layer_type) {
+  // partition/load graph
+  auto gnn_graph = std::make_unique<galois::graphs::GNNGraph>(
+      input_directory, input_name, partition_scheme, true);
+
+  // create layer types vector
+  std::vector<galois::GNNLayerType> layer_types;
+  for (size_t i = 0; i < num_layers; i++) {
+    layer_types.push_back(layer_type);
+  }
+  // sizes
+  std::vector<size_t> layer_sizes_vector =
+      CreateLayerSizesVector(gnn_graph.get());
+  // layer config object
+  galois::GNNLayerConfig layer_config = CreateLayerConfig();
+  // GNN config object
+  // TODO output type should be configurable
+  galois::GraphNeuralNetworkConfig gnn_config(
+      num_layers, layer_types, layer_sizes_vector,
+      galois::GNNOutputLayerType::kSoftmax, layer_config);
+  // optimizer
+  std::unique_ptr<galois::BaseOptimizer> opt = CreateOptimizer(gnn_graph.get());
+
+  // create the gnn
+  return std::make_unique<galois::GraphNeuralNetwork>(
+      std::move(gnn_graph), std::move(opt), std::move(gnn_config));
+}

From e81eb03a143f0d8a2555a1d4625b6a9ef68f509d Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 19 Oct 2020 18:47:05 -0500
Subject: [PATCH 384/660] Train function to GNN

Added a high level train function for callers to do end to end training
for some specified number of epochs for the graph neural network class.
---
 libgnn/include/galois/GraphNeuralNetwork.h |  5 ++++
 libgnn/src/GraphNeuralNetwork.cpp          | 29 ++++++++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h
index 919c11046a..725e3a69d1 100644
--- a/libgnn/include/galois/GraphNeuralNetwork.h
+++ b/libgnn/include/galois/GraphNeuralNetwork.h
@@ -13,6 +13,7 @@ namespace galois {
 
 ////////////////////////////////////////////////////////////////////////////////
 
+// TODO validation and testing intervals
 //! Configuration object passed into constructor of a GraphNeuralNetwork to
 //! determine how the network gets constructed.
 class GraphNeuralNetworkConfig {
@@ -131,6 +132,10 @@ class GraphNeuralNetwork {
   //! Returns the output layer
   const galois::GNNLayer* GetOutputLayer() { return gnn_layers_.back().get(); }
 
+  //! Do training for a specified # of epochs and return test accuracy at the
+  //! end of it
+  float Train(size_t num_epochs);
+
   //! Propogates the graph's feature vectors through the network to get a new
   //! vector representation.
   //! Also known as the forward phase in most literature
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index 3424c2b3e3..82c1d40c07 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -59,6 +59,35 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork(
   }
 }
 
+float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
+  const size_t this_host = graph_->host_id();
+  // TODO incorporate validation/test intervals
+  for (size_t epoch = 0; epoch < num_epochs; epoch++) {
+    const std::vector<galois::GNNFloat>* predictions = DoInference();
+    GradientPropagation();
+    float train_accuracy = GetGlobalAccuracy(*predictions);
+    if (this_host == 0) {
+      galois::gPrint("Epoch ", epoch, ": Train accuracy is ", train_accuracy,
+                     "\n");
+    }
+    // TODO validation and test as necessary
+  }
+
+  // check test accuracy
+  galois::StatTimer acc_timer("FinalAccuracyTest");
+  acc_timer.start();
+  SetLayerPhases(galois::GNNPhase::kTest);
+  const std::vector<galois::GNNFloat>* predictions = DoInference();
+  float global_accuracy = GetGlobalAccuracy(*predictions);
+  acc_timer.stop();
+
+  if (this_host == 0) {
+    galois::gPrint("Final test accuracy is ", global_accuracy, "\n");
+  }
+
+  return global_accuracy;
+}
+
 const std::vector<galois::GNNFloat>* galois::GraphNeuralNetwork::DoInference() {
   // start with graph features and pass it through all layers of the network
   const std::vector<GNNFloat>* layer_input = &(graph_->GetLocalFeatures());

From c7e9c02a47fc6a522a4121bbfca3e6d1a03caa89 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 19 Oct 2020 18:48:08 -0500
Subject: [PATCH 385/660] Epoch CLI to GNNBench + gcn-dist app complete

Adds another command line option to GNNBench to specify the number of
epochs to train for. The initial gcn app is also now done and will do
training end to end.
---
 lonestar/gnn/distributed/gcn/gcn-dist.cpp     | 2 +-
 lonestar/libgnnbench/include/GNNBench/Start.h | 1 +
 lonestar/libgnnbench/src/Start.cpp            | 4 ++++
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/lonestar/gnn/distributed/gcn/gcn-dist.cpp b/lonestar/gnn/distributed/gcn/gcn-dist.cpp
index b2c1888c7a..a7eb0a4bae 100644
--- a/lonestar/gnn/distributed/gcn/gcn-dist.cpp
+++ b/lonestar/gnn/distributed/gcn/gcn-dist.cpp
@@ -18,7 +18,7 @@ int main(int argc, char* argv[]) {
 
   galois::StatTimer train_timer("TrainingTime");
   train_timer.start();
-
+  gnn->Train(num_epochs);
   train_timer.stop();
   compute_timer.stop();
 
diff --git a/lonestar/libgnnbench/include/GNNBench/Start.h b/lonestar/libgnnbench/include/GNNBench/Start.h
index 93fc3ee0b1..c17ddecadc 100644
--- a/lonestar/libgnnbench/include/GNNBench/Start.h
+++ b/lonestar/libgnnbench/include/GNNBench/Start.h
@@ -10,6 +10,7 @@
 
 extern llvm::cl::opt<unsigned> num_threads;
 extern llvm::cl::opt<unsigned> num_runs;
+extern llvm::cl::opt<unsigned> num_epochs;
 extern llvm::cl::opt<std::string> stat_file;
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/lonestar/libgnnbench/src/Start.cpp b/lonestar/libgnnbench/src/Start.cpp
index 6276b373ee..1a178c583d 100644
--- a/lonestar/libgnnbench/src/Start.cpp
+++ b/lonestar/libgnnbench/src/Start.cpp
@@ -6,6 +6,10 @@ cll::opt<unsigned> num_threads("t", cll::desc("Number of threads (default 1)"),
                                cll::init(1));
 cll::opt<unsigned> num_runs("runs", cll::desc("Number of runs (default 1)"),
                             cll::init(1));
+cll::opt<unsigned> num_epochs("epochs",
+                              cll::desc("Number of epochs (default 50)"),
+                              cll::init(50));
+
 cll::opt<std::string>
     stat_file("statFile", cll::desc("Optional output file to print stats to"));
 

From f69569e26c7c0066791820cbc95c3ffe5b4a2136 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 27 Oct 2020 17:12:07 -0500
Subject: [PATCH 386/660] Disable GALOIS_SUPPORT_ASYNC in libdist

The SUPPORT_ASYNC flag makes messages use issend instead of isend.
issend was causing issues with distributed execution where under
certain interleavings of send/recv a send would get presumably
corrupted. For now, disable issend until a fix is found (or maybe
remove it forever depending on if it's really necessary or not).

Performance wise it does not seem to show much of an impact on GNN
apps; mostly noise and large variance has been observed.
---
 libdist/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libdist/CMakeLists.txt b/libdist/CMakeLists.txt
index 138a4edabd..57e6aa1750 100644
--- a/libdist/CMakeLists.txt
+++ b/libdist/CMakeLists.txt
@@ -21,7 +21,7 @@ target_include_directories(galois_dist_async PUBLIC
 target_link_libraries(galois_dist_async PUBLIC MPI::MPI_CXX)
 target_link_libraries(galois_dist_async PUBLIC galois_shmem)
 
-target_compile_definitions(galois_dist_async PRIVATE GALOIS_SUPPORT_ASYNC=1)
+#target_compile_definitions(galois_dist_async PRIVATE GALOIS_SUPPORT_ASYNC=1)
 
 if (GALOIS_USE_BARE_MPI)
   target_compile_definitions(galois_dist_async PRIVATE GALOIS_USE_BARE_MPI=1)

From 4035e46aeb5fa85c994d37a477a494d79c782f7f Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 27 Oct 2020 17:26:34 -0500
Subject: [PATCH 387/660] GNNGraph loading: turn off unnecessary CuSP features

Add args to get rid of async partitioning, multi-round
partitioning.
---
 libgnn/src/graphs/GNNGraph.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index b77e5df0a8..3b0c79a628 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -17,10 +17,10 @@ LoadPartition(const std::string& input_directory,
   switch (partition_scheme) {
   case galois::graphs::GNNPartitionScheme::kOEC:
     return galois::cuspPartitionGraph<GnnOEC, char, void>(
-        input_file, galois::CUSP_CSR, galois::CUSP_CSR, true, "");
+        input_file, galois::CUSP_CSR, galois::CUSP_CSR, true, "", "", false, 1);
   case galois::graphs::GNNPartitionScheme::kCVC:
     return galois::cuspPartitionGraph<GnnCVC, char, void>(
-        input_file, galois::CUSP_CSR, galois::CUSP_CSR, true, "");
+        input_file, galois::CUSP_CSR, galois::CUSP_CSR, true, "", "", false, 1);
   default:
     GALOIS_LOG_FATAL("Error: partition scheme specified is invalid");
     return nullptr;

From b8c6a90a926101a29278bd1b538532ce788efb10 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 27 Oct 2020 17:28:31 -0500
Subject: [PATCH 388/660] Various Katana changes to Serialize/PODArray

Take some changes made to clean up the serialize/POD code from the
Katana repo and add them here.
---
 libdist/include/galois/runtime/Serialize.h    | 10 +++++++---
 libgalois/CMakeLists.txt                      |  1 +
 libgalois/include/galois/PODResizeableArray.h | 12 +++++++++---
 3 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/libdist/include/galois/runtime/Serialize.h b/libdist/include/galois/runtime/Serialize.h
index 1060721ed2..94517e34ca 100644
--- a/libdist/include/galois/runtime/Serialize.h
+++ b/libdist/include/galois/runtime/Serialize.h
@@ -79,13 +79,17 @@ class SerializeBuffer {
 
   //! Insert characters from a buffer into the serialize buffer
   void insert(const uint8_t* c, size_t bytes) {
-    bufdata.insert(bufdata.end(), c, c + bytes);
+    if (bytes > 0) {
+      bufdata.insert(bufdata.end(), c, c + bytes);
+    }
   }
 
   //! Insert characters from a buffer into the serialize buffer at a particular
   //! offset
   void insertAt(const uint8_t* c, size_t bytes, size_t offset) {
-    std::copy_n(c, bytes, bufdata.begin() + offset);
+    if (bytes > 0) {
+      std::copy_n(c, bytes, bufdata.begin() + offset);
+    }
   }
 
   /**
@@ -237,7 +241,7 @@ class DeSerializeBuffer {
    */
   void extract(uint8_t* dst, size_t num) {
     if (num > 0) {
-      memcpy(dst, &bufdata[offset], num);
+      std::copy_n(&bufdata[offset], num, dst);
       offset += num;
     }
   }
diff --git a/libgalois/CMakeLists.txt b/libgalois/CMakeLists.txt
index 8e9d56d48e..76161160f6 100644
--- a/libgalois/CMakeLists.txt
+++ b/libgalois/CMakeLists.txt
@@ -86,6 +86,7 @@ endif()
 
 target_link_libraries(galois_shmem INTERFACE pygalois)
 target_link_libraries(galois_shmem PRIVATE Threads::Threads)
+target_link_libraries(galois_shmem PUBLIC galois_support)
 
 if (CMAKE_HAVE_PTHREAD_H)
   target_compile_definitions(galois_shmem PRIVATE GALOIS_HAVE_PTHREAD)
diff --git a/libgalois/include/galois/PODResizeableArray.h b/libgalois/include/galois/PODResizeableArray.h
index a37a0b598c..dc1cabdb48 100644
--- a/libgalois/include/galois/PODResizeableArray.h
+++ b/libgalois/include/galois/PODResizeableArray.h
@@ -28,6 +28,7 @@
 #include <type_traits>
 
 #include "galois/config.h"
+#include "galois/Logging.h"
 
 namespace galois {
 
@@ -136,6 +137,9 @@ class PODResizeableArray {
       }
       data_ = static_cast<_Tp*>(
           realloc(reinterpret_cast<void*>(data_), capacity_ * sizeof(_Tp)));
+      if (!data_) {
+        GALOIS_LOG_FATAL("Out of memory for a PODResizableArray");
+      }
     }
   }
 
@@ -183,10 +187,12 @@ class PODResizeableArray {
   void insert(iterator GALOIS_USED_ONLY_IN_DEBUG(position), InputIterator first,
               InputIterator last) {
     assert(position == end());
-    size_t old_size = size_;
     size_t to_add   = last - first;
-    resize(old_size + to_add);
-    std::copy_n(first, to_add, begin() + old_size);
+    if (to_add > 0) {
+      size_t old_size = size_;
+      resize(old_size + to_add);
+      std::copy_n(first, to_add, begin() + old_size);
+    }
   }
 
   void swap(PODResizeableArray& v) {

From 65a3bb32817886a83cd824a1d3d696d52d8385ed Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 28 Oct 2020 19:19:20 -0500
Subject: [PATCH 389/660] Disable DIE for partitioning of non-gnn datasets

Adding a DIE in else clause for the check of break points for GNN
datasets unintentionally broke all datasets for other apps.

TODO: add a check to only call the function if a GNN dataset is used
(i.e. add a flag or add a hardcoded list)
---
 libcusp/include/galois/graphs/NewGeneric.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/libcusp/include/galois/graphs/NewGeneric.h b/libcusp/include/galois/graphs/NewGeneric.h
index 33a618c62f..f4837ff1de 100644
--- a/libcusp/include/galois/graphs/NewGeneric.h
+++ b/libcusp/include/galois/graphs/NewGeneric.h
@@ -95,8 +95,9 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
       bps.push_back(0);
       bps.push_back(5);
     } else {
-      GALOIS_DIE("invalid input for gnn partitioning ", filename,
-                 " hardcode needed");
+      // XXX only die under certain conditions
+      //GALOIS_DIE("invalid input for gnn partitioning ", filename,
+      //           " hardcode needed");
     }
     // TODO hardcode the rest
 

From 64dfd096e98a6024d491e3c4f5749b66d6e99200 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 28 Oct 2020 19:23:56 -0500
Subject: [PATCH 390/660] GNNGraph cleanup; prep for GPU additions

Remove some unused vars/functions originally added to use MKL functions
that were eventually dropped. Rearranged some functions and added
placeholders for the GPU graph build.
---
 libgnn/include/galois/graphs/GNNGraph.h | 112 +++++++++---------------
 libgnn/src/graphs/GNNGraph.cpp          |  19 ----
 2 files changed, 41 insertions(+), 90 deletions(-)

diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 8ce85092ac..3f73aff510 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -5,6 +5,10 @@
 #include "galois/graphs/GluonSubstrate.h"
 #include "galois/graphs/GraphAggregationSyncStructures.h"
 
+#ifdef GALOIS_ENABLE_GPU
+#include "galois/graphs/GNNGraph.cuh"
+#endif
+
 namespace galois {
 
 // TODO remove the need to hardcode this path
@@ -47,6 +51,16 @@ class GNNGraph {
   //! Returns host id in brackets to use for printing things
   const std::string& host_prefix() const { return host_prefix_; }
 
+  //! Length of a node feature
+  size_t node_feature_length() const { return node_feature_length_; }
+
+  //! Return the number of label classes (i.e. number of possible outputs)
+  size_t GetNumLabelClasses() const { return num_label_classes_; };
+
+  //////////////////////////////////////////////////////////////////////////////
+  // Graph accessors
+  //////////////////////////////////////////////////////////////////////////////
+
   //! Return # of nodes in the partitioned graph
   size_t size() const { return partitioned_graph_->size(); }
 
@@ -67,22 +81,7 @@ class GNNGraph {
     return partitioned_graph_->masterNodesRange().end();
   }
 
-  //! Given an LID and the current phase of GNN computation, determine if the
-  //! lid in question is valid for the current phase (i.e., it is part of
-  //! a training, validation, or test phase mask)
-  bool IsValidForPhase(const unsigned lid,
-                       const galois::GNNPhase current_phase) const;
-  //! Returns the label of some local id assuming labels are single class
-  //! labels.
-  GNNFloat GetSingleClassLabel(const unsigned lid) const {
-    assert(using_single_class_labels_);
-    return local_ground_truth_labels_[lid];
-  }
-
-  //! Return the number of label classes
-  size_t GetNumLabelClasses() const { return num_label_classes_; };
-
-  // All following functions take a local id
+  // All following functions take a local node id
   EdgeIterator EdgeBegin(GraphNode n) const {
     return partitioned_graph_->edge_begin(n);
   };
@@ -94,22 +93,25 @@ class GNNGraph {
   };
   GNNFloat NormFactor(GraphNode n) const { return norm_factors_[n]; }
 
-  size_t node_feature_length() const { return node_feature_length_; }
+  //! Returns the ground truth label of some local id assuming labels are single
+  //! class labels.
+  GNNFloat GetSingleClassLabel(const unsigned lid) const {
+    assert(using_single_class_labels_);
+    return local_ground_truth_labels_[lid];
+  }
 
+  //! Return matrix of the local node features
   const std::vector<GNNFloat>& GetLocalFeatures() const {
     return local_node_features_;
   }
 
-  //! Returns a pointer to the CSR indices where the first element starts at
-  //! 0 (used with MKL)
-  const uint32_t* GetZeroBasedRowPointer() const {
-    return zero_start_graph_indices_.data();
-  }
+  //! Given an LID and the current phase of GNN computation, determine if the
+  //! lid in question is valid for the current phase (i.e., it is part of
+  //! a training, validation, or test phase mask)
+  bool IsValidForPhase(const unsigned lid,
+                       const galois::GNNPhase current_phase) const;
 
-  //! Return pointer to all edge destinations; used with MKL
-  const uint32_t* GetEdgeDestPointer() const {
-    return partitioned_graph_->edge_dst_ptr();
-  }
+  //////////////////////////////////////////////////////////////////////////////
 
   //! Given a matrix and the column size, do an aggregate sync where each row
   //! is considered a node's data and sync using the graph's Gluon
@@ -137,9 +139,6 @@ class GNNGraph {
   //! The entire topology of the dataset: used for things like norm factor
   //! calculation or sampling
   WholeGraph whole_graph_;
-  //! The indices pointer from the partitioned graph except with a 0
-  //! prepended to it; needed for MKL calls
-  std::vector<uint32_t> zero_start_graph_indices_;
   //! Sync substrate for the partitioned graph
   std::unique_ptr<galois::graphs::GluonSubstrate<GNNDistGraph>> sync_substrate_;
   //! True if labels are single class
@@ -173,6 +172,10 @@ class GNNGraph {
 
   // TODO vars for subgraphs as necessary
 
+  //////////////////////////////////////////////////////////////////////////////
+  // Initialization
+  //////////////////////////////////////////////////////////////////////////////
+
   //! Read labels of local nodes only
   void ReadLocalLabels(const std::string& dataset_name,
                        bool has_single_class_label);
@@ -185,54 +188,21 @@ class GNNGraph {
                                 GNNRange* mask_range, GNNLabel* masks);
   //! Read masks of local nodes only for training, validation, and testing
   void ReadLocalMasks(const std::string& dataset_name);
-  //! Init the node start indices that have a 0 at the beginning; straight
-  //! copy of the array from the partitioned graph save for the 0 at the
-  //! first element.
-  void InitZeroStartGraphIndices();
   //! Reads the entire graph topology in (but nothing else)
   void ReadWholeGraph(const std::string& dataset_name);
   //! Initializes the norm factors using the entire graph's topology for global
   //! degree access
   void InitNormFactor();
 
-  // public:
-  //  void saveDistGraph(DGraph* a);
-  //  galois::graphs::GluonSubstrate<DGraph>* getSyncSubstrate();
-  //  float_t* get_feats_ptr() { return h_feats; }
-  //  float_t* get_feats_subg_ptr() { return h_feats_subg.data(); }
-  //  label_t* get_labels_ptr() { return h_labels; }
-  //  label_t* get_labels_subg_ptr() { return h_labels_subg.data(); }
-  //  float_t* get_norm_factors_ptr() { return normFactors.data(); }
-  //  float_t* get_norm_factors_subg_ptr() { return &normFactorsSub[0]; }
-  //
-  //  //! allocate the norm factor vector
-  //  void allocNormFactor();
-  //  void allocNormFactorSub(int subID);
-  //  //! construct norm factor vector by using data from global graph
-  //  void constructNormFactor(deepgalois::Context* globalContext);
-  //  void constructNormFactorSub(int subgraphID);
-  //
-  //  void constructSubgraphLabels(size_t m, const mask_t* masks);
-  //  void constructSubgraphFeatures(size_t m, const mask_t* masks);
-  //
-  //  //! return label for some node
-  //  //! NOTE: this is LID, not GID
-  //  label_t get_label(size_t lid) { return h_labels[lid]; }
-  //
-  //  //! returns pointer to the features of each local node
-  //  float_t* get_in_ptr();
-  //
-  //  //! allocate memory for subgraphs (don't actually build them)
-  //  void allocateSubgraphs(int num_subgraphs, unsigned max_size);
-  //
-  //  //! return if a vertex is owned by the partitioned graph this context
-  //  contains bool isOwned(unsigned gid);
-  //  //! return if part graph has provided vertex for given gid locally
-  //  bool isLocal(unsigned gid);
-  //  //! get GID of an lid for a vertex
-  //  unsigned getGID(unsigned lid);
-  //  //! get local id of a vertex given a global id for that vertex
-  //  unsigned getLID(unsigned gid);
+  //////////////////////////////////////////////////////////////////////////////
+  // GPU things
+  //////////////////////////////////////////////////////////////////////////////
+
+#ifdef GALOIS_ENABLE_GPU
+  //! This satisfies the cuda context forward declaration in host decls:
+  //! context fields
+  GNNGraphGPUAllocations gpu_memory_;
+#endif
 };
 
 } // namespace graphs
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index 3b0c79a628..fe997ed228 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -71,8 +71,6 @@ galois::graphs::GNNGraph::GNNGraph(const std::string& input_directory,
           *partitioned_graph_, host_id_,
           galois::runtime::getSystemNetworkInterface().Num, false);
 
-  // create the 0 based row indices for MKL use
-  InitZeroStartGraphIndices();
   // read in entire graph topology
   ReadWholeGraph(dataset_name);
   // init norm factors using the whole graph topology
@@ -340,23 +338,6 @@ void galois::graphs::GNNGraph::ReadLocalMasks(const std::string& dataset_name) {
   }
 }
 
-void galois::graphs::GNNGraph::InitZeroStartGraphIndices() {
-  GALOIS_LOG_VERBOSE("[{}] Initializing node indices with 0 prepended",
-                     host_id_);
-  // size is num nodes + 1
-  zero_start_graph_indices_.resize(partitioned_graph_->size() + 1);
-  // first element is zero
-  zero_start_graph_indices_[0] = 0;
-  // the rest is a straight copy from partitioned graph (use edge_end to access
-  // it)
-  galois::do_all(
-      galois::iterate(static_cast<size_t>(0), partitioned_graph_->size()),
-      [&](size_t i) {
-        zero_start_graph_indices_[i + 1] = *(partitioned_graph_->edge_end(i));
-      },
-      galois::loopname("InitZeroStartGraphIndices"));
-}
-
 void galois::graphs::GNNGraph::ReadWholeGraph(const std::string& dataset_name) {
   std::string input_file = input_directory_ + dataset_name + ".csgr";
   GALOIS_LOG_VERBOSE("[{}] Reading entire graph: file to read is {}", host_id_,

From 04bd6c51ab238c4d61d48d61b8740d35b6da15e1 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 3 Nov 2020 10:43:05 -0600
Subject: [PATCH 391/660] Fix GNNGraph default directory init bug

The constructor for GNN Graph that does not take an input directory was
constructing a GNNGraph in its function scope and not constructing the
actual caller of the constructor. This led to most of the unit tests
failing as the graph was not constructed correctly.

Moral: run unit tests before every commit.
---
 libgnn/src/graphs/GNNGraph.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index fe997ed228..2292614b70 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -38,10 +38,9 @@ size_t gnn_matrix_to_sync_column_length_ = 0;
 
 galois::graphs::GNNGraph::GNNGraph(const std::string& dataset_name,
                                    GNNPartitionScheme partition_scheme,
-                                   bool has_single_class_label) {
-  GNNGraph(galois::default_gnn_dataset_path, dataset_name, partition_scheme,
-           has_single_class_label);
-}
+                                   bool has_single_class_label)
+    : GNNGraph(galois::default_gnn_dataset_path, dataset_name, partition_scheme,
+               has_single_class_label) {}
 
 galois::graphs::GNNGraph::GNNGraph(const std::string& input_directory,
                                    const std::string& dataset_name,

From 10943ca906a6a2ac50d18c758a5e530f5e8fda8f Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 3 Nov 2020 10:48:55 -0600
Subject: [PATCH 392/660] Allow getMarshalGraph to not deallocate graph

Added a version of getMarshalGraph in the Gluon substrate that allows a
user to not deallocate the source graph. Added in anticipation of GNN
work where it might be beneficial to keep the CPU version of the graph
around in memory.
---
 libgluon/include/galois/graphs/GluonSubstrate.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/libgluon/include/galois/graphs/GluonSubstrate.h b/libgluon/include/galois/graphs/GluonSubstrate.h
index 11a89157e7..f79427af89 100644
--- a/libgluon/include/galois/graphs/GluonSubstrate.h
+++ b/libgluon/include/galois/graphs/GluonSubstrate.h
@@ -3301,7 +3301,9 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
   }
 
 public:
-  void getMarshalGraph(MarshalGraph& m) {
+  void getMarshalGraph(MarshalGraph& m) { getMarshalGraph(m, true); }
+
+  void getMarshalGraph(MarshalGraph& m, bool deallocate_graph) {
     m.nnodes   = userGraph.size();
     m.nedges   = userGraph.sizeEdges();
     m.numOwned = userGraph.numMasters();
@@ -3389,7 +3391,9 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
 
     // user needs to provide method of freeing up graph (it can do nothing
     // if they wish)
-    userGraph.deallocate();
+    if (deallocate_graph) {
+      userGraph.deallocate();
+    }
   }
 #endif // het galois def
 

From bd9dea1a0042ceb627dd7702a266da5320400a9c Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 3 Nov 2020 13:03:16 -0600
Subject: [PATCH 393/660] Allocate features of graph on GPU (first test)

Adds initial CUDA headers/impl to GNNGraph, starting with a struct that
is meant to hold all GPU based allocations for the graph. This is
compiled separately from the main GNN library using nvcc and linked into
the main GNN library.

This commit adds an Init GPU call that is meant to initialize all GPU
memory used for GNN training. For now, it only allocates/copies over the
features of the graph. The rest are incoming in later commits.

Also contains a change to the Gluon gradient interface which adds a few
dummy typedefs so that Gluon doesn't complain (GPU build expects certain
typedefs to exist due to the marshal graph function).
---
 CMakeLists.txt                                |  7 +++-
 libgnn/CMakeLists.txt                         | 24 +++++++++++-
 libgnn/include/galois/CUDAUtil.h              | 17 +++++++++
 libgnn/include/galois/GNNTypes.h              |  4 ++
 libgnn/include/galois/graphs/GNNGraph.cuh     | 37 +++++++++++++++++++
 libgnn/include/galois/graphs/GNNGraph.h       |  3 ++
 .../galois/layers/GluonGradientInterface.h    |  9 +++++
 libgnn/src/graphs/GNNGraph.cpp                | 15 +++++++-
 libgnn/src/graphs/GNNGraph.cu                 | 11 ++++++
 9 files changed, 124 insertions(+), 3 deletions(-)
 create mode 100644 libgnn/include/galois/CUDAUtil.h
 create mode 100644 libgnn/include/galois/graphs/GNNGraph.cuh
 create mode 100644 libgnn/src/graphs/GNNGraph.cu

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 937251376c..41f318b828 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -236,7 +236,6 @@ if (GALOIS_ENABLE_DIST)
   add_subdirectory(libdist)
   add_subdirectory(libcusp)
   add_subdirectory(libgluon)
-  add_subdirectory(libgnn)
 endif()
 
 # TODO(loc) prefix with GALOIS
@@ -279,6 +278,12 @@ if (GALOIS_ENABLE_GPU)
     #find_package(OpenCL REQUIRED)
   endif()
 endif()
+
+if (GALOIS_ENABLE_DIST)
+  # here because I need the GPU declarations above
+  add_subdirectory(libgnn)
+endif()
+
 add_subdirectory(libpangolin)
 
 # Applications (apps)
diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt
index d635781ba6..1ca05c8632 100644
--- a/libgnn/CMakeLists.txt
+++ b/libgnn/CMakeLists.txt
@@ -23,5 +23,27 @@ target_include_directories(galois_gnn PUBLIC
 )
 
 set_target_properties(galois_gnn PROPERTIES EXPORT_NAME galois_gnn)
-
 add_subdirectory(test)
+
+if (GALOIS_ENABLE_GPU)
+  target_compile_definitions(galois_gnn PUBLIC GALOIS_ENABLE_GPU=1)
+
+  # create the galois_gnn_gpu library to get linked into galois_gnn
+  set(gpusources
+    src/graphs/GNNGraph.cu
+  )
+  add_library(galois_gnn_gpu STATIC ${gpusources})
+  target_compile_definitions(galois_gnn_gpu PRIVATE _FORCE_INLINES)
+  target_compile_options(galois_gnn_gpu PUBLIC "$<$<COMPILE_LANGUAGE:CUDA>:--expt-extended-lambda>")
+  set_property(TARGET galois_gnn_gpu PROPERTY CUDA_STANDARD 14)
+  target_compile_definitions(galois_gnn_gpu PUBLIC GALOIS_ENABLE_GPU=1)
+  target_include_directories(galois_gnn_gpu PUBLIC
+    ${CMAKE_CURRENT_SOURCE_DIR}/include
+  )
+
+  # link to gpu lib (which takes care of moderngpu and cub)
+  target_link_libraries(galois_gnn_gpu Galois::gpu galois_support)
+
+  # gpu -> cpu lib
+  target_link_libraries(galois_gnn galois_gnn_gpu)
+endif()
diff --git a/libgnn/include/galois/CUDAUtil.h b/libgnn/include/galois/CUDAUtil.h
new file mode 100644
index 0000000000..dabd1638b4
--- /dev/null
+++ b/libgnn/include/galois/CUDAUtil.h
@@ -0,0 +1,17 @@
+#ifdef GALOIS_ENABLE_GPU
+//! @file CUDAUtil.h
+//! Contains various utility functions for CUDA.
+#pragma once
+#include <cuda.h>
+#include "galois/Logging.h"
+
+#define CUDA_CHECK(condition)                                                  \
+  do {                                                                         \
+    cudaError_t error = condition;                                             \
+    if (error != cudaSuccess) {                                                \
+      GALOIS_LOG_ERROR("CUDA error: {}", cudaGetErrorString(error));           \
+      exit(EXIT_FAILURE);                                                      \
+    }                                                                          \
+  } while (0)
+
+#endif
diff --git a/libgnn/include/galois/GNNTypes.h b/libgnn/include/galois/GNNTypes.h
index a04fa14687..99cb700cb4 100644
--- a/libgnn/include/galois/GNNTypes.h
+++ b/libgnn/include/galois/GNNTypes.h
@@ -13,6 +13,10 @@ using GNNFloat = float;
 using GNNLabel = uint8_t;
 //! Type of a feature on vertices
 using GNNFeature = float;
+//! Type of node index on gpus
+using GPUNodeIndex = uint32_t;
+//! Type of edge index on gpus
+using GPUEdgeIndex = uint64_t;
 
 //! Phase of GNN computation
 enum class GNNPhase { kTrain, kValidate, kTest };
diff --git a/libgnn/include/galois/graphs/GNNGraph.cuh b/libgnn/include/galois/graphs/GNNGraph.cuh
new file mode 100644
index 0000000000..3d8bd45d58
--- /dev/null
+++ b/libgnn/include/galois/graphs/GNNGraph.cuh
@@ -0,0 +1,37 @@
+#pragma once
+#include "galois/GNNTypes.h"
+
+namespace galois {
+namespace graphs {
+
+//! Class to hold everything allocated on the GPU that has to do with GNNGraph.
+//! Similar in nature to the CUDAContext class in existing D-IrGL
+class GNNGraphGPUAllocations {
+public:
+  // XXX getters for everything, the rest of the setters, etc.
+
+  // XXX destructor for allocated memory
+
+  //! Host side function that allocates memory for the features on the vertices
+  //! and copies them over to the GPU.
+  void SetFeatures(const std::vector<GNNFeature>& features);
+
+private:
+  // Note: no graph object, similar to Xuhao's LGraph in older code
+  //! edge_index[n] gets the first edge index for node n (i.e. edge_index_[0]
+  //! = 0)
+  GPUEdgeIndex* edge_index_{nullptr};
+  //! edge_destinations_[i] = destination for edge i
+  GPUNodeIndex* edge_destinations_{nullptr};
+  //! (Local) feature vector
+  GNNFeature* feature_vector_{nullptr};
+  //! (Local) ground truth vector
+  GNNFloat* ground_truth_{nullptr};
+  //! (Local) norm factors
+  GNNFloat* norm_factors_{nullptr};
+
+  // XXX masks? other things I haven't considered yet?
+};
+
+} // namespace graphs
+} // namespace galois
diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 3f73aff510..81b2830e9d 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -202,6 +202,9 @@ class GNNGraph {
   //! This satisfies the cuda context forward declaration in host decls:
   //! context fields
   GNNGraphGPUAllocations gpu_memory_;
+  //! Call this to setup GPU memory for this graph: allocates necessary GPU
+  //! memory and copies things over
+  void InitGPUMemory();
 #endif
 };
 
diff --git a/libgnn/include/galois/layers/GluonGradientInterface.h b/libgnn/include/galois/layers/GluonGradientInterface.h
index 92c0a5eb69..473151efcd 100644
--- a/libgnn/include/galois/layers/GluonGradientInterface.h
+++ b/libgnn/include/galois/layers/GluonGradientInterface.h
@@ -15,6 +15,15 @@ namespace galois {
 //! they'll all see the same values after the first round of sync anyways)
 class GluonGradientInterface {
 public:
+  // typedefs required by GPU end to build; not actually used anywhere in this
+  // class (...at the moment)
+  // as such, dummy declarations that don't particularly make sense
+  // TODO will likely need to revisit once GPU substrate for this needs to be
+  // setup
+  using GraphNode     = uint32_t;
+  using edge_iterator = boost::counting_iterator<uint64_t>;
+  using EdgeType      = char;
+
   //! Save reference to weight gradients.
   //! Then setup mirror metadata for Gluon to use during setup.
   GluonGradientInterface(std::vector<GNNFloat>& gradients);
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index 2292614b70..804486a6bb 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -47,7 +47,7 @@ galois::graphs::GNNGraph::GNNGraph(const std::string& input_directory,
                                    GNNPartitionScheme partition_scheme,
                                    bool has_single_class_label)
     : input_directory_(input_directory) {
-  GALOIS_LOG_VERBOSE("[{}] Constructing partitiong for {}", host_id_,
+  GALOIS_LOG_VERBOSE("[{}] Constructing partitioning for {}", host_id_,
                      dataset_name);
   // save host id
   host_id_ = galois::runtime::getSystemNetworkInterface().ID;
@@ -74,6 +74,12 @@ galois::graphs::GNNGraph::GNNGraph(const std::string& input_directory,
   ReadWholeGraph(dataset_name);
   // init norm factors using the whole graph topology
   InitNormFactor();
+
+#ifdef GALOIS_ENABLE_GPU
+  // allocate/copy data structures over to GPU
+  GALOIS_LOG_VERBOSE("[{}] Initializing GPU memory", host_id_);
+  InitGPUMemory();
+#endif
 }
 
 bool galois::graphs::GNNGraph::IsValidForPhase(
@@ -364,3 +370,10 @@ void galois::graphs::GNNGraph::InitNormFactor() {
       },
       galois::loopname("InitNormFactor"));
 }
+
+#ifdef GALOIS_ENABLE_GPU
+void galois::graphs::GNNGraph::InitGPUMemory() {
+  // XXX finish up GPU memory allocation; currently just testing the build
+  gpu_memory_.SetFeatures(local_node_features_);
+}
+#endif
diff --git a/libgnn/src/graphs/GNNGraph.cu b/libgnn/src/graphs/GNNGraph.cu
new file mode 100644
index 0000000000..4282ba753d
--- /dev/null
+++ b/libgnn/src/graphs/GNNGraph.cu
@@ -0,0 +1,11 @@
+#include "galois/CUDAUtil.h"
+#include "galois/graphs/GNNGraph.cuh"
+
+void galois::graphs::GNNGraphGPUAllocations::SetFeatures(
+    const std::vector<GNNFeature>& features) {
+  CUDA_CHECK(cudaMalloc((void**)(&feature_vector_),
+                        features.size() * sizeof(GNNFeature)));
+  CUDA_CHECK(cudaMemcpy(feature_vector_, features.data(),
+                        features.size() * sizeof(GNNFeature),
+                        cudaMemcpyHostToDevice));
+}

From 50f8de139a53521f8cf0943c436e25fd509468b4 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 3 Nov 2020 13:52:51 -0600
Subject: [PATCH 394/660] epoch-test: changed input to cora for faster test

---
 libgnn/test/epoch-test.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libgnn/test/epoch-test.cpp b/libgnn/test/epoch-test.cpp
index 21d5249fd1..2486269ccd 100644
--- a/libgnn/test/epoch-test.cpp
+++ b/libgnn/test/epoch-test.cpp
@@ -14,7 +14,7 @@ int main() {
 
   // load graph
   auto test_graph = std::make_unique<galois::graphs::GNNGraph>(
-      "reddit", galois::graphs::GNNPartitionScheme::kCVC, true);
+      "cora", galois::graphs::GNNPartitionScheme::kCVC, true);
 
   std::vector<galois::GNNLayerType> layer_types = {
       galois::GNNLayerType::kGraphConvolutional,

From 6824c288e8f06fa47356d09fe55e6d53b7b12ee8 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 3 Nov 2020 17:45:06 -0600
Subject: [PATCH 395/660] Copy over int graph topology onto GPU for GNN

Adds code to cast the CSR of a partitioned graph into ints and copy it
over to the GPU.

The use of ints is because BLAS CSR standard is apparently ints, so this
is a future-proofing step for using GPU BLAS calls.

Also adds the destructor for GPU graph allocations which frees up
allocated memory + saves more variables onto GPU memory like feature
length, node count, etc.
---
 libgnn/include/galois/CUDAUtil.h          |  8 ++++
 libgnn/include/galois/graphs/GNNGraph.cuh | 28 +++++++++++---
 libgnn/src/graphs/GNNGraph.cpp            | 45 ++++++++++++++++++++++-
 libgnn/src/graphs/GNNGraph.cu             | 45 ++++++++++++++++++++++-
 4 files changed, 119 insertions(+), 7 deletions(-)

diff --git a/libgnn/include/galois/CUDAUtil.h b/libgnn/include/galois/CUDAUtil.h
index dabd1638b4..f8d7a03b80 100644
--- a/libgnn/include/galois/CUDAUtil.h
+++ b/libgnn/include/galois/CUDAUtil.h
@@ -14,4 +14,12 @@
     }                                                                          \
   } while (0)
 
+#define CUDA_FREE(ptr)                                                         \
+  do {                                                                         \
+    if (ptr) {                                                                 \
+      CUDA_CHECK(cudaFree(ptr));                                               \
+      ptr = nullptr;                                                           \
+    }                                                                          \
+  } while (0)
+
 #endif
diff --git a/libgnn/include/galois/graphs/GNNGraph.cuh b/libgnn/include/galois/graphs/GNNGraph.cuh
index 3d8bd45d58..4e0bb8d193 100644
--- a/libgnn/include/galois/graphs/GNNGraph.cuh
+++ b/libgnn/include/galois/graphs/GNNGraph.cuh
@@ -10,27 +10,45 @@ class GNNGraphGPUAllocations {
 public:
   // XXX getters for everything, the rest of the setters, etc.
 
-  // XXX destructor for allocated memory
+  //! CUDA frees all allocated memory (i.e. non-nullptr)
+  ~GNNGraphGPUAllocations();
+
+  //! Copies graph topology over to GPU; using ints because cuSparse lib
+  //! expects ints for the CSR arrays
+  void SetGraphTopology(const std::vector<int>& edge_index,
+                        const std::vector<int>& edge_dests);
 
   //! Host side function that allocates memory for the features on the vertices
   //! and copies them over to the GPU.
-  void SetFeatures(const std::vector<GNNFeature>& features);
+  void SetFeatures(const std::vector<GNNFeature>& features,
+                   unsigned num_features);
 
 private:
+  // ALL THESE VARIABLES ARE DEVICE SIDE (GPU) POINTERS
+
+  //! Number of features (which is equivalent to number of nodes)
+  unsigned* num_features_{nullptr};
+  //! Length of a feature vector
+  unsigned* feature_length_{nullptr};
+  //! Number of edges in graph
+  unsigned* num_edges_{nullptr};
+
   // Note: no graph object, similar to Xuhao's LGraph in older code
   //! edge_index[n] gets the first edge index for node n (i.e. edge_index_[0]
   //! = 0)
-  GPUEdgeIndex* edge_index_{nullptr};
+  int* edge_index_{nullptr};
   //! edge_destinations_[i] = destination for edge i
-  GPUNodeIndex* edge_destinations_{nullptr};
+  int* edge_destinations_{nullptr};
   //! (Local) feature vector
   GNNFeature* feature_vector_{nullptr};
+  // TODO need these on GPU?
   //! (Local) ground truth vector
   GNNFloat* ground_truth_{nullptr};
   //! (Local) norm factors
   GNNFloat* norm_factors_{nullptr};
 
-  // XXX masks? other things I haven't considered yet?
+  // TODO masks? other things I haven't considered yet? will determine if they
+  // are needed
 };
 
 } // namespace graphs
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index 804486a6bb..42eb645c43 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -2,6 +2,7 @@
 #include "galois/Logging.h"
 #include "galois/graphs/ReadGraph.h"
 #include "galois/graphs/GNNGraph.h"
+#include <limits>
 
 namespace {
 //! Partitions a particular dataset given some partitioning scheme
@@ -374,6 +375,48 @@ void galois::graphs::GNNGraph::InitNormFactor() {
 #ifdef GALOIS_ENABLE_GPU
 void galois::graphs::GNNGraph::InitGPUMemory() {
   // XXX finish up GPU memory allocation; currently just testing the build
-  gpu_memory_.SetFeatures(local_node_features_);
+
+  // create int casted CSR
+  uint64_t* e_index_ptr = partitioned_graph_->row_start_ptr();
+  uint32_t* e_dest_ptr  = partitioned_graph_->edge_dst_ptr();
+
+  // + 1 because first element is 0 in BLAS CSRs
+  std::vector<int> e_index(partitioned_graph_->size() + 1);
+  std::vector<int> e_dest(partitioned_graph_->sizeEdges());
+
+  // set in parallel
+  galois::do_all(
+      galois::iterate(static_cast<size_t>(0), partitioned_graph_->size() + 1),
+      [&](size_t index) {
+        if (index != 0) {
+          if (e_index_ptr[index - 1] >
+              static_cast<size_t>(std::numeric_limits<int>::max())) {
+            GALOIS_LOG_FATAL("{} is too big a number for int arrays on GPUs",
+                             e_index_ptr[index - 1]);
+          }
+          e_index[index] = static_cast<int>(e_index_ptr[index - 1]);
+        } else {
+          e_index[index] = 0;
+        }
+      },
+      galois::loopname("GPUEdgeIndexConstruction"));
+  galois::do_all(
+      galois::iterate(static_cast<size_t>(0), partitioned_graph_->sizeEdges()),
+      [&](size_t edge) {
+        if (e_dest_ptr[edge] >
+            static_cast<size_t>(std::numeric_limits<int>::max())) {
+          GALOIS_LOG_FATAL("{} is too big a number for int arrays on GPUs",
+                           e_dest_ptr[edge]);
+        }
+
+        e_dest[edge] = static_cast<int>(e_dest_ptr[edge]);
+      },
+      galois::loopname("GPUEdgeDestConstruction"));
+
+  gpu_memory_.SetGraphTopology(e_index, e_dest);
+  e_index.clear();
+  e_dest.clear();
+
+  gpu_memory_.SetFeatures(local_node_features_, node_feature_length_);
 }
 #endif
diff --git a/libgnn/src/graphs/GNNGraph.cu b/libgnn/src/graphs/GNNGraph.cu
index 4282ba753d..aae729c015 100644
--- a/libgnn/src/graphs/GNNGraph.cu
+++ b/libgnn/src/graphs/GNNGraph.cu
@@ -1,8 +1,51 @@
 #include "galois/CUDAUtil.h"
 #include "galois/graphs/GNNGraph.cuh"
 
+galois::graphs::GNNGraphGPUAllocations::~GNNGraphGPUAllocations() {
+  GALOIS_LOG_VERBOSE("Freeing GPU graph allocations");
+  CUDA_FREE(num_features_);
+  CUDA_FREE(feature_length_);
+  CUDA_FREE(num_edges_);
+  CUDA_FREE(edge_index_);
+  CUDA_FREE(edge_destinations_);
+  CUDA_FREE(feature_vector_);
+  CUDA_FREE(ground_truth_);
+  CUDA_FREE(norm_factors_);
+}
+
+void galois::graphs::GNNGraphGPUAllocations::SetGraphTopology(
+    const std::vector<int>& edge_index, const std::vector<int>& edge_dests) {
+  // num edges variable
+  CUDA_CHECK(cudaMalloc((void**)(&num_edges_), sizeof(unsigned)));
+  unsigned num_edges = edge_dests.size();
+  CUDA_CHECK(cudaMemcpy(num_edges_, &num_edges, sizeof(unsigned),
+                        cudaMemcpyHostToDevice));
+
+  // topology; assumes caller already setup vectors accordingly
+  CUDA_CHECK(
+      cudaMalloc((void**)(&edge_index_), edge_index.size() * sizeof(int)));
+  CUDA_CHECK(cudaMalloc((void**)(&edge_destinations_),
+                        edge_dests.size() * sizeof(int)));
+  CUDA_CHECK(cudaMemcpy(edge_index_, edge_index.data(),
+                        edge_index.size() * sizeof(int),
+                        cudaMemcpyHostToDevice));
+  CUDA_CHECK(cudaMemcpy(edge_destinations_, edge_dests.data(),
+                        edge_dests.size() * sizeof(int),
+                        cudaMemcpyHostToDevice));
+}
+
 void galois::graphs::GNNGraphGPUAllocations::SetFeatures(
-    const std::vector<GNNFeature>& features) {
+    const std::vector<GNNFeature>& features, unsigned num_features) {
+  // feature count & length
+  CUDA_CHECK(cudaMalloc((void**)(&num_features_), sizeof(unsigned)));
+  CUDA_CHECK(cudaMalloc((void**)(&feature_length_), sizeof(unsigned)));
+  CUDA_CHECK(cudaMemcpy(num_features_, &num_features, sizeof(unsigned),
+                        cudaMemcpyHostToDevice));
+  unsigned feature_length = features.size() / num_features;
+  CUDA_CHECK(cudaMemcpy(feature_length_, &feature_length, sizeof(unsigned),
+                        cudaMemcpyHostToDevice));
+
+  // features themselves
   CUDA_CHECK(cudaMalloc((void**)(&feature_vector_),
                         features.size() * sizeof(GNNFeature)));
   CUDA_CHECK(cudaMemcpy(feature_vector_, features.data(),

From ccb21dc1446b775c966f02c71de3c2fdbf504583 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 4 Nov 2020 15:41:16 -0600
Subject: [PATCH 396/660] Training set specification for citeseer and pubmed

Adds training set specification for citeseer and pubmed datasets to the
CuSP partitioner so that they can be run without crashes.
g#
---
 libcusp/include/galois/graphs/NewGeneric.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/libcusp/include/galois/graphs/NewGeneric.h b/libcusp/include/galois/graphs/NewGeneric.h
index f4837ff1de..048cfa4bc2 100644
--- a/libcusp/include/galois/graphs/NewGeneric.h
+++ b/libcusp/include/galois/graphs/NewGeneric.h
@@ -88,6 +88,12 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
     } else if (filename.find("reddit") != std::string::npos) {
       bps.push_back(0);
       bps.push_back(153431);
+    } else if (filename.find("citeseer") != std::string::npos) {
+      bps.push_back(0);
+      bps.push_back(120);
+    } else if (filename.find("pubmed") != std::string::npos) {
+      bps.push_back(0);
+      bps.push_back(60);
     } else if (filename.find("ppi") != std::string::npos) {
       bps.push_back(0);
       bps.push_back(9716);
@@ -96,7 +102,7 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
       bps.push_back(5);
     } else {
       // XXX only die under certain conditions
-      //GALOIS_DIE("invalid input for gnn partitioning ", filename,
+      // GALOIS_DIE("invalid input for gnn partitioning ", filename,
       //           " hardcode needed");
     }
     // TODO hardcode the rest

From d7792b3b0cab40a514d5caf8a88dc78943425eb8 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 4 Nov 2020 19:00:29 -0600
Subject: [PATCH 397/660] GNNGraph: copy node ground truth labels to GPU

Adds a function to copy ground truth for vertices to the GPU and adds it
to the GPU memory init call.
---
 libgnn/include/galois/graphs/GNNGraph.cuh | 5 ++++-
 libgnn/include/galois/graphs/GNNGraph.h   | 3 +--
 libgnn/src/graphs/GNNGraph.cpp            | 1 +
 libgnn/src/graphs/GNNGraph.cu             | 9 +++++++++
 4 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/libgnn/include/galois/graphs/GNNGraph.cuh b/libgnn/include/galois/graphs/GNNGraph.cuh
index 4e0bb8d193..9e047b6a0f 100644
--- a/libgnn/include/galois/graphs/GNNGraph.cuh
+++ b/libgnn/include/galois/graphs/GNNGraph.cuh
@@ -23,6 +23,9 @@ public:
   void SetFeatures(const std::vector<GNNFeature>& features,
                    unsigned num_features);
 
+  //! Copy over ground truth for the graph to GPU
+  void SetLabels(const std::vector<GNNLabel>& ground_truth);
+
 private:
   // ALL THESE VARIABLES ARE DEVICE SIDE (GPU) POINTERS
 
@@ -41,9 +44,9 @@ private:
   int* edge_destinations_{nullptr};
   //! (Local) feature vector
   GNNFeature* feature_vector_{nullptr};
-  // TODO need these on GPU?
   //! (Local) ground truth vector
   GNNFloat* ground_truth_{nullptr};
+  // TODO need this?
   //! (Local) norm factors
   GNNFloat* norm_factors_{nullptr};
 
diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 81b2830e9d..76ac693cf5 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -199,8 +199,7 @@ class GNNGraph {
   //////////////////////////////////////////////////////////////////////////////
 
 #ifdef GALOIS_ENABLE_GPU
-  //! This satisfies the cuda context forward declaration in host decls:
-  //! context fields
+  //! Object that holds all GPU allocated pointers to memory related to graphs.
   GNNGraphGPUAllocations gpu_memory_;
   //! Call this to setup GPU memory for this graph: allocates necessary GPU
   //! memory and copies things over
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index 42eb645c43..918ce3d735 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -418,5 +418,6 @@ void galois::graphs::GNNGraph::InitGPUMemory() {
   e_dest.clear();
 
   gpu_memory_.SetFeatures(local_node_features_, node_feature_length_);
+  gpu_memory_.SetLabels(local_ground_truth_labels_);
 }
 #endif
diff --git a/libgnn/src/graphs/GNNGraph.cu b/libgnn/src/graphs/GNNGraph.cu
index aae729c015..f13bbf4089 100644
--- a/libgnn/src/graphs/GNNGraph.cu
+++ b/libgnn/src/graphs/GNNGraph.cu
@@ -52,3 +52,12 @@ void galois::graphs::GNNGraphGPUAllocations::SetFeatures(
                         features.size() * sizeof(GNNFeature),
                         cudaMemcpyHostToDevice));
 }
+
+void galois::graphs::GNNGraphGPUAllocations::SetLabels(
+    const std::vector<GNNLabel>& ground_truth) {
+  CUDA_CHECK(cudaMalloc((void**)(&ground_truth_),
+                        ground_truth.size() * sizeof(GNNLabel)));
+  CUDA_CHECK(cudaMemcpy(ground_truth_, ground_truth.data(),
+                        ground_truth.size() * sizeof(GNNLabel),
+                        cudaMemcpyHostToDevice));
+}

From ede6c109fb39959e53517879408de97b6ba414eb Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 5 Nov 2020 19:38:39 -0600
Subject: [PATCH 398/660] Massive signature overhaul for vectors in GNNs

Previously in most of the GNN code std::vectors were being passed around
for the main calls to training/inferencing. The problem with this is
that GPUs do not have std::vector, but only pointers. To avoid losing
out on the size call, a new class called PointerWithSize has been added
that wraps a pointer and attaches a size to it that can be accessed via
size. It also has a similar interface to a std::vector so as little of
the code has to be changed as possible.

This class is now used instead of std::vector in the main
Forward/Backward Phase calls in the GNN in preparation for the GPU
support. This also required a few related changes such as making some
functions non-const (because the PointerWithSize is not a const
construction since it grabs the raw pointer).

Most of the tests have been edited as well to account for the new return
type.
---
 libgnn/include/galois/GNNTypes.h              |  32 +++
 libgnn/include/galois/GraphNeuralNetwork.h    |   8 +-
 libgnn/include/galois/graphs/GNNGraph.h       |   4 +-
 libgnn/include/galois/layers/GNNLayer.h       |  28 +-
 .../galois/layers/GraphConvolutionalLayer.h   |  11 +-
 libgnn/include/galois/layers/SoftmaxLayer.h   |  11 +-
 libgnn/src/GraphNeuralNetwork.cpp             |  28 +-
 libgnn/src/layers/GNNLayer.cpp                |   8 +-
 libgnn/src/layers/GraphConvolutionalLayer.cpp |  13 +-
 libgnn/src/layers/SoftmaxLayer.cpp            |  16 +-
 libgnn/test/accuracy-test.cpp                 |  12 +-
 libgnn/test/aggregate-sync-test.cpp           |  22 +-
 libgnn/test/convlayer-test.cpp                | 156 ++++++------
 libgnn/test/epoch-test.cpp                    |   8 +-
 libgnn/test/gnnfb-test.cpp                    |  10 +-
 libgnn/test/gpu-convlayer-test.cpp            | 239 ++++++++++++++++++
 libgnn/test/softmaxlayer-test.cpp             |   6 +-
 libgnn/test/weight-sync-test.cpp              |   5 +-
 18 files changed, 449 insertions(+), 168 deletions(-)
 create mode 100644 libgnn/test/gpu-convlayer-test.cpp

diff --git a/libgnn/include/galois/GNNTypes.h b/libgnn/include/galois/GNNTypes.h
index 99cb700cb4..40f19da7b0 100644
--- a/libgnn/include/galois/GNNTypes.h
+++ b/libgnn/include/galois/GNNTypes.h
@@ -4,6 +4,7 @@
 
 #include <cstdint>
 #include <cstddef>
+#include <vector>
 
 namespace galois {
 //! Floating point type to use throughout GNN compute; typedef'd so it's easier
@@ -21,4 +22,35 @@ using GPUEdgeIndex = uint64_t;
 //! Phase of GNN computation
 enum class GNNPhase { kTrain, kValidate, kTest };
 
+//! Vector like wrapper over a pointer and size; exists solely to pass around
+//! raw pointers with size (because vectors are a no-go due to the code
+//! handling both CPU and GPU.)
+template <typename PointerType>
+class PointerWithSize {
+public:
+  //! Default is empty
+  PointerWithSize() : ptr_{nullptr}, num_elements_{0} {}
+  //! Generic constructor which takes 2 fields to initialize
+  PointerWithSize(PointerType* ptr, size_t num_elements)
+      : ptr_{ptr}, num_elements_{num_elements} {}
+  //! Grab vector pointer + size
+  PointerWithSize(std::vector<PointerType>& v)
+      : ptr_{v.data()}, num_elements_{v.size()} {}
+  //! Alias to return pointer data
+  PointerType* data() { return ptr_; }
+  //! Alias to return pointer data (const version)
+  const PointerType* data() const { return ptr_; }
+  //! # elements that pointer should contain
+  size_t size() const { return num_elements_; }
+  // accessors; one lets you mess with the array
+  PointerType& operator[](size_t i) { return ptr_[i]; }
+  const PointerType& operator[](size_t i) const { return ptr_[i]; }
+
+private:
+  //! Pointer to data
+  PointerType* ptr_;
+  //! # elements that I should be able to access from pointer
+  size_t num_elements_;
+};
+
 } // end namespace galois
diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h
index 725e3a69d1..9e7e2266d0 100644
--- a/libgnn/include/galois/GraphNeuralNetwork.h
+++ b/libgnn/include/galois/GraphNeuralNetwork.h
@@ -106,7 +106,7 @@ class GraphNeuralNetwork {
   size_t num_intermediate_layers() { return gnn_layers_.size() - 1; }
 
   //! Returns pointer to intermediate layer i
-  const galois::GNNLayer* GetIntermediateLayer(size_t i) {
+  galois::GNNLayer* GetIntermediateLayer(size_t i) {
     if (i < gnn_layers_.size() - 1) {
       return gnn_layers_[i].get();
     } else {
@@ -130,7 +130,7 @@ class GraphNeuralNetwork {
   }
 
   //! Returns the output layer
-  const galois::GNNLayer* GetOutputLayer() { return gnn_layers_.back().get(); }
+  galois::GNNLayer* GetOutputLayer() { return gnn_layers_.back().get(); }
 
   //! Do training for a specified # of epochs and return test accuracy at the
   //! end of it
@@ -140,9 +140,9 @@ class GraphNeuralNetwork {
   //! vector representation.
   //! Also known as the forward phase in most literature
   //! @returns Output layer's output
-  const std::vector<GNNFloat>* DoInference();
+  const PointerWithSize<GNNFloat> DoInference();
 
-  float GetGlobalAccuracy(const std::vector<GNNFloat>& predictions);
+  float GetGlobalAccuracy(const PointerWithSize<GNNFloat> predictions);
 
   //! Backpropagate gradients from the output layer backwards through the
   //! network to update the layer weights. Also known as a backward phase in
diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 76ac693cf5..2b55d17b7a 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -101,8 +101,8 @@ class GNNGraph {
   }
 
   //! Return matrix of the local node features
-  const std::vector<GNNFloat>& GetLocalFeatures() const {
-    return local_node_features_;
+  const PointerWithSize<GNNFloat> GetLocalFeatures() {
+    return PointerWithSize(local_node_features_);
   }
 
   //! Given an LID and the current phase of GNN computation, determine if the
diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
index 2473de7229..e738bdacca 100644
--- a/libgnn/include/galois/layers/GNNLayer.h
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -80,16 +80,17 @@ class GNNLayer {
     }
   }
 
-  const std::vector<GNNFloat>& GetForwardOutput() const {
-    return forward_output_matrix_;
+  const PointerWithSize<GNNFloat> GetForwardOutput() {
+    return PointerWithSize(forward_output_matrix_);
   }
-  const std::vector<GNNFloat>& GetBackwardOutput() const {
-    return backward_output_matrix_;
+
+  const PointerWithSize<GNNFloat> GetBackwardOutput() {
+    return PointerWithSize(backward_output_matrix_);
   }
 
   //! Returns the weight gradients
-  const std::vector<GNNFloat>& GetLayerWeightGradients() const {
-    return layer_weight_gradients_;
+  const PointerWithSize<GNNFloat> GetLayerWeightGradients() {
+    return PointerWithSize(layer_weight_gradients_);
   }
 
   //! Returns dimensions of this layer
@@ -106,8 +107,9 @@ class GNNLayer {
   //! ultimately leads to an output (classfication of node labels) at the end
   //! of the GNN.
   //! @returns Output of the forward phase (i.e. input to next layer)
-  virtual const std::vector<galois::GNNFloat>&
-  ForwardPhase(const std::vector<galois::GNNFloat>& input_embeddings) = 0;
+  // XXX size of embeddings
+  virtual const PointerWithSize<galois::GNNFloat>
+  ForwardPhase(const PointerWithSize<galois::GNNFloat> input_embeddings) = 0;
   //! Conducts the backward phase given the input to this layer; the backward
   //! phase calculates the gradients to update the weights of trainable
   //! parts of the layer (e.g., weights, trainable params for aggregate, etc.).
@@ -117,9 +119,9 @@ class GNNLayer {
   //! one; takes a pointer to save space by writing intermediate results to it
   //! @returns Output of the backward phase (i.e. input to previous layer); note
   //! it's a pointer because layer can mess with it
-  virtual std::vector<galois::GNNFloat>*
-  BackwardPhase(const std::vector<galois::GNNFloat>& prev_layer_input,
-                std::vector<galois::GNNFloat>* input_gradient) = 0;
+  virtual PointerWithSize<galois::GNNFloat>
+  BackwardPhase(const PointerWithSize<galois::GNNFloat> prev_layer_input,
+                PointerWithSize<galois::GNNFloat>* input_gradient) = 0;
 
   //! Given an optimizer, update the weights in this layer based on gradients
   //! stored in the layer
@@ -185,7 +187,7 @@ class GNNLayer {
   //! Choose a set of weights from this layer's weights to keep and save to
   //! the output matrix + apply some scaling to the kept weights based on
   //! dropout rate
-  void DoDropout(const std::vector<GNNFloat>& input_to_drop,
+  void DoDropout(const PointerWithSize<GNNFloat> input_to_drop,
                  std::vector<GNNFloat>* output_matrix);
   //! Apply the derivative of dropout to the backward phase output
   void DoDropoutDerivative();
@@ -194,7 +196,7 @@ class GNNLayer {
   //! matrix
   void Activation();
   //! Calculate derivative of activation function based on config on the matrix
-  void ActivationDerivative(std::vector<GNNFloat>* matrix);
+  void ActivationDerivative(PointerWithSize<GNNFloat>* matrix);
 
   //! Synchronize weight gradients with a summation
   void WeightGradientSyncSum();
diff --git a/libgnn/include/galois/layers/GraphConvolutionalLayer.h b/libgnn/include/galois/layers/GraphConvolutionalLayer.h
index 123a8d774a..196fa752c8 100644
--- a/libgnn/include/galois/layers/GraphConvolutionalLayer.h
+++ b/libgnn/include/galois/layers/GraphConvolutionalLayer.h
@@ -20,11 +20,12 @@ class GraphConvolutionalLayer : public GNNLayer {
                                 GNNLayerConfig()) {}
 
   // Parent functions
-  const std::vector<GNNFloat>&
-  ForwardPhase(const std::vector<GNNFloat>& input_embeddings) final;
-  std::vector<GNNFloat>*
-  BackwardPhase(const std::vector<galois::GNNFloat>& prev_layer_input,
-                std::vector<GNNFloat>* input_gradient) final;
+  const PointerWithSize<galois::GNNFloat>
+  ForwardPhase(const PointerWithSize<galois::GNNFloat> input_embeddings) final;
+
+  PointerWithSize<galois::GNNFloat>
+  BackwardPhase(const PointerWithSize<galois::GNNFloat> prev_layer_input,
+                PointerWithSize<galois::GNNFloat>* input_gradient) final;
 
 private:
   // 2 temporaries the size of the forward input; used for dropout and
diff --git a/libgnn/include/galois/layers/SoftmaxLayer.h b/libgnn/include/galois/layers/SoftmaxLayer.h
index 815f2401ff..5c412f6bf3 100644
--- a/libgnn/include/galois/layers/SoftmaxLayer.h
+++ b/libgnn/include/galois/layers/SoftmaxLayer.h
@@ -25,14 +25,13 @@ class SoftmaxLayer : public GNNLayer {
   }
 
   //! Creates probability distribution of each row of input
-  const std::vector<GNNFloat>&
-  ForwardPhase(const std::vector<GNNFloat>& input_embeddings) final;
-
+  const PointerWithSize<galois::GNNFloat>
+  ForwardPhase(const PointerWithSize<galois::GNNFloat> input_embeddings) final;
   //! Get gradients to fix distribution such that it leans more towards single
   //! class ground truth.
-  std::vector<galois::GNNFloat>*
-  BackwardPhase(const std::vector<galois::GNNFloat>& prev_layer_input,
-                std::vector<GNNFloat>* input_gradient) final;
+  PointerWithSize<galois::GNNFloat>
+  BackwardPhase(const PointerWithSize<galois::GNNFloat> prev_layer_input,
+                PointerWithSize<galois::GNNFloat>* input_gradient) final;
 
 private:
   //! Loss for each row of the input
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index 82c1d40c07..eb419ba26c 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -63,9 +63,9 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
   const size_t this_host = graph_->host_id();
   // TODO incorporate validation/test intervals
   for (size_t epoch = 0; epoch < num_epochs; epoch++) {
-    const std::vector<galois::GNNFloat>* predictions = DoInference();
+    const PointerWithSize<galois::GNNFloat> predictions = DoInference();
     GradientPropagation();
-    float train_accuracy = GetGlobalAccuracy(*predictions);
+    float train_accuracy = GetGlobalAccuracy(predictions);
     if (this_host == 0) {
       galois::gPrint("Epoch ", epoch, ": Train accuracy is ", train_accuracy,
                      "\n");
@@ -77,8 +77,8 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
   galois::StatTimer acc_timer("FinalAccuracyTest");
   acc_timer.start();
   SetLayerPhases(galois::GNNPhase::kTest);
-  const std::vector<galois::GNNFloat>* predictions = DoInference();
-  float global_accuracy = GetGlobalAccuracy(*predictions);
+  const PointerWithSize<galois::GNNFloat> predictions = DoInference();
+  float global_accuracy = GetGlobalAccuracy(predictions);
   acc_timer.stop();
 
   if (this_host == 0) {
@@ -88,17 +88,19 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
   return global_accuracy;
 }
 
-const std::vector<galois::GNNFloat>* galois::GraphNeuralNetwork::DoInference() {
+const galois::PointerWithSize<galois::GNNFloat>
+galois::GraphNeuralNetwork::DoInference() {
   // start with graph features and pass it through all layers of the network
-  const std::vector<GNNFloat>* layer_input = &(graph_->GetLocalFeatures());
+  galois::PointerWithSize<galois::GNNFloat> layer_input =
+      graph_->GetLocalFeatures();
   for (std::unique_ptr<galois::GNNLayer>& ptr : gnn_layers_) {
-    layer_input = &(ptr->ForwardPhase(*layer_input));
+    layer_input = ptr->ForwardPhase(layer_input);
   }
   return layer_input;
 }
 
 float galois::GraphNeuralNetwork::GetGlobalAccuracy(
-    const std::vector<GNNFloat>& predictions) {
+    const PointerWithSize<GNNFloat> predictions) {
   // check owned nodes' accuracy
   size_t num_labels = graph_->GetNumLabelClasses();
   assert((graph_->GetNumLabelClasses() * graph_->size()) == predictions.size());
@@ -143,7 +145,7 @@ void galois::GraphNeuralNetwork::GradientPropagation() {
   // from output layer get initial gradients
   std::vector<galois::GNNFloat> dummy;
   std::unique_ptr<galois::GNNLayer>& output_layer = gnn_layers_.back();
-  std::vector<galois::GNNFloat>* current_gradients =
+  galois::PointerWithSize<galois::GNNFloat> current_gradients =
       output_layer->BackwardPhase(dummy, nullptr);
 
   // loops through intermediate layers in a backward fashion
@@ -153,16 +155,16 @@ void galois::GraphNeuralNetwork::GradientPropagation() {
     size_t layer_index = gnn_layers_.size() - 2 - i;
 
     // get the input to the layer before this one
-    const std::vector<galois::GNNFloat>* prev_layer_input;
+    galois::PointerWithSize<galois::GNNFloat> prev_layer_input;
     if (layer_index != 0) {
-      prev_layer_input = &(gnn_layers_[layer_index - 1]->GetForwardOutput());
+      prev_layer_input = gnn_layers_[layer_index - 1]->GetForwardOutput();
     } else {
-      prev_layer_input = &(graph_->GetLocalFeatures());
+      prev_layer_input = graph_->GetLocalFeatures();
     }
 
     // backward prop and get a new set of gradients
     current_gradients = gnn_layers_[layer_index]->BackwardPhase(
-        *prev_layer_input, current_gradients);
+        prev_layer_input, &current_gradients);
     // if not output do optimization/gradient descent
     // at this point in the layer the gradients exist; use the gradients to
     // update the weights of the layer
diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp
index dc81a9ca2b..a0ead51e10 100644
--- a/libgnn/src/layers/GNNLayer.cpp
+++ b/libgnn/src/layers/GNNLayer.cpp
@@ -61,8 +61,9 @@ void galois::GNNLayer::RandomInitVector(std::vector<GNNFloat>* vector_to_init) {
 
 // XXX Something is wrong with dropout; accuracy suffers, figure out what
 // it is
-void galois::GNNLayer::DoDropout(const std::vector<GNNFloat>& input_to_dropout,
-                                 std::vector<GNNFloat>* output_matrix) {
+void galois::GNNLayer::DoDropout(
+    const PointerWithSize<GNNFloat> input_to_dropout,
+    std::vector<GNNFloat>* output_matrix) {
   size_t num_elements = output_matrix->size();
   assert(num_elements == dropout_mask_.size());
   assert(num_elements == input_to_dropout.size());
@@ -114,7 +115,8 @@ void galois::GNNLayer::Activation() {
       galois::loopname("ReLU"));
 }
 
-void galois::GNNLayer::ActivationDerivative(std::vector<GNNFloat>* gradient) {
+void galois::GNNLayer::ActivationDerivative(
+    PointerWithSize<GNNFloat>* gradient) {
   // TODO only does relu at the moment; should check user specified activation
   // and act accordingly
   // keep gradient if the original output is greater than 0
diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp
index 57a5d9505b..e2e80ce8b1 100644
--- a/libgnn/src/layers/GraphConvolutionalLayer.cpp
+++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp
@@ -21,9 +21,9 @@ galois::GraphConvolutionalLayer::GraphConvolutionalLayer(
   layer_type_ = galois::GNNLayerType::kGraphConvolutional;
 }
 
-const std::vector<galois::GNNFloat>&
+const galois::PointerWithSize<galois::GNNFloat>
 galois::GraphConvolutionalLayer::ForwardPhase(
-    const std::vector<galois::GNNFloat>& input_embeddings) {
+    const galois::PointerWithSize<galois::GNNFloat> input_embeddings) {
   assert(input_embeddings.size() ==
          (layer_dimensions_.input_rows * layer_dimensions_.input_columns));
   assert(in_temp_1_.size() == input_embeddings.size());
@@ -64,9 +64,10 @@ galois::GraphConvolutionalLayer::ForwardPhase(
   return forward_output_matrix_;
 }
 
-std::vector<galois::GNNFloat>* galois::GraphConvolutionalLayer::BackwardPhase(
-    const std::vector<galois::GNNFloat>& prev_layer_input,
-    std::vector<galois::GNNFloat>* input_gradient) {
+galois::PointerWithSize<galois::GNNFloat>
+galois::GraphConvolutionalLayer::BackwardPhase(
+    galois::PointerWithSize<galois::GNNFloat> prev_layer_input,
+    galois::PointerWithSize<galois::GNNFloat>* input_gradient) {
   assert(layer_phase_ == GNNPhase::kTrain);
   // derivative of activation
   if (config_.do_activation) {
@@ -121,7 +122,7 @@ std::vector<galois::GNNFloat>* galois::GraphConvolutionalLayer::BackwardPhase(
     DoDropoutDerivative();
   }
 
-  return &backward_output_matrix_;
+  return PointerWithSize(backward_output_matrix_);
 }
 
 void galois::GraphConvolutionalLayer::AggregateAll(
diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp
index 1262555a36..07e78d3c1f 100644
--- a/libgnn/src/layers/SoftmaxLayer.cpp
+++ b/libgnn/src/layers/SoftmaxLayer.cpp
@@ -3,11 +3,11 @@
 #include "galois/layers/SoftmaxLayer.h"
 
 // Allocate memory and initialize
-void galois::SoftmaxLayer::Init() {
-}
+void galois::SoftmaxLayer::Init() {}
 
-const std::vector<galois::GNNFloat>& galois::SoftmaxLayer::ForwardPhase(
-    const std::vector<galois::GNNFloat>& input_embeddings) {
+const galois::PointerWithSize<galois::GNNFloat>
+galois::SoftmaxLayer::ForwardPhase(
+    const galois::PointerWithSize<galois::GNNFloat> input_embeddings) {
   input_loss_.assign(input_loss_.size(), 0.0);
   forward_output_matrix_.assign(forward_output_matrix_.size(), 0.0);
   const size_t feature_length = layer_dimensions_.input_columns;
@@ -42,9 +42,9 @@ const std::vector<galois::GNNFloat>& galois::SoftmaxLayer::ForwardPhase(
   return forward_output_matrix_;
 }
 
-std::vector<galois::GNNFloat>*
-galois::SoftmaxLayer::BackwardPhase(const std::vector<galois::GNNFloat>&,
-                                    std::vector<galois::GNNFloat>*) {
+galois::PointerWithSize<galois::GNNFloat>
+galois::SoftmaxLayer::BackwardPhase(const PointerWithSize<galois::GNNFloat>,
+                                    PointerWithSize<galois::GNNFloat>*) {
   const size_t feature_length = layer_dimensions_.input_columns;
 
   galois::do_all(
@@ -83,7 +83,7 @@ galois::SoftmaxLayer::BackwardPhase(const std::vector<galois::GNNFloat>&,
       // steal on as some threads may have nothing to work on
       galois::steal(), galois::loopname("SoftmaxBackward"));
 
-  return &backward_output_matrix_;
+  return PointerWithSize(backward_output_matrix_);
 }
 
 // TODO function for getting loss
diff --git a/libgnn/test/accuracy-test.cpp b/libgnn/test/accuracy-test.cpp
index e1fc17702e..6d26284325 100644
--- a/libgnn/test/accuracy-test.cpp
+++ b/libgnn/test/accuracy-test.cpp
@@ -33,23 +33,23 @@ int main() {
 
   //////////////////////////////////////////////////////////////////////////////
 
-  const std::vector<galois::GNNFloat>* distributions = gnn->DoInference();
+  galois::PointerWithSize<galois::GNNFloat> distributions = gnn->DoInference();
   // accuracy will be 0.2: everything chooses the first 1 as the entire row
   // is the same
-  float pred_accuracy = gnn->GetGlobalAccuracy(*distributions);
+  float pred_accuracy = gnn->GetGlobalAccuracy(distributions);
   GALOIS_LOG_VERBOSE("{}", pred_accuracy);
   GALOIS_LOG_ASSERT(pred_accuracy == static_cast<float>(0.2));
 
   // validation mode
   gnn->SetLayerPhases(galois::GNNPhase::kValidate);
-  const std::vector<galois::GNNFloat>* dist2 = gnn->DoInference();
-  pred_accuracy                              = gnn->GetGlobalAccuracy(*dist2);
+  galois::PointerWithSize<galois::GNNFloat> dist2 = gnn->DoInference();
+  pred_accuracy = gnn->GetGlobalAccuracy(dist2);
   GALOIS_LOG_ASSERT(pred_accuracy == static_cast<float>(0.0));
 
   // test mode
   gnn->SetLayerPhases(galois::GNNPhase::kTest);
-  const std::vector<galois::GNNFloat>* dist3 = gnn->DoInference();
-  pred_accuracy                              = gnn->GetGlobalAccuracy(*dist3);
+  galois::PointerWithSize<galois::GNNFloat> dist3 = gnn->DoInference();
+  pred_accuracy = gnn->GetGlobalAccuracy(dist3);
   GALOIS_LOG_ASSERT(pred_accuracy == static_cast<float>(0.0));
 
   // manufactured predictions to make sure it predicts things correctly based
diff --git a/libgnn/test/aggregate-sync-test.cpp b/libgnn/test/aggregate-sync-test.cpp
index 432a546448..600ac42018 100644
--- a/libgnn/test/aggregate-sync-test.cpp
+++ b/libgnn/test/aggregate-sync-test.cpp
@@ -38,7 +38,7 @@ int main() {
                                                         dimension_0, l_config);
   layer_0->InitAllWeightsTo1();
   // make sure it runs in a sane manner
-  const std::vector<galois::GNNFloat>& layer_0_forward_output =
+  galois::PointerWithSize<galois::GNNFloat> layer_0_forward_output =
       layer_0->ForwardPhase(test_graph->GetLocalFeatures());
 
   //////////////////////////////////////////////////////////////////////////////
@@ -97,20 +97,21 @@ int main() {
 
   //////////////////////////////////////////////////////////////////////////////
 
-  std::vector<galois::GNNFloat> dummy_ones(test_graph->size() * 2, 1);
+  std::vector<galois::GNNFloat> dummy_ones_v(test_graph->size() * 2, 1);
+  galois::PointerWithSize<galois::GNNFloat> dummy_ones(dummy_ones_v);
   // backward pass checking
   // layer 0 means that an empty weight matrix is returned since there is no
   // point passing back anything
-  std::vector<galois::GNNFloat>* layer_0_backward_output =
+  galois::PointerWithSize<galois::GNNFloat> layer_0_backward_output =
       layer_0->BackwardPhase(test_graph->GetLocalFeatures(), &dummy_ones);
 
   //////////////////////////////////////////////////////////////////////////////
   // sanity check layer 0 backward output; all 0 because layer 0
   //////////////////////////////////////////////////////////////////////////////
   // since norm factors aren't invovled it is possible to do full assertions
-  GALOIS_LOG_ASSERT(layer_0_backward_output->size() == test_graph->size() * 3);
-  for (size_t i = 0; i < layer_0_backward_output->size(); i++) {
-    GALOIS_LOG_ASSERT((*layer_0_backward_output)[i] == 0);
+  GALOIS_LOG_ASSERT(layer_0_backward_output.size() == test_graph->size() * 3);
+  for (size_t i = 0; i < layer_0_backward_output.size(); i++) {
+    GALOIS_LOG_ASSERT((layer_0_backward_output)[i] == 0);
   }
 
   //////////////////////////////////////////////////////////////////////////////
@@ -120,7 +121,7 @@ int main() {
       std::make_unique<galois::GraphConvolutionalLayer>(1, *(test_graph.get()),
                                                         dimension_0, l_config);
   layer_1->InitAllWeightsTo1();
-  const std::vector<galois::GNNFloat>& layer_1_forward_output =
+  galois::PointerWithSize<galois::GNNFloat> layer_1_forward_output =
       layer_1->ForwardPhase(test_graph->GetLocalFeatures());
 
   // same check for forward as before
@@ -164,8 +165,8 @@ int main() {
   }
 
   // since layer isn't 0 anymore, backward phase will actually return something
-  dummy_ones.assign(test_graph->size() * 2, 1);
-  std::vector<galois::GNNFloat>* layer_1_backward_output =
+  dummy_ones_v.assign(test_graph->size() * 2, 1);
+  galois::PointerWithSize<galois::GNNFloat> layer_1_backward_output =
       layer_1->BackwardPhase(test_graph->GetLocalFeatures(), &dummy_ones);
 
   for (size_t row = 0; row < test_graph->size(); row++) {
@@ -193,8 +194,7 @@ int main() {
 
     // size 3 columns
     for (size_t c = 0; c < 3; c++) {
-      GALOIS_LOG_ASSERT((*layer_1_backward_output)[row * 3 + c] ==
-                        ground_truth);
+      GALOIS_LOG_ASSERT((layer_1_backward_output)[row * 3 + c] == ground_truth);
     }
   }
 
diff --git a/libgnn/test/convlayer-test.cpp b/libgnn/test/convlayer-test.cpp
index 00825cf6f8..ae23fa4f23 100644
--- a/libgnn/test/convlayer-test.cpp
+++ b/libgnn/test/convlayer-test.cpp
@@ -17,7 +17,8 @@ int main() {
   galois::graphs::GNNGraph test_graph(
       "tester", galois::graphs::GNNPartitionScheme::kOEC, true);
 
-  std::vector<galois::GNNFloat> feats = test_graph.GetLocalFeatures();
+  galois::PointerWithSize<galois::GNNFloat> feats =
+      test_graph.GetLocalFeatures();
   //////////////////////////////////////////////////////////////////////////////
   // doubles as a test for reading as well
   GALOIS_LOG_ASSERT(7 == test_graph.size());
@@ -59,7 +60,7 @@ int main() {
                                                         dimension_0, dcon);
   layer_0->InitAllWeightsTo1();
   // make sure it runs in a sane manner
-  const std::vector<galois::GNNFloat>& layer_0_forward_output =
+  const galois::PointerWithSize<galois::GNNFloat> layer_0_forward_output =
       layer_0->ForwardPhase(test_graph.GetLocalFeatures());
 
   //////////////////////////////////////////////////////////////////////////////
@@ -85,12 +86,13 @@ int main() {
   //////////////////////////////////////////////////////////////////////////////
 
   // dummy 1 matrix
-  std::vector<galois::GNNFloat> dummy_ones(14, 1);
+  std::vector<galois::GNNFloat> dummy_ones_v(14, 1);
+  galois::PointerWithSize dummy_ones(dummy_ones_v);
 
   // backward pass checking
   // layer 0 means that an empty weight matrix is returned since there is no
   // point passing back anything
-  std::vector<galois::GNNFloat>* layer_0_backward_output =
+  galois::PointerWithSize<galois::GNNFloat> layer_0_backward_output =
       layer_0->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
 
   //////////////////////////////////////////////////////////////////////////////
@@ -98,30 +100,30 @@ int main() {
   //////////////////////////////////////////////////////////////////////////////
   // since norm factors aren't invovled it is possible to do full assertions
   // 7 x 3
-  GALOIS_LOG_ASSERT(layer_0_backward_output->size() == 21);
-  GALOIS_LOG_ASSERT((*layer_0_backward_output)[0] == 0);
-  GALOIS_LOG_ASSERT((*layer_0_backward_output)[1] == 0);
-  GALOIS_LOG_ASSERT((*layer_0_backward_output)[2] == 0);
-  GALOIS_LOG_ASSERT((*layer_0_backward_output)[3] == 0);
-  GALOIS_LOG_ASSERT((*layer_0_backward_output)[4] == 0);
-  GALOIS_LOG_ASSERT((*layer_0_backward_output)[5] == 0);
-  GALOIS_LOG_ASSERT((*layer_0_backward_output)[6] == 0);
-  GALOIS_LOG_ASSERT((*layer_0_backward_output)[7] == 0);
-  GALOIS_LOG_ASSERT((*layer_0_backward_output)[8] == 0);
-  GALOIS_LOG_ASSERT((*layer_0_backward_output)[9] == 0);
-  GALOIS_LOG_ASSERT((*layer_0_backward_output)[10] == 0);
-  GALOIS_LOG_ASSERT((*layer_0_backward_output)[11] == 0);
-  GALOIS_LOG_ASSERT((*layer_0_backward_output)[12] == 0);
-  GALOIS_LOG_ASSERT((*layer_0_backward_output)[13] == 0);
-  GALOIS_LOG_ASSERT((*layer_0_backward_output)[14] == 0);
-  GALOIS_LOG_ASSERT((*layer_0_backward_output)[15] == 0);
-  GALOIS_LOG_ASSERT((*layer_0_backward_output)[16] == 0);
-  GALOIS_LOG_ASSERT((*layer_0_backward_output)[17] == 0);
-  GALOIS_LOG_ASSERT((*layer_0_backward_output)[18] == 0);
-  GALOIS_LOG_ASSERT((*layer_0_backward_output)[19] == 0);
-  GALOIS_LOG_ASSERT((*layer_0_backward_output)[20] == 0);
+  GALOIS_LOG_ASSERT(layer_0_backward_output.size() == 21);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[0] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[1] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[2] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[3] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[4] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[5] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[6] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[7] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[8] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[9] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[10] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[11] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[12] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[13] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[14] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[15] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[16] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[17] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[18] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[19] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[20] == 0);
 
-  const std::vector<galois::GNNFloat> layer_0_weight_gradients =
+  galois::PointerWithSize<galois::GNNFloat> layer_0_weight_gradients =
       layer_0->GetLayerWeightGradients();
   // make sure they are sane
   GALOIS_LOG_ASSERT(layer_0_weight_gradients.size() == 6);
@@ -141,7 +143,7 @@ int main() {
       std::make_unique<galois::GraphConvolutionalLayer>(1, test_graph,
                                                         dimension_0, dcon);
   layer_1->InitAllWeightsTo1();
-  const std::vector<galois::GNNFloat>& layer_1_forward_output =
+  galois::PointerWithSize<galois::GNNFloat> layer_1_forward_output =
       layer_1->ForwardPhase(test_graph.GetLocalFeatures());
   // same check as before for sanity purposes
   GALOIS_LOG_ASSERT(layer_1_forward_output.size() == 14);
@@ -161,36 +163,36 @@ int main() {
   GALOIS_LOG_ASSERT(layer_1_forward_output[13] == 15);
 
   // since layer isn't 0 anymore, backward phase will actually return something
-  dummy_ones.assign(14, 1);
-  std::vector<galois::GNNFloat>* layer_1_backward_output =
+  dummy_ones_v.assign(14, 1);
+  galois::PointerWithSize<galois::GNNFloat> layer_1_backward_output =
       layer_1->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
   //////////////////////////////////////////////////////////////////////////////
   // check that multiplies go as expected
   //////////////////////////////////////////////////////////////////////////////
-  GALOIS_LOG_ASSERT(layer_1_backward_output->size() == 21);
-  GALOIS_LOG_ASSERT((*layer_1_backward_output)[0] == 2);
-  GALOIS_LOG_ASSERT((*layer_1_backward_output)[1] == 2);
-  GALOIS_LOG_ASSERT((*layer_1_backward_output)[2] == 2);
-  GALOIS_LOG_ASSERT((*layer_1_backward_output)[3] == 4);
-  GALOIS_LOG_ASSERT((*layer_1_backward_output)[4] == 4);
-  GALOIS_LOG_ASSERT((*layer_1_backward_output)[5] == 4);
-  GALOIS_LOG_ASSERT((*layer_1_backward_output)[6] == 4);
-  GALOIS_LOG_ASSERT((*layer_1_backward_output)[7] == 4);
-  GALOIS_LOG_ASSERT((*layer_1_backward_output)[8] == 4);
-  GALOIS_LOG_ASSERT((*layer_1_backward_output)[9] == 4);
-  GALOIS_LOG_ASSERT((*layer_1_backward_output)[10] == 4);
-  GALOIS_LOG_ASSERT((*layer_1_backward_output)[11] == 4);
-  GALOIS_LOG_ASSERT((*layer_1_backward_output)[12] == 4);
-  GALOIS_LOG_ASSERT((*layer_1_backward_output)[13] == 4);
-  GALOIS_LOG_ASSERT((*layer_1_backward_output)[14] == 4);
-  GALOIS_LOG_ASSERT((*layer_1_backward_output)[15] == 4);
-  GALOIS_LOG_ASSERT((*layer_1_backward_output)[16] == 4);
-  GALOIS_LOG_ASSERT((*layer_1_backward_output)[17] == 4);
-  GALOIS_LOG_ASSERT((*layer_1_backward_output)[18] == 2);
-  GALOIS_LOG_ASSERT((*layer_1_backward_output)[19] == 2);
-  GALOIS_LOG_ASSERT((*layer_1_backward_output)[20] == 2);
+  GALOIS_LOG_ASSERT(layer_1_backward_output.size() == 21);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[0] == 2);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[1] == 2);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[2] == 2);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[3] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[4] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[5] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[6] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[7] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[8] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[9] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[10] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[11] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[12] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[13] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[14] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[15] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[16] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[17] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[18] == 2);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[19] == 2);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[20] == 2);
 
-  const std::vector<galois::GNNFloat> layer_1_weight_gradients =
+  galois::PointerWithSize<galois::GNNFloat> layer_1_weight_gradients =
       layer_1->GetLayerWeightGradients();
   // make sure they are sane
   GALOIS_LOG_ASSERT(layer_1_weight_gradients.size() == 6);
@@ -217,7 +219,7 @@ int main() {
   std::unique_ptr<galois::GraphConvolutionalLayer> layer_2 =
       std::make_unique<galois::GraphConvolutionalLayer>(1, test_graph,
                                                         dimension_0, config);
-  const std::vector<galois::GNNFloat> l2_fo =
+  galois::PointerWithSize<galois::GNNFloat> l2_fo =
       layer_2->ForwardPhase(test_graph.GetLocalFeatures());
   GALOIS_LOG_ASSERT(l2_fo.size() == 14);
   GALOIS_LOG_VERBOSE("{}", l2_fo[0]);
@@ -235,31 +237,31 @@ int main() {
   GALOIS_LOG_VERBOSE("{}", l2_fo[12]);
   GALOIS_LOG_VERBOSE("{}", l2_fo[13]);
 
-  std::vector<galois::GNNFloat>* l2_bo =
+  galois::PointerWithSize<galois::GNNFloat> l2_bo =
       layer_2->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
 
-  GALOIS_LOG_ASSERT(l2_bo->size() == 21);
-  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[0]);
-  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[1]);
-  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[2]);
-  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[3]);
-  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[4]);
-  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[5]);
-  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[6]);
-  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[7]);
-  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[8]);
-  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[9]);
-  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[10]);
-  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[11]);
-  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[12]);
-  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[13]);
-  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[14]);
-  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[15]);
-  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[16]);
-  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[17]);
-  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[18]);
-  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[19]);
-  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[20]);
+  GALOIS_LOG_ASSERT(l2_bo.size() == 21);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[0]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[1]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[2]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[3]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[4]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[5]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[6]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[7]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[8]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[9]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[10]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[11]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[12]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[13]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[14]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[15]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[16]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[17]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[18]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[19]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[20]);
 
   return 0;
 }
diff --git a/libgnn/test/epoch-test.cpp b/libgnn/test/epoch-test.cpp
index 2486269ccd..da2a9e1be2 100644
--- a/libgnn/test/epoch-test.cpp
+++ b/libgnn/test/epoch-test.cpp
@@ -44,16 +44,16 @@ int main() {
   galois::StatTimer main_timer("Timer_0");
   main_timer.start();
   for (size_t epoch = 0; epoch < 20; epoch++) {
-    const std::vector<galois::GNNFloat>* predictions = gnn->DoInference();
+    galois::PointerWithSize<galois::GNNFloat> predictions = gnn->DoInference();
     gnn->GradientPropagation();
     galois::gPrint("Epoch ", epoch, ": Accuracy is ",
-                   gnn->GetGlobalAccuracy(*predictions), "\n");
+                   gnn->GetGlobalAccuracy(predictions), "\n");
   }
 
   // check test accuracy
   gnn->SetLayerPhases(galois::GNNPhase::kTest);
-  const std::vector<galois::GNNFloat>* predictions = gnn->DoInference();
-  galois::gPrint("Test accuracy is ", gnn->GetGlobalAccuracy(*predictions),
+  galois::PointerWithSize<galois::GNNFloat> predictions = gnn->DoInference();
+  galois::gPrint("Test accuracy is ", gnn->GetGlobalAccuracy(predictions),
                  "\n");
   main_timer.stop();
 }
diff --git a/libgnn/test/gnnfb-test.cpp b/libgnn/test/gnnfb-test.cpp
index d43e1b0e2e..e7232ca108 100644
--- a/libgnn/test/gnnfb-test.cpp
+++ b/libgnn/test/gnnfb-test.cpp
@@ -46,7 +46,7 @@ int main() {
   gnn->DoInference();
 
   // check output for layers to make sure it's as expected
-  const std::vector<galois::GNNFloat>& lf0_out =
+  galois::PointerWithSize<galois::GNNFloat> lf0_out =
       gnn->GetIntermediateLayer(0)->GetForwardOutput();
   GALOIS_LOG_ASSERT(lf0_out.size() == 28);
   for (size_t i = 0; i < 4; i++) {
@@ -71,7 +71,7 @@ int main() {
     GALOIS_LOG_ASSERT(lf0_out[24 + i] == 15);
   }
 
-  const std::vector<galois::GNNFloat>& lf1_out =
+  const galois::PointerWithSize<galois::GNNFloat> lf1_out =
       gnn->GetIntermediateLayer(1)->GetForwardOutput();
   GALOIS_LOG_ASSERT(lf1_out.size() == 49);
   for (size_t i = 0; i < 7; i++) {
@@ -96,7 +96,7 @@ int main() {
     GALOIS_LOG_ASSERT(lf1_out[42 + i] == 120);
   }
 
-  const std::vector<galois::GNNFloat>& fo_out =
+  const galois::PointerWithSize<galois::GNNFloat> fo_out =
       gnn->GetOutputLayer()->GetForwardOutput();
   GALOIS_LOG_ASSERT(fo_out.size() == 49);
   // since row all same, prob distribution across row should be same
@@ -127,7 +127,7 @@ int main() {
   gnn->SetLayerPhases(galois::GNNPhase::kValidate);
   gnn->SetAllLayerWeightsTo1();
   gnn->DoInference();
-  const std::vector<galois::GNNFloat>& fo_out_val =
+  const galois::PointerWithSize<galois::GNNFloat> fo_out_val =
       gnn->GetOutputLayer()->GetForwardOutput();
   for (size_t c = 0; c < 49; c += 7) {
     for (size_t i = 0; i < 6; i++) {
@@ -150,7 +150,7 @@ int main() {
   gnn->SetLayerPhases(galois::GNNPhase::kTest);
   gnn->SetAllLayerWeightsTo1();
   gnn->DoInference();
-  const std::vector<galois::GNNFloat>& fo_out_test =
+  galois::PointerWithSize<galois::GNNFloat> fo_out_test =
       gnn->GetOutputLayer()->GetForwardOutput();
   for (size_t c = 0; c < 49; c += 7) {
     for (size_t i = 0; i < 6; i++) {
diff --git a/libgnn/test/gpu-convlayer-test.cpp b/libgnn/test/gpu-convlayer-test.cpp
new file mode 100644
index 0000000000..d51a3bb54c
--- /dev/null
+++ b/libgnn/test/gpu-convlayer-test.cpp
@@ -0,0 +1,239 @@
+//! @file gpu-convlayer-test.cpp
+//! Conv layer test with a test graph on gpus
+
+#include "galois/Logging.h"
+#include "galois/layers/GraphConvolutionalLayer.h"
+
+int main() {
+  galois::DistMemSys G;
+
+  size_t num_threads = galois::setActiveThreads(
+      56 / galois::runtime::getSystemNetworkInterface().Num);
+
+  GALOIS_LOG_VERBOSE("[{}] Using {} threads",
+                     galois::runtime::getSystemNetworkInterface().ID,
+                     num_threads);
+  // load test graph
+  galois::graphs::GNNGraph test_graph(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true);
+
+  std::vector<galois::GNNFloat> feats = test_graph.GetLocalFeatures();
+
+  galois::GNNLayerDimensions dimension_0;
+  dimension_0.input_rows     = 7;
+  dimension_0.input_columns  = 3;
+  dimension_0.output_columns = 2;
+
+  galois::GNNLayerConfig dcon;
+  dcon.allow_aggregate_after_update = false;
+
+  // create the layer, no norm factor
+  std::unique_ptr<galois::GraphConvolutionalLayer> layer_0 =
+      std::make_unique<galois::GraphConvolutionalLayer>(0, test_graph,
+                                                        dimension_0, dcon);
+  layer_0->InitAllWeightsTo1();
+  // make sure it runs in a sane manner
+  const std::vector<galois::GNNFloat>& layer_0_forward_output =
+      layer_0->ForwardPhase(test_graph.GetLocalFeatures());
+
+  //////////////////////////////////////////////////////////////////////////////
+  // sanity check layer 0 output
+  //////////////////////////////////////////////////////////////////////////////
+  // since norm factors aren't invovled it is possible to do full assertions
+  // 7 x 2
+  GALOIS_LOG_ASSERT(layer_0_forward_output.size() == 14);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[0] == 3);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[1] == 3);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[2] == 6);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[3] == 6);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[4] == 12);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[5] == 12);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[6] == 18);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[7] == 18);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[8] == 24);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[9] == 24);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[10] == 30);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[11] == 30);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[12] == 15);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[13] == 15);
+  //////////////////////////////////////////////////////////////////////////////
+
+  // dummy 1 matrix
+  std::vector<galois::GNNFloat> dummy_ones(14, 1);
+
+  // backward pass checking
+  // layer 0 means that an empty weight matrix is returned since there is no
+  // point passing back anything
+  std::vector<galois::GNNFloat>* layer_0_backward_output =
+      layer_0->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
+
+  //////////////////////////////////////////////////////////////////////////////
+  // sanity check layer 0 backward output; all 0 because layer 0
+  //////////////////////////////////////////////////////////////////////////////
+  // since norm factors aren't invovled it is possible to do full assertions
+  // 7 x 3
+  GALOIS_LOG_ASSERT(layer_0_backward_output->size() == 21);
+  GALOIS_LOG_ASSERT((*layer_0_backward_output)[0] == 0);
+  GALOIS_LOG_ASSERT((*layer_0_backward_output)[1] == 0);
+  GALOIS_LOG_ASSERT((*layer_0_backward_output)[2] == 0);
+  GALOIS_LOG_ASSERT((*layer_0_backward_output)[3] == 0);
+  GALOIS_LOG_ASSERT((*layer_0_backward_output)[4] == 0);
+  GALOIS_LOG_ASSERT((*layer_0_backward_output)[5] == 0);
+  GALOIS_LOG_ASSERT((*layer_0_backward_output)[6] == 0);
+  GALOIS_LOG_ASSERT((*layer_0_backward_output)[7] == 0);
+  GALOIS_LOG_ASSERT((*layer_0_backward_output)[8] == 0);
+  GALOIS_LOG_ASSERT((*layer_0_backward_output)[9] == 0);
+  GALOIS_LOG_ASSERT((*layer_0_backward_output)[10] == 0);
+  GALOIS_LOG_ASSERT((*layer_0_backward_output)[11] == 0);
+  GALOIS_LOG_ASSERT((*layer_0_backward_output)[12] == 0);
+  GALOIS_LOG_ASSERT((*layer_0_backward_output)[13] == 0);
+  GALOIS_LOG_ASSERT((*layer_0_backward_output)[14] == 0);
+  GALOIS_LOG_ASSERT((*layer_0_backward_output)[15] == 0);
+  GALOIS_LOG_ASSERT((*layer_0_backward_output)[16] == 0);
+  GALOIS_LOG_ASSERT((*layer_0_backward_output)[17] == 0);
+  GALOIS_LOG_ASSERT((*layer_0_backward_output)[18] == 0);
+  GALOIS_LOG_ASSERT((*layer_0_backward_output)[19] == 0);
+  GALOIS_LOG_ASSERT((*layer_0_backward_output)[20] == 0);
+
+  const std::vector<galois::GNNFloat> layer_0_weight_gradients =
+      layer_0->GetLayerWeightGradients();
+  // make sure they are sane
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients.size() == 6);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[0] == 21);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[1] == 21);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[2] == 21);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[3] == 21);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[4] == 21);
+
+  layer_0.reset();
+
+  //////////////////////////////////////////////////////////////////////////////
+
+  // create layer 1 for testing backward prop actually giving weights back
+
+  std::unique_ptr<galois::GraphConvolutionalLayer> layer_1 =
+      std::make_unique<galois::GraphConvolutionalLayer>(1, test_graph,
+                                                        dimension_0, dcon);
+  layer_1->InitAllWeightsTo1();
+  const std::vector<galois::GNNFloat>& layer_1_forward_output =
+      layer_1->ForwardPhase(test_graph.GetLocalFeatures());
+  // same check as before for sanity purposes
+  GALOIS_LOG_ASSERT(layer_1_forward_output.size() == 14);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[0] == 3);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[1] == 3);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[2] == 6);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[3] == 6);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[4] == 12);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[5] == 12);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[6] == 18);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[7] == 18);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[8] == 24);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[9] == 24);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[10] == 30);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[11] == 30);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[12] == 15);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[13] == 15);
+
+  // since layer isn't 0 anymore, backward phase will actually return something
+  dummy_ones.assign(14, 1);
+  std::vector<galois::GNNFloat>* layer_1_backward_output =
+      layer_1->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
+  //////////////////////////////////////////////////////////////////////////////
+  // check that multiplies go as expected
+  //////////////////////////////////////////////////////////////////////////////
+  GALOIS_LOG_ASSERT(layer_1_backward_output->size() == 21);
+  GALOIS_LOG_ASSERT((*layer_1_backward_output)[0] == 2);
+  GALOIS_LOG_ASSERT((*layer_1_backward_output)[1] == 2);
+  GALOIS_LOG_ASSERT((*layer_1_backward_output)[2] == 2);
+  GALOIS_LOG_ASSERT((*layer_1_backward_output)[3] == 4);
+  GALOIS_LOG_ASSERT((*layer_1_backward_output)[4] == 4);
+  GALOIS_LOG_ASSERT((*layer_1_backward_output)[5] == 4);
+  GALOIS_LOG_ASSERT((*layer_1_backward_output)[6] == 4);
+  GALOIS_LOG_ASSERT((*layer_1_backward_output)[7] == 4);
+  GALOIS_LOG_ASSERT((*layer_1_backward_output)[8] == 4);
+  GALOIS_LOG_ASSERT((*layer_1_backward_output)[9] == 4);
+  GALOIS_LOG_ASSERT((*layer_1_backward_output)[10] == 4);
+  GALOIS_LOG_ASSERT((*layer_1_backward_output)[11] == 4);
+  GALOIS_LOG_ASSERT((*layer_1_backward_output)[12] == 4);
+  GALOIS_LOG_ASSERT((*layer_1_backward_output)[13] == 4);
+  GALOIS_LOG_ASSERT((*layer_1_backward_output)[14] == 4);
+  GALOIS_LOG_ASSERT((*layer_1_backward_output)[15] == 4);
+  GALOIS_LOG_ASSERT((*layer_1_backward_output)[16] == 4);
+  GALOIS_LOG_ASSERT((*layer_1_backward_output)[17] == 4);
+  GALOIS_LOG_ASSERT((*layer_1_backward_output)[18] == 2);
+  GALOIS_LOG_ASSERT((*layer_1_backward_output)[19] == 2);
+  GALOIS_LOG_ASSERT((*layer_1_backward_output)[20] == 2);
+
+  const std::vector<galois::GNNFloat> layer_1_weight_gradients =
+      layer_1->GetLayerWeightGradients();
+  // make sure they are sane
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients.size() == 6);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[0] == 21);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[1] == 21);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[2] == 21);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[3] == 21);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[4] == 21);
+
+  layer_1.reset();
+
+  //////////////////////////////////////////////////////////////////////////////
+
+  galois::GNNLayerConfig config;
+  config.do_dropout                   = true;
+  config.do_activation                = true;
+  config.do_normalization             = true;
+  config.allow_aggregate_after_update = false;
+
+  // finally, just make sure dropout and activation run without crashes
+  // (verification requires floating point accuracy or setting a seed which I
+  // don't have time for at the moment
+  // TODO in future maybe add better unit test for this
+  std::unique_ptr<galois::GraphConvolutionalLayer> layer_2 =
+      std::make_unique<galois::GraphConvolutionalLayer>(1, test_graph,
+                                                        dimension_0, config);
+  const std::vector<galois::GNNFloat> l2_fo =
+      layer_2->ForwardPhase(test_graph.GetLocalFeatures());
+  GALOIS_LOG_ASSERT(l2_fo.size() == 14);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[0]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[1]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[2]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[3]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[4]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[5]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[6]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[7]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[8]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[9]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[10]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[11]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[12]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[13]);
+
+  std::vector<galois::GNNFloat>* l2_bo =
+      layer_2->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
+
+  GALOIS_LOG_ASSERT(l2_bo->size() == 21);
+  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[0]);
+  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[1]);
+  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[2]);
+  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[3]);
+  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[4]);
+  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[5]);
+  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[6]);
+  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[7]);
+  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[8]);
+  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[9]);
+  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[10]);
+  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[11]);
+  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[12]);
+  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[13]);
+  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[14]);
+  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[15]);
+  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[16]);
+  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[17]);
+  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[18]);
+  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[19]);
+  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[20]);
+
+  return 0;
+}
diff --git a/libgnn/test/softmaxlayer-test.cpp b/libgnn/test/softmaxlayer-test.cpp
index 5d9fa87728..f7baab24fd 100644
--- a/libgnn/test/softmaxlayer-test.cpp
+++ b/libgnn/test/softmaxlayer-test.cpp
@@ -38,7 +38,7 @@ int main() {
   // train mode
   auto output_layer =
       std::make_unique<galois::SoftmaxLayer>(3, test_graph, dimension_0);
-  const std::vector<galois::GNNFloat>& prediction_distribution =
+  galois::PointerWithSize<galois::GNNFloat> prediction_distribution =
       output_layer->ForwardPhase(softmax_input);
   output_layer->BackwardPhase(softmax_input, nullptr);
 
@@ -60,7 +60,7 @@ int main() {
 
   // validation mode
   output_layer->SetLayerPhase(galois::GNNPhase::kValidate);
-  const std::vector<galois::GNNFloat>& pd2 =
+  galois::PointerWithSize<galois::GNNFloat> pd2 =
       output_layer->ForwardPhase(softmax_input);
   output_layer->BackwardPhase(softmax_input, nullptr);
   // validate vertex is index 5
@@ -86,7 +86,7 @@ int main() {
 
   // test mode
   output_layer->SetLayerPhase(galois::GNNPhase::kTest);
-  const std::vector<galois::GNNFloat>& pd3 =
+  galois::PointerWithSize<galois::GNNFloat> pd3 =
       output_layer->ForwardPhase(softmax_input);
   output_layer->BackwardPhase(softmax_input, nullptr);
   // validate vertex is index 6
diff --git a/libgnn/test/weight-sync-test.cpp b/libgnn/test/weight-sync-test.cpp
index 561aa95370..3ea524e4a7 100644
--- a/libgnn/test/weight-sync-test.cpp
+++ b/libgnn/test/weight-sync-test.cpp
@@ -29,12 +29,13 @@ int main() {
   layer_0->InitAllWeightsTo1();
 
   // backward pass checking; check the gradients out
-  std::vector<galois::GNNFloat> dummy_ones(test_graph->size() * 2, 1);
+  std::vector<galois::GNNFloat> dummy_ones_v(test_graph->size() * 2, 1);
+  galois::PointerWithSize<galois::GNNFloat> dummy_ones(dummy_ones_v);
   layer_0->BackwardPhase(test_graph->GetLocalFeatures(), &dummy_ones);
 
   // gradient verification; average
   // host 0 has 18, 1 has 21, 2 has 12, 3 has 0s; averaged to 12.75
-  const std::vector<galois::GNNFloat>& grads =
+  const galois::PointerWithSize<galois::GNNFloat>& grads =
       layer_0->GetLayerWeightGradients();
   for (size_t i = 0; i < 6; i++) {
     GALOIS_LOG_ASSERT(grads[i] == 12.75);

From 5931edc464f9de1d72c9de5081a8c4a9fb93b4fc Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 6 Nov 2020 19:03:35 -0600
Subject: [PATCH 399/660] Allocate GPU memory for parent GNNLayer

Allocates memory for layer weights, gradients, forward output, and
backward output of generic GNNLayers on the GPU.

Layer specific matrices such as those on a GCN layer will need to be
handled separately in later commits.
---
 libgnn/CMakeLists.txt                     |  1 +
 libgnn/include/galois/graphs/GNNGraph.cuh |  6 ----
 libgnn/include/galois/layers/GNNLayer.cuh | 26 +++++++++++++++
 libgnn/include/galois/layers/GNNLayer.h   | 11 +++++++
 libgnn/src/layers/GNNLayer.cpp            | 14 ++++++++
 libgnn/src/layers/GNNLayer.cu             | 40 +++++++++++++++++++++++
 6 files changed, 92 insertions(+), 6 deletions(-)
 create mode 100644 libgnn/include/galois/layers/GNNLayer.cuh
 create mode 100644 libgnn/src/layers/GNNLayer.cu

diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt
index 1ca05c8632..bf84358393 100644
--- a/libgnn/CMakeLists.txt
+++ b/libgnn/CMakeLists.txt
@@ -31,6 +31,7 @@ if (GALOIS_ENABLE_GPU)
   # create the galois_gnn_gpu library to get linked into galois_gnn
   set(gpusources
     src/graphs/GNNGraph.cu
+    src/layers/GNNLayer.cu
   )
   add_library(galois_gnn_gpu STATIC ${gpusources})
   target_compile_definitions(galois_gnn_gpu PRIVATE _FORCE_INLINES)
diff --git a/libgnn/include/galois/graphs/GNNGraph.cuh b/libgnn/include/galois/graphs/GNNGraph.cuh
index 9e047b6a0f..c44fba7b9a 100644
--- a/libgnn/include/galois/graphs/GNNGraph.cuh
+++ b/libgnn/include/galois/graphs/GNNGraph.cuh
@@ -8,24 +8,18 @@ namespace graphs {
 //! Similar in nature to the CUDAContext class in existing D-IrGL
 class GNNGraphGPUAllocations {
 public:
-  // XXX getters for everything, the rest of the setters, etc.
-
   //! CUDA frees all allocated memory (i.e. non-nullptr)
   ~GNNGraphGPUAllocations();
-
   //! Copies graph topology over to GPU; using ints because cuSparse lib
   //! expects ints for the CSR arrays
   void SetGraphTopology(const std::vector<int>& edge_index,
                         const std::vector<int>& edge_dests);
-
   //! Host side function that allocates memory for the features on the vertices
   //! and copies them over to the GPU.
   void SetFeatures(const std::vector<GNNFeature>& features,
                    unsigned num_features);
-
   //! Copy over ground truth for the graph to GPU
   void SetLabels(const std::vector<GNNLabel>& ground_truth);
-
 private:
   // ALL THESE VARIABLES ARE DEVICE SIDE (GPU) POINTERS
 
diff --git a/libgnn/include/galois/layers/GNNLayer.cuh b/libgnn/include/galois/layers/GNNLayer.cuh
new file mode 100644
index 0000000000..3a89c97d61
--- /dev/null
+++ b/libgnn/include/galois/layers/GNNLayer.cuh
@@ -0,0 +1,26 @@
+#pragma once
+#include "galois/GNNTypes.h"
+
+namespace galois {
+
+//! Holds pointers to GNN layer weights/gradient on GPU
+class GNNLayerGPUAllocations {
+public:
+  //! CUDA frees all allocated memory (i.e. non-nullptr)
+  ~GNNLayerGPUAllocations();
+  //! Initializes forward and backward output matrices of this layer on GPU
+  void InitInOutMemory(size_t forward_size, size_t backward_size);
+  //! Initializes memory for weight and weight gradients on GPU
+  void InitWeightMemory(size_t num_weights);
+  //! Copy provided data in vector to GPU weights
+  void CopyToWeights(const std::vector<GNNFloat>& cpu_layer_weights);
+
+private:
+  size_t* num_weights_{nullptr};
+  GNNFloat* forward_output_matrix_{nullptr};
+  GNNFloat* backward_output_matrix_{nullptr};
+  GNNFloat* layer_weights_{nullptr};
+  GNNFloat* layer_weight_gradients_{nullptr};
+};
+
+} // namespace galois
diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
index e738bdacca..9636d4f8d6 100644
--- a/libgnn/include/galois/layers/GNNLayer.h
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -5,6 +5,10 @@
 #include "galois/graphs/GNNGraph.h"
 #include "galois/layers/GluonGradientInterface.h"
 
+#ifdef GALOIS_ENABLE_GPU
+#include "galois/layers/GNNLayer.cuh"
+#endif
+
 namespace galois {
 
 //! Supported layer types in the GNN
@@ -203,6 +207,13 @@ class GNNLayer {
   //! Synchronize weight gradients with a summation, then locally divide all
   //! weights to get an average
   void WeightGradientSyncAverage();
+
+#ifdef GALOIS_ENABLE_GPU
+  //! Object that holds all GPU allocated pointers to memory related to layers
+  GNNLayerGPUAllocations gpu_memory_;
+  //! Copies over layer weights to GPU
+  void CopyLayerWeightsToGPU();
+#endif
 };
 
 } // namespace galois
diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp
index a0ead51e10..a42d593a22 100644
--- a/libgnn/src/layers/GNNLayer.cpp
+++ b/libgnn/src/layers/GNNLayer.cpp
@@ -28,6 +28,9 @@ galois::GNNLayer::GNNLayer(size_t layer_num,
         *gradient_sync_interface_,
         galois::runtime::getSystemNetworkInterface().ID,
         galois::runtime::getSystemNetworkInterface().Num, false);
+#ifdef GALOIS_ENABLE_GPU
+    gpu_memory_.InitWeightMemory(num_weight_elements);
+#endif
   }
 
   size_t num_output_elements =
@@ -35,6 +38,11 @@ galois::GNNLayer::GNNLayer(size_t layer_num,
   forward_output_matrix_.resize(num_output_elements, 0);
   backward_output_matrix_.resize(
       layer_dimensions_.input_rows * layer_dimensions_.input_columns, 0);
+#ifdef GALOIS_ENABLE_GPU
+  gpu_memory_.InitInOutMemory(num_output_elements,
+                              layer_dimensions_.input_rows *
+                                  layer_dimensions_.input_columns);
+#endif
 }
 
 void galois::GNNLayer::GlorotBengioInit(std::vector<GNNFloat>* vector_to_init) {
@@ -159,3 +167,9 @@ void galois::GNNLayer::WeightGradientSyncAverage() {
         galois::loopname("WeightGradientSyncAverageDivide"));
   }
 }
+
+#ifdef GALOIS_ENABLE_GPU
+void galois::GNNLayer::CopyLayerWeightsToGPU() {
+  gpu_memory_.CopyToWeights(layer_weights_);
+}
+#endif
diff --git a/libgnn/src/layers/GNNLayer.cu b/libgnn/src/layers/GNNLayer.cu
new file mode 100644
index 0000000000..424df92e26
--- /dev/null
+++ b/libgnn/src/layers/GNNLayer.cu
@@ -0,0 +1,40 @@
+#include "galois/CUDAUtil.h"
+#include "galois/layers/GNNLayer.cuh"
+
+galois::GNNLayerGPUAllocations::~GNNLayerGPUAllocations() {
+  GALOIS_LOG_VERBOSE("Freeing GPU layer allocations");
+  CUDA_FREE(num_weights_);
+  CUDA_FREE(forward_output_matrix_);
+  CUDA_FREE(backward_output_matrix_);
+  CUDA_FREE(layer_weights_);
+  CUDA_FREE(layer_weight_gradients_);
+}
+
+void galois::GNNLayerGPUAllocations::InitInOutMemory(size_t forward_size,
+                                                     size_t backward_size) {
+  CUDA_CHECK(cudaMalloc((void**)(&forward_output_matrix_),
+                        forward_size * sizeof(GNNFloat)));
+  CUDA_CHECK(cudaMalloc((void**)(&backward_output_matrix_),
+                        backward_size * sizeof(GNNFloat)));
+}
+
+void galois::GNNLayerGPUAllocations::InitWeightMemory(size_t num_weights) {
+  // num weights
+  CUDA_CHECK(cudaMalloc((void**)(&num_weights_), sizeof(size_t)));
+  CUDA_CHECK(cudaMemcpy(num_weights_, &num_weights, sizeof(size_t),
+                        cudaMemcpyHostToDevice));
+  // memory
+  CUDA_CHECK(
+      cudaMalloc((void**)(&layer_weights_), num_weights * sizeof(GNNFloat)));
+  CUDA_CHECK(cudaMalloc((void**)(&layer_weight_gradients_),
+                        num_weights * sizeof(GNNFloat)));
+}
+
+void galois::GNNLayerGPUAllocations::CopyToWeights(
+    const std::vector<GNNFloat>& cpu_layer_weights) {
+  CUDA_CHECK(cudaMemcpy(layer_weights_, cpu_layer_weights.data(),
+                        cpu_layer_weights.size() * sizeof(GNNFloat),
+                        cudaMemcpyHostToDevice));
+}
+
+// TODO copy from gpu function as well just in case I need to check

From 483fda6703859a96a96da30612949fcb97d1e1ad Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 6 Nov 2020 19:35:39 -0600
Subject: [PATCH 400/660] GCN layer: allocate GPU memory for matrices

Allocates memory on the GPU for the intermediate matrices used in the
GCN layer of the graph neural network.
---
 libgnn/CMakeLists.txt                         |  1 +
 .../galois/layers/GraphConvolutionalLayer.cuh | 19 +++++++++++++++++++
 .../galois/layers/GraphConvolutionalLayer.h   |  8 ++++++++
 libgnn/src/layers/GraphConvolutionalLayer.cpp |  3 +++
 libgnn/src/layers/GraphConvolutionalLayer.cu  | 19 +++++++++++++++++++
 5 files changed, 50 insertions(+)
 create mode 100644 libgnn/include/galois/layers/GraphConvolutionalLayer.cuh
 create mode 100644 libgnn/src/layers/GraphConvolutionalLayer.cu

diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt
index bf84358393..f556ec6ca4 100644
--- a/libgnn/CMakeLists.txt
+++ b/libgnn/CMakeLists.txt
@@ -32,6 +32,7 @@ if (GALOIS_ENABLE_GPU)
   set(gpusources
     src/graphs/GNNGraph.cu
     src/layers/GNNLayer.cu
+    src/layers/GraphConvolutionalLayer.cu
   )
   add_library(galois_gnn_gpu STATIC ${gpusources})
   target_compile_definitions(galois_gnn_gpu PRIVATE _FORCE_INLINES)
diff --git a/libgnn/include/galois/layers/GraphConvolutionalLayer.cuh b/libgnn/include/galois/layers/GraphConvolutionalLayer.cuh
new file mode 100644
index 0000000000..6b567eab2e
--- /dev/null
+++ b/libgnn/include/galois/layers/GraphConvolutionalLayer.cuh
@@ -0,0 +1,19 @@
+#pragma once
+#include "galois/GNNTypes.h"
+
+namespace galois {
+
+//! Holds pointers for GPU memory for GCN layer
+class GCNGPUAllocations {
+public:
+  // free memory
+  ~GCNGPUAllocations();
+  // allocate the 3 temp arrays
+  void Allocate(size_t input_elements, size_t output_elements);
+private:
+  GNNFloat* in_temp_1_{nullptr};
+  GNNFloat* in_temp_2_{nullptr};
+  GNNFloat* out_temp_{nullptr};
+};
+
+}
diff --git a/libgnn/include/galois/layers/GraphConvolutionalLayer.h b/libgnn/include/galois/layers/GraphConvolutionalLayer.h
index 196fa752c8..19c4e6c68c 100644
--- a/libgnn/include/galois/layers/GraphConvolutionalLayer.h
+++ b/libgnn/include/galois/layers/GraphConvolutionalLayer.h
@@ -1,6 +1,10 @@
 #pragma once
 #include "galois/layers/GNNLayer.h"
 
+#ifdef GALOIS_ENABLE_GPU
+#include "galois/layers/GraphConvolutionalLayer.cuh"
+#endif
+
 namespace galois {
 
 class GraphConvolutionalLayer : public GNNLayer {
@@ -55,6 +59,10 @@ class GraphConvolutionalLayer : public GNNLayer {
   void UpdateEmbeddings(const GNNFloat* node_embeddings, GNNFloat* output);
   //! Calculate graident via mxm with last layer's gradients (backward)
   void UpdateEmbeddingsDerivative(const GNNFloat* gradients, GNNFloat* output);
+
+#ifdef GALOIS_ENABLE_GPU
+  GCNGPUAllocations gpu_memory_;
+#endif
 };
 
 } // namespace galois
diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp
index e2e80ce8b1..c10c59c383 100644
--- a/libgnn/src/layers/GraphConvolutionalLayer.cpp
+++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp
@@ -19,6 +19,9 @@ galois::GraphConvolutionalLayer::GraphConvolutionalLayer(
   GALOIS_LOG_VERBOSE("Output elements {}", num_output_elements);
   out_temp_.resize(num_output_elements, 0);
   layer_type_ = galois::GNNLayerType::kGraphConvolutional;
+#ifdef GALOIS_ENABLE_GPU
+  gpu_memory_.Allocate(num_input_elements, num_output_elements);
+#endif
 }
 
 const galois::PointerWithSize<galois::GNNFloat>
diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cu b/libgnn/src/layers/GraphConvolutionalLayer.cu
new file mode 100644
index 0000000000..20e96d9777
--- /dev/null
+++ b/libgnn/src/layers/GraphConvolutionalLayer.cu
@@ -0,0 +1,19 @@
+#include "galois/CUDAUtil.h"
+#include "galois/layers/GraphConvolutionalLayer.cuh"
+
+galois::GCNGPUAllocations::~GCNGPUAllocations() {
+  GALOIS_LOG_VERBOSE("Freeing GCN layer allocations");
+  CUDA_FREE(in_temp_1_);
+  CUDA_FREE(in_temp_2_);
+  CUDA_FREE(out_temp_);
+}
+
+void galois::GCNGPUAllocations::Allocate(size_t input_elements,
+                                         size_t output_elements) {
+  CUDA_CHECK(
+      cudaMalloc((void**)(&in_temp_1_), input_elements * sizeof(GNNFloat)));
+  CUDA_CHECK(
+      cudaMalloc((void**)(&in_temp_2_), input_elements * sizeof(GNNFloat)));
+  CUDA_CHECK(
+      cudaMalloc((void**)(&out_temp_), output_elements * sizeof(GNNFloat)));
+}

From deb50bceb176ba4bc54883ebc5f9ba36053f6563 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 10 Nov 2020 17:02:35 -0600
Subject: [PATCH 401/660] Disable majority GNN tests for GPU build

Turn off a majority of the GNN tests for the GPU build because the data
is on the GPU so doing a check of the data would involve copying it back
to the CPU: this will be done later.

This commit is unlikely to build as it does not include a few changes
coming in later commits that make things work.
---
 libgnn/test/CMakeLists.txt         |  76 +++---
 libgnn/test/gpu-convlayer-test.cpp | 405 +++++++++++++++--------------
 2 files changed, 247 insertions(+), 234 deletions(-)

diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt
index 69ef29b43f..8bec96c4d6 100644
--- a/libgnn/test/CMakeLists.txt
+++ b/libgnn/test/CMakeLists.txt
@@ -2,40 +2,46 @@ add_executable(gnngraph-test gnngraph-test.cpp)
 target_link_libraries(gnngraph-test galois_gnn)
 add_test(NAME gnngraph-test COMMAND gnngraph-test)
 
-add_executable(convlayer-test convlayer-test.cpp)
-target_link_libraries(convlayer-test galois_gnn)
-add_test(NAME convlayer-test COMMAND convlayer-test)
-
-add_executable(softmaxlayer-test softmaxlayer-test.cpp)
-target_link_libraries(softmaxlayer-test galois_gnn)
-add_test(NAME softmaxlayer-test COMMAND softmaxlayer-test)
-
-add_executable(gnnconstruct-test gnnconstruct-test.cpp)
-target_link_libraries(gnnconstruct-test galois_gnn)
-add_test(NAME gnnconstruct-test COMMAND gnnconstruct-test)
-
-add_executable(gnnfb-test gnnfb-test.cpp)
-target_link_libraries(gnnfb-test galois_gnn)
-add_test(NAME gnnfb-test COMMAND gnnfb-test)
-
-add_executable(adam-test adam-test.cpp)
-target_link_libraries(adam-test galois_gnn)
-add_test(NAME adam-test COMMAND adam-test)
-
-add_executable(accuracy-test accuracy-test.cpp)
-target_link_libraries(accuracy-test galois_gnn)
-add_test(NAME accuracy-test COMMAND accuracy-test)
-
-add_executable(epoch-test epoch-test.cpp)
-target_link_libraries(epoch-test galois_gnn)
-add_test(NAME epoch-test COMMAND epoch-test)
-
-# TODO figure out how to make this test run in parallel
-add_executable(aggregate-sync-test aggregate-sync-test.cpp)
-target_link_libraries(aggregate-sync-test galois_gnn)
-#add_test(NAME aggregate-sync-test COMMAND GALOIS_DO_NOT_BIND_THREADS=1 mpirun -n=4 ./aggregate-sync-test)
-
-add_executable(weight-sync-test weight-sync-test.cpp)
-target_link_libraries(weight-sync-test galois_gnn)
+if (NOT GALOIS_ENABLE_GPU)
+  add_executable(convlayer-test convlayer-test.cpp)
+  target_link_libraries(convlayer-test galois_gnn)
+  add_test(NAME convlayer-test COMMAND convlayer-test)
+  
+  add_executable(softmaxlayer-test softmaxlayer-test.cpp)
+  target_link_libraries(softmaxlayer-test galois_gnn)
+  add_test(NAME softmaxlayer-test COMMAND softmaxlayer-test)
+  
+  add_executable(gnnconstruct-test gnnconstruct-test.cpp)
+  target_link_libraries(gnnconstruct-test galois_gnn)
+  add_test(NAME gnnconstruct-test COMMAND gnnconstruct-test)
+  
+  add_executable(gnnfb-test gnnfb-test.cpp)
+  target_link_libraries(gnnfb-test galois_gnn)
+  add_test(NAME gnnfb-test COMMAND gnnfb-test)
+  
+  add_executable(adam-test adam-test.cpp)
+  target_link_libraries(adam-test galois_gnn)
+  add_test(NAME adam-test COMMAND adam-test)
+  
+  add_executable(accuracy-test accuracy-test.cpp)
+  target_link_libraries(accuracy-test galois_gnn)
+  add_test(NAME accuracy-test COMMAND accuracy-test)
+  
+  add_executable(epoch-test epoch-test.cpp)
+  target_link_libraries(epoch-test galois_gnn)
+  add_test(NAME epoch-test COMMAND epoch-test)
+  
+  # TODO figure out how to make this test run in parallel
+  add_executable(aggregate-sync-test aggregate-sync-test.cpp)
+  target_link_libraries(aggregate-sync-test galois_gnn)
+  #add_test(NAME aggregate-sync-test COMMAND GALOIS_DO_NOT_BIND_THREADS=1 mpirun -n=4 ./aggregate-sync-test)
+  
+  add_executable(weight-sync-test weight-sync-test.cpp)
+  target_link_libraries(weight-sync-test galois_gnn)
+else()
+  add_executable(gpu-convlayer-test gpu-convlayer-test.cpp)
+  target_link_libraries(gpu-convlayer-test galois_gnn)
+  add_test(NAME gpu-convlayer-test COMMAND gpu-convlayer-test)
+endif()
 
 # TODO multi host tests?
diff --git a/libgnn/test/gpu-convlayer-test.cpp b/libgnn/test/gpu-convlayer-test.cpp
index d51a3bb54c..0123a35b17 100644
--- a/libgnn/test/gpu-convlayer-test.cpp
+++ b/libgnn/test/gpu-convlayer-test.cpp
@@ -17,7 +17,13 @@ int main() {
   galois::graphs::GNNGraph test_graph(
       "tester", galois::graphs::GNNPartitionScheme::kOEC, true);
 
-  std::vector<galois::GNNFloat> feats = test_graph.GetLocalFeatures();
+  galois::PointerWithSize<galois::GNNFloat> feats =
+      test_graph.GetLocalFeatures();
+  //////////////////////////////////////////////////////////////////////////////
+  // doubles as a test for reading as well
+  GALOIS_LOG_ASSERT(7 == test_graph.size());
+  GALOIS_LOG_ASSERT(21 == feats.size());
+  //////////////////////////////////////////////////////////////////////////////
 
   galois::GNNLayerDimensions dimension_0;
   dimension_0.input_rows     = 7;
@@ -33,207 +39,208 @@ int main() {
                                                         dimension_0, dcon);
   layer_0->InitAllWeightsTo1();
   // make sure it runs in a sane manner
-  const std::vector<galois::GNNFloat>& layer_0_forward_output =
+  const galois::PointerWithSize<galois::GNNFloat> layer_0_forward_output =
       layer_0->ForwardPhase(test_graph.GetLocalFeatures());
 
-  //////////////////////////////////////////////////////////////////////////////
-  // sanity check layer 0 output
-  //////////////////////////////////////////////////////////////////////////////
-  // since norm factors aren't invovled it is possible to do full assertions
-  // 7 x 2
+  ////////////////////////////////////////////////////////////////////////////////
+  //// sanity check layer 0 output
+  ////////////////////////////////////////////////////////////////////////////////
+  //// since norm factors aren't invovled it is possible to do full assertions
+  //// 7 x 2
   GALOIS_LOG_ASSERT(layer_0_forward_output.size() == 14);
-  GALOIS_LOG_ASSERT(layer_0_forward_output[0] == 3);
-  GALOIS_LOG_ASSERT(layer_0_forward_output[1] == 3);
-  GALOIS_LOG_ASSERT(layer_0_forward_output[2] == 6);
-  GALOIS_LOG_ASSERT(layer_0_forward_output[3] == 6);
-  GALOIS_LOG_ASSERT(layer_0_forward_output[4] == 12);
-  GALOIS_LOG_ASSERT(layer_0_forward_output[5] == 12);
-  GALOIS_LOG_ASSERT(layer_0_forward_output[6] == 18);
-  GALOIS_LOG_ASSERT(layer_0_forward_output[7] == 18);
-  GALOIS_LOG_ASSERT(layer_0_forward_output[8] == 24);
-  GALOIS_LOG_ASSERT(layer_0_forward_output[9] == 24);
-  GALOIS_LOG_ASSERT(layer_0_forward_output[10] == 30);
-  GALOIS_LOG_ASSERT(layer_0_forward_output[11] == 30);
-  GALOIS_LOG_ASSERT(layer_0_forward_output[12] == 15);
-  GALOIS_LOG_ASSERT(layer_0_forward_output[13] == 15);
-  //////////////////////////////////////////////////////////////////////////////
-
-  // dummy 1 matrix
-  std::vector<galois::GNNFloat> dummy_ones(14, 1);
-
-  // backward pass checking
-  // layer 0 means that an empty weight matrix is returned since there is no
-  // point passing back anything
-  std::vector<galois::GNNFloat>* layer_0_backward_output =
-      layer_0->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
-
-  //////////////////////////////////////////////////////////////////////////////
-  // sanity check layer 0 backward output; all 0 because layer 0
-  //////////////////////////////////////////////////////////////////////////////
-  // since norm factors aren't invovled it is possible to do full assertions
-  // 7 x 3
-  GALOIS_LOG_ASSERT(layer_0_backward_output->size() == 21);
-  GALOIS_LOG_ASSERT((*layer_0_backward_output)[0] == 0);
-  GALOIS_LOG_ASSERT((*layer_0_backward_output)[1] == 0);
-  GALOIS_LOG_ASSERT((*layer_0_backward_output)[2] == 0);
-  GALOIS_LOG_ASSERT((*layer_0_backward_output)[3] == 0);
-  GALOIS_LOG_ASSERT((*layer_0_backward_output)[4] == 0);
-  GALOIS_LOG_ASSERT((*layer_0_backward_output)[5] == 0);
-  GALOIS_LOG_ASSERT((*layer_0_backward_output)[6] == 0);
-  GALOIS_LOG_ASSERT((*layer_0_backward_output)[7] == 0);
-  GALOIS_LOG_ASSERT((*layer_0_backward_output)[8] == 0);
-  GALOIS_LOG_ASSERT((*layer_0_backward_output)[9] == 0);
-  GALOIS_LOG_ASSERT((*layer_0_backward_output)[10] == 0);
-  GALOIS_LOG_ASSERT((*layer_0_backward_output)[11] == 0);
-  GALOIS_LOG_ASSERT((*layer_0_backward_output)[12] == 0);
-  GALOIS_LOG_ASSERT((*layer_0_backward_output)[13] == 0);
-  GALOIS_LOG_ASSERT((*layer_0_backward_output)[14] == 0);
-  GALOIS_LOG_ASSERT((*layer_0_backward_output)[15] == 0);
-  GALOIS_LOG_ASSERT((*layer_0_backward_output)[16] == 0);
-  GALOIS_LOG_ASSERT((*layer_0_backward_output)[17] == 0);
-  GALOIS_LOG_ASSERT((*layer_0_backward_output)[18] == 0);
-  GALOIS_LOG_ASSERT((*layer_0_backward_output)[19] == 0);
-  GALOIS_LOG_ASSERT((*layer_0_backward_output)[20] == 0);
-
-  const std::vector<galois::GNNFloat> layer_0_weight_gradients =
-      layer_0->GetLayerWeightGradients();
-  // make sure they are sane
-  GALOIS_LOG_ASSERT(layer_0_weight_gradients.size() == 6);
-  GALOIS_LOG_ASSERT(layer_0_weight_gradients[0] == 21);
-  GALOIS_LOG_ASSERT(layer_0_weight_gradients[1] == 21);
-  GALOIS_LOG_ASSERT(layer_0_weight_gradients[2] == 21);
-  GALOIS_LOG_ASSERT(layer_0_weight_gradients[3] == 21);
-  GALOIS_LOG_ASSERT(layer_0_weight_gradients[4] == 21);
-
-  layer_0.reset();
-
-  //////////////////////////////////////////////////////////////////////////////
-
-  // create layer 1 for testing backward prop actually giving weights back
-
-  std::unique_ptr<galois::GraphConvolutionalLayer> layer_1 =
-      std::make_unique<galois::GraphConvolutionalLayer>(1, test_graph,
-                                                        dimension_0, dcon);
-  layer_1->InitAllWeightsTo1();
-  const std::vector<galois::GNNFloat>& layer_1_forward_output =
-      layer_1->ForwardPhase(test_graph.GetLocalFeatures());
-  // same check as before for sanity purposes
-  GALOIS_LOG_ASSERT(layer_1_forward_output.size() == 14);
-  GALOIS_LOG_ASSERT(layer_1_forward_output[0] == 3);
-  GALOIS_LOG_ASSERT(layer_1_forward_output[1] == 3);
-  GALOIS_LOG_ASSERT(layer_1_forward_output[2] == 6);
-  GALOIS_LOG_ASSERT(layer_1_forward_output[3] == 6);
-  GALOIS_LOG_ASSERT(layer_1_forward_output[4] == 12);
-  GALOIS_LOG_ASSERT(layer_1_forward_output[5] == 12);
-  GALOIS_LOG_ASSERT(layer_1_forward_output[6] == 18);
-  GALOIS_LOG_ASSERT(layer_1_forward_output[7] == 18);
-  GALOIS_LOG_ASSERT(layer_1_forward_output[8] == 24);
-  GALOIS_LOG_ASSERT(layer_1_forward_output[9] == 24);
-  GALOIS_LOG_ASSERT(layer_1_forward_output[10] == 30);
-  GALOIS_LOG_ASSERT(layer_1_forward_output[11] == 30);
-  GALOIS_LOG_ASSERT(layer_1_forward_output[12] == 15);
-  GALOIS_LOG_ASSERT(layer_1_forward_output[13] == 15);
-
-  // since layer isn't 0 anymore, backward phase will actually return something
-  dummy_ones.assign(14, 1);
-  std::vector<galois::GNNFloat>* layer_1_backward_output =
-      layer_1->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
-  //////////////////////////////////////////////////////////////////////////////
-  // check that multiplies go as expected
-  //////////////////////////////////////////////////////////////////////////////
-  GALOIS_LOG_ASSERT(layer_1_backward_output->size() == 21);
-  GALOIS_LOG_ASSERT((*layer_1_backward_output)[0] == 2);
-  GALOIS_LOG_ASSERT((*layer_1_backward_output)[1] == 2);
-  GALOIS_LOG_ASSERT((*layer_1_backward_output)[2] == 2);
-  GALOIS_LOG_ASSERT((*layer_1_backward_output)[3] == 4);
-  GALOIS_LOG_ASSERT((*layer_1_backward_output)[4] == 4);
-  GALOIS_LOG_ASSERT((*layer_1_backward_output)[5] == 4);
-  GALOIS_LOG_ASSERT((*layer_1_backward_output)[6] == 4);
-  GALOIS_LOG_ASSERT((*layer_1_backward_output)[7] == 4);
-  GALOIS_LOG_ASSERT((*layer_1_backward_output)[8] == 4);
-  GALOIS_LOG_ASSERT((*layer_1_backward_output)[9] == 4);
-  GALOIS_LOG_ASSERT((*layer_1_backward_output)[10] == 4);
-  GALOIS_LOG_ASSERT((*layer_1_backward_output)[11] == 4);
-  GALOIS_LOG_ASSERT((*layer_1_backward_output)[12] == 4);
-  GALOIS_LOG_ASSERT((*layer_1_backward_output)[13] == 4);
-  GALOIS_LOG_ASSERT((*layer_1_backward_output)[14] == 4);
-  GALOIS_LOG_ASSERT((*layer_1_backward_output)[15] == 4);
-  GALOIS_LOG_ASSERT((*layer_1_backward_output)[16] == 4);
-  GALOIS_LOG_ASSERT((*layer_1_backward_output)[17] == 4);
-  GALOIS_LOG_ASSERT((*layer_1_backward_output)[18] == 2);
-  GALOIS_LOG_ASSERT((*layer_1_backward_output)[19] == 2);
-  GALOIS_LOG_ASSERT((*layer_1_backward_output)[20] == 2);
-
-  const std::vector<galois::GNNFloat> layer_1_weight_gradients =
-      layer_1->GetLayerWeightGradients();
-  // make sure they are sane
-  GALOIS_LOG_ASSERT(layer_1_weight_gradients.size() == 6);
-  GALOIS_LOG_ASSERT(layer_1_weight_gradients[0] == 21);
-  GALOIS_LOG_ASSERT(layer_1_weight_gradients[1] == 21);
-  GALOIS_LOG_ASSERT(layer_1_weight_gradients[2] == 21);
-  GALOIS_LOG_ASSERT(layer_1_weight_gradients[3] == 21);
-  GALOIS_LOG_ASSERT(layer_1_weight_gradients[4] == 21);
-
-  layer_1.reset();
-
-  //////////////////////////////////////////////////////////////////////////////
-
-  galois::GNNLayerConfig config;
-  config.do_dropout                   = true;
-  config.do_activation                = true;
-  config.do_normalization             = true;
-  config.allow_aggregate_after_update = false;
-
-  // finally, just make sure dropout and activation run without crashes
-  // (verification requires floating point accuracy or setting a seed which I
-  // don't have time for at the moment
-  // TODO in future maybe add better unit test for this
-  std::unique_ptr<galois::GraphConvolutionalLayer> layer_2 =
-      std::make_unique<galois::GraphConvolutionalLayer>(1, test_graph,
-                                                        dimension_0, config);
-  const std::vector<galois::GNNFloat> l2_fo =
-      layer_2->ForwardPhase(test_graph.GetLocalFeatures());
-  GALOIS_LOG_ASSERT(l2_fo.size() == 14);
-  GALOIS_LOG_VERBOSE("{}", l2_fo[0]);
-  GALOIS_LOG_VERBOSE("{}", l2_fo[1]);
-  GALOIS_LOG_VERBOSE("{}", l2_fo[2]);
-  GALOIS_LOG_VERBOSE("{}", l2_fo[3]);
-  GALOIS_LOG_VERBOSE("{}", l2_fo[4]);
-  GALOIS_LOG_VERBOSE("{}", l2_fo[5]);
-  GALOIS_LOG_VERBOSE("{}", l2_fo[6]);
-  GALOIS_LOG_VERBOSE("{}", l2_fo[7]);
-  GALOIS_LOG_VERBOSE("{}", l2_fo[8]);
-  GALOIS_LOG_VERBOSE("{}", l2_fo[9]);
-  GALOIS_LOG_VERBOSE("{}", l2_fo[10]);
-  GALOIS_LOG_VERBOSE("{}", l2_fo[11]);
-  GALOIS_LOG_VERBOSE("{}", l2_fo[12]);
-  GALOIS_LOG_VERBOSE("{}", l2_fo[13]);
-
-  std::vector<galois::GNNFloat>* l2_bo =
-      layer_2->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
-
-  GALOIS_LOG_ASSERT(l2_bo->size() == 21);
-  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[0]);
-  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[1]);
-  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[2]);
-  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[3]);
-  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[4]);
-  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[5]);
-  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[6]);
-  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[7]);
-  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[8]);
-  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[9]);
-  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[10]);
-  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[11]);
-  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[12]);
-  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[13]);
-  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[14]);
-  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[15]);
-  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[16]);
-  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[17]);
-  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[18]);
-  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[19]);
-  GALOIS_LOG_VERBOSE("{}", (*l2_bo)[20]);
+  //GALOIS_LOG_ASSERT(layer_0_forward_output[0] == 3);
+  //GALOIS_LOG_ASSERT(layer_0_forward_output[1] == 3);
+  //GALOIS_LOG_ASSERT(layer_0_forward_output[2] == 6);
+  //GALOIS_LOG_ASSERT(layer_0_forward_output[3] == 6);
+  //GALOIS_LOG_ASSERT(layer_0_forward_output[4] == 12);
+  //GALOIS_LOG_ASSERT(layer_0_forward_output[5] == 12);
+  //GALOIS_LOG_ASSERT(layer_0_forward_output[6] == 18);
+  //GALOIS_LOG_ASSERT(layer_0_forward_output[7] == 18);
+  //GALOIS_LOG_ASSERT(layer_0_forward_output[8] == 24);
+  //GALOIS_LOG_ASSERT(layer_0_forward_output[9] == 24);
+  //GALOIS_LOG_ASSERT(layer_0_forward_output[10] == 30);
+  //GALOIS_LOG_ASSERT(layer_0_forward_output[11] == 30);
+  //GALOIS_LOG_ASSERT(layer_0_forward_output[12] == 15);
+  //GALOIS_LOG_ASSERT(layer_0_forward_output[13] == 15);
+  ////////////////////////////////////////////////////////////////////////////////
+
+  //// dummy 1 matrix
+  //std::vector<galois::GNNFloat> dummy_ones_v(14, 1);
+  //galois::PointerWithSize dummy_ones(dummy_ones_v);
+
+  //// backward pass checking
+  //// layer 0 means that an empty weight matrix is returned since there is no
+  //// point passing back anything
+  //galois::PointerWithSize<galois::GNNFloat> layer_0_backward_output =
+  //    layer_0->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
+
+  ////////////////////////////////////////////////////////////////////////////////
+  //// sanity check layer 0 backward output; all 0 because layer 0
+  ////////////////////////////////////////////////////////////////////////////////
+  //// since norm factors aren't invovled it is possible to do full assertions
+  //// 7 x 3
+  //GALOIS_LOG_ASSERT(layer_0_backward_output.size() == 21);
+  //GALOIS_LOG_ASSERT((layer_0_backward_output)[0] == 0);
+  //GALOIS_LOG_ASSERT((layer_0_backward_output)[1] == 0);
+  //GALOIS_LOG_ASSERT((layer_0_backward_output)[2] == 0);
+  //GALOIS_LOG_ASSERT((layer_0_backward_output)[3] == 0);
+  //GALOIS_LOG_ASSERT((layer_0_backward_output)[4] == 0);
+  //GALOIS_LOG_ASSERT((layer_0_backward_output)[5] == 0);
+  //GALOIS_LOG_ASSERT((layer_0_backward_output)[6] == 0);
+  //GALOIS_LOG_ASSERT((layer_0_backward_output)[7] == 0);
+  //GALOIS_LOG_ASSERT((layer_0_backward_output)[8] == 0);
+  //GALOIS_LOG_ASSERT((layer_0_backward_output)[9] == 0);
+  //GALOIS_LOG_ASSERT((layer_0_backward_output)[10] == 0);
+  //GALOIS_LOG_ASSERT((layer_0_backward_output)[11] == 0);
+  //GALOIS_LOG_ASSERT((layer_0_backward_output)[12] == 0);
+  //GALOIS_LOG_ASSERT((layer_0_backward_output)[13] == 0);
+  //GALOIS_LOG_ASSERT((layer_0_backward_output)[14] == 0);
+  //GALOIS_LOG_ASSERT((layer_0_backward_output)[15] == 0);
+  //GALOIS_LOG_ASSERT((layer_0_backward_output)[16] == 0);
+  //GALOIS_LOG_ASSERT((layer_0_backward_output)[17] == 0);
+  //GALOIS_LOG_ASSERT((layer_0_backward_output)[18] == 0);
+  //GALOIS_LOG_ASSERT((layer_0_backward_output)[19] == 0);
+  //GALOIS_LOG_ASSERT((layer_0_backward_output)[20] == 0);
+
+  //galois::PointerWithSize<galois::GNNFloat> layer_0_weight_gradients =
+  //    layer_0->GetLayerWeightGradients();
+  //// make sure they are sane
+  //GALOIS_LOG_ASSERT(layer_0_weight_gradients.size() == 6);
+  //GALOIS_LOG_ASSERT(layer_0_weight_gradients[0] == 21);
+  //GALOIS_LOG_ASSERT(layer_0_weight_gradients[1] == 21);
+  //GALOIS_LOG_ASSERT(layer_0_weight_gradients[2] == 21);
+  //GALOIS_LOG_ASSERT(layer_0_weight_gradients[3] == 21);
+  //GALOIS_LOG_ASSERT(layer_0_weight_gradients[4] == 21);
+
+  //layer_0.reset();
+
+  ////////////////////////////////////////////////////////////////////////////////
+
+  //// create layer 1 for testing backward prop actually giving weights back
+
+  //std::unique_ptr<galois::GraphConvolutionalLayer> layer_1 =
+  //    std::make_unique<galois::GraphConvolutionalLayer>(1, test_graph,
+  //                                                      dimension_0, dcon);
+  //layer_1->InitAllWeightsTo1();
+  //galois::PointerWithSize<galois::GNNFloat> layer_1_forward_output =
+  //    layer_1->ForwardPhase(test_graph.GetLocalFeatures());
+  //// same check as before for sanity purposes
+  //GALOIS_LOG_ASSERT(layer_1_forward_output.size() == 14);
+  //GALOIS_LOG_ASSERT(layer_1_forward_output[0] == 3);
+  //GALOIS_LOG_ASSERT(layer_1_forward_output[1] == 3);
+  //GALOIS_LOG_ASSERT(layer_1_forward_output[2] == 6);
+  //GALOIS_LOG_ASSERT(layer_1_forward_output[3] == 6);
+  //GALOIS_LOG_ASSERT(layer_1_forward_output[4] == 12);
+  //GALOIS_LOG_ASSERT(layer_1_forward_output[5] == 12);
+  //GALOIS_LOG_ASSERT(layer_1_forward_output[6] == 18);
+  //GALOIS_LOG_ASSERT(layer_1_forward_output[7] == 18);
+  //GALOIS_LOG_ASSERT(layer_1_forward_output[8] == 24);
+  //GALOIS_LOG_ASSERT(layer_1_forward_output[9] == 24);
+  //GALOIS_LOG_ASSERT(layer_1_forward_output[10] == 30);
+  //GALOIS_LOG_ASSERT(layer_1_forward_output[11] == 30);
+  //GALOIS_LOG_ASSERT(layer_1_forward_output[12] == 15);
+  //GALOIS_LOG_ASSERT(layer_1_forward_output[13] == 15);
+
+  //// since layer isn't 0 anymore, backward phase will actually return something
+  //dummy_ones_v.assign(14, 1);
+  //galois::PointerWithSize<galois::GNNFloat> layer_1_backward_output =
+  //    layer_1->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
+  ////////////////////////////////////////////////////////////////////////////////
+  //// check that multiplies go as expected
+  ////////////////////////////////////////////////////////////////////////////////
+  //GALOIS_LOG_ASSERT(layer_1_backward_output.size() == 21);
+  //GALOIS_LOG_ASSERT((layer_1_backward_output)[0] == 2);
+  //GALOIS_LOG_ASSERT((layer_1_backward_output)[1] == 2);
+  //GALOIS_LOG_ASSERT((layer_1_backward_output)[2] == 2);
+  //GALOIS_LOG_ASSERT((layer_1_backward_output)[3] == 4);
+  //GALOIS_LOG_ASSERT((layer_1_backward_output)[4] == 4);
+  //GALOIS_LOG_ASSERT((layer_1_backward_output)[5] == 4);
+  //GALOIS_LOG_ASSERT((layer_1_backward_output)[6] == 4);
+  //GALOIS_LOG_ASSERT((layer_1_backward_output)[7] == 4);
+  //GALOIS_LOG_ASSERT((layer_1_backward_output)[8] == 4);
+  //GALOIS_LOG_ASSERT((layer_1_backward_output)[9] == 4);
+  //GALOIS_LOG_ASSERT((layer_1_backward_output)[10] == 4);
+  //GALOIS_LOG_ASSERT((layer_1_backward_output)[11] == 4);
+  //GALOIS_LOG_ASSERT((layer_1_backward_output)[12] == 4);
+  //GALOIS_LOG_ASSERT((layer_1_backward_output)[13] == 4);
+  //GALOIS_LOG_ASSERT((layer_1_backward_output)[14] == 4);
+  //GALOIS_LOG_ASSERT((layer_1_backward_output)[15] == 4);
+  //GALOIS_LOG_ASSERT((layer_1_backward_output)[16] == 4);
+  //GALOIS_LOG_ASSERT((layer_1_backward_output)[17] == 4);
+  //GALOIS_LOG_ASSERT((layer_1_backward_output)[18] == 2);
+  //GALOIS_LOG_ASSERT((layer_1_backward_output)[19] == 2);
+  //GALOIS_LOG_ASSERT((layer_1_backward_output)[20] == 2);
+
+  //galois::PointerWithSize<galois::GNNFloat> layer_1_weight_gradients =
+  //    layer_1->GetLayerWeightGradients();
+  //// make sure they are sane
+  //GALOIS_LOG_ASSERT(layer_1_weight_gradients.size() == 6);
+  //GALOIS_LOG_ASSERT(layer_1_weight_gradients[0] == 21);
+  //GALOIS_LOG_ASSERT(layer_1_weight_gradients[1] == 21);
+  //GALOIS_LOG_ASSERT(layer_1_weight_gradients[2] == 21);
+  //GALOIS_LOG_ASSERT(layer_1_weight_gradients[3] == 21);
+  //GALOIS_LOG_ASSERT(layer_1_weight_gradients[4] == 21);
+
+  //layer_1.reset();
+
+  ////////////////////////////////////////////////////////////////////////////////
+
+  //galois::GNNLayerConfig config;
+  //config.do_dropout                   = true;
+  //config.do_activation                = true;
+  //config.do_normalization             = true;
+  //config.allow_aggregate_after_update = false;
+
+  //// finally, just make sure dropout and activation run without crashes
+  //// (verification requires floating point accuracy or setting a seed which I
+  //// don't have time for at the moment
+  //// TODO in future maybe add better unit test for this
+  //std::unique_ptr<galois::GraphConvolutionalLayer> layer_2 =
+  //    std::make_unique<galois::GraphConvolutionalLayer>(1, test_graph,
+  //                                                      dimension_0, config);
+  //galois::PointerWithSize<galois::GNNFloat> l2_fo =
+  //    layer_2->ForwardPhase(test_graph.GetLocalFeatures());
+  //GALOIS_LOG_ASSERT(l2_fo.size() == 14);
+  //GALOIS_LOG_VERBOSE("{}", l2_fo[0]);
+  //GALOIS_LOG_VERBOSE("{}", l2_fo[1]);
+  //GALOIS_LOG_VERBOSE("{}", l2_fo[2]);
+  //GALOIS_LOG_VERBOSE("{}", l2_fo[3]);
+  //GALOIS_LOG_VERBOSE("{}", l2_fo[4]);
+  //GALOIS_LOG_VERBOSE("{}", l2_fo[5]);
+  //GALOIS_LOG_VERBOSE("{}", l2_fo[6]);
+  //GALOIS_LOG_VERBOSE("{}", l2_fo[7]);
+  //GALOIS_LOG_VERBOSE("{}", l2_fo[8]);
+  //GALOIS_LOG_VERBOSE("{}", l2_fo[9]);
+  //GALOIS_LOG_VERBOSE("{}", l2_fo[10]);
+  //GALOIS_LOG_VERBOSE("{}", l2_fo[11]);
+  //GALOIS_LOG_VERBOSE("{}", l2_fo[12]);
+  //GALOIS_LOG_VERBOSE("{}", l2_fo[13]);
+
+  //galois::PointerWithSize<galois::GNNFloat> l2_bo =
+  //    layer_2->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
+
+  //GALOIS_LOG_ASSERT(l2_bo.size() == 21);
+  //GALOIS_LOG_VERBOSE("{}", (l2_bo)[0]);
+  //GALOIS_LOG_VERBOSE("{}", (l2_bo)[1]);
+  //GALOIS_LOG_VERBOSE("{}", (l2_bo)[2]);
+  //GALOIS_LOG_VERBOSE("{}", (l2_bo)[3]);
+  //GALOIS_LOG_VERBOSE("{}", (l2_bo)[4]);
+  //GALOIS_LOG_VERBOSE("{}", (l2_bo)[5]);
+  //GALOIS_LOG_VERBOSE("{}", (l2_bo)[6]);
+  //GALOIS_LOG_VERBOSE("{}", (l2_bo)[7]);
+  //GALOIS_LOG_VERBOSE("{}", (l2_bo)[8]);
+  //GALOIS_LOG_VERBOSE("{}", (l2_bo)[9]);
+  //GALOIS_LOG_VERBOSE("{}", (l2_bo)[10]);
+  //GALOIS_LOG_VERBOSE("{}", (l2_bo)[11]);
+  //GALOIS_LOG_VERBOSE("{}", (l2_bo)[12]);
+  //GALOIS_LOG_VERBOSE("{}", (l2_bo)[13]);
+  //GALOIS_LOG_VERBOSE("{}", (l2_bo)[14]);
+  //GALOIS_LOG_VERBOSE("{}", (l2_bo)[15]);
+  //GALOIS_LOG_VERBOSE("{}", (l2_bo)[16]);
+  //GALOIS_LOG_VERBOSE("{}", (l2_bo)[17]);
+  //GALOIS_LOG_VERBOSE("{}", (l2_bo)[18]);
+  //GALOIS_LOG_VERBOSE("{}", (l2_bo)[19]);
+  //GALOIS_LOG_VERBOSE("{}", (l2_bo)[20]);
 
   return 0;
 }

From 606750b85e131b08da97ff60f9ea621ee041a45e Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 10 Nov 2020 17:06:42 -0600
Subject: [PATCH 402/660] GPU arch related defs, CUDA_TEST

Adds compile time constants for various GPU things like warp size, block
size, etc. Also adds CUDA_TEST which is a macro for checking if a CUDA
call returned successfully.
---
 libgnn/include/galois/CUDAUtil.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/libgnn/include/galois/CUDAUtil.h b/libgnn/include/galois/CUDAUtil.h
index f8d7a03b80..d479efe64d 100644
--- a/libgnn/include/galois/CUDAUtil.h
+++ b/libgnn/include/galois/CUDAUtil.h
@@ -5,6 +5,14 @@
 #include <cuda.h>
 #include "galois/Logging.h"
 
+// TODO check these
+#define CHUNK_SIZE 256
+#define TB_SIZE 256
+#define BLOCK_SIZE 256
+#define WARP_SIZE 32
+#define MAX_NUM_CLASSES 128
+#define WARPS_PER_BLOCK (BLOCK_SIZE / WARP_SIZE)
+
 #define CUDA_CHECK(condition)                                                  \
   do {                                                                         \
     cudaError_t error = condition;                                             \
@@ -22,4 +30,15 @@
     }                                                                          \
   } while (0)
 
+#define CUDA_TEST(msg)                                                         \
+  do {                                                                         \
+    cudaError_t e;                                                             \
+    cudaDeviceSynchronize();                                                   \
+    if (cudaSuccess != (e = cudaGetLastError())) {                             \
+      GALOIS_LOG_ERROR("{}: {}", msg, e);                                      \
+      GALOIS_LOG_ERROR("{}", cudaGetErrorString(e));                           \
+      exit(EXIT_FAILURE);                                                      \
+    }                                                                          \
+  } while (0)
+
 #endif

From 4d218ccc2f769c4c369e42d6fec185c9c5ba100e Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 10 Nov 2020 17:18:25 -0600
Subject: [PATCH 403/660] GPU structs: getter functions

Adds getter functions to the memory objects for layers and the graph.
The GCN layer also declares a AggregateAllGPU function for a later
commit to use + to do aggregation on the GPU.
---
 libgnn/include/galois/graphs/GNNGraph.cuh            |  8 ++++++++
 libgnn/include/galois/layers/GNNLayer.cuh            |  5 +++++
 .../galois/layers/GraphConvolutionalLayer.cuh        | 12 +++++++++++-
 3 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/libgnn/include/galois/graphs/GNNGraph.cuh b/libgnn/include/galois/graphs/GNNGraph.cuh
index c44fba7b9a..33093d2ebc 100644
--- a/libgnn/include/galois/graphs/GNNGraph.cuh
+++ b/libgnn/include/galois/graphs/GNNGraph.cuh
@@ -20,6 +20,14 @@ public:
                    unsigned num_features);
   //! Copy over ground truth for the graph to GPU
   void SetLabels(const std::vector<GNNLabel>& ground_truth);
+
+  GNNFeature* feature_vector() { return feature_vector_; };
+  const GNNFeature* feature_vector() const { return feature_vector_; };
+  int* edge_index() { return edge_index_; }
+  const int* edge_index() const { return edge_index_; }
+  int* edge_destinations() { return edge_destinations_; }
+  const int* edge_destinations() const { return edge_destinations_; }
+
 private:
   // ALL THESE VARIABLES ARE DEVICE SIDE (GPU) POINTERS
 
diff --git a/libgnn/include/galois/layers/GNNLayer.cuh b/libgnn/include/galois/layers/GNNLayer.cuh
index 3a89c97d61..81fa9e2026 100644
--- a/libgnn/include/galois/layers/GNNLayer.cuh
+++ b/libgnn/include/galois/layers/GNNLayer.cuh
@@ -15,6 +15,11 @@ public:
   //! Copy provided data in vector to GPU weights
   void CopyToWeights(const std::vector<GNNFloat>& cpu_layer_weights);
 
+  GNNFloat* forward_output() { return forward_output_matrix_; }
+  GNNFloat* backward_output() { return backward_output_matrix_; }
+  GNNFloat* layer_weights() { return layer_weights_; }
+  GNNFloat* layer_weight_gradients() { return layer_weight_gradients_; }
+
 private:
   size_t* num_weights_{nullptr};
   GNNFloat* forward_output_matrix_{nullptr};
diff --git a/libgnn/include/galois/layers/GraphConvolutionalLayer.cuh b/libgnn/include/galois/layers/GraphConvolutionalLayer.cuh
index 6b567eab2e..993b6f39cb 100644
--- a/libgnn/include/galois/layers/GraphConvolutionalLayer.cuh
+++ b/libgnn/include/galois/layers/GraphConvolutionalLayer.cuh
@@ -1,5 +1,6 @@
 #pragma once
 #include "galois/GNNTypes.h"
+#include "galois/graphs/GNNGraph.cuh"
 
 namespace galois {
 
@@ -10,10 +11,19 @@ public:
   ~GCNGPUAllocations();
   // allocate the 3 temp arrays
   void Allocate(size_t input_elements, size_t output_elements);
+  GNNFloat* in_temp_1() { return in_temp_1_; }
+  GNNFloat* in_temp_2() { return in_temp_2_; }
+  GNNFloat* out_temp() { return out_temp_; }
+
+  void AggregateAllGPU(const graphs::GNNGraphGPUAllocations& gpu_graph,
+                       size_t num_nodes, size_t column_length,
+                       const GNNFloat* node_embeddings,
+                       GNNFloat* aggregate_output);
+
 private:
   GNNFloat* in_temp_1_{nullptr};
   GNNFloat* in_temp_2_{nullptr};
   GNNFloat* out_temp_{nullptr};
 };
 
-}
+} // namespace galois

From 6629963b45b5879935c3aa596e414985c5078a08 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 10 Nov 2020 17:22:37 -0600
Subject: [PATCH 404/660] GNNGraph: GPU object access + return GPU features

Return GPU features if GPU build is on + adds function to get the GPU
pointer object.
---
 libgnn/include/galois/graphs/GNNGraph.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 2b55d17b7a..5383b325d3 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -102,7 +102,13 @@ class GNNGraph {
 
   //! Return matrix of the local node features
   const PointerWithSize<GNNFloat> GetLocalFeatures() {
+#ifndef GALOIS_ENABLE_GPU
     return PointerWithSize(local_node_features_);
+#else
+    // TODO remove reliance on local_node_features
+    return PointerWithSize(gpu_memory_.feature_vector(),
+                           local_node_features_.size());
+#endif
   }
 
   //! Given an LID and the current phase of GNN computation, determine if the
@@ -121,6 +127,9 @@ class GNNGraph {
   void AggregateSync(GNNFloat* matrix_to_sync,
                      const size_t matrix_column_size) const;
 
+#ifdef GALOIS_ENABLE_GPU
+  const GNNGraphGPUAllocations& GetGPUGraph() const { return gpu_memory_; }
+#endif
 private:
   //! Directory for input data
   const std::string input_directory_;

From 15de1cf298c5957e0b043b2435f159bb0e7789ea Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 10 Nov 2020 17:33:46 -0600
Subject: [PATCH 405/660] Use PointerWithSize for functions in layers

Since the code now needs to run with both CPUs and GPUs, the
"orchestration" code cannot use vectors anymore. This commit replaces
them with PointerWithSize which can be used by both CPUs and GPUs. This
commit adds initialization code for these objects depending on the
build.

This commit also begins to split the calls in CPU and GPU variants: the
orchestration code remains the same, but depending on the build, the
code will either call a CPU version or a GPU version. This split is done
for dropout and aggregation so far (though this commit does not
include the implementation of the GPU code; just the call).
---
 libgnn/include/galois/layers/GNNLayer.h       | 30 +++++++---
 .../galois/layers/GraphConvolutionalLayer.h   | 14 ++++-
 libgnn/src/layers/GNNLayer.cpp                | 44 ++++++++++++---
 libgnn/src/layers/GraphConvolutionalLayer.cpp | 55 +++++++++++++++----
 4 files changed, 115 insertions(+), 28 deletions(-)

diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
index 9636d4f8d6..b1c8ae55a5 100644
--- a/libgnn/include/galois/layers/GNNLayer.h
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -144,22 +144,32 @@ class GNNLayer {
   GNNLayerDimensions layer_dimensions_;
   //! Config object for certain parameters for layer
   GNNLayerConfig config_;
+
   //! Weights used by this layer. Dimensions: input columns by output columns
   std::vector<GNNFloat> layer_weights_;
   //! Gradients used to update the weights of this layer
   std::vector<GNNFloat> layer_weight_gradients_;
-  //! Wrapper over gradient matrix to make it compatible with Gluon
-  std::unique_ptr<GluonGradientInterface> gradient_sync_interface_;
-  //! Synchronization substrate for the weight gradients
-  std::unique_ptr<galois::graphs::GluonSubstrate<GluonGradientInterface>>
-      gradient_sync_substrate_;
-
   // There is a forward and a backward as their sizes will differ and we only
   // want to allocate memory once to avoid runtime memory allocation.
   //! The output of the forward phase for this layer.
   std::vector<GNNFloat> forward_output_matrix_;
   //! The output of the backward phase for this layer.
   std::vector<GNNFloat> backward_output_matrix_;
+
+  // These are wrapper around the pointer for the data associated with
+  // any GNN layer: takes a CPU or GPU pointer depending on configuration
+  // Needed to allow both CPU/GPU runs with same code
+  PointerWithSize<GNNFloat> p_layer_weights_;
+  PointerWithSize<GNNFloat> p_layer_weight_gradients_;
+  PointerWithSize<GNNFloat> p_forward_output_matrix_;
+  PointerWithSize<GNNFloat> p_backward_output_matrix_;
+
+  //! Wrapper over gradient matrix to make it compatible with Gluon
+  std::unique_ptr<GluonGradientInterface> gradient_sync_interface_;
+  //! Synchronization substrate for the weight gradients
+  std::unique_ptr<galois::graphs::GluonSubstrate<GluonGradientInterface>>
+      gradient_sync_substrate_;
+
   //! RNG for matrix initialization
   PerThreadRNG random_init_rng_{-5.0, 5.0};
   //! RNG for dropout
@@ -188,11 +198,15 @@ class GNNLayer {
   //! Randomly init a float vector using the class's random init RNG
   void RandomInitVector(std::vector<GNNFloat>* vector_to_init);
 
+  //! CPU variant of dropout
+  void DoDropoutCPU(const PointerWithSize<GNNFloat> input_to_drop,
+                    PointerWithSize<GNNFloat>* output_matrix);
+
   //! Choose a set of weights from this layer's weights to keep and save to
   //! the output matrix + apply some scaling to the kept weights based on
   //! dropout rate
   void DoDropout(const PointerWithSize<GNNFloat> input_to_drop,
-                 std::vector<GNNFloat>* output_matrix);
+                 PointerWithSize<GNNFloat>* output_matrix);
   //! Apply the derivative of dropout to the backward phase output
   void DoDropoutDerivative();
 
@@ -210,7 +224,7 @@ class GNNLayer {
 
 #ifdef GALOIS_ENABLE_GPU
   //! Object that holds all GPU allocated pointers to memory related to layers
-  GNNLayerGPUAllocations gpu_memory_;
+  GNNLayerGPUAllocations gpu_object_;
   //! Copies over layer weights to GPU
   void CopyLayerWeightsToGPU();
 #endif
diff --git a/libgnn/include/galois/layers/GraphConvolutionalLayer.h b/libgnn/include/galois/layers/GraphConvolutionalLayer.h
index 19c4e6c68c..c677389df7 100644
--- a/libgnn/include/galois/layers/GraphConvolutionalLayer.h
+++ b/libgnn/include/galois/layers/GraphConvolutionalLayer.h
@@ -39,6 +39,12 @@ class GraphConvolutionalLayer : public GNNLayer {
   // Temporary matrix the size of the output of the forward pass; used if
   // an intermediate op occurs before writing to the final output matrix
   std::vector<GNNFloat> out_temp_;
+
+  // Pointer with size versions
+  PointerWithSize<GNNFloat> p_in_temp_1_;
+  PointerWithSize<GNNFloat> p_in_temp_2_;
+  PointerWithSize<GNNFloat> p_out_temp_;
+
   // Each thread has a vector of size # input columns or # output columns for
   // storing intermediate results during aggregation.
   // The one used depeneds on if aggregation occurs before or after the mxm.
@@ -47,6 +53,12 @@ class GraphConvolutionalLayer : public GNNLayer {
   galois::substrate::PerThreadStorage<std::vector<GNNFloat>>
       output_column_intermediates_;
 
+  //! CPU aggregation
+  void AggregateAllCPU(
+      size_t column_length, const GNNFloat* node_embeddings,
+      GNNFloat* aggregate_output,
+      galois::substrate::PerThreadStorage<std::vector<GNNFloat>>* pts);
+
   //! Performs aggregation for all nodes of the graph given the length of the
   //! vector to aggregate, the features themselves, an output array, and per
   //! thread storage for the intermediate scaling via norm factor
@@ -61,7 +73,7 @@ class GraphConvolutionalLayer : public GNNLayer {
   void UpdateEmbeddingsDerivative(const GNNFloat* gradients, GNNFloat* output);
 
 #ifdef GALOIS_ENABLE_GPU
-  GCNGPUAllocations gpu_memory_;
+  GCNGPUAllocations gpu_object_;
 #endif
 };
 
diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp
index a42d593a22..8bf9c42dfd 100644
--- a/libgnn/src/layers/GNNLayer.cpp
+++ b/libgnn/src/layers/GNNLayer.cpp
@@ -29,7 +29,7 @@ galois::GNNLayer::GNNLayer(size_t layer_num,
         galois::runtime::getSystemNetworkInterface().ID,
         galois::runtime::getSystemNetworkInterface().Num, false);
 #ifdef GALOIS_ENABLE_GPU
-    gpu_memory_.InitWeightMemory(num_weight_elements);
+    gpu_object_.InitWeightMemory(num_weight_elements);
 #endif
   }
 
@@ -39,10 +39,31 @@ galois::GNNLayer::GNNLayer(size_t layer_num,
   backward_output_matrix_.resize(
       layer_dimensions_.input_rows * layer_dimensions_.input_columns, 0);
 #ifdef GALOIS_ENABLE_GPU
-  gpu_memory_.InitInOutMemory(num_output_elements,
+  gpu_object_.InitInOutMemory(num_output_elements,
                               layer_dimensions_.input_rows *
                                   layer_dimensions_.input_columns);
 #endif
+
+  // initialize the PointerWithSize wrappers
+#ifndef GALOIS_ENABLE_GPU
+  p_layer_weights_ = PointerWithSize<GNNFloat>(layer_weights_);
+  p_layer_weight_gradients_ =
+      PointerWithSize<GNNFloat>(layer_weight_gradients_);
+  p_forward_output_matrix_ = PointerWithSize<GNNFloat>(forward_output_matrix_);
+  p_backward_output_matrix_ =
+      PointerWithSize<GNNFloat>(backward_output_matrix_);
+#else
+  p_layer_weights_ = PointerWithSize<GNNFloat>(gpu_object_.layer_weights(),
+                                               layer_weights_.size());
+  p_layer_weight_gradients_ = PointerWithSize<GNNFloat>(
+      gpu_object_.layer_weight_gradients(), layer_weight_gradients_.size());
+  p_forward_output_matrix_ = PointerWithSize<GNNFloat>(
+      gpu_object_.forward_output(), forward_output_matrix_.size());
+  p_backward_output_matrix_ = PointerWithSize<GNNFloat>(
+      gpu_object_.backward_output(), backward_output_matrix_.size());
+  // TODO can clear the cpu side vectors/don't use .size() since optimally they
+  // aren't initialized
+#endif
 }
 
 void galois::GNNLayer::GlorotBengioInit(std::vector<GNNFloat>* vector_to_init) {
@@ -67,11 +88,9 @@ void galois::GNNLayer::RandomInitVector(std::vector<GNNFloat>* vector_to_init) {
       galois::loopname("RandomInitVector"));
 }
 
-// XXX Something is wrong with dropout; accuracy suffers, figure out what
-// it is
-void galois::GNNLayer::DoDropout(
+void galois::GNNLayer::DoDropoutCPU(
     const PointerWithSize<GNNFloat> input_to_dropout,
-    std::vector<GNNFloat>* output_matrix) {
+    PointerWithSize<GNNFloat>* output_matrix) {
   size_t num_elements = output_matrix->size();
   assert(num_elements == dropout_mask_.size());
   assert(num_elements == input_to_dropout.size());
@@ -96,6 +115,17 @@ void galois::GNNLayer::DoDropout(
       galois::loopname("LayerDropout"));
 }
 
+void galois::GNNLayer::DoDropout(
+    const PointerWithSize<GNNFloat> input_to_dropout,
+    PointerWithSize<GNNFloat>* output_matrix) {
+  //#ifdef GALOIS_ENABLE_GPU
+  //  // XXX
+  //  DoDropoutGPU();
+  //#else
+  DoDropoutCPU(input_to_dropout, output_matrix);
+  //#endif
+}
+
 void galois::GNNLayer::DoDropoutDerivative() {
   assert(backward_output_matrix_.size() == dropout_mask_.size());
   GNNFloat scale = 1. / (1. - config_.dropout_rate);
@@ -170,6 +200,6 @@ void galois::GNNLayer::WeightGradientSyncAverage() {
 
 #ifdef GALOIS_ENABLE_GPU
 void galois::GNNLayer::CopyLayerWeightsToGPU() {
-  gpu_memory_.CopyToWeights(layer_weights_);
+  gpu_object_.CopyToWeights(layer_weights_);
 }
 #endif
diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp
index c10c59c383..af2925facc 100644
--- a/libgnn/src/layers/GraphConvolutionalLayer.cpp
+++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp
@@ -20,39 +20,57 @@ galois::GraphConvolutionalLayer::GraphConvolutionalLayer(
   out_temp_.resize(num_output_elements, 0);
   layer_type_ = galois::GNNLayerType::kGraphConvolutional;
 #ifdef GALOIS_ENABLE_GPU
-  gpu_memory_.Allocate(num_input_elements, num_output_elements);
+  gpu_object_.Allocate(num_input_elements, num_output_elements);
 #endif
+
+  // init pointers with size
+#ifndef GALOIS_ENABLE_GPU
+  p_in_temp_1_ = PointerWithSize<GNNFloat>(in_temp_1_);
+  p_in_temp_2_ = PointerWithSize<GNNFloat>(in_temp_2_);
+  p_out_temp_  = PointerWithSize<GNNFloat>(out_temp_);
+#else
+  p_in_temp_1_ =
+      PointerWithSize<GNNFloat>(gpu_object_.in_temp_1(), in_temp_1_.size());
+  p_in_temp_2_ =
+      PointerWithSize<GNNFloat>(gpu_object_.in_temp_2(), in_temp_2_.size());
+  p_out_temp_ =
+      PointerWithSize<GNNFloat>(gpu_object_.out_temp(), out_temp_.size());
+#endif
+  GALOIS_LOG_VERBOSE("Conv layer initialized");
 }
 
 const galois::PointerWithSize<galois::GNNFloat>
 galois::GraphConvolutionalLayer::ForwardPhase(
     const galois::PointerWithSize<galois::GNNFloat> input_embeddings) {
+  GALOIS_LOG_VERBOSE("Calling forward phase");
   assert(input_embeddings.size() ==
          (layer_dimensions_.input_rows * layer_dimensions_.input_columns));
-  assert(in_temp_1_.size() == input_embeddings.size());
-  assert(in_temp_2_.size() == input_embeddings.size());
-  assert(forward_output_matrix_.size() ==
+  assert(p_in_temp_1_.size() == input_embeddings.size());
+  assert(p_in_temp_2_.size() == input_embeddings.size());
+  assert(p_forward_output_matrix_.size() ==
          (layer_dimensions_.input_rows * layer_dimensions_.output_columns));
   // pointer to input to operate on
   const GNNFloat* input_data = input_embeddings.data();
   // first, dropout
   if (config_.do_dropout && (layer_phase_ == GNNPhase::kTrain)) {
-    DoDropout(input_embeddings, &in_temp_1_);
-    input_data = in_temp_1_.data();
+    galois::PointerWithSize<galois::GNNFloat> drop_output(in_temp_1_);
+    DoDropout(input_embeddings, &drop_output);
+    input_data = drop_output.data();
   }
 
   // flip aggregate/update if dimensions favor it (do less work)
   if (!config_.allow_aggregate_after_update ||
       layer_dimensions_.input_columns <= layer_dimensions_.output_columns) {
     // aggregation and update
-    AggregateAll(layer_dimensions_.input_columns, input_data, in_temp_2_.data(),
-                 &input_column_intermediates_);
-    UpdateEmbeddings(in_temp_2_.data(), forward_output_matrix_.data());
+    AggregateAll(layer_dimensions_.input_columns, input_data,
+                 p_in_temp_2_.data(), &input_column_intermediates_);
+    UpdateEmbeddings(p_in_temp_2_.data(), p_forward_output_matrix_.data());
   } else {
     // update to aggregate
-    UpdateEmbeddings(input_data, out_temp_.data());
-    AggregateAll(layer_dimensions_.output_columns, out_temp_.data(),
-                 forward_output_matrix_.data(), &output_column_intermediates_);
+    UpdateEmbeddings(input_data, p_out_temp_.data());
+    AggregateAll(layer_dimensions_.output_columns, p_out_temp_.data(),
+                 p_forward_output_matrix_.data(),
+                 &output_column_intermediates_);
   }
 
   // TODO synchronization of aggregation functions
@@ -129,6 +147,19 @@ galois::GraphConvolutionalLayer::BackwardPhase(
 }
 
 void galois::GraphConvolutionalLayer::AggregateAll(
+    size_t column_length, const GNNFloat* node_embeddings,
+    GNNFloat* aggregate_output,
+    [[maybe_unused]] galois::substrate::PerThreadStorage<std::vector<GNNFloat>>*
+        pts) {
+#ifndef GALOIS_ENABLE_GPU
+  AggregateAllCPU(column_length, node_embeddings, aggregate_output, pts);
+#else
+  gpu_object_.AggregateAllGPU(graph_.GetGPUGraph(), graph_.size(),
+                              column_length, node_embeddings, aggregate_output);
+#endif
+}
+
+void galois::GraphConvolutionalLayer::AggregateAllCPU(
     size_t column_length, const GNNFloat* node_embeddings,
     GNNFloat* aggregate_output,
     galois::substrate::PerThreadStorage<std::vector<GNNFloat>>* pts) {

From 336b6311ec9e70e0178932701efac10ed531ee21 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 10 Nov 2020 17:40:42 -0600
Subject: [PATCH 406/660] GCN aggregation code on GPU

This commit adds the GCN aggregation code on the GPU: each warp gets a
node, and the warp splits the summation of its feature among its
threads. Therefore, the implementation works well if the feature vector
is big to take advantage of the paralellism.

This code was taken from the old GPU code, so it may not necessarily be
the best optimized just yet. cuSparse may perform better as well, but at
the moment I just want to get something working.
---
 libgnn/src/layers/GraphConvolutionalLayer.cu | 62 ++++++++++++++++++++
 1 file changed, 62 insertions(+)

diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cu b/libgnn/src/layers/GraphConvolutionalLayer.cu
index 20e96d9777..e5a34a3c15 100644
--- a/libgnn/src/layers/GraphConvolutionalLayer.cu
+++ b/libgnn/src/layers/GraphConvolutionalLayer.cu
@@ -17,3 +17,65 @@ void galois::GCNGPUAllocations::Allocate(size_t input_elements,
   CUDA_CHECK(
       cudaMalloc((void**)(&out_temp_), output_elements * sizeof(GNNFloat)));
 }
+
+namespace {
+// GPU side aggregation call: no matrix multiply, just regular dst accesses
+__global__ void AggregateAllKernel(unsigned num_nodes, size_t column_length,
+                                   const int* edge_index,
+                                   const int* edge_destination,
+                                   const galois::GNNFloat* node_embeddings,
+                                   galois::GNNFloat* aggregate_output) {
+  const unsigned thread_id =
+      BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index
+  const unsigned thread_lane =
+      threadIdx.x & (WARP_SIZE - 1); // thread index within the warp
+  const unsigned warp_id = thread_id / WARP_SIZE; // global warp index
+  const unsigned warp_lane =
+      threadIdx.x / WARP_SIZE; // warp index within the CTA
+  const unsigned num_warps =
+      (BLOCK_SIZE / WARP_SIZE) * gridDim.x; // total number of active warps
+
+  // each warp gets a source: this var holds the first/last edge worked on by
+  // that warp
+  __shared__ int edge_begin_end[BLOCK_SIZE / WARP_SIZE][2];
+
+  // each warp works on a source: threads in warp split the feature
+  for (int src = warp_id; src < static_cast<int>(num_nodes); src += num_warps) {
+    if (thread_lane < 2) {
+      edge_begin_end[warp_lane][thread_lane] = edge_index[src + thread_lane];
+    }
+    // essentially what this is doing is making 2 of the threads set edge
+    // begin/end; all threads wait for sync
+    __syncthreads();
+
+    const int row_begin     = edge_begin_end[warp_lane][0];
+    const int row_end       = edge_begin_end[warp_lane][1];
+    unsigned base_src_index = src * column_length;
+
+    for (int offset = row_begin; offset < row_end; offset++) {
+      int dst                 = edge_destination[offset];
+      unsigned base_dst_index = dst * column_length;
+
+      // NOTE: this is where warp diverges
+      // the feature aggregation is split among thread in a warp
+      for (int i = 0; i < column_length; i += WARP_SIZE) {
+        if ((thread_lane + i) < column_length) {
+          aggregate_output[base_src_index + thread_lane + i] +=
+              node_embeddings[base_dst_index + thread_lane + i];
+        }
+      }
+    }
+  }
+}
+
+} // namespace
+
+void galois::GCNGPUAllocations::AggregateAllGPU(
+    const graphs::GNNGraphGPUAllocations& gpu_graph, size_t num_nodes,
+    size_t column_length, const GNNFloat* node_embeddings,
+    GNNFloat* aggregate_output) {
+  AggregateAllKernel<<<(num_nodes - 1) / WARPS_PER_BLOCK + 1, BLOCK_SIZE>>>(
+      num_nodes, column_length, gpu_graph.edge_index(),
+      gpu_graph.edge_destinations(), node_embeddings, aggregate_output);
+  CUDA_TEST("GPU aggregate all failure");
+}

From f46332a58ce93c2692e5427e2ce448e07dbcc47f Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 11 Nov 2020 12:55:41 -0600
Subject: [PATCH 407/660] Adds CuBLAS SGEMM function

This commit adds a function to call CuBLAS's SGEMM for doing matrix
multiplies on the GPU using row-major matrices. Note the way matrices
are passed into the function: B * A instead of A * B. The idea is that
the CuBLAS function assumes column-major, so using row-major matrices
makes it so that it's a transpose matrix. (BA)^T = C^T, and C^T in
column-major form is a row-major C (exactly what we want).
---
 libgnn/CMakeLists.txt             |  1 +
 libgnn/include/galois/CUDAUtil.h  | 14 ++++++++++++--
 libgnn/include/galois/GNNMath.cuh | 22 ++++++++++++++++++++++
 libgnn/src/GNNMath.cu             | 26 ++++++++++++++++++++++++++
 4 files changed, 61 insertions(+), 2 deletions(-)
 create mode 100644 libgnn/include/galois/GNNMath.cuh
 create mode 100644 libgnn/src/GNNMath.cu

diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt
index f556ec6ca4..d5771552ca 100644
--- a/libgnn/CMakeLists.txt
+++ b/libgnn/CMakeLists.txt
@@ -30,6 +30,7 @@ if (GALOIS_ENABLE_GPU)
 
   # create the galois_gnn_gpu library to get linked into galois_gnn
   set(gpusources
+    src/GNNMath.cu
     src/graphs/GNNGraph.cu
     src/layers/GNNLayer.cu
     src/layers/GraphConvolutionalLayer.cu
diff --git a/libgnn/include/galois/CUDAUtil.h b/libgnn/include/galois/CUDAUtil.h
index d479efe64d..7af2f6c1e8 100644
--- a/libgnn/include/galois/CUDAUtil.h
+++ b/libgnn/include/galois/CUDAUtil.h
@@ -1,8 +1,9 @@
-#ifdef GALOIS_ENABLE_GPU
+#ifndef GALOIS_CUDA_UTIL
+#define GALOIS_CUDA_UTIL
 //! @file CUDAUtil.h
 //! Contains various utility functions for CUDA.
-#pragma once
 #include <cuda.h>
+#include <cublas_v2.h>
 #include "galois/Logging.h"
 
 // TODO check these
@@ -41,4 +42,13 @@
     }                                                                          \
   } while (0)
 
+#define CUBLAS_CHECK(condition)                                                \
+  do {                                                                         \
+    cublasStatus_t status = condition;                                         \
+    if (status != CUBLAS_STATUS_SUCCESS) {                                     \
+      GALOIS_LOG_ERROR("CuBLAS error code : {}", status);                      \
+      exit(EXIT_FAILURE);                                                      \
+    }                                                                          \
+  } while (0)
+
 #endif
diff --git a/libgnn/include/galois/GNNMath.cuh b/libgnn/include/galois/GNNMath.cuh
new file mode 100644
index 0000000000..6a7cbbac43
--- /dev/null
+++ b/libgnn/include/galois/GNNMath.cuh
@@ -0,0 +1,22 @@
+#ifndef GALOIS_GNN_MATH_CUDA
+#define GALOIS_GNN_MATH_CUDA
+#include "galois/GNNTypes.h"
+#include "galois/CUDAUtil.h"
+
+namespace galois {
+
+extern bool cublas_is_init;
+extern cublasHandle_t global_cublas_handle;
+
+//! Initializes the cublas handle to use cublas on GPUs.
+void InitCuBLAS() { CUBLAS_CHECK(cublasCreate(&global_cublas_handle)); }
+
+//! Takes 2 *row-major* matrices and does a matrix multiply on the GPU using
+//! CuBLAS.
+void CBlasSGEMMGPU(const cublasOperation_t trans_a,
+                   const cublasOperation_t trans_b, size_t input_rows,
+                   size_t input_columns, size_t output_columns,
+                   const GNNFloat* a, const GNNFloat* b, GNNFloat* output);
+
+} // namespace galois
+#endif
diff --git a/libgnn/src/GNNMath.cu b/libgnn/src/GNNMath.cu
new file mode 100644
index 0000000000..d4c4108785
--- /dev/null
+++ b/libgnn/src/GNNMath.cu
@@ -0,0 +1,26 @@
+#include "galois/GNNMath.cuh"
+
+bool galois::cublas_is_init = false;
+cublasHandle_t galois::global_cublas_handle;
+
+void galois::CBlasSGEMMGPU(const cublasOperation_t trans_a,
+                           const cublasOperation_t trans_b, size_t input_rows,
+                           size_t input_columns, size_t output_columns,
+                           const GNNFloat* a, const GNNFloat* b,
+                           GNNFloat* output) {
+  if (!cublas_is_init) {
+    InitCuBLAS();
+  }
+  size_t lead_dim_a = (trans_a == CUBLAS_OP_N) ? input_columns : input_rows;
+  size_t lead_dim_b = (trans_b == CUBLAS_OP_N) ? output_columns : input_columns;
+  float dummy0      = 0.0;
+  float dummy1      = 1.0;
+  // because cusparse assumes column major even though we're passing in row
+  // major, the order of multiply is reversed so that it does what we
+  // want anyways
+  // https://stackoverflow.com/questions/56043539/cublassgemm-row-major-multiplication
+  CUBLAS_CHECK(cublasSgemm(global_cublas_handle, trans_b, trans_a,
+                           output_columns, input_rows, input_columns, &dummy1,
+                           b, lead_dim_b, a, lead_dim_a, &dummy0, output,
+                           output_columns));
+}

From 771460ec89f2e124b25ba77385db4cf7fd4312c6 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 11 Nov 2020 13:01:52 -0600
Subject: [PATCH 408/660] Renames base gpu alloc object in GNNLayer

Before this commit the GPU alloc object in GNNLayer and its children
were named the same. This commit changes the name of the base object to
something else so that children classes can use it without name
conflict.
---
 libgnn/include/galois/layers/GNNLayer.h |  2 +-
 libgnn/src/layers/GNNLayer.cpp          | 21 +++++++++++----------
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
index b1c8ae55a5..5e2d4708ba 100644
--- a/libgnn/include/galois/layers/GNNLayer.h
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -224,7 +224,7 @@ class GNNLayer {
 
 #ifdef GALOIS_ENABLE_GPU
   //! Object that holds all GPU allocated pointers to memory related to layers
-  GNNLayerGPUAllocations gpu_object_;
+  GNNLayerGPUAllocations base_gpu_object_;
   //! Copies over layer weights to GPU
   void CopyLayerWeightsToGPU();
 #endif
diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp
index 8bf9c42dfd..198c40985c 100644
--- a/libgnn/src/layers/GNNLayer.cpp
+++ b/libgnn/src/layers/GNNLayer.cpp
@@ -29,7 +29,7 @@ galois::GNNLayer::GNNLayer(size_t layer_num,
         galois::runtime::getSystemNetworkInterface().ID,
         galois::runtime::getSystemNetworkInterface().Num, false);
 #ifdef GALOIS_ENABLE_GPU
-    gpu_object_.InitWeightMemory(num_weight_elements);
+    base_gpu_object_.InitWeightMemory(num_weight_elements);
 #endif
   }
 
@@ -39,9 +39,9 @@ galois::GNNLayer::GNNLayer(size_t layer_num,
   backward_output_matrix_.resize(
       layer_dimensions_.input_rows * layer_dimensions_.input_columns, 0);
 #ifdef GALOIS_ENABLE_GPU
-  gpu_object_.InitInOutMemory(num_output_elements,
-                              layer_dimensions_.input_rows *
-                                  layer_dimensions_.input_columns);
+  base_gpu_object_.InitInOutMemory(num_output_elements,
+                                   layer_dimensions_.input_rows *
+                                       layer_dimensions_.input_columns);
 #endif
 
   // initialize the PointerWithSize wrappers
@@ -53,14 +53,15 @@ galois::GNNLayer::GNNLayer(size_t layer_num,
   p_backward_output_matrix_ =
       PointerWithSize<GNNFloat>(backward_output_matrix_);
 #else
-  p_layer_weights_ = PointerWithSize<GNNFloat>(gpu_object_.layer_weights(),
+  p_layer_weights_ = PointerWithSize<GNNFloat>(base_gpu_object_.layer_weights(),
                                                layer_weights_.size());
-  p_layer_weight_gradients_ = PointerWithSize<GNNFloat>(
-      gpu_object_.layer_weight_gradients(), layer_weight_gradients_.size());
+  p_layer_weight_gradients_ =
+      PointerWithSize<GNNFloat>(base_gpu_object_.layer_weight_gradients(),
+                                layer_weight_gradients_.size());
   p_forward_output_matrix_ = PointerWithSize<GNNFloat>(
-      gpu_object_.forward_output(), forward_output_matrix_.size());
+      base_gpu_object_.forward_output(), forward_output_matrix_.size());
   p_backward_output_matrix_ = PointerWithSize<GNNFloat>(
-      gpu_object_.backward_output(), backward_output_matrix_.size());
+      base_gpu_object_.backward_output(), backward_output_matrix_.size());
   // TODO can clear the cpu side vectors/don't use .size() since optimally they
   // aren't initialized
 #endif
@@ -200,6 +201,6 @@ void galois::GNNLayer::WeightGradientSyncAverage() {
 
 #ifdef GALOIS_ENABLE_GPU
 void galois::GNNLayer::CopyLayerWeightsToGPU() {
-  gpu_object_.CopyToWeights(layer_weights_);
+  base_gpu_object_.CopyToWeights(layer_weights_);
 }
 #endif

From c508c09bc5a9fc3cf234a08e1b8729f729f9513a Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 11 Nov 2020 13:17:14 -0600
Subject: [PATCH 409/660] Link CuBLAS to GPU build, multiple def. fix

Links cublas to the gpu gnn library. Moves the definition of the cublas
init function to the cu file because nvcc doesn't seem to work the same
way has gcc/g++ in terms of multiple includes (header guards/pragma once
doesn't seem to work).
---
 libgnn/CMakeLists.txt             | 2 +-
 libgnn/include/galois/GNNMath.cuh | 2 +-
 libgnn/src/GNNMath.cu             | 2 ++
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt
index d5771552ca..ff7d47a07d 100644
--- a/libgnn/CMakeLists.txt
+++ b/libgnn/CMakeLists.txt
@@ -45,7 +45,7 @@ if (GALOIS_ENABLE_GPU)
   )
 
   # link to gpu lib (which takes care of moderngpu and cub)
-  target_link_libraries(galois_gnn_gpu Galois::gpu galois_support)
+  target_link_libraries(galois_gnn_gpu Galois::gpu galois_support -lcublas)
 
   # gpu -> cpu lib
   target_link_libraries(galois_gnn galois_gnn_gpu)
diff --git a/libgnn/include/galois/GNNMath.cuh b/libgnn/include/galois/GNNMath.cuh
index 6a7cbbac43..763799f838 100644
--- a/libgnn/include/galois/GNNMath.cuh
+++ b/libgnn/include/galois/GNNMath.cuh
@@ -9,7 +9,7 @@ extern bool cublas_is_init;
 extern cublasHandle_t global_cublas_handle;
 
 //! Initializes the cublas handle to use cublas on GPUs.
-void InitCuBLAS() { CUBLAS_CHECK(cublasCreate(&global_cublas_handle)); }
+void InitCuBLAS();
 
 //! Takes 2 *row-major* matrices and does a matrix multiply on the GPU using
 //! CuBLAS.
diff --git a/libgnn/src/GNNMath.cu b/libgnn/src/GNNMath.cu
index d4c4108785..be396a153e 100644
--- a/libgnn/src/GNNMath.cu
+++ b/libgnn/src/GNNMath.cu
@@ -3,6 +3,8 @@
 bool galois::cublas_is_init = false;
 cublasHandle_t galois::global_cublas_handle;
 
+void galois::InitCuBLAS() { CUBLAS_CHECK(cublasCreate(&global_cublas_handle)); }
+
 void galois::CBlasSGEMMGPU(const cublasOperation_t trans_a,
                            const cublasOperation_t trans_b, size_t input_rows,
                            size_t input_columns, size_t output_columns,

From 667e94915916f08d314bd39351c2007025e1a4a8 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 11 Nov 2020 13:19:46 -0600
Subject: [PATCH 410/660] UpdateEmdeddingsGPU function added

This commit splits the update emdeddings call in the GCN layer into a
CPU and GPU version. The GPU version calls a function which at the
moment will call into cuBLAS to do the multiplication.

Next step is to unit test the forward pass to make sure the results are
sane.

This commit also includes a clang-format run on SoftmaxLayer (which
currently is not being used anywhere anyways as it was an example Xuhao
added).
---
 .../galois/layers/GraphConvolutionalLayer.cuh      |  5 +++++
 libgnn/src/layers/GraphConvolutionalLayer.cpp      |  9 +++++++++
 libgnn/src/layers/GraphConvolutionalLayer.cu       | 10 +++++++++-
 libgnn/src/layers/SoftmaxLayer.cu                  | 14 +++++---------
 4 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/libgnn/include/galois/layers/GraphConvolutionalLayer.cuh b/libgnn/include/galois/layers/GraphConvolutionalLayer.cuh
index 993b6f39cb..4b28916db5 100644
--- a/libgnn/include/galois/layers/GraphConvolutionalLayer.cuh
+++ b/libgnn/include/galois/layers/GraphConvolutionalLayer.cuh
@@ -20,6 +20,11 @@ public:
                        const GNNFloat* node_embeddings,
                        GNNFloat* aggregate_output);
 
+  void UpdateEmbeddingsGPU(size_t num_nodes, size_t input_columns,
+                           size_t output_columns,
+                           const GNNFloat* node_embeddings,
+                           const GNNFloat* layer_weights, GNNFloat* output);
+
 private:
   GNNFloat* in_temp_1_{nullptr};
   GNNFloat* in_temp_2_{nullptr};
diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp
index af2925facc..1cd7a34f40 100644
--- a/libgnn/src/layers/GraphConvolutionalLayer.cpp
+++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp
@@ -215,10 +215,19 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU(
 
 void galois::GraphConvolutionalLayer::UpdateEmbeddings(
     const GNNFloat* node_embeddings, GNNFloat* output) {
+
+#ifndef GALOIS_ENABLE_GPU
+  // CPU version is just a call into CBlas
   galois::CBlasSGEMM(CblasNoTrans, CblasNoTrans, layer_dimensions_.input_rows,
                      layer_dimensions_.input_columns,
                      layer_dimensions_.output_columns, node_embeddings,
                      layer_weights_.data(), output);
+#else
+  gpu_object_.UpdateEmbeddingsGPU(
+      layer_dimensions_.input_rows, layer_dimensions_.input_columns,
+      layer_dimensions_.output_columns, node_embeddings,
+      base_gpu_object_.layer_weights(), output);
+#endif
 }
 
 void galois::GraphConvolutionalLayer::UpdateEmbeddingsDerivative(
diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cu b/libgnn/src/layers/GraphConvolutionalLayer.cu
index e5a34a3c15..0af2201829 100644
--- a/libgnn/src/layers/GraphConvolutionalLayer.cu
+++ b/libgnn/src/layers/GraphConvolutionalLayer.cu
@@ -1,4 +1,4 @@
-#include "galois/CUDAUtil.h"
+#include "galois/GNNMath.cuh"
 #include "galois/layers/GraphConvolutionalLayer.cuh"
 
 galois::GCNGPUAllocations::~GCNGPUAllocations() {
@@ -79,3 +79,11 @@ void galois::GCNGPUAllocations::AggregateAllGPU(
       gpu_graph.edge_destinations(), node_embeddings, aggregate_output);
   CUDA_TEST("GPU aggregate all failure");
 }
+
+void galois::GCNGPUAllocations::UpdateEmbeddingsGPU(
+    size_t num_nodes, size_t input_columns, size_t output_columns,
+    const GNNFloat* node_embeddings, const GNNFloat* layer_weights,
+    GNNFloat* output) {
+  CBlasSGEMMGPU(CUBLAS_OP_N, CUBLAS_OP_N, num_nodes, input_columns,
+                output_columns, node_embeddings, layer_weights, output);
+}
diff --git a/libgnn/src/layers/SoftmaxLayer.cu b/libgnn/src/layers/SoftmaxLayer.cu
index d9ed5fc0ff..c3f61dcf6f 100644
--- a/libgnn/src/layers/SoftmaxLayer.cu
+++ b/libgnn/src/layers/SoftmaxLayer.cu
@@ -3,26 +3,22 @@
 #include "galois/layers/SoftmaxLayer.h"
 
 // Allocate memory and initialize
-void galois::SoftmaxLayer::Init() {
-}
+void galois::SoftmaxLayer::Init() {}
 
 // Input: in_tensor
 // Output: out_tensor
 void galois::SoftmaxLayer::Forward(const galois::GNNFloat* in_tensor,
-                                   galois::GNNFloat* out_tensor) {
-} 
+                                   galois::GNNFloat* out_tensor) {}
 
 // Input: in_tensor
 // Input: out_tensor
 // Input: out_gradients
 // Output: in_gradients
-// Note: although out_gradients is an input data, 
+// Note: although out_gradients is an input data,
 //       it is not const because it can be reused
-//       to hold intermediate data inside this function, 
+//       to hold intermediate data inside this function,
 //       to avoid allocating more memory
 void galois::SoftmaxLayer::Backward(const galois::GNNFloat* in_tensor,
                                     const galois::GNNFloat* out_tensor,
                                     galois::GNNFloat* in_gradients,
-                                    galois::GNNFloat* out_gradients) {
-}
-
+                                    galois::GNNFloat* out_gradients) {}

From 32829a25759a04b9821b5623bc4ea849f586b387 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 11 Nov 2020 15:52:21 -0600
Subject: [PATCH 411/660] Changing weights on CPU copies it to GPU

Initialization of weights on CPU did not initialize weights on GPU: this
commit adds a call to copy over newly initialized CPU weights over to
the GPU. Moves init of GPU memory before the CPU weights are initialized
as well to prevent a nullptr copy.

Also adds a debug function to print vectors on the GPU.
---
 libgnn/include/galois/CUDAUtil.h              |  3 +--
 libgnn/include/galois/layers/GNNLayer.cuh     |  5 +++++
 libgnn/include/galois/layers/GNNLayer.h       | 11 ++++++++++
 libgnn/src/layers/GNNLayer.cpp                | 20 ++++++++++++++++---
 libgnn/src/layers/GNNLayer.cu                 | 18 +++++++++++++++++
 libgnn/src/layers/GraphConvolutionalLayer.cpp |  5 ++---
 libgnn/src/layers/GraphConvolutionalLayer.cu  |  4 ++++
 7 files changed, 58 insertions(+), 8 deletions(-)

diff --git a/libgnn/include/galois/CUDAUtil.h b/libgnn/include/galois/CUDAUtil.h
index 7af2f6c1e8..6a7e7e9915 100644
--- a/libgnn/include/galois/CUDAUtil.h
+++ b/libgnn/include/galois/CUDAUtil.h
@@ -18,8 +18,7 @@
   do {                                                                         \
     cudaError_t error = condition;                                             \
     if (error != cudaSuccess) {                                                \
-      GALOIS_LOG_ERROR("CUDA error: {}", cudaGetErrorString(error));           \
-      exit(EXIT_FAILURE);                                                      \
+      GALOIS_LOG_FATAL("CUDA error: {}", cudaGetErrorString(error));           \
     }                                                                          \
   } while (0)
 
diff --git a/libgnn/include/galois/layers/GNNLayer.cuh b/libgnn/include/galois/layers/GNNLayer.cuh
index 81fa9e2026..b1e5290761 100644
--- a/libgnn/include/galois/layers/GNNLayer.cuh
+++ b/libgnn/include/galois/layers/GNNLayer.cuh
@@ -14,6 +14,11 @@ public:
   void InitWeightMemory(size_t num_weights);
   //! Copy provided data in vector to GPU weights
   void CopyToWeights(const std::vector<GNNFloat>& cpu_layer_weights);
+  //! Copy GPU forward output to the provided vector (assumes vector is already
+  //! correct size)
+  void CopyForwardOutputToCPU(std::vector<GNNFloat>* cpu_forward_output);
+  //! Prints forward output matrix on gpu
+  void PrintForwardOutput(size_t num);
 
   GNNFloat* forward_output() { return forward_output_matrix_; }
   GNNFloat* backward_output() { return backward_output_matrix_; }
diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
index 5e2d4708ba..e6ac1b1497 100644
--- a/libgnn/include/galois/layers/GNNLayer.h
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -82,6 +82,9 @@ class GNNLayer {
     if (layer_weights_.size()) {
       layer_weights_.assign(layer_weights_.size(), 1);
     }
+#ifdef GALOIS_ENABLE_GPU
+    CopyLayerWeightsToGPU();
+#endif
   }
 
   const PointerWithSize<GNNFloat> GetForwardOutput() {
@@ -131,6 +134,14 @@ class GNNLayer {
   //! stored in the layer
   void OptimizeLayer(BaseOptimizer* optimizer, size_t trainable_layer_number);
 
+#ifdef GALOIS_ENABLE_GPU
+  //! Copies over forward output results to CPU
+  const std::vector<GNNFloat>& CopyForwardOutputFromGPU();
+  void PrintForwardOutputGPU() {
+    base_gpu_object_.PrintForwardOutput(forward_output_matrix_.size());
+  }
+#endif
+
 protected:
   //! Layer order (starts from 0); used in backward to shortcut output as layer
   //! 0 does not need to do some things that other layers need to do
diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp
index 198c40985c..7bd591e90a 100644
--- a/libgnn/src/layers/GNNLayer.cpp
+++ b/libgnn/src/layers/GNNLayer.cpp
@@ -9,6 +9,7 @@ galois::GNNLayer::GNNLayer(size_t layer_num,
     : layer_number_(layer_num), graph_(graph), layer_dimensions_(dimensions),
       config_(config) {
   if (config_.allocate_weights) {
+
     // TODO some of this does not need alloc if not used
     // dropout allocation; dropout is same as input
     dropout_mask_.resize(
@@ -18,6 +19,10 @@ galois::GNNLayer::GNNLayer(size_t layer_num,
         layer_dimensions_.input_columns * layer_dimensions_.output_columns;
     layer_weights_.resize(num_weight_elements);
     layer_weight_gradients_.resize(num_weight_elements, 0);
+#ifdef GALOIS_ENABLE_GPU
+    base_gpu_object_.InitWeightMemory(num_weight_elements);
+#endif
+
     GlorotBengioInit(&layer_weights_);
 
     // initialize sync substrate
@@ -28,9 +33,6 @@ galois::GNNLayer::GNNLayer(size_t layer_num,
         *gradient_sync_interface_,
         galois::runtime::getSystemNetworkInterface().ID,
         galois::runtime::getSystemNetworkInterface().Num, false);
-#ifdef GALOIS_ENABLE_GPU
-    base_gpu_object_.InitWeightMemory(num_weight_elements);
-#endif
   }
 
   size_t num_output_elements =
@@ -77,6 +79,9 @@ void galois::GNNLayer::GlorotBengioInit(std::vector<GNNFloat>* vector_to_init) {
   for (size_t i = 0; i < vector_to_init->size(); i++) {
     (*vector_to_init)[i] = dist(rng);
   }
+#ifdef GALOIS_ENABLE_GPU
+  CopyLayerWeightsToGPU();
+#endif
 }
 
 void galois::GNNLayer::RandomInitVector(std::vector<GNNFloat>* vector_to_init) {
@@ -87,6 +92,9 @@ void galois::GNNLayer::RandomInitVector(std::vector<GNNFloat>* vector_to_init) {
         (*vector_to_init)[i] = random_init_rng_.GetRandomNumber();
       },
       galois::loopname("RandomInitVector"));
+#ifdef GALOIS_ENABLE_GPU
+  CopyLayerWeightsToGPU();
+#endif
 }
 
 void galois::GNNLayer::DoDropoutCPU(
@@ -203,4 +211,10 @@ void galois::GNNLayer::WeightGradientSyncAverage() {
 void galois::GNNLayer::CopyLayerWeightsToGPU() {
   base_gpu_object_.CopyToWeights(layer_weights_);
 }
+
+const std::vector<galois::GNNFloat>&
+galois::GNNLayer::CopyForwardOutputFromGPU() {
+  base_gpu_object_.CopyForwardOutputToCPU(&forward_output_matrix_);
+  return forward_output_matrix_;
+}
 #endif
diff --git a/libgnn/src/layers/GNNLayer.cu b/libgnn/src/layers/GNNLayer.cu
index 424df92e26..64be961e4b 100644
--- a/libgnn/src/layers/GNNLayer.cu
+++ b/libgnn/src/layers/GNNLayer.cu
@@ -37,4 +37,22 @@ void galois::GNNLayerGPUAllocations::CopyToWeights(
                         cudaMemcpyHostToDevice));
 }
 
+void galois::GNNLayerGPUAllocations::CopyForwardOutputToCPU(
+    std::vector<GNNFloat>* cpu_forward_output) {
+  CUDA_CHECK(cudaMemcpy(cpu_forward_output->data(), forward_output_matrix_,
+                        cpu_forward_output->size() * sizeof(GNNFloat),
+                        cudaMemcpyDeviceToHost));
+}
+
+namespace {
+__global__ void PrintVector(galois::GNNFloat* v, unsigned size) {
+  for (unsigned i = 0; i < size; i++) {
+    printf("%u %f\n", i, v[i]);
+  }
+}
+} // namespace
+
 // TODO copy from gpu function as well just in case I need to check
+void galois::GNNLayerGPUAllocations::PrintForwardOutput(size_t size) {
+  PrintVector<<<1, 1>>>(forward_output_matrix_, size);
+}
diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp
index 1cd7a34f40..61a0abaf4c 100644
--- a/libgnn/src/layers/GraphConvolutionalLayer.cpp
+++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp
@@ -80,9 +80,9 @@ galois::GraphConvolutionalLayer::ForwardPhase(
     Activation();
   }
 
-  assert(forward_output_matrix_.size() ==
+  assert(p_forward_output_matrix_.size() ==
          (layer_dimensions_.input_rows * layer_dimensions_.output_columns));
-  return forward_output_matrix_;
+  return p_forward_output_matrix_;
 }
 
 galois::PointerWithSize<galois::GNNFloat>
@@ -215,7 +215,6 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU(
 
 void galois::GraphConvolutionalLayer::UpdateEmbeddings(
     const GNNFloat* node_embeddings, GNNFloat* output) {
-
 #ifndef GALOIS_ENABLE_GPU
   // CPU version is just a call into CBlas
   galois::CBlasSGEMM(CblasNoTrans, CblasNoTrans, layer_dimensions_.input_rows,
diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cu b/libgnn/src/layers/GraphConvolutionalLayer.cu
index 0af2201829..7161580ee3 100644
--- a/libgnn/src/layers/GraphConvolutionalLayer.cu
+++ b/libgnn/src/layers/GraphConvolutionalLayer.cu
@@ -65,6 +65,10 @@ __global__ void AggregateAllKernel(unsigned num_nodes, size_t column_length,
         }
       }
     }
+    //__syncthreads();
+    // if (thread_lane == 0) {
+    //  printf("Agg %d %f\n", src, aggregate_output[base_src_index]);
+    //}
   }
 }
 

From 9527c9013a77010971cb26c5b4223e2a639f9f80 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 11 Nov 2020 15:54:00 -0600
Subject: [PATCH 412/660] GPU conv layer forward pass test

Reenable assertions for the GPU GCN layer forward pass which now works.
The next step is to get the backward pass test working which involves
adding a function for user code to copy things over to CUDA without
needing to include the CUDA header + adding the appropriate GPU
functions in the backend.
---
 libgnn/test/gpu-convlayer-test.cpp | 343 +++++++++++++++--------------
 1 file changed, 175 insertions(+), 168 deletions(-)

diff --git a/libgnn/test/gpu-convlayer-test.cpp b/libgnn/test/gpu-convlayer-test.cpp
index 0123a35b17..7326c1a911 100644
--- a/libgnn/test/gpu-convlayer-test.cpp
+++ b/libgnn/test/gpu-convlayer-test.cpp
@@ -39,208 +39,215 @@ int main() {
                                                         dimension_0, dcon);
   layer_0->InitAllWeightsTo1();
   // make sure it runs in a sane manner
-  const galois::PointerWithSize<galois::GNNFloat> layer_0_forward_output =
-      layer_0->ForwardPhase(test_graph.GetLocalFeatures());
+  layer_0->ForwardPhase(test_graph.GetLocalFeatures());
+  // pointer is to GPU memory: copy it over to a CPU source for verification
+  layer_0->PrintForwardOutputGPU();
+  const std::vector<galois::GNNFloat>& layer_0_forward_output =
+      layer_0->CopyForwardOutputFromGPU();
 
   ////////////////////////////////////////////////////////////////////////////////
   //// sanity check layer 0 output
   ////////////////////////////////////////////////////////////////////////////////
-  //// since norm factors aren't invovled it is possible to do full assertions
-  //// 7 x 2
+  // since norm factors aren't invovled it is possible to do full assertions
+  // 7 x 2
   GALOIS_LOG_ASSERT(layer_0_forward_output.size() == 14);
-  //GALOIS_LOG_ASSERT(layer_0_forward_output[0] == 3);
-  //GALOIS_LOG_ASSERT(layer_0_forward_output[1] == 3);
-  //GALOIS_LOG_ASSERT(layer_0_forward_output[2] == 6);
-  //GALOIS_LOG_ASSERT(layer_0_forward_output[3] == 6);
-  //GALOIS_LOG_ASSERT(layer_0_forward_output[4] == 12);
-  //GALOIS_LOG_ASSERT(layer_0_forward_output[5] == 12);
-  //GALOIS_LOG_ASSERT(layer_0_forward_output[6] == 18);
-  //GALOIS_LOG_ASSERT(layer_0_forward_output[7] == 18);
-  //GALOIS_LOG_ASSERT(layer_0_forward_output[8] == 24);
-  //GALOIS_LOG_ASSERT(layer_0_forward_output[9] == 24);
-  //GALOIS_LOG_ASSERT(layer_0_forward_output[10] == 30);
-  //GALOIS_LOG_ASSERT(layer_0_forward_output[11] == 30);
-  //GALOIS_LOG_ASSERT(layer_0_forward_output[12] == 15);
-  //GALOIS_LOG_ASSERT(layer_0_forward_output[13] == 15);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[0] == 3);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[1] == 3);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[2] == 6);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[3] == 6);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[4] == 12);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[5] == 12);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[6] == 18);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[7] == 18);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[8] == 24);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[9] == 24);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[10] == 30);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[11] == 30);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[12] == 15);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[13] == 15);
   ////////////////////////////////////////////////////////////////////////////////
 
-  //// dummy 1 matrix
-  //std::vector<galois::GNNFloat> dummy_ones_v(14, 1);
-  //galois::PointerWithSize dummy_ones(dummy_ones_v);
+  // dummy 1 matrix
+  // std::vector<galois::GNNFloat> dummy_ones_v(14, 1);
+  // galois::PointerWithSize dummy_ones(dummy_ones_v);
 
-  //// backward pass checking
-  //// layer 0 means that an empty weight matrix is returned since there is no
-  //// point passing back anything
-  //galois::PointerWithSize<galois::GNNFloat> layer_0_backward_output =
+  // XXX TODO copy this over to the GPU
+
+  // backward pass checking
+  // layer 0 means that an empty weight matrix is returned since there is no
+  // point passing back anything
+  // galois::PointerWithSize<galois::GNNFloat> layer_0_backward_output =
   //    layer_0->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
 
-  ////////////////////////////////////////////////////////////////////////////////
-  //// sanity check layer 0 backward output; all 0 because layer 0
-  ////////////////////////////////////////////////////////////////////////////////
-  //// since norm factors aren't invovled it is possible to do full assertions
-  //// 7 x 3
-  //GALOIS_LOG_ASSERT(layer_0_backward_output.size() == 21);
-  //GALOIS_LOG_ASSERT((layer_0_backward_output)[0] == 0);
-  //GALOIS_LOG_ASSERT((layer_0_backward_output)[1] == 0);
-  //GALOIS_LOG_ASSERT((layer_0_backward_output)[2] == 0);
-  //GALOIS_LOG_ASSERT((layer_0_backward_output)[3] == 0);
-  //GALOIS_LOG_ASSERT((layer_0_backward_output)[4] == 0);
-  //GALOIS_LOG_ASSERT((layer_0_backward_output)[5] == 0);
-  //GALOIS_LOG_ASSERT((layer_0_backward_output)[6] == 0);
-  //GALOIS_LOG_ASSERT((layer_0_backward_output)[7] == 0);
-  //GALOIS_LOG_ASSERT((layer_0_backward_output)[8] == 0);
-  //GALOIS_LOG_ASSERT((layer_0_backward_output)[9] == 0);
-  //GALOIS_LOG_ASSERT((layer_0_backward_output)[10] == 0);
-  //GALOIS_LOG_ASSERT((layer_0_backward_output)[11] == 0);
-  //GALOIS_LOG_ASSERT((layer_0_backward_output)[12] == 0);
-  //GALOIS_LOG_ASSERT((layer_0_backward_output)[13] == 0);
-  //GALOIS_LOG_ASSERT((layer_0_backward_output)[14] == 0);
-  //GALOIS_LOG_ASSERT((layer_0_backward_output)[15] == 0);
-  //GALOIS_LOG_ASSERT((layer_0_backward_output)[16] == 0);
-  //GALOIS_LOG_ASSERT((layer_0_backward_output)[17] == 0);
-  //GALOIS_LOG_ASSERT((layer_0_backward_output)[18] == 0);
-  //GALOIS_LOG_ASSERT((layer_0_backward_output)[19] == 0);
-  //GALOIS_LOG_ASSERT((layer_0_backward_output)[20] == 0);
-
-  //galois::PointerWithSize<galois::GNNFloat> layer_0_weight_gradients =
+  //////////////////////////////////////////////////////////////////////////////
+  // sanity check layer 0 backward output; all 0 because layer 0
+  //////////////////////////////////////////////////////////////////////////////
+  // since norm factors aren't invovled it is possible to do full assertions
+  // 7 x 3
+  // GALOIS_LOG_ASSERT(layer_0_backward_output.size() == 21);
+  // GALOIS_LOG_ASSERT((layer_0_backward_output)[0] == 0);
+  // GALOIS_LOG_ASSERT((layer_0_backward_output)[1] == 0);
+  // GALOIS_LOG_ASSERT((layer_0_backward_output)[2] == 0);
+  // GALOIS_LOG_ASSERT((layer_0_backward_output)[3] == 0);
+  // GALOIS_LOG_ASSERT((layer_0_backward_output)[4] == 0);
+  // GALOIS_LOG_ASSERT((layer_0_backward_output)[5] == 0);
+  // GALOIS_LOG_ASSERT((layer_0_backward_output)[6] == 0);
+  // GALOIS_LOG_ASSERT((layer_0_backward_output)[7] == 0);
+  // GALOIS_LOG_ASSERT((layer_0_backward_output)[8] == 0);
+  // GALOIS_LOG_ASSERT((layer_0_backward_output)[9] == 0);
+  // GALOIS_LOG_ASSERT((layer_0_backward_output)[10] == 0);
+  // GALOIS_LOG_ASSERT((layer_0_backward_output)[11] == 0);
+  // GALOIS_LOG_ASSERT((layer_0_backward_output)[12] == 0);
+  // GALOIS_LOG_ASSERT((layer_0_backward_output)[13] == 0);
+  // GALOIS_LOG_ASSERT((layer_0_backward_output)[14] == 0);
+  // GALOIS_LOG_ASSERT((layer_0_backward_output)[15] == 0);
+  // GALOIS_LOG_ASSERT((layer_0_backward_output)[16] == 0);
+  // GALOIS_LOG_ASSERT((layer_0_backward_output)[17] == 0);
+  // GALOIS_LOG_ASSERT((layer_0_backward_output)[18] == 0);
+  // GALOIS_LOG_ASSERT((layer_0_backward_output)[19] == 0);
+  // GALOIS_LOG_ASSERT((layer_0_backward_output)[20] == 0);
+
+  // galois::PointerWithSize<galois::GNNFloat> layer_0_weight_gradients =
   //    layer_0->GetLayerWeightGradients();
   //// make sure they are sane
-  //GALOIS_LOG_ASSERT(layer_0_weight_gradients.size() == 6);
-  //GALOIS_LOG_ASSERT(layer_0_weight_gradients[0] == 21);
-  //GALOIS_LOG_ASSERT(layer_0_weight_gradients[1] == 21);
-  //GALOIS_LOG_ASSERT(layer_0_weight_gradients[2] == 21);
-  //GALOIS_LOG_ASSERT(layer_0_weight_gradients[3] == 21);
-  //GALOIS_LOG_ASSERT(layer_0_weight_gradients[4] == 21);
+  // GALOIS_LOG_ASSERT(layer_0_weight_gradients.size() == 6);
+  // GALOIS_LOG_ASSERT(layer_0_weight_gradients[0] == 21);
+  // GALOIS_LOG_ASSERT(layer_0_weight_gradients[1] == 21);
+  // GALOIS_LOG_ASSERT(layer_0_weight_gradients[2] == 21);
+  // GALOIS_LOG_ASSERT(layer_0_weight_gradients[3] == 21);
+  // GALOIS_LOG_ASSERT(layer_0_weight_gradients[4] == 21);
 
-  //layer_0.reset();
+  // layer_0.reset();
 
   ////////////////////////////////////////////////////////////////////////////////
 
-  //// create layer 1 for testing backward prop actually giving weights back
+  // create layer 1 for testing backward prop actually giving weights back
 
-  //std::unique_ptr<galois::GraphConvolutionalLayer> layer_1 =
-  //    std::make_unique<galois::GraphConvolutionalLayer>(1, test_graph,
-  //                                                      dimension_0, dcon);
-  //layer_1->InitAllWeightsTo1();
-  //galois::PointerWithSize<galois::GNNFloat> layer_1_forward_output =
-  //    layer_1->ForwardPhase(test_graph.GetLocalFeatures());
-  //// same check as before for sanity purposes
-  //GALOIS_LOG_ASSERT(layer_1_forward_output.size() == 14);
-  //GALOIS_LOG_ASSERT(layer_1_forward_output[0] == 3);
-  //GALOIS_LOG_ASSERT(layer_1_forward_output[1] == 3);
-  //GALOIS_LOG_ASSERT(layer_1_forward_output[2] == 6);
-  //GALOIS_LOG_ASSERT(layer_1_forward_output[3] == 6);
-  //GALOIS_LOG_ASSERT(layer_1_forward_output[4] == 12);
-  //GALOIS_LOG_ASSERT(layer_1_forward_output[5] == 12);
-  //GALOIS_LOG_ASSERT(layer_1_forward_output[6] == 18);
-  //GALOIS_LOG_ASSERT(layer_1_forward_output[7] == 18);
-  //GALOIS_LOG_ASSERT(layer_1_forward_output[8] == 24);
-  //GALOIS_LOG_ASSERT(layer_1_forward_output[9] == 24);
-  //GALOIS_LOG_ASSERT(layer_1_forward_output[10] == 30);
-  //GALOIS_LOG_ASSERT(layer_1_forward_output[11] == 30);
-  //GALOIS_LOG_ASSERT(layer_1_forward_output[12] == 15);
-  //GALOIS_LOG_ASSERT(layer_1_forward_output[13] == 15);
-
-  //// since layer isn't 0 anymore, backward phase will actually return something
-  //dummy_ones_v.assign(14, 1);
-  //galois::PointerWithSize<galois::GNNFloat> layer_1_backward_output =
+  std::unique_ptr<galois::GraphConvolutionalLayer> layer_1 =
+      std::make_unique<galois::GraphConvolutionalLayer>(1, test_graph,
+                                                        dimension_0, dcon);
+  layer_1->InitAllWeightsTo1();
+  layer_1->ForwardPhase(test_graph.GetLocalFeatures());
+  const std::vector<galois::GNNFloat>& layer_1_forward_output =
+      layer_1->CopyForwardOutputFromGPU();
+
+  // same check as before for sanity purposes
+  GALOIS_LOG_ASSERT(layer_1_forward_output.size() == 14);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[0] == 3);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[1] == 3);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[2] == 6);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[3] == 6);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[4] == 12);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[5] == 12);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[6] == 18);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[7] == 18);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[8] == 24);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[9] == 24);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[10] == 30);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[11] == 30);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[12] == 15);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[13] == 15);
+
+  // since layer isn't 0 anymore, backward phase will actually return something
+  // dummy_ones_v.assign(14, 1);
+  // galois::PointerWithSize<galois::GNNFloat> layer_1_backward_output =
   //    layer_1->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
-  ////////////////////////////////////////////////////////////////////////////////
-  //// check that multiplies go as expected
-  ////////////////////////////////////////////////////////////////////////////////
-  //GALOIS_LOG_ASSERT(layer_1_backward_output.size() == 21);
-  //GALOIS_LOG_ASSERT((layer_1_backward_output)[0] == 2);
-  //GALOIS_LOG_ASSERT((layer_1_backward_output)[1] == 2);
-  //GALOIS_LOG_ASSERT((layer_1_backward_output)[2] == 2);
-  //GALOIS_LOG_ASSERT((layer_1_backward_output)[3] == 4);
-  //GALOIS_LOG_ASSERT((layer_1_backward_output)[4] == 4);
-  //GALOIS_LOG_ASSERT((layer_1_backward_output)[5] == 4);
-  //GALOIS_LOG_ASSERT((layer_1_backward_output)[6] == 4);
-  //GALOIS_LOG_ASSERT((layer_1_backward_output)[7] == 4);
-  //GALOIS_LOG_ASSERT((layer_1_backward_output)[8] == 4);
-  //GALOIS_LOG_ASSERT((layer_1_backward_output)[9] == 4);
-  //GALOIS_LOG_ASSERT((layer_1_backward_output)[10] == 4);
-  //GALOIS_LOG_ASSERT((layer_1_backward_output)[11] == 4);
-  //GALOIS_LOG_ASSERT((layer_1_backward_output)[12] == 4);
-  //GALOIS_LOG_ASSERT((layer_1_backward_output)[13] == 4);
-  //GALOIS_LOG_ASSERT((layer_1_backward_output)[14] == 4);
-  //GALOIS_LOG_ASSERT((layer_1_backward_output)[15] == 4);
-  //GALOIS_LOG_ASSERT((layer_1_backward_output)[16] == 4);
-  //GALOIS_LOG_ASSERT((layer_1_backward_output)[17] == 4);
-  //GALOIS_LOG_ASSERT((layer_1_backward_output)[18] == 2);
-  //GALOIS_LOG_ASSERT((layer_1_backward_output)[19] == 2);
-  //GALOIS_LOG_ASSERT((layer_1_backward_output)[20] == 2);
-
-  //galois::PointerWithSize<galois::GNNFloat> layer_1_weight_gradients =
+  //////////////////////////////////////////////////////////////////////////////
+  // check that multiplies go as expected
+  //////////////////////////////////////////////////////////////////////////////
+  // GALOIS_LOG_ASSERT(layer_1_backward_output.size() == 21);
+  // GALOIS_LOG_ASSERT((layer_1_backward_output)[0] == 2);
+  // GALOIS_LOG_ASSERT((layer_1_backward_output)[1] == 2);
+  // GALOIS_LOG_ASSERT((layer_1_backward_output)[2] == 2);
+  // GALOIS_LOG_ASSERT((layer_1_backward_output)[3] == 4);
+  // GALOIS_LOG_ASSERT((layer_1_backward_output)[4] == 4);
+  // GALOIS_LOG_ASSERT((layer_1_backward_output)[5] == 4);
+  // GALOIS_LOG_ASSERT((layer_1_backward_output)[6] == 4);
+  // GALOIS_LOG_ASSERT((layer_1_backward_output)[7] == 4);
+  // GALOIS_LOG_ASSERT((layer_1_backward_output)[8] == 4);
+  // GALOIS_LOG_ASSERT((layer_1_backward_output)[9] == 4);
+  // GALOIS_LOG_ASSERT((layer_1_backward_output)[10] == 4);
+  // GALOIS_LOG_ASSERT((layer_1_backward_output)[11] == 4);
+  // GALOIS_LOG_ASSERT((layer_1_backward_output)[12] == 4);
+  // GALOIS_LOG_ASSERT((layer_1_backward_output)[13] == 4);
+  // GALOIS_LOG_ASSERT((layer_1_backward_output)[14] == 4);
+  // GALOIS_LOG_ASSERT((layer_1_backward_output)[15] == 4);
+  // GALOIS_LOG_ASSERT((layer_1_backward_output)[16] == 4);
+  // GALOIS_LOG_ASSERT((layer_1_backward_output)[17] == 4);
+  // GALOIS_LOG_ASSERT((layer_1_backward_output)[18] == 2);
+  // GALOIS_LOG_ASSERT((layer_1_backward_output)[19] == 2);
+  // GALOIS_LOG_ASSERT((layer_1_backward_output)[20] == 2);
+
+  // galois::PointerWithSize<galois::GNNFloat> layer_1_weight_gradients =
   //    layer_1->GetLayerWeightGradients();
   //// make sure they are sane
-  //GALOIS_LOG_ASSERT(layer_1_weight_gradients.size() == 6);
-  //GALOIS_LOG_ASSERT(layer_1_weight_gradients[0] == 21);
-  //GALOIS_LOG_ASSERT(layer_1_weight_gradients[1] == 21);
-  //GALOIS_LOG_ASSERT(layer_1_weight_gradients[2] == 21);
-  //GALOIS_LOG_ASSERT(layer_1_weight_gradients[3] == 21);
-  //GALOIS_LOG_ASSERT(layer_1_weight_gradients[4] == 21);
+  // GALOIS_LOG_ASSERT(layer_1_weight_gradients.size() == 6);
+  // GALOIS_LOG_ASSERT(layer_1_weight_gradients[0] == 21);
+  // GALOIS_LOG_ASSERT(layer_1_weight_gradients[1] == 21);
+  // GALOIS_LOG_ASSERT(layer_1_weight_gradients[2] == 21);
+  // GALOIS_LOG_ASSERT(layer_1_weight_gradients[3] == 21);
+  // GALOIS_LOG_ASSERT(layer_1_weight_gradients[4] == 21);
 
-  //layer_1.reset();
+  // layer_1.reset();
 
   ////////////////////////////////////////////////////////////////////////////////
 
-  //galois::GNNLayerConfig config;
-  //config.do_dropout                   = true;
-  //config.do_activation                = true;
-  //config.do_normalization             = true;
-  //config.allow_aggregate_after_update = false;
+  // galois::GNNLayerConfig config;
+  // config.do_dropout                   = true;
+  // config.do_activation                = true;
+  // config.do_normalization             = true;
+  // config.allow_aggregate_after_update = false;
 
   //// finally, just make sure dropout and activation run without crashes
   //// (verification requires floating point accuracy or setting a seed which I
   //// don't have time for at the moment
   //// TODO in future maybe add better unit test for this
-  //std::unique_ptr<galois::GraphConvolutionalLayer> layer_2 =
+  // std::unique_ptr<galois::GraphConvolutionalLayer> layer_2 =
   //    std::make_unique<galois::GraphConvolutionalLayer>(1, test_graph,
   //                                                      dimension_0, config);
-  //galois::PointerWithSize<galois::GNNFloat> l2_fo =
+  // galois::PointerWithSize<galois::GNNFloat> l2_fo =
   //    layer_2->ForwardPhase(test_graph.GetLocalFeatures());
-  //GALOIS_LOG_ASSERT(l2_fo.size() == 14);
-  //GALOIS_LOG_VERBOSE("{}", l2_fo[0]);
-  //GALOIS_LOG_VERBOSE("{}", l2_fo[1]);
-  //GALOIS_LOG_VERBOSE("{}", l2_fo[2]);
-  //GALOIS_LOG_VERBOSE("{}", l2_fo[3]);
-  //GALOIS_LOG_VERBOSE("{}", l2_fo[4]);
-  //GALOIS_LOG_VERBOSE("{}", l2_fo[5]);
-  //GALOIS_LOG_VERBOSE("{}", l2_fo[6]);
-  //GALOIS_LOG_VERBOSE("{}", l2_fo[7]);
-  //GALOIS_LOG_VERBOSE("{}", l2_fo[8]);
-  //GALOIS_LOG_VERBOSE("{}", l2_fo[9]);
-  //GALOIS_LOG_VERBOSE("{}", l2_fo[10]);
-  //GALOIS_LOG_VERBOSE("{}", l2_fo[11]);
-  //GALOIS_LOG_VERBOSE("{}", l2_fo[12]);
-  //GALOIS_LOG_VERBOSE("{}", l2_fo[13]);
-
-  //galois::PointerWithSize<galois::GNNFloat> l2_bo =
+  // GALOIS_LOG_ASSERT(l2_fo.size() == 14);
+  // GALOIS_LOG_VERBOSE("{}", l2_fo[0]);
+  // GALOIS_LOG_VERBOSE("{}", l2_fo[1]);
+  // GALOIS_LOG_VERBOSE("{}", l2_fo[2]);
+  // GALOIS_LOG_VERBOSE("{}", l2_fo[3]);
+  // GALOIS_LOG_VERBOSE("{}", l2_fo[4]);
+  // GALOIS_LOG_VERBOSE("{}", l2_fo[5]);
+  // GALOIS_LOG_VERBOSE("{}", l2_fo[6]);
+  // GALOIS_LOG_VERBOSE("{}", l2_fo[7]);
+  // GALOIS_LOG_VERBOSE("{}", l2_fo[8]);
+  // GALOIS_LOG_VERBOSE("{}", l2_fo[9]);
+  // GALOIS_LOG_VERBOSE("{}", l2_fo[10]);
+  // GALOIS_LOG_VERBOSE("{}", l2_fo[11]);
+  // GALOIS_LOG_VERBOSE("{}", l2_fo[12]);
+  // GALOIS_LOG_VERBOSE("{}", l2_fo[13]);
+
+  // galois::PointerWithSize<galois::GNNFloat> l2_bo =
   //    layer_2->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
 
-  //GALOIS_LOG_ASSERT(l2_bo.size() == 21);
-  //GALOIS_LOG_VERBOSE("{}", (l2_bo)[0]);
-  //GALOIS_LOG_VERBOSE("{}", (l2_bo)[1]);
-  //GALOIS_LOG_VERBOSE("{}", (l2_bo)[2]);
-  //GALOIS_LOG_VERBOSE("{}", (l2_bo)[3]);
-  //GALOIS_LOG_VERBOSE("{}", (l2_bo)[4]);
-  //GALOIS_LOG_VERBOSE("{}", (l2_bo)[5]);
-  //GALOIS_LOG_VERBOSE("{}", (l2_bo)[6]);
-  //GALOIS_LOG_VERBOSE("{}", (l2_bo)[7]);
-  //GALOIS_LOG_VERBOSE("{}", (l2_bo)[8]);
-  //GALOIS_LOG_VERBOSE("{}", (l2_bo)[9]);
-  //GALOIS_LOG_VERBOSE("{}", (l2_bo)[10]);
-  //GALOIS_LOG_VERBOSE("{}", (l2_bo)[11]);
-  //GALOIS_LOG_VERBOSE("{}", (l2_bo)[12]);
-  //GALOIS_LOG_VERBOSE("{}", (l2_bo)[13]);
-  //GALOIS_LOG_VERBOSE("{}", (l2_bo)[14]);
-  //GALOIS_LOG_VERBOSE("{}", (l2_bo)[15]);
-  //GALOIS_LOG_VERBOSE("{}", (l2_bo)[16]);
-  //GALOIS_LOG_VERBOSE("{}", (l2_bo)[17]);
-  //GALOIS_LOG_VERBOSE("{}", (l2_bo)[18]);
-  //GALOIS_LOG_VERBOSE("{}", (l2_bo)[19]);
-  //GALOIS_LOG_VERBOSE("{}", (l2_bo)[20]);
+  // GALOIS_LOG_ASSERT(l2_bo.size() == 21);
+  // GALOIS_LOG_VERBOSE("{}", (l2_bo)[0]);
+  // GALOIS_LOG_VERBOSE("{}", (l2_bo)[1]);
+  // GALOIS_LOG_VERBOSE("{}", (l2_bo)[2]);
+  // GALOIS_LOG_VERBOSE("{}", (l2_bo)[3]);
+  // GALOIS_LOG_VERBOSE("{}", (l2_bo)[4]);
+  // GALOIS_LOG_VERBOSE("{}", (l2_bo)[5]);
+  // GALOIS_LOG_VERBOSE("{}", (l2_bo)[6]);
+  // GALOIS_LOG_VERBOSE("{}", (l2_bo)[7]);
+  // GALOIS_LOG_VERBOSE("{}", (l2_bo)[8]);
+  // GALOIS_LOG_VERBOSE("{}", (l2_bo)[9]);
+  // GALOIS_LOG_VERBOSE("{}", (l2_bo)[10]);
+  // GALOIS_LOG_VERBOSE("{}", (l2_bo)[11]);
+  // GALOIS_LOG_VERBOSE("{}", (l2_bo)[12]);
+  // GALOIS_LOG_VERBOSE("{}", (l2_bo)[13]);
+  // GALOIS_LOG_VERBOSE("{}", (l2_bo)[14]);
+  // GALOIS_LOG_VERBOSE("{}", (l2_bo)[15]);
+  // GALOIS_LOG_VERBOSE("{}", (l2_bo)[16]);
+  // GALOIS_LOG_VERBOSE("{}", (l2_bo)[17]);
+  // GALOIS_LOG_VERBOSE("{}", (l2_bo)[18]);
+  // GALOIS_LOG_VERBOSE("{}", (l2_bo)[19]);
+  // GALOIS_LOG_VERBOSE("{}", (l2_bo)[20]);
 
   return 0;
 }

From 818b68564850a5202adae75653c9f63fd2a85ad5 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 11 Nov 2020 16:29:02 -0600
Subject: [PATCH 413/660] When init CuBLAS, set var to true

---
 libgnn/src/GNNMath.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libgnn/src/GNNMath.cu b/libgnn/src/GNNMath.cu
index be396a153e..06a3dc5983 100644
--- a/libgnn/src/GNNMath.cu
+++ b/libgnn/src/GNNMath.cu
@@ -12,6 +12,7 @@ void galois::CBlasSGEMMGPU(const cublasOperation_t trans_a,
                            GNNFloat* output) {
   if (!cublas_is_init) {
     InitCuBLAS();
+    cublas_is_init = true;
   }
   size_t lead_dim_a = (trans_a == CUBLAS_OP_N) ? input_columns : input_rows;
   size_t lead_dim_b = (trans_b == CUBLAS_OP_N) ? output_columns : input_columns;

From 25a8d1976a0102f42b3ebe63c25260c5085b2dbe Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 11 Nov 2020 16:52:56 -0600
Subject: [PATCH 414/660] GPU allocation helper function on GNNLayer

Adds a function that allocates GPU memory and copies over a particular
passed in vector to the GPU. At the moment there is no way to free this
memory and it will leak. This function is added mostly for unit test
purposes and should not be used otherwise.
---
 libgnn/include/galois/layers/GNNLayer.cuh |  4 ++++
 libgnn/include/galois/layers/GNNLayer.h   |  4 ++++
 libgnn/src/layers/GNNLayer.cu             | 13 +++++++++++++
 3 files changed, 21 insertions(+)

diff --git a/libgnn/include/galois/layers/GNNLayer.cuh b/libgnn/include/galois/layers/GNNLayer.cuh
index b1e5290761..951b1c2775 100644
--- a/libgnn/include/galois/layers/GNNLayer.cuh
+++ b/libgnn/include/galois/layers/GNNLayer.cuh
@@ -20,6 +20,10 @@ public:
   //! Prints forward output matrix on gpu
   void PrintForwardOutput(size_t num);
 
+  //! Helper function: give a vector which is copied over to the GPU (new
+  //! memory is allocated as necessary)
+  GNNFloat* Allocate(const std::vector<GNNFloat>& v);
+
   GNNFloat* forward_output() { return forward_output_matrix_; }
   GNNFloat* backward_output() { return backward_output_matrix_; }
   GNNFloat* layer_weights() { return layer_weights_; }
diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
index e6ac1b1497..143e3a2cb2 100644
--- a/libgnn/include/galois/layers/GNNLayer.h
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -135,6 +135,10 @@ class GNNLayer {
   void OptimizeLayer(BaseOptimizer* optimizer, size_t trainable_layer_number);
 
 #ifdef GALOIS_ENABLE_GPU
+  //! Utility function for allocating
+  PointerWithSize<GNNFloat> AllocateGPU(const std::vector<GNNFloat>& v) {
+    return PointerWithSize<GNNFloat>(base_gpu_object_.Allocate(v), v.size());
+  }
   //! Copies over forward output results to CPU
   const std::vector<GNNFloat>& CopyForwardOutputFromGPU();
   void PrintForwardOutputGPU() {
diff --git a/libgnn/src/layers/GNNLayer.cu b/libgnn/src/layers/GNNLayer.cu
index 64be961e4b..941926f1f6 100644
--- a/libgnn/src/layers/GNNLayer.cu
+++ b/libgnn/src/layers/GNNLayer.cu
@@ -44,6 +44,19 @@ void galois::GNNLayerGPUAllocations::CopyForwardOutputToCPU(
                         cudaMemcpyDeviceToHost));
 }
 
+galois::GNNFloat*
+galois::GNNLayerGPUAllocations::Allocate(const std::vector<GNNFloat>& v) {
+  // TODO keep track of these so that on destruction they can be freed
+  // accordingly; for now I'll let them leak
+  galois::GNNFloat* to_return = nullptr;
+  CUDA_CHECK(
+      cudaMalloc((void**)(&to_return), v.size() * sizeof(galois::GNNFloat)));
+  CUDA_CHECK(cudaMemcpy(to_return, v.data(),
+                        v.size() * sizeof(galois::GNNFloat),
+                        cudaMemcpyHostToDevice));
+  return to_return;
+}
+
 namespace {
 __global__ void PrintVector(galois::GNNFloat* v, unsigned size) {
   for (unsigned i = 0; i < size; i++) {

From 75719d0eec0e08ca535ee267b9a337ab8f35fc8c Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 11 Nov 2020 17:16:03 -0600
Subject: [PATCH 415/660] Prepping GCN Layer backward phase for GPU code

Add ifdefs to separate CPU/GPU code in the backward step and also change
the structures being used to PointerWithSize (gpus don't like CPU
vectors).

Added a few TODOs too for better code organization later.
---
 libgnn/src/layers/GraphConvolutionalLayer.cpp | 37 +++++++++++++------
 1 file changed, 26 insertions(+), 11 deletions(-)

diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp
index 61a0abaf4c..1dbfdacb2b 100644
--- a/libgnn/src/layers/GraphConvolutionalLayer.cpp
+++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp
@@ -103,47 +103,58 @@ galois::GraphConvolutionalLayer::BackwardPhase(
       // transposed sgemm for derivative; in_temp is output
       assert(input_gradient->size() ==
              layer_dimensions_.input_rows * layer_dimensions_.output_columns);
-      assert(in_temp_1_.size() ==
+      assert(p_in_temp_1_.size() ==
              layer_dimensions_.input_columns * layer_dimensions_.input_rows);
-      UpdateEmbeddingsDerivative(input_gradient->data(), in_temp_1_.data());
+      UpdateEmbeddingsDerivative(input_gradient->data(), p_in_temp_1_.data());
       // derivative of aggregate is the same due to symmetric graph
-      AggregateAll(layer_dimensions_.input_columns, in_temp_1_.data(),
-                   backward_output_matrix_.data(),
+      AggregateAll(layer_dimensions_.input_columns, p_in_temp_1_.data(),
+                   p_backward_output_matrix_.data(),
                    &input_column_intermediates_);
     }
     // weight gradient calculation
+    // TODO put this in a function to put the ifdef in there
+#ifndef GALOIS_ENABLE_GPU
     galois::CBlasSGEMM(
         CblasTrans, CblasNoTrans, layer_dimensions_.input_columns,
         layer_dimensions_.input_rows, layer_dimensions_.output_columns,
         prev_layer_input.data(), input_gradient->data(),
-        layer_weight_gradients_.data());
+        p_layer_weight_gradients_.data());
+#else
+    // XXX
+#endif
   } else {
     // aggregate occurs regardless of layer being equal to 0 because it is
     // required in this case for the weight gradient calculation
     AggregateAll(layer_dimensions_.output_columns, input_gradient->data(),
-                 out_temp_.data(), &output_column_intermediates_);
+                 p_out_temp_.data(), &output_column_intermediates_);
     if (layer_number_ != 0) {
       // derivative for update
-      UpdateEmbeddingsDerivative(out_temp_.data(),
-                                 backward_output_matrix_.data());
+      UpdateEmbeddingsDerivative(p_out_temp_.data(),
+                                 p_backward_output_matrix_.data());
     }
+    // TODO put this in a function
+#ifndef GALOIS_ENABLE_GPU
     // weight gradient; note the use of the aggregated gradient in out_temp
     galois::CBlasSGEMM(
         CblasTrans, CblasNoTrans, layer_dimensions_.input_columns,
         layer_dimensions_.input_rows, layer_dimensions_.output_columns,
         prev_layer_input.data(), out_temp_.data(),
-        layer_weight_gradients_.data());
+        p_layer_weight_gradients_.data());
+#else
+    // XXX
+#endif
   }
 
   // sync weight gradients; note aggregation sync occurs in the function call
   // already
+  // TODO figure out how to do this with GPUs
   WeightGradientSyncAverage();
 
   if (config_.do_dropout && layer_number_ != 0) {
     DoDropoutDerivative();
   }
 
-  return PointerWithSize(backward_output_matrix_);
+  return p_backward_output_matrix_;
 }
 
 void galois::GraphConvolutionalLayer::AggregateAll(
@@ -231,12 +242,16 @@ void galois::GraphConvolutionalLayer::UpdateEmbeddings(
 
 void galois::GraphConvolutionalLayer::UpdateEmbeddingsDerivative(
     const GNNFloat* gradients, GNNFloat* output) {
-  assert(layer_weights_.size() ==
+  assert(p_layer_weights_.size() ==
          layer_dimensions_.input_columns * layer_dimensions_.output_columns);
+#ifndef GALOIS_ENABLE_GPU
   // difference is Trans for B matrix (data) to get z by y (weights is y by z
   // normally); result is x by y
   galois::CBlasSGEMM(CblasNoTrans, CblasTrans, layer_dimensions_.input_rows,
                      layer_dimensions_.output_columns,
                      layer_dimensions_.input_columns, gradients,
                      layer_weights_.data(), output);
+#else
+  // XXX
+#endif
 }

From 6af03430cf2a8e56b32d748c60c35e3d26c17734 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 11 Nov 2020 18:25:57 -0600
Subject: [PATCH 416/660] GPU GCN weight gradient/layer gradient calc

Adds functions for calculating the weight and layer gradients of the GCN
layer. Untested: the tests will be added in a commit down the line.
---
 .../galois/layers/GraphConvolutionalLayer.cuh  |  9 +++++++++
 libgnn/src/layers/GraphConvolutionalLayer.cpp  | 18 ++++++++++++++----
 libgnn/src/layers/GraphConvolutionalLayer.cu   | 17 +++++++++++++++++
 3 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/libgnn/include/galois/layers/GraphConvolutionalLayer.cuh b/libgnn/include/galois/layers/GraphConvolutionalLayer.cuh
index 4b28916db5..fd4d9d76f0 100644
--- a/libgnn/include/galois/layers/GraphConvolutionalLayer.cuh
+++ b/libgnn/include/galois/layers/GraphConvolutionalLayer.cuh
@@ -24,6 +24,15 @@ public:
                            size_t output_columns,
                            const GNNFloat* node_embeddings,
                            const GNNFloat* layer_weights, GNNFloat* output);
+  void UpdateEmbeddingsDerivativeGPU(size_t num_nodes, size_t input_columns,
+                                     size_t output_columns,
+                                     const GNNFloat* node_embeddings,
+                                     const GNNFloat* layer_weights,
+                                     GNNFloat* output);
+
+  void GetWeightGradientsGPU(size_t num_nodes, size_t input_columns,
+                             size_t output_columns, const GNNFloat* prev_input,
+                             const GNNFloat* gradients, GNNFloat* output);
 
 private:
   GNNFloat* in_temp_1_{nullptr};
diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp
index 1dbfdacb2b..ef9d3cbb03 100644
--- a/libgnn/src/layers/GraphConvolutionalLayer.cpp
+++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp
@@ -120,7 +120,10 @@ galois::GraphConvolutionalLayer::BackwardPhase(
         prev_layer_input.data(), input_gradient->data(),
         p_layer_weight_gradients_.data());
 #else
-    // XXX
+    gpu_object_.GetWeightGradientsGPU(
+        layer_dimensions_.input_rows, layer_dimensions_.input_columns,
+        layer_dimensions_.output_columns, prev_layer_input.data(),
+        input_gradient->data(), p_layer_weight_gradients_.data());
 #endif
   } else {
     // aggregate occurs regardless of layer being equal to 0 because it is
@@ -138,10 +141,13 @@ galois::GraphConvolutionalLayer::BackwardPhase(
     galois::CBlasSGEMM(
         CblasTrans, CblasNoTrans, layer_dimensions_.input_columns,
         layer_dimensions_.input_rows, layer_dimensions_.output_columns,
-        prev_layer_input.data(), out_temp_.data(),
+        prev_layer_input.data(), p_out_temp_.data(),
         p_layer_weight_gradients_.data());
 #else
-    // XXX
+    gpu_object_.GetWeightGradientsGPU(
+        layer_dimensions_.input_rows, layer_dimensions_.input_columns,
+        layer_dimensions_.output_columns, prev_layer_input.data(),
+        p_out_temp_.data(), p_layer_weight_gradients_.data());
 #endif
   }
 
@@ -252,6 +258,10 @@ void galois::GraphConvolutionalLayer::UpdateEmbeddingsDerivative(
                      layer_dimensions_.input_columns, gradients,
                      layer_weights_.data(), output);
 #else
-  // XXX
+  gpu_object_.UpdateEmbeddingsDerivativeGPU(
+      layer_dimensions_.input_rows, layer_dimensions_.input_columns,
+      layer_dimensions_.output_columns, gradients,
+      base_gpu_object_.layer_weights(), output);
+
 #endif
 }
diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cu b/libgnn/src/layers/GraphConvolutionalLayer.cu
index 7161580ee3..5b6124211d 100644
--- a/libgnn/src/layers/GraphConvolutionalLayer.cu
+++ b/libgnn/src/layers/GraphConvolutionalLayer.cu
@@ -91,3 +91,20 @@ void galois::GCNGPUAllocations::UpdateEmbeddingsGPU(
   CBlasSGEMMGPU(CUBLAS_OP_N, CUBLAS_OP_N, num_nodes, input_columns,
                 output_columns, node_embeddings, layer_weights, output);
 }
+
+void galois::GCNGPUAllocations::UpdateEmbeddingsDerivativeGPU(
+    size_t num_nodes, size_t input_columns, size_t output_columns,
+    const GNNFloat* gradients, const GNNFloat* layer_weights,
+    GNNFloat* output) {
+  // note output clumns/input columns are flipped due to transpose of the
+  // layer weights
+  CBlasSGEMMGPU(CUBLAS_OP_N, CUBLAS_OP_T, num_nodes, output_columns,
+                input_columns, gradients, layer_weights, output);
+}
+
+void galois::GCNGPUAllocations::GetWeightGradientsGPU(
+    size_t num_nodes, size_t input_columns, size_t output_columns,
+    const GNNFloat* prev_input, const GNNFloat* gradients, GNNFloat* output) {
+  CBlasSGEMMGPU(CUBLAS_OP_T, CUBLAS_OP_N, input_columns, num_nodes,
+                output_columns, prev_input, gradients, output);
+}

From d1a7eff9376ce0ab72dd619330375c18355920e4 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 11 Nov 2020 18:51:50 -0600
Subject: [PATCH 417/660] Functions for backward/weight gradient from GPU

Add functions to copy the backward output and weight gradients of a
layer from GPU to CPU. Also moved some function definitions to the
header since the definitions were quite small.
---
 libgnn/include/galois/layers/GNNLayer.cuh |  7 +++++++
 libgnn/include/galois/layers/GNNLayer.h   | 22 +++++++++++++++++++---
 libgnn/src/layers/GNNLayer.cpp            | 12 ------------
 libgnn/src/layers/GNNLayer.cu             | 14 ++++++++++++++
 4 files changed, 40 insertions(+), 15 deletions(-)

diff --git a/libgnn/include/galois/layers/GNNLayer.cuh b/libgnn/include/galois/layers/GNNLayer.cuh
index 951b1c2775..387b1673c4 100644
--- a/libgnn/include/galois/layers/GNNLayer.cuh
+++ b/libgnn/include/galois/layers/GNNLayer.cuh
@@ -17,6 +17,13 @@ public:
   //! Copy GPU forward output to the provided vector (assumes vector is already
   //! correct size)
   void CopyForwardOutputToCPU(std::vector<GNNFloat>* cpu_forward_output);
+  //! Copy GPU backward output to the provided vector (assumes vector is already
+  //! correct size)
+  void CopyBackwardOutputToCPU(std::vector<GNNFloat>* cpu_backward_output);
+  //! Copy GPU weight gradients to the provided vector (assumes vector is
+  //! already correct size)
+  void CopyWeightGradientsToCPU(std::vector<GNNFloat>* cpu_gradients);
+
   //! Prints forward output matrix on gpu
   void PrintForwardOutput(size_t num);
 
diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
index 143e3a2cb2..f4acec8f25 100644
--- a/libgnn/include/galois/layers/GNNLayer.h
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -139,8 +139,22 @@ class GNNLayer {
   PointerWithSize<GNNFloat> AllocateGPU(const std::vector<GNNFloat>& v) {
     return PointerWithSize<GNNFloat>(base_gpu_object_.Allocate(v), v.size());
   }
-  //! Copies over forward output results to CPU
-  const std::vector<GNNFloat>& CopyForwardOutputFromGPU();
+  //! Copies over forward output results to CPU from GPU
+  const std::vector<GNNFloat>& CopyForwardOutputFromGPU() {
+    base_gpu_object_.CopyForwardOutputToCPU(&forward_output_matrix_);
+    return forward_output_matrix_;
+  }
+  //! Copies over backward output results to CPU from GPU
+  const std::vector<GNNFloat>& CopyBackwardOutputFromGPU() {
+    base_gpu_object_.CopyBackwardOutputToCPU(&backward_output_matrix_);
+    return backward_output_matrix_;
+  }
+  //! Copies over weight gradients to CPU from GPU
+  const std::vector<GNNFloat>& CopyWeightGradientsFromGPU() {
+    base_gpu_object_.CopyWeightGradientsToCPU(&layer_weight_gradients_);
+    return layer_weight_gradients_;
+  }
+
   void PrintForwardOutputGPU() {
     base_gpu_object_.PrintForwardOutput(forward_output_matrix_.size());
   }
@@ -241,7 +255,9 @@ class GNNLayer {
   //! Object that holds all GPU allocated pointers to memory related to layers
   GNNLayerGPUAllocations base_gpu_object_;
   //! Copies over layer weights to GPU
-  void CopyLayerWeightsToGPU();
+  void CopyLayerWeightsToGPU() {
+    base_gpu_object_.CopyToWeights(layer_weights_);
+  }
 #endif
 };
 
diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp
index 7bd591e90a..3cfbd990a0 100644
--- a/libgnn/src/layers/GNNLayer.cpp
+++ b/libgnn/src/layers/GNNLayer.cpp
@@ -206,15 +206,3 @@ void galois::GNNLayer::WeightGradientSyncAverage() {
         galois::loopname("WeightGradientSyncAverageDivide"));
   }
 }
-
-#ifdef GALOIS_ENABLE_GPU
-void galois::GNNLayer::CopyLayerWeightsToGPU() {
-  base_gpu_object_.CopyToWeights(layer_weights_);
-}
-
-const std::vector<galois::GNNFloat>&
-galois::GNNLayer::CopyForwardOutputFromGPU() {
-  base_gpu_object_.CopyForwardOutputToCPU(&forward_output_matrix_);
-  return forward_output_matrix_;
-}
-#endif
diff --git a/libgnn/src/layers/GNNLayer.cu b/libgnn/src/layers/GNNLayer.cu
index 941926f1f6..597fba96bd 100644
--- a/libgnn/src/layers/GNNLayer.cu
+++ b/libgnn/src/layers/GNNLayer.cu
@@ -44,6 +44,20 @@ void galois::GNNLayerGPUAllocations::CopyForwardOutputToCPU(
                         cudaMemcpyDeviceToHost));
 }
 
+void galois::GNNLayerGPUAllocations::CopyBackwardOutputToCPU(
+    std::vector<GNNFloat>* cpu_backward_output) {
+  CUDA_CHECK(cudaMemcpy(cpu_backward_output->data(), backward_output_matrix_,
+                        cpu_backward_output->size() * sizeof(GNNFloat),
+                        cudaMemcpyDeviceToHost));
+}
+
+void galois::GNNLayerGPUAllocations::CopyWeightGradientsToCPU(
+    std::vector<GNNFloat>* cpu_gradients) {
+  CUDA_CHECK(cudaMemcpy(cpu_gradients->data(), layer_weight_gradients_,
+                        cpu_gradients->size() * sizeof(GNNFloat),
+                        cudaMemcpyDeviceToHost));
+}
+
 galois::GNNFloat*
 galois::GNNLayerGPUAllocations::Allocate(const std::vector<GNNFloat>& v) {
   // TODO keep track of these so that on destruction they can be freed

From 614651fad02088f967f06424a6b79d5b5e86e748 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 11 Nov 2020 19:09:16 -0600
Subject: [PATCH 418/660] Readded zero'ing of output matrix for aggregation

Since the aggregation in the GPU doesn't actually overwrite but adds to,
the entire output matrix needs to be zero'd out before anything is done
on it else you will have garbage values on it.
---
 libgnn/src/layers/GraphConvolutionalLayer.cu | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cu b/libgnn/src/layers/GraphConvolutionalLayer.cu
index 5b6124211d..7828336b28 100644
--- a/libgnn/src/layers/GraphConvolutionalLayer.cu
+++ b/libgnn/src/layers/GraphConvolutionalLayer.cu
@@ -78,6 +78,8 @@ void galois::GCNGPUAllocations::AggregateAllGPU(
     const graphs::GNNGraphGPUAllocations& gpu_graph, size_t num_nodes,
     size_t column_length, const GNNFloat* node_embeddings,
     GNNFloat* aggregate_output) {
+  CUDA_CHECK(cudaMemset(aggregate_output, 0,
+                        num_nodes * column_length * sizeof(GNNFloat)));
   AggregateAllKernel<<<(num_nodes - 1) / WARPS_PER_BLOCK + 1, BLOCK_SIZE>>>(
       num_nodes, column_length, gpu_graph.edge_index(),
       gpu_graph.edge_destinations(), node_embeddings, aggregate_output);

From 068387cbc5e766e853e3ecadeb5ff5a06f19d22f Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 11 Nov 2020 19:10:12 -0600
Subject: [PATCH 419/660] GPU GCN layer unit test: simple forward/backward

The forward and backward pass of a GCN layer without dropout/activation
works fine now.

All that is left for fully functioning GPU code is the output layer
(softmax). Dropout and activation are nice to have but are not critical
to "function" (though obviously they will be added).
---
 libgnn/test/gpu-convlayer-test.cpp | 159 +++++++++++++++--------------
 1 file changed, 81 insertions(+), 78 deletions(-)

diff --git a/libgnn/test/gpu-convlayer-test.cpp b/libgnn/test/gpu-convlayer-test.cpp
index 7326c1a911..f4bb4cf4d3 100644
--- a/libgnn/test/gpu-convlayer-test.cpp
+++ b/libgnn/test/gpu-convlayer-test.cpp
@@ -41,7 +41,6 @@ int main() {
   // make sure it runs in a sane manner
   layer_0->ForwardPhase(test_graph.GetLocalFeatures());
   // pointer is to GPU memory: copy it over to a CPU source for verification
-  layer_0->PrintForwardOutputGPU();
   const std::vector<galois::GNNFloat>& layer_0_forward_output =
       layer_0->CopyForwardOutputFromGPU();
 
@@ -68,56 +67,58 @@ int main() {
   ////////////////////////////////////////////////////////////////////////////////
 
   // dummy 1 matrix
-  // std::vector<galois::GNNFloat> dummy_ones_v(14, 1);
-  // galois::PointerWithSize dummy_ones(dummy_ones_v);
-
-  // XXX TODO copy this over to the GPU
+  std::vector<galois::GNNFloat> dummy_ones_v(14, 1);
+  // TODO need to free the gpu pointer
+  galois::PointerWithSize<galois::GNNFloat> dummy_ones =
+      layer_0->AllocateGPU(dummy_ones_v);
 
   // backward pass checking
   // layer 0 means that an empty weight matrix is returned since there is no
   // point passing back anything
   // galois::PointerWithSize<galois::GNNFloat> layer_0_backward_output =
-  //    layer_0->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
+  layer_0->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
+  const std::vector<galois::GNNFloat>& layer_0_backward_output =
+      layer_0->CopyBackwardOutputFromGPU();
 
   //////////////////////////////////////////////////////////////////////////////
   // sanity check layer 0 backward output; all 0 because layer 0
   //////////////////////////////////////////////////////////////////////////////
   // since norm factors aren't invovled it is possible to do full assertions
   // 7 x 3
-  // GALOIS_LOG_ASSERT(layer_0_backward_output.size() == 21);
-  // GALOIS_LOG_ASSERT((layer_0_backward_output)[0] == 0);
-  // GALOIS_LOG_ASSERT((layer_0_backward_output)[1] == 0);
-  // GALOIS_LOG_ASSERT((layer_0_backward_output)[2] == 0);
-  // GALOIS_LOG_ASSERT((layer_0_backward_output)[3] == 0);
-  // GALOIS_LOG_ASSERT((layer_0_backward_output)[4] == 0);
-  // GALOIS_LOG_ASSERT((layer_0_backward_output)[5] == 0);
-  // GALOIS_LOG_ASSERT((layer_0_backward_output)[6] == 0);
-  // GALOIS_LOG_ASSERT((layer_0_backward_output)[7] == 0);
-  // GALOIS_LOG_ASSERT((layer_0_backward_output)[8] == 0);
-  // GALOIS_LOG_ASSERT((layer_0_backward_output)[9] == 0);
-  // GALOIS_LOG_ASSERT((layer_0_backward_output)[10] == 0);
-  // GALOIS_LOG_ASSERT((layer_0_backward_output)[11] == 0);
-  // GALOIS_LOG_ASSERT((layer_0_backward_output)[12] == 0);
-  // GALOIS_LOG_ASSERT((layer_0_backward_output)[13] == 0);
-  // GALOIS_LOG_ASSERT((layer_0_backward_output)[14] == 0);
-  // GALOIS_LOG_ASSERT((layer_0_backward_output)[15] == 0);
-  // GALOIS_LOG_ASSERT((layer_0_backward_output)[16] == 0);
-  // GALOIS_LOG_ASSERT((layer_0_backward_output)[17] == 0);
-  // GALOIS_LOG_ASSERT((layer_0_backward_output)[18] == 0);
-  // GALOIS_LOG_ASSERT((layer_0_backward_output)[19] == 0);
-  // GALOIS_LOG_ASSERT((layer_0_backward_output)[20] == 0);
-
-  // galois::PointerWithSize<galois::GNNFloat> layer_0_weight_gradients =
-  //    layer_0->GetLayerWeightGradients();
-  //// make sure they are sane
-  // GALOIS_LOG_ASSERT(layer_0_weight_gradients.size() == 6);
-  // GALOIS_LOG_ASSERT(layer_0_weight_gradients[0] == 21);
-  // GALOIS_LOG_ASSERT(layer_0_weight_gradients[1] == 21);
-  // GALOIS_LOG_ASSERT(layer_0_weight_gradients[2] == 21);
-  // GALOIS_LOG_ASSERT(layer_0_weight_gradients[3] == 21);
-  // GALOIS_LOG_ASSERT(layer_0_weight_gradients[4] == 21);
-
-  // layer_0.reset();
+  GALOIS_LOG_ASSERT(layer_0_backward_output.size() == 21);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[0] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[1] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[2] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[3] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[4] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[5] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[6] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[7] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[8] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[9] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[10] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[11] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[12] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[13] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[14] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[15] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[16] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[17] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[18] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[19] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[20] == 0);
+
+  const std::vector<galois::GNNFloat>& layer_0_weight_gradients =
+      layer_0->CopyWeightGradientsFromGPU();
+  // make sure they are sane
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients.size() == 6);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[0] == 21);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[1] == 21);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[2] == 21);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[3] == 21);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[4] == 21);
+
+  layer_0.reset();
 
   ////////////////////////////////////////////////////////////////////////////////
 
@@ -131,7 +132,6 @@ int main() {
   const std::vector<galois::GNNFloat>& layer_1_forward_output =
       layer_1->CopyForwardOutputFromGPU();
 
-  // same check as before for sanity purposes
   GALOIS_LOG_ASSERT(layer_1_forward_output.size() == 14);
   GALOIS_LOG_ASSERT(layer_1_forward_output[0] == 3);
   GALOIS_LOG_ASSERT(layer_1_forward_output[1] == 3);
@@ -149,49 +149,52 @@ int main() {
   GALOIS_LOG_ASSERT(layer_1_forward_output[13] == 15);
 
   // since layer isn't 0 anymore, backward phase will actually return something
-  // dummy_ones_v.assign(14, 1);
-  // galois::PointerWithSize<galois::GNNFloat> layer_1_backward_output =
-  //    layer_1->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
+  dummy_ones = layer_1->AllocateGPU(dummy_ones_v);
+  layer_1->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
+  const std::vector<galois::GNNFloat>& layer_1_backward_output =
+      layer_1->CopyBackwardOutputFromGPU();
   //////////////////////////////////////////////////////////////////////////////
   // check that multiplies go as expected
   //////////////////////////////////////////////////////////////////////////////
-  // GALOIS_LOG_ASSERT(layer_1_backward_output.size() == 21);
-  // GALOIS_LOG_ASSERT((layer_1_backward_output)[0] == 2);
-  // GALOIS_LOG_ASSERT((layer_1_backward_output)[1] == 2);
-  // GALOIS_LOG_ASSERT((layer_1_backward_output)[2] == 2);
-  // GALOIS_LOG_ASSERT((layer_1_backward_output)[3] == 4);
-  // GALOIS_LOG_ASSERT((layer_1_backward_output)[4] == 4);
-  // GALOIS_LOG_ASSERT((layer_1_backward_output)[5] == 4);
-  // GALOIS_LOG_ASSERT((layer_1_backward_output)[6] == 4);
-  // GALOIS_LOG_ASSERT((layer_1_backward_output)[7] == 4);
-  // GALOIS_LOG_ASSERT((layer_1_backward_output)[8] == 4);
-  // GALOIS_LOG_ASSERT((layer_1_backward_output)[9] == 4);
-  // GALOIS_LOG_ASSERT((layer_1_backward_output)[10] == 4);
-  // GALOIS_LOG_ASSERT((layer_1_backward_output)[11] == 4);
-  // GALOIS_LOG_ASSERT((layer_1_backward_output)[12] == 4);
-  // GALOIS_LOG_ASSERT((layer_1_backward_output)[13] == 4);
-  // GALOIS_LOG_ASSERT((layer_1_backward_output)[14] == 4);
-  // GALOIS_LOG_ASSERT((layer_1_backward_output)[15] == 4);
-  // GALOIS_LOG_ASSERT((layer_1_backward_output)[16] == 4);
-  // GALOIS_LOG_ASSERT((layer_1_backward_output)[17] == 4);
-  // GALOIS_LOG_ASSERT((layer_1_backward_output)[18] == 2);
-  // GALOIS_LOG_ASSERT((layer_1_backward_output)[19] == 2);
-  // GALOIS_LOG_ASSERT((layer_1_backward_output)[20] == 2);
-
-  // galois::PointerWithSize<galois::GNNFloat> layer_1_weight_gradients =
-  //    layer_1->GetLayerWeightGradients();
-  //// make sure they are sane
-  // GALOIS_LOG_ASSERT(layer_1_weight_gradients.size() == 6);
-  // GALOIS_LOG_ASSERT(layer_1_weight_gradients[0] == 21);
-  // GALOIS_LOG_ASSERT(layer_1_weight_gradients[1] == 21);
-  // GALOIS_LOG_ASSERT(layer_1_weight_gradients[2] == 21);
-  // GALOIS_LOG_ASSERT(layer_1_weight_gradients[3] == 21);
-  // GALOIS_LOG_ASSERT(layer_1_weight_gradients[4] == 21);
-
-  // layer_1.reset();
+  GALOIS_LOG_ASSERT(layer_1_backward_output.size() == 21);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[0] == 2);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[1] == 2);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[2] == 2);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[3] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[4] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[5] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[6] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[7] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[8] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[9] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[10] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[11] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[12] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[13] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[14] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[15] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[16] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[17] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[18] == 2);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[19] == 2);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[20] == 2);
+
+  const std::vector<galois::GNNFloat>& layer_1_weight_gradients =
+      layer_1->CopyWeightGradientsFromGPU();
+  // make sure they are sane
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients.size() == 6);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[0] == 21);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[1] == 21);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[2] == 21);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[3] == 21);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[4] == 21);
+
+  layer_1.reset();
 
   ////////////////////////////////////////////////////////////////////////////////
 
+  // TODO get dropout and activation working
+
   // galois::GNNLayerConfig config;
   // config.do_dropout                   = true;
   // config.do_activation                = true;

From bbe5fe7de6b2f8e75f2738d5ef82c4b056fa6224 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 12 Nov 2020 19:17:49 -0600
Subject: [PATCH 420/660] Softmax layer funcs prep split into CPU/GPU

Add ifdefs to calls in softmax layer in preparation for GPU calls.
---
 libgnn/include/galois/layers/SoftmaxLayer.h |  6 +++++
 libgnn/src/layers/SoftmaxLayer.cpp          | 26 ++++++++++++++++++---
 2 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/libgnn/include/galois/layers/SoftmaxLayer.h b/libgnn/include/galois/layers/SoftmaxLayer.h
index 5c412f6bf3..b9f821787b 100644
--- a/libgnn/include/galois/layers/SoftmaxLayer.h
+++ b/libgnn/include/galois/layers/SoftmaxLayer.h
@@ -24,9 +24,15 @@ class SoftmaxLayer : public GNNLayer {
     Init();
   }
 
+  const PointerWithSize<galois::GNNFloat>
+  ForwardPhaseCPU(const PointerWithSize<galois::GNNFloat> input_embeddings);
   //! Creates probability distribution of each row of input
   const PointerWithSize<galois::GNNFloat>
   ForwardPhase(const PointerWithSize<galois::GNNFloat> input_embeddings) final;
+
+  PointerWithSize<galois::GNNFloat>
+  BackwardPhaseCPU(const PointerWithSize<galois::GNNFloat> prev_layer_input,
+                   PointerWithSize<galois::GNNFloat>* input_gradient);
   //! Get gradients to fix distribution such that it leans more towards single
   //! class ground truth.
   PointerWithSize<galois::GNNFloat>
diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp
index 07e78d3c1f..c3bfdb00e7 100644
--- a/libgnn/src/layers/SoftmaxLayer.cpp
+++ b/libgnn/src/layers/SoftmaxLayer.cpp
@@ -6,7 +6,7 @@
 void galois::SoftmaxLayer::Init() {}
 
 const galois::PointerWithSize<galois::GNNFloat>
-galois::SoftmaxLayer::ForwardPhase(
+galois::SoftmaxLayer::ForwardPhaseCPU(
     const galois::PointerWithSize<galois::GNNFloat> input_embeddings) {
   input_loss_.assign(input_loss_.size(), 0.0);
   forward_output_matrix_.assign(forward_output_matrix_.size(), 0.0);
@@ -42,9 +42,19 @@ galois::SoftmaxLayer::ForwardPhase(
   return forward_output_matrix_;
 }
 
+const galois::PointerWithSize<galois::GNNFloat>
+galois::SoftmaxLayer::ForwardPhase(
+    const galois::PointerWithSize<galois::GNNFloat> input_embeddings) {
+#ifndef GALOIS_ENABLE_GPU
+  return ForwardPhaseCPU(input_embeddings);
+#else
+  // XXX
+#endif
+}
+
 galois::PointerWithSize<galois::GNNFloat>
-galois::SoftmaxLayer::BackwardPhase(const PointerWithSize<galois::GNNFloat>,
-                                    PointerWithSize<galois::GNNFloat>*) {
+galois::SoftmaxLayer::BackwardPhaseCPU(const PointerWithSize<galois::GNNFloat>,
+                                       PointerWithSize<galois::GNNFloat>*) {
   const size_t feature_length = layer_dimensions_.input_columns;
 
   galois::do_all(
@@ -86,4 +96,14 @@ galois::SoftmaxLayer::BackwardPhase(const PointerWithSize<galois::GNNFloat>,
   return PointerWithSize(backward_output_matrix_);
 }
 
+galois::PointerWithSize<galois::GNNFloat>
+galois::SoftmaxLayer::BackwardPhase(const PointerWithSize<galois::GNNFloat> a,
+                                    PointerWithSize<galois::GNNFloat>* b) {
+#ifndef GALOIS_ENABLE_GPU
+  return BackwardPhaseCPU(a, b);
+#else
+  // gpu_object_.BackwardPhaseGPU(
+#endif
+}
+
 // TODO function for getting loss

From e735eb65fdee68269fc2e77c7f1dd69bc3360625 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 12 Nov 2020 19:24:31 -0600
Subject: [PATCH 421/660] Dataset masks in GNNs now chars

For some reason  they were Labels which are not needed since the masks
are essentially bitsets: they have been changed to chars to save more
space.
---
 libgnn/include/galois/graphs/GNNGraph.h | 8 ++++----
 libgnn/src/graphs/GNNGraph.cpp          | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 5383b325d3..04debc019f 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -160,11 +160,11 @@ class GNNGraph {
 
   // TODO maybe revisit this and use an actual bitset
   //! Bitset indicating which nodes are training nodes
-  std::vector<GNNLabel> local_training_mask_;
+  std::vector<char> local_training_mask_;
   //! Bitset indicating which nodes are validation nodes
-  std::vector<GNNLabel> local_validation_mask_;
+  std::vector<char> local_validation_mask_;
   //! Bitset indicating which nodes are testing nodes
-  std::vector<GNNLabel> local_testing_mask_;
+  std::vector<char> local_testing_mask_;
 
   //! Global mask range for training nodes; must convert to LIDs when using
   //! in this class
@@ -194,7 +194,7 @@ class GNNGraph {
   //! given a name, mask type, and arrays to save into
   size_t ReadLocalMasksFromFile(const std::string& dataset_name,
                                 const std::string& mask_type,
-                                GNNRange* mask_range, GNNLabel* masks);
+                                GNNRange* mask_range, char* masks);
   //! Read masks of local nodes only for training, validation, and testing
   void ReadLocalMasks(const std::string& dataset_name);
   //! Reads the entire graph topology in (but nothing else)
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index 918ce3d735..6e616e851b 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -253,7 +253,7 @@ void galois::graphs::GNNGraph::ReadLocalFeatures(
 //! given a name, mask type, and arrays to save into
 size_t galois::graphs::GNNGraph::ReadLocalMasksFromFile(
     const std::string& dataset_name, const std::string& mask_type,
-    GNNRange* mask_range, GNNLabel* masks) {
+    GNNRange* mask_range, char* masks) {
   size_t range_begin;
   size_t range_end;
 

From 85c544e7355c9270d3726ef91f2b4124886237aa Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 12 Nov 2020 19:42:11 -0600
Subject: [PATCH 422/660] Cleanup to Softmax layer to let GPU build work

Returning dead objects + removing unused arguments in Softmax layer
files to allow GPU build to compile
---
 libgnn/include/galois/layers/SoftmaxLayer.h |  3 +--
 libgnn/src/layers/SoftmaxLayer.cpp          | 17 ++++++++++-------
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/libgnn/include/galois/layers/SoftmaxLayer.h b/libgnn/include/galois/layers/SoftmaxLayer.h
index b9f821787b..62a4d9ff75 100644
--- a/libgnn/include/galois/layers/SoftmaxLayer.h
+++ b/libgnn/include/galois/layers/SoftmaxLayer.h
@@ -31,8 +31,7 @@ class SoftmaxLayer : public GNNLayer {
   ForwardPhase(const PointerWithSize<galois::GNNFloat> input_embeddings) final;
 
   PointerWithSize<galois::GNNFloat>
-  BackwardPhaseCPU(const PointerWithSize<galois::GNNFloat> prev_layer_input,
-                   PointerWithSize<galois::GNNFloat>* input_gradient);
+  BackwardPhaseCPU();
   //! Get gradients to fix distribution such that it leans more towards single
   //! class ground truth.
   PointerWithSize<galois::GNNFloat>
diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp
index c3bfdb00e7..b0446fd3ab 100644
--- a/libgnn/src/layers/SoftmaxLayer.cpp
+++ b/libgnn/src/layers/SoftmaxLayer.cpp
@@ -43,18 +43,19 @@ galois::SoftmaxLayer::ForwardPhaseCPU(
 }
 
 const galois::PointerWithSize<galois::GNNFloat>
-galois::SoftmaxLayer::ForwardPhase(
-    const galois::PointerWithSize<galois::GNNFloat> input_embeddings) {
+galois::SoftmaxLayer::ForwardPhase([
+    [maybe_unused]] const galois::PointerWithSize<galois::GNNFloat>
+                                       input_embeddings) {
 #ifndef GALOIS_ENABLE_GPU
   return ForwardPhaseCPU(input_embeddings);
 #else
   // XXX
+  return PointerWithSize<GNNFloat>();
 #endif
 }
 
 galois::PointerWithSize<galois::GNNFloat>
-galois::SoftmaxLayer::BackwardPhaseCPU(const PointerWithSize<galois::GNNFloat>,
-                                       PointerWithSize<galois::GNNFloat>*) {
+galois::SoftmaxLayer::BackwardPhaseCPU() {
   const size_t feature_length = layer_dimensions_.input_columns;
 
   galois::do_all(
@@ -97,12 +98,14 @@ galois::SoftmaxLayer::BackwardPhaseCPU(const PointerWithSize<galois::GNNFloat>,
 }
 
 galois::PointerWithSize<galois::GNNFloat>
-galois::SoftmaxLayer::BackwardPhase(const PointerWithSize<galois::GNNFloat> a,
-                                    PointerWithSize<galois::GNNFloat>* b) {
+galois::SoftmaxLayer::BackwardPhase(const PointerWithSize<galois::GNNFloat>,
+                                    PointerWithSize<galois::GNNFloat>*) {
 #ifndef GALOIS_ENABLE_GPU
-  return BackwardPhaseCPU(a, b);
+  return BackwardPhaseCPU();
 #else
+  // XXX
   // gpu_object_.BackwardPhaseGPU(
+  return PointerWithSize<GNNFloat>();
 #endif
 }
 

From 6139288827858dceef5ef40761eeb77a594cdecf Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 12 Nov 2020 20:00:42 -0600
Subject: [PATCH 423/660] Copy over GNN node masks to GPU

Adds code to copy over the masks for the train, val, test sets to the
GPU. Removes norm factor variable + adds the free calls for masks
to the destructor as well.
---
 libgnn/include/galois/graphs/GNNGraph.cuh | 13 +++++++------
 libgnn/src/graphs/GNNGraph.cpp            |  4 ++--
 libgnn/src/graphs/GNNGraph.cu             | 23 ++++++++++++++++++++++-
 3 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/libgnn/include/galois/graphs/GNNGraph.cuh b/libgnn/include/galois/graphs/GNNGraph.cuh
index 33093d2ebc..3470056663 100644
--- a/libgnn/include/galois/graphs/GNNGraph.cuh
+++ b/libgnn/include/galois/graphs/GNNGraph.cuh
@@ -20,6 +20,9 @@ public:
                    unsigned num_features);
   //! Copy over ground truth for the graph to GPU
   void SetLabels(const std::vector<GNNLabel>& ground_truth);
+  //! Copy over masks for the 3 sets to GPU
+  void SetMasks(const std::vector<char>& train, const std::vector<char>& val,
+                const std::vector<char>& test);
 
   GNNFeature* feature_vector() { return feature_vector_; };
   const GNNFeature* feature_vector() const { return feature_vector_; };
@@ -48,12 +51,10 @@ private:
   GNNFeature* feature_vector_{nullptr};
   //! (Local) ground truth vector
   GNNFloat* ground_truth_{nullptr};
-  // TODO need this?
-  //! (Local) norm factors
-  GNNFloat* norm_factors_{nullptr};
-
-  // TODO masks? other things I haven't considered yet? will determine if they
-  // are needed
+  // masks for phases
+  char* local_training_mask_{nullptr};
+  char* local_validation_mask_{nullptr};
+  char* local_testing_mask_{nullptr};
 };
 
 } // namespace graphs
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index 6e616e851b..cbdf5e13db 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -374,8 +374,6 @@ void galois::graphs::GNNGraph::InitNormFactor() {
 
 #ifdef GALOIS_ENABLE_GPU
 void galois::graphs::GNNGraph::InitGPUMemory() {
-  // XXX finish up GPU memory allocation; currently just testing the build
-
   // create int casted CSR
   uint64_t* e_index_ptr = partitioned_graph_->row_start_ptr();
   uint32_t* e_dest_ptr  = partitioned_graph_->edge_dst_ptr();
@@ -419,5 +417,7 @@ void galois::graphs::GNNGraph::InitGPUMemory() {
 
   gpu_memory_.SetFeatures(local_node_features_, node_feature_length_);
   gpu_memory_.SetLabels(local_ground_truth_labels_);
+  gpu_memory_.SetMasks(local_training_mask_, local_validation_mask_,
+                       local_testing_mask_);
 }
 #endif
diff --git a/libgnn/src/graphs/GNNGraph.cu b/libgnn/src/graphs/GNNGraph.cu
index f13bbf4089..b0d5c1eb43 100644
--- a/libgnn/src/graphs/GNNGraph.cu
+++ b/libgnn/src/graphs/GNNGraph.cu
@@ -10,7 +10,9 @@ galois::graphs::GNNGraphGPUAllocations::~GNNGraphGPUAllocations() {
   CUDA_FREE(edge_destinations_);
   CUDA_FREE(feature_vector_);
   CUDA_FREE(ground_truth_);
-  CUDA_FREE(norm_factors_);
+  CUDA_FREE(local_training_mask_);
+  CUDA_FREE(local_validation_mask_);
+  CUDA_FREE(local_testing_mask_);
 }
 
 void galois::graphs::GNNGraphGPUAllocations::SetGraphTopology(
@@ -61,3 +63,22 @@ void galois::graphs::GNNGraphGPUAllocations::SetLabels(
                         ground_truth.size() * sizeof(GNNLabel),
                         cudaMemcpyHostToDevice));
 }
+
+void galois::graphs::GNNGraphGPUAllocations::SetMasks(
+    const std::vector<char>& train, const std::vector<char>& val,
+    const std::vector<char>& test) {
+  CUDA_CHECK(
+      cudaMalloc((void**)(&local_training_mask_), train.size() * sizeof(char)));
+  CUDA_CHECK(cudaMemcpy(local_training_mask_, train.data(),
+                        train.size() * sizeof(char), cudaMemcpyHostToDevice));
+
+  CUDA_CHECK(
+      cudaMalloc((void**)(&local_validation_mask_), val.size() * sizeof(char)));
+  CUDA_CHECK(cudaMemcpy(local_validation_mask_, val.data(),
+                        val.size() * sizeof(char), cudaMemcpyHostToDevice));
+
+  CUDA_CHECK(
+      cudaMalloc((void**)(&local_testing_mask_), test.size() * sizeof(char)));
+  CUDA_CHECK(cudaMemcpy(local_testing_mask_, test.data(),
+                        test.size() * sizeof(char), cudaMemcpyHostToDevice));
+}

From cba333bf432efc5afd49126b4f4182dda12bf91d Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 13 Nov 2020 17:28:31 -0600
Subject: [PATCH 424/660] Softmax GPU object + hook to its forward phase

Adds the file for the GPU object for the Softmax layer and adds the call
to the Forward phase of the GPU code. The call itself is not yet
defined.
---
 libgnn/include/galois/graphs/GNNGraph.cuh     | 15 ++++++----
 libgnn/include/galois/layers/SoftmaxLayer.cuh | 28 +++++++++++++++++++
 libgnn/include/galois/layers/SoftmaxLayer.h   | 17 +++++++++--
 libgnn/src/layers/SoftmaxLayer.cpp            | 11 ++++----
 4 files changed, 57 insertions(+), 14 deletions(-)
 create mode 100644 libgnn/include/galois/layers/SoftmaxLayer.cuh

diff --git a/libgnn/include/galois/graphs/GNNGraph.cuh b/libgnn/include/galois/graphs/GNNGraph.cuh
index 3470056663..528fe4ceb2 100644
--- a/libgnn/include/galois/graphs/GNNGraph.cuh
+++ b/libgnn/include/galois/graphs/GNNGraph.cuh
@@ -24,12 +24,15 @@ public:
   void SetMasks(const std::vector<char>& train, const std::vector<char>& val,
                 const std::vector<char>& test);
 
-  GNNFeature* feature_vector() { return feature_vector_; };
-  const GNNFeature* feature_vector() const { return feature_vector_; };
-  int* edge_index() { return edge_index_; }
-  const int* edge_index() const { return edge_index_; }
-  int* edge_destinations() { return edge_destinations_; }
-  const int* edge_destinations() const { return edge_destinations_; }
+  GNNFeature* feature_vector() const { return feature_vector_; };
+  int* edge_index() const { return edge_index_; }
+  int* edge_destinations() const { return edge_destinations_; }
+
+  GNNFloat* ground_truth() const { return ground_truth_; }
+
+  char* local_training_mask() const { return local_training_mask_; }
+  char* local_validation_mask() const { return local_validation_mask_; }
+  char* local_testing_mask() const { return local_testing_mask_; }
 
 private:
   // ALL THESE VARIABLES ARE DEVICE SIDE (GPU) POINTERS
diff --git a/libgnn/include/galois/layers/SoftmaxLayer.cuh b/libgnn/include/galois/layers/SoftmaxLayer.cuh
new file mode 100644
index 0000000000..440bb1f488
--- /dev/null
+++ b/libgnn/include/galois/layers/SoftmaxLayer.cuh
@@ -0,0 +1,28 @@
+#ifndef GALOIS_SOFTMAX_GPU
+#define GALOIS_SOFTMAX_GPU
+#include "galois/graphs/GNNGraph.cuh"
+namespace galois {
+
+//! Contains implementation for the forward/backward pass of the softmax layer
+//! on GPUs.
+class SoftmaxLayerGPU {
+public:
+  //! Initialize by saving pointers to already initialized GPU memory
+  SoftmaxLayerGPU(const galois::graphs::GNNGraphGPUAllocations& gpu_graph)
+      : train_mask_(gpu_graph.local_training_mask()),
+        val_mask_(gpu_graph.local_validation_mask()),
+        test_mask_(gpu_graph.local_testing_mask()),
+        local_labels_(gpu_graph.ground_truth()) {}
+  void ForwardPhaseGPU(size_t num_nodes, size_t feature_length,
+                       const GNNFloat* input_embeddings, GNNFloat* output);
+  void BackwardPhaseGPU(GNNFloat* output);
+
+private:
+  char* train_mask_;
+  char* val_mask_;
+  char* test_mask_;
+  GNNFloat* local_labels_;
+};
+
+} // namespace galois
+#endif
diff --git a/libgnn/include/galois/layers/SoftmaxLayer.h b/libgnn/include/galois/layers/SoftmaxLayer.h
index 62a4d9ff75..76a7ec654f 100644
--- a/libgnn/include/galois/layers/SoftmaxLayer.h
+++ b/libgnn/include/galois/layers/SoftmaxLayer.h
@@ -1,5 +1,8 @@
 #pragma once
 #include "galois/layers/GNNLayer.h"
+#ifdef GALOIS_ENABLE_GPU
+#include "galois/layers/SoftmaxLayer.cuh"
+#endif
 
 namespace galois {
 
@@ -12,10 +15,15 @@ class SoftmaxLayer : public GNNLayer {
                const GNNLayerDimensions& dimensions)
       : GNNLayer(layer_num, graph, dimensions,
                  GNNLayerConfig{.allocate_weights = false}),
+#ifdef GALOIS_ENABLE_GPU
+        gpu_object_(graph.GetGPUGraph()),
+#endif
         input_loss_(dimensions.input_rows),
         ground_truth_vectors_(dimensions.input_columns),
         norm_gradient_vectors_(dimensions.input_columns),
-        softmax_temp_vectors_(dimensions.input_columns) {
+        softmax_temp_vectors_(dimensions.input_columns)
+
+  {
     output_layer_type_ = galois::GNNOutputLayerType::kSoftmax;
     // input/output columns must be equivalent in a softmax
     GALOIS_LOG_ASSERT(dimensions.input_columns == dimensions.output_columns);
@@ -30,8 +38,7 @@ class SoftmaxLayer : public GNNLayer {
   const PointerWithSize<galois::GNNFloat>
   ForwardPhase(const PointerWithSize<galois::GNNFloat> input_embeddings) final;
 
-  PointerWithSize<galois::GNNFloat>
-  BackwardPhaseCPU();
+  PointerWithSize<galois::GNNFloat> BackwardPhaseCPU();
   //! Get gradients to fix distribution such that it leans more towards single
   //! class ground truth.
   PointerWithSize<galois::GNNFloat>
@@ -39,6 +46,10 @@ class SoftmaxLayer : public GNNLayer {
                 PointerWithSize<galois::GNNFloat>* input_gradient) final;
 
 private:
+#ifdef GALOIS_ENABLE_GPU
+  SoftmaxLayerGPU gpu_object_;
+#endif
+
   //! Loss for each row of the input
   std::vector<GNNFloat> input_loss_;
   //! Each thread gets storage to allocate the ground truth vector in during
diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp
index b0446fd3ab..00c0c05edd 100644
--- a/libgnn/src/layers/SoftmaxLayer.cpp
+++ b/libgnn/src/layers/SoftmaxLayer.cpp
@@ -43,14 +43,15 @@ galois::SoftmaxLayer::ForwardPhaseCPU(
 }
 
 const galois::PointerWithSize<galois::GNNFloat>
-galois::SoftmaxLayer::ForwardPhase([
-    [maybe_unused]] const galois::PointerWithSize<galois::GNNFloat>
-                                       input_embeddings) {
+galois::SoftmaxLayer::ForwardPhase(
+    const galois::PointerWithSize<galois::GNNFloat> input_embeddings) {
 #ifndef GALOIS_ENABLE_GPU
   return ForwardPhaseCPU(input_embeddings);
 #else
-  // XXX
-  return PointerWithSize<GNNFloat>();
+  gpu_object_.ForwardPhaseGPU(graph_.size(), graph_.node_feature_length(),
+                              input_embeddings.data(),
+                              p_forward_output_matrix_.data());
+  return p_forward_output_matrix_;
 #endif
 }
 

From ed546d4d5334cdd52235ace01f39b4d4058dec29 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 13 Nov 2020 18:58:04 -0600
Subject: [PATCH 425/660] Softmax function for GPUs

Adds a softmax function on GPUs that can be called from GPU kernels.
---
 libgnn/include/galois/GNNMath.cuh |  6 ++++++
 libgnn/src/GNNMath.cu             | 23 +++++++++++++++++++++++
 2 files changed, 29 insertions(+)

diff --git a/libgnn/include/galois/GNNMath.cuh b/libgnn/include/galois/GNNMath.cuh
index 763799f838..212226d00b 100644
--- a/libgnn/include/galois/GNNMath.cuh
+++ b/libgnn/include/galois/GNNMath.cuh
@@ -18,5 +18,11 @@ void CBlasSGEMMGPU(const cublasOperation_t trans_a,
                    size_t input_columns, size_t output_columns,
                    const GNNFloat* a, const GNNFloat* b, GNNFloat* output);
 
+//! Given a vector, apply a softmax on some specified # of elements and save
+//! the result to the specified output. Since this is a device function,
+//! all pointers should be to GPU memory.
+__device__ void DoSoftmax(size_t vector_length, const GNNFloat* input,
+                          GNNFloat* output);
+
 } // namespace galois
 #endif
diff --git a/libgnn/src/GNNMath.cu b/libgnn/src/GNNMath.cu
index 06a3dc5983..a04fac6962 100644
--- a/libgnn/src/GNNMath.cu
+++ b/libgnn/src/GNNMath.cu
@@ -27,3 +27,26 @@ void galois::CBlasSGEMMGPU(const cublasOperation_t trans_a,
                            b, lead_dim_b, a, lead_dim_a, &dummy0, output,
                            output_columns));
 }
+
+__device__ void galois::DoSoftmax(size_t vector_length, const GNNFloat* input,
+                                  GNNFloat* output) {
+  // find max value
+  GNNFloat current_max = input[0];
+  for (size_t i = 1; i < vector_length; i++) {
+    if (input[i] > current_max) {
+      current_max = input[i];
+    }
+  }
+  // set output by scaling with the max
+  GNNFloat denominator = 0.0;
+  for (size_t i = 0; i < vector_length; i++) {
+    // NOTE: expf only works for single precision float; may need to change if
+    // we ever switch to double
+    output[i] = expf(input[i] - current_max);
+    denominator += output[i];
+  }
+  // denominator scale
+  for (size_t i = 0; i < vector_length; i++) {
+    output[i] /= denominator;
+  }
+}

From 8017d6a473efe0ce5fad02183656406569b016df Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 13 Nov 2020 19:38:52 -0600
Subject: [PATCH 426/660] CUDA_KERNEL_LOOP and some helper calcs

Added a few things from old codebase's CUDA utils to new one in
preparation for using the newly added things to compute the softmax
layer. Also added the original source of the old code: Caffe.
---
 libgnn/include/galois/CUDAUtil.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/libgnn/include/galois/CUDAUtil.h b/libgnn/include/galois/CUDAUtil.h
index 6a7e7e9915..51be6cd102 100644
--- a/libgnn/include/galois/CUDAUtil.h
+++ b/libgnn/include/galois/CUDAUtil.h
@@ -2,10 +2,21 @@
 #define GALOIS_CUDA_UTIL
 //! @file CUDAUtil.h
 //! Contains various utility functions for CUDA.
+//! Taken and revised+added to from here
+//! https://github.com/BVLC/caffe/blob/master/include/caffe/util/device_alternate.hpp
 #include <cuda.h>
 #include <cublas_v2.h>
 #include "galois/Logging.h"
 
+// TODO check these too and make sure they make sense
+// CUDA: use 256 threads per block
+const int CUDA_NUM_THREADS = 256;
+
+// CUDA: number of blocks for threads.
+inline int CUDA_GET_BLOCKS(const int N) {
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}
+
 // TODO check these
 #define CHUNK_SIZE 256
 #define TB_SIZE 256
@@ -14,6 +25,7 @@
 #define MAX_NUM_CLASSES 128
 #define WARPS_PER_BLOCK (BLOCK_SIZE / WARP_SIZE)
 
+//! Wrap a CUDA call with this to auto-check if it returns any error
 #define CUDA_CHECK(condition)                                                  \
   do {                                                                         \
     cudaError_t error = condition;                                             \
@@ -22,6 +34,7 @@
     }                                                                          \
   } while (0)
 
+//! Frees a pointer allocated by cuda malloc
 #define CUDA_FREE(ptr)                                                         \
   do {                                                                         \
     if (ptr) {                                                                 \
@@ -30,6 +43,7 @@
     }                                                                          \
   } while (0)
 
+//! Call this after a cuda call to make sure it set any error flags
 #define CUDA_TEST(msg)                                                         \
   do {                                                                         \
     cudaError_t e;                                                             \
@@ -41,6 +55,13 @@
     }                                                                          \
   } while (0)
 
+//! Basic kernel loop for CUDA threads
+//! Caffe describes it as "grid stride"
+#define CUDA_KERNEL_LOOP(i, n)                                                 \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n);                 \
+       i += blockDim.x * gridDim.x)
+
+//! Wrap a CuBLAS call with this to check if it threw any errors
 #define CUBLAS_CHECK(condition)                                                \
   do {                                                                         \
     cublasStatus_t status = condition;                                         \

From 5eb6b4d085f90bba8adeb8e04f4b0103a7c6388a Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 13 Nov 2020 19:41:41 -0600
Subject: [PATCH 427/660] Definition of forward phase gpu softmax

This commit adds the softmax/cross entropy function to GNNMath.cu and
uses it to define the GPU Softmax forward phase function. An additional
argument was added to the forward phase gpu call to deal with the
different phases: the phase argument details which mask to use in the
softmax. There are a few things left to do that will be done later,
namely zero'ing out the output matrix.

Note that I have NOT defined cross entropy for the forward phase: it is
only used to calculate loss, and I'm not using loss nor referring to it
anywhere in my code or analysis at the moment..
---
 libgnn/CMakeLists.txt                         |  1 +
 libgnn/include/galois/GNNMath.cuh             |  6 +++
 libgnn/include/galois/layers/SoftmaxLayer.cuh |  5 ++-
 libgnn/src/GNNMath.cpp                        |  4 ++
 libgnn/src/GNNMath.cu                         | 14 +++++++
 libgnn/src/layers/SoftmaxLayer.cpp            |  2 +-
 libgnn/src/layers/SoftmaxLayer.cu             | 37 +++++++++++++------
 7 files changed, 54 insertions(+), 15 deletions(-)

diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt
index ff7d47a07d..ca50e171ee 100644
--- a/libgnn/CMakeLists.txt
+++ b/libgnn/CMakeLists.txt
@@ -34,6 +34,7 @@ if (GALOIS_ENABLE_GPU)
     src/graphs/GNNGraph.cu
     src/layers/GNNLayer.cu
     src/layers/GraphConvolutionalLayer.cu
+    src/layers/SoftmaxLayer.cu
   )
   add_library(galois_gnn_gpu STATIC ${gpusources})
   target_compile_definitions(galois_gnn_gpu PRIVATE _FORCE_INLINES)
diff --git a/libgnn/include/galois/GNNMath.cuh b/libgnn/include/galois/GNNMath.cuh
index 212226d00b..01afe64f03 100644
--- a/libgnn/include/galois/GNNMath.cuh
+++ b/libgnn/include/galois/GNNMath.cuh
@@ -18,6 +18,12 @@ void CBlasSGEMMGPU(const cublasOperation_t trans_a,
                    size_t input_columns, size_t output_columns,
                    const GNNFloat* a, const GNNFloat* b, GNNFloat* output);
 
+//! Runs softmax + cross entropy on masked nodes
+__global__ void
+SoftmaxCrossEntropyForward(char* mask, size_t num_nodes, size_t feature_length,
+                           const galois::GNNFloat* input_embeddings,
+                           galois::GNNFloat* output);
+
 //! Given a vector, apply a softmax on some specified # of elements and save
 //! the result to the specified output. Since this is a device function,
 //! all pointers should be to GPU memory.
diff --git a/libgnn/include/galois/layers/SoftmaxLayer.cuh b/libgnn/include/galois/layers/SoftmaxLayer.cuh
index 440bb1f488..40e9681bb1 100644
--- a/libgnn/include/galois/layers/SoftmaxLayer.cuh
+++ b/libgnn/include/galois/layers/SoftmaxLayer.cuh
@@ -13,8 +13,9 @@ public:
         val_mask_(gpu_graph.local_validation_mask()),
         test_mask_(gpu_graph.local_testing_mask()),
         local_labels_(gpu_graph.ground_truth()) {}
-  void ForwardPhaseGPU(size_t num_nodes, size_t feature_length,
-                       const GNNFloat* input_embeddings, GNNFloat* output);
+  void ForwardPhaseGPU(galois::GNNPhase phase, size_t num_nodes,
+                       size_t feature_length, const GNNFloat* input_embeddings,
+                       GNNFloat* output);
   void BackwardPhaseGPU(GNNFloat* output);
 
 private:
diff --git a/libgnn/src/GNNMath.cpp b/libgnn/src/GNNMath.cpp
index 5e9fb8d050..0d065d6bcc 100644
--- a/libgnn/src/GNNMath.cpp
+++ b/libgnn/src/GNNMath.cpp
@@ -88,6 +88,10 @@ galois::GNNFloat galois::GNNCrossEntropy(const size_t vector_length,
                                          const GNNFloat* input) {
   GNNFloat loss = 0.0;
 
+  // Note that this function works if there are multiple non-zeros in the
+  // ground truth vector
+  // If there is only 1 then this function is overkill and it should break
+  // early
   for (size_t i = 0; i < vector_length; i++) {
     if (ground_truth[i] == 0.0) {
       continue;
diff --git a/libgnn/src/GNNMath.cu b/libgnn/src/GNNMath.cu
index a04fac6962..d33ea88cc9 100644
--- a/libgnn/src/GNNMath.cu
+++ b/libgnn/src/GNNMath.cu
@@ -28,6 +28,20 @@ void galois::CBlasSGEMMGPU(const cublasOperation_t trans_a,
                            output_columns));
 }
 
+
+__global__ void SoftmaxCrossEntropyForward(char* mask, size_t num_nodes, size_t feature_length,
+                                      const galois::GNNFloat* input_embeddings,
+                                      galois::GNNFloat* output) {
+  // XXX zero out output
+  CUDA_KERNEL_LOOP(i, num_nodes) {
+    if (mask[i] == 1) {
+      galois::DoSoftmax(feature_length, input_embeddings + feature_length * i, output + feature_length * i);
+      // ignoring crossentropy loss calculation for now because I'm not using
+      // loss for anything + didn't bother allocating an array to store loss anyways
+    }
+  }
+}
+
 __device__ void galois::DoSoftmax(size_t vector_length, const GNNFloat* input,
                                   GNNFloat* output) {
   // find max value
diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp
index 00c0c05edd..57a10af41c 100644
--- a/libgnn/src/layers/SoftmaxLayer.cpp
+++ b/libgnn/src/layers/SoftmaxLayer.cpp
@@ -48,7 +48,7 @@ galois::SoftmaxLayer::ForwardPhase(
 #ifndef GALOIS_ENABLE_GPU
   return ForwardPhaseCPU(input_embeddings);
 #else
-  gpu_object_.ForwardPhaseGPU(graph_.size(), graph_.node_feature_length(),
+  gpu_object_.ForwardPhaseGPU(layer_phase_, graph_.size(), graph_.node_feature_length(),
                               input_embeddings.data(),
                               p_forward_output_matrix_.data());
   return p_forward_output_matrix_;
diff --git a/libgnn/src/layers/SoftmaxLayer.cu b/libgnn/src/layers/SoftmaxLayer.cu
index c3f61dcf6f..a562923a98 100644
--- a/libgnn/src/layers/SoftmaxLayer.cu
+++ b/libgnn/src/layers/SoftmaxLayer.cu
@@ -1,14 +1,27 @@
+#include "galois/GNNMath.cuh"
 #include "galois/Logging.h"
-#include "galois/GNNMath.h" // Please add GPU functions
-#include "galois/layers/SoftmaxLayer.h"
+#include "galois/layers/SoftmaxLayer.cuh"
 
-// Allocate memory and initialize
-void galois::SoftmaxLayer::Init() {}
+void galois::SoftmaxLayerGPU::ForwardPhaseGPU(galois::GNNPhase phase, size_t num_nodes, size_t feature_length,
+                                         const GNNFloat* input_embeddings, GNNFloat* output) {
+  char* mask_to_use = nullptr;
+  switch (phase) {
+  case GNNPhase::kTrain:
+    mask_to_use = train_mask_;
+    break;
+  case GNNPhase::kValidate:
+    mask_to_use = val_mask_;
+    break;
+  case GNNPhase::kTest:
+    mask_to_use = test_mask_;
+    break;
+  default:
+    GALOIS_LOG_FATAL("Invalid phase specified");
+  }
 
-// Input: in_tensor
-// Output: out_tensor
-void galois::SoftmaxLayer::Forward(const galois::GNNFloat* in_tensor,
-                                   galois::GNNFloat* out_tensor) {}
+  SoftmaxCrossEntropyForward<<<CUDA_GET_BLOCKS(num_nodes), CUDA_NUM_THREADS>>>(mask_to_use, num_nodes,
+                  feature_length, input_embeddings, output);
+}
 
 // Input: in_tensor
 // Input: out_tensor
@@ -18,7 +31,7 @@ void galois::SoftmaxLayer::Forward(const galois::GNNFloat* in_tensor,
 //       it is not const because it can be reused
 //       to hold intermediate data inside this function,
 //       to avoid allocating more memory
-void galois::SoftmaxLayer::Backward(const galois::GNNFloat* in_tensor,
-                                    const galois::GNNFloat* out_tensor,
-                                    galois::GNNFloat* in_gradients,
-                                    galois::GNNFloat* out_gradients) {}
+//void galois::SoftmaxLayerGPU::Backward(const galois::GNNFloat* in_tensor,
+//                                    const galois::GNNFloat* out_tensor,
+//                                    galois::GNNFloat* in_gradients,
+//                                    galois::GNNFloat* out_gradients) {}

From 77ec2f36fea802b1e5842ccdde71bc205fbd3ff5 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 17 Nov 2020 14:07:35 -0600
Subject: [PATCH 428/660] Softmax forward phase fixes; unit test for it too

Fixed some bugs exposed by the unit test for softmax forward, namely
that the feature length size was incorrect and that the vector was not
being 0'd out before softmax occured. The unit test in question has been
ported over from the cpu softmax unit test as well.

The next step is to finish up the backward pass for the softmax layer
and reactivate the unit test calls to the backward phase. I also need to
consider actually checking backward phase output to make sure it is
sane.
---
 libgnn/include/galois/GNNMath.cuh           |   3 +-
 libgnn/include/galois/layers/SoftmaxLayer.h |   3 -
 libgnn/src/GNNMath.cu                       |  15 ++-
 libgnn/src/layers/GNNLayer.cpp              |   1 -
 libgnn/src/layers/SoftmaxLayer.cpp          |   9 +-
 libgnn/src/layers/SoftmaxLayer.cu           |  15 ++-
 libgnn/test/CMakeLists.txt                  |   5 +
 libgnn/test/gpu-softmaxlayer-test.cpp       | 118 ++++++++++++++++++++
 8 files changed, 147 insertions(+), 22 deletions(-)
 create mode 100644 libgnn/test/gpu-softmaxlayer-test.cpp

diff --git a/libgnn/include/galois/GNNMath.cuh b/libgnn/include/galois/GNNMath.cuh
index 01afe64f03..e63221f87f 100644
--- a/libgnn/include/galois/GNNMath.cuh
+++ b/libgnn/include/galois/GNNMath.cuh
@@ -18,7 +18,8 @@ void CBlasSGEMMGPU(const cublasOperation_t trans_a,
                    size_t input_columns, size_t output_columns,
                    const GNNFloat* a, const GNNFloat* b, GNNFloat* output);
 
-//! Runs softmax + cross entropy on masked nodes
+//! Runs softmax + cross entropy on masked nodes. Will not overwrite all of
+//! the output, so make sure it's been zero'd out beforehand.
 __global__ void
 SoftmaxCrossEntropyForward(char* mask, size_t num_nodes, size_t feature_length,
                            const galois::GNNFloat* input_embeddings,
diff --git a/libgnn/include/galois/layers/SoftmaxLayer.h b/libgnn/include/galois/layers/SoftmaxLayer.h
index 76a7ec654f..7bf29272cd 100644
--- a/libgnn/include/galois/layers/SoftmaxLayer.h
+++ b/libgnn/include/galois/layers/SoftmaxLayer.h
@@ -29,7 +29,6 @@ class SoftmaxLayer : public GNNLayer {
     GALOIS_LOG_ASSERT(dimensions.input_columns == dimensions.output_columns);
     // output needs to match number of possible classes
     GALOIS_LOG_ASSERT(dimensions.input_columns == graph.GetNumLabelClasses());
-    Init();
   }
 
   const PointerWithSize<galois::GNNFloat>
@@ -64,8 +63,6 @@ class SoftmaxLayer : public GNNLayer {
   //! derivative calculation; each is the size of a feature vector
   galois::substrate::PerThreadStorage<std::vector<GNNFloat>>
       softmax_temp_vectors_;
-
-  void Init();
 };
 
 } // namespace galois
diff --git a/libgnn/src/GNNMath.cu b/libgnn/src/GNNMath.cu
index d33ea88cc9..0066e85939 100644
--- a/libgnn/src/GNNMath.cu
+++ b/libgnn/src/GNNMath.cu
@@ -28,16 +28,19 @@ void galois::CBlasSGEMMGPU(const cublasOperation_t trans_a,
                            output_columns));
 }
 
+__global__ void galois::SoftmaxCrossEntropyForward(
+    char* mask, size_t num_nodes, size_t feature_length,
+    const galois::GNNFloat* input_embeddings, galois::GNNFloat* output) {
 
-__global__ void SoftmaxCrossEntropyForward(char* mask, size_t num_nodes, size_t feature_length,
-                                      const galois::GNNFloat* input_embeddings,
-                                      galois::GNNFloat* output) {
-  // XXX zero out output
+  // NOTE: assumes that output is already 0'd out as it will not overwrite the
+  // entire thing
   CUDA_KERNEL_LOOP(i, num_nodes) {
     if (mask[i] == 1) {
-      galois::DoSoftmax(feature_length, input_embeddings + feature_length * i, output + feature_length * i);
+      galois::DoSoftmax(feature_length, input_embeddings + feature_length * i,
+                        output + feature_length * i);
       // ignoring crossentropy loss calculation for now because I'm not using
-      // loss for anything + didn't bother allocating an array to store loss anyways
+      // loss for anything + didn't bother allocating an array to store loss
+      // anyways
     }
   }
 }
diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp
index 3cfbd990a0..31cf58c6c7 100644
--- a/libgnn/src/layers/GNNLayer.cpp
+++ b/libgnn/src/layers/GNNLayer.cpp
@@ -9,7 +9,6 @@ galois::GNNLayer::GNNLayer(size_t layer_num,
     : layer_number_(layer_num), graph_(graph), layer_dimensions_(dimensions),
       config_(config) {
   if (config_.allocate_weights) {
-
     // TODO some of this does not need alloc if not used
     // dropout allocation; dropout is same as input
     dropout_mask_.resize(
diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp
index 57a10af41c..3a65ba55bc 100644
--- a/libgnn/src/layers/SoftmaxLayer.cpp
+++ b/libgnn/src/layers/SoftmaxLayer.cpp
@@ -2,9 +2,6 @@
 #include "galois/GNNMath.h"
 #include "galois/layers/SoftmaxLayer.h"
 
-// Allocate memory and initialize
-void galois::SoftmaxLayer::Init() {}
-
 const galois::PointerWithSize<galois::GNNFloat>
 galois::SoftmaxLayer::ForwardPhaseCPU(
     const galois::PointerWithSize<galois::GNNFloat> input_embeddings) {
@@ -48,9 +45,9 @@ galois::SoftmaxLayer::ForwardPhase(
 #ifndef GALOIS_ENABLE_GPU
   return ForwardPhaseCPU(input_embeddings);
 #else
-  gpu_object_.ForwardPhaseGPU(layer_phase_, graph_.size(), graph_.node_feature_length(),
-                              input_embeddings.data(),
-                              p_forward_output_matrix_.data());
+  gpu_object_.ForwardPhaseGPU(
+      layer_phase_, graph_.size(), layer_dimensions_.input_columns,
+      input_embeddings.data(), p_forward_output_matrix_.data());
   return p_forward_output_matrix_;
 #endif
 }
diff --git a/libgnn/src/layers/SoftmaxLayer.cu b/libgnn/src/layers/SoftmaxLayer.cu
index a562923a98..e385214981 100644
--- a/libgnn/src/layers/SoftmaxLayer.cu
+++ b/libgnn/src/layers/SoftmaxLayer.cu
@@ -2,8 +2,11 @@
 #include "galois/Logging.h"
 #include "galois/layers/SoftmaxLayer.cuh"
 
-void galois::SoftmaxLayerGPU::ForwardPhaseGPU(galois::GNNPhase phase, size_t num_nodes, size_t feature_length,
-                                         const GNNFloat* input_embeddings, GNNFloat* output) {
+void galois::SoftmaxLayerGPU::ForwardPhaseGPU(galois::GNNPhase phase,
+                                              size_t num_nodes,
+                                              size_t feature_length,
+                                              const GNNFloat* input_embeddings,
+                                              GNNFloat* output) {
   char* mask_to_use = nullptr;
   switch (phase) {
   case GNNPhase::kTrain:
@@ -19,8 +22,10 @@ void galois::SoftmaxLayerGPU::ForwardPhaseGPU(galois::GNNPhase phase, size_t num
     GALOIS_LOG_FATAL("Invalid phase specified");
   }
 
-  SoftmaxCrossEntropyForward<<<CUDA_GET_BLOCKS(num_nodes), CUDA_NUM_THREADS>>>(mask_to_use, num_nodes,
-                  feature_length, input_embeddings, output);
+  CUDA_CHECK(
+      cudaMemset(output, 0, num_nodes * feature_length * sizeof(GNNFloat)));
+  SoftmaxCrossEntropyForward<<<CUDA_GET_BLOCKS(num_nodes), CUDA_NUM_THREADS>>>(
+      mask_to_use, num_nodes, feature_length, input_embeddings, output);
 }
 
 // Input: in_tensor
@@ -31,7 +36,7 @@ void galois::SoftmaxLayerGPU::ForwardPhaseGPU(galois::GNNPhase phase, size_t num
 //       it is not const because it can be reused
 //       to hold intermediate data inside this function,
 //       to avoid allocating more memory
-//void galois::SoftmaxLayerGPU::Backward(const galois::GNNFloat* in_tensor,
+// void galois::SoftmaxLayerGPU::Backward(const galois::GNNFloat* in_tensor,
 //                                    const galois::GNNFloat* out_tensor,
 //                                    galois::GNNFloat* in_gradients,
 //                                    galois::GNNFloat* out_gradients) {}
diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt
index 8bec96c4d6..01199c1247 100644
--- a/libgnn/test/CMakeLists.txt
+++ b/libgnn/test/CMakeLists.txt
@@ -42,6 +42,11 @@ else()
   add_executable(gpu-convlayer-test gpu-convlayer-test.cpp)
   target_link_libraries(gpu-convlayer-test galois_gnn)
   add_test(NAME gpu-convlayer-test COMMAND gpu-convlayer-test)
+
+  add_executable(gpu-softmaxlayer-test gpu-softmaxlayer-test.cpp)
+  target_link_libraries(gpu-softmaxlayer-test galois_gnn)
+  add_test(NAME gpu-softmaxlayer-test COMMAND gpu-softmaxlayer-test)
+
 endif()
 
 # TODO multi host tests?
diff --git a/libgnn/test/gpu-softmaxlayer-test.cpp b/libgnn/test/gpu-softmaxlayer-test.cpp
new file mode 100644
index 0000000000..2bceb7a6b4
--- /dev/null
+++ b/libgnn/test/gpu-softmaxlayer-test.cpp
@@ -0,0 +1,118 @@
+//! @file convlayer-test.cpp
+//! Softmax layer test with a test graph
+
+#include "galois/Logging.h"
+#include "galois/GNNMath.h"
+#include "galois/layers/SoftmaxLayer.h"
+
+int main() {
+  galois::DistMemSys G;
+
+  size_t num_threads = galois::setActiveThreads(
+      56 / galois::runtime::getSystemNetworkInterface().Num);
+  GALOIS_LOG_VERBOSE("Num threads is {}", num_threads);
+
+  // load test graph
+  galois::graphs::GNNGraph test_graph(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true);
+
+  // input/output columns must be same in softmax
+  galois::GNNLayerDimensions dimension_0;
+  dimension_0.input_rows     = 7;
+  dimension_0.input_columns  = test_graph.GetNumLabelClasses();
+  dimension_0.output_columns = test_graph.GetNumLabelClasses();
+
+  GALOIS_LOG_VERBOSE("Num output classes is {}", dimension_0.input_columns);
+
+  // train mode
+  auto output_layer =
+      std::make_unique<galois::SoftmaxLayer>(3, test_graph, dimension_0);
+  // input to softmax
+  std::vector<galois::GNNFloat> softmax_input(49, 0.0);
+  // create input with perfect accuracy
+  softmax_input[0]  = 1;
+  softmax_input[8]  = 1;
+  softmax_input[16] = 1;
+  softmax_input[24] = 1;
+  softmax_input[32] = 1;
+  softmax_input[40] = 1;
+  softmax_input[48] = 1;
+  galois::PointerWithSize<galois::GNNFloat> p_softmax_input =
+      output_layer->AllocateGPU(softmax_input);
+
+  output_layer->ForwardPhase(p_softmax_input);
+
+  const std::vector<galois::GNNFloat>& prediction_distribution =
+      output_layer->CopyForwardOutputFromGPU();
+
+  // assert that predictions are as expected
+  for (size_t i = 0; i < 5; i++) {
+    GALOIS_LOG_ASSERT(galois::MaxIndex(7, &(prediction_distribution[i * 7])) ==
+                      i);
+  }
+  // train mode means last 2 vertices should be empty
+  for (size_t i = 5; i < 7; i++) {
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 0] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 1] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 2] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 3] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 4] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 5] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 6] == 0.0);
+  }
+
+  // XXX
+  // output_layer->BackwardPhase(p_softmax_input, nullptr);
+
+  // validation mode
+  output_layer->SetLayerPhase(galois::GNNPhase::kValidate);
+  output_layer->ForwardPhase(p_softmax_input);
+  std::vector<galois::GNNFloat> pd2 = output_layer->CopyForwardOutputFromGPU();
+
+  // validate vertex is index 5
+  GALOIS_LOG_ASSERT(galois::MaxIndex(7, &(pd2[5 * 7])) == 5);
+  for (size_t i = 0; i < 5; i++) {
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 0] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 1] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 2] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 3] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 4] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 5] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 6] == 0.0);
+  }
+  for (size_t i = 6; i < 7; i++) {
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 0] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 1] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 2] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 3] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 4] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 5] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 6] == 0.0);
+  }
+
+  // XXX
+  // output_layer->BackwardPhase(p_softmax_input, nullptr);
+
+  // test mode
+  output_layer->SetLayerPhase(galois::GNNPhase::kTest);
+  output_layer->ForwardPhase(p_softmax_input);
+  std::vector<galois::GNNFloat> pd3 = output_layer->CopyForwardOutputFromGPU();
+  // validate vertex is index 6
+  GALOIS_LOG_ASSERT(galois::MaxIndex(7, &(pd3[6 * 7])) == 6);
+  // all but last are empty distributions
+  for (size_t i = 0; i < 6; i++) {
+    GALOIS_LOG_ASSERT(pd3[i * 7 + 0] == 0.0);
+    GALOIS_LOG_ASSERT(pd3[i * 7 + 1] == 0.0);
+    GALOIS_LOG_ASSERT(pd3[i * 7 + 2] == 0.0);
+    GALOIS_LOG_ASSERT(pd3[i * 7 + 3] == 0.0);
+    GALOIS_LOG_ASSERT(pd3[i * 7 + 4] == 0.0);
+    GALOIS_LOG_ASSERT(pd3[i * 7 + 5] == 0.0);
+    GALOIS_LOG_ASSERT(pd3[i * 7 + 6] == 0.0);
+  }
+
+  // XXX
+  // output_layer->BackwardPhase(softmax_input, nullptr);
+
+  // TODO in future maybe: add better test for backward phase besides just
+  // running it
+}

From bf1b3551672885b165a6500bddb79b0178e7cdad Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 17 Nov 2020 14:25:36 -0600
Subject: [PATCH 429/660] Softmax: mask selection function refactoring

Moved code to select the right mask pointer passed on the current
layer phase to a function as it will be used in backward phase as
well.
---
 libgnn/include/galois/layers/SoftmaxLayer.cuh | 15 ++++++++++++
 libgnn/src/layers/SoftmaxLayer.cu             | 23 +++++++------------
 2 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/libgnn/include/galois/layers/SoftmaxLayer.cuh b/libgnn/include/galois/layers/SoftmaxLayer.cuh
index 40e9681bb1..ee1350f2bd 100644
--- a/libgnn/include/galois/layers/SoftmaxLayer.cuh
+++ b/libgnn/include/galois/layers/SoftmaxLayer.cuh
@@ -23,6 +23,21 @@ private:
   char* val_mask_;
   char* test_mask_;
   GNNFloat* local_labels_;
+
+  //! Helper function that returns the correct mask based on phase it is passed
+  char* ChooseMask(galois::GNNPhase phase) {
+    switch (phase) {
+    case GNNPhase::kTrain:
+      return train_mask_;
+    case GNNPhase::kValidate:
+      return val_mask_;
+    case GNNPhase::kTest:
+      return test_mask_;
+    default:
+      GALOIS_LOG_FATAL("Invalid phase specified");
+      return nullptr;
+    }
+  }
 };
 
 } // namespace galois
diff --git a/libgnn/src/layers/SoftmaxLayer.cu b/libgnn/src/layers/SoftmaxLayer.cu
index e385214981..aecdd93c52 100644
--- a/libgnn/src/layers/SoftmaxLayer.cu
+++ b/libgnn/src/layers/SoftmaxLayer.cu
@@ -7,27 +7,20 @@ void galois::SoftmaxLayerGPU::ForwardPhaseGPU(galois::GNNPhase phase,
                                               size_t feature_length,
                                               const GNNFloat* input_embeddings,
                                               GNNFloat* output) {
-  char* mask_to_use = nullptr;
-  switch (phase) {
-  case GNNPhase::kTrain:
-    mask_to_use = train_mask_;
-    break;
-  case GNNPhase::kValidate:
-    mask_to_use = val_mask_;
-    break;
-  case GNNPhase::kTest:
-    mask_to_use = test_mask_;
-    break;
-  default:
-    GALOIS_LOG_FATAL("Invalid phase specified");
-  }
-
+  char* mask_to_use = ChooseMask(phase);
   CUDA_CHECK(
       cudaMemset(output, 0, num_nodes * feature_length * sizeof(GNNFloat)));
   SoftmaxCrossEntropyForward<<<CUDA_GET_BLOCKS(num_nodes), CUDA_NUM_THREADS>>>(
       mask_to_use, num_nodes, feature_length, input_embeddings, output);
+  CUDA_TEST("Softmax cross entropy forward failed");
 }
 
+// void galois::SoftmaxLayerGPU::BackwardPhaseGPU() {
+//
+//
+//
+//}
+
 // Input: in_tensor
 // Input: out_tensor
 // Input: out_gradients

From f71b7c6d2c04e09ec7d92e3647643be2add94612 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 18 Nov 2020 11:57:06 -0600
Subject: [PATCH 430/660] Fixed ground truth type on GPUs

Ground truth is represented with GNNLabel, but I was using a GNNFloat.
This caused the labels being read to be garbaged when used on the GPU.
This commit changes it them to the correct type.

It also includes the signature definition of the backward phase: the
implementation will be included in the next commit. (Split the commits
up for modularity's sake)
---
 libgnn/include/galois/graphs/GNNGraph.cuh     | 4 ++--
 libgnn/include/galois/layers/SoftmaxLayer.cuh | 6 ++++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/libgnn/include/galois/graphs/GNNGraph.cuh b/libgnn/include/galois/graphs/GNNGraph.cuh
index 528fe4ceb2..d485808972 100644
--- a/libgnn/include/galois/graphs/GNNGraph.cuh
+++ b/libgnn/include/galois/graphs/GNNGraph.cuh
@@ -28,7 +28,7 @@ public:
   int* edge_index() const { return edge_index_; }
   int* edge_destinations() const { return edge_destinations_; }
 
-  GNNFloat* ground_truth() const { return ground_truth_; }
+  GNNLabel* ground_truth() const { return ground_truth_; }
 
   char* local_training_mask() const { return local_training_mask_; }
   char* local_validation_mask() const { return local_validation_mask_; }
@@ -53,7 +53,7 @@ private:
   //! (Local) feature vector
   GNNFeature* feature_vector_{nullptr};
   //! (Local) ground truth vector
-  GNNFloat* ground_truth_{nullptr};
+  GNNLabel* ground_truth_{nullptr};
   // masks for phases
   char* local_training_mask_{nullptr};
   char* local_validation_mask_{nullptr};
diff --git a/libgnn/include/galois/layers/SoftmaxLayer.cuh b/libgnn/include/galois/layers/SoftmaxLayer.cuh
index ee1350f2bd..8e1e5d21d7 100644
--- a/libgnn/include/galois/layers/SoftmaxLayer.cuh
+++ b/libgnn/include/galois/layers/SoftmaxLayer.cuh
@@ -16,13 +16,15 @@ public:
   void ForwardPhaseGPU(galois::GNNPhase phase, size_t num_nodes,
                        size_t feature_length, const GNNFloat* input_embeddings,
                        GNNFloat* output);
-  void BackwardPhaseGPU(GNNFloat* output);
+  void BackwardPhaseGPU(galois::GNNPhase phase, size_t num_nodes,
+                        size_t feature_length, const GNNFloat* predictions,
+                        GNNFloat* output_gradient);
 
 private:
   char* train_mask_;
   char* val_mask_;
   char* test_mask_;
-  GNNFloat* local_labels_;
+  GNNLabel* local_labels_;
 
   //! Helper function that returns the correct mask based on phase it is passed
   char* ChooseMask(galois::GNNPhase phase) {

From 4562c8b551da74996055523218a5765e5a7bc05a Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 18 Nov 2020 12:27:47 -0600
Subject: [PATCH 431/660] Softmax backward phase on GPU

Adds the backward phase for the softmax layer for the GPU. The
implementation is taken from the non-refactored old code: it copies a
prediction to shared memory (presumably to improve locality) then does
cross entropy to softmax derivatives. It remains to be seen if the
shared memory copy is actually more efficient; some testing will be done
down the line.

Also adds print to both cpu and gpu softmax tests in order to verify
that both are doing the same compute (which they are in this commit).
---
 libgnn/include/galois/GNNMath.cuh     | 10 ++++
 libgnn/src/GNNMath.cu                 | 83 +++++++++++++++++++++++++++
 libgnn/src/layers/SoftmaxLayer.cpp    | 10 +++-
 libgnn/src/layers/SoftmaxLayer.cu     | 36 ++++++------
 libgnn/test/gpu-softmaxlayer-test.cpp | 27 +++++++--
 libgnn/test/softmaxlayer-test.cpp     | 22 ++++++-
 6 files changed, 158 insertions(+), 30 deletions(-)

diff --git a/libgnn/include/galois/GNNMath.cuh b/libgnn/include/galois/GNNMath.cuh
index e63221f87f..aca14a573f 100644
--- a/libgnn/include/galois/GNNMath.cuh
+++ b/libgnn/include/galois/GNNMath.cuh
@@ -20,11 +20,21 @@ void CBlasSGEMMGPU(const cublasOperation_t trans_a,
 
 //! Runs softmax + cross entropy on masked nodes. Will not overwrite all of
 //! the output, so make sure it's been zero'd out beforehand.
+//! At this point in time cross entropy is ignored because it only calculates a
+//! loss value which doesn't really do anything for us at the moment.
 __global__ void
 SoftmaxCrossEntropyForward(char* mask, size_t num_nodes, size_t feature_length,
                            const galois::GNNFloat* input_embeddings,
                            galois::GNNFloat* output);
 
+//! Derivative of cross entropy (to get error of prediction) then derivavtive
+//! of the softmax.
+__global__ void
+SoftmaxCrossEntropyBackward(char* mask, size_t num_nodes, size_t feature_length,
+                            const galois::GNNFloat* predictions,
+                            const galois::GNNLabel* ground_truth,
+                            galois::GNNFloat* output_gradient);
+
 //! Given a vector, apply a softmax on some specified # of elements and save
 //! the result to the specified output. Since this is a device function,
 //! all pointers should be to GPU memory.
diff --git a/libgnn/src/GNNMath.cu b/libgnn/src/GNNMath.cu
index 0066e85939..5b429dafb2 100644
--- a/libgnn/src/GNNMath.cu
+++ b/libgnn/src/GNNMath.cu
@@ -45,6 +45,89 @@ __global__ void galois::SoftmaxCrossEntropyForward(
   }
 }
 
+__global__ void galois::SoftmaxCrossEntropyBackward(
+    char* mask, size_t num_nodes, size_t feature_length,
+    const galois::GNNFloat* predictions, const galois::GNNLabel* ground_truth,
+    galois::GNNFloat* output_gradient) {
+  const unsigned global_thread_id =
+      BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index
+  const unsigned warp_thread_lane =
+      threadIdx.x & (WARP_SIZE - 1); // thread index within the warp
+  const unsigned warp_id = global_thread_id / WARP_SIZE; // global warp index
+  const unsigned warp_lane =
+      threadIdx.x / WARP_SIZE; // warp index within the CTA
+  const unsigned num_warps =
+      (BLOCK_SIZE / WARP_SIZE) * gridDim.x; // total number of active warps
+
+  // TODO: how many classes can there be? it's a set quantity at the moment
+  // copy of a particular node's prediction; put into shared memory to avoid
+  // overheads of accessing it otherwise
+  // TODO benchmark
+  __shared__ GNNFloat
+      local_node_prediction[BLOCK_SIZE / WARP_SIZE][MAX_NUM_CLASSES];
+  __shared__ GNNFloat
+      intermediate_gradient[BLOCK_SIZE / WARP_SIZE][MAX_NUM_CLASSES];
+
+  // a warp works on a single node at once
+  for (unsigned wid = warp_id; wid < num_nodes; wid += num_warps) {
+    // operate only if masked
+    if (mask[wid] == 1) {
+      unsigned base_index = wid * feature_length;
+
+      // copy over a prediction to shared memory (faster access time)
+      // TODO benchmark this to see if worth
+      for (unsigned feat_index = warp_thread_lane; feat_index < feature_length;
+           feat_index += WARP_SIZE) {
+        if (feat_index < feature_length) {
+          local_node_prediction[warp_lane][feat_index] =
+              predictions[base_index + feat_index];
+        }
+      }
+      // do not proceed until entire prediction is copied to shared memory
+      __syncthreads();
+
+      // TODO can refactor below to device functions
+      // cross entropy derivative
+      // each thread of warp takes different feature
+      for (unsigned feat_index = warp_thread_lane; feat_index < feature_length;
+           feat_index += WARP_SIZE) {
+        if (feat_index < feature_length) {
+          if (feat_index == (unsigned)ground_truth[wid]) {
+            // this thread is responsible for the truth
+            intermediate_gradient[warp_lane][feat_index] =
+                -1.0 / (local_node_prediction[warp_lane][feat_index] + 1e-10);
+          } else {
+            // all others are 0 (ground truth label = 0)
+            intermediate_gradient[warp_lane][feat_index] = 0.0;
+          }
+        }
+      }
+      __syncthreads();
+
+      // softmax derivative
+      // each thread of warp takes different feature
+      for (unsigned feat_index = warp_thread_lane; feat_index < feature_length;
+           feat_index += WARP_SIZE) {
+        if (feat_index < feature_length) {
+          GNNFloat sum  = 0.0;
+          GNNFloat self = local_node_prediction[warp_lane][feat_index];
+
+          for (unsigned j = 0; j < feature_length; j++) {
+            GNNFloat df = (j == feat_index)
+                              ? (self * (1.0 - self))
+                              : -local_node_prediction[warp_lane][j] * self;
+            sum += df * intermediate_gradient[warp_lane][j];
+          }
+
+          // each thread saves final output for the feature
+          output_gradient[base_index + feat_index] = sum;
+        }
+      }
+      __syncthreads();
+    }
+  }
+}
+
 __device__ void galois::DoSoftmax(size_t vector_length, const GNNFloat* input,
                                   GNNFloat* output) {
   // find max value
diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp
index 3a65ba55bc..a4d5133caa 100644
--- a/libgnn/src/layers/SoftmaxLayer.cpp
+++ b/libgnn/src/layers/SoftmaxLayer.cpp
@@ -56,6 +56,9 @@ galois::PointerWithSize<galois::GNNFloat>
 galois::SoftmaxLayer::BackwardPhaseCPU() {
   const size_t feature_length = layer_dimensions_.input_columns;
 
+  // zero out output
+  backward_output_matrix_.assign(backward_output_matrix_.size(), 0);
+
   galois::do_all(
       galois::iterate(graph_.begin(), graph_.end()),
       [&](const unsigned i) {
@@ -101,9 +104,10 @@ galois::SoftmaxLayer::BackwardPhase(const PointerWithSize<galois::GNNFloat>,
 #ifndef GALOIS_ENABLE_GPU
   return BackwardPhaseCPU();
 #else
-  // XXX
-  // gpu_object_.BackwardPhaseGPU(
-  return PointerWithSize<GNNFloat>();
+  gpu_object_.BackwardPhaseGPU(
+      layer_phase_, graph_.size(), layer_dimensions_.input_columns,
+      p_forward_output_matrix_.data(), p_backward_output_matrix_.data());
+  return p_backward_output_matrix_;
 #endif
 }
 
diff --git a/libgnn/src/layers/SoftmaxLayer.cu b/libgnn/src/layers/SoftmaxLayer.cu
index aecdd93c52..f24a6f1e77 100644
--- a/libgnn/src/layers/SoftmaxLayer.cu
+++ b/libgnn/src/layers/SoftmaxLayer.cu
@@ -1,3 +1,4 @@
+#include <cassert>
 #include "galois/GNNMath.cuh"
 #include "galois/Logging.h"
 #include "galois/layers/SoftmaxLayer.cuh"
@@ -15,21 +16,20 @@ void galois::SoftmaxLayerGPU::ForwardPhaseGPU(galois::GNNPhase phase,
   CUDA_TEST("Softmax cross entropy forward failed");
 }
 
-// void galois::SoftmaxLayerGPU::BackwardPhaseGPU() {
-//
-//
-//
-//}
-
-// Input: in_tensor
-// Input: out_tensor
-// Input: out_gradients
-// Output: in_gradients
-// Note: although out_gradients is an input data,
-//       it is not const because it can be reused
-//       to hold intermediate data inside this function,
-//       to avoid allocating more memory
-// void galois::SoftmaxLayerGPU::Backward(const galois::GNNFloat* in_tensor,
-//                                    const galois::GNNFloat* out_tensor,
-//                                    galois::GNNFloat* in_gradients,
-//                                    galois::GNNFloat* out_gradients) {}
+void galois::SoftmaxLayerGPU::BackwardPhaseGPU(galois::GNNPhase phase,
+                                               size_t num_nodes,
+                                               size_t feature_length,
+                                               const GNNFloat* predictions,
+                                               GNNFloat* output_gradient) {
+  assert(feature_length <= MAX_NUM_CLASSES);
+  char* mask_to_use = ChooseMask(phase);
+  CUDA_CHECK(cudaMemset(output_gradient, 0,
+                        num_nodes * feature_length * sizeof(GNNFloat)));
+  // TODO check the launch parameters; this is taken directly from the original
+  // code
+  SoftmaxCrossEntropyBackward<<<(num_nodes - 1) / WARPS_PER_BLOCK + 1,
+                                BLOCK_SIZE>>>(mask_to_use, num_nodes,
+                                              feature_length, predictions,
+                                              local_labels_, output_gradient);
+  CUDA_TEST("Softmax cross entropy backward failed");
+}
diff --git a/libgnn/test/gpu-softmaxlayer-test.cpp b/libgnn/test/gpu-softmaxlayer-test.cpp
index 2bceb7a6b4..453606e311 100644
--- a/libgnn/test/gpu-softmaxlayer-test.cpp
+++ b/libgnn/test/gpu-softmaxlayer-test.cpp
@@ -61,8 +61,13 @@ int main() {
     GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 6] == 0.0);
   }
 
-  // XXX
-  // output_layer->BackwardPhase(p_softmax_input, nullptr);
+  output_layer->BackwardPhase(p_softmax_input, nullptr);
+  const std::vector<galois::GNNFloat>& backward_output =
+      output_layer->CopyBackwardOutputFromGPU();
+  printf("Output 1\n========\n");
+  for (galois::GNNFloat a : backward_output) {
+    printf("%f\n", a);
+  }
 
   // validation mode
   output_layer->SetLayerPhase(galois::GNNPhase::kValidate);
@@ -90,8 +95,13 @@ int main() {
     GALOIS_LOG_ASSERT(pd2[i * 7 + 6] == 0.0);
   }
 
-  // XXX
-  // output_layer->BackwardPhase(p_softmax_input, nullptr);
+  output_layer->BackwardPhase(p_softmax_input, nullptr);
+  const std::vector<galois::GNNFloat>& backward_output2 =
+      output_layer->CopyBackwardOutputFromGPU();
+  printf("Output 2\n========\n");
+  for (galois::GNNFloat a : backward_output2) {
+    printf("%f\n", a);
+  }
 
   // test mode
   output_layer->SetLayerPhase(galois::GNNPhase::kTest);
@@ -110,8 +120,13 @@ int main() {
     GALOIS_LOG_ASSERT(pd3[i * 7 + 6] == 0.0);
   }
 
-  // XXX
-  // output_layer->BackwardPhase(softmax_input, nullptr);
+  output_layer->BackwardPhase(softmax_input, nullptr);
+  const std::vector<galois::GNNFloat>& backward_output3 =
+      output_layer->CopyBackwardOutputFromGPU();
+  printf("Output 3\n========\n");
+  for (galois::GNNFloat a : backward_output3) {
+    printf("%f\n", a);
+  }
 
   // TODO in future maybe: add better test for backward phase besides just
   // running it
diff --git a/libgnn/test/softmaxlayer-test.cpp b/libgnn/test/softmaxlayer-test.cpp
index f7baab24fd..9f15bedfa3 100644
--- a/libgnn/test/softmaxlayer-test.cpp
+++ b/libgnn/test/softmaxlayer-test.cpp
@@ -40,7 +40,13 @@ int main() {
       std::make_unique<galois::SoftmaxLayer>(3, test_graph, dimension_0);
   galois::PointerWithSize<galois::GNNFloat> prediction_distribution =
       output_layer->ForwardPhase(softmax_input);
-  output_layer->BackwardPhase(softmax_input, nullptr);
+
+  galois::PointerWithSize<galois::GNNFloat> asdf =
+      output_layer->BackwardPhase(softmax_input, nullptr);
+  printf("Output 1\n========\n");
+  for (unsigned i = 0; i < asdf.size(); i++) {
+    printf("%f\n", asdf[i]);
+  }
 
   // assert that predictions are as expected
   for (size_t i = 0; i < 5; i++) {
@@ -62,7 +68,12 @@ int main() {
   output_layer->SetLayerPhase(galois::GNNPhase::kValidate);
   galois::PointerWithSize<galois::GNNFloat> pd2 =
       output_layer->ForwardPhase(softmax_input);
-  output_layer->BackwardPhase(softmax_input, nullptr);
+  asdf = output_layer->BackwardPhase(softmax_input, nullptr);
+  printf("Output 2\n========\n");
+  for (unsigned i = 0; i < asdf.size(); i++) {
+    printf("%f\n", asdf[i]);
+  }
+
   // validate vertex is index 5
   GALOIS_LOG_ASSERT(galois::MaxIndex(7, &(pd2[5 * 7])) == 5);
   for (size_t i = 0; i < 5; i++) {
@@ -88,7 +99,12 @@ int main() {
   output_layer->SetLayerPhase(galois::GNNPhase::kTest);
   galois::PointerWithSize<galois::GNNFloat> pd3 =
       output_layer->ForwardPhase(softmax_input);
-  output_layer->BackwardPhase(softmax_input, nullptr);
+  asdf = output_layer->BackwardPhase(softmax_input, nullptr);
+  printf("Output 3\n========\n");
+  for (unsigned i = 0; i < asdf.size(); i++) {
+    printf("%f\n", asdf[i]);
+  }
+
   // validate vertex is index 6
   GALOIS_LOG_ASSERT(galois::MaxIndex(7, &(pd3[6 * 7])) == 6);
   // all but last are empty distributions

From 4e5266caf94184b4f827b55a1567a8372f83e886 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 18 Nov 2020 14:56:47 -0600
Subject: [PATCH 432/660] Structure added for GPU global accuracy function

This commit adds the declarations for the global accuracy
getter for GPU GNNs as well as the orchestration of the call to the GPU
version. The rest of the implementation will come in a later commit: for now
this isn't priority as I can still compute accuracy on the CPU.

Adds a new GNNGPU object to hold all GPU related things for the GNN
class.
---
 libgnn/CMakeLists.txt                        |  1 +
 libgnn/include/galois/GraphNeuralNetwork.cuh | 22 +++++++++++++++++
 libgnn/include/galois/GraphNeuralNetwork.h   | 10 ++++++++
 libgnn/src/GraphNeuralNetwork.cpp            | 11 +++++++++
 libgnn/src/GraphNeuralNetwork.cu             | 26 ++++++++++++++++++++
 5 files changed, 70 insertions(+)
 create mode 100644 libgnn/include/galois/GraphNeuralNetwork.cuh
 create mode 100644 libgnn/src/GraphNeuralNetwork.cu

diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt
index ca50e171ee..61867f21c8 100644
--- a/libgnn/CMakeLists.txt
+++ b/libgnn/CMakeLists.txt
@@ -35,6 +35,7 @@ if (GALOIS_ENABLE_GPU)
     src/layers/GNNLayer.cu
     src/layers/GraphConvolutionalLayer.cu
     src/layers/SoftmaxLayer.cu
+    src/GraphNeuralNetwork.cu
   )
   add_library(galois_gnn_gpu STATIC ${gpusources})
   target_compile_definitions(galois_gnn_gpu PRIVATE _FORCE_INLINES)
diff --git a/libgnn/include/galois/GraphNeuralNetwork.cuh b/libgnn/include/galois/GraphNeuralNetwork.cuh
new file mode 100644
index 0000000000..dd2eeed8b0
--- /dev/null
+++ b/libgnn/include/galois/GraphNeuralNetwork.cuh
@@ -0,0 +1,22 @@
+#ifndef GALOIS_GNN_GPU_CLASS
+#define GALOIS_GNN_GPU_CLASS
+
+#include "galois/GNNTypes.h"
+#include "galois/graphs/GNNGraph.cuh"
+
+namespace galois {
+
+//! Helper class for a GNN: holds GPU arguments. In its own class so that the
+//! compiler used for it can differ from the main CPU code
+class GraphNeuralNetworkGPU {
+public:
+  //! Gets accuracy of a prediction given pointers to the data on the GPU
+  float
+  GetGlobalAccuracyGPU(const galois::graphs::GNNGraphGPUAllocations& gpu_graph,
+                       galois::GNNPhase phase,
+                       const galois::PointerWithSize<GNNFloat> predictions);
+};
+
+} // namespace galois
+
+#endif
diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h
index 9e7e2266d0..652b1cbfad 100644
--- a/libgnn/include/galois/GraphNeuralNetwork.h
+++ b/libgnn/include/galois/GraphNeuralNetwork.h
@@ -9,6 +9,10 @@
 #include "galois/graphs/GNNGraph.h"
 #include "galois/layers/GNNLayer.h"
 
+#ifdef GALOIS_ENABLE_GPU
+#include "galois/GraphNeuralNetwork.cuh"
+#endif
+
 namespace galois {
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -144,6 +148,8 @@ class GraphNeuralNetwork {
 
   float GetGlobalAccuracy(const PointerWithSize<GNNFloat> predictions);
 
+  float GetGlobalAccuracyCPU(const PointerWithSize<GNNFloat> predictions);
+
   //! Backpropagate gradients from the output layer backwards through the
   //! network to update the layer weights. Also known as a backward phase in
   //! most literature
@@ -164,6 +170,10 @@ class GraphNeuralNetwork {
   DGAccumulator<size_t> num_correct_;
   //! Used to count total number of things checked during accuracy calculation
   DGAccumulator<size_t> total_checked_;
+#ifdef GALOIS_ENABLE_GPU
+  //! Holds all GPU functions
+  GraphNeuralNetworkGPU gpu_object_;
+#endif
 };
 
 } // namespace galois
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index eb419ba26c..e669feac50 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -101,6 +101,17 @@ galois::GraphNeuralNetwork::DoInference() {
 
 float galois::GraphNeuralNetwork::GetGlobalAccuracy(
     const PointerWithSize<GNNFloat> predictions) {
+  // TODO mark as a forwarding argument?
+#ifndef GALOIS_ENABLE_GPU
+  return GetGlobalAccuracyCPU(predictions);
+#else
+  return gpu_object_.GetGlobalAccuracyGPU(graph_->GetGPUGraph(), phase_,
+                                          predictions);
+#endif
+}
+
+float galois::GraphNeuralNetwork::GetGlobalAccuracyCPU(
+    const PointerWithSize<GNNFloat> predictions) {
   // check owned nodes' accuracy
   size_t num_labels = graph_->GetNumLabelClasses();
   assert((graph_->GetNumLabelClasses() * graph_->size()) == predictions.size());
diff --git a/libgnn/src/GraphNeuralNetwork.cu b/libgnn/src/GraphNeuralNetwork.cu
new file mode 100644
index 0000000000..a16c4b2b69
--- /dev/null
+++ b/libgnn/src/GraphNeuralNetwork.cu
@@ -0,0 +1,26 @@
+#include "galois/GraphNeuralNetwork.cuh"
+#include "galois/Logging.h"
+
+float galois::GraphNeuralNetworkGPU::GetGlobalAccuracyGPU(
+    const graphs::GNNGraphGPUAllocations& gpu_graph, GNNPhase phase,
+    const PointerWithSize<GNNFloat> predictions) {
+  // get correct mask
+  char* mask_to_use = nullptr;
+  switch (phase) {
+  case GNNPhase::kTrain:
+    mask_to_use = gpu_graph.local_training_mask();
+    break;
+  case GNNPhase::kValidate:
+    mask_to_use = gpu_graph.local_validation_mask();
+    break;
+  case GNNPhase::kTest:
+    mask_to_use = gpu_graph.local_testing_mask();
+    break;
+  default:
+    GALOIS_LOG_FATAL("Invalid phase specified");
+  }
+
+  // run accuracy check kernel on GPU
+
+  return 0.0;
+}

From b9e3b32afdedb6ed91fdf8968e96118298987d4c Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 18 Nov 2020 17:28:28 -0600
Subject: [PATCH 433/660] Allocate GPU memory for Adam optimizer

Adds a GPU Adam optimizer class that holds the allocations for the
moments used in the adam optimizer on the GPU. Adds a gpu version of the
adam test as well to make sure build is sane in its current state.
The CPU optimizer class is also now split into the CPU/GPU paths
depending on which build is being used.

Next step is to do the adam optimizer on the GPU proper.
---
 libgnn/CMakeLists.txt                   |  1 +
 libgnn/include/galois/GNNOptimizers.cuh | 30 +++++++++++++++
 libgnn/include/galois/GNNOptimizers.h   | 31 +++++++++++++++-
 libgnn/src/GNNOptimizers.cpp            |  4 +-
 libgnn/src/GNNOptimizers.cu             | 29 +++++++++++++++
 libgnn/test/CMakeLists.txt              |  3 ++
 libgnn/test/gpu-adam-test.cpp           | 49 +++++++++++++++++++++++++
 7 files changed, 144 insertions(+), 3 deletions(-)
 create mode 100644 libgnn/include/galois/GNNOptimizers.cuh
 create mode 100644 libgnn/src/GNNOptimizers.cu
 create mode 100644 libgnn/test/gpu-adam-test.cpp

diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt
index 61867f21c8..362fc7f773 100644
--- a/libgnn/CMakeLists.txt
+++ b/libgnn/CMakeLists.txt
@@ -36,6 +36,7 @@ if (GALOIS_ENABLE_GPU)
     src/layers/GraphConvolutionalLayer.cu
     src/layers/SoftmaxLayer.cu
     src/GraphNeuralNetwork.cu
+    src/GNNOptimizers.cu
   )
   add_library(galois_gnn_gpu STATIC ${gpusources})
   target_compile_definitions(galois_gnn_gpu PRIVATE _FORCE_INLINES)
diff --git a/libgnn/include/galois/GNNOptimizers.cuh b/libgnn/include/galois/GNNOptimizers.cuh
new file mode 100644
index 0000000000..13fcb97263
--- /dev/null
+++ b/libgnn/include/galois/GNNOptimizers.cuh
@@ -0,0 +1,30 @@
+#ifndef GALOIS_GPU_GNN_OPT
+#define GALOIS_GPU_GNN_OPT
+
+#include <vector>
+#include "galois/GNNTypes.h"
+
+namespace galois {
+
+//! Holds GPU memory for the adam optimizer as well as function definitions
+//! for weight adjustment
+class AdamOptimizerGPU {
+public:
+  //! Initializes the moment vectors on the GPU based on provided sizes
+  AdamOptimizerGPU(const std::vector<size_t>& trainable_layer_sizes,
+                   size_t num_trainable);
+  //! Frees moment vectors and vector of pointers to moments
+  ~AdamOptimizerGPU();
+
+  GNNFloat* first_moment(size_t i) { return first_moments_[i]; };
+  GNNFloat* second_moment(size_t i) { return second_moments_[i]; };
+
+private:
+  size_t num_layers_;
+  std::vector<GNNFloat*> first_moments_;
+  std::vector<GNNFloat*> second_moments_;
+};
+
+} // namespace galois
+
+#endif
diff --git a/libgnn/include/galois/GNNOptimizers.h b/libgnn/include/galois/GNNOptimizers.h
index c0e8dd2582..9528612ef4 100644
--- a/libgnn/include/galois/GNNOptimizers.h
+++ b/libgnn/include/galois/GNNOptimizers.h
@@ -10,6 +10,10 @@
 #include <vector>
 #include <cassert>
 
+#ifdef GALOIS_ENABLE_GPU
+#include "galois/GNNOptimizers.cuh"
+#endif
+
 namespace galois {
 
 //! Virtual class; optimizers all need the descent function
@@ -41,18 +45,35 @@ class AdamOptimizer : public BaseOptimizer {
   AdamOptimizer(const AdamConfiguration& config,
                 const std::vector<size_t>& trainable_layer_sizes,
                 size_t num_trainable_layers)
-      : config_(config), num_trainable_layers_(num_trainable_layers),
+      :
+#ifdef GALOIS_ENABLE_GPU
+        gpu_object_(trainable_layer_sizes, num_trainable_layers),
+#endif
+        config_(config), num_trainable_layers_(num_trainable_layers),
         beta1_power_t_(num_trainable_layers_, config.beta1),
         beta2_power_t_(num_trainable_layers_, config.beta2) {
     // >= because only prefix will be considered otherwise
     assert(trainable_layer_sizes.size() >= num_trainable_layers_);
+#ifndef GALOIS_ENABLE_GPU
     // allocate vectors based on # of trainable layers
     for (size_t i = 0; i < num_trainable_layers_; i++) {
       first_moments_.emplace_back(trainable_layer_sizes[i], 0.0);
       second_moments_.emplace_back(trainable_layer_sizes[i], 0.0);
+      // Pointer with size construction
+      p_first_moments_.emplace_back(first_moments_.back());
+      p_second_moments_.emplace_back(second_moments_.back());
     }
     assert(first_moments_.size() == num_trainable_layers_);
     assert(second_moments_.size() == num_trainable_layers_);
+#else
+    // pointer with size initialization with GPU pointers
+    for (size_t i = 0; i < num_trainable_layers_; i++) {
+      p_first_moments_.emplace_back(gpu_object_.first_moment(i),
+                                    trainable_layer_sizes[i]);
+      p_second_moments_.emplace_back(gpu_object_.second_moment(i),
+                                     trainable_layer_sizes[i]);
+    }
+#endif
   }
   //! Adam based gradient descent
   void GradientDescent(const std::vector<GNNFloat>& derivatives,
@@ -60,12 +81,20 @@ class AdamOptimizer : public BaseOptimizer {
                        size_t layer_number) final;
 
 private:
+#ifdef GALOIS_ENABLE_GPU
+  AdamOptimizerGPU gpu_object_;
+#endif
+
   //! Configuration options for this layer
   AdamConfiguration config_;
   //! First moment vectors; one for each trainable layer
   std::vector<std::vector<GNNFloat>> first_moments_;
   //! Second moment vectors; one for each trainable layer
   std::vector<std::vector<GNNFloat>> second_moments_;
+  // PointerWithSize versions of first/second moments (for use in function
+  // to support GPU pointers as well
+  std::vector<PointerWithSize<GNNFloat>> p_first_moments_;
+  std::vector<PointerWithSize<GNNFloat>> p_second_moments_;
   //! Number of layers that can be trained (need moment vectors for each)
   size_t num_trainable_layers_;
   // power terms used in adam: updated by raising power every time update is
diff --git a/libgnn/src/GNNOptimizers.cpp b/libgnn/src/GNNOptimizers.cpp
index 53088825fd..94d51310b9 100644
--- a/libgnn/src/GNNOptimizers.cpp
+++ b/libgnn/src/GNNOptimizers.cpp
@@ -9,8 +9,8 @@ void galois::AdamOptimizer::GradientDescent(
   assert(derivatives.size() == matrix->size());
 
   // grab based on layer being used
-  std::vector<GNNFloat>& first_moment  = first_moments_[layer_number];
-  std::vector<GNNFloat>& second_moment = second_moments_[layer_number];
+  PointerWithSize<GNNFloat>& first_moment  = p_first_moments_[layer_number];
+  PointerWithSize<GNNFloat>& second_moment = p_second_moments_[layer_number];
   assert(derivatives.size() == first_moment.size());
   assert(derivatives.size() == second_moment.size());
 
diff --git a/libgnn/src/GNNOptimizers.cu b/libgnn/src/GNNOptimizers.cu
new file mode 100644
index 0000000000..ff5b771b59
--- /dev/null
+++ b/libgnn/src/GNNOptimizers.cu
@@ -0,0 +1,29 @@
+#include "galois/GNNOptimizers.cuh"
+#include "galois/CUDAUtil.h"
+
+galois::AdamOptimizerGPU::AdamOptimizerGPU(
+    const std::vector<size_t>& trainable_layer_sizes, size_t num_trainable) {
+  num_layers_ = num_trainable;
+  first_moments_.resize(num_layers_);
+  second_moments_.resize(num_layers_);
+
+  for (size_t layer = 0; layer < num_layers_; layer++) {
+    // initialize the moment vector memory then zero it all out
+    CUDA_CHECK(cudaMalloc((void**)(&(first_moments_[layer])),
+                          trainable_layer_sizes[layer] * sizeof(GNNFloat)));
+    CUDA_CHECK(cudaMalloc((void**)(&(second_moments_[layer])),
+                          trainable_layer_sizes[layer] * sizeof(GNNFloat)));
+    CUDA_CHECK(cudaMemset(first_moments_[layer], 0,
+                          trainable_layer_sizes[layer] * sizeof(GNNFloat)));
+    CUDA_CHECK(cudaMemset(second_moments_[layer], 0,
+                          trainable_layer_sizes[layer] * sizeof(GNNFloat)));
+  }
+}
+
+galois::AdamOptimizerGPU::~AdamOptimizerGPU() {
+  // loop through and free first/second moments
+  for (size_t layer = 0; layer < num_layers_; layer++) {
+    CUDA_FREE(first_moments_[layer]);
+    CUDA_FREE(second_moments_[layer]);
+  }
+}
diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt
index 01199c1247..9c7547b8d3 100644
--- a/libgnn/test/CMakeLists.txt
+++ b/libgnn/test/CMakeLists.txt
@@ -47,6 +47,9 @@ else()
   target_link_libraries(gpu-softmaxlayer-test galois_gnn)
   add_test(NAME gpu-softmaxlayer-test COMMAND gpu-softmaxlayer-test)
 
+  add_executable(gpu-adam-test gpu-adam-test.cpp)
+  target_link_libraries(gpu-adam-test galois_gnn)
+  #add_test(NAME gpu-adam-test COMMAND gpu-adam-test)
 endif()
 
 # TODO multi host tests?
diff --git a/libgnn/test/gpu-adam-test.cpp b/libgnn/test/gpu-adam-test.cpp
new file mode 100644
index 0000000000..faee872bfa
--- /dev/null
+++ b/libgnn/test/gpu-adam-test.cpp
@@ -0,0 +1,49 @@
+//! @file adam-test.cpp
+//! Tests the adam optimizer
+#include "galois/DistGalois.h"
+#include "galois/GNNOptimizers.h"
+#include "galois/Logging.h"
+
+int main() {
+  galois::DistMemSys G;
+
+  size_t num_threads = galois::setActiveThreads(
+      56 / galois::runtime::getSystemNetworkInterface().Num);
+  GALOIS_LOG_VERBOSE("[{}] Using {} threads",
+                     galois::runtime::getSystemNetworkInterface().ID,
+                     num_threads);
+
+  // create sample config that is easy to trace
+  galois::AdamOptimizer::AdamConfiguration config;
+  config.alpha   = 1;
+  config.beta1   = 0.5;
+  config.beta2   = 0.5;
+  config.epsilon = 0;
+
+  std::vector<size_t> layer_sizes = {2, 1};
+  galois::AdamOptimizer adam(config, layer_sizes, 2);
+  printf("%p\n", &adam);
+
+  // std::vector<galois::GNNFloat> weights1 = {1, 1};
+  // std::vector<galois::GNNFloat> weights2 = {10};
+  // std::vector<galois::GNNFloat> grad1    = {1, 1};
+  // std::vector<galois::GNNFloat> grad2    = {10};
+
+  // adam.GradientDescent(grad1, &weights1, 0);
+  //// check weights
+  // GALOIS_LOG_ASSERT(weights1[0] == 0.0);
+  // GALOIS_LOG_ASSERT(weights1[1] == 0.0);
+
+  // adam.GradientDescent(grad2, &weights2, 1);
+  // GALOIS_LOG_ASSERT(weights2[0] == 9.0);
+
+  //// run again to check if adam keeps moments from before
+  // adam.GradientDescent(grad1, &weights1, 0);
+  //// check weights again (turns out derivative one ends up doing same thing)
+  // GALOIS_LOG_ASSERT(weights1[0] == -1.0);
+  // GALOIS_LOG_ASSERT(weights1[1] == -1.0);
+
+  //// grad 2 again
+  // adam.GradientDescent(grad2, &weights2, 1);
+  // GALOIS_LOG_ASSERT(weights2[0] == 8.0);
+}

From af9c72a664438dc5003021ec10c3e3849f87110e Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 18 Nov 2020 17:57:54 -0600
Subject: [PATCH 434/660] GradientDescent call now uses PointerWithSize

The gradient descent call in the optimizers now uses PointerWithSize
rather than std::vectors. This is for compatibility with GPU pointers.
Calls to the function have been changed throughout the code accordingly.
---
 libgnn/include/galois/GNNOptimizers.h |  8 ++++----
 libgnn/src/GNNOptimizers.cpp          | 12 ++++++++----
 libgnn/src/layers/GNNLayer.cpp        |  2 +-
 libgnn/test/CMakeLists.txt            |  2 +-
 libgnn/test/adam-test.cpp             |  8 ++++----
 libgnn/test/gpu-adam-test.cpp         | 11 +++++------
 6 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/libgnn/include/galois/GNNOptimizers.h b/libgnn/include/galois/GNNOptimizers.h
index 9528612ef4..e649b73887 100644
--- a/libgnn/include/galois/GNNOptimizers.h
+++ b/libgnn/include/galois/GNNOptimizers.h
@@ -19,8 +19,8 @@ namespace galois {
 //! Virtual class; optimizers all need the descent function
 class BaseOptimizer {
 public:
-  virtual void GradientDescent(const std::vector<GNNFloat>& derivatives,
-                               std::vector<GNNFloat>* matrix,
+  virtual void GradientDescent(PointerWithSize<GNNFloat> derivatives,
+                               PointerWithSize<GNNFloat> matrix,
                                size_t layer_number) = 0;
 };
 
@@ -76,8 +76,8 @@ class AdamOptimizer : public BaseOptimizer {
 #endif
   }
   //! Adam based gradient descent
-  void GradientDescent(const std::vector<GNNFloat>& derivatives,
-                       std::vector<GNNFloat>* matrix,
+  void GradientDescent(PointerWithSize<GNNFloat> derivatives,
+                       PointerWithSize<GNNFloat> matrix,
                        size_t layer_number) final;
 
 private:
diff --git a/libgnn/src/GNNOptimizers.cpp b/libgnn/src/GNNOptimizers.cpp
index 94d51310b9..fa1f4dd10c 100644
--- a/libgnn/src/GNNOptimizers.cpp
+++ b/libgnn/src/GNNOptimizers.cpp
@@ -4,9 +4,9 @@
 #include <cassert>
 
 void galois::AdamOptimizer::GradientDescent(
-    const std::vector<GNNFloat>& derivatives, std::vector<GNNFloat>* matrix,
+    PointerWithSize<GNNFloat> derivatives, PointerWithSize<GNNFloat> matrix,
     size_t layer_number) {
-  assert(derivatives.size() == matrix->size());
+  assert(derivatives.size() == matrix.size());
 
   // grab based on layer being used
   PointerWithSize<GNNFloat>& first_moment  = p_first_moments_[layer_number];
@@ -14,9 +14,10 @@ void galois::AdamOptimizer::GradientDescent(
   assert(derivatives.size() == first_moment.size());
   assert(derivatives.size() == second_moment.size());
 
+#ifndef GALOIS_ENABLE_GPU
   // individual weight updates via gradients
   galois::do_all(
-      galois::iterate(static_cast<size_t>(0), matrix->size()),
+      galois::iterate(static_cast<size_t>(0), matrix.size()),
       [&](size_t i) {
         // moment estimate updates
         first_moment[i] = config_.beta1 * first_moment[i] +
@@ -30,11 +31,14 @@ void galois::AdamOptimizer::GradientDescent(
         GNNFloat bias_correct_second =
             second_moment[i] / (1.0 - beta2_power_t_[layer_number]);
         // weight update using bias corrected moments
-        (matrix->data())[i] -=
+        (matrix.data())[i] -=
             config_.alpha * bias_correct_first /
             (std::sqrt(bias_correct_second) + config_.epsilon);
       },
       galois::loopname("AdamOptimizerGradientDescent"));
+#else
+  // gpu_object_.DoAdamUpdate(first_moment.data(), second_moment.data(), );
+#endif
 
   // update the power terms for next update call
   beta1_power_t_[layer_number] *= config_.beta1;
diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp
index 31cf58c6c7..5688e13c31 100644
--- a/libgnn/src/layers/GNNLayer.cpp
+++ b/libgnn/src/layers/GNNLayer.cpp
@@ -179,7 +179,7 @@ void galois::GNNLayer::ActivationDerivative(
 
 void galois::GNNLayer::OptimizeLayer(BaseOptimizer* optimizer,
                                      size_t trainable_layer_number) {
-  optimizer->GradientDescent(layer_weight_gradients_, &layer_weights_,
+  optimizer->GradientDescent(p_layer_weight_gradients_, p_layer_weights_,
                              trainable_layer_number);
 }
 
diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt
index 9c7547b8d3..54e9bd43af 100644
--- a/libgnn/test/CMakeLists.txt
+++ b/libgnn/test/CMakeLists.txt
@@ -49,7 +49,7 @@ else()
 
   add_executable(gpu-adam-test gpu-adam-test.cpp)
   target_link_libraries(gpu-adam-test galois_gnn)
-  #add_test(NAME gpu-adam-test COMMAND gpu-adam-test)
+  add_test(NAME gpu-adam-test COMMAND gpu-adam-test)
 endif()
 
 # TODO multi host tests?
diff --git a/libgnn/test/adam-test.cpp b/libgnn/test/adam-test.cpp
index dfdfcdad00..159e27c744 100644
--- a/libgnn/test/adam-test.cpp
+++ b/libgnn/test/adam-test.cpp
@@ -28,21 +28,21 @@ int main() {
   std::vector<galois::GNNFloat> grad1    = {1, 1};
   std::vector<galois::GNNFloat> grad2    = {10};
 
-  adam.GradientDescent(grad1, &weights1, 0);
+  adam.GradientDescent(grad1, weights1, 0);
   // check weights
   GALOIS_LOG_ASSERT(weights1[0] == 0.0);
   GALOIS_LOG_ASSERT(weights1[1] == 0.0);
 
-  adam.GradientDescent(grad2, &weights2, 1);
+  adam.GradientDescent(grad2, weights2, 1);
   GALOIS_LOG_ASSERT(weights2[0] == 9.0);
 
   // run again to check if adam keeps moments from before
-  adam.GradientDescent(grad1, &weights1, 0);
+  adam.GradientDescent(grad1, weights1, 0);
   // check weights again (turns out derivative one ends up doing same thing)
   GALOIS_LOG_ASSERT(weights1[0] == -1.0);
   GALOIS_LOG_ASSERT(weights1[1] == -1.0);
 
   // grad 2 again
-  adam.GradientDescent(grad2, &weights2, 1);
+  adam.GradientDescent(grad2, weights2, 1);
   GALOIS_LOG_ASSERT(weights2[0] == 8.0);
 }
diff --git a/libgnn/test/gpu-adam-test.cpp b/libgnn/test/gpu-adam-test.cpp
index faee872bfa..24a19fb66c 100644
--- a/libgnn/test/gpu-adam-test.cpp
+++ b/libgnn/test/gpu-adam-test.cpp
@@ -22,14 +22,13 @@ int main() {
 
   std::vector<size_t> layer_sizes = {2, 1};
   galois::AdamOptimizer adam(config, layer_sizes, 2);
-  printf("%p\n", &adam);
 
-  // std::vector<galois::GNNFloat> weights1 = {1, 1};
-  // std::vector<galois::GNNFloat> weights2 = {10};
-  // std::vector<galois::GNNFloat> grad1    = {1, 1};
-  // std::vector<galois::GNNFloat> grad2    = {10};
+  std::vector<galois::GNNFloat> weights1 = {1, 1};
+  std::vector<galois::GNNFloat> weights2 = {10};
+  std::vector<galois::GNNFloat> grad1    = {1, 1};
+  std::vector<galois::GNNFloat> grad2    = {10};
 
-  // adam.GradientDescent(grad1, &weights1, 0);
+  adam.GradientDescent(grad1, weights1, 0);
   //// check weights
   // GALOIS_LOG_ASSERT(weights1[0] == 0.0);
   // GALOIS_LOG_ASSERT(weights1[1] == 0.0);

From 599ee5f4432e2255ef1e0676c3485e62c384a3a9 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 18 Nov 2020 18:43:22 -0600
Subject: [PATCH 435/660] Adam optimizer on GPU + test done; CPU fix

Implements Adam optimization on the GPU and makes sure it's sane via the
gpu unit test. Also fixes an inconsistency with the CPU adam optimizer
where a sqrt wasn't being applied to epsilon like it is in the original
non-refactored code.
---
 libgnn/include/galois/GNNOptimizers.cuh | 11 +++++
 libgnn/include/galois/GNNOptimizers.h   |  6 +++
 libgnn/src/GNNOptimizers.cpp            | 11 +++--
 libgnn/src/GNNOptimizers.cu             | 39 ++++++++++++++++++
 libgnn/test/gpu-adam-test.cpp           | 54 ++++++++++++++++++-------
 5 files changed, 103 insertions(+), 18 deletions(-)

diff --git a/libgnn/include/galois/GNNOptimizers.cuh b/libgnn/include/galois/GNNOptimizers.cuh
index 13fcb97263..42f499b557 100644
--- a/libgnn/include/galois/GNNOptimizers.cuh
+++ b/libgnn/include/galois/GNNOptimizers.cuh
@@ -19,6 +19,17 @@ public:
   GNNFloat* first_moment(size_t i) { return first_moments_[i]; };
   GNNFloat* second_moment(size_t i) { return second_moments_[i]; };
 
+  //! Calls into a GPU kernel; needs to be done this way as this cuh is included
+  //! in a GCC build, so the kernel cannot be defined in this header.
+  void AdamUpdate(const GNNFloat* derivatives, GNNFloat* matrix_to_update,
+                  size_t matrix_size, GNNFloat* first_moment,
+                  GNNFloat* second_moment, GNNFloat alpha, GNNFloat beta1,
+                  GNNFloat beta2, GNNFloat epsilon, GNNFloat beta1t,
+                  GNNFloat beta2t);
+
+  //! Helper to copy gpu pointer to cpu vector
+  void CopyToVector(std::vector<GNNFloat>& to, PointerWithSize<GNNFloat> from);
+
 private:
   size_t num_layers_;
   std::vector<GNNFloat*> first_moments_;
diff --git a/libgnn/include/galois/GNNOptimizers.h b/libgnn/include/galois/GNNOptimizers.h
index e649b73887..86a656fd30 100644
--- a/libgnn/include/galois/GNNOptimizers.h
+++ b/libgnn/include/galois/GNNOptimizers.h
@@ -80,6 +80,12 @@ class AdamOptimizer : public BaseOptimizer {
                        PointerWithSize<GNNFloat> matrix,
                        size_t layer_number) final;
 
+#ifdef GALOIS_ENABLE_GPU
+  //! helper function for unit testing to do some vector copying
+  void CopyToVector(std::vector<GNNFloat>& to, PointerWithSize<GNNFloat> from) {
+    gpu_object_.CopyToVector(to, from);
+  }
+#endif
 private:
 #ifdef GALOIS_ENABLE_GPU
   AdamOptimizerGPU gpu_object_;
diff --git a/libgnn/src/GNNOptimizers.cpp b/libgnn/src/GNNOptimizers.cpp
index fa1f4dd10c..566b61c14e 100644
--- a/libgnn/src/GNNOptimizers.cpp
+++ b/libgnn/src/GNNOptimizers.cpp
@@ -31,13 +31,16 @@ void galois::AdamOptimizer::GradientDescent(
         GNNFloat bias_correct_second =
             second_moment[i] / (1.0 - beta2_power_t_[layer_number]);
         // weight update using bias corrected moments
-        (matrix.data())[i] -=
-            config_.alpha * bias_correct_first /
-            (std::sqrt(bias_correct_second) + config_.epsilon);
+        (matrix.data())[i] -= config_.alpha * bias_correct_first /
+                              std::sqrt(bias_correct_second + config_.epsilon);
       },
       galois::loopname("AdamOptimizerGradientDescent"));
 #else
-  // gpu_object_.DoAdamUpdate(first_moment.data(), second_moment.data(), );
+  gpu_object_.AdamUpdate(derivatives.data(), matrix.data(), matrix.size(),
+                         first_moment.data(), second_moment.data(),
+                         config_.alpha, config_.beta1, config_.beta2,
+                         config_.epsilon, beta1_power_t_[layer_number],
+                         beta2_power_t_[layer_number]);
 #endif
 
   // update the power terms for next update call
diff --git a/libgnn/src/GNNOptimizers.cu b/libgnn/src/GNNOptimizers.cu
index ff5b771b59..77f3e74f5f 100644
--- a/libgnn/src/GNNOptimizers.cu
+++ b/libgnn/src/GNNOptimizers.cu
@@ -27,3 +27,42 @@ galois::AdamOptimizerGPU::~AdamOptimizerGPU() {
     CUDA_FREE(second_moments_[layer]);
   }
 }
+void galois::AdamOptimizerGPU::CopyToVector(std::vector<GNNFloat>& to,
+                                            PointerWithSize<GNNFloat> from) {
+  CUDA_CHECK(cudaMemcpy(to.data(), from.data(), to.size() * sizeof(GNNFloat),
+                        cudaMemcpyDeviceToHost));
+}
+
+namespace {
+
+__global__ void DoAdamUpdate(const galois::GNNFloat* derivatives,
+                             galois::GNNFloat* matrix_to_update,
+                             size_t matrix_size, galois::GNNFloat* first_moment,
+                             galois::GNNFloat* second_moment,
+                             galois::GNNFloat alpha, galois::GNNFloat beta1,
+                             galois::GNNFloat beta2, galois::GNNFloat epsilon,
+                             galois::GNNFloat beta1t, galois::GNNFloat beta2t) {
+  CUDA_KERNEL_LOOP(i, matrix_size) {
+    first_moment[i]  = beta1 * first_moment[i] + (1.0 - beta1) * derivatives[i];
+    second_moment[i] = beta2 * second_moment[i] +
+                       (1.0 - beta2) * (derivatives[i] * derivatives[i]);
+    // bias corrected moments using beta power
+    galois::GNNFloat bias_correct_first  = first_moment[i] / (1.0 - beta1t);
+    galois::GNNFloat bias_correct_second = second_moment[i] / (1.0 - beta2t);
+    // weight update using bias corrected moments
+    matrix_to_update[i] -=
+        alpha * bias_correct_first / sqrtf(bias_correct_second + epsilon);
+  }
+}
+
+} // namespace
+
+void galois::AdamOptimizerGPU::AdamUpdate(
+    const GNNFloat* derivatives, GNNFloat* matrix_to_update, size_t matrix_size,
+    GNNFloat* first_moment, GNNFloat* second_moment, GNNFloat alpha,
+    GNNFloat beta1, GNNFloat beta2, GNNFloat epsilon, GNNFloat beta1t,
+    GNNFloat beta2t) {
+  DoAdamUpdate<<<CUDA_GET_BLOCKS(matrix_size), CUDA_NUM_THREADS>>>(
+      derivatives, matrix_to_update, matrix_size, first_moment, second_moment,
+      alpha, beta1, beta2, epsilon, beta1t, beta2t);
+}
diff --git a/libgnn/test/gpu-adam-test.cpp b/libgnn/test/gpu-adam-test.cpp
index 24a19fb66c..a1d0c1961e 100644
--- a/libgnn/test/gpu-adam-test.cpp
+++ b/libgnn/test/gpu-adam-test.cpp
@@ -3,6 +3,7 @@
 #include "galois/DistGalois.h"
 #include "galois/GNNOptimizers.h"
 #include "galois/Logging.h"
+#include "galois/layers/SoftmaxLayer.h"
 
 int main() {
   galois::DistMemSys G;
@@ -23,26 +24,51 @@ int main() {
   std::vector<size_t> layer_sizes = {2, 1};
   galois::AdamOptimizer adam(config, layer_sizes, 2);
 
+  // make this layer to get access to a gpu helper function; TODO
+  // need a helper alloc function
+  galois::graphs::GNNGraph test_graph(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true);
+  galois::GNNLayerDimensions dimension_0;
+  dimension_0.input_rows     = 7;
+  dimension_0.input_columns  = test_graph.GetNumLabelClasses();
+  dimension_0.output_columns = test_graph.GetNumLabelClasses();
+  auto alloc_layer =
+      std::make_unique<galois::SoftmaxLayer>(3, test_graph, dimension_0);
+
   std::vector<galois::GNNFloat> weights1 = {1, 1};
   std::vector<galois::GNNFloat> weights2 = {10};
   std::vector<galois::GNNFloat> grad1    = {1, 1};
   std::vector<galois::GNNFloat> grad2    = {10};
 
-  adam.GradientDescent(grad1, weights1, 0);
-  //// check weights
-  // GALOIS_LOG_ASSERT(weights1[0] == 0.0);
-  // GALOIS_LOG_ASSERT(weights1[1] == 0.0);
+  galois::PointerWithSize<galois::GNNFloat> p_grad1 =
+      alloc_layer->AllocateGPU(grad1);
+  galois::PointerWithSize<galois::GNNFloat> p_weights1 =
+      alloc_layer->AllocateGPU(weights1);
+  galois::PointerWithSize<galois::GNNFloat> p_grad2 =
+      alloc_layer->AllocateGPU(grad2);
+  galois::PointerWithSize<galois::GNNFloat> p_weights2 =
+      alloc_layer->AllocateGPU(weights2);
+
+  adam.GradientDescent(p_grad1, p_weights1, 0);
+  adam.CopyToVector(weights1, p_weights1);
+
+  // check weights
+  GALOIS_LOG_ASSERT(weights1[0] == 0.0);
+  GALOIS_LOG_ASSERT(weights1[1] == 0.0);
 
-  // adam.GradientDescent(grad2, &weights2, 1);
-  // GALOIS_LOG_ASSERT(weights2[0] == 9.0);
+  adam.GradientDescent(p_grad2, p_weights2, 1);
+  adam.CopyToVector(weights2, p_weights2);
+  GALOIS_LOG_ASSERT(weights2[0] == 9.0);
 
-  //// run again to check if adam keeps moments from before
-  // adam.GradientDescent(grad1, &weights1, 0);
-  //// check weights again (turns out derivative one ends up doing same thing)
-  // GALOIS_LOG_ASSERT(weights1[0] == -1.0);
-  // GALOIS_LOG_ASSERT(weights1[1] == -1.0);
+  // run again to check if adam keeps moments from before
+  adam.GradientDescent(p_grad1, p_weights1, 0);
+  adam.CopyToVector(weights1, p_weights1);
+  // check weights again (turns out derivative one ends up doing same thing)
+  GALOIS_LOG_ASSERT(weights1[0] == -1.0);
+  GALOIS_LOG_ASSERT(weights1[1] == -1.0);
 
-  //// grad 2 again
-  // adam.GradientDescent(grad2, &weights2, 1);
-  // GALOIS_LOG_ASSERT(weights2[0] == 8.0);
+  // grad 2 again
+  adam.GradientDescent(p_grad2, p_weights2, 1);
+  adam.CopyToVector(weights2, p_weights2);
+  GALOIS_LOG_ASSERT(weights2[0] == 8.0);
 }

From 5fd1abfebb755d6c19b0960177e395dbede5c76c Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 19 Nov 2020 18:36:08 -0600
Subject: [PATCH 436/660] gpu epoch test; fixes to returned pointers

Adds a gpu version of the epoch test and fixes the pointers returned
from a GNN layer (it was always returning CPU pointers even in the GPU
build). Adds error checking to cuSparse call too.

gpu-epoch-test runs a GNN end to end (still missing some features that
CPU has), but it has to copy predictions over from GPU (slow, should do
this from GPU end) + there seem to be accuracy issues on reddit. Will be
resolved in a later commit.
---
 libgnn/include/galois/layers/GNNLayer.h |  6 +--
 libgnn/src/GNNMath.cu                   |  1 +
 libgnn/src/GraphNeuralNetwork.cpp       | 11 ++--
 libgnn/test/CMakeLists.txt              |  4 ++
 libgnn/test/gpu-epoch-test.cpp          | 69 +++++++++++++++++++++++++
 5 files changed, 82 insertions(+), 9 deletions(-)
 create mode 100644 libgnn/test/gpu-epoch-test.cpp

diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
index f4acec8f25..3296b17d20 100644
--- a/libgnn/include/galois/layers/GNNLayer.h
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -88,16 +88,16 @@ class GNNLayer {
   }
 
   const PointerWithSize<GNNFloat> GetForwardOutput() {
-    return PointerWithSize(forward_output_matrix_);
+    return p_forward_output_matrix_;
   }
 
   const PointerWithSize<GNNFloat> GetBackwardOutput() {
-    return PointerWithSize(backward_output_matrix_);
+    return p_backward_output_matrix_;
   }
 
   //! Returns the weight gradients
   const PointerWithSize<GNNFloat> GetLayerWeightGradients() {
-    return PointerWithSize(layer_weight_gradients_);
+    return p_layer_weight_gradients_;
   }
 
   //! Returns dimensions of this layer
diff --git a/libgnn/src/GNNMath.cu b/libgnn/src/GNNMath.cu
index 5b429dafb2..8f60f91d84 100644
--- a/libgnn/src/GNNMath.cu
+++ b/libgnn/src/GNNMath.cu
@@ -26,6 +26,7 @@ void galois::CBlasSGEMMGPU(const cublasOperation_t trans_a,
                            output_columns, input_rows, input_columns, &dummy1,
                            b, lead_dim_b, a, lead_dim_a, &dummy0, output,
                            output_columns));
+  CUDA_TEST("cublas sgemm failure");
 }
 
 __global__ void galois::SoftmaxCrossEntropyForward(
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index e669feac50..ebe486b47a 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -102,12 +102,12 @@ galois::GraphNeuralNetwork::DoInference() {
 float galois::GraphNeuralNetwork::GetGlobalAccuracy(
     const PointerWithSize<GNNFloat> predictions) {
   // TODO mark as a forwarding argument?
-#ifndef GALOIS_ENABLE_GPU
+  //#ifndef GALOIS_ENABLE_GPU
   return GetGlobalAccuracyCPU(predictions);
-#else
-  return gpu_object_.GetGlobalAccuracyGPU(graph_->GetGPUGraph(), phase_,
-                                          predictions);
-#endif
+  //#else
+  //  return gpu_object_.GetGlobalAccuracyGPU(graph_->GetGPUGraph(), phase_,
+  //                                          predictions);
+  //#endif
 }
 
 float galois::GraphNeuralNetwork::GetGlobalAccuracyCPU(
@@ -158,7 +158,6 @@ void galois::GraphNeuralNetwork::GradientPropagation() {
   std::unique_ptr<galois::GNNLayer>& output_layer = gnn_layers_.back();
   galois::PointerWithSize<galois::GNNFloat> current_gradients =
       output_layer->BackwardPhase(dummy, nullptr);
-
   // loops through intermediate layers in a backward fashion
   // -1 to ignore output layer which was handled above
   for (size_t i = 0; i < gnn_layers_.size() - 1; i++) {
diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt
index 54e9bd43af..c900c7318c 100644
--- a/libgnn/test/CMakeLists.txt
+++ b/libgnn/test/CMakeLists.txt
@@ -50,6 +50,10 @@ else()
   add_executable(gpu-adam-test gpu-adam-test.cpp)
   target_link_libraries(gpu-adam-test galois_gnn)
   add_test(NAME gpu-adam-test COMMAND gpu-adam-test)
+
+  add_executable(gpu-epoch-test gpu-epoch-test.cpp)
+  target_link_libraries(gpu-epoch-test galois_gnn)
+  #add_test(NAME gpu-epoch-test COMMAND gpu-epoch-test)
 endif()
 
 # TODO multi host tests?
diff --git a/libgnn/test/gpu-epoch-test.cpp b/libgnn/test/gpu-epoch-test.cpp
new file mode 100644
index 0000000000..6223fce8e5
--- /dev/null
+++ b/libgnn/test/gpu-epoch-test.cpp
@@ -0,0 +1,69 @@
+//! @file epoch-test.cpp
+//! Run 50 epochs of training to see if results improve.
+
+#include "galois/Logging.h"
+#include "galois/GraphNeuralNetwork.h"
+
+int main() {
+  galois::DistMemSys G;
+
+  size_t num_threads = galois::setActiveThreads(
+      56 / galois::runtime::getSystemNetworkInterface().Num);
+  // size_t num_threads = galois::setActiveThreads(1);
+  GALOIS_LOG_VERBOSE("Num threads is {}", num_threads);
+
+  // load graph
+  auto test_graph = std::make_unique<galois::graphs::GNNGraph>(
+      "reddit", galois::graphs::GNNPartitionScheme::kCVC, true);
+
+  std::vector<galois::GNNLayerType> layer_types = {
+      galois::GNNLayerType::kGraphConvolutional,
+      galois::GNNLayerType::kGraphConvolutional};
+  std::vector<size_t> layer_output_sizes = {
+      16, test_graph->GetNumLabelClasses(), test_graph->GetNumLabelClasses()};
+  galois::GNNLayerConfig layer_config;
+  layer_config.do_dropout       = false;
+  layer_config.do_activation    = false;
+  layer_config.do_normalization = true;
+  // XXX Activation kills accuracy compared to old code, esp. for cora
+  galois::GraphNeuralNetworkConfig gnn_config(
+      2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax,
+      layer_config);
+
+  std::vector<size_t> adam_sizes = {16 * test_graph->node_feature_length(),
+                                    16 * test_graph->GetNumLabelClasses()};
+  auto adam = std::make_unique<galois::AdamOptimizer>(adam_sizes, 2);
+
+  std::vector<galois::GNNFloat> cpu_pred;
+  cpu_pred.resize(test_graph->GetNumLabelClasses() * test_graph->size());
+
+  auto gnn = std::make_unique<galois::GraphNeuralNetwork>(
+      std::move(test_graph), std::move(adam), std::move(gnn_config));
+
+  //////////////////////////////////////////////////////////////////////////////
+
+  // no verification; test should be eyeballed to make sure accuracy is
+  // increasing
+  galois::StatTimer main_timer("Timer_0");
+  main_timer.start();
+  for (size_t epoch = 0; epoch < 50; epoch++) {
+    galois::PointerWithSize<galois::GNNFloat> predictions = gnn->DoInference();
+    if (cpu_pred.size() != predictions.size()) {
+      cpu_pred.resize(predictions.size());
+    }
+    gnn->GradientPropagation();
+    // copy to cpu
+    // TODO currently adam has this helper function; it should be handled
+    // by other class though
+    adam->CopyToVector(cpu_pred, predictions);
+    galois::gPrint("Epoch ", epoch, ": Accuracy is ",
+                   gnn->GetGlobalAccuracy(cpu_pred), "\n");
+  }
+
+  // check test accuracy
+  gnn->SetLayerPhases(galois::GNNPhase::kTest);
+  galois::PointerWithSize<galois::GNNFloat> predictions = gnn->DoInference();
+  adam->CopyToVector(cpu_pred, predictions);
+  galois::gPrint("Test accuracy is ", gnn->GetGlobalAccuracy(cpu_pred), "\n");
+  main_timer.stop();
+}

From e500cb06a4c2313e6d948f8053b2efa32b44888d Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 24 Nov 2020 17:35:37 -0600
Subject: [PATCH 437/660] GNN: Copy over norm factors from CPU to GPU

Norm factors are required during aggregation in order for the current
computation on GPU to match CPU computation (earlier I was under the
impression that norm factors were integrated into the data that was
already copied, but this is incorrect). This commit adds the norm factor
copy from CPU to GPU.
---
 libgnn/include/galois/graphs/GNNGraph.cuh | 4 ++++
 libgnn/src/graphs/GNNGraph.cpp            | 1 +
 libgnn/src/graphs/GNNGraph.cu             | 9 +++++++++
 3 files changed, 14 insertions(+)

diff --git a/libgnn/include/galois/graphs/GNNGraph.cuh b/libgnn/include/galois/graphs/GNNGraph.cuh
index d485808972..81bf00971a 100644
--- a/libgnn/include/galois/graphs/GNNGraph.cuh
+++ b/libgnn/include/galois/graphs/GNNGraph.cuh
@@ -23,6 +23,8 @@ public:
   //! Copy over masks for the 3 sets to GPU
   void SetMasks(const std::vector<char>& train, const std::vector<char>& val,
                 const std::vector<char>& test);
+  //! Copy over norm factors
+  void SetNormFactors(const std::vector<GNNFloat> norm_factors);
 
   GNNFeature* feature_vector() const { return feature_vector_; };
   int* edge_index() const { return edge_index_; }
@@ -58,6 +60,8 @@ private:
   char* local_training_mask_{nullptr};
   char* local_validation_mask_{nullptr};
   char* local_testing_mask_{nullptr};
+  //! Norm factors used during aggregation
+  GNNFloat* norm_factors_;
 };
 
 } // namespace graphs
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index cbdf5e13db..059759a81e 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -419,5 +419,6 @@ void galois::graphs::GNNGraph::InitGPUMemory() {
   gpu_memory_.SetLabels(local_ground_truth_labels_);
   gpu_memory_.SetMasks(local_training_mask_, local_validation_mask_,
                        local_testing_mask_);
+  gpu_memory_.SetNormFactors(norm_factors_);
 }
 #endif
diff --git a/libgnn/src/graphs/GNNGraph.cu b/libgnn/src/graphs/GNNGraph.cu
index b0d5c1eb43..96ba37db15 100644
--- a/libgnn/src/graphs/GNNGraph.cu
+++ b/libgnn/src/graphs/GNNGraph.cu
@@ -82,3 +82,12 @@ void galois::graphs::GNNGraphGPUAllocations::SetMasks(
   CUDA_CHECK(cudaMemcpy(local_testing_mask_, test.data(),
                         test.size() * sizeof(char), cudaMemcpyHostToDevice));
 }
+
+void galois::graphs::GNNGraphGPUAllocations::SetNormFactors(
+    const std::vector<GNNFloat> norm_factors) {
+  CUDA_CHECK(cudaMalloc((void**)(&norm_factors_),
+                        norm_factors.size() * sizeof(GNNFloat)));
+  CUDA_CHECK(cudaMemcpy(norm_factors_, norm_factors.data(),
+                        norm_factors.size() * sizeof(GNNFloat),
+                        cudaMemcpyHostToDevice));
+}

From 0cdaaf5146f0c0bcc447558248f697aa2bf41af6 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 24 Nov 2020 18:08:22 -0600
Subject: [PATCH 438/660] GPU GCN aggregation uses norm factors

Aggregation in the GPU for GCN now uses norm factors to normalize the
aggregations of neighbors. This change allows it to exactly match
computation done on a CPU if dropout is turned off.

The next step is to add dropout support to the GPU.
---
 libgnn/include/galois/graphs/GNNGraph.cuh     |  3 +-
 .../galois/layers/GraphConvolutionalLayer.cuh |  2 +-
 libgnn/src/GraphNeuralNetwork.cu              |  1 +
 libgnn/src/layers/GraphConvolutionalLayer.cpp |  3 +-
 libgnn/src/layers/GraphConvolutionalLayer.cu  | 35 ++++++++++++++-----
 libgnn/test/gpu-epoch-test.cpp                |  2 +-
 6 files changed, 32 insertions(+), 14 deletions(-)

diff --git a/libgnn/include/galois/graphs/GNNGraph.cuh b/libgnn/include/galois/graphs/GNNGraph.cuh
index 81bf00971a..2012dcd7c9 100644
--- a/libgnn/include/galois/graphs/GNNGraph.cuh
+++ b/libgnn/include/galois/graphs/GNNGraph.cuh
@@ -29,12 +29,11 @@ public:
   GNNFeature* feature_vector() const { return feature_vector_; };
   int* edge_index() const { return edge_index_; }
   int* edge_destinations() const { return edge_destinations_; }
-
   GNNLabel* ground_truth() const { return ground_truth_; }
-
   char* local_training_mask() const { return local_training_mask_; }
   char* local_validation_mask() const { return local_validation_mask_; }
   char* local_testing_mask() const { return local_testing_mask_; }
+  GNNFloat* norm_factors() const { return norm_factors_; }
 
 private:
   // ALL THESE VARIABLES ARE DEVICE SIDE (GPU) POINTERS
diff --git a/libgnn/include/galois/layers/GraphConvolutionalLayer.cuh b/libgnn/include/galois/layers/GraphConvolutionalLayer.cuh
index fd4d9d76f0..c59617828d 100644
--- a/libgnn/include/galois/layers/GraphConvolutionalLayer.cuh
+++ b/libgnn/include/galois/layers/GraphConvolutionalLayer.cuh
@@ -18,7 +18,7 @@ public:
   void AggregateAllGPU(const graphs::GNNGraphGPUAllocations& gpu_graph,
                        size_t num_nodes, size_t column_length,
                        const GNNFloat* node_embeddings,
-                       GNNFloat* aggregate_output);
+                       GNNFloat* aggregate_output, bool use_norm);
 
   void UpdateEmbeddingsGPU(size_t num_nodes, size_t input_columns,
                            size_t output_columns,
diff --git a/libgnn/src/GraphNeuralNetwork.cu b/libgnn/src/GraphNeuralNetwork.cu
index a16c4b2b69..2d04073563 100644
--- a/libgnn/src/GraphNeuralNetwork.cu
+++ b/libgnn/src/GraphNeuralNetwork.cu
@@ -21,6 +21,7 @@ float galois::GraphNeuralNetworkGPU::GetGlobalAccuracyGPU(
   }
 
   // run accuracy check kernel on GPU
+  // TODO finish this implementation
 
   return 0.0;
 }
diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp
index ef9d3cbb03..04fea5f286 100644
--- a/libgnn/src/layers/GraphConvolutionalLayer.cpp
+++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp
@@ -172,7 +172,8 @@ void galois::GraphConvolutionalLayer::AggregateAll(
   AggregateAllCPU(column_length, node_embeddings, aggregate_output, pts);
 #else
   gpu_object_.AggregateAllGPU(graph_.GetGPUGraph(), graph_.size(),
-                              column_length, node_embeddings, aggregate_output);
+                              column_length, node_embeddings, aggregate_output,
+                              config_.do_normalization);
 #endif
 }
 
diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cu b/libgnn/src/layers/GraphConvolutionalLayer.cu
index 7828336b28..882cb32391 100644
--- a/libgnn/src/layers/GraphConvolutionalLayer.cu
+++ b/libgnn/src/layers/GraphConvolutionalLayer.cu
@@ -23,6 +23,7 @@ namespace {
 __global__ void AggregateAllKernel(unsigned num_nodes, size_t column_length,
                                    const int* edge_index,
                                    const int* edge_destination,
+                                   const galois::GNNFloat* norm_factors,
                                    const galois::GNNFloat* node_embeddings,
                                    galois::GNNFloat* aggregate_output) {
   const unsigned thread_id =
@@ -41,6 +42,13 @@ __global__ void AggregateAllKernel(unsigned num_nodes, size_t column_length,
 
   // each warp works on a source: threads in warp split the feature
   for (int src = warp_id; src < static_cast<int>(num_nodes); src += num_warps) {
+    galois::GNNFloat src_norm    = 0.0;
+    galois::GNNFloat norm_to_use = 1.0;
+
+    if (norm_factors != nullptr) {
+      src_norm = norm_factors[src];
+    }
+
     if (thread_lane < 2) {
       edge_begin_end[warp_lane][thread_lane] = edge_index[src + thread_lane];
     }
@@ -56,19 +64,20 @@ __global__ void AggregateAllKernel(unsigned num_nodes, size_t column_length,
       int dst                 = edge_destination[offset];
       unsigned base_dst_index = dst * column_length;
 
+      if (norm_factors != nullptr) {
+        // note that otherwise it's 1.0, so a no-op when it comes to multiply
+        norm_to_use = src_norm * norm_factors[dst];
+      }
+
       // NOTE: this is where warp diverges
       // the feature aggregation is split among thread in a warp
       for (int i = 0; i < column_length; i += WARP_SIZE) {
         if ((thread_lane + i) < column_length) {
           aggregate_output[base_src_index + thread_lane + i] +=
-              node_embeddings[base_dst_index + thread_lane + i];
+              node_embeddings[base_dst_index + thread_lane + i] * norm_to_use;
         }
       }
     }
-    //__syncthreads();
-    // if (thread_lane == 0) {
-    //  printf("Agg %d %f\n", src, aggregate_output[base_src_index]);
-    //}
   }
 }
 
@@ -77,12 +86,20 @@ __global__ void AggregateAllKernel(unsigned num_nodes, size_t column_length,
 void galois::GCNGPUAllocations::AggregateAllGPU(
     const graphs::GNNGraphGPUAllocations& gpu_graph, size_t num_nodes,
     size_t column_length, const GNNFloat* node_embeddings,
-    GNNFloat* aggregate_output) {
+    GNNFloat* aggregate_output, bool use_norm) {
   CUDA_CHECK(cudaMemset(aggregate_output, 0,
                         num_nodes * column_length * sizeof(GNNFloat)));
-  AggregateAllKernel<<<(num_nodes - 1) / WARPS_PER_BLOCK + 1, BLOCK_SIZE>>>(
-      num_nodes, column_length, gpu_graph.edge_index(),
-      gpu_graph.edge_destinations(), node_embeddings, aggregate_output);
+  if (use_norm) {
+    AggregateAllKernel<<<(num_nodes - 1) / WARPS_PER_BLOCK + 1, BLOCK_SIZE>>>(
+        num_nodes, column_length, gpu_graph.edge_index(),
+        gpu_graph.edge_destinations(), gpu_graph.norm_factors(),
+        node_embeddings, aggregate_output);
+  } else {
+    AggregateAllKernel<<<(num_nodes - 1) / WARPS_PER_BLOCK + 1, BLOCK_SIZE>>>(
+        num_nodes, column_length, gpu_graph.edge_index(),
+        gpu_graph.edge_destinations(), nullptr, node_embeddings,
+        aggregate_output);
+  }
   CUDA_TEST("GPU aggregate all failure");
 }
 
diff --git a/libgnn/test/gpu-epoch-test.cpp b/libgnn/test/gpu-epoch-test.cpp
index 6223fce8e5..7778550875 100644
--- a/libgnn/test/gpu-epoch-test.cpp
+++ b/libgnn/test/gpu-epoch-test.cpp
@@ -46,7 +46,7 @@ int main() {
   // increasing
   galois::StatTimer main_timer("Timer_0");
   main_timer.start();
-  for (size_t epoch = 0; epoch < 50; epoch++) {
+  for (size_t epoch = 0; epoch < 20; epoch++) {
     galois::PointerWithSize<galois::GNNFloat> predictions = gnn->DoInference();
     if (cpu_pred.size() != predictions.size()) {
       cpu_pred.resize(predictions.size());

From 982a0bc783d92cdf3c3b398c794f72b7ff968d64 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 24 Nov 2020 18:59:18 -0600
Subject: [PATCH 439/660] Init function for CuRAND

Efficient dropout support requires RNG on the GPU: this commit adds a
function to init the CuRAND RNG so that the GPU can generate the random
numbers required to choose things to drop for dropout.
---
 libgnn/CMakeLists.txt             |  4 ++--
 libgnn/include/galois/CUDAUtil.h  | 11 +++++++++++
 libgnn/include/galois/GNNMath.cuh |  4 ++++
 libgnn/src/GNNMath.cu             | 14 ++++++++++++--
 4 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt
index 362fc7f773..baee47c3fb 100644
--- a/libgnn/CMakeLists.txt
+++ b/libgnn/CMakeLists.txt
@@ -47,8 +47,8 @@ if (GALOIS_ENABLE_GPU)
     ${CMAKE_CURRENT_SOURCE_DIR}/include
   )
 
-  # link to gpu lib (which takes care of moderngpu and cub)
-  target_link_libraries(galois_gnn_gpu Galois::gpu galois_support -lcublas)
+  # link to gpu lib (which takes care of moderngpu and cub) as well as cu libs
+  target_link_libraries(galois_gnn_gpu Galois::gpu galois_support -lcublas -lcurand)
 
   # gpu -> cpu lib
   target_link_libraries(galois_gnn galois_gnn_gpu)
diff --git a/libgnn/include/galois/CUDAUtil.h b/libgnn/include/galois/CUDAUtil.h
index 51be6cd102..fd51eb1362 100644
--- a/libgnn/include/galois/CUDAUtil.h
+++ b/libgnn/include/galois/CUDAUtil.h
@@ -6,6 +6,7 @@
 //! https://github.com/BVLC/caffe/blob/master/include/caffe/util/device_alternate.hpp
 #include <cuda.h>
 #include <cublas_v2.h>
+#include <curand.h>
 #include "galois/Logging.h"
 
 // TODO check these too and make sure they make sense
@@ -71,4 +72,14 @@ inline int CUDA_GET_BLOCKS(const int N) {
     }                                                                          \
   } while (0)
 
+//! Wrap a CuRAND call with this to check if it threw any errors
+#define CURAND_CHECK(condition)                                                \
+  do {                                                                         \
+    curandStatus_t status = condition;                                         \
+    if (status != CURAND_STATUS_SUCCESS) {                                     \
+      GALOIS_LOG_ERROR("CuRAND error code : {}", status);                      \
+      exit(EXIT_FAILURE);                                                      \
+    }                                                                          \
+  } while (0)
+
 #endif
diff --git a/libgnn/include/galois/GNNMath.cuh b/libgnn/include/galois/GNNMath.cuh
index aca14a573f..40402f325b 100644
--- a/libgnn/include/galois/GNNMath.cuh
+++ b/libgnn/include/galois/GNNMath.cuh
@@ -7,9 +7,13 @@ namespace galois {
 
 extern bool cublas_is_init;
 extern cublasHandle_t global_cublas_handle;
+extern bool curand_is_init;
+extern curandGenerator_t global_curand_generator;
 
 //! Initializes the cublas handle to use cublas on GPUs.
 void InitCuBLAS();
+//! Initializes the curand RNG
+void InitCuRAND();
 
 //! Takes 2 *row-major* matrices and does a matrix multiply on the GPU using
 //! CuBLAS.
diff --git a/libgnn/src/GNNMath.cu b/libgnn/src/GNNMath.cu
index 8f60f91d84..026ca17265 100644
--- a/libgnn/src/GNNMath.cu
+++ b/libgnn/src/GNNMath.cu
@@ -2,8 +2,19 @@
 
 bool galois::cublas_is_init = false;
 cublasHandle_t galois::global_cublas_handle;
+bool galois::curand_is_init = false;
+curandGenerator_t galois::global_curand_generator;
 
-void galois::InitCuBLAS() { CUBLAS_CHECK(cublasCreate(&global_cublas_handle)); }
+void galois::InitCuBLAS() {
+  CUBLAS_CHECK(cublasCreate(&global_cublas_handle));
+  galois::cublas_is_init = true;
+}
+
+void galois::InitCuRAND() {
+  CURAND_CHECK(curandCreateGenerator(&galois::global_curand_generator,
+                                     CURAND_RNG_PSEUDO_DEFAULT));
+  galois::curand_is_init = true;
+}
 
 void galois::CBlasSGEMMGPU(const cublasOperation_t trans_a,
                            const cublasOperation_t trans_b, size_t input_rows,
@@ -12,7 +23,6 @@ void galois::CBlasSGEMMGPU(const cublasOperation_t trans_a,
                            GNNFloat* output) {
   if (!cublas_is_init) {
     InitCuBLAS();
-    cublas_is_init = true;
   }
   size_t lead_dim_a = (trans_a == CUBLAS_OP_N) ? input_columns : input_rows;
   size_t lead_dim_b = (trans_b == CUBLAS_OP_N) ? output_columns : input_columns;

From 256c560e74ed35f0b2c90edb0375a06de9a95848 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 24 Nov 2020 19:34:51 -0600
Subject: [PATCH 440/660] GNNLayer GPU: init dropout memory

Initializes a dropout mask for every GPU layer. Can be optimized if
dropout is disabled (i.e. do not allocate) for both CPU/GPUs. This will
be handled later once a base implementation of everything is settled.

It is a float because the float will be checked during dropout to see if
it crosses some threshold for dropout.
---
 libgnn/include/galois/layers/GNNLayer.cuh | 3 +++
 libgnn/src/layers/GNNLayer.cpp            | 2 ++
 libgnn/src/layers/GNNLayer.cu             | 6 ++++++
 3 files changed, 11 insertions(+)

diff --git a/libgnn/include/galois/layers/GNNLayer.cuh b/libgnn/include/galois/layers/GNNLayer.cuh
index 387b1673c4..0e00515302 100644
--- a/libgnn/include/galois/layers/GNNLayer.cuh
+++ b/libgnn/include/galois/layers/GNNLayer.cuh
@@ -12,6 +12,8 @@ public:
   void InitInOutMemory(size_t forward_size, size_t backward_size);
   //! Initializes memory for weight and weight gradients on GPU
   void InitWeightMemory(size_t num_weights);
+  //! Initializes memory for dropout
+  void InitDropoutMemory(size_t dropout_size);
   //! Copy provided data in vector to GPU weights
   void CopyToWeights(const std::vector<GNNFloat>& cpu_layer_weights);
   //! Copy GPU forward output to the provided vector (assumes vector is already
@@ -42,6 +44,7 @@ private:
   GNNFloat* backward_output_matrix_{nullptr};
   GNNFloat* layer_weights_{nullptr};
   GNNFloat* layer_weight_gradients_{nullptr};
+  GNNFloat* dropout_mask_{nullptr};
 };
 
 } // namespace galois
diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp
index 5688e13c31..8044eef6cf 100644
--- a/libgnn/src/layers/GNNLayer.cpp
+++ b/libgnn/src/layers/GNNLayer.cpp
@@ -20,6 +20,8 @@ galois::GNNLayer::GNNLayer(size_t layer_num,
     layer_weight_gradients_.resize(num_weight_elements, 0);
 #ifdef GALOIS_ENABLE_GPU
     base_gpu_object_.InitWeightMemory(num_weight_elements);
+    base_gpu_object_.InitDropoutMemory(layer_dimensions_.input_rows *
+                                       layer_dimensions_.input_columns);
 #endif
 
     GlorotBengioInit(&layer_weights_);
diff --git a/libgnn/src/layers/GNNLayer.cu b/libgnn/src/layers/GNNLayer.cu
index 597fba96bd..dc70817dcf 100644
--- a/libgnn/src/layers/GNNLayer.cu
+++ b/libgnn/src/layers/GNNLayer.cu
@@ -30,6 +30,12 @@ void galois::GNNLayerGPUAllocations::InitWeightMemory(size_t num_weights) {
                         num_weights * sizeof(GNNFloat)));
 }
 
+void galois::GNNLayerGPUAllocations::InitDropoutMemory(size_t dropout_size) {
+  CUDA_CHECK(
+      cudaMalloc((void**)(&dropout_mask_), dropout_size * sizeof(GNNFloat)));
+  CUDA_CHECK(cudaMemset(dropout_mask_, 0, dropout_size * sizeof(GNNFloat)));
+}
+
 void galois::GNNLayerGPUAllocations::CopyToWeights(
     const std::vector<GNNFloat>& cpu_layer_weights) {
   CUDA_CHECK(cudaMemcpy(layer_weights_, cpu_layer_weights.data(),

From 0fa6ea70543e14e01ac4578edcbfb4b032efc5c0 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 25 Nov 2020 12:33:13 -0600
Subject: [PATCH 441/660] Added CuRAND uniform generate wrapper call

Wrapper call to generate random numbers in an array on the GPU.
---
 libgnn/include/galois/GNNMath.cuh | 3 +++
 libgnn/src/GNNMath.cu             | 9 +++++++++
 2 files changed, 12 insertions(+)

diff --git a/libgnn/include/galois/GNNMath.cuh b/libgnn/include/galois/GNNMath.cuh
index 40402f325b..1b262fa6a3 100644
--- a/libgnn/include/galois/GNNMath.cuh
+++ b/libgnn/include/galois/GNNMath.cuh
@@ -15,6 +15,9 @@ void InitCuBLAS();
 //! Initializes the curand RNG
 void InitCuRAND();
 
+//! Initializes an array with random numbers (0.0, 1.0]
+void CuRANDUniformRNG(GNNFloat* array_to_fill, size_t num_elements);
+
 //! Takes 2 *row-major* matrices and does a matrix multiply on the GPU using
 //! CuBLAS.
 void CBlasSGEMMGPU(const cublasOperation_t trans_a,
diff --git a/libgnn/src/GNNMath.cu b/libgnn/src/GNNMath.cu
index 026ca17265..8771b75d5b 100644
--- a/libgnn/src/GNNMath.cu
+++ b/libgnn/src/GNNMath.cu
@@ -16,6 +16,15 @@ void galois::InitCuRAND() {
   galois::curand_is_init = true;
 }
 
+void galois::CuRANDUniformRNG(GNNFloat* array_to_fill, size_t num_elements) {
+  // TODO how much overhead does this check have?
+  if (!galois::curand_is_init) {
+    galois::InitCuRAND();
+  }
+  CURAND_CHECK(curandGenerateUniform(galois::global_curand_generator,
+                                     array_to_fill, num_elements));
+}
+
 void galois::CBlasSGEMMGPU(const cublasOperation_t trans_a,
                            const cublasOperation_t trans_b, size_t input_rows,
                            size_t input_columns, size_t output_columns,

From b52e1cbc6e9e6024002541efabceb294248fa898 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 25 Nov 2020 17:10:40 -0600
Subject: [PATCH 442/660] GNN GPU: forward phase dropout

Adds code necessary for forward phase of dropout. Adds a data structure
for storing chars (result of dropout rng) and renames vars accordingly
(floats are converted to bools depending on threshold). Fixes the
variables passed into the dropout call as well as before it was using
CPU pointers: this commit changes the pointer to use the PointerWithSize
objects allocated for this purpose.
---
 libgnn/include/galois/PerThreadRNG.h          |  2 +
 libgnn/include/galois/layers/GNNLayer.cuh     |  7 +++-
 libgnn/src/layers/GNNLayer.cpp                | 10 ++---
 libgnn/src/layers/GNNLayer.cu                 | 37 ++++++++++++++++++-
 libgnn/src/layers/GraphConvolutionalLayer.cpp |  5 +--
 5 files changed, 50 insertions(+), 11 deletions(-)

diff --git a/libgnn/include/galois/PerThreadRNG.h b/libgnn/include/galois/PerThreadRNG.h
index fde88386ab..441b1b542c 100644
--- a/libgnn/include/galois/PerThreadRNG.h
+++ b/libgnn/include/galois/PerThreadRNG.h
@@ -26,6 +26,8 @@ class PerThreadRNG {
   }
   //! Return true or false based on some dropout rate
   bool DoBernoulli(float dropout_rate) {
+    // TODO can the random number be 0? what is the behavior of 0 > 0?
+    // same with 1 > 1.....
     return (GetRandomNumber() > dropout_rate) ? 1 : 0;
   }
 
diff --git a/libgnn/include/galois/layers/GNNLayer.cuh b/libgnn/include/galois/layers/GNNLayer.cuh
index 0e00515302..fed3f12402 100644
--- a/libgnn/include/galois/layers/GNNLayer.cuh
+++ b/libgnn/include/galois/layers/GNNLayer.cuh
@@ -29,6 +29,10 @@ public:
   //! Prints forward output matrix on gpu
   void PrintForwardOutput(size_t num);
 
+  //! Does dropout on the GPU; saves non-dropped weights to output
+  void DoDropoutGPU(const PointerWithSize<GNNFloat> input_to_dropout,
+                    PointerWithSize<GNNFloat> output, float dropout_rate);
+
   //! Helper function: give a vector which is copied over to the GPU (new
   //! memory is allocated as necessary)
   GNNFloat* Allocate(const std::vector<GNNFloat>& v);
@@ -44,7 +48,8 @@ private:
   GNNFloat* backward_output_matrix_{nullptr};
   GNNFloat* layer_weights_{nullptr};
   GNNFloat* layer_weight_gradients_{nullptr};
-  GNNFloat* dropout_mask_{nullptr};
+  GNNFloat* rng_results_{nullptr};
+  char* dropout_mask_{nullptr};
 };
 
 } // namespace galois
diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp
index 8044eef6cf..3541442bed 100644
--- a/libgnn/src/layers/GNNLayer.cpp
+++ b/libgnn/src/layers/GNNLayer.cpp
@@ -128,12 +128,12 @@ void galois::GNNLayer::DoDropoutCPU(
 void galois::GNNLayer::DoDropout(
     const PointerWithSize<GNNFloat> input_to_dropout,
     PointerWithSize<GNNFloat>* output_matrix) {
-  //#ifdef GALOIS_ENABLE_GPU
-  //  // XXX
-  //  DoDropoutGPU();
-  //#else
+#ifdef GALOIS_ENABLE_GPU
+  base_gpu_object_.DoDropoutGPU(input_to_dropout, *output_matrix,
+                                config_.dropout_rate);
+#else
   DoDropoutCPU(input_to_dropout, output_matrix);
-  //#endif
+#endif
 }
 
 void galois::GNNLayer::DoDropoutDerivative() {
diff --git a/libgnn/src/layers/GNNLayer.cu b/libgnn/src/layers/GNNLayer.cu
index dc70817dcf..0e43a478be 100644
--- a/libgnn/src/layers/GNNLayer.cu
+++ b/libgnn/src/layers/GNNLayer.cu
@@ -1,4 +1,5 @@
 #include "galois/CUDAUtil.h"
+#include "galois/GNNMath.cuh"
 #include "galois/layers/GNNLayer.cuh"
 
 galois::GNNLayerGPUAllocations::~GNNLayerGPUAllocations() {
@@ -32,8 +33,11 @@ void galois::GNNLayerGPUAllocations::InitWeightMemory(size_t num_weights) {
 
 void galois::GNNLayerGPUAllocations::InitDropoutMemory(size_t dropout_size) {
   CUDA_CHECK(
-      cudaMalloc((void**)(&dropout_mask_), dropout_size * sizeof(GNNFloat)));
-  CUDA_CHECK(cudaMemset(dropout_mask_, 0, dropout_size * sizeof(GNNFloat)));
+      cudaMalloc((void**)(&rng_results_), dropout_size * sizeof(GNNFloat)));
+  CUDA_CHECK(cudaMemset(rng_results_, 0, dropout_size * sizeof(GNNFloat)));
+
+  CUDA_CHECK(cudaMalloc((void**)(&dropout_mask_), dropout_size * sizeof(char)));
+  CUDA_CHECK(cudaMemset(dropout_mask_, 0, dropout_size * sizeof(char)));
 }
 
 void galois::GNNLayerGPUAllocations::CopyToWeights(
@@ -64,6 +68,35 @@ void galois::GNNLayerGPUAllocations::CopyWeightGradientsToCPU(
                         cudaMemcpyDeviceToHost));
 }
 
+namespace {
+
+__global__ void
+DoDropoutImpl(size_t input_size, const galois::GNNFloat* input_to_dropout,
+              galois::GNNFloat* output, const galois::GNNFloat* rng_vector,
+              char* dropout_mask, float dropout_rate, galois::GNNFloat scale) {
+  CUDA_KERNEL_LOOP(i, input_size) {
+    // convert the rng floats into a mask
+    dropout_mask[i] = rng_vector[i] > dropout_rate ? 1 : 0;
+    // use mask to keep/drop weights
+    output[i] = input_to_dropout[i] * dropout_mask[i] * scale;
+  }
+}
+
+} // namespace
+
+void galois::GNNLayerGPUAllocations::DoDropoutGPU(
+    const PointerWithSize<GNNFloat> input_to_dropout,
+    PointerWithSize<GNNFloat> output, float dropout_rate) {
+  // RNG which weights to dropout
+  galois::CuRANDUniformRNG(rng_results_, input_to_dropout.size());
+  GNNFloat scale = 1. / (1. - dropout_rate);
+  // GPU dropout kernel
+  DoDropoutImpl<<<CUDA_GET_BLOCKS(input_to_dropout.size()), CUDA_NUM_THREADS>>>(
+      input_to_dropout.size(), input_to_dropout.data(), output.data(),
+      rng_results_, dropout_mask_, dropout_rate, scale);
+  CUDA_TEST("Dropout on GPU failure");
+}
+
 galois::GNNFloat*
 galois::GNNLayerGPUAllocations::Allocate(const std::vector<GNNFloat>& v) {
   // TODO keep track of these so that on destruction they can be freed
diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp
index 04fea5f286..d070afa1ef 100644
--- a/libgnn/src/layers/GraphConvolutionalLayer.cpp
+++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp
@@ -53,9 +53,8 @@ galois::GraphConvolutionalLayer::ForwardPhase(
   const GNNFloat* input_data = input_embeddings.data();
   // first, dropout
   if (config_.do_dropout && (layer_phase_ == GNNPhase::kTrain)) {
-    galois::PointerWithSize<galois::GNNFloat> drop_output(in_temp_1_);
-    DoDropout(input_embeddings, &drop_output);
-    input_data = drop_output.data();
+    DoDropout(input_embeddings, &p_in_temp_1_);
+    input_data = p_in_temp_1_.data();
   }
 
   // flip aggregate/update if dimensions favor it (do less work)

From a186e5836ef35c0fee1964859f95688afde3a463 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 25 Nov 2020 17:48:43 -0600
Subject: [PATCH 443/660] GPU GNN: backward phase dropout derivative

Adds the GPU wrapper + kernel call for the derivative of dropout for the
backward phase of the GCN layer. Makes the epoch tests for both CPUs and
GPUs equivalent as well.

With this commit, single CPU/GPU functionality for the GCN is roughly
the same. Note that answers will not be the same due to dropout RNG
being different on the 2 platforms.
---
 libgnn/include/galois/layers/GNNLayer.cuh |  2 ++
 libgnn/src/layers/GNNLayer.cpp            | 11 ++++++++---
 libgnn/src/layers/GNNLayer.cu             | 18 +++++++++++++++++-
 libgnn/test/epoch-test.cpp                |  2 +-
 libgnn/test/gpu-epoch-test.cpp            |  6 +++---
 5 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/libgnn/include/galois/layers/GNNLayer.cuh b/libgnn/include/galois/layers/GNNLayer.cuh
index fed3f12402..9dfd09e0da 100644
--- a/libgnn/include/galois/layers/GNNLayer.cuh
+++ b/libgnn/include/galois/layers/GNNLayer.cuh
@@ -32,6 +32,8 @@ public:
   //! Does dropout on the GPU; saves non-dropped weights to output
   void DoDropoutGPU(const PointerWithSize<GNNFloat> input_to_dropout,
                     PointerWithSize<GNNFloat> output, float dropout_rate);
+  //! Does dropout derivative on the backward output matrix of the gpu
+  void DoDropoutDerivativeGPU(size_t input_size, GNNFloat scale);
 
   //! Helper function: give a vector which is copied over to the GPU (new
   //! memory is allocated as necessary)
diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp
index 3541442bed..aff4bc3b11 100644
--- a/libgnn/src/layers/GNNLayer.cpp
+++ b/libgnn/src/layers/GNNLayer.cpp
@@ -128,11 +128,11 @@ void galois::GNNLayer::DoDropoutCPU(
 void galois::GNNLayer::DoDropout(
     const PointerWithSize<GNNFloat> input_to_dropout,
     PointerWithSize<GNNFloat>* output_matrix) {
-#ifdef GALOIS_ENABLE_GPU
+#ifndef GALOIS_ENABLE_GPU
+  DoDropoutCPU(input_to_dropout, output_matrix);
+#else
   base_gpu_object_.DoDropoutGPU(input_to_dropout, *output_matrix,
                                 config_.dropout_rate);
-#else
-  DoDropoutCPU(input_to_dropout, output_matrix);
 #endif
 }
 
@@ -140,6 +140,7 @@ void galois::GNNLayer::DoDropoutDerivative() {
   assert(backward_output_matrix_.size() == dropout_mask_.size());
   GNNFloat scale = 1. / (1. - config_.dropout_rate);
 
+#ifndef GALOIS_ENABLE_GPU
   // use dropout mask to figure out derivative
   galois::do_all(
       galois::iterate(static_cast<size_t>(0), backward_output_matrix_.size()),
@@ -149,6 +150,10 @@ void galois::GNNLayer::DoDropoutDerivative() {
                                      scale;
       },
       galois::loopname("LayerDropoutDerivative"));
+#else
+  base_gpu_object_.DoDropoutDerivativeGPU(p_backward_output_matrix_.size(),
+                                          scale);
+#endif
 }
 
 void galois::GNNLayer::Activation() {
diff --git a/libgnn/src/layers/GNNLayer.cu b/libgnn/src/layers/GNNLayer.cu
index 0e43a478be..d6616be5fe 100644
--- a/libgnn/src/layers/GNNLayer.cu
+++ b/libgnn/src/layers/GNNLayer.cu
@@ -78,7 +78,16 @@ DoDropoutImpl(size_t input_size, const galois::GNNFloat* input_to_dropout,
     // convert the rng floats into a mask
     dropout_mask[i] = rng_vector[i] > dropout_rate ? 1 : 0;
     // use mask to keep/drop weights
-    output[i] = input_to_dropout[i] * dropout_mask[i] * scale;
+    output[i] = input_to_dropout[i] * (float)dropout_mask[i] * scale;
+  }
+}
+
+__global__ void DoDropoutDerivativeImpl(size_t input_size,
+                                        galois::GNNFloat* input,
+                                        char* dropout_mask,
+                                        galois::GNNFloat scale) {
+  CUDA_KERNEL_LOOP(i, input_size) {
+    input[i] = input[i] * (float)dropout_mask[i] * scale;
   }
 }
 
@@ -97,6 +106,13 @@ void galois::GNNLayerGPUAllocations::DoDropoutGPU(
   CUDA_TEST("Dropout on GPU failure");
 }
 
+void galois::GNNLayerGPUAllocations::DoDropoutDerivativeGPU(size_t input_size,
+                                                            GNNFloat scale) {
+  DoDropoutDerivativeImpl<<<CUDA_GET_BLOCKS(input_size), CUDA_NUM_THREADS>>>(
+      input_size, backward_output_matrix_, dropout_mask_, scale);
+  CUDA_TEST("Dropout derivative on GPU failure");
+}
+
 galois::GNNFloat*
 galois::GNNLayerGPUAllocations::Allocate(const std::vector<GNNFloat>& v) {
   // TODO keep track of these so that on destruction they can be freed
diff --git a/libgnn/test/epoch-test.cpp b/libgnn/test/epoch-test.cpp
index da2a9e1be2..d8a27cc13b 100644
--- a/libgnn/test/epoch-test.cpp
+++ b/libgnn/test/epoch-test.cpp
@@ -43,7 +43,7 @@ int main() {
   // increasing
   galois::StatTimer main_timer("Timer_0");
   main_timer.start();
-  for (size_t epoch = 0; epoch < 20; epoch++) {
+  for (size_t epoch = 0; epoch < 100; epoch++) {
     galois::PointerWithSize<galois::GNNFloat> predictions = gnn->DoInference();
     gnn->GradientPropagation();
     galois::gPrint("Epoch ", epoch, ": Accuracy is ",
diff --git a/libgnn/test/gpu-epoch-test.cpp b/libgnn/test/gpu-epoch-test.cpp
index 7778550875..3a481b9d66 100644
--- a/libgnn/test/gpu-epoch-test.cpp
+++ b/libgnn/test/gpu-epoch-test.cpp
@@ -14,7 +14,7 @@ int main() {
 
   // load graph
   auto test_graph = std::make_unique<galois::graphs::GNNGraph>(
-      "reddit", galois::graphs::GNNPartitionScheme::kCVC, true);
+      "cora", galois::graphs::GNNPartitionScheme::kCVC, true);
 
   std::vector<galois::GNNLayerType> layer_types = {
       galois::GNNLayerType::kGraphConvolutional,
@@ -22,7 +22,7 @@ int main() {
   std::vector<size_t> layer_output_sizes = {
       16, test_graph->GetNumLabelClasses(), test_graph->GetNumLabelClasses()};
   galois::GNNLayerConfig layer_config;
-  layer_config.do_dropout       = false;
+  layer_config.do_dropout       = true;
   layer_config.do_activation    = false;
   layer_config.do_normalization = true;
   // XXX Activation kills accuracy compared to old code, esp. for cora
@@ -46,7 +46,7 @@ int main() {
   // increasing
   galois::StatTimer main_timer("Timer_0");
   main_timer.start();
-  for (size_t epoch = 0; epoch < 20; epoch++) {
+  for (size_t epoch = 0; epoch < 100; epoch++) {
     galois::PointerWithSize<galois::GNNFloat> predictions = gnn->DoInference();
     if (cpu_pred.size() != predictions.size()) {
       cpu_pred.resize(predictions.size());

From 461d57571f1159d8add0005fe664878a089866e1 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 15 Dec 2020 19:57:30 -0600
Subject: [PATCH 444/660] Gradient sync to sum, master only softmax

Changes the gradient sync function to use sum instead of average, and
make it so the softmax layer only loops over the master nodes on each
host.

The effects of this are that accuracy in a distributed setting will be
exactly the same as accuracy in a single host setting because no
redundant computation will occur. In practice, however, RNG on each host
(i.e., dropout) will cause distributed execution to differ from single
host execution. Turning off all RNG will make it so the exact same
computation occurs (tradeoff is that dropout isn't done, so overfitting
to the train set may occur).
---
 libgnn/src/layers/GraphConvolutionalLayer.cpp | 4 +++-
 libgnn/src/layers/SoftmaxLayer.cpp            | 4 ++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp
index d070afa1ef..9c4379dbcc 100644
--- a/libgnn/src/layers/GraphConvolutionalLayer.cpp
+++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp
@@ -89,6 +89,7 @@ galois::GraphConvolutionalLayer::BackwardPhase(
     galois::PointerWithSize<galois::GNNFloat> prev_layer_input,
     galois::PointerWithSize<galois::GNNFloat>* input_gradient) {
   assert(layer_phase_ == GNNPhase::kTrain);
+
   // derivative of activation
   if (config_.do_activation) {
     ActivationDerivative(input_gradient);
@@ -153,7 +154,8 @@ galois::GraphConvolutionalLayer::BackwardPhase(
   // sync weight gradients; note aggregation sync occurs in the function call
   // already
   // TODO figure out how to do this with GPUs
-  WeightGradientSyncAverage();
+  // WeightGradientSyncAverage();
+  WeightGradientSyncSum();
 
   if (config_.do_dropout && layer_number_ != 0) {
     DoDropoutDerivative();
diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp
index a4d5133caa..562349780b 100644
--- a/libgnn/src/layers/SoftmaxLayer.cpp
+++ b/libgnn/src/layers/SoftmaxLayer.cpp
@@ -10,7 +10,7 @@ galois::SoftmaxLayer::ForwardPhaseCPU(
   const size_t feature_length = layer_dimensions_.input_columns;
 
   galois::do_all(
-      galois::iterate(graph_.begin(), graph_.end()),
+      galois::iterate(graph_.begin_owned(), graph_.end_owned()),
       [&](const unsigned i) {
         if (graph_.IsValidForPhase(i, layer_phase_)) {
           // do softmax
@@ -60,7 +60,7 @@ galois::SoftmaxLayer::BackwardPhaseCPU() {
   backward_output_matrix_.assign(backward_output_matrix_.size(), 0);
 
   galois::do_all(
-      galois::iterate(graph_.begin(), graph_.end()),
+      galois::iterate(graph_.begin_owned(), graph_.end_owned()),
       [&](const unsigned i) {
         if (graph_.IsValidForPhase(i, layer_phase_)) {
           // create ground truth vector for this LID

From 98509ebd21d949e717fd98901cc324ad39ba55aa Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 18 Dec 2020 19:12:02 -0600
Subject: [PATCH 445/660] Old code fixing: zero out output matrices

Old GNN code was problematic as it did not zero out output matrices,
meaning garbage was introduced into training step (and ironically
improved accuracy). This has been fixed.

Also left a comment in adam optimizer noting its incorrect use.
---
 libdeepgalois/include/deepgalois/optimizer.h    |  2 +-
 libdeepgalois/src/layers/softmax_loss_layer.cpp | 14 ++++++++++++++
 libdeepgalois/src/optimizer.cpp                 |  2 ++
 3 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/libdeepgalois/include/deepgalois/optimizer.h b/libdeepgalois/include/deepgalois/optimizer.h
index ceb0f93ba0..f5eb4b54ec 100644
--- a/libdeepgalois/include/deepgalois/optimizer.h
+++ b/libdeepgalois/include/deepgalois/optimizer.h
@@ -49,7 +49,7 @@ struct stateful_optimizer : public optimizer {
   vec_t& get(const vec_t& key) {
     static_assert(Index < N, "index out of range");
     if (E_[Index][&key].empty())
-      E_[Index][&key].resize(key.size(), float_t());
+      E_[Index][&key].resize(key.size(), float_t(0));
     return E_[Index][&key];
   }
   std::unordered_map<const vec_t*, vec_t> E_[N];
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp
index 3581365427..17e7023176 100644
--- a/libdeepgalois/src/layers/softmax_loss_layer.cpp
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp
@@ -29,6 +29,13 @@ void softmax_loss_layer::forward_propagation(const float_t* in_data,
                                              float_t* out_data) {
   // size_t numSamples = input_dims;
   size_t featLen = input_dims[1];
+  // zero out the output vector
+  for (unsigned i = 0; i < input_dims[0]; i++) {
+    for (unsigned j = 0; j < featLen; j++) {
+      out_data[i * featLen + j] = 0.0;
+    }
+  }
+
   galois::do_all(
       galois::iterate(begin_, end_),
       [&](const unsigned gid) {
@@ -61,6 +68,13 @@ void softmax_loss_layer::back_propagation(const float_t* in_data,
                                           float_t* in_grad) {
   // note: out_grad is ignored because it shouldn't exist (this is output layer)
   size_t featLen = layer::input_dims[1];
+
+  for (unsigned i = 0; i < input_dims[0]; i++) {
+    for (unsigned j = 0; j < featLen; j++) {
+      in_grad[i * featLen + j] = 0.0;
+    }
+  }
+
   galois::do_all(
       galois::iterate(layer::begin_, layer::end_),
       [&](const auto& gid) {
diff --git a/libdeepgalois/src/optimizer.cpp b/libdeepgalois/src/optimizer.cpp
index e8455e9206..4538d1c956 100644
--- a/libdeepgalois/src/optimizer.cpp
+++ b/libdeepgalois/src/optimizer.cpp
@@ -46,6 +46,8 @@ void adam::update(const vec_t& dW, vec_t& W) {
       },
       galois::chunk_size<256>(), galois::steal(),
       galois::loopname("adam_update"));
+  // TODO/NOTE: this is incorrect: adam parameters should not be shared
+  // among layers, but this is making it shared
   b1_t *= b1;
   b2_t *= b2;
 }

From f20e47368148e3a7bf1572b1039ee7a9d2045ef3 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 6 Jan 2021 18:45:04 -0600
Subject: [PATCH 446/660] Testing splits for ogbn datasets

Graph loading in libgnn requires splits for the training set; this
commit adds them for the new ogbn datasets.
---
 libcusp/include/galois/graphs/NewGeneric.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/libcusp/include/galois/graphs/NewGeneric.h b/libcusp/include/galois/graphs/NewGeneric.h
index 048cfa4bc2..9d6fe7b558 100644
--- a/libcusp/include/galois/graphs/NewGeneric.h
+++ b/libcusp/include/galois/graphs/NewGeneric.h
@@ -100,6 +100,12 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
     } else if (filename.find("tester") != std::string::npos) {
       bps.push_back(0);
       bps.push_back(5);
+    } else if (filename.find("ogbn-arxiv") != std::string::npos) {
+      bps.push_back(0);
+      bps.push_back(169251);
+    } else if (filename.find("ogbn-products") != std::string::npos) {
+      bps.push_back(0);
+      bps.push_back(196614);
     } else {
       // XXX only die under certain conditions
       // GALOIS_DIE("invalid input for gnn partitioning ", filename,

From 4e58928c3648aab90692070e57afc01f626cc495 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 6 Jan 2021 19:07:56 -0600
Subject: [PATCH 447/660] Off by one for ogbn training splits

---
 libcusp/include/galois/graphs/NewGeneric.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libcusp/include/galois/graphs/NewGeneric.h b/libcusp/include/galois/graphs/NewGeneric.h
index 9d6fe7b558..e8f4fb332d 100644
--- a/libcusp/include/galois/graphs/NewGeneric.h
+++ b/libcusp/include/galois/graphs/NewGeneric.h
@@ -102,10 +102,10 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
       bps.push_back(5);
     } else if (filename.find("ogbn-arxiv") != std::string::npos) {
       bps.push_back(0);
-      bps.push_back(169251);
+      bps.push_back(169252);
     } else if (filename.find("ogbn-products") != std::string::npos) {
       bps.push_back(0);
-      bps.push_back(196614);
+      bps.push_back(196615);
     } else {
       // XXX only die under certain conditions
       // GALOIS_DIE("invalid input for gnn partitioning ", filename,

From 2ae60a61c6b29e505fde6087a401f1b349e36dae Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 8 Jan 2021 19:05:47 -0600
Subject: [PATCH 448/660] multilabel reading test

Adds a multilabel reading test as well as a function to GNNGraph to grab
multi-class labels (returns a pointer).

Also makes a reading change: for multi-class files the labels files
should be "mlabels" rather than just labels.
---
 libgnn/include/galois/graphs/GNNGraph.h |   8 ++
 libgnn/src/graphs/GNNGraph.cpp          |   8 +-
 libgnn/test/CMakeLists.txt              |   3 +
 libgnn/test/multilabel-read.cpp         | 142 ++++++++++++++++++++++++
 4 files changed, 160 insertions(+), 1 deletion(-)
 create mode 100644 libgnn/test/multilabel-read.cpp

diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 04debc019f..c06de18182 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -100,6 +100,14 @@ class GNNGraph {
     return local_ground_truth_labels_[lid];
   }
 
+  //! Returns pointer to start of ground truth vector for some local id assuming
+  //! labels are multi-class.
+  const GNNLabel* GetMultiClassLabel(const unsigned lid) const {
+    assert(!using_single_class_labels_);
+    return static_cast<const GNNLabel*>(local_ground_truth_labels_.data() +
+                                        (lid * num_label_classes_));
+  }
+
   //! Return matrix of the local node features
   const PointerWithSize<GNNFloat> GetLocalFeatures() {
 #ifndef GALOIS_ENABLE_GPU
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index 059759a81e..df73a1cd61 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -132,7 +132,13 @@ void galois::graphs::GNNGraph::AggregateSync(
 void galois::graphs::GNNGraph::ReadLocalLabels(const std::string& dataset_name,
                                                bool has_single_class_label) {
   GALOIS_LOG_VERBOSE("[{}] Reading labels from disk...", host_id_);
-  std::string filename = input_directory_ + dataset_name + "-labels.txt";
+  std::string filename;
+  if (has_single_class_label) {
+    filename = input_directory_ + dataset_name + "-labels.txt";
+  } else {
+    filename = input_directory_ + dataset_name + "-mlabels.txt";
+  }
+
   // read file header, save num label classes while at it
   std::ifstream file_stream;
   file_stream.open(filename, std::ios::in);
diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt
index c900c7318c..8385f0b177 100644
--- a/libgnn/test/CMakeLists.txt
+++ b/libgnn/test/CMakeLists.txt
@@ -38,6 +38,9 @@ if (NOT GALOIS_ENABLE_GPU)
   
   add_executable(weight-sync-test weight-sync-test.cpp)
   target_link_libraries(weight-sync-test galois_gnn)
+
+  add_executable(multilabel-read multilabel-read.cpp)
+  target_link_libraries(multilabel-read galois_gnn)
 else()
   add_executable(gpu-convlayer-test gpu-convlayer-test.cpp)
   target_link_libraries(gpu-convlayer-test galois_gnn)
diff --git a/libgnn/test/multilabel-read.cpp b/libgnn/test/multilabel-read.cpp
new file mode 100644
index 0000000000..83debfa2bc
--- /dev/null
+++ b/libgnn/test/multilabel-read.cpp
@@ -0,0 +1,142 @@
+//! @file multilabel-read
+//! Make sure multilabels read are sane
+
+#include "galois/Logging.h"
+#include "galois/graphs/GNNGraph.h"
+
+int main() {
+  galois::DistMemSys G;
+
+  // load test graph; false at end = multilabel
+  galois::graphs::GNNGraph test_graph(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, false);
+  const galois::GNNLabel* labels = test_graph.GetMultiClassLabel(0);
+
+  unsigned i = 0;
+  GALOIS_LOG_ASSERT(1 == labels[i * 7]);
+  GALOIS_LOG_ASSERT(1 == labels[i * 7 + 1]);
+  GALOIS_LOG_ASSERT(1 == labels[i * 7 + 2]);
+  GALOIS_LOG_ASSERT(0 == labels[i * 7 + 3]);
+  GALOIS_LOG_ASSERT(0 == labels[i * 7 + 4]);
+  GALOIS_LOG_ASSERT(0 == labels[i * 7 + 5]);
+  GALOIS_LOG_ASSERT(0 == labels[i * 7 + 6]);
+
+  i = 1;
+  GALOIS_LOG_ASSERT(0 == labels[i * 7]);
+  GALOIS_LOG_ASSERT(1 == labels[i * 7 + 1]);
+  GALOIS_LOG_ASSERT(1 == labels[i * 7 + 2]);
+  GALOIS_LOG_ASSERT(1 == labels[i * 7 + 3]);
+  GALOIS_LOG_ASSERT(0 == labels[i * 7 + 4]);
+  GALOIS_LOG_ASSERT(0 == labels[i * 7 + 5]);
+  GALOIS_LOG_ASSERT(0 == labels[i * 7 + 6]);
+
+  i = 2;
+  GALOIS_LOG_ASSERT(0 == labels[i * 7]);
+  GALOIS_LOG_ASSERT(0 == labels[i * 7 + 1]);
+  GALOIS_LOG_ASSERT(1 == labels[i * 7 + 2]);
+  GALOIS_LOG_ASSERT(1 == labels[i * 7 + 3]);
+  GALOIS_LOG_ASSERT(1 == labels[i * 7 + 4]);
+  GALOIS_LOG_ASSERT(0 == labels[i * 7 + 5]);
+  GALOIS_LOG_ASSERT(0 == labels[i * 7 + 6]);
+
+  i = 3;
+  GALOIS_LOG_ASSERT(0 == labels[i * 7]);
+  GALOIS_LOG_ASSERT(0 == labels[i * 7 + 1]);
+  GALOIS_LOG_ASSERT(0 == labels[i * 7 + 2]);
+  GALOIS_LOG_ASSERT(1 == labels[i * 7 + 3]);
+  GALOIS_LOG_ASSERT(1 == labels[i * 7 + 4]);
+  GALOIS_LOG_ASSERT(1 == labels[i * 7 + 5]);
+  GALOIS_LOG_ASSERT(0 == labels[i * 7 + 6]);
+
+  i = 4;
+  GALOIS_LOG_ASSERT(0 == labels[i * 7]);
+  GALOIS_LOG_ASSERT(0 == labels[i * 7 + 1]);
+  GALOIS_LOG_ASSERT(0 == labels[i * 7 + 2]);
+  GALOIS_LOG_ASSERT(0 == labels[i * 7 + 3]);
+  GALOIS_LOG_ASSERT(1 == labels[i * 7 + 4]);
+  GALOIS_LOG_ASSERT(1 == labels[i * 7 + 5]);
+  GALOIS_LOG_ASSERT(1 == labels[i * 7 + 6]);
+
+  i = 5;
+  GALOIS_LOG_ASSERT(1 == labels[i * 7]);
+  GALOIS_LOG_ASSERT(0 == labels[i * 7 + 1]);
+  GALOIS_LOG_ASSERT(0 == labels[i * 7 + 2]);
+  GALOIS_LOG_ASSERT(0 == labels[i * 7 + 3]);
+  GALOIS_LOG_ASSERT(0 == labels[i * 7 + 4]);
+  GALOIS_LOG_ASSERT(1 == labels[i * 7 + 5]);
+  GALOIS_LOG_ASSERT(1 == labels[i * 7 + 6]);
+
+  i = 6;
+  GALOIS_LOG_ASSERT(1 == labels[i * 7]);
+  GALOIS_LOG_ASSERT(1 == labels[i * 7 + 1]);
+  GALOIS_LOG_ASSERT(0 == labels[i * 7 + 2]);
+  GALOIS_LOG_ASSERT(0 == labels[i * 7 + 3]);
+  GALOIS_LOG_ASSERT(0 == labels[i * 7 + 4]);
+  GALOIS_LOG_ASSERT(0 == labels[i * 7 + 5]);
+  GALOIS_LOG_ASSERT(1 == labels[i * 7 + 6]);
+
+  labels = test_graph.GetMultiClassLabel(0);
+  GALOIS_LOG_ASSERT(1 == labels[0]);
+  GALOIS_LOG_ASSERT(1 == labels[1]);
+  GALOIS_LOG_ASSERT(1 == labels[2]);
+  GALOIS_LOG_ASSERT(0 == labels[3]);
+  GALOIS_LOG_ASSERT(0 == labels[4]);
+  GALOIS_LOG_ASSERT(0 == labels[5]);
+  GALOIS_LOG_ASSERT(0 == labels[6]);
+
+  labels = test_graph.GetMultiClassLabel(1);
+  GALOIS_LOG_ASSERT(0 == labels[0]);
+  GALOIS_LOG_ASSERT(1 == labels[1]);
+  GALOIS_LOG_ASSERT(1 == labels[2]);
+  GALOIS_LOG_ASSERT(1 == labels[3]);
+  GALOIS_LOG_ASSERT(0 == labels[4]);
+  GALOIS_LOG_ASSERT(0 == labels[5]);
+  GALOIS_LOG_ASSERT(0 == labels[6]);
+
+  labels = test_graph.GetMultiClassLabel(2);
+  GALOIS_LOG_ASSERT(0 == labels[0]);
+  GALOIS_LOG_ASSERT(0 == labels[1]);
+  GALOIS_LOG_ASSERT(1 == labels[2]);
+  GALOIS_LOG_ASSERT(1 == labels[3]);
+  GALOIS_LOG_ASSERT(1 == labels[4]);
+  GALOIS_LOG_ASSERT(0 == labels[5]);
+  GALOIS_LOG_ASSERT(0 == labels[6]);
+
+  labels = test_graph.GetMultiClassLabel(3);
+  GALOIS_LOG_ASSERT(0 == labels[0]);
+  GALOIS_LOG_ASSERT(0 == labels[1]);
+  GALOIS_LOG_ASSERT(0 == labels[2]);
+  GALOIS_LOG_ASSERT(1 == labels[3]);
+  GALOIS_LOG_ASSERT(1 == labels[4]);
+  GALOIS_LOG_ASSERT(1 == labels[5]);
+  GALOIS_LOG_ASSERT(0 == labels[6]);
+
+  labels = test_graph.GetMultiClassLabel(4);
+  GALOIS_LOG_ASSERT(0 == labels[0]);
+  GALOIS_LOG_ASSERT(0 == labels[1]);
+  GALOIS_LOG_ASSERT(0 == labels[2]);
+  GALOIS_LOG_ASSERT(0 == labels[3]);
+  GALOIS_LOG_ASSERT(1 == labels[4]);
+  GALOIS_LOG_ASSERT(1 == labels[5]);
+  GALOIS_LOG_ASSERT(1 == labels[6]);
+
+  labels = test_graph.GetMultiClassLabel(5);
+  GALOIS_LOG_ASSERT(1 == labels[0]);
+  GALOIS_LOG_ASSERT(0 == labels[1]);
+  GALOIS_LOG_ASSERT(0 == labels[2]);
+  GALOIS_LOG_ASSERT(0 == labels[3]);
+  GALOIS_LOG_ASSERT(0 == labels[4]);
+  GALOIS_LOG_ASSERT(1 == labels[5]);
+  GALOIS_LOG_ASSERT(1 == labels[6]);
+
+  labels = test_graph.GetMultiClassLabel(6);
+  GALOIS_LOG_ASSERT(1 == labels[0]);
+  GALOIS_LOG_ASSERT(1 == labels[1]);
+  GALOIS_LOG_ASSERT(0 == labels[2]);
+  GALOIS_LOG_ASSERT(0 == labels[3]);
+  GALOIS_LOG_ASSERT(0 == labels[4]);
+  GALOIS_LOG_ASSERT(0 == labels[5]);
+  GALOIS_LOG_ASSERT(1 == labels[6]);
+
+  return 0;
+}

From 3398c5f4dd57c0048f0370dd6b649c68b9eeb7d9 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 11 Jan 2021 15:41:16 -0600
Subject: [PATCH 449/660] F1 scoring for multi-class labels

1) Moved accuracy functions from GNN to GNNGraph which contains the
labels necessary to figure out accuracy.
2) Fixed an issue with an older test w.r.t. multi-class reading.
3) Added a new F1 scoring test.
4) New F1 scoring function added (MultiClass accuracy) which returns
micro F1 score.
---
 libgnn/include/galois/GraphNeuralNetwork.h |   9 +-
 libgnn/include/galois/graphs/GNNGraph.h    |  23 ++-
 libgnn/src/GraphNeuralNetwork.cpp          |  56 +-------
 libgnn/src/graphs/GNNGraph.cpp             | 155 +++++++++++++++++++++
 libgnn/test/CMakeLists.txt                 |   5 +
 libgnn/test/f1-test.cpp                    |  51 +++++++
 libgnn/test/gnngraph-test.cpp              |  10 +-
 7 files changed, 241 insertions(+), 68 deletions(-)
 create mode 100644 libgnn/test/f1-test.cpp

diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h
index 652b1cbfad..d4681746f3 100644
--- a/libgnn/include/galois/GraphNeuralNetwork.h
+++ b/libgnn/include/galois/GraphNeuralNetwork.h
@@ -146,10 +146,10 @@ class GraphNeuralNetwork {
   //! @returns Output layer's output
   const PointerWithSize<GNNFloat> DoInference();
 
+  //! Returns classification accuracy for single class label or micro F1 score
+  //! for multi-class predictions; this calls into GNNGraph's accuracy call
   float GetGlobalAccuracy(const PointerWithSize<GNNFloat> predictions);
 
-  float GetGlobalAccuracyCPU(const PointerWithSize<GNNFloat> predictions);
-
   //! Backpropagate gradients from the output layer backwards through the
   //! network to update the layer weights. Also known as a backward phase in
   //! most literature
@@ -166,10 +166,7 @@ class GraphNeuralNetwork {
   std::vector<std::unique_ptr<galois::GNNLayer>> gnn_layers_;
   //! Current phase of the GNN: train, validation, test
   GNNPhase phase_{GNNPhase::kTrain};
-  //! Used to track accurate predictions during accuracy calculation
-  DGAccumulator<size_t> num_correct_;
-  //! Used to count total number of things checked during accuracy calculation
-  DGAccumulator<size_t> total_checked_;
+
 #ifdef GALOIS_ENABLE_GPU
   //! Holds all GPU functions
   GraphNeuralNetworkGPU gpu_object_;
diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index c06de18182..cfed56aade 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -55,7 +55,9 @@ class GNNGraph {
   size_t node_feature_length() const { return node_feature_length_; }
 
   //! Return the number of label classes (i.e. number of possible outputs)
-  size_t GetNumLabelClasses() const { return num_label_classes_; };
+  size_t GetNumLabelClasses() const { return num_label_classes_; }
+
+  bool is_single_class_label() const { return using_single_class_labels_; }
 
   //////////////////////////////////////////////////////////////////////////////
   // Graph accessors
@@ -93,6 +95,9 @@ class GNNGraph {
   };
   GNNFloat NormFactor(GraphNode n) const { return norm_factors_[n]; }
 
+  float GetGlobalAccuracy(PointerWithSize<GNNFloat> predictions,
+                          GNNPhase phase);
+
   //! Returns the ground truth label of some local id assuming labels are single
   //! class labels.
   GNNFloat GetSingleClassLabel(const unsigned lid) const {
@@ -139,6 +144,13 @@ class GNNGraph {
   const GNNGraphGPUAllocations& GetGPUGraph() const { return gpu_memory_; }
 #endif
 private:
+  float GetGlobalAccuracyCPU(PointerWithSize<GNNFloat> predictions,
+                             GNNPhase phase);
+  float GetGlobalAccuracyCPUSingle(PointerWithSize<GNNFloat> predictions,
+                                   GNNPhase phase);
+  float GetGlobalAccuracyCPUMulti(PointerWithSize<GNNFloat> predictions,
+                                  GNNPhase phase);
+
   //! Directory for input data
   const std::string input_directory_;
   //! In a multi-host setting, this variable stores the host id that the graph
@@ -222,6 +234,15 @@ class GNNGraph {
   //! memory and copies things over
   void InitGPUMemory();
 #endif
+  //! Used to track accurate predictions during accuracy calculation
+  DGAccumulator<size_t> num_correct_;
+  //! Used to count total number of things checked during accuracy calculation
+  DGAccumulator<size_t> total_checked_;
+  // Below are used for multi-class accuracy
+  DGAccumulator<size_t> local_true_positive_;
+  DGAccumulator<size_t> local_true_negative_;
+  DGAccumulator<size_t> local_false_positive_;
+  DGAccumulator<size_t> local_false_negative_;
 };
 
 } // namespace graphs
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index ebe486b47a..afd46f1bcb 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -67,8 +67,8 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
     GradientPropagation();
     float train_accuracy = GetGlobalAccuracy(predictions);
     if (this_host == 0) {
-      galois::gPrint("Epoch ", epoch, ": Train accuracy is ", train_accuracy,
-                     "\n");
+      galois::gPrint("Epoch ", epoch, ": Train accuracy/F1 micro is ",
+                     train_accuracy, "\n");
     }
     // TODO validation and test as necessary
   }
@@ -100,56 +100,8 @@ galois::GraphNeuralNetwork::DoInference() {
 }
 
 float galois::GraphNeuralNetwork::GetGlobalAccuracy(
-    const PointerWithSize<GNNFloat> predictions) {
-  // TODO mark as a forwarding argument?
-  //#ifndef GALOIS_ENABLE_GPU
-  return GetGlobalAccuracyCPU(predictions);
-  //#else
-  //  return gpu_object_.GetGlobalAccuracyGPU(graph_->GetGPUGraph(), phase_,
-  //                                          predictions);
-  //#endif
-}
-
-float galois::GraphNeuralNetwork::GetGlobalAccuracyCPU(
-    const PointerWithSize<GNNFloat> predictions) {
-  // check owned nodes' accuracy
-  size_t num_labels = graph_->GetNumLabelClasses();
-  assert((graph_->GetNumLabelClasses() * graph_->size()) == predictions.size());
-  num_correct_.reset();
-  total_checked_.reset();
-
-  galois::do_all(
-      galois::iterate(graph_->begin_owned(), graph_->end_owned()),
-      [&](const unsigned lid) {
-        if (graph_->IsValidForPhase(lid, phase_)) {
-          total_checked_ += 1;
-          // get prediction by getting max
-          size_t predicted_label =
-              galois::MaxIndex(num_labels, &(predictions[lid * num_labels]));
-          // GALOIS_LOG_VERBOSE("Checking LID {} with label {} against
-          // prediction {}",
-          //                   lid, graph_->GetSingleClassLabel(lid),
-          //                   predicted_label);
-          // check against ground truth and track accordingly
-          // TODO static cast used here is dangerous
-          if (predicted_label ==
-              static_cast<size_t>(graph_->GetSingleClassLabel(lid))) {
-            num_correct_ += 1;
-          }
-        }
-      },
-      // TODO chunk size?
-      // steal on as some threads may have nothing to work on
-      galois::steal(), galois::loopname("GlobalAccuracy"));
-  // TODO revise for later when multi-class labels come in
-
-  size_t global_correct = num_correct_.reduce();
-  size_t global_checked = total_checked_.reduce();
-
-  GALOIS_LOG_VERBOSE("Accuracy: {} / {}", global_correct, global_checked);
-
-  return static_cast<float>(global_correct) /
-         static_cast<float>(global_checked);
+    PointerWithSize<GNNFloat> predictions) {
+  return graph_->GetGlobalAccuracy(predictions, phase_);
 }
 
 void galois::GraphNeuralNetwork::GradientPropagation() {
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index df73a1cd61..2753e07f3d 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -2,6 +2,7 @@
 #include "galois/Logging.h"
 #include "galois/graphs/ReadGraph.h"
 #include "galois/graphs/GNNGraph.h"
+#include "galois/GNNMath.h"
 #include <limits>
 
 namespace {
@@ -378,6 +379,160 @@ void galois::graphs::GNNGraph::InitNormFactor() {
       galois::loopname("InitNormFactor"));
 }
 
+float galois::graphs::GNNGraph::GetGlobalAccuracy(
+    PointerWithSize<GNNFloat> predictions, GNNPhase phase) {
+  // No GPU version yet, but this is where it would be
+  return GetGlobalAccuracyCPU(predictions, phase);
+}
+
+float galois::graphs::GNNGraph::GetGlobalAccuracyCPU(
+    PointerWithSize<GNNFloat> predictions, GNNPhase phase) {
+  if (is_single_class_label()) {
+    return GetGlobalAccuracyCPUSingle(predictions, phase);
+  } else {
+    return GetGlobalAccuracyCPUMulti(predictions, phase);
+  }
+}
+
+float galois::graphs::GNNGraph::GetGlobalAccuracyCPUSingle(
+    PointerWithSize<GNNFloat> predictions, GNNPhase phase) {
+  // check owned nodes' accuracy
+  assert((num_label_classes_ * size()) == predictions.size());
+  num_correct_.reset();
+  total_checked_.reset();
+
+  galois::do_all(
+      galois::iterate(begin_owned(), end_owned()),
+      [&](const unsigned lid) {
+        if (IsValidForPhase(lid, phase)) {
+          total_checked_ += 1;
+          // get prediction by getting max
+          size_t predicted_label = galois::MaxIndex(
+              num_label_classes_, &(predictions[lid * num_label_classes_]));
+          // check against ground truth and track accordingly
+          // TODO static cast used here is dangerous
+          if (predicted_label ==
+              static_cast<size_t>(GetSingleClassLabel(lid))) {
+            num_correct_ += 1;
+          }
+        }
+      },
+      // steal on as some threads may have nothing to work on
+      galois::steal(), galois::loopname("GlobalAccuracy"));
+
+  size_t global_correct = num_correct_.reduce();
+  size_t global_checked = total_checked_.reduce();
+
+  GALOIS_LOG_VERBOSE("Accuracy: {} / {}", global_correct, global_checked);
+
+  return static_cast<float>(global_correct) /
+         static_cast<float>(global_checked);
+}
+
+float galois::graphs::GNNGraph::GetGlobalAccuracyCPUMulti(
+    PointerWithSize<GNNFloat> predictions, GNNPhase phase) {
+
+  const GNNLabel* full_ground_truth = GetMultiClassLabel(0);
+  assert(predictions.size() == (num_label_classes_ * size()));
+
+  size_t global_true_positive  = 0;
+  size_t global_true_negative  = 0;
+  size_t global_false_positive = 0;
+  size_t global_false_negative = 0;
+  size_t global_f1_score       = 0;
+
+  // per class check
+  for (size_t label_class = 0; label_class < num_label_classes_;
+       label_class++) {
+    local_true_positive_.reset();
+    local_true_negative_.reset();
+    local_false_positive_.reset();
+    local_false_negative_.reset();
+
+    // loop through all *owned* nodes (do not want to overcount)
+    galois::do_all(
+        galois::iterate(begin_owned(), end_owned()),
+        [&](const unsigned lid) {
+          if (IsValidForPhase(lid, phase)) {
+            size_t label_index  = lid * num_label_classes_ + label_class;
+            GNNLabel true_label = full_ground_truth[label_index];
+            GNNLabel prediction_is_positive =
+                (predictions[label_index] > 0.5) ? 1 : 0;
+
+            if (true_label && prediction_is_positive) {
+              local_true_positive_ += 1;
+            } else if (true_label && !prediction_is_positive) {
+              local_false_negative_ += 1;
+            } else if (!true_label && prediction_is_positive) {
+              local_false_positive_ += 1;
+            } else if (!true_label && !prediction_is_positive) {
+              local_true_negative_ += 1;
+            } else {
+              // all cases should be covered with clauses above, so it should
+              // NEVER get here; adding it here just for sanity purposes
+              GALOIS_LOG_FATAL(
+                  "Logic error with true label and prediction label");
+            }
+          }
+          total_checked_ += 1;
+        },
+        galois::steal(), galois::loopname("GlobalMultiAccuracy"));
+
+    // reduce from accumulators across all hosts for this particular class
+    size_t class_true_positives  = local_true_positive_.reduce();
+    size_t class_false_positives = local_false_positive_.reduce();
+    size_t class_true_negatives  = local_true_negative_.reduce();
+    size_t class_false_negatives = local_false_negative_.reduce();
+
+    // add to global counts
+    global_true_positive += class_true_positives;
+    global_false_positive += class_false_positives;
+    global_true_negative += class_true_negatives;
+    global_false_negative += class_false_negatives;
+
+    // calculate precision, recall, and f1 score for this class
+    // ternery op used to avoid division by 0
+    double class_precision =
+        (class_true_positives + class_true_negatives) > 0
+            ? static_cast<double>(class_true_positives) /
+                  (class_true_positives + class_false_positives)
+            : 0.0;
+    double class_recall =
+        (class_true_positives + class_false_negatives) > 0
+            ? static_cast<double>(class_true_positives) /
+                  (class_true_positives + class_false_negatives)
+            : 0.0;
+    double class_f1_score = (class_precision + class_recall) > 0
+                                ? (2.0 * (class_precision * class_recall)) /
+                                      (class_precision + class_recall)
+                                : 0.0;
+
+    global_f1_score += class_f1_score;
+  } // end label class loop
+
+  // double global_f1_macro_score = global_f1_score / num_label_classes_;
+
+  // micro = considers all classes for precision/recall
+  double global_micro_precision =
+      (global_true_positive + global_true_negative) > 0
+          ? static_cast<double>(global_true_positive) /
+                (global_true_positive + global_false_positive)
+          : 0.0;
+  double global_micro_recall =
+      (global_true_positive + global_false_negative) > 0
+          ? static_cast<double>(global_true_positive) /
+                (global_true_positive + global_false_negative)
+          : 0.0;
+
+  double global_f1_micro_score =
+      (global_micro_precision + global_micro_recall) > 0
+          ? (2.0 * (global_micro_precision * global_micro_recall)) /
+                (global_micro_precision + global_micro_recall)
+          : 0.0;
+
+  return global_f1_micro_score;
+}
+
 #ifdef GALOIS_ENABLE_GPU
 void galois::graphs::GNNGraph::InitGPUMemory() {
   // create int casted CSR
diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt
index 8385f0b177..820bd03019 100644
--- a/libgnn/test/CMakeLists.txt
+++ b/libgnn/test/CMakeLists.txt
@@ -41,6 +41,11 @@ if (NOT GALOIS_ENABLE_GPU)
 
   add_executable(multilabel-read multilabel-read.cpp)
   target_link_libraries(multilabel-read galois_gnn)
+  add_test(NAME multilabel-read COMMAND multilabel-read)
+
+  add_executable(f1-test f1-test.cpp)
+  target_link_libraries(f1-test galois_gnn)
+  add_test(NAME f1-test COMMAND f1-test)
 else()
   add_executable(gpu-convlayer-test gpu-convlayer-test.cpp)
   target_link_libraries(gpu-convlayer-test galois_gnn)
diff --git a/libgnn/test/f1-test.cpp b/libgnn/test/f1-test.cpp
new file mode 100644
index 0000000000..64935bc235
--- /dev/null
+++ b/libgnn/test/f1-test.cpp
@@ -0,0 +1,51 @@
+//! @file f1-test
+//! Tests f1 micro accuracy for multiclass labels
+
+#include "galois/Logging.h"
+#include "galois/graphs/GNNGraph.h"
+
+int main() {
+  galois::DistMemSys G;
+
+  // load test graph; false at end = multilabel
+  galois::graphs::GNNGraph test_graph(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, false);
+
+  // perfect precision and recall
+  std::vector<galois::GNNFloat> prediction = {
+      1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1,
+      1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1};
+  GALOIS_LOG_ASSERT(1.0 == test_graph.GetGlobalAccuracy(
+                               prediction, galois::GNNPhase::kTrain));
+  GALOIS_LOG_ASSERT(1.0 == test_graph.GetGlobalAccuracy(
+                               prediction, galois::GNNPhase::kValidate));
+  GALOIS_LOG_ASSERT(
+      1.0 == test_graph.GetGlobalAccuracy(prediction, galois::GNNPhase::kTest));
+
+  // perfect recall, but training precision is bad
+  std::vector<galois::GNNFloat> prediction2 = {
+      1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1};
+
+  // just print here and check with eyes: checking float equivalance is a pain
+  // both prints should be .6666666
+  GALOIS_LOG_DEBUG(
+      "{} {}",
+      test_graph.GetGlobalAccuracy(prediction2, galois::GNNPhase::kTrain),
+      (2 * (15.0 / 30.0)) / ((15.0 / 30.0) + 1));
+  GALOIS_LOG_ASSERT(1.0 == test_graph.GetGlobalAccuracy(
+                               prediction2, galois::GNNPhase::kValidate));
+  GALOIS_LOG_ASSERT(1.0 == test_graph.GetGlobalAccuracy(
+                               prediction2, galois::GNNPhase::kTest));
+
+  // no predictions made
+  std::vector<galois::GNNFloat> prediction3(49, 0);
+  GALOIS_LOG_ASSERT(0.0 == test_graph.GetGlobalAccuracy(
+                               prediction3, galois::GNNPhase::kTrain));
+  GALOIS_LOG_ASSERT(0.0 == test_graph.GetGlobalAccuracy(
+                               prediction3, galois::GNNPhase::kValidate));
+  GALOIS_LOG_ASSERT(0.0 == test_graph.GetGlobalAccuracy(
+                               prediction3, galois::GNNPhase::kTest));
+
+  return 0;
+}
diff --git a/libgnn/test/gnngraph-test.cpp b/libgnn/test/gnngraph-test.cpp
index 7db24081f5..5aa4d72ddf 100644
--- a/libgnn/test/gnngraph-test.cpp
+++ b/libgnn/test/gnngraph-test.cpp
@@ -14,21 +14,13 @@ int main() {
                      galois::runtime::getSystemNetworkInterface().ID,
                      num_threads);
 
-  GALOIS_LOG_VERBOSE("reddit with multilabel, oec");
-  galois::graphs::GNNGraph("reddit", galois::graphs::GNNPartitionScheme::kOEC,
-                           false);
+  // multi level reading tested in another test
   GALOIS_LOG_VERBOSE("reddit with single label, oec");
   galois::graphs::GNNGraph("reddit", galois::graphs::GNNPartitionScheme::kOEC,
                            true);
-  GALOIS_LOG_VERBOSE("reddit with multilabel, cvc");
-  galois::graphs::GNNGraph("reddit", galois::graphs::GNNPartitionScheme::kCVC,
-                           false);
   GALOIS_LOG_VERBOSE("reddit with single label, cvc");
   galois::graphs::GNNGraph("reddit", galois::graphs::GNNPartitionScheme::kCVC,
                            true);
 
-  // TODO fix citeseer and goec
-  // galois::graphs::GNNGraph("citeseer",
-  // galois::graphs::GNNPartitionScheme::kOEC, false);
   return 0;
 }

From 4025be2c2e442afb1c09441a3e8369b64078a085 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 12 Jan 2021 13:22:26 -0600
Subject: [PATCH 450/660] GNN sigmoid output layer

- Adds the sigmoid layer as a possible output layer for use.
- Templatizes cross entropy call because truth vector doens't
necessarily need to be a float.
- Removed definition of constructors from GNN class and let default
constructors do their thing.
- Added a sanity test for multi-label epochs.
- Added yelp/amazon ranges to CuSP

Needs to be added as an option to the GCN app (next commit).
---
 libcusp/include/galois/graphs/NewGeneric.h  | 13 +++-
 libgnn/CMakeLists.txt                       |  1 +
 libgnn/include/galois/GNNMath.h             | 11 ++-
 libgnn/include/galois/GraphNeuralNetwork.h  |  7 --
 libgnn/include/galois/layers/GNNLayer.h     |  3 +-
 libgnn/include/galois/layers/SigmoidLayer.h | 52 +++++++++++++
 libgnn/src/GNNMath.cpp                      |  9 ---
 libgnn/src/GraphNeuralNetwork.cpp           | 14 ++++
 libgnn/src/layers/SigmoidLayer.cpp          | 86 +++++++++++++++++++++
 libgnn/test/CMakeLists.txt                  |  8 +-
 libgnn/test/multilabel-epoch-test.cpp       | 59 ++++++++++++++
 11 files changed, 239 insertions(+), 24 deletions(-)
 create mode 100644 libgnn/include/galois/layers/SigmoidLayer.h
 create mode 100644 libgnn/src/layers/SigmoidLayer.cpp
 create mode 100644 libgnn/test/multilabel-epoch-test.cpp

diff --git a/libcusp/include/galois/graphs/NewGeneric.h b/libcusp/include/galois/graphs/NewGeneric.h
index e8f4fb332d..0c3e4b31d4 100644
--- a/libcusp/include/galois/graphs/NewGeneric.h
+++ b/libcusp/include/galois/graphs/NewGeneric.h
@@ -81,6 +81,7 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
     // performance critical
     std::vector<uint32_t> bps;
 
+    // TODO(loc) avoid this entirely and load it from file...
     // if through all possible GNN outputs
     if (filename.find("cora") != std::string::npos) {
       bps.push_back(0);
@@ -106,12 +107,20 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
     } else if (filename.find("ogbn-products") != std::string::npos) {
       bps.push_back(0);
       bps.push_back(196615);
+    } else if (filename.find("yelp") != std::string::npos) {
+      // this is entire graph: yelp's mask isn't contiguous
+      bps.push_back(0);
+      bps.push_back(716847);
+    } else if (filename.find("amazon") != std::string::npos) {
+      // this is entire graph: amazon's mask isn't contiguous
+      bps.push_back(0);
+      bps.push_back(1569960);
     } else {
-      // XXX only die under certain conditions
+      // TODO(loc) only die under certain conditions; don't die if something
+      // is missing
       // GALOIS_DIE("invalid input for gnn partitioning ", filename,
       //           " hardcode needed");
     }
-    // TODO hardcode the rest
 
     return bps;
   }
diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt
index baee47c3fb..320189c44e 100644
--- a/libgnn/CMakeLists.txt
+++ b/libgnn/CMakeLists.txt
@@ -6,6 +6,7 @@ set(sources
   src/layers/GNNLayer.cpp
   src/layers/GluonGradientInterface.cpp
   src/layers/GraphConvolutionalLayer.cpp
+  src/layers/SigmoidLayer.cpp
   src/layers/SoftmaxLayer.cpp
 )
 
diff --git a/libgnn/include/galois/GNNMath.h b/libgnn/include/galois/GNNMath.h
index 488b538d75..f8edd9650f 100644
--- a/libgnn/include/galois/GNNMath.h
+++ b/libgnn/include/galois/GNNMath.h
@@ -28,10 +28,17 @@ void GNNSoftmaxDerivative(const size_t vector_length,
 galois::GNNFloat GNNCrossEntropy(const size_t vector_length,
                                  const GNNFloat* ground_truth,
                                  const GNNFloat* input);
+
 //! Derivative of cross entropy; gradients saved into an output vector.
+template <typename TruthType>
 void GNNCrossEntropyDerivative(const size_t vector_length,
-                               const GNNFloat* ground_truth,
-                               const GNNFloat* input, GNNFloat* gradients);
+                               const TruthType* ground_truth,
+                               const GNNFloat* input, GNNFloat* gradients) {
+  for (size_t i = 0; i < vector_length; i++) {
+    gradients[i] = -(ground_truth[i]) / (input[i] + static_cast<float>(1e-10));
+  }
+}
+
 //! Calls into a library BLAS call to do matrix muliply; uses default alpha/beta
 void CBlasSGEMM(const CBLAS_TRANSPOSE trans_a, const CBLAS_TRANSPOSE trans_b,
                 size_t input_rows, size_t input_columns, size_t output_columns,
diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h
index d4681746f3..51142b9b38 100644
--- a/libgnn/include/galois/GraphNeuralNetwork.h
+++ b/libgnn/include/galois/GraphNeuralNetwork.h
@@ -22,13 +22,6 @@ namespace galois {
 //! determine how the network gets constructed.
 class GraphNeuralNetworkConfig {
 public:
-  // default move, no copy
-  GraphNeuralNetworkConfig()                                = delete;
-  GraphNeuralNetworkConfig(const GraphNeuralNetworkConfig&) = delete;
-  GraphNeuralNetworkConfig& operator=(const GraphNeuralNetworkConfig&) = delete;
-  GraphNeuralNetworkConfig(GraphNeuralNetworkConfig&&) = default;
-  GraphNeuralNetworkConfig& operator=(GraphNeuralNetworkConfig&&) = default;
-
   //! Construction without a config for layers specified; uses a default
   GraphNeuralNetworkConfig(size_t num_layers,
                            const std::vector<GNNLayerType>& layer_types,
diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
index 3296b17d20..93498a6497 100644
--- a/libgnn/include/galois/layers/GNNLayer.h
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -20,9 +20,8 @@ enum class GNNLayerType {
   // TODO SAGE and GAT
 };
 
-// TODO Sigmoid
 //! Supported output layer types in the GNN
-enum class GNNOutputLayerType { kInvalid, kSoftmax };
+enum class GNNOutputLayerType { kInvalid, kSoftmax, kSigmoid };
 
 //! Struct holding the dimensions of a layer. Assumption is that a layer takes
 //! a matrix and outputs another matrix with a different # of columns (e.g.
diff --git a/libgnn/include/galois/layers/SigmoidLayer.h b/libgnn/include/galois/layers/SigmoidLayer.h
new file mode 100644
index 0000000000..44c215909d
--- /dev/null
+++ b/libgnn/include/galois/layers/SigmoidLayer.h
@@ -0,0 +1,52 @@
+#pragma once
+#include "galois/layers/GNNLayer.h"
+
+// TODO(loc) GPU support
+
+namespace galois {
+
+//! Sigmoid layer: applies sigmoid function element wise to each element of the
+//! input.
+//! Meant for use with *multi-class* labels.
+class SigmoidLayer : public GNNLayer {
+public:
+  SigmoidLayer(size_t layer_num, const galois::graphs::GNNGraph& graph,
+               const GNNLayerDimensions& dimensions)
+      : GNNLayer(layer_num, graph, dimensions,
+                 GNNLayerConfig{.allocate_weights = false}),
+        // input_loss_(dimensions.input_rows),
+        norm_gradient_vectors_(dimensions.input_columns) {
+    output_layer_type_ = galois::GNNOutputLayerType::kSigmoid;
+    // input/output columns must be equivalent
+    GALOIS_LOG_ASSERT(dimensions.input_columns == dimensions.output_columns);
+    // output needs to match number of possible classes
+    GALOIS_LOG_ASSERT(dimensions.input_columns == graph.GetNumLabelClasses());
+  }
+
+  //! Normalizes all elements by applying sigmoid to all of them
+  const PointerWithSize<galois::GNNFloat>
+  ForwardPhase(const PointerWithSize<galois::GNNFloat> input_embeddings) final;
+
+  //! Get gradients to fix distribution such that it leans more towards
+  //! multiclass ground truth.
+  PointerWithSize<galois::GNNFloat>
+  BackwardPhase(const PointerWithSize<galois::GNNFloat>,
+                PointerWithSize<galois::GNNFloat>*) final;
+
+private:
+  const PointerWithSize<galois::GNNFloat>
+  ForwardPhaseCPU(const PointerWithSize<galois::GNNFloat> input_embeddings);
+
+  PointerWithSize<galois::GNNFloat> BackwardPhaseCPU();
+
+  //! Loss for each row of the input; unused for now because loss doesn't
+  //! need to be calculated for correctness
+  // std::vector<GNNFloat> input_loss_;
+
+  //! Each thread gets storage to allocate the gradients during backward
+  //! prop; each is the size of a feature vector
+  galois::substrate::PerThreadStorage<std::vector<GNNFloat>>
+      norm_gradient_vectors_;
+};
+
+} // namespace galois
diff --git a/libgnn/src/GNNMath.cpp b/libgnn/src/GNNMath.cpp
index 0d065d6bcc..294dc9f7be 100644
--- a/libgnn/src/GNNMath.cpp
+++ b/libgnn/src/GNNMath.cpp
@@ -107,15 +107,6 @@ galois::GNNFloat galois::GNNCrossEntropy(const size_t vector_length,
   return loss;
 }
 
-void galois::GNNCrossEntropyDerivative(const size_t vector_length,
-                                       const GNNFloat* ground_truth,
-                                       const GNNFloat* input,
-                                       GNNFloat* gradients) {
-  for (size_t i = 0; i < vector_length; i++) {
-    gradients[i] = -(ground_truth[i]) / (input[i] + static_cast<float>(1e-10));
-  }
-}
-
 void galois::CBlasSGEMM(const CBLAS_TRANSPOSE trans_a,
                         const CBLAS_TRANSPOSE trans_b, size_t input_rows,
                         size_t input_columns, size_t output_columns,
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index afd46f1bcb..ce89b4b780 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -2,6 +2,7 @@
 #include "galois/GraphNeuralNetwork.h"
 #include "galois/layers/GraphConvolutionalLayer.h"
 #include "galois/layers/SoftmaxLayer.h"
+#include "galois/layers/SigmoidLayer.h"
 
 galois::GraphNeuralNetwork::GraphNeuralNetwork(
     std::unique_ptr<galois::graphs::GNNGraph> graph,
@@ -54,9 +55,22 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork(
     gnn_layers_.push_back(std::move(std::make_unique<SoftmaxLayer>(
         config_.num_intermediate_layers(), *graph_, output_dims)));
     break;
+  case (GNNOutputLayerType::kSigmoid):
+    gnn_layers_.push_back(std::move(std::make_unique<SigmoidLayer>(
+        config_.num_intermediate_layers(), *graph_, output_dims)));
+    break;
   default:
     GALOIS_LOG_FATAL("Invalid layer type during network construction");
   }
+
+  // sanity checking multi-class + output layer
+  if (!graph_->is_single_class_label() &&
+      (config_.output_layer_type() != GNNOutputLayerType::kSigmoid)) {
+    GALOIS_LOG_WARN(
+        "Using a non-sigmoid output layer with a multi-class label!");
+    // if debug mode just kill program
+    assert(false);
+  }
 }
 
 float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
diff --git a/libgnn/src/layers/SigmoidLayer.cpp b/libgnn/src/layers/SigmoidLayer.cpp
new file mode 100644
index 0000000000..8db6b8e0cc
--- /dev/null
+++ b/libgnn/src/layers/SigmoidLayer.cpp
@@ -0,0 +1,86 @@
+#include "galois/layers/SigmoidLayer.h"
+#include "galois/GNNMath.h"
+#include <math.h>
+
+// TODO(loc) GPU support
+
+const galois::PointerWithSize<galois::GNNFloat>
+galois::SigmoidLayer::ForwardPhaseCPU(
+    const galois::PointerWithSize<galois::GNNFloat> input_embeddings) {
+  // loss is ignored for now anyways
+  // input_loss_.assign(input_loss_.size(), 0.0);
+  forward_output_matrix_.assign(forward_output_matrix_.size(), 0.0);
+  const size_t feature_length = layer_dimensions_.input_columns;
+
+  galois::do_all(
+      galois::iterate(graph_.begin_owned(), graph_.end_owned()),
+      [&](const unsigned local_node) {
+        if (graph_.IsValidForPhase(local_node, layer_phase_)) {
+          size_t node_offset = feature_length * local_node;
+          // sigmoid the values for this node
+          for (unsigned index = 0; index < feature_length; index++) {
+            forward_output_matrix_[node_offset + index] =
+                1.0 / (1.0 + expf(-input_embeddings[node_offset + index]));
+          }
+          // TODO(loc) calculate loss (it's not even being used/not required
+          // for correctness so I'm ignoring it for now)
+        }
+      },
+      galois::steal(), galois::loopname("SigmoidForward"));
+
+  return forward_output_matrix_;
+}
+
+const galois::PointerWithSize<galois::GNNFloat>
+galois::SigmoidLayer::ForwardPhase(
+    const galois::PointerWithSize<galois::GNNFloat> input_embeddings) {
+#ifdef GALOIS_ENABLE_GPU
+  // TODO(loc) when GPU needs it
+  return 0;
+#else
+  return ForwardPhaseCPU(input_embeddings);
+#endif
+}
+
+galois::PointerWithSize<galois::GNNFloat>
+galois::SigmoidLayer::BackwardPhaseCPU() {
+  const size_t feature_length = layer_dimensions_.input_columns;
+  backward_output_matrix_.assign(backward_output_matrix_.size(), 0);
+
+  galois::do_all(
+      galois::iterate(graph_.begin_owned(), graph_.end_owned()),
+      [&](const unsigned local_node) {
+        if (graph_.IsValidForPhase(local_node, layer_phase_)) {
+          // derivative cross entropy into norm grad
+          const GNNLabel* ground_truth = graph_.GetMultiClassLabel(local_node);
+          size_t node_offset           = feature_length * local_node;
+          std::vector<GNNFloat>* norm_gradient =
+              norm_gradient_vectors_.getLocal();
+          GNNCrossEntropyDerivative(feature_length, ground_truth,
+                                    &(forward_output_matrix_[node_offset]),
+                                    norm_gradient->data());
+
+          // sigmoid derivative
+          for (unsigned index = 0; index < feature_length; index++) {
+            backward_output_matrix_[node_offset + index] =
+                (*norm_gradient)[index] *
+                forward_output_matrix_[node_offset + index] *
+                (1.0 - forward_output_matrix_[node_offset + index]);
+          }
+        }
+      },
+      galois::steal(), galois::loopname("SigmoidBackward"));
+
+  return backward_output_matrix_;
+}
+
+galois::PointerWithSize<galois::GNNFloat>
+galois::SigmoidLayer::BackwardPhase(const PointerWithSize<galois::GNNFloat>,
+                                    PointerWithSize<galois::GNNFloat>*) {
+#ifdef GALOIS_ENABLE_GPU
+  // TODO(loc) when GPU needs it
+  return 0;
+#else
+  return BackwardPhaseCPU();
+#endif
+}
diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt
index 820bd03019..6bf0ac6bd8 100644
--- a/libgnn/test/CMakeLists.txt
+++ b/libgnn/test/CMakeLists.txt
@@ -30,12 +30,16 @@ if (NOT GALOIS_ENABLE_GPU)
   add_executable(epoch-test epoch-test.cpp)
   target_link_libraries(epoch-test galois_gnn)
   add_test(NAME epoch-test COMMAND epoch-test)
-  
+
+  add_executable(multilabel-epoch-test multilabel-epoch-test.cpp)
+  target_link_libraries(multilabel-epoch-test galois_gnn)
+  add_test(NAME multilabel-epoch-test COMMAND multilabel-epoch-test)
+
   # TODO figure out how to make this test run in parallel
   add_executable(aggregate-sync-test aggregate-sync-test.cpp)
   target_link_libraries(aggregate-sync-test galois_gnn)
   #add_test(NAME aggregate-sync-test COMMAND GALOIS_DO_NOT_BIND_THREADS=1 mpirun -n=4 ./aggregate-sync-test)
-  
+
   add_executable(weight-sync-test weight-sync-test.cpp)
   target_link_libraries(weight-sync-test galois_gnn)
 
diff --git a/libgnn/test/multilabel-epoch-test.cpp b/libgnn/test/multilabel-epoch-test.cpp
new file mode 100644
index 0000000000..3fb96f8c81
--- /dev/null
+++ b/libgnn/test/multilabel-epoch-test.cpp
@@ -0,0 +1,59 @@
+//! @file multilabel-epoch-test.cpp
+//! Run 100 epochs of multilabel dataset
+
+#include "galois/Logging.h"
+#include "galois/GraphNeuralNetwork.h"
+
+int main() {
+  galois::DistMemSys G;
+
+  size_t num_threads = galois::setActiveThreads(
+      56 / galois::runtime::getSystemNetworkInterface().Num);
+  // size_t num_threads = galois::setActiveThreads(1);
+  GALOIS_LOG_VERBOSE("Num threads is {}", num_threads);
+
+  // load graph
+  auto test_graph = std::make_unique<galois::graphs::GNNGraph>(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, false);
+
+  std::vector<galois::GNNLayerType> layer_types = {
+      galois::GNNLayerType::kGraphConvolutional,
+      galois::GNNLayerType::kGraphConvolutional};
+  std::vector<size_t> layer_output_sizes = {
+      16, test_graph->GetNumLabelClasses(), test_graph->GetNumLabelClasses()};
+  galois::GNNLayerConfig layer_config;
+  layer_config.do_dropout       = true;
+  layer_config.do_activation    = false;
+  layer_config.do_normalization = true;
+  // XXX Activation kills accuracy compared to old code, esp. for cora
+  galois::GraphNeuralNetworkConfig gnn_config(
+      2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSigmoid,
+      layer_config);
+
+  std::vector<size_t> adam_sizes = {16 * test_graph->node_feature_length(),
+                                    16 * test_graph->GetNumLabelClasses()};
+  auto adam = std::make_unique<galois::AdamOptimizer>(adam_sizes, 2);
+
+  auto gnn = std::make_unique<galois::GraphNeuralNetwork>(
+      std::move(test_graph), std::move(adam), std::move(gnn_config));
+
+  //////////////////////////////////////////////////////////////////////////////
+
+  // no verification; test should be eyeballed to make sure accuracy is
+  // increasing
+  galois::StatTimer main_timer("Timer_0");
+  main_timer.start();
+  for (size_t epoch = 0; epoch < 100; epoch++) {
+    galois::PointerWithSize<galois::GNNFloat> predictions = gnn->DoInference();
+    gnn->GradientPropagation();
+    galois::gPrint("Epoch ", epoch, ": Accuracy is ",
+                   gnn->GetGlobalAccuracy(predictions), "\n");
+  }
+
+  // check test accuracy
+  gnn->SetLayerPhases(galois::GNNPhase::kTest);
+  galois::PointerWithSize<galois::GNNFloat> predictions = gnn->DoInference();
+  galois::gPrint("Test accuracy is ", gnn->GetGlobalAccuracy(predictions),
+                 "\n");
+  main_timer.stop();
+}

From 50537bc4ad641cae6536f3ba7c99220660ff0e1e Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 12 Jan 2021 13:52:06 -0600
Subject: [PATCH 451/660] Sigmoid and multiclass option added to gnnbench

Users can now specify multiclass and Sigmoid output layer in gnn
applications.
---
 lonestar/libgnnbench/include/GNNBench/Input.h |  5 +++-
 lonestar/libgnnbench/src/Input.cpp            | 23 +++++++++++++++----
 2 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/lonestar/libgnnbench/include/GNNBench/Input.h b/lonestar/libgnnbench/include/GNNBench/Input.h
index fc5059bb0c..737887a756 100644
--- a/lonestar/libgnnbench/include/GNNBench/Input.h
+++ b/lonestar/libgnnbench/include/GNNBench/Input.h
@@ -21,7 +21,10 @@ extern llvm::cl::opt<bool> do_activation;
 // TODO activation layer type once more are supported
 //! Controls weight normalization based on degree
 extern llvm::cl::opt<bool> do_normalization;
-// TODO output layer type
+//! Output layer type
+extern llvm::cl::opt<galois::GNNOutputLayerType> output_layer_type;
+//! If true, use multiclass ground truth
+extern llvm::cl::opt<bool> multiclass_labels;
 // TODO optimizer type
 //! Toggles an optimization that flips aggregate/update step if it would be
 //! beneficial
diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp
index 97ef7a6fc3..d1dbb5bba3 100644
--- a/lonestar/libgnnbench/src/Input.cpp
+++ b/lonestar/libgnnbench/src/Input.cpp
@@ -59,6 +59,20 @@ llvm::cl::opt<bool>
                                "features based on their degree"),
                      cll::init(true));
 
+llvm::cl::opt<galois::GNNOutputLayerType> output_layer_type(
+    "outputLayer", cll::desc("Type of output layer"),
+    cll::values(clEnumValN(galois::GNNOutputLayerType::kSoftmax, "softmax",
+                           "Softmax (default)"),
+                clEnumValN(galois::GNNOutputLayerType::kSigmoid, "sigmoid",
+                           "Sigmoid")),
+    cll::init(galois::GNNOutputLayerType::kSoftmax));
+
+llvm::cl::opt<bool>
+    multiclass_labels("multiclassLabels",
+                      cll::desc("If true (off by default), use multi-class "
+                                "ground truth; required for some inputs"),
+                      cll::init(false));
+
 llvm::cl::opt<bool>
     agg_after_update("allowAggregationAfterUpdate",
                      cll::desc("If true (on by default), allows aggregate to "
@@ -161,7 +175,7 @@ std::unique_ptr<galois::GraphNeuralNetwork>
 InitializeGraphNeuralNetwork(galois::GNNLayerType layer_type) {
   // partition/load graph
   auto gnn_graph = std::make_unique<galois::graphs::GNNGraph>(
-      input_directory, input_name, partition_scheme, true);
+      input_directory, input_name, partition_scheme, !multiclass_labels);
 
   // create layer types vector
   std::vector<galois::GNNLayerType> layer_types;
@@ -174,10 +188,9 @@ InitializeGraphNeuralNetwork(galois::GNNLayerType layer_type) {
   // layer config object
   galois::GNNLayerConfig layer_config = CreateLayerConfig();
   // GNN config object
-  // TODO output type should be configurable
-  galois::GraphNeuralNetworkConfig gnn_config(
-      num_layers, layer_types, layer_sizes_vector,
-      galois::GNNOutputLayerType::kSoftmax, layer_config);
+  galois::GraphNeuralNetworkConfig gnn_config(num_layers, layer_types,
+                                              layer_sizes_vector,
+                                              output_layer_type, layer_config);
   // optimizer
   std::unique_ptr<galois::BaseOptimizer> opt = CreateOptimizer(gnn_graph.get());
 

From 3ffe9ed26a2d7a91e0e80ea78263bffd271f96a4 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 19 Jan 2021 16:51:28 -0600
Subject: [PATCH 452/660] Fixing cross-entropy loss; no ReLU for last layer

Cross-entropy loss was not punishing false positives before. This has
been fixed.

Also, ReLU has been turned off for the last layer whenever activation is
used as this seems to completely destroy performance.
---
 libgnn/include/galois/GNNMath.h             | 44 +++++++++++++++++++--
 libgnn/include/galois/layers/GNNLayer.h     |  2 +
 libgnn/include/galois/layers/SigmoidLayer.h |  9 ++---
 libgnn/src/GNNMath.cpp                      | 25 ------------
 libgnn/src/GraphNeuralNetwork.cpp           |  4 ++
 libgnn/src/layers/SigmoidLayer.cpp          | 17 ++++++--
 6 files changed, 63 insertions(+), 38 deletions(-)

diff --git a/libgnn/include/galois/GNNMath.h b/libgnn/include/galois/GNNMath.h
index f8edd9650f..231d437836 100644
--- a/libgnn/include/galois/GNNMath.h
+++ b/libgnn/include/galois/GNNMath.h
@@ -1,7 +1,9 @@
 #pragma once
 
+#include "galois/Logging.h"
 #include "galois/GNNTypes.h"
 #include <mkl.h>
+#include <cmath>
 
 namespace galois {
 
@@ -25,9 +27,35 @@ void GNNSoftmaxDerivative(const size_t vector_length,
                           GNNFloat* temp_vector, GNNFloat* output);
 //! Performs cross entropy given a ground truth and input and returns the loss
 //! value.
+template <typename TruthType>
 galois::GNNFloat GNNCrossEntropy(const size_t vector_length,
-                                 const GNNFloat* ground_truth,
-                                 const GNNFloat* input);
+                                 const TruthType* ground_truth,
+                                 const GNNFloat* input) {
+  GNNFloat loss = 0.0;
+
+  // Note that this function works if there are multiple non-zeros in the
+  // ground truth vector
+  // If there is only 1 then this function is overkill and it should break
+  // early (i.e. single class)
+  // Multiclass = fine
+  for (size_t i = 0; i < vector_length; i++) {
+    if (ground_truth[i] == 0.0) {
+      if (input[i] == 1.0) {
+        loss -= std::log(static_cast<GNNFloat>(1e-10));
+      } else {
+        loss -= std::log(1 - input[i]);
+      }
+    } else {
+      if (input[i] == 0.0) {
+        loss -= std::log(static_cast<GNNFloat>(1e-10));
+      } else {
+        loss -= std::log(input[i]);
+      }
+    }
+  }
+
+  return loss;
+}
 
 //! Derivative of cross entropy; gradients saved into an output vector.
 template <typename TruthType>
@@ -35,7 +63,17 @@ void GNNCrossEntropyDerivative(const size_t vector_length,
                                const TruthType* ground_truth,
                                const GNNFloat* input, GNNFloat* gradients) {
   for (size_t i = 0; i < vector_length; i++) {
-    gradients[i] = -(ground_truth[i]) / (input[i] + static_cast<float>(1e-10));
+    // TODO(loc) assumption: binary classifier, make explicit in function name
+    if (ground_truth[i]) {
+      gradients[i] = -1.0 / (input[i] + static_cast<float>(1e-10));
+    } else {
+      if (input[i] == 1.0) {
+        // opposite
+        gradients[i] = 1.0 / static_cast<float>(1e-10);
+      } else {
+        gradients[i] = 1.0 / (1.0 - input[i]);
+      }
+    }
   }
 }
 
diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
index 93498a6497..f8d8cd8d8a 100644
--- a/libgnn/include/galois/layers/GNNLayer.h
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -75,6 +75,8 @@ class GNNLayer {
   //! Changes this layer's phase
   void SetLayerPhase(GNNPhase new_phase) { layer_phase_ = new_phase; }
 
+  void DisableActivation() { config_.do_activation = false; }
+
   //! Initializes all layer weights to 1. This is used as a debug function for
   //! testing.
   void InitAllWeightsTo1() {
diff --git a/libgnn/include/galois/layers/SigmoidLayer.h b/libgnn/include/galois/layers/SigmoidLayer.h
index 44c215909d..7efe8cd9db 100644
--- a/libgnn/include/galois/layers/SigmoidLayer.h
+++ b/libgnn/include/galois/layers/SigmoidLayer.h
@@ -14,7 +14,7 @@ class SigmoidLayer : public GNNLayer {
                const GNNLayerDimensions& dimensions)
       : GNNLayer(layer_num, graph, dimensions,
                  GNNLayerConfig{.allocate_weights = false}),
-        // input_loss_(dimensions.input_rows),
+        input_loss_(dimensions.input_rows),
         norm_gradient_vectors_(dimensions.input_columns) {
     output_layer_type_ = galois::GNNOutputLayerType::kSigmoid;
     // input/output columns must be equivalent
@@ -36,13 +36,10 @@ class SigmoidLayer : public GNNLayer {
 private:
   const PointerWithSize<galois::GNNFloat>
   ForwardPhaseCPU(const PointerWithSize<galois::GNNFloat> input_embeddings);
-
   PointerWithSize<galois::GNNFloat> BackwardPhaseCPU();
 
-  //! Loss for each row of the input; unused for now because loss doesn't
-  //! need to be calculated for correctness
-  // std::vector<GNNFloat> input_loss_;
-
+  //! Loss for each row of the input
+  std::vector<GNNFloat> input_loss_;
   //! Each thread gets storage to allocate the gradients during backward
   //! prop; each is the size of a feature vector
   galois::substrate::PerThreadStorage<std::vector<GNNFloat>>
diff --git a/libgnn/src/GNNMath.cpp b/libgnn/src/GNNMath.cpp
index 294dc9f7be..dcaaf31a42 100644
--- a/libgnn/src/GNNMath.cpp
+++ b/libgnn/src/GNNMath.cpp
@@ -1,4 +1,3 @@
-#include <cmath>
 #include <algorithm>
 #include <immintrin.h>
 #include "galois/GNNMath.h"
@@ -83,30 +82,6 @@ void galois::GNNSoftmaxDerivative(const size_t vector_length,
   }
 }
 
-galois::GNNFloat galois::GNNCrossEntropy(const size_t vector_length,
-                                         const GNNFloat* ground_truth,
-                                         const GNNFloat* input) {
-  GNNFloat loss = 0.0;
-
-  // Note that this function works if there are multiple non-zeros in the
-  // ground truth vector
-  // If there is only 1 then this function is overkill and it should break
-  // early
-  for (size_t i = 0; i < vector_length; i++) {
-    if (ground_truth[i] == 0.0) {
-      continue;
-    }
-
-    if (input[i] == 0.0) {
-      loss -= ground_truth[i] * std::log(static_cast<GNNFloat>(1e-10));
-    } else {
-      loss -= ground_truth[i] * std::log(input[i]);
-    }
-  }
-
-  return loss;
-}
-
 void galois::CBlasSGEMM(const CBLAS_TRANSPOSE trans_a,
                         const CBLAS_TRANSPOSE trans_b, size_t input_rows,
                         size_t input_columns, size_t output_columns,
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index ce89b4b780..aae4fbb8a1 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -36,6 +36,10 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork(
     case GNNLayerType::kGraphConvolutional:
       gnn_layers_.push_back(std::move(std::make_unique<GraphConvolutionalLayer>(
           i, *graph_, layer_dims, config_.default_layer_config())));
+      if (i == config_.num_intermediate_layers() - 1) {
+        // last layer before output layer should never have activation
+        gnn_layers_.back()->DisableActivation();
+      }
       break;
     default:
       GALOIS_LOG_FATAL("Invalid layer type during network construction");
diff --git a/libgnn/src/layers/SigmoidLayer.cpp b/libgnn/src/layers/SigmoidLayer.cpp
index 8db6b8e0cc..1b6fe9eb05 100644
--- a/libgnn/src/layers/SigmoidLayer.cpp
+++ b/libgnn/src/layers/SigmoidLayer.cpp
@@ -7,10 +7,11 @@
 const galois::PointerWithSize<galois::GNNFloat>
 galois::SigmoidLayer::ForwardPhaseCPU(
     const galois::PointerWithSize<galois::GNNFloat> input_embeddings) {
-  // loss is ignored for now anyways
-  // input_loss_.assign(input_loss_.size(), 0.0);
+  input_loss_.assign(input_loss_.size(), 0.0);
   forward_output_matrix_.assign(forward_output_matrix_.size(), 0.0);
   const size_t feature_length = layer_dimensions_.input_columns;
+  galois::GAccumulator<double> total_loss;
+  total_loss.reset();
 
   galois::do_all(
       galois::iterate(graph_.begin_owned(), graph_.end_owned()),
@@ -21,13 +22,21 @@ galois::SigmoidLayer::ForwardPhaseCPU(
           for (unsigned index = 0; index < feature_length; index++) {
             forward_output_matrix_[node_offset + index] =
                 1.0 / (1.0 + expf(-input_embeddings[node_offset + index]));
+            // if (local_node == 0) {
+            //  galois::gPrint(forward_output_matrix_[node_offset + index],
+            //  "\n");
+            //}
           }
-          // TODO(loc) calculate loss (it's not even being used/not required
-          // for correctness so I'm ignoring it for now)
+
+          input_loss_[local_node] = GNNCrossEntropy(
+              feature_length, graph_.GetMultiClassLabel(local_node),
+              &forward_output_matrix_[node_offset]);
+          total_loss += input_loss_[local_node];
         }
       },
       galois::steal(), galois::loopname("SigmoidForward"));
 
+  galois::gPrint("Total loss is ", total_loss.reduce(), "\n");
   return forward_output_matrix_;
 }
 

From 3bf83392a35aa033f1f08099699d4566a4e408f2 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 22 Jan 2021 16:04:01 -0600
Subject: [PATCH 453/660] Fixing Sigmoid and GCN derivatives

Sigmoid derivative has been changed to a numerically stable and simple
subtraction calculation.

GCN derivative in the case of aggregate -> xform was incorrect: it was
multiplying with non-aggregated features. This caused gradient explosion
and degrading accuracy as time went on due to the gradients being
non-sensical.
---
 libgnn/src/layers/GraphConvolutionalLayer.cpp | 19 ++++++++++--
 libgnn/src/layers/SigmoidLayer.cpp            | 30 +++++++++----------
 2 files changed, 30 insertions(+), 19 deletions(-)

diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp
index 9c4379dbcc..9967a76773 100644
--- a/libgnn/src/layers/GraphConvolutionalLayer.cpp
+++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp
@@ -112,12 +112,13 @@ galois::GraphConvolutionalLayer::BackwardPhase(
                    &input_column_intermediates_);
     }
     // weight gradient calculation
-    // TODO put this in a function to put the ifdef in there
+    // TODO(loc) put this in a function to put the ifdef in there
 #ifndef GALOIS_ENABLE_GPU
+    // temp 2 holds aggregated feature vectors from forward phase
     galois::CBlasSGEMM(
         CblasTrans, CblasNoTrans, layer_dimensions_.input_columns,
         layer_dimensions_.input_rows, layer_dimensions_.output_columns,
-        prev_layer_input.data(), input_gradient->data(),
+        p_in_temp_2_.data(), input_gradient->data(),
         p_layer_weight_gradients_.data());
 #else
     gpu_object_.GetWeightGradientsGPU(
@@ -189,7 +190,7 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU(
       [&](size_t src) {
         size_t index_to_src_feature = src * column_length;
         // zero out src feature first
-        // TODO can init to self as well
+        // TODO(loc) can init to self as well to add to self
         for (size_t i = 0; i < column_length; i++) {
           aggregate_output[index_to_src_feature + i] = 0;
         }
@@ -225,6 +226,18 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU(
                               &aggregate_output[index_to_src_feature]);
           }
         }
+
+        // GNNFloat* intermediate = pts->getLocal()->data();
+        // GNNFloat norm_scale = source_norm * source_norm;
+        // for (size_t i = 0; i < column_length; i++) {
+        //  intermediate[i] =
+        //      norm_scale * node_embeddings[index_to_src_feature + i];
+        //}
+        //// add self
+        // galois::VectorAdd(column_length,
+        //                  &aggregate_output[index_to_src_feature],
+        //                  intermediate,
+        //                  &aggregate_output[index_to_src_feature]);
       },
       galois::steal(), galois::loopname("ConvolutionalAggregateAll"));
 
diff --git a/libgnn/src/layers/SigmoidLayer.cpp b/libgnn/src/layers/SigmoidLayer.cpp
index 1b6fe9eb05..a676383e6f 100644
--- a/libgnn/src/layers/SigmoidLayer.cpp
+++ b/libgnn/src/layers/SigmoidLayer.cpp
@@ -20,17 +20,21 @@ galois::SigmoidLayer::ForwardPhaseCPU(
           size_t node_offset = feature_length * local_node;
           // sigmoid the values for this node
           for (unsigned index = 0; index < feature_length; index++) {
-            forward_output_matrix_[node_offset + index] =
-                1.0 / (1.0 + expf(-input_embeddings[node_offset + index]));
-            // if (local_node == 0) {
-            //  galois::gPrint(forward_output_matrix_[node_offset + index],
-            //  "\n");
-            //}
+            // splitting in half is done for numerical stability of log
+            if (input_embeddings[node_offset + index] >= 0) {
+              forward_output_matrix_[node_offset + index] =
+                  1.0 / (1.0 + expf(-input_embeddings[node_offset + index]));
+            } else {
+              forward_output_matrix_[node_offset + index] =
+                  expf(input_embeddings[node_offset + index]) /
+                  (1.0 + expf(input_embeddings[node_offset + index]));
+            }
           }
 
           input_loss_[local_node] = GNNCrossEntropy(
               feature_length, graph_.GetMultiClassLabel(local_node),
               &forward_output_matrix_[node_offset]);
+          // TODO(loc) normalize the loss
           total_loss += input_loss_[local_node];
         }
       },
@@ -63,18 +67,12 @@ galois::SigmoidLayer::BackwardPhaseCPU() {
           // derivative cross entropy into norm grad
           const GNNLabel* ground_truth = graph_.GetMultiClassLabel(local_node);
           size_t node_offset           = feature_length * local_node;
-          std::vector<GNNFloat>* norm_gradient =
-              norm_gradient_vectors_.getLocal();
-          GNNCrossEntropyDerivative(feature_length, ground_truth,
-                                    &(forward_output_matrix_[node_offset]),
-                                    norm_gradient->data());
-
-          // sigmoid derivative
+          // sigmoid-cross-entropy derivative: turns out all it is is simple
+          // subtraction
           for (unsigned index = 0; index < feature_length; index++) {
             backward_output_matrix_[node_offset + index] =
-                (*norm_gradient)[index] *
-                forward_output_matrix_[node_offset + index] *
-                (1.0 - forward_output_matrix_[node_offset + index]);
+                forward_output_matrix_[node_offset + index] -
+                ground_truth[index];
           }
         }
       },

From ef58e82d77b8d42fe6b594cbd407a8799d033d32 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 27 Jan 2021 15:24:19 -0600
Subject: [PATCH 454/660] Fixing GPU code/build

Fixes backpropagation gradient used in GPU to match fix applied to CPU
code.

Fixes a CMake var used by the GPU build (no clue how it was building
before).

Adds dummy uses/returns to SigmoidLayer gpu (which needs to be
implemented).
---
 CMakeLists.txt                                | 2 +-
 libgnn/src/layers/GraphConvolutionalLayer.cpp | 2 +-
 libgnn/src/layers/SigmoidLayer.cpp            | 5 +++--
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 41f318b828..bb72f24c71 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -259,7 +259,7 @@ if (GALOIS_ENABLE_GPU)
     set(CUDA_PROPAGATE_HOST_FLAGS off)
     set(CUDA_HOST_COMPILER g++)
 
-    string(REPLACE "." "" GENCODES ${CUDA_CAPABILITY})
+    string(REPLACE "." "" GENCODES ${GALOIS_CUDA_CAPABILITY})
     string(REPLACE "," ";" GENCODES ${GENCODES})
     foreach(GENCODE ${GENCODES})
       set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; --expt-extended-lambda -gencode arch=compute_${GENCODE},code=sm_${GENCODE})
diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp
index 9967a76773..ed05ddf4be 100644
--- a/libgnn/src/layers/GraphConvolutionalLayer.cpp
+++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp
@@ -123,7 +123,7 @@ galois::GraphConvolutionalLayer::BackwardPhase(
 #else
     gpu_object_.GetWeightGradientsGPU(
         layer_dimensions_.input_rows, layer_dimensions_.input_columns,
-        layer_dimensions_.output_columns, prev_layer_input.data(),
+        layer_dimensions_.output_columns, p_in_temp_2_.data(),
         input_gradient->data(), p_layer_weight_gradients_.data());
 #endif
   } else {
diff --git a/libgnn/src/layers/SigmoidLayer.cpp b/libgnn/src/layers/SigmoidLayer.cpp
index a676383e6f..3ae7492046 100644
--- a/libgnn/src/layers/SigmoidLayer.cpp
+++ b/libgnn/src/layers/SigmoidLayer.cpp
@@ -49,7 +49,8 @@ galois::SigmoidLayer::ForwardPhase(
     const galois::PointerWithSize<galois::GNNFloat> input_embeddings) {
 #ifdef GALOIS_ENABLE_GPU
   // TODO(loc) when GPU needs it
-  return 0;
+  printf("%p\n", input_embeddings.data());
+  return p_layer_weights_;
 #else
   return ForwardPhaseCPU(input_embeddings);
 #endif
@@ -86,7 +87,7 @@ galois::SigmoidLayer::BackwardPhase(const PointerWithSize<galois::GNNFloat>,
                                     PointerWithSize<galois::GNNFloat>*) {
 #ifdef GALOIS_ENABLE_GPU
   // TODO(loc) when GPU needs it
-  return 0;
+  return p_layer_weights_;
 #else
   return BackwardPhaseCPU();
 #endif

From eca2d0dd75858ed2383e291c9b7fc36b416a9b88 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 28 Jan 2021 16:49:31 -0600
Subject: [PATCH 455/660] Various comments on GCN layer

added a few comments to better mark exactly what is going on in the GCN
forward/backward pass

Added a label class print as well
---
 libgnn/src/graphs/GNNGraph.cpp                |  7 ++++++-
 libgnn/src/layers/GraphConvolutionalLayer.cpp | 13 +++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index 2753e07f3d..646f39db8e 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -146,6 +146,7 @@ void galois::graphs::GNNGraph::ReadLocalLabels(const std::string& dataset_name,
   size_t num_nodes;
   file_stream >> num_nodes >> num_label_classes_ >> std::ws;
   assert(num_nodes == partitioned_graph_->globalSize());
+  galois::gPrint("Number of label classes is ", num_label_classes_, "\n");
 
   // allocate memory for labels
   if (has_single_class_label) {
@@ -454,7 +455,8 @@ float galois::graphs::GNNGraph::GetGlobalAccuracyCPUMulti(
         galois::iterate(begin_owned(), end_owned()),
         [&](const unsigned lid) {
           if (IsValidForPhase(lid, phase)) {
-            size_t label_index  = lid * num_label_classes_ + label_class;
+            size_t label_index = lid * num_label_classes_ + label_class;
+
             GNNLabel true_label = full_ground_truth[label_index];
             GNNLabel prediction_is_positive =
                 (predictions[label_index] > 0.5) ? 1 : 0;
@@ -510,6 +512,9 @@ float galois::graphs::GNNGraph::GetGlobalAccuracyCPUMulti(
     global_f1_score += class_f1_score;
   } // end label class loop
 
+  // GALOIS_LOG_WARN("{} {} {} {}", global_true_positive, global_true_negative,
+  // global_false_positive, global_false_negative);
+
   // double global_f1_macro_score = global_f1_score / num_label_classes_;
 
   // micro = considers all classes for precision/recall
diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp
index ed05ddf4be..48759a9bfa 100644
--- a/libgnn/src/layers/GraphConvolutionalLayer.cpp
+++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp
@@ -66,7 +66,9 @@ galois::GraphConvolutionalLayer::ForwardPhase(
     UpdateEmbeddings(p_in_temp_2_.data(), p_forward_output_matrix_.data());
   } else {
     // update to aggregate
+    // FW
     UpdateEmbeddings(input_data, p_out_temp_.data());
+    // A(FW)
     AggregateAll(layer_dimensions_.output_columns, p_out_temp_.data(),
                  p_forward_output_matrix_.data(),
                  &output_column_intermediates_);
@@ -95,6 +97,8 @@ galois::GraphConvolutionalLayer::BackwardPhase(
     ActivationDerivative(input_gradient);
   }
 
+  // AFW = O
+
   // derivative of aggregation/update
   // TODO clean up logic here to reduce nesting
   if (!config_.allow_aggregate_after_update ||
@@ -105,11 +109,15 @@ galois::GraphConvolutionalLayer::BackwardPhase(
              layer_dimensions_.input_rows * layer_dimensions_.output_columns);
       assert(p_in_temp_1_.size() ==
              layer_dimensions_.input_columns * layer_dimensions_.input_rows);
+      // pintemp1 contains (AF)'
       UpdateEmbeddingsDerivative(input_gradient->data(), p_in_temp_1_.data());
+      // pback contains F'
       // derivative of aggregate is the same due to symmetric graph
       AggregateAll(layer_dimensions_.input_columns, p_in_temp_1_.data(),
                    p_backward_output_matrix_.data(),
                    &input_column_intermediates_);
+      // TODO if training A, then A' compute here if layer # is 0
+      // dot product of edges that exist in A
     }
     // weight gradient calculation
     // TODO(loc) put this in a function to put the ifdef in there
@@ -127,16 +135,21 @@ galois::GraphConvolutionalLayer::BackwardPhase(
         input_gradient->data(), p_layer_weight_gradients_.data());
 #endif
   } else {
+    // TODO at this point, out_temp contains memoized FW
+    // can use it to get A' = O' (FW)^T
     // aggregate occurs regardless of layer being equal to 0 because it is
     // required in this case for the weight gradient calculation
+    // this is (FW)'
     AggregateAll(layer_dimensions_.output_columns, input_gradient->data(),
                  p_out_temp_.data(), &output_column_intermediates_);
     if (layer_number_ != 0) {
       // derivative for update
+      // backout = F'
       UpdateEmbeddingsDerivative(p_out_temp_.data(),
                                  p_backward_output_matrix_.data());
     }
     // TODO put this in a function
+    // W' = F^T (FW)'
 #ifndef GALOIS_ENABLE_GPU
     // weight gradient; note the use of the aggregated gradient in out_temp
     galois::CBlasSGEMM(

From 0dc99b2d1ccbac6aa1dd3c7f8368dfbd048b84ac Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 28 Jan 2021 17:10:59 -0600
Subject: [PATCH 456/660] Add sigmoid test, fix conv test

Adds sigmoid test (which doens't do an automated correctness check
because floats are pain).

Fixes the convolutional layer test after fixing the backward phase a few
commits ago.
---
 libgnn/test/CMakeLists.txt        |  4 ++
 libgnn/test/convlayer-test.cpp    | 20 +++++-----
 libgnn/test/sigmoidlayer-test.cpp | 64 +++++++++++++++++++++++++++++++
 libgnn/test/softmaxlayer-test.cpp |  9 +++--
 4 files changed, 83 insertions(+), 14 deletions(-)
 create mode 100644 libgnn/test/sigmoidlayer-test.cpp

diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt
index 6bf0ac6bd8..18a854ff8f 100644
--- a/libgnn/test/CMakeLists.txt
+++ b/libgnn/test/CMakeLists.txt
@@ -11,6 +11,10 @@ if (NOT GALOIS_ENABLE_GPU)
   target_link_libraries(softmaxlayer-test galois_gnn)
   add_test(NAME softmaxlayer-test COMMAND softmaxlayer-test)
   
+  add_executable(sigmoidlayer-test sigmoidlayer-test.cpp)
+  target_link_libraries(sigmoidlayer-test galois_gnn)
+  add_test(NAME sigmoidlayer-test COMMAND sigmoidlayer-test)
+
   add_executable(gnnconstruct-test gnnconstruct-test.cpp)
   target_link_libraries(gnnconstruct-test galois_gnn)
   add_test(NAME gnnconstruct-test COMMAND gnnconstruct-test)
diff --git a/libgnn/test/convlayer-test.cpp b/libgnn/test/convlayer-test.cpp
index ae23fa4f23..58d1d7d581 100644
--- a/libgnn/test/convlayer-test.cpp
+++ b/libgnn/test/convlayer-test.cpp
@@ -127,11 +127,11 @@ int main() {
       layer_0->GetLayerWeightGradients();
   // make sure they are sane
   GALOIS_LOG_ASSERT(layer_0_weight_gradients.size() == 6);
-  GALOIS_LOG_ASSERT(layer_0_weight_gradients[0] == 21);
-  GALOIS_LOG_ASSERT(layer_0_weight_gradients[1] == 21);
-  GALOIS_LOG_ASSERT(layer_0_weight_gradients[2] == 21);
-  GALOIS_LOG_ASSERT(layer_0_weight_gradients[3] == 21);
-  GALOIS_LOG_ASSERT(layer_0_weight_gradients[4] == 21);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[0] == 36);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[1] == 36);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[2] == 36);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[3] == 36);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[4] == 36);
 
   layer_0.reset();
 
@@ -196,11 +196,11 @@ int main() {
       layer_1->GetLayerWeightGradients();
   // make sure they are sane
   GALOIS_LOG_ASSERT(layer_1_weight_gradients.size() == 6);
-  GALOIS_LOG_ASSERT(layer_1_weight_gradients[0] == 21);
-  GALOIS_LOG_ASSERT(layer_1_weight_gradients[1] == 21);
-  GALOIS_LOG_ASSERT(layer_1_weight_gradients[2] == 21);
-  GALOIS_LOG_ASSERT(layer_1_weight_gradients[3] == 21);
-  GALOIS_LOG_ASSERT(layer_1_weight_gradients[4] == 21);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[0] == 36);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[1] == 36);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[2] == 36);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[3] == 36);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[4] == 36);
 
   layer_1.reset();
 
diff --git a/libgnn/test/sigmoidlayer-test.cpp b/libgnn/test/sigmoidlayer-test.cpp
new file mode 100644
index 0000000000..333651bdf5
--- /dev/null
+++ b/libgnn/test/sigmoidlayer-test.cpp
@@ -0,0 +1,64 @@
+//! @file sigmoidlayer-test.cpp
+//! Sigmoid layer test with a test graph
+//! No automated ground truth checking; when this was written it was compared
+//! manually with pytorch
+//! TODO add in automated checking eventually; for now this just makes sure it
+//! runs
+
+#include "galois/Logging.h"
+#include "galois/GNNMath.h"
+#include "galois/layers/SigmoidLayer.h"
+
+int main() {
+  galois::DistMemSys G;
+
+  galois::setActiveThreads(1);
+
+  // load test graph
+  galois::graphs::GNNGraph test_graph(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, false);
+
+  // input/output columns must be same in softmax
+  galois::GNNLayerDimensions dimension_0;
+  dimension_0.input_rows     = 7;
+  dimension_0.input_columns  = test_graph.GetNumLabelClasses();
+  dimension_0.output_columns = test_graph.GetNumLabelClasses();
+
+  GALOIS_LOG_VERBOSE("Num output classes is {}", dimension_0.input_columns);
+
+  // input to softmax
+  std::vector<galois::GNNFloat> softmax_input(49, 0.0);
+  // create input with perfect accuracy
+  softmax_input[0]  = 1;
+  softmax_input[1]  = 1;
+  softmax_input[2]  = 100000000000;
+  softmax_input[3]  = 100000000000000000;
+  softmax_input[4]  = -1000;
+  softmax_input[5]  = -10;
+  softmax_input[6]  = 1000000;
+  softmax_input[8]  = 1;
+  softmax_input[9]  = 1;
+  softmax_input[10] = 1;
+  softmax_input[16] = 1;
+  softmax_input[17] = 1;
+  softmax_input[18] = 1;
+  softmax_input[24] = 0;
+  softmax_input[32] = 0;
+  softmax_input[40] = 0;
+  softmax_input[48] = 0;
+
+  // train mode
+  auto output_layer =
+      std::make_unique<galois::SigmoidLayer>(3, test_graph, dimension_0);
+  output_layer->ForwardPhase(softmax_input);
+
+  galois::PointerWithSize<galois::GNNFloat> asdf =
+      output_layer->BackwardPhase(softmax_input, nullptr);
+  printf("Output 1\n========\n");
+  for (unsigned i = 0; i < asdf.size(); i++) {
+    if (i % 7 == 0) {
+      printf("--------------\n");
+    }
+    printf("%f\n", asdf[i]);
+  }
+}
diff --git a/libgnn/test/softmaxlayer-test.cpp b/libgnn/test/softmaxlayer-test.cpp
index 9f15bedfa3..7a6de416dc 100644
--- a/libgnn/test/softmaxlayer-test.cpp
+++ b/libgnn/test/softmaxlayer-test.cpp
@@ -1,5 +1,9 @@
-//! @file convlayer-test.cpp
+//! @file softmaxlayer-test.cpp
 //! Softmax layer test with a test graph
+//! No automated ground truth checking; when this was written it was compared
+//! manually with pytorch
+//! TODO add in automated checking eventually; for now this just makes sure it
+//! runs
 
 #include "galois/Logging.h"
 #include "galois/GNNMath.h"
@@ -117,7 +121,4 @@ int main() {
     GALOIS_LOG_ASSERT(pd3[i * 7 + 5] == 0.0);
     GALOIS_LOG_ASSERT(pd3[i * 7 + 6] == 0.0);
   }
-
-  // TODO in future maybe: add better test for backward phase besides just
-  // running it
 }

From 53f6140ba5d6d671ec81d5efccc6cb4156b07368 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Sat, 30 Jan 2021 13:08:49 -0600
Subject: [PATCH 457/660] gpuconv layer test fix based on backward phase fix

Fixing conv layer test for gpu after backward gcn fix.
---
 libgnn/test/gpu-convlayer-test.cpp | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/libgnn/test/gpu-convlayer-test.cpp b/libgnn/test/gpu-convlayer-test.cpp
index f4bb4cf4d3..a79262d706 100644
--- a/libgnn/test/gpu-convlayer-test.cpp
+++ b/libgnn/test/gpu-convlayer-test.cpp
@@ -112,11 +112,11 @@ int main() {
       layer_0->CopyWeightGradientsFromGPU();
   // make sure they are sane
   GALOIS_LOG_ASSERT(layer_0_weight_gradients.size() == 6);
-  GALOIS_LOG_ASSERT(layer_0_weight_gradients[0] == 21);
-  GALOIS_LOG_ASSERT(layer_0_weight_gradients[1] == 21);
-  GALOIS_LOG_ASSERT(layer_0_weight_gradients[2] == 21);
-  GALOIS_LOG_ASSERT(layer_0_weight_gradients[3] == 21);
-  GALOIS_LOG_ASSERT(layer_0_weight_gradients[4] == 21);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[0] == 36);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[1] == 36);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[2] == 36);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[3] == 36);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[4] == 36);
 
   layer_0.reset();
 
@@ -183,11 +183,11 @@ int main() {
       layer_1->CopyWeightGradientsFromGPU();
   // make sure they are sane
   GALOIS_LOG_ASSERT(layer_1_weight_gradients.size() == 6);
-  GALOIS_LOG_ASSERT(layer_1_weight_gradients[0] == 21);
-  GALOIS_LOG_ASSERT(layer_1_weight_gradients[1] == 21);
-  GALOIS_LOG_ASSERT(layer_1_weight_gradients[2] == 21);
-  GALOIS_LOG_ASSERT(layer_1_weight_gradients[3] == 21);
-  GALOIS_LOG_ASSERT(layer_1_weight_gradients[4] == 21);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[0] == 36);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[1] == 36);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[2] == 36);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[3] == 36);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[4] == 36);
 
   layer_1.reset();
 

From f0802dfe7fb9d0a2b06551c236f08d4b5b21d5c8 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 1 Feb 2021 13:27:13 -0600
Subject: [PATCH 458/660] Configuration hooks for random subgraph sampling

Adds configuration hooks for random subgraph sampling via a on/off flag
on a node.

Renames var to enable/disable agg after update optimization
---
 libgnn/include/galois/GraphNeuralNetwork.h    | 45 +++++++++---
 libgnn/include/galois/graphs/GNNGraph.h       | 72 +++++++++++++------
 libgnn/include/galois/layers/GNNLayer.h       | 10 ++-
 libgnn/src/GraphNeuralNetwork.cpp             | 17 +++++
 libgnn/src/graphs/GNNGraph.cpp                | 10 +++
 libgnn/src/layers/GraphConvolutionalLayer.cpp |  4 +-
 libgnn/test/aggregate-sync-test.cpp           |  2 +-
 libgnn/test/convlayer-test.cpp                | 10 +--
 libgnn/test/gnnfb-test.cpp                    |  2 +-
 libgnn/test/weight-sync-test.cpp              |  2 +-
 lonestar/libgnnbench/include/GNNBench/Input.h |  2 +-
 lonestar/libgnnbench/src/Input.cpp            | 20 +++---
 12 files changed, 143 insertions(+), 53 deletions(-)

diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h
index 51142b9b38..9aa7d8189e 100644
--- a/libgnn/include/galois/GraphNeuralNetwork.h
+++ b/libgnn/include/galois/GraphNeuralNetwork.h
@@ -23,22 +23,44 @@ namespace galois {
 class GraphNeuralNetworkConfig {
 public:
   //! Construction without a config for layers specified; uses a default
+  //! also no sampling specified
   GraphNeuralNetworkConfig(size_t num_layers,
                            const std::vector<GNNLayerType>& layer_types,
                            const std::vector<size_t>& layer_column_sizes,
                            GNNOutputLayerType output_layer_type)
       : GraphNeuralNetworkConfig(num_layers, layer_types, layer_column_sizes,
-                                 output_layer_type, GNNLayerConfig()) {}
+                                 output_layer_type, false, GNNLayerConfig()) {}
+
+  //! Construction without a config for layers specified
+  GraphNeuralNetworkConfig(size_t num_layers,
+                           const std::vector<GNNLayerType>& layer_types,
+                           const std::vector<size_t>& layer_column_sizes,
+                           GNNOutputLayerType output_layer_type,
+                           bool do_sampling)
+      : GraphNeuralNetworkConfig(num_layers, layer_types, layer_column_sizes,
+                                 output_layer_type, do_sampling,
+                                 GNNLayerConfig()) {}
+
+  //! Construction without sampling specified
+  GraphNeuralNetworkConfig(size_t num_layers,
+                           const std::vector<GNNLayerType>& layer_types,
+                           const std::vector<size_t>& layer_column_sizes,
+                           GNNOutputLayerType output_layer_type,
+                           const GNNLayerConfig& default_layer_config)
+      : GraphNeuralNetworkConfig(num_layers, layer_types, layer_column_sizes,
+                                 output_layer_type, false,
+                                 default_layer_config) {}
 
   //! Construction with a specified config for layers
   GraphNeuralNetworkConfig(size_t num_layers,
                            const std::vector<GNNLayerType>& layer_types,
                            const std::vector<size_t>& layer_column_sizes,
                            GNNOutputLayerType output_layer_type,
+                           bool do_sampling,
                            const GNNLayerConfig& default_layer_config)
       : num_intermediate_layers_(num_layers), layer_types_(layer_types),
         layer_column_sizes_(layer_column_sizes),
-        output_layer_type_(output_layer_type),
+        output_layer_type_(output_layer_type), do_sampling_(do_sampling),
         default_layer_config_(default_layer_config) {
     // Do sanity checks on inputs
     // should have a type for each layer
@@ -51,25 +73,30 @@ class GraphNeuralNetworkConfig {
   }
 
   //! # layers NOT including output layer
-  size_t num_intermediate_layers() { return num_intermediate_layers_; }
+  size_t num_intermediate_layers() const { return num_intermediate_layers_; }
   //! Get intermediate layer i
-  GNNLayerType intermediate_layer_type(size_t i) {
+  GNNLayerType intermediate_layer_type(size_t i) const {
     assert(i < num_intermediate_layers_);
     return layer_types_[i];
   }
   //! Get intermediate layer i's size
-  size_t intermediate_layer_size(size_t i) {
+  size_t intermediate_layer_size(size_t i) const {
     assert(i < num_intermediate_layers_);
     return layer_column_sizes_[i];
   }
   //! Type of output layer
-  GNNOutputLayerType output_layer_type() { return output_layer_type_; }
+  GNNOutputLayerType output_layer_type() const { return output_layer_type_; }
   //! Size of output layer is last element of layer column sizes
-  size_t output_layer_size() {
+  size_t output_layer_size() const {
     return layer_column_sizes_[num_intermediate_layers_];
   }
+
+  bool do_sampling() const { return do_sampling_; }
+
   //! Get the default layer config of layers in this GNN
-  const GNNLayerConfig& default_layer_config() { return default_layer_config_; }
+  const GNNLayerConfig& default_layer_config() const {
+    return default_layer_config_;
+  }
 
 private:
   //! Number of layers to construct in the GNN not including the output
@@ -83,6 +110,8 @@ class GraphNeuralNetworkConfig {
   std::vector<size_t> layer_column_sizes_;
   //! Output layer type
   GNNOutputLayerType output_layer_type_;
+  //! Graph sampling
+  bool do_sampling_;
   //! Default config to use for layers
   GNNLayerConfig default_layer_config_;
 };
diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index cfed56aade..4400809940 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "galois/GNNTypes.h"
+#include "galois/PerThreadRNG.h"
 #include "galois/graphs/CuSPPartitioner.h"
 #include "galois/graphs/GluonSubstrate.h"
 #include "galois/graphs/GraphAggregationSyncStructures.h"
@@ -140,10 +141,52 @@ class GNNGraph {
   void AggregateSync(GNNFloat* matrix_to_sync,
                      const size_t matrix_column_size) const;
 
+  //////////////////////////////////////////////////////////////////////////////
+  // Sampling related
+  //////////////////////////////////////////////////////////////////////////////
+
+  //! Loops through all master nodes and determines if it is "on" or "off"
+  //! (the meaning of on and off depends on how it is used; for now, it is used
+  //! to indicate subgraph presence)
+  void UniformNodeSample();
+
+  //! Returns true if a particular node is currently considered "in" a sampled
+  //! graph
+  bool IsInSampledGraph(const NodeIterator& ni) const {
+    // TODO(loc) GPU
+    return partitioned_graph_->getData(*ni);
+  }
+
 #ifdef GALOIS_ENABLE_GPU
   const GNNGraphGPUAllocations& GetGPUGraph() const { return gpu_memory_; }
 #endif
 private:
+  //////////////////////////////////////////////////////////////////////////////
+  // Initialization
+  //////////////////////////////////////////////////////////////////////////////
+
+  //! Read labels of local nodes only
+  void ReadLocalLabels(const std::string& dataset_name,
+                       bool has_single_class_label);
+  //! Read features of local nodes only
+  void ReadLocalFeatures(const std::string& dataset_str);
+  //! Helper function to read masks from file into the appropriate structures
+  //! given a name, mask type, and arrays to save into
+  size_t ReadLocalMasksFromFile(const std::string& dataset_name,
+                                const std::string& mask_type,
+                                GNNRange* mask_range, char* masks);
+  //! Read masks of local nodes only for training, validation, and testing
+  void ReadLocalMasks(const std::string& dataset_name);
+  //! Reads the entire graph topology in (but nothing else)
+  void ReadWholeGraph(const std::string& dataset_name);
+  //! Initializes the norm factors using the entire graph's topology for global
+  //! degree access
+  void InitNormFactor();
+
+  //////////////////////////////////////////////////////////////////////////////
+  // Accuracy
+  //////////////////////////////////////////////////////////////////////////////
+
   float GetGlobalAccuracyCPU(PointerWithSize<GNNFloat> predictions,
                              GNNPhase phase);
   float GetGlobalAccuracyCPUSingle(PointerWithSize<GNNFloat> predictions,
@@ -151,6 +194,10 @@ class GNNGraph {
   float GetGlobalAccuracyCPUMulti(PointerWithSize<GNNFloat> predictions,
                                   GNNPhase phase);
 
+  //////////////////////////////////////////////////////////////////////////////
+  // Vars
+  //////////////////////////////////////////////////////////////////////////////
+
   //! Directory for input data
   const std::string input_directory_;
   //! In a multi-host setting, this variable stores the host id that the graph
@@ -199,29 +246,10 @@ class GNNGraph {
   //! Normalization constant based on structure of the graph (degrees)
   std::vector<GNNFloat> norm_factors_;
 
-  // TODO vars for subgraphs as necessary
-
-  //////////////////////////////////////////////////////////////////////////////
-  // Initialization
-  //////////////////////////////////////////////////////////////////////////////
+  //! RNG for subgraph sampling
+  galois::PerThreadRNG sample_rng_;
 
-  //! Read labels of local nodes only
-  void ReadLocalLabels(const std::string& dataset_name,
-                       bool has_single_class_label);
-  //! Read features of local nodes only
-  void ReadLocalFeatures(const std::string& dataset_str);
-  //! Helper function to read masks from file into the appropriate structures
-  //! given a name, mask type, and arrays to save into
-  size_t ReadLocalMasksFromFile(const std::string& dataset_name,
-                                const std::string& mask_type,
-                                GNNRange* mask_range, char* masks);
-  //! Read masks of local nodes only for training, validation, and testing
-  void ReadLocalMasks(const std::string& dataset_name);
-  //! Reads the entire graph topology in (but nothing else)
-  void ReadWholeGraph(const std::string& dataset_name);
-  //! Initializes the norm factors using the entire graph's topology for global
-  //! degree access
-  void InitNormFactor();
+  // TODO vars for subgraphs as necessary
 
   //////////////////////////////////////////////////////////////////////////////
   // GPU things
diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
index f8d8cd8d8a..2924520661 100644
--- a/libgnn/include/galois/layers/GNNLayer.h
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -48,9 +48,11 @@ struct GNNLayerConfig {
   bool do_activation{false};
   //! True if normalization is to occur during multiplies
   bool do_normalization{false};
-  //! If this is true, aggregate may occur after multiply if # of input columns
+  //! If this is false, aggregate may occur after multiply if # of input columns
   //! is higher than output columns to do less work in aggregation
-  bool allow_aggregate_after_update{true};
+  bool disable_aggregate_after_update{false};
+  //! Graph sampling flag in use or not
+  bool do_sampling{false};
   // TODO activation type; for now default is softmax
 };
 
@@ -135,6 +137,10 @@ class GNNLayer {
   //! stored in the layer
   void OptimizeLayer(BaseOptimizer* optimizer, size_t trainable_layer_number);
 
+  //! Flip sampling switch on
+  void EnableSampling() { config_.do_sampling = true; }
+  bool IsSampledLayer() { return config_.do_sampling; }
+
 #ifdef GALOIS_ENABLE_GPU
   //! Utility function for allocating
   PointerWithSize<GNNFloat> AllocateGPU(const std::vector<GNNFloat>& v) {
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index aae4fbb8a1..7892bf4f9e 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -75,12 +75,29 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork(
     // if debug mode just kill program
     assert(false);
   }
+
+  // flip sampling
+  if (config_.do_sampling()) {
+    for (std::unique_ptr<galois::GNNLayer>& ptr : gnn_layers_) {
+      ptr->EnableSampling();
+    }
+  }
 }
 
 float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
   const size_t this_host = graph_->host_id();
+  if (config_.do_sampling()) {
+    for (std::unique_ptr<galois::GNNLayer>& ptr : gnn_layers_) {
+      assert(ptr->IsSampledLayer());
+    }
+  }
+
   // TODO incorporate validation/test intervals
   for (size_t epoch = 0; epoch < num_epochs; epoch++) {
+    if (config_.do_sampling()) {
+      // subgraph sample every epoch
+      graph_->UniformNodeSample();
+    }
     const PointerWithSize<galois::GNNFloat> predictions = DoInference();
     GradientPropagation();
     float train_accuracy = GetGlobalAccuracy(predictions);
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index 646f39db8e..f110228fa3 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -130,6 +130,16 @@ void galois::graphs::GNNGraph::AggregateSync(
       "GraphAggregateSync");
 }
 
+void galois::graphs::GNNGraph::UniformNodeSample() {
+  galois::do_all(
+      galois::iterate(begin_owned(), end_owned()), [&](const NodeIterator& x) {
+        partitioned_graph_->getData(*x) = sample_rng_.DoBernoulli(0.5);
+      });
+  // TODO(loc) GPU
+  // TODO(loc) sync the flags across all machines to have same sample on all of
+  // them
+}
+
 void galois::graphs::GNNGraph::ReadLocalLabels(const std::string& dataset_name,
                                                bool has_single_class_label) {
   GALOIS_LOG_VERBOSE("[{}] Reading labels from disk...", host_id_);
diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp
index 48759a9bfa..2bef20ab1e 100644
--- a/libgnn/src/layers/GraphConvolutionalLayer.cpp
+++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp
@@ -58,7 +58,7 @@ galois::GraphConvolutionalLayer::ForwardPhase(
   }
 
   // flip aggregate/update if dimensions favor it (do less work)
-  if (!config_.allow_aggregate_after_update ||
+  if (config_.disable_aggregate_after_update ||
       layer_dimensions_.input_columns <= layer_dimensions_.output_columns) {
     // aggregation and update
     AggregateAll(layer_dimensions_.input_columns, input_data,
@@ -101,7 +101,7 @@ galois::GraphConvolutionalLayer::BackwardPhase(
 
   // derivative of aggregation/update
   // TODO clean up logic here to reduce nesting
-  if (!config_.allow_aggregate_after_update ||
+  if (config_.disable_aggregate_after_update ||
       layer_dimensions_.input_columns <= layer_dimensions_.output_columns) {
     if (layer_number_ != 0) {
       // transposed sgemm for derivative; in_temp is output
diff --git a/libgnn/test/aggregate-sync-test.cpp b/libgnn/test/aggregate-sync-test.cpp
index 600ac42018..d13674f1a2 100644
--- a/libgnn/test/aggregate-sync-test.cpp
+++ b/libgnn/test/aggregate-sync-test.cpp
@@ -30,7 +30,7 @@ int main() {
   dimension_0.input_columns  = 3;
   dimension_0.output_columns = 2;
   galois::GNNLayerConfig l_config;
-  l_config.allow_aggregate_after_update = false;
+  l_config.disable_aggregate_after_update = false;
 
   // create the layer, no norm factor
   std::unique_ptr<galois::GraphConvolutionalLayer> layer_0 =
diff --git a/libgnn/test/convlayer-test.cpp b/libgnn/test/convlayer-test.cpp
index 58d1d7d581..136953378d 100644
--- a/libgnn/test/convlayer-test.cpp
+++ b/libgnn/test/convlayer-test.cpp
@@ -52,7 +52,7 @@ int main() {
   dimension_0.output_columns = 2;
 
   galois::GNNLayerConfig dcon;
-  dcon.allow_aggregate_after_update = false;
+  dcon.disable_aggregate_after_update = false;
 
   // create the layer, no norm factor
   std::unique_ptr<galois::GraphConvolutionalLayer> layer_0 =
@@ -207,10 +207,10 @@ int main() {
   //////////////////////////////////////////////////////////////////////////////
 
   galois::GNNLayerConfig config;
-  config.do_dropout                   = true;
-  config.do_activation                = true;
-  config.do_normalization             = true;
-  config.allow_aggregate_after_update = false;
+  config.do_dropout                     = true;
+  config.do_activation                  = true;
+  config.do_normalization               = true;
+  config.disable_aggregate_after_update = false;
 
   // finally, just make sure dropout and activation run without crashes
   // (verification requires floating point accuracy or setting a seed which I
diff --git a/libgnn/test/gnnfb-test.cpp b/libgnn/test/gnnfb-test.cpp
index e7232ca108..224204bceb 100644
--- a/libgnn/test/gnnfb-test.cpp
+++ b/libgnn/test/gnnfb-test.cpp
@@ -24,7 +24,7 @@ int main() {
   // note this includes the output; last 2 must be same because softmax
   std::vector<size_t> layer_output_sizes = {4, 7, 7};
   galois::GNNLayerConfig dcon;
-  dcon.allow_aggregate_after_update = false;
+  dcon.disable_aggregate_after_update = false;
   // note GNNLayerConfig is passed in; use a config that does not do anything
   // extra like dropout or activation and the like so that input is easier to
   // verify
diff --git a/libgnn/test/weight-sync-test.cpp b/libgnn/test/weight-sync-test.cpp
index 3ea524e4a7..4c2c01f844 100644
--- a/libgnn/test/weight-sync-test.cpp
+++ b/libgnn/test/weight-sync-test.cpp
@@ -21,7 +21,7 @@ int main() {
   dimension_0.output_columns = 2;
   galois::GNNLayerConfig dcon;
 
-  dcon.allow_aggregate_after_update = false;
+  dcon.disable_aggregate_after_update = false;
   // create the layer, no norm factor
   std::unique_ptr<galois::GraphConvolutionalLayer> layer_0 =
       std::make_unique<galois::GraphConvolutionalLayer>(0, *(test_graph.get()),
diff --git a/lonestar/libgnnbench/include/GNNBench/Input.h b/lonestar/libgnnbench/include/GNNBench/Input.h
index 737887a756..598148af42 100644
--- a/lonestar/libgnnbench/include/GNNBench/Input.h
+++ b/lonestar/libgnnbench/include/GNNBench/Input.h
@@ -28,7 +28,7 @@ extern llvm::cl::opt<bool> multiclass_labels;
 // TODO optimizer type
 //! Toggles an optimization that flips aggregate/update step if it would be
 //! beneficial
-extern llvm::cl::opt<bool> agg_after_update;
+extern llvm::cl::opt<bool> disable_agg_after_update;
 
 const char* GNNPartitionToString(galois::graphs::GNNPartitionScheme s);
 
diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp
index d1dbb5bba3..684e4111dd 100644
--- a/lonestar/libgnnbench/src/Input.cpp
+++ b/lonestar/libgnnbench/src/Input.cpp
@@ -73,11 +73,11 @@ llvm::cl::opt<bool>
                                 "ground truth; required for some inputs"),
                       cll::init(false));
 
-llvm::cl::opt<bool>
-    agg_after_update("allowAggregationAfterUpdate",
-                     cll::desc("If true (on by default), allows aggregate to "
-                               "be done after update as an optimization"),
-                     cll::init(true));
+llvm::cl::opt<bool> disable_agg_after_update(
+    "disableAggregationAfterUpdate",
+    cll::desc("If true (off by default), disables aggregate "
+              "after update optimization"),
+    cll::init(false));
 
 const char* GNNPartitionToString(galois::graphs::GNNPartitionScheme s) {
   switch (s) {
@@ -127,11 +127,11 @@ CreateLayerSizesVector(const galois::graphs::GNNGraph* gnn_graph) {
 //! Setup layer config struct based on cli args
 galois::GNNLayerConfig CreateLayerConfig() {
   galois::GNNLayerConfig layer_config;
-  layer_config.do_dropout                   = do_dropout;
-  layer_config.dropout_rate                 = dropout_rate;
-  layer_config.do_activation                = do_activation;
-  layer_config.do_normalization             = do_normalization;
-  layer_config.allow_aggregate_after_update = agg_after_update;
+  layer_config.do_dropout                     = do_dropout;
+  layer_config.dropout_rate                   = dropout_rate;
+  layer_config.do_activation                  = do_activation;
+  layer_config.do_normalization               = do_normalization;
+  layer_config.disable_aggregate_after_update = disable_agg_after_update;
   return layer_config;
 }
 

From e72595a75d44ed3d6215fe6101653551d91231a7 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 1 Feb 2021 16:35:20 -0600
Subject: [PATCH 459/660] Layers account for sampling: sampling test

If sampling is enabled, layers now ignore the appropriate data depending
on the state of the sampled flag in the graph.

Adds a test to make sure sampling works sanely as well.

TODO: for completeness, can test the agg/xform swap in the GCN layer.
---
 libgnn/include/galois/graphs/GNNGraph.h       |   5 +
 libgnn/src/layers/GraphConvolutionalLayer.cpp |  15 +-
 libgnn/src/layers/SigmoidLayer.cpp            |  10 +
 libgnn/src/layers/SoftmaxLayer.cpp            |  10 +
 libgnn/test/CMakeLists.txt                    |   4 +
 libgnn/test/sample-test.cpp                   | 211 ++++++++++++++++++
 6 files changed, 254 insertions(+), 1 deletion(-)
 create mode 100644 libgnn/test/sample-test.cpp

diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 4400809940..242b63d4c3 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -150,6 +150,11 @@ class GNNGraph {
   //! to indicate subgraph presence)
   void UniformNodeSample();
 
+  //! Makes a node "sampled"; used for debugging/testing
+  void SetSampledNode(size_t node) { partitioned_graph_->getData(node) = 1; }
+  //! Makes a node "not sampled"; used for debugging/testing
+  void UnsetSampledNode(size_t node) { partitioned_graph_->getData(node) = 0; }
+
   //! Returns true if a particular node is currently considered "in" a sampled
   //! graph
   bool IsInSampledGraph(const NodeIterator& ni) const {
diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp
index 2bef20ab1e..46b997b087 100644
--- a/libgnn/src/layers/GraphConvolutionalLayer.cpp
+++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp
@@ -208,6 +208,13 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU(
           aggregate_output[index_to_src_feature + i] = 0;
         }
 
+        if (IsSampledLayer()) {
+          // check if node is part of sampled graph; ignore after 0'ing if not
+          // sampled
+          if (!graph_.IsInSampledGraph(src))
+            return;
+        }
+
         GNNFloat source_norm = 0.0;
         if (config_.do_normalization) {
           source_norm = graph_.NormFactor(src);
@@ -215,7 +222,13 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU(
 
         // loop through all destinations to grab the feature to aggregate
         for (auto e = graph_.EdgeBegin(src); e != graph_.EdgeEnd(src); e++) {
-          size_t dst                  = graph_.EdgeDestination(e);
+          size_t dst = graph_.EdgeDestination(e);
+          if (IsSampledLayer()) {
+            // ignore non-sampled nodes
+            if (!graph_.IsInSampledGraph(dst))
+              continue;
+          }
+
           size_t index_to_dst_feature = dst * column_length;
 
           if (config_.do_normalization) {
diff --git a/libgnn/src/layers/SigmoidLayer.cpp b/libgnn/src/layers/SigmoidLayer.cpp
index 3ae7492046..a7b373373c 100644
--- a/libgnn/src/layers/SigmoidLayer.cpp
+++ b/libgnn/src/layers/SigmoidLayer.cpp
@@ -17,6 +17,11 @@ galois::SigmoidLayer::ForwardPhaseCPU(
       galois::iterate(graph_.begin_owned(), graph_.end_owned()),
       [&](const unsigned local_node) {
         if (graph_.IsValidForPhase(local_node, layer_phase_)) {
+          if (IsSampledLayer()) {
+            if (!graph_.IsInSampledGraph(local_node))
+              return;
+          }
+
           size_t node_offset = feature_length * local_node;
           // sigmoid the values for this node
           for (unsigned index = 0; index < feature_length; index++) {
@@ -65,6 +70,11 @@ galois::SigmoidLayer::BackwardPhaseCPU() {
       galois::iterate(graph_.begin_owned(), graph_.end_owned()),
       [&](const unsigned local_node) {
         if (graph_.IsValidForPhase(local_node, layer_phase_)) {
+          if (IsSampledLayer()) {
+            if (!graph_.IsInSampledGraph(local_node))
+              return;
+          }
+
           // derivative cross entropy into norm grad
           const GNNLabel* ground_truth = graph_.GetMultiClassLabel(local_node);
           size_t node_offset           = feature_length * local_node;
diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp
index 562349780b..62dcabe622 100644
--- a/libgnn/src/layers/SoftmaxLayer.cpp
+++ b/libgnn/src/layers/SoftmaxLayer.cpp
@@ -12,6 +12,11 @@ galois::SoftmaxLayer::ForwardPhaseCPU(
   galois::do_all(
       galois::iterate(graph_.begin_owned(), graph_.end_owned()),
       [&](const unsigned i) {
+        if (IsSampledLayer()) {
+          if (!graph_.IsInSampledGraph(i))
+            return;
+        }
+
         if (graph_.IsValidForPhase(i, layer_phase_)) {
           // do softmax
           GNNSoftmax(feature_length, &input_embeddings[feature_length * i],
@@ -63,6 +68,11 @@ galois::SoftmaxLayer::BackwardPhaseCPU() {
       galois::iterate(graph_.begin_owned(), graph_.end_owned()),
       [&](const unsigned i) {
         if (graph_.IsValidForPhase(i, layer_phase_)) {
+          if (IsSampledLayer()) {
+            if (!graph_.IsInSampledGraph(i))
+              return;
+          }
+
           // create ground truth vector for this LID
           // TODO maybe make this part of the graph class instead of recreating
           // every time
diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt
index 18a854ff8f..a6b711397b 100644
--- a/libgnn/test/CMakeLists.txt
+++ b/libgnn/test/CMakeLists.txt
@@ -54,6 +54,10 @@ if (NOT GALOIS_ENABLE_GPU)
   add_executable(f1-test f1-test.cpp)
   target_link_libraries(f1-test galois_gnn)
   add_test(NAME f1-test COMMAND f1-test)
+
+  add_executable(sample-test sample-test.cpp)
+  target_link_libraries(sample-test galois_gnn)
+  add_test(NAME sample-test COMMAND sample-test)
 else()
   add_executable(gpu-convlayer-test gpu-convlayer-test.cpp)
   target_link_libraries(gpu-convlayer-test galois_gnn)
diff --git a/libgnn/test/sample-test.cpp b/libgnn/test/sample-test.cpp
new file mode 100644
index 0000000000..ead938e5aa
--- /dev/null
+++ b/libgnn/test/sample-test.cpp
@@ -0,0 +1,211 @@
+//! @file sample-test.cpp
+//! Sampling tester
+
+#include "galois/Logging.h"
+#include "galois/GNNMath.h"
+#include "galois/layers/GraphConvolutionalLayer.h"
+#include "galois/layers/SoftmaxLayer.h"
+#include "galois/layers/SigmoidLayer.h"
+
+int main() {
+  galois::DistMemSys G;
+
+  size_t num_threads = galois::setActiveThreads(
+      56 / galois::runtime::getSystemNetworkInterface().Num);
+
+  GALOIS_LOG_VERBOSE("[{}] Using {} threads",
+                     galois::runtime::getSystemNetworkInterface().ID,
+                     num_threads);
+  // load test graph
+  galois::graphs::GNNGraph test_graph(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true);
+
+  galois::GNNLayerDimensions dimension_0;
+  dimension_0.input_rows     = 7;
+  dimension_0.input_columns  = 3;
+  dimension_0.output_columns = 2;
+
+  galois::GNNLayerConfig dcon;
+  dcon.disable_aggregate_after_update = false;
+
+  // choose a few sample nodes
+  test_graph.SetSampledNode(0);
+  test_graph.SetSampledNode(2);
+  test_graph.SetSampledNode(4);
+  test_graph.SetSampledNode(5);
+  test_graph.UnsetSampledNode(1);
+  test_graph.UnsetSampledNode(3);
+  test_graph.UnsetSampledNode(6);
+
+  //////////////////////////////////////////////////////////////////////////////
+
+  std::unique_ptr<galois::GraphConvolutionalLayer> layer_1 =
+      std::make_unique<galois::GraphConvolutionalLayer>(1, test_graph,
+                                                        dimension_0, dcon);
+  layer_1->InitAllWeightsTo1();
+  layer_1->EnableSampling();
+
+  galois::PointerWithSize<galois::GNNFloat> layer_1_forward_output =
+      layer_1->ForwardPhase(test_graph.GetLocalFeatures());
+  // same check as before for sanity purposes
+  GALOIS_LOG_ASSERT(layer_1_forward_output.size() == 14);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[0] == 0);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[1] == 0);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[2] == 0);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[3] == 0);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[4] == 0);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[5] == 0);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[6] == 0);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[7] == 0);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[8] == 15);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[9] == 15);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[10] == 12);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[11] == 12);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[12] == 0);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[13] == 0);
+
+  // dummy 1 matrix
+  std::vector<galois::GNNFloat> dummy_ones_v(14, 1);
+  galois::PointerWithSize dummy_ones(dummy_ones_v);
+
+  // since layer isn't 0 anymore, backward phase will actually return something
+  dummy_ones_v.assign(14, 1);
+  // 0 out unsampled nodes
+  dummy_ones_v[2]  = 0;
+  dummy_ones_v[3]  = 0;
+  dummy_ones_v[6]  = 0;
+  dummy_ones_v[7]  = 0;
+  dummy_ones_v[12] = 0;
+  dummy_ones_v[13] = 0;
+
+  galois::PointerWithSize<galois::GNNFloat> layer_1_backward_output =
+      layer_1->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
+
+  //////////////////////////////////////////////////////////////////////////////
+  // check that multiplies go as expected
+  //////////////////////////////////////////////////////////////////////////////
+
+  GALOIS_LOG_ASSERT(layer_1_backward_output.size() == 21);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[0] == 0);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[1] == 0);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[2] == 0);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[3] == 0);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[4] == 0);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[5] == 0);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[6] == 0);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[7] == 0);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[8] == 0);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[9] == 0);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[10] == 0);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[11] == 0);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[12] == 2);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[13] == 2);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[14] == 2);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[15] == 2);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[16] == 2);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[17] == 2);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[18] == 0);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[19] == 0);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[20] == 0);
+
+  galois::PointerWithSize<galois::GNNFloat> layer_1_weight_gradients =
+      layer_1->GetLayerWeightGradients();
+  // make sure they are sane
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients.size() == 6);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[0] == 9);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[1] == 9);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[2] == 9);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[3] == 9);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[4] == 9);
+
+  layer_1.reset();
+
+  //////////////////////////////////////////////////////////////////////////////
+  // softmax
+  //////////////////////////////////////////////////////////////////////////////
+
+  galois::GNNLayerDimensions dimension_out;
+  dimension_out.input_rows     = 7;
+  dimension_out.input_columns  = test_graph.GetNumLabelClasses();
+  dimension_out.output_columns = test_graph.GetNumLabelClasses();
+  std::vector<galois::GNNFloat> softmax_input(49, 0.0);
+  // create input with perfect accuracy
+  softmax_input[0]  = 1;
+  softmax_input[8]  = 1;
+  softmax_input[16] = 1;
+  softmax_input[24] = 1;
+  softmax_input[32] = 1;
+  softmax_input[40] = 1;
+  softmax_input[48] = 1;
+
+  auto output_layer =
+      std::make_unique<galois::SoftmaxLayer>(3, test_graph, dimension_out);
+  output_layer->EnableSampling();
+  galois::PointerWithSize<galois::GNNFloat> prediction_distribution =
+      output_layer->ForwardPhase(softmax_input);
+
+  GALOIS_LOG_ASSERT(galois::MaxIndex(7, &(prediction_distribution[0])) == 0);
+  GALOIS_LOG_ASSERT(galois::MaxIndex(7, &(prediction_distribution[2 * 7])) ==
+                    2);
+  GALOIS_LOG_ASSERT(galois::MaxIndex(7, &(prediction_distribution[4 * 7])) ==
+                    4);
+
+  std::vector<size_t> sampled_out = {1, 3, 6};
+  // assert sampled out are all 0s
+  for (size_t i : sampled_out) {
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 0] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 1] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 2] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 3] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 4] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 5] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 6] == 0.0);
+  }
+  // softmax back: check sampled out is all 0s (others are floats, too painful)
+  galois::PointerWithSize<galois::GNNFloat> asdf =
+      output_layer->BackwardPhase(softmax_input, nullptr);
+  for (size_t i : sampled_out) {
+    GALOIS_LOG_ASSERT(asdf[i * 7 + 0] == 0.0);
+    GALOIS_LOG_ASSERT(asdf[i * 7 + 1] == 0.0);
+    GALOIS_LOG_ASSERT(asdf[i * 7 + 2] == 0.0);
+    GALOIS_LOG_ASSERT(asdf[i * 7 + 3] == 0.0);
+    GALOIS_LOG_ASSERT(asdf[i * 7 + 4] == 0.0);
+    GALOIS_LOG_ASSERT(asdf[i * 7 + 5] == 0.0);
+    GALOIS_LOG_ASSERT(asdf[i * 7 + 6] == 0.0);
+  }
+
+  output_layer.reset();
+
+  //////////////////////////////////////////////////////////////////////////////
+  // sigmoid
+  //////////////////////////////////////////////////////////////////////////////
+  galois::graphs::GNNGraph multi_graph(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, false);
+
+  auto sigmoid_layer =
+      std::make_unique<galois::SigmoidLayer>(3, multi_graph, dimension_out);
+  sigmoid_layer->EnableSampling();
+  // reuse softmax input; only thing interested in is checking for 0s
+  prediction_distribution = sigmoid_layer->ForwardPhase(softmax_input);
+  for (size_t i : sampled_out) {
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 0] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 1] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 2] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 3] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 4] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 5] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 6] == 0.0);
+  }
+  asdf = sigmoid_layer->BackwardPhase(softmax_input, nullptr);
+  for (size_t i : sampled_out) {
+    GALOIS_LOG_ASSERT(asdf[i * 7 + 0] == 0.0);
+    GALOIS_LOG_ASSERT(asdf[i * 7 + 1] == 0.0);
+    GALOIS_LOG_ASSERT(asdf[i * 7 + 2] == 0.0);
+    GALOIS_LOG_ASSERT(asdf[i * 7 + 3] == 0.0);
+    GALOIS_LOG_ASSERT(asdf[i * 7 + 4] == 0.0);
+    GALOIS_LOG_ASSERT(asdf[i * 7 + 5] == 0.0);
+    GALOIS_LOG_ASSERT(asdf[i * 7 + 6] == 0.0);
+  }
+
+  return 0;
+}

From f573b1d857d9c85c0a2d5761b21ed8ff0b845422 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 2 Feb 2021 16:22:34 -0600
Subject: [PATCH 460/660] gcn app sampling option: loss/accuracy report fix

Adds CLI option for sampling for GCN app. Also fixes the loss/accuracy
reporting by the library by averaging loss to # of nodes checked and
taking into account sampling when calculating both.
---
 libgnn/include/galois/GraphNeuralNetwork.h    |  2 ++
 libgnn/include/galois/graphs/GNNGraph.h       | 13 ++++++--
 libgnn/include/galois/layers/GNNLayer.h       |  4 +++
 libgnn/src/GraphNeuralNetwork.cpp             | 12 ++++----
 libgnn/src/graphs/GNNGraph.cpp                | 30 +++++++++++++++----
 libgnn/src/layers/SigmoidLayer.cpp            | 11 ++++---
 libgnn/src/layers/SoftmaxLayer.cpp            |  1 +
 lonestar/libgnnbench/include/GNNBench/Input.h |  2 ++
 lonestar/libgnnbench/src/Input.cpp            | 12 ++++++--
 9 files changed, 65 insertions(+), 22 deletions(-)

diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h
index 9aa7d8189e..ed4cc19b8c 100644
--- a/libgnn/include/galois/GraphNeuralNetwork.h
+++ b/libgnn/include/galois/GraphNeuralNetwork.h
@@ -171,6 +171,8 @@ class GraphNeuralNetwork {
   //! Returns classification accuracy for single class label or micro F1 score
   //! for multi-class predictions; this calls into GNNGraph's accuracy call
   float GetGlobalAccuracy(const PointerWithSize<GNNFloat> predictions);
+  float GetGlobalAccuracy(const PointerWithSize<GNNFloat> predictions,
+                          bool sampling);
 
   //! Backpropagate gradients from the output layer backwards through the
   //! network to update the layer weights. Also known as a backward phase in
diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 242b63d4c3..a4ef90ea4a 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -96,8 +96,11 @@ class GNNGraph {
   };
   GNNFloat NormFactor(GraphNode n) const { return norm_factors_[n]; }
 
+  // Get accuracy: sampling is by default false
   float GetGlobalAccuracy(PointerWithSize<GNNFloat> predictions,
                           GNNPhase phase);
+  float GetGlobalAccuracy(PointerWithSize<GNNFloat> predictions, GNNPhase phase,
+                          bool sampling);
 
   //! Returns the ground truth label of some local id assuming labels are single
   //! class labels.
@@ -161,6 +164,10 @@ class GNNGraph {
     // TODO(loc) GPU
     return partitioned_graph_->getData(*ni);
   }
+  bool IsInSampledGraph(size_t node_id) const {
+    // TODO(loc) GPU
+    return partitioned_graph_->getData(node_id);
+  }
 
 #ifdef GALOIS_ENABLE_GPU
   const GNNGraphGPUAllocations& GetGPUGraph() const { return gpu_memory_; }
@@ -193,11 +200,11 @@ class GNNGraph {
   //////////////////////////////////////////////////////////////////////////////
 
   float GetGlobalAccuracyCPU(PointerWithSize<GNNFloat> predictions,
-                             GNNPhase phase);
+                             GNNPhase phase, bool sampling);
   float GetGlobalAccuracyCPUSingle(PointerWithSize<GNNFloat> predictions,
-                                   GNNPhase phase);
+                                   GNNPhase phase, bool sampling);
   float GetGlobalAccuracyCPUMulti(PointerWithSize<GNNFloat> predictions,
-                                  GNNPhase phase);
+                                  GNNPhase phase, bool sampling);
 
   //////////////////////////////////////////////////////////////////////////////
   // Vars
diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
index 2924520661..06bce9660f 100644
--- a/libgnn/include/galois/layers/GNNLayer.h
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -221,6 +221,10 @@ class GNNLayer {
   galois::GNNOutputLayerType output_layer_type_{
       galois::GNNOutputLayerType::kInvalid};
 
+  // Used mainly for accuracy tracking
+  galois::DGAccumulator<uint32_t> node_count_;
+  galois::DGAccumulator<float> float_accumulator_;
+
   //////////////////////////////////////////////////////////////////////////////
 
   //! Init based from following paper
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index 7892bf4f9e..d9c0110a9f 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -86,11 +86,11 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork(
 
 float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
   const size_t this_host = graph_->host_id();
-  if (config_.do_sampling()) {
-    for (std::unique_ptr<galois::GNNLayer>& ptr : gnn_layers_) {
-      assert(ptr->IsSampledLayer());
-    }
-  }
+  // if (config_.do_sampling()) {
+  //   for (std::unique_ptr<galois::GNNLayer>& ptr : gnn_layers_) {
+  //     assert(ptr->IsSampledLayer());
+  //   }
+  // }
 
   // TODO incorporate validation/test intervals
   for (size_t epoch = 0; epoch < num_epochs; epoch++) {
@@ -136,7 +136,7 @@ galois::GraphNeuralNetwork::DoInference() {
 
 float galois::GraphNeuralNetwork::GetGlobalAccuracy(
     PointerWithSize<GNNFloat> predictions) {
-  return graph_->GetGlobalAccuracy(predictions, phase_);
+  return graph_->GetGlobalAccuracy(predictions, phase_, config_.do_sampling());
 }
 
 void galois::GraphNeuralNetwork::GradientPropagation() {
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index f110228fa3..4a1b3a2f99 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -393,20 +393,26 @@ void galois::graphs::GNNGraph::InitNormFactor() {
 float galois::graphs::GNNGraph::GetGlobalAccuracy(
     PointerWithSize<GNNFloat> predictions, GNNPhase phase) {
   // No GPU version yet, but this is where it would be
-  return GetGlobalAccuracyCPU(predictions, phase);
+  return GetGlobalAccuracy(predictions, phase, false);
+}
+
+float galois::graphs::GNNGraph::GetGlobalAccuracy(
+    PointerWithSize<GNNFloat> predictions, GNNPhase phase, bool sampling) {
+  // No GPU version yet, but this is where it would be
+  return GetGlobalAccuracyCPU(predictions, phase, sampling);
 }
 
 float galois::graphs::GNNGraph::GetGlobalAccuracyCPU(
-    PointerWithSize<GNNFloat> predictions, GNNPhase phase) {
+    PointerWithSize<GNNFloat> predictions, GNNPhase phase, bool sampling) {
   if (is_single_class_label()) {
-    return GetGlobalAccuracyCPUSingle(predictions, phase);
+    return GetGlobalAccuracyCPUSingle(predictions, phase, sampling);
   } else {
-    return GetGlobalAccuracyCPUMulti(predictions, phase);
+    return GetGlobalAccuracyCPUMulti(predictions, phase, sampling);
   }
 }
 
 float galois::graphs::GNNGraph::GetGlobalAccuracyCPUSingle(
-    PointerWithSize<GNNFloat> predictions, GNNPhase phase) {
+    PointerWithSize<GNNFloat> predictions, GNNPhase phase, bool sampling) {
   // check owned nodes' accuracy
   assert((num_label_classes_ * size()) == predictions.size());
   num_correct_.reset();
@@ -416,6 +422,12 @@ float galois::graphs::GNNGraph::GetGlobalAccuracyCPUSingle(
       galois::iterate(begin_owned(), end_owned()),
       [&](const unsigned lid) {
         if (IsValidForPhase(lid, phase)) {
+          if (sampling) {
+            if (!IsInSampledGraph(lid)) {
+              return;
+            }
+          }
+
           total_checked_ += 1;
           // get prediction by getting max
           size_t predicted_label = galois::MaxIndex(
@@ -441,7 +453,7 @@ float galois::graphs::GNNGraph::GetGlobalAccuracyCPUSingle(
 }
 
 float galois::graphs::GNNGraph::GetGlobalAccuracyCPUMulti(
-    PointerWithSize<GNNFloat> predictions, GNNPhase phase) {
+    PointerWithSize<GNNFloat> predictions, GNNPhase phase, bool sampling) {
 
   const GNNLabel* full_ground_truth = GetMultiClassLabel(0);
   assert(predictions.size() == (num_label_classes_ * size()));
@@ -465,6 +477,12 @@ float galois::graphs::GNNGraph::GetGlobalAccuracyCPUMulti(
         galois::iterate(begin_owned(), end_owned()),
         [&](const unsigned lid) {
           if (IsValidForPhase(lid, phase)) {
+            if (sampling) {
+              if (!IsInSampledGraph(lid)) {
+                return;
+              }
+            }
+
             size_t label_index = lid * num_label_classes_ + label_class;
 
             GNNLabel true_label = full_ground_truth[label_index];
diff --git a/libgnn/src/layers/SigmoidLayer.cpp b/libgnn/src/layers/SigmoidLayer.cpp
index a7b373373c..983ab9af87 100644
--- a/libgnn/src/layers/SigmoidLayer.cpp
+++ b/libgnn/src/layers/SigmoidLayer.cpp
@@ -10,8 +10,8 @@ galois::SigmoidLayer::ForwardPhaseCPU(
   input_loss_.assign(input_loss_.size(), 0.0);
   forward_output_matrix_.assign(forward_output_matrix_.size(), 0.0);
   const size_t feature_length = layer_dimensions_.input_columns;
-  galois::GAccumulator<double> total_loss;
-  total_loss.reset();
+  node_count_.reset();
+  float_accumulator_.reset();
 
   galois::do_all(
       galois::iterate(graph_.begin_owned(), graph_.end_owned()),
@@ -22,6 +22,8 @@ galois::SigmoidLayer::ForwardPhaseCPU(
               return;
           }
 
+          node_count_ += 1;
+
           size_t node_offset = feature_length * local_node;
           // sigmoid the values for this node
           for (unsigned index = 0; index < feature_length; index++) {
@@ -40,12 +42,13 @@ galois::SigmoidLayer::ForwardPhaseCPU(
               feature_length, graph_.GetMultiClassLabel(local_node),
               &forward_output_matrix_[node_offset]);
           // TODO(loc) normalize the loss
-          total_loss += input_loss_[local_node];
+          float_accumulator_ += input_loss_[local_node];
         }
       },
       galois::steal(), galois::loopname("SigmoidForward"));
 
-  galois::gPrint("Total loss is ", total_loss.reduce(), "\n");
+  galois::gPrint("Average loss is ",
+                 float_accumulator_.reduce() / node_count_.reduce(), "\n");
   return forward_output_matrix_;
 }
 
diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp
index 62dcabe622..9b9fc0e3a6 100644
--- a/libgnn/src/layers/SoftmaxLayer.cpp
+++ b/libgnn/src/layers/SoftmaxLayer.cpp
@@ -8,6 +8,7 @@ galois::SoftmaxLayer::ForwardPhaseCPU(
   input_loss_.assign(input_loss_.size(), 0.0);
   forward_output_matrix_.assign(forward_output_matrix_.size(), 0.0);
   const size_t feature_length = layer_dimensions_.input_columns;
+  // TODO(loc) once needed for accuracy debugging, print out loss
 
   galois::do_all(
       galois::iterate(graph_.begin_owned(), graph_.end_owned()),
diff --git a/lonestar/libgnnbench/include/GNNBench/Input.h b/lonestar/libgnnbench/include/GNNBench/Input.h
index 598148af42..18db419793 100644
--- a/lonestar/libgnnbench/include/GNNBench/Input.h
+++ b/lonestar/libgnnbench/include/GNNBench/Input.h
@@ -29,6 +29,8 @@ extern llvm::cl::opt<bool> multiclass_labels;
 //! Toggles an optimization that flips aggregate/update step if it would be
 //! beneficial
 extern llvm::cl::opt<bool> disable_agg_after_update;
+//! Random sampling of nodes every epoch
+extern llvm::cl::opt<bool> do_graph_sampling;
 
 const char* GNNPartitionToString(galois::graphs::GNNPartitionScheme s);
 
diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp
index 684e4111dd..b4cd7fb67e 100644
--- a/lonestar/libgnnbench/src/Input.cpp
+++ b/lonestar/libgnnbench/src/Input.cpp
@@ -79,6 +79,12 @@ llvm::cl::opt<bool> disable_agg_after_update(
               "after update optimization"),
     cll::init(false));
 
+llvm::cl::opt<bool>
+    do_graph_sampling("doGraphSampling",
+                      cll::desc("If true (off by default), sample nodes for "
+                                "use every epoch at a 50\% drop rate"),
+                      cll::init(false));
+
 const char* GNNPartitionToString(galois::graphs::GNNPartitionScheme s) {
   switch (s) {
   case galois::graphs::GNNPartitionScheme::kOEC:
@@ -188,9 +194,9 @@ InitializeGraphNeuralNetwork(galois::GNNLayerType layer_type) {
   // layer config object
   galois::GNNLayerConfig layer_config = CreateLayerConfig();
   // GNN config object
-  galois::GraphNeuralNetworkConfig gnn_config(num_layers, layer_types,
-                                              layer_sizes_vector,
-                                              output_layer_type, layer_config);
+  galois::GraphNeuralNetworkConfig gnn_config(
+      num_layers, layer_types, layer_sizes_vector, output_layer_type,
+      do_graph_sampling, layer_config);
   // optimizer
   std::unique_ptr<galois::BaseOptimizer> opt = CreateOptimizer(gnn_graph.get());
 

From c2582e93fc746b6ae488d433947eb0998d2d4ac7 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 2 Feb 2021 17:10:15 -0600
Subject: [PATCH 461/660] Option for diff. sample rates; test sample fix

Separated sampling function into 2 functions: one allows you to adjust
rate of sampling.

Fixed sampling such that anything not the training phase will use the
entire graph as it should.
---
 libgnn/include/galois/graphs/GNNGraph.h       |  4 +++-
 libgnn/src/graphs/GNNGraph.cpp                | 10 ++++++----
 libgnn/src/layers/GraphConvolutionalLayer.cpp |  5 +++--
 libgnn/src/layers/SigmoidLayer.cpp            |  6 ++++--
 libgnn/src/layers/SoftmaxLayer.cpp            |  4 ++--
 5 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index a4ef90ea4a..3c0419c28f 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -150,8 +150,10 @@ class GNNGraph {
 
   //! Loops through all master nodes and determines if it is "on" or "off"
   //! (the meaning of on and off depends on how it is used; for now, it is used
-  //! to indicate subgraph presence)
+  //! to indicate subgraph presence); droprate controls chance of being dropped
+  //! (e.g. if 0.8, a node is 80% likely to not be included in subgraph)
   void UniformNodeSample();
+  void UniformNodeSample(float droprate);
 
   //! Makes a node "sampled"; used for debugging/testing
   void SetSampledNode(size_t node) { partitioned_graph_->getData(node) = 1; }
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index 4a1b3a2f99..ee243f5c4d 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -130,10 +130,12 @@ void galois::graphs::GNNGraph::AggregateSync(
       "GraphAggregateSync");
 }
 
-void galois::graphs::GNNGraph::UniformNodeSample() {
+void galois::graphs::GNNGraph::UniformNodeSample() { UniformNodeSample(0.8); }
+
+void galois::graphs::GNNGraph::UniformNodeSample(float droprate) {
   galois::do_all(
       galois::iterate(begin_owned(), end_owned()), [&](const NodeIterator& x) {
-        partitioned_graph_->getData(*x) = sample_rng_.DoBernoulli(0.5);
+        partitioned_graph_->getData(*x) = sample_rng_.DoBernoulli(droprate);
       });
   // TODO(loc) GPU
   // TODO(loc) sync the flags across all machines to have same sample on all of
@@ -423,7 +425,7 @@ float galois::graphs::GNNGraph::GetGlobalAccuracyCPUSingle(
       [&](const unsigned lid) {
         if (IsValidForPhase(lid, phase)) {
           if (sampling) {
-            if (!IsInSampledGraph(lid)) {
+            if (phase == GNNPhase::kTrain && !IsInSampledGraph(lid)) {
               return;
             }
           }
@@ -478,7 +480,7 @@ float galois::graphs::GNNGraph::GetGlobalAccuracyCPUMulti(
         [&](const unsigned lid) {
           if (IsValidForPhase(lid, phase)) {
             if (sampling) {
-              if (!IsInSampledGraph(lid)) {
+              if (phase == GNNPhase::kTrain && !IsInSampledGraph(lid)) {
                 return;
               }
             }
diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp
index 46b997b087..c416a0272a 100644
--- a/libgnn/src/layers/GraphConvolutionalLayer.cpp
+++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp
@@ -211,7 +211,7 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU(
         if (IsSampledLayer()) {
           // check if node is part of sampled graph; ignore after 0'ing if not
           // sampled
-          if (!graph_.IsInSampledGraph(src))
+          if (layer_phase_ == GNNPhase::kTrain && !graph_.IsInSampledGraph(src))
             return;
         }
 
@@ -225,7 +225,8 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU(
           size_t dst = graph_.EdgeDestination(e);
           if (IsSampledLayer()) {
             // ignore non-sampled nodes
-            if (!graph_.IsInSampledGraph(dst))
+            if (layer_phase_ == GNNPhase::kTrain &&
+                !graph_.IsInSampledGraph(dst))
               continue;
           }
 
diff --git a/libgnn/src/layers/SigmoidLayer.cpp b/libgnn/src/layers/SigmoidLayer.cpp
index 983ab9af87..35f95b64a6 100644
--- a/libgnn/src/layers/SigmoidLayer.cpp
+++ b/libgnn/src/layers/SigmoidLayer.cpp
@@ -18,7 +18,8 @@ galois::SigmoidLayer::ForwardPhaseCPU(
       [&](const unsigned local_node) {
         if (graph_.IsValidForPhase(local_node, layer_phase_)) {
           if (IsSampledLayer()) {
-            if (!graph_.IsInSampledGraph(local_node))
+            if (layer_phase_ == GNNPhase::kTrain &&
+                !graph_.IsInSampledGraph(local_node))
               return;
           }
 
@@ -74,7 +75,8 @@ galois::SigmoidLayer::BackwardPhaseCPU() {
       [&](const unsigned local_node) {
         if (graph_.IsValidForPhase(local_node, layer_phase_)) {
           if (IsSampledLayer()) {
-            if (!graph_.IsInSampledGraph(local_node))
+            if (layer_phase_ == GNNPhase::kTrain &&
+                !graph_.IsInSampledGraph(local_node))
               return;
           }
 
diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp
index 9b9fc0e3a6..d98251091c 100644
--- a/libgnn/src/layers/SoftmaxLayer.cpp
+++ b/libgnn/src/layers/SoftmaxLayer.cpp
@@ -14,7 +14,7 @@ galois::SoftmaxLayer::ForwardPhaseCPU(
       galois::iterate(graph_.begin_owned(), graph_.end_owned()),
       [&](const unsigned i) {
         if (IsSampledLayer()) {
-          if (!graph_.IsInSampledGraph(i))
+          if (layer_phase_ == GNNPhase::kTrain && !graph_.IsInSampledGraph(i))
             return;
         }
 
@@ -70,7 +70,7 @@ galois::SoftmaxLayer::BackwardPhaseCPU() {
       [&](const unsigned i) {
         if (graph_.IsValidForPhase(i, layer_phase_)) {
           if (IsSampledLayer()) {
-            if (!graph_.IsInSampledGraph(i))
+            if (layer_phase_ == GNNPhase::kTrain && !graph_.IsInSampledGraph(i))
               return;
           }
 

From e393c03b80e2106ff1954e48cd50c0ffd8946ace Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 5 Feb 2021 12:16:56 -0600
Subject: [PATCH 462/660] ogbn-proteins split

---
 libcusp/include/galois/graphs/NewGeneric.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/libcusp/include/galois/graphs/NewGeneric.h b/libcusp/include/galois/graphs/NewGeneric.h
index 0c3e4b31d4..771c5b5143 100644
--- a/libcusp/include/galois/graphs/NewGeneric.h
+++ b/libcusp/include/galois/graphs/NewGeneric.h
@@ -115,6 +115,10 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
       // this is entire graph: amazon's mask isn't contiguous
       bps.push_back(0);
       bps.push_back(1569960);
+    } else if (filename.find("ogbn-proteins") != std::string::npos) {
+      // this is entire graph: amazon's mask isn't contiguous
+      bps.push_back(0);
+      bps.push_back(86618);
     } else {
       // TODO(loc) only die under certain conditions; don't die if something
       // is missing

From 78f1e86a792091cff94d56e47d02b75fbcb1434d Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 5 Feb 2021 12:40:28 -0600
Subject: [PATCH 463/660] Inductive gnn training

Inductive gnn training option enabled by this commit. Users can specify
inductive training, and the result is that during training validation
and test nodes are completely ignored.
---
 libgnn/include/galois/layers/GNNLayer.h       |  5 ++-
 libgnn/src/layers/GraphConvolutionalLayer.cpp | 37 ++++++++++++++-----
 lonestar/libgnnbench/include/GNNBench/Input.h | 21 -----------
 lonestar/libgnnbench/include/GNNBench/Start.h |  3 --
 lonestar/libgnnbench/src/Input.cpp            |  7 ++++
 5 files changed, 38 insertions(+), 35 deletions(-)

diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
index 06bce9660f..6ec6a78671 100644
--- a/libgnn/include/galois/layers/GNNLayer.h
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -53,6 +53,8 @@ struct GNNLayerConfig {
   bool disable_aggregate_after_update{false};
   //! Graph sampling flag in use or not
   bool do_sampling{false};
+  //! Inductive layer means for aggregation all non-training nodes are ignored
+  bool inductive_training_{false};
   // TODO activation type; for now default is softmax
 };
 
@@ -139,7 +141,8 @@ class GNNLayer {
 
   //! Flip sampling switch on
   void EnableSampling() { config_.do_sampling = true; }
-  bool IsSampledLayer() { return config_.do_sampling; }
+  bool IsSampledLayer() const { return config_.do_sampling; }
+  bool IsInductiveLayer() const { return config_.inductive_training_; }
 
 #ifdef GALOIS_ENABLE_GPU
   //! Utility function for allocating
diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp
index c416a0272a..208229d6f1 100644
--- a/libgnn/src/layers/GraphConvolutionalLayer.cpp
+++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp
@@ -208,11 +208,19 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU(
           aggregate_output[index_to_src_feature + i] = 0;
         }
 
-        if (IsSampledLayer()) {
-          // check if node is part of sampled graph; ignore after 0'ing if not
-          // sampled
-          if (layer_phase_ == GNNPhase::kTrain && !graph_.IsInSampledGraph(src))
-            return;
+        if (layer_phase_ == GNNPhase::kTrain) {
+          if (IsInductiveLayer()) {
+            // if inductive, all non-training nodes do not exist
+            if (!graph_.IsValidForPhase(src, GNNPhase::kTrain))
+              return;
+          }
+
+          if (IsSampledLayer()) {
+            // check if node is part of sampled graph; ignore after 0'ing if not
+            // sampled
+            if (!graph_.IsInSampledGraph(src))
+              return;
+          }
         }
 
         GNNFloat source_norm = 0.0;
@@ -223,11 +231,20 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU(
         // loop through all destinations to grab the feature to aggregate
         for (auto e = graph_.EdgeBegin(src); e != graph_.EdgeEnd(src); e++) {
           size_t dst = graph_.EdgeDestination(e);
-          if (IsSampledLayer()) {
-            // ignore non-sampled nodes
-            if (layer_phase_ == GNNPhase::kTrain &&
-                !graph_.IsInSampledGraph(dst))
-              continue;
+
+          if (layer_phase_ == GNNPhase::kTrain) {
+            if (IsInductiveLayer()) {
+              // if inductive, all non-training nodes do not exist
+              if (!graph_.IsValidForPhase(dst, GNNPhase::kTrain))
+                return;
+            }
+
+            if (IsSampledLayer()) {
+              // ignore non-sampled nodes
+              if (layer_phase_ == GNNPhase::kTrain &&
+                  !graph_.IsInSampledGraph(dst))
+                continue;
+            }
           }
 
           size_t index_to_dst_feature = dst * column_length;
diff --git a/lonestar/libgnnbench/include/GNNBench/Input.h b/lonestar/libgnnbench/include/GNNBench/Input.h
index 18db419793..784b1fd431 100644
--- a/lonestar/libgnnbench/include/GNNBench/Input.h
+++ b/lonestar/libgnnbench/include/GNNBench/Input.h
@@ -10,27 +10,6 @@ extern llvm::cl::opt<std::string> input_directory;
 extern llvm::cl::opt<std::string> input_name;
 //! Scheme used to partition the graph
 extern llvm::cl::opt<galois::graphs::GNNPartitionScheme> partition_scheme;
-// Control layer count and size
-extern llvm::cl::opt<size_t> num_layers;
-extern llvm::cl::list<size_t> layer_sizes;
-// Control dropout
-extern llvm::cl::opt<bool> do_dropout;
-extern llvm::cl::opt<float> dropout_rate;
-// Control activation
-extern llvm::cl::opt<bool> do_activation;
-// TODO activation layer type once more are supported
-//! Controls weight normalization based on degree
-extern llvm::cl::opt<bool> do_normalization;
-//! Output layer type
-extern llvm::cl::opt<galois::GNNOutputLayerType> output_layer_type;
-//! If true, use multiclass ground truth
-extern llvm::cl::opt<bool> multiclass_labels;
-// TODO optimizer type
-//! Toggles an optimization that flips aggregate/update step if it would be
-//! beneficial
-extern llvm::cl::opt<bool> disable_agg_after_update;
-//! Random sampling of nodes every epoch
-extern llvm::cl::opt<bool> do_graph_sampling;
 
 const char* GNNPartitionToString(galois::graphs::GNNPartitionScheme s);
 
diff --git a/lonestar/libgnnbench/include/GNNBench/Start.h b/lonestar/libgnnbench/include/GNNBench/Start.h
index c17ddecadc..c03970c868 100644
--- a/lonestar/libgnnbench/include/GNNBench/Start.h
+++ b/lonestar/libgnnbench/include/GNNBench/Start.h
@@ -8,10 +8,7 @@
 // CLI
 ////////////////////////////////////////////////////////////////////////////////
 
-extern llvm::cl::opt<unsigned> num_threads;
-extern llvm::cl::opt<unsigned> num_runs;
 extern llvm::cl::opt<unsigned> num_epochs;
-extern llvm::cl::opt<std::string> stat_file;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Init functions
diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp
index b4cd7fb67e..d9b92607b1 100644
--- a/lonestar/libgnnbench/src/Input.cpp
+++ b/lonestar/libgnnbench/src/Input.cpp
@@ -85,6 +85,12 @@ llvm::cl::opt<bool>
                                 "use every epoch at a 50\% drop rate"),
                       cll::init(false));
 
+llvm::cl::opt<bool>
+    do_inductive_training("doInductiveTraining",
+                          cll::desc("If true (off by default), during training "
+                                    "all non-train nodes are ignored"),
+                          cll::init(false));
+
 const char* GNNPartitionToString(galois::graphs::GNNPartitionScheme s) {
   switch (s) {
   case galois::graphs::GNNPartitionScheme::kOEC:
@@ -138,6 +144,7 @@ galois::GNNLayerConfig CreateLayerConfig() {
   layer_config.do_activation                  = do_activation;
   layer_config.do_normalization               = do_normalization;
   layer_config.disable_aggregate_after_update = disable_agg_after_update;
+  layer_config.inductive_training_            = do_inductive_training;
   return layer_config;
 }
 

From 5b95e08224d8337b7cff5d71f2dc60967c7795aa Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 5 Feb 2021 21:09:31 -0600
Subject: [PATCH 464/660] Sample and inductive norm factor correction

Adds inductive training (i.e. ignore non-train nodes during training).

Sampling/inductive training changes up input graph: norm factor must
chnage as well.
---
 libgnn/include/galois/GraphNeuralNetwork.h | 14 +++--
 libgnn/include/galois/graphs/GNNGraph.h    |  7 +++
 libgnn/src/GraphNeuralNetwork.cpp          |  7 ++-
 libgnn/src/graphs/GNNGraph.cpp             | 72 +++++++++++++++++++++-
 lonestar/libgnnbench/src/Input.cpp         |  1 +
 5 files changed, 93 insertions(+), 8 deletions(-)

diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h
index ed4cc19b8c..ae860d0d32 100644
--- a/libgnn/include/galois/GraphNeuralNetwork.h
+++ b/libgnn/include/galois/GraphNeuralNetwork.h
@@ -58,9 +58,9 @@ class GraphNeuralNetworkConfig {
                            GNNOutputLayerType output_layer_type,
                            bool do_sampling,
                            const GNNLayerConfig& default_layer_config)
-      : num_intermediate_layers_(num_layers), layer_types_(layer_types),
-        layer_column_sizes_(layer_column_sizes),
-        output_layer_type_(output_layer_type), do_sampling_(do_sampling),
+      : do_sampling_(do_sampling), num_intermediate_layers_(num_layers),
+        layer_types_(layer_types), layer_column_sizes_(layer_column_sizes),
+        output_layer_type_(output_layer_type),
         default_layer_config_(default_layer_config) {
     // Do sanity checks on inputs
     // should have a type for each layer
@@ -98,6 +98,12 @@ class GraphNeuralNetworkConfig {
     return default_layer_config_;
   }
 
+  // public because they are independent of other settings
+  //! Graph sampling
+  bool do_sampling_{false};
+  //! Inductive = training ignores test/val set
+  bool inductive_training_{false};
+
 private:
   //! Number of layers to construct in the GNN not including the output
   //! layer
@@ -110,8 +116,6 @@ class GraphNeuralNetworkConfig {
   std::vector<size_t> layer_column_sizes_;
   //! Output layer type
   GNNOutputLayerType output_layer_type_;
-  //! Graph sampling
-  bool do_sampling_;
   //! Default config to use for layers
   GNNLayerConfig default_layer_config_;
 };
diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 3c0419c28f..2ed6647b7c 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -171,9 +171,16 @@ class GNNGraph {
     return partitioned_graph_->getData(node_id);
   }
 
+  //! Calculate norm factor considering the entire graph
+  void CalculateFullNormFactor();
+  //! Calculate norm factor considering sampled nodes and/or training nodes
+  //! only (inductive)
+  void CalculateSpecialNormFactor(bool is_sampled, bool is_inductive);
+
 #ifdef GALOIS_ENABLE_GPU
   const GNNGraphGPUAllocations& GetGPUGraph() const { return gpu_memory_; }
 #endif
+
 private:
   //////////////////////////////////////////////////////////////////////////////
   // Initialization
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index d9c0110a9f..be188ff843 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -92,11 +92,16 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
   //   }
   // }
 
+  if (config_.inductive_training_) {
+    graph_->CalculateSpecialNormFactor(false, true);
+  }
+
   // TODO incorporate validation/test intervals
   for (size_t epoch = 0; epoch < num_epochs; epoch++) {
     if (config_.do_sampling()) {
       // subgraph sample every epoch
       graph_->UniformNodeSample();
+      graph_->CalculateSpecialNormFactor(true, config_.inductive_training_);
     }
     const PointerWithSize<galois::GNNFloat> predictions = DoInference();
     GradientPropagation();
@@ -107,7 +112,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
     }
     // TODO validation and test as necessary
   }
-
+  graph_->CalculateFullNormFactor();
   // check test accuracy
   galois::StatTimer acc_timer("FinalAccuracyTest");
   acc_timer.start();
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index ee243f5c4d..3e5d468da2 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -130,7 +130,7 @@ void galois::graphs::GNNGraph::AggregateSync(
       "GraphAggregateSync");
 }
 
-void galois::graphs::GNNGraph::UniformNodeSample() { UniformNodeSample(0.8); }
+void galois::graphs::GNNGraph::UniformNodeSample() { UniformNodeSample(0.5); }
 
 void galois::graphs::GNNGraph::UniformNodeSample(float droprate) {
   galois::do_all(
@@ -374,6 +374,11 @@ void galois::graphs::GNNGraph::ReadWholeGraph(const std::string& dataset_name) {
 void galois::graphs::GNNGraph::InitNormFactor() {
   GALOIS_LOG_VERBOSE("[{}] Initializing norm factors", host_id_);
   norm_factors_.resize(partitioned_graph_->size(), 0.0);
+  CalculateFullNormFactor();
+}
+
+void galois::graphs::GNNGraph::CalculateFullNormFactor() {
+  norm_factors_.assign(partitioned_graph_->size(), 0.0);
 
   // get the norm factor contribution for each node based on the GLOBAL graph
   galois::do_all(
@@ -389,7 +394,70 @@ void galois::graphs::GNNGraph::InitNormFactor() {
               1.0 / std::sqrt(static_cast<float>(global_degree));
         }
       },
-      galois::loopname("InitNormFactor"));
+      galois::loopname("CalculateFullNormFactor"));
+}
+
+void galois::graphs::GNNGraph::CalculateSpecialNormFactor(bool is_sampled,
+                                                          bool is_inductive) {
+  if (galois::runtime::getSystemNetworkInterface().Num > 1) {
+    GALOIS_LOG_FATAL("cannot run special norm factor in dist setting yet");
+  }
+
+  norm_factors_.assign(partitioned_graph_->size(), 0.0);
+
+  // get the norm factor contribution for each node based on the GLOBAL graph
+  galois::do_all(
+      galois::iterate(static_cast<size_t>(0), partitioned_graph_->size()),
+      [&](size_t local_id) {
+        // ignore node if not valid
+        if (is_sampled && is_inductive) {
+          if (!IsValidForPhase(local_id, GNNPhase::kTrain) ||
+              !IsInSampledGraph(local_id)) {
+            return;
+          }
+        } else if (is_sampled) {
+          if (!IsInSampledGraph(local_id)) {
+            return;
+          }
+        } else if (is_inductive) {
+          if (!IsValidForPhase(local_id, GNNPhase::kTrain)) {
+            return;
+          }
+        }
+
+        size_t degree = 0;
+
+        // TODO(loc) make this work in a distributed setting; assuming
+        // whole graph is present on single host at the moment
+        for (EdgeIterator e = EdgeBegin(local_id); e != EdgeEnd(local_id);
+             e++) {
+          size_t dest = EdgeDestination(e);
+          if (is_sampled && is_inductive) {
+            if (!IsValidForPhase(dest, GNNPhase::kTrain) ||
+                !IsInSampledGraph(dest)) {
+              continue;
+            }
+          } else if (is_sampled) {
+            if (!IsInSampledGraph(dest)) {
+              continue;
+            }
+          } else if (is_inductive) {
+            if (!IsValidForPhase(dest, GNNPhase::kTrain)) {
+              continue;
+            }
+          } else {
+            GALOIS_LOG_WARN(
+                "Why is special norm factor called if not sampled/inductive?");
+          }
+          degree += 1;
+        }
+
+        // only set if non-zero
+        if (degree != 0) {
+          norm_factors_[local_id] = 1.0 / std::sqrt(static_cast<float>(degree));
+        }
+      },
+      galois::loopname("CalculateSpecialNormFactor"));
 }
 
 float galois::graphs::GNNGraph::GetGlobalAccuracy(
diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp
index d9b92607b1..dea458a6b3 100644
--- a/lonestar/libgnnbench/src/Input.cpp
+++ b/lonestar/libgnnbench/src/Input.cpp
@@ -204,6 +204,7 @@ InitializeGraphNeuralNetwork(galois::GNNLayerType layer_type) {
   galois::GraphNeuralNetworkConfig gnn_config(
       num_layers, layer_types, layer_sizes_vector, output_layer_type,
       do_graph_sampling, layer_config);
+  gnn_config.inductive_training_ = do_inductive_training;
   // optimizer
   std::unique_ptr<galois::BaseOptimizer> opt = CreateOptimizer(gnn_graph.get());
 

From c704c5a001a6de5a8fe88c904173840a7b1745af Mon Sep 17 00:00:00 2001
From: Hochan Lee <nicelhc13@users.noreply.github.com>
Date: Tue, 9 Feb 2021 17:45:41 -0600
Subject: [PATCH 465/660] Implement a distributed multi-gpu GCN. (#1)

Multi-GPU GCN
---
 libgluon/include/galois/cuda/Context.h        | 118 +++++++
 libgluon/include/galois/cuda/HostDecls.h      |  46 ++-
 .../include/galois/graphs/GluonSubstrate.h    | 294 ++++++++++++------
 .../include/galois/runtime/SyncStructures.h   |  99 ++++++
 .../include/galois/runtime/cuda/DeviceSync.h  | 217 ++++++++-----
 libgnn/CMakeLists.txt                         |   5 +-
 libgnn/include/galois/CUDAUtilHostDecls.h     |   3 +
 .../include/galois/GNNCudaContextHostDecls.h  |  82 +++++
 libgnn/include/galois/GNNOptimizers.h         |  40 +--
 libgnn/include/galois/GNNTypes.h              |   5 +
 libgnn/include/galois/graphs/GNNGraph.h       |  30 +-
 .../graphs/GraphAggregationSyncStructures.h   |  26 +-
 libgnn/include/galois/layers/GNNLayer.h       |   4 +-
 libgnn/src/CUDAUtil.cu                        |   9 +
 libgnn/src/GNNCudaContext.cu                  | 228 ++++++++++++++
 libgnn/src/GNNOptimizers.cpp                  |  59 ++--
 libgnn/src/GraphNeuralNetwork.cpp             |  58 +++-
 libgnn/src/graphs/GNNGraph.cpp                |  68 +++-
 libgnn/src/layers/GNNLayer.cpp                | 112 ++++---
 libgnn/src/layers/GraphConvolutionalLayer.cpp | 158 ++++++----
 libgnn/src/layers/SoftmaxLayer.cpp            |  30 +-
 libgnn/test/CMakeLists.txt                    |   3 +
 libgnn/test/gpu-adam-test.cpp                 |   2 +-
 libgnn/test/gpu-aggregate-sync-test.cpp       | 212 +++++++++++++
 libgnn/test/gpu-convlayer-test.cpp            |  11 +-
 libgnn/test/gpu-epoch-test.cpp                |   1 +
 libgnn/test/gpu-softmaxlayer-test.cpp         |   1 +
 libgpu/include/sharedptr.h                    |  11 +
 lonestar/libgnnbench/CMakeLists.txt           |   1 +
 lonestar/libgnnbench/include/GNNBench/Input.h |   4 +
 lonestar/libgnnbench/include/GNNBench/Start.h |  14 +
 lonestar/libgnnbench/src/Start.cpp            |  73 ++++-
 32 files changed, 1640 insertions(+), 384 deletions(-)
 create mode 100644 libgnn/include/galois/CUDAUtilHostDecls.h
 create mode 100644 libgnn/include/galois/GNNCudaContextHostDecls.h
 create mode 100644 libgnn/src/CUDAUtil.cu
 create mode 100644 libgnn/src/GNNCudaContext.cu
 create mode 100644 libgnn/test/gpu-aggregate-sync-test.cpp

diff --git a/libgluon/include/galois/cuda/Context.h b/libgluon/include/galois/cuda/Context.h
index 0ecf9eba82..57492bfdf6 100644
--- a/libgluon/include/galois/cuda/Context.h
+++ b/libgluon/include/galois/cuda/Context.h
@@ -32,6 +32,7 @@
 #include <cuda.h>
 #include "gg.h"
 #include "galois/cuda/HostDecls.h"
+#include "galois/cuda/DynamicBitset.h"
 
 struct CUDA_Context_Shared {
   unsigned int* num_nodes;         // per host
@@ -170,6 +171,34 @@ size_t mem_usage_CUDA_common(MarshalGraph& g, unsigned num_hosts) {
   return mem_usage;
 }
 
+size_t mem_usage_CUDA_common(PartitionedGraphInfo& g_info, unsigned num_hosts) {
+  size_t mem_usage       = 0;
+  size_t max_shared_size = 0; // for union across master/mirror of all hosts
+  mem_usage += num_hosts * sizeof(unsigned int);
+  mem_usage += num_hosts * sizeof(Shared<unsigned int>);
+  for (uint32_t h = 0; h < num_hosts; ++h) {
+    if (g_info.num_master_nodes[h] > 0) {
+      mem_usage += g_info.num_master_nodes[h] * sizeof(unsigned int);
+    }
+    if (g_info.num_master_nodes[h] > max_shared_size) {
+      max_shared_size = g_info.num_master_nodes[h];
+    }
+  }
+  mem_usage += num_hosts * sizeof(unsigned int);
+  mem_usage += num_hosts * sizeof(Shared<unsigned int>);
+  for (uint32_t h = 0; h < num_hosts; ++h) {
+    if (g_info.num_mirror_nodes[h] > 0) {
+      mem_usage += g_info.num_mirror_nodes[h] * sizeof(unsigned int);
+    }
+    if (g_info.num_mirror_nodes[h] > max_shared_size) {
+      max_shared_size = g_info.num_mirror_nodes[h];
+    }
+  }
+  mem_usage += max_shared_size * sizeof(unsigned int);
+  mem_usage += ((max_shared_size + 63) / 64) * sizeof(unsigned long long int);
+  return mem_usage;
+}
+
 template <typename Type>
 void load_graph_CUDA_field(struct CUDA_Context_Common* ctx,
                            struct CUDA_Context_Field<Type>* field,
@@ -191,6 +220,44 @@ void load_graph_CUDA_field(struct CUDA_Context_Common* ctx,
   field->is_updated.cpu_wr_ptr()->alloc(ctx->gg.nnodes);
 }
 
+//! Set up cuda context for vector communication.
+//! A vector of the vector is represented as a flattened 1D vector.
+//! Users can either allocate data on this function or not.
+//! The data could be a pointer which had been allocated at outside.
+template <typename Type>
+void load_graph_CUDA_field_inflating(struct CUDA_Context_Common* ctx,
+                                     struct CUDA_Context_Field<Type>* field,
+                                     unsigned num_hosts, unsigned nnodes,
+                                     size_t infl_size) {
+  load_graph_CUDA_field_inflating<Type>(ctx, field, num_hosts, nnodes,
+                                        infl_size, true);
+}
+
+template <typename Type>
+void load_graph_CUDA_field_inflating(struct CUDA_Context_Common* ctx,
+                                     struct CUDA_Context_Field<Type>* field,
+                                     unsigned num_hosts, unsigned nnodes,
+                                     size_t infl_size, bool data_alloc) {
+  size_t max_shared_size = 0; // for union across master/mirror of all hosts
+  for (uint32_t h = 0; h < num_hosts; ++h) {
+    if (ctx->master.num_nodes[h] > max_shared_size) {
+      max_shared_size = ctx->master.num_nodes[h];
+    }
+  }
+  for (uint32_t h = 0; h < num_hosts; ++h) {
+    if (ctx->mirror.num_nodes[h] > max_shared_size) {
+      max_shared_size = ctx->mirror.num_nodes[h];
+    }
+  }
+  field->is_updated.alloc(1);
+  field->is_updated.cpu_wr_ptr()->alloc(nnodes);
+
+  if (data_alloc) {
+    field->data.alloc(nnodes * infl_size);
+  }
+  field->shared_data.alloc(max_shared_size * infl_size);
+}
+
 template <typename Type>
 size_t mem_usage_CUDA_field(struct CUDA_Context_Field<Type>* field,
                             MarshalGraph& g, unsigned num_hosts) {
@@ -211,3 +278,54 @@ size_t mem_usage_CUDA_field(struct CUDA_Context_Field<Type>* field,
   mem_usage += ((g.nnodes + 63) / 64) * sizeof(unsigned long long int);
   return mem_usage;
 }
+
+void load_graph_CUDA_common(struct CUDA_Context_Common* ctx,
+                            PartitionedGraphInfo& g_info, unsigned num_hosts) {
+  ctx->numOwned          = g_info.numOwned;
+  ctx->beginMaster       = g_info.beginMaster;
+  ctx->numNodesWithEdges = g_info.numNodesWithEdges;
+  assert(ctx->id == g_info.id);
+
+  size_t mem_usage =
+      ((g_info.nnodes + 1) + g_info.nedges) * sizeof(index_type) +
+      (g_info.nnodes) * sizeof(node_data_type);
+
+  size_t max_shared_size = 0; // for union across master/mirror of all hosts
+  ctx->master.num_nodes =
+      (unsigned int*)calloc(num_hosts, sizeof(unsigned int));
+  memcpy(ctx->master.num_nodes, g_info.num_master_nodes,
+         sizeof(unsigned int) * num_hosts);
+  ctx->master.nodes = (DeviceOnly<unsigned int>*)calloc(
+      num_hosts, sizeof(Shared<unsigned int>));
+  for (uint32_t h = 0; h < num_hosts; ++h) {
+    if (ctx->master.num_nodes[h] > 0) {
+      ctx->master.nodes[h].alloc(ctx->master.num_nodes[h]);
+      ctx->master.nodes[h].copy_to_gpu(g_info.master_nodes[h],
+                                       ctx->master.num_nodes[h]);
+    }
+    if (ctx->master.num_nodes[h] > max_shared_size) {
+      max_shared_size = ctx->master.num_nodes[h];
+    }
+  }
+  ctx->mirror.num_nodes =
+      (unsigned int*)calloc(num_hosts, sizeof(unsigned int));
+  memcpy(ctx->mirror.num_nodes, g_info.num_mirror_nodes,
+         sizeof(unsigned int) * num_hosts);
+  ctx->mirror.nodes = (DeviceOnly<unsigned int>*)calloc(
+      num_hosts, sizeof(Shared<unsigned int>));
+  for (uint32_t h = 0; h < num_hosts; ++h) {
+    if (ctx->mirror.num_nodes[h] > 0) {
+      ctx->mirror.nodes[h].alloc(ctx->mirror.num_nodes[h]);
+      ctx->mirror.nodes[h].copy_to_gpu(g_info.mirror_nodes[h],
+                                       ctx->mirror.num_nodes[h]);
+    }
+    if (ctx->mirror.num_nodes[h] > max_shared_size) {
+      max_shared_size = ctx->mirror.num_nodes[h];
+    }
+  }
+  ctx->offsets.alloc(max_shared_size);
+  ctx->is_updated.alloc(1);
+  ctx->is_updated.cpu_wr_ptr()->alloc(max_shared_size);
+  // printf("[%u] load_graph_GPU: %u owned nodes of total %u resident, %lu
+  // edges\n", ctx->id, ctx->nowned, graph.nnodes, graph.nedges);
+}
diff --git a/libgluon/include/galois/cuda/HostDecls.h b/libgluon/include/galois/cuda/HostDecls.h
index a085b26967..d4852df70d 100644
--- a/libgluon/include/galois/cuda/HostDecls.h
+++ b/libgluon/include/galois/cuda/HostDecls.h
@@ -35,6 +35,45 @@ typedef unsigned int node_data_type;
 typedef unsigned int edge_data_type;
 #endif
 
+struct PartitionedGraphInfo {
+  size_t nnodes;
+  size_t nedges;
+  unsigned int numOwned;    // Number of nodes owned (masters) by this host
+  unsigned int beginMaster; // local id of the beginning of master nodes
+  unsigned int numNodesWithEdges; // Number of nodes (masters + mirrors) that
+                                  // have outgoing edges
+  int id;
+  unsigned numHosts;
+  unsigned int* num_master_nodes;
+  unsigned int** master_nodes;
+  unsigned int* num_mirror_nodes;
+  unsigned int** mirror_nodes;
+
+  PartitionedGraphInfo()
+      : nnodes(0), nedges(0), numOwned(0), beginMaster(0), numNodesWithEdges(0),
+        id(-1), numHosts(0), num_master_nodes(nullptr), master_nodes(nullptr),
+        num_mirror_nodes(nullptr), mirror_nodes(nullptr) {}
+
+  ~PartitionedGraphInfo() {
+    if (!num_master_nodes)
+      free(num_master_nodes);
+    if (!master_nodes) {
+      for (unsigned i = 0; i < numHosts; ++i) {
+        free(master_nodes[i]);
+      }
+      free(master_nodes);
+    }
+    if (!num_mirror_nodes)
+      free(num_mirror_nodes);
+    if (!mirror_nodes) {
+      for (unsigned i = 0; i < numHosts; ++i) {
+        free(mirror_nodes[i]);
+      }
+      free(mirror_nodes);
+    }
+  }
+};
+
 struct MarshalGraph {
   size_t nnodes;
   size_t nedges;
@@ -55,9 +94,10 @@ struct MarshalGraph {
 
   MarshalGraph()
       : nnodes(0), nedges(0), numOwned(0), beginMaster(0), numNodesWithEdges(0),
-        id(-1), numHosts(0), row_start(NULL), edge_dst(NULL), node_data(NULL),
-        edge_data(NULL), num_master_nodes(NULL), master_nodes(NULL),
-        num_mirror_nodes(NULL), mirror_nodes(NULL) {}
+        id(-1), numHosts(0), row_start(nullptr), edge_dst(nullptr),
+        node_data(nullptr), edge_data(nullptr), num_master_nodes(nullptr),
+        master_nodes(nullptr), num_mirror_nodes(nullptr),
+        mirror_nodes(nullptr) {}
 
   ~MarshalGraph() {
     if (!row_start)
diff --git a/libgluon/include/galois/graphs/GluonSubstrate.h b/libgluon/include/galois/graphs/GluonSubstrate.h
index f79427af89..8b68216794 100644
--- a/libgluon/include/galois/graphs/GluonSubstrate.h
+++ b/libgluon/include/galois/graphs/GluonSubstrate.h
@@ -719,21 +719,22 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
    * @param loopName Name to give timer
    * @param x Host to send to
    * @param b OUTPUT: Buffer that will hold data to send
+   * @param elem_size The inner-vector dimesnion of a vector of the vector
    */
   template <
       SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy,
       bool async,
       typename std::enable_if<!BitsetFnTy::is_vector_bitset()>::type* = nullptr>
   void getSendBuffer(std::string loopName, unsigned x,
-                     galois::runtime::SendBuffer& b) {
+                     galois::runtime::SendBuffer& b, size_t elem_size) {
     auto& sharedNodes = (syncType == syncReduce) ? mirrorNodes : masterNodes;
 
     if (BitsetFnTy::is_valid()) {
       syncExtract<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(
-          loopName, x, sharedNodes[x], b);
+          loopName, x, sharedNodes[x], b, elem_size);
     } else {
       syncExtract<syncType, SyncFnTy, VecTy, async>(loopName, x, sharedNodes[x],
-                                                    b);
+                                                    b, elem_size);
     }
 
     std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
@@ -747,11 +748,11 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
       bool async,
       typename std::enable_if<BitsetFnTy::is_vector_bitset()>::type* = nullptr>
   void getSendBuffer(std::string loopName, unsigned x,
-                     galois::runtime::SendBuffer& b) {
+                     galois::runtime::SendBuffer& b, size_t elem_size) {
     auto& sharedNodes = (syncType == syncReduce) ? mirrorNodes : masterNodes;
 
     syncExtract<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(
-        loopName, x, sharedNodes[x], b);
+        loopName, x, sharedNodes[x], b, elem_size);
 
     std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
     std::string statSendBytes_str(syncTypeStr + "SendBytesVector_" +
@@ -1644,9 +1645,9 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
             typename std::enable_if<galois::runtime::is_memory_copyable<
                 typename SyncFnTy::ValTy>::value>::type* = nullptr>
   void syncExtract(std::string loopName, unsigned from_id,
-                   std::vector<size_t>& indices,
-                   galois::runtime::SendBuffer& b) {
-    uint32_t num = indices.size();
+                   std::vector<size_t>& indices, galois::runtime::SendBuffer& b,
+                   size_t elem_size) {
+    uint32_t num = indices.size() * elem_size;
     static VecTy val_vec; // sometimes wasteful
     galois::PODResizeableArray<unsigned int>& offsets = syncOffsets;
     std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
@@ -1725,8 +1726,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
             typename std::enable_if<!galois::runtime::is_memory_copyable<
                 typename SyncFnTy::ValTy>::value>::type* = nullptr>
   void syncExtract(std::string loopName, unsigned from_id,
-                   std::vector<size_t>& indices,
-                   galois::runtime::SendBuffer& b) {
+                   std::vector<size_t>& indices, galois::runtime::SendBuffer& b,
+                   size_t elem_size) {
     std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
     std::string extract_timer_str(syncTypeStr + "Extract_" +
                                   get_run_identifier(loopName));
@@ -1739,7 +1740,7 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
 
     DataCommMode data_mode;
 
-    uint32_t num = indices.size();
+    uint32_t num = indices.size() * elem_size;
     static VecTy val_vec; // sometimes wasteful
     static galois::PODResizeableArray<unsigned int> dummyVector;
 
@@ -1768,7 +1769,6 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
         b.resize(sizeof(DataCommMode) + sizeof(size_t) +
                  (num * sizeof(typename SyncFnTy::ValTy)));
       }
-
     } else {
       b.resize(0);
       if (!async) {
@@ -1808,9 +1808,9 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
       bool async,
       typename std::enable_if<!BitsetFnTy::is_vector_bitset()>::type* = nullptr>
   void syncExtract(std::string loopName, unsigned from_id,
-                   std::vector<size_t>& indices,
-                   galois::runtime::SendBuffer& b) {
-    uint32_t num                        = indices.size();
+                   std::vector<size_t>& indices, galois::runtime::SendBuffer& b,
+                   size_t elem_size) {
+    uint32_t num                        = indices.size() * elem_size;
     galois::DynamicBitSet& bit_set_comm = syncBitset;
     static VecTy val_vec; // sometimes wasteful
     galois::PODResizeableArray<unsigned int>& offsets = syncOffsets;
@@ -1947,8 +1947,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
       bool async,
       typename std::enable_if<BitsetFnTy::is_vector_bitset()>::type* = nullptr>
   void syncExtract(std::string loopName, unsigned, std::vector<size_t>& indices,
-                   galois::runtime::SendBuffer& b) {
-    uint32_t num                        = indices.size();
+                   galois::runtime::SendBuffer& b, size_t elem_size) {
+    uint32_t num                        = indices.size() * elem_size;
     galois::DynamicBitSet& bit_set_comm = syncBitset;
     static VecTy val_vec; // sometimes wasteful
     galois::PODResizeableArray<unsigned int>& offsets = syncOffsets;
@@ -1958,7 +1958,6 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
                                   get_run_identifier(loopName));
     galois::CondStatTimer<GALOIS_COMM_STATS> Textract(extract_timer_str.c_str(),
                                                       RNAME);
-
     Textract.start();
 
     if (num > 0) {
@@ -2123,7 +2122,7 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
   template <WriteLocation writeLocation, ReadLocation readLocation,
             SyncType syncType, typename SyncFnTy, typename BitsetFnTy,
             typename VecTy, bool async>
-  void syncNetSend(std::string loopName) {
+  void syncNetSend(std::string loopName, size_t elem_size) {
     static galois::runtime::SendBuffer
         b; // although a static variable, allocation not reused
            // due to std::move in net.sendTagged()
@@ -2141,7 +2140,7 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
         continue;
 
       getSendBuffer<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(loopName, x,
-                                                                  b);
+                                                                  b, elem_size);
 
       if ((!async) || (b.size() > 0)) {
         size_t syncTypePhase = 0;
@@ -2178,14 +2177,14 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
   template <WriteLocation writeLocation, ReadLocation readLocation,
             SyncType syncType, typename SyncFnTy, typename BitsetFnTy,
             typename VecTy, bool async>
-  void syncSend(std::string loopName) {
+  void syncSend(std::string loopName, size_t elem_size) {
     std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
     galois::CondStatTimer<GALOIS_COMM_STATS> TSendTime(
         (syncTypeStr + "Send_" + get_run_identifier(loopName)).c_str(), RNAME);
 
     TSendTime.start();
     syncNetSend<writeLocation, readLocation, syncType, SyncFnTy, BitsetFnTy,
-                VecTy, async>(loopName);
+                VecTy, async>(loopName, elem_size);
     TSendTime.stop();
   }
 
@@ -2717,7 +2716,7 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
    */
   template <WriteLocation writeLocation, ReadLocation readLocation,
             typename ReduceFnTy, typename BitsetFnTy, bool async>
-  inline void reduce(std::string loopName) {
+  inline void reduce(std::string loopName, size_t elem_size) {
     std::string timer_str("Reduce_" + get_run_identifier(loopName));
     galois::CondStatTimer<GALOIS_COMM_STATS> TsyncReduce(timer_str.c_str(),
                                                          RNAME);
@@ -2735,7 +2734,7 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
     case noBareMPI:
 #endif
       syncSend<writeLocation, readLocation, syncReduce, ReduceFnTy, BitsetFnTy,
-               VecTy, async>(loopName);
+               VecTy, async>(loopName, elem_size);
       syncRecv<writeLocation, readLocation, syncReduce, ReduceFnTy, BitsetFnTy,
                VecTy, async>(loopName);
 #ifdef GALOIS_USE_BARE_MPI
@@ -2768,7 +2767,7 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
    */
   template <WriteLocation writeLocation, ReadLocation readLocation,
             typename BroadcastFnTy, typename BitsetFnTy, bool async>
-  inline void broadcast(std::string loopName) {
+  inline void broadcast(std::string loopName, size_t elem_size) {
     std::string timer_str("Broadcast_" + get_run_identifier(loopName));
     galois::CondStatTimer<GALOIS_COMM_STATS> TsyncBroadcast(timer_str.c_str(),
                                                             RNAME);
@@ -2810,10 +2809,10 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
 #endif
       if (use_bitset) {
         syncSend<writeLocation, readLocation, syncBroadcast, BroadcastFnTy,
-                 BitsetFnTy, VecTy, async>(loopName);
+                 BitsetFnTy, VecTy, async>(loopName, elem_size);
       } else {
         syncSend<writeLocation, readLocation, syncBroadcast, BroadcastFnTy,
-                 galois::InvalidBitsetFnTy, VecTy, async>(loopName);
+                 galois::InvalidBitsetFnTy, VecTy, async>(loopName, elem_size);
       }
       syncRecv<writeLocation, readLocation, syncBroadcast, BroadcastFnTy,
                BitsetFnTy, VecTy, async>(loopName);
@@ -2845,12 +2844,14 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
    * @param loopName used to name timers for statistics
    */
   template <typename SyncFnTy, typename BitsetFnTy, bool async>
-  inline void sync_src_to_src(std::string loopName) {
+  inline void sync_src_to_src(std::string loopName, size_t elem_size) {
     // do nothing for OEC
     // reduce and broadcast for IEC, CVC, UVC
     if (transposed || isVertexCut) {
-      reduce<writeSource, readSource, SyncFnTy, BitsetFnTy, async>(loopName);
-      broadcast<writeSource, readSource, SyncFnTy, BitsetFnTy, async>(loopName);
+      reduce<writeSource, readSource, SyncFnTy, BitsetFnTy, async>(loopName,
+                                                                   elem_size);
+      broadcast<writeSource, readSource, SyncFnTy, BitsetFnTy, async>(
+          loopName, elem_size);
     }
   }
 
@@ -2863,24 +2864,24 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
    * @param loopName used to name timers for statistics
    */
   template <typename SyncFnTy, typename BitsetFnTy, bool async>
-  inline void sync_src_to_dst(std::string loopName) {
+  inline void sync_src_to_dst(std::string loopName, size_t elem_size) {
     // only broadcast for OEC
     // only reduce for IEC
     // reduce and broadcast for CVC, UVC
     if (transposed) {
       reduce<writeSource, readDestination, SyncFnTy, BitsetFnTy, async>(
-          loopName);
+          loopName, elem_size);
       if (isVertexCut) {
         broadcast<writeSource, readDestination, SyncFnTy, BitsetFnTy, async>(
-            loopName);
+            loopName, elem_size);
       }
     } else {
       if (isVertexCut) {
         reduce<writeSource, readDestination, SyncFnTy, BitsetFnTy, async>(
-            loopName);
+            loopName, elem_size);
       }
       broadcast<writeSource, readDestination, SyncFnTy, BitsetFnTy, async>(
-          loopName);
+          loopName, elem_size);
     }
   }
 
@@ -2893,13 +2894,15 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
    * @param loopName used to name timers for statistics
    */
   template <typename SyncFnTy, typename BitsetFnTy, bool async>
-  inline void sync_src_to_any(std::string loopName) {
+  inline void sync_src_to_any(std::string loopName, size_t elem_size) {
     // only broadcast for OEC
     // reduce and broadcast for IEC, CVC, UVC
     if (transposed || isVertexCut) {
-      reduce<writeSource, readAny, SyncFnTy, BitsetFnTy, async>(loopName);
+      reduce<writeSource, readAny, SyncFnTy, BitsetFnTy, async>(loopName,
+                                                                elem_size);
     }
-    broadcast<writeSource, readAny, SyncFnTy, BitsetFnTy, async>(loopName);
+    broadcast<writeSource, readAny, SyncFnTy, BitsetFnTy, async>(loopName,
+                                                                 elem_size);
   }
 
   /**
@@ -2911,23 +2914,23 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
    * @param loopName used to name timers for statistics
    */
   template <typename SyncFnTy, typename BitsetFnTy, bool async>
-  inline void sync_dst_to_src(std::string loopName) {
+  inline void sync_dst_to_src(std::string loopName, size_t elem_size) {
     // only reduce for OEC
     // only broadcast for IEC
     // reduce and broadcast for CVC, UVC
     if (transposed) {
       if (isVertexCut) {
         reduce<writeDestination, readSource, SyncFnTy, BitsetFnTy, async>(
-            loopName);
+            loopName, elem_size);
       }
       broadcast<writeDestination, readSource, SyncFnTy, BitsetFnTy, async>(
-          loopName);
+          loopName, elem_size);
     } else {
       reduce<writeDestination, readSource, SyncFnTy, BitsetFnTy, async>(
-          loopName);
+          loopName, elem_size);
       if (isVertexCut) {
         broadcast<writeDestination, readSource, SyncFnTy, BitsetFnTy, async>(
-            loopName);
+            loopName, elem_size);
       }
     }
   }
@@ -2941,14 +2944,14 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
    * @param loopName used to name timers for statistics
    */
   template <typename SyncFnTy, typename BitsetFnTy, bool async>
-  inline void sync_dst_to_dst(std::string loopName) {
+  inline void sync_dst_to_dst(std::string loopName, size_t elem_size) {
     // do nothing for IEC
     // reduce and broadcast for OEC, CVC, UVC
     if (!transposed || isVertexCut) {
       reduce<writeDestination, readDestination, SyncFnTy, BitsetFnTy, async>(
-          loopName);
+          loopName, elem_size);
       broadcast<writeDestination, readDestination, SyncFnTy, BitsetFnTy, async>(
-          loopName);
+          loopName, elem_size);
     }
   }
 
@@ -2961,13 +2964,15 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
    * @param loopName used to name timers for statistics
    */
   template <typename SyncFnTy, typename BitsetFnTy, bool async>
-  inline void sync_dst_to_any(std::string loopName) {
+  inline void sync_dst_to_any(std::string loopName, size_t elem_size) {
     // only broadcast for IEC
     // reduce and broadcast for OEC, CVC, UVC
     if (!transposed || isVertexCut) {
-      reduce<writeDestination, readAny, SyncFnTy, BitsetFnTy, async>(loopName);
+      reduce<writeDestination, readAny, SyncFnTy, BitsetFnTy, async>(loopName,
+                                                                     elem_size);
     }
-    broadcast<writeDestination, readAny, SyncFnTy, BitsetFnTy, async>(loopName);
+    broadcast<writeDestination, readAny, SyncFnTy, BitsetFnTy, async>(
+        loopName, elem_size);
   }
 
   /**
@@ -2979,12 +2984,14 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
    * @param loopName used to name timers for statistics
    */
   template <typename SyncFnTy, typename BitsetFnTy, bool async>
-  inline void sync_any_to_src(std::string loopName) {
+  inline void sync_any_to_src(std::string loopName, size_t elem_size) {
     // only reduce for OEC
     // reduce and broadcast for IEC, CVC, UVC
-    reduce<writeAny, readSource, SyncFnTy, BitsetFnTy, async>(loopName);
+    reduce<writeAny, readSource, SyncFnTy, BitsetFnTy, async>(loopName,
+                                                              elem_size);
     if (transposed || isVertexCut) {
-      broadcast<writeAny, readSource, SyncFnTy, BitsetFnTy, async>(loopName);
+      broadcast<writeAny, readSource, SyncFnTy, BitsetFnTy, async>(loopName,
+                                                                   elem_size);
     }
   }
 
@@ -2997,14 +3004,15 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
    * @param loopName used to name timers for statistics
    */
   template <typename SyncFnTy, typename BitsetFnTy, bool async>
-  inline void sync_any_to_dst(std::string loopName) {
+  inline void sync_any_to_dst(std::string loopName, size_t elem_size) {
     // only reduce for IEC
     // reduce and broadcast for OEC, CVC, UVC
-    reduce<writeAny, readDestination, SyncFnTy, BitsetFnTy, async>(loopName);
+    reduce<writeAny, readDestination, SyncFnTy, BitsetFnTy, async>(loopName,
+                                                                   elem_size);
 
     if (!transposed || isVertexCut) {
       broadcast<writeAny, readDestination, SyncFnTy, BitsetFnTy, async>(
-          loopName);
+          loopName, elem_size);
     }
   }
 
@@ -3017,10 +3025,11 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
    * @param loopName used to name timers for statistics
    */
   template <typename SyncFnTy, typename BitsetFnTy, bool async>
-  inline void sync_any_to_any(std::string loopName) {
+  inline void sync_any_to_any(std::string loopName, size_t elem_size) {
     // reduce and broadcast for OEC, IEC, CVC, UVC
-    reduce<writeAny, readAny, SyncFnTy, BitsetFnTy, async>(loopName);
-    broadcast<writeAny, readAny, SyncFnTy, BitsetFnTy, async>(loopName);
+    reduce<writeAny, readAny, SyncFnTy, BitsetFnTy, async>(loopName, elem_size);
+    broadcast<writeAny, readAny, SyncFnTy, BitsetFnTy, async>(loopName,
+                                                              elem_size);
   }
 
   ////////////////////////////////////////////////////////////////////////////////
@@ -3028,6 +3037,13 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
   ////////////////////////////////////////////////////////////////////////////////
 
 public:
+  template <WriteLocation writeLocation, ReadLocation readLocation,
+            typename SyncFnTy, typename BitsetFnTy = galois::InvalidBitsetFnTy,
+            bool async = false>
+  inline void sync(std::string loopName) {
+    sync<writeLocation, readLocation, SyncFnTy, BitsetFnTy, async>(loopName, 1);
+  }
+
   /**
    * Main sync call exposed to the user that calls the correct sync function
    * based on provided template arguments. Must provide information through
@@ -3043,38 +3059,38 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
   template <WriteLocation writeLocation, ReadLocation readLocation,
             typename SyncFnTy, typename BitsetFnTy = galois::InvalidBitsetFnTy,
             bool async = false>
-  inline void sync(std::string loopName) {
+  inline void sync(std::string loopName, size_t elem_size) {
     std::string timer_str("Sync_" + loopName + "_" + get_run_identifier());
     galois::StatTimer Tsync(timer_str.c_str(), RNAME);
 
     Tsync.start();
 
     if (partitionAgnostic) {
-      sync_any_to_any<SyncFnTy, BitsetFnTy, async>(loopName);
+      sync_any_to_any<SyncFnTy, BitsetFnTy, async>(loopName, elem_size);
     } else {
       if (writeLocation == writeSource) {
         if (readLocation == readSource) {
-          sync_src_to_src<SyncFnTy, BitsetFnTy, async>(loopName);
+          sync_src_to_src<SyncFnTy, BitsetFnTy, async>(loopName, elem_size);
         } else if (readLocation == readDestination) {
-          sync_src_to_dst<SyncFnTy, BitsetFnTy, async>(loopName);
+          sync_src_to_dst<SyncFnTy, BitsetFnTy, async>(loopName, elem_size);
         } else { // readAny
-          sync_src_to_any<SyncFnTy, BitsetFnTy, async>(loopName);
+          sync_src_to_any<SyncFnTy, BitsetFnTy, async>(loopName, elem_size);
         }
       } else if (writeLocation == writeDestination) {
         if (readLocation == readSource) {
-          sync_dst_to_src<SyncFnTy, BitsetFnTy, async>(loopName);
+          sync_dst_to_src<SyncFnTy, BitsetFnTy, async>(loopName, elem_size);
         } else if (readLocation == readDestination) {
-          sync_dst_to_dst<SyncFnTy, BitsetFnTy, async>(loopName);
+          sync_dst_to_dst<SyncFnTy, BitsetFnTy, async>(loopName, elem_size);
         } else { // readAny
-          sync_dst_to_any<SyncFnTy, BitsetFnTy, async>(loopName);
+          sync_dst_to_any<SyncFnTy, BitsetFnTy, async>(loopName, elem_size);
         }
       } else { // writeAny
         if (readLocation == readSource) {
-          sync_any_to_src<SyncFnTy, BitsetFnTy, async>(loopName);
+          sync_any_to_src<SyncFnTy, BitsetFnTy, async>(loopName, elem_size);
         } else if (readLocation == readDestination) {
-          sync_any_to_dst<SyncFnTy, BitsetFnTy, async>(loopName);
+          sync_any_to_dst<SyncFnTy, BitsetFnTy, async>(loopName, elem_size);
         } else { // readAny
-          sync_any_to_any<SyncFnTy, BitsetFnTy, async>(loopName);
+          sync_any_to_any<SyncFnTy, BitsetFnTy, async>(loopName, elem_size);
         }
       }
     }
@@ -3153,13 +3169,20 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
      */
     static inline void call(GluonSubstrate* substrate,
                             galois::runtime::FieldFlags& fieldFlags,
-                            std::string loopName, const BITVECTOR_STATUS&) {
+                            std::string loopName, const BITVECTOR_STATUS& b) {
+      call(substrate, fieldFlags, loopName, b, 1);
+    }
+
+    static inline void call(GluonSubstrate* substrate,
+                            galois::runtime::FieldFlags& fieldFlags,
+                            std::string loopName, const BITVECTOR_STATUS&,
+                            size_t elem_size) {
       if (fieldFlags.src_to_dst() && fieldFlags.dst_to_dst()) {
-        substrate->sync_any_to_dst<SyncFnTy, BitsetFnTy>(loopName);
+        substrate->sync_any_to_dst<SyncFnTy, BitsetFnTy>(loopName, elem_size);
       } else if (fieldFlags.src_to_dst()) {
-        substrate->sync_src_to_dst<SyncFnTy, BitsetFnTy>(loopName);
+        substrate->sync_src_to_dst<SyncFnTy, BitsetFnTy>(loopName, elem_size);
       } else if (fieldFlags.dst_to_dst()) {
-        substrate->sync_dst_to_dst<SyncFnTy, BitsetFnTy>(loopName);
+        substrate->sync_dst_to_dst<SyncFnTy, BitsetFnTy>(loopName, elem_size);
       }
 
       fieldFlags.clear_read_dst();
@@ -3189,6 +3212,13 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
                             galois::runtime::FieldFlags& fieldFlags,
                             std::string loopName,
                             const BITVECTOR_STATUS& bvFlag) {
+      call(substrate, fieldFlags, loopName, bvFlag, 1);
+    }
+
+    static inline void call(GluonSubstrate* substrate,
+                            galois::runtime::FieldFlags& fieldFlags,
+                            std::string loopName,
+                            const BITVECTOR_STATUS& bvFlag, size_t elem_size) {
       bool src_write = fieldFlags.src_to_src() || fieldFlags.src_to_dst();
       bool dst_write = fieldFlags.dst_to_src() || fieldFlags.dst_to_dst();
 
@@ -3201,42 +3231,56 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
         if (src_write) {
           if (fieldFlags.src_to_src() && fieldFlags.src_to_dst()) {
             if (bvFlag == BITVECTOR_STATUS::NONE_INVALID) {
-              substrate->sync_src_to_any<SyncFnTy, BitsetFnTy>(loopName);
+              substrate->sync_src_to_any<SyncFnTy, BitsetFnTy>(loopName,
+                                                               elem_size);
             } else if (galois::runtime::src_invalid(bvFlag)) {
               // src invalid bitset; sync individually so it can be called
               // without bitset
-              substrate->sync_src_to_dst<SyncFnTy, BitsetFnTy>(loopName);
-              substrate->sync_src_to_src<SyncFnTy, BitsetFnTy>(loopName);
+              substrate->sync_src_to_dst<SyncFnTy, BitsetFnTy>(loopName,
+                                                               elem_size);
+              substrate->sync_src_to_src<SyncFnTy, BitsetFnTy>(loopName,
+                                                               elem_size);
             } else if (galois::runtime::dst_invalid(bvFlag)) {
               // dst invalid bitset; sync individually so it can be called
               // without bitset
-              substrate->sync_src_to_src<SyncFnTy, BitsetFnTy>(loopName);
-              substrate->sync_src_to_dst<SyncFnTy, BitsetFnTy>(loopName);
+              substrate->sync_src_to_src<SyncFnTy, BitsetFnTy>(loopName,
+                                                               elem_size);
+              substrate->sync_src_to_dst<SyncFnTy, BitsetFnTy>(loopName,
+                                                               elem_size);
             } else {
               GALOIS_DIE("invalid bitvector flag setting in syncOnDemand");
             }
           } else if (fieldFlags.src_to_src()) {
-            substrate->sync_src_to_src<SyncFnTy, BitsetFnTy>(loopName);
+            substrate->sync_src_to_src<SyncFnTy, BitsetFnTy>(loopName,
+                                                             elem_size);
           } else { // src to dst is set
-            substrate->sync_src_to_dst<SyncFnTy, BitsetFnTy>(loopName);
+            substrate->sync_src_to_dst<SyncFnTy, BitsetFnTy>(loopName,
+                                                             elem_size);
           }
         } else if (dst_write) {
           if (fieldFlags.dst_to_src() && fieldFlags.dst_to_dst()) {
             if (bvFlag == BITVECTOR_STATUS::NONE_INVALID) {
-              substrate->sync_dst_to_any<SyncFnTy, BitsetFnTy>(loopName);
+              substrate->sync_dst_to_any<SyncFnTy, BitsetFnTy>(loopName,
+                                                               elem_size);
             } else if (galois::runtime::src_invalid(bvFlag)) {
-              substrate->sync_dst_to_dst<SyncFnTy, BitsetFnTy>(loopName);
-              substrate->sync_dst_to_src<SyncFnTy, BitsetFnTy>(loopName);
+              substrate->sync_dst_to_dst<SyncFnTy, BitsetFnTy>(loopName,
+                                                               elem_size);
+              substrate->sync_dst_to_src<SyncFnTy, BitsetFnTy>(loopName,
+                                                               elem_size);
             } else if (galois::runtime::dst_invalid(bvFlag)) {
-              substrate->sync_dst_to_src<SyncFnTy, BitsetFnTy>(loopName);
-              substrate->sync_dst_to_dst<SyncFnTy, BitsetFnTy>(loopName);
+              substrate->sync_dst_to_src<SyncFnTy, BitsetFnTy>(loopName,
+                                                               elem_size);
+              substrate->sync_dst_to_dst<SyncFnTy, BitsetFnTy>(loopName,
+                                                               elem_size);
             } else {
               GALOIS_DIE("invalid bitvector flag setting in syncOnDemand");
             }
           } else if (fieldFlags.dst_to_src()) {
-            substrate->sync_dst_to_src<SyncFnTy, BitsetFnTy>(loopName);
+            substrate->sync_dst_to_src<SyncFnTy, BitsetFnTy>(loopName,
+                                                             elem_size);
           } else { // dst to dst is set
-            substrate->sync_dst_to_dst<SyncFnTy, BitsetFnTy>(loopName);
+            substrate->sync_dst_to_dst<SyncFnTy, BitsetFnTy>(loopName,
+                                                             elem_size);
           }
         }
 
@@ -3252,20 +3296,25 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
 
         if (src_read && dst_read) {
           if (bvFlag == BITVECTOR_STATUS::NONE_INVALID) {
-            substrate->sync_any_to_any<SyncFnTy, BitsetFnTy>(loopName);
+            substrate->sync_any_to_any<SyncFnTy, BitsetFnTy>(loopName,
+                                                             elem_size);
           } else if (galois::runtime::src_invalid(bvFlag)) {
-            substrate->sync_any_to_dst<SyncFnTy, BitsetFnTy>(loopName);
-            substrate->sync_any_to_src<SyncFnTy, BitsetFnTy>(loopName);
+            substrate->sync_any_to_dst<SyncFnTy, BitsetFnTy>(loopName,
+                                                             elem_size);
+            substrate->sync_any_to_src<SyncFnTy, BitsetFnTy>(loopName,
+                                                             elem_size);
           } else if (galois::runtime::dst_invalid(bvFlag)) {
-            substrate->sync_any_to_src<SyncFnTy, BitsetFnTy>(loopName);
-            substrate->sync_any_to_dst<SyncFnTy, BitsetFnTy>(loopName);
+            substrate->sync_any_to_src<SyncFnTy, BitsetFnTy>(loopName,
+                                                             elem_size);
+            substrate->sync_any_to_dst<SyncFnTy, BitsetFnTy>(loopName,
+                                                             elem_size);
           } else {
             GALOIS_DIE("invalid bitvector flag setting in syncOnDemand");
           }
         } else if (src_read) {
-          substrate->sync_any_to_src<SyncFnTy, BitsetFnTy>(loopName);
+          substrate->sync_any_to_src<SyncFnTy, BitsetFnTy>(loopName, elem_size);
         } else { // dst_read
-          substrate->sync_any_to_dst<SyncFnTy, BitsetFnTy>(loopName);
+          substrate->sync_any_to_dst<SyncFnTy, BitsetFnTy>(loopName, elem_size);
         }
       }
 
@@ -3395,6 +3444,63 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
       userGraph.deallocate();
     }
   }
+
+  void getPartitionedGraphInfo(PartitionedGraphInfo& g_info) {
+    getPartitionedGraphInfo(g_info, true);
+  }
+
+  void getPartitionedGraphInfo(PartitionedGraphInfo& g_info,
+                               bool deallocate_graph) {
+    g_info.numOwned = userGraph.numMasters();
+    // Assumption: master occurs at beginning in contiguous range
+    g_info.beginMaster       = 0;
+    g_info.numNodesWithEdges = userGraph.getNumNodesWithEdges();
+    g_info.id                = id;
+    g_info.numHosts          = numHosts;
+
+    // copy memoization meta-data
+    g_info.num_master_nodes =
+        (unsigned int*)calloc(masterNodes.size(), sizeof(unsigned int));
+    g_info.master_nodes =
+        (unsigned int**)calloc(masterNodes.size(), sizeof(unsigned int*));
+
+    for (uint32_t h = 0; h < masterNodes.size(); ++h) {
+      g_info.num_master_nodes[h] = masterNodes[h].size();
+
+      if (masterNodes[h].size() > 0) {
+        g_info.master_nodes[h] =
+            (unsigned int*)calloc(masterNodes[h].size(), sizeof(unsigned int));
+        ;
+        std::copy(masterNodes[h].begin(), masterNodes[h].end(),
+                  g_info.master_nodes[h]);
+      } else {
+        g_info.master_nodes[h] = NULL;
+      }
+    }
+
+    g_info.num_mirror_nodes =
+        (unsigned int*)calloc(mirrorNodes.size(), sizeof(unsigned int));
+    g_info.mirror_nodes =
+        (unsigned int**)calloc(mirrorNodes.size(), sizeof(unsigned int*));
+    for (uint32_t h = 0; h < mirrorNodes.size(); ++h) {
+      g_info.num_mirror_nodes[h] = mirrorNodes[h].size();
+
+      if (mirrorNodes[h].size() > 0) {
+        g_info.mirror_nodes[h] =
+            (unsigned int*)calloc(mirrorNodes[h].size(), sizeof(unsigned int));
+        std::copy(mirrorNodes[h].begin(), mirrorNodes[h].end(),
+                  g_info.mirror_nodes[h]);
+      } else {
+        g_info.mirror_nodes[h] = NULL;
+      }
+    }
+
+    // user needs to provide method of freeing up graph (it can do nothing
+    // if they wish)
+    if (deallocate_graph) {
+      userGraph.deallocate();
+    }
+  }
 #endif // het galois def
 
   ////////////////////////////////////////////////////////////////////////////////
diff --git a/libgluon/include/galois/runtime/SyncStructures.h b/libgluon/include/galois/runtime/SyncStructures.h
index b5a2b65d5c..44264461cd 100644
--- a/libgluon/include/galois/runtime/SyncStructures.h
+++ b/libgluon/include/galois/runtime/SyncStructures.h
@@ -1981,4 +1981,103 @@ class FieldFlags {
     }                                                                          \
   }
 
+#ifdef GALOIS_ENABLE_GPU
+#define GALOIS_SYNC_STRUCTURE_GNN_LAYER(fieldname, cuda_ctx_for_sync,          \
+                                        gnn_matrix_to_sync_column_length_,     \
+                                        layer_number_to_sync)                  \
+  struct GNNSumAggregate_##fieldname {                                         \
+    using ValTy = GNNFloat;                                                    \
+                                                                               \
+    static ValTy extract(uint32_t, char&) { return 0.f; }                      \
+                                                                               \
+    static bool reduce(uint32_t, char&, ValTy) { return false; }               \
+                                                                               \
+    static void reset(uint32_t, char&) {}                                      \
+                                                                               \
+    static void setVal(uint32_t, char&, ValTy) {}                              \
+                                                                               \
+    static bool extract_batch(unsigned from_id, uint8_t* buf,                  \
+                              size_t* buf_size, DataCommMode* mode) {          \
+      if (device_personality == DevicePersonality::GPU_CUDA) {                 \
+        batch_get_node_##fieldname##_matrix_cuda(                              \
+            cuda_ctx_for_sync, from_id, buf, buf_size, mode,                   \
+            gnn_matrix_to_sync_column_length_, layer_number_to_sync);          \
+        return true;                                                           \
+      }                                                                        \
+      assert(device_personality == DevicePersonality::CPU);                    \
+      return false;                                                            \
+    }                                                                          \
+                                                                               \
+    static bool extract_batch(unsigned from_id, uint8_t* buf) {                \
+      if (device_personality == DevicePersonality::GPU_CUDA) {                 \
+        batch_get_node_##fieldname##_matrix_cuda(                              \
+            cuda_ctx_for_sync, from_id, buf,                                   \
+            gnn_matrix_to_sync_column_length_, layer_number_to_sync);          \
+        return true;                                                           \
+      }                                                                        \
+      assert(device_personality == DevicePersonality::CPU);                    \
+      return false;                                                            \
+    }                                                                          \
+                                                                               \
+    static bool reduce_batch(unsigned from_id, uint8_t* buf,                   \
+                             DataCommMode mode) {                              \
+      if (device_personality == DevicePersonality::GPU_CUDA) {                 \
+        batch_aggregate_node_##fieldname##_matrix_cuda(                        \
+            cuda_ctx_for_sync, from_id, buf, mode,                             \
+            gnn_matrix_to_sync_column_length_, layer_number_to_sync);          \
+        return true;                                                           \
+      }                                                                        \
+      assert(device_personality == DevicePersonality::CPU);                    \
+      return false;                                                            \
+    }                                                                          \
+                                                                               \
+    static bool reduce_mirror_batch(unsigned from_id, uint8_t* buf,            \
+                                    DataCommMode mode) {                       \
+      if (device_personality == DevicePersonality::GPU_CUDA) {                 \
+        batch_aggregate_mirror_node_##fieldname##_matrix_cuda(                 \
+            cuda_ctx_for_sync, from_id, buf, mode,                             \
+            gnn_matrix_to_sync_column_length_, layer_number_to_sync);          \
+        return true;                                                           \
+      }                                                                        \
+      assert(device_personality == DevicePersonality::CPU);                    \
+      return false;                                                            \
+    }                                                                          \
+                                                                               \
+    static bool setVal_batch(unsigned from_id, uint8_t* buf,                   \
+                             DataCommMode mode) {                              \
+      if (device_personality == DevicePersonality::GPU_CUDA) {                 \
+        batch_set_mirror_node_##fieldname##_matrix_cuda(                       \
+            cuda_ctx_for_sync, from_id, buf, mode,                             \
+            gnn_matrix_to_sync_column_length_, layer_number_to_sync);          \
+        return true;                                                           \
+      }                                                                        \
+      assert(device_personality == DevicePersonality::CPU);                    \
+      return false;                                                            \
+    }                                                                          \
+                                                                               \
+    static bool extract_reset_batch(unsigned from_id, uint8_t* buf,            \
+                                    size_t* buf_size, DataCommMode* mode) {    \
+      if (device_personality == DevicePersonality::GPU_CUDA) {                 \
+        batch_get_reset_node_##fieldname##_matrix_cuda(                        \
+            cuda_ctx_for_sync, from_id, buf, buf_size, mode,                   \
+            gnn_matrix_to_sync_column_length_, layer_number_to_sync);          \
+        return true;                                                           \
+      }                                                                        \
+      assert(device_personality == DevicePersonality::CPU);                    \
+      return false;                                                            \
+    }                                                                          \
+                                                                               \
+    static bool extract_reset_batch(unsigned from_id, uint8_t* buf) {          \
+      if (device_personality == DevicePersonality::GPU_CUDA) {                 \
+        batch_get_reset_node_##fieldname##_matrix_cuda(                        \
+            cuda_ctx_for_sync, from_id, buf,                                   \
+            gnn_matrix_to_sync_column_length_, layer_number_to_sync);          \
+        return true;                                                           \
+      }                                                                        \
+      assert(device_personality == DevicePersonality::CPU);                    \
+      return false;                                                            \
+    }                                                                          \
+  };
+#endif
+
 #endif // header guard
diff --git a/libgluon/include/galois/runtime/cuda/DeviceSync.h b/libgluon/include/galois/runtime/cuda/DeviceSync.h
index db23350c4a..a9512b1cc1 100644
--- a/libgluon/include/galois/runtime/cuda/DeviceSync.h
+++ b/libgluon/include/galois/runtime/cuda/DeviceSync.h
@@ -52,7 +52,8 @@ void kernel_sizing(dim3& blocks, dim3& threads) {
 }
 
 template <typename DataType>
-__global__ void batch_get_subset(index_type subset_size,
+__global__ void batch_get_subset(const index_type subset_size,
+                                 const index_type elem_size,
                                  const unsigned int* __restrict__ indices,
                                  DataType* __restrict__ subset,
                                  const DataType* __restrict__ array) {
@@ -61,12 +62,15 @@ __global__ void batch_get_subset(index_type subset_size,
   index_type src_end = subset_size;
   for (index_type src = 0 + tid; src < src_end; src += nthreads) {
     unsigned index = indices[src];
-    subset[src]    = array[index];
+    for (index_type eid = 0; eid < elem_size; eid++) {
+      subset[src * elem_size + eid] = array[index * elem_size + eid];
+    }
   }
 }
 
 template <typename DataType, typename OffsetIteratorType>
-__global__ void batch_get_subset(index_type subset_size,
+__global__ void batch_get_subset(const index_type subset_size,
+                                 const index_type elem_size,
                                  const unsigned int* __restrict__ indices,
                                  const OffsetIteratorType offsets,
                                  DataType* __restrict__ subset,
@@ -76,45 +80,52 @@ __global__ void batch_get_subset(index_type subset_size,
   index_type src_end = subset_size;
   for (index_type src = 0 + tid; src < src_end; src += nthreads) {
     unsigned index = indices[offsets[src]];
-    subset[src]    = array[index];
+    for (index_type eid = 0; eid < elem_size; eid++) {
+      subset[src * elem_size + eid] = array[index * elem_size + eid];
+    }
   }
 }
 
 template <typename DataType>
-__global__ void batch_get_reset_subset(index_type subset_size,
-                                       const unsigned int* __restrict__ indices,
-                                       DataType* __restrict__ subset,
-                                       DataType* __restrict__ array,
-                                       DataType reset_value) {
+__global__ void
+batch_get_reset_subset(const index_type subset_size, const index_type elem_size,
+                       const unsigned int* __restrict__ indices,
+                       DataType* __restrict__ subset,
+                       DataType* __restrict__ array, DataType reset_value) {
   unsigned tid       = TID_1D;
   unsigned nthreads  = TOTAL_THREADS_1D;
   index_type src_end = subset_size;
   for (index_type src = 0 + tid; src < src_end; src += nthreads) {
     unsigned index = indices[src];
-    subset[src]    = array[index];
-    array[index]   = reset_value;
+    for (index_type eid = 0; eid < elem_size; eid++) {
+      subset[src * elem_size + eid]  = array[index * elem_size + eid];
+      array[index * elem_size + eid] = reset_value;
+    }
   }
 }
 
 template <typename DataType, typename OffsetIteratorType>
-__global__ void batch_get_reset_subset(index_type subset_size,
-                                       const unsigned int* __restrict__ indices,
-                                       const OffsetIteratorType offsets,
-                                       DataType* __restrict__ subset,
-                                       DataType* __restrict__ array,
-                                       DataType reset_value) {
+__global__ void
+batch_get_reset_subset(const index_type subset_size, const index_type elem_size,
+                       const unsigned int* __restrict__ indices,
+                       const OffsetIteratorType offsets,
+                       DataType* __restrict__ subset,
+                       DataType* __restrict__ array, DataType reset_value) {
   unsigned tid       = TID_1D;
   unsigned nthreads  = TOTAL_THREADS_1D;
   index_type src_end = subset_size;
   for (index_type src = 0 + tid; src < src_end; src += nthreads) {
     unsigned index = indices[offsets[src]];
-    subset[src]    = array[index];
-    array[index]   = reset_value;
+    for (index_type eid = 0; eid < elem_size; eid++) {
+      subset[src * elem_size + eid]  = array[index * elem_size + eid];
+      array[index * elem_size + eid] = reset_value;
+    }
   }
 }
 
 template <typename DataType, SharedType sharedType>
-__global__ void batch_set_subset(index_type subset_size,
+__global__ void batch_set_subset(const index_type subset_size,
+                                 const index_type elem_size,
                                  const unsigned int* __restrict__ indices,
                                  const DataType* __restrict__ subset,
                                  DataType* __restrict__ array,
@@ -124,7 +135,10 @@ __global__ void batch_set_subset(index_type subset_size,
   index_type src_end = subset_size;
   for (index_type src = 0 + tid; src < src_end; src += nthreads) {
     unsigned index = indices[src];
-    array[index]   = subset[src];
+    for (index_type eid = 0; eid < elem_size; eid++) {
+      array[index * elem_size + eid] = subset[src * elem_size + eid];
+    }
+
     if (sharedType != sharedMirror) {
       is_array_updated->set(index);
     }
@@ -132,7 +146,8 @@ __global__ void batch_set_subset(index_type subset_size,
 }
 
 template <typename DataType, SharedType sharedType, typename OffsetIteratorType>
-__global__ void batch_set_subset(index_type subset_size,
+__global__ void batch_set_subset(const index_type subset_size,
+                                 const index_type elem_size,
                                  const unsigned int* __restrict__ indices,
                                  const OffsetIteratorType offsets,
                                  const DataType* __restrict__ subset,
@@ -143,7 +158,10 @@ __global__ void batch_set_subset(index_type subset_size,
   index_type src_end = subset_size;
   for (index_type src = 0 + tid; src < src_end; src += nthreads) {
     unsigned index = indices[offsets[src]];
-    array[index]   = subset[src];
+    for (index_type eid = 0; eid < elem_size; eid++) {
+      array[index * elem_size + eid] = subset[src * elem_size + eid];
+    }
+
     if (sharedType != sharedMirror) {
       is_array_updated->set(index);
     }
@@ -151,7 +169,8 @@ __global__ void batch_set_subset(index_type subset_size,
 }
 
 template <typename DataType, SharedType sharedType>
-__global__ void batch_add_subset(index_type subset_size,
+__global__ void batch_add_subset(const index_type subset_size,
+                                 const index_type elem_size,
                                  const unsigned int* __restrict__ indices,
                                  const DataType* __restrict__ subset,
                                  DataType* __restrict__ array,
@@ -161,7 +180,10 @@ __global__ void batch_add_subset(index_type subset_size,
   index_type src_end = subset_size;
   for (index_type src = 0 + tid; src < src_end; src += nthreads) {
     unsigned index = indices[src];
-    array[index] += subset[src];
+    for (index_type eid = 0; eid < elem_size; eid++) {
+      array[index * elem_size + eid] += subset[src * elem_size + eid];
+    }
+
     if (sharedType != sharedMirror) {
       is_array_updated->set(index);
     }
@@ -169,7 +191,8 @@ __global__ void batch_add_subset(index_type subset_size,
 }
 
 template <typename DataType, SharedType sharedType, typename OffsetIteratorType>
-__global__ void batch_add_subset(index_type subset_size,
+__global__ void batch_add_subset(const index_type subset_size,
+                                 const index_type elem_size,
                                  const unsigned int* __restrict__ indices,
                                  const OffsetIteratorType offsets,
                                  const DataType* __restrict__ subset,
@@ -180,7 +203,10 @@ __global__ void batch_add_subset(index_type subset_size,
   index_type src_end = subset_size;
   for (index_type src = 0 + tid; src < src_end; src += nthreads) {
     unsigned index = indices[offsets[src]];
-    array[index] += subset[src];
+    for (index_type eid = 0; eid < elem_size; eid++) {
+      array[index * elem_size + eid] += subset[src * elem_size + eid];
+    }
+
     if (sharedType != sharedMirror) {
       is_array_updated->set(index);
     }
@@ -188,7 +214,8 @@ __global__ void batch_add_subset(index_type subset_size,
 }
 
 template <typename DataType, SharedType sharedType>
-__global__ void batch_min_subset(index_type subset_size,
+__global__ void batch_min_subset(const index_type subset_size,
+                                 const index_type elem_size,
                                  const unsigned int* __restrict__ indices,
                                  const DataType* __restrict__ subset,
                                  DataType* __restrict__ array,
@@ -198,17 +225,20 @@ __global__ void batch_min_subset(index_type subset_size,
   index_type src_end = subset_size;
   for (index_type src = 0 + tid; src < src_end; src += nthreads) {
     unsigned index = indices[src];
-    if (array[index] > subset[src]) {
-      array[index] = subset[src];
-      if (sharedType != sharedMirror) {
-        is_array_updated->set(index);
+    for (index_type eid = 0; eid < elem_size; eid++) {
+      if (array[index * elem_size + eid] > subset[src * elem_size + eid]) {
+        array[index * elem_size + eid] = subset[src * elem_size + eid];
+        if (sharedType != sharedMirror) {
+          is_array_updated->set(index);
+        }
       }
     }
   }
 }
 
 template <typename DataType, SharedType sharedType, typename OffsetIteratorType>
-__global__ void batch_min_subset(index_type subset_size,
+__global__ void batch_min_subset(const index_type subset_size,
+                                 const index_type elem_size,
                                  const unsigned int* __restrict__ indices,
                                  const OffsetIteratorType offsets,
                                  const DataType* __restrict__ subset,
@@ -219,10 +249,12 @@ __global__ void batch_min_subset(index_type subset_size,
   index_type src_end = subset_size;
   for (index_type src = 0 + tid; src < src_end; src += nthreads) {
     unsigned index = indices[offsets[src]];
-    if (array[index] > subset[src]) {
-      array[index] = subset[src];
-      if (sharedType != sharedMirror) {
-        is_array_updated->set(index);
+    for (index_type eid = 0; eid < elem_size; eid++) {
+      if (array[index * elem_size + eid] > subset[src * elem_size + eid]) {
+        array[index * elem_size + eid] = subset[src * elem_size + eid];
+        if (sharedType != sharedMirror) {
+          is_array_updated->set(index);
+        }
       }
     }
   }
@@ -437,6 +469,15 @@ void batch_get_shared_field(struct CUDA_Context_Common* ctx,
                             struct CUDA_Context_Field<DataType>* field,
                             unsigned from_id, uint8_t* send_buffer,
                             DataType i = 0) {
+  batch_get_shared_field<DataType, sharedType, reset>(ctx, field, from_id,
+                                                      send_buffer, 1, i);
+}
+
+template <typename DataType, SharedType sharedType, bool reset>
+void batch_get_shared_field(struct CUDA_Context_Common* ctx,
+                            struct CUDA_Context_Field<DataType>* field,
+                            unsigned from_id, uint8_t* send_buffer,
+                            size_t elem_size, DataType i = 0) {
   struct CUDA_Context_Shared* shared;
   if (sharedType == sharedMaster) {
     shared = &ctx->master;
@@ -454,12 +495,12 @@ void batch_get_shared_field(struct CUDA_Context_Common* ctx,
   size_t v_size = shared->num_nodes[from_id];
   if (reset) {
     batch_get_reset_subset<DataType><<<blocks, threads>>>(
-        v_size, shared->nodes[from_id].device_ptr(), shared_data->device_ptr(),
-        field->data.gpu_wr_ptr(), i);
+        v_size, elem_size, shared->nodes[from_id].device_ptr(),
+        shared_data->device_ptr(), field->data.gpu_wr_ptr(), i);
   } else {
     batch_get_subset<DataType><<<blocks, threads>>>(
-        v_size, shared->nodes[from_id].device_ptr(), shared_data->device_ptr(),
-        field->data.gpu_rd_ptr());
+        v_size, elem_size, shared->nodes[from_id].device_ptr(),
+        shared_data->device_ptr(), field->data.gpu_rd_ptr());
   }
   check_cuda_kernel;
   // timer1.stop();
@@ -468,7 +509,9 @@ void batch_get_shared_field(struct CUDA_Context_Common* ctx,
   memcpy(send_buffer, &data_mode, sizeof(data_mode));
   memcpy(send_buffer + sizeof(data_mode), &v_size, sizeof(v_size));
   shared_data->copy_to_cpu(
-      (DataType*)(send_buffer + sizeof(data_mode) + sizeof(v_size)), v_size);
+      (DataType*)(send_buffer + sizeof(data_mode) + sizeof(v_size)),
+      v_size * elem_size);
+
   // timer2.stop();
   // timer.stop();
   // fprintf(stderr, "Get %u->%u: Time (ms): %llu + %llu = %llu\n",
@@ -480,7 +523,8 @@ void batch_get_shared_field(struct CUDA_Context_Common* ctx,
 template <typename DataType>
 void serializeMessage(struct CUDA_Context_Common* ctx, DataCommMode data_mode,
                       size_t bit_set_count, size_t num_shared,
-                      DeviceOnly<DataType>* shared_data, uint8_t* send_buffer) {
+                      DeviceOnly<DataType>* shared_data, uint8_t* send_buffer,
+                      size_t elem_size) {
   if (data_mode == noData) {
     // do nothing
     return;
@@ -520,7 +564,8 @@ void serializeMessage(struct CUDA_Context_Common* ctx, DataCommMode data_mode,
   // serialize data vector
   memcpy(send_buffer + offset, &bit_set_count, sizeof(bit_set_count));
   offset += sizeof(bit_set_count);
-  shared_data->copy_to_cpu((DataType*)(send_buffer + offset), bit_set_count);
+  shared_data->copy_to_cpu((DataType*)(send_buffer + offset),
+                           (elem_size * bit_set_count));
   // offset += bit_set_count * sizeof(DataType);
 }
 
@@ -530,6 +575,16 @@ void batch_get_shared_field(struct CUDA_Context_Common* ctx,
                             unsigned from_id, uint8_t* send_buffer,
                             size_t* v_size, DataCommMode* data_mode,
                             DataType i = 0) {
+  batch_get_shared_field<DataType, sharedType, reset>(
+      ctx, field, from_id, send_buffer, v_size, data_mode, 1, i);
+}
+
+template <typename DataType, SharedType sharedType, bool reset>
+void batch_get_shared_field(struct CUDA_Context_Common* ctx,
+                            struct CUDA_Context_Field<DataType>* field,
+                            unsigned from_id, uint8_t* send_buffer,
+                            size_t* v_size, DataCommMode* data_mode,
+                            size_t elem_size, DataType i = 0) {
   struct CUDA_Context_Shared* shared;
   if (sharedType == sharedMaster) {
     shared = &ctx->master;
@@ -541,69 +596,53 @@ void batch_get_shared_field(struct CUDA_Context_Common* ctx,
   dim3 threads;
   kernel_sizing(blocks, threads);
 
-  // ggc::Timer timer("timer"), timer1("timer1"), timer2("timer2"),
-  // timer3("timer3"), timer4("timer 4"); timer.start();
   if (enforcedDataMode != onlyData) {
-    // timer1.start();
     ctx->is_updated.cpu_rd_ptr()->resize(shared->num_nodes[from_id]);
     ctx->is_updated.cpu_rd_ptr()->reset();
+    //! check updated entries and update bitset
     batch_get_subset_bitset<<<blocks, threads>>>(
         shared->num_nodes[from_id], shared->nodes[from_id].device_ptr(),
         ctx->is_updated.gpu_rd_ptr(), field->is_updated.gpu_rd_ptr());
     check_cuda_kernel;
-    // timer1.stop();
-    // timer2.start();
     get_offsets_from_bitset(shared->num_nodes[from_id],
                             ctx->offsets.device_ptr(),
                             ctx->is_updated.gpu_rd_ptr(), v_size);
-    // timer2.stop();
   }
   *data_mode = get_data_mode<DataType>(*v_size, shared->num_nodes[from_id]);
-  // timer3.start();
   if ((*data_mode) == onlyData) {
     *v_size = shared->num_nodes[from_id];
     if (reset) {
       batch_get_reset_subset<DataType><<<blocks, threads>>>(
-          *v_size, shared->nodes[from_id].device_ptr(),
+          *v_size, elem_size, shared->nodes[from_id].device_ptr(),
           shared_data->device_ptr(), field->data.gpu_wr_ptr(), i);
     } else {
       batch_get_subset<DataType><<<blocks, threads>>>(
-          *v_size, shared->nodes[from_id].device_ptr(),
+          *v_size, elem_size, shared->nodes[from_id].device_ptr(),
           shared_data->device_ptr(), field->data.gpu_rd_ptr());
     }
   } else { // bitsetData || offsetsData
     if (reset) {
       batch_get_reset_subset<DataType><<<blocks, threads>>>(
-          *v_size, shared->nodes[from_id].device_ptr(),
+          *v_size, elem_size, shared->nodes[from_id].device_ptr(),
           ctx->offsets.device_ptr(), shared_data->device_ptr(),
           field->data.gpu_wr_ptr(), i);
     } else {
       batch_get_subset<DataType><<<blocks, threads>>>(
-          *v_size, shared->nodes[from_id].device_ptr(),
+          *v_size, elem_size, shared->nodes[from_id].device_ptr(),
           ctx->offsets.device_ptr(), shared_data->device_ptr(),
           field->data.gpu_rd_ptr());
     }
   }
   check_cuda_kernel;
-  // timer3.stop();
-  // timer4.start();
   serializeMessage(ctx, *data_mode, *v_size, shared->num_nodes[from_id],
-                   shared_data, send_buffer);
-  // timer4.stop();
-  // timer.stop();
-  // fprintf(stderr, "Get %u->%u: %d mode %u bitset %u indices. Time (ms): %llu
-  // + %llu + %llu + %llu = %llu\n",
-  //  ctx->id, from_id, *data_mode,
-  //  ctx->is_updated.cpu_rd_ptr()->alloc_size(), sizeof(unsigned int) *
-  //  (*v_size), timer1.duration_ms(), timer2.duration_ms(),
-  //  timer3.duration_ms(), timer4.duration_ms(), timer.duration_ms());
+                   shared_data, send_buffer, elem_size);
 }
 
 template <typename DataType>
 void deserializeMessage(struct CUDA_Context_Common* ctx, DataCommMode data_mode,
                         size_t& bit_set_count, size_t num_shared,
-                        DeviceOnly<DataType>* shared_data,
-                        uint8_t* recv_buffer) {
+                        DeviceOnly<DataType>* shared_data, uint8_t* recv_buffer,
+                        size_t elem_size) {
   size_t offset = 0; // data_mode is already deserialized
 
   if (data_mode != onlyData) {
@@ -640,8 +679,8 @@ void deserializeMessage(struct CUDA_Context_Common* ctx, DataCommMode data_mode,
 
   // deserialize data vector
   offset += sizeof(bit_set_count);
-  shared_data->copy_to_gpu((DataType*)(recv_buffer + offset), bit_set_count);
-  // offset += bit_set_count * sizeof(DataType);
+  shared_data->copy_to_gpu((DataType*)(recv_buffer + offset),
+                           bit_set_count * elem_size);
 }
 
 template <typename DataType, SharedType sharedType, UpdateOp op>
@@ -649,6 +688,15 @@ void batch_set_shared_field(struct CUDA_Context_Common* ctx,
                             struct CUDA_Context_Field<DataType>* field,
                             unsigned from_id, uint8_t* recv_buffer,
                             DataCommMode data_mode) {
+  batch_set_shared_field<DataType, sharedType, op>(ctx, field, from_id,
+                                                   recv_buffer, data_mode, 1);
+}
+
+template <typename DataType, SharedType sharedType, UpdateOp op>
+void batch_set_shared_field(struct CUDA_Context_Common* ctx,
+                            struct CUDA_Context_Field<DataType>* field,
+                            unsigned from_id, uint8_t* recv_buffer,
+                            DataCommMode data_mode, size_t elem_size) {
   assert(data_mode != noData);
   struct CUDA_Context_Shared* shared;
   if (sharedType == sharedMaster) {
@@ -666,54 +714,57 @@ void batch_set_shared_field(struct CUDA_Context_Common* ctx,
   // timer.start();
   // timer1.start();
   deserializeMessage(ctx, data_mode, v_size, shared->num_nodes[from_id],
-                     shared_data, recv_buffer);
+                     shared_data, recv_buffer, elem_size);
   // timer1.stop();
   // timer2.start();
   if (data_mode == onlyData) {
     if (op == setOp) {
       batch_set_subset<DataType, sharedType><<<blocks, threads>>>(
-          v_size, shared->nodes[from_id].device_ptr(),
+          v_size, elem_size, shared->nodes[from_id].device_ptr(),
           shared_data->device_ptr(), field->data.gpu_wr_ptr(),
           field->is_updated.gpu_wr_ptr());
     } else if (op == addOp) {
       batch_add_subset<DataType, sharedType><<<blocks, threads>>>(
-          v_size, shared->nodes[from_id].device_ptr(),
+          v_size, elem_size, shared->nodes[from_id].device_ptr(),
           shared_data->device_ptr(), field->data.gpu_wr_ptr(),
           field->is_updated.gpu_wr_ptr());
     } else if (op == minOp) {
       batch_min_subset<DataType, sharedType><<<blocks, threads>>>(
-          v_size, shared->nodes[from_id].device_ptr(),
+          v_size, elem_size, shared->nodes[from_id].device_ptr(),
           shared_data->device_ptr(), field->data.gpu_wr_ptr(),
           field->is_updated.gpu_wr_ptr());
     }
   } else if (data_mode == gidsData) {
     if (op == setOp) {
       batch_set_subset<DataType, sharedType><<<blocks, threads>>>(
-          v_size, ctx->offsets.device_ptr(), shared_data->device_ptr(),
-          field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr());
+          v_size, elem_size, ctx->offsets.device_ptr(),
+          shared_data->device_ptr(), field->data.gpu_wr_ptr(),
+          field->is_updated.gpu_wr_ptr());
     } else if (op == addOp) {
       batch_add_subset<DataType, sharedType><<<blocks, threads>>>(
-          v_size, ctx->offsets.device_ptr(), shared_data->device_ptr(),
-          field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr());
+          v_size, elem_size, ctx->offsets.device_ptr(),
+          shared_data->device_ptr(), field->data.gpu_wr_ptr(),
+          field->is_updated.gpu_wr_ptr());
     } else if (op == minOp) {
       batch_min_subset<DataType, sharedType><<<blocks, threads>>>(
-          v_size, ctx->offsets.device_ptr(), shared_data->device_ptr(),
-          field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr());
+          v_size, elem_size, ctx->offsets.device_ptr(),
+          shared_data->device_ptr(), field->data.gpu_wr_ptr(),
+          field->is_updated.gpu_wr_ptr());
     }
   } else { // bitsetData || offsetsData
     if (op == setOp) {
       batch_set_subset<DataType, sharedType><<<blocks, threads>>>(
-          v_size, shared->nodes[from_id].device_ptr(),
+          v_size, elem_size, shared->nodes[from_id].device_ptr(),
           ctx->offsets.device_ptr(), shared_data->device_ptr(),
           field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr());
     } else if (op == addOp) {
       batch_add_subset<DataType, sharedType><<<blocks, threads>>>(
-          v_size, shared->nodes[from_id].device_ptr(),
+          v_size, elem_size, shared->nodes[from_id].device_ptr(),
           ctx->offsets.device_ptr(), shared_data->device_ptr(),
           field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr());
     } else if (op == minOp) {
       batch_min_subset<DataType, sharedType><<<blocks, threads>>>(
-          v_size, shared->nodes[from_id].device_ptr(),
+          v_size, elem_size, shared->nodes[from_id].device_ptr(),
           ctx->offsets.device_ptr(), shared_data->device_ptr(),
           field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr());
     }
diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt
index 320189c44e..e0d90216e2 100644
--- a/libgnn/CMakeLists.txt
+++ b/libgnn/CMakeLists.txt
@@ -14,6 +14,7 @@ set(MKL_LIBRARIES ${MKL_ROOT}/lib/intel64)
 set(INTEL_LIBS "-lmkl_intel_lp64 -lmkl_sequential -lmkl_core")
 
 add_library(galois_gnn STATIC ${sources})
+
 target_link_directories(galois_gnn PUBLIC ${MKL_LIBRARIES})
 target_link_libraries(galois_gnn galois_shmem)
 target_link_libraries(galois_gnn ${INTEL_LIBS})
@@ -38,6 +39,8 @@ if (GALOIS_ENABLE_GPU)
     src/layers/SoftmaxLayer.cu
     src/GraphNeuralNetwork.cu
     src/GNNOptimizers.cu
+    src/GNNCudaContext.cu
+    src/CUDAUtil.cu
   )
   add_library(galois_gnn_gpu STATIC ${gpusources})
   target_compile_definitions(galois_gnn_gpu PRIVATE _FORCE_INLINES)
@@ -49,7 +52,7 @@ if (GALOIS_ENABLE_GPU)
   )
 
   # link to gpu lib (which takes care of moderngpu and cub) as well as cu libs
-  target_link_libraries(galois_gnn_gpu Galois::gpu galois_support -lcublas -lcurand)
+  target_link_libraries(galois_gnn_gpu galois_gluon Galois::gpu galois_support -lcublas -lcurand)
 
   # gpu -> cpu lib
   target_link_libraries(galois_gnn galois_gnn_gpu)
diff --git a/libgnn/include/galois/CUDAUtilHostDecls.h b/libgnn/include/galois/CUDAUtilHostDecls.h
new file mode 100644
index 0000000000..d9fe5230a5
--- /dev/null
+++ b/libgnn/include/galois/CUDAUtilHostDecls.h
@@ -0,0 +1,3 @@
+#pragma once
+
+void SetCUDADeviceId(int gpu_id);
diff --git a/libgnn/include/galois/GNNCudaContextHostDecls.h b/libgnn/include/galois/GNNCudaContextHostDecls.h
new file mode 100644
index 0000000000..fea68d5fec
--- /dev/null
+++ b/libgnn/include/galois/GNNCudaContextHostDecls.h
@@ -0,0 +1,82 @@
+#pragma once
+
+#include "galois/cuda/HostDecls.h"
+
+extern int gpudevice;
+
+void load_graph_CUDA_GNN(struct CUDA_Context* ctx, PartitionedGraphInfo& g,
+                         unsigned num_hosts);
+void resize_CUDA_layer_vector(struct CUDA_Context* ctx, size_t num_layers);
+void init_CUDA_layer_vector_meta_obj(struct CUDA_Context* ctx,
+                                     unsigned layer_number, unsigned num_hosts,
+                                     unsigned nnodes, size_t infl_in_size,
+                                     size_t infl_out_size);
+
+namespace galois {
+void batch_get_node_layer_input_matrix_cuda(
+    struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, size_t* buf_size,
+    DataCommMode* mode, size_t column_size, unsigned layer_number);
+void batch_get_node_layer_input_matrix_cuda(struct CUDA_Context* ctx,
+                                            unsigned from_id, uint8_t* buf,
+                                            size_t column_size,
+                                            unsigned layer_number);
+void batch_aggregate_node_layer_input_matrix_cuda(
+    struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, DataCommMode mode,
+    size_t column_size, unsigned layer_number);
+void batch_aggregate_mirror_node_layer_input_matrix_cuda(
+    struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, DataCommMode mode,
+    size_t column_size, unsigned layer_number);
+void batch_set_node_layer_input_matrix_cuda(struct CUDA_Context* ctx,
+                                            unsigned from_id, uint8_t* buf,
+                                            DataCommMode mode,
+                                            size_t column_size,
+                                            unsigned layer_number);
+void batch_set_mirror_node_layer_input_matrix_cuda(
+    struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, DataCommMode mode,
+    size_t column_size, unsigned layer_number);
+void batch_get_reset_node_layer_input_matrix_cuda(
+    struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, size_t* buf_size,
+    DataCommMode* mode, size_t column_size, unsigned layer_number);
+void batch_get_reset_node_layer_input_matrix_cuda(struct CUDA_Context* ctx,
+                                                  unsigned from_id,
+                                                  uint8_t* buf,
+                                                  size_t column_size,
+                                                  unsigned layer_number);
+void batch_get_node_layer_output_matrix_cuda(
+    struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, size_t* buf_size,
+    DataCommMode* mode, size_t column_size, unsigned layer_number);
+void batch_get_node_layer_output_matrix_cuda(struct CUDA_Context* ctx,
+                                             unsigned from_id, uint8_t* buf,
+                                             size_t column_size,
+                                             unsigned layer_number);
+void batch_aggregate_node_layer_output_matrix_cuda(
+    struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, DataCommMode mode,
+    size_t column_size, unsigned layer_number);
+void batch_aggregate_mirror_node_layer_output_matrix_cuda(
+    struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, DataCommMode mode,
+    size_t column_size, unsigned layer_number);
+void batch_set_node_layer_output_matrix_cuda(struct CUDA_Context* ctx,
+                                             unsigned from_id, uint8_t* buf,
+                                             DataCommMode mode,
+                                             size_t column_size,
+                                             unsigned layer_number);
+void batch_set_mirror_node_layer_output_matrix_cuda(
+    struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, DataCommMode mode,
+    size_t column_size, unsigned layer_number);
+void batch_get_reset_node_layer_output_matrix_cuda(
+    struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, size_t* buf_size,
+    DataCommMode* mode, size_t column_size, unsigned layer_number);
+void batch_get_reset_node_layer_output_matrix_cuda(struct CUDA_Context* ctx,
+                                                   unsigned from_id,
+                                                   uint8_t* buf,
+                                                   size_t column_size,
+                                                   unsigned layer_number);
+
+void cudaSetLayerInputOutput(struct CUDA_Context* ctx, GNNFloat* layer_matrix,
+                             size_t column_size, size_t num_nodes,
+                             unsigned layer_number);
+size_t getLayerInputMatrixColumnSize(struct CUDA_Context* ctx,
+                                     unsigned layer_number);
+size_t getLayerOutputMatrixColumnSize(struct CUDA_Context* ctx,
+                                      unsigned layer_number);
+} // namespace galois
diff --git a/libgnn/include/galois/GNNOptimizers.h b/libgnn/include/galois/GNNOptimizers.h
index 86a656fd30..8a171f96da 100644
--- a/libgnn/include/galois/GNNOptimizers.h
+++ b/libgnn/include/galois/GNNOptimizers.h
@@ -54,24 +54,28 @@ class AdamOptimizer : public BaseOptimizer {
         beta2_power_t_(num_trainable_layers_, config.beta2) {
     // >= because only prefix will be considered otherwise
     assert(trainable_layer_sizes.size() >= num_trainable_layers_);
-#ifndef GALOIS_ENABLE_GPU
-    // allocate vectors based on # of trainable layers
-    for (size_t i = 0; i < num_trainable_layers_; i++) {
-      first_moments_.emplace_back(trainable_layer_sizes[i], 0.0);
-      second_moments_.emplace_back(trainable_layer_sizes[i], 0.0);
-      // Pointer with size construction
-      p_first_moments_.emplace_back(first_moments_.back());
-      p_second_moments_.emplace_back(second_moments_.back());
-    }
-    assert(first_moments_.size() == num_trainable_layers_);
-    assert(second_moments_.size() == num_trainable_layers_);
-#else
-    // pointer with size initialization with GPU pointers
-    for (size_t i = 0; i < num_trainable_layers_; i++) {
-      p_first_moments_.emplace_back(gpu_object_.first_moment(i),
-                                    trainable_layer_sizes[i]);
-      p_second_moments_.emplace_back(gpu_object_.second_moment(i),
-                                     trainable_layer_sizes[i]);
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      // pointer with size initialization with GPU pointers
+      for (size_t i = 0; i < num_trainable_layers_; i++) {
+        p_first_moments_.emplace_back(gpu_object_.first_moment(i),
+                                      trainable_layer_sizes[i]);
+        p_second_moments_.emplace_back(gpu_object_.second_moment(i),
+                                       trainable_layer_sizes[i]);
+      }
+    } else {
+#endif
+      // allocate vectors based on # of trainable layers
+      for (size_t i = 0; i < num_trainable_layers_; i++) {
+        first_moments_.emplace_back(trainable_layer_sizes[i], 0.0);
+        second_moments_.emplace_back(trainable_layer_sizes[i], 0.0);
+        // Pointer with size construction
+        p_first_moments_.emplace_back(first_moments_.back());
+        p_second_moments_.emplace_back(second_moments_.back());
+      }
+      assert(first_moments_.size() == num_trainable_layers_);
+      assert(second_moments_.size() == num_trainable_layers_);
+#ifdef GALOIS_ENABLE_GPU
     }
 #endif
   }
diff --git a/libgnn/include/galois/GNNTypes.h b/libgnn/include/galois/GNNTypes.h
index 40f19da7b0..3603cb68d7 100644
--- a/libgnn/include/galois/GNNTypes.h
+++ b/libgnn/include/galois/GNNTypes.h
@@ -6,6 +6,11 @@
 #include <cstddef>
 #include <vector>
 
+#ifdef GALOIS_ENABLE_GPU
+enum class DevicePersonality { CPU, GPU_CUDA };
+extern DevicePersonality device_personality;
+#endif
+
 namespace galois {
 //! Floating point type to use throughout GNN compute; typedef'd so it's easier
 //! to flip later
diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 2ed6647b7c..7b55b84162 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -119,13 +119,14 @@ class GNNGraph {
 
   //! Return matrix of the local node features
   const PointerWithSize<GNNFloat> GetLocalFeatures() {
-#ifndef GALOIS_ENABLE_GPU
-    return PointerWithSize(local_node_features_);
-#else
-    // TODO remove reliance on local_node_features
-    return PointerWithSize(gpu_memory_.feature_vector(),
-                           local_node_features_.size());
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      // TODO remove reliance on local_node_features
+      return PointerWithSize(gpu_memory_.feature_vector(),
+                             local_node_features_.size());
+    }
 #endif
+    return PointerWithSize(local_node_features_);
   }
 
   //! Given an LID and the current phase of GNN computation, determine if the
@@ -178,7 +179,23 @@ class GNNGraph {
   void CalculateSpecialNormFactor(bool is_sampled, bool is_inductive);
 
 #ifdef GALOIS_ENABLE_GPU
+  void AggregateSync(GNNFloat* matrix_to_sync, const size_t matrix_column_size,
+                     const unsigned layer_number) const;
+
+  void InitLayerVectorMetaObjects(size_t layer_number, unsigned num_hosts,
+                                  size_t infl_in_size, size_t infl_out_size);
+
+  void ResizeLayerVector(size_t num_layers);
+
   const GNNGraphGPUAllocations& GetGPUGraph() const { return gpu_memory_; }
+
+  void GetMarshalGraph(MarshalGraph& m) const {
+    sync_substrate_->getMarshalGraph(m, false);
+  }
+
+  void GetPartitionedGraphInfo(PartitionedGraphInfo& g_info) const {
+    sync_substrate_->getPartitionedGraphInfo(g_info);
+  }
 #endif
 
 private:
@@ -277,6 +294,7 @@ class GNNGraph {
   //////////////////////////////////////////////////////////////////////////////
 
 #ifdef GALOIS_ENABLE_GPU
+  struct CUDA_Context* cuda_ctx_;
   //! Object that holds all GPU allocated pointers to memory related to graphs.
   GNNGraphGPUAllocations gpu_memory_;
   //! Call this to setup GPU memory for this graph: allocates necessary GPU
diff --git a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
index 75a18fd830..62a5ab14cb 100644
--- a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
+++ b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
@@ -2,18 +2,28 @@
 // gets synchronized
 #include "galois/GNNTypes.h"
 #include "galois/BufferWrapper.h"
+#ifdef GALOIS_ENABLE_GPU
+#include "galois/GNNCudaContextHostDecls.h"
+#endif
 
 namespace galois {
 namespace graphs {
 
 extern GNNFloat* gnn_matrix_to_sync_;
 extern size_t gnn_matrix_to_sync_column_length_;
+#ifdef GALOIS_ENABLE_GPU
+extern struct CUDA_Context* cuda_ctx_for_sync;
+extern unsigned layer_number_to_sync;
+#endif
 
 struct GNNSumAggregate {
   using ValTy = galois::BufferWrapper<GNNFloat>;
 
   //! return a vector of floats to sync
   static ValTy extract(uint32_t node_id, char&) {
+    // It should be a CPU synchronizing substrate.
+    // If the GPU flag is turned off, then personality does not exist.
+    // assert(device_personality == DevicePersonality::CPU);
     ValTy extracted_vec(
         &gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_],
         gnn_matrix_to_sync_column_length_);
@@ -51,16 +61,24 @@ struct GNNSumAggregate {
     return false;
   }
   static bool extract_batch(unsigned, uint8_t*) { return false; }
-  static bool extract_reset_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {
-    return false;
-  }
-  static bool extract_reset_batch(unsigned, uint8_t*) { return false; }
   static bool reduce_batch(unsigned, uint8_t*, DataCommMode) { return false; }
   static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) {
     return false;
   }
   static bool setVal_batch(unsigned, uint8_t*, DataCommMode) { return false; }
+  static bool extract_reset_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {
+    return false;
+  }
+  static bool extract_reset_batch(unsigned, uint8_t*) { return false; }
 };
 
+#ifdef GALOIS_ENABLE_GPU
+GALOIS_SYNC_STRUCTURE_GNN_LAYER(layer_input, cuda_ctx_for_sync,
+                                gnn_matrix_to_sync_column_length_,
+                                layer_number_to_sync);
+GALOIS_SYNC_STRUCTURE_GNN_LAYER(layer_output, cuda_ctx_for_sync,
+                                gnn_matrix_to_sync_column_length_,
+                                layer_number_to_sync);
+#endif
 } // namespace graphs
 } // namespace galois
diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
index 6ec6a78671..c4cc29290f 100644
--- a/libgnn/include/galois/layers/GNNLayer.h
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -88,7 +88,9 @@ class GNNLayer {
       layer_weights_.assign(layer_weights_.size(), 1);
     }
 #ifdef GALOIS_ENABLE_GPU
-    CopyLayerWeightsToGPU();
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      CopyLayerWeightsToGPU();
+    }
 #endif
   }
 
diff --git a/libgnn/src/CUDAUtil.cu b/libgnn/src/CUDAUtil.cu
new file mode 100644
index 0000000000..7d39a81ff2
--- /dev/null
+++ b/libgnn/src/CUDAUtil.cu
@@ -0,0 +1,9 @@
+#include <cuda.h>
+
+#include "galois/CUDAUtilHostDecls.h"
+#include "galois/GNNTypes.h"
+
+DevicePersonality device_personality;
+int gpudevice;
+
+void SetCUDADeviceId(int gpu_id) { cudaSetDevice(gpu_id); }
diff --git a/libgnn/src/GNNCudaContext.cu b/libgnn/src/GNNCudaContext.cu
new file mode 100644
index 0000000000..d0512f8e72
--- /dev/null
+++ b/libgnn/src/GNNCudaContext.cu
@@ -0,0 +1,228 @@
+#include <stdlib.h>
+#include "gg.h"
+#include "ggcuda.h"
+#include "galois/cuda/Context.h"
+#include "galois/GNNTypes.h"
+#include "galois/runtime/cuda/DeviceSync.h"
+#include "galois/GNNCudaContextHostDecls.h"
+
+// The forward declaration is in the original Context.h file; as long as
+// pointers to it are used it shouldn't be an issue (since space usage is
+// unknown at that point)
+struct CUDA_Context : public CUDA_Context_Common {
+  // TODO to arrays: each context handles all layers of the graph
+  // Possible to add a "layer" argument to the below functions?
+  std::vector<struct CUDA_Context_Field<galois::GNNFloat>> layer_input_matrix;
+  std::vector<struct CUDA_Context_Field<galois::GNNFloat>> layer_output_matrix;
+  std::vector<size_t> layer_input_matrix_column_size;
+  std::vector<size_t> layer_output_matrix_column_size;
+};
+
+//! Allocates a new CUDA context
+//! Note: caller is responsible for freeing it
+struct CUDA_Context* get_CUDA_context(int id) {
+  struct CUDA_Context* ctx =
+      (struct CUDA_Context*)calloc(1, sizeof(struct CUDA_Context));
+  ctx->id = id;
+  return ctx;
+}
+
+bool init_CUDA_context(struct CUDA_Context* ctx, int device) {
+  return init_CUDA_context_common(ctx, device);
+}
+
+void resize_CUDA_layer_vector(struct CUDA_Context* ctx, size_t num_layers) {
+  ctx->layer_output_matrix.resize(num_layers);
+  ctx->layer_output_matrix_column_size.resize(num_layers);
+  ctx->layer_input_matrix.resize(num_layers);
+  ctx->layer_input_matrix_column_size.resize(num_layers);
+}
+
+void load_graph_CUDA_GNN(struct CUDA_Context* ctx, PartitionedGraphInfo& g_info,
+                         unsigned num_hosts) {
+  size_t mem_usage = mem_usage_CUDA_common(g_info, num_hosts);
+  printf("[%d] Host memory for communication context: (%3u B) %3u MB\n",
+         ctx->id, mem_usage, mem_usage / 1048756);
+
+  // TODO This is expensive; is it required? Can we get away with less?
+  // should only need one copy of mirror/masters for entire execution,
+  // not per layer
+  // graph does not need to be copied either since that's handled elsewhere
+  // (gpu object on GNNGraph)
+  load_graph_CUDA_common(ctx, g_info, num_hosts);
+}
+
+void init_CUDA_layer_vector_meta_obj(struct CUDA_Context* ctx,
+                                     unsigned layer_number, unsigned num_hosts,
+                                     unsigned nnodes, size_t infl_in_size,
+                                     size_t infl_out_size) {
+  ctx->layer_input_matrix_column_size[layer_number] = infl_in_size;
+  load_graph_CUDA_field_inflating(ctx, &ctx->layer_input_matrix[layer_number],
+                                  num_hosts, nnodes, infl_in_size, false);
+  ctx->layer_output_matrix_column_size[layer_number] = infl_out_size;
+  load_graph_CUDA_field_inflating(ctx, &ctx->layer_output_matrix[layer_number],
+                                  num_hosts, nnodes, infl_out_size, false);
+}
+
+////////// layer_input_matrix (forward) synchronization function ///////////////
+
+namespace galois {
+void batch_get_node_layer_input_matrix_cuda(
+    struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, size_t* buf_size,
+    DataCommMode* mode, size_t column_size, unsigned layer_number) {
+  batch_get_shared_field<GNNFloat, sharedMaster, false>(
+      ctx, &ctx->layer_input_matrix[layer_number], from_id, buf, buf_size, mode,
+      column_size);
+}
+
+void batch_get_node_layer_input_matrix_cuda(struct CUDA_Context* ctx,
+                                            unsigned from_id, uint8_t* buf,
+                                            size_t column_size,
+                                            unsigned layer_number) {
+  batch_get_shared_field<GNNFloat, sharedMaster, false>(
+      ctx, &ctx->layer_input_matrix[layer_number], from_id, buf, column_size);
+}
+
+void batch_aggregate_node_layer_input_matrix_cuda(
+    struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, DataCommMode mode,
+    size_t column_size, unsigned layer_number) {
+  batch_set_shared_field<GNNFloat, sharedMaster, addOp>(
+      ctx, &ctx->layer_input_matrix[layer_number], from_id, buf, mode,
+      column_size);
+}
+
+void batch_aggregate_mirror_node_layer_input_matrix_cuda(
+    struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, DataCommMode mode,
+    size_t column_size, unsigned layer_number) {
+  batch_set_shared_field<GNNFloat, sharedMirror, addOp>(
+      ctx, &ctx->layer_input_matrix[layer_number], from_id, buf, mode,
+      column_size);
+}
+
+void batch_set_node_layer_input_matrix_cuda(struct CUDA_Context* ctx,
+                                            unsigned from_id, uint8_t* buf,
+                                            DataCommMode mode,
+                                            size_t column_size,
+                                            unsigned layer_number) {
+  batch_set_shared_field<GNNFloat, sharedMaster, setOp>(
+      ctx, &ctx->layer_input_matrix[layer_number], from_id, buf, mode,
+      column_size);
+}
+
+void batch_set_mirror_node_layer_input_matrix_cuda(
+    struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, DataCommMode mode,
+    size_t column_size, unsigned layer_number) {
+  batch_set_shared_field<GNNFloat, sharedMirror, setOp>(
+      ctx, &ctx->layer_input_matrix[layer_number], from_id, buf, mode,
+      column_size);
+}
+
+void batch_get_reset_node_layer_input_matrix_cuda(
+    struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, size_t* buf_size,
+    DataCommMode* mode, size_t column_size, unsigned layer_number) {
+  batch_get_shared_field<GNNFloat, sharedMaster, true>(
+      ctx, &ctx->layer_input_matrix[layer_number], from_id, buf, buf_size, mode,
+      column_size);
+}
+
+void batch_get_reset_node_layer_input_matrix_cuda(struct CUDA_Context* ctx,
+                                                  unsigned from_id,
+                                                  uint8_t* buf,
+                                                  size_t column_size,
+                                                  unsigned layer_number) {
+  batch_get_shared_field<GNNFloat, sharedMaster, true>(
+      ctx, &ctx->layer_input_matrix[layer_number], from_id, buf, column_size);
+}
+
+////////// layer_output_matrix (backward) synchronization function /////////////
+
+void batch_get_node_layer_output_matrix_cuda(
+    struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, size_t* buf_size,
+    DataCommMode* mode, size_t column_size, unsigned layer_number) {
+  batch_get_shared_field<GNNFloat, sharedMaster, false>(
+      ctx, &ctx->layer_output_matrix[layer_number], from_id, buf, buf_size,
+      mode, column_size);
+}
+
+void batch_get_node_layer_output_matrix_cuda(struct CUDA_Context* ctx,
+                                             unsigned from_id, uint8_t* buf,
+                                             size_t column_size,
+                                             unsigned layer_number) {
+  batch_get_shared_field<GNNFloat, sharedMaster, false>(
+      ctx, &ctx->layer_output_matrix[layer_number], from_id, buf, column_size);
+}
+
+void batch_aggregate_node_layer_output_matrix_cuda(
+    struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, DataCommMode mode,
+    size_t column_size, unsigned layer_number) {
+  batch_set_shared_field<GNNFloat, sharedMaster, addOp>(
+      ctx, &ctx->layer_output_matrix[layer_number], from_id, buf, mode,
+      column_size);
+}
+
+void batch_aggregate_mirror_node_layer_output_matrix_cuda(
+    struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, DataCommMode mode,
+    size_t column_size, unsigned layer_number) {
+  batch_set_shared_field<GNNFloat, sharedMirror, addOp>(
+      ctx, &ctx->layer_output_matrix[layer_number], from_id, buf, mode,
+      column_size);
+}
+
+void batch_set_node_layer_output_matrix_cuda(struct CUDA_Context* ctx,
+                                             unsigned from_id, uint8_t* buf,
+                                             DataCommMode mode,
+                                             size_t column_size,
+                                             unsigned layer_number) {
+  batch_set_shared_field<GNNFloat, sharedMaster, setOp>(
+      ctx, &ctx->layer_output_matrix[layer_number], from_id, buf, mode,
+      column_size);
+}
+
+void batch_set_mirror_node_layer_output_matrix_cuda(
+    struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, DataCommMode mode,
+    size_t column_size, unsigned layer_number) {
+  batch_set_shared_field<GNNFloat, sharedMirror, setOp>(
+      ctx, &ctx->layer_output_matrix[layer_number], from_id, buf, mode,
+      column_size);
+}
+
+void batch_get_reset_node_layer_output_matrix_cuda(
+    struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, size_t* buf_size,
+    DataCommMode* mode, size_t column_size, unsigned layer_number) {
+  batch_get_shared_field<GNNFloat, sharedMaster, true>(
+      ctx, &ctx->layer_output_matrix[layer_number], from_id, buf, buf_size,
+      mode, column_size);
+}
+
+void batch_get_reset_node_layer_output_matrix_cuda(struct CUDA_Context* ctx,
+                                                   unsigned from_id,
+                                                   uint8_t* buf,
+                                                   size_t column_size,
+                                                   unsigned layer_number) {
+  batch_get_shared_field<GNNFloat, sharedMaster, true>(
+      ctx, &ctx->layer_output_matrix[layer_number], from_id, buf, column_size);
+}
+
+void cudaSetLayerInputOutput(struct CUDA_Context* ctx, GNNFloat* layer_matrix,
+                             size_t column_size, size_t num_nodes,
+                             unsigned layer_number) {
+  if (ctx->layer_input_matrix_column_size[layer_number] == column_size) {
+    ctx->layer_input_matrix[layer_number].data.set_data(
+        layer_matrix, column_size * num_nodes);
+  } else if (ctx->layer_output_matrix_column_size[layer_number] ==
+             column_size) {
+    ctx->layer_output_matrix[layer_number].data.set_data(
+        layer_matrix, column_size * num_nodes);
+  }
+}
+
+size_t getLayerInputMatrixColumnSize(struct CUDA_Context* ctx,
+                                     unsigned layer_number) {
+  return ctx->layer_input_matrix_column_size[layer_number];
+}
+
+size_t getLayerOutputMatrixColumnSize(struct CUDA_Context* ctx,
+                                      unsigned layer_number) {
+  return ctx->layer_output_matrix_column_size[layer_number];
+}
+} // namespace galois
diff --git a/libgnn/src/GNNOptimizers.cpp b/libgnn/src/GNNOptimizers.cpp
index 566b61c14e..664de35e01 100644
--- a/libgnn/src/GNNOptimizers.cpp
+++ b/libgnn/src/GNNOptimizers.cpp
@@ -14,33 +14,38 @@ void galois::AdamOptimizer::GradientDescent(
   assert(derivatives.size() == first_moment.size());
   assert(derivatives.size() == second_moment.size());
 
-#ifndef GALOIS_ENABLE_GPU
-  // individual weight updates via gradients
-  galois::do_all(
-      galois::iterate(static_cast<size_t>(0), matrix.size()),
-      [&](size_t i) {
-        // moment estimate updates
-        first_moment[i] = config_.beta1 * first_moment[i] +
-                          (1.0 - config_.beta1) * derivatives[i];
-        second_moment[i] =
-            config_.beta2 * second_moment[i] +
-            (1.0 - config_.beta2) * (derivatives[i] * derivatives[i]);
-        // bias corrected moments using beta power
-        GNNFloat bias_correct_first =
-            first_moment[i] / (1.0 - beta1_power_t_[layer_number]);
-        GNNFloat bias_correct_second =
-            second_moment[i] / (1.0 - beta2_power_t_[layer_number]);
-        // weight update using bias corrected moments
-        (matrix.data())[i] -= config_.alpha * bias_correct_first /
-                              std::sqrt(bias_correct_second + config_.epsilon);
-      },
-      galois::loopname("AdamOptimizerGradientDescent"));
-#else
-  gpu_object_.AdamUpdate(derivatives.data(), matrix.data(), matrix.size(),
-                         first_moment.data(), second_moment.data(),
-                         config_.alpha, config_.beta1, config_.beta2,
-                         config_.epsilon, beta1_power_t_[layer_number],
-                         beta2_power_t_[layer_number]);
+#ifdef GALOIS_ENABLE_GPU
+  if (device_personality == DevicePersonality::GPU_CUDA) {
+    gpu_object_.AdamUpdate(derivatives.data(), matrix.data(), matrix.size(),
+                           first_moment.data(), second_moment.data(),
+                           config_.alpha, config_.beta1, config_.beta2,
+                           config_.epsilon, beta1_power_t_[layer_number],
+                           beta2_power_t_[layer_number]);
+  } else {
+#endif
+    // individual weight updates via gradients
+    galois::do_all(
+        galois::iterate(static_cast<size_t>(0), matrix.size()),
+        [&](size_t i) {
+          // moment estimate updates
+          first_moment[i] = config_.beta1 * first_moment[i] +
+                            (1.0 - config_.beta1) * derivatives[i];
+          second_moment[i] =
+              config_.beta2 * second_moment[i] +
+              (1.0 - config_.beta2) * (derivatives[i] * derivatives[i]);
+          // bias corrected moments using beta power
+          GNNFloat bias_correct_first =
+              first_moment[i] / (1.0 - beta1_power_t_[layer_number]);
+          GNNFloat bias_correct_second =
+              second_moment[i] / (1.0 - beta2_power_t_[layer_number]);
+          // weight update using bias corrected moments
+          (matrix.data())[i] -=
+              config_.alpha * bias_correct_first /
+              std::sqrt(bias_correct_second + config_.epsilon);
+        },
+        galois::loopname("AdamOptimizerGradientDescent"));
+#ifdef GALOIS_ENABLE_GPU
+  }
 #endif
 
   // update the power terms for next update call
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index be188ff843..5eac909e18 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -14,6 +14,11 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork(
   // this will be the # of rows for each layer
   size_t max_rows = graph_->size();
 
+#ifdef GALOIS_ENABLE_GPU
+  if (device_personality == DevicePersonality::GPU_CUDA) {
+    graph_->ResizeLayerVector(config_.num_intermediate_layers());
+  }
+#endif
   // create the intermediate layers
   for (size_t i = 0; i < config_.num_intermediate_layers(); i++) {
     GNNLayerType layer_type = config_.intermediate_layer_type(i);
@@ -36,6 +41,13 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork(
     case GNNLayerType::kGraphConvolutional:
       gnn_layers_.push_back(std::move(std::make_unique<GraphConvolutionalLayer>(
           i, *graph_, layer_dims, config_.default_layer_config())));
+#ifdef GALOIS_ENABLE_GPU
+      if (device_personality == DevicePersonality::GPU_CUDA) {
+        graph_->InitLayerVectorMetaObjects(
+            i, galois::runtime::getSystemNetworkInterface().Num,
+            layer_dims.input_columns, layer_dims.output_columns);
+      }
+#endif
       if (i == config_.num_intermediate_layers() - 1) {
         // last layer before output layer should never have activation
         gnn_layers_.back()->DisableActivation();
@@ -86,11 +98,16 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork(
 
 float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
   const size_t this_host = graph_->host_id();
-  // if (config_.do_sampling()) {
-  //   for (std::unique_ptr<galois::GNNLayer>& ptr : gnn_layers_) {
-  //     assert(ptr->IsSampledLayer());
-  //   }
-  // }
+  std::vector<GNNFloat> cpu_pred;
+  float train_accuracy{0.f};
+
+  /*
+  if (config_.do_sampling()) {
+    for (std::unique_ptr<galois::GNNLayer>& ptr : gnn_layers_) {
+      assert(ptr->IsSampledLayer());
+    }
+  }
+  */
 
   if (config_.inductive_training_) {
     graph_->CalculateSpecialNormFactor(false, true);
@@ -105,7 +122,22 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
     }
     const PointerWithSize<galois::GNNFloat> predictions = DoInference();
     GradientPropagation();
-    float train_accuracy = GetGlobalAccuracy(predictions);
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      if (cpu_pred.size() != predictions.size()) {
+        cpu_pred.resize(predictions.size());
+      }
+
+      AdamOptimizer* adam = static_cast<AdamOptimizer*>(optimizer_.get());
+      adam->CopyToVector(cpu_pred, predictions);
+      train_accuracy = GetGlobalAccuracy(cpu_pred);
+    } else {
+#endif
+      train_accuracy = GetGlobalAccuracy(predictions);
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+
     if (this_host == 0) {
       galois::gPrint("Epoch ", epoch, ": Train accuracy/F1 micro is ",
                      train_accuracy, "\n");
@@ -118,7 +150,18 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
   acc_timer.start();
   SetLayerPhases(galois::GNNPhase::kTest);
   const PointerWithSize<galois::GNNFloat> predictions = DoInference();
-  float global_accuracy = GetGlobalAccuracy(predictions);
+  float global_accuracy{0.0};
+#ifdef GALOIS_ENABLE_GPU
+  if (device_personality == DevicePersonality::GPU_CUDA) {
+    AdamOptimizer* adam = static_cast<AdamOptimizer*>(optimizer_.get());
+    adam->CopyToVector(cpu_pred, predictions);
+    global_accuracy = GetGlobalAccuracy(cpu_pred);
+  } else {
+#endif
+    global_accuracy = GetGlobalAccuracy(predictions);
+#ifdef GALOIS_ENABLE_GPU
+  }
+#endif
   acc_timer.stop();
 
   if (this_host == 0) {
@@ -136,6 +179,7 @@ galois::GraphNeuralNetwork::DoInference() {
   for (std::unique_ptr<galois::GNNLayer>& ptr : gnn_layers_) {
     layer_input = ptr->ForwardPhase(layer_input);
   }
+
   return layer_input;
 }
 
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index 3e5d468da2..c102fc8283 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -35,6 +35,10 @@ namespace galois {
 namespace graphs {
 GNNFloat* gnn_matrix_to_sync_            = nullptr;
 size_t gnn_matrix_to_sync_column_length_ = 0;
+#ifdef GALOIS_ENABLE_GPU
+struct CUDA_Context* cuda_ctx_for_sync;
+unsigned layer_number_to_sync;
+#endif
 } // namespace graphs
 } // namespace galois
 
@@ -78,9 +82,21 @@ galois::graphs::GNNGraph::GNNGraph(const std::string& input_directory,
   InitNormFactor();
 
 #ifdef GALOIS_ENABLE_GPU
-  // allocate/copy data structures over to GPU
-  GALOIS_LOG_VERBOSE("[{}] Initializing GPU memory", host_id_);
-  InitGPUMemory();
+  if (device_personality == DevicePersonality::GPU_CUDA) {
+    // allocate/copy data structures over to GPU
+    GALOIS_LOG_VERBOSE("[{}] Initializing GPU memory", host_id_);
+    InitGPUMemory();
+
+    // initialize CUDA context
+    cuda_ctx_ = get_CUDA_context(host_id_);
+    if (!init_CUDA_context(cuda_ctx_, ::gpudevice)) {
+      GALOIS_DIE("Failed to initialize CUDA context");
+    }
+    PartitionedGraphInfo g_info;
+    GetPartitionedGraphInfo(g_info);
+    load_graph_CUDA_GNN(cuda_ctx_, g_info,
+                        galois::runtime::getSystemNetworkInterface().Num);
+  }
 #endif
 }
 
@@ -124,13 +140,44 @@ void galois::graphs::GNNGraph::AggregateSync(
   gnn_matrix_to_sync_column_length_ = matrix_column_size;
 
   // XXX bitset setting
-
   // call sync
   sync_substrate_->sync<writeSource, readAny, GNNSumAggregate>(
       "GraphAggregateSync");
 }
 
-void galois::graphs::GNNGraph::UniformNodeSample() { UniformNodeSample(0.5); }
+#ifdef GALOIS_ENABLE_GPU
+void galois::graphs::GNNGraph::AggregateSync(
+    GNNFloat* matrix_to_sync, const size_t matrix_column_size,
+    const unsigned layer_number) const {
+  size_t layer_input_mtx_column_size =
+      getLayerInputMatrixColumnSize(cuda_ctx_, layer_number);
+  size_t layer_output_mtx_column_size =
+      getLayerOutputMatrixColumnSize(cuda_ctx_, layer_number);
+  // set globals for the sync substrate
+  gnn_matrix_to_sync_               = matrix_to_sync;
+  gnn_matrix_to_sync_column_length_ = matrix_column_size;
+  cuda_ctx_for_sync                 = cuda_ctx_;
+  layer_number_to_sync              = layer_number;
+  // XXX bitset setting
+  // call sync
+  cudaSetLayerInputOutput(cuda_ctx_, matrix_to_sync, matrix_column_size, size(),
+                          layer_number);
+
+  if (gnn_matrix_to_sync_column_length_ == layer_input_mtx_column_size) {
+    sync_substrate_->sync<writeSource, readAny, GNNSumAggregate_layer_input>(
+        "GraphAggregateSync", gnn_matrix_to_sync_column_length_);
+  } else if (gnn_matrix_to_sync_column_length_ ==
+             layer_output_mtx_column_size) {
+    sync_substrate_->sync<writeSource, readAny, GNNSumAggregate_layer_output>(
+        "GraphAggregateSync", gnn_matrix_to_sync_column_length_);
+  } else {
+    GALOIS_LOG_FATAL("Column size of the synchronized matrix does not"
+                     " match to the column size of the CUDA context");
+  }
+}
+#endif
+
+void galois::graphs::GNNGraph::UniformNodeSample() { UniformNodeSample(0.8); }
 
 void galois::graphs::GNNGraph::UniformNodeSample(float droprate) {
   galois::do_all(
@@ -685,4 +732,15 @@ void galois::graphs::GNNGraph::InitGPUMemory() {
                        local_testing_mask_);
   gpu_memory_.SetNormFactors(norm_factors_);
 }
+
+void galois::graphs::GNNGraph::InitLayerVectorMetaObjects(
+    size_t layer_number, unsigned num_hosts, size_t infl_in_size,
+    size_t infl_out_size) {
+  init_CUDA_layer_vector_meta_obj(cuda_ctx_, layer_number, num_hosts, size(),
+                                  infl_in_size, infl_out_size);
+}
+
+void galois::graphs::GNNGraph::ResizeLayerVector(size_t num_layers) {
+  resize_CUDA_layer_vector(cuda_ctx_, num_layers);
+}
 #endif
diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp
index aff4bc3b11..9da77a004f 100644
--- a/libgnn/src/layers/GNNLayer.cpp
+++ b/libgnn/src/layers/GNNLayer.cpp
@@ -19,9 +19,11 @@ galois::GNNLayer::GNNLayer(size_t layer_num,
     layer_weights_.resize(num_weight_elements);
     layer_weight_gradients_.resize(num_weight_elements, 0);
 #ifdef GALOIS_ENABLE_GPU
-    base_gpu_object_.InitWeightMemory(num_weight_elements);
-    base_gpu_object_.InitDropoutMemory(layer_dimensions_.input_rows *
-                                       layer_dimensions_.input_columns);
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      base_gpu_object_.InitWeightMemory(num_weight_elements);
+      base_gpu_object_.InitDropoutMemory(layer_dimensions_.input_rows *
+                                         layer_dimensions_.input_columns);
+    }
 #endif
 
     GlorotBengioInit(&layer_weights_);
@@ -42,31 +44,35 @@ galois::GNNLayer::GNNLayer(size_t layer_num,
   backward_output_matrix_.resize(
       layer_dimensions_.input_rows * layer_dimensions_.input_columns, 0);
 #ifdef GALOIS_ENABLE_GPU
-  base_gpu_object_.InitInOutMemory(num_output_elements,
-                                   layer_dimensions_.input_rows *
-                                       layer_dimensions_.input_columns);
-#endif
+  if (device_personality == DevicePersonality::GPU_CUDA) {
+    base_gpu_object_.InitInOutMemory(num_output_elements,
+                                     layer_dimensions_.input_rows *
+                                         layer_dimensions_.input_columns);
 
-  // initialize the PointerWithSize wrappers
-#ifndef GALOIS_ENABLE_GPU
-  p_layer_weights_ = PointerWithSize<GNNFloat>(layer_weights_);
-  p_layer_weight_gradients_ =
-      PointerWithSize<GNNFloat>(layer_weight_gradients_);
-  p_forward_output_matrix_ = PointerWithSize<GNNFloat>(forward_output_matrix_);
-  p_backward_output_matrix_ =
-      PointerWithSize<GNNFloat>(backward_output_matrix_);
-#else
-  p_layer_weights_ = PointerWithSize<GNNFloat>(base_gpu_object_.layer_weights(),
-                                               layer_weights_.size());
-  p_layer_weight_gradients_ =
-      PointerWithSize<GNNFloat>(base_gpu_object_.layer_weight_gradients(),
-                                layer_weight_gradients_.size());
-  p_forward_output_matrix_ = PointerWithSize<GNNFloat>(
-      base_gpu_object_.forward_output(), forward_output_matrix_.size());
-  p_backward_output_matrix_ = PointerWithSize<GNNFloat>(
-      base_gpu_object_.backward_output(), backward_output_matrix_.size());
-  // TODO can clear the cpu side vectors/don't use .size() since optimally they
-  // aren't initialized
+    // initialize the PointerWithSize wrappers
+    p_layer_weights_ = PointerWithSize<GNNFloat>(
+        base_gpu_object_.layer_weights(), layer_weights_.size());
+    p_layer_weight_gradients_ =
+        PointerWithSize<GNNFloat>(base_gpu_object_.layer_weight_gradients(),
+                                  layer_weight_gradients_.size());
+    p_forward_output_matrix_ = PointerWithSize<GNNFloat>(
+        base_gpu_object_.forward_output(), forward_output_matrix_.size());
+    p_backward_output_matrix_ = PointerWithSize<GNNFloat>(
+        base_gpu_object_.backward_output(), backward_output_matrix_.size());
+    // TODO can clear the cpu side vectors/don't use .size() since optimally
+    // they aren't initialized
+  } else {
+#endif
+    // initialize the PointerWithSize wrappers
+    p_layer_weights_ = PointerWithSize<GNNFloat>(layer_weights_);
+    p_layer_weight_gradients_ =
+        PointerWithSize<GNNFloat>(layer_weight_gradients_);
+    p_forward_output_matrix_ =
+        PointerWithSize<GNNFloat>(forward_output_matrix_);
+    p_backward_output_matrix_ =
+        PointerWithSize<GNNFloat>(backward_output_matrix_);
+#ifdef GALOIS_ENABLE_GPU
+  }
 #endif
 }
 
@@ -81,7 +87,9 @@ void galois::GNNLayer::GlorotBengioInit(std::vector<GNNFloat>* vector_to_init) {
     (*vector_to_init)[i] = dist(rng);
   }
 #ifdef GALOIS_ENABLE_GPU
-  CopyLayerWeightsToGPU();
+  if (device_personality == DevicePersonality::GPU_CUDA) {
+    CopyLayerWeightsToGPU();
+  }
 #endif
 }
 
@@ -94,7 +102,9 @@ void galois::GNNLayer::RandomInitVector(std::vector<GNNFloat>* vector_to_init) {
       },
       galois::loopname("RandomInitVector"));
 #ifdef GALOIS_ENABLE_GPU
-  CopyLayerWeightsToGPU();
+  if (device_personality == DevicePersonality::GPU_CUDA) {
+    CopyLayerWeightsToGPU();
+  }
 #endif
 }
 
@@ -128,11 +138,15 @@ void galois::GNNLayer::DoDropoutCPU(
 void galois::GNNLayer::DoDropout(
     const PointerWithSize<GNNFloat> input_to_dropout,
     PointerWithSize<GNNFloat>* output_matrix) {
-#ifndef GALOIS_ENABLE_GPU
-  DoDropoutCPU(input_to_dropout, output_matrix);
-#else
-  base_gpu_object_.DoDropoutGPU(input_to_dropout, *output_matrix,
-                                config_.dropout_rate);
+#ifdef GALOIS_ENABLE_GPU
+  if (device_personality == DevicePersonality::GPU_CUDA) {
+    base_gpu_object_.DoDropoutGPU(input_to_dropout, *output_matrix,
+                                  config_.dropout_rate);
+  } else {
+#endif
+    DoDropoutCPU(input_to_dropout, output_matrix);
+#ifdef GALOIS_ENABLE_GPU
+  }
 #endif
 }
 
@@ -140,19 +154,23 @@ void galois::GNNLayer::DoDropoutDerivative() {
   assert(backward_output_matrix_.size() == dropout_mask_.size());
   GNNFloat scale = 1. / (1. - config_.dropout_rate);
 
-#ifndef GALOIS_ENABLE_GPU
-  // use dropout mask to figure out derivative
-  galois::do_all(
-      galois::iterate(static_cast<size_t>(0), backward_output_matrix_.size()),
-      [&](size_t i) {
-        backward_output_matrix_[i] = backward_output_matrix_[i] *
-                                     static_cast<GNNFloat>(dropout_mask_[i]) *
-                                     scale;
-      },
-      galois::loopname("LayerDropoutDerivative"));
-#else
-  base_gpu_object_.DoDropoutDerivativeGPU(p_backward_output_matrix_.size(),
-                                          scale);
+#ifdef GALOIS_ENABLE_GPU
+  if (device_personality == DevicePersonality::GPU_CUDA) {
+    base_gpu_object_.DoDropoutDerivativeGPU(p_backward_output_matrix_.size(),
+                                            scale);
+  } else {
+#endif
+    // use dropout mask to figure out derivative
+    galois::do_all(
+        galois::iterate(static_cast<size_t>(0), backward_output_matrix_.size()),
+        [&](size_t i) {
+          backward_output_matrix_[i] = backward_output_matrix_[i] *
+                                       static_cast<GNNFloat>(dropout_mask_[i]) *
+                                       scale;
+        },
+        galois::loopname("LayerDropoutDerivative"));
+#ifdef GALOIS_ENABLE_GPU
+  }
 #endif
 }
 
diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp
index 208229d6f1..07f69cee6e 100644
--- a/libgnn/src/layers/GraphConvolutionalLayer.cpp
+++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp
@@ -20,22 +20,24 @@ galois::GraphConvolutionalLayer::GraphConvolutionalLayer(
   out_temp_.resize(num_output_elements, 0);
   layer_type_ = galois::GNNLayerType::kGraphConvolutional;
 #ifdef GALOIS_ENABLE_GPU
-  gpu_object_.Allocate(num_input_elements, num_output_elements);
+  if (device_personality == DevicePersonality::GPU_CUDA) {
+    gpu_object_.Allocate(num_input_elements, num_output_elements);
+    // init pointers with size
+    p_in_temp_1_ =
+        PointerWithSize<GNNFloat>(gpu_object_.in_temp_1(), in_temp_1_.size());
+    p_in_temp_2_ =
+        PointerWithSize<GNNFloat>(gpu_object_.in_temp_2(), in_temp_2_.size());
+    p_out_temp_ =
+        PointerWithSize<GNNFloat>(gpu_object_.out_temp(), out_temp_.size());
+  } else {
 #endif
-
-  // init pointers with size
-#ifndef GALOIS_ENABLE_GPU
-  p_in_temp_1_ = PointerWithSize<GNNFloat>(in_temp_1_);
-  p_in_temp_2_ = PointerWithSize<GNNFloat>(in_temp_2_);
-  p_out_temp_  = PointerWithSize<GNNFloat>(out_temp_);
-#else
-  p_in_temp_1_ =
-      PointerWithSize<GNNFloat>(gpu_object_.in_temp_1(), in_temp_1_.size());
-  p_in_temp_2_ =
-      PointerWithSize<GNNFloat>(gpu_object_.in_temp_2(), in_temp_2_.size());
-  p_out_temp_ =
-      PointerWithSize<GNNFloat>(gpu_object_.out_temp(), out_temp_.size());
+    p_in_temp_1_ = PointerWithSize<GNNFloat>(in_temp_1_);
+    p_in_temp_2_ = PointerWithSize<GNNFloat>(in_temp_2_);
+    p_out_temp_  = PointerWithSize<GNNFloat>(out_temp_);
+#ifdef GALOIS_ENABLE_GPU
+  }
 #endif
+
   GALOIS_LOG_VERBOSE("Conv layer initialized");
 }
 
@@ -121,18 +123,22 @@ galois::GraphConvolutionalLayer::BackwardPhase(
     }
     // weight gradient calculation
     // TODO(loc) put this in a function to put the ifdef in there
-#ifndef GALOIS_ENABLE_GPU
-    // temp 2 holds aggregated feature vectors from forward phase
-    galois::CBlasSGEMM(
-        CblasTrans, CblasNoTrans, layer_dimensions_.input_columns,
-        layer_dimensions_.input_rows, layer_dimensions_.output_columns,
-        p_in_temp_2_.data(), input_gradient->data(),
-        p_layer_weight_gradients_.data());
-#else
-    gpu_object_.GetWeightGradientsGPU(
-        layer_dimensions_.input_rows, layer_dimensions_.input_columns,
-        layer_dimensions_.output_columns, p_in_temp_2_.data(),
-        input_gradient->data(), p_layer_weight_gradients_.data());
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      gpu_object_.GetWeightGradientsGPU(
+          layer_dimensions_.input_rows, layer_dimensions_.input_columns,
+          layer_dimensions_.output_columns, p_in_temp_2_.data(),
+          input_gradient->data(), p_layer_weight_gradients_.data());
+    } else {
+#endif
+      // temp 2 holds aggregated feature vectors from forward phase
+      galois::CBlasSGEMM(
+          CblasTrans, CblasNoTrans, layer_dimensions_.input_columns,
+          layer_dimensions_.input_rows, layer_dimensions_.output_columns,
+          p_in_temp_2_.data(), input_gradient->data(),
+          p_layer_weight_gradients_.data());
+#ifdef GALOIS_ENABLE_GPU
+    }
 #endif
   } else {
     // TODO at this point, out_temp contains memoized FW
@@ -150,18 +156,21 @@ galois::GraphConvolutionalLayer::BackwardPhase(
     }
     // TODO put this in a function
     // W' = F^T (FW)'
-#ifndef GALOIS_ENABLE_GPU
-    // weight gradient; note the use of the aggregated gradient in out_temp
-    galois::CBlasSGEMM(
-        CblasTrans, CblasNoTrans, layer_dimensions_.input_columns,
-        layer_dimensions_.input_rows, layer_dimensions_.output_columns,
-        prev_layer_input.data(), p_out_temp_.data(),
-        p_layer_weight_gradients_.data());
-#else
-    gpu_object_.GetWeightGradientsGPU(
-        layer_dimensions_.input_rows, layer_dimensions_.input_columns,
-        layer_dimensions_.output_columns, prev_layer_input.data(),
-        p_out_temp_.data(), p_layer_weight_gradients_.data());
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      gpu_object_.GetWeightGradientsGPU(
+          layer_dimensions_.input_rows, layer_dimensions_.input_columns,
+          layer_dimensions_.output_columns, prev_layer_input.data(),
+          p_out_temp_.data(), p_layer_weight_gradients_.data());
+    } else {
+#endif
+      galois::CBlasSGEMM(
+          CblasTrans, CblasNoTrans, layer_dimensions_.input_columns,
+          layer_dimensions_.input_rows, layer_dimensions_.output_columns,
+          prev_layer_input.data(), p_out_temp_.data(),
+          p_layer_weight_gradients_.data());
+#ifdef GALOIS_ENABLE_GPU
+    }
 #endif
   }
 
@@ -183,12 +192,17 @@ void galois::GraphConvolutionalLayer::AggregateAll(
     GNNFloat* aggregate_output,
     [[maybe_unused]] galois::substrate::PerThreadStorage<std::vector<GNNFloat>>*
         pts) {
-#ifndef GALOIS_ENABLE_GPU
-  AggregateAllCPU(column_length, node_embeddings, aggregate_output, pts);
-#else
-  gpu_object_.AggregateAllGPU(graph_.GetGPUGraph(), graph_.size(),
-                              column_length, node_embeddings, aggregate_output,
-                              config_.do_normalization);
+#ifdef GALOIS_ENABLE_GPU
+  if (device_personality == DevicePersonality::GPU_CUDA) {
+    gpu_object_.AggregateAllGPU(graph_.GetGPUGraph(), graph_.size(),
+                                column_length, node_embeddings,
+                                aggregate_output, config_.do_normalization);
+    graph_.AggregateSync(aggregate_output, column_length, layer_number_);
+  } else {
+#endif
+    AggregateAllCPU(column_length, node_embeddings, aggregate_output, pts);
+#ifdef GALOIS_ENABLE_GPU
+  }
 #endif
 }
 
@@ -284,24 +298,27 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU(
         //                  &aggregate_output[index_to_src_feature]);
       },
       galois::steal(), galois::loopname("ConvolutionalAggregateAll"));
-
   // aggregate sync
   graph_.AggregateSync(aggregate_output, column_length);
 }
 
 void galois::GraphConvolutionalLayer::UpdateEmbeddings(
     const GNNFloat* node_embeddings, GNNFloat* output) {
-#ifndef GALOIS_ENABLE_GPU
-  // CPU version is just a call into CBlas
-  galois::CBlasSGEMM(CblasNoTrans, CblasNoTrans, layer_dimensions_.input_rows,
-                     layer_dimensions_.input_columns,
-                     layer_dimensions_.output_columns, node_embeddings,
-                     layer_weights_.data(), output);
-#else
-  gpu_object_.UpdateEmbeddingsGPU(
-      layer_dimensions_.input_rows, layer_dimensions_.input_columns,
-      layer_dimensions_.output_columns, node_embeddings,
-      base_gpu_object_.layer_weights(), output);
+#ifdef GALOIS_ENABLE_GPU
+  if (device_personality == DevicePersonality::GPU_CUDA) {
+    gpu_object_.UpdateEmbeddingsGPU(
+        layer_dimensions_.input_rows, layer_dimensions_.input_columns,
+        layer_dimensions_.output_columns, node_embeddings,
+        base_gpu_object_.layer_weights(), output);
+  } else {
+#endif
+    // CPU version is just a call into CBlas
+    galois::CBlasSGEMM(CblasNoTrans, CblasNoTrans, layer_dimensions_.input_rows,
+                       layer_dimensions_.input_columns,
+                       layer_dimensions_.output_columns, node_embeddings,
+                       layer_weights_.data(), output);
+#ifdef GALOIS_ENABLE_GPU
+  }
 #endif
 }
 
@@ -309,18 +326,21 @@ void galois::GraphConvolutionalLayer::UpdateEmbeddingsDerivative(
     const GNNFloat* gradients, GNNFloat* output) {
   assert(p_layer_weights_.size() ==
          layer_dimensions_.input_columns * layer_dimensions_.output_columns);
-#ifndef GALOIS_ENABLE_GPU
-  // difference is Trans for B matrix (data) to get z by y (weights is y by z
-  // normally); result is x by y
-  galois::CBlasSGEMM(CblasNoTrans, CblasTrans, layer_dimensions_.input_rows,
-                     layer_dimensions_.output_columns,
-                     layer_dimensions_.input_columns, gradients,
-                     layer_weights_.data(), output);
-#else
-  gpu_object_.UpdateEmbeddingsDerivativeGPU(
-      layer_dimensions_.input_rows, layer_dimensions_.input_columns,
-      layer_dimensions_.output_columns, gradients,
-      base_gpu_object_.layer_weights(), output);
-
+#ifdef GALOIS_ENABLE_GPU
+  if (device_personality == DevicePersonality::GPU_CUDA) {
+    gpu_object_.UpdateEmbeddingsDerivativeGPU(
+        layer_dimensions_.input_rows, layer_dimensions_.input_columns,
+        layer_dimensions_.output_columns, gradients,
+        base_gpu_object_.layer_weights(), output);
+  } else {
+#endif
+    // difference is Trans for B matrix (data) to get z by y (weights is y by z
+    // normally); result is x by y
+    galois::CBlasSGEMM(CblasNoTrans, CblasTrans, layer_dimensions_.input_rows,
+                       layer_dimensions_.output_columns,
+                       layer_dimensions_.input_columns, gradients,
+                       layer_weights_.data(), output);
+#ifdef GALOIS_ENABLE_GPU
+  }
 #endif
 }
diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp
index d98251091c..f541b43a18 100644
--- a/libgnn/src/layers/SoftmaxLayer.cpp
+++ b/libgnn/src/layers/SoftmaxLayer.cpp
@@ -48,14 +48,15 @@ galois::SoftmaxLayer::ForwardPhaseCPU(
 const galois::PointerWithSize<galois::GNNFloat>
 galois::SoftmaxLayer::ForwardPhase(
     const galois::PointerWithSize<galois::GNNFloat> input_embeddings) {
-#ifndef GALOIS_ENABLE_GPU
-  return ForwardPhaseCPU(input_embeddings);
-#else
-  gpu_object_.ForwardPhaseGPU(
-      layer_phase_, graph_.size(), layer_dimensions_.input_columns,
-      input_embeddings.data(), p_forward_output_matrix_.data());
-  return p_forward_output_matrix_;
+#ifdef GALOIS_ENABLE_GPU
+  if (device_personality == DevicePersonality::GPU_CUDA) {
+    gpu_object_.ForwardPhaseGPU(
+        layer_phase_, graph_.size(), layer_dimensions_.input_columns,
+        input_embeddings.data(), p_forward_output_matrix_.data());
+    return p_forward_output_matrix_;
+  }
 #endif
+  return ForwardPhaseCPU(input_embeddings);
 }
 
 galois::PointerWithSize<galois::GNNFloat>
@@ -112,14 +113,15 @@ galois::SoftmaxLayer::BackwardPhaseCPU() {
 galois::PointerWithSize<galois::GNNFloat>
 galois::SoftmaxLayer::BackwardPhase(const PointerWithSize<galois::GNNFloat>,
                                     PointerWithSize<galois::GNNFloat>*) {
-#ifndef GALOIS_ENABLE_GPU
-  return BackwardPhaseCPU();
-#else
-  gpu_object_.BackwardPhaseGPU(
-      layer_phase_, graph_.size(), layer_dimensions_.input_columns,
-      p_forward_output_matrix_.data(), p_backward_output_matrix_.data());
-  return p_backward_output_matrix_;
+#ifdef GALOIS_ENABLE_GPU
+  if (device_personality == DevicePersonality::GPU_CUDA) {
+    gpu_object_.BackwardPhaseGPU(
+        layer_phase_, graph_.size(), layer_dimensions_.input_columns,
+        p_forward_output_matrix_.data(), p_backward_output_matrix_.data());
+    return p_backward_output_matrix_;
+  }
 #endif
+  return BackwardPhaseCPU();
 }
 
 // TODO function for getting loss
diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt
index a6b711397b..9e10da1246 100644
--- a/libgnn/test/CMakeLists.txt
+++ b/libgnn/test/CMakeLists.txt
@@ -74,6 +74,9 @@ else()
   add_executable(gpu-epoch-test gpu-epoch-test.cpp)
   target_link_libraries(gpu-epoch-test galois_gnn)
   #add_test(NAME gpu-epoch-test COMMAND gpu-epoch-test)
+
+  add_executable(gpu-aggregate-sync-test gpu-aggregate-sync-test.cpp)
+  target_link_libraries(gpu-aggregate-sync-test galois_gnn)
 endif()
 
 # TODO multi host tests?
diff --git a/libgnn/test/gpu-adam-test.cpp b/libgnn/test/gpu-adam-test.cpp
index a1d0c1961e..ed99982a78 100644
--- a/libgnn/test/gpu-adam-test.cpp
+++ b/libgnn/test/gpu-adam-test.cpp
@@ -13,7 +13,7 @@ int main() {
   GALOIS_LOG_VERBOSE("[{}] Using {} threads",
                      galois::runtime::getSystemNetworkInterface().ID,
                      num_threads);
-
+  device_personality = DevicePersonality::GPU_CUDA;
   // create sample config that is easy to trace
   galois::AdamOptimizer::AdamConfiguration config;
   config.alpha   = 1;
diff --git a/libgnn/test/gpu-aggregate-sync-test.cpp b/libgnn/test/gpu-aggregate-sync-test.cpp
new file mode 100644
index 0000000000..a3f645c5ee
--- /dev/null
+++ b/libgnn/test/gpu-aggregate-sync-test.cpp
@@ -0,0 +1,212 @@
+//! @file gpu-aggregate-sync-test.cpp
+//! GPU sync test to make sure it's sane
+#include "galois/Logging.h"
+#include "galois/GraphNeuralNetwork.h"
+#include "galois/layers/GraphConvolutionalLayer.h"
+#include "galois/CUDAUtilHostDecls.h"
+
+int main() {
+  galois::DistMemSys G;
+
+  if (galois::runtime::getSystemNetworkInterface().Num == 1) {
+    GALOIS_LOG_ERROR("This test should be run with multiple hosts/processes");
+    exit(1);
+  }
+  device_personality = DevicePersonality::GPU_CUDA;
+  gpudevice          = galois::runtime::getSystemNetworkInterface().ID;
+  SetCUDADeviceId(gpudevice);
+
+  auto test_graph = std::make_unique<galois::graphs::GNNGraph>(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true);
+
+  // create same layer from convlayer-test and make sure result is the same even
+  // in multi-host environment
+  galois::GNNLayerDimensions dimension_0;
+  dimension_0.input_rows     = test_graph->size();
+  dimension_0.input_columns  = 3;
+  dimension_0.output_columns = 2;
+  galois::GNNLayerConfig l_config;
+  l_config.disable_aggregate_after_update = true;
+
+  unsigned num_layers = 2;
+  test_graph->ResizeLayerVector(num_layers);
+  test_graph->InitLayerVectorMetaObjects(
+      0, galois::runtime::getSystemNetworkInterface().Num,
+      dimension_0.input_columns, dimension_0.output_columns);
+  test_graph->InitLayerVectorMetaObjects(
+      1, galois::runtime::getSystemNetworkInterface().Num,
+      dimension_0.input_columns, dimension_0.output_columns);
+  // create the layer, no norm factor
+  std::unique_ptr<galois::GraphConvolutionalLayer> layer_0 =
+      std::make_unique<galois::GraphConvolutionalLayer>(0, *(test_graph.get()),
+                                                        dimension_0, l_config);
+  layer_0->InitAllWeightsTo1();
+  // make sure it runs in a sane manner
+  layer_0->ForwardPhase(test_graph->GetLocalFeatures());
+  // pointer is to GPU memory: copy it over to a CPU source for verification
+  const std::vector<galois::GNNFloat>& layer_0_forward_output =
+      layer_0->CopyForwardOutputFromGPU();
+
+  //////////////////////////////////////////////////////////////////////////////
+  // sanity check output
+  //////////////////////////////////////////////////////////////////////////////
+
+  // check each row on each host: convert row into GID, and based on GID we
+  // know what the ground truth is
+  // row 0 = 3
+  // row 1 = 6
+  // row 2 = 12
+  // row 3 = 18
+  // row 4 = 24
+  // row 5 = 30
+  // row 6 = 15
+
+  // row should correspond to LID
+  for (size_t row = 0; row < test_graph->size(); row++) {
+    // row -> GID
+    size_t global_row = test_graph->GetGID(row);
+
+    galois::GNNFloat ground_truth = 0.0;
+
+    switch (global_row) {
+    case 0:
+      ground_truth = 3;
+      break;
+    case 1:
+      ground_truth = 6;
+      break;
+    case 2:
+      ground_truth = 12;
+      break;
+    case 3:
+      ground_truth = 18;
+      break;
+    case 4:
+      ground_truth = 24;
+      break;
+    case 5:
+      ground_truth = 30;
+      break;
+    case 6:
+      ground_truth = 15;
+      break;
+    default:
+      GALOIS_LOG_FATAL("bad global row for test graph");
+      break;
+    }
+
+    // size 2 columns
+    for (size_t c = 0; c < 2; c++) {
+      GALOIS_LOG_ASSERT(layer_0_forward_output[row * 2 + c] == ground_truth);
+    }
+  }
+
+  //////////////////////////////////////////////////////////////////////////////
+
+  std::vector<galois::GNNFloat> dummy_ones_v(test_graph->size() * 2, 1);
+  galois::PointerWithSize<galois::GNNFloat> dummy_ones =
+      layer_0->AllocateGPU(dummy_ones_v);
+  // backward pass checking
+  // layer 0 means that an empty weight matrix is returned since there is no
+  // point passing back anything
+  layer_0->BackwardPhase(test_graph->GetLocalFeatures(), &dummy_ones);
+  const std::vector<galois::GNNFloat>& layer_0_backward_output =
+      layer_0->CopyBackwardOutputFromGPU();
+
+  //////////////////////////////////////////////////////////////////////////////
+  // sanity check layer 0 backward output; all 0 because layer 0
+  //////////////////////////////////////////////////////////////////////////////
+  // since norm factors aren't invovled it is possible to do full assertions
+  GALOIS_LOG_ASSERT(layer_0_backward_output.size() == test_graph->size() * 3);
+  for (size_t i = 0; i < layer_0_backward_output.size(); i++) {
+    GALOIS_LOG_ASSERT((layer_0_backward_output)[i] == 0);
+  }
+
+  //////////////////////////////////////////////////////////////////////////////
+  // layer 1 to check backward output
+  //////////////////////////////////////////////////////////////////////////////
+  std::unique_ptr<galois::GraphConvolutionalLayer> layer_1 =
+      std::make_unique<galois::GraphConvolutionalLayer>(1, *(test_graph.get()),
+                                                        dimension_0, l_config);
+  layer_1->InitAllWeightsTo1();
+  layer_1->ForwardPhase(test_graph->GetLocalFeatures());
+  const std::vector<galois::GNNFloat>& layer_1_forward_output =
+      layer_1->CopyForwardOutputFromGPU();
+
+  // same check for forward as before
+  for (size_t row = 0; row < test_graph->size(); row++) {
+    // row -> GID
+    size_t global_row = test_graph->GetGID(row);
+
+    galois::GNNFloat ground_truth = 0.0;
+
+    switch (global_row) {
+    case 0:
+      ground_truth = 3;
+      break;
+    case 1:
+      ground_truth = 6;
+      break;
+    case 2:
+      ground_truth = 12;
+      break;
+    case 3:
+      ground_truth = 18;
+      break;
+    case 4:
+      ground_truth = 24;
+      break;
+    case 5:
+      ground_truth = 30;
+      break;
+    case 6:
+      ground_truth = 15;
+      break;
+    default:
+      GALOIS_LOG_FATAL("bad global row for test graph");
+      break;
+    }
+
+    // size 2 columns
+    for (size_t c = 0; c < 2; c++) {
+      GALOIS_LOG_ASSERT(layer_1_forward_output[row * 2 + c] == ground_truth);
+    }
+  }
+
+  // since layer isn't 0 anymore, backward phase will actually return something
+  dummy_ones_v.assign(test_graph->size() * 2, 1);
+  layer_1->BackwardPhase(test_graph->GetLocalFeatures(), &dummy_ones);
+  const std::vector<galois::GNNFloat>& layer_1_backward_output =
+      layer_1->CopyBackwardOutputFromGPU();
+
+  for (size_t row = 0; row < test_graph->size(); row++) {
+    // row -> GID
+    size_t global_row = test_graph->GetGID(row);
+
+    galois::GNNFloat ground_truth = 0.0;
+
+    switch (global_row) {
+    case 0:
+    case 6:
+      ground_truth = 2;
+      break;
+    case 1:
+    case 2:
+    case 3:
+    case 4:
+    case 5:
+      ground_truth = 4;
+      break;
+    default:
+      GALOIS_LOG_FATAL("bad global row for test graph");
+      break;
+    }
+
+    // size 3 columns
+    for (size_t c = 0; c < 3; c++) {
+      GALOIS_LOG_ASSERT((layer_1_backward_output)[row * 3 + c] == ground_truth);
+    }
+  }
+
+  // TODO CVC
+}
diff --git a/libgnn/test/gpu-convlayer-test.cpp b/libgnn/test/gpu-convlayer-test.cpp
index a79262d706..947a0b8703 100644
--- a/libgnn/test/gpu-convlayer-test.cpp
+++ b/libgnn/test/gpu-convlayer-test.cpp
@@ -13,6 +13,7 @@ int main() {
   GALOIS_LOG_VERBOSE("[{}] Using {} threads",
                      galois::runtime::getSystemNetworkInterface().ID,
                      num_threads);
+  device_personality = DevicePersonality::GPU_CUDA;
   // load test graph
   galois::graphs::GNNGraph test_graph(
       "tester", galois::graphs::GNNPartitionScheme::kOEC, true);
@@ -31,8 +32,16 @@ int main() {
   dimension_0.output_columns = 2;
 
   galois::GNNLayerConfig dcon;
-  dcon.allow_aggregate_after_update = false;
+  dcon.disable_aggregate_after_update = false;
 
+  unsigned num_layers = 2;
+  test_graph.ResizeLayerVector(num_layers);
+  test_graph.InitLayerVectorMetaObjects(
+      0, galois::runtime::getSystemNetworkInterface().Num,
+      dimension_0.input_columns, dimension_0.output_columns);
+  test_graph.InitLayerVectorMetaObjects(
+      1, galois::runtime::getSystemNetworkInterface().Num,
+      dimension_0.input_columns, dimension_0.output_columns);
   // create the layer, no norm factor
   std::unique_ptr<galois::GraphConvolutionalLayer> layer_0 =
       std::make_unique<galois::GraphConvolutionalLayer>(0, test_graph,
diff --git a/libgnn/test/gpu-epoch-test.cpp b/libgnn/test/gpu-epoch-test.cpp
index 3a481b9d66..3ac2c2b2ed 100644
--- a/libgnn/test/gpu-epoch-test.cpp
+++ b/libgnn/test/gpu-epoch-test.cpp
@@ -11,6 +11,7 @@ int main() {
       56 / galois::runtime::getSystemNetworkInterface().Num);
   // size_t num_threads = galois::setActiveThreads(1);
   GALOIS_LOG_VERBOSE("Num threads is {}", num_threads);
+  device_personality = DevicePersonality::GPU_CUDA;
 
   // load graph
   auto test_graph = std::make_unique<galois::graphs::GNNGraph>(
diff --git a/libgnn/test/gpu-softmaxlayer-test.cpp b/libgnn/test/gpu-softmaxlayer-test.cpp
index 453606e311..5d52e80e35 100644
--- a/libgnn/test/gpu-softmaxlayer-test.cpp
+++ b/libgnn/test/gpu-softmaxlayer-test.cpp
@@ -11,6 +11,7 @@ int main() {
   size_t num_threads = galois::setActiveThreads(
       56 / galois::runtime::getSystemNetworkInterface().Num);
   GALOIS_LOG_VERBOSE("Num threads is {}", num_threads);
+  device_personality = DevicePersonality::GPU_CUDA;
 
   // load test graph
   galois::graphs::GNNGraph test_graph(
diff --git a/libgpu/include/sharedptr.h b/libgpu/include/sharedptr.h
index 9ce66de597..191812ff57 100644
--- a/libgpu/include/sharedptr.h
+++ b/libgpu/include/sharedptr.h
@@ -202,6 +202,17 @@ class Shared {
     return ptrs[0];
 #endif
   }
+
+  void set_data(T* src, size_t src_nmemb) { set_data(src, src_nmemb, 1); }
+
+  void set_data(T* src, size_t src_nmemb, int device) {
+    if (this->nmemb == 0) {
+      alloc(src_nmemb);
+      nmemb = src_nmemb;
+    }
+    assert(this->nmemb == src_nmemb);
+    ptrs[device] = src;
+  }
 };
 
 template <typename T>
diff --git a/lonestar/libgnnbench/CMakeLists.txt b/lonestar/libgnnbench/CMakeLists.txt
index 14d152c8e7..0818a3310c 100644
--- a/lonestar/libgnnbench/CMakeLists.txt
+++ b/lonestar/libgnnbench/CMakeLists.txt
@@ -2,4 +2,5 @@ add_library(gnnbench STATIC src/Input.cpp src/Start.cpp)
 target_include_directories(gnnbench PUBLIC
   "${CMAKE_CURRENT_SOURCE_DIR}/include"
 )
+
 target_link_libraries(gnnbench galois_gnn LLVMSupport)
diff --git a/lonestar/libgnnbench/include/GNNBench/Input.h b/lonestar/libgnnbench/include/GNNBench/Input.h
index 784b1fd431..dc62b19d50 100644
--- a/lonestar/libgnnbench/include/GNNBench/Input.h
+++ b/lonestar/libgnnbench/include/GNNBench/Input.h
@@ -4,6 +4,10 @@
 #include "galois/graphs/GNNGraph.h"
 #include <llvm/Support/CommandLine.h>
 
+#ifdef GALOIS_ENABLE_GPU
+extern int gpudevice;
+#endif
+
 //! Directory where all files used for GNN training are found
 extern llvm::cl::opt<std::string> input_directory;
 //! Base graph name (used to find the csgr, features, masks, etc.)
diff --git a/lonestar/libgnnbench/include/GNNBench/Start.h b/lonestar/libgnnbench/include/GNNBench/Start.h
index c03970c868..75ec167f78 100644
--- a/lonestar/libgnnbench/include/GNNBench/Start.h
+++ b/lonestar/libgnnbench/include/GNNBench/Start.h
@@ -3,13 +3,27 @@
 #include "galois/Galois.h"
 #include "galois/Version.h"
 #include "GNNBench/Input.h"
+#ifdef GALOIS_ENABLE_GPU
+#include "galois/CUDAUtilHostDecls.h"
+#endif
 
 ////////////////////////////////////////////////////////////////////////////////
 // CLI
 ////////////////////////////////////////////////////////////////////////////////
 
+extern llvm::cl::opt<unsigned> num_threads;
 extern llvm::cl::opt<unsigned> num_epochs;
 
+#ifdef GALOIS_ENABLE_GPU
+std::string personality_str(DevicePersonality p);
+extern llvm::cl::opt<int> num_nodes;
+extern llvm::cl::opt<std::string> personality_set;
+
+namespace internal {
+void heteroSetup();
+};
+#endif
+
 ////////////////////////////////////////////////////////////////////////////////
 // Init functions
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/lonestar/libgnnbench/src/Start.cpp b/lonestar/libgnnbench/src/Start.cpp
index 1a178c583d..aa059c60f6 100644
--- a/lonestar/libgnnbench/src/Start.cpp
+++ b/lonestar/libgnnbench/src/Start.cpp
@@ -4,12 +4,36 @@ namespace cll = llvm::cl;
 
 cll::opt<unsigned> num_threads("t", cll::desc("Number of threads (default 1)"),
                                cll::init(1));
-cll::opt<unsigned> num_runs("runs", cll::desc("Number of runs (default 1)"),
-                            cll::init(1));
 cll::opt<unsigned> num_epochs("epochs",
                               cll::desc("Number of epochs (default 50)"),
                               cll::init(50));
 
+#ifdef GALOIS_ENABLE_GPU
+std::string personality_str(DevicePersonality p) {
+  switch (p) {
+  case DevicePersonality::CPU:
+    return "CPU";
+  case DevicePersonality::GPU_CUDA:
+    return "GPU_CUDA";
+  default:
+    GALOIS_LOG_ASSERT(false && "Invalid personality");
+    break;
+  }
+  return "";
+}
+
+cll::opt<int> num_nodes(
+    "numNodes",
+    cll::desc("Num of physical nodes with devices (default = num of hosts): "
+              "detect GPU to use for each host automatically"),
+    cll::init(-1));
+cll::opt<std::string> personality_set(
+    "pset",
+    cll::desc("String specifying personality for hosts on each physical "
+              "node. 'c'=CPU, 'g'=GPU (default 'c')"),
+    cll::init("c"));
+#endif
+
 cll::opt<std::string>
     stat_file("statFile", cll::desc("Optional output file to print stats to"));
 
@@ -65,7 +89,6 @@ void GNNBenchStart(int argc, char** argv, const char* app, const char* desc,
     galois::runtime::reportParam("GNNBench", "CommandLine", cmdout.str());
     galois::runtime::reportParam("GNNBench", "Threads", num_threads);
     galois::runtime::reportParam("GNNBench", "Hosts", net.Num);
-    galois::runtime::reportParam("GNNBench", "Runs", num_runs);
     galois::runtime::reportParam("GNNBench", "Run_UUID",
                                  galois::runtime::getRandUUID());
     galois::runtime::reportParam("GNNBench", "InputDirectory", input_directory);
@@ -78,4 +101,48 @@ void GNNBenchStart(int argc, char** argv, const char* app, const char* desc,
   char name[256];
   gethostname(name, 256);
   galois::runtime::reportParam("GNNBench", "Hostname", name);
+
+#ifdef GALOIS_ENABLE_GPU
+  internal::heteroSetup();
+#endif
+}
+
+#ifdef GALOIS_ENABLE_GPU
+void internal::heteroSetup() {
+  const unsigned my_host_id = galois::runtime::getHostID();
+
+  auto& net = galois::runtime::getSystemNetworkInterface();
+
+  if (num_nodes == -1) {
+    num_nodes = net.Num;
+  }
+
+  GALOIS_LOG_ASSERT((net.Num % num_nodes) == 0);
+
+  device_personality = DevicePersonality::CPU;
+  if (personality_set.length() == (net.Num / num_nodes)) {
+    switch (personality_set.c_str()[my_host_id % (net.Num / num_nodes)]) {
+    case 'g':
+      galois::gInfo(my_host_id, " chooses GPU");
+      device_personality = DevicePersonality::GPU_CUDA;
+      break;
+    case 'c':
+      galois::gInfo(my_host_id, " chooses CPU");
+      device_personality = DevicePersonality::CPU;
+      break;
+    }
+
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      gpudevice = get_gpu_device_id(personality_set, num_nodes);
+    } else {
+      gpudevice = -1;
+    }
+
+    SetCUDADeviceId(gpudevice);
+  } else {
+    galois::gWarn(
+        "Command line option -pset ignored because its string length is not "
+        "equal to the number of processes/hosts on each physical node");
+  }
 }
+#endif

From 0f99d7cbf9d78abc70d10be9767275610c7af801 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 11 Feb 2021 14:34:41 -0600
Subject: [PATCH 466/660] libgnn: SAINT sampling, dense layer, var renames

1) Adds GraphSAINT random walk sampling; not exactly the same, but
the idea is.

2) Adds a dense layer: GCN layer without any aggregation (just a weight
multiply)

3) Renames some variables so that false is the default state.
---
 libgnn/CMakeLists.txt                         |   1 +
 libgnn/include/galois/graphs/GNNGraph.h       |   6 +-
 libgnn/include/galois/layers/DenseLayer.h     |  54 ++++++++
 libgnn/include/galois/layers/GNNLayer.h       |  21 +--
 libgnn/src/GraphNeuralNetwork.cpp             |  20 ++-
 libgnn/src/graphs/GNNGraph.cpp                |  75 ++++++++++-
 libgnn/src/layers/DenseLayer.cpp              | 127 ++++++++++++++++++
 libgnn/src/layers/GraphConvolutionalLayer.cpp |  39 +++---
 8 files changed, 306 insertions(+), 37 deletions(-)
 create mode 100644 libgnn/include/galois/layers/DenseLayer.h
 create mode 100644 libgnn/src/layers/DenseLayer.cpp

diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt
index e0d90216e2..83fcc327cf 100644
--- a/libgnn/CMakeLists.txt
+++ b/libgnn/CMakeLists.txt
@@ -6,6 +6,7 @@ set(sources
   src/layers/GNNLayer.cpp
   src/layers/GluonGradientInterface.cpp
   src/layers/GraphConvolutionalLayer.cpp
+  src/layers/DenseLayer.cpp
   src/layers/SigmoidLayer.cpp
   src/layers/SoftmaxLayer.cpp
 )
diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 7b55b84162..3978661a54 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -153,9 +153,13 @@ class GNNGraph {
   //! (the meaning of on and off depends on how it is used; for now, it is used
   //! to indicate subgraph presence); droprate controls chance of being dropped
   //! (e.g. if 0.8, a node is 80% likely to not be included in subgraph)
-  void UniformNodeSample();
+  void UniformNodeSample() { UniformNodeSample(0.5); }
   void UniformNodeSample(float droprate);
 
+  //! Use the sampling method present in GraphSAINT
+  void GraphSAINTSample() { GraphSAINTSample(3000, 2); };
+  void GraphSAINTSample(size_t num_roots, size_t walk_depth);
+
   //! Makes a node "sampled"; used for debugging/testing
   void SetSampledNode(size_t node) { partitioned_graph_->getData(node) = 1; }
   //! Makes a node "not sampled"; used for debugging/testing
diff --git a/libgnn/include/galois/layers/DenseLayer.h b/libgnn/include/galois/layers/DenseLayer.h
new file mode 100644
index 0000000000..d9918f8c2e
--- /dev/null
+++ b/libgnn/include/galois/layers/DenseLayer.h
@@ -0,0 +1,54 @@
+#pragma once
+#include "galois/layers/GNNLayer.h"
+
+namespace galois {
+
+//! Just does a linear xform with no convolution over graph
+class DenseLayer : public GNNLayer {
+public:
+  //! Initializes the variables of the base class and also allocates additional
+  //! memory for temporary matrices. Also initializes sync substrate for the
+  //! weight matrix
+  DenseLayer(size_t layer_num, const galois::graphs::GNNGraph& graph,
+             const GNNLayerDimensions& dimensions,
+             const GNNLayerConfig& config);
+
+  DenseLayer(size_t layer_num, const galois::graphs::GNNGraph& graph,
+             const GNNLayerDimensions& dimensions)
+      : DenseLayer(layer_num, graph, dimensions, GNNLayerConfig()) {}
+
+  // Parent functions
+  const PointerWithSize<galois::GNNFloat>
+  ForwardPhase(const PointerWithSize<galois::GNNFloat> input_embeddings) final;
+
+  PointerWithSize<galois::GNNFloat>
+  BackwardPhase(const PointerWithSize<galois::GNNFloat> prev_layer_input,
+                PointerWithSize<galois::GNNFloat>* input_gradient) final;
+
+private:
+  // 2 temporaries the size of the forward input; used for dropout and
+  // aggregation (if either are required)
+  std::vector<GNNFloat> in_temp_1_;
+  // Pointer with size versions
+  PointerWithSize<GNNFloat> p_in_temp_1_;
+
+  // Each thread has a vector of size # input columns or # output columns for
+  // storing intermediate results during aggregation.
+  // The one used depeneds on if aggregation occurs before or after the mxm.
+  galois::substrate::PerThreadStorage<std::vector<GNNFloat>>
+      input_column_intermediates_;
+  galois::substrate::PerThreadStorage<std::vector<GNNFloat>>
+      output_column_intermediates_;
+
+  //! Do embedding update via mxm with this layer's weights (forward)
+  void UpdateEmbeddings(const GNNFloat* node_embeddings, GNNFloat* output);
+  //! Calculate graident via mxm with last layer's gradients (backward)
+  void UpdateEmbeddingsDerivative(const GNNFloat* gradients, GNNFloat* output);
+
+#ifdef GALOIS_ENABLE_GPU
+  // TODO(hochan/loc) replace with dense gpu object
+  GCNGPUAllocations gpu_object_;
+#endif
+};
+
+} // namespace galois
diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
index c4cc29290f..e387441b8f 100644
--- a/libgnn/include/galois/layers/GNNLayer.h
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -16,8 +16,10 @@ enum class GNNLayerType {
   //! Invalid placeholder
   kInvalid,
   //! GCN
-  kGraphConvolutional
-  // TODO SAGE and GAT
+  kGraphConvolutional,
+  //! Dense linear xform layer
+  kDense
+  // TODO GAT
 };
 
 //! Supported output layer types in the GNN
@@ -39,15 +41,14 @@ struct GNNLayerDimensions {
 struct GNNLayerConfig {
   //! True if weights should be allocated
   bool allocate_weights{true};
-  //! True if dropout is to be done at beginning of forward phase
-  bool do_dropout{false};
+  //! Turns off dropout of weights if enabled
+  bool disable_dropout{false};
   //! Rate at which to drop things if dropout is on
   float dropout_rate{0.5};
-  //! True if some activation function is to be called done at end of forward
-  //! phase
-  bool do_activation{false};
-  //! True if normalization is to occur during multiplies
-  bool do_normalization{false};
+  //! True to disable activation function for intermediate layers
+  bool disable_activation{false};
+  //! True if normalization is disabled to occur during multiplies
+  bool disable_normalization{false};
   //! If this is false, aggregate may occur after multiply if # of input columns
   //! is higher than output columns to do less work in aggregation
   bool disable_aggregate_after_update{false};
@@ -79,7 +80,7 @@ class GNNLayer {
   //! Changes this layer's phase
   void SetLayerPhase(GNNPhase new_phase) { layer_phase_ = new_phase; }
 
-  void DisableActivation() { config_.do_activation = false; }
+  void DisableActivation() { config_.disable_activation = true; }
 
   //! Initializes all layer weights to 1. This is used as a debug function for
   //! testing.
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index 5eac909e18..8192b3f087 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -1,6 +1,7 @@
 #include "galois/GNNMath.h"
 #include "galois/GraphNeuralNetwork.h"
 #include "galois/layers/GraphConvolutionalLayer.h"
+#include "galois/layers/DenseLayer.h"
 #include "galois/layers/SoftmaxLayer.h"
 #include "galois/layers/SigmoidLayer.h"
 
@@ -48,14 +49,22 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork(
             layer_dims.input_columns, layer_dims.output_columns);
       }
 #endif
-      if (i == config_.num_intermediate_layers() - 1) {
-        // last layer before output layer should never have activation
-        gnn_layers_.back()->DisableActivation();
-      }
+      break;
+    case GNNLayerType::kDense:
+      gnn_layers_.push_back(std::move(std::make_unique<DenseLayer>(
+          i, *graph_, layer_dims, config_.default_layer_config())));
+#ifdef GALOIS_ENABLE_GPU
+      // TODO(loc/hochan) dense layer gpu
+#endif
       break;
     default:
       GALOIS_LOG_FATAL("Invalid layer type during network construction");
     }
+
+    if (i == config_.num_intermediate_layers() - 1) {
+      // last layer before output layer should never have activation
+      gnn_layers_.back()->DisableActivation();
+    }
   }
 
   // create the output layer
@@ -117,7 +126,8 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
   for (size_t epoch = 0; epoch < num_epochs; epoch++) {
     if (config_.do_sampling()) {
       // subgraph sample every epoch
-      graph_->UniformNodeSample();
+      // graph_->UniformNodeSample();
+      graph_->GraphSAINTSample();
       graph_->CalculateSpecialNormFactor(true, config_.inductive_training_);
     }
     const PointerWithSize<galois::GNNFloat> predictions = DoInference();
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index c102fc8283..4f12dad28c 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -177,8 +177,6 @@ void galois::graphs::GNNGraph::AggregateSync(
 }
 #endif
 
-void galois::graphs::GNNGraph::UniformNodeSample() { UniformNodeSample(0.8); }
-
 void galois::graphs::GNNGraph::UniformNodeSample(float droprate) {
   galois::do_all(
       galois::iterate(begin_owned(), end_owned()), [&](const NodeIterator& x) {
@@ -189,6 +187,74 @@ void galois::graphs::GNNGraph::UniformNodeSample(float droprate) {
   // them
 }
 
+// TODO(loc) does not work in a distributed setting: assumes the partitioned
+// graph is the entire graph
+void galois::graphs::GNNGraph::GraphSAINTSample(size_t num_roots,
+                                                size_t walk_depth) {
+  // reset sample
+  galois::do_all(galois::iterate(begin(), end()),
+                 [&](size_t n) { partitioned_graph_->getData(n) = 0; });
+
+  galois::on_each([&](size_t thread_id, size_t num_threads) {
+    size_t my_start = 0;
+    size_t my_end   = 0;
+    std::tie(my_start, my_end) =
+        galois::block_range(size_t{0}, num_roots, thread_id, num_threads);
+    size_t thread_roots = my_end - my_start;
+    size_t train_range  = global_training_mask_range_.size;
+    // init RNG
+    drand48_data seed_struct;
+    srand48_r(sample_rng_.GetRandomNumber() * thread_id * num_threads,
+              &seed_struct);
+
+    for (size_t root_num = 0; root_num < thread_roots; root_num++) {
+      // pick a random training node root at random (with replacement);
+      size_t root = 0;
+      while (true) {
+        long int rand_num;
+        lrand48_r(&seed_struct, &rand_num);
+        root = global_training_mask_range_.begin + (rand_num % train_range);
+        if (IsValidForPhase(root, GNNPhase::kTrain)) {
+          break;
+        }
+      }
+      // mark this root as sampled
+      SetSampledNode(root);
+      assert(IsInSampledGraph(root));
+
+      // sample more nodes based on depth of the walk
+      for (size_t current_depth = 0; current_depth < walk_depth;
+           current_depth++) {
+        // pick random edge, mark sampled, swap roots
+        EdgeIterator first_edge = EdgeBegin(root);
+        size_t num_edges        = std::distance(first_edge, EdgeEnd(root));
+        if (num_edges == 0) {
+          break;
+        }
+
+        // must select training neighbor: if it doesn't, then ignore and
+        // continue
+        // To prevent infinite loop in case node has NO training neighbor,
+        // this implementation will not loop until one is found and will
+        // not find full depth if it doesn't find any training nodes randomly
+        long int rand_num;
+        lrand48_r(&seed_struct, &rand_num);
+        EdgeIterator selected_edge = first_edge + (rand_num % num_edges);
+        size_t candidate_dest      = EdgeDestination(selected_edge);
+
+        // TODO(loc) another possibility is to just pick it anyways regardless
+        // but don't mark it as sampled, though this would lead to disconnected
+        // graph
+        if (IsValidForPhase(candidate_dest, GNNPhase::kTrain)) {
+          SetSampledNode(candidate_dest);
+          assert(IsInSampledGraph(candidate_dest));
+          root = candidate_dest;
+        }
+      }
+    }
+  });
+}
+
 void galois::graphs::GNNGraph::ReadLocalLabels(const std::string& dataset_name,
                                                bool has_single_class_label) {
   GALOIS_LOG_VERBOSE("[{}] Reading labels from disk...", host_id_);
@@ -432,9 +498,10 @@ void galois::graphs::GNNGraph::CalculateFullNormFactor() {
       galois::iterate(static_cast<size_t>(0), partitioned_graph_->size()),
       [&](size_t local_id) {
         // translate lid into gid to get global degree
-        size_t global_id     = partitioned_graph_->getGID(local_id);
+        size_t global_id = partitioned_graph_->getGID(local_id);
+        // +1 because simulated self edge
         size_t global_degree = whole_graph_.edge_end(global_id) -
-                               whole_graph_.edge_begin(global_id);
+                               whole_graph_.edge_begin(global_id) + 1;
         // only set if non-zero
         if (global_degree != 0) {
           norm_factors_[local_id] =
diff --git a/libgnn/src/layers/DenseLayer.cpp b/libgnn/src/layers/DenseLayer.cpp
new file mode 100644
index 0000000000..b767805a6a
--- /dev/null
+++ b/libgnn/src/layers/DenseLayer.cpp
@@ -0,0 +1,127 @@
+#include "galois/Logging.h"
+#include "galois/GNNMath.h"
+#include "galois/layers/DenseLayer.h"
+
+galois::DenseLayer::DenseLayer(size_t layer_num,
+                               const galois::graphs::GNNGraph& graph,
+                               const GNNLayerDimensions& dimensions,
+                               const GNNLayerConfig& config)
+    : GNNLayer(layer_num, graph, dimensions, config),
+      input_column_intermediates_(dimensions.input_columns),
+      output_column_intermediates_(dimensions.output_columns) {
+  size_t num_input_elements =
+      layer_dimensions_.input_rows * layer_dimensions_.input_columns;
+  in_temp_1_.resize(num_input_elements, 0);
+  size_t num_output_elements =
+      layer_dimensions_.input_rows * layer_dimensions_.output_columns;
+  GALOIS_LOG_VERBOSE("Output elements {}", num_output_elements);
+  layer_type_  = galois::GNNLayerType::kDense;
+  p_in_temp_1_ = PointerWithSize<GNNFloat>(in_temp_1_);
+  GALOIS_LOG_VERBOSE("Dense initialized");
+}
+
+const galois::PointerWithSize<galois::GNNFloat>
+galois::DenseLayer::ForwardPhase(
+    const galois::PointerWithSize<galois::GNNFloat> input_embeddings) {
+  GALOIS_LOG_VERBOSE("Calling forward phase");
+  assert(input_embeddings.size() ==
+         (layer_dimensions_.input_rows * layer_dimensions_.input_columns));
+  assert(p_in_temp_1_.size() == input_embeddings.size());
+  assert(p_forward_output_matrix_.size() ==
+         (layer_dimensions_.input_rows * layer_dimensions_.output_columns));
+  // pointer to input to operate on
+  const GNNFloat* input_data = input_embeddings.data();
+  // first, dropout
+  if (!config_.disable_dropout && (layer_phase_ == GNNPhase::kTrain)) {
+    DoDropout(input_embeddings, &p_in_temp_1_);
+    input_data = p_in_temp_1_.data();
+  }
+
+  // FW
+  UpdateEmbeddings(input_data, p_forward_output_matrix_.data());
+
+  if (!config_.disable_activation) {
+    GALOIS_LOG_VERBOSE("Doing activation");
+    Activation();
+  }
+
+  assert(p_forward_output_matrix_.size() ==
+         (layer_dimensions_.input_rows * layer_dimensions_.output_columns));
+  return p_forward_output_matrix_;
+}
+
+galois::PointerWithSize<galois::GNNFloat> galois::DenseLayer::BackwardPhase(
+    galois::PointerWithSize<galois::GNNFloat> prev_layer_input,
+    galois::PointerWithSize<galois::GNNFloat>* input_gradient) {
+  assert(layer_phase_ == GNNPhase::kTrain);
+
+  // derivative of activation
+  if (!config_.disable_activation) {
+    ActivationDerivative(input_gradient);
+  }
+
+  if (layer_number_ != 0) {
+    // derivative for update
+    // backout = F'
+    UpdateEmbeddingsDerivative(input_gradient->data(),
+                               p_backward_output_matrix_.data());
+  }
+
+  // W' = F^T (FW)'
+  galois::CBlasSGEMM(CblasTrans, CblasNoTrans, layer_dimensions_.input_columns,
+                     layer_dimensions_.input_rows,
+                     layer_dimensions_.output_columns, prev_layer_input.data(),
+                     input_gradient->data(), p_layer_weight_gradients_.data());
+  // sync weight gradients; note aggregation sync occurs in the function call
+  // already
+  WeightGradientSyncSum();
+
+  if (!config_.disable_dropout && layer_number_ != 0) {
+    DoDropoutDerivative();
+  }
+
+  return p_backward_output_matrix_;
+}
+
+void galois::DenseLayer::UpdateEmbeddings(const GNNFloat* node_embeddings,
+                                          GNNFloat* output) {
+#ifdef GALOIS_ENABLE_GPU
+  if (device_personality == DevicePersonality::GPU_CUDA) {
+    gpu_object_.UpdateEmbeddingsGPU(
+        layer_dimensions_.input_rows, layer_dimensions_.input_columns,
+        layer_dimensions_.output_columns, node_embeddings,
+        base_gpu_object_.layer_weights(), output);
+  } else {
+#endif
+    // CPU version is just a call into CBlas
+    galois::CBlasSGEMM(CblasNoTrans, CblasNoTrans, layer_dimensions_.input_rows,
+                       layer_dimensions_.input_columns,
+                       layer_dimensions_.output_columns, node_embeddings,
+                       layer_weights_.data(), output);
+#ifdef GALOIS_ENABLE_GPU
+  }
+#endif
+}
+
+void galois::DenseLayer::UpdateEmbeddingsDerivative(const GNNFloat* gradients,
+                                                    GNNFloat* output) {
+  assert(p_layer_weights_.size() ==
+         layer_dimensions_.input_columns * layer_dimensions_.output_columns);
+#ifdef GALOIS_ENABLE_GPU
+  if (device_personality == DevicePersonality::GPU_CUDA) {
+    gpu_object_.UpdateEmbeddingsDerivativeGPU(
+        layer_dimensions_.input_rows, layer_dimensions_.input_columns,
+        layer_dimensions_.output_columns, gradients,
+        base_gpu_object_.layer_weights(), output);
+  } else {
+#endif
+    // difference is Trans for B matrix (data) to get z by y (weights is y by z
+    // normally); result is x by y
+    galois::CBlasSGEMM(CblasNoTrans, CblasTrans, layer_dimensions_.input_rows,
+                       layer_dimensions_.output_columns,
+                       layer_dimensions_.input_columns, gradients,
+                       layer_weights_.data(), output);
+#ifdef GALOIS_ENABLE_GPU
+  }
+#endif
+}
diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp
index 07f69cee6e..23c2affde7 100644
--- a/libgnn/src/layers/GraphConvolutionalLayer.cpp
+++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp
@@ -54,7 +54,7 @@ galois::GraphConvolutionalLayer::ForwardPhase(
   // pointer to input to operate on
   const GNNFloat* input_data = input_embeddings.data();
   // first, dropout
-  if (config_.do_dropout && (layer_phase_ == GNNPhase::kTrain)) {
+  if (!config_.disable_dropout && (layer_phase_ == GNNPhase::kTrain)) {
     DoDropout(input_embeddings, &p_in_temp_1_);
     input_data = p_in_temp_1_.data();
   }
@@ -78,7 +78,7 @@ galois::GraphConvolutionalLayer::ForwardPhase(
 
   // TODO synchronization of aggregation functions
 
-  if (config_.do_activation) {
+  if (!config_.disable_activation) {
     GALOIS_LOG_VERBOSE("Doing activation");
     Activation();
   }
@@ -95,7 +95,7 @@ galois::GraphConvolutionalLayer::BackwardPhase(
   assert(layer_phase_ == GNNPhase::kTrain);
 
   // derivative of activation
-  if (config_.do_activation) {
+  if (!config_.disable_activation) {
     ActivationDerivative(input_gradient);
   }
 
@@ -180,7 +180,7 @@ galois::GraphConvolutionalLayer::BackwardPhase(
   // WeightGradientSyncAverage();
   WeightGradientSyncSum();
 
-  if (config_.do_dropout && layer_number_ != 0) {
+  if (!config_.disable_dropout && layer_number_ != 0) {
     DoDropoutDerivative();
   }
 
@@ -194,9 +194,9 @@ void galois::GraphConvolutionalLayer::AggregateAll(
         pts) {
 #ifdef GALOIS_ENABLE_GPU
   if (device_personality == DevicePersonality::GPU_CUDA) {
-    gpu_object_.AggregateAllGPU(graph_.GetGPUGraph(), graph_.size(),
-                                column_length, node_embeddings,
-                                aggregate_output, config_.do_normalization);
+    gpu_object_.AggregateAllGPU(
+        graph_.GetGPUGraph(), graph_.size(), column_length, node_embeddings,
+        aggregate_output, !config_.disable_normalization);
     graph_.AggregateSync(aggregate_output, column_length, layer_number_);
   } else {
 #endif
@@ -217,7 +217,6 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU(
       [&](size_t src) {
         size_t index_to_src_feature = src * column_length;
         // zero out src feature first
-        // TODO(loc) can init to self as well to add to self
         for (size_t i = 0; i < column_length; i++) {
           aggregate_output[index_to_src_feature + i] = 0;
         }
@@ -238,10 +237,16 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU(
         }
 
         GNNFloat source_norm = 0.0;
-        if (config_.do_normalization) {
+        if (!config_.disable_normalization) {
           source_norm = graph_.NormFactor(src);
         }
 
+        // init to self
+        for (size_t i = 0; i < column_length; i++) {
+          aggregate_output[index_to_src_feature + i] =
+              node_embeddings[index_to_src_feature + i];
+        }
+
         // loop through all destinations to grab the feature to aggregate
         for (auto e = graph_.EdgeBegin(src); e != graph_.EdgeEnd(src); e++) {
           size_t dst = graph_.EdgeDestination(e);
@@ -263,7 +268,7 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU(
 
           size_t index_to_dst_feature = dst * column_length;
 
-          if (config_.do_normalization) {
+          if (!config_.disable_normalization) {
             GNNFloat norm_scale = source_norm * graph_.NormFactor(dst);
             // scale the value on the destination by the combined norm term
             assert(pts->getLocal()->size() == column_length);
@@ -288,14 +293,14 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU(
         // GNNFloat* intermediate = pts->getLocal()->data();
         // GNNFloat norm_scale = source_norm * source_norm;
         // for (size_t i = 0; i < column_length; i++) {
-        //  intermediate[i] =
-        //      norm_scale * node_embeddings[index_to_src_feature + i];
-        //}
-        //// add self
+        //   intermediate[i] =
+        //       norm_scale * node_embeddings[index_to_src_feature + i];
+        // }
+        // // add self
         // galois::VectorAdd(column_length,
-        //                  &aggregate_output[index_to_src_feature],
-        //                  intermediate,
-        //                  &aggregate_output[index_to_src_feature]);
+        //                   &aggregate_output[index_to_src_feature],
+        //                   intermediate,
+        //                   &aggregate_output[index_to_src_feature]);
       },
       galois::steal(), galois::loopname("ConvolutionalAggregateAll"));
   // aggregate sync

From 781b0cb439da41e5578bbad6d4315c0c2f7a7577 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 11 Feb 2021 14:38:40 -0600
Subject: [PATCH 467/660] GALOIS_LOG_VASSERT

Steals VASSERT from Katana so that assertions can be more easy to
understand.
---
 libsupport/include/galois/Logging.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/libsupport/include/galois/Logging.h b/libsupport/include/galois/Logging.h
index 8621233fdf..674a9b65fa 100644
--- a/libsupport/include/galois/Logging.h
+++ b/libsupport/include/galois/Logging.h
@@ -111,4 +111,13 @@ void LogLine(LogLevel level, const char* file_name, int line_no, F fmt_string,
     }                                                                          \
   } while (0)
 
+#define GALOIS_LOG_VASSERT(cond, fmt_string, ...)                              \
+  do {                                                                         \
+    if (!(cond)) {                                                             \
+      ::galois::LogLine(::galois::LogLevel::Error, __FILE__, __LINE__,         \
+                        FMT_STRING(fmt_string), ##__VA_ARGS__);                \
+      ::std::abort();                                                          \
+    }                                                                          \
+  } while (0)
+
 #endif

From e0a6f8cc1842b1d2f91e8812f16f6bbe718f1f93 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 11 Feb 2021 14:42:13 -0600
Subject: [PATCH 468/660] gcn-app allows specifying layers; var changes

CLI option to specify layer types for gcn-dist (which should now be
called gnn-dist probably). Changes variable names to reflect libgnn var
name changes.
---
 lonestar/gnn/distributed/gcn/gcn-dist.cpp     |  2 +-
 lonestar/libgnnbench/include/GNNBench/Input.h |  3 +-
 lonestar/libgnnbench/src/Input.cpp            | 84 ++++++++++++-------
 3 files changed, 58 insertions(+), 31 deletions(-)

diff --git a/lonestar/gnn/distributed/gcn/gcn-dist.cpp b/lonestar/gnn/distributed/gcn/gcn-dist.cpp
index a7eb0a4bae..65fe1338cc 100644
--- a/lonestar/gnn/distributed/gcn/gcn-dist.cpp
+++ b/lonestar/gnn/distributed/gcn/gcn-dist.cpp
@@ -9,7 +9,7 @@ int main(int argc, char* argv[]) {
   galois::StatTimer init_timer("InitializationTime");
   init_timer.start();
   std::unique_ptr<galois::GraphNeuralNetwork> gnn =
-    InitializeGraphNeuralNetwork(galois::GNNLayerType::kGraphConvolutional);
+      InitializeGraphNeuralNetwork();
   gnn->SetLayerPhases(galois::GNNPhase::kTrain);
   init_timer.stop();
 
diff --git a/lonestar/libgnnbench/include/GNNBench/Input.h b/lonestar/libgnnbench/include/GNNBench/Input.h
index dc62b19d50..bb417a90f2 100644
--- a/lonestar/libgnnbench/include/GNNBench/Input.h
+++ b/lonestar/libgnnbench/include/GNNBench/Input.h
@@ -19,5 +19,4 @@ const char* GNNPartitionToString(galois::graphs::GNNPartitionScheme s);
 
 //! Using command line args above, create a GNN using some specified layer type
 //! as the intermediate layer.
-std::unique_ptr<galois::GraphNeuralNetwork>
-InitializeGraphNeuralNetwork(galois::GNNLayerType layer_type);
+std::unique_ptr<galois::GraphNeuralNetwork> InitializeGraphNeuralNetwork();
diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp
index dea458a6b3..3e602f8f74 100644
--- a/lonestar/libgnnbench/src/Input.cpp
+++ b/lonestar/libgnnbench/src/Input.cpp
@@ -30,16 +30,29 @@ llvm::cl::opt<size_t> num_layers(
         "Number of intermediate layers in the neural network (default 2))"),
     cll::init(2));
 
-llvm::cl::list<size_t>
-    layer_sizes("layerSizes",
-                cll::desc("Comma separated list of numbers specifying "
-                          "intermediate layer sizes (does not include output)"),
-                cll::CommaSeparated);
+llvm::cl::list<size_t> layer_sizes(
+    "layerSizes",
+    cll::desc(
+        "Comma separated list of numbers specifying "
+        "intermediate layer sizes (does not include output); default sizes are "
+        "16 until last layer which is the size of the # of labels"),
+    cll::CommaSeparated);
+
+llvm::cl::list<galois::GNNLayerType> cl_layer_types(
+    "layerTypes",
+    cll::desc("Comma separated list of layer types specifying "
+              "intermediate layers (does not include output)"),
+    cll::values(clEnumValN(galois::GNNLayerType::kGraphConvolutional, "gcn",
+                           "Graph Convolutional Layer (default)"),
+                clEnumValN(galois::GNNLayerType::kDense, "dense",
+                           "Dense Layer")),
+    cll::CommaSeparated);
 
-llvm::cl::opt<bool> do_dropout(
-    "doDropout",
-    cll::desc("If true (on by default), does dropout of input during training"),
-    cll::init(true));
+llvm::cl::opt<bool>
+    disable_dropout("disableDropout",
+                    cll::desc("If true (off by default), disables dropout of "
+                              "layer weights during training"),
+                    cll::init(false));
 
 llvm::cl::opt<float> dropout_rate(
     "dropoutRate",
@@ -47,17 +60,17 @@ llvm::cl::opt<float> dropout_rate(
               "0.1, then 10 percent chance of dropping) (default 0.5)"),
     cll::init(0.5));
 
-llvm::cl::opt<bool>
-    do_activation("doActivation",
-                  cll::desc("If true (off by default), does activation at the "
-                            "end of an intermediate layer"),
-                  cll::init(false));
+llvm::cl::opt<bool> disable_activation(
+    "disableActivation",
+    cll::desc("If true (off by default), disable activation at the "
+              "end of an intermediate layers"),
+    cll::init(false));
 
-llvm::cl::opt<bool>
-    do_normalization("doNormalization",
-                     cll::desc("If true (on by default), normalizes vertex "
-                               "features based on their degree"),
-                     cll::init(true));
+llvm::cl::opt<bool> disable_normalization(
+    "disableNormalization",
+    cll::desc("If true (off by default), disable normalizing vertex "
+              "features based on their degree"),
+    cll::init(false));
 
 llvm::cl::opt<galois::GNNOutputLayerType> output_layer_type(
     "outputLayer", cll::desc("Type of output layer"),
@@ -103,6 +116,25 @@ const char* GNNPartitionToString(galois::graphs::GNNPartitionScheme s) {
   }
 }
 
+//! Initializes the vector of layer sizes from command line args + graph
+std::vector<galois::GNNLayerType> CreateLayerTypesVector() {
+  std::vector<galois::GNNLayerType> layer_types;
+  if (!cl_layer_types.size()) {
+    // default is all GCN layers
+    for (size_t i = 0; i < num_layers; i++) {
+      layer_types.emplace_back(galois::GNNLayerType::kGraphConvolutional);
+    }
+  } else {
+    GALOIS_LOG_VASSERT(cl_layer_types.size() == num_layers,
+                       "Number layer types should be {} not {}", num_layers,
+                       cl_layer_types.size());
+    for (size_t i = 0; i < num_layers; i++) {
+      layer_types.emplace_back(cl_layer_types[i]);
+    }
+  }
+  return layer_types;
+}
+
 //! Initializes the vector of layer sizes from command line args + graph
 std::vector<size_t>
 CreateLayerSizesVector(const galois::graphs::GNNGraph* gnn_graph) {
@@ -139,10 +171,10 @@ CreateLayerSizesVector(const galois::graphs::GNNGraph* gnn_graph) {
 //! Setup layer config struct based on cli args
 galois::GNNLayerConfig CreateLayerConfig() {
   galois::GNNLayerConfig layer_config;
-  layer_config.do_dropout                     = do_dropout;
+  layer_config.disable_dropout                = disable_dropout;
   layer_config.dropout_rate                   = dropout_rate;
-  layer_config.do_activation                  = do_activation;
-  layer_config.do_normalization               = do_normalization;
+  layer_config.disable_activation             = disable_activation;
+  layer_config.disable_normalization          = disable_normalization;
   layer_config.disable_aggregate_after_update = disable_agg_after_update;
   layer_config.inductive_training_            = do_inductive_training;
   return layer_config;
@@ -184,17 +216,13 @@ CreateOptimizer(const galois::graphs::GNNGraph* gnn_graph) {
   return std::make_unique<galois::AdamOptimizer>(opt_sizes, num_layers);
 }
 
-std::unique_ptr<galois::GraphNeuralNetwork>
-InitializeGraphNeuralNetwork(galois::GNNLayerType layer_type) {
+std::unique_ptr<galois::GraphNeuralNetwork> InitializeGraphNeuralNetwork() {
   // partition/load graph
   auto gnn_graph = std::make_unique<galois::graphs::GNNGraph>(
       input_directory, input_name, partition_scheme, !multiclass_labels);
 
   // create layer types vector
-  std::vector<galois::GNNLayerType> layer_types;
-  for (size_t i = 0; i < num_layers; i++) {
-    layer_types.push_back(layer_type);
-  }
+  std::vector<galois::GNNLayerType> layer_types = CreateLayerTypesVector();
   // sizes
   std::vector<size_t> layer_sizes_vector =
       CreateLayerSizesVector(gnn_graph.get());

From 3112b9e4ec40f228c50759ce0bb5f700a6de8702 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 11 Feb 2021 15:47:32 -0600
Subject: [PATCH 469/660] Fix libgnn tests; disable self aggregation option

Tests broke after changing variable names and adding self aggregation
in a previous commit. /this commit adds a config option for self
aggregation and fixes tests.
---
 libgnn/include/galois/layers/GNNLayer.h       | 10 ++++++++++
 libgnn/src/layers/GraphConvolutionalLayer.cpp |  8 +++++---
 libgnn/test/convlayer-test.cpp                | 13 ++++++++-----
 libgnn/test/epoch-test.cpp                    |  6 +++---
 libgnn/test/gnnfb-test.cpp                    |  1 +
 libgnn/test/multilabel-epoch-test.cpp         |  6 +++---
 libgnn/test/sample-test.cpp                   |  1 +
 lonestar/libgnnbench/src/Input.cpp            |  6 ++++++
 8 files changed, 37 insertions(+), 14 deletions(-)

diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
index e387441b8f..68d2107456 100644
--- a/libgnn/include/galois/layers/GNNLayer.h
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -52,11 +52,21 @@ struct GNNLayerConfig {
   //! If this is false, aggregate may occur after multiply if # of input columns
   //! is higher than output columns to do less work in aggregation
   bool disable_aggregate_after_update{false};
+  //! On to not aggregate self vector during aggregation
+  bool disable_self_aggregate{false};
   //! Graph sampling flag in use or not
   bool do_sampling{false};
   //! Inductive layer means for aggregation all non-training nodes are ignored
   bool inductive_training_{false};
   // TODO activation type; for now default is softmax
+
+  //! Sets settings such that testing is easy
+  void DebugConfig() {
+    disable_activation     = true;
+    disable_normalization  = true;
+    disable_dropout        = true;
+    disable_self_aggregate = true;
+  }
 };
 
 // Tried to avoid inheritance, but keeping track of heterogeneous layers
diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp
index 23c2affde7..1b2778fe6f 100644
--- a/libgnn/src/layers/GraphConvolutionalLayer.cpp
+++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp
@@ -242,9 +242,11 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU(
         }
 
         // init to self
-        for (size_t i = 0; i < column_length; i++) {
-          aggregate_output[index_to_src_feature + i] =
-              node_embeddings[index_to_src_feature + i];
+        if (!config_.disable_self_aggregate) {
+          for (size_t i = 0; i < column_length; i++) {
+            aggregate_output[index_to_src_feature + i] =
+                node_embeddings[index_to_src_feature + i];
+          }
         }
 
         // loop through all destinations to grab the feature to aggregate
diff --git a/libgnn/test/convlayer-test.cpp b/libgnn/test/convlayer-test.cpp
index 136953378d..bcada6c4ed 100644
--- a/libgnn/test/convlayer-test.cpp
+++ b/libgnn/test/convlayer-test.cpp
@@ -53,6 +53,7 @@ int main() {
 
   galois::GNNLayerConfig dcon;
   dcon.disable_aggregate_after_update = false;
+  dcon.DebugConfig();
 
   // create the layer, no norm factor
   std::unique_ptr<galois::GraphConvolutionalLayer> layer_0 =
@@ -69,9 +70,11 @@ int main() {
   // since norm factors aren't invovled it is possible to do full assertions
   // 7 x 2
   GALOIS_LOG_ASSERT(layer_0_forward_output.size() == 14);
-  GALOIS_LOG_ASSERT(layer_0_forward_output[0] == 3);
+  GALOIS_LOG_VASSERT(layer_0_forward_output[0] == 3, "{} should be 3",
+                     layer_0_forward_output[0]);
   GALOIS_LOG_ASSERT(layer_0_forward_output[1] == 3);
-  GALOIS_LOG_ASSERT(layer_0_forward_output[2] == 6);
+  GALOIS_LOG_VASSERT(layer_0_forward_output[2] == 6, "{} should be 6",
+                     layer_0_forward_output[2]);
   GALOIS_LOG_ASSERT(layer_0_forward_output[3] == 6);
   GALOIS_LOG_ASSERT(layer_0_forward_output[4] == 12);
   GALOIS_LOG_ASSERT(layer_0_forward_output[5] == 12);
@@ -207,9 +210,9 @@ int main() {
   //////////////////////////////////////////////////////////////////////////////
 
   galois::GNNLayerConfig config;
-  config.do_dropout                     = true;
-  config.do_activation                  = true;
-  config.do_normalization               = true;
+  config.disable_dropout                = false;
+  config.disable_activation             = false;
+  config.disable_normalization          = false;
   config.disable_aggregate_after_update = false;
 
   // finally, just make sure dropout and activation run without crashes
diff --git a/libgnn/test/epoch-test.cpp b/libgnn/test/epoch-test.cpp
index d8a27cc13b..2dbaea3372 100644
--- a/libgnn/test/epoch-test.cpp
+++ b/libgnn/test/epoch-test.cpp
@@ -22,9 +22,9 @@ int main() {
   std::vector<size_t> layer_output_sizes = {
       16, test_graph->GetNumLabelClasses(), test_graph->GetNumLabelClasses()};
   galois::GNNLayerConfig layer_config;
-  layer_config.do_dropout       = true;
-  layer_config.do_activation    = false;
-  layer_config.do_normalization = true;
+  layer_config.disable_dropout       = false;
+  layer_config.disable_activation    = false;
+  layer_config.disable_normalization = false;
   // XXX Activation kills accuracy compared to old code, esp. for cora
   galois::GraphNeuralNetworkConfig gnn_config(
       2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax,
diff --git a/libgnn/test/gnnfb-test.cpp b/libgnn/test/gnnfb-test.cpp
index 224204bceb..091c6f01c8 100644
--- a/libgnn/test/gnnfb-test.cpp
+++ b/libgnn/test/gnnfb-test.cpp
@@ -25,6 +25,7 @@ int main() {
   std::vector<size_t> layer_output_sizes = {4, 7, 7};
   galois::GNNLayerConfig dcon;
   dcon.disable_aggregate_after_update = false;
+  dcon.DebugConfig();
   // note GNNLayerConfig is passed in; use a config that does not do anything
   // extra like dropout or activation and the like so that input is easier to
   // verify
diff --git a/libgnn/test/multilabel-epoch-test.cpp b/libgnn/test/multilabel-epoch-test.cpp
index 3fb96f8c81..7626abda1d 100644
--- a/libgnn/test/multilabel-epoch-test.cpp
+++ b/libgnn/test/multilabel-epoch-test.cpp
@@ -22,9 +22,9 @@ int main() {
   std::vector<size_t> layer_output_sizes = {
       16, test_graph->GetNumLabelClasses(), test_graph->GetNumLabelClasses()};
   galois::GNNLayerConfig layer_config;
-  layer_config.do_dropout       = true;
-  layer_config.do_activation    = false;
-  layer_config.do_normalization = true;
+  layer_config.disable_dropout       = false;
+  layer_config.disable_activation    = false;
+  layer_config.disable_normalization = false;
   // XXX Activation kills accuracy compared to old code, esp. for cora
   galois::GraphNeuralNetworkConfig gnn_config(
       2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSigmoid,
diff --git a/libgnn/test/sample-test.cpp b/libgnn/test/sample-test.cpp
index ead938e5aa..063ff80ca5 100644
--- a/libgnn/test/sample-test.cpp
+++ b/libgnn/test/sample-test.cpp
@@ -27,6 +27,7 @@ int main() {
 
   galois::GNNLayerConfig dcon;
   dcon.disable_aggregate_after_update = false;
+  dcon.DebugConfig();
 
   // choose a few sample nodes
   test_graph.SetSampledNode(0);
diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp
index 3e602f8f74..47ca1bfe0c 100644
--- a/lonestar/libgnnbench/src/Input.cpp
+++ b/lonestar/libgnnbench/src/Input.cpp
@@ -92,6 +92,11 @@ llvm::cl::opt<bool> disable_agg_after_update(
               "after update optimization"),
     cll::init(false));
 
+llvm::cl::opt<bool> disable_self_aggregate(
+    "disableSelfAggregation",
+    cll::desc("If true (off by default), disables aggregate of self feature"),
+    cll::init(false));
+
 llvm::cl::opt<bool>
     do_graph_sampling("doGraphSampling",
                       cll::desc("If true (off by default), sample nodes for "
@@ -176,6 +181,7 @@ galois::GNNLayerConfig CreateLayerConfig() {
   layer_config.disable_activation             = disable_activation;
   layer_config.disable_normalization          = disable_normalization;
   layer_config.disable_aggregate_after_update = disable_agg_after_update;
+  layer_config.disable_self_aggregate         = disable_self_aggregate;
   layer_config.inductive_training_            = do_inductive_training;
   return layer_config;
 }

From 6607147502f5c05e24d374695d2481cbf15089b5 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 17 Feb 2021 17:48:24 -0600
Subject: [PATCH 470/660] Partial SAGE layer + self loop norm fix in GCN

Adds incomplete SAGE layer implementation: the mean aggregation (done
via the norms being 1 / degree) is done in the SAGE layer. What needs to
be done next is the concat part of it. (next commit will take care of
this)

Also fixes a minor thing in GCN self-loop aggregation where norm^2 needs
to be applied to the self feature.
---
 libgnn/CMakeLists.txt                         |   1 +
 libgnn/include/galois/graphs/GNNGraph.h       |   6 +
 libgnn/include/galois/layers/GNNLayer.h       |   2 +
 libgnn/include/galois/layers/SAGELayer.h      |  99 +++++
 libgnn/src/GraphNeuralNetwork.cpp             |   8 +
 libgnn/src/graphs/GNNGraph.cpp                |   3 +
 libgnn/src/layers/GraphConvolutionalLayer.cpp |   3 +-
 libgnn/src/layers/SAGELayer.cpp               | 352 ++++++++++++++++++
 lonestar/libgnnbench/src/Input.cpp            |   2 +
 9 files changed, 475 insertions(+), 1 deletion(-)
 create mode 100644 libgnn/include/galois/layers/SAGELayer.h
 create mode 100644 libgnn/src/layers/SAGELayer.cpp

diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt
index 83fcc327cf..82454b1301 100644
--- a/libgnn/CMakeLists.txt
+++ b/libgnn/CMakeLists.txt
@@ -9,6 +9,7 @@ set(sources
   src/layers/DenseLayer.cpp
   src/layers/SigmoidLayer.cpp
   src/layers/SoftmaxLayer.cpp
+  src/layers/SAGELayer.cpp
 )
 
 set(MKL_LIBRARIES ${MKL_ROOT}/lib/intel64)
diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 3978661a54..4dafda2afb 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -95,6 +95,9 @@ class GNNGraph {
     return partitioned_graph_->getEdgeDst(ei);
   };
   GNNFloat NormFactor(GraphNode n) const { return norm_factors_[n]; }
+  //! Degree norm (1 / degree) of current functional graph (e.g., sampled,
+  //! inductive graph, etc); calculated whenever norm factor is calculated
+  GNNFloat DegreeNorm(GraphNode n) const { return degree_norm_[n]; }
 
   // Get accuracy: sampling is by default false
   float GetGlobalAccuracy(PointerWithSize<GNNFloat> predictions,
@@ -287,6 +290,9 @@ class GNNGraph {
 
   //! Normalization constant based on structure of the graph (degrees)
   std::vector<GNNFloat> norm_factors_;
+  //! Normalization constant based on degrees (unlike nomral norm factors
+  //! it's only division without a square root)
+  std::vector<GNNFloat> degree_norm_;
 
   //! RNG for subgraph sampling
   galois::PerThreadRNG sample_rng_;
diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
index 68d2107456..3be7908ad7 100644
--- a/libgnn/include/galois/layers/GNNLayer.h
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -17,6 +17,8 @@ enum class GNNLayerType {
   kInvalid,
   //! GCN
   kGraphConvolutional,
+  //! Sage layer: same as GCN except with mean aggregation and concat
+  kSAGE,
   //! Dense linear xform layer
   kDense
   // TODO GAT
diff --git a/libgnn/include/galois/layers/SAGELayer.h b/libgnn/include/galois/layers/SAGELayer.h
new file mode 100644
index 0000000000..a3fd5ecac6
--- /dev/null
+++ b/libgnn/include/galois/layers/SAGELayer.h
@@ -0,0 +1,99 @@
+#pragma once
+#include "galois/layers/GNNLayer.h"
+
+#ifdef GALOIS_ENABLE_GPU
+// TODO(loc/hochan)
+#endif
+
+namespace galois {
+
+struct SAGELayerConfig {
+  // TODO(loc) relevant options here
+  bool todo;
+};
+
+//! Same as GCN layer except for the following:
+//! - Mean aggregation; no symmetric norm with sqrts used (this
+//! ends up performing better for some graphs)
+//! - Concatination of the self: rather than aggregating self
+//! feature it is concatinated (i.e. dimensions are doubled)
+class SAGELayer : public GNNLayer {
+public:
+  //! Initializes the variables of the base class and also allocates additional
+  //! memory for temporary matrices. Also initializes sync substrate for the
+  //! weight matrix
+  SAGELayer(size_t layer_num, const galois::graphs::GNNGraph& graph,
+            const GNNLayerDimensions& dimensions, const GNNLayerConfig& config,
+            const SAGELayerConfig& sage_config);
+
+  SAGELayer(size_t layer_num, const galois::graphs::GNNGraph& graph,
+            const GNNLayerDimensions& dimensions, const GNNLayerConfig& config)
+      : SAGELayer(layer_num, graph, dimensions, config, SAGELayerConfig()) {}
+
+  SAGELayer(size_t layer_num, const galois::graphs::GNNGraph& graph,
+            const GNNLayerDimensions& dimensions)
+      : SAGELayer(layer_num, graph, dimensions, GNNLayerConfig(),
+                  SAGELayerConfig()) {}
+
+  // Parent functions
+  const PointerWithSize<galois::GNNFloat>
+  ForwardPhase(const PointerWithSize<galois::GNNFloat> input_embeddings) final;
+
+  PointerWithSize<galois::GNNFloat>
+  BackwardPhase(const PointerWithSize<galois::GNNFloat> prev_layer_input,
+                PointerWithSize<galois::GNNFloat>* input_gradient) final;
+
+private:
+  // 2 temporaries the size of the forward input; used for dropout and
+  // aggregation (if either are required)
+  std::vector<GNNFloat> in_temp_1_;
+  std::vector<GNNFloat> in_temp_2_;
+  // Temporary matrix the size of the output of the forward pass; used if
+  // an intermediate op occurs before writing to the final output matrix
+  std::vector<GNNFloat> out_temp_;
+
+  // Pointer with size versions
+  PointerWithSize<GNNFloat> p_in_temp_1_;
+  PointerWithSize<GNNFloat> p_in_temp_2_;
+  PointerWithSize<GNNFloat> p_out_temp_;
+
+  // Each thread has a vector of size # input columns or # output columns for
+  // storing intermediate results during aggregation.
+  // The one used depeneds on if aggregation occurs before or after the mxm.
+  galois::substrate::PerThreadStorage<std::vector<GNNFloat>>
+      input_column_intermediates_;
+  galois::substrate::PerThreadStorage<std::vector<GNNFloat>>
+      output_column_intermediates_;
+
+  //! CPU aggregation
+  void AggregateAllCPU(
+      size_t column_length, const GNNFloat* node_embeddings,
+      GNNFloat* aggregate_output,
+      galois::substrate::PerThreadStorage<std::vector<GNNFloat>>* pts,
+      bool is_backward);
+
+  //! Performs aggregation for all nodes of the graph given the length of the
+  //! vector to aggregate, the features themselves, an output array, and per
+  //! thread storage for the intermediate scaling via norm factor
+  void
+  AggregateAll(size_t column_length, const GNNFloat* node_embeddings,
+               GNNFloat* aggregate_output,
+               galois::substrate::PerThreadStorage<std::vector<GNNFloat>>* pts);
+  void
+  AggregateAll(size_t column_length, const GNNFloat* node_embeddings,
+               GNNFloat* aggregate_output,
+               galois::substrate::PerThreadStorage<std::vector<GNNFloat>>* pts,
+               bool is_backward);
+
+  //! Do embedding update via mxm with this layer's weights (forward)
+  void UpdateEmbeddings(const GNNFloat* node_embeddings, GNNFloat* output);
+  //! Calculate graident via mxm with last layer's gradients (backward)
+  void UpdateEmbeddingsDerivative(const GNNFloat* gradients, GNNFloat* output);
+
+#ifdef GALOIS_ENABLE_GPU
+  // TODO(loc/hochan)
+  GCNGPUAllocations gpu_object_;
+#endif
+};
+
+} // namespace galois
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index 8192b3f087..40e3c8a7e1 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -4,6 +4,7 @@
 #include "galois/layers/DenseLayer.h"
 #include "galois/layers/SoftmaxLayer.h"
 #include "galois/layers/SigmoidLayer.h"
+#include "galois/layers/SAGELayer.h"
 
 galois::GraphNeuralNetwork::GraphNeuralNetwork(
     std::unique_ptr<galois::graphs::GNNGraph> graph,
@@ -48,6 +49,13 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork(
             i, galois::runtime::getSystemNetworkInterface().Num,
             layer_dims.input_columns, layer_dims.output_columns);
       }
+#endif
+      break;
+    case GNNLayerType::kSAGE:
+      gnn_layers_.push_back(std::move(std::make_unique<SAGELayer>(
+          i, *graph_, layer_dims, config_.default_layer_config())));
+#ifdef GALOIS_ENABLE_GPU
+      // TODO(loc/hochan) sage layer gpu
 #endif
       break;
     case GNNLayerType::kDense:
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index 4f12dad28c..af3ef00baf 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -487,6 +487,7 @@ void galois::graphs::GNNGraph::ReadWholeGraph(const std::string& dataset_name) {
 void galois::graphs::GNNGraph::InitNormFactor() {
   GALOIS_LOG_VERBOSE("[{}] Initializing norm factors", host_id_);
   norm_factors_.resize(partitioned_graph_->size(), 0.0);
+  degree_norm_.resize(partitioned_graph_->size(), 0.0);
   CalculateFullNormFactor();
 }
 
@@ -506,6 +507,7 @@ void galois::graphs::GNNGraph::CalculateFullNormFactor() {
         if (global_degree != 0) {
           norm_factors_[local_id] =
               1.0 / std::sqrt(static_cast<float>(global_degree));
+          degree_norm_[local_id] = 1.0 / static_cast<float>(global_degree);
         }
       },
       galois::loopname("CalculateFullNormFactor"));
@@ -569,6 +571,7 @@ void galois::graphs::GNNGraph::CalculateSpecialNormFactor(bool is_sampled,
         // only set if non-zero
         if (degree != 0) {
           norm_factors_[local_id] = 1.0 / std::sqrt(static_cast<float>(degree));
+          degree_norm_[local_id]  = 1.0 / static_cast<float>(degree);
         }
       },
       galois::loopname("CalculateSpecialNormFactor"));
diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp
index 1b2778fe6f..81bebfd8e2 100644
--- a/libgnn/src/layers/GraphConvolutionalLayer.cpp
+++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp
@@ -245,7 +245,8 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU(
         if (!config_.disable_self_aggregate) {
           for (size_t i = 0; i < column_length; i++) {
             aggregate_output[index_to_src_feature + i] =
-                node_embeddings[index_to_src_feature + i];
+                node_embeddings[index_to_src_feature + i] * source_norm *
+                source_norm;
           }
         }
 
diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp
new file mode 100644
index 0000000000..a7e6d4c5f6
--- /dev/null
+++ b/libgnn/src/layers/SAGELayer.cpp
@@ -0,0 +1,352 @@
+#include "galois/Logging.h"
+#include "galois/GNNMath.h"
+#include "galois/layers/SAGELayer.h"
+
+galois::SAGELayer::SAGELayer(size_t layer_num,
+                             const galois::graphs::GNNGraph& graph,
+                             const GNNLayerDimensions& dimensions,
+                             const GNNLayerConfig& config,
+                             const SAGELayerConfig&)
+    : GNNLayer(layer_num, graph, dimensions, config),
+      input_column_intermediates_(dimensions.input_columns),
+      output_column_intermediates_(dimensions.output_columns) {
+  size_t num_input_elements =
+      layer_dimensions_.input_rows * layer_dimensions_.input_columns;
+  in_temp_1_.resize(num_input_elements, 0);
+  // TODO temp2 does not need to be initialized in all circumstances
+  in_temp_2_.resize(num_input_elements, 0);
+
+  size_t num_output_elements =
+      layer_dimensions_.input_rows * layer_dimensions_.output_columns;
+  GALOIS_LOG_VERBOSE("Output elements {}", num_output_elements);
+  out_temp_.resize(num_output_elements, 0);
+  layer_type_ = galois::GNNLayerType::kGraphConvolutional;
+#ifdef GALOIS_ENABLE_GPU
+  // TODO
+  if (device_personality == DevicePersonality::GPU_CUDA) {
+    gpu_object_.Allocate(num_input_elements, num_output_elements);
+    // init pointers with size
+    p_in_temp_1_ =
+        PointerWithSize<GNNFloat>(gpu_object_.in_temp_1(), in_temp_1_.size());
+    p_in_temp_2_ =
+        PointerWithSize<GNNFloat>(gpu_object_.in_temp_2(), in_temp_2_.size());
+    p_out_temp_ =
+        PointerWithSize<GNNFloat>(gpu_object_.out_temp(), out_temp_.size());
+  } else {
+#endif
+    p_in_temp_1_ = PointerWithSize<GNNFloat>(in_temp_1_);
+    p_in_temp_2_ = PointerWithSize<GNNFloat>(in_temp_2_);
+    p_out_temp_  = PointerWithSize<GNNFloat>(out_temp_);
+#ifdef GALOIS_ENABLE_GPU
+    // TODO concat
+  }
+#endif
+
+  GALOIS_LOG_VERBOSE("SAGE layer initialized");
+}
+
+const galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::ForwardPhase(
+    const galois::PointerWithSize<galois::GNNFloat> input_embeddings) {
+  GALOIS_LOG_VERBOSE("Calling forward phase");
+  assert(input_embeddings.size() ==
+         (layer_dimensions_.input_rows * layer_dimensions_.input_columns));
+  assert(p_in_temp_1_.size() == input_embeddings.size());
+  assert(p_in_temp_2_.size() == input_embeddings.size());
+  assert(p_forward_output_matrix_.size() ==
+         (layer_dimensions_.input_rows * layer_dimensions_.output_columns));
+  // pointer to input to operate on
+  const GNNFloat* input_data = input_embeddings.data();
+  // first, dropout
+  if (!config_.disable_dropout && (layer_phase_ == GNNPhase::kTrain)) {
+    DoDropout(input_embeddings, &p_in_temp_1_);
+    input_data = p_in_temp_1_.data();
+  }
+
+  // flip aggregate/update if dimensions favor it (do less work)
+  if (config_.disable_aggregate_after_update ||
+      layer_dimensions_.input_columns <= layer_dimensions_.output_columns) {
+    // aggregation and update
+    AggregateAll(layer_dimensions_.input_columns, input_data,
+                 p_in_temp_2_.data(), &input_column_intermediates_);
+    UpdateEmbeddings(p_in_temp_2_.data(), p_forward_output_matrix_.data());
+  } else {
+    // update to aggregate
+    // FW
+    UpdateEmbeddings(input_data, p_out_temp_.data());
+    // A(FW)
+    AggregateAll(layer_dimensions_.output_columns, p_out_temp_.data(),
+                 p_forward_output_matrix_.data(),
+                 &output_column_intermediates_);
+  }
+
+  // TODO synchronization of aggregation functions
+
+  if (!config_.disable_activation) {
+    GALOIS_LOG_VERBOSE("Doing activation");
+    Activation();
+  }
+
+  assert(p_forward_output_matrix_.size() ==
+         (layer_dimensions_.input_rows * layer_dimensions_.output_columns));
+  return p_forward_output_matrix_;
+}
+
+galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
+    galois::PointerWithSize<galois::GNNFloat> prev_layer_input,
+    galois::PointerWithSize<galois::GNNFloat>* input_gradient) {
+  assert(layer_phase_ == GNNPhase::kTrain);
+
+  // derivative of activation
+  if (!config_.disable_activation) {
+    ActivationDerivative(input_gradient);
+  }
+
+  // AFW = O
+
+  // derivative of aggregation/update
+  // TODO clean up logic here to reduce nesting
+  if (config_.disable_aggregate_after_update ||
+      layer_dimensions_.input_columns <= layer_dimensions_.output_columns) {
+    if (layer_number_ != 0) {
+      // transposed sgemm for derivative; in_temp is output
+      assert(input_gradient->size() ==
+             layer_dimensions_.input_rows * layer_dimensions_.output_columns);
+      assert(p_in_temp_1_.size() ==
+             layer_dimensions_.input_columns * layer_dimensions_.input_rows);
+      // pintemp1 contains (AF)'
+      UpdateEmbeddingsDerivative(input_gradient->data(), p_in_temp_1_.data());
+      // pback contains F'
+      // derivative of aggregate is the same due to symmetric graph
+      AggregateAll(layer_dimensions_.input_columns, p_in_temp_1_.data(),
+                   p_backward_output_matrix_.data(),
+                   &input_column_intermediates_, true);
+      // TODO if training A, then A' compute here if layer # is 0
+      // dot product of edges that exist in A
+    }
+    // weight gradient calculation
+    // TODO(loc) put this in a function to put the ifdef in there
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      gpu_object_.GetWeightGradientsGPU(
+          layer_dimensions_.input_rows, layer_dimensions_.input_columns,
+          layer_dimensions_.output_columns, p_in_temp_2_.data(),
+          input_gradient->data(), p_layer_weight_gradients_.data());
+    } else {
+#endif
+      // temp 2 holds aggregated feature vectors from forward phase
+      galois::CBlasSGEMM(
+          CblasTrans, CblasNoTrans, layer_dimensions_.input_columns,
+          layer_dimensions_.input_rows, layer_dimensions_.output_columns,
+          p_in_temp_2_.data(), input_gradient->data(),
+          p_layer_weight_gradients_.data());
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+  } else {
+    // TODO at this point, out_temp contains memoized FW
+    // can use it to get A' = O' (FW)^T
+    // aggregate occurs regardless of layer being equal to 0 because it is
+    // required in this case for the weight gradient calculation
+    // this is (FW)'
+    AggregateAll(layer_dimensions_.output_columns, input_gradient->data(),
+                 p_out_temp_.data(), &output_column_intermediates_, true);
+    if (layer_number_ != 0) {
+      // derivative for update
+      // backout = F'
+      UpdateEmbeddingsDerivative(p_out_temp_.data(),
+                                 p_backward_output_matrix_.data());
+    }
+    // TODO put this in a function
+    // W' = F^T (FW)'
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      gpu_object_.GetWeightGradientsGPU(
+          layer_dimensions_.input_rows, layer_dimensions_.input_columns,
+          layer_dimensions_.output_columns, prev_layer_input.data(),
+          p_out_temp_.data(), p_layer_weight_gradients_.data());
+    } else {
+#endif
+      galois::CBlasSGEMM(
+          CblasTrans, CblasNoTrans, layer_dimensions_.input_columns,
+          layer_dimensions_.input_rows, layer_dimensions_.output_columns,
+          prev_layer_input.data(), p_out_temp_.data(),
+          p_layer_weight_gradients_.data());
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+  }
+
+  // sync weight gradients; note aggregation sync occurs in the function call
+  // already
+  // TODO figure out how to do this with GPUs
+  // WeightGradientSyncAverage();
+  WeightGradientSyncSum();
+
+  if (!config_.disable_dropout && layer_number_ != 0) {
+    DoDropoutDerivative();
+  }
+
+  return p_backward_output_matrix_;
+}
+
+void galois::SAGELayer::AggregateAll(
+    size_t column_length, const GNNFloat* node_embeddings,
+    GNNFloat* aggregate_output,
+    [[maybe_unused]] galois::substrate::PerThreadStorage<std::vector<GNNFloat>>*
+        pts) {
+  AggregateAll(column_length, node_embeddings, aggregate_output, pts, false);
+}
+
+void galois::SAGELayer::AggregateAll(
+    size_t column_length, const GNNFloat* node_embeddings,
+    GNNFloat* aggregate_output,
+    [[maybe_unused]] galois::substrate::PerThreadStorage<std::vector<GNNFloat>>*
+        pts,
+    bool is_backward) {
+#ifdef GALOIS_ENABLE_GPU
+  if (device_personality == DevicePersonality::GPU_CUDA) {
+    gpu_object_.AggregateAllGPU(
+        graph_.GetGPUGraph(), graph_.size(), column_length, node_embeddings,
+        aggregate_output, !config_.disable_normalization);
+    graph_.AggregateSync(aggregate_output, column_length, layer_number_);
+  } else {
+#endif
+    AggregateAllCPU(column_length, node_embeddings, aggregate_output, pts,
+                    is_backward);
+#ifdef GALOIS_ENABLE_GPU
+  }
+#endif
+}
+
+void galois::SAGELayer::AggregateAllCPU(
+    size_t column_length, const GNNFloat* node_embeddings,
+    GNNFloat* aggregate_output,
+    galois::substrate::PerThreadStorage<std::vector<GNNFloat>>* pts,
+    bool is_backward) {
+  size_t num_nodes = graph_.size();
+
+  galois::do_all(
+      galois::iterate(static_cast<size_t>(0), num_nodes),
+      [&](size_t src) {
+        size_t index_to_src_feature = src * column_length;
+        // zero out src feature first
+        for (size_t i = 0; i < column_length; i++) {
+          aggregate_output[index_to_src_feature + i] = 0;
+        }
+
+        if (layer_phase_ == GNNPhase::kTrain) {
+          if (IsInductiveLayer()) {
+            // if inductive, all non-training nodes do not exist
+            if (!graph_.IsValidForPhase(src, GNNPhase::kTrain))
+              return;
+          }
+
+          if (IsSampledLayer()) {
+            // check if node is part of sampled graph; ignore after 0'ing if not
+            // sampled
+            if (!graph_.IsInSampledGraph(src))
+              return;
+          }
+        }
+
+        GNNFloat source_norm = 0.0;
+        if (!config_.disable_normalization) {
+          source_norm = graph_.DegreeNorm(src);
+        }
+
+        // loop through all destinations to grab the feature to aggregate
+        for (auto e = graph_.EdgeBegin(src); e != graph_.EdgeEnd(src); e++) {
+          size_t dst = graph_.EdgeDestination(e);
+
+          if (layer_phase_ == GNNPhase::kTrain) {
+            if (IsInductiveLayer()) {
+              // if inductive, all non-training nodes do not exist
+              if (!graph_.IsValidForPhase(dst, GNNPhase::kTrain))
+                return;
+            }
+
+            if (IsSampledLayer()) {
+              // ignore non-sampled nodes
+              if (layer_phase_ == GNNPhase::kTrain &&
+                  !graph_.IsInSampledGraph(dst))
+                continue;
+            }
+          }
+
+          size_t index_to_dst_feature = dst * column_length;
+
+          if (!config_.disable_normalization) {
+            GNNFloat norm_scale;
+            if (!is_backward) {
+              norm_scale = source_norm;
+            } else {
+              norm_scale = graph_.DegreeNorm(dst);
+            }
+
+            // scale the value on the destination by the combined norm term
+            assert(pts->getLocal()->size() == column_length);
+            GNNFloat* intermediate = pts->getLocal()->data();
+            for (size_t i = 0; i < column_length; i++) {
+              intermediate[i] =
+                  norm_scale * node_embeddings[index_to_dst_feature + i];
+            }
+            // add intermediate instead of original feature
+            galois::VectorAdd(
+                column_length, &aggregate_output[index_to_src_feature],
+                intermediate, &aggregate_output[index_to_src_feature]);
+          } else {
+            // add dst feature to aggregate output
+            galois::VectorAdd(column_length,
+                              &aggregate_output[index_to_src_feature],
+                              &node_embeddings[index_to_dst_feature],
+                              &aggregate_output[index_to_src_feature]);
+          }
+        }
+      },
+      galois::steal(), galois::loopname("ConvolutionalAggregateAll"));
+  // aggregate sync
+  graph_.AggregateSync(aggregate_output, column_length);
+}
+
+void galois::SAGELayer::UpdateEmbeddings(const GNNFloat* node_embeddings,
+                                         GNNFloat* output) {
+#ifdef GALOIS_ENABLE_GPU
+  if (device_personality == DevicePersonality::GPU_CUDA) {
+    gpu_object_.UpdateEmbeddingsGPU(
+        layer_dimensions_.input_rows, layer_dimensions_.input_columns,
+        layer_dimensions_.output_columns, node_embeddings,
+        base_gpu_object_.layer_weights(), output);
+  } else {
+#endif
+    // CPU version is just a call into CBlas
+    galois::CBlasSGEMM(CblasNoTrans, CblasNoTrans, layer_dimensions_.input_rows,
+                       layer_dimensions_.input_columns,
+                       layer_dimensions_.output_columns, node_embeddings,
+                       layer_weights_.data(), output);
+#ifdef GALOIS_ENABLE_GPU
+  }
+#endif
+}
+
+void galois::SAGELayer::UpdateEmbeddingsDerivative(const GNNFloat* gradients,
+                                                   GNNFloat* output) {
+  assert(p_layer_weights_.size() ==
+         layer_dimensions_.input_columns * layer_dimensions_.output_columns);
+#ifdef GALOIS_ENABLE_GPU
+  if (device_personality == DevicePersonality::GPU_CUDA) {
+    gpu_object_.UpdateEmbeddingsDerivativeGPU(
+        layer_dimensions_.input_rows, layer_dimensions_.input_columns,
+        layer_dimensions_.output_columns, gradients,
+        base_gpu_object_.layer_weights(), output);
+  } else {
+#endif
+    // difference is Trans for B matrix (data) to get z by y (weights is y by z
+    // normally); result is x by y
+    galois::CBlasSGEMM(CblasNoTrans, CblasTrans, layer_dimensions_.input_rows,
+                       layer_dimensions_.output_columns,
+                       layer_dimensions_.input_columns, gradients,
+                       layer_weights_.data(), output);
+#ifdef GALOIS_ENABLE_GPU
+  }
+#endif
+}
diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp
index 47ca1bfe0c..c48f0b41b4 100644
--- a/lonestar/libgnnbench/src/Input.cpp
+++ b/lonestar/libgnnbench/src/Input.cpp
@@ -44,6 +44,8 @@ llvm::cl::list<galois::GNNLayerType> cl_layer_types(
               "intermediate layers (does not include output)"),
     cll::values(clEnumValN(galois::GNNLayerType::kGraphConvolutional, "gcn",
                            "Graph Convolutional Layer (default)"),
+                clEnumValN(galois::GNNLayerType::kSAGE, "sage",
+                           "SAGE layer (GCN with concat + mean)"),
                 clEnumValN(galois::GNNLayerType::kDense, "dense",
                            "Dense Layer")),
     cll::CommaSeparated);

From a0a14278f6b4e21868e49d5f978ab1a77da8f0ec Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 18 Feb 2021 15:35:49 -0600
Subject: [PATCH 471/660] SAGE layer: forward phase

Implement the forward concat phase for the SAGE layer; maintains another
weight matrix solely for multiplying with the self features.

Conceptually W = W1 | W2: since this is the case, you can multiply the
self matrix separately from the aggregated one and just sum it into the
aggregated linear xform.

This code has been unit tested; the test will be pushed in later once
full layer is done.

Next up is backward phase.
---
 libgnn/include/galois/layers/SAGELayer.h | 66 +++++++++++++++--------
 libgnn/src/layers/SAGELayer.cpp          | 69 ++++++++++++++++++++----
 2 files changed, 102 insertions(+), 33 deletions(-)

diff --git a/libgnn/include/galois/layers/SAGELayer.h b/libgnn/include/galois/layers/SAGELayer.h
index a3fd5ecac6..9dcd53d9c6 100644
--- a/libgnn/include/galois/layers/SAGELayer.h
+++ b/libgnn/include/galois/layers/SAGELayer.h
@@ -8,10 +8,12 @@
 namespace galois {
 
 struct SAGELayerConfig {
-  // TODO(loc) relevant options here
-  bool todo;
+  bool disable_concat{false};
 };
 
+// TODO(loc) move common functionality with GCN layer to common parent class
+// (e.g. inits): cleans up Dense code a bit as well
+
 //! Same as GCN layer except for the following:
 //! - Mean aggregation; no symmetric norm with sqrts used (this
 //! ends up performing better for some graphs)
@@ -35,6 +37,12 @@ class SAGELayer : public GNNLayer {
       : SAGELayer(layer_num, graph, dimensions, GNNLayerConfig(),
                   SAGELayerConfig()) {}
 
+  void InitSelfWeightsTo1() {
+    if (layer_weights_2_.size()) {
+      layer_weights_2_.assign(layer_weights_2_.size(), 1);
+    }
+  }
+
   // Parent functions
   const PointerWithSize<galois::GNNFloat>
   ForwardPhase(const PointerWithSize<galois::GNNFloat> input_embeddings) final;
@@ -44,27 +52,6 @@ class SAGELayer : public GNNLayer {
                 PointerWithSize<galois::GNNFloat>* input_gradient) final;
 
 private:
-  // 2 temporaries the size of the forward input; used for dropout and
-  // aggregation (if either are required)
-  std::vector<GNNFloat> in_temp_1_;
-  std::vector<GNNFloat> in_temp_2_;
-  // Temporary matrix the size of the output of the forward pass; used if
-  // an intermediate op occurs before writing to the final output matrix
-  std::vector<GNNFloat> out_temp_;
-
-  // Pointer with size versions
-  PointerWithSize<GNNFloat> p_in_temp_1_;
-  PointerWithSize<GNNFloat> p_in_temp_2_;
-  PointerWithSize<GNNFloat> p_out_temp_;
-
-  // Each thread has a vector of size # input columns or # output columns for
-  // storing intermediate results during aggregation.
-  // The one used depeneds on if aggregation occurs before or after the mxm.
-  galois::substrate::PerThreadStorage<std::vector<GNNFloat>>
-      input_column_intermediates_;
-  galois::substrate::PerThreadStorage<std::vector<GNNFloat>>
-      output_column_intermediates_;
-
   //! CPU aggregation
   void AggregateAllCPU(
       size_t column_length, const GNNFloat* node_embeddings,
@@ -87,9 +74,42 @@ class SAGELayer : public GNNLayer {
 
   //! Do embedding update via mxm with this layer's weights (forward)
   void UpdateEmbeddings(const GNNFloat* node_embeddings, GNNFloat* output);
+  //! Same as above but uses the second set of weights (self feature weights)
+  void SelfFeatureUpdateEmbeddings(const GNNFloat* node_embeddings,
+                                   GNNFloat* output);
   //! Calculate graident via mxm with last layer's gradients (backward)
   void UpdateEmbeddingsDerivative(const GNNFloat* gradients, GNNFloat* output);
 
+  //! SAGE config params
+  SAGELayerConfig sage_config_;
+
+  // second set of weights for the concat that may occur
+  std::vector<GNNFloat> layer_weights_2_;
+  std::vector<GNNFloat> layer_weight_gradients_2_;
+  PointerWithSize<GNNFloat> p_layer_weights_2_;
+  PointerWithSize<GNNFloat> p_layer_weight_gradients_2_;
+
+  // 2 temporaries the size of the forward input; used for dropout and
+  // aggregation (if either are required)
+  std::vector<GNNFloat> in_temp_1_;
+  std::vector<GNNFloat> in_temp_2_;
+  // Temporary matrix the size of the output of the forward pass; used if
+  // an intermediate op occurs before writing to the final output matrix
+  std::vector<GNNFloat> out_temp_;
+
+  // Pointer with size versions
+  PointerWithSize<GNNFloat> p_in_temp_1_;
+  PointerWithSize<GNNFloat> p_in_temp_2_;
+  PointerWithSize<GNNFloat> p_out_temp_;
+
+  // Each thread has a vector of size # input columns or # output columns for
+  // storing intermediate results during aggregation.
+  // The one used depeneds on if aggregation occurs before or after the mxm.
+  galois::substrate::PerThreadStorage<std::vector<GNNFloat>>
+      input_column_intermediates_;
+  galois::substrate::PerThreadStorage<std::vector<GNNFloat>>
+      output_column_intermediates_;
+
 #ifdef GALOIS_ENABLE_GPU
   // TODO(loc/hochan)
   GCNGPUAllocations gpu_object_;
diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp
index a7e6d4c5f6..461f23bd99 100644
--- a/libgnn/src/layers/SAGELayer.cpp
+++ b/libgnn/src/layers/SAGELayer.cpp
@@ -6,10 +6,24 @@ galois::SAGELayer::SAGELayer(size_t layer_num,
                              const galois::graphs::GNNGraph& graph,
                              const GNNLayerDimensions& dimensions,
                              const GNNLayerConfig& config,
-                             const SAGELayerConfig&)
-    : GNNLayer(layer_num, graph, dimensions, config),
+                             const SAGELayerConfig& sage_config)
+    : GNNLayer(layer_num, graph, dimensions, config), sage_config_(sage_config),
       input_column_intermediates_(dimensions.input_columns),
       output_column_intermediates_(dimensions.output_columns) {
+  if (!sage_config_.disable_concat) {
+    // there are now 2 weight matrices used: one for self, one for aggregation
+    // abstractly it's one matrix: W = W1 | W2
+    size_t num_weight_elements =
+        layer_dimensions_.input_columns * layer_dimensions_.output_columns;
+    layer_weights_2_.resize(num_weight_elements);
+    layer_weight_gradients_2_.resize(num_weight_elements, 0);
+    GlorotBengioInit(&layer_weights_2_);
+    // update the pointers to them as well as realloc will require it
+    p_layer_weights_2_ = PointerWithSize<GNNFloat>(layer_weights_2_);
+    p_layer_weight_gradients_2_ =
+        PointerWithSize<GNNFloat>(layer_weight_gradients_2_);
+  }
+
   size_t num_input_elements =
       layer_dimensions_.input_rows * layer_dimensions_.input_columns;
   in_temp_1_.resize(num_input_elements, 0);
@@ -20,9 +34,9 @@ galois::SAGELayer::SAGELayer(size_t layer_num,
       layer_dimensions_.input_rows * layer_dimensions_.output_columns;
   GALOIS_LOG_VERBOSE("Output elements {}", num_output_elements);
   out_temp_.resize(num_output_elements, 0);
-  layer_type_ = galois::GNNLayerType::kGraphConvolutional;
+  layer_type_ = galois::GNNLayerType::kSAGE;
 #ifdef GALOIS_ENABLE_GPU
-  // TODO
+  // TODO(loc/hochan) GPU SAGE
   if (device_personality == DevicePersonality::GPU_CUDA) {
     gpu_object_.Allocate(num_input_elements, num_output_elements);
     // init pointers with size
@@ -38,13 +52,26 @@ galois::SAGELayer::SAGELayer(size_t layer_num,
     p_in_temp_2_ = PointerWithSize<GNNFloat>(in_temp_2_);
     p_out_temp_  = PointerWithSize<GNNFloat>(out_temp_);
 #ifdef GALOIS_ENABLE_GPU
-    // TODO concat
+    // TODO concat parameters
   }
 #endif
 
   GALOIS_LOG_VERBOSE("SAGE layer initialized");
 }
 
+void MatrixAdd(size_t num_nodes, galois::PointerWithSize<galois::GNNFloat> in,
+               galois::PointerWithSize<galois::GNNFloat>* out) {
+  assert(in.size() == out->size());
+  assert((in.size() % num_nodes) == 0);
+  size_t column_size = in.size() / num_nodes;
+  // split matrix to threads
+  galois::do_all(galois::iterate(size_t{0}, num_nodes), [&](size_t node) {
+    size_t my_offset = node * column_size;
+    galois::VectorAdd(column_size, &(in[my_offset]),
+                      &((out->data())[my_offset]), &(out->data()[my_offset]));
+  });
+}
+
 const galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::ForwardPhase(
     const galois::PointerWithSize<galois::GNNFloat> input_embeddings) {
   GALOIS_LOG_VERBOSE("Calling forward phase");
@@ -62,6 +89,9 @@ const galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::ForwardPhase(
     input_data = p_in_temp_1_.data();
   }
 
+  // O = FW1 + AFW2 is what is done if concat is on: below is the AFW2 part
+  // which is done regardless
+
   // flip aggregate/update if dimensions favor it (do less work)
   if (config_.disable_aggregate_after_update ||
       layer_dimensions_.input_columns <= layer_dimensions_.output_columns) {
@@ -79,7 +109,14 @@ const galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::ForwardPhase(
                  &output_column_intermediates_);
   }
 
-  // TODO synchronization of aggregation functions
+  if (!sage_config_.disable_concat) {
+    // FW1 is unaffected by the agg/update flip, so can to it
+    // separately
+    SelfFeatureUpdateEmbeddings(input_data, p_out_temp_.data());
+    // add result to the output matrix: FW1 + AFW2
+    MatrixAdd(layer_dimensions_.input_rows, p_out_temp_,
+              &p_forward_output_matrix_);
+  }
 
   if (!config_.disable_activation) {
     GALOIS_LOG_VERBOSE("Doing activation");
@@ -176,10 +213,6 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
 #endif
   }
 
-  // sync weight gradients; note aggregation sync occurs in the function call
-  // already
-  // TODO figure out how to do this with GPUs
-  // WeightGradientSyncAverage();
   WeightGradientSyncSum();
 
   if (!config_.disable_dropout && layer_number_ != 0) {
@@ -311,6 +344,7 @@ void galois::SAGELayer::AggregateAllCPU(
 void galois::SAGELayer::UpdateEmbeddings(const GNNFloat* node_embeddings,
                                          GNNFloat* output) {
 #ifdef GALOIS_ENABLE_GPU
+  // TODO self change
   if (device_personality == DevicePersonality::GPU_CUDA) {
     gpu_object_.UpdateEmbeddingsGPU(
         layer_dimensions_.input_rows, layer_dimensions_.input_columns,
@@ -328,6 +362,21 @@ void galois::SAGELayer::UpdateEmbeddings(const GNNFloat* node_embeddings,
 #endif
 }
 
+void galois::SAGELayer::SelfFeatureUpdateEmbeddings(
+    const GNNFloat* node_embeddings, GNNFloat* output) {
+#ifdef GALOIS_ENABLE_GPU
+  // TODO self change
+#endif
+  // note use of layer weights 2 differentiates this from above
+  galois::CBlasSGEMM(CblasNoTrans, CblasNoTrans, layer_dimensions_.input_rows,
+                     layer_dimensions_.input_columns,
+                     layer_dimensions_.output_columns, node_embeddings,
+                     layer_weights_2_.data(), output);
+#ifdef GALOIS_ENABLE_GPU
+}
+#endif
+}
+
 void galois::SAGELayer::UpdateEmbeddingsDerivative(const GNNFloat* gradients,
                                                    GNNFloat* output) {
   assert(p_layer_weights_.size() ==

From 0b43a3a7165184c8265a5e4f97d31882ad85c8a6 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 19 Feb 2021 12:41:57 -0600
Subject: [PATCH 472/660] train/val/test splits for non-complete masks

Code before did not support masks that were not complete (i.e., assumes
that range given in file is complete meaning if in range it is part of
that set). Code now checks to make sure this is the case before doing
that; if not, it uses the mask instead (slower, but correct).
---
 libgnn/include/galois/graphs/GNNGraph.h | 21 ++++++++++++-
 libgnn/src/graphs/GNNGraph.cpp          | 41 ++++++++++++++++++++++---
 2 files changed, 57 insertions(+), 5 deletions(-)

diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 4dafda2afb..b69eb43ea2 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -136,7 +136,13 @@ class GNNGraph {
   //! lid in question is valid for the current phase (i.e., it is part of
   //! a training, validation, or test phase mask)
   bool IsValidForPhase(const unsigned lid,
-                       const galois::GNNPhase current_phase) const;
+                       const galois::GNNPhase current_phase) const {
+    if (!incomplete_masks_) {
+      return IsValidForPhaseCompleteRange(lid, current_phase);
+    } else {
+      return IsValidForPhaseMasked(lid, current_phase);
+    }
+  }
 
   //////////////////////////////////////////////////////////////////////////////
 
@@ -228,6 +234,15 @@ class GNNGraph {
   //! degree access
   void InitNormFactor();
 
+  //! Used if ranges for a mask are complete (if in range, it's part of mask).
+  bool IsValidForPhaseCompleteRange(const unsigned lid,
+                                    const galois::GNNPhase current_phase) const;
+
+  //! Used if ranges for a mask are incomplete, meaning I actually have to
+  //! check the mask.
+  bool IsValidForPhaseMasked(const unsigned lid,
+                             const galois::GNNPhase current_phase) const;
+
   //////////////////////////////////////////////////////////////////////////////
   // Accuracy
   //////////////////////////////////////////////////////////////////////////////
@@ -288,6 +303,10 @@ class GNNGraph {
   //! in this class
   GNNRange global_testing_mask_range_;
 
+  //! If true, then node splits of train/val/test aren't complete (i.e.
+  //! falling in range != part of that set)
+  bool incomplete_masks_{false};
+
   //! Normalization constant based on structure of the graph (degrees)
   std::vector<GNNFloat> norm_factors_;
   //! Normalization constant based on degrees (unlike nomral norm factors
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index af3ef00baf..919d7340e4 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -100,8 +100,9 @@ galois::graphs::GNNGraph::GNNGraph(const std::string& input_directory,
 #endif
 }
 
-bool galois::graphs::GNNGraph::IsValidForPhase(
+bool galois::graphs::GNNGraph::IsValidForPhaseCompleteRange(
     const unsigned lid, const galois::GNNPhase current_phase) const {
+  // only use ranges if they're complete
   // convert to gid first
   size_t gid = partitioned_graph_->getGID(lid);
 
@@ -123,9 +124,9 @@ bool galois::graphs::GNNGraph::IsValidForPhase(
   }
 
   // if within range, it is valid
-  // TODO there is an assumption here that ranges are contiguous; may not
-  // necessarily be the case in all inputs in which case using the mask is safer
-  // (but less cache efficient)
+  // there is an assumption here that ranges are contiguous; may not
+  // necessarily be the case in all inputs in which case using the mask is
+  // required (but less cache efficient)
   if (range_to_use->begin <= gid && gid < range_to_use->end) {
     return true;
   } else {
@@ -133,6 +134,28 @@ bool galois::graphs::GNNGraph::IsValidForPhase(
   }
 }
 
+bool galois::graphs::GNNGraph::IsValidForPhaseMasked(
+    const unsigned lid, const galois::GNNPhase current_phase) const {
+  // select mask to use based on phase
+  const std::vector<char>* mask_to_use;
+  switch (current_phase) {
+  case GNNPhase::kTrain:
+    mask_to_use = &local_training_mask_;
+    break;
+  case GNNPhase::kValidate:
+    mask_to_use = &local_validation_mask_;
+    break;
+  case GNNPhase::kTest:
+    mask_to_use = &local_testing_mask_;
+    break;
+  default:
+    GALOIS_LOG_FATAL("Invalid phase used");
+    mask_to_use = nullptr;
+  }
+
+  return (*mask_to_use)[lid];
+}
+
 void galois::graphs::GNNGraph::AggregateSync(
     GNNFloat* matrix_to_sync, const size_t matrix_column_size) const {
   // set globals for the sync substrate
@@ -425,6 +448,16 @@ size_t galois::graphs::GNNGraph::ReadLocalMasksFromFile(
   }
   mask_stream.close();
 
+  if (local_sample_count != mask_range->size) {
+    // overlapping masks: need to actually check the masks rather than use
+    // ranges
+    if (!incomplete_masks_) {
+      galois::gInfo(
+          "Masks are not contained in range: must actually check mask");
+    }
+    incomplete_masks_ = true;
+  }
+
   return local_sample_count;
 }
 

From c80ede803cac9384d642e3db9b19ecda072940ac Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 19 Feb 2021 13:16:40 -0600
Subject: [PATCH 473/660] GNNLayer: optimize function made virtual

Made optimize layer function able to be overridden by child classes
because they may be training more than just the single weight matrix
(e.g., sage trains 2 matrices).
---
 libgnn/include/galois/layers/GNNLayer.h | 6 +++++-
 libgnn/src/layers/GNNLayer.cpp          | 6 ------
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
index 3be7908ad7..7b8737e204 100644
--- a/libgnn/include/galois/layers/GNNLayer.h
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -152,7 +152,11 @@ class GNNLayer {
 
   //! Given an optimizer, update the weights in this layer based on gradients
   //! stored in the layer
-  void OptimizeLayer(BaseOptimizer* optimizer, size_t trainable_layer_number);
+  virtual void OptimizeLayer(BaseOptimizer* optimizer,
+                             size_t trainable_layer_number) {
+    optimizer->GradientDescent(p_layer_weight_gradients_, p_layer_weights_,
+                               trainable_layer_number);
+  }
 
   //! Flip sampling switch on
   void EnableSampling() { config_.do_sampling = true; }
diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp
index 9da77a004f..6deab8e682 100644
--- a/libgnn/src/layers/GNNLayer.cpp
+++ b/libgnn/src/layers/GNNLayer.cpp
@@ -202,12 +202,6 @@ void galois::GNNLayer::ActivationDerivative(
       galois::loopname("ReLU-Derivative"));
 }
 
-void galois::GNNLayer::OptimizeLayer(BaseOptimizer* optimizer,
-                                     size_t trainable_layer_number) {
-  optimizer->GradientDescent(p_layer_weight_gradients_, p_layer_weights_,
-                             trainable_layer_number);
-}
-
 void galois::GNNLayer::WeightGradientSyncSum() {
   // XXX bitset
   gradient_sync_substrate_->sync<writeAny, readAny, WeightGradientSummation>(

From bd25d8d8d2c0e9bfd00737640f35bfff2aaf8465 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 22 Feb 2021 17:15:57 -0600
Subject: [PATCH 474/660] L2 normalization layer + test

Adds an implementation of an l2 normalization layer (taken from
GraphSAINT, but I verified it manually via pen/paper derivation).

A test is added to ensure the math being done per row is correct as
well.
---
 libgnn/CMakeLists.txt                      |   5 +-
 libgnn/include/galois/layers/GNNLayer.h    |   5 +-
 libgnn/include/galois/layers/L2NormLayer.h |  50 +++++++++
 libgnn/src/layers/L2NormLayer.cpp          | 121 +++++++++++++++++++++
 libgnn/test/CMakeLists.txt                 |   6 +-
 libgnn/test/l2norm-layer-test.cpp          |  85 +++++++++++++++
 6 files changed, 268 insertions(+), 4 deletions(-)
 create mode 100644 libgnn/include/galois/layers/L2NormLayer.h
 create mode 100644 libgnn/src/layers/L2NormLayer.cpp
 create mode 100644 libgnn/test/l2norm-layer-test.cpp

diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt
index 82454b1301..b59cccef93 100644
--- a/libgnn/CMakeLists.txt
+++ b/libgnn/CMakeLists.txt
@@ -3,13 +3,14 @@ set(sources
   src/GNNOptimizers.cpp
   src/GraphNeuralNetwork.cpp
   src/graphs/GNNGraph.cpp
+  src/layers/DenseLayer.cpp
   src/layers/GNNLayer.cpp
   src/layers/GluonGradientInterface.cpp
   src/layers/GraphConvolutionalLayer.cpp
-  src/layers/DenseLayer.cpp
+  src/layers/L2NormLayer.cpp
+  src/layers/SAGELayer.cpp
   src/layers/SigmoidLayer.cpp
   src/layers/SoftmaxLayer.cpp
-  src/layers/SAGELayer.cpp
 )
 
 set(MKL_LIBRARIES ${MKL_ROOT}/lib/intel64)
diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
index 7b8737e204..0039683ad4 100644
--- a/libgnn/include/galois/layers/GNNLayer.h
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -20,7 +20,9 @@ enum class GNNLayerType {
   //! Sage layer: same as GCN except with mean aggregation and concat
   kSAGE,
   //! Dense linear xform layer
-  kDense
+  kDense,
+  //! L2 normalization layer
+  kL2Norm
   // TODO GAT
 };
 
@@ -129,6 +131,7 @@ class GNNLayer {
   galois::GNNOutputLayerType output_layer_type() const {
     return output_layer_type_;
   }
+  size_t layer_number() const { return layer_number_; }
 
   //! Conducts the forward phase given the input to this layer which
   //! ultimately leads to an output (classfication of node labels) at the end
diff --git a/libgnn/include/galois/layers/L2NormLayer.h b/libgnn/include/galois/layers/L2NormLayer.h
new file mode 100644
index 0000000000..176c88700e
--- /dev/null
+++ b/libgnn/include/galois/layers/L2NormLayer.h
@@ -0,0 +1,50 @@
+#pragma once
+#include "galois/layers/GNNLayer.h"
+
+#ifdef GALOIS_ENABLE_GPU
+// TODO(loc/hochan)
+#endif
+
+namespace galois {
+
+//! Applies L2 norm to rows of the input
+class L2NormLayer : public GNNLayer {
+public:
+  L2NormLayer(size_t layer_num, const galois::graphs::GNNGraph& graph,
+              const GNNLayerDimensions& dimensions)
+      : L2NormLayer(layer_num, graph, dimensions,
+                    GNNLayerConfig{.allocate_weights = false}) {}
+  L2NormLayer(size_t layer_num, const galois::graphs::GNNGraph& graph,
+              const GNNLayerDimensions& dimensions,
+              const GNNLayerConfig& config)
+      : GNNLayer(layer_num, graph, dimensions, config) {
+    layer_type_ = galois::GNNLayerType::kL2Norm;
+    // input/output columns must be equivalent in a softmax
+    GALOIS_LOG_ASSERT(dimensions.input_columns == dimensions.output_columns);
+    GALOIS_LOG_VERBOSE("L2 norm initialized");
+  }
+
+  const PointerWithSize<galois::GNNFloat>
+  ForwardPhase(const PointerWithSize<galois::GNNFloat> input_embeddings);
+
+  PointerWithSize<galois::GNNFloat>
+  BackwardPhase(const PointerWithSize<galois::GNNFloat> prev_layer_input,
+                PointerWithSize<galois::GNNFloat>* input_gradient);
+
+private:
+  const PointerWithSize<galois::GNNFloat>
+  ForwardPhaseCPU(const PointerWithSize<galois::GNNFloat> input_embeddings);
+
+  PointerWithSize<galois::GNNFloat>
+  BackwardPhaseCPU(const PointerWithSize<galois::GNNFloat> prev_layer_input,
+                   PointerWithSize<galois::GNNFloat>* input_gradient);
+
+  //! No op
+  void OptimizeLayer(BaseOptimizer*, size_t) { return; };
+
+#ifdef GALOIS_ENABLE_GPU
+    // TODO(loc/hochan)
+#endif
+};
+
+} // namespace galois
diff --git a/libgnn/src/layers/L2NormLayer.cpp b/libgnn/src/layers/L2NormLayer.cpp
new file mode 100644
index 0000000000..a29fccab1d
--- /dev/null
+++ b/libgnn/src/layers/L2NormLayer.cpp
@@ -0,0 +1,121 @@
+#include "galois/layers/L2NormLayer.h"
+const galois::PointerWithSize<galois::GNNFloat>
+galois::L2NormLayer::ForwardPhase(
+    const galois::PointerWithSize<galois::GNNFloat> input_embeddings) {
+#ifdef GALOIS_ENABLE_GPU
+  // TODO
+#endif
+  return ForwardPhaseCPU(input_embeddings);
+}
+
+const galois::PointerWithSize<galois::GNNFloat>
+galois::L2NormLayer::ForwardPhaseCPU(
+    const galois::PointerWithSize<galois::GNNFloat> input_embeddings) {
+  forward_output_matrix_.assign(forward_output_matrix_.size(), 0.0);
+  // for each row, get square root of squared sums then normalize
+  const size_t feature_length = layer_dimensions_.input_columns;
+  // TODO(loc) make sure this works in distributed setting as well
+  galois::do_all(
+      galois::iterate(graph_.begin_owned(), graph_.end_owned()),
+      [&](const unsigned row) {
+        if (IsSampledLayer()) {
+          if (layer_phase_ == GNNPhase::kTrain && !graph_.IsInSampledGraph(row))
+            return;
+        }
+
+        if (graph_.IsValidForPhase(row, layer_phase_)) {
+          size_t row_offset        = row * feature_length;
+          float running_square_sum = 0.0;
+          // get square sums
+          for (size_t row_index = row_offset;
+               row_index < (row_offset + feature_length); row_index++) {
+            running_square_sum += std::pow(input_embeddings[row_index], 2);
+          }
+
+          // make sure running sum isn't too small
+          running_square_sum =
+              (running_square_sum < 1.0e-12) ? 10e-12 : running_square_sum;
+
+          // sqrt of sums, then divide row by it
+          float sqrt_squares = std::pow(running_square_sum, 0.5);
+          for (size_t row_index = row_offset;
+               row_index < (row_offset + feature_length); row_index++) {
+            forward_output_matrix_[row_index] =
+                input_embeddings[row_index] / sqrt_squares;
+          }
+        }
+      },
+      galois::loopname("L2ForwardNormalization"));
+
+  return forward_output_matrix_;
+}
+
+galois::PointerWithSize<galois::GNNFloat> galois::L2NormLayer::BackwardPhase(
+    const PointerWithSize<galois::GNNFloat> prev_layer_input,
+    PointerWithSize<galois::GNNFloat>* input_gradient) {
+#ifdef GALOIS_ENABLE_GPU
+  // TODO
+#endif
+  return BackwardPhaseCPU(prev_layer_input, input_gradient);
+}
+
+galois::PointerWithSize<galois::GNNFloat> galois::L2NormLayer::BackwardPhaseCPU(
+    galois::PointerWithSize<galois::GNNFloat> prev_layer_input,
+    galois::PointerWithSize<galois::GNNFloat>* input_gradient) {
+  backward_output_matrix_.assign(forward_output_matrix_.size(), 0.0);
+  const size_t feature_length = layer_dimensions_.input_columns;
+
+  // derivative of some x_1 is sum of gradient w.r.t. x_1 for all elements of
+  // the row (since l2 norm affects entire row)
+  // The math itself can be derived using quotient/chain rule on each element
+  // of the normalized row
+  galois::do_all(
+      galois::iterate(graph_.begin_owned(), graph_.end_owned()),
+      [&](const unsigned row) {
+        if (IsSampledLayer()) {
+          if (layer_phase_ == GNNPhase::kTrain && !graph_.IsInSampledGraph(row))
+            return;
+        }
+
+        if (graph_.IsValidForPhase(row, layer_phase_)) {
+          size_t row_offset = row * feature_length;
+          // note: if you work this out on paper it turns out that terms that
+          // seem extra in the way this is calculated below simply get canceled
+          // out, so this ends up working out This implementation is taken from
+          // the IPDPS GraphSAINT implementation: I (loc) have confirmed the
+          // math checks out
+          float running_square_sum = 0.0;
+          float mult_with_input    = 0.0;
+
+          // get square sums
+          for (size_t row_index = row_offset;
+               row_index < (row_offset + feature_length); row_index++) {
+            running_square_sum += std::pow(prev_layer_input[row_index], 2);
+            // gradient multiplied with corresponding input; subtraction because
+            // derivative math ends up working out that way
+            mult_with_input -=
+                prev_layer_input[row_index] * (*input_gradient)[row_index];
+          }
+          running_square_sum =
+              (running_square_sum < 1.0e-12) ? 10e-12 : running_square_sum;
+          assert(running_square_sum != 0.0);
+
+          // denominator for all gradients is just the square sum to the -3/2'd
+          // power since this is -, all we have to do is multiply it later
+          // rather than divide
+          float denominator = std::pow(running_square_sum, -1.5);
+          assert(denominator != 0.0);
+
+          for (size_t row_index = row_offset;
+               row_index < (row_offset + feature_length); row_index++) {
+            backward_output_matrix_[row_index] =
+                denominator *
+                (prev_layer_input[row_index] * mult_with_input +
+                 (*input_gradient)[row_index] * running_square_sum);
+          }
+        }
+      },
+      galois::loopname("L2Backward"));
+
+  return PointerWithSize(backward_output_matrix_);
+}
diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt
index 9e10da1246..4f2eca0295 100644
--- a/libgnn/test/CMakeLists.txt
+++ b/libgnn/test/CMakeLists.txt
@@ -6,7 +6,11 @@ if (NOT GALOIS_ENABLE_GPU)
   add_executable(convlayer-test convlayer-test.cpp)
   target_link_libraries(convlayer-test galois_gnn)
   add_test(NAME convlayer-test COMMAND convlayer-test)
-  
+
+  add_executable(l2norm-layer-test l2norm-layer-test.cpp)
+  target_link_libraries(l2norm-layer-test galois_gnn)
+  add_test(NAME l2norm-layer-test COMMAND l2norm-layer-test)
+
   add_executable(softmaxlayer-test softmaxlayer-test.cpp)
   target_link_libraries(softmaxlayer-test galois_gnn)
   add_test(NAME softmaxlayer-test COMMAND softmaxlayer-test)
diff --git a/libgnn/test/l2norm-layer-test.cpp b/libgnn/test/l2norm-layer-test.cpp
new file mode 100644
index 0000000000..a66c419a7f
--- /dev/null
+++ b/libgnn/test/l2norm-layer-test.cpp
@@ -0,0 +1,85 @@
+#include "galois/Logging.h"
+#include "galois/GNNMath.h"
+#include "galois/layers/L2NormLayer.h"
+
+int main() {
+  galois::DistMemSys G;
+
+  size_t num_threads = galois::setActiveThreads(
+      56 / galois::runtime::getSystemNetworkInterface().Num);
+  GALOIS_LOG_VERBOSE("Num threads is {}", num_threads);
+
+  // load test graph
+  galois::graphs::GNNGraph test_graph(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true);
+
+  // input/output columns must be same in softmax
+  galois::GNNLayerDimensions dimension_0;
+  dimension_0.input_rows     = 7;
+  dimension_0.input_columns  = 2;
+  dimension_0.output_columns = 2;
+
+  std::vector<galois::GNNFloat> l2_input(14, 0.0);
+  l2_input[0]  = 4;
+  l2_input[1]  = 3;
+  l2_input[2]  = 4;
+  l2_input[3]  = 3;
+  l2_input[4]  = 4;
+  l2_input[5]  = 3;
+  l2_input[6]  = 4;
+  l2_input[7]  = 3;
+  l2_input[8]  = 4;
+  l2_input[9]  = 3;
+  l2_input[10] = 4;
+  l2_input[11] = 3;
+  l2_input[12] = 4;
+  l2_input[13] = 3;
+
+  auto l2_layer =
+      std::make_unique<galois::L2NormLayer>(2, test_graph, dimension_0);
+  galois::PointerWithSize<galois::GNNFloat> normed =
+      l2_layer->ForwardPhase(l2_input);
+
+  // only go up to 5 because training set
+  for (size_t row = 0; row < 5; row++) {
+    GALOIS_LOG_VASSERT(std::abs(normed[row * 2] - 0.8) < 0.0001,
+                       "input 4 should become 0.8 not {}, index {}",
+                       normed[row * 2], row * 2);
+    GALOIS_LOG_VASSERT(std::abs(normed[row * 2 + 1] - 0.6) < 0.0001,
+                       "input 3 should become 0.6 not {}, index {}",
+                       normed[row * 2 + 1], row * 2 + 1);
+  }
+  // only go up to 5 because training set
+  for (size_t row = 5; row < 7; row++) {
+    GALOIS_LOG_VASSERT(std::abs(normed[row * 2] - 0.0) < 0.0001,
+                       "index {} should be 0, not part of train", row * 2);
+    GALOIS_LOG_VASSERT(std::abs(normed[row * 2 + 1] - 0.0) < 0.0001,
+                       "index {} should be 0, not part of train", row * 2 + 1);
+  }
+
+  // backward
+  std::vector<galois::GNNFloat> dummy_ones_v(14, 1);
+  galois::PointerWithSize dummy_ones(dummy_ones_v);
+
+  galois::PointerWithSize<galois::GNNFloat> grads =
+      l2_layer->BackwardPhase(l2_input, &dummy_ones);
+  float out_4 = (-3.0 / 125.0);
+  float out_3 = (4.0 / 125.0);
+  for (size_t row = 0; row < 5; row++) {
+    GALOIS_LOG_VASSERT(std::abs(grads[row * 2] - out_4) < 0.0001,
+                       "index {} grad 4 gradient should be {} not {}", row * 2,
+                       out_4, grads[row * 2]);
+    GALOIS_LOG_VASSERT(std::abs(grads[row * 2 + 1] - out_3) < 0.0001,
+                       "index {} grad 3 gradient should be {} not {}",
+                       row * 2 + 1, out_3, grads[row * 2 + 1]);
+  }
+
+  for (size_t row = 5; row < 7; row++) {
+    GALOIS_LOG_VASSERT(std::abs(grads[row * 2] - 0.0) < 0.0001,
+                       "index {} should be 0, not part of train", row * 2);
+    GALOIS_LOG_VASSERT(std::abs(grads[row * 2 + 1] - 0.0) < 0.0001,
+                       "index {} should be 0, not part of train", row * 2 + 1);
+  }
+
+  return 0;
+}

From 0b7c2eb8fbb1a9bdba7135d3db31a749689a4855 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 23 Feb 2021 14:01:40 -0600
Subject: [PATCH 475/660] SAGE Layer completed + tests

This commit finishes up the implementation of the SAGE layer's backward
phase. One thing to note is that if dropout is enabled, then the dropout
matrix needs to be used during backward phase computations and not the
original. This needs to be fixed in the other existing layers.

GNNMath has a new version of the SGEMM routine that aggregates directly
into the output matrix: this is useful for the SAGE split matrix.
---
 libgnn/include/galois/GNNMath.h          |   5 +
 libgnn/include/galois/layers/SAGELayer.h |  13 ++
 libgnn/src/GNNMath.cpp                   |  13 +-
 libgnn/src/GraphNeuralNetwork.cpp        |  28 ++-
 libgnn/src/layers/SAGELayer.cpp          |  88 ++++++--
 libgnn/test/CMakeLists.txt               |   4 +
 libgnn/test/sage-layer-test.cpp          | 272 +++++++++++++++++++++++
 lonestar/libgnnbench/src/Input.cpp       |  13 +-
 8 files changed, 407 insertions(+), 29 deletions(-)
 create mode 100644 libgnn/test/sage-layer-test.cpp

diff --git a/libgnn/include/galois/GNNMath.h b/libgnn/include/galois/GNNMath.h
index 231d437836..9e17a448fc 100644
--- a/libgnn/include/galois/GNNMath.h
+++ b/libgnn/include/galois/GNNMath.h
@@ -82,4 +82,9 @@ void CBlasSGEMM(const CBLAS_TRANSPOSE trans_a, const CBLAS_TRANSPOSE trans_b,
                 size_t input_rows, size_t input_columns, size_t output_columns,
                 const GNNFloat* a, const GNNFloat* b, GNNFloat* output);
 
+void CBlasSGEMM(const CBLAS_TRANSPOSE trans_a, const CBLAS_TRANSPOSE trans_b,
+                size_t input_rows, size_t input_columns, size_t output_columns,
+                const GNNFloat* a, const GNNFloat* b, GNNFloat* output,
+                bool accumulate);
+
 } // namespace galois
diff --git a/libgnn/include/galois/layers/SAGELayer.h b/libgnn/include/galois/layers/SAGELayer.h
index 9dcd53d9c6..a489913ef5 100644
--- a/libgnn/include/galois/layers/SAGELayer.h
+++ b/libgnn/include/galois/layers/SAGELayer.h
@@ -43,6 +43,11 @@ class SAGELayer : public GNNLayer {
     }
   }
 
+  //! Returns the 2nd set of weight gradients
+  const PointerWithSize<GNNFloat> GetLayerWeightGradients2() {
+    return p_layer_weight_gradients_2_;
+  }
+
   // Parent functions
   const PointerWithSize<galois::GNNFloat>
   ForwardPhase(const PointerWithSize<galois::GNNFloat> input_embeddings) final;
@@ -79,9 +84,17 @@ class SAGELayer : public GNNLayer {
                                    GNNFloat* output);
   //! Calculate graident via mxm with last layer's gradients (backward)
   void UpdateEmbeddingsDerivative(const GNNFloat* gradients, GNNFloat* output);
+  //! Same as above but uses the second set of weights (self feature weights)
+  void SelfFeatureUpdateEmbeddingsDerivative(const GNNFloat* gradients,
+                                             GNNFloat* output);
+
+  //! override parent function: optimizes the second set of weights as well
+  void OptimizeLayer(BaseOptimizer* optimizer, size_t trainable_layer_number);
 
   //! SAGE config params
   SAGELayerConfig sage_config_;
+  //! Need own optimizer for the 2nd weight matrix
+  std::unique_ptr<AdamOptimizer> second_weight_optimizer_;
 
   // second set of weights for the concat that may occur
   std::vector<GNNFloat> layer_weights_2_;
diff --git a/libgnn/src/GNNMath.cpp b/libgnn/src/GNNMath.cpp
index dcaaf31a42..38af349a8c 100644
--- a/libgnn/src/GNNMath.cpp
+++ b/libgnn/src/GNNMath.cpp
@@ -87,6 +87,15 @@ void galois::CBlasSGEMM(const CBLAS_TRANSPOSE trans_a,
                         size_t input_columns, size_t output_columns,
                         const GNNFloat* a, const GNNFloat* b,
                         GNNFloat* output) {
+  CBlasSGEMM(trans_a, trans_b, input_rows, input_columns, output_columns, a, b,
+             output, false);
+}
+
+void galois::CBlasSGEMM(const CBLAS_TRANSPOSE trans_a,
+                        const CBLAS_TRANSPOSE trans_b, size_t input_rows,
+                        size_t input_columns, size_t output_columns,
+                        const GNNFloat* a, const GNNFloat* b, GNNFloat* output,
+                        bool accumulate) {
   // set lead dimension based on cblas spec w.r.t. transpose setting
   size_t lead_dim_a = (trans_a == CblasNoTrans) ? input_columns : input_rows;
   size_t lead_dim_b =
@@ -94,6 +103,6 @@ void galois::CBlasSGEMM(const CBLAS_TRANSPOSE trans_a,
   // do the MM
   // TODO roll our own sgemm rather than use 3rd party?
   cblas_sgemm(CblasRowMajor, trans_a, trans_b, input_rows, output_columns,
-              input_columns, 1.0, a, lead_dim_a, b, lead_dim_b, 0.0, output,
-              output_columns);
+              input_columns, 1.0, a, lead_dim_a, b, lead_dim_b,
+              accumulate ? 1.0 : 0.0, output, output_columns);
 }
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index 40e3c8a7e1..0905713dd8 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -1,10 +1,11 @@
 #include "galois/GNNMath.h"
 #include "galois/GraphNeuralNetwork.h"
-#include "galois/layers/GraphConvolutionalLayer.h"
 #include "galois/layers/DenseLayer.h"
-#include "galois/layers/SoftmaxLayer.h"
-#include "galois/layers/SigmoidLayer.h"
+#include "galois/layers/GraphConvolutionalLayer.h"
+#include "galois/layers/L2NormLayer.h"
 #include "galois/layers/SAGELayer.h"
+#include "galois/layers/SigmoidLayer.h"
+#include "galois/layers/SoftmaxLayer.h"
 
 galois::GraphNeuralNetwork::GraphNeuralNetwork(
     std::unique_ptr<galois::graphs::GNNGraph> graph,
@@ -56,6 +57,13 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork(
           i, *graph_, layer_dims, config_.default_layer_config())));
 #ifdef GALOIS_ENABLE_GPU
       // TODO(loc/hochan) sage layer gpu
+#endif
+      break;
+    case GNNLayerType::kL2Norm:
+      gnn_layers_.push_back(std::move(std::make_unique<L2NormLayer>(
+          i, *graph_, layer_dims, config_.default_layer_config())));
+#ifdef GALOIS_ENABLE_GPU
+      // TODO(loc/hochan) l2 layer gpu
 #endif
       break;
     case GNNLayerType::kDense:
@@ -68,10 +76,18 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork(
     default:
       GALOIS_LOG_FATAL("Invalid layer type during network construction");
     }
+  }
 
-    if (i == config_.num_intermediate_layers() - 1) {
-      // last layer before output layer should never have activation
-      gnn_layers_.back()->DisableActivation();
+  // loop backward and find last GCN/SAGE (main) layer to disable activation
+  for (auto back_iter = gnn_layers_.rbegin(); back_iter != gnn_layers_.rend();
+       back_iter++) {
+    GNNLayerType layer_type = (*back_iter)->layer_type();
+    if (layer_type == GNNLayerType::kGraphConvolutional ||
+        layer_type == GNNLayerType::kSAGE) {
+      galois::gDebug("Disabling activation on layer ",
+                     (*back_iter)->layer_number(), "\n");
+      (*back_iter)->DisableActivation();
+      break;
     }
   }
 
diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp
index 461f23bd99..79e757e93c 100644
--- a/libgnn/src/layers/SAGELayer.cpp
+++ b/libgnn/src/layers/SAGELayer.cpp
@@ -22,6 +22,9 @@ galois::SAGELayer::SAGELayer(size_t layer_num,
     p_layer_weights_2_ = PointerWithSize<GNNFloat>(layer_weights_2_);
     p_layer_weight_gradients_2_ =
         PointerWithSize<GNNFloat>(layer_weight_gradients_2_);
+    // initialize the optimizer
+    std::vector<size_t> weight_size = {num_weight_elements};
+    second_weight_optimizer_ = std::make_unique<AdamOptimizer>(weight_size, 1);
   }
 
   size_t num_input_elements =
@@ -112,10 +115,7 @@ const galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::ForwardPhase(
   if (!sage_config_.disable_concat) {
     // FW1 is unaffected by the agg/update flip, so can to it
     // separately
-    SelfFeatureUpdateEmbeddings(input_data, p_out_temp_.data());
-    // add result to the output matrix: FW1 + AFW2
-    MatrixAdd(layer_dimensions_.input_rows, p_out_temp_,
-              &p_forward_output_matrix_);
+    SelfFeatureUpdateEmbeddings(input_data, p_forward_output_matrix_.data());
   }
 
   if (!config_.disable_activation) {
@@ -125,6 +125,7 @@ const galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::ForwardPhase(
 
   assert(p_forward_output_matrix_.size() ==
          (layer_dimensions_.input_rows * layer_dimensions_.output_columns));
+
   return p_forward_output_matrix_;
 }
 
@@ -138,7 +139,29 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
     ActivationDerivative(input_gradient);
   }
 
+  // if dropout was used, use the dropout matrix for the input
+  galois::PointerWithSize<galois::GNNFloat> input_to_use;
+  if (!config_.disable_dropout) {
+    // dropout result is currently stored in temp 1
+    // needs to be used before it gets overwritten
+    input_to_use = p_in_temp_1_;
+  } else {
+    // no dropout = use vanilla input
+    input_to_use = prev_layer_input;
+  }
+
   // AFW = O
+  if (!sage_config_.disable_concat) {
+    // Fw1 + AFW2 = O; self feature has own weight matrix and makes own
+    // contribution to gradients which is handled in this block
+    // !!!! do this early because p_in_temp may get overwritten later
+    // if update occurs before aggregate !!!
+    galois::CBlasSGEMM(
+        CblasTrans, CblasNoTrans, layer_dimensions_.input_columns,
+        layer_dimensions_.input_rows, layer_dimensions_.output_columns,
+        input_to_use.data(), input_gradient->data(),
+        p_layer_weight_gradients_2_.data());
+  }
 
   // derivative of aggregation/update
   // TODO clean up logic here to reduce nesting
@@ -157,8 +180,6 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
       AggregateAll(layer_dimensions_.input_columns, p_in_temp_1_.data(),
                    p_backward_output_matrix_.data(),
                    &input_column_intermediates_, true);
-      // TODO if training A, then A' compute here if layer # is 0
-      // dot product of edges that exist in A
     }
     // weight gradient calculation
     // TODO(loc) put this in a function to put the ifdef in there
@@ -180,8 +201,6 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
     }
 #endif
   } else {
-    // TODO at this point, out_temp contains memoized FW
-    // can use it to get A' = O' (FW)^T
     // aggregate occurs regardless of layer being equal to 0 because it is
     // required in this case for the weight gradient calculation
     // this is (FW)'
@@ -195,24 +214,35 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
     }
     // TODO put this in a function
     // W' = F^T (FW)'
+    // input to use is not overwritten in this branch so it's safe to use
 #ifdef GALOIS_ENABLE_GPU
     if (device_personality == DevicePersonality::GPU_CUDA) {
       gpu_object_.GetWeightGradientsGPU(
           layer_dimensions_.input_rows, layer_dimensions_.input_columns,
-          layer_dimensions_.output_columns, prev_layer_input.data(),
+          layer_dimensions_.output_columns, input_to_use.data(),
           p_out_temp_.data(), p_layer_weight_gradients_.data());
     } else {
 #endif
-      galois::CBlasSGEMM(
-          CblasTrans, CblasNoTrans, layer_dimensions_.input_columns,
-          layer_dimensions_.input_rows, layer_dimensions_.output_columns,
-          prev_layer_input.data(), p_out_temp_.data(),
-          p_layer_weight_gradients_.data());
+      galois::CBlasSGEMM(CblasTrans, CblasNoTrans,
+                         layer_dimensions_.input_columns,
+                         layer_dimensions_.input_rows,
+                         layer_dimensions_.output_columns, input_to_use.data(),
+                         p_out_temp_.data(), p_layer_weight_gradients_.data());
 #ifdef GALOIS_ENABLE_GPU
     }
 #endif
   }
 
+  if (!sage_config_.disable_concat) {
+    if (layer_number_ != 0) {
+      // deal with feature gradients for the self feature here
+      // this function will sum directly into the backward matrix
+      SelfFeatureUpdateEmbeddingsDerivative(input_gradient->data(),
+                                            p_backward_output_matrix_.data());
+    }
+  }
+
+  // TODO(loc) sync both weight matrices
   WeightGradientSyncSum();
 
   if (!config_.disable_dropout && layer_number_ != 0) {
@@ -371,7 +401,7 @@ void galois::SAGELayer::SelfFeatureUpdateEmbeddings(
   galois::CBlasSGEMM(CblasNoTrans, CblasNoTrans, layer_dimensions_.input_rows,
                      layer_dimensions_.input_columns,
                      layer_dimensions_.output_columns, node_embeddings,
-                     layer_weights_2_.data(), output);
+                     layer_weights_2_.data(), output, true);
 #ifdef GALOIS_ENABLE_GPU
 }
 #endif
@@ -399,3 +429,31 @@ void galois::SAGELayer::UpdateEmbeddingsDerivative(const GNNFloat* gradients,
   }
 #endif
 }
+
+void galois::SAGELayer::SelfFeatureUpdateEmbeddingsDerivative(
+    const GNNFloat* gradients, GNNFloat* output) {
+  assert(p_layer_weights_.size() ==
+         layer_dimensions_.input_columns * layer_dimensions_.output_columns);
+#ifdef GALOIS_ENABLE_GPU
+  // TODO gpu self
+#endif
+  // difference is Trans for B matrix (data) to get z by y (weights is y by z
+  // normally); result is x by y
+  // true at end -> accumulate
+  galois::CBlasSGEMM(CblasNoTrans, CblasTrans, layer_dimensions_.input_rows,
+                     layer_dimensions_.output_columns,
+                     layer_dimensions_.input_columns, gradients,
+                     layer_weights_2_.data(), output, true);
+#ifdef GALOIS_ENABLE_GPU
+#endif
+}
+
+void galois::SAGELayer::OptimizeLayer(BaseOptimizer* optimizer,
+                                      size_t trainable_layer_number) {
+  optimizer->GradientDescent(p_layer_weight_gradients_, p_layer_weights_,
+                             trainable_layer_number);
+  if (!sage_config_.disable_concat) {
+    second_weight_optimizer_->GradientDescent(p_layer_weight_gradients_2_,
+                                              p_layer_weights_2_, 0);
+  }
+}
diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt
index 4f2eca0295..853c5a22f9 100644
--- a/libgnn/test/CMakeLists.txt
+++ b/libgnn/test/CMakeLists.txt
@@ -7,6 +7,10 @@ if (NOT GALOIS_ENABLE_GPU)
   target_link_libraries(convlayer-test galois_gnn)
   add_test(NAME convlayer-test COMMAND convlayer-test)
 
+  add_executable(sage-layer-test sage-layer-test.cpp)
+  target_link_libraries(sage-layer-test galois_gnn)
+  add_test(NAME sage-layer-test COMMAND sage-layer-test)
+
   add_executable(l2norm-layer-test l2norm-layer-test.cpp)
   target_link_libraries(l2norm-layer-test galois_gnn)
   add_test(NAME l2norm-layer-test COMMAND l2norm-layer-test)
diff --git a/libgnn/test/sage-layer-test.cpp b/libgnn/test/sage-layer-test.cpp
new file mode 100644
index 0000000000..dadc8b0096
--- /dev/null
+++ b/libgnn/test/sage-layer-test.cpp
@@ -0,0 +1,272 @@
+//! @file sage-layer-test.cpp
+//! Sage layer test
+
+#include "galois/Logging.h"
+#include "galois/layers/SAGELayer.h"
+
+int main() {
+  galois::DistMemSys G;
+
+  size_t num_threads = galois::setActiveThreads(
+      56 / galois::runtime::getSystemNetworkInterface().Num);
+
+  GALOIS_LOG_VERBOSE("[{}] Using {} threads",
+                     galois::runtime::getSystemNetworkInterface().ID,
+                     num_threads);
+  // load test graph
+  galois::graphs::GNNGraph test_graph(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true);
+
+  galois::GNNLayerDimensions dimension_0;
+  dimension_0.input_rows     = 7;
+  dimension_0.input_columns  = 3;
+  dimension_0.output_columns = 2;
+
+  galois::GNNLayerConfig dcon;
+  dcon.disable_aggregate_after_update = false;
+  dcon.DebugConfig();
+  galois::SAGELayerConfig scon;
+  scon.disable_concat = false;
+
+  std::unique_ptr<galois::SAGELayer> layer_0 =
+      std::make_unique<galois::SAGELayer>(0, test_graph, dimension_0, dcon,
+                                          scon);
+  layer_0->InitAllWeightsTo1();
+  // sage weights for self
+  layer_0->InitSelfWeightsTo1();
+
+  // make sure it runs in a sane manner
+  const galois::PointerWithSize<galois::GNNFloat> layer_0_forward_output =
+      layer_0->ForwardPhase(test_graph.GetLocalFeatures());
+
+  //////////////////////////////////////////////////////////////////////////////
+  // sanity check layer 0 output
+  //////////////////////////////////////////////////////////////////////////////
+  // since norm factors aren't invovled it is possible to do full assertions
+  // 7 x 2
+  GALOIS_LOG_ASSERT(layer_0_forward_output.size() == 14);
+  GALOIS_LOG_VASSERT(layer_0_forward_output[0] == 3, "{} should be 3",
+                     layer_0_forward_output[0]);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[1] == 3);
+  GALOIS_LOG_VASSERT(layer_0_forward_output[2] == 9, "{} should be 6",
+                     layer_0_forward_output[2]);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[3] == 9);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[4] == 18);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[5] == 18);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[6] == 27);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[7] == 27);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[8] == 36);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[9] == 36);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[10] == 45);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[11] == 45);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[12] == 33);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[13] == 33);
+  //////////////////////////////////////////////////////////////////////////////
+
+  // dummy 1 matrix
+  std::vector<galois::GNNFloat> dummy_ones_v(14, 1);
+  galois::PointerWithSize dummy_ones(dummy_ones_v);
+
+  // backward pass checking
+  // layer 0 means that an empty weight matrix is returned since there is no
+  // point passing back anything
+  galois::PointerWithSize<galois::GNNFloat> layer_0_backward_output =
+      layer_0->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
+
+  ////////////////////////////////////////////////////////////////////////////////
+  //// sanity check layer 0 backward output; all 0 because layer 0
+  ////////////////////////////////////////////////////////////////////////////////
+  // since norm factors aren't invovled it is possible to do full assertions
+  // 7 x 3
+  GALOIS_LOG_ASSERT(layer_0_backward_output.size() == 21);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[0] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[1] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[2] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[3] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[4] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[5] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[6] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[7] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[8] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[9] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[10] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[11] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[12] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[13] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[14] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[15] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[16] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[17] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[18] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[19] == 0);
+  GALOIS_LOG_ASSERT((layer_0_backward_output)[20] == 0);
+
+  galois::PointerWithSize<galois::GNNFloat> layer_0_weight_gradients =
+      layer_0->GetLayerWeightGradients();
+  galois::PointerWithSize<galois::GNNFloat> layer_0_weight_gradients_2 =
+      layer_0->GetLayerWeightGradients2();
+
+  // make sure they are sane
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients.size() == 6);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[0] == 36);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[1] == 36);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[2] == 36);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[3] == 36);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[4] == 36);
+
+  // make sure they are sane
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients_2.size() == 6);
+  GALOIS_LOG_VASSERT(layer_0_weight_gradients_2[0] == 21,
+                     "{} is wrong should be {}", layer_0_weight_gradients_2[0],
+                     21);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients_2[1] == 21);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients_2[2] == 21);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients_2[3] == 21);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients_2[4] == 21);
+
+  layer_0.reset();
+
+  ////////////////////////////////////////////////////////////////////////////////
+
+  // create layer 1 for testing backward prop actually giving weights back
+
+  auto layer_1 = std::make_unique<galois::SAGELayer>(1, test_graph, dimension_0,
+                                                     dcon, scon);
+  layer_1->InitAllWeightsTo1();
+  layer_1->InitSelfWeightsTo1();
+
+  galois::PointerWithSize<galois::GNNFloat> layer_1_forward_output =
+      layer_1->ForwardPhase(test_graph.GetLocalFeatures());
+  // same check as before for sanity purposes
+  GALOIS_LOG_ASSERT(layer_1_forward_output.size() == 14);
+  GALOIS_LOG_VASSERT(layer_1_forward_output[0] == 3, "{} should be 3",
+                     layer_1_forward_output[0]);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[1] == 3);
+  GALOIS_LOG_VASSERT(layer_1_forward_output[2] == 9, "{} should be 6",
+                     layer_1_forward_output[2]);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[3] == 9);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[4] == 18);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[5] == 18);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[6] == 27);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[7] == 27);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[8] == 36);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[9] == 36);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[10] == 45);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[11] == 45);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[12] == 33);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[13] == 33);
+
+  // since layer isn't 0 anymore, backward phase will actually return something
+  dummy_ones_v.assign(14, 1);
+  galois::PointerWithSize<galois::GNNFloat> layer_1_backward_output =
+      layer_1->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
+
+  //////////////////////////////////////////////////////////////////////////////
+  // check that multiplies go as expected
+  //////////////////////////////////////////////////////////////////////////////
+  GALOIS_LOG_ASSERT(layer_1_backward_output.size() == 21);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[0] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[1] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[2] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[3] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[4] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[5] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[6] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[7] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[8] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[9] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[10] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[11] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[12] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[13] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[14] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[15] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[16] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[17] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[18] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[19] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[20] == 4);
+
+  galois::PointerWithSize<galois::GNNFloat> layer_1_weight_gradients =
+      layer_1->GetLayerWeightGradients();
+  // make sure they are sane
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients.size() == 6);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[0] == 36);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[1] == 36);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[2] == 36);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[3] == 36);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[4] == 36);
+
+  galois::PointerWithSize<galois::GNNFloat> layer_1_weight_gradients_2 =
+      layer_1->GetLayerWeightGradients2();
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients_2.size() == 6);
+  GALOIS_LOG_VASSERT(layer_1_weight_gradients_2[0] == 21,
+                     "{} is wrong should be {}", layer_1_weight_gradients_2[0],
+                     21);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients_2[1] == 21);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients_2[2] == 21);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients_2[3] == 21);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients_2[4] == 21);
+
+  layer_1.reset();
+
+  ////////////////////////////////////////////////////////////////////////////////
+
+  galois::GNNLayerConfig config;
+  config.disable_dropout                = false;
+  config.disable_activation             = false;
+  config.disable_normalization          = false;
+  config.disable_aggregate_after_update = false;
+
+  // finally, just make sure dropout and activation run without crashes
+  // (verification requires floating point accuracy or setting a seed which I
+  // don't have time for at the moment
+  // TODO in future maybe add better unit test for this
+  auto layer_2 = std::make_unique<galois::SAGELayer>(1, test_graph, dimension_0,
+                                                     config, scon);
+  galois::PointerWithSize<galois::GNNFloat> l2_fo =
+      layer_2->ForwardPhase(test_graph.GetLocalFeatures());
+  GALOIS_LOG_ASSERT(l2_fo.size() == 14);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[0]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[1]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[2]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[3]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[4]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[5]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[6]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[7]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[8]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[9]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[10]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[11]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[12]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[13]);
+
+  galois::PointerWithSize<galois::GNNFloat> l2_bo =
+      layer_2->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
+
+  GALOIS_LOG_ASSERT(l2_bo.size() == 21);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[0]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[1]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[2]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[3]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[4]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[5]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[6]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[7]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[8]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[9]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[10]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[11]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[12]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[13]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[14]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[15]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[16]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[17]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[18]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[19]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[20]);
+
+  return 0;
+}
diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp
index c48f0b41b4..dbddb552e2 100644
--- a/lonestar/libgnnbench/src/Input.cpp
+++ b/lonestar/libgnnbench/src/Input.cpp
@@ -42,12 +42,13 @@ llvm::cl::list<galois::GNNLayerType> cl_layer_types(
     "layerTypes",
     cll::desc("Comma separated list of layer types specifying "
               "intermediate layers (does not include output)"),
-    cll::values(clEnumValN(galois::GNNLayerType::kGraphConvolutional, "gcn",
-                           "Graph Convolutional Layer (default)"),
-                clEnumValN(galois::GNNLayerType::kSAGE, "sage",
-                           "SAGE layer (GCN with concat + mean)"),
-                clEnumValN(galois::GNNLayerType::kDense, "dense",
-                           "Dense Layer")),
+    cll::values(
+        clEnumValN(galois::GNNLayerType::kGraphConvolutional, "gcn",
+                   "Graph Convolutional Layer (default)"),
+        clEnumValN(galois::GNNLayerType::kSAGE, "sage",
+                   "SAGE layer (GCN with concat + mean)"),
+        clEnumValN(galois::GNNLayerType::kL2Norm, "l2norm", "L2 norm layer"),
+        clEnumValN(galois::GNNLayerType::kDense, "dense", "Dense layer")),
     cll::CommaSeparated);
 
 llvm::cl::opt<bool>

From 8ee70932ef08ce870a771e06ca48061c57ee7279 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 23 Feb 2021 14:21:11 -0600
Subject: [PATCH 476/660] Fixed dropout derivative bug in Dense/GCN layers

Backward prop if dropout is on needs to be done with the dropout'd
matrix  and not the original one. This is already fixed in the SAGE
layer (and was the reason why SAGE was giving me issues for a week).
---
 libgnn/src/layers/DenseLayer.cpp              | 12 ++++++++++-
 libgnn/src/layers/GraphConvolutionalLayer.cpp | 21 +++++++++++++------
 2 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/libgnn/src/layers/DenseLayer.cpp b/libgnn/src/layers/DenseLayer.cpp
index b767805a6a..b2da6bf010 100644
--- a/libgnn/src/layers/DenseLayer.cpp
+++ b/libgnn/src/layers/DenseLayer.cpp
@@ -67,10 +67,20 @@ galois::PointerWithSize<galois::GNNFloat> galois::DenseLayer::BackwardPhase(
                                p_backward_output_matrix_.data());
   }
 
+  galois::PointerWithSize<galois::GNNFloat> input_data;
+  if (!config_.disable_dropout) {
+    // dropout result is currently stored in temp 1
+    // needs to be used before it gets overwritten
+    input_data = p_in_temp_1_;
+  } else {
+    // no dropout = use vanilla input
+    input_data = prev_layer_input;
+  }
+
   // W' = F^T (FW)'
   galois::CBlasSGEMM(CblasTrans, CblasNoTrans, layer_dimensions_.input_columns,
                      layer_dimensions_.input_rows,
-                     layer_dimensions_.output_columns, prev_layer_input.data(),
+                     layer_dimensions_.output_columns, input_data.data(),
                      input_gradient->data(), p_layer_weight_gradients_.data());
   // sync weight gradients; note aggregation sync occurs in the function call
   // already
diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp
index 81bebfd8e2..70e37ab23c 100644
--- a/libgnn/src/layers/GraphConvolutionalLayer.cpp
+++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp
@@ -100,6 +100,15 @@ galois::GraphConvolutionalLayer::BackwardPhase(
   }
 
   // AFW = O
+  galois::PointerWithSize<galois::GNNFloat> input_data;
+  if (!config_.disable_dropout) {
+    // dropout result is currently stored in temp 1
+    // needs to be used before it gets overwritten
+    input_data = p_in_temp_1_;
+  } else {
+    // no dropout = use vanilla input
+    input_data = prev_layer_input;
+  }
 
   // derivative of aggregation/update
   // TODO clean up logic here to reduce nesting
@@ -160,15 +169,15 @@ galois::GraphConvolutionalLayer::BackwardPhase(
     if (device_personality == DevicePersonality::GPU_CUDA) {
       gpu_object_.GetWeightGradientsGPU(
           layer_dimensions_.input_rows, layer_dimensions_.input_columns,
-          layer_dimensions_.output_columns, prev_layer_input.data(),
+          layer_dimensions_.output_columns, input_data.data(),
           p_out_temp_.data(), p_layer_weight_gradients_.data());
     } else {
 #endif
-      galois::CBlasSGEMM(
-          CblasTrans, CblasNoTrans, layer_dimensions_.input_columns,
-          layer_dimensions_.input_rows, layer_dimensions_.output_columns,
-          prev_layer_input.data(), p_out_temp_.data(),
-          p_layer_weight_gradients_.data());
+      galois::CBlasSGEMM(CblasTrans, CblasNoTrans,
+                         layer_dimensions_.input_columns,
+                         layer_dimensions_.input_rows,
+                         layer_dimensions_.output_columns, input_data.data(),
+                         p_out_temp_.data(), p_layer_weight_gradients_.data());
 #ifdef GALOIS_ENABLE_GPU
     }
 #endif

From 95d8c079e0f909a117b58d32afb75cce25c85734 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 24 Feb 2021 16:08:14 -0600
Subject: [PATCH 477/660] VectorMultAdd

Fused multiply-add of vectors in GNNMath.
---
 libgnn/include/galois/GNNMath.h |  4 ++++
 libgnn/src/GNNMath.cpp          | 36 +++++++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+)

diff --git a/libgnn/include/galois/GNNMath.h b/libgnn/include/galois/GNNMath.h
index 9e17a448fc..e32d062cc5 100644
--- a/libgnn/include/galois/GNNMath.h
+++ b/libgnn/include/galois/GNNMath.h
@@ -13,6 +13,10 @@ size_t MaxIndex(const size_t length, const GNNFloat* vector);
 //! Can be called in parallel sections as its sigle threaded code
 void VectorAdd(size_t length, const GNNFloat* a, const GNNFloat* b,
                GNNFloat* output);
+//! Given 2 float array pointers, do element wise addition of length elements
+//! while scaling the second vector with a multiplier
+void VectorMulAdd(size_t length, const GNNFloat* a, const GNNFloat* b,
+                  const GNNFloat b_scale, GNNFloat* output);
 
 //! Does a softmax operation on the input vector and saves result to output
 //! vector; single threaded so it can be called in a parallel section
diff --git a/libgnn/src/GNNMath.cpp b/libgnn/src/GNNMath.cpp
index 38af349a8c..fe14198d83 100644
--- a/libgnn/src/GNNMath.cpp
+++ b/libgnn/src/GNNMath.cpp
@@ -35,6 +35,8 @@ void galois::VectorAdd(size_t length, const GNNFloat* a, const GNNFloat* b,
     output[i] = a[i] + b[i];
   }
 #else
+  galois::gWarn("No vectorization support on this machine! Falling back to "
+                "simple for loop");
   // no vector -> trivial loop add
   for (size_t i = 0; i < length; ++i) {
     output[i] = a[i] + b[i];
@@ -42,6 +44,40 @@ void galois::VectorAdd(size_t length, const GNNFloat* a, const GNNFloat* b,
 #endif
 }
 
+void galois::VectorMulAdd(size_t length, const GNNFloat* a, const GNNFloat* b,
+                          const GNNFloat b_scale, GNNFloat* output) {
+#ifdef __AVX2__
+  constexpr size_t vectorization_length =
+      8; // for 32-bit floating point in AVX2; TODO AVX512
+  // can only do up to a particular multiple due to alignment
+  // create scale vector for b
+  __m128 scale_vec_half = _mm_set_ps(b_scale, b_scale, b_scale, b_scale);
+  __m256 scale_vec_main = _mm256_castps128_ps256(scale_vec_half);
+  scale_vec_main = _mm256_insertf128_ps(scale_vec_main, scale_vec_half, 1);
+
+  const size_t aligned_end = length - length % vectorization_length;
+  // do add via vector ops
+  for (size_t i = 0; i < aligned_end; i += vectorization_length) {
+    _mm256_storeu_ps(
+        &output[i],
+        _mm256_add_ps(_mm256_loadu_ps(&a[i]),
+                      _mm256_mul_ps(scale_vec_main, _mm256_loadu_ps(&b[i]))));
+  }
+
+  // handle the rest
+  for (size_t i = aligned_end; i < length; ++i) {
+    output[i] = a[i] + b[i] * b_scale;
+  }
+#else
+  galois::gWarn("No vectorization support on this machine! Falling back to "
+                "simple for loop");
+  // no vector -> trivial loop add
+  for (size_t i = 0; i < length; ++i) {
+    output[i] = a[i] + b[i] * b_scale;
+  }
+#endif
+}
+
 void galois::GNNSoftmax(const size_t vector_length, const GNNFloat* input,
                         GNNFloat* output) {
   const GNNFloat max_element =

From 9cba4e28a6bf775cf5a1ca9b7a413d49e8be6519 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 24 Feb 2021 16:14:30 -0600
Subject: [PATCH 478/660] GCN layer aggregation uses fused multiply add

Gets rid of the need for per thread storage (very slow) and on demand
alloc of an intermediate vector by using a fused multiply add for
aggregation.

Also adds a few timers to the layer.
---
 .../galois/layers/GraphConvolutionalLayer.h   |  2 +-
 libgnn/src/layers/GraphConvolutionalLayer.cpp | 46 +++++++++----------
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/libgnn/include/galois/layers/GraphConvolutionalLayer.h b/libgnn/include/galois/layers/GraphConvolutionalLayer.h
index c677389df7..47980dcd0c 100644
--- a/libgnn/include/galois/layers/GraphConvolutionalLayer.h
+++ b/libgnn/include/galois/layers/GraphConvolutionalLayer.h
@@ -32,6 +32,7 @@ class GraphConvolutionalLayer : public GNNLayer {
                 PointerWithSize<galois::GNNFloat>* input_gradient) final;
 
 private:
+  static const constexpr char* kRegionName = "GCNLayer";
   // 2 temporaries the size of the forward input; used for dropout and
   // aggregation (if either are required)
   std::vector<GNNFloat> in_temp_1_;
@@ -71,7 +72,6 @@ class GraphConvolutionalLayer : public GNNLayer {
   void UpdateEmbeddings(const GNNFloat* node_embeddings, GNNFloat* output);
   //! Calculate graident via mxm with last layer's gradients (backward)
   void UpdateEmbeddingsDerivative(const GNNFloat* gradients, GNNFloat* output);
-
 #ifdef GALOIS_ENABLE_GPU
   GCNGPUAllocations gpu_object_;
 #endif
diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp
index 70e37ab23c..bbf42a47b0 100644
--- a/libgnn/src/layers/GraphConvolutionalLayer.cpp
+++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp
@@ -44,6 +44,8 @@ galois::GraphConvolutionalLayer::GraphConvolutionalLayer(
 const galois::PointerWithSize<galois::GNNFloat>
 galois::GraphConvolutionalLayer::ForwardPhase(
     const galois::PointerWithSize<galois::GNNFloat> input_embeddings) {
+  galois::StatTimer timer("ForwardPhase", kRegionName);
+  timer.start();
   GALOIS_LOG_VERBOSE("Calling forward phase");
   assert(input_embeddings.size() ==
          (layer_dimensions_.input_rows * layer_dimensions_.input_columns));
@@ -85,6 +87,7 @@ galois::GraphConvolutionalLayer::ForwardPhase(
 
   assert(p_forward_output_matrix_.size() ==
          (layer_dimensions_.input_rows * layer_dimensions_.output_columns));
+  timer.stop();
   return p_forward_output_matrix_;
 }
 
@@ -92,6 +95,9 @@ galois::PointerWithSize<galois::GNNFloat>
 galois::GraphConvolutionalLayer::BackwardPhase(
     galois::PointerWithSize<galois::GNNFloat> prev_layer_input,
     galois::PointerWithSize<galois::GNNFloat>* input_gradient) {
+  galois::StatTimer timer("BackwardPhase", kRegionName);
+  timer.start();
+
   assert(layer_phase_ == GNNPhase::kTrain);
 
   // derivative of activation
@@ -193,6 +199,7 @@ galois::GraphConvolutionalLayer::BackwardPhase(
     DoDropoutDerivative();
   }
 
+  timer.stop();
   return p_backward_output_matrix_;
 }
 
@@ -201,6 +208,9 @@ void galois::GraphConvolutionalLayer::AggregateAll(
     GNNFloat* aggregate_output,
     [[maybe_unused]] galois::substrate::PerThreadStorage<std::vector<GNNFloat>>*
         pts) {
+  galois::StatTimer timer("Aggregate", kRegionName);
+  timer.start();
+
 #ifdef GALOIS_ENABLE_GPU
   if (device_personality == DevicePersonality::GPU_CUDA) {
     gpu_object_.AggregateAllGPU(
@@ -213,12 +223,13 @@ void galois::GraphConvolutionalLayer::AggregateAll(
 #ifdef GALOIS_ENABLE_GPU
   }
 #endif
+  timer.stop();
 }
 
 void galois::GraphConvolutionalLayer::AggregateAllCPU(
     size_t column_length, const GNNFloat* node_embeddings,
     GNNFloat* aggregate_output,
-    galois::substrate::PerThreadStorage<std::vector<GNNFloat>>* pts) {
+    galois::substrate::PerThreadStorage<std::vector<GNNFloat>>*) {
   size_t num_nodes = graph_.size();
 
   galois::do_all(
@@ -282,17 +293,10 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU(
 
           if (!config_.disable_normalization) {
             GNNFloat norm_scale = source_norm * graph_.NormFactor(dst);
-            // scale the value on the destination by the combined norm term
-            assert(pts->getLocal()->size() == column_length);
-            GNNFloat* intermediate = pts->getLocal()->data();
-            for (size_t i = 0; i < column_length; i++) {
-              intermediate[i] =
-                  norm_scale * node_embeddings[index_to_dst_feature + i];
-            }
-            // add intermediate instead of original feature
-            galois::VectorAdd(
+            galois::VectorMulAdd(
                 column_length, &aggregate_output[index_to_src_feature],
-                intermediate, &aggregate_output[index_to_src_feature]);
+                &node_embeddings[index_to_dst_feature], norm_scale,
+                &aggregate_output[index_to_src_feature]);
           } else {
             // add dst feature to aggregate output
             galois::VectorAdd(column_length,
@@ -301,18 +305,6 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU(
                               &aggregate_output[index_to_src_feature]);
           }
         }
-
-        // GNNFloat* intermediate = pts->getLocal()->data();
-        // GNNFloat norm_scale = source_norm * source_norm;
-        // for (size_t i = 0; i < column_length; i++) {
-        //   intermediate[i] =
-        //       norm_scale * node_embeddings[index_to_src_feature + i];
-        // }
-        // // add self
-        // galois::VectorAdd(column_length,
-        //                   &aggregate_output[index_to_src_feature],
-        //                   intermediate,
-        //                   &aggregate_output[index_to_src_feature]);
       },
       galois::steal(), galois::loopname("ConvolutionalAggregateAll"));
   // aggregate sync
@@ -321,6 +313,9 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU(
 
 void galois::GraphConvolutionalLayer::UpdateEmbeddings(
     const GNNFloat* node_embeddings, GNNFloat* output) {
+  galois::StatTimer timer("ForwardXform", kRegionName);
+  timer.start();
+
 #ifdef GALOIS_ENABLE_GPU
   if (device_personality == DevicePersonality::GPU_CUDA) {
     gpu_object_.UpdateEmbeddingsGPU(
@@ -337,10 +332,14 @@ void galois::GraphConvolutionalLayer::UpdateEmbeddings(
 #ifdef GALOIS_ENABLE_GPU
   }
 #endif
+  timer.stop();
 }
 
 void galois::GraphConvolutionalLayer::UpdateEmbeddingsDerivative(
     const GNNFloat* gradients, GNNFloat* output) {
+  galois::StatTimer timer("BackwardXform", kRegionName);
+  timer.start();
+
   assert(p_layer_weights_.size() ==
          layer_dimensions_.input_columns * layer_dimensions_.output_columns);
 #ifdef GALOIS_ENABLE_GPU
@@ -360,4 +359,5 @@ void galois::GraphConvolutionalLayer::UpdateEmbeddingsDerivative(
 #ifdef GALOIS_ENABLE_GPU
   }
 #endif
+  timer.stop();
 }

From c89ad4638dca6b2cb465de85105a3057224b326a Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 24 Feb 2021 16:40:57 -0600
Subject: [PATCH 479/660] Preliminary timers

Some timers added to GNN and GNNLayer
---
 libgnn/src/GraphNeuralNetwork.cpp |  4 ++++
 libgnn/src/layers/GNNLayer.cpp    | 14 ++++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index 0905713dd8..0badb4f312 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -148,6 +148,9 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
 
   // TODO incorporate validation/test intervals
   for (size_t epoch = 0; epoch < num_epochs; epoch++) {
+    const std::string t_name = "Epoch" + std::to_string(epoch);
+    galois::StatTimer epoch_timer(t_name.c_str(), "GraphNeuralNetwork");
+    epoch_timer.start();
     if (config_.do_sampling()) {
       // subgraph sample every epoch
       // graph_->UniformNodeSample();
@@ -176,6 +179,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
       galois::gPrint("Epoch ", epoch, ": Train accuracy/F1 micro is ",
                      train_accuracy, "\n");
     }
+    epoch_timer.stop();
     // TODO validation and test as necessary
   }
   graph_->CalculateFullNormFactor();
diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp
index 6deab8e682..2018c4f5c5 100644
--- a/libgnn/src/layers/GNNLayer.cpp
+++ b/libgnn/src/layers/GNNLayer.cpp
@@ -138,6 +138,8 @@ void galois::GNNLayer::DoDropoutCPU(
 void galois::GNNLayer::DoDropout(
     const PointerWithSize<GNNFloat> input_to_dropout,
     PointerWithSize<GNNFloat>* output_matrix) {
+  galois::StatTimer timer("ForwardDropout", "GNNLayer");
+  timer.start();
 #ifdef GALOIS_ENABLE_GPU
   if (device_personality == DevicePersonality::GPU_CUDA) {
     base_gpu_object_.DoDropoutGPU(input_to_dropout, *output_matrix,
@@ -148,9 +150,12 @@ void galois::GNNLayer::DoDropout(
 #ifdef GALOIS_ENABLE_GPU
   }
 #endif
+  timer.stop();
 }
 
 void galois::GNNLayer::DoDropoutDerivative() {
+  galois::StatTimer timer("BackwardDropout", "GNNLayer");
+  timer.start();
   assert(backward_output_matrix_.size() == dropout_mask_.size());
   GNNFloat scale = 1. / (1. - config_.dropout_rate);
 
@@ -172,9 +177,13 @@ void galois::GNNLayer::DoDropoutDerivative() {
 #ifdef GALOIS_ENABLE_GPU
   }
 #endif
+  timer.stop();
 }
 
 void galois::GNNLayer::Activation() {
+  galois::StatTimer timer("ForwardActivation", "GNNLayer");
+  timer.start();
+
   // TODO only does relu at the moment; should check user specified activation
   // and act accordingly
   galois::do_all(
@@ -184,10 +193,14 @@ void galois::GNNLayer::Activation() {
             std::max(forward_output_matrix_.at(i), static_cast<GNNFloat>(0));
       },
       galois::loopname("ReLU"));
+  timer.stop();
 }
 
 void galois::GNNLayer::ActivationDerivative(
     PointerWithSize<GNNFloat>* gradient) {
+  galois::StatTimer timer("BackwardActivation", "GNNLayer");
+  timer.start();
+
   // TODO only does relu at the moment; should check user specified activation
   // and act accordingly
   // keep gradient if the original output is greater than 0
@@ -200,6 +213,7 @@ void galois::GNNLayer::ActivationDerivative(
                 : static_cast<GNNFloat>(0);
       },
       galois::loopname("ReLU-Derivative"));
+  timer.stop();
 }
 
 void galois::GNNLayer::WeightGradientSyncSum() {

From c279dff4d7685365489560fb349cc4d41ad49a1a Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 24 Feb 2021 16:50:15 -0600
Subject: [PATCH 480/660] SAGE aggregation: fused multiply add

Like GCN layer, get rid of need for intermediate vector with fused
multiply add.
---
 libgnn/src/layers/SAGELayer.cpp | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp
index 79e757e93c..dfbd006eba 100644
--- a/libgnn/src/layers/SAGELayer.cpp
+++ b/libgnn/src/layers/SAGELayer.cpp
@@ -284,7 +284,7 @@ void galois::SAGELayer::AggregateAll(
 void galois::SAGELayer::AggregateAllCPU(
     size_t column_length, const GNNFloat* node_embeddings,
     GNNFloat* aggregate_output,
-    galois::substrate::PerThreadStorage<std::vector<GNNFloat>>* pts,
+    galois::substrate::PerThreadStorage<std::vector<GNNFloat>>*,
     bool is_backward) {
   size_t num_nodes = graph_.size();
 
@@ -346,17 +346,10 @@ void galois::SAGELayer::AggregateAllCPU(
               norm_scale = graph_.DegreeNorm(dst);
             }
 
-            // scale the value on the destination by the combined norm term
-            assert(pts->getLocal()->size() == column_length);
-            GNNFloat* intermediate = pts->getLocal()->data();
-            for (size_t i = 0; i < column_length; i++) {
-              intermediate[i] =
-                  norm_scale * node_embeddings[index_to_dst_feature + i];
-            }
-            // add intermediate instead of original feature
-            galois::VectorAdd(
+            galois::VectorMulAdd(
                 column_length, &aggregate_output[index_to_src_feature],
-                intermediate, &aggregate_output[index_to_src_feature]);
+                &node_embeddings[index_to_dst_feature], norm_scale,
+                &aggregate_output[index_to_src_feature]);
           } else {
             // add dst feature to aggregate output
             galois::VectorAdd(column_length,

From 34199abaee13d25aab0859cbe35bdd9abe12a02c Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 25 Feb 2021 18:22:05 -0600
Subject: [PATCH 481/660] Fix GNN CVC by fixing the edge master function

Edge assign function for GNN's CVC was wrong: need to account for
non-contiguous rows. This commit fixes that. Also adds aggregate sync
test for CVC (long overdue) and adds the original CVC as an option to
the gcn app.
---
 .../galois/graphs/GenericPartitioners.h       |  12 +-
 libgnn/include/galois/graphs/GNNGraph.h       |   2 +-
 libgnn/src/graphs/GNNGraph.cpp                |   7 +-
 libgnn/test/aggregate-sync-test.cpp           | 161 +++++++++++++++++-
 lonestar/libgnnbench/src/Input.cpp            |   6 +-
 5 files changed, 180 insertions(+), 8 deletions(-)

diff --git a/libcusp/include/galois/graphs/GenericPartitioners.h b/libcusp/include/galois/graphs/GenericPartitioners.h
index 006faea862..b02d2c9594 100644
--- a/libcusp/include/galois/graphs/GenericPartitioners.h
+++ b/libcusp/include/galois/graphs/GenericPartitioners.h
@@ -981,6 +981,11 @@ class GnnCVC : public galois::graphs::CustomMasterAssignment {
   //! Returns the grid column ID of the specified host
   unsigned gridColumnID(unsigned id) const { return (id % numColumnHosts); }
 
+  //! Find the row of a particular node
+  unsigned getRowOfNode(uint64_t gid) const {
+    return gridRowID(retrieveMaster(gid));
+  }
+
   //! Find the column of a particular node
   unsigned getColumnOfNode(uint64_t gid) const {
     return gridColumnID(retrieveMaster(gid));
@@ -1009,9 +1014,10 @@ class GnnCVC : public galois::graphs::CustomMasterAssignment {
 
   uint32_t retrieveMaster(uint32_t gid) const { return _globalHostMap[gid]; }
 
-  uint32_t getEdgeOwner(uint32_t, uint32_t dst, uint64_t) const {
-    int i = getColumnOfNode(dst);
-    return _h_offset + i;
+  uint32_t getEdgeOwner(uint32_t src, uint32_t dst, uint64_t) const {
+    unsigned blockedRowOffset   = getRowOfNode(src) * numColumnHosts;
+    unsigned cyclicColumnOffset = getColumnOfNode(dst);
+    return blockedRowOffset + cyclicColumnOffset;
   }
 
   bool noCommunication() { return false; }
diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index b69eb43ea2..5f4a337845 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -28,7 +28,7 @@ struct GNNRange {
 namespace graphs {
 
 //! Possible partitioning schemes for the GNN graph
-enum class GNNPartitionScheme { kOEC, kCVC };
+enum class GNNPartitionScheme { kOEC, kCVC, kOCVC };
 
 //! XXX
 class GNNGraph {
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index 919d7340e4..cd76d118e0 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -23,6 +23,9 @@ LoadPartition(const std::string& input_directory,
   case galois::graphs::GNNPartitionScheme::kCVC:
     return galois::cuspPartitionGraph<GnnCVC, char, void>(
         input_file, galois::CUSP_CSR, galois::CUSP_CSR, true, "", "", false, 1);
+  case galois::graphs::GNNPartitionScheme::kOCVC:
+    return galois::cuspPartitionGraph<GenericCVC, char, void>(
+        input_file, galois::CUSP_CSR, galois::CUSP_CSR, true, "", "", false, 1);
   default:
     GALOIS_LOG_FATAL("Error: partition scheme specified is invalid");
     return nullptr;
@@ -74,7 +77,8 @@ galois::graphs::GNNGraph::GNNGraph(const std::string& input_directory,
   sync_substrate_ =
       std::make_unique<galois::graphs::GluonSubstrate<GNNDistGraph>>(
           *partitioned_graph_, host_id_,
-          galois::runtime::getSystemNetworkInterface().Num, false);
+          galois::runtime::getSystemNetworkInterface().Num, false,
+          partitioned_graph_->cartesianGrid());
 
   // read in entire graph topology
   ReadWholeGraph(dataset_name);
@@ -163,7 +167,6 @@ void galois::graphs::GNNGraph::AggregateSync(
   gnn_matrix_to_sync_column_length_ = matrix_column_size;
 
   // XXX bitset setting
-  // call sync
   sync_substrate_->sync<writeSource, readAny, GNNSumAggregate>(
       "GraphAggregateSync");
 }
diff --git a/libgnn/test/aggregate-sync-test.cpp b/libgnn/test/aggregate-sync-test.cpp
index d13674f1a2..888f2ca69f 100644
--- a/libgnn/test/aggregate-sync-test.cpp
+++ b/libgnn/test/aggregate-sync-test.cpp
@@ -22,6 +22,11 @@ int main() {
                      test_graph->GetGID(test_graph->EdgeDestination(e)), "\n");
     }
   }
+  for (auto own = test_graph->begin_owned(); own != test_graph->end_owned();
+       own++) {
+    galois::gPrint(test_graph->host_prefix(), "Node owned GID ",
+                   test_graph->GetGID(*own), "\n");
+  }
 
   // create same layer from convlayer-test and make sure result is the same even
   // in multi-host environment
@@ -31,6 +36,7 @@ int main() {
   dimension_0.output_columns = 2;
   galois::GNNLayerConfig l_config;
   l_config.disable_aggregate_after_update = false;
+  l_config.DebugConfig();
 
   // create the layer, no norm factor
   std::unique_ptr<galois::GraphConvolutionalLayer> layer_0 =
@@ -197,6 +203,159 @@ int main() {
       GALOIS_LOG_ASSERT((layer_1_backward_output)[row * 3 + c] == ground_truth);
     }
   }
+  //////////////////////////////////////////////////////////////////////////////
+  auto test_graph_2 = std::make_unique<galois::graphs::GNNGraph>(
+      "tester", galois::graphs::GNNPartitionScheme::kCVC, true);
+  // print edges for sanity
+  for (size_t node = 0; node < test_graph_2->size(); node++) {
+    for (auto e = test_graph_2->EdgeBegin(node);
+         e != test_graph_2->EdgeEnd(node); e++) {
+      galois::gPrint(
+          test_graph_2->host_prefix(), "Edge ", test_graph_2->GetGID(node), " ",
+          test_graph_2->GetGID(test_graph_2->EdgeDestination(e)), "\n");
+    }
+  }
+  for (auto own = test_graph_2->begin_owned(); own != test_graph_2->end_owned();
+       own++) {
+    galois::gPrint(test_graph_2->host_prefix(), "Node owned GID ",
+                   test_graph_2->GetGID(*own), "\n");
+  }
+
+  // create same layer from convlayer-test and make sure result is the same even
+  // in multi-host environment
+  dimension_0.input_rows                  = test_graph_2->size();
+  dimension_0.input_columns               = 3;
+  dimension_0.output_columns              = 2;
+  l_config.disable_aggregate_after_update = false;
+  l_config.DebugConfig();
+
+  // create the layer, no norm factor
+  layer_0 = std::make_unique<galois::GraphConvolutionalLayer>(
+      0, *(test_graph_2.get()), dimension_0, l_config);
+  layer_0->InitAllWeightsTo1();
+  // make sure it runs in a sane manner
+  layer_0_forward_output =
+      layer_0->ForwardPhase(test_graph_2->GetLocalFeatures());
+
+  for (size_t row = 0; row < test_graph_2->size(); row++) {
+    // row -> GID
+    size_t global_row = test_graph_2->GetGID(row);
+
+    galois::GNNFloat ground_truth = 0.0;
+
+    switch (global_row) {
+    case 0:
+      ground_truth = 3;
+      break;
+    case 1:
+      ground_truth = 6;
+      break;
+    case 2:
+      ground_truth = 12;
+      break;
+    case 3:
+      ground_truth = 18;
+      break;
+    case 4:
+      ground_truth = 24;
+      break;
+    case 5:
+      ground_truth = 30;
+      break;
+    case 6:
+      ground_truth = 15;
+      break;
+    default:
+      GALOIS_LOG_FATAL("bad global row for test graph");
+      break;
+    }
+
+    // size 2 columns
+    for (size_t c = 0; c < 2; c++) {
+      GALOIS_LOG_VASSERT(layer_0_forward_output[row * 2 + c] == ground_truth,
+                         "{} Row {} GID {} need to be {} not {}",
+                         test_graph_2->host_prefix(), row, global_row,
+                         ground_truth, layer_0_forward_output[row * 2 + c]);
+    }
+  }
+
+  layer_1 = std::make_unique<galois::GraphConvolutionalLayer>(
+      1, *(test_graph_2.get()), dimension_0, l_config);
+  layer_1->InitAllWeightsTo1();
+  layer_1_forward_output =
+      layer_1->ForwardPhase(test_graph_2->GetLocalFeatures());
+
+  // same check for forward as before
+  for (size_t row = 0; row < test_graph_2->size(); row++) {
+    // row -> GID
+    size_t global_row = test_graph_2->GetGID(row);
+
+    galois::GNNFloat ground_truth = 0.0;
+
+    switch (global_row) {
+    case 0:
+      ground_truth = 3;
+      break;
+    case 1:
+      ground_truth = 6;
+      break;
+    case 2:
+      ground_truth = 12;
+      break;
+    case 3:
+      ground_truth = 18;
+      break;
+    case 4:
+      ground_truth = 24;
+      break;
+    case 5:
+      ground_truth = 30;
+      break;
+    case 6:
+      ground_truth = 15;
+      break;
+    default:
+      GALOIS_LOG_FATAL("bad global row for test graph");
+      break;
+    }
+
+    // size 2 columns
+    for (size_t c = 0; c < 2; c++) {
+      GALOIS_LOG_ASSERT(layer_1_forward_output[row * 2 + c] == ground_truth);
+    }
+  }
+
+  // since layer isn't 0 anymore, backward phase will actually return something
+  dummy_ones_v.assign(test_graph_2->size() * 2, 1);
+  layer_1_backward_output =
+      layer_1->BackwardPhase(test_graph_2->GetLocalFeatures(), &dummy_ones);
+
+  for (size_t row = 0; row < test_graph_2->size(); row++) {
+    // row -> GID
+    size_t global_row = test_graph_2->GetGID(row);
+
+    galois::GNNFloat ground_truth = 0.0;
 
-  // XXX TODO CVC
+    switch (global_row) {
+    case 0:
+    case 6:
+      ground_truth = 2;
+      break;
+    case 1:
+    case 2:
+    case 3:
+    case 4:
+    case 5:
+      ground_truth = 4;
+      break;
+    default:
+      GALOIS_LOG_FATAL("bad global row for test graph");
+      break;
+    }
+
+    // size 3 columns
+    for (size_t c = 0; c < 3; c++) {
+      GALOIS_LOG_ASSERT((layer_1_backward_output)[row * 3 + c] == ground_truth);
+    }
+  }
 }
diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp
index dbddb552e2..e02adb56dc 100644
--- a/lonestar/libgnnbench/src/Input.cpp
+++ b/lonestar/libgnnbench/src/Input.cpp
@@ -21,7 +21,9 @@ llvm::cl::opt<galois::graphs::GNNPartitionScheme> partition_scheme(
     cll::values(clEnumValN(galois::graphs::GNNPartitionScheme::kOEC, "oec",
                            "Outgoing Edge-Cut (default)"),
                 clEnumValN(galois::graphs::GNNPartitionScheme::kCVC, "cvc",
-                           "Cartesian Vertex-Cut")),
+                           "Cartesian Vertex-Cut"),
+                clEnumValN(galois::graphs::GNNPartitionScheme::kOCVC, "ocvc",
+                           "Original Cartesian Vertex-Cut")),
     cll::init(galois::graphs::GNNPartitionScheme::kOEC));
 
 llvm::cl::opt<size_t> num_layers(
@@ -118,6 +120,8 @@ const char* GNNPartitionToString(galois::graphs::GNNPartitionScheme s) {
     return "oec";
   case galois::graphs::GNNPartitionScheme::kCVC:
     return "cvc";
+  case galois::graphs::GNNPartitionScheme::kOCVC:
+    return "ocvc";
   default:
     GALOIS_LOG_FATAL("Invalid partitioning scheme");
     return "";

From f71e418ecc2016e0ac7acf2b696e35d2690dc108 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 25 Feb 2021 20:28:13 -0600
Subject: [PATCH 482/660] Support for non existent labels (-1)

If any bit of a label bitset for single class graphs is -1, then that
node has no label. This is now supported in the code.
---
 libgnn/include/galois/graphs/GNNGraph.h | 7 ++++++-
 libgnn/src/graphs/GNNGraph.cpp          | 6 ++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 5f4a337845..02cef8e621 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -109,7 +109,12 @@ class GNNGraph {
   //! class labels.
   GNNFloat GetSingleClassLabel(const unsigned lid) const {
     assert(using_single_class_labels_);
-    return local_ground_truth_labels_[lid];
+    if (local_ground_truth_labels_[lid] != num_label_classes_) {
+      return local_ground_truth_labels_[lid];
+    } else {
+      GALOIS_LOG_FATAL(
+          "should not get the label of a node that has no ground truth");
+    }
   }
 
   //! Returns pointer to start of ground truth vector for some local id assuming
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index cd76d118e0..3ec30ec57f 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -330,6 +330,12 @@ void galois::graphs::GNNGraph::ReadLocalLabels(const std::string& dataset_name,
         label_stream >> cur_bit;
 
         if (has_single_class_label) {
+          // no label
+          if (cur_bit == = -1) {
+            local_ground_truth_labels_[cur_lid] = num_label_classes_;
+            break;
+          }
+
           // in single class, only 1 bit is set in bitset; that represents the
           // class to take
           if (cur_bit != 0) {

From 2543fb976c458fdfb9be2db3313ac7be783f75f8 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Sat, 27 Feb 2021 15:01:41 -0600
Subject: [PATCH 483/660] (attempted) -1 fixing for graph label reading

Untested -1 reading fix; issue was unsigned being used to load -1
---
 libgnn/src/graphs/GNNGraph.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index 3ec30ec57f..fd87c08fb6 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -323,7 +323,7 @@ void galois::graphs::GNNGraph::ReadLocalLabels(const std::string& dataset_name,
       uint32_t cur_lid = partitioned_graph_->getLID(cur_gid);
       // read line as bitset of 0s and 1s
       std::istringstream label_stream(read_line);
-      unsigned cur_bit;
+      int cur_bit;
       // bitset size is # of label classes
       for (size_t cur_class = 0; cur_class < num_label_classes_; ++cur_class) {
         // read a bit
@@ -331,7 +331,7 @@ void galois::graphs::GNNGraph::ReadLocalLabels(const std::string& dataset_name,
 
         if (has_single_class_label) {
           // no label
-          if (cur_bit == = -1) {
+          if (cur_bit == -1) {
             local_ground_truth_labels_[cur_lid] = num_label_classes_;
             break;
           }

From 5bff7cced8f30822b7a09d29fe85b5bda0fd2296 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Sat, 27 Feb 2021 18:29:10 -0600
Subject: [PATCH 484/660] Val/test intervals added to training + timers

Can now specify intervals where training will evaluate val/test set and
return accuracy in order to track progress.

Added a lot more kinds of timers and stats that break down epoch time
and accuracy for the stat file (prep for paper result collection).
---
 libgnn/include/galois/GraphNeuralNetwork.h |   6 +
 libgnn/src/GraphNeuralNetwork.cpp          | 132 +++++++++++++++------
 lonestar/gnn/distributed/gcn/gcn-dist.cpp  |   4 -
 lonestar/libgnnbench/src/Input.cpp         |  14 ++-
 4 files changed, 115 insertions(+), 41 deletions(-)

diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h
index ae860d0d32..3df6fbe94e 100644
--- a/libgnn/include/galois/GraphNeuralNetwork.h
+++ b/libgnn/include/galois/GraphNeuralNetwork.h
@@ -103,6 +103,10 @@ class GraphNeuralNetworkConfig {
   bool do_sampling_{false};
   //! Inductive = training ignores test/val set
   bool inductive_training_{false};
+  //! Interval to run validation set on network at; 0 = no run
+  unsigned validation_interval_{0};
+  //! Interval to run testing set on network at; 0 = no run
+  unsigned test_interval_{0};
 
 private:
   //! Number of layers to construct in the GNN not including the output
@@ -198,6 +202,8 @@ class GraphNeuralNetwork {
 #ifdef GALOIS_ENABLE_GPU
   //! Holds all GPU functions
   GraphNeuralNetworkGPU gpu_object_;
+  // Used to copy predictions from gpu over
+  std::vector<GNNFloat> cpu_pred_;
 #endif
 };
 
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index 0badb4f312..7955d9e92f 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -131,7 +131,6 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork(
 
 float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
   const size_t this_host = graph_->host_id();
-  std::vector<GNNFloat> cpu_pred;
   float train_accuracy{0.f};
 
   /*
@@ -142,68 +141,113 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
   }
   */
 
+  bool altered_norm_factor =
+      config_.inductive_training_ || config_.do_sampling();
+
   if (config_.inductive_training_) {
     graph_->CalculateSpecialNormFactor(false, true);
   }
 
   // TODO incorporate validation/test intervals
   for (size_t epoch = 0; epoch < num_epochs; epoch++) {
-    const std::string t_name = "Epoch" + std::to_string(epoch);
+    const std::string t_name     = "TrainEpoch" + std::to_string(epoch);
+    const std::string t_name_acc = t_name + "Accuracy";
     galois::StatTimer epoch_timer(t_name.c_str(), "GraphNeuralNetwork");
     epoch_timer.start();
     if (config_.do_sampling()) {
       // subgraph sample every epoch
-      // graph_->UniformNodeSample();
       graph_->GraphSAINTSample();
       graph_->CalculateSpecialNormFactor(true, config_.inductive_training_);
     }
     const PointerWithSize<galois::GNNFloat> predictions = DoInference();
     GradientPropagation();
-#ifdef GALOIS_ENABLE_GPU
-    if (device_personality == DevicePersonality::GPU_CUDA) {
-      if (cpu_pred.size() != predictions.size()) {
-        cpu_pred.resize(predictions.size());
-      }
+    epoch_timer.stop();
 
-      AdamOptimizer* adam = static_cast<AdamOptimizer*>(optimizer_.get());
-      adam->CopyToVector(cpu_pred, predictions);
-      train_accuracy = GetGlobalAccuracy(cpu_pred);
-    } else {
-#endif
-      train_accuracy = GetGlobalAccuracy(predictions);
-#ifdef GALOIS_ENABLE_GPU
-    }
-#endif
+    train_accuracy = GetGlobalAccuracy(predictions);
 
     if (this_host == 0) {
       galois::gPrint("Epoch ", epoch, ": Train accuracy/F1 micro is ",
                      train_accuracy, "\n");
+      galois::runtime::reportStat_Single("GraphNeuralNetwork", t_name_acc,
+                                         train_accuracy);
     }
-    epoch_timer.stop();
-    // TODO validation and test as necessary
+
+    bool do_validate = config_.validation_interval_
+                           ? epoch % config_.validation_interval_ == 0
+                           : false;
+    bool do_test =
+        config_.test_interval_ ? epoch % config_.test_interval_ == 0 : false;
+
+    // get real norm factor back if altered by sampling or inductive training
+    if ((do_validate || do_test) && altered_norm_factor) {
+      graph_->CalculateFullNormFactor();
+    }
+
+    if (do_validate) {
+      const std::string v_name     = "ValEpoch" + std::to_string(epoch);
+      const std::string v_name_acc = v_name + "Accuracy";
+      galois::StatTimer val_epoch_timer(v_name.c_str(), "GraphNeuralNetwork");
+
+      val_epoch_timer.start();
+      SetLayerPhases(galois::GNNPhase::kValidate);
+      const PointerWithSize<galois::GNNFloat> val_pred = DoInference();
+      val_epoch_timer.stop();
+
+      float val_acc = GetGlobalAccuracy(val_pred);
+      if (this_host == 0) {
+        galois::gPrint("Epoch ", epoch, ": Validation accuracy is ", val_acc,
+                       "\n");
+        galois::runtime::reportStat_Single("GraphNeuralNetwork", v_name_acc,
+                                           val_acc);
+      }
+    }
+
+    if (do_test) {
+      const std::string test_name     = "TestEpoch" + std::to_string(epoch);
+      const std::string test_name_acc = test_name + "Accuracy";
+      galois::StatTimer test_epoch_timer(test_name.c_str(),
+                                         "GraphNeuralNetwork");
+
+      test_epoch_timer.start();
+      SetLayerPhases(galois::GNNPhase::kTest);
+      const PointerWithSize<galois::GNNFloat> test_pred = DoInference();
+      test_epoch_timer.stop();
+
+      float test_acc = GetGlobalAccuracy(test_pred);
+      if (this_host == 0) {
+        galois::gPrint("Epoch ", epoch, ": Test accuracy is ", test_acc, "\n");
+        galois::runtime::reportStat_Single("GraphNeuralNetwork", test_name_acc,
+                                           test_acc);
+      }
+    }
+
+    if (do_validate || do_test) {
+      // revert to training phase for next epoch
+      SetLayerPhases(galois::GNNPhase::kTrain);
+      // get back inductive norm factor as necessary; sampling norm is handled
+      // at beginning of every iteration
+      if (config_.inductive_training_ && !config_.do_sampling()) {
+        graph_->CalculateSpecialNormFactor(false, true);
+      }
+    }
+  }
+
+  if (altered_norm_factor) {
+    graph_->CalculateFullNormFactor();
   }
-  graph_->CalculateFullNormFactor();
+
   // check test accuracy
-  galois::StatTimer acc_timer("FinalAccuracyTest");
-  acc_timer.start();
+  galois::StatTimer test_timer("FinalTestRun", "GraphNeuralNetwork");
+  test_timer.start();
   SetLayerPhases(galois::GNNPhase::kTest);
   const PointerWithSize<galois::GNNFloat> predictions = DoInference();
-  float global_accuracy{0.0};
-#ifdef GALOIS_ENABLE_GPU
-  if (device_personality == DevicePersonality::GPU_CUDA) {
-    AdamOptimizer* adam = static_cast<AdamOptimizer*>(optimizer_.get());
-    adam->CopyToVector(cpu_pred, predictions);
-    global_accuracy = GetGlobalAccuracy(cpu_pred);
-  } else {
-#endif
-    global_accuracy = GetGlobalAccuracy(predictions);
-#ifdef GALOIS_ENABLE_GPU
-  }
-#endif
-  acc_timer.stop();
+  float global_accuracy = GetGlobalAccuracy(predictions);
+  test_timer.stop();
 
   if (this_host == 0) {
     galois::gPrint("Final test accuracy is ", global_accuracy, "\n");
+    galois::runtime::reportStat_Single("GraphNeuralNetwork",
+                                       "FinalTestAccuracy", global_accuracy);
   }
 
   return global_accuracy;
@@ -223,7 +267,23 @@ galois::GraphNeuralNetwork::DoInference() {
 
 float galois::GraphNeuralNetwork::GetGlobalAccuracy(
     PointerWithSize<GNNFloat> predictions) {
-  return graph_->GetGlobalAccuracy(predictions, phase_, config_.do_sampling());
+#ifdef GALOIS_ENABLE_GPU
+  if (device_personality == DevicePersonality::GPU_CUDA) {
+    if (cpu_pred_.size() != predictions.size()) {
+      cpu_pred_.resize(predictions.size());
+    }
+
+    // TODO get rid of CPU copy here if possible
+    AdamOptimizer* adam = static_cast<AdamOptimizer*>(optimizer_.get());
+    adam->CopyToVector(cpu_pred_, predictions);
+    return graph_->GetGlobalAccuracy(cpu_pred_, phase_, config_.do_sampling());
+  } else {
+#endif
+    return graph_->GetGlobalAccuracy(predictions, phase_,
+                                     config_.do_sampling());
+#ifdef GALOIS_ENABLE_GPU
+  }
+#endif
 }
 
 void galois::GraphNeuralNetwork::GradientPropagation() {
diff --git a/lonestar/gnn/distributed/gcn/gcn-dist.cpp b/lonestar/gnn/distributed/gcn/gcn-dist.cpp
index 65fe1338cc..e3dd1cac77 100644
--- a/lonestar/gnn/distributed/gcn/gcn-dist.cpp
+++ b/lonestar/gnn/distributed/gcn/gcn-dist.cpp
@@ -15,11 +15,7 @@ int main(int argc, char* argv[]) {
 
   galois::StatTimer compute_timer("Timer_0");
   compute_timer.start();
-
-  galois::StatTimer train_timer("TrainingTime");
-  train_timer.start();
   gnn->Train(num_epochs);
-  train_timer.stop();
   compute_timer.stop();
 
   return 0;
diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp
index e02adb56dc..d8975204c5 100644
--- a/lonestar/libgnnbench/src/Input.cpp
+++ b/lonestar/libgnnbench/src/Input.cpp
@@ -114,6 +114,16 @@ llvm::cl::opt<bool>
                                     "all non-train nodes are ignored"),
                           cll::init(false));
 
+llvm::cl::opt<unsigned>
+    val_interval("valInterval",
+                 cll::desc("# of epochs to test validation set (default 0)"),
+                 cll::init(0));
+
+llvm::cl::opt<unsigned>
+    test_interval("testInterval",
+                  cll::desc("# of epochs to test test set (default 0)"),
+                  cll::init(0));
+
 const char* GNNPartitionToString(galois::graphs::GNNPartitionScheme s) {
   switch (s) {
   case galois::graphs::GNNPartitionScheme::kOEC:
@@ -245,7 +255,9 @@ std::unique_ptr<galois::GraphNeuralNetwork> InitializeGraphNeuralNetwork() {
   galois::GraphNeuralNetworkConfig gnn_config(
       num_layers, layer_types, layer_sizes_vector, output_layer_type,
       do_graph_sampling, layer_config);
-  gnn_config.inductive_training_ = do_inductive_training;
+  gnn_config.inductive_training_  = do_inductive_training;
+  gnn_config.validation_interval_ = val_interval;
+  gnn_config.test_interval_       = test_interval;
   // optimizer
   std::unique_ptr<galois::BaseOptimizer> opt = CreateOptimizer(gnn_graph.get());
 

From b7a01bdb56a9f78fb98c9833710d64cf255614f7 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 1 Mar 2021 15:57:11 -0600
Subject: [PATCH 485/660] Fixed aggregate sync in GNNs

The buffer wrapper I was using to save a copy of data during sync is
incorrect as the memory can potentially get written before it is
actually serialized into a message. This was leading to inconsistent and
wrong results for GNN training. This commit changes it to use a gstl
vector. There is now some copy overhead due to this, but the tradeoff is
actual correct execution.

Some small fixes to the aggregate sync test as well.
---
 .../graphs/GraphAggregationSyncStructures.h   | 19 ++++++++++----
 libgnn/src/layers/GraphConvolutionalLayer.cpp | 15 +++++++----
 libgnn/test/aggregate-sync-test.cpp           | 25 +++++++++++++++----
 3 files changed, 44 insertions(+), 15 deletions(-)

diff --git a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
index 62a5ab14cb..e5dcb970af 100644
--- a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
+++ b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
@@ -17,16 +17,18 @@ extern unsigned layer_number_to_sync;
 #endif
 
 struct GNNSumAggregate {
-  using ValTy = galois::BufferWrapper<GNNFloat>;
+  using ValTy = galois::gstl::Vector<GNNFloat>;
 
   //! return a vector of floats to sync
   static ValTy extract(uint32_t node_id, char&) {
     // It should be a CPU synchronizing substrate.
     // If the GPU flag is turned off, then personality does not exist.
     // assert(device_personality == DevicePersonality::CPU);
-    ValTy extracted_vec(
-        &gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_],
-        gnn_matrix_to_sync_column_length_);
+    ValTy extracted_vec(gnn_matrix_to_sync_column_length_);
+    for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) {
+      extracted_vec[i] =
+          gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_ + i];
+    }
     // move constructor should kick in here to avoid return copy
     return extracted_vec;
   }
@@ -43,8 +45,15 @@ struct GNNSumAggregate {
     return true;
   }
 
-  //! do nothing (waste of a write)
+  //! No-op: readAny = overwritten anyways
   static void reset(uint32_t, char&) {}
+  // Reset is here in case anyone wants to bring it back
+  // static void reset(uint32_t node_id, char&) {
+  //  for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) {
+  //    gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_ + i] =
+  //    0;
+  //  }
+  //}
 
   //! element wise set
   static void setVal(uint32_t node_id, char&, ValTy y) {
diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp
index bbf42a47b0..7d7667a624 100644
--- a/libgnn/src/layers/GraphConvolutionalLayer.cpp
+++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp
@@ -230,7 +230,9 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU(
     size_t column_length, const GNNFloat* node_embeddings,
     GNNFloat* aggregate_output,
     galois::substrate::PerThreadStorage<std::vector<GNNFloat>>*) {
-  size_t num_nodes = graph_.size();
+  size_t num_nodes   = graph_.size();
+  size_t last_master = *(graph_.end_owned());
+  assert(0 == *(graph_.begin_owned()));
 
   galois::do_all(
       galois::iterate(static_cast<size_t>(0), num_nodes),
@@ -263,10 +265,13 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU(
 
         // init to self
         if (!config_.disable_self_aggregate) {
-          for (size_t i = 0; i < column_length; i++) {
-            aggregate_output[index_to_src_feature + i] =
-                node_embeddings[index_to_src_feature + i] * source_norm *
-                source_norm;
+          // only aggregate self once on master
+          if (src < last_master) {
+            for (size_t i = 0; i < column_length; i++) {
+              aggregate_output[index_to_src_feature + i] =
+                  node_embeddings[index_to_src_feature + i] * source_norm *
+                  source_norm;
+            }
           }
         }
 
diff --git a/libgnn/test/aggregate-sync-test.cpp b/libgnn/test/aggregate-sync-test.cpp
index 888f2ca69f..7025331029 100644
--- a/libgnn/test/aggregate-sync-test.cpp
+++ b/libgnn/test/aggregate-sync-test.cpp
@@ -6,8 +6,7 @@ int main() {
   galois::DistMemSys G;
 
   if (galois::runtime::getSystemNetworkInterface().Num == 1) {
-    GALOIS_LOG_ERROR("This test should be run with multiple hosts/processes");
-    exit(1);
+    GALOIS_LOG_WARN("This test should be run with multiple hosts/processes!");
   }
 
   auto test_graph = std::make_unique<galois::graphs::GNNGraph>(
@@ -233,10 +232,26 @@ int main() {
   layer_0 = std::make_unique<galois::GraphConvolutionalLayer>(
       0, *(test_graph_2.get()), dimension_0, l_config);
   layer_0->InitAllWeightsTo1();
+
   // make sure it runs in a sane manner
+  // galois::PointerWithSize<galois::GNNFloat> layer_0_forward_output =
   layer_0_forward_output =
       layer_0->ForwardPhase(test_graph_2->GetLocalFeatures());
 
+  for (size_t row = 0; row < test_graph_2->size(); row++) {
+    // row -> GID
+    size_t global_row = test_graph_2->GetGID(row);
+
+    if (global_row == 1) {
+      galois::gPrint(test_graph_2->host_prefix(), "GID ", global_row, " local ",
+                     row, " value ", layer_0_forward_output[row * 2], "\n");
+    }
+    if (global_row == 4) {
+      galois::gPrint(test_graph_2->host_prefix(), "GID ", global_row, " local ",
+                     row, " value ", layer_0_forward_output[row * 2], "\n");
+    }
+  }
+
   for (size_t row = 0; row < test_graph_2->size(); row++) {
     // row -> GID
     size_t global_row = test_graph_2->GetGID(row);
@@ -325,10 +340,10 @@ int main() {
     }
   }
 
-  // since layer isn't 0 anymore, backward phase will actually return something
-  dummy_ones_v.assign(test_graph_2->size() * 2, 1);
+  std::vector<galois::GNNFloat> dummy_ones_v2(test_graph_2->size() * 2, 1);
+  galois::PointerWithSize<galois::GNNFloat> dummy_ones2(dummy_ones_v2);
   layer_1_backward_output =
-      layer_1->BackwardPhase(test_graph_2->GetLocalFeatures(), &dummy_ones);
+      layer_1->BackwardPhase(test_graph_2->GetLocalFeatures(), &dummy_ones2);
 
   for (size_t row = 0; row < test_graph_2->size(); row++) {
     // row -> GID

From c983f2cd4e278b15e0f14071393c6a61910ec2af Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 1 Mar 2021 17:29:34 -0600
Subject: [PATCH 486/660] Func to sync layer weight matrix; layer init

Added a (currently unused) function that synchronizes entire weight
matrix with a set operation. Currently unused because Bengio init will
do the same thing on each host.

Changed the seed used by Bengio init to include layer number as well to
make it so each layer has a different weight set to start with.
---
 libgnn/include/galois/layers/GNNLayer.h       |  3 +++
 .../galois/layers/GradientSyncStructures.h    | 27 +++++++++++++++++++
 libgnn/src/layers/GNNLayer.cpp                | 24 ++++++++++++++++-
 3 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
index 0039683ad4..cfbf81fc51 100644
--- a/libgnn/include/galois/layers/GNNLayer.h
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -225,6 +225,9 @@ class GNNLayer {
   PointerWithSize<GNNFloat> p_forward_output_matrix_;
   PointerWithSize<GNNFloat> p_backward_output_matrix_;
 
+  //! Synchronizes all weights (used in distributed setting)
+  void SyncInitialWeights();
+
   //! Wrapper over gradient matrix to make it compatible with Gluon
   std::unique_ptr<GluonGradientInterface> gradient_sync_interface_;
   //! Synchronization substrate for the weight gradients
diff --git a/libgnn/include/galois/layers/GradientSyncStructures.h b/libgnn/include/galois/layers/GradientSyncStructures.h
index 32b7a85b82..ad76f514cd 100644
--- a/libgnn/include/galois/layers/GradientSyncStructures.h
+++ b/libgnn/include/galois/layers/GradientSyncStructures.h
@@ -34,4 +34,31 @@ struct WeightGradientSummation {
   static bool setVal_batch(unsigned, uint8_t*, DataCommMode) { return false; }
 };
 
+struct WeightGradientSet {
+  using ValTy = GNNFloat;
+  static ValTy extract(uint32_t, ValTy& weight) { return weight; }
+  static bool reduce(uint32_t, ValTy&, ValTy) { return true; }
+
+  //! reset weight to 0
+  static void reset(uint32_t, ValTy& weight) { weight = 0.0; }
+
+  //! save weight
+  static void setVal(uint32_t, ValTy& weight, ValTy y) { weight = y; }
+
+  // GPU options TODO for GPU
+  static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {
+    return false;
+  }
+  static bool extract_batch(unsigned, uint8_t*) { return false; }
+  static bool extract_reset_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {
+    return false;
+  }
+  static bool extract_reset_batch(unsigned, uint8_t*) { return false; }
+  static bool reduce_batch(unsigned, uint8_t*, DataCommMode) { return false; }
+  static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) {
+    return false;
+  }
+  static bool setVal_batch(unsigned, uint8_t*, DataCommMode) { return false; }
+};
+
 } // namespace galois
diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp
index 2018c4f5c5..6831ccb0b7 100644
--- a/libgnn/src/layers/GNNLayer.cpp
+++ b/libgnn/src/layers/GNNLayer.cpp
@@ -80,7 +80,7 @@ void galois::GNNLayer::GlorotBengioInit(std::vector<GNNFloat>* vector_to_init) {
   float max = std::sqrt(6.0) / std::sqrt(layer_dimensions_.output_columns +
                                          layer_dimensions_.input_columns);
   // TODO this seed should be configurable
-  std::default_random_engine rng(1);
+  std::default_random_engine rng(1 + layer_number_);
   std::uniform_real_distribution<GNNFloat> dist(-max, max);
 
   for (size_t i = 0; i < vector_to_init->size(); i++) {
@@ -238,3 +238,25 @@ void galois::GNNLayer::WeightGradientSyncAverage() {
         galois::loopname("WeightGradientSyncAverageDivide"));
   }
 }
+
+void galois::GNNLayer::SyncInitialWeights() {
+  if (galois::runtime::getSystemNetworkInterface().Num == 1) {
+    return;
+  }
+#ifdef GALOIS_ENABLE_GPU
+  // TODO(loc/hochan)
+  GALOIS_LOG_FATAL("Need to implement GPU version of this");
+#endif
+  // copy weights over to gradients
+  for (size_t i = 0; i < layer_weights_.size(); i++) {
+    layer_weight_gradients_[i] = layer_weights_[i];
+  }
+  // sync "gradients" with a set only (reduction ignored)
+  gradient_sync_substrate_->sync<writeAny, readAny, WeightGradientSet>(
+      "InitialSync");
+  // copy "gradients" (actually weights) back to weight matrix
+  for (size_t i = 0; i < layer_weights_.size(); i++) {
+    layer_weights_[i]          = layer_weight_gradients_[i];
+    layer_weight_gradients_[i] = 0;
+  }
+}

From 921347891550702d201c1b23cac77c94b551c5b3 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 2 Mar 2021 13:19:11 -0600
Subject: [PATCH 487/660] SAGE paired matrix init

The 2 matrices used by SAGE layer are technically 1 whole matrix. This
commit adds a pair glorot bengio init to work on them as a pair.
---
 libgnn/include/galois/layers/GNNLayer.h |  4 ++++
 libgnn/src/layers/GNNLayer.cpp          | 29 ++++++++++++++++++++++++-
 libgnn/src/layers/SAGELayer.cpp         |  5 ++++-
 3 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
index cfbf81fc51..ecd79bec34 100644
--- a/libgnn/include/galois/layers/GNNLayer.h
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -263,6 +263,10 @@ class GNNLayer {
   //! Code inspired DGL and TinyDNN
   void GlorotBengioInit(std::vector<GNNFloat>* vector_to_init);
 
+  //! Init 2 things as one unit; used for SAGE
+  void PairGlorotBengioInit(std::vector<GNNFloat>* vector1,
+                            std::vector<GNNFloat>* vector2);
+
   //! Randomly init a float vector using the class's random init RNG
   void RandomInitVector(std::vector<GNNFloat>* vector_to_init);
 
diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp
index 6831ccb0b7..9f228cad25 100644
--- a/libgnn/src/layers/GNNLayer.cpp
+++ b/libgnn/src/layers/GNNLayer.cpp
@@ -79,7 +79,6 @@ galois::GNNLayer::GNNLayer(size_t layer_num,
 void galois::GNNLayer::GlorotBengioInit(std::vector<GNNFloat>* vector_to_init) {
   float max = std::sqrt(6.0) / std::sqrt(layer_dimensions_.output_columns +
                                          layer_dimensions_.input_columns);
-  // TODO this seed should be configurable
   std::default_random_engine rng(1 + layer_number_);
   std::uniform_real_distribution<GNNFloat> dist(-max, max);
 
@@ -93,6 +92,34 @@ void galois::GNNLayer::GlorotBengioInit(std::vector<GNNFloat>* vector_to_init) {
 #endif
 }
 
+void galois::GNNLayer::PairGlorotBengioInit(std::vector<GNNFloat>* vector1,
+                                            std::vector<GNNFloat>* vector2) {
+  // multiplied by 2 here because 2 pieces are 1 unit
+  float max =
+      std::sqrt(6.0) / std::sqrt((2 * layer_dimensions_.output_columns) +
+                                 layer_dimensions_.input_columns);
+  assert(vector1->size() ==
+         (layer_dimensions_.input_columns * layer_dimensions_.output_columns));
+  assert(vector2->size() ==
+         (layer_dimensions_.input_columns * layer_dimensions_.output_columns));
+  std::default_random_engine rng(1 + layer_number_);
+  std::uniform_real_distribution<GNNFloat> dist(-max, max);
+
+  for (size_t i = 0; i < vector1->size(); i++) {
+    (*vector1)[i] = dist(rng);
+  }
+  for (size_t i = 0; i < vector2->size(); i++) {
+    (*vector2)[i] = dist(rng);
+  }
+#ifdef GALOIS_ENABLE_GPU
+  // TODO
+  GALOIS_LOG_FATAL("TODO: copy both not 1");
+  if (device_personality == DevicePersonality::GPU_CUDA) {
+    CopyLayerWeightsToGPU();
+  }
+#endif
+}
+
 void galois::GNNLayer::RandomInitVector(std::vector<GNNFloat>* vector_to_init) {
   galois::do_all(
       galois::iterate(static_cast<size_t>(0), vector_to_init->size()),
diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp
index dfbd006eba..c9b2a16da7 100644
--- a/libgnn/src/layers/SAGELayer.cpp
+++ b/libgnn/src/layers/SAGELayer.cpp
@@ -17,7 +17,10 @@ galois::SAGELayer::SAGELayer(size_t layer_num,
         layer_dimensions_.input_columns * layer_dimensions_.output_columns;
     layer_weights_2_.resize(num_weight_elements);
     layer_weight_gradients_2_.resize(num_weight_elements, 0);
-    GlorotBengioInit(&layer_weights_2_);
+
+    // reinit both weight matrices as one unit
+    PairGlorotBengioInit(&layer_weights_, &layer_weights_2_);
+
     // update the pointers to them as well as realloc will require it
     p_layer_weights_2_ = PointerWithSize<GNNFloat>(layer_weights_2_);
     p_layer_weight_gradients_2_ =

From e3bbbc32167ade693d775ade0c0146ba4c4fbb67 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 2 Mar 2021 14:15:36 -0600
Subject: [PATCH 488/660] SAGE distribution: sync 2nd set of weights

SAGE layer now suppports distributed execution: this commit adds the
second sync required for the 2nd set of weights to the backward pass
---
 libgnn/include/galois/layers/SAGELayer.h | 15 +++++++++++++++
 libgnn/src/layers/SAGELayer.cpp          | 11 ++++++++++-
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/libgnn/include/galois/layers/SAGELayer.h b/libgnn/include/galois/layers/SAGELayer.h
index a489913ef5..431a7f0696 100644
--- a/libgnn/include/galois/layers/SAGELayer.h
+++ b/libgnn/include/galois/layers/SAGELayer.h
@@ -1,5 +1,6 @@
 #pragma once
 #include "galois/layers/GNNLayer.h"
+#include "galois/layers/GradientSyncStructures.h"
 
 #ifdef GALOIS_ENABLE_GPU
 // TODO(loc/hochan)
@@ -91,6 +92,14 @@ class SAGELayer : public GNNLayer {
   //! override parent function: optimizes the second set of weights as well
   void OptimizeLayer(BaseOptimizer* optimizer, size_t trainable_layer_number);
 
+  //! Sync second set of weight gradients
+  void WeightGradientSyncSum2() {
+    // TODO bitset
+    gradient_sync_substrate_2_
+        ->sync<writeAny, readAny, WeightGradientSummation>(
+            "WeightGradients2Sync");
+  }
+
   //! SAGE config params
   SAGELayerConfig sage_config_;
   //! Need own optimizer for the 2nd weight matrix
@@ -102,6 +111,12 @@ class SAGELayer : public GNNLayer {
   PointerWithSize<GNNFloat> p_layer_weights_2_;
   PointerWithSize<GNNFloat> p_layer_weight_gradients_2_;
 
+  //! Wrapper over 2nd gradient matrix to make it compatible with Gluon
+  std::unique_ptr<GluonGradientInterface> gradient_sync_interface_2_;
+  //! Synchronization substrate for the 2nd weight gradients
+  std::unique_ptr<galois::graphs::GluonSubstrate<GluonGradientInterface>>
+      gradient_sync_substrate_2_;
+
   // 2 temporaries the size of the forward input; used for dropout and
   // aggregation (if either are required)
   std::vector<GNNFloat> in_temp_1_;
diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp
index c9b2a16da7..3c8184faee 100644
--- a/libgnn/src/layers/SAGELayer.cpp
+++ b/libgnn/src/layers/SAGELayer.cpp
@@ -28,6 +28,15 @@ galois::SAGELayer::SAGELayer(size_t layer_num,
     // initialize the optimizer
     std::vector<size_t> weight_size = {num_weight_elements};
     second_weight_optimizer_ = std::make_unique<AdamOptimizer>(weight_size, 1);
+
+    // initialize sync substrate for second set
+    gradient_sync_interface_2_ =
+        std::make_unique<GluonGradientInterface>(layer_weight_gradients_2_);
+    gradient_sync_substrate_2_ = std::make_unique<
+        galois::graphs::GluonSubstrate<GluonGradientInterface>>(
+        *gradient_sync_interface_2_,
+        galois::runtime::getSystemNetworkInterface().ID,
+        galois::runtime::getSystemNetworkInterface().Num, false);
   }
 
   size_t num_input_elements =
@@ -165,6 +174,7 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
         input_to_use.data(), input_gradient->data(),
         p_layer_weight_gradients_2_.data());
   }
+  WeightGradientSyncSum2();
 
   // derivative of aggregation/update
   // TODO clean up logic here to reduce nesting
@@ -245,7 +255,6 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
     }
   }
 
-  // TODO(loc) sync both weight matrices
   WeightGradientSyncSum();
 
   if (!config_.disable_dropout && layer_number_ != 0) {

From 97ca9be9dd3aeeb51ffa23b960c458afaff99165 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 2 Mar 2021 16:56:32 -0600
Subject: [PATCH 489/660] Cleanup some prints in cusp and gnn; ogbn100m

Lots of trace prints/debug prints being printed that this commit cleans
up.

Also adds the training boundaries for the ogbn-100M dataset.
---
 .../include/galois/graphs/DistributedGraph.h  | 26 +++---
 libcusp/include/galois/graphs/NewGeneric.h    | 79 +++++++++----------
 libgnn/src/graphs/GNNGraph.cpp                |  8 +-
 libgnn/src/layers/GNNLayer.cpp                |  4 +-
 libgnn/src/layers/GluonGradientInterface.cpp  |  4 +-
 libgnn/src/layers/SoftmaxLayer.cpp            | 17 +++-
 6 files changed, 75 insertions(+), 63 deletions(-)

diff --git a/libcusp/include/galois/graphs/DistributedGraph.h b/libcusp/include/galois/graphs/DistributedGraph.h
index bf70bbf3e2..d13350f848 100644
--- a/libcusp/include/galois/graphs/DistributedGraph.h
+++ b/libcusp/include/galois/graphs/DistributedGraph.h
@@ -60,16 +60,16 @@ enum MASTERS_DISTRIBUTION {
  * @tparam NodeTy type of node data for the graph
  * @tparam EdgeTy type of edge data for the graph
  */
-template <typename NodeTy, typename EdgeTy,
-          typename NodeIndexTy=uint32_t,
-          typename EdgeIndexTy=uint64_t>
+template <typename NodeTy, typename EdgeTy, typename NodeIndexTy = uint32_t,
+          typename EdgeIndexTy = uint64_t>
 class DistGraph {
 private:
   //! Graph name used for printing things
   constexpr static const char* const GRNAME = "dGraph";
 
-  using GraphTy = galois::graphs::LC_CSR_Graph<NodeTy, EdgeTy, true, false,
-                  false, EdgeTy, NodeIndexTy, EdgeIndexTy>;
+  using GraphTy =
+      galois::graphs::LC_CSR_Graph<NodeTy, EdgeTy, true, false, false, EdgeTy,
+                                   NodeIndexTy, EdgeIndexTy>;
 
   // vector for determining range objects for master nodes + nodes
   // with edges (which includes masters)
@@ -393,10 +393,10 @@ class DistGraph {
     galois::runtime::reportStatCond_Tmax<MORE_DIST_STATS>(
         GRNAME, "MasterDistTime", timer.get());
 
-    galois::gPrint(
+    galois::gDebug(
         "[", id, "] Master distribution time : ", timer.get_usec() / 1000000.0f,
         " seconds to read ", g.num_bytes_read(), " bytes in ", g.num_seeks(),
-        " seeks (", g.num_bytes_read() / (float)timer.get_usec(), " MBPS)\n");
+        " seeks (", g.num_bytes_read() / (float)timer.get_usec(), " MBPS)");
     return numNodes_to_divide;
   }
 
@@ -866,9 +866,7 @@ class DistGraph {
   /**
    * Deallocates underlying LC CSR Graph
    */
-  void deallocate() {
-    graph.deallocate();
-  }
+  void deallocate() { graph.deallocate(); }
 
   /**
    * Sort the underlying LC_CSR_Graph by ID (destinations)
@@ -882,10 +880,10 @@ class DistGraph {
         galois::no_stats(), galois::loopname("CSREdgeSort"), galois::steal());
   }
 
-////////////////////////////////////////////////////////////////////////////////
-// what follows are GNN functions; some are not great (e.g. expose arrays)
-// TODO figure out better way to do this
-////////////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////////////
+  // what follows are GNN functions; some are not great (e.g. expose arrays)
+  // TODO figure out better way to do this
+  ////////////////////////////////////////////////////////////////////////////////
   EdgeIndexTy* row_start_ptr() { return graph.row_start_ptr(); }
   NodeIndexTy* edge_dst_ptr() { return graph.edge_dst_ptr(); }
 };
diff --git a/libcusp/include/galois/graphs/NewGeneric.h b/libcusp/include/galois/graphs/NewGeneric.h
index 771c5b5143..c29127d9e6 100644
--- a/libcusp/include/galois/graphs/NewGeneric.h
+++ b/libcusp/include/galois/graphs/NewGeneric.h
@@ -119,6 +119,10 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
       // this is entire graph: amazon's mask isn't contiguous
       bps.push_back(0);
       bps.push_back(86618);
+    } else if (filename.find("ogbn-papers100M") != std::string::npos) {
+      // this is entire graph: amazon's mask isn't contiguous
+      bps.push_back(602);
+      bps.push_back(111052523);
     } else {
       // TODO(loc) only die under certain conditions; don't die if something
       // is missing
@@ -224,9 +228,8 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
     Tgraph_construct.start();
 
     if (readFromFile) {
-      galois::gPrint("[", base_DistGraph::id,
-                     "] Reading local graph from file ", localGraphFileName,
-                     "\n");
+      galois::gDebug("[", base_DistGraph::id,
+                     "] Reading local graph from file ", localGraphFileName);
       base_DistGraph::read_local_graph_from_file(localGraphFileName);
       Tgraph_construct.stop();
       return;
@@ -312,7 +315,7 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
 
     // phase 0
 
-    galois::gPrint("[", base_DistGraph::id, "] Starting graph reading.\n");
+    galois::gDebug("[", base_DistGraph::id, "] Starting graph reading.");
     galois::graphs::BufferedGraph<EdgeTy> bufGraph;
     bufGraph.resetReadCounters();
     galois::StatTimer graphReadTimer("GraphReading", GRNAME);
@@ -321,18 +324,16 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
                               *edgeEnd, base_DistGraph::numGlobalNodes,
                               base_DistGraph::numGlobalEdges);
     graphReadTimer.stop();
-    galois::gPrint("[", base_DistGraph::id, "] Reading graph complete.\n");
+    galois::gDebug("[", base_DistGraph::id, "] Reading graph complete.");
 
     if (graphPartitioner->masterAssignPhase()) {
       // loop over all nodes, determine where neighbors are, assign masters
       galois::StatTimer phase0Timer("Phase0", GRNAME);
-      galois::gPrint("[", base_DistGraph::id,
-                     "] Starting master assignment.\n");
+      galois::gDebug("[", base_DistGraph::id, "] Starting master assignment.");
       phase0Timer.start();
       phase0(bufGraph, cuspAsync, stateRounds);
       phase0Timer.stop();
-      galois::gPrint("[", base_DistGraph::id,
-                     "] Master assignment complete.\n");
+      galois::gDebug("[", base_DistGraph::id, "] Master assignment complete.");
     }
 
     galois::StatTimer inspectionTimer("EdgeInspection", GRNAME);
@@ -447,13 +448,14 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
     base_DistGraph::initializeSpecificRanges();
 
     Tgraph_construct.stop();
-    galois::gPrint("[", base_DistGraph::id, "] Graph construction complete.\n");
+    galois::gDebug("[", base_DistGraph::id, "] Graph construction complete.");
 
     // report state rounds
     if (base_DistGraph::id == 0) {
       galois::runtime::reportStat_Single(GRNAME, "CuSPStateRounds",
                                          (uint32_t)stateRounds);
     }
+    galois::gPrint("[", base_DistGraph::id, "] Dist graph constructed\n");
   }
 
 private:
@@ -1363,7 +1365,7 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
 
     if (async) {
       if (base_DistGraph::id == 0) {
-        galois::gPrint("Using asynchronous master determination sends.\n");
+        galois::gDebug("Using asynchronous master determination sends.");
       }
 
       hostFinished.resize(base_DistGraph::numHosts);
@@ -1381,8 +1383,8 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
 #endif
 
     if (base_DistGraph::id == 0) {
-      galois::gPrint("Number of BSP sync rounds in master assignment: ",
-                     stateRounds, "\n");
+      galois::gDebug("Number of BSP sync rounds in master assignment: ",
+                     stateRounds);
     }
 
     // galois::PerThreadTimer<CUSP_PT_TIMER> ptt(
@@ -1484,9 +1486,9 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
       base_DistGraph::increment_evilPhase();
     }
 
-    galois::gPrint("[", base_DistGraph::id,
+    galois::gDebug("[", base_DistGraph::id,
                    "] Local master assignment "
-                   "complete.\n");
+                   "complete.");
 
     // one more step: let masters know of nodes they own (if they don't
     // have the node locally then this is the only way they will learn about
@@ -1498,7 +1500,7 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
     recvMastersToOwners();
     p0master2ownerTimer.stop();
 
-    galois::gPrint("[", base_DistGraph::id, "] Received my master mappings.\n");
+    galois::gDebug("[", base_DistGraph::id, "] Received my master mappings.");
 
     base_DistGraph::increment_evilPhase();
 
@@ -1543,11 +1545,10 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
     inspectionTimer.stop();
 
     uint64_t allBytesRead = bufGraph.getBytesRead();
-    galois::gPrint(
-        "[", base_DistGraph::id,
-        "] Edge inspection time: ", inspectionTimer.get_usec() / 1000000.0f,
-        " seconds to read ", allBytesRead, " bytes (",
-        allBytesRead / (float)inspectionTimer.get_usec(), " MBPS)\n");
+    galois::gDebug("[", base_DistGraph::id, "] Edge inspection time: ",
+                   inspectionTimer.get_usec() / 1000000.0f, " seconds to read ",
+                   allBytesRead, " bytes (",
+                   allBytesRead / (float)inspectionTimer.get_usec(), " MBPS)");
 
     // get incoming mirrors ready for creation
     uint32_t additionalMirrorCount = incomingMirrors.count();
@@ -1646,7 +1647,7 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
   void edgeCutLoad(GraphTy& graph,
                    galois::graphs::BufferedGraph<EdgeTy>& bGraph) {
     if (base_DistGraph::id == 0) {
-      galois::gPrint("Loading edge-data while creating edges\n");
+      galois::gDebug("Loading edge-data while creating edges");
     }
 
     uint64_t globalOffset = base_DistGraph::gid2host[base_DistGraph::id].first;
@@ -1677,10 +1678,10 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
         galois::steal(), galois::no_stats());
 
     timer.stop();
-    galois::gPrint("[", base_DistGraph::id,
+    galois::gDebug("[", base_DistGraph::id,
                    "] Edge loading time: ", timer.get_usec() / 1000000.0f,
                    " seconds to read ", bGraph.getBytesRead(), " bytes (",
-                   bGraph.getBytesRead() / (float)timer.get_usec(), " MBPS)\n");
+                   bGraph.getBytesRead() / (float)timer.get_usec(), " MBPS)");
   }
 
   /**
@@ -1698,7 +1699,7 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
   void edgeCutLoad(GraphTy& graph,
                    galois::graphs::BufferedGraph<EdgeTy>& bGraph) {
     if (base_DistGraph::id == 0) {
-      galois::gPrint("Loading edge-data while creating edges\n");
+      galois::gDebug("Loading edge-data while creating edges");
     }
 
     uint64_t globalOffset = base_DistGraph::gid2host[base_DistGraph::id].first;
@@ -1728,10 +1729,10 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
         galois::steal(), galois::no_stats());
 
     timer.stop();
-    galois::gPrint("[", base_DistGraph::id,
+    galois::gDebug("[", base_DistGraph::id,
                    "] Edge loading time: ", timer.get_usec() / 1000000.0f,
                    " seconds to read ", bGraph.getBytesRead(), " bytes (",
-                   bGraph.getBytesRead() / (float)timer.get_usec(), " MBPS)\n");
+                   bGraph.getBytesRead() / (float)timer.get_usec(), " MBPS)");
   }
 
   /**
@@ -1764,11 +1765,10 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
     inspectionTimer.stop();
     // report edge inspection time
     uint64_t allBytesRead = bufGraph.getBytesRead();
-    galois::gPrint(
-        "[", base_DistGraph::id,
-        "] Edge inspection time: ", inspectionTimer.get_usec() / 1000000.0f,
-        " seconds to read ", allBytesRead, " bytes (",
-        allBytesRead / (float)inspectionTimer.get_usec(), " MBPS)\n");
+    galois::gDebug("[", base_DistGraph::id, "] Edge inspection time: ",
+                   inspectionTimer.get_usec() / 1000000.0f, " seconds to read ",
+                   allBytesRead, " bytes (",
+                   allBytesRead / (float)inspectionTimer.get_usec(), " MBPS)");
 
     // old inspection barrier
     // galois::runtime::getHostBarrier().wait();
@@ -2138,7 +2138,7 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
     galois::runtime::reportStat_Tsum(
         GRNAME, std::string("EdgeInspectionBytesSent"), bytesSent.reduce());
 
-    galois::gPrint("[", base_DistGraph::id, "] Inspection sends complete.\n");
+    galois::gDebug("[", base_DistGraph::id, "] Inspection sends complete.");
   }
 
   /**
@@ -2218,8 +2218,7 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
       }
     }
 
-    galois::gPrint("[", base_DistGraph::id,
-                   "] Inspection receives complete.\n");
+    galois::gDebug("[", base_DistGraph::id, "] Inspection receives complete.");
   }
 
   /**
@@ -2246,7 +2245,7 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
     inspectIncomingNodes(hasIncomingEdge, prefixSumOfEdges);
     finalizeInspection(prefixSumOfEdges);
 
-    galois::gPrint("[", base_DistGraph::id, "] Inspection mapping complete.\n");
+    galois::gDebug("[", base_DistGraph::id, "] Inspection mapping complete.");
     return prefixSumOfEdges;
   }
 
@@ -2598,9 +2597,9 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
                  galois::graphs::BufferedGraph<EdgeTy>& bufGraph) {
     if (base_DistGraph::id == 0) {
       if (std::is_void<typename GraphTy::edge_data_type>::value) {
-        fprintf(stderr, "Loading void edge-data while creating edges.\n");
+        galois::gDebug("Loading void edge-data while creating edges.");
       } else {
-        fprintf(stderr, "Loading edge-data while creating edges.\n");
+        galois::gDebug(stderr, "Loading edge-data while creating edges.");
       }
     }
 
@@ -2625,10 +2624,10 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
 
     loadEdgeTimer.stop();
 
-    galois::gPrint("[", base_DistGraph::id, "] Edge loading time: ",
+    galois::gDebug("[", base_DistGraph::id, "] Edge loading time: ",
                    loadEdgeTimer.get_usec() / 1000000.0f, " seconds to read ",
                    bufBytesRead, " bytes (",
-                   bufBytesRead / (float)loadEdgeTimer.get_usec(), " MBPS)\n");
+                   bufBytesRead / (float)loadEdgeTimer.get_usec(), " MBPS)");
   }
 
   // Edge type is not void. (i.e. edge data exists)
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index fd87c08fb6..4efae3c429 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -166,7 +166,7 @@ void galois::graphs::GNNGraph::AggregateSync(
   gnn_matrix_to_sync_               = matrix_to_sync;
   gnn_matrix_to_sync_column_length_ = matrix_column_size;
 
-  // XXX bitset setting
+  // TODO(loc) bitset setting
   sync_substrate_->sync<writeSource, readAny, GNNSumAggregate>(
       "GraphAggregateSync");
 }
@@ -184,7 +184,7 @@ void galois::graphs::GNNGraph::AggregateSync(
   gnn_matrix_to_sync_column_length_ = matrix_column_size;
   cuda_ctx_for_sync                 = cuda_ctx_;
   layer_number_to_sync              = layer_number;
-  // XXX bitset setting
+  // TODO bitset setting
   // call sync
   cudaSetLayerInputOutput(cuda_ctx_, matrix_to_sync, matrix_column_size, size(),
                           layer_number);
@@ -297,7 +297,9 @@ void galois::graphs::GNNGraph::ReadLocalLabels(const std::string& dataset_name,
   size_t num_nodes;
   file_stream >> num_nodes >> num_label_classes_ >> std::ws;
   assert(num_nodes == partitioned_graph_->globalSize());
-  galois::gPrint("Number of label classes is ", num_label_classes_, "\n");
+  if (host_id_ == 0) {
+    galois::gInfo("Number of label classes is ", num_label_classes_);
+  }
 
   // allocate memory for labels
   if (has_single_class_label) {
diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp
index 9f228cad25..92c1fa3250 100644
--- a/libgnn/src/layers/GNNLayer.cpp
+++ b/libgnn/src/layers/GNNLayer.cpp
@@ -244,7 +244,7 @@ void galois::GNNLayer::ActivationDerivative(
 }
 
 void galois::GNNLayer::WeightGradientSyncSum() {
-  // XXX bitset
+  // TODO bitset
   gradient_sync_substrate_->sync<writeAny, readAny, WeightGradientSummation>(
       "WeightGradientsSync");
 }
@@ -252,7 +252,7 @@ void galois::GNNLayer::WeightGradientSyncSum() {
 void galois::GNNLayer::WeightGradientSyncAverage() {
   size_t num_hosts = galois::runtime::getSystemNetworkInterface().Num;
   if (num_hosts > 1) {
-    // XXX bitset
+    // TODO bitset
     // sum, then average by dividing all by num hosts (every host participates
     // in sync)
     gradient_sync_substrate_->sync<writeAny, readAny, WeightGradientSummation>(
diff --git a/libgnn/src/layers/GluonGradientInterface.cpp b/libgnn/src/layers/GluonGradientInterface.cpp
index 31770afb4e..74e612af17 100644
--- a/libgnn/src/layers/GluonGradientInterface.cpp
+++ b/libgnn/src/layers/GluonGradientInterface.cpp
@@ -44,6 +44,6 @@ galois::GluonGradientInterface::GluonGradientInterface(
     }
   }
 
-  galois::gInfo("[", my_host, "] Weight gradients: this host owns ",
-                begin_master_, " to ", end_master_);
+  galois::gDebug("[", my_host, "] Weight gradients: this host owns ",
+                 begin_master_, " to ", end_master_);
 }
diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp
index f541b43a18..a268089b33 100644
--- a/libgnn/src/layers/SoftmaxLayer.cpp
+++ b/libgnn/src/layers/SoftmaxLayer.cpp
@@ -8,7 +8,12 @@ galois::SoftmaxLayer::ForwardPhaseCPU(
   input_loss_.assign(input_loss_.size(), 0.0);
   forward_output_matrix_.assign(forward_output_matrix_.size(), 0.0);
   const size_t feature_length = layer_dimensions_.input_columns;
-  // TODO(loc) once needed for accuracy debugging, print out loss
+#ifndef NDEBUG
+  galois::DGAccumulator<GNNFloat> loss_accum;
+  galois::DGAccumulator<size_t> handled;
+  loss_accum.reset();
+  handled.reset();
+#endif
 
   galois::do_all(
       galois::iterate(graph_.begin_owned(), graph_.end_owned()),
@@ -22,7 +27,6 @@ galois::SoftmaxLayer::ForwardPhaseCPU(
           // do softmax
           GNNSoftmax(feature_length, &input_embeddings[feature_length * i],
                      &forward_output_matrix_[feature_length * i]);
-
           // create ground truth vector for this LID
           std::vector<GNNFloat>* ground_truth_vec =
               ground_truth_vectors_.getLocal();
@@ -36,11 +40,20 @@ galois::SoftmaxLayer::ForwardPhaseCPU(
           input_loss_[i] =
               GNNCrossEntropy(feature_length, ground_truth_vec->data(),
                               &forward_output_matrix_[feature_length * i]);
+#ifndef NDEBUG
+          loss_accum += input_loss_[i];
+          handled += 1;
+#endif
         }
       },
       // TODO chunk size?
       // steal on as some threads may have nothing to work on
       galois::steal(), galois::loopname("SoftmaxForward"));
+#ifndef NDEBUG
+  GNNFloat reduced_loss = loss_accum.reduce();
+  size_t t              = handled.reduce();
+  galois::gPrint("Loss is ", reduced_loss / t, "\n");
+#endif
 
   return forward_output_matrix_;
 }

From 7b0d246f6ef22477955b44111d7ce0efc369c7bf Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 3 Mar 2021 13:36:13 -0600
Subject: [PATCH 490/660] aggregate sync test for gnn runs via ctest

Added CMake code to run the aggregation sync test with multiple
processes. Also changed a few of the other tests to make them run for
less time.

TODO: weight-sync test is broken, need to fix + add other partitioning
policies.
---
 libgnn/test/CMakeLists.txt    | 16 ++++++++++++++--
 libgnn/test/epoch-test.cpp    |  2 +-
 libgnn/test/gnngraph-test.cpp |  6 +++---
 3 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt
index 853c5a22f9..b1170e2d16 100644
--- a/libgnn/test/CMakeLists.txt
+++ b/libgnn/test/CMakeLists.txt
@@ -2,6 +2,15 @@ add_executable(gnngraph-test gnngraph-test.cpp)
 target_link_libraries(gnngraph-test galois_gnn)
 add_test(NAME gnngraph-test COMMAND gnngraph-test)
 
+# multihost testing things
+set(hosts)
+set(host 12)
+while (${host} GREATER 1)
+  list(APPEND hosts ${host})
+  math(EXPR host "${host} - 1")
+endwhile()
+list(APPEND hosts "1")
+
 if (NOT GALOIS_ENABLE_GPU)
   add_executable(convlayer-test convlayer-test.cpp)
   target_link_libraries(convlayer-test galois_gnn)
@@ -47,13 +56,16 @@ if (NOT GALOIS_ENABLE_GPU)
   target_link_libraries(multilabel-epoch-test galois_gnn)
   add_test(NAME multilabel-epoch-test COMMAND multilabel-epoch-test)
 
-  # TODO figure out how to make this test run in parallel
   add_executable(aggregate-sync-test aggregate-sync-test.cpp)
   target_link_libraries(aggregate-sync-test galois_gnn)
-  #add_test(NAME aggregate-sync-test COMMAND GALOIS_DO_NOT_BIND_THREADS=1 mpirun -n=4 ./aggregate-sync-test)
+  foreach(host_count ${hosts})
+    add_test(NAME run-aggsync-${host_count} COMMAND mpiexec --bind-to none -n ${host_count} aggregate-sync-test)
+    set_tests_properties(run-aggsync-${host_count} PROPERTIES ENVIRONMENT "GALOIS_DO_NOT_BIND_THREADS=1")
+  endforeach()
 
   add_executable(weight-sync-test weight-sync-test.cpp)
   target_link_libraries(weight-sync-test galois_gnn)
+  # TODO multi host tests
 
   add_executable(multilabel-read multilabel-read.cpp)
   target_link_libraries(multilabel-read galois_gnn)
diff --git a/libgnn/test/epoch-test.cpp b/libgnn/test/epoch-test.cpp
index 2dbaea3372..ed665684f1 100644
--- a/libgnn/test/epoch-test.cpp
+++ b/libgnn/test/epoch-test.cpp
@@ -43,7 +43,7 @@ int main() {
   // increasing
   galois::StatTimer main_timer("Timer_0");
   main_timer.start();
-  for (size_t epoch = 0; epoch < 100; epoch++) {
+  for (size_t epoch = 0; epoch < 25; epoch++) {
     galois::PointerWithSize<galois::GNNFloat> predictions = gnn->DoInference();
     gnn->GradientPropagation();
     galois::gPrint("Epoch ", epoch, ": Accuracy is ",
diff --git a/libgnn/test/gnngraph-test.cpp b/libgnn/test/gnngraph-test.cpp
index 5aa4d72ddf..101540f4d5 100644
--- a/libgnn/test/gnngraph-test.cpp
+++ b/libgnn/test/gnngraph-test.cpp
@@ -14,12 +14,12 @@ int main() {
                      galois::runtime::getSystemNetworkInterface().ID,
                      num_threads);
 
-  // multi level reading tested in another test
+  // note multi level reading tested in another test
   GALOIS_LOG_VERBOSE("reddit with single label, oec");
-  galois::graphs::GNNGraph("reddit", galois::graphs::GNNPartitionScheme::kOEC,
+  galois::graphs::GNNGraph("cora", galois::graphs::GNNPartitionScheme::kOEC,
                            true);
   GALOIS_LOG_VERBOSE("reddit with single label, cvc");
-  galois::graphs::GNNGraph("reddit", galois::graphs::GNNPartitionScheme::kCVC,
+  galois::graphs::GNNGraph("cora", galois::graphs::GNNPartitionScheme::kCVC,
                            true);
 
   return 0;

From f2960b1ac2a5f4c20ddb497bc43fd4c9e2bf964e Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 3 Mar 2021 16:48:20 -0600
Subject: [PATCH 491/660] GNN replication factor fix

Replication factor stat was wrong for GNNs because multiple sync
substrates were being created that all used the same replication factor
name. Resolve issue by only reporting it ifthe object the substrate is
created for is a graph (should only be 1 graph in any execution).
---
 .../include/galois/graphs/DistributedGraph.h  |  3 +++
 .../include/galois/graphs/GluonSubstrate.h    | 27 +++++++++++--------
 .../galois/layers/GluonGradientInterface.h    |  2 ++
 3 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/libcusp/include/galois/graphs/DistributedGraph.h b/libcusp/include/galois/graphs/DistributedGraph.h
index d13350f848..42b659fa67 100644
--- a/libcusp/include/galois/graphs/DistributedGraph.h
+++ b/libcusp/include/galois/graphs/DistributedGraph.h
@@ -886,6 +886,9 @@ class DistGraph {
   ////////////////////////////////////////////////////////////////////////////////
   EdgeIndexTy* row_start_ptr() { return graph.row_start_ptr(); }
   NodeIndexTy* edge_dst_ptr() { return graph.edge_dst_ptr(); }
+
+  //! Used by substrate to determine if some stats are to be reported
+  bool is_a_graph() const { return true; }
 };
 
 template <typename NodeTy, typename EdgeTy, typename NodeIndexTy,
diff --git a/libgluon/include/galois/graphs/GluonSubstrate.h b/libgluon/include/galois/graphs/GluonSubstrate.h
index 8b68216794..6f9c9f5b85 100644
--- a/libgluon/include/galois/graphs/GluonSubstrate.h
+++ b/libgluon/include/galois/graphs/GluonSubstrate.h
@@ -82,6 +82,8 @@ namespace graphs {
 template <typename GraphTy>
 class GluonSubstrate : public galois::runtime::GlobalObject {
 private:
+  bool is_a_graph_{false};
+
   //! Synchronization type
   enum SyncType {
     syncReduce,   //!< Reduction sync
@@ -340,16 +342,18 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
    */
   void reportProxyStats(uint64_t global_total_mirror_nodes,
                         uint64_t GALOIS_UNUSED(global_total_owned_nodes)) {
-    float replication_factor =
-        (float)(global_total_mirror_nodes + userGraph.globalSize()) /
-        (float)userGraph.globalSize();
-    galois::runtime::reportStat_Single(RNAME, "ReplicationFactor",
-                                       replication_factor);
+    if (is_a_graph_) {
+      float replication_factor =
+          (float)(global_total_mirror_nodes + userGraph.globalSize()) /
+          (float)userGraph.globalSize();
+      galois::runtime::reportStat_Single(RNAME, "ReplicationFactor",
+                                         replication_factor);
 
-    galois::runtime::reportStatCond_Single<MORE_DIST_STATS>(
-        RNAME, "TotalNodes", userGraph.globalSize());
-    galois::runtime::reportStatCond_Single<MORE_DIST_STATS>(
-        RNAME, "TotalGlobalMirrorNodes", global_total_mirror_nodes);
+      galois::runtime::reportStatCond_Single<MORE_DIST_STATS>(
+          RNAME, "TotalNodes", userGraph.globalSize());
+      galois::runtime::reportStatCond_Single<MORE_DIST_STATS>(
+          RNAME, "TotalGlobalMirrorNodes", global_total_mirror_nodes);
+    }
   }
 
   ////////////////////////////////////////////////////////////////////////////////
@@ -431,12 +435,13 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
         substrateDataMode(_enforcedDataMode), numHosts(numHosts), num_run(0),
         num_round(0), currentBVFlag(nullptr),
         mirrorNodes(userGraph.getMirrorNodes()) {
+    is_a_graph_ = _userGraph.is_a_graph();
     if (cartesianGrid.first != 0 && cartesianGrid.second != 0) {
       GALOIS_ASSERT(cartesianGrid.first * cartesianGrid.second == numHosts,
                     "Cartesian split doesn't equal number of hosts");
       if (id == 0) {
-        galois::gInfo("Gluon optimizing communication for 2-D cartesian cut: ",
-                      cartesianGrid.first, " x ", cartesianGrid.second);
+        galois::gDebug("Gluon optimizing communication for 2-D cartesian cut: ",
+                       cartesianGrid.first, " x ", cartesianGrid.second);
       }
       isCartCut = true;
     } else {
diff --git a/libgnn/include/galois/layers/GluonGradientInterface.h b/libgnn/include/galois/layers/GluonGradientInterface.h
index 473151efcd..a41ca0cb4d 100644
--- a/libgnn/include/galois/layers/GluonGradientInterface.h
+++ b/libgnn/include/galois/layers/GluonGradientInterface.h
@@ -59,6 +59,8 @@ class GluonGradientInterface {
   unsigned getEdgeData(uint32_t) const { return 0; }
   void deallocate() const {};
 
+  bool is_a_graph() const { return false; }
+
 private:
   //! Reference to gradients that can get synchronized
   std::vector<GNNFloat>& gradients_;

From 660e5ea968ee46dc01d006c2f06651437571c3e2 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 3 Mar 2021 17:07:49 -0600
Subject: [PATCH 492/660] Timer chnages for layers and GNN

- GNN: single train/val/test timer that becomes more fine grained as
necessary (e.g. when reporting a test/val forward phase).

- Added timers to SAGE.

- Separated forward/backward aggregation timers.

- Chunk size 1 for do_all aggregation
---
 .../galois/layers/GraphConvolutionalLayer.h   |  5 +++
 libgnn/include/galois/layers/SAGELayer.h      |  1 +
 libgnn/src/GraphNeuralNetwork.cpp             | 37 +++++++++++--------
 libgnn/src/layers/GraphConvolutionalLayer.cpp | 24 ++++++++++--
 libgnn/src/layers/SAGELayer.cpp               | 37 ++++++++++++++++++-
 5 files changed, 82 insertions(+), 22 deletions(-)

diff --git a/libgnn/include/galois/layers/GraphConvolutionalLayer.h b/libgnn/include/galois/layers/GraphConvolutionalLayer.h
index 47980dcd0c..4c884c129f 100644
--- a/libgnn/include/galois/layers/GraphConvolutionalLayer.h
+++ b/libgnn/include/galois/layers/GraphConvolutionalLayer.h
@@ -67,6 +67,11 @@ class GraphConvolutionalLayer : public GNNLayer {
   AggregateAll(size_t column_length, const GNNFloat* node_embeddings,
                GNNFloat* aggregate_output,
                galois::substrate::PerThreadStorage<std::vector<GNNFloat>>* pts);
+  void
+  AggregateAll(size_t column_length, const GNNFloat* node_embeddings,
+               GNNFloat* aggregate_output,
+               galois::substrate::PerThreadStorage<std::vector<GNNFloat>>* pts,
+               bool is_backward);
 
   //! Do embedding update via mxm with this layer's weights (forward)
   void UpdateEmbeddings(const GNNFloat* node_embeddings, GNNFloat* output);
diff --git a/libgnn/include/galois/layers/SAGELayer.h b/libgnn/include/galois/layers/SAGELayer.h
index 431a7f0696..b72e9dca50 100644
--- a/libgnn/include/galois/layers/SAGELayer.h
+++ b/libgnn/include/galois/layers/SAGELayer.h
@@ -58,6 +58,7 @@ class SAGELayer : public GNNLayer {
                 PointerWithSize<galois::GNNFloat>* input_gradient) final;
 
 private:
+  static const constexpr char* kRegionName = "SAGELayer";
   //! CPU aggregation
   void AggregateAllCPU(
       size_t column_length, const GNNFloat* node_embeddings,
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index 7955d9e92f..b31a31ecd1 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -147,12 +147,12 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
   if (config_.inductive_training_) {
     graph_->CalculateSpecialNormFactor(false, true);
   }
+  galois::StatTimer epoch_timer("TrainingTime", "GraphNeuralNetwork");
+  galois::StatTimer validation_timer("ValidationTime", "GraphNeuralNetwork");
+  galois::StatTimer epoch_test_timer("TestTime", "GraphNeuralNetwork");
 
   // TODO incorporate validation/test intervals
   for (size_t epoch = 0; epoch < num_epochs; epoch++) {
-    const std::string t_name     = "TrainEpoch" + std::to_string(epoch);
-    const std::string t_name_acc = t_name + "Accuracy";
-    galois::StatTimer epoch_timer(t_name.c_str(), "GraphNeuralNetwork");
     epoch_timer.start();
     if (config_.do_sampling()) {
       // subgraph sample every epoch
@@ -166,6 +166,8 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
     train_accuracy = GetGlobalAccuracy(predictions);
 
     if (this_host == 0) {
+      const std::string t_name_acc =
+          "TrainEpoch" + std::to_string(epoch) + "Accuracy";
       galois::gPrint("Epoch ", epoch, ": Train accuracy/F1 micro is ",
                      train_accuracy, "\n");
       galois::runtime::reportStat_Single("GraphNeuralNetwork", t_name_acc,
@@ -184,44 +186,43 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
     }
 
     if (do_validate) {
-      const std::string v_name     = "ValEpoch" + std::to_string(epoch);
-      const std::string v_name_acc = v_name + "Accuracy";
-      galois::StatTimer val_epoch_timer(v_name.c_str(), "GraphNeuralNetwork");
-
-      val_epoch_timer.start();
+      validation_timer.start();
       SetLayerPhases(galois::GNNPhase::kValidate);
       const PointerWithSize<galois::GNNFloat> val_pred = DoInference();
-      val_epoch_timer.stop();
+      validation_timer.stop();
 
       float val_acc = GetGlobalAccuracy(val_pred);
       if (this_host == 0) {
         galois::gPrint("Epoch ", epoch, ": Validation accuracy is ", val_acc,
                        "\n");
+        const std::string v_name_acc =
+            "ValEpoch" + std::to_string(epoch) + "Accuracy";
         galois::runtime::reportStat_Single("GraphNeuralNetwork", v_name_acc,
                                            val_acc);
       }
     }
 
     if (do_test) {
-      const std::string test_name     = "TestEpoch" + std::to_string(epoch);
-      const std::string test_name_acc = test_name + "Accuracy";
-      galois::StatTimer test_epoch_timer(test_name.c_str(),
-                                         "GraphNeuralNetwork");
-
-      test_epoch_timer.start();
+      epoch_test_timer.start();
       SetLayerPhases(galois::GNNPhase::kTest);
       const PointerWithSize<galois::GNNFloat> test_pred = DoInference();
-      test_epoch_timer.stop();
+      epoch_test_timer.stop();
 
       float test_acc = GetGlobalAccuracy(test_pred);
       if (this_host == 0) {
         galois::gPrint("Epoch ", epoch, ": Test accuracy is ", test_acc, "\n");
+        const std::string test_name_acc =
+            "TestEpoch" + std::to_string(epoch) + "Accuracy";
         galois::runtime::reportStat_Single("GraphNeuralNetwork", test_name_acc,
                                            test_acc);
       }
     }
 
     if (do_validate || do_test) {
+      // report the training time elapsed at this point in time
+      galois::runtime::reportStat_Single(
+          "GraphNeuralNetwork", "ElapsedTrainTimeEpoch" + std::to_string(epoch),
+          epoch_timer.get());
       // revert to training phase for next epoch
       SetLayerPhases(galois::GNNPhase::kTrain);
       // get back inductive norm factor as necessary; sampling norm is handled
@@ -232,6 +233,10 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
     }
   }
 
+  uint64_t average_epoch_time = epoch_timer.get() / num_epochs;
+  galois::runtime::reportStat_Tavg("GraphNeuralNetwork", "AverageEpochTime",
+                                   average_epoch_time);
+
   if (altered_norm_factor) {
     graph_->CalculateFullNormFactor();
   }
diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp
index 7d7667a624..419d813119 100644
--- a/libgnn/src/layers/GraphConvolutionalLayer.cpp
+++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp
@@ -132,7 +132,7 @@ galois::GraphConvolutionalLayer::BackwardPhase(
       // derivative of aggregate is the same due to symmetric graph
       AggregateAll(layer_dimensions_.input_columns, p_in_temp_1_.data(),
                    p_backward_output_matrix_.data(),
-                   &input_column_intermediates_);
+                   &input_column_intermediates_, true);
       // TODO if training A, then A' compute here if layer # is 0
       // dot product of edges that exist in A
     }
@@ -162,7 +162,7 @@ galois::GraphConvolutionalLayer::BackwardPhase(
     // required in this case for the weight gradient calculation
     // this is (FW)'
     AggregateAll(layer_dimensions_.output_columns, input_gradient->data(),
-                 p_out_temp_.data(), &output_column_intermediates_);
+                 p_out_temp_.data(), &output_column_intermediates_, true);
     if (layer_number_ != 0) {
       // derivative for update
       // backout = F'
@@ -208,7 +208,22 @@ void galois::GraphConvolutionalLayer::AggregateAll(
     GNNFloat* aggregate_output,
     [[maybe_unused]] galois::substrate::PerThreadStorage<std::vector<GNNFloat>>*
         pts) {
-  galois::StatTimer timer("Aggregate", kRegionName);
+  AggregateAll(column_length, node_embeddings, aggregate_output, pts, false);
+}
+
+void galois::GraphConvolutionalLayer::AggregateAll(
+    size_t column_length, const GNNFloat* node_embeddings,
+    GNNFloat* aggregate_output,
+    [[maybe_unused]] galois::substrate::PerThreadStorage<std::vector<GNNFloat>>*
+        pts,
+    bool is_backward) {
+  std::string agg_timer_name = "Aggregate";
+  if (!is_backward) {
+    agg_timer_name += "Forward";
+  } else {
+    agg_timer_name += "Backward";
+  }
+  galois::StatTimer timer(agg_timer_name.c_str(), kRegionName);
   timer.start();
 
 #ifdef GALOIS_ENABLE_GPU
@@ -311,7 +326,8 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU(
           }
         }
       },
-      galois::steal(), galois::loopname("ConvolutionalAggregateAll"));
+      galois::chunk_size<1>(), galois::steal(),
+      galois::loopname("ConvolutionalAggregateAll"));
   // aggregate sync
   graph_.AggregateSync(aggregate_output, column_length);
 }
diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp
index 3c8184faee..f8632dd7f0 100644
--- a/libgnn/src/layers/SAGELayer.cpp
+++ b/libgnn/src/layers/SAGELayer.cpp
@@ -89,7 +89,9 @@ void MatrixAdd(size_t num_nodes, galois::PointerWithSize<galois::GNNFloat> in,
 
 const galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::ForwardPhase(
     const galois::PointerWithSize<galois::GNNFloat> input_embeddings) {
-  GALOIS_LOG_VERBOSE("Calling forward phase");
+  galois::StatTimer timer("ForwardPhase", kRegionName);
+  timer.start();
+
   assert(input_embeddings.size() ==
          (layer_dimensions_.input_rows * layer_dimensions_.input_columns));
   assert(p_in_temp_1_.size() == input_embeddings.size());
@@ -138,12 +140,17 @@ const galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::ForwardPhase(
   assert(p_forward_output_matrix_.size() ==
          (layer_dimensions_.input_rows * layer_dimensions_.output_columns));
 
+  timer.stop();
+
   return p_forward_output_matrix_;
 }
 
 galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
     galois::PointerWithSize<galois::GNNFloat> prev_layer_input,
     galois::PointerWithSize<galois::GNNFloat>* input_gradient) {
+  galois::StatTimer timer("BackwardPhase", kRegionName);
+  timer.start();
+
   assert(layer_phase_ == GNNPhase::kTrain);
 
   // derivative of activation
@@ -261,6 +268,7 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
     DoDropoutDerivative();
   }
 
+  timer.stop();
   return p_backward_output_matrix_;
 }
 
@@ -278,6 +286,15 @@ void galois::SAGELayer::AggregateAll(
     [[maybe_unused]] galois::substrate::PerThreadStorage<std::vector<GNNFloat>>*
         pts,
     bool is_backward) {
+  std::string agg_timer_name = "Aggregate";
+  if (!is_backward) {
+    agg_timer_name += "Forward";
+  } else {
+    agg_timer_name += "Backward";
+  }
+  galois::StatTimer timer(agg_timer_name.c_str(), kRegionName);
+  timer.start();
+
 #ifdef GALOIS_ENABLE_GPU
   if (device_personality == DevicePersonality::GPU_CUDA) {
     gpu_object_.AggregateAllGPU(
@@ -291,6 +308,7 @@ void galois::SAGELayer::AggregateAll(
 #ifdef GALOIS_ENABLE_GPU
   }
 #endif
+  timer.stop();
 }
 
 void galois::SAGELayer::AggregateAllCPU(
@@ -371,13 +389,16 @@ void galois::SAGELayer::AggregateAllCPU(
           }
         }
       },
-      galois::steal(), galois::loopname("ConvolutionalAggregateAll"));
+      galois::chunk_size<1>(), galois::steal(),
+      galois::loopname("ConvolutionalAggregateAll"));
   // aggregate sync
   graph_.AggregateSync(aggregate_output, column_length);
 }
 
 void galois::SAGELayer::UpdateEmbeddings(const GNNFloat* node_embeddings,
                                          GNNFloat* output) {
+  galois::StatTimer timer("ForwardXForm", kRegionName);
+  timer.start();
 #ifdef GALOIS_ENABLE_GPU
   // TODO self change
   if (device_personality == DevicePersonality::GPU_CUDA) {
@@ -395,10 +416,13 @@ void galois::SAGELayer::UpdateEmbeddings(const GNNFloat* node_embeddings,
 #ifdef GALOIS_ENABLE_GPU
   }
 #endif
+  timer.stop();
 }
 
 void galois::SAGELayer::SelfFeatureUpdateEmbeddings(
     const GNNFloat* node_embeddings, GNNFloat* output) {
+  galois::StatTimer timer("SelfForwardXForm", kRegionName);
+  timer.start();
 #ifdef GALOIS_ENABLE_GPU
   // TODO self change
 #endif
@@ -410,10 +434,14 @@ void galois::SAGELayer::SelfFeatureUpdateEmbeddings(
 #ifdef GALOIS_ENABLE_GPU
 }
 #endif
+timer.stop();
 }
 
 void galois::SAGELayer::UpdateEmbeddingsDerivative(const GNNFloat* gradients,
                                                    GNNFloat* output) {
+  galois::StatTimer timer("BackwardXForm", kRegionName);
+  timer.start();
+
   assert(p_layer_weights_.size() ==
          layer_dimensions_.input_columns * layer_dimensions_.output_columns);
 #ifdef GALOIS_ENABLE_GPU
@@ -433,10 +461,14 @@ void galois::SAGELayer::UpdateEmbeddingsDerivative(const GNNFloat* gradients,
 #ifdef GALOIS_ENABLE_GPU
   }
 #endif
+  timer.stop();
 }
 
 void galois::SAGELayer::SelfFeatureUpdateEmbeddingsDerivative(
     const GNNFloat* gradients, GNNFloat* output) {
+  galois::StatTimer timer("SelfBackwardXForm", kRegionName);
+  timer.start();
+
   assert(p_layer_weights_.size() ==
          layer_dimensions_.input_columns * layer_dimensions_.output_columns);
 #ifdef GALOIS_ENABLE_GPU
@@ -451,6 +483,7 @@ void galois::SAGELayer::SelfFeatureUpdateEmbeddingsDerivative(
                      layer_weights_2_.data(), output, true);
 #ifdef GALOIS_ENABLE_GPU
 #endif
+  timer.stop();
 }
 
 void galois::SAGELayer::OptimizeLayer(BaseOptimizer* optimizer,

From 89604a5a0179cba629dd691471acd578d112df7b Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 4 Mar 2021 16:11:43 -0600
Subject: [PATCH 493/660] Forcefully change last layer to match label classes

Rather than forcing user to specify the last layer's size correctly,
this commit adds code that will automatically overwrite the layer size
with the correct size.
---
 lonestar/libgnnbench/src/Input.cpp | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp
index d8975204c5..0bc508963d 100644
--- a/lonestar/libgnnbench/src/Input.cpp
+++ b/lonestar/libgnnbench/src/Input.cpp
@@ -169,6 +169,15 @@ CreateLayerSizesVector(const galois::graphs::GNNGraph* gnn_graph) {
     }
     // verify user satisfies last intermediate layer needing to have same size
     // as # label classes
+    if (layer_sizes_vector.back() != gnn_graph->GetNumLabelClasses()) {
+      galois::gWarn(
+          "Size of last layer (", layer_sizes_vector.back(),
+          ") is not equal to # label classes: forcefully changing it to ",
+          gnn_graph->GetNumLabelClasses());
+      layer_sizes_vector.back()   = gnn_graph->GetNumLabelClasses();
+      layer_sizes[num_layers - 1] = gnn_graph->GetNumLabelClasses();
+    }
+
     GALOIS_LOG_ASSERT(layer_sizes_vector.back() ==
                       gnn_graph->GetNumLabelClasses());
   } else {

From 1bd09da74741682db44c95fdb351754487caec82 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Sat, 6 Mar 2021 14:41:58 -0600
Subject: [PATCH 494/660] Dist. acc consistency, weight sync collective

Replaces the weight sync gluon call with mpi collectives: much more
efficient as it removes the need for point to point. Will go in and
delete the Gluon substrate for weight in another commit later.

No-dropout distributed execution should now result in 100% consistent
accuracy with single host runs. The issue is that you can use a masked
gradient for the weight grad calculation, but the feat grad calc
requires that you have the non-masked version since you may need mirror
info. Before this commit, the masked version was used everywhere which
would lead to inconsistency down the line. This has been corrected
in both GCN and SAGE. The output layers need a change too (loop over all
nodes rather than masters only).

Added back-conv-test, which tests to make sure weights/feats gradients
passed back from a GCN layer are consistent/correct. This doesn't not
catch some corner cases as I found during debugging due to the tester
graph being too small.

Also do not allocate ptemp2 if it's not required to get some space
savings. (significant if # nodes and feature length is high).
---
 libgnn/include/galois/graphs/GNNGraph.h       |   6 +-
 libgnn/include/galois/layers/GNNLayer.h       |   7 +-
 libgnn/include/galois/layers/SAGELayer.h      |   7 +-
 libgnn/src/layers/GNNLayer.cpp                |  89 +++++++--
 libgnn/src/layers/GraphConvolutionalLayer.cpp |  16 +-
 libgnn/src/layers/SAGELayer.cpp               |  93 ++++++---
 libgnn/src/layers/SigmoidLayer.cpp            |   4 +-
 libgnn/src/layers/SoftmaxLayer.cpp            |   4 +-
 libgnn/test/CMakeLists.txt                    |   9 +-
 libgnn/test/back-conv-test.cpp                | 176 ++++++++++++++++++
 libgnn/test/weight-sync-test.cpp              |  45 -----
 11 files changed, 341 insertions(+), 115 deletions(-)
 create mode 100644 libgnn/test/back-conv-test.cpp
 delete mode 100644 libgnn/test/weight-sync-test.cpp

diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 02cef8e621..071b33aeac 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -67,14 +67,16 @@ class GNNGraph {
   //! Return # of nodes in the partitioned graph
   size_t size() const { return partitioned_graph_->size(); }
 
+  bool is_local(size_t gid) const { return partitioned_graph_->isLocal(gid); }
+  size_t GetLID(size_t gid) const { return partitioned_graph_->getLID(gid); }
+  size_t GetGID(size_t lid) const { return partitioned_graph_->getGID(lid); }
+
   //! Node begin for all local nodes
   NodeIterator begin() const {
     return partitioned_graph_->allNodesRange().begin();
   }
   //! Node end for all local nodes
   NodeIterator end() const { return partitioned_graph_->allNodesRange().end(); }
-  //! Return GID of some local node
-  size_t GetGID(unsigned lid) const { return partitioned_graph_->getGID(lid); }
 
   NodeIterator begin_owned() const {
     return partitioned_graph_->masterNodesRange().begin();
diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
index ecd79bec34..27fd1ac0c7 100644
--- a/libgnn/include/galois/layers/GNNLayer.h
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -281,6 +281,8 @@ class GNNLayer {
                  PointerWithSize<GNNFloat>* output_matrix);
   //! Apply the derivative of dropout to the backward phase output
   void DoDropoutDerivative();
+  void ReconstructDropoutMatrix(const PointerWithSize<GNNFloat> input_to_drop,
+                                PointerWithSize<GNNFloat>* output_matrix);
 
   //! Does some activation function based on configuration on forward output
   //! matrix
@@ -290,9 +292,6 @@ class GNNLayer {
 
   //! Synchronize weight gradients with a summation
   void WeightGradientSyncSum();
-  //! Synchronize weight gradients with a summation, then locally divide all
-  //! weights to get an average
-  void WeightGradientSyncAverage();
 
 #ifdef GALOIS_ENABLE_GPU
   //! Object that holds all GPU allocated pointers to memory related to layers
@@ -302,6 +301,8 @@ class GNNLayer {
     base_gpu_object_.CopyToWeights(layer_weights_);
   }
 #endif
+
+  void MaskGradientNonMasters(PointerWithSize<GNNFloat>* gradients);
 };
 
 } // namespace galois
diff --git a/libgnn/include/galois/layers/SAGELayer.h b/libgnn/include/galois/layers/SAGELayer.h
index b72e9dca50..6825812315 100644
--- a/libgnn/include/galois/layers/SAGELayer.h
+++ b/libgnn/include/galois/layers/SAGELayer.h
@@ -94,12 +94,7 @@ class SAGELayer : public GNNLayer {
   void OptimizeLayer(BaseOptimizer* optimizer, size_t trainable_layer_number);
 
   //! Sync second set of weight gradients
-  void WeightGradientSyncSum2() {
-    // TODO bitset
-    gradient_sync_substrate_2_
-        ->sync<writeAny, readAny, WeightGradientSummation>(
-            "WeightGradients2Sync");
-  }
+  void WeightGradientSyncSum2();
 
   //! SAGE config params
   SAGELayerConfig sage_config_;
diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp
index 92c1fa3250..1084bf9010 100644
--- a/libgnn/src/layers/GNNLayer.cpp
+++ b/libgnn/src/layers/GNNLayer.cpp
@@ -180,6 +180,33 @@ void galois::GNNLayer::DoDropout(
   timer.stop();
 }
 
+void galois::GNNLayer::ReconstructDropoutMatrix(
+    const PointerWithSize<GNNFloat> input_to_dropout,
+    PointerWithSize<GNNFloat>* output_matrix) {
+  galois::StatTimer timer("ReconstructDropoutMatrix", "GNNLayer");
+  timer.start();
+#ifdef GALOIS_ENABLE_GPU
+  if (device_personality == DevicePersonality::GPU_CUDA) {
+    // TODO(hochan)
+    GALOIS_LOG_FATAL("Implement me");
+  } else {
+#endif
+    // reuse the dropout mask from a previous dropout call
+    size_t num_elements = output_matrix->size();
+    GNNFloat scale      = 1. / (1. - config_.dropout_rate);
+    galois::do_all(
+        galois::iterate(static_cast<size_t>(0), num_elements),
+        [&](size_t i) {
+          (*output_matrix)[i] = input_to_dropout[i] *
+                                static_cast<GNNFloat>(dropout_mask_[i]) * scale;
+        },
+        galois::loopname("ReconstructDropout"));
+#ifdef GALOIS_ENABLE_GPU
+  }
+#endif
+  timer.stop();
+}
+
 void galois::GNNLayer::DoDropoutDerivative() {
   galois::StatTimer timer("BackwardDropout", "GNNLayer");
   timer.start();
@@ -244,26 +271,30 @@ void galois::GNNLayer::ActivationDerivative(
 }
 
 void galois::GNNLayer::WeightGradientSyncSum() {
-  // TODO bitset
+  galois::StatTimer t("Sync_WeightGradientsSum", "GNNLayer");
+  t.start();
+#ifdef GALOIS_ENABLE_GPU
+  // TODO(hochan) collectives here rather than gluon sync if possible like the
+  // CPU code
+  // preferably without needing to do a gpu->cpu copy
+  galois::gWarn(
+      "GPU still using inefficient point to point comms for weight sync");
   gradient_sync_substrate_->sync<writeAny, readAny, WeightGradientSummation>(
       "WeightGradientsSync");
-}
-
-void galois::GNNLayer::WeightGradientSyncAverage() {
-  size_t num_hosts = galois::runtime::getSystemNetworkInterface().Num;
-  if (num_hosts > 1) {
-    // TODO bitset
-    // sum, then average by dividing all by num hosts (every host participates
-    // in sync)
-    gradient_sync_substrate_->sync<writeAny, readAny, WeightGradientSummation>(
-        "WeightGradientsSyncAverage");
-    galois::do_all(
-        galois::iterate(static_cast<size_t>(0), layer_weight_gradients_.size()),
-        [&](size_t weight_index) {
-          layer_weight_gradients_[weight_index] /= num_hosts;
-        },
-        galois::loopname("WeightGradientSyncAverageDivide"));
+#else
+  // TODO(loc) remove this limitation later; can just do a loop over the weight
+  // matrix
+  if (p_layer_weight_gradients_.size() >
+      size_t{std::numeric_limits<int>::max()}) {
+    GALOIS_LOG_FATAL("Weight sync code does not handle size larger than max "
+                     "int at the moment");
   }
+  MPI_Allreduce(MPI_IN_PLACE,
+                static_cast<void*>(p_layer_weight_gradients_.data()),
+                static_cast<int>(p_layer_weight_gradients_.size()), MPI_FLOAT,
+                MPI_SUM, MPI_COMM_WORLD);
+#endif
+  t.stop();
 }
 
 void galois::GNNLayer::SyncInitialWeights() {
@@ -271,7 +302,7 @@ void galois::GNNLayer::SyncInitialWeights() {
     return;
   }
 #ifdef GALOIS_ENABLE_GPU
-  // TODO(loc/hochan)
+  // TODO(loc/hochan); not required at the moment however
   GALOIS_LOG_FATAL("Need to implement GPU version of this");
 #endif
   // copy weights over to gradients
@@ -287,3 +318,25 @@ void galois::GNNLayer::SyncInitialWeights() {
     layer_weight_gradients_[i] = 0;
   }
 }
+
+void galois::GNNLayer::MaskGradientNonMasters(
+    PointerWithSize<GNNFloat>* gradient) {
+#ifdef GALOIS_ENABLE_GPU
+  // TODO(hochan) mask away the **non** masters on gpu
+  GALOIS_LOG_FATAL("implement this");
+#else
+  assert(*(graph_.begin_owned()) == 0);
+  size_t start_node = *(graph_.end_owned());
+  size_t end_node   = graph_.size();
+  size_t row_index  = layer_dimensions_.output_columns;
+  galois::do_all(
+      galois::iterate(start_node, end_node),
+      [&](size_t non_master) {
+        // TODO(loc) use a std function for this for max efficiency
+        for (size_t i = 0; i < row_index; i++) {
+          (*gradient)[non_master * row_index + i] = 0;
+        }
+      },
+      galois::loopname("MaskGradientNonMasters"));
+#endif
+}
diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp
index 419d813119..8d3c6754a2 100644
--- a/libgnn/src/layers/GraphConvolutionalLayer.cpp
+++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp
@@ -11,8 +11,10 @@ galois::GraphConvolutionalLayer::GraphConvolutionalLayer(
   size_t num_input_elements =
       layer_dimensions_.input_rows * layer_dimensions_.input_columns;
   in_temp_1_.resize(num_input_elements, 0);
-  // TODO temp2 does not need to be initialized in all circumstances
-  in_temp_2_.resize(num_input_elements, 0);
+  if (config_.disable_aggregate_after_update ||
+      layer_dimensions_.input_columns <= layer_dimensions_.output_columns) {
+    in_temp_2_.resize(num_input_elements, 0);
+  }
 
   size_t num_output_elements =
       layer_dimensions_.input_rows * layer_dimensions_.output_columns;
@@ -50,7 +52,6 @@ galois::GraphConvolutionalLayer::ForwardPhase(
   assert(input_embeddings.size() ==
          (layer_dimensions_.input_rows * layer_dimensions_.input_columns));
   assert(p_in_temp_1_.size() == input_embeddings.size());
-  assert(p_in_temp_2_.size() == input_embeddings.size());
   assert(p_forward_output_matrix_.size() ==
          (layer_dimensions_.input_rows * layer_dimensions_.output_columns));
   // pointer to input to operate on
@@ -78,8 +79,6 @@ galois::GraphConvolutionalLayer::ForwardPhase(
                  &output_column_intermediates_);
   }
 
-  // TODO synchronization of aggregation functions
-
   if (!config_.disable_activation) {
     GALOIS_LOG_VERBOSE("Doing activation");
     Activation();
@@ -88,6 +87,7 @@ galois::GraphConvolutionalLayer::ForwardPhase(
   assert(p_forward_output_matrix_.size() ==
          (layer_dimensions_.input_rows * layer_dimensions_.output_columns));
   timer.stop();
+
   return p_forward_output_matrix_;
 }
 
@@ -138,6 +138,7 @@ galois::GraphConvolutionalLayer::BackwardPhase(
     }
     // weight gradient calculation
     // TODO(loc) put this in a function to put the ifdef in there
+    MaskGradientNonMasters(input_gradient);
 #ifdef GALOIS_ENABLE_GPU
     if (device_personality == DevicePersonality::GPU_CUDA) {
       gpu_object_.GetWeightGradientsGPU(
@@ -169,8 +170,9 @@ galois::GraphConvolutionalLayer::BackwardPhase(
       UpdateEmbeddingsDerivative(p_out_temp_.data(),
                                  p_backward_output_matrix_.data());
     }
-    // TODO put this in a function
     // W' = F^T (FW)'
+    MaskGradientNonMasters(&p_out_temp_);
+    // TODO put this in a function
 #ifdef GALOIS_ENABLE_GPU
     if (device_personality == DevicePersonality::GPU_CUDA) {
       gpu_object_.GetWeightGradientsGPU(
@@ -191,8 +193,6 @@ galois::GraphConvolutionalLayer::BackwardPhase(
 
   // sync weight gradients; note aggregation sync occurs in the function call
   // already
-  // TODO figure out how to do this with GPUs
-  // WeightGradientSyncAverage();
   WeightGradientSyncSum();
 
   if (!config_.disable_dropout && layer_number_ != 0) {
diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp
index f8632dd7f0..194563610d 100644
--- a/libgnn/src/layers/SAGELayer.cpp
+++ b/libgnn/src/layers/SAGELayer.cpp
@@ -42,8 +42,11 @@ galois::SAGELayer::SAGELayer(size_t layer_num,
   size_t num_input_elements =
       layer_dimensions_.input_rows * layer_dimensions_.input_columns;
   in_temp_1_.resize(num_input_elements, 0);
-  // TODO temp2 does not need to be initialized in all circumstances
-  in_temp_2_.resize(num_input_elements, 0);
+  // only need to allocate if input <= output because not used otherwise
+  if (config_.disable_aggregate_after_update ||
+      layer_dimensions_.input_columns <= layer_dimensions_.output_columns) {
+    in_temp_2_.resize(num_input_elements, 0);
+  }
 
   size_t num_output_elements =
       layer_dimensions_.input_rows * layer_dimensions_.output_columns;
@@ -74,17 +77,27 @@ galois::SAGELayer::SAGELayer(size_t layer_num,
   GALOIS_LOG_VERBOSE("SAGE layer initialized");
 }
 
-void MatrixAdd(size_t num_nodes, galois::PointerWithSize<galois::GNNFloat> in,
-               galois::PointerWithSize<galois::GNNFloat>* out) {
-  assert(in.size() == out->size());
-  assert((in.size() % num_nodes) == 0);
-  size_t column_size = in.size() / num_nodes;
-  // split matrix to threads
-  galois::do_all(galois::iterate(size_t{0}, num_nodes), [&](size_t node) {
-    size_t my_offset = node * column_size;
-    galois::VectorAdd(column_size, &(in[my_offset]),
-                      &((out->data())[my_offset]), &(out->data()[my_offset]));
-  });
+void galois::SAGELayer::WeightGradientSyncSum2() {
+  galois::StatTimer t("Sync_WeightGradientsSum2", kRegionName);
+  t.start();
+#ifdef GALOIS_ENABLE_GPU
+  // TODO(hochan) collectives here rather than gluon sync if possible like the
+  // CPU code
+  GALOIS_LOG_FATAL("implement me");
+#else
+  // TODO(loc) remove this limitation later; can just do a loop over the weight
+  // matrix
+  if (p_layer_weight_gradients_2_.size() >
+      size_t{std::numeric_limits<int>::max()}) {
+    GALOIS_LOG_FATAL("Weight sync code does not handle size larger than max "
+                     "int at the moment");
+  }
+  MPI_Allreduce(MPI_IN_PLACE,
+                static_cast<void*>(p_layer_weight_gradients_2_.data()),
+                static_cast<int>(p_layer_weight_gradients_2_.size()), MPI_FLOAT,
+                MPI_SUM, MPI_COMM_WORLD);
+#endif
+  t.stop();
 }
 
 const galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::ForwardPhase(
@@ -95,7 +108,6 @@ const galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::ForwardPhase(
   assert(input_embeddings.size() ==
          (layer_dimensions_.input_rows * layer_dimensions_.input_columns));
   assert(p_in_temp_1_.size() == input_embeddings.size());
-  assert(p_in_temp_2_.size() == input_embeddings.size());
   assert(p_forward_output_matrix_.size() ==
          (layer_dimensions_.input_rows * layer_dimensions_.output_columns));
   // pointer to input to operate on
@@ -170,30 +182,21 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
   }
 
   // AFW = O
-  if (!sage_config_.disable_concat) {
-    // Fw1 + AFW2 = O; self feature has own weight matrix and makes own
-    // contribution to gradients which is handled in this block
-    // !!!! do this early because p_in_temp may get overwritten later
-    // if update occurs before aggregate !!!
-    galois::CBlasSGEMM(
-        CblasTrans, CblasNoTrans, layer_dimensions_.input_columns,
-        layer_dimensions_.input_rows, layer_dimensions_.output_columns,
-        input_to_use.data(), input_gradient->data(),
-        p_layer_weight_gradients_2_.data());
-  }
-  WeightGradientSyncSum2();
 
   // derivative of aggregation/update
   // TODO clean up logic here to reduce nesting
   if (config_.disable_aggregate_after_update ||
       layer_dimensions_.input_columns <= layer_dimensions_.output_columns) {
     if (layer_number_ != 0) {
+      // ---unmasked---
       // transposed sgemm for derivative; in_temp is output
       assert(input_gradient->size() ==
              layer_dimensions_.input_rows * layer_dimensions_.output_columns);
       assert(p_in_temp_1_.size() ==
              layer_dimensions_.input_columns * layer_dimensions_.input_rows);
       // pintemp1 contains (AF)'
+      // overwrites the dropout matrix that was in ptemp1 (needed for second
+      // weight matrix)
       UpdateEmbeddingsDerivative(input_gradient->data(), p_in_temp_1_.data());
       // pback contains F'
       // derivative of aggregate is the same due to symmetric graph
@@ -203,6 +206,8 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
     }
     // weight gradient calculation
     // TODO(loc) put this in a function to put the ifdef in there
+    // ---masked---
+    MaskGradientNonMasters(input_gradient);
 #ifdef GALOIS_ENABLE_GPU
     if (device_personality == DevicePersonality::GPU_CUDA) {
       gpu_object_.GetWeightGradientsGPU(
@@ -224,6 +229,7 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
     // aggregate occurs regardless of layer being equal to 0 because it is
     // required in this case for the weight gradient calculation
     // this is (FW)'
+    // --unmasked--
     AggregateAll(layer_dimensions_.output_columns, input_gradient->data(),
                  p_out_temp_.data(), &output_column_intermediates_, true);
     if (layer_number_ != 0) {
@@ -235,6 +241,8 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
     // TODO put this in a function
     // W' = F^T (FW)'
     // input to use is not overwritten in this branch so it's safe to use
+    // --- masked ---, uses ptemp1
+    MaskGradientNonMasters(&p_out_temp_);
 #ifdef GALOIS_ENABLE_GPU
     if (device_personality == DevicePersonality::GPU_CUDA) {
       gpu_object_.GetWeightGradientsGPU(
@@ -254,6 +262,39 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
   }
 
   if (!sage_config_.disable_concat) {
+    // Fw1 + AFW2 = O; self feature has own weight matrix and makes own
+    // contribution to gradients which is handled in this block
+    // second weight matrix: reconstruct the dropout matrix if it was
+    // overwritten into temp1
+    if (config_.disable_aggregate_after_update ||
+        layer_dimensions_.input_columns <= layer_dimensions_.output_columns) {
+      if (!config_.disable_dropout) {
+        // input gradients have already been masked; need to reconstruct the
+        // dropout matrix which we can do since we saved the dropout mask
+        // save it into ptemp1
+        ReconstructDropoutMatrix(prev_layer_input, &p_in_temp_1_);
+        // !!!NOTE!!!
+        // If you're using dropout in the distributed setting you've already
+        // thrown consistency out the window anyways because distributed RNG
+        // will make it so each host does something different
+        // Therefore, this op above is nothing more than a feeble attempt
+        // at getting *some* notion of consistency
+      }
+    } else {
+      // mask original input gradients since this path masks the aggregated
+      // gradients only
+      MaskGradientNonMasters(input_gradient);
+      // in dropout case, ptemp1 (contained in input to use) still contains the
+      // dropout matrix so no need to recompute
+    }
+
+    galois::CBlasSGEMM(
+        CblasTrans, CblasNoTrans, layer_dimensions_.input_columns,
+        layer_dimensions_.input_rows, layer_dimensions_.output_columns,
+        input_to_use.data(), input_gradient->data(),
+        p_layer_weight_gradients_2_.data());
+    WeightGradientSyncSum2();
+
     if (layer_number_ != 0) {
       // deal with feature gradients for the self feature here
       // this function will sum directly into the backward matrix
diff --git a/libgnn/src/layers/SigmoidLayer.cpp b/libgnn/src/layers/SigmoidLayer.cpp
index 35f95b64a6..317811b6df 100644
--- a/libgnn/src/layers/SigmoidLayer.cpp
+++ b/libgnn/src/layers/SigmoidLayer.cpp
@@ -14,7 +14,7 @@ galois::SigmoidLayer::ForwardPhaseCPU(
   float_accumulator_.reset();
 
   galois::do_all(
-      galois::iterate(graph_.begin_owned(), graph_.end_owned()),
+      galois::iterate(graph_.begin(), graph_.end()),
       [&](const unsigned local_node) {
         if (graph_.IsValidForPhase(local_node, layer_phase_)) {
           if (IsSampledLayer()) {
@@ -71,7 +71,7 @@ galois::SigmoidLayer::BackwardPhaseCPU() {
   backward_output_matrix_.assign(backward_output_matrix_.size(), 0);
 
   galois::do_all(
-      galois::iterate(graph_.begin_owned(), graph_.end_owned()),
+      galois::iterate(graph_.begin(), graph_.end()),
       [&](const unsigned local_node) {
         if (graph_.IsValidForPhase(local_node, layer_phase_)) {
           if (IsSampledLayer()) {
diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp
index a268089b33..6ac09806fe 100644
--- a/libgnn/src/layers/SoftmaxLayer.cpp
+++ b/libgnn/src/layers/SoftmaxLayer.cpp
@@ -16,7 +16,7 @@ galois::SoftmaxLayer::ForwardPhaseCPU(
 #endif
 
   galois::do_all(
-      galois::iterate(graph_.begin_owned(), graph_.end_owned()),
+      galois::iterate(graph_.begin(), graph_.end()),
       [&](const unsigned i) {
         if (IsSampledLayer()) {
           if (layer_phase_ == GNNPhase::kTrain && !graph_.IsInSampledGraph(i))
@@ -80,7 +80,7 @@ galois::SoftmaxLayer::BackwardPhaseCPU() {
   backward_output_matrix_.assign(backward_output_matrix_.size(), 0);
 
   galois::do_all(
-      galois::iterate(graph_.begin_owned(), graph_.end_owned()),
+      galois::iterate(graph_.begin(), graph_.end()),
       [&](const unsigned i) {
         if (graph_.IsValidForPhase(i, layer_phase_)) {
           if (IsSampledLayer()) {
diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt
index b1170e2d16..b9ef634c53 100644
--- a/libgnn/test/CMakeLists.txt
+++ b/libgnn/test/CMakeLists.txt
@@ -63,9 +63,12 @@ if (NOT GALOIS_ENABLE_GPU)
     set_tests_properties(run-aggsync-${host_count} PROPERTIES ENVIRONMENT "GALOIS_DO_NOT_BIND_THREADS=1")
   endforeach()
 
-  add_executable(weight-sync-test weight-sync-test.cpp)
-  target_link_libraries(weight-sync-test galois_gnn)
-  # TODO multi host tests
+  add_executable(back-conv-test back-conv-test.cpp)
+  target_link_libraries(back-conv-test galois_gnn)
+  foreach(host_count ${hosts})
+    add_test(NAME run-back-conv-${host_count} COMMAND mpiexec --bind-to none -n ${host_count} back-conv-test)
+    set_tests_properties(run-back-conv-${host_count} PROPERTIES ENVIRONMENT "GALOIS_DO_NOT_BIND_THREADS=1")
+  endforeach()
 
   add_executable(multilabel-read multilabel-read.cpp)
   target_link_libraries(multilabel-read galois_gnn)
diff --git a/libgnn/test/back-conv-test.cpp b/libgnn/test/back-conv-test.cpp
new file mode 100644
index 0000000000..b1c9c025c6
--- /dev/null
+++ b/libgnn/test/back-conv-test.cpp
@@ -0,0 +1,176 @@
+#include "galois/Logging.h"
+#include "galois/layers/GraphConvolutionalLayer.h"
+
+int main() {
+  galois::DistMemSys G;
+
+  size_t num_threads = galois::setActiveThreads(
+      56 / galois::runtime::getSystemNetworkInterface().Num);
+
+  GALOIS_LOG_VERBOSE("[{}] Using {} threads",
+                     galois::runtime::getSystemNetworkInterface().ID,
+                     num_threads);
+  // load test graph
+  galois::graphs::GNNGraph test_graph(
+      "tester", galois::graphs::GNNPartitionScheme::kCVC, true);
+  galois::PointerWithSize<galois::GNNFloat> feats =
+      test_graph.GetLocalFeatures();
+  for (size_t row = 0; row < test_graph.size(); row++) {
+    // row -> GID
+    size_t global_row             = test_graph.GetGID(row);
+    galois::GNNFloat ground_truth = 0.0;
+
+    switch (global_row) {
+    case 0:
+      ground_truth = 0;
+      break;
+    case 1:
+      ground_truth = 1;
+      break;
+    case 2:
+      ground_truth = 2;
+      break;
+    case 3:
+      ground_truth = 3;
+      break;
+    case 4:
+      ground_truth = 4;
+      break;
+    case 5:
+      ground_truth = 5;
+      break;
+    case 6:
+      ground_truth = 6;
+      break;
+    default:
+      GALOIS_LOG_FATAL("bad global row for test graph");
+      break;
+    }
+    // size 2 columns
+    for (size_t c = 0; c < 3; c++) {
+      GALOIS_LOG_VASSERT(feats[row * 3 + c] == ground_truth, "{} not {}",
+                         ground_truth, feats[row * 2 + c]);
+    }
+  }
+
+  galois::GNNLayerDimensions dimension_0;
+  dimension_0.input_rows     = test_graph.size();
+  dimension_0.input_columns  = 3;
+  dimension_0.output_columns = 2;
+
+  galois::GNNLayerConfig dcon;
+  dcon.DebugConfig();
+
+  // dummy 1 matrix
+  std::vector<galois::GNNFloat> dummy_ones_v(test_graph.size() * 2, 1);
+  galois::PointerWithSize dummy_ones(dummy_ones_v);
+
+  // create layer 1 for testing backward prop actually giving weights back
+  std::unique_ptr<galois::GraphConvolutionalLayer> layer_1 =
+      std::make_unique<galois::GraphConvolutionalLayer>(1, test_graph,
+                                                        dimension_0, dcon);
+  layer_1->InitAllWeightsTo1();
+  galois::PointerWithSize<galois::GNNFloat> layer_1_forward_output =
+      layer_1->ForwardPhase(test_graph.GetLocalFeatures());
+
+  for (size_t row = 0; row < test_graph.size(); row++) {
+    // row -> GID
+    size_t global_row             = test_graph.GetGID(row);
+    galois::GNNFloat ground_truth = 0.0;
+
+    switch (global_row) {
+    case 0:
+      ground_truth = 3;
+      break;
+    case 1:
+      ground_truth = 6;
+      break;
+    case 2:
+      ground_truth = 12;
+      break;
+    case 3:
+      ground_truth = 18;
+      break;
+    case 4:
+      ground_truth = 24;
+      break;
+    case 5:
+      ground_truth = 30;
+      break;
+    case 6:
+      ground_truth = 15;
+      break;
+    default:
+      GALOIS_LOG_FATAL("bad global row for test graph");
+      break;
+    }
+    // size 2 columns
+    for (size_t c = 0; c < 2; c++) {
+      GALOIS_LOG_VASSERT(layer_1_forward_output[row * 2 + c] == ground_truth,
+                         "{} not {}", ground_truth,
+                         layer_1_forward_output[row * 2 + c]);
+    }
+  }
+
+  galois::PointerWithSize<galois::GNNFloat> layer_1_backward_output =
+      layer_1->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
+
+  for (size_t row = 0; row < test_graph.size(); row++) {
+    // row -> GID
+    size_t global_row             = test_graph.GetGID(row);
+    galois::GNNFloat ground_truth = 0.0;
+
+    switch (global_row) {
+    case 0:
+      ground_truth = 2;
+      break;
+    case 1:
+      ground_truth = 4;
+      break;
+    case 2:
+      ground_truth = 4;
+      break;
+    case 3:
+      ground_truth = 4;
+      break;
+    case 4:
+      ground_truth = 4;
+      break;
+    case 5:
+      ground_truth = 4;
+      break;
+    case 6:
+      ground_truth = 2;
+      break;
+    default:
+      GALOIS_LOG_FATAL("bad global row for test graph");
+      break;
+    }
+    // size 2 columns
+    for (size_t c = 0; c < 3; c++) {
+      GALOIS_LOG_ASSERT(layer_1_backward_output[row * 3 + c] == ground_truth);
+    }
+  }
+
+  galois::PointerWithSize<galois::GNNFloat> layer_1_weight_gradients =
+      layer_1->GetLayerWeightGradients();
+
+  // make sure they are sane
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients.size() == 6);
+  GALOIS_LOG_VASSERT(layer_1_weight_gradients[0] == 36, "36 not {}",
+                     layer_1_weight_gradients[0]);
+  GALOIS_LOG_VASSERT(layer_1_weight_gradients[1] == 36, "36 not {}",
+                     layer_1_weight_gradients[1]);
+  GALOIS_LOG_VASSERT(layer_1_weight_gradients[2] == 36, "36 not {}",
+                     layer_1_weight_gradients[2]);
+  GALOIS_LOG_VASSERT(layer_1_weight_gradients[3] == 36, "36 not {}",
+                     layer_1_weight_gradients[3]);
+  GALOIS_LOG_VASSERT(layer_1_weight_gradients[4] == 36, "36 not {}",
+                     layer_1_weight_gradients[4]);
+  GALOIS_LOG_VASSERT(layer_1_weight_gradients[5] == 36, "36 not {}",
+                     layer_1_weight_gradients[5]);
+
+  layer_1.reset();
+
+  return 0;
+}
diff --git a/libgnn/test/weight-sync-test.cpp b/libgnn/test/weight-sync-test.cpp
deleted file mode 100644
index 4c2c01f844..0000000000
--- a/libgnn/test/weight-sync-test.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-#include "galois/Logging.h"
-#include "galois/GraphNeuralNetwork.h"
-#include "galois/layers/GraphConvolutionalLayer.h"
-
-int main() {
-  galois::DistMemSys G;
-
-  if (galois::runtime::getSystemNetworkInterface().Num == 4) {
-    GALOIS_LOG_ERROR("This test should be run with 4 hosts/processes");
-    exit(1);
-  }
-
-  auto test_graph = std::make_unique<galois::graphs::GNNGraph>(
-      "tester", galois::graphs::GNNPartitionScheme::kOEC, true);
-
-  // create same layer from convlayer-test and make sure result is the same even
-  // in multi-host environment
-  galois::GNNLayerDimensions dimension_0;
-  dimension_0.input_rows     = test_graph->size();
-  dimension_0.input_columns  = 3;
-  dimension_0.output_columns = 2;
-  galois::GNNLayerConfig dcon;
-
-  dcon.disable_aggregate_after_update = false;
-  // create the layer, no norm factor
-  std::unique_ptr<galois::GraphConvolutionalLayer> layer_0 =
-      std::make_unique<galois::GraphConvolutionalLayer>(0, *(test_graph.get()),
-                                                        dimension_0, dcon);
-  layer_0->InitAllWeightsTo1();
-
-  // backward pass checking; check the gradients out
-  std::vector<galois::GNNFloat> dummy_ones_v(test_graph->size() * 2, 1);
-  galois::PointerWithSize<galois::GNNFloat> dummy_ones(dummy_ones_v);
-  layer_0->BackwardPhase(test_graph->GetLocalFeatures(), &dummy_ones);
-
-  // gradient verification; average
-  // host 0 has 18, 1 has 21, 2 has 12, 3 has 0s; averaged to 12.75
-  const galois::PointerWithSize<galois::GNNFloat>& grads =
-      layer_0->GetLayerWeightGradients();
-  for (size_t i = 0; i < 6; i++) {
-    GALOIS_LOG_ASSERT(grads[i] == 12.75);
-  }
-
-  // XXX CVC
-}

From 5ce1c24d09741a9ac27490058d8b51a59b097274 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Sat, 6 Mar 2021 17:28:28 -0600
Subject: [PATCH 495/660] Bitset now used for GNN aggregation

Adds the use of a bitset for sync for the aggregation in GCN and SAGE.
Theoretically this should improve sync time significantly.
---
 .../galois/graphs/GraphAggregationSyncStructures.h       | 3 +++
 libgnn/include/galois/layers/GraphConvolutionalLayer.h   | 2 ++
 libgnn/include/galois/layers/SAGELayer.h                 | 2 ++
 libgnn/src/graphs/GNNGraph.cpp                           | 9 +++++----
 libgnn/src/layers/GraphConvolutionalLayer.cpp            | 2 ++
 libgnn/src/layers/SAGELayer.cpp                          | 2 ++
 6 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
index e5dcb970af..8e3db38096 100644
--- a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
+++ b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
@@ -11,6 +11,7 @@ namespace graphs {
 
 extern GNNFloat* gnn_matrix_to_sync_;
 extern size_t gnn_matrix_to_sync_column_length_;
+extern galois::DynamicBitSet bitset_graph_aggregate;
 #ifdef GALOIS_ENABLE_GPU
 extern struct CUDA_Context* cuda_ctx_for_sync;
 extern unsigned layer_number_to_sync;
@@ -89,5 +90,7 @@ GALOIS_SYNC_STRUCTURE_GNN_LAYER(layer_output, cuda_ctx_for_sync,
                                 gnn_matrix_to_sync_column_length_,
                                 layer_number_to_sync);
 #endif
+
+GALOIS_SYNC_STRUCTURE_BITSET(graph_aggregate);
 } // namespace graphs
 } // namespace galois
diff --git a/libgnn/include/galois/layers/GraphConvolutionalLayer.h b/libgnn/include/galois/layers/GraphConvolutionalLayer.h
index 4c884c129f..e44976f73b 100644
--- a/libgnn/include/galois/layers/GraphConvolutionalLayer.h
+++ b/libgnn/include/galois/layers/GraphConvolutionalLayer.h
@@ -7,6 +7,8 @@
 
 namespace galois {
 
+extern galois::DynamicBitSet graphs::bitset_graph_aggregate;
+
 class GraphConvolutionalLayer : public GNNLayer {
 public:
   //! Initializes the variables of the base class and also allocates additional
diff --git a/libgnn/include/galois/layers/SAGELayer.h b/libgnn/include/galois/layers/SAGELayer.h
index 6825812315..59f71b9041 100644
--- a/libgnn/include/galois/layers/SAGELayer.h
+++ b/libgnn/include/galois/layers/SAGELayer.h
@@ -8,6 +8,8 @@
 
 namespace galois {
 
+extern galois::DynamicBitSet graphs::bitset_graph_aggregate;
+
 struct SAGELayerConfig {
   bool disable_concat{false};
 };
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index 4efae3c429..481784dc82 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -38,6 +38,7 @@ namespace galois {
 namespace graphs {
 GNNFloat* gnn_matrix_to_sync_            = nullptr;
 size_t gnn_matrix_to_sync_column_length_ = 0;
+galois::DynamicBitSet bitset_graph_aggregate;
 #ifdef GALOIS_ENABLE_GPU
 struct CUDA_Context* cuda_ctx_for_sync;
 unsigned layer_number_to_sync;
@@ -79,6 +80,7 @@ galois::graphs::GNNGraph::GNNGraph(const std::string& input_directory,
           *partitioned_graph_, host_id_,
           galois::runtime::getSystemNetworkInterface().Num, false,
           partitioned_graph_->cartesianGrid());
+  bitset_graph_aggregate.resize(partitioned_graph_->size());
 
   // read in entire graph topology
   ReadWholeGraph(dataset_name);
@@ -165,10 +167,9 @@ void galois::graphs::GNNGraph::AggregateSync(
   // set globals for the sync substrate
   gnn_matrix_to_sync_               = matrix_to_sync;
   gnn_matrix_to_sync_column_length_ = matrix_column_size;
-
-  // TODO(loc) bitset setting
-  sync_substrate_->sync<writeSource, readAny, GNNSumAggregate>(
-      "GraphAggregateSync");
+  sync_substrate_
+      ->sync<writeSource, readAny, GNNSumAggregate, Bitset_graph_aggregate>(
+          "GraphAggregateSync");
 }
 
 #ifdef GALOIS_ENABLE_GPU
diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp
index 8d3c6754a2..d2ee5ddcb0 100644
--- a/libgnn/src/layers/GraphConvolutionalLayer.cpp
+++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp
@@ -273,6 +273,8 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU(
           }
         }
 
+        graphs::bitset_graph_aggregate.set(src);
+
         GNNFloat source_norm = 0.0;
         if (!config_.disable_normalization) {
           source_norm = graph_.NormFactor(src);
diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp
index 194563610d..697722a9c5 100644
--- a/libgnn/src/layers/SAGELayer.cpp
+++ b/libgnn/src/layers/SAGELayer.cpp
@@ -383,6 +383,8 @@ void galois::SAGELayer::AggregateAllCPU(
           }
         }
 
+        graphs::bitset_graph_aggregate.set(src);
+
         GNNFloat source_norm = 0.0;
         if (!config_.disable_normalization) {
           source_norm = graph_.DegreeNorm(src);

From 1bf25f4e3a12716265c76e5b56121fe8c21f1f0b Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Sat, 6 Mar 2021 17:50:00 -0600
Subject: [PATCH 496/660] Slightly more intelligent bitset for GNN

Was setting bitset even if source had no edges: changed it to be
slightly more precise.

Unlikely that bitset helps at all though because of topology driven
nature of the aggregation operator. The hope was that mirror nodes
without edges would not get sync'd, but since sync call is already
writeSource those shouldn't be getting sync'd anyways.
---
 libgnn/src/layers/GraphConvolutionalLayer.cpp | 3 ++-
 libgnn/src/layers/SAGELayer.cpp               | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp
index d2ee5ddcb0..79c074ff2a 100644
--- a/libgnn/src/layers/GraphConvolutionalLayer.cpp
+++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp
@@ -273,7 +273,6 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU(
           }
         }
 
-        graphs::bitset_graph_aggregate.set(src);
 
         GNNFloat source_norm = 0.0;
         if (!config_.disable_normalization) {
@@ -282,6 +281,7 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU(
 
         // init to self
         if (!config_.disable_self_aggregate) {
+          graphs::bitset_graph_aggregate.set(src);
           // only aggregate self once on master
           if (src < last_master) {
             for (size_t i = 0; i < column_length; i++) {
@@ -295,6 +295,7 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU(
         // loop through all destinations to grab the feature to aggregate
         for (auto e = graph_.EdgeBegin(src); e != graph_.EdgeEnd(src); e++) {
           size_t dst = graph_.EdgeDestination(e);
+          graphs::bitset_graph_aggregate.set(src);
 
           if (layer_phase_ == GNNPhase::kTrain) {
             if (IsInductiveLayer()) {
diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp
index 697722a9c5..3ba096182f 100644
--- a/libgnn/src/layers/SAGELayer.cpp
+++ b/libgnn/src/layers/SAGELayer.cpp
@@ -383,7 +383,6 @@ void galois::SAGELayer::AggregateAllCPU(
           }
         }
 
-        graphs::bitset_graph_aggregate.set(src);
 
         GNNFloat source_norm = 0.0;
         if (!config_.disable_normalization) {
@@ -392,6 +391,7 @@ void galois::SAGELayer::AggregateAllCPU(
 
         // loop through all destinations to grab the feature to aggregate
         for (auto e = graph_.EdgeBegin(src); e != graph_.EdgeEnd(src); e++) {
+          graphs::bitset_graph_aggregate.set(src);
           size_t dst = graph_.EdgeDestination(e);
 
           if (layer_phase_ == GNNPhase::kTrain) {

From 011fc14a35524ac6442665281ad4919d37190ed5 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Sat, 6 Mar 2021 21:01:59 -0600
Subject: [PATCH 497/660] Softmax-cross-entropy derivative merged

Merged the application of softmax and cross entropy derivatives into a
very simple subtraction term that is apparently correct according to
math gurus on the internet.

Advantages:
1) much simpler
2) most importantly, numerical stability + correctness: previous
softmax/cross-entropy derivative was causing blowup of accuracy after a
few epochs where Xuhao OMP code was not. This simplifies the derivative
into something that is numerically stable (and probably more correct
than what was there before).
---
 libgnn/include/galois/GNNMath.h               | 35 +++++++-----
 libgnn/src/GNNMath.cpp                        |  5 ++
 libgnn/src/layers/GraphConvolutionalLayer.cpp |  1 -
 libgnn/src/layers/SAGELayer.cpp               |  1 -
 libgnn/src/layers/SoftmaxLayer.cpp            | 55 ++++++++-----------
 5 files changed, 49 insertions(+), 48 deletions(-)

diff --git a/libgnn/include/galois/GNNMath.h b/libgnn/include/galois/GNNMath.h
index e32d062cc5..9e50295200 100644
--- a/libgnn/include/galois/GNNMath.h
+++ b/libgnn/include/galois/GNNMath.h
@@ -29,6 +29,7 @@ void GNNSoftmaxDerivative(const size_t vector_length,
                           const GNNFloat* prev_output,
                           const GNNFloat* prev_output_derivative,
                           GNNFloat* temp_vector, GNNFloat* output);
+
 //! Performs cross entropy given a ground truth and input and returns the loss
 //! value.
 template <typename TruthType>
@@ -36,12 +37,12 @@ galois::GNNFloat GNNCrossEntropy(const size_t vector_length,
                                  const TruthType* ground_truth,
                                  const GNNFloat* input) {
   GNNFloat loss = 0.0;
-
   // Note that this function works if there are multiple non-zeros in the
   // ground truth vector
   // If there is only 1 then this function is overkill and it should break
-  // early (i.e. single class)
-  // Multiclass = fine
+  // early (i.e. single class); in one-hot vector setting for instance
+  // Multiclass = fine: in fact this is meant for multiclass but also
+  // works for single
   for (size_t i = 0; i < vector_length; i++) {
     if (ground_truth[i] == 0.0) {
       if (input[i] == 1.0) {
@@ -60,24 +61,28 @@ galois::GNNFloat GNNCrossEntropy(const size_t vector_length,
 
   return loss;
 }
-
 //! Derivative of cross entropy; gradients saved into an output vector.
 template <typename TruthType>
 void GNNCrossEntropyDerivative(const size_t vector_length,
                                const TruthType* ground_truth,
                                const GNNFloat* input, GNNFloat* gradients) {
+  // TODO(loc) delete this function once I fully understand why it wasn't
+  // working
+  GALOIS_LOG_FATAL(
+      "DO NOT USE THIS FUNCTION; NOT CORRECT IN ALL CIRCUMSTANCES");
   for (size_t i = 0; i < vector_length; i++) {
-    // TODO(loc) assumption: binary classifier, make explicit in function name
-    if (ground_truth[i]) {
-      gradients[i] = -1.0 / (input[i] + static_cast<float>(1e-10));
-    } else {
-      if (input[i] == 1.0) {
-        // opposite
-        gradients[i] = 1.0 / static_cast<float>(1e-10);
-      } else {
-        gradients[i] = 1.0 / (1.0 - input[i]);
-      }
-    }
+    gradients[i] = -ground_truth[i] / (input[i] + static_cast<float>(1e-10));
+    // if (ground_truth[i]) {
+    //  gradients[i] = -1.0 / (input[i] + static_cast<float>(1e-10));
+    //}
+    // else {
+    //  if (input[i] == 1.0) {
+    //    // opposite
+    //    gradients[i] = 1.0 / static_cast<float>(1e-10);
+    //  } else {
+    //    gradients[i] = 1.0 / (1.0 - input[i]);
+    //  }
+    //}
   }
 }
 
diff --git a/libgnn/src/GNNMath.cpp b/libgnn/src/GNNMath.cpp
index fe14198d83..aef3dae6dd 100644
--- a/libgnn/src/GNNMath.cpp
+++ b/libgnn/src/GNNMath.cpp
@@ -98,6 +98,11 @@ void galois::GNNSoftmaxDerivative(const size_t vector_length,
                                   const GNNFloat* prev_output,
                                   const GNNFloat* prev_output_derivative,
                                   GNNFloat* temp_vector, GNNFloat* output) {
+  // TODO(loc) remove this function, unnecessary as cross/softmax derivatives
+  // can be merged as currently done in Softmax code
+  // will do so in a separate commit
+  GALOIS_LOG_FATAL("Should not need this function anymore with simplified "
+                   "combined derivatives in each layer");
   for (size_t i = 0; i < vector_length; i++) {
     for (size_t j = 0; j < vector_length; j++) {
       temp_vector[j] = (j == i) ? prev_output[i] * (1.0 - prev_output[i])
diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp
index 79c074ff2a..a60b1eb0c4 100644
--- a/libgnn/src/layers/GraphConvolutionalLayer.cpp
+++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp
@@ -273,7 +273,6 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU(
           }
         }
 
-
         GNNFloat source_norm = 0.0;
         if (!config_.disable_normalization) {
           source_norm = graph_.NormFactor(src);
diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp
index 3ba096182f..191c02d00e 100644
--- a/libgnn/src/layers/SAGELayer.cpp
+++ b/libgnn/src/layers/SAGELayer.cpp
@@ -383,7 +383,6 @@ void galois::SAGELayer::AggregateAllCPU(
           }
         }
 
-
         GNNFloat source_norm = 0.0;
         if (!config_.disable_normalization) {
           source_norm = graph_.DegreeNorm(src);
diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp
index 6ac09806fe..10ed93c8ff 100644
--- a/libgnn/src/layers/SoftmaxLayer.cpp
+++ b/libgnn/src/layers/SoftmaxLayer.cpp
@@ -9,6 +9,7 @@ galois::SoftmaxLayer::ForwardPhaseCPU(
   forward_output_matrix_.assign(forward_output_matrix_.size(), 0.0);
   const size_t feature_length = layer_dimensions_.input_columns;
 #ifndef NDEBUG
+  //#ifdef NDEBUG
   galois::DGAccumulator<GNNFloat> loss_accum;
   galois::DGAccumulator<size_t> handled;
   loss_accum.reset();
@@ -41,6 +42,7 @@ galois::SoftmaxLayer::ForwardPhaseCPU(
               GNNCrossEntropy(feature_length, ground_truth_vec->data(),
                               &forward_output_matrix_[feature_length * i]);
 #ifndef NDEBUG
+          //#ifdef NDEBUG
           loss_accum += input_loss_[i];
           handled += 1;
 #endif
@@ -50,6 +52,7 @@ galois::SoftmaxLayer::ForwardPhaseCPU(
       // steal on as some threads may have nothing to work on
       galois::steal(), galois::loopname("SoftmaxForward"));
 #ifndef NDEBUG
+  //#ifdef NDEBUG
   GNNFloat reduced_loss = loss_accum.reduce();
   size_t t              = handled.reduce();
   galois::gPrint("Loss is ", reduced_loss / t, "\n");
@@ -81,43 +84,33 @@ galois::SoftmaxLayer::BackwardPhaseCPU() {
 
   galois::do_all(
       galois::iterate(graph_.begin(), graph_.end()),
-      [&](const unsigned i) {
-        if (graph_.IsValidForPhase(i, layer_phase_)) {
+      [&](const unsigned node) {
+        if (graph_.IsValidForPhase(node, layer_phase_)) {
           if (IsSampledLayer()) {
-            if (layer_phase_ == GNNPhase::kTrain && !graph_.IsInSampledGraph(i))
+            if (layer_phase_ == GNNPhase::kTrain &&
+                !graph_.IsInSampledGraph(node))
               return;
           }
 
-          // create ground truth vector for this LID
-          // TODO maybe make this part of the graph class instead of recreating
-          // every time
-          std::vector<GNNFloat>* ground_truth_vec =
-              ground_truth_vectors_.getLocal();
-          assert(ground_truth_vec->size() == feature_length);
-          ground_truth_vec->assign(ground_truth_vec->size(), 0.0);
-          // single class label is an index; set the correct one
-          (*ground_truth_vec)[static_cast<size_t>(
-              graph_.GetSingleClassLabel(i))] = 1.0;
-
-          // derivative cross entropy into norm grad
-          std::vector<GNNFloat>* norm_gradient =
-              norm_gradient_vectors_.getLocal();
-          GNNCrossEntropyDerivative(
-              feature_length, ground_truth_vec->data(),
-              &(forward_output_matrix_[i * feature_length]),
-              norm_gradient->data());
-
-          // use norm grad with softmax deritave, save and return
-          std::vector<GNNFloat>* softmax_temp =
-              softmax_temp_vectors_.getLocal();
-          GNNSoftmaxDerivative(feature_length,
-                               &(forward_output_matrix_[i * feature_length]),
-                               norm_gradient->data(), softmax_temp->data(),
-                               &(backward_output_matrix_[i * feature_length]));
+          size_t correct = graph_.GetSingleClassLabel(node);
+          // See here for explanation for why this works
+          // https://gombru.github.io/2018/05/23/cross_entropy_loss/
+          // Derivation of full combined derivative isn't there, but some
+          // emperical inspection tells me this is likely correct
+          // TODO(loc) work it out myself
+          for (size_t idx = 0; idx < feature_length; idx++) {
+            if (idx == correct) {
+              // positive class
+              backward_output_matrix_[node * feature_length + idx] =
+                  forward_output_matrix_[node * feature_length + idx] - 1;
+            } else {
+              // negative class
+              backward_output_matrix_[node * feature_length + idx] =
+                  forward_output_matrix_[node * feature_length + idx];
+            }
+          }
         }
       },
-      // TODO chunk size?
-      // steal on as some threads may have nothing to work on
       galois::steal(), galois::loopname("SoftmaxBackward"));
 
   return PointerWithSize(backward_output_matrix_);

From 8d80e9e7e03bb5d17b17ee4d5d7e0057193c5f3f Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 8 Mar 2021 21:49:36 -0600
Subject: [PATCH 498/660] better serialization for vec of vectors in gluon

Vector of vector serializatoin in Gluon was bad. This commit fixes it by
serializing all vectors into a single one before sendoff so
serialization unit doens't have to deal with it.

Because this commit was made in a rush (so i can schedule tests) I did
not really bother cleaning it much.
---
 libgluon/CMakeLists.txt                       |   2 +-
 .../include/galois/graphs/GluonSubstrate.h    | 318 +++++++++++++++++-
 2 files changed, 313 insertions(+), 7 deletions(-)

diff --git a/libgluon/CMakeLists.txt b/libgluon/CMakeLists.txt
index 3c9812e498..543e796156 100644
--- a/libgluon/CMakeLists.txt
+++ b/libgluon/CMakeLists.txt
@@ -18,7 +18,7 @@ target_include_directories(galois_gluon PUBLIC
 )
 
 if (GALOIS_COMM_STATS)
-  target_compile_definitions(galois_gluon PRIVATE GALOIS_COMM_STATS=1)
+  target_compile_definitions(galois_gluon PUBLIC GALOIS_COMM_STATS=1)
 endif()
 
 if (GALOIS_USE_BARE_MPI)
diff --git a/libgluon/include/galois/graphs/GluonSubstrate.h b/libgluon/include/galois/graphs/GluonSubstrate.h
index 6f9c9f5b85..2ef2e0b136 100644
--- a/libgluon/include/galois/graphs/GluonSubstrate.h
+++ b/libgluon/include/galois/graphs/GluonSubstrate.h
@@ -710,6 +710,12 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
         galois::no_stats());
   }
 
+  template <typename T>
+  struct is_vector_of_vec : public std::false_type {};
+  template <typename T, typename A, typename A2>
+  struct is_vector_of_vec<std::vector<std::vector<T, A2>, A>>
+      : public std::true_type {};
+
   ////////////////////////////////////////////////////////////////////////////////
   // Message prep functions (buffering, send buffer getting, etc.)
   ////////////////////////////////////////////////////////////////////////////////
@@ -735,11 +741,21 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
     auto& sharedNodes = (syncType == syncReduce) ? mirrorNodes : masterNodes;
 
     if (BitsetFnTy::is_valid()) {
-      syncExtract<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(
-          loopName, x, sharedNodes[x], b, elem_size);
+      if (is_vector_of_vec<VecTy>::value) {
+        syncExtractFloatVecHack<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(
+            loopName, x, sharedNodes[x], b, elem_size);
+      } else {
+        syncExtract<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(
+            loopName, x, sharedNodes[x], b, elem_size);
+      }
     } else {
-      syncExtract<syncType, SyncFnTy, VecTy, async>(loopName, x, sharedNodes[x],
-                                                    b, elem_size);
+      if (is_vector_of_vec<VecTy>::value) {
+        // TODO (loc)
+        GALOIS_LOG_FATAL("implement me");
+      } else {
+        syncExtract<syncType, SyncFnTy, VecTy, async>(
+            loopName, x, sharedNodes[x], b, elem_size);
+      }
     }
 
     std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
@@ -1876,10 +1892,164 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
           extractSubset<SyncFnTy, syncType, VecTy, false, true>(
               loopName, indices, bit_set_count, offsets, val_vec);
         }
+
         serializeMessage<async, syncType>(loopName, data_mode, bit_set_count,
                                           indices, offsets, bit_set_comm,
                                           val_vec, b);
       } else {
+        // TODO(loc/hochan) vector gpu hack for gnns
+        if (data_mode == noData) {
+          b.resize(0);
+          if (!async) {
+            gSerialize(b, data_mode);
+          }
+        } else if (data_mode == gidsData) {
+          b.resize(sizeof(DataCommMode) + sizeof(bit_set_count) +
+                   sizeof(size_t) + (bit_set_count * sizeof(unsigned int)) +
+                   sizeof(size_t) +
+                   (bit_set_count * sizeof(typename SyncFnTy::ValTy)));
+        } else if (data_mode == offsetsData) {
+          b.resize(sizeof(DataCommMode) + sizeof(bit_set_count) +
+                   sizeof(size_t) + (bit_set_count * sizeof(unsigned int)) +
+                   sizeof(size_t) +
+                   (bit_set_count * sizeof(typename SyncFnTy::ValTy)));
+        } else if (data_mode == bitsetData) {
+          size_t bitset_alloc_size = ((num + 63) / 64) * sizeof(uint64_t);
+          b.resize(sizeof(DataCommMode) + sizeof(bit_set_count) +
+                   sizeof(size_t)   // bitset size
+                   + sizeof(size_t) // bitset vector size
+                   + bitset_alloc_size + sizeof(size_t) +
+                   (bit_set_count * sizeof(typename SyncFnTy::ValTy)));
+        } else { // onlyData
+          b.resize(sizeof(DataCommMode) + sizeof(size_t) +
+                   (num * sizeof(typename SyncFnTy::ValTy)));
+        }
+      }
+
+      reportRedundantSize<SyncFnTy>(loopName, syncTypeStr, num, bit_set_count,
+                                    bit_set_comm);
+    } else {
+      data_mode = noData;
+      b.resize(0);
+      if (!async) {
+        gSerialize(b, noData);
+      }
+    }
+
+    Textract.stop();
+
+    std::string metadata_str(syncTypeStr + "MetadataMode_" +
+                             std::to_string(data_mode) + "_" +
+                             get_run_identifier(loopName));
+    galois::runtime::reportStatCond_Single<MORE_DIST_STATS>(RNAME, metadata_str,
+                                                            1);
+  }
+  template <
+      SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy,
+      bool async,
+      typename std::enable_if<!BitsetFnTy::is_vector_bitset()>::type* = nullptr,
+      typename std::enable_if<!is_vector_of_vec<VecTy>::value>::type* = nullptr>
+  void syncExtractFloatVecHack(std::string, unsigned, std::vector<size_t>&,
+                               galois::runtime::SendBuffer&, size_t) {
+    // TODO(loc) cleaner way to do this
+    GALOIS_LOG_FATAL(
+        "Execution should not call float vec hack if not vector of vectors");
+  }
+
+  template <
+      SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy,
+      bool async,
+      typename std::enable_if<!BitsetFnTy::is_vector_bitset()>::type* = nullptr,
+      typename std::enable_if<is_vector_of_vec<VecTy>::value>::type*  = nullptr>
+  void syncExtractFloatVecHack(std::string loopName, unsigned from_id,
+                               std::vector<size_t>& indices,
+                               galois::runtime::SendBuffer& b,
+                               size_t elem_size) {
+    // TODO(loc) assumption that type in the VecTy is a vector of floats
+    // throughout this code; more robust solution would detect it other ways
+    uint32_t num                        = indices.size() * elem_size;
+    galois::DynamicBitSet& bit_set_comm = syncBitset;
+    static VecTy val_vec; // sometimes wasteful
+    static galois::gstl::Vector<float> single_array;
+    galois::PODResizeableArray<unsigned int>& offsets = syncOffsets;
+
+    ////////////////////////////////////////////////////////////////////////////
+    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
+    std::string extract_timer_str(syncTypeStr + "Extract_" +
+                                  get_run_identifier(loopName));
+    galois::CondStatTimer<GALOIS_COMM_STATS> Textract(extract_timer_str.c_str(),
+                                                      RNAME);
+    std::string extract_alloc_timer_str(syncTypeStr + "ExtractAlloc_" +
+                                        get_run_identifier(loopName));
+    galois::CondStatTimer<GALOIS_COMM_STATS> Textractalloc(
+        extract_alloc_timer_str.c_str(), RNAME);
+    std::string extract_batch_timer_str(syncTypeStr + "ExtractBatch_" +
+                                        get_run_identifier(loopName));
+    galois::CondStatTimer<GALOIS_COMM_STATS> Textractbatch(
+        extract_batch_timer_str.c_str(), RNAME);
+    ////////////////////////////////////////////////////////////////////////////
+
+    DataCommMode data_mode;
+    Textract.start();
+
+    if (num > 0) {
+      size_t bit_set_count = 0;
+      Textractalloc.start();
+      b.reserve(getMaxSendBufferSize<SyncFnTy>(num));
+      Textractalloc.stop();
+
+      Textractbatch.start();
+      bool batch_succeeded = extractBatchWrapper<SyncFnTy, syncType>(
+          from_id, b, bit_set_count, data_mode);
+      Textractbatch.stop();
+
+      // GPUs have a batch function they can use; CPUs do not; therefore,
+      // CPUS always enter this if block
+      if (!batch_succeeded) {
+        Textractalloc.start();
+        b.resize(0);
+        bit_set_comm.reserve(maxSharedSize);
+        offsets.reserve(maxSharedSize);
+        val_vec.reserve(maxSharedSize);
+        bit_set_comm.resize(num);
+        offsets.resize(num);
+        val_vec.resize(num);
+        Textractalloc.stop();
+        const galois::DynamicBitSet& bit_set_compute = BitsetFnTy::get();
+
+        getBitsetAndOffsets<SyncFnTy, syncType>(
+            loopName, indices, bit_set_compute, bit_set_comm, offsets,
+            bit_set_count, data_mode);
+
+        if (data_mode == onlyData) {
+          bit_set_count = indices.size();
+          extractSubset<SyncFnTy, syncType, VecTy, true, true>(
+              loopName, indices, bit_set_count, offsets, val_vec);
+        } else if (data_mode !=
+                   noData) { // bitsetData or offsetsData or gidsData
+          extractSubset<SyncFnTy, syncType, VecTy, false, true>(
+              loopName, indices, bit_set_count, offsets, val_vec);
+        }
+
+        // Vector of vectors is in val_vec
+        // val vec over to contiguous array of #s
+        size_t num_nodes    = val_vec.size();
+        size_t feature_size = val_vec[0].size();
+        single_array.resize(num_nodes * feature_size);
+        galois::do_all(
+            galois::iterate(size_t{0}, num_nodes),
+            [&](size_t node) {
+              std::memcpy(&(single_array.data()[node * feature_size]),
+                          val_vec[node].data(), feature_size * sizeof(float));
+            },
+            galois::loopname("GluonSerializeManyVecToOne"));
+
+        serializeMessage<async, syncType>(loopName, data_mode, bit_set_count,
+                                          indices, offsets, bit_set_comm,
+                                          single_array, b);
+        gSerialize(b, feature_size);
+      } else {
+        // TODO(loc/hochan) vector gpu hack for gnns
         if (data_mode == noData) {
           b.resize(0);
           if (!async) {
@@ -2259,6 +2429,121 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
           deserializeMessage<syncType>(loopName, data_mode, num, buf,
                                        bit_set_count, offsets, bit_set_comm,
                                        buf_start, retval, val_vec);
+          bit_set_comm.reserve(maxSharedSize);
+          offsets.reserve(maxSharedSize);
+          val_vec.reserve(maxSharedSize);
+
+          galois::DynamicBitSet& bit_set_compute = BitsetFnTy::get();
+
+          if (data_mode == bitsetData) {
+            size_t bit_set_count2;
+            getOffsetsFromBitset<syncType>(loopName, bit_set_comm, offsets,
+                                           bit_set_count2);
+            assert(bit_set_count == bit_set_count2);
+          }
+
+          if (data_mode == onlyData) {
+            setSubset<decltype(sharedNodes[from_id]), SyncFnTy, syncType, VecTy,
+                      async, true, true>(loopName, sharedNodes[from_id],
+                                         bit_set_count, offsets, val_vec,
+                                         bit_set_compute);
+          } else if (data_mode == dataSplit || data_mode == dataSplitFirst) {
+            setSubset<decltype(sharedNodes[from_id]), SyncFnTy, syncType, VecTy,
+                      async, true, true>(loopName, sharedNodes[from_id],
+                                         bit_set_count, offsets, val_vec,
+                                         bit_set_compute, buf_start);
+          } else if (data_mode == gidsData) {
+            setSubset<decltype(offsets), SyncFnTy, syncType, VecTy, async, true,
+                      true>(loopName, offsets, bit_set_count, offsets, val_vec,
+                            bit_set_compute);
+          } else { // bitsetData or offsetsData
+            setSubset<decltype(sharedNodes[from_id]), SyncFnTy, syncType, VecTy,
+                      async, false, true>(loopName, sharedNodes[from_id],
+                                          bit_set_count, offsets, val_vec,
+                                          bit_set_compute);
+          }
+          // TODO: reduce could update the bitset, so it needs to be copied
+          // back to the device
+        }
+      }
+    }
+
+    Tset.stop();
+
+    return retval;
+  }
+
+  // TODO (loc) way too much code duplication
+  template <
+      SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy,
+      bool async,
+      typename std::enable_if<!BitsetFnTy::is_vector_bitset()>::type* = nullptr,
+      typename std::enable_if<is_vector_of_vec<VecTy>::value>::type*  = nullptr>
+  size_t syncRecvApplyVecHack(uint32_t from_id,
+                              galois::runtime::RecvBuffer& buf,
+                              std::string loopName) {
+    ////////////////////////////////////////////////////////////////////////////
+    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
+    std::string set_timer_str(syncTypeStr + "Set_" +
+                              get_run_identifier(loopName));
+    galois::CondStatTimer<GALOIS_COMM_STATS> Tset(set_timer_str.c_str(), RNAME);
+    std::string set_batch_timer_str(syncTypeStr + "SetBatch_" +
+                                    get_run_identifier(loopName));
+    galois::CondStatTimer<GALOIS_COMM_STATS> Tsetbatch(
+        set_batch_timer_str.c_str(), RNAME);
+    ////////////////////////////////////////////////////////////////////////////
+
+    galois::DynamicBitSet& bit_set_comm = syncBitset;
+    static VecTy val_vec;
+    // TODO(loc) assumes float for now
+    static galois::gstl::Vector<float> single_array;
+    galois::PODResizeableArray<unsigned int>& offsets = syncOffsets;
+
+    auto& sharedNodes = (syncType == syncReduce) ? masterNodes : mirrorNodes;
+    uint32_t num      = sharedNodes[from_id].size();
+    size_t retval     = 0;
+
+    Tset.start();
+
+    if (num > 0) { // only enter if we expect message from that host
+      DataCommMode data_mode;
+      // 1st deserialize gets data mode
+      galois::runtime::gDeserialize(buf, data_mode);
+
+      if (data_mode != noData) {
+        // GPU update call
+        Tsetbatch.start();
+        bool batch_succeeded =
+            setBatchWrapper<SyncFnTy, syncType, async>(from_id, buf, data_mode);
+        Tsetbatch.stop();
+
+        // cpu always enters this block
+        if (!batch_succeeded) {
+          size_t bit_set_count = num;
+          size_t buf_start     = 0;
+
+          // deserialize the rest of the data in the buffer depending on the
+          // data mode; arguments passed in here are mostly output vars
+          deserializeMessage<syncType>(loopName, data_mode, num, buf,
+                                       bit_set_count, offsets, bit_set_comm,
+                                       buf_start, retval, single_array);
+
+          // deserialize sngle array into vector of vector state again
+          size_t feature_size;
+          gDeserialize(buf, feature_size);
+          size_t num_nodes = single_array.size() / feature_size;
+
+          assert(single_array.size() % feature_size == 0);
+          val_vec.resize(num_nodes);
+          galois::do_all(
+              galois::iterate(size_t{0}, num_nodes),
+              [&](size_t node) {
+                val_vec[node].resize(feature_size);
+                std::memcpy((void*)(val_vec[node].data()),
+                            &(single_array[node * feature_size]),
+                            feature_size * sizeof(float));
+              },
+              galois::loopname("GluonDeserializeBackToVecOfVec"));
 
           bit_set_comm.reserve(maxSharedSize);
           offsets.reserve(maxSharedSize);
@@ -2304,6 +2589,17 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
     return retval;
   }
 
+  template <
+      SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy,
+      bool async,
+      typename std::enable_if<!BitsetFnTy::is_vector_bitset()>::type* = nullptr,
+      typename std::enable_if<!is_vector_of_vec<VecTy>::value>::type* = nullptr>
+  size_t syncRecvApplyVecHack(uint32_t, galois::runtime::RecvBuffer&,
+                              std::string) {
+    GALOIS_LOG_FATAL("NOT SUPPORTED, should never get called");
+    return 0;
+  }
+
   /**
    * VECTOR BITSET VARIANT.
    *
@@ -2498,6 +2794,11 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
                                                    RNAME);
 
     if (async) {
+      if (is_vector_of_vec<VecTy>::value) {
+        galois::gWarn("Async execution does not support the vector of vec hack "
+                      "(most important for GNN)");
+      }
+
       size_t syncTypePhase = 0;
       if (syncType == syncBroadcast)
         syncTypePhase = 1;
@@ -2526,8 +2827,13 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
         } while (!p);
         Twait.stop();
 
-        syncRecvApply<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(
-            p->first, p->second, loopName);
+        if (is_vector_of_vec<VecTy>::value) {
+          syncRecvApplyVecHack<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(
+              p->first, p->second, loopName);
+        } else {
+          syncRecvApply<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(
+              p->first, p->second, loopName);
+        }
       }
       incrementEvilPhase();
     }

From 6c6947d245d5ac313973a53954077261393bdd87 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 9 Mar 2021 15:48:58 -0600
Subject: [PATCH 499/660] Size usage, weight sync sub removal, fixes

Prints for size used on layers. Removal of sync substrate for weights
since collectives are used now.

Some bug fixes for mask policy in graph reading + size used + print of
set sizes.

Do not allocate weights for layer 0 backward. Fixes tests that check for
it as well.
---
 libgnn/CMakeLists.txt                         |  1 -
 libgnn/include/galois/layers/GNNLayer.h       | 15 ++--
 .../galois/layers/GluonGradientInterface.h    | 81 -------------------
 libgnn/include/galois/layers/SAGELayer.h      |  6 --
 libgnn/src/graphs/GNNGraph.cpp                | 47 +++++++----
 libgnn/src/layers/GNNLayer.cpp                | 65 ++++++---------
 libgnn/src/layers/GluonGradientInterface.cpp  | 49 -----------
 libgnn/src/layers/GraphConvolutionalLayer.cpp | 10 ++-
 libgnn/src/layers/SAGELayer.cpp               | 25 +++---
 libgnn/test/aggregate-sync-test.cpp           | 13 ++-
 libgnn/test/convlayer-test.cpp                | 23 +-----
 libgnn/test/sage-layer-test.cpp               | 23 +-----
 12 files changed, 95 insertions(+), 263 deletions(-)
 delete mode 100644 libgnn/include/galois/layers/GluonGradientInterface.h
 delete mode 100644 libgnn/src/layers/GluonGradientInterface.cpp

diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt
index b59cccef93..ed60ae032b 100644
--- a/libgnn/CMakeLists.txt
+++ b/libgnn/CMakeLists.txt
@@ -5,7 +5,6 @@ set(sources
   src/graphs/GNNGraph.cpp
   src/layers/DenseLayer.cpp
   src/layers/GNNLayer.cpp
-  src/layers/GluonGradientInterface.cpp
   src/layers/GraphConvolutionalLayer.cpp
   src/layers/L2NormLayer.cpp
   src/layers/SAGELayer.cpp
diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
index 27fd1ac0c7..47b38a7f73 100644
--- a/libgnn/include/galois/layers/GNNLayer.h
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -3,7 +3,6 @@
 #include "galois/PerThreadRNG.h"
 #include "galois/GNNOptimizers.h"
 #include "galois/graphs/GNNGraph.h"
-#include "galois/layers/GluonGradientInterface.h"
 
 #ifdef GALOIS_ENABLE_GPU
 #include "galois/layers/GNNLayer.cuh"
@@ -225,15 +224,6 @@ class GNNLayer {
   PointerWithSize<GNNFloat> p_forward_output_matrix_;
   PointerWithSize<GNNFloat> p_backward_output_matrix_;
 
-  //! Synchronizes all weights (used in distributed setting)
-  void SyncInitialWeights();
-
-  //! Wrapper over gradient matrix to make it compatible with Gluon
-  std::unique_ptr<GluonGradientInterface> gradient_sync_interface_;
-  //! Synchronization substrate for the weight gradients
-  std::unique_ptr<galois::graphs::GluonSubstrate<GluonGradientInterface>>
-      gradient_sync_substrate_;
-
   //! RNG for matrix initialization
   PerThreadRNG random_init_rng_{-5.0, 5.0};
   //! RNG for dropout
@@ -303,6 +293,11 @@ class GNNLayer {
 #endif
 
   void MaskGradientNonMasters(PointerWithSize<GNNFloat>* gradients);
+
+  //! Does some math to get GB used by some # of floats
+  double FloatElementsToGB(size_t num_of_floats) const {
+    return num_of_floats * double{4} / (1 << 30);
+  }
 };
 
 } // namespace galois
diff --git a/libgnn/include/galois/layers/GluonGradientInterface.h b/libgnn/include/galois/layers/GluonGradientInterface.h
deleted file mode 100644
index a41ca0cb4d..0000000000
--- a/libgnn/include/galois/layers/GluonGradientInterface.h
+++ /dev/null
@@ -1,81 +0,0 @@
-#pragma once
-
-#include "galois/GNNTypes.h"
-#include "galois/gstl.h"
-#include "galois/runtime/Network.h"
-
-namespace galois {
-
-// TODO figure out which function calls can be removed without causing compiler
-// to complain
-
-//! Wraps a matrix and allows it to be synchronized via Gluon as it provides
-//! all the functions Gluon needs.
-//! Assumes the matrix is initialized the same way across all hosts (if not
-//! they'll all see the same values after the first round of sync anyways)
-class GluonGradientInterface {
-public:
-  // typedefs required by GPU end to build; not actually used anywhere in this
-  // class (...at the moment)
-  // as such, dummy declarations that don't particularly make sense
-  // TODO will likely need to revisit once GPU substrate for this needs to be
-  // setup
-  using GraphNode     = uint32_t;
-  using edge_iterator = boost::counting_iterator<uint64_t>;
-  using EdgeType      = char;
-
-  //! Save reference to weight gradients.
-  //! Then setup mirror metadata for Gluon to use during setup.
-  GluonGradientInterface(std::vector<GNNFloat>& gradients);
-
-  //! Size is number of weights since all hosts own everything
-  size_t size() const { return num_weights_; }
-  //! Global size is number of weights
-  size_t globalSize() const { return num_weights_; }
-  //! Return the weights owned by this host
-  size_t numMasters() const { return num_owned_; }
-  //! GID is same as LID since all hosts have all weights
-  uint32_t getGID(const uint32_t node_id) const { return node_id; }
-  //! LID is same as GID since all hosts have all weights
-  uint32_t getLID(const uint32_t node_id) const { return node_id; }
-  //! Return weight w
-  GNNFloat& getData(uint32_t w) const { return gradients_[w]; }
-  //! Return ranges for mirrors (unowned nodes)
-  const std::vector<std::pair<uint32_t, uint32_t>>& getMirrorRanges() const {
-    return mirror_ranges_;
-  }
-  //! Return mirror nodes for each host from this host's point of view
-  std::vector<std::vector<size_t>>& getMirrorNodes() { return mirror_nodes_; }
-
-  //////////////////////////////////////////////////////////////////////////////
-
-  // for all that follow, no edges in this sync so most of this returns what
-  // you expect
-  // size_t getNumNodesWithEdges() const { return 0; }
-  bool is_vertex_cut() const { return false; }
-  unsigned edge_begin(uint32_t) const { return 0; }
-  unsigned edge_end(uint32_t) const { return 0; }
-  unsigned getEdgeDst(uint32_t) const { return 0; }
-  unsigned getEdgeData(uint32_t) const { return 0; }
-  void deallocate() const {};
-
-  bool is_a_graph() const { return false; }
-
-private:
-  //! Reference to gradients that can get synchronized
-  std::vector<GNNFloat>& gradients_;
-  //! number of weight gradients
-  size_t num_weights_;
-  //! number of single gradients this host is responsible for
-  size_t num_owned_;
-  //! First weight that's a master
-  size_t begin_master_;
-  //! Last weight that's a master
-  size_t end_master_;
-  //! My nodes whose's masters are on other hosts; global ids
-  std::vector<std::vector<size_t>> mirror_nodes_;
-  //! nodes that are mirrors on this host
-  std::vector<std::pair<uint32_t, uint32_t>> mirror_ranges_;
-};
-
-} // namespace galois
diff --git a/libgnn/include/galois/layers/SAGELayer.h b/libgnn/include/galois/layers/SAGELayer.h
index 59f71b9041..9dc7007931 100644
--- a/libgnn/include/galois/layers/SAGELayer.h
+++ b/libgnn/include/galois/layers/SAGELayer.h
@@ -109,12 +109,6 @@ class SAGELayer : public GNNLayer {
   PointerWithSize<GNNFloat> p_layer_weights_2_;
   PointerWithSize<GNNFloat> p_layer_weight_gradients_2_;
 
-  //! Wrapper over 2nd gradient matrix to make it compatible with Gluon
-  std::unique_ptr<GluonGradientInterface> gradient_sync_interface_2_;
-  //! Synchronization substrate for the 2nd weight gradients
-  std::unique_ptr<galois::graphs::GluonSubstrate<GluonGradientInterface>>
-      gradient_sync_substrate_2_;
-
   // 2 temporaries the size of the forward input; used for dropout and
   // aggregation (if either are required)
   std::vector<GNNFloat> in_temp_1_;
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index 481784dc82..a75bb47498 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -362,6 +362,9 @@ void galois::graphs::GNNGraph::ReadLocalLabels(const std::string& dataset_name,
 
   file_stream.close();
 
+  galois::gInfo(host_prefix_, "Read ", found_local_vertices, " labels (",
+                local_ground_truth_labels_.size() * double{4} / (1 << 30),
+                " GB)");
   GALOIS_LOG_ASSERT(found_local_vertices == partitioned_graph_->size());
 }
 
@@ -414,6 +417,10 @@ void galois::graphs::GNNGraph::ReadLocalFeatures(
     }
   }
   full_feature_set.reset();
+
+  galois::gInfo(
+      host_prefix_, "Read ", local_node_features_.size(), " features (",
+      local_ground_truth_labels_.size() * double{4} / (1 << 30), " GB)");
   GALOIS_LOG_ASSERT(num_kept_vertices == partitioned_graph_->size());
 }
 
@@ -438,19 +445,23 @@ size_t galois::graphs::GNNGraph::ReadLocalMasksFromFile(
   mask_range->end   = range_end;
   mask_range->size  = range_end - range_begin;
 
-  size_t cur_line_num       = 0;
+  size_t cur_line_num = 0;
+  // valid nodes on this host
   size_t local_sample_count = 0;
+  // this tracks TOTAL # of valid nodes in this group (not necessarily valid
+  // ones on this host)
+  size_t valid_count = 0;
   std::string line;
   // each line is a number signifying if mask is set for the vertex
   while (std::getline(mask_stream, line)) {
     std::istringstream mask_stream(line);
     // only examine vertices/lines in range
     if (cur_line_num >= range_begin && cur_line_num < range_end) {
-      // only bother if node is local
-      if (partitioned_graph_->isLocal(cur_line_num)) {
-        unsigned mask = 0;
-        mask_stream >> mask;
-        if (mask == 1) {
+      unsigned mask = 0;
+      mask_stream >> mask;
+      if (mask == 1) {
+        valid_count++;
+        if (partitioned_graph_->isLocal(cur_line_num)) {
           masks[partitioned_graph_->getLID(cur_line_num)] = 1;
           local_sample_count++;
         }
@@ -460,7 +471,7 @@ size_t galois::graphs::GNNGraph::ReadLocalMasksFromFile(
   }
   mask_stream.close();
 
-  if (local_sample_count != mask_range->size) {
+  if (valid_count != mask_range->size) {
     // overlapping masks: need to actually check the masks rather than use
     // ranges
     if (!incomplete_masks_) {
@@ -470,7 +481,7 @@ size_t galois::graphs::GNNGraph::ReadLocalMasksFromFile(
     incomplete_masks_ = true;
   }
 
-  return local_sample_count;
+  return valid_count;
 }
 
 void galois::graphs::GNNGraph::ReadLocalMasks(const std::string& dataset_name) {
@@ -513,12 +524,20 @@ void galois::graphs::GNNGraph::ReadLocalMasks(const std::string& dataset_name) {
     }
   } else {
     // XXX i can get local sample counts from here if i need it
-    ReadLocalMasksFromFile(dataset_name, "train", &global_training_mask_range_,
-                           local_training_mask_.data());
-    ReadLocalMasksFromFile(dataset_name, "val", &global_validation_mask_range_,
-                           local_validation_mask_.data());
-    ReadLocalMasksFromFile(dataset_name, "test", &global_testing_mask_range_,
-                           local_testing_mask_.data());
+    size_t valid_train = ReadLocalMasksFromFile(dataset_name, "train",
+                                                &global_training_mask_range_,
+                                                local_training_mask_.data());
+    size_t valid_val   = ReadLocalMasksFromFile(dataset_name, "val",
+                                              &global_validation_mask_range_,
+                                              local_validation_mask_.data());
+    size_t valid_test  = ReadLocalMasksFromFile(dataset_name, "test",
+                                               &global_testing_mask_range_,
+                                               local_testing_mask_.data());
+    if (galois::runtime::getSystemNetworkInterface().ID == 0) {
+      galois::gInfo("Valid # training nodes is ", valid_train);
+      galois::gInfo("Valid # validation nodes is ", valid_val);
+      galois::gInfo("Valid # test nodes is ", valid_test);
+    }
   }
 }
 
diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp
index 1084bf9010..cde5698a93 100644
--- a/libgnn/src/layers/GNNLayer.cpp
+++ b/libgnn/src/layers/GNNLayer.cpp
@@ -9,14 +9,22 @@ galois::GNNLayer::GNNLayer(size_t layer_num,
     : layer_number_(layer_num), graph_(graph), layer_dimensions_(dimensions),
       config_(config) {
   if (config_.allocate_weights) {
-    // TODO some of this does not need alloc if not used
     // dropout allocation; dropout is same as input
-    dropout_mask_.resize(
-        layer_dimensions_.input_rows * layer_dimensions_.input_columns, false);
+    if (!config_.disable_dropout) {
+      dropout_mask_.resize(layer_dimensions_.input_rows *
+                               layer_dimensions_.input_columns,
+                           false);
+    }
     // allocate memory based on layer dimensions
     size_t num_weight_elements =
         layer_dimensions_.input_columns * layer_dimensions_.output_columns;
+    galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
+                  ", layer weights ", num_weight_elements, " (",
+                  FloatElementsToGB(num_weight_elements), " GB)");
     layer_weights_.resize(num_weight_elements);
+    galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
+                  ", layer gradients ", num_weight_elements, " (",
+                  FloatElementsToGB(num_weight_elements), " GB)");
     layer_weight_gradients_.resize(num_weight_elements, 0);
 #ifdef GALOIS_ENABLE_GPU
     if (device_personality == DevicePersonality::GPU_CUDA) {
@@ -27,22 +35,25 @@ galois::GNNLayer::GNNLayer(size_t layer_num,
 #endif
 
     GlorotBengioInit(&layer_weights_);
-
-    // initialize sync substrate
-    gradient_sync_interface_ =
-        std::make_unique<GluonGradientInterface>(layer_weight_gradients_);
-    gradient_sync_substrate_ = std::make_unique<
-        galois::graphs::GluonSubstrate<GluonGradientInterface>>(
-        *gradient_sync_interface_,
-        galois::runtime::getSystemNetworkInterface().ID,
-        galois::runtime::getSystemNetworkInterface().Num, false);
   }
 
   size_t num_output_elements =
       layer_dimensions_.input_rows * layer_dimensions_.output_columns;
+  galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
+                ", forward output matrix ", num_output_elements, " (",
+                FloatElementsToGB(num_output_elements), " GB)");
   forward_output_matrix_.resize(num_output_elements, 0);
-  backward_output_matrix_.resize(
-      layer_dimensions_.input_rows * layer_dimensions_.input_columns, 0);
+  if (layer_number_ != 0) {
+    galois::gInfo(
+        graph_.host_prefix(), "Creating layer ", layer_number_,
+        ", backward output matrix ",
+        layer_dimensions_.input_rows * layer_dimensions_.input_columns, " (",
+        FloatElementsToGB(layer_dimensions_.input_rows *
+                          layer_dimensions_.input_columns),
+        " GB)");
+    backward_output_matrix_.resize(
+        layer_dimensions_.input_rows * layer_dimensions_.input_columns, 0);
+  }
 #ifdef GALOIS_ENABLE_GPU
   if (device_personality == DevicePersonality::GPU_CUDA) {
     base_gpu_object_.InitInOutMemory(num_output_elements,
@@ -277,10 +288,6 @@ void galois::GNNLayer::WeightGradientSyncSum() {
   // TODO(hochan) collectives here rather than gluon sync if possible like the
   // CPU code
   // preferably without needing to do a gpu->cpu copy
-  galois::gWarn(
-      "GPU still using inefficient point to point comms for weight sync");
-  gradient_sync_substrate_->sync<writeAny, readAny, WeightGradientSummation>(
-      "WeightGradientsSync");
 #else
   // TODO(loc) remove this limitation later; can just do a loop over the weight
   // matrix
@@ -297,28 +304,6 @@ void galois::GNNLayer::WeightGradientSyncSum() {
   t.stop();
 }
 
-void galois::GNNLayer::SyncInitialWeights() {
-  if (galois::runtime::getSystemNetworkInterface().Num == 1) {
-    return;
-  }
-#ifdef GALOIS_ENABLE_GPU
-  // TODO(loc/hochan); not required at the moment however
-  GALOIS_LOG_FATAL("Need to implement GPU version of this");
-#endif
-  // copy weights over to gradients
-  for (size_t i = 0; i < layer_weights_.size(); i++) {
-    layer_weight_gradients_[i] = layer_weights_[i];
-  }
-  // sync "gradients" with a set only (reduction ignored)
-  gradient_sync_substrate_->sync<writeAny, readAny, WeightGradientSet>(
-      "InitialSync");
-  // copy "gradients" (actually weights) back to weight matrix
-  for (size_t i = 0; i < layer_weights_.size(); i++) {
-    layer_weights_[i]          = layer_weight_gradients_[i];
-    layer_weight_gradients_[i] = 0;
-  }
-}
-
 void galois::GNNLayer::MaskGradientNonMasters(
     PointerWithSize<GNNFloat>* gradient) {
 #ifdef GALOIS_ENABLE_GPU
diff --git a/libgnn/src/layers/GluonGradientInterface.cpp b/libgnn/src/layers/GluonGradientInterface.cpp
deleted file mode 100644
index 74e612af17..0000000000
--- a/libgnn/src/layers/GluonGradientInterface.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-#include "galois/layers/GluonGradientInterface.h"
-
-galois::GluonGradientInterface::GluonGradientInterface(
-    std::vector<GNNFloat>& gradients)
-    : gradients_(gradients), num_weights_(gradients_.size()) {
-  size_t my_host   = galois::runtime::getSystemNetworkInterface().ID;
-  size_t num_hosts = galois::runtime::getSystemNetworkInterface().Num;
-
-  // allocate a vector for each host
-  mirror_nodes_.resize(num_hosts);
-
-  // loop through distribution of weights to hosts
-  for (unsigned h = 0; h < num_hosts; h++) {
-    std::pair<size_t, size_t> cur_range =
-        galois::block_range((size_t)0, num_weights_, h, num_hosts);
-
-    if (h != my_host) {
-      // setup mirrors for the host h which is just the list of IDs
-      size_t current_weight   = cur_range.first;
-      size_t last_weight      = cur_range.second;
-      size_t num_host_weights = last_weight - current_weight;
-
-      // set mirrors for host h
-      mirror_nodes_[h].reserve(num_host_weights);
-      for (; current_weight < last_weight; current_weight++) {
-        mirror_nodes_[h].push_back(current_weight);
-      }
-    } else {
-      // these belong to this host; save, then mirror ranges can be
-      // calculated from this
-      begin_master_ = cur_range.first;
-      end_master_   = cur_range.second;
-      num_owned_    = end_master_ - begin_master_;
-
-      // first range is 0 to begin master
-      if (begin_master_ > 0) {
-        mirror_ranges_.emplace_back(0, begin_master_);
-      }
-
-      // second range is endMaster to end
-      if (end_master_ < num_weights_) {
-        mirror_ranges_.emplace_back(end_master_, num_weights_);
-      }
-    }
-  }
-
-  galois::gDebug("[", my_host, "] Weight gradients: this host owns ",
-                 begin_master_, " to ", end_master_);
-}
diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp
index a60b1eb0c4..44d2df6529 100644
--- a/libgnn/src/layers/GraphConvolutionalLayer.cpp
+++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp
@@ -10,15 +10,23 @@ galois::GraphConvolutionalLayer::GraphConvolutionalLayer(
       output_column_intermediates_(dimensions.output_columns) {
   size_t num_input_elements =
       layer_dimensions_.input_rows * layer_dimensions_.input_columns;
+  galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
+                ", GCN input temp var 1 ", num_input_elements, " (",
+                FloatElementsToGB(num_input_elements), " GB)");
   in_temp_1_.resize(num_input_elements, 0);
   if (config_.disable_aggregate_after_update ||
       layer_dimensions_.input_columns <= layer_dimensions_.output_columns) {
+    galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
+                  ", GCN input temp var 2 ", num_input_elements, " (",
+                  FloatElementsToGB(num_input_elements), " GB)");
     in_temp_2_.resize(num_input_elements, 0);
   }
 
   size_t num_output_elements =
       layer_dimensions_.input_rows * layer_dimensions_.output_columns;
-  GALOIS_LOG_VERBOSE("Output elements {}", num_output_elements);
+  galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
+                ", GCN output temp var ", num_output_elements, " (",
+                FloatElementsToGB(num_output_elements), " GB)");
   out_temp_.resize(num_output_elements, 0);
   layer_type_ = galois::GNNLayerType::kGraphConvolutional;
 #ifdef GALOIS_ENABLE_GPU
diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp
index 191c02d00e..9f80000bdd 100644
--- a/libgnn/src/layers/SAGELayer.cpp
+++ b/libgnn/src/layers/SAGELayer.cpp
@@ -15,7 +15,13 @@ galois::SAGELayer::SAGELayer(size_t layer_num,
     // abstractly it's one matrix: W = W1 | W2
     size_t num_weight_elements =
         layer_dimensions_.input_columns * layer_dimensions_.output_columns;
+    galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
+                  ", SAGE second layer weights ", num_weight_elements, " (",
+                  FloatElementsToGB(num_weight_elements), " GB)");
     layer_weights_2_.resize(num_weight_elements);
+    galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
+                  ", SAGE second layer gradients ", num_weight_elements, " (",
+                  FloatElementsToGB(num_weight_elements), " GB)");
     layer_weight_gradients_2_.resize(num_weight_elements, 0);
 
     // reinit both weight matrices as one unit
@@ -28,29 +34,28 @@ galois::SAGELayer::SAGELayer(size_t layer_num,
     // initialize the optimizer
     std::vector<size_t> weight_size = {num_weight_elements};
     second_weight_optimizer_ = std::make_unique<AdamOptimizer>(weight_size, 1);
-
-    // initialize sync substrate for second set
-    gradient_sync_interface_2_ =
-        std::make_unique<GluonGradientInterface>(layer_weight_gradients_2_);
-    gradient_sync_substrate_2_ = std::make_unique<
-        galois::graphs::GluonSubstrate<GluonGradientInterface>>(
-        *gradient_sync_interface_2_,
-        galois::runtime::getSystemNetworkInterface().ID,
-        galois::runtime::getSystemNetworkInterface().Num, false);
   }
 
   size_t num_input_elements =
       layer_dimensions_.input_rows * layer_dimensions_.input_columns;
+  galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
+                ", SAGE input temp var 1 ", num_input_elements, " (",
+                FloatElementsToGB(num_input_elements), " GB)");
   in_temp_1_.resize(num_input_elements, 0);
   // only need to allocate if input <= output because not used otherwise
   if (config_.disable_aggregate_after_update ||
       layer_dimensions_.input_columns <= layer_dimensions_.output_columns) {
+    galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
+                  ", SAGE input temp var 2 ", num_input_elements, " (",
+                  FloatElementsToGB(num_input_elements), " GB)");
     in_temp_2_.resize(num_input_elements, 0);
   }
 
   size_t num_output_elements =
       layer_dimensions_.input_rows * layer_dimensions_.output_columns;
-  GALOIS_LOG_VERBOSE("Output elements {}", num_output_elements);
+  galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
+                ", SAGE output temp var ", num_output_elements, " (",
+                FloatElementsToGB(num_output_elements), " GB)");
   out_temp_.resize(num_output_elements, 0);
   layer_type_ = galois::GNNLayerType::kSAGE;
 #ifdef GALOIS_ENABLE_GPU
diff --git a/libgnn/test/aggregate-sync-test.cpp b/libgnn/test/aggregate-sync-test.cpp
index 7025331029..c3e3439a5e 100644
--- a/libgnn/test/aggregate-sync-test.cpp
+++ b/libgnn/test/aggregate-sync-test.cpp
@@ -96,7 +96,9 @@ int main() {
 
     // size 2 columns
     for (size_t c = 0; c < 2; c++) {
-      GALOIS_LOG_ASSERT(layer_0_forward_output[row * 2 + c] == ground_truth);
+      GALOIS_LOG_VASSERT(layer_0_forward_output[row * 2 + c] == ground_truth,
+                         "should be {} not {}", ground_truth,
+                         layer_0_forward_output[row * 2 + c]);
     }
   }
 
@@ -111,13 +113,10 @@ int main() {
       layer_0->BackwardPhase(test_graph->GetLocalFeatures(), &dummy_ones);
 
   //////////////////////////////////////////////////////////////////////////////
-  // sanity check layer 0 backward output; all 0 because layer 0
+  // sanity check layer 0 backward output: empty
   //////////////////////////////////////////////////////////////////////////////
-  // since norm factors aren't invovled it is possible to do full assertions
-  GALOIS_LOG_ASSERT(layer_0_backward_output.size() == test_graph->size() * 3);
-  for (size_t i = 0; i < layer_0_backward_output.size(); i++) {
-    GALOIS_LOG_ASSERT((layer_0_backward_output)[i] == 0);
-  }
+
+  GALOIS_LOG_ASSERT(layer_0_backward_output.size() == 0);
 
   //////////////////////////////////////////////////////////////////////////////
   // layer 1 to check backward output
diff --git a/libgnn/test/convlayer-test.cpp b/libgnn/test/convlayer-test.cpp
index bcada6c4ed..309433845b 100644
--- a/libgnn/test/convlayer-test.cpp
+++ b/libgnn/test/convlayer-test.cpp
@@ -103,28 +103,7 @@ int main() {
   //////////////////////////////////////////////////////////////////////////////
   // since norm factors aren't invovled it is possible to do full assertions
   // 7 x 3
-  GALOIS_LOG_ASSERT(layer_0_backward_output.size() == 21);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[0] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[1] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[2] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[3] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[4] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[5] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[6] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[7] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[8] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[9] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[10] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[11] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[12] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[13] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[14] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[15] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[16] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[17] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[18] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[19] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[20] == 0);
+  GALOIS_LOG_ASSERT(layer_0_backward_output.size() == 0);
 
   galois::PointerWithSize<galois::GNNFloat> layer_0_weight_gradients =
       layer_0->GetLayerWeightGradients();
diff --git a/libgnn/test/sage-layer-test.cpp b/libgnn/test/sage-layer-test.cpp
index dadc8b0096..39a2cd2635 100644
--- a/libgnn/test/sage-layer-test.cpp
+++ b/libgnn/test/sage-layer-test.cpp
@@ -78,28 +78,7 @@ int main() {
   ////////////////////////////////////////////////////////////////////////////////
   // since norm factors aren't invovled it is possible to do full assertions
   // 7 x 3
-  GALOIS_LOG_ASSERT(layer_0_backward_output.size() == 21);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[0] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[1] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[2] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[3] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[4] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[5] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[6] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[7] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[8] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[9] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[10] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[11] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[12] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[13] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[14] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[15] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[16] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[17] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[18] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[19] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[20] == 0);
+  GALOIS_LOG_ASSERT(layer_0_backward_output.size() == 0);
 
   galois::PointerWithSize<galois::GNNFloat> layer_0_weight_gradients =
       layer_0->GetLayerWeightGradients();

From 3c15b652ee6e3d87bf6008a5f07d27fc45f458e4 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 9 Mar 2021 16:16:18 -0600
Subject: [PATCH 500/660] 2 minor fixes: paper100M split, features size

Made training split for papers100M the entire graph (mask is
non-contiguous and occupies most of the graph anyways). It did not start
at 0 before which was also problematic on its own.

Corrected feature GB print in GNNGraph.
---
 libcusp/include/galois/graphs/NewGeneric.h | 6 +++---
 libgnn/src/graphs/GNNGraph.cpp             | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/libcusp/include/galois/graphs/NewGeneric.h b/libcusp/include/galois/graphs/NewGeneric.h
index c29127d9e6..710ba82996 100644
--- a/libcusp/include/galois/graphs/NewGeneric.h
+++ b/libcusp/include/galois/graphs/NewGeneric.h
@@ -120,9 +120,9 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
       bps.push_back(0);
       bps.push_back(86618);
     } else if (filename.find("ogbn-papers100M") != std::string::npos) {
-      // this is entire graph: amazon's mask isn't contiguous
-      bps.push_back(602);
-      bps.push_back(111052523);
+      // whole graph (non contiguous mask)
+      bps.push_back(0);
+      bps.push_back(111059956);
     } else {
       // TODO(loc) only die under certain conditions; don't die if something
       // is missing
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index a75bb47498..1bc4cb830e 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -420,7 +420,7 @@ void galois::graphs::GNNGraph::ReadLocalFeatures(
 
   galois::gInfo(
       host_prefix_, "Read ", local_node_features_.size(), " features (",
-      local_ground_truth_labels_.size() * double{4} / (1 << 30), " GB)");
+      local_node_features_.size() * double{4} / (1 << 30), " GB)");
   GALOIS_LOG_ASSERT(num_kept_vertices == partitioned_graph_->size());
 }
 

From 5f1f89696c9000d619f4071011a349f2e4cda85c Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 11 Mar 2021 17:12:03 -0600
Subject: [PATCH 501/660] The big GNN space saving commit (part 1)

This commit does the following:

- Temporary/memoization matrices have been reduced to a minimum of only
1 per layer. The size of it will be the smaller of the input or output
matrices. This change along with distributed execution/masking required
some reorganization of the order of things in order to make it so you
can mask the correct things at the right time + avoid overwriting
something before it is used. This has made the code more annoying to
read as a result. Some cleanup will be necessary.

- Forward output matrices of layer n now act as the backward output
matrix for layer n + 1. This is a huge space savings, but again, it
required some changes to keep correctness. For example, for the
activation derivative I now have to keep a bitset marking which elements
were originally greater than one (originally this was done by checking
the forward output matrix itself, but now it gets overwritten so you
can't do that anymore).
This change required a signature change to the layer constructors.

- Tests now have to pass in a "forward output" for a layer to write the
backward output into. This has made things slightly more annoying to
read as well.

Part 2 will make it so output layers do not allocate their own output
matrices and just overwrite the input matrix. This will require some
function signature changes, which is why it will be done in a separate
commit.

Also TODO: Distributed SAGE unit tests.
---
 libgnn/include/galois/GNNMath.h               |   3 +
 libgnn/include/galois/layers/DenseLayer.h     |   5 +-
 libgnn/include/galois/layers/GNNLayer.h       |  15 +-
 .../galois/layers/GraphConvolutionalLayer.h   |   6 +-
 libgnn/include/galois/layers/L2NormLayer.h    |   7 +-
 libgnn/include/galois/layers/SAGELayer.h      |  11 +-
 libgnn/include/galois/layers/SigmoidLayer.h   |   4 +-
 libgnn/include/galois/layers/SoftmaxLayer.h   |   4 +-
 libgnn/src/GNNMath.cpp                        |   6 +
 libgnn/src/GraphNeuralNetwork.cpp             |  23 +-
 libgnn/src/graphs/GNNGraph.cpp                |   6 +-
 libgnn/src/layers/DenseLayer.cpp              |  10 +-
 libgnn/src/layers/GNNLayer.cpp                |  99 +++++---
 libgnn/src/layers/GraphConvolutionalLayer.cpp | 118 ++++++----
 libgnn/src/layers/L2NormLayer.cpp             |   7 +-
 libgnn/src/layers/SAGELayer.cpp               | 214 ++++++++++--------
 libgnn/src/layers/SigmoidLayer.cpp            |   7 +-
 libgnn/src/layers/SoftmaxLayer.cpp            |  10 +-
 libgnn/test/aggregate-sync-test.cpp           |  21 +-
 libgnn/test/back-conv-test.cpp                |   6 +-
 libgnn/test/convlayer-test.cpp                |  10 +-
 libgnn/test/l2norm-layer-test.cpp             |   7 +-
 libgnn/test/sage-layer-test.cpp               |  16 +-
 libgnn/test/sample-test.cpp                   |  16 +-
 libgnn/test/sigmoidlayer-test.cpp             |   7 +-
 libgnn/test/softmaxlayer-test.cpp             |   7 +-
 26 files changed, 409 insertions(+), 236 deletions(-)

diff --git a/libgnn/include/galois/GNNMath.h b/libgnn/include/galois/GNNMath.h
index 9e50295200..dd7ee5b479 100644
--- a/libgnn/include/galois/GNNMath.h
+++ b/libgnn/include/galois/GNNMath.h
@@ -7,6 +7,9 @@
 
 namespace galois {
 
+//! zeros out a vector of some length
+void VectorZero(size_t length, GNNFloat* a);
+
 //! Find max index in a vector of some length
 size_t MaxIndex(const size_t length, const GNNFloat* vector);
 //! Given 2 float array pointers, do element wise addition of length elements
diff --git a/libgnn/include/galois/layers/DenseLayer.h b/libgnn/include/galois/layers/DenseLayer.h
index d9918f8c2e..bb651ca30e 100644
--- a/libgnn/include/galois/layers/DenseLayer.h
+++ b/libgnn/include/galois/layers/DenseLayer.h
@@ -10,12 +10,15 @@ class DenseLayer : public GNNLayer {
   //! memory for temporary matrices. Also initializes sync substrate for the
   //! weight matrix
   DenseLayer(size_t layer_num, const galois::graphs::GNNGraph& graph,
+             PointerWithSize<GNNFloat>* backward_output_matrix,
              const GNNLayerDimensions& dimensions,
              const GNNLayerConfig& config);
 
   DenseLayer(size_t layer_num, const galois::graphs::GNNGraph& graph,
+             PointerWithSize<GNNFloat>* backward_output_matrix,
              const GNNLayerDimensions& dimensions)
-      : DenseLayer(layer_num, graph, dimensions, GNNLayerConfig()) {}
+      : DenseLayer(layer_num, graph, backward_output_matrix, dimensions,
+                   GNNLayerConfig()) {}
 
   // Parent functions
   const PointerWithSize<galois::GNNFloat>
diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
index 47b38a7f73..4e83cdc145 100644
--- a/libgnn/include/galois/layers/GNNLayer.h
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -44,6 +44,9 @@ struct GNNLayerDimensions {
 struct GNNLayerConfig {
   //! True if weights should be allocated
   bool allocate_weights{true};
+  //! If true, disable allocation of the output matrix (used for output layers
+  //! which can overwrite the input, i.e. passthrough)
+  bool disable_output{false};
   //! Turns off dropout of weights if enabled
   bool disable_dropout{false};
   //! Rate at which to drop things if dropout is on
@@ -77,17 +80,19 @@ struct GNNLayerConfig {
 //! Base class for layers in a graph neural network
 class GNNLayer {
 public:
-  GNNLayer() = delete;
   //! Creation of a layer needs the # of the layer, the graph to train on, and
   //! the input/output dimensions of the MxM that occurs in the layer; config
   //! as well
   GNNLayer(size_t layer_num, const galois::graphs::GNNGraph& graph,
+           PointerWithSize<GNNFloat>* backward_output_matrix,
            const GNNLayerDimensions& dimensions, const GNNLayerConfig& config);
 
   //! Uses a default config
   GNNLayer(size_t layer_num, const galois::graphs::GNNGraph& graph,
+           PointerWithSize<GNNFloat>* backward_output_matrix,
            const GNNLayerDimensions& dimensions)
-      : GNNLayer(layer_num, graph, dimensions, GNNLayerConfig()) {}
+      : GNNLayer(layer_num, graph, backward_output_matrix, dimensions,
+                 GNNLayerConfig()) {}
 
   GNNPhase layer_phase() { return layer_phase_; }
   //! Changes this layer's phase
@@ -213,8 +218,6 @@ class GNNLayer {
   // want to allocate memory once to avoid runtime memory allocation.
   //! The output of the forward phase for this layer.
   std::vector<GNNFloat> forward_output_matrix_;
-  //! The output of the backward phase for this layer.
-  std::vector<GNNFloat> backward_output_matrix_;
 
   // These are wrapper around the pointer for the data associated with
   // any GNN layer: takes a CPU or GPU pointer depending on configuration
@@ -223,6 +226,7 @@ class GNNLayer {
   PointerWithSize<GNNFloat> p_layer_weight_gradients_;
   PointerWithSize<GNNFloat> p_forward_output_matrix_;
   PointerWithSize<GNNFloat> p_backward_output_matrix_;
+  galois::DynamicBitSet activation_memo_;
 
   //! RNG for matrix initialization
   PerThreadRNG random_init_rng_{-5.0, 5.0};
@@ -292,6 +296,9 @@ class GNNLayer {
   }
 #endif
 
+  //! Mask a input size'd matrix's rows that correspond to mirrors
+  void MaskInputNonMasters(PointerWithSize<GNNFloat>* input);
+  //! Mask a gradient size'd matrix's rows that correspond to mirrors
   void MaskGradientNonMasters(PointerWithSize<GNNFloat>* gradients);
 
   //! Does some math to get GB used by some # of floats
diff --git a/libgnn/include/galois/layers/GraphConvolutionalLayer.h b/libgnn/include/galois/layers/GraphConvolutionalLayer.h
index e44976f73b..d7a600096d 100644
--- a/libgnn/include/galois/layers/GraphConvolutionalLayer.h
+++ b/libgnn/include/galois/layers/GraphConvolutionalLayer.h
@@ -16,14 +16,16 @@ class GraphConvolutionalLayer : public GNNLayer {
   //! weight matrix
   GraphConvolutionalLayer(size_t layer_num,
                           const galois::graphs::GNNGraph& graph,
+                          PointerWithSize<GNNFloat>* backward_output_matrix,
                           const GNNLayerDimensions& dimensions,
                           const GNNLayerConfig& config);
 
   GraphConvolutionalLayer(size_t layer_num,
                           const galois::graphs::GNNGraph& graph,
+                          PointerWithSize<GNNFloat>* backward_output_matrix,
                           const GNNLayerDimensions& dimensions)
-      : GraphConvolutionalLayer(layer_num, graph, dimensions,
-                                GNNLayerConfig()) {}
+      : GraphConvolutionalLayer(layer_num, graph, backward_output_matrix,
+                                dimensions, GNNLayerConfig()) {}
 
   // Parent functions
   const PointerWithSize<galois::GNNFloat>
diff --git a/libgnn/include/galois/layers/L2NormLayer.h b/libgnn/include/galois/layers/L2NormLayer.h
index 176c88700e..34ac3983e1 100644
--- a/libgnn/include/galois/layers/L2NormLayer.h
+++ b/libgnn/include/galois/layers/L2NormLayer.h
@@ -11,13 +11,16 @@ namespace galois {
 class L2NormLayer : public GNNLayer {
 public:
   L2NormLayer(size_t layer_num, const galois::graphs::GNNGraph& graph,
+
+              PointerWithSize<GNNFloat>* backward_output_matrix,
               const GNNLayerDimensions& dimensions)
-      : L2NormLayer(layer_num, graph, dimensions,
+      : L2NormLayer(layer_num, graph, backward_output_matrix, dimensions,
                     GNNLayerConfig{.allocate_weights = false}) {}
   L2NormLayer(size_t layer_num, const galois::graphs::GNNGraph& graph,
+              PointerWithSize<GNNFloat>* backward_output_matrix,
               const GNNLayerDimensions& dimensions,
               const GNNLayerConfig& config)
-      : GNNLayer(layer_num, graph, dimensions, config) {
+      : GNNLayer(layer_num, graph, backward_output_matrix, dimensions, config) {
     layer_type_ = galois::GNNLayerType::kL2Norm;
     // input/output columns must be equivalent in a softmax
     GALOIS_LOG_ASSERT(dimensions.input_columns == dimensions.output_columns);
diff --git a/libgnn/include/galois/layers/SAGELayer.h b/libgnn/include/galois/layers/SAGELayer.h
index 9dc7007931..056ea748c1 100644
--- a/libgnn/include/galois/layers/SAGELayer.h
+++ b/libgnn/include/galois/layers/SAGELayer.h
@@ -28,17 +28,22 @@ class SAGELayer : public GNNLayer {
   //! memory for temporary matrices. Also initializes sync substrate for the
   //! weight matrix
   SAGELayer(size_t layer_num, const galois::graphs::GNNGraph& graph,
+
+            PointerWithSize<GNNFloat>* backward_output_matrix,
             const GNNLayerDimensions& dimensions, const GNNLayerConfig& config,
             const SAGELayerConfig& sage_config);
 
   SAGELayer(size_t layer_num, const galois::graphs::GNNGraph& graph,
+            PointerWithSize<GNNFloat>* backward_output_matrix,
             const GNNLayerDimensions& dimensions, const GNNLayerConfig& config)
-      : SAGELayer(layer_num, graph, dimensions, config, SAGELayerConfig()) {}
+      : SAGELayer(layer_num, graph, backward_output_matrix, dimensions, config,
+                  SAGELayerConfig()) {}
 
   SAGELayer(size_t layer_num, const galois::graphs::GNNGraph& graph,
+            PointerWithSize<GNNFloat>* backward_output_matrix,
             const GNNLayerDimensions& dimensions)
-      : SAGELayer(layer_num, graph, dimensions, GNNLayerConfig(),
-                  SAGELayerConfig()) {}
+      : SAGELayer(layer_num, graph, backward_output_matrix, dimensions,
+                  GNNLayerConfig(), SAGELayerConfig()) {}
 
   void InitSelfWeightsTo1() {
     if (layer_weights_2_.size()) {
diff --git a/libgnn/include/galois/layers/SigmoidLayer.h b/libgnn/include/galois/layers/SigmoidLayer.h
index 7efe8cd9db..5a2f9f6894 100644
--- a/libgnn/include/galois/layers/SigmoidLayer.h
+++ b/libgnn/include/galois/layers/SigmoidLayer.h
@@ -11,8 +11,10 @@ namespace galois {
 class SigmoidLayer : public GNNLayer {
 public:
   SigmoidLayer(size_t layer_num, const galois::graphs::GNNGraph& graph,
+
+               PointerWithSize<GNNFloat>* backward_output_matrix,
                const GNNLayerDimensions& dimensions)
-      : GNNLayer(layer_num, graph, dimensions,
+      : GNNLayer(layer_num, graph, backward_output_matrix, dimensions,
                  GNNLayerConfig{.allocate_weights = false}),
         input_loss_(dimensions.input_rows),
         norm_gradient_vectors_(dimensions.input_columns) {
diff --git a/libgnn/include/galois/layers/SoftmaxLayer.h b/libgnn/include/galois/layers/SoftmaxLayer.h
index 7bf29272cd..444a383386 100644
--- a/libgnn/include/galois/layers/SoftmaxLayer.h
+++ b/libgnn/include/galois/layers/SoftmaxLayer.h
@@ -12,8 +12,10 @@ namespace galois {
 class SoftmaxLayer : public GNNLayer {
 public:
   SoftmaxLayer(size_t layer_num, const galois::graphs::GNNGraph& graph,
+
+               PointerWithSize<GNNFloat>* backward_output_matrix,
                const GNNLayerDimensions& dimensions)
-      : GNNLayer(layer_num, graph, dimensions,
+      : GNNLayer(layer_num, graph, backward_output_matrix, dimensions,
                  GNNLayerConfig{.allocate_weights = false}),
 #ifdef GALOIS_ENABLE_GPU
         gpu_object_(graph.GetGPUGraph()),
diff --git a/libgnn/src/GNNMath.cpp b/libgnn/src/GNNMath.cpp
index aef3dae6dd..afb3712981 100644
--- a/libgnn/src/GNNMath.cpp
+++ b/libgnn/src/GNNMath.cpp
@@ -3,6 +3,12 @@
 #include "galois/GNNMath.h"
 #include "galois/Logging.h"
 
+void galois::VectorZero(size_t length, GNNFloat* a) {
+  for (size_t i = 0; i < length; i++) {
+    a[i] = 0;
+  }
+}
+
 size_t galois::MaxIndex(const size_t length, const GNNFloat* vector) {
   size_t index     = 0;
   GNNFloat cur_max = vector[0];
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index b31a31ecd1..9e944d0568 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -22,6 +22,9 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork(
     graph_->ResizeLayerVector(config_.num_intermediate_layers());
   }
 #endif
+  // used for chaining layers together; begins as nullptr
+  PointerWithSize<GNNFloat> prev_output_layer(nullptr, 0);
+
   // create the intermediate layers
   for (size_t i = 0; i < config_.num_intermediate_layers(); i++) {
     GNNLayerType layer_type = config_.intermediate_layer_type(i);
@@ -43,7 +46,8 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork(
     switch (layer_type) {
     case GNNLayerType::kGraphConvolutional:
       gnn_layers_.push_back(std::move(std::make_unique<GraphConvolutionalLayer>(
-          i, *graph_, layer_dims, config_.default_layer_config())));
+          i, *graph_, &prev_output_layer, layer_dims,
+          config_.default_layer_config())));
 #ifdef GALOIS_ENABLE_GPU
       if (device_personality == DevicePersonality::GPU_CUDA) {
         graph_->InitLayerVectorMetaObjects(
@@ -54,21 +58,24 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork(
       break;
     case GNNLayerType::kSAGE:
       gnn_layers_.push_back(std::move(std::make_unique<SAGELayer>(
-          i, *graph_, layer_dims, config_.default_layer_config())));
+          i, *graph_, &prev_output_layer, layer_dims,
+          config_.default_layer_config())));
 #ifdef GALOIS_ENABLE_GPU
       // TODO(loc/hochan) sage layer gpu
 #endif
       break;
     case GNNLayerType::kL2Norm:
       gnn_layers_.push_back(std::move(std::make_unique<L2NormLayer>(
-          i, *graph_, layer_dims, config_.default_layer_config())));
+          i, *graph_, &prev_output_layer, layer_dims,
+          config_.default_layer_config())));
 #ifdef GALOIS_ENABLE_GPU
       // TODO(loc/hochan) l2 layer gpu
 #endif
       break;
     case GNNLayerType::kDense:
       gnn_layers_.push_back(std::move(std::make_unique<DenseLayer>(
-          i, *graph_, layer_dims, config_.default_layer_config())));
+          i, *graph_, &prev_output_layer, layer_dims,
+          config_.default_layer_config())));
 #ifdef GALOIS_ENABLE_GPU
       // TODO(loc/hochan) dense layer gpu
 #endif
@@ -76,6 +83,8 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork(
     default:
       GALOIS_LOG_FATAL("Invalid layer type during network construction");
     }
+    // update output layer for next layer
+    prev_output_layer = gnn_layers_.back()->GetForwardOutput();
   }
 
   // loop backward and find last GCN/SAGE (main) layer to disable activation
@@ -102,11 +111,13 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork(
   switch (config_.output_layer_type()) {
   case (GNNOutputLayerType::kSoftmax):
     gnn_layers_.push_back(std::move(std::make_unique<SoftmaxLayer>(
-        config_.num_intermediate_layers(), *graph_, output_dims)));
+        config_.num_intermediate_layers(), *graph_, &prev_output_layer,
+        output_dims)));
     break;
   case (GNNOutputLayerType::kSigmoid):
     gnn_layers_.push_back(std::move(std::make_unique<SigmoidLayer>(
-        config_.num_intermediate_layers(), *graph_, output_dims)));
+        config_.num_intermediate_layers(), *graph_, &prev_output_layer,
+        output_dims)));
     break;
   default:
     GALOIS_LOG_FATAL("Invalid layer type during network construction");
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index 1bc4cb830e..89cdca94e9 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -418,9 +418,9 @@ void galois::graphs::GNNGraph::ReadLocalFeatures(
   }
   full_feature_set.reset();
 
-  galois::gInfo(
-      host_prefix_, "Read ", local_node_features_.size(), " features (",
-      local_node_features_.size() * double{4} / (1 << 30), " GB)");
+  galois::gInfo(host_prefix_, "Read ", local_node_features_.size(),
+                " features (",
+                local_node_features_.size() * double{4} / (1 << 30), " GB)");
   GALOIS_LOG_ASSERT(num_kept_vertices == partitioned_graph_->size());
 }
 
diff --git a/libgnn/src/layers/DenseLayer.cpp b/libgnn/src/layers/DenseLayer.cpp
index b2da6bf010..75e715e482 100644
--- a/libgnn/src/layers/DenseLayer.cpp
+++ b/libgnn/src/layers/DenseLayer.cpp
@@ -2,11 +2,11 @@
 #include "galois/GNNMath.h"
 #include "galois/layers/DenseLayer.h"
 
-galois::DenseLayer::DenseLayer(size_t layer_num,
-                               const galois::graphs::GNNGraph& graph,
-                               const GNNLayerDimensions& dimensions,
-                               const GNNLayerConfig& config)
-    : GNNLayer(layer_num, graph, dimensions, config),
+galois::DenseLayer::DenseLayer(
+    size_t layer_num, const galois::graphs::GNNGraph& graph,
+    PointerWithSize<GNNFloat>* backward_output_matrix,
+    const GNNLayerDimensions& dimensions, const GNNLayerConfig& config)
+    : GNNLayer(layer_num, graph, backward_output_matrix, dimensions, config),
       input_column_intermediates_(dimensions.input_columns),
       output_column_intermediates_(dimensions.output_columns) {
   size_t num_input_elements =
diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp
index cde5698a93..b88f91b631 100644
--- a/libgnn/src/layers/GNNLayer.cpp
+++ b/libgnn/src/layers/GNNLayer.cpp
@@ -4,6 +4,7 @@
 
 galois::GNNLayer::GNNLayer(size_t layer_num,
                            const galois::graphs::GNNGraph& graph,
+                           PointerWithSize<GNNFloat>* backward_output_matrix,
                            const GNNLayerDimensions& dimensions,
                            const GNNLayerConfig& config)
     : layer_number_(layer_num), graph_(graph), layer_dimensions_(dimensions),
@@ -37,23 +38,29 @@ galois::GNNLayer::GNNLayer(size_t layer_num,
     GlorotBengioInit(&layer_weights_);
   }
 
-  size_t num_output_elements =
-      layer_dimensions_.input_rows * layer_dimensions_.output_columns;
-  galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
-                ", forward output matrix ", num_output_elements, " (",
-                FloatElementsToGB(num_output_elements), " GB)");
-  forward_output_matrix_.resize(num_output_elements, 0);
+  if (!config_.disable_output) {
+    size_t num_output_elements =
+        layer_dimensions_.input_rows * layer_dimensions_.output_columns;
+    galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
+                  ", forward output matrix ", num_output_elements, " (",
+                  FloatElementsToGB(num_output_elements), " GB)");
+    forward_output_matrix_.resize(num_output_elements, 0);
+  }
+
   if (layer_number_ != 0) {
-    galois::gInfo(
-        graph_.host_prefix(), "Creating layer ", layer_number_,
-        ", backward output matrix ",
-        layer_dimensions_.input_rows * layer_dimensions_.input_columns, " (",
-        FloatElementsToGB(layer_dimensions_.input_rows *
-                          layer_dimensions_.input_columns),
-        " GB)");
-    backward_output_matrix_.resize(
-        layer_dimensions_.input_rows * layer_dimensions_.input_columns, 0);
+    GALOIS_LOG_VASSERT(
+        backward_output_matrix->size() ==
+            layer_dimensions_.input_rows * layer_dimensions_.input_columns,
+        "backward output size {} should equal input size {}",
+        backward_output_matrix->size(),
+        layer_dimensions_.input_rows * layer_dimensions_.input_columns);
+  } else {
+    GALOIS_LOG_VASSERT(backward_output_matrix->data() == nullptr,
+                       "layer 0 should null ptr backward output");
+    GALOIS_LOG_VASSERT(backward_output_matrix->size() == 0,
+                       "layer 0 should size 0 backward output");
   }
+
 #ifdef GALOIS_ENABLE_GPU
   if (device_personality == DevicePersonality::GPU_CUDA) {
     base_gpu_object_.InitInOutMemory(num_output_elements,
@@ -68,8 +75,7 @@ galois::GNNLayer::GNNLayer(size_t layer_num,
                                   layer_weight_gradients_.size());
     p_forward_output_matrix_ = PointerWithSize<GNNFloat>(
         base_gpu_object_.forward_output(), forward_output_matrix_.size());
-    p_backward_output_matrix_ = PointerWithSize<GNNFloat>(
-        base_gpu_object_.backward_output(), backward_output_matrix_.size());
+    p_backward_output_matrix_ = *backward_output_matrix;
     // TODO can clear the cpu side vectors/don't use .size() since optimally
     // they aren't initialized
   } else {
@@ -80,8 +86,7 @@ galois::GNNLayer::GNNLayer(size_t layer_num,
         PointerWithSize<GNNFloat>(layer_weight_gradients_);
     p_forward_output_matrix_ =
         PointerWithSize<GNNFloat>(forward_output_matrix_);
-    p_backward_output_matrix_ =
-        PointerWithSize<GNNFloat>(backward_output_matrix_);
+    p_backward_output_matrix_ = *backward_output_matrix;
 #ifdef GALOIS_ENABLE_GPU
   }
 #endif
@@ -221,7 +226,7 @@ void galois::GNNLayer::ReconstructDropoutMatrix(
 void galois::GNNLayer::DoDropoutDerivative() {
   galois::StatTimer timer("BackwardDropout", "GNNLayer");
   timer.start();
-  assert(backward_output_matrix_.size() == dropout_mask_.size());
+  assert(p_backward_output_matrix_.size() == dropout_mask_.size());
   GNNFloat scale = 1. / (1. - config_.dropout_rate);
 
 #ifdef GALOIS_ENABLE_GPU
@@ -232,11 +237,12 @@ void galois::GNNLayer::DoDropoutDerivative() {
 #endif
     // use dropout mask to figure out derivative
     galois::do_all(
-        galois::iterate(static_cast<size_t>(0), backward_output_matrix_.size()),
+        galois::iterate(static_cast<size_t>(0),
+                        p_backward_output_matrix_.size()),
         [&](size_t i) {
-          backward_output_matrix_[i] = backward_output_matrix_[i] *
-                                       static_cast<GNNFloat>(dropout_mask_[i]) *
-                                       scale;
+          p_backward_output_matrix_[i] =
+              p_backward_output_matrix_[i] *
+              static_cast<GNNFloat>(dropout_mask_[i]) * scale;
         },
         galois::loopname("LayerDropoutDerivative"));
 #ifdef GALOIS_ENABLE_GPU
@@ -249,13 +255,22 @@ void galois::GNNLayer::Activation() {
   galois::StatTimer timer("ForwardActivation", "GNNLayer");
   timer.start();
 
+  if (activation_memo_.size() == 0) {
+    activation_memo_.resize(forward_output_matrix_.size());
+  }
+  activation_memo_.reset();
+
   // TODO only does relu at the moment; should check user specified activation
   // and act accordingly
   galois::do_all(
       galois::iterate(static_cast<size_t>(0), forward_output_matrix_.size()),
       [&](size_t i) {
-        forward_output_matrix_[i] =
-            std::max(forward_output_matrix_.at(i), static_cast<GNNFloat>(0));
+        if (forward_output_matrix_[i] > 0.0) {
+          // do nothing, keep value; set the memo though
+          activation_memo_.set(i);
+        } else {
+          forward_output_matrix_[i] = 0;
+        }
       },
       galois::loopname("ReLU"));
   timer.stop();
@@ -268,14 +283,14 @@ void galois::GNNLayer::ActivationDerivative(
 
   // TODO only does relu at the moment; should check user specified activation
   // and act accordingly
-  // keep gradient if the original output is greater than 0
+  // keep gradient if the original output was greater than 0
   galois::do_all(
       galois::iterate(static_cast<size_t>(0), gradient->size()),
       [&](size_t i) {
-        (*gradient)[i] =
-            (forward_output_matrix_.at(i) > static_cast<GNNFloat>(0))
-                ? (*gradient)[i]
-                : static_cast<GNNFloat>(0);
+        // it was <= 0 before; set back to 0
+        if (!activation_memo_.test(i)) {
+          (*gradient)[i] = 0;
+        }
       },
       galois::loopname("ReLU-Derivative"));
   timer.stop();
@@ -304,6 +319,28 @@ void galois::GNNLayer::WeightGradientSyncSum() {
   t.stop();
 }
 
+void galois::GNNLayer::MaskInputNonMasters(PointerWithSize<GNNFloat>* input) {
+#ifdef GALOIS_ENABLE_GPU
+  // TODO(hochan) mask away the **non** masters on gpu
+  GALOIS_LOG_FATAL("implement this");
+#else
+  assert(*(graph_.begin_owned()) == 0);
+  size_t start_node = *(graph_.end_owned());
+  size_t end_node   = graph_.size();
+  size_t row_index  = layer_dimensions_.input_columns;
+  assert((row_index * layer_dimensions_.input_rows) == input->size());
+  galois::do_all(
+      galois::iterate(start_node, end_node),
+      [&](size_t non_master) {
+        // TODO(loc) use a std function for this for max efficiency
+        for (size_t i = 0; i < row_index; i++) {
+          (*input)[non_master * row_index + i] = 0;
+        }
+      },
+      galois::loopname("MaskInputNonMasters"));
+#endif
+}
+
 void galois::GNNLayer::MaskGradientNonMasters(
     PointerWithSize<GNNFloat>* gradient) {
 #ifdef GALOIS_ENABLE_GPU
diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp
index 44d2df6529..6f86cf1395 100644
--- a/libgnn/src/layers/GraphConvolutionalLayer.cpp
+++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp
@@ -4,18 +4,25 @@
 
 galois::GraphConvolutionalLayer::GraphConvolutionalLayer(
     size_t layer_num, const galois::graphs::GNNGraph& graph,
+    PointerWithSize<GNNFloat>* backward_output_matrix,
     const GNNLayerDimensions& dimensions, const GNNLayerConfig& config)
-    : GNNLayer(layer_num, graph, dimensions, config),
+    : GNNLayer(layer_num, graph, backward_output_matrix, dimensions, config),
       input_column_intermediates_(dimensions.input_columns),
       output_column_intermediates_(dimensions.output_columns) {
   size_t num_input_elements =
       layer_dimensions_.input_rows * layer_dimensions_.input_columns;
-  galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
-                ", GCN input temp var 1 ", num_input_elements, " (",
-                FloatElementsToGB(num_input_elements), " GB)");
-  in_temp_1_.resize(num_input_elements, 0);
-  if (config_.disable_aggregate_after_update ||
+  if (!config_.disable_dropout || config_.disable_aggregate_after_update ||
       layer_dimensions_.input_columns <= layer_dimensions_.output_columns) {
+    galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
+                  ", GCN input temp var 1 ", num_input_elements, " (",
+                  FloatElementsToGB(num_input_elements), " GB)");
+    in_temp_1_.resize(num_input_elements, 0);
+  }
+
+  // only on in dropout case + if in temp is smaller than out temp
+  if (!config_.disable_dropout &&
+      (config_.disable_aggregate_after_update ||
+       layer_dimensions_.input_columns <= layer_dimensions_.output_columns)) {
     galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
                   ", GCN input temp var 2 ", num_input_elements, " (",
                   FloatElementsToGB(num_input_elements), " GB)");
@@ -24,10 +31,17 @@ galois::GraphConvolutionalLayer::GraphConvolutionalLayer(
 
   size_t num_output_elements =
       layer_dimensions_.input_rows * layer_dimensions_.output_columns;
-  galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
-                ", GCN output temp var ", num_output_elements, " (",
-                FloatElementsToGB(num_output_elements), " GB)");
-  out_temp_.resize(num_output_elements, 0);
+
+  // only needed if out temp would be smaller than intemp
+  if (!config_.disable_aggregate_after_update &&
+      layer_dimensions_.input_columns > layer_dimensions_.output_columns) {
+    // xform matrix first to work with a smaller output size
+    galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
+                  ", GCN output temp var ", num_output_elements, " (",
+                  FloatElementsToGB(num_output_elements), " GB)");
+    out_temp_.resize(num_output_elements, 0);
+  }
+
   layer_type_ = galois::GNNLayerType::kGraphConvolutional;
 #ifdef GALOIS_ENABLE_GPU
   if (device_personality == DevicePersonality::GPU_CUDA) {
@@ -59,24 +73,27 @@ galois::GraphConvolutionalLayer::ForwardPhase(
   GALOIS_LOG_VERBOSE("Calling forward phase");
   assert(input_embeddings.size() ==
          (layer_dimensions_.input_rows * layer_dimensions_.input_columns));
-  assert(p_in_temp_1_.size() == input_embeddings.size());
   assert(p_forward_output_matrix_.size() ==
          (layer_dimensions_.input_rows * layer_dimensions_.output_columns));
   // pointer to input to operate on
   const GNNFloat* input_data = input_embeddings.data();
+  GNNFloat* agg_data;
   // first, dropout
   if (!config_.disable_dropout && (layer_phase_ == GNNPhase::kTrain)) {
     DoDropout(input_embeddings, &p_in_temp_1_);
     input_data = p_in_temp_1_.data();
+    agg_data   = p_in_temp_2_.data();
+  } else {
+    agg_data = p_in_temp_1_.data();
   }
 
   // flip aggregate/update if dimensions favor it (do less work)
   if (config_.disable_aggregate_after_update ||
       layer_dimensions_.input_columns <= layer_dimensions_.output_columns) {
     // aggregation and update
-    AggregateAll(layer_dimensions_.input_columns, input_data,
-                 p_in_temp_2_.data(), &input_column_intermediates_);
-    UpdateEmbeddings(p_in_temp_2_.data(), p_forward_output_matrix_.data());
+    AggregateAll(layer_dimensions_.input_columns, input_data, agg_data,
+                 &input_column_intermediates_);
+    UpdateEmbeddings(agg_data, p_forward_output_matrix_.data());
   } else {
     // update to aggregate
     // FW
@@ -115,43 +132,34 @@ galois::GraphConvolutionalLayer::BackwardPhase(
 
   // AFW = O
   galois::PointerWithSize<galois::GNNFloat> input_data;
+  galois::PointerWithSize<galois::GNNFloat> agg_data;
   if (!config_.disable_dropout) {
     // dropout result is currently stored in temp 1
     // needs to be used before it gets overwritten
     input_data = p_in_temp_1_;
+    agg_data   = p_in_temp_2_;
   } else {
     // no dropout = use vanilla input
     input_data = prev_layer_input;
+    agg_data   = p_in_temp_1_;
   }
 
+  // NOTE: PREV LAYER INPUT AND BACKWARDOUTPUT ARE THE SAME MEMORY LOCATION;
+  // BEWARE OF DEPENDENCIES
+
   // derivative of aggregation/update
   // TODO clean up logic here to reduce nesting
   if (config_.disable_aggregate_after_update ||
       layer_dimensions_.input_columns <= layer_dimensions_.output_columns) {
-    if (layer_number_ != 0) {
-      // transposed sgemm for derivative; in_temp is output
-      assert(input_gradient->size() ==
-             layer_dimensions_.input_rows * layer_dimensions_.output_columns);
-      assert(p_in_temp_1_.size() ==
-             layer_dimensions_.input_columns * layer_dimensions_.input_rows);
-      // pintemp1 contains (AF)'
-      UpdateEmbeddingsDerivative(input_gradient->data(), p_in_temp_1_.data());
-      // pback contains F'
-      // derivative of aggregate is the same due to symmetric graph
-      AggregateAll(layer_dimensions_.input_columns, p_in_temp_1_.data(),
-                   p_backward_output_matrix_.data(),
-                   &input_column_intermediates_, true);
-      // TODO if training A, then A' compute here if layer # is 0
-      // dot product of edges that exist in A
-    }
-    // weight gradient calculation
-    // TODO(loc) put this in a function to put the ifdef in there
-    MaskGradientNonMasters(input_gradient);
+    // aggdata can == p_intemp1; in other words, need to use before overwrite
+    // mask it, then use it
+    MaskInputNonMasters(&agg_data);
+
 #ifdef GALOIS_ENABLE_GPU
     if (device_personality == DevicePersonality::GPU_CUDA) {
       gpu_object_.GetWeightGradientsGPU(
           layer_dimensions_.input_rows, layer_dimensions_.input_columns,
-          layer_dimensions_.output_columns, p_in_temp_2_.data(),
+          layer_dimensions_.output_columns, agg_data.data(),
           input_gradient->data(), p_layer_weight_gradients_.data());
     } else {
 #endif
@@ -159,11 +167,26 @@ galois::GraphConvolutionalLayer::BackwardPhase(
       galois::CBlasSGEMM(
           CblasTrans, CblasNoTrans, layer_dimensions_.input_columns,
           layer_dimensions_.input_rows, layer_dimensions_.output_columns,
-          p_in_temp_2_.data(), input_gradient->data(),
+          agg_data.data(), input_gradient->data(),
           p_layer_weight_gradients_.data());
 #ifdef GALOIS_ENABLE_GPU
     }
 #endif
+
+    // gradient isn't masked here; only temp1, which has already been
+    // overwritten = fine
+    if (layer_number_ != 0) {
+      // transposed sgemm for derivative; in_temp is output
+      assert(input_gradient->size() ==
+             layer_dimensions_.input_rows * layer_dimensions_.output_columns);
+      // pintemp1 contains (AF)'
+      UpdateEmbeddingsDerivative(input_gradient->data(), p_in_temp_1_.data());
+      // pback contains F'
+      // derivative of aggregate is the same due to symmetric graph
+      AggregateAll(layer_dimensions_.input_columns, p_in_temp_1_.data(),
+                   p_backward_output_matrix_.data(),
+                   &input_column_intermediates_, true);
+    }
   } else {
     // TODO at this point, out_temp contains memoized FW
     // can use it to get A' = O' (FW)^T
@@ -172,15 +195,19 @@ galois::GraphConvolutionalLayer::BackwardPhase(
     // this is (FW)'
     AggregateAll(layer_dimensions_.output_columns, input_gradient->data(),
                  p_out_temp_.data(), &output_column_intermediates_, true);
+
+    // done after above because input_data = p_backward_output_matrix in some
+    // cases; use first before overwriting here if layer # doesn't = 0, it means
+    // I can mess with the input data itself instad of masking the gradients I
+    // can mask the input
     if (layer_number_ != 0) {
-      // derivative for update
-      // backout = F'
-      UpdateEmbeddingsDerivative(p_out_temp_.data(),
-                                 p_backward_output_matrix_.data());
+      MaskInputNonMasters(&input_data);
+    } else {
+      // if 0 then no input to mask: mask the gradient
+      // this is fine because gradient won't be used to get feature gradients
+      MaskGradientNonMasters(&p_out_temp_);
     }
-    // W' = F^T (FW)'
-    MaskGradientNonMasters(&p_out_temp_);
-    // TODO put this in a function
+
 #ifdef GALOIS_ENABLE_GPU
     if (device_personality == DevicePersonality::GPU_CUDA) {
       gpu_object_.GetWeightGradientsGPU(
@@ -197,6 +224,13 @@ galois::GraphConvolutionalLayer::BackwardPhase(
 #ifdef GALOIS_ENABLE_GPU
     }
 #endif
+
+    if (layer_number_ != 0) {
+      // can now overwrite p_backward without issue; since input gradient
+      // is untouched if layer number isn't 0 this will be correct
+      UpdateEmbeddingsDerivative(p_out_temp_.data(),
+                                 p_backward_output_matrix_.data());
+    }
   }
 
   // sync weight gradients; note aggregation sync occurs in the function call
diff --git a/libgnn/src/layers/L2NormLayer.cpp b/libgnn/src/layers/L2NormLayer.cpp
index a29fccab1d..d7c04c52e9 100644
--- a/libgnn/src/layers/L2NormLayer.cpp
+++ b/libgnn/src/layers/L2NormLayer.cpp
@@ -62,7 +62,8 @@ galois::PointerWithSize<galois::GNNFloat> galois::L2NormLayer::BackwardPhase(
 galois::PointerWithSize<galois::GNNFloat> galois::L2NormLayer::BackwardPhaseCPU(
     galois::PointerWithSize<galois::GNNFloat> prev_layer_input,
     galois::PointerWithSize<galois::GNNFloat>* input_gradient) {
-  backward_output_matrix_.assign(forward_output_matrix_.size(), 0.0);
+  galois::do_all(galois::iterate(size_t{0}, p_backward_output_matrix_.size()),
+                 [&](size_t i) { p_backward_output_matrix_[i] = 0; });
   const size_t feature_length = layer_dimensions_.input_columns;
 
   // derivative of some x_1 is sum of gradient w.r.t. x_1 for all elements of
@@ -108,7 +109,7 @@ galois::PointerWithSize<galois::GNNFloat> galois::L2NormLayer::BackwardPhaseCPU(
 
           for (size_t row_index = row_offset;
                row_index < (row_offset + feature_length); row_index++) {
-            backward_output_matrix_[row_index] =
+            p_backward_output_matrix_[row_index] =
                 denominator *
                 (prev_layer_input[row_index] * mult_with_input +
                  (*input_gradient)[row_index] * running_square_sum);
@@ -117,5 +118,5 @@ galois::PointerWithSize<galois::GNNFloat> galois::L2NormLayer::BackwardPhaseCPU(
       },
       galois::loopname("L2Backward"));
 
-  return PointerWithSize(backward_output_matrix_);
+  return p_backward_output_matrix_;
 }
diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp
index 9f80000bdd..8fde856ac8 100644
--- a/libgnn/src/layers/SAGELayer.cpp
+++ b/libgnn/src/layers/SAGELayer.cpp
@@ -4,10 +4,12 @@
 
 galois::SAGELayer::SAGELayer(size_t layer_num,
                              const galois::graphs::GNNGraph& graph,
+                             PointerWithSize<GNNFloat>* backward_output_matrix,
                              const GNNLayerDimensions& dimensions,
                              const GNNLayerConfig& config,
                              const SAGELayerConfig& sage_config)
-    : GNNLayer(layer_num, graph, dimensions, config), sage_config_(sage_config),
+    : GNNLayer(layer_num, graph, backward_output_matrix, dimensions, config),
+      sage_config_(sage_config),
       input_column_intermediates_(dimensions.input_columns),
       output_column_intermediates_(dimensions.output_columns) {
   if (!sage_config_.disable_concat) {
@@ -38,13 +40,20 @@ galois::SAGELayer::SAGELayer(size_t layer_num,
 
   size_t num_input_elements =
       layer_dimensions_.input_rows * layer_dimensions_.input_columns;
-  galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
-                ", SAGE input temp var 1 ", num_input_elements, " (",
-                FloatElementsToGB(num_input_elements), " GB)");
-  in_temp_1_.resize(num_input_elements, 0);
-  // only need to allocate if input <= output because not used otherwise
-  if (config_.disable_aggregate_after_update ||
+
+  // if in temp is smaller than out temp, or if dropout exists
+  if (!config_.disable_dropout || config_.disable_aggregate_after_update ||
       layer_dimensions_.input_columns <= layer_dimensions_.output_columns) {
+    galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
+                  ", SAGE input temp var 1 ", num_input_elements, " (",
+                  FloatElementsToGB(num_input_elements), " GB)");
+    in_temp_1_.resize(num_input_elements, 0);
+  }
+
+  // only on in dropout case + if in temp is smaller than out temp
+  if (!config_.disable_dropout &&
+      (config_.disable_aggregate_after_update ||
+       layer_dimensions_.input_columns <= layer_dimensions_.output_columns)) {
     galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
                   ", SAGE input temp var 2 ", num_input_elements, " (",
                   FloatElementsToGB(num_input_elements), " GB)");
@@ -53,10 +62,16 @@ galois::SAGELayer::SAGELayer(size_t layer_num,
 
   size_t num_output_elements =
       layer_dimensions_.input_rows * layer_dimensions_.output_columns;
-  galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
-                ", SAGE output temp var ", num_output_elements, " (",
-                FloatElementsToGB(num_output_elements), " GB)");
-  out_temp_.resize(num_output_elements, 0);
+
+  // only needed if out temp would be smaller than intemp
+  if (!config_.disable_aggregate_after_update &&
+      layer_dimensions_.input_columns > layer_dimensions_.output_columns) {
+    galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
+                  ", SAGE output temp var ", num_output_elements, " (",
+                  FloatElementsToGB(num_output_elements), " GB)");
+    out_temp_.resize(num_output_elements, 0);
+  }
+
   layer_type_ = galois::GNNLayerType::kSAGE;
 #ifdef GALOIS_ENABLE_GPU
   // TODO(loc/hochan) GPU SAGE
@@ -112,15 +127,18 @@ const galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::ForwardPhase(
 
   assert(input_embeddings.size() ==
          (layer_dimensions_.input_rows * layer_dimensions_.input_columns));
-  assert(p_in_temp_1_.size() == input_embeddings.size());
   assert(p_forward_output_matrix_.size() ==
          (layer_dimensions_.input_rows * layer_dimensions_.output_columns));
   // pointer to input to operate on
   const GNNFloat* input_data = input_embeddings.data();
+  GNNFloat* agg_data;
   // first, dropout
   if (!config_.disable_dropout && (layer_phase_ == GNNPhase::kTrain)) {
     DoDropout(input_embeddings, &p_in_temp_1_);
     input_data = p_in_temp_1_.data();
+    agg_data   = p_in_temp_2_.data();
+  } else {
+    agg_data = p_in_temp_1_.data();
   }
 
   // O = FW1 + AFW2 is what is done if concat is on: below is the AFW2 part
@@ -130,9 +148,9 @@ const galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::ForwardPhase(
   if (config_.disable_aggregate_after_update ||
       layer_dimensions_.input_columns <= layer_dimensions_.output_columns) {
     // aggregation and update
-    AggregateAll(layer_dimensions_.input_columns, input_data,
-                 p_in_temp_2_.data(), &input_column_intermediates_);
-    UpdateEmbeddings(p_in_temp_2_.data(), p_forward_output_matrix_.data());
+    AggregateAll(layer_dimensions_.input_columns, input_data, agg_data,
+                 &input_column_intermediates_);
+    UpdateEmbeddings(agg_data, p_forward_output_matrix_.data());
   } else {
     // update to aggregate
     // FW
@@ -176,15 +194,47 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
   }
 
   // if dropout was used, use the dropout matrix for the input
-  galois::PointerWithSize<galois::GNNFloat> input_to_use;
+  galois::PointerWithSize<galois::GNNFloat> input_data;
+  galois::PointerWithSize<galois::GNNFloat> agg_data;
   if (!config_.disable_dropout) {
     // dropout result is currently stored in temp 1
     // needs to be used before it gets overwritten
-    input_to_use = p_in_temp_1_;
+    input_data = p_in_temp_1_;
+    agg_data   = p_in_temp_2_;
   } else {
     // no dropout = use vanilla input
-    input_to_use = prev_layer_input;
+    input_data = prev_layer_input;
+    agg_data   = p_in_temp_1_;
+  }
+
+  // aggregate this here before gradient starts to get overwritten
+  if (!config_.disable_aggregate_after_update &&
+      layer_dimensions_.input_columns > layer_dimensions_.output_columns) {
+    // aggregate occurs regardless of layer being equal to 0 because it is
+    // required in this case for the weight gradient calculation
+    // this is (FW)'
+    AggregateAll(layer_dimensions_.output_columns, input_gradient->data(),
+                 p_out_temp_.data(), &output_column_intermediates_, true);
+  }
+
+  if (!sage_config_.disable_concat) {
+    if (layer_number_ != 0) {
+      MaskInputNonMasters(&input_data);
+    } else {
+      // if 0 then no input to mask: mask the gradient
+      // this is fine because gradient won't be used to get feature gradients
+      MaskGradientNonMasters(input_gradient);
+    }
+    // input data (prev layer input or temp1) or gradient need mask
+    // can mask gradient if layer == 0
+    // otherwise must mask other
+    galois::CBlasSGEMM(
+        CblasTrans, CblasNoTrans, layer_dimensions_.input_columns,
+        layer_dimensions_.input_rows, layer_dimensions_.output_columns,
+        input_data.data(), input_gradient->data(),
+        p_layer_weight_gradients_2_.data());
   }
+  WeightGradientSyncSum2();
 
   // AFW = O
 
@@ -192,32 +242,19 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
   // TODO clean up logic here to reduce nesting
   if (config_.disable_aggregate_after_update ||
       layer_dimensions_.input_columns <= layer_dimensions_.output_columns) {
-    if (layer_number_ != 0) {
-      // ---unmasked---
-      // transposed sgemm for derivative; in_temp is output
-      assert(input_gradient->size() ==
-             layer_dimensions_.input_rows * layer_dimensions_.output_columns);
-      assert(p_in_temp_1_.size() ==
-             layer_dimensions_.input_columns * layer_dimensions_.input_rows);
-      // pintemp1 contains (AF)'
-      // overwrites the dropout matrix that was in ptemp1 (needed for second
-      // weight matrix)
-      UpdateEmbeddingsDerivative(input_gradient->data(), p_in_temp_1_.data());
-      // pback contains F'
-      // derivative of aggregate is the same due to symmetric graph
-      AggregateAll(layer_dimensions_.input_columns, p_in_temp_1_.data(),
-                   p_backward_output_matrix_.data(),
-                   &input_column_intermediates_, true);
+    // aggdata can == p_intemp1; in other words, need to use before overwrite
+    // mask it, then use it
+    if (layer_number_ != 0 || sage_config_.disable_concat) {
+      MaskInputNonMasters(&agg_data);
     }
-    // weight gradient calculation
-    // TODO(loc) put this in a function to put the ifdef in there
-    // ---masked---
-    MaskGradientNonMasters(input_gradient);
+    // if concat is disabled, then input grad isn't masked; therefore, mask
+    // this to get the same effect
+
 #ifdef GALOIS_ENABLE_GPU
     if (device_personality == DevicePersonality::GPU_CUDA) {
       gpu_object_.GetWeightGradientsGPU(
           layer_dimensions_.input_rows, layer_dimensions_.input_columns,
-          layer_dimensions_.output_columns, p_in_temp_2_.data(),
+          layer_dimensions_.output_columns, agg_data.data(),
           input_gradient->data(), p_layer_weight_gradients_.data());
     } else {
 #endif
@@ -225,91 +262,76 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
       galois::CBlasSGEMM(
           CblasTrans, CblasNoTrans, layer_dimensions_.input_columns,
           layer_dimensions_.input_rows, layer_dimensions_.output_columns,
-          p_in_temp_2_.data(), input_gradient->data(),
+          agg_data.data(), input_gradient->data(),
           p_layer_weight_gradients_.data());
 #ifdef GALOIS_ENABLE_GPU
     }
 #endif
+
+    // 0 means input gradient shouldn't get masked
+    if (layer_number_ != 0) {
+      // ---unmasked---
+      // transposed sgemm for derivative; in_temp is output
+      assert(input_gradient->size() ==
+             layer_dimensions_.input_rows * layer_dimensions_.output_columns);
+      // pintemp1 contains (AF)'
+      // overwrites the dropout matrix that was in ptemp1 (needed for second
+      // weight matrix)
+      UpdateEmbeddingsDerivative(input_gradient->data(), p_in_temp_1_.data());
+      // pback contains F'
+      // derivative of aggregate is the same due to symmetric graph
+      AggregateAll(layer_dimensions_.input_columns, p_in_temp_1_.data(),
+                   p_backward_output_matrix_.data(),
+                   &input_column_intermediates_, true);
+    }
   } else {
-    // aggregate occurs regardless of layer being equal to 0 because it is
-    // required in this case for the weight gradient calculation
-    // this is (FW)'
     // --unmasked--
-    AggregateAll(layer_dimensions_.output_columns, input_gradient->data(),
-                 p_out_temp_.data(), &output_column_intermediates_, true);
-    if (layer_number_ != 0) {
-      // derivative for update
-      // backout = F'
-      UpdateEmbeddingsDerivative(p_out_temp_.data(),
-                                 p_backward_output_matrix_.data());
+    // disable concat part is here because otherwise it would get done elsewhere
+    if (layer_number_ != 0 && sage_config_.disable_concat) {
+      MaskInputNonMasters(&input_data);
+    } else {
+      // if 0 then no input to mask: mask the gradient
+      // this is fine because gradient won't be used to get feature gradients
+      MaskGradientNonMasters(&p_out_temp_);
     }
-    // TODO put this in a function
+
     // W' = F^T (FW)'
-    // input to use is not overwritten in this branch so it's safe to use
-    // --- masked ---, uses ptemp1
-    MaskGradientNonMasters(&p_out_temp_);
+    // TODO put this in a function
 #ifdef GALOIS_ENABLE_GPU
     if (device_personality == DevicePersonality::GPU_CUDA) {
       gpu_object_.GetWeightGradientsGPU(
           layer_dimensions_.input_rows, layer_dimensions_.input_columns,
-          layer_dimensions_.output_columns, input_to_use.data(),
+          layer_dimensions_.output_columns, input_data.data(),
           p_out_temp_.data(), p_layer_weight_gradients_.data());
     } else {
 #endif
       galois::CBlasSGEMM(CblasTrans, CblasNoTrans,
                          layer_dimensions_.input_columns,
                          layer_dimensions_.input_rows,
-                         layer_dimensions_.output_columns, input_to_use.data(),
+                         layer_dimensions_.output_columns, input_data.data(),
                          p_out_temp_.data(), p_layer_weight_gradients_.data());
 #ifdef GALOIS_ENABLE_GPU
     }
 #endif
-  }
-
-  if (!sage_config_.disable_concat) {
-    // Fw1 + AFW2 = O; self feature has own weight matrix and makes own
-    // contribution to gradients which is handled in this block
-    // second weight matrix: reconstruct the dropout matrix if it was
-    // overwritten into temp1
-    if (config_.disable_aggregate_after_update ||
-        layer_dimensions_.input_columns <= layer_dimensions_.output_columns) {
-      if (!config_.disable_dropout) {
-        // input gradients have already been masked; need to reconstruct the
-        // dropout matrix which we can do since we saved the dropout mask
-        // save it into ptemp1
-        ReconstructDropoutMatrix(prev_layer_input, &p_in_temp_1_);
-        // !!!NOTE!!!
-        // If you're using dropout in the distributed setting you've already
-        // thrown consistency out the window anyways because distributed RNG
-        // will make it so each host does something different
-        // Therefore, this op above is nothing more than a feeble attempt
-        // at getting *some* notion of consistency
-      }
-    } else {
-      // mask original input gradients since this path masks the aggregated
-      // gradients only
-      MaskGradientNonMasters(input_gradient);
-      // in dropout case, ptemp1 (contained in input to use) still contains the
-      // dropout matrix so no need to recompute
-    }
-
-    galois::CBlasSGEMM(
-        CblasTrans, CblasNoTrans, layer_dimensions_.input_columns,
-        layer_dimensions_.input_rows, layer_dimensions_.output_columns,
-        input_to_use.data(), input_gradient->data(),
-        p_layer_weight_gradients_2_.data());
-    WeightGradientSyncSum2();
 
     if (layer_number_ != 0) {
-      // deal with feature gradients for the self feature here
-      // this function will sum directly into the backward matrix
-      SelfFeatureUpdateEmbeddingsDerivative(input_gradient->data(),
-                                            p_backward_output_matrix_.data());
+      // derivative for update
+      // backout = F'
+      UpdateEmbeddingsDerivative(p_out_temp_.data(),
+                                 p_backward_output_matrix_.data());
     }
   }
-
   WeightGradientSyncSum();
 
+  // full gradient needed here; should occur after all updates
+  if (layer_number_ != 0) {
+    // deal with feature gradients for the self feature here
+    // this function will sum directly into the backward matrix
+    // input gradient never gets masked if layer number != 0
+    SelfFeatureUpdateEmbeddingsDerivative(input_gradient->data(),
+                                          p_backward_output_matrix_.data());
+  }
+
   if (!config_.disable_dropout && layer_number_ != 0) {
     DoDropoutDerivative();
   }
diff --git a/libgnn/src/layers/SigmoidLayer.cpp b/libgnn/src/layers/SigmoidLayer.cpp
index 317811b6df..f2a421bed1 100644
--- a/libgnn/src/layers/SigmoidLayer.cpp
+++ b/libgnn/src/layers/SigmoidLayer.cpp
@@ -68,7 +68,8 @@ galois::SigmoidLayer::ForwardPhase(
 galois::PointerWithSize<galois::GNNFloat>
 galois::SigmoidLayer::BackwardPhaseCPU() {
   const size_t feature_length = layer_dimensions_.input_columns;
-  backward_output_matrix_.assign(backward_output_matrix_.size(), 0);
+  galois::do_all(galois::iterate(size_t{0}, p_backward_output_matrix_.size()),
+                 [&](size_t i) { p_backward_output_matrix_[i] = 0; });
 
   galois::do_all(
       galois::iterate(graph_.begin(), graph_.end()),
@@ -86,7 +87,7 @@ galois::SigmoidLayer::BackwardPhaseCPU() {
           // sigmoid-cross-entropy derivative: turns out all it is is simple
           // subtraction
           for (unsigned index = 0; index < feature_length; index++) {
-            backward_output_matrix_[node_offset + index] =
+            p_backward_output_matrix_[node_offset + index] =
                 forward_output_matrix_[node_offset + index] -
                 ground_truth[index];
           }
@@ -94,7 +95,7 @@ galois::SigmoidLayer::BackwardPhaseCPU() {
       },
       galois::steal(), galois::loopname("SigmoidBackward"));
 
-  return backward_output_matrix_;
+  return p_backward_output_matrix_;
 }
 
 galois::PointerWithSize<galois::GNNFloat>
diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp
index 10ed93c8ff..f0ded3ac49 100644
--- a/libgnn/src/layers/SoftmaxLayer.cpp
+++ b/libgnn/src/layers/SoftmaxLayer.cpp
@@ -79,8 +79,8 @@ galois::PointerWithSize<galois::GNNFloat>
 galois::SoftmaxLayer::BackwardPhaseCPU() {
   const size_t feature_length = layer_dimensions_.input_columns;
 
-  // zero out output
-  backward_output_matrix_.assign(backward_output_matrix_.size(), 0);
+  galois::do_all(galois::iterate(size_t{0}, p_backward_output_matrix_.size()),
+                 [&](size_t i) { p_backward_output_matrix_[i] = 0; });
 
   galois::do_all(
       galois::iterate(graph_.begin(), graph_.end()),
@@ -101,11 +101,11 @@ galois::SoftmaxLayer::BackwardPhaseCPU() {
           for (size_t idx = 0; idx < feature_length; idx++) {
             if (idx == correct) {
               // positive class
-              backward_output_matrix_[node * feature_length + idx] =
+              p_backward_output_matrix_[node * feature_length + idx] =
                   forward_output_matrix_[node * feature_length + idx] - 1;
             } else {
               // negative class
-              backward_output_matrix_[node * feature_length + idx] =
+              p_backward_output_matrix_[node * feature_length + idx] =
                   forward_output_matrix_[node * feature_length + idx];
             }
           }
@@ -113,7 +113,7 @@ galois::SoftmaxLayer::BackwardPhaseCPU() {
       },
       galois::steal(), galois::loopname("SoftmaxBackward"));
 
-  return PointerWithSize(backward_output_matrix_);
+  return p_backward_output_matrix_;
 }
 
 galois::PointerWithSize<galois::GNNFloat>
diff --git a/libgnn/test/aggregate-sync-test.cpp b/libgnn/test/aggregate-sync-test.cpp
index c3e3439a5e..eac1f89c84 100644
--- a/libgnn/test/aggregate-sync-test.cpp
+++ b/libgnn/test/aggregate-sync-test.cpp
@@ -34,13 +34,17 @@ int main() {
   dimension_0.input_columns  = 3;
   dimension_0.output_columns = 2;
   galois::GNNLayerConfig l_config;
-  l_config.disable_aggregate_after_update = false;
   l_config.DebugConfig();
+  l_config.disable_aggregate_after_update = true;
+
+  galois::PointerWithSize<galois::GNNFloat> p_null(nullptr, 0);
+  std::vector<galois::GNNFloat> back_matrix(test_graph->size() * 3);
+  galois::PointerWithSize<galois::GNNFloat> p_back(back_matrix);
 
   // create the layer, no norm factor
   std::unique_ptr<galois::GraphConvolutionalLayer> layer_0 =
-      std::make_unique<galois::GraphConvolutionalLayer>(0, *(test_graph.get()),
-                                                        dimension_0, l_config);
+      std::make_unique<galois::GraphConvolutionalLayer>(
+          0, *(test_graph.get()), &p_null, dimension_0, l_config);
   layer_0->InitAllWeightsTo1();
   // make sure it runs in a sane manner
   galois::PointerWithSize<galois::GNNFloat> layer_0_forward_output =
@@ -122,8 +126,8 @@ int main() {
   // layer 1 to check backward output
   //////////////////////////////////////////////////////////////////////////////
   std::unique_ptr<galois::GraphConvolutionalLayer> layer_1 =
-      std::make_unique<galois::GraphConvolutionalLayer>(1, *(test_graph.get()),
-                                                        dimension_0, l_config);
+      std::make_unique<galois::GraphConvolutionalLayer>(
+          1, *(test_graph.get()), &p_back, dimension_0, l_config);
   layer_1->InitAllWeightsTo1();
   galois::PointerWithSize<galois::GNNFloat> layer_1_forward_output =
       layer_1->ForwardPhase(test_graph->GetLocalFeatures());
@@ -229,7 +233,7 @@ int main() {
 
   // create the layer, no norm factor
   layer_0 = std::make_unique<galois::GraphConvolutionalLayer>(
-      0, *(test_graph_2.get()), dimension_0, l_config);
+      0, *(test_graph_2.get()), &p_null, dimension_0, l_config);
   layer_0->InitAllWeightsTo1();
 
   // make sure it runs in a sane manner
@@ -293,8 +297,11 @@ int main() {
     }
   }
 
+  std::vector<galois::GNNFloat> back_matrix_2(test_graph_2->size() * 3);
+  galois::PointerWithSize<galois::GNNFloat> p_back_2(back_matrix_2);
+
   layer_1 = std::make_unique<galois::GraphConvolutionalLayer>(
-      1, *(test_graph_2.get()), dimension_0, l_config);
+      1, *(test_graph_2.get()), &p_back_2, dimension_0, l_config);
   layer_1->InitAllWeightsTo1();
   layer_1_forward_output =
       layer_1->ForwardPhase(test_graph_2->GetLocalFeatures());
diff --git a/libgnn/test/back-conv-test.cpp b/libgnn/test/back-conv-test.cpp
index b1c9c025c6..480058f6ae 100644
--- a/libgnn/test/back-conv-test.cpp
+++ b/libgnn/test/back-conv-test.cpp
@@ -60,14 +60,18 @@ int main() {
 
   galois::GNNLayerConfig dcon;
   dcon.DebugConfig();
+  dcon.disable_aggregate_after_update = true;
 
   // dummy 1 matrix
   std::vector<galois::GNNFloat> dummy_ones_v(test_graph.size() * 2, 1);
   galois::PointerWithSize dummy_ones(dummy_ones_v);
 
+  std::vector<galois::GNNFloat> back_matrix(test_graph.size() * 3);
+  galois::PointerWithSize<galois::GNNFloat> p_back(back_matrix);
+
   // create layer 1 for testing backward prop actually giving weights back
   std::unique_ptr<galois::GraphConvolutionalLayer> layer_1 =
-      std::make_unique<galois::GraphConvolutionalLayer>(1, test_graph,
+      std::make_unique<galois::GraphConvolutionalLayer>(1, test_graph, &p_back,
                                                         dimension_0, dcon);
   layer_1->InitAllWeightsTo1();
   galois::PointerWithSize<galois::GNNFloat> layer_1_forward_output =
diff --git a/libgnn/test/convlayer-test.cpp b/libgnn/test/convlayer-test.cpp
index 309433845b..5902d059fa 100644
--- a/libgnn/test/convlayer-test.cpp
+++ b/libgnn/test/convlayer-test.cpp
@@ -55,9 +55,13 @@ int main() {
   dcon.disable_aggregate_after_update = false;
   dcon.DebugConfig();
 
+  galois::PointerWithSize<galois::GNNFloat> p_null(nullptr, 0);
+  std::vector<galois::GNNFloat> back_matrix(21);
+  galois::PointerWithSize<galois::GNNFloat> p_back(back_matrix);
+
   // create the layer, no norm factor
   std::unique_ptr<galois::GraphConvolutionalLayer> layer_0 =
-      std::make_unique<galois::GraphConvolutionalLayer>(0, test_graph,
+      std::make_unique<galois::GraphConvolutionalLayer>(0, test_graph, &p_null,
                                                         dimension_0, dcon);
   layer_0->InitAllWeightsTo1();
   // make sure it runs in a sane manner
@@ -122,7 +126,7 @@ int main() {
   // create layer 1 for testing backward prop actually giving weights back
 
   std::unique_ptr<galois::GraphConvolutionalLayer> layer_1 =
-      std::make_unique<galois::GraphConvolutionalLayer>(1, test_graph,
+      std::make_unique<galois::GraphConvolutionalLayer>(1, test_graph, &p_back,
                                                         dimension_0, dcon);
   layer_1->InitAllWeightsTo1();
   galois::PointerWithSize<galois::GNNFloat> layer_1_forward_output =
@@ -199,7 +203,7 @@ int main() {
   // don't have time for at the moment
   // TODO in future maybe add better unit test for this
   std::unique_ptr<galois::GraphConvolutionalLayer> layer_2 =
-      std::make_unique<galois::GraphConvolutionalLayer>(1, test_graph,
+      std::make_unique<galois::GraphConvolutionalLayer>(1, test_graph, &p_back,
                                                         dimension_0, config);
   galois::PointerWithSize<galois::GNNFloat> l2_fo =
       layer_2->ForwardPhase(test_graph.GetLocalFeatures());
diff --git a/libgnn/test/l2norm-layer-test.cpp b/libgnn/test/l2norm-layer-test.cpp
index a66c419a7f..ca30c99ac0 100644
--- a/libgnn/test/l2norm-layer-test.cpp
+++ b/libgnn/test/l2norm-layer-test.cpp
@@ -35,8 +35,11 @@ int main() {
   l2_input[12] = 4;
   l2_input[13] = 3;
 
-  auto l2_layer =
-      std::make_unique<galois::L2NormLayer>(2, test_graph, dimension_0);
+  std::vector<galois::GNNFloat> back_matrix(14);
+  galois::PointerWithSize<galois::GNNFloat> p_back(back_matrix);
+
+  auto l2_layer = std::make_unique<galois::L2NormLayer>(2, test_graph, &p_back,
+                                                        dimension_0);
   galois::PointerWithSize<galois::GNNFloat> normed =
       l2_layer->ForwardPhase(l2_input);
 
diff --git a/libgnn/test/sage-layer-test.cpp b/libgnn/test/sage-layer-test.cpp
index 39a2cd2635..830e147a7c 100644
--- a/libgnn/test/sage-layer-test.cpp
+++ b/libgnn/test/sage-layer-test.cpp
@@ -28,9 +28,13 @@ int main() {
   galois::SAGELayerConfig scon;
   scon.disable_concat = false;
 
+  galois::PointerWithSize<galois::GNNFloat> p_null(nullptr, 0);
+  std::vector<galois::GNNFloat> back_matrix(21);
+  galois::PointerWithSize<galois::GNNFloat> p_back(back_matrix);
+
   std::unique_ptr<galois::SAGELayer> layer_0 =
-      std::make_unique<galois::SAGELayer>(0, test_graph, dimension_0, dcon,
-                                          scon);
+      std::make_unique<galois::SAGELayer>(0, test_graph, &p_null, dimension_0,
+                                          dcon, scon);
   layer_0->InitAllWeightsTo1();
   // sage weights for self
   layer_0->InitSelfWeightsTo1();
@@ -109,8 +113,8 @@ int main() {
 
   // create layer 1 for testing backward prop actually giving weights back
 
-  auto layer_1 = std::make_unique<galois::SAGELayer>(1, test_graph, dimension_0,
-                                                     dcon, scon);
+  auto layer_1 = std::make_unique<galois::SAGELayer>(1, test_graph, &p_back,
+                                                     dimension_0, dcon, scon);
   layer_1->InitAllWeightsTo1();
   layer_1->InitSelfWeightsTo1();
 
@@ -201,8 +205,8 @@ int main() {
   // (verification requires floating point accuracy or setting a seed which I
   // don't have time for at the moment
   // TODO in future maybe add better unit test for this
-  auto layer_2 = std::make_unique<galois::SAGELayer>(1, test_graph, dimension_0,
-                                                     config, scon);
+  auto layer_2 = std::make_unique<galois::SAGELayer>(1, test_graph, &p_back,
+                                                     dimension_0, config, scon);
   galois::PointerWithSize<galois::GNNFloat> l2_fo =
       layer_2->ForwardPhase(test_graph.GetLocalFeatures());
   GALOIS_LOG_ASSERT(l2_fo.size() == 14);
diff --git a/libgnn/test/sample-test.cpp b/libgnn/test/sample-test.cpp
index 063ff80ca5..3540582ade 100644
--- a/libgnn/test/sample-test.cpp
+++ b/libgnn/test/sample-test.cpp
@@ -40,8 +40,11 @@ int main() {
 
   //////////////////////////////////////////////////////////////////////////////
 
+  std::vector<galois::GNNFloat> back_matrix(21);
+  galois::PointerWithSize<galois::GNNFloat> p_back(back_matrix);
+
   std::unique_ptr<galois::GraphConvolutionalLayer> layer_1 =
-      std::make_unique<galois::GraphConvolutionalLayer>(1, test_graph,
+      std::make_unique<galois::GraphConvolutionalLayer>(1, test_graph, &p_back,
                                                         dimension_0, dcon);
   layer_1->InitAllWeightsTo1();
   layer_1->EnableSampling();
@@ -139,8 +142,11 @@ int main() {
   softmax_input[40] = 1;
   softmax_input[48] = 1;
 
-  auto output_layer =
-      std::make_unique<galois::SoftmaxLayer>(3, test_graph, dimension_out);
+  std::vector<galois::GNNFloat> back_matrix_2(49);
+  galois::PointerWithSize<galois::GNNFloat> p_back_2(back_matrix_2);
+
+  auto output_layer = std::make_unique<galois::SoftmaxLayer>(
+      3, test_graph, &p_back_2, dimension_out);
   output_layer->EnableSampling();
   galois::PointerWithSize<galois::GNNFloat> prediction_distribution =
       output_layer->ForwardPhase(softmax_input);
@@ -183,8 +189,8 @@ int main() {
   galois::graphs::GNNGraph multi_graph(
       "tester", galois::graphs::GNNPartitionScheme::kOEC, false);
 
-  auto sigmoid_layer =
-      std::make_unique<galois::SigmoidLayer>(3, multi_graph, dimension_out);
+  auto sigmoid_layer = std::make_unique<galois::SigmoidLayer>(
+      3, multi_graph, &p_back_2, dimension_out);
   sigmoid_layer->EnableSampling();
   // reuse softmax input; only thing interested in is checking for 0s
   prediction_distribution = sigmoid_layer->ForwardPhase(softmax_input);
diff --git a/libgnn/test/sigmoidlayer-test.cpp b/libgnn/test/sigmoidlayer-test.cpp
index 333651bdf5..0bc2cd7252 100644
--- a/libgnn/test/sigmoidlayer-test.cpp
+++ b/libgnn/test/sigmoidlayer-test.cpp
@@ -47,9 +47,12 @@ int main() {
   softmax_input[40] = 0;
   softmax_input[48] = 0;
 
+  std::vector<galois::GNNFloat> back_matrix(49);
+  galois::PointerWithSize<galois::GNNFloat> p_back(back_matrix);
+
   // train mode
-  auto output_layer =
-      std::make_unique<galois::SigmoidLayer>(3, test_graph, dimension_0);
+  auto output_layer = std::make_unique<galois::SigmoidLayer>(
+      3, test_graph, &p_back, dimension_0);
   output_layer->ForwardPhase(softmax_input);
 
   galois::PointerWithSize<galois::GNNFloat> asdf =
diff --git a/libgnn/test/softmaxlayer-test.cpp b/libgnn/test/softmaxlayer-test.cpp
index 7a6de416dc..b85e0b4bb6 100644
--- a/libgnn/test/softmaxlayer-test.cpp
+++ b/libgnn/test/softmaxlayer-test.cpp
@@ -39,9 +39,12 @@ int main() {
   softmax_input[40] = 1;
   softmax_input[48] = 1;
 
+  std::vector<galois::GNNFloat> back_matrix(49);
+  galois::PointerWithSize<galois::GNNFloat> p_back(back_matrix);
+
   // train mode
-  auto output_layer =
-      std::make_unique<galois::SoftmaxLayer>(3, test_graph, dimension_0);
+  auto output_layer = std::make_unique<galois::SoftmaxLayer>(
+      3, test_graph, &p_back, dimension_0);
   galois::PointerWithSize<galois::GNNFloat> prediction_distribution =
       output_layer->ForwardPhase(softmax_input);
 

From d95af12fbb373177ed500d587b85b7fea757a21c Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 11 Mar 2021 17:27:17 -0600
Subject: [PATCH 502/660] Backward pass call takes non-const forward input

Allow Backward Pass to mess with the input from the previous layer.
Reason for this is to let the output layers reuse it to save significant
space.
---
 libgnn/include/galois/layers/DenseLayer.h              | 2 +-
 libgnn/include/galois/layers/GNNLayer.h                | 2 +-
 libgnn/include/galois/layers/GraphConvolutionalLayer.h | 2 +-
 libgnn/include/galois/layers/L2NormLayer.h             | 4 ++--
 libgnn/include/galois/layers/SAGELayer.h               | 2 +-
 libgnn/include/galois/layers/SigmoidLayer.h            | 2 +-
 libgnn/include/galois/layers/SoftmaxLayer.h            | 2 +-
 libgnn/src/layers/L2NormLayer.cpp                      | 2 +-
 libgnn/src/layers/SigmoidLayer.cpp                     | 2 +-
 libgnn/src/layers/SoftmaxLayer.cpp                     | 2 +-
 10 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/libgnn/include/galois/layers/DenseLayer.h b/libgnn/include/galois/layers/DenseLayer.h
index bb651ca30e..7b00d1987c 100644
--- a/libgnn/include/galois/layers/DenseLayer.h
+++ b/libgnn/include/galois/layers/DenseLayer.h
@@ -25,7 +25,7 @@ class DenseLayer : public GNNLayer {
   ForwardPhase(const PointerWithSize<galois::GNNFloat> input_embeddings) final;
 
   PointerWithSize<galois::GNNFloat>
-  BackwardPhase(const PointerWithSize<galois::GNNFloat> prev_layer_input,
+  BackwardPhase(PointerWithSize<galois::GNNFloat> prev_layer_input,
                 PointerWithSize<galois::GNNFloat>* input_gradient) final;
 
 private:
diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
index 4e83cdc145..b5fb109ffe 100644
--- a/libgnn/include/galois/layers/GNNLayer.h
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -154,7 +154,7 @@ class GNNLayer {
   //! @returns Output of the backward phase (i.e. input to previous layer); note
   //! it's a pointer because layer can mess with it
   virtual PointerWithSize<galois::GNNFloat>
-  BackwardPhase(const PointerWithSize<galois::GNNFloat> prev_layer_input,
+  BackwardPhase(PointerWithSize<galois::GNNFloat> prev_layer_input,
                 PointerWithSize<galois::GNNFloat>* input_gradient) = 0;
 
   //! Given an optimizer, update the weights in this layer based on gradients
diff --git a/libgnn/include/galois/layers/GraphConvolutionalLayer.h b/libgnn/include/galois/layers/GraphConvolutionalLayer.h
index d7a600096d..988276965d 100644
--- a/libgnn/include/galois/layers/GraphConvolutionalLayer.h
+++ b/libgnn/include/galois/layers/GraphConvolutionalLayer.h
@@ -32,7 +32,7 @@ class GraphConvolutionalLayer : public GNNLayer {
   ForwardPhase(const PointerWithSize<galois::GNNFloat> input_embeddings) final;
 
   PointerWithSize<galois::GNNFloat>
-  BackwardPhase(const PointerWithSize<galois::GNNFloat> prev_layer_input,
+  BackwardPhase(PointerWithSize<galois::GNNFloat> prev_layer_input,
                 PointerWithSize<galois::GNNFloat>* input_gradient) final;
 
 private:
diff --git a/libgnn/include/galois/layers/L2NormLayer.h b/libgnn/include/galois/layers/L2NormLayer.h
index 34ac3983e1..0ed1a0d0df 100644
--- a/libgnn/include/galois/layers/L2NormLayer.h
+++ b/libgnn/include/galois/layers/L2NormLayer.h
@@ -31,7 +31,7 @@ class L2NormLayer : public GNNLayer {
   ForwardPhase(const PointerWithSize<galois::GNNFloat> input_embeddings);
 
   PointerWithSize<galois::GNNFloat>
-  BackwardPhase(const PointerWithSize<galois::GNNFloat> prev_layer_input,
+  BackwardPhase(PointerWithSize<galois::GNNFloat> prev_layer_input,
                 PointerWithSize<galois::GNNFloat>* input_gradient);
 
 private:
@@ -39,7 +39,7 @@ class L2NormLayer : public GNNLayer {
   ForwardPhaseCPU(const PointerWithSize<galois::GNNFloat> input_embeddings);
 
   PointerWithSize<galois::GNNFloat>
-  BackwardPhaseCPU(const PointerWithSize<galois::GNNFloat> prev_layer_input,
+  BackwardPhaseCPU(PointerWithSize<galois::GNNFloat> prev_layer_input,
                    PointerWithSize<galois::GNNFloat>* input_gradient);
 
   //! No op
diff --git a/libgnn/include/galois/layers/SAGELayer.h b/libgnn/include/galois/layers/SAGELayer.h
index 056ea748c1..b5ee978067 100644
--- a/libgnn/include/galois/layers/SAGELayer.h
+++ b/libgnn/include/galois/layers/SAGELayer.h
@@ -61,7 +61,7 @@ class SAGELayer : public GNNLayer {
   ForwardPhase(const PointerWithSize<galois::GNNFloat> input_embeddings) final;
 
   PointerWithSize<galois::GNNFloat>
-  BackwardPhase(const PointerWithSize<galois::GNNFloat> prev_layer_input,
+  BackwardPhase(PointerWithSize<galois::GNNFloat> prev_layer_input,
                 PointerWithSize<galois::GNNFloat>* input_gradient) final;
 
 private:
diff --git a/libgnn/include/galois/layers/SigmoidLayer.h b/libgnn/include/galois/layers/SigmoidLayer.h
index 5a2f9f6894..209929bf30 100644
--- a/libgnn/include/galois/layers/SigmoidLayer.h
+++ b/libgnn/include/galois/layers/SigmoidLayer.h
@@ -32,7 +32,7 @@ class SigmoidLayer : public GNNLayer {
   //! Get gradients to fix distribution such that it leans more towards
   //! multiclass ground truth.
   PointerWithSize<galois::GNNFloat>
-  BackwardPhase(const PointerWithSize<galois::GNNFloat>,
+  BackwardPhase(PointerWithSize<galois::GNNFloat>,
                 PointerWithSize<galois::GNNFloat>*) final;
 
 private:
diff --git a/libgnn/include/galois/layers/SoftmaxLayer.h b/libgnn/include/galois/layers/SoftmaxLayer.h
index 444a383386..5fae882531 100644
--- a/libgnn/include/galois/layers/SoftmaxLayer.h
+++ b/libgnn/include/galois/layers/SoftmaxLayer.h
@@ -43,7 +43,7 @@ class SoftmaxLayer : public GNNLayer {
   //! Get gradients to fix distribution such that it leans more towards single
   //! class ground truth.
   PointerWithSize<galois::GNNFloat>
-  BackwardPhase(const PointerWithSize<galois::GNNFloat> prev_layer_input,
+  BackwardPhase(PointerWithSize<galois::GNNFloat> prev_layer_input,
                 PointerWithSize<galois::GNNFloat>* input_gradient) final;
 
 private:
diff --git a/libgnn/src/layers/L2NormLayer.cpp b/libgnn/src/layers/L2NormLayer.cpp
index d7c04c52e9..bcf66eb2f9 100644
--- a/libgnn/src/layers/L2NormLayer.cpp
+++ b/libgnn/src/layers/L2NormLayer.cpp
@@ -51,7 +51,7 @@ galois::L2NormLayer::ForwardPhaseCPU(
 }
 
 galois::PointerWithSize<galois::GNNFloat> galois::L2NormLayer::BackwardPhase(
-    const PointerWithSize<galois::GNNFloat> prev_layer_input,
+    PointerWithSize<galois::GNNFloat> prev_layer_input,
     PointerWithSize<galois::GNNFloat>* input_gradient) {
 #ifdef GALOIS_ENABLE_GPU
   // TODO
diff --git a/libgnn/src/layers/SigmoidLayer.cpp b/libgnn/src/layers/SigmoidLayer.cpp
index f2a421bed1..1809decc8a 100644
--- a/libgnn/src/layers/SigmoidLayer.cpp
+++ b/libgnn/src/layers/SigmoidLayer.cpp
@@ -99,7 +99,7 @@ galois::SigmoidLayer::BackwardPhaseCPU() {
 }
 
 galois::PointerWithSize<galois::GNNFloat>
-galois::SigmoidLayer::BackwardPhase(const PointerWithSize<galois::GNNFloat>,
+galois::SigmoidLayer::BackwardPhase(PointerWithSize<galois::GNNFloat>,
                                     PointerWithSize<galois::GNNFloat>*) {
 #ifdef GALOIS_ENABLE_GPU
   // TODO(loc) when GPU needs it
diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp
index f0ded3ac49..57eebd005e 100644
--- a/libgnn/src/layers/SoftmaxLayer.cpp
+++ b/libgnn/src/layers/SoftmaxLayer.cpp
@@ -117,7 +117,7 @@ galois::SoftmaxLayer::BackwardPhaseCPU() {
 }
 
 galois::PointerWithSize<galois::GNNFloat>
-galois::SoftmaxLayer::BackwardPhase(const PointerWithSize<galois::GNNFloat>,
+galois::SoftmaxLayer::BackwardPhase(PointerWithSize<galois::GNNFloat>,
                                     PointerWithSize<galois::GNNFloat>*) {
 #ifdef GALOIS_ENABLE_GPU
   if (device_personality == DevicePersonality::GPU_CUDA) {

From 347018f1ee39b8e467da1e9baec608b1d092405d Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 11 Mar 2021 20:12:57 -0600
Subject: [PATCH 503/660] GNN space saving (part 2): no softmax output

The softmax layer now reuses the output layer from the layer before it
in all of its operations, making it possible to completely avoid
allocating another matrix (resulting in *huge* space savings if # of
input classes is sufficiently large). The problem with this is now you
cannot check the output matrix of the layer before it (because softmax
will destroy it), and running backward phase will destroy the
predictions (meaning you have to check accuracy before you do gradient
descent).

Some tests had to be changed as a result of the changes described above.
---
 libgnn/include/galois/layers/SoftmaxLayer.h |  7 ++-
 libgnn/src/GraphNeuralNetwork.cpp           |  5 +-
 libgnn/src/layers/SoftmaxLayer.cpp          | 25 ++++----
 libgnn/test/gnnfb-test.cpp                  | 64 ++++++++++-----------
 libgnn/test/softmaxlayer-test.cpp           | 37 ++++++------
 5 files changed, 73 insertions(+), 65 deletions(-)

diff --git a/libgnn/include/galois/layers/SoftmaxLayer.h b/libgnn/include/galois/layers/SoftmaxLayer.h
index 5fae882531..433d055f83 100644
--- a/libgnn/include/galois/layers/SoftmaxLayer.h
+++ b/libgnn/include/galois/layers/SoftmaxLayer.h
@@ -15,8 +15,9 @@ class SoftmaxLayer : public GNNLayer {
 
                PointerWithSize<GNNFloat>* backward_output_matrix,
                const GNNLayerDimensions& dimensions)
-      : GNNLayer(layer_num, graph, backward_output_matrix, dimensions,
-                 GNNLayerConfig{.allocate_weights = false}),
+      : GNNLayer(
+            layer_num, graph, backward_output_matrix, dimensions,
+            GNNLayerConfig{.allocate_weights = false, .disable_output = true}),
 #ifdef GALOIS_ENABLE_GPU
         gpu_object_(graph.GetGPUGraph()),
 #endif
@@ -43,7 +44,7 @@ class SoftmaxLayer : public GNNLayer {
   //! Get gradients to fix distribution such that it leans more towards single
   //! class ground truth.
   PointerWithSize<galois::GNNFloat>
-  BackwardPhase(PointerWithSize<galois::GNNFloat> prev_layer_input,
+  BackwardPhase(PointerWithSize<galois::GNNFloat> in_out,
                 PointerWithSize<galois::GNNFloat>* input_gradient) final;
 
 private:
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index 9e944d0568..4942076b23 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -171,11 +171,12 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
       graph_->CalculateSpecialNormFactor(true, config_.inductive_training_);
     }
     const PointerWithSize<galois::GNNFloat> predictions = DoInference();
+    // have to get accuracy here because gradient prop destroys the predictions
+    // matrix
+    train_accuracy = GetGlobalAccuracy(predictions);
     GradientPropagation();
     epoch_timer.stop();
 
-    train_accuracy = GetGlobalAccuracy(predictions);
-
     if (this_host == 0) {
       const std::string t_name_acc =
           "TrainEpoch" + std::to_string(epoch) + "Accuracy";
diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp
index 57eebd005e..47d5f2ce0b 100644
--- a/libgnn/src/layers/SoftmaxLayer.cpp
+++ b/libgnn/src/layers/SoftmaxLayer.cpp
@@ -5,8 +5,8 @@
 const galois::PointerWithSize<galois::GNNFloat>
 galois::SoftmaxLayer::ForwardPhaseCPU(
     const galois::PointerWithSize<galois::GNNFloat> input_embeddings) {
+  // note: p_backward == input_embeddings
   input_loss_.assign(input_loss_.size(), 0.0);
-  forward_output_matrix_.assign(forward_output_matrix_.size(), 0.0);
   const size_t feature_length = layer_dimensions_.input_columns;
 #ifndef NDEBUG
   //#ifdef NDEBUG
@@ -20,14 +20,17 @@ galois::SoftmaxLayer::ForwardPhaseCPU(
       galois::iterate(graph_.begin(), graph_.end()),
       [&](const unsigned i) {
         if (IsSampledLayer()) {
-          if (layer_phase_ == GNNPhase::kTrain && !graph_.IsInSampledGraph(i))
+          if (layer_phase_ == GNNPhase::kTrain && !graph_.IsInSampledGraph(i)) {
+            VectorZero(feature_length,
+                       &p_backward_output_matrix_[i * feature_length]);
             return;
+          }
         }
 
         if (graph_.IsValidForPhase(i, layer_phase_)) {
           // do softmax
           GNNSoftmax(feature_length, &input_embeddings[feature_length * i],
-                     &forward_output_matrix_[feature_length * i]);
+                     &p_backward_output_matrix_[feature_length * i]);
           // create ground truth vector for this LID
           std::vector<GNNFloat>* ground_truth_vec =
               ground_truth_vectors_.getLocal();
@@ -40,12 +43,15 @@ galois::SoftmaxLayer::ForwardPhaseCPU(
           // calculate loss for this LID (note not all i will be filled)
           input_loss_[i] =
               GNNCrossEntropy(feature_length, ground_truth_vec->data(),
-                              &forward_output_matrix_[feature_length * i]);
+                              &p_backward_output_matrix_[feature_length * i]);
 #ifndef NDEBUG
           //#ifdef NDEBUG
           loss_accum += input_loss_[i];
           handled += 1;
 #endif
+        } else {
+          VectorZero(feature_length,
+                     &p_backward_output_matrix_[i * feature_length]);
         }
       },
       // TODO chunk size?
@@ -58,7 +64,7 @@ galois::SoftmaxLayer::ForwardPhaseCPU(
   galois::gPrint("Loss is ", reduced_loss / t, "\n");
 #endif
 
-  return forward_output_matrix_;
+  return p_backward_output_matrix_;
 }
 
 const galois::PointerWithSize<galois::GNNFloat>
@@ -79,9 +85,6 @@ galois::PointerWithSize<galois::GNNFloat>
 galois::SoftmaxLayer::BackwardPhaseCPU() {
   const size_t feature_length = layer_dimensions_.input_columns;
 
-  galois::do_all(galois::iterate(size_t{0}, p_backward_output_matrix_.size()),
-                 [&](size_t i) { p_backward_output_matrix_[i] = 0; });
-
   galois::do_all(
       galois::iterate(graph_.begin(), graph_.end()),
       [&](const unsigned node) {
@@ -102,11 +105,11 @@ galois::SoftmaxLayer::BackwardPhaseCPU() {
             if (idx == correct) {
               // positive class
               p_backward_output_matrix_[node * feature_length + idx] =
-                  forward_output_matrix_[node * feature_length + idx] - 1;
+                  p_backward_output_matrix_[node * feature_length + idx] - 1;
             } else {
               // negative class
               p_backward_output_matrix_[node * feature_length + idx] =
-                  forward_output_matrix_[node * feature_length + idx];
+                  p_backward_output_matrix_[node * feature_length + idx];
             }
           }
         }
@@ -123,7 +126,7 @@ galois::SoftmaxLayer::BackwardPhase(PointerWithSize<galois::GNNFloat>,
   if (device_personality == DevicePersonality::GPU_CUDA) {
     gpu_object_.BackwardPhaseGPU(
         layer_phase_, graph_.size(), layer_dimensions_.input_columns,
-        p_forward_output_matrix_.data(), p_backward_output_matrix_.data());
+        p_backward_output_matrix_.data(), p_backward_output_matrix_.data());
     return p_backward_output_matrix_;
   }
 #endif
diff --git a/libgnn/test/gnnfb-test.cpp b/libgnn/test/gnnfb-test.cpp
index 091c6f01c8..b99c8aeb8d 100644
--- a/libgnn/test/gnnfb-test.cpp
+++ b/libgnn/test/gnnfb-test.cpp
@@ -44,7 +44,7 @@ int main() {
   //////////////////////////////////////////////////////////////////////////////
   // forward phase
   //////////////////////////////////////////////////////////////////////////////
-  gnn->DoInference();
+  const galois::PointerWithSize<galois::GNNFloat> fo_out = gnn->DoInference();
 
   // check output for layers to make sure it's as expected
   galois::PointerWithSize<galois::GNNFloat> lf0_out =
@@ -72,33 +72,36 @@ int main() {
     GALOIS_LOG_ASSERT(lf0_out[24 + i] == 15);
   }
 
-  const galois::PointerWithSize<galois::GNNFloat> lf1_out =
-      gnn->GetIntermediateLayer(1)->GetForwardOutput();
-  GALOIS_LOG_ASSERT(lf1_out.size() == 49);
-  for (size_t i = 0; i < 7; i++) {
-    GALOIS_LOG_ASSERT(lf1_out[0 + i] == 24);
-  }
-  for (size_t i = 0; i < 7; i++) {
-    GALOIS_LOG_ASSERT(lf1_out[7 + i] == 60);
-  }
-  for (size_t i = 0; i < 7; i++) {
-    GALOIS_LOG_ASSERT(lf1_out[14 + i] == 96);
-  }
-  for (size_t i = 0; i < 7; i++) {
-    GALOIS_LOG_ASSERT(lf1_out[21 + i] == 144);
-  }
-  for (size_t i = 0; i < 7; i++) {
-    GALOIS_LOG_ASSERT(lf1_out[28 + i] == 192);
-  }
-  for (size_t i = 0; i < 7; i++) {
-    GALOIS_LOG_ASSERT(lf1_out[35 + i] == 156);
-  }
-  for (size_t i = 0; i < 7; i++) {
-    GALOIS_LOG_ASSERT(lf1_out[42 + i] == 120);
-  }
+  // Disabled: this test worked in past because forward outputs were all
+  // separate matrices, but due to space saving measures this forward output
+  // gets messed with by the softmax call
+
+  // const galois::PointerWithSize<galois::GNNFloat> lf1_out =
+  //    gnn->GetIntermediateLayer(1)->GetForwardOutput();
+  // GALOIS_LOG_ASSERT(lf1_out.size() == 49);
+  // for (size_t i = 0; i < 7; i++) {
+  //  GALOIS_LOG_VASSERT(lf1_out[0 + i] == 24, "{} vs {} (correct)", lf1_out[0 +
+  //  i], 24);
+  //}
+  // for (size_t i = 0; i < 7; i++) {
+  //  GALOIS_LOG_ASSERT(lf1_out[7 + i] == 60);
+  //}
+  // for (size_t i = 0; i < 7; i++) {
+  //  GALOIS_LOG_ASSERT(lf1_out[14 + i] == 96);
+  //}
+  // for (size_t i = 0; i < 7; i++) {
+  //  GALOIS_LOG_ASSERT(lf1_out[21 + i] == 144);
+  //}
+  // for (size_t i = 0; i < 7; i++) {
+  //  GALOIS_LOG_ASSERT(lf1_out[28 + i] == 192);
+  //}
+  // for (size_t i = 0; i < 7; i++) {
+  //  GALOIS_LOG_ASSERT(lf1_out[35 + i] == 156);
+  //}
+  // for (size_t i = 0; i < 7; i++) {
+  //  GALOIS_LOG_ASSERT(lf1_out[42 + i] == 120);
+  //}
 
-  const galois::PointerWithSize<galois::GNNFloat> fo_out =
-      gnn->GetOutputLayer()->GetForwardOutput();
   GALOIS_LOG_ASSERT(fo_out.size() == 49);
   // since row all same, prob distribution across row should be same
   for (size_t c = 0; c < 49; c += 7) {
@@ -127,9 +130,8 @@ int main() {
   //////////////////////////////////////////////////////////////////////////////
   gnn->SetLayerPhases(galois::GNNPhase::kValidate);
   gnn->SetAllLayerWeightsTo1();
-  gnn->DoInference();
   const galois::PointerWithSize<galois::GNNFloat> fo_out_val =
-      gnn->GetOutputLayer()->GetForwardOutput();
+      gnn->DoInference();
   for (size_t c = 0; c < 49; c += 7) {
     for (size_t i = 0; i < 6; i++) {
       GALOIS_LOG_ASSERT(fo_out_val[c + i] == fo_out_val[c + i + 1]);
@@ -150,9 +152,7 @@ int main() {
   // all but last should be 0s
   gnn->SetLayerPhases(galois::GNNPhase::kTest);
   gnn->SetAllLayerWeightsTo1();
-  gnn->DoInference();
-  galois::PointerWithSize<galois::GNNFloat> fo_out_test =
-      gnn->GetOutputLayer()->GetForwardOutput();
+  galois::PointerWithSize<galois::GNNFloat> fo_out_test = gnn->DoInference();
   for (size_t c = 0; c < 49; c += 7) {
     for (size_t i = 0; i < 6; i++) {
       GALOIS_LOG_ASSERT(fo_out_test[c + i] == fo_out_test[c + i + 1]);
diff --git a/libgnn/test/softmaxlayer-test.cpp b/libgnn/test/softmaxlayer-test.cpp
index b85e0b4bb6..66c4e557bc 100644
--- a/libgnn/test/softmaxlayer-test.cpp
+++ b/libgnn/test/softmaxlayer-test.cpp
@@ -48,13 +48,6 @@ int main() {
   galois::PointerWithSize<galois::GNNFloat> prediction_distribution =
       output_layer->ForwardPhase(softmax_input);
 
-  galois::PointerWithSize<galois::GNNFloat> asdf =
-      output_layer->BackwardPhase(softmax_input, nullptr);
-  printf("Output 1\n========\n");
-  for (unsigned i = 0; i < asdf.size(); i++) {
-    printf("%f\n", asdf[i]);
-  }
-
   // assert that predictions are as expected
   for (size_t i = 0; i < 5; i++) {
     GALOIS_LOG_ASSERT(galois::MaxIndex(7, &(prediction_distribution[i * 7])) ==
@@ -71,15 +64,19 @@ int main() {
     GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 6] == 0.0);
   }
 
+  // NOTE: checked before backward because backward overwrites this matrix
+
+  galois::PointerWithSize<galois::GNNFloat> asdf =
+      output_layer->BackwardPhase(softmax_input, nullptr);
+  printf("Output 1\n========\n");
+  for (unsigned i = 0; i < asdf.size(); i++) {
+    printf("%f\n", asdf[i]);
+  }
+
   // validation mode
   output_layer->SetLayerPhase(galois::GNNPhase::kValidate);
   galois::PointerWithSize<galois::GNNFloat> pd2 =
       output_layer->ForwardPhase(softmax_input);
-  asdf = output_layer->BackwardPhase(softmax_input, nullptr);
-  printf("Output 2\n========\n");
-  for (unsigned i = 0; i < asdf.size(); i++) {
-    printf("%f\n", asdf[i]);
-  }
 
   // validate vertex is index 5
   GALOIS_LOG_ASSERT(galois::MaxIndex(7, &(pd2[5 * 7])) == 5);
@@ -102,16 +99,16 @@ int main() {
     GALOIS_LOG_ASSERT(pd2[i * 7 + 6] == 0.0);
   }
 
-  // test mode
-  output_layer->SetLayerPhase(galois::GNNPhase::kTest);
-  galois::PointerWithSize<galois::GNNFloat> pd3 =
-      output_layer->ForwardPhase(softmax_input);
   asdf = output_layer->BackwardPhase(softmax_input, nullptr);
-  printf("Output 3\n========\n");
+  printf("Output 2\n========\n");
   for (unsigned i = 0; i < asdf.size(); i++) {
     printf("%f\n", asdf[i]);
   }
 
+  // test mode
+  output_layer->SetLayerPhase(galois::GNNPhase::kTest);
+  galois::PointerWithSize<galois::GNNFloat> pd3 =
+      output_layer->ForwardPhase(softmax_input);
   // validate vertex is index 6
   GALOIS_LOG_ASSERT(galois::MaxIndex(7, &(pd3[6 * 7])) == 6);
   // all but last are empty distributions
@@ -124,4 +121,10 @@ int main() {
     GALOIS_LOG_ASSERT(pd3[i * 7 + 5] == 0.0);
     GALOIS_LOG_ASSERT(pd3[i * 7 + 6] == 0.0);
   }
+
+  asdf = output_layer->BackwardPhase(softmax_input, nullptr);
+  printf("Output 3\n========\n");
+  for (unsigned i = 0; i < asdf.size(); i++) {
+    printf("%f\n", asdf[i]);
+  }
 }

From bc1b748ffb8ba82d1bea932ac21b4f5bcf8a644a Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 17 Mar 2021 21:05:25 -0500
Subject: [PATCH 504/660] buffer overflow check

Prevent send buffer from overflowing
---
 libgluon/include/galois/graphs/GluonSubstrate.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/libgluon/include/galois/graphs/GluonSubstrate.h b/libgluon/include/galois/graphs/GluonSubstrate.h
index 2ef2e0b136..9e7a7738a4 100644
--- a/libgluon/include/galois/graphs/GluonSubstrate.h
+++ b/libgluon/include/galois/graphs/GluonSubstrate.h
@@ -2317,6 +2317,10 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
       getSendBuffer<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(loopName, x,
                                                                   b, elem_size);
 
+      if (b.size() > static_cast<size_t>(std::numeric_limits<int>::max())) {
+        GALOIS_LOG_FATAL("send buff limit limit reached: {}", b.size());
+      }
+
       if ((!async) || (b.size() > 0)) {
         size_t syncTypePhase = 0;
         if (async && (syncType == syncBroadcast))

From 81249b69370bbf4f11e3541e8614de6554e1fdb8 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 15 Mar 2021 22:19:15 -0500
Subject: [PATCH 505/660] Remove host fence

Unused from what I can tell.
---
 libdist/src/Barrier.cpp | 51 -----------------------------------------
 1 file changed, 51 deletions(-)

diff --git a/libdist/src/Barrier.cpp b/libdist/src/Barrier.cpp
index 455e22aaed..0558d8ebb4 100644
--- a/libdist/src/Barrier.cpp
+++ b/libdist/src/Barrier.cpp
@@ -41,52 +41,6 @@
 #include "galois/runtime/BareMPI.h"
 
 namespace {
-class HostFence : public galois::substrate::Barrier {
-public:
-  virtual const char* name() const { return "HostFence"; }
-
-  virtual void reinit(unsigned) {}
-
-  //! control-flow barrier across distributed hosts
-  //! acts as a distributed-memory fence as well (flushes send and receives)
-  virtual void wait() {
-    auto& net = galois::runtime::getSystemNetworkInterface();
-
-    if (galois::runtime::evilPhase == 0) {
-      galois::gWarn("evilPhase is 0, implying loop-around or no use: fence "
-                    "may not work correctly!");
-    }
-
-    for (unsigned h = 0; h < net.Num; ++h) {
-      if (h == net.ID)
-        continue;
-      galois::runtime::SendBuffer b;
-      galois::runtime::gSerialize(b, net.ID + 1); // non-zero message
-      net.sendTagged(h, galois::runtime::evilPhase, b);
-    }
-    net.flush(); // flush all sends
-
-    unsigned received = 1; // self
-    while (received < net.Num) {
-      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
-      do {
-        net.handleReceives(); // flush all receives from net.sendMsg() or
-                              // net.sendSimple()
-        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
-      } while (!p);
-      assert(p->first != net.ID);
-      // ignore received data
-      ++received;
-    }
-    ++galois::runtime::evilPhase;
-    if (galois::runtime::evilPhase >=
-        static_cast<uint32_t>(
-            std::numeric_limits<int16_t>::max())) { // limit defined by MPI or
-                                                    // LCI
-      galois::runtime::evilPhase = 1;
-    }
-  }
-};
 
 class HostBarrier : public galois::substrate::Barrier {
 public:
@@ -110,8 +64,3 @@ galois::substrate::Barrier& galois::runtime::getHostBarrier() {
   static HostBarrier b;
   return b;
 }
-
-galois::substrate::Barrier& galois::runtime::getHostFence() {
-  static HostFence b;
-  return b;
-}

From 39bec32762d7c58453330b4562a780d08fdb2c19 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 31 Mar 2021 19:40:08 -0500
Subject: [PATCH 506/660] Adam optimizer fix: epsilon outside sqrt

The epsilon value for the Adam optimizer is meant to prevent division by
0: it should not appear in the sqrt computation as it can greatly affect
the gradient in some cases since it is in the denominator.
---
 libgnn/src/GNNOptimizers.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libgnn/src/GNNOptimizers.cpp b/libgnn/src/GNNOptimizers.cpp
index 664de35e01..843e75a1a6 100644
--- a/libgnn/src/GNNOptimizers.cpp
+++ b/libgnn/src/GNNOptimizers.cpp
@@ -41,7 +41,7 @@ void galois::AdamOptimizer::GradientDescent(
           // weight update using bias corrected moments
           (matrix.data())[i] -=
               config_.alpha * bias_correct_first /
-              std::sqrt(bias_correct_second + config_.epsilon);
+              (std::sqrt(bias_correct_second) + config_.epsilon);
         },
         galois::loopname("AdamOptimizerGradientDescent"));
 #ifdef GALOIS_ENABLE_GPU

From fa1d597e4906351137df1b00afe6a5d676cf3e18 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 14 Apr 2021 16:56:43 -0500
Subject: [PATCH 507/660] GNNGraph now has edge sample bit and reverse edges

After graph partitioning, reverse edges get constructed on each
partition: this is to do correct aggregation later when sampling edges.
A byte has been added to all edges as well to mark if the edge is "on"
or "off" (i.e., sampled), with the corresponding in-edge sharing this
data as necessary.

This is all in preparation for distributed edge sampling.
---
 .../include/galois/graphs/DistributedGraph.h  | 45 +++++++++-
 .../include/galois/graphs/BufferedGraph.h     | 66 ++++++++------
 .../include/galois/graphs/LC_CSR_CSC_Graph.h  | 46 +++++-----
 .../include/galois/graphs/LC_CSR_Graph.h      | 61 ++++++-------
 libgnn/include/galois/graphs/GNNGraph.h       | 53 +++++++++--
 libgnn/src/graphs/GNNGraph.cpp                | 18 ++--
 libgnn/src/layers/GraphConvolutionalLayer.cpp |  8 +-
 libgnn/src/layers/SAGELayer.cpp               |  8 +-
 libgnn/test/CMakeLists.txt                    |  4 +
 libgnn/test/aggregate-sync-test.cpp           | 14 +--
 libgnn/test/sample-bit-test.cpp               | 88 +++++++++++++++++++
 11 files changed, 299 insertions(+), 112 deletions(-)
 create mode 100644 libgnn/test/sample-bit-test.cpp

diff --git a/libcusp/include/galois/graphs/DistributedGraph.h b/libcusp/include/galois/graphs/DistributedGraph.h
index 42b659fa67..1c56302b93 100644
--- a/libcusp/include/galois/graphs/DistributedGraph.h
+++ b/libcusp/include/galois/graphs/DistributedGraph.h
@@ -31,6 +31,7 @@
 #include <fstream>
 
 #include "galois/graphs/LC_CSR_Graph.h"
+#include "galois/graphs/LC_CSR_CSC_Graph.h"
 #include "galois/graphs/BufferedGraph.h"
 #include "galois/runtime/DistStats.h"
 #include "galois/graphs/OfflineGraph.h"
@@ -68,8 +69,8 @@ class DistGraph {
   constexpr static const char* const GRNAME = "dGraph";
 
   using GraphTy =
-      galois::graphs::LC_CSR_Graph<NodeTy, EdgeTy, true, false, false, EdgeTy,
-                                   NodeIndexTy, EdgeIndexTy>;
+      galois::graphs::LC_CSR_CSC_Graph<NodeTy, EdgeTy, false, true, false,
+                                       false, EdgeTy, NodeIndexTy, EdgeIndexTy>;
 
   // vector for determining range objects for master nodes + nodes
   // with edges (which includes masters)
@@ -745,6 +746,46 @@ class DistGraph {
     return IDs;
   }
 
+  //////////////////////////////////////////////////////////////////////////////
+  // for in edges
+  //////////////////////////////////////////////////////////////////////////////
+
+  //! Construct the transpose graph for the partitioned graph
+  void ConstructIncomingEdges() { graph.constructIncomingEdges(); }
+
+  /**
+   * Get the edge data for a particular edge in the graph.
+   *
+   * @param ni edge to get the data of
+   * @param mflag access flag for edge data
+   * @returns The edge data for the requested edge
+   */
+  typename GraphTy::edge_data_reference
+  GetInEdgeData(edge_iterator ni,
+                galois::MethodFlag mflag = galois::MethodFlag::UNPROTECTED) {
+    return graph.getInEdgeData(ni, mflag);
+  }
+
+  GraphNode GetInEdgeDest(edge_iterator ni) { return graph.getInEdgeDst(ni); }
+
+  edge_iterator in_edge_begin(GraphNode N) {
+    return graph.in_edge_begin(N, galois::MethodFlag::UNPROTECTED);
+  }
+
+  edge_iterator in_edge_end(GraphNode N) {
+    return graph.in_edge_end(N, galois::MethodFlag::UNPROTECTED);
+  }
+
+  galois::runtime::iterable<galois::NoDerefIterator<edge_iterator>>
+  in_edges(GraphNode N) {
+    return galois::graphs::internal::make_no_deref_range(in_edge_begin(N),
+                                                         in_edge_end(N));
+  }
+
+  //////////////////////////////////////////////////////////////////////////////
+  // end in edges
+  //////////////////////////////////////////////////////////////////////////////
+
 protected:
   /**
    * Uses a pre-computed prefix sum to determine division of nodes among
diff --git a/libgalois/include/galois/graphs/BufferedGraph.h b/libgalois/include/galois/graphs/BufferedGraph.h
index e5e3fa4221..22cc10cc11 100644
--- a/libgalois/include/galois/graphs/BufferedGraph.h
+++ b/libgalois/include/galois/graphs/BufferedGraph.h
@@ -180,7 +180,7 @@ class BufferedGraph {
       typename std::enable_if<!std::is_void<EdgeType>::value>::type* = nullptr>
   void loadEdgeData(std::ifstream& graphFile, uint64_t edgeStart,
                     uint64_t numEdgesToLoad, uint64_t numGlobalNodes,
-                    uint64_t numGlobalEdges) {
+                    uint64_t numGlobalEdges, uint64_t file_data_size) {
     if (numEdgesToLoad == 0) {
       return;
     }
@@ -193,30 +193,39 @@ class BufferedGraph {
       GALOIS_DIE("Failed to allocate memory for edge data buffer.");
     }
 
-    // position after nodes + edges
-    uint64_t baseReadPosition = (4 + numGlobalNodes) * sizeof(uint64_t) +
-                                (sizeof(uint32_t) * numGlobalEdges);
-
-    // version 1 padding TODO make version agnostic
-    if (numGlobalEdges % 2) {
-      baseReadPosition += sizeof(uint32_t);
-    }
-
-    // jump to first byte of edge data
-    uint64_t readPosition =
-        baseReadPosition + (sizeof(EdgeDataType) * edgeStart);
-    graphFile.seekg(readPosition);
-    uint64_t numBytesToLoad = numEdgesToLoad * sizeof(EdgeDataType);
-    uint64_t bytesRead      = 0;
-
-    while (numBytesToLoad > 0) {
-      graphFile.read(((char*)this->edgeDataBuffer) + bytesRead, numBytesToLoad);
-      size_t numRead = graphFile.gcount();
-      numBytesToLoad -= numRead;
-      bytesRead += numRead;
+    if (file_data_size == sizeof(EdgeDataType)) {
+      // position after nodes + edges
+      uint64_t baseReadPosition = (4 + numGlobalNodes) * sizeof(uint64_t) +
+                                  (sizeof(uint32_t) * numGlobalEdges);
+
+      // version 1 padding TODO make version agnostic
+      if (numGlobalEdges % 2) {
+        baseReadPosition += sizeof(uint32_t);
+      }
+
+      // jump to first byte of edge data
+      uint64_t readPosition =
+          baseReadPosition + (sizeof(EdgeDataType) * edgeStart);
+      graphFile.seekg(readPosition);
+      uint64_t numBytesToLoad = numEdgesToLoad * sizeof(EdgeDataType);
+      uint64_t bytesRead      = 0;
+
+      while (numBytesToLoad > 0) {
+        graphFile.read(((char*)this->edgeDataBuffer) + bytesRead,
+                       numBytesToLoad);
+        size_t numRead = graphFile.gcount();
+        numBytesToLoad -= numRead;
+        bytesRead += numRead;
+      }
+
+      assert(numBytesToLoad == 0);
+    } else {
+      // file on disk does not match edge data type: fill in the buffer
+      // with 0s instead
+      galois::gInfo("File on disk does not have appropriate edge data to read; "
+                    "filling with 0s");
+      memset(edgeDataBuffer, 0, sizeof(EdgeDataType) * numEdgesToLoad);
     }
-
-    assert(numBytesToLoad == 0);
   }
 
   /**
@@ -230,7 +239,8 @@ class BufferedGraph {
   template <
       typename EdgeType,
       typename std::enable_if<std::is_void<EdgeType>::value>::type* = nullptr>
-  void loadEdgeData(std::ifstream&, uint64_t, uint64_t, uint64_t, uint64_t) {
+  void loadEdgeData(std::ifstream&, uint64_t, uint64_t, uint64_t, uint64_t,
+                    uint64_t) {
     // do nothing (edge data is void, i.e. no edge data)
   }
 
@@ -322,7 +332,7 @@ class BufferedGraph {
     loadEdgeDest(graphFile, 0, globalEdgeSize, globalSize);
     // may or may not do something depending on EdgeDataType
     loadEdgeData<EdgeDataType>(graphFile, 0, globalEdgeSize, globalSize,
-                               globalEdgeSize);
+                               globalEdgeSize, header[1]);
     graphLoaded = true;
 
     graphFile.close();
@@ -350,6 +360,8 @@ class BufferedGraph {
     }
 
     std::ifstream graphFile(filename.c_str());
+    uint64_t header[4];
+    graphFile.read(((char*)header), sizeof(uint64_t) * 4);
 
     globalSize     = numGlobalNodes;
     globalEdgeSize = numGlobalEdges;
@@ -364,7 +376,7 @@ class BufferedGraph {
 
     // may or may not do something depending on EdgeDataType
     loadEdgeData<EdgeDataType>(graphFile, edgeStart, numLocalEdges,
-                               numGlobalNodes, numGlobalEdges);
+                               numGlobalNodes, numGlobalEdges, header[1]);
     graphLoaded = true;
 
     graphFile.close();
diff --git a/libgalois/include/galois/graphs/LC_CSR_CSC_Graph.h b/libgalois/include/galois/graphs/LC_CSR_CSC_Graph.h
index 2da77fb6cb..9509f73a8e 100644
--- a/libgalois/include/galois/graphs/LC_CSR_CSC_Graph.h
+++ b/libgalois/include/galois/graphs/LC_CSR_CSC_Graph.h
@@ -50,31 +50,35 @@ namespace graphs {
  */
 template <typename NodeTy, typename EdgeTy, bool EdgeDataByValue = false,
           bool HasNoLockable = false, bool UseNumaAlloc = false,
-          bool HasOutOfLineLockable = false, typename FileEdgeTy = EdgeTy>
+          bool HasOutOfLineLockable = false, typename FileEdgeTy = EdgeTy,
+          typename NodeIndexTy = uint32_t, typename EdgeIndexTy = uint64_t>
 class LC_CSR_CSC_Graph
     : public LC_CSR_Graph<NodeTy, EdgeTy, HasNoLockable, UseNumaAlloc,
-                          HasOutOfLineLockable, FileEdgeTy> {
+                          HasOutOfLineLockable, FileEdgeTy, NodeIndexTy,
+                          EdgeIndexTy> {
   // typedef to make it easier to read
   //! Typedef referring to base LC_CSR_Graph
-  using BaseGraph = LC_CSR_Graph<NodeTy, EdgeTy, HasNoLockable, UseNumaAlloc,
-                                 HasOutOfLineLockable, FileEdgeTy>;
+  using BaseGraph =
+      LC_CSR_Graph<NodeTy, EdgeTy, HasNoLockable, UseNumaAlloc,
+                   HasOutOfLineLockable, FileEdgeTy, NodeIndexTy, EdgeIndexTy>;
   //! Typedef referring to this class itself
   using ThisGraph =
       LC_CSR_CSC_Graph<NodeTy, EdgeTy, EdgeDataByValue, HasNoLockable,
-                       UseNumaAlloc, HasOutOfLineLockable, FileEdgeTy>;
+                       UseNumaAlloc, HasOutOfLineLockable, FileEdgeTy,
+                       NodeIndexTy, EdgeIndexTy>;
 
 public:
   //! Graph node typedef
-  using GraphNode = uint32_t;
+  using GraphNode = NodeIndexTy;
 
 protected:
-  // retypedefs of base class
+  // redefinitions of base class typedefs
   //! large array for edge data
   using EdgeData = LargeArray<EdgeTy>;
   //! large array for edge destinations
-  using EdgeDst = LargeArray<uint32_t>;
+  using EdgeDst = LargeArray<NodeIndexTy>;
   //! large array for edge index data
-  using EdgeIndData = LargeArray<uint64_t>;
+  using EdgeIndData = LargeArray<EdgeIndexTy>;
 
 public:
   //! iterator for edges
@@ -85,7 +89,7 @@ class LC_CSR_CSC_Graph
 
 protected:
   //! edge index data for the reverse edges
-  EdgeIndData inEdgeIndData;
+  EdgeIndData in_edge_ind_data_;
   //! edge destination data for the reverse edges
   EdgeDst inEdgeDst;
   //! Edge data of inedges can be a value copy of the outedges (i.e. in and
@@ -162,9 +166,9 @@ class LC_CSR_CSC_Graph
     }
 
     // copy over the new tranposed edge index data
-    inEdgeIndData.allocateInterleaved(BaseGraph::numNodes);
+    in_edge_ind_data_.allocateInterleaved(BaseGraph::numNodes);
     galois::do_all(galois::iterate(UINT64_C(0), BaseGraph::numNodes),
-                   [&](uint64_t n) { inEdgeIndData[n] = dataBuffer[n]; });
+                   [&](uint64_t n) { in_edge_ind_data_[n] = dataBuffer[n]; });
   }
 
   /**
@@ -179,8 +183,9 @@ class LC_CSR_CSC_Graph
     // saving an edge for a node
     if (BaseGraph::numNodes >= 1) {
       dataBuffer[0] = 0;
-      galois::do_all(galois::iterate(UINT64_C(1), BaseGraph::numNodes),
-                     [&](uint64_t n) { dataBuffer[n] = inEdgeIndData[n - 1]; });
+      galois::do_all(
+          galois::iterate(UINT64_C(1), BaseGraph::numNodes),
+          [&](uint64_t n) { dataBuffer[n] = in_edge_ind_data_[n - 1]; });
     }
 
     // allocate edge dests and data
@@ -212,13 +217,6 @@ class LC_CSR_CSC_Graph
   }
 
 public:
-  //! default constructor
-  LC_CSR_CSC_Graph() = default;
-  //! default move constructor
-  LC_CSR_CSC_Graph(LC_CSR_CSC_Graph&& rhs) = default;
-  //! default = operator
-  LC_CSR_CSC_Graph& operator=(LC_CSR_CSC_Graph&&) = default;
-
   /////////////////////////////////////////////////////////////////////////////
   // Construction functions
   /////////////////////////////////////////////////////////////////////////////
@@ -254,7 +252,7 @@ class LC_CSR_CSC_Graph
    * @returns Iterator to first in edge of node N
    */
   edge_iterator in_raw_begin(GraphNode N) const {
-    return edge_iterator((N == 0) ? 0 : inEdgeIndData[N - 1]);
+    return edge_iterator((N == 0) ? 0 : in_edge_ind_data_[N - 1]);
   }
 
   /**
@@ -265,7 +263,7 @@ class LC_CSR_CSC_Graph
    * node N+1)
    */
   edge_iterator in_raw_end(GraphNode N) const {
-    return edge_iterator(inEdgeIndData[N]);
+    return edge_iterator(in_edge_ind_data_[N]);
   }
 
   /**
@@ -389,7 +387,7 @@ class LC_CSR_CSC_Graph
   /**
    * @returns the prefix sum of in-edges
    */
-  const EdgeIndData& getInEdgePrefixSum() const { return inEdgeIndData; }
+  const EdgeIndData& getInEdgePrefixSum() const { return in_edge_ind_data_; }
 
   /////////////////////////////////////////////////////////////////////////////
   // Utility
diff --git a/libgalois/include/galois/graphs/LC_CSR_Graph.h b/libgalois/include/galois/graphs/LC_CSR_Graph.h
index afd0b52abd..9f849d0efc 100644
--- a/libgalois/include/galois/graphs/LC_CSR_Graph.h
+++ b/libgalois/include/galois/graphs/LC_CSR_Graph.h
@@ -344,8 +344,9 @@ class LC_CSR_Graph :
   uint64_t operator[](uint64_t n) { return *(edge_end(n)); }
 
   template <typename EdgeNumFnTy, typename EdgeDstFnTy, typename EdgeDataFnTy>
-  LC_CSR_Graph(NodeIndexTy _numNodes, EdgeIndexTy _numEdges, EdgeNumFnTy edgeNum,
-               EdgeDstFnTy _edgeDst, EdgeDataFnTy _edgeData)
+  LC_CSR_Graph(NodeIndexTy _numNodes, EdgeIndexTy _numEdges,
+               EdgeNumFnTy edgeNum, EdgeDstFnTy _edgeDst,
+               EdgeDataFnTy _edgeData)
       : numNodes(_numNodes), numEdges(_numEdges) {
     if (UseNumaAlloc) {
       //! [numaallocex]
@@ -717,8 +718,8 @@ class LC_CSR_Graph :
   }
 
   template <bool is_non_void = EdgeData::has_value>
-  void edgeDataCopy(EdgeData& edgeData_new, EdgeData& edgeData, EdgeIndexTy e_new,
-                    EdgeIndexTy e,
+  void edgeDataCopy(EdgeData& edgeData_new, EdgeData& edgeData,
+                    EdgeIndexTy e_new, EdgeIndexTy e,
                     typename std::enable_if<is_non_void>::type* = 0) {
     edgeData_new[e_new] = edgeData[e];
   }
@@ -815,7 +816,7 @@ class LC_CSR_Graph :
                      std::vector<uint64_t>& prefix_sum,
                      std::vector<std::vector<uint32_t>>& edges_id,
                      std::vector<std::vector<EdgeTy>>& edges_data) {
-    //allocateFrom(numNodes, numEdges);
+    // allocateFrom(numNodes, numEdges);
     /*
      * Deallocate if reusing the graph
      */
@@ -823,24 +824,25 @@ class LC_CSR_Graph :
     constructNodes();
 
     galois::do_all(galois::iterate((NodeIndexTy)0, numNodes),
-                  [&](NodeIndexTy n) { edgeIndData[n] = prefix_sum[n]; });
+                   [&](NodeIndexTy n) { edgeIndData[n] = prefix_sum[n]; });
     galois::do_all(galois::iterate((NodeIndexTy)0, numNodes),
-    [&](NodeIndexTy n) {
-      if (n == 0) {
-        if (edgeIndData[n] > 0) {
-          std::copy(edges_id[n].begin(), edges_id[n].end(), edgeDst.begin());
-          std::copy(edges_data[n].begin(), edges_data[n].end(),
-                    edgeData.begin());
-        }
-      } else {
-        if (edgeIndData[n] - edgeIndData[n - 1] > 0) {
-          std::copy(edges_id[n].begin(), edges_id[n].end(),
-                    edgeDst.begin() + edgeIndData[n - 1]);
-          std::copy(edges_data[n].begin(), edges_data[n].end(),
-                    edgeData.begin() + edgeIndData[n - 1]);
-        }
-      }
-    });
+                   [&](NodeIndexTy n) {
+                     if (n == 0) {
+                       if (edgeIndData[n] > 0) {
+                         std::copy(edges_id[n].begin(), edges_id[n].end(),
+                                   edgeDst.begin());
+                         std::copy(edges_data[n].begin(), edges_data[n].end(),
+                                   edgeData.begin());
+                       }
+                     } else {
+                       if (edgeIndData[n] - edgeIndData[n - 1] > 0) {
+                         std::copy(edges_id[n].begin(), edges_id[n].end(),
+                                   edgeDst.begin() + edgeIndData[n - 1]);
+                         std::copy(edges_data[n].begin(), edges_data[n].end(),
+                                   edgeData.begin() + edgeIndData[n - 1]);
+                       }
+                     }
+                   });
 
     initializeLocalRanges();
   }
@@ -874,10 +876,10 @@ class LC_CSR_Graph :
     initializeLocalRanges();
   }
 
-////////////////////////////////////////////////////////////////////////////////
-// Warning: the below code is NOT compatible with NodeIndexTy/EdgeIndexTy;
-// do NOT use with them
-////////////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////////////
+  // Warning: the below code is NOT compatible with NodeIndexTy/EdgeIndexTy;
+  // do NOT use with them
+  ////////////////////////////////////////////////////////////////////////////////
 
   /**
    * Reads the GR files directly into in-memory
@@ -1030,10 +1032,9 @@ class LC_CSR_Graph :
       this->setLocalRange(*r.first, *r.second);
     });
   }
-////////////////////////////////////////////////////////////////////////////////
-// End warning section
-////////////////////////////////////////////////////////////////////////////////
-
+  ////////////////////////////////////////////////////////////////////////////////
+  // End warning section
+  ////////////////////////////////////////////////////////////////////////////////
 };
 
 } // namespace galois::graphs
diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 071b33aeac..3b5a499a57 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -33,7 +33,7 @@ enum class GNNPartitionScheme { kOEC, kCVC, kOCVC };
 //! XXX
 class GNNGraph {
 public:
-  using GNNDistGraph = galois::graphs::DistGraph<char, void>;
+  using GNNDistGraph = galois::graphs::DistGraph<char, char>;
   using WholeGraph   = galois::graphs::LC_CSR_Graph<char, void>;
   using GraphNode    = GNNDistGraph::GraphNode;
   // defined as such because dist graph range objects used long unsigned
@@ -86,20 +86,61 @@ class GNNGraph {
     return partitioned_graph_->masterNodesRange().end();
   }
 
+  //////////////////////////////////////////////////////////////////////////////
+  // out edges
+  //////////////////////////////////////////////////////////////////////////////
   // All following functions take a local node id
-  EdgeIterator EdgeBegin(GraphNode n) const {
+  EdgeIterator edge_begin(GraphNode n) const {
     return partitioned_graph_->edge_begin(n);
   };
-  EdgeIterator EdgeEnd(GraphNode n) const {
+  EdgeIterator edge_end(GraphNode n) const {
     return partitioned_graph_->edge_end(n);
   };
-  GraphNode EdgeDestination(EdgeIterator ei) const {
+  GraphNode GetEdgeDest(EdgeIterator ei) const {
     return partitioned_graph_->getEdgeDst(ei);
   };
-  GNNFloat NormFactor(GraphNode n) const { return norm_factors_[n]; }
+  char IsEdgeSampled(EdgeIterator ei) const {
+    return partitioned_graph_->getEdgeData(ei);
+  };
+  //! Set the flag on the edge to 1; makes it sampled
+  void MakeEdgeSampled(EdgeIterator ei) {
+    partitioned_graph_->getEdgeData(ei) = 1;
+  };
+  //! Set the flag on the edge to 0; makes it not sampled
+  void MakeEdgeUnsampled(EdgeIterator ei) {
+    partitioned_graph_->getEdgeData(ei) = 0;
+  };
+  galois::runtime::iterable<
+      galois::NoDerefIterator<GNNDistGraph::edge_iterator>>
+  edges(GraphNode N) {
+    return partitioned_graph_->edges(N);
+  }
+  //////////////////////////////////////////////////////////////////////////////
+  // in edges
+  //////////////////////////////////////////////////////////////////////////////
+  EdgeIterator in_edge_begin(GraphNode n) const {
+    return partitioned_graph_->in_edge_begin(n);
+  }
+  EdgeIterator in_edge_end(GraphNode n) const {
+    return partitioned_graph_->in_edge_end(n);
+  }
+  GraphNode GetInEdgeDest(EdgeIterator ei) const {
+    return partitioned_graph_->GetInEdgeDest(ei);
+  };
+  char IsInEdgeSampled(EdgeIterator ei) const {
+    return partitioned_graph_->GetInEdgeData(ei);
+  };
+  galois::runtime::iterable<
+      galois::NoDerefIterator<GNNDistGraph::edge_iterator>>
+  in_edges(GraphNode N) {
+    return partitioned_graph_->in_edges(N);
+  }
+  //////////////////////////////////////////////////////////////////////////////
+
+  GNNFloat GetNormFactor(GraphNode n) const { return norm_factors_[n]; }
   //! Degree norm (1 / degree) of current functional graph (e.g., sampled,
   //! inductive graph, etc); calculated whenever norm factor is calculated
-  GNNFloat DegreeNorm(GraphNode n) const { return degree_norm_[n]; }
+  GNNFloat GetDegreeNorm(GraphNode n) const { return degree_norm_[n]; }
 
   // Get accuracy: sampling is by default false
   float GetGlobalAccuracy(PointerWithSize<GNNFloat> predictions,
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index 89cdca94e9..0c10f7a023 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -18,13 +18,13 @@ LoadPartition(const std::string& input_directory,
   // load partition
   switch (partition_scheme) {
   case galois::graphs::GNNPartitionScheme::kOEC:
-    return galois::cuspPartitionGraph<GnnOEC, char, void>(
+    return galois::cuspPartitionGraph<GnnOEC, char, char>(
         input_file, galois::CUSP_CSR, galois::CUSP_CSR, true, "", "", false, 1);
   case galois::graphs::GNNPartitionScheme::kCVC:
-    return galois::cuspPartitionGraph<GnnCVC, char, void>(
+    return galois::cuspPartitionGraph<GnnCVC, char, char>(
         input_file, galois::CUSP_CSR, galois::CUSP_CSR, true, "", "", false, 1);
   case galois::graphs::GNNPartitionScheme::kOCVC:
-    return galois::cuspPartitionGraph<GenericCVC, char, void>(
+    return galois::cuspPartitionGraph<GenericCVC, char, char>(
         input_file, galois::CUSP_CSR, galois::CUSP_CSR, true, "", "", false, 1);
   default:
     GALOIS_LOG_FATAL("Error: partition scheme specified is invalid");
@@ -68,6 +68,8 @@ galois::graphs::GNNGraph::GNNGraph(const std::string& input_directory,
   // load partition
   partitioned_graph_ =
       LoadPartition(input_directory_, dataset_name, partition_scheme);
+  // reverse edges
+  partitioned_graph_->ConstructIncomingEdges();
 
   // read additional graph data
   ReadLocalLabels(dataset_name, has_single_class_label);
@@ -253,8 +255,8 @@ void galois::graphs::GNNGraph::GraphSAINTSample(size_t num_roots,
       for (size_t current_depth = 0; current_depth < walk_depth;
            current_depth++) {
         // pick random edge, mark sampled, swap roots
-        EdgeIterator first_edge = EdgeBegin(root);
-        size_t num_edges        = std::distance(first_edge, EdgeEnd(root));
+        EdgeIterator first_edge = edge_begin(root);
+        size_t num_edges        = std::distance(first_edge, edge_end(root));
         if (num_edges == 0) {
           break;
         }
@@ -267,7 +269,7 @@ void galois::graphs::GNNGraph::GraphSAINTSample(size_t num_roots,
         long int rand_num;
         lrand48_r(&seed_struct, &rand_num);
         EdgeIterator selected_edge = first_edge + (rand_num % num_edges);
-        size_t candidate_dest      = EdgeDestination(selected_edge);
+        size_t candidate_dest      = GetEdgeDest(selected_edge);
 
         // TODO(loc) another possibility is to just pick it anyways regardless
         // but don't mark it as sampled, though this would lead to disconnected
@@ -609,9 +611,9 @@ void galois::graphs::GNNGraph::CalculateSpecialNormFactor(bool is_sampled,
 
         // TODO(loc) make this work in a distributed setting; assuming
         // whole graph is present on single host at the moment
-        for (EdgeIterator e = EdgeBegin(local_id); e != EdgeEnd(local_id);
+        for (EdgeIterator e = edge_begin(local_id); e != edge_end(local_id);
              e++) {
-          size_t dest = EdgeDestination(e);
+          size_t dest = GetEdgeDest(e);
           if (is_sampled && is_inductive) {
             if (!IsValidForPhase(dest, GNNPhase::kTrain) ||
                 !IsInSampledGraph(dest)) {
diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp
index 6f86cf1395..282042a805 100644
--- a/libgnn/src/layers/GraphConvolutionalLayer.cpp
+++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp
@@ -317,7 +317,7 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU(
 
         GNNFloat source_norm = 0.0;
         if (!config_.disable_normalization) {
-          source_norm = graph_.NormFactor(src);
+          source_norm = graph_.GetNormFactor(src);
         }
 
         // init to self
@@ -334,8 +334,8 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU(
         }
 
         // loop through all destinations to grab the feature to aggregate
-        for (auto e = graph_.EdgeBegin(src); e != graph_.EdgeEnd(src); e++) {
-          size_t dst = graph_.EdgeDestination(e);
+        for (auto e = graph_.edge_begin(src); e != graph_.edge_end(src); e++) {
+          size_t dst = graph_.GetEdgeDest(e);
           graphs::bitset_graph_aggregate.set(src);
 
           if (layer_phase_ == GNNPhase::kTrain) {
@@ -356,7 +356,7 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU(
           size_t index_to_dst_feature = dst * column_length;
 
           if (!config_.disable_normalization) {
-            GNNFloat norm_scale = source_norm * graph_.NormFactor(dst);
+            GNNFloat norm_scale = source_norm * graph_.GetNormFactor(dst);
             galois::VectorMulAdd(
                 column_length, &aggregate_output[index_to_src_feature],
                 &node_embeddings[index_to_dst_feature], norm_scale,
diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp
index 8fde856ac8..bd6b84469f 100644
--- a/libgnn/src/layers/SAGELayer.cpp
+++ b/libgnn/src/layers/SAGELayer.cpp
@@ -412,13 +412,13 @@ void galois::SAGELayer::AggregateAllCPU(
 
         GNNFloat source_norm = 0.0;
         if (!config_.disable_normalization) {
-          source_norm = graph_.DegreeNorm(src);
+          source_norm = graph_.GetDegreeNorm(src);
         }
 
         // loop through all destinations to grab the feature to aggregate
-        for (auto e = graph_.EdgeBegin(src); e != graph_.EdgeEnd(src); e++) {
+        for (auto e = graph_.edge_begin(src); e != graph_.edge_end(src); e++) {
           graphs::bitset_graph_aggregate.set(src);
-          size_t dst = graph_.EdgeDestination(e);
+          size_t dst = graph_.GetEdgeDest(e);
 
           if (layer_phase_ == GNNPhase::kTrain) {
             if (IsInductiveLayer()) {
@@ -442,7 +442,7 @@ void galois::SAGELayer::AggregateAllCPU(
             if (!is_backward) {
               norm_scale = source_norm;
             } else {
-              norm_scale = graph_.DegreeNorm(dst);
+              norm_scale = graph_.GetDegreeNorm(dst);
             }
 
             galois::VectorMulAdd(
diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt
index b9ef634c53..91835cfc07 100644
--- a/libgnn/test/CMakeLists.txt
+++ b/libgnn/test/CMakeLists.txt
@@ -81,6 +81,10 @@ if (NOT GALOIS_ENABLE_GPU)
   add_executable(sample-test sample-test.cpp)
   target_link_libraries(sample-test galois_gnn)
   add_test(NAME sample-test COMMAND sample-test)
+
+  add_executable(sample-bit-test sample-bit-test.cpp)
+  target_link_libraries(sample-bit-test galois_gnn)
+  add_test(NAME sample-bit-test COMMAND sample-bit-test)
 else()
   add_executable(gpu-convlayer-test gpu-convlayer-test.cpp)
   target_link_libraries(gpu-convlayer-test galois_gnn)
diff --git a/libgnn/test/aggregate-sync-test.cpp b/libgnn/test/aggregate-sync-test.cpp
index eac1f89c84..d95931a798 100644
--- a/libgnn/test/aggregate-sync-test.cpp
+++ b/libgnn/test/aggregate-sync-test.cpp
@@ -14,11 +14,11 @@ int main() {
 
   // print edges for sanity
   for (size_t node = 0; node < test_graph->size(); node++) {
-    for (auto e = test_graph->EdgeBegin(node); e != test_graph->EdgeEnd(node);
+    for (auto e = test_graph->edge_begin(node); e != test_graph->edge_end(node);
          e++) {
       galois::gPrint(test_graph->host_prefix(), "Edge ",
                      test_graph->GetGID(node), " ",
-                     test_graph->GetGID(test_graph->EdgeDestination(e)), "\n");
+                     test_graph->GetGID(test_graph->GetEdgeDest(e)), "\n");
     }
   }
   for (auto own = test_graph->begin_owned(); own != test_graph->end_owned();
@@ -210,11 +210,11 @@ int main() {
       "tester", galois::graphs::GNNPartitionScheme::kCVC, true);
   // print edges for sanity
   for (size_t node = 0; node < test_graph_2->size(); node++) {
-    for (auto e = test_graph_2->EdgeBegin(node);
-         e != test_graph_2->EdgeEnd(node); e++) {
-      galois::gPrint(
-          test_graph_2->host_prefix(), "Edge ", test_graph_2->GetGID(node), " ",
-          test_graph_2->GetGID(test_graph_2->EdgeDestination(e)), "\n");
+    for (auto e = test_graph_2->edge_begin(node);
+         e != test_graph_2->edge_end(node); e++) {
+      galois::gPrint(test_graph_2->host_prefix(), "Edge ",
+                     test_graph_2->GetGID(node), " ",
+                     test_graph_2->GetGID(test_graph_2->GetEdgeDest(e)), "\n");
     }
   }
   for (auto own = test_graph_2->begin_owned(); own != test_graph_2->end_owned();
diff --git a/libgnn/test/sample-bit-test.cpp b/libgnn/test/sample-bit-test.cpp
new file mode 100644
index 0000000000..cc1226e8bf
--- /dev/null
+++ b/libgnn/test/sample-bit-test.cpp
@@ -0,0 +1,88 @@
+//! @file sample-bit-test.cpp
+//! Checks to see if edge sample bit is set correctly.
+
+#include "galois/Logging.h"
+#include "galois/graphs/GNNGraph.h"
+
+int main() {
+  galois::DistMemSys G;
+
+  size_t num_threads = galois::setActiveThreads(
+      56 / galois::runtime::getSystemNetworkInterface().Num);
+  GALOIS_LOG_VERBOSE("[{}] Using {} threads",
+                     galois::runtime::getSystemNetworkInterface().ID,
+                     num_threads);
+
+  galois::graphs::GNNGraph graph(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true);
+
+  // first, assert all edges are not sampled (should come with all 0s)
+  for (size_t node = 0; node < graph.size(); node++) {
+    for (auto ei : graph.edges(node)) {
+      GALOIS_LOG_ASSERT(!graph.IsEdgeSampled(ei));
+    }
+    for (auto ei : graph.in_edges(node)) {
+      GALOIS_LOG_ASSERT(!graph.IsInEdgeSampled(ei));
+    }
+  }
+
+  // make all edges sampled; it should set the in-edges as well
+  for (size_t node = 0; node < graph.size(); node++) {
+    for (auto ei : graph.edges(node)) {
+      graph.MakeEdgeSampled(ei);
+    }
+  }
+
+  // all edges (including ins) should be sampled
+  for (size_t node = 0; node < graph.size(); node++) {
+    for (auto ei : graph.edges(node)) {
+      GALOIS_LOG_ASSERT(graph.IsEdgeSampled(ei));
+    }
+    for (auto ei : graph.in_edges(node)) {
+      GALOIS_LOG_ASSERT(graph.IsInEdgeSampled(ei));
+    }
+  }
+
+  // clear sample bits for odd numbers
+  for (size_t node = 0; node < graph.size(); node++) {
+    if (node % 2 == 1) {
+      for (auto ei : graph.edges(node)) {
+        graph.MakeEdgeUnsampled(ei);
+      }
+    }
+  }
+
+  // do another check
+  for (size_t node = 0; node < graph.size(); node++) {
+    for (auto ei : graph.edges(node)) {
+      if (node % 2 == 1) {
+        GALOIS_LOG_ASSERT(!graph.IsEdgeSampled(ei));
+      } else {
+        GALOIS_LOG_ASSERT(graph.IsEdgeSampled(ei));
+      }
+    }
+
+    // in edges for this node: if destination (i.e., source) is
+    // odd, then it should not be sampled
+    for (auto ei : graph.in_edges(node)) {
+      if ((graph.GetInEdgeDest(ei) % 2) == 1) {
+        GALOIS_LOG_ASSERT(!graph.IsInEdgeSampled(ei));
+      } else {
+        GALOIS_LOG_ASSERT(graph.IsInEdgeSampled(ei));
+      }
+    }
+  }
+
+  // print edges for a quick lookover if run manually
+  for (size_t node = 0; node < graph.size(); node++) {
+    for (auto ei : graph.edges(node)) {
+      galois::gPrint("Out edge ", node, " ", graph.GetEdgeDest(ei), "\n");
+    }
+    for (auto ei : graph.in_edges(node)) {
+      galois::gPrint("In edge to ", node, " from ", graph.GetInEdgeDest(ei),
+                     "\n");
+    }
+  }
+
+  return 0;
+}

From 386fbb83682be6232da9c980a459ad6fac0111f1 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 15 Apr 2021 14:14:57 -0500
Subject: [PATCH 508/660] CSR_CSC always creates in-to-out mapping

Before, in to out mapping in CSR/CSC only created if edge data exists.
Now it is always created in case user wants to create edge-data outside
of the graph object (e.g., in GNNs). Adds the function to get access to
this mapping as well.
---
 .../include/galois/graphs/DistributedGraph.h  |  5 ++++
 .../include/galois/graphs/LC_CSR_CSC_Graph.h  | 27 ++++++++++---------
 2 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/libcusp/include/galois/graphs/DistributedGraph.h b/libcusp/include/galois/graphs/DistributedGraph.h
index 1c56302b93..e13f71e4d2 100644
--- a/libcusp/include/galois/graphs/DistributedGraph.h
+++ b/libcusp/include/galois/graphs/DistributedGraph.h
@@ -782,6 +782,11 @@ class DistGraph {
                                                          in_edge_end(N));
   }
 
+  //! Return corresponding out-edge index for an in-edge
+  size_t InEdgeToOutEdge(edge_iterator ni) const {
+    return graph.InEdgeToOutEdge(ni);
+  }
+
   //////////////////////////////////////////////////////////////////////////////
   // end in edges
   //////////////////////////////////////////////////////////////////////////////
diff --git a/libgalois/include/galois/graphs/LC_CSR_CSC_Graph.h b/libgalois/include/galois/graphs/LC_CSR_CSC_Graph.h
index 9509f73a8e..2f0b9e88de 100644
--- a/libgalois/include/galois/graphs/LC_CSR_CSC_Graph.h
+++ b/libgalois/include/galois/graphs/LC_CSR_CSC_Graph.h
@@ -99,6 +99,7 @@ class LC_CSR_CSC_Graph
       typename std::conditional<EdgeDataByValue, EdgeData, EdgeIndData>::type;
   //! The data for the reverse edges
   EdgeDataRep inEdgeData;
+  EdgeIndData in_edge_to_out_edge_;
 
   //! redefinition of the edge sort iterator in LC_CSR_Graph
   using edge_sort_iterator =
@@ -127,19 +128,11 @@ class LC_CSR_CSC_Graph
     BaseGraph::edgeDataCopy(inEdgeData, BaseGraph::edgeData, e_new, e);
   }
 
-  /**
-   * Save a pointer to an outedge (i.e. map an in-edge to an out-edge). Done
-   * to share edge data.
-   *
-   * @param e_new position of out-edge to save
-   * @param e position of in-edge
-   */
+  //! Do nothing; getting edge data will be done via pointer
   template <bool A                             = EdgeDataByValue,
             typename std::enable_if<!A>::type* = nullptr>
-  void createEdgeData(const uint64_t e_new, const uint64_t e) {
-    if (!std::is_void<EdgeTy>::value) {
-      inEdgeData[e_new] = e;
-    }
+  void createEdgeData(const uint64_t, const uint64_t) {
+    // do nothing
   }
 
   /**
@@ -194,6 +187,7 @@ class LC_CSR_CSC_Graph
     if (!std::is_void<EdgeTy>::value) {
       inEdgeData.allocateInterleaved(BaseGraph::numEdges);
     }
+    in_edge_to_out_edge_.allocateInterleaved(BaseGraph::numEdges);
 
     galois::do_all(
         galois::iterate(UINT64_C(0), BaseGraph::numNodes), [&](uint64_t src) {
@@ -211,6 +205,7 @@ class LC_CSR_CSC_Graph
             inEdgeDst[e_new] = src;
             // edge data to "new" array
             createEdgeData(e_new, e);
+            in_edge_to_out_edge_[e_new] = e;
             e++;
           }
         });
@@ -365,7 +360,7 @@ class LC_CSR_CSC_Graph
             typename std::enable_if<!A>::type* = nullptr>
   edge_data_reference
   getInEdgeData(edge_iterator ni, MethodFlag = MethodFlag::UNPROTECTED) const {
-    return BaseGraph::edgeData[inEdgeData[*ni]];
+    return BaseGraph::edgeData[in_edge_to_out_edge_[*ni]];
   }
 
   /**
@@ -381,7 +376,13 @@ class LC_CSR_CSC_Graph
             typename std::enable_if<!A>::type* = nullptr>
   edge_data_reference getInEdgeData(edge_iterator ni,
                                     MethodFlag = MethodFlag::UNPROTECTED) {
-    return BaseGraph::edgeData[inEdgeData[*ni]];
+    return BaseGraph::edgeData[in_edge_to_out_edge_[*ni]];
+  }
+
+  //! Returns corresponding index for the out-edge corresponding to
+  //! an in-edge.
+  size_t InEdgeToOutEdge(edge_iterator ni) const {
+    return in_edge_to_out_edge_[*ni];
   }
 
   /**

From fbfe895ed217a4cbf894681a82f7c2fe549c5438 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 15 Apr 2021 14:23:39 -0500
Subject: [PATCH 509/660] Separate edge sample data; data also vector now

Edge sampling data separated from partitioned graph. Also made into a
vector because each layer will have different sampling status.
---
 libgnn/include/galois/graphs/GNNGraph.h | 36 ++++++++++++++++++-------
 libgnn/src/graphs/GNNGraph.cpp          |  6 ++---
 libgnn/test/sample-bit-test.cpp         | 21 ++++++++-------
 3 files changed, 40 insertions(+), 23 deletions(-)

diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 3b5a499a57..72dce17185 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -33,7 +33,7 @@ enum class GNNPartitionScheme { kOEC, kCVC, kOCVC };
 //! XXX
 class GNNGraph {
 public:
-  using GNNDistGraph = galois::graphs::DistGraph<char, char>;
+  using GNNDistGraph = galois::graphs::DistGraph<char, void>;
   using WholeGraph   = galois::graphs::LC_CSR_Graph<char, void>;
   using GraphNode    = GNNDistGraph::GraphNode;
   // defined as such because dist graph range objects used long unsigned
@@ -87,8 +87,19 @@ class GNNGraph {
   }
 
   //////////////////////////////////////////////////////////////////////////////
-  // out edges
+  // Edges
   //////////////////////////////////////////////////////////////////////////////
+
+  void InitializeEdgeData() { InitializeEdgeData(1); }
+
+  void InitializeEdgeData(size_t num_layers) {
+    edge_sample_status_.create(partitioned_graph_->sizeEdges(), num_layers);
+  }
+
+  //////////////////////////////////////////////////////////////////////////////
+  // Out Edges
+  //////////////////////////////////////////////////////////////////////////////
+
   // All following functions take a local node id
   EdgeIterator edge_begin(GraphNode n) const {
     return partitioned_graph_->edge_begin(n);
@@ -99,22 +110,23 @@ class GNNGraph {
   GraphNode GetEdgeDest(EdgeIterator ei) const {
     return partitioned_graph_->getEdgeDst(ei);
   };
-  char IsEdgeSampled(EdgeIterator ei) const {
-    return partitioned_graph_->getEdgeData(ei);
+  bool IsEdgeSampled(EdgeIterator ei, size_t layer_num) const {
+    return edge_sample_status_[*ei][layer_num];
   };
   //! Set the flag on the edge to 1; makes it sampled
-  void MakeEdgeSampled(EdgeIterator ei) {
-    partitioned_graph_->getEdgeData(ei) = 1;
+  void MakeEdgeSampled(EdgeIterator ei, size_t layer_num) {
+    edge_sample_status_[*ei][layer_num] = 1;
   };
   //! Set the flag on the edge to 0; makes it not sampled
-  void MakeEdgeUnsampled(EdgeIterator ei) {
-    partitioned_graph_->getEdgeData(ei) = 0;
+  void MakeEdgeUnsampled(EdgeIterator ei, size_t layer_num) {
+    edge_sample_status_[*ei][layer_num] = 0;
   };
   galois::runtime::iterable<
       galois::NoDerefIterator<GNNDistGraph::edge_iterator>>
   edges(GraphNode N) {
     return partitioned_graph_->edges(N);
   }
+
   //////////////////////////////////////////////////////////////////////////////
   // in edges
   //////////////////////////////////////////////////////////////////////////////
@@ -127,8 +139,9 @@ class GNNGraph {
   GraphNode GetInEdgeDest(EdgeIterator ei) const {
     return partitioned_graph_->GetInEdgeDest(ei);
   };
-  char IsInEdgeSampled(EdgeIterator ei) const {
-    return partitioned_graph_->GetInEdgeData(ei);
+  bool IsInEdgeSampled(EdgeIterator ei, size_t layer_num) const {
+    return edge_sample_status_[partitioned_graph_->InEdgeToOutEdge(ei)]
+                              [layer_num];
   };
   galois::runtime::iterable<
       galois::NoDerefIterator<GNNDistGraph::edge_iterator>>
@@ -332,6 +345,9 @@ class GNNGraph {
   std::vector<GNNLabel> local_ground_truth_labels_;
   //! Feature vectors for nodes in partitioned graph
   std::vector<GNNFeature> local_node_features_;
+  //! Sample data on edges: each edge gets a small bitset to mark
+  //! if it's been sampled for a particular layer
+  galois::LargeArray<std::vector<bool>> edge_sample_status_;
 
   // TODO maybe revisit this and use an actual bitset
   //! Bitset indicating which nodes are training nodes
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index 0c10f7a023..073c616127 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -18,13 +18,13 @@ LoadPartition(const std::string& input_directory,
   // load partition
   switch (partition_scheme) {
   case galois::graphs::GNNPartitionScheme::kOEC:
-    return galois::cuspPartitionGraph<GnnOEC, char, char>(
+    return galois::cuspPartitionGraph<GnnOEC, char, void>(
         input_file, galois::CUSP_CSR, galois::CUSP_CSR, true, "", "", false, 1);
   case galois::graphs::GNNPartitionScheme::kCVC:
-    return galois::cuspPartitionGraph<GnnCVC, char, char>(
+    return galois::cuspPartitionGraph<GnnCVC, char, void>(
         input_file, galois::CUSP_CSR, galois::CUSP_CSR, true, "", "", false, 1);
   case galois::graphs::GNNPartitionScheme::kOCVC:
-    return galois::cuspPartitionGraph<GenericCVC, char, char>(
+    return galois::cuspPartitionGraph<GenericCVC, char, void>(
         input_file, galois::CUSP_CSR, galois::CUSP_CSR, true, "", "", false, 1);
   default:
     GALOIS_LOG_FATAL("Error: partition scheme specified is invalid");
diff --git a/libgnn/test/sample-bit-test.cpp b/libgnn/test/sample-bit-test.cpp
index cc1226e8bf..1ad2d50196 100644
--- a/libgnn/test/sample-bit-test.cpp
+++ b/libgnn/test/sample-bit-test.cpp
@@ -15,31 +15,32 @@ int main() {
 
   galois::graphs::GNNGraph graph(
       "tester", galois::graphs::GNNPartitionScheme::kOEC, true);
+  graph.InitializeEdgeData(3);
 
   // first, assert all edges are not sampled (should come with all 0s)
   for (size_t node = 0; node < graph.size(); node++) {
     for (auto ei : graph.edges(node)) {
-      GALOIS_LOG_ASSERT(!graph.IsEdgeSampled(ei));
+      GALOIS_LOG_ASSERT(!graph.IsEdgeSampled(ei, 0));
     }
     for (auto ei : graph.in_edges(node)) {
-      GALOIS_LOG_ASSERT(!graph.IsInEdgeSampled(ei));
+      GALOIS_LOG_ASSERT(!graph.IsInEdgeSampled(ei, 0));
     }
   }
 
   // make all edges sampled; it should set the in-edges as well
   for (size_t node = 0; node < graph.size(); node++) {
     for (auto ei : graph.edges(node)) {
-      graph.MakeEdgeSampled(ei);
+      graph.MakeEdgeSampled(ei, 0);
     }
   }
 
   // all edges (including ins) should be sampled
   for (size_t node = 0; node < graph.size(); node++) {
     for (auto ei : graph.edges(node)) {
-      GALOIS_LOG_ASSERT(graph.IsEdgeSampled(ei));
+      GALOIS_LOG_ASSERT(graph.IsEdgeSampled(ei, 0));
     }
     for (auto ei : graph.in_edges(node)) {
-      GALOIS_LOG_ASSERT(graph.IsInEdgeSampled(ei));
+      GALOIS_LOG_ASSERT(graph.IsInEdgeSampled(ei, 0));
     }
   }
 
@@ -47,7 +48,7 @@ int main() {
   for (size_t node = 0; node < graph.size(); node++) {
     if (node % 2 == 1) {
       for (auto ei : graph.edges(node)) {
-        graph.MakeEdgeUnsampled(ei);
+        graph.MakeEdgeUnsampled(ei, 0);
       }
     }
   }
@@ -56,9 +57,9 @@ int main() {
   for (size_t node = 0; node < graph.size(); node++) {
     for (auto ei : graph.edges(node)) {
       if (node % 2 == 1) {
-        GALOIS_LOG_ASSERT(!graph.IsEdgeSampled(ei));
+        GALOIS_LOG_ASSERT(!graph.IsEdgeSampled(ei, 0));
       } else {
-        GALOIS_LOG_ASSERT(graph.IsEdgeSampled(ei));
+        GALOIS_LOG_ASSERT(graph.IsEdgeSampled(ei, 0));
       }
     }
 
@@ -66,9 +67,9 @@ int main() {
     // odd, then it should not be sampled
     for (auto ei : graph.in_edges(node)) {
       if ((graph.GetInEdgeDest(ei) % 2) == 1) {
-        GALOIS_LOG_ASSERT(!graph.IsInEdgeSampled(ei));
+        GALOIS_LOG_ASSERT(!graph.IsInEdgeSampled(ei, 0));
       } else {
-        GALOIS_LOG_ASSERT(graph.IsInEdgeSampled(ei));
+        GALOIS_LOG_ASSERT(graph.IsInEdgeSampled(ei, 0));
       }
     }
   }

From 7891816ad0516f4e399c0f97d3ce81cb534fe3cd Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 15 Apr 2021 15:39:38 -0500
Subject: [PATCH 510/660] GNN set sample edge via in-edge; more tests

Functions for setting an edge bit via in-edge + tests for it along with
tests for the different layers.
---
 libgnn/include/galois/graphs/GNNGraph.h |  8 +++
 libgnn/test/sample-bit-test.cpp         | 76 +++++++++++++++++++++++++
 2 files changed, 84 insertions(+)

diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 72dce17185..d5b5ee05dd 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -143,6 +143,14 @@ class GNNGraph {
     return edge_sample_status_[partitioned_graph_->InEdgeToOutEdge(ei)]
                               [layer_num];
   };
+  //! Set the flag on the edge to 1; makes it sampled
+  void MakeInEdgeSampled(EdgeIterator ei, size_t layer_num) {
+    edge_sample_status_[partitioned_graph_->InEdgeToOutEdge(ei)][layer_num] = 1;
+  };
+  //! Set the flag on the edge to 0; makes it not sampled
+  void MakeInEdgeUnsampled(EdgeIterator ei, size_t layer_num) {
+    edge_sample_status_[partitioned_graph_->InEdgeToOutEdge(ei)][layer_num] = 0;
+  };
   galois::runtime::iterable<
       galois::NoDerefIterator<GNNDistGraph::edge_iterator>>
   in_edges(GraphNode N) {
diff --git a/libgnn/test/sample-bit-test.cpp b/libgnn/test/sample-bit-test.cpp
index 1ad2d50196..66d739a6d7 100644
--- a/libgnn/test/sample-bit-test.cpp
+++ b/libgnn/test/sample-bit-test.cpp
@@ -61,6 +61,8 @@ int main() {
       } else {
         GALOIS_LOG_ASSERT(graph.IsEdgeSampled(ei, 0));
       }
+      GALOIS_LOG_ASSERT(!graph.IsEdgeSampled(ei, 1));
+      GALOIS_LOG_ASSERT(!graph.IsEdgeSampled(ei, 2));
     }
 
     // in edges for this node: if destination (i.e., source) is
@@ -71,6 +73,80 @@ int main() {
       } else {
         GALOIS_LOG_ASSERT(graph.IsInEdgeSampled(ei, 0));
       }
+      GALOIS_LOG_ASSERT(!graph.IsInEdgeSampled(ei, 1));
+      GALOIS_LOG_ASSERT(!graph.IsInEdgeSampled(ei, 2));
+    }
+  }
+
+  // odd layer 1, even layer 2
+  for (size_t node = 0; node < graph.size(); node++) {
+    if (node % 2 == 1) {
+      for (auto ei : graph.edges(node)) {
+        graph.MakeEdgeSampled(ei, 1);
+      }
+    } else {
+      for (auto ei : graph.edges(node)) {
+        graph.MakeEdgeSampled(ei, 2);
+      }
+    }
+  }
+
+  for (size_t node = 0; node < graph.size(); node++) {
+    for (auto ei : graph.edges(node)) {
+      if (node % 2 == 1) {
+        GALOIS_LOG_ASSERT(!graph.IsEdgeSampled(ei, 0));
+        GALOIS_LOG_ASSERT(graph.IsEdgeSampled(ei, 1));
+        GALOIS_LOG_ASSERT(!graph.IsEdgeSampled(ei, 2));
+      } else {
+        GALOIS_LOG_ASSERT(graph.IsEdgeSampled(ei, 0));
+        GALOIS_LOG_ASSERT(!graph.IsEdgeSampled(ei, 1));
+        GALOIS_LOG_ASSERT(graph.IsEdgeSampled(ei, 2));
+      }
+    }
+
+    // in edges for this node: if destination (i.e., source) is
+    // odd, then it should not be sampled
+    for (auto ei : graph.in_edges(node)) {
+      if ((graph.GetInEdgeDest(ei) % 2) == 1) {
+        GALOIS_LOG_ASSERT(!graph.IsInEdgeSampled(ei, 0));
+        GALOIS_LOG_ASSERT(graph.IsInEdgeSampled(ei, 1));
+        GALOIS_LOG_ASSERT(!graph.IsInEdgeSampled(ei, 2));
+      } else {
+        GALOIS_LOG_ASSERT(graph.IsInEdgeSampled(ei, 0));
+        GALOIS_LOG_ASSERT(!graph.IsInEdgeSampled(ei, 1));
+        GALOIS_LOG_ASSERT(graph.IsInEdgeSampled(ei, 2));
+      }
+    }
+  }
+
+  // odd layer 1, even layer 2; set in edge
+  for (size_t node = 0; node < graph.size(); node++) {
+    if (node % 2 == 1) {
+      for (auto ei : graph.in_edges(node)) {
+        graph.MakeInEdgeUnsampled(ei, 1);
+      }
+    } else {
+      for (auto ei : graph.in_edges(node)) {
+        graph.MakeInEdgeSampled(ei, 1);
+      }
+    }
+  }
+
+  for (size_t node = 0; node < graph.size(); node++) {
+    for (auto ei : graph.in_edges(node)) {
+      if (node % 2 == 1) {
+        GALOIS_LOG_ASSERT(!graph.IsInEdgeSampled(ei, 1));
+      } else {
+        GALOIS_LOG_ASSERT(graph.IsInEdgeSampled(ei, 1));
+      }
+    }
+
+    for (auto ei : graph.edges(node)) {
+      if ((graph.GetEdgeDest(ei) % 2) == 1) {
+        GALOIS_LOG_ASSERT(!graph.IsEdgeSampled(ei, 1));
+      } else {
+        GALOIS_LOG_ASSERT(graph.IsEdgeSampled(ei, 1));
+      }
     }
   }
 

From 4065206d38bf3181a176a6466884520b748100e6 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 19 Apr 2021 18:04:30 -0500
Subject: [PATCH 511/660] Basic edge sampling for distributed SAGE

1) Edges sampled probabilistically across hosts, but out-degree used
when in-degree is supposed to be used.
2) GCN not supported yet.
3) Norm factors are wrong (requires another comm step).
4) Bitset has to be completely set because otherwise it segfaults
for some reason.
5) Memory usage not reduced: still iterating over full graph.

Next step is to actually create the subgraph.
---
 libgnn/include/galois/graphs/GNNGraph.h       |  19 +++
 .../graphs/GraphAggregationSyncStructures.h   |  40 ++++++
 libgnn/src/GraphNeuralNetwork.cpp             |  23 +++-
 libgnn/src/graphs/GNNGraph.cpp                |  70 ++++++++++
 libgnn/src/layers/GraphConvolutionalLayer.cpp |   3 +
 libgnn/src/layers/SAGELayer.cpp               | 123 +++++++++++++-----
 6 files changed, 240 insertions(+), 38 deletions(-)

diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index d5b5ee05dd..853a96dc0d 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -156,6 +156,16 @@ class GNNGraph {
   in_edges(GraphNode N) {
     return partitioned_graph_->in_edges(N);
   }
+
+  //////////////////////////////////////////////////////////////////////////////
+  // neighborhood sampling
+  //////////////////////////////////////////////////////////////////////////////
+
+  //! Set seed nodes, i.e., nodes that are being predicted on
+  void SetupNeighborhoodSample();
+  //! Sample neighbors of nodes that are marked as ready for sampling
+  void SampleEdges(size_t sample_layer_num, size_t num_to_sample);
+
   //////////////////////////////////////////////////////////////////////////////
 
   GNNFloat GetNormFactor(GraphNode n) const { return norm_factors_[n]; }
@@ -247,10 +257,12 @@ class GNNGraph {
   //! graph
   bool IsInSampledGraph(const NodeIterator& ni) const {
     // TODO(loc) GPU
+    assert(*ni < size());
     return partitioned_graph_->getData(*ni);
   }
   bool IsInSampledGraph(size_t node_id) const {
     // TODO(loc) GPU
+    assert(node_id < size());
     return partitioned_graph_->getData(node_id);
   }
 
@@ -353,10 +365,17 @@ class GNNGraph {
   std::vector<GNNLabel> local_ground_truth_labels_;
   //! Feature vectors for nodes in partitioned graph
   std::vector<GNNFeature> local_node_features_;
+
+  //////////////////////////////////////////////////////////////////////////////
+
   //! Sample data on edges: each edge gets a small bitset to mark
   //! if it's been sampled for a particular layer
   galois::LargeArray<std::vector<bool>> edge_sample_status_;
 
+  galois::DynamicBitSet new_sampled_nodes_;
+
+  //////////////////////////////////////////////////////////////////////////////
+
   // TODO maybe revisit this and use an actual bitset
   //! Bitset indicating which nodes are training nodes
   std::vector<char> local_training_mask_;
diff --git a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
index 8e3db38096..7759c26dca 100644
--- a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
+++ b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
@@ -17,6 +17,46 @@ extern struct CUDA_Context* cuda_ctx_for_sync;
 extern unsigned layer_number_to_sync;
 #endif
 
+struct SampleFlagSync {
+  using ValTy = char;
+
+  //! return a vector of floats to sync
+  static ValTy extract(uint32_t, char& i) { return i; }
+
+  //! reduction is addition in this case; add received vector to
+  //! own vector
+  static bool reduce(uint32_t, char& i, ValTy y) {
+    if (y > i) {
+      i = y;
+      assert(i == 1);
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  //! No-op: readAny = overwritten anyways
+  static void reset(uint32_t, char&) {}
+
+  //! element wise set
+  static void setVal(uint32_t, char& i, ValTy y) { i = y; }
+
+  // GPU options TODO for GPU
+  static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {
+    return false;
+  }
+  static bool extract_batch(unsigned, uint8_t*) { return false; }
+  static bool reduce_batch(unsigned, uint8_t*, DataCommMode) { return false; }
+  static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) {
+    return false;
+  }
+  static bool setVal_batch(unsigned, uint8_t*, DataCommMode) { return false; }
+  static bool extract_reset_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {
+    return false;
+  }
+  static bool extract_reset_batch(unsigned, uint8_t*) { return false; }
+};
+
 struct GNNSumAggregate {
   using ValTy = galois::gstl::Vector<GNNFloat>;
 
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index 4942076b23..42bcfc3b08 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -99,6 +99,8 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork(
       break;
     }
   }
+  // output layer not included; it will never involve sampling
+  graph_->InitializeEdgeData(gnn_layers_.size());
 
   // create the output layer
   GNNLayerDimensions output_dims = {
@@ -158,6 +160,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
   if (config_.inductive_training_) {
     graph_->CalculateSpecialNormFactor(false, true);
   }
+
   galois::StatTimer epoch_timer("TrainingTime", "GraphNeuralNetwork");
   galois::StatTimer validation_timer("ValidationTime", "GraphNeuralNetwork");
   galois::StatTimer epoch_test_timer("TestTime", "GraphNeuralNetwork");
@@ -165,11 +168,25 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
   // TODO incorporate validation/test intervals
   for (size_t epoch = 0; epoch < num_epochs; epoch++) {
     epoch_timer.start();
+
     if (config_.do_sampling()) {
-      // subgraph sample every epoch
-      graph_->GraphSAINTSample();
-      graph_->CalculateSpecialNormFactor(true, config_.inductive_training_);
+      graph_->SetupNeighborhoodSample();
+      size_t num_sampled_layers = 0;
+
+      // work backwards on GCN/SAGE layers
+      // loop backward and find last GCN/SAGE (main) layer to disable activation
+      for (auto back_iter = gnn_layers_.rbegin();
+           back_iter != gnn_layers_.rend(); back_iter++) {
+        GNNLayerType layer_type = (*back_iter)->layer_type();
+        if (layer_type == GNNLayerType::kGraphConvolutional ||
+            layer_type == GNNLayerType::kSAGE) {
+          graph_->SampleEdges((*back_iter)->layer_number(), 5);
+          num_sampled_layers++;
+        }
+      }
+      galois::gDebug("Number of sampled layers is ", num_sampled_layers);
     }
+
     const PointerWithSize<galois::GNNFloat> predictions = DoInference();
     // have to get accuracy here because gradient prop destroys the predictions
     // matrix
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index 073c616127..c12288c950 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -819,6 +819,76 @@ float galois::graphs::GNNGraph::GetGlobalAccuracyCPUMulti(
   return global_f1_micro_score;
 }
 
+void galois::graphs::GNNGraph::SetupNeighborhoodSample() {
+  new_sampled_nodes_.resize(size());
+  new_sampled_nodes_.reset();
+
+  // for now, if training node, it goes into seed node
+  galois::do_all(galois::iterate(begin(), end()), [&](const NodeIterator& x) {
+    if (IsValidForPhase(*x, GNNPhase::kTrain)) {
+      SetSampledNode(*x);
+    } else {
+      UnsetSampledNode(*x);
+    }
+  });
+  // clear all sampled edges
+  galois::do_all(galois::iterate(size_t{0}, partitioned_graph_->sizeEdges()),
+                 [&](size_t edge_id) {
+                   std::fill(edge_sample_status_[edge_id].begin(),
+                             edge_sample_status_[edge_id].end(), 0);
+                 });
+}
+
+void galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num,
+                                           size_t num_to_sample) {
+  galois::GAccumulator<size_t> sampled;
+  galois::GAccumulator<size_t> total;
+  sampled.reset();
+  total.reset();
+  galois::do_all(
+      galois::iterate(begin(), end()),
+      [&](const NodeIterator& x) {
+        // only operate on if sampled
+        if (partitioned_graph_->getData(*x)) {
+          // chance of not uniformly choosing an edge of this node num_to_sample
+          // times (degree norm is 1 / degree)
+          // XXX in-degree prob, not out degree
+          double probability_of_reject =
+              std::pow(1 - GetDegreeNorm(*x), num_to_sample);
+          // loop through in-edges, turn "on" edge with some probability
+          for (auto edge_iter : partitioned_graph_->in_edges(*x)) {
+            if (sample_rng_.DoBernoulli(probability_of_reject)) {
+              // if here, it means edge accepted; set sampled on, mark source
+              // as part of next set
+              MakeInEdgeSampled(edge_iter, sample_layer_num);
+              new_sampled_nodes_.set(
+                  partitioned_graph_->GetInEdgeDest(edge_iter));
+              sampled += 1;
+            }
+            total += 1;
+          }
+        }
+      },
+      galois::steal(), galois::loopname("NeighborhoodSample"));
+
+  galois::gPrint("Num sampled edges is ", sampled.reduce(), " out of ",
+                 total.reduce(), "\n");
+
+  std::vector<uint32_t> new_nodes = new_sampled_nodes_.getOffsets();
+
+  // update nodes, then communicate update to all hosts so that they can
+  // continue the exploration
+  galois::do_all(
+      galois::iterate(new_nodes),
+      [&](uint32_t new_node_id) { SetSampledNode(new_node_id); },
+      galois::loopname("NeighborhoodSampleSet"));
+
+  // XXX(loc) bitset; can readAny be weaker?
+  sync_substrate_->sync<writeSource, readAny, SampleFlagSync>("SampleSync");
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
 #ifdef GALOIS_ENABLE_GPU
 void galois::graphs::GNNGraph::InitGPUMemory() {
   // create int casted CSR
diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp
index 282042a805..b5a538d314 100644
--- a/libgnn/src/layers/GraphConvolutionalLayer.cpp
+++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp
@@ -308,6 +308,9 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU(
           }
 
           if (IsSampledLayer()) {
+            // XXX(loc)
+            GALOIS_LOG_WARN(
+                "Edge sampling not yet implemented for GCN; only SAGE");
             // check if node is part of sampled graph; ignore after 0'ing if not
             // sampled
             if (!graph_.IsInSampledGraph(src))
diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp
index bd6b84469f..8a01c8fe1d 100644
--- a/libgnn/src/layers/SAGELayer.cpp
+++ b/libgnn/src/layers/SAGELayer.cpp
@@ -384,11 +384,16 @@ void galois::SAGELayer::AggregateAllCPU(
     GNNFloat* aggregate_output,
     galois::substrate::PerThreadStorage<std::vector<GNNFloat>>*,
     bool is_backward) {
+
   size_t num_nodes = graph_.size();
 
   galois::do_all(
       galois::iterate(static_cast<size_t>(0), num_nodes),
       [&](size_t src) {
+        // TODO(loc) this is currently a hack: the sync substrate blows
+        // up if not the entire bitset is set for sync call like in
+        // edge sampling
+        graphs::bitset_graph_aggregate.set(src);
         size_t index_to_src_feature = src * column_length;
         // zero out src feature first
         for (size_t i = 0; i < column_length; i++) {
@@ -403,10 +408,10 @@ void galois::SAGELayer::AggregateAllCPU(
           }
 
           if (IsSampledLayer()) {
-            // check if node is part of sampled graph; ignore after 0'ing if not
-            // sampled
-            if (!graph_.IsInSampledGraph(src))
+            // check if node is part of sampled graph
+            if (!graph_.IsInSampledGraph(src)) {
               return;
+            }
           }
         }
 
@@ -415,46 +420,94 @@ void galois::SAGELayer::AggregateAllCPU(
           source_norm = graph_.GetDegreeNorm(src);
         }
 
-        // loop through all destinations to grab the feature to aggregate
-        for (auto e = graph_.edge_begin(src); e != graph_.edge_end(src); e++) {
-          graphs::bitset_graph_aggregate.set(src);
-          size_t dst = graph_.GetEdgeDest(e);
-
-          if (layer_phase_ == GNNPhase::kTrain) {
-            if (IsInductiveLayer()) {
-              // if inductive, all non-training nodes do not exist
-              if (!graph_.IsValidForPhase(dst, GNNPhase::kTrain))
-                return;
+        if (!is_backward) {
+          // loop through all destinations to grab the feature to aggregate
+          for (auto e = graph_.edge_begin(src); e != graph_.edge_end(src);
+               e++) {
+            // graphs::bitset_graph_aggregate.set(src);
+            size_t dst = graph_.GetEdgeDest(e);
+
+            if (layer_phase_ == GNNPhase::kTrain) {
+              if (IsInductiveLayer()) {
+                // if inductive, all non-training nodes do not exist
+                if (!graph_.IsValidForPhase(dst, GNNPhase::kTrain))
+                  return;
+              }
+
+              if (IsSampledLayer()) {
+                if (!graph_.IsEdgeSampled(e, layer_number_)) {
+                  continue;
+                }
+                // ignore non-sampled nodes
+                if (layer_phase_ == GNNPhase::kTrain &&
+                    !graph_.IsInSampledGraph(dst))
+                  continue;
+              }
             }
 
-            if (IsSampledLayer()) {
-              // ignore non-sampled nodes
-              if (layer_phase_ == GNNPhase::kTrain &&
-                  !graph_.IsInSampledGraph(dst))
-                continue;
+            size_t index_to_dst_feature = dst * column_length;
+
+            if (!config_.disable_normalization) {
+              GNNFloat norm_scale;
+              if (!is_backward) {
+                norm_scale = source_norm;
+              } else {
+                norm_scale = graph_.GetDegreeNorm(dst);
+              }
+
+              galois::VectorMulAdd(
+                  column_length, &aggregate_output[index_to_src_feature],
+                  &node_embeddings[index_to_dst_feature], norm_scale,
+                  &aggregate_output[index_to_src_feature]);
+            } else {
+              // add dst feature to aggregate output
+              galois::VectorAdd(column_length,
+                                &aggregate_output[index_to_src_feature],
+                                &node_embeddings[index_to_dst_feature],
+                                &aggregate_output[index_to_src_feature]);
             }
           }
+        } else {
+          // loop through all destinations to grab the feature to aggregate
+          for (auto e = graph_.in_edge_begin(src); e != graph_.in_edge_end(src);
+               e++) {
+            // graphs::bitset_graph_aggregate.set(src);
+            size_t dst = graph_.GetInEdgeDest(e);
+
+            if (layer_phase_ == GNNPhase::kTrain) {
+              if (IsInductiveLayer()) {
+                // if inductive, all non-training nodes do not exist
+                if (!graph_.IsValidForPhase(dst, GNNPhase::kTrain))
+                  return;
+              }
+
+              if (IsSampledLayer()) {
+                if (!graph_.IsInEdgeSampled(e, layer_number_)) {
+                  continue;
+                }
+                // ignore non-sampled nodes
+                if (layer_phase_ == GNNPhase::kTrain &&
+                    !graph_.IsInSampledGraph(dst))
+                  continue;
+              }
+            }
 
-          size_t index_to_dst_feature = dst * column_length;
+            size_t index_to_dst_feature = dst * column_length;
 
-          if (!config_.disable_normalization) {
-            GNNFloat norm_scale;
-            if (!is_backward) {
-              norm_scale = source_norm;
+            if (!config_.disable_normalization) {
+              GNNFloat norm_scale = graph_.GetDegreeNorm(dst);
+
+              galois::VectorMulAdd(
+                  column_length, &aggregate_output[index_to_src_feature],
+                  &node_embeddings[index_to_dst_feature], norm_scale,
+                  &aggregate_output[index_to_src_feature]);
             } else {
-              norm_scale = graph_.GetDegreeNorm(dst);
+              // add dst feature to aggregate output
+              galois::VectorAdd(column_length,
+                                &aggregate_output[index_to_src_feature],
+                                &node_embeddings[index_to_dst_feature],
+                                &aggregate_output[index_to_src_feature]);
             }
-
-            galois::VectorMulAdd(
-                column_length, &aggregate_output[index_to_src_feature],
-                &node_embeddings[index_to_dst_feature], norm_scale,
-                &aggregate_output[index_to_src_feature]);
-          } else {
-            // add dst feature to aggregate output
-            galois::VectorAdd(column_length,
-                              &aggregate_output[index_to_src_feature],
-                              &node_embeddings[index_to_dst_feature],
-                              &aggregate_output[index_to_src_feature]);
           }
         }
       },

From 51cab969c403f89671dbc4522cba6f0e7c62f056 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 23 Apr 2021 18:42:12 -0500
Subject: [PATCH 512/660] CSR/CSC Graph: manual construction functions

Allows a user to manually construct the the in-edges rather than calling
into a function to do it.
---
 .../include/galois/graphs/LC_CSR_CSC_Graph.h  | 42 +++++++++++++++----
 1 file changed, 35 insertions(+), 7 deletions(-)

diff --git a/libgalois/include/galois/graphs/LC_CSR_CSC_Graph.h b/libgalois/include/galois/graphs/LC_CSR_CSC_Graph.h
index 2f0b9e88de..09224296a3 100644
--- a/libgalois/include/galois/graphs/LC_CSR_CSC_Graph.h
+++ b/libgalois/include/galois/graphs/LC_CSR_CSC_Graph.h
@@ -91,7 +91,7 @@ class LC_CSR_CSC_Graph
   //! edge index data for the reverse edges
   EdgeIndData in_edge_ind_data_;
   //! edge destination data for the reverse edges
-  EdgeDst inEdgeDst;
+  EdgeDst in_edge_dst_;
   //! Edge data of inedges can be a value copy of the outedges (i.e. in and
   //! out edges have separate edge values) or inedges can refer to the same
   //! data as its corresponding outedge; this is what this typedef is for
@@ -108,12 +108,12 @@ class LC_CSR_CSC_Graph
 
   //! beginning iterator to an edge sorter for in-edges
   edge_sort_iterator in_edge_sort_begin(GraphNode N) {
-    return edge_sort_iterator(*in_raw_begin(N), &inEdgeDst, &inEdgeData);
+    return edge_sort_iterator(*in_raw_begin(N), &in_edge_dst_, &inEdgeData);
   }
 
   //! ending iterator to an edge sorter for in-edges
   edge_sort_iterator in_edge_sort_end(GraphNode N) {
-    return edge_sort_iterator(*in_raw_end(N), &inEdgeDst, &inEdgeData);
+    return edge_sort_iterator(*in_raw_end(N), &in_edge_dst_, &inEdgeData);
   }
 
   /**
@@ -182,7 +182,7 @@ class LC_CSR_CSC_Graph
     }
 
     // allocate edge dests and data
-    inEdgeDst.allocateInterleaved(BaseGraph::numEdges);
+    in_edge_dst_.allocateInterleaved(BaseGraph::numEdges);
 
     if (!std::is_void<EdgeTy>::value) {
       inEdgeData.allocateInterleaved(BaseGraph::numEdges);
@@ -202,7 +202,7 @@ class LC_CSR_CSC_Graph
             // location to save edge
             auto e_new = __sync_fetch_and_add(&(dataBuffer[dst]), 1);
             // save src as destination
-            inEdgeDst[e_new] = src;
+            in_edge_dst_[e_new] = src;
             // edge data to "new" array
             createEdgeData(e_new, e);
             in_edge_to_out_edge_[e_new] = e;
@@ -212,6 +212,34 @@ class LC_CSR_CSC_Graph
   }
 
 public:
+  /////////////////////////////////////////////////////////////////////////////
+  // Manual construction functions
+  /////////////////////////////////////////////////////////////////////////////
+
+  // no edge data support at the moment for these functions because not required
+  // for the current use case
+
+  //! Reallocate memory for the CSC part of the graph
+  void CSCAllocate() {
+    // assumes nodes and edges set from CSR version of this call
+    in_edge_dst_.deallocate();
+    in_edge_ind_data_.deallocate();
+
+    if (UseNumaAlloc) {
+      in_edge_ind_data_.allocateBlocked(BaseGraph::numNodes);
+      in_edge_dst_.allocateBlocked(BaseGraph::numEdges);
+    } else {
+      in_edge_ind_data_.allocateInterleaved(BaseGraph::numNodes);
+      in_edge_dst_.allocateInterleaved(BaseGraph::numEdges);
+    }
+  }
+  //! Construct the in edge for some edge index by setting the destination
+  void ConstructInEdge(EdgeIndexTy e, NodeIndexTy dst) {
+    in_edge_dst_[e] = dst;
+  }
+  //! In-edge index setting
+  void FixEndInEdge(NodeIndexTy n, EdgeIndexTy e) { in_edge_ind_data_[n] = e; }
+
   /////////////////////////////////////////////////////////////////////////////
   // Construction functions
   /////////////////////////////////////////////////////////////////////////////
@@ -274,7 +302,7 @@ class LC_CSR_CSC_Graph
     if (!HasNoLockable && galois::runtime::shouldLock(mflag)) {
       for (edge_iterator ii = in_raw_begin(N), ee = in_raw_end(N); ii != ee;
            ++ii) {
-        BaseGraph::acquireNode(inEdgeDst[*ii], mflag);
+        BaseGraph::acquireNode(in_edge_dst_[*ii], mflag);
       }
     }
     return in_raw_begin(N);
@@ -313,7 +341,7 @@ class LC_CSR_CSC_Graph
    * @param ni edge id
    * @returns destination for that in edge
    */
-  GraphNode getInEdgeDst(edge_iterator ni) const { return inEdgeDst[*ni]; }
+  GraphNode getInEdgeDst(edge_iterator ni) const { return in_edge_dst_[*ni]; }
 
   /**
    * Given an edge id for in edge, get the data associated with that edge.

From 455159e13e980796cc0e59ec712fe220e5ef7e16 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 23 Apr 2021 18:43:47 -0500
Subject: [PATCH 513/660] AVX512 for vector add/mul function

---
 libgnn/src/GNNMath.cpp | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/libgnn/src/GNNMath.cpp b/libgnn/src/GNNMath.cpp
index afb3712981..9e8de18d5f 100644
--- a/libgnn/src/GNNMath.cpp
+++ b/libgnn/src/GNNMath.cpp
@@ -52,6 +52,24 @@ void galois::VectorAdd(size_t length, const GNNFloat* a, const GNNFloat* b,
 
 void galois::VectorMulAdd(size_t length, const GNNFloat* a, const GNNFloat* b,
                           const GNNFloat b_scale, GNNFloat* output) {
+#ifdef __AVX512F__
+  // 512
+  constexpr size_t vectorization_length = 16;
+  const size_t aligned_end = length - length % vectorization_length;
+  __m512 scale_vec_main = _mm512_set_ps(
+      b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale,
+      b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale);
+  for (size_t i = 0; i < aligned_end; i += vectorization_length) {
+    _mm512_storeu_ps(
+        &output[i],
+        _mm512_add_ps(_mm512_loadu_ps(&a[i]),
+                      _mm512_mul_ps(scale_vec_main, _mm512_loadu_ps(&b[i]))));
+  }
+  // handle the rest
+  for (size_t i = aligned_end; i < length; ++i) {
+    output[i] = a[i] + b[i] * b_scale;
+  }
+#else
 #ifdef __AVX2__
   constexpr size_t vectorization_length =
       8; // for 32-bit floating point in AVX2; TODO AVX512
@@ -82,6 +100,7 @@ void galois::VectorMulAdd(size_t length, const GNNFloat* a, const GNNFloat* b,
     output[i] = a[i] + b[i] * b_scale;
   }
 #endif
+#endif
 }
 
 void galois::GNNSoftmax(const size_t vector_length, const GNNFloat* input,

From 4d7bc090b048f0d0b32077edc5da6e8d19ed349e Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Sat, 24 Apr 2021 13:05:31 -0500
Subject: [PATCH 514/660] MKL link update: 20.0 and parallel link

On CDGC machines, link to MKL 20. Also, fix sequential MKL link because
that makes the BLAS calls sequential and not parallel.

Fixing parallel link ended up with a near 4x speedup for 3 layer 256
hidden layer SAGE on products.
---
 CMakeLists.txt        | 2 +-
 libgnn/CMakeLists.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index bb72f24c71..1eaa1e1e0a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -141,7 +141,7 @@ endif()
 # TODO (loc) prefix with GALOIS, move elsewhere more fitting in this file
 ################################################################################
 if(USE_MKL_BLAS)
-  SET(INTEL_ROOT /opt/apps/sysnet/intel/19.0)
+  SET(INTEL_ROOT /opt/apps/sysnet/intel/20.0)
   SET(MKL_ROOT ${INTEL_ROOT}/mkl)
   find_package(MKL REQUIRED)
   message(STATUS "MKL: ${MKL_INCLUDE_DIRS}")
diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt
index ed60ae032b..665cd14545 100644
--- a/libgnn/CMakeLists.txt
+++ b/libgnn/CMakeLists.txt
@@ -13,7 +13,7 @@ set(sources
 )
 
 set(MKL_LIBRARIES ${MKL_ROOT}/lib/intel64)
-set(INTEL_LIBS "-lmkl_intel_lp64 -lmkl_sequential -lmkl_core")
+set(INTEL_LIBS "-lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5")
 
 add_library(galois_gnn STATIC ${sources})
 

From 0d31eff9e9d1e0edccad8702792b59faed92f950 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 26 Apr 2021 16:58:53 -0500
Subject: [PATCH 515/660] GNNSubgraph in Graph

Subgraph object in graph that is used when flag is flipped.

NOT CHECKED IN DETAIL DUE TO DEADLINE COMING UP AND NEED TO RUSH.
---
 libgnn/CMakeLists.txt                      |   1 +
 libgnn/include/galois/GNNTypes.h           |   2 +-
 libgnn/include/galois/graphs/GNNGraph.h    | 234 +++++++++++++++++----
 libgnn/include/galois/graphs/GNNSubgraph.h | 132 ++++++++++++
 libgnn/src/GNNMath.cpp                     |   2 +-
 libgnn/src/graphs/GNNGraph.cpp             | 139 ++++++++++--
 libgnn/src/graphs/GNNSubgraph.cpp          | 186 ++++++++++++++++
 7 files changed, 636 insertions(+), 60 deletions(-)
 create mode 100644 libgnn/include/galois/graphs/GNNSubgraph.h
 create mode 100644 libgnn/src/graphs/GNNSubgraph.cpp

diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt
index 665cd14545..2393ce043b 100644
--- a/libgnn/CMakeLists.txt
+++ b/libgnn/CMakeLists.txt
@@ -3,6 +3,7 @@ set(sources
   src/GNNOptimizers.cpp
   src/GraphNeuralNetwork.cpp
   src/graphs/GNNGraph.cpp
+  src/graphs/GNNSubgraph.cpp
   src/layers/DenseLayer.cpp
   src/layers/GNNLayer.cpp
   src/layers/GraphConvolutionalLayer.cpp
diff --git a/libgnn/include/galois/GNNTypes.h b/libgnn/include/galois/GNNTypes.h
index 3603cb68d7..492bc841dc 100644
--- a/libgnn/include/galois/GNNTypes.h
+++ b/libgnn/include/galois/GNNTypes.h
@@ -25,7 +25,7 @@ using GPUNodeIndex = uint32_t;
 using GPUEdgeIndex = uint64_t;
 
 //! Phase of GNN computation
-enum class GNNPhase { kTrain, kValidate, kTest };
+enum class GNNPhase { kTrain, kValidate, kTest, kOther };
 
 //! Vector like wrapper over a pointer and size; exists solely to pass around
 //! raw pointers with size (because vectors are a no-go due to the code
diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 853a96dc0d..3a538d9da5 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -30,7 +30,6 @@ namespace graphs {
 //! Possible partitioning schemes for the GNN graph
 enum class GNNPartitionScheme { kOEC, kCVC, kOCVC };
 
-//! XXX
 class GNNGraph {
 public:
   using GNNDistGraph = galois::graphs::DistGraph<char, void>;
@@ -66,6 +65,14 @@ class GNNGraph {
 
   //! Return # of nodes in the partitioned graph
   size_t size() const { return partitioned_graph_->size(); }
+  //! Returns # of nodes in the *graph that is currently active*.
+  size_t active_size() const {
+    if (!use_subgraph_) {
+      return partitioned_graph_->size();
+    } else {
+      return subgraph_->size();
+    }
+  }
 
   bool is_local(size_t gid) const { return partitioned_graph_->isLocal(gid); }
   size_t GetLID(size_t gid) const { return partitioned_graph_->getLID(gid); }
@@ -73,28 +80,44 @@ class GNNGraph {
 
   //! Node begin for all local nodes
   NodeIterator begin() const {
-    return partitioned_graph_->allNodesRange().begin();
+    if (!use_subgraph_) {
+      return partitioned_graph_->allNodesRange().begin();
+    } else {
+      return subgraph_->begin();
+    }
   }
   //! Node end for all local nodes
-  NodeIterator end() const { return partitioned_graph_->allNodesRange().end(); }
+  NodeIterator end() const {
+    if (!use_subgraph_) {
+      return partitioned_graph_->allNodesRange().end();
+    } else {
+      return subgraph_->end();
+    }
+  }
 
   NodeIterator begin_owned() const {
-    return partitioned_graph_->masterNodesRange().begin();
+    if (!use_subgraph_) {
+      return partitioned_graph_->masterNodesRange().begin();
+    } else {
+      return subgraph_->begin_owned();
+    }
   }
 
   NodeIterator end_owned() const {
-    return partitioned_graph_->masterNodesRange().end();
+    if (!use_subgraph_) {
+      return partitioned_graph_->masterNodesRange().end();
+    } else {
+      return subgraph_->end_owned();
+    }
   }
 
   //////////////////////////////////////////////////////////////////////////////
   // Edges
   //////////////////////////////////////////////////////////////////////////////
 
-  void InitializeEdgeData() { InitializeEdgeData(1); }
-
-  void InitializeEdgeData(size_t num_layers) {
-    edge_sample_status_.create(partitioned_graph_->sizeEdges(), num_layers);
-  }
+  void InitializeSamplingData() { InitializeSamplingData(1); }
+  //! Initialize data required to do graph sampling
+  void InitializeSamplingData(size_t num_layers);
 
   //////////////////////////////////////////////////////////////////////////////
   // Out Edges
@@ -102,17 +125,64 @@ class GNNGraph {
 
   // All following functions take a local node id
   EdgeIterator edge_begin(GraphNode n) const {
-    return partitioned_graph_->edge_begin(n);
+    if (!use_subgraph_) {
+      return partitioned_graph_->edge_begin(n);
+    } else {
+      return subgraph_->edge_begin(n);
+    }
   };
+
   EdgeIterator edge_end(GraphNode n) const {
-    return partitioned_graph_->edge_end(n);
+    if (!use_subgraph_) {
+      return partitioned_graph_->edge_end(n);
+    } else {
+      return subgraph_->edge_end(n);
+    }
   };
   GraphNode GetEdgeDest(EdgeIterator ei) const {
-    return partitioned_graph_->getEdgeDst(ei);
+    if (!use_subgraph_) {
+      return partitioned_graph_->getEdgeDst(ei);
+    } else {
+      return subgraph_->GetEdgeDest(ei);
+    }
+  };
+  galois::runtime::iterable<
+      galois::NoDerefIterator<GNNDistGraph::edge_iterator>>
+  edges(GraphNode N) const {
+    if (!use_subgraph_) {
+      return partitioned_graph_->edges(N);
+    } else {
+      return subgraph_->edges(N);
+    }
+  }
+
+  bool IsEdgeSampledAny(EdgeIterator ei) const {
+    for (bool b : edge_sample_status_[*ei]) {
+      if (b)
+        return true;
+    }
+    return false;
+  }
+  bool IsEdgeSampled(uint32_t ei, size_t layer_num) const {
+    if (!use_subgraph_) {
+      return edge_sample_status_[ei][layer_num];
+    } else {
+      GALOIS_LOG_FATAL("This shouldn't be called with subgraph");
+      return false;
+    }
   };
   bool IsEdgeSampled(EdgeIterator ei, size_t layer_num) const {
+    if (!use_subgraph_) {
+      return edge_sample_status_[*ei][layer_num];
+    } else {
+      return subgraph_->OutEdgeSampled(ei, layer_num, *this);
+    }
+  };
+  //! Always use original graph's edge iterator here
+  bool IsEdgeSampledOriginalGraph(EdgeIterator ei, size_t layer_num) const {
     return edge_sample_status_[*ei][layer_num];
   };
+
   //! Set the flag on the edge to 1; makes it sampled
   void MakeEdgeSampled(EdgeIterator ei, size_t layer_num) {
     edge_sample_status_[*ei][layer_num] = 1;
@@ -121,28 +191,62 @@ class GNNGraph {
   void MakeEdgeUnsampled(EdgeIterator ei, size_t layer_num) {
     edge_sample_status_[*ei][layer_num] = 0;
   };
-  galois::runtime::iterable<
-      galois::NoDerefIterator<GNNDistGraph::edge_iterator>>
-  edges(GraphNode N) {
-    return partitioned_graph_->edges(N);
-  }
 
   //////////////////////////////////////////////////////////////////////////////
   // in edges
   //////////////////////////////////////////////////////////////////////////////
   EdgeIterator in_edge_begin(GraphNode n) const {
-    return partitioned_graph_->in_edge_begin(n);
+    if (!use_subgraph_) {
+      return partitioned_graph_->in_edge_begin(n);
+    } else {
+      return subgraph_->in_edge_begin(n);
+    }
   }
   EdgeIterator in_edge_end(GraphNode n) const {
-    return partitioned_graph_->in_edge_end(n);
+    if (!use_subgraph_) {
+      return partitioned_graph_->in_edge_end(n);
+    } else {
+      return subgraph_->in_edge_end(n);
+    }
+  }
+  galois::runtime::iterable<
+      galois::NoDerefIterator<GNNDistGraph::edge_iterator>>
+  in_edges(GraphNode N) const {
+    if (!use_subgraph_) {
+      return partitioned_graph_->in_edges(N);
+    } else {
+      return subgraph_->in_edges(N);
+    }
   }
   GraphNode GetInEdgeDest(EdgeIterator ei) const {
-    return partitioned_graph_->GetInEdgeDest(ei);
+    if (!use_subgraph_) {
+      return partitioned_graph_->GetInEdgeDest(ei);
+    } else {
+      return subgraph_->GetInEdgeDest(ei);
+    }
+  };
+
+  EdgeIterator InEdgeToOutEdge(EdgeIterator in_edge_iter) const {
+    return partitioned_graph_->InEdgeToOutEdge(in_edge_iter);
+  }
+
+  bool IsInEdgeSampledAny(EdgeIterator ei) const {
+    for (bool b :
+         edge_sample_status_[partitioned_graph_->InEdgeToOutEdge(ei)]) {
+      if (b)
+        return true;
+    }
+    return false;
   };
   bool IsInEdgeSampled(EdgeIterator ei, size_t layer_num) const {
-    return edge_sample_status_[partitioned_graph_->InEdgeToOutEdge(ei)]
-                              [layer_num];
+    if (!use_subgraph_) {
+      return edge_sample_status_[partitioned_graph_->InEdgeToOutEdge(ei)]
+                                [layer_num];
+    } else {
+      return subgraph_->InEdgeSampled(ei, layer_num, *this);
+    }
   };
+
   //! Set the flag on the edge to 1; makes it sampled
   void MakeInEdgeSampled(EdgeIterator ei, size_t layer_num) {
     edge_sample_status_[partitioned_graph_->InEdgeToOutEdge(ei)][layer_num] = 1;
@@ -151,11 +255,6 @@ class GNNGraph {
   void MakeInEdgeUnsampled(EdgeIterator ei, size_t layer_num) {
     edge_sample_status_[partitioned_graph_->InEdgeToOutEdge(ei)][layer_num] = 0;
   };
-  galois::runtime::iterable<
-      galois::NoDerefIterator<GNNDistGraph::edge_iterator>>
-  in_edges(GraphNode N) {
-    return partitioned_graph_->in_edges(N);
-  }
 
   //////////////////////////////////////////////////////////////////////////////
   // neighborhood sampling
@@ -163,15 +262,42 @@ class GNNGraph {
 
   //! Set seed nodes, i.e., nodes that are being predicted on
   void SetupNeighborhoodSample();
+
+  //! Choose all edges from sampled nodes
+  void SampleAllEdges(size_t agg_layer_num);
   //! Sample neighbors of nodes that are marked as ready for sampling
   void SampleEdges(size_t sample_layer_num, size_t num_to_sample);
 
+  //! Construct the subgraph from sampled edges and corresponding nodes
+  size_t ConstructSampledSubgraph() {
+    // false first so that the build process can use functions to access the
+    // real graph
+    use_subgraph_             = false;
+    size_t num_subgraph_nodes = subgraph_->BuildSubgraph(*this);
+    // after this, this graph is a subgraph
+    use_subgraph_ = true;
+    return num_subgraph_nodes;
+  }
+
+  void EnableSubgraph() { use_subgraph_ = true; }
+
+  void DisableSubgraph() { use_subgraph_ = false; }
+
   //////////////////////////////////////////////////////////////////////////////
 
   GNNFloat GetNormFactor(GraphNode n) const { return norm_factors_[n]; }
   //! Degree norm (1 / degree) of current functional graph (e.g., sampled,
   //! inductive graph, etc); calculated whenever norm factor is calculated
-  GNNFloat GetDegreeNorm(GraphNode n) const { return degree_norm_[n]; }
+  GNNFloat GetDegreeNorm(GraphNode n) const {
+    if (!use_subgraph_) {
+      return degree_norm_[n];
+    } else {
+      // XXX does not work in distributed case, fix there
+      // XXX also need to account for current layer number in sampling
+      // case because degrees in each layer differ
+      return 1.0 / subgraph_->GetLocalDegree(n);
+    }
+  }
 
   // Get accuracy: sampling is by default false
   float GetGlobalAccuracy(PointerWithSize<GNNFloat> predictions,
@@ -183,11 +309,19 @@ class GNNGraph {
   //! class labels.
   GNNFloat GetSingleClassLabel(const unsigned lid) const {
     assert(using_single_class_labels_);
-    if (local_ground_truth_labels_[lid] != num_label_classes_) {
-      return local_ground_truth_labels_[lid];
+    unsigned to_use = lid;
+    if (use_subgraph_) {
+      to_use = subgraph_->SIDToLID(lid);
+    }
+
+    if (local_ground_truth_labels_[to_use] != num_label_classes_) {
+      // galois::gPrint(lid, " ", to_use, " ",
+      // (int)local_ground_truth_labels_[to_use], "\n");
+      return local_ground_truth_labels_[to_use];
     } else {
       GALOIS_LOG_FATAL(
-          "should not get the label of a node that has no ground truth");
+          "should not get the label of a node that has no ground truth {}",
+          to_use);
     }
   }
 
@@ -208,7 +342,12 @@ class GNNGraph {
                              local_node_features_.size());
     }
 #endif
-    return PointerWithSize(local_node_features_);
+    if (!use_subgraph_) {
+      return PointerWithSize(local_node_features_);
+    } else {
+      return PointerWithSize(subgraph_->GetLocalFeatures().data(),
+                             subgraph_->GetLocalFeatures().size());
+    }
   }
 
   //! Given an LID and the current phase of GNN computation, determine if the
@@ -216,10 +355,16 @@ class GNNGraph {
   //! a training, validation, or test phase mask)
   bool IsValidForPhase(const unsigned lid,
                        const galois::GNNPhase current_phase) const {
-    if (!incomplete_masks_) {
-      return IsValidForPhaseCompleteRange(lid, current_phase);
+    // XXX maybe just map this all over to subgraph, though in that case
+    // issue is that subgraph doesn't necessarily know about test/val
+    unsigned to_use = lid;
+    if (use_subgraph_) {
+      to_use = subgraph_->SIDToLID(lid);
+    }
+    if (!incomplete_masks_ && current_phase != GNNPhase::kOther) {
+      return IsValidForPhaseCompleteRange(to_use, current_phase);
     } else {
-      return IsValidForPhaseMasked(lid, current_phase);
+      return IsValidForPhaseMasked(to_use, current_phase);
     }
   }
 
@@ -293,6 +438,10 @@ class GNNGraph {
 #endif
 
 private:
+// included like this to avoid cyclic dependency issues + not used anywhere but
+// in this class anyways
+#include "galois/graphs/GNNSubgraph.h"
+
   //////////////////////////////////////////////////////////////////////////////
   // Initialization
   //////////////////////////////////////////////////////////////////////////////
@@ -307,6 +456,8 @@ class GNNGraph {
   size_t ReadLocalMasksFromFile(const std::string& dataset_name,
                                 const std::string& mask_type,
                                 GNNRange* mask_range, char* masks);
+  //! Finds nodes that aren't part of the 3 main GNN phase classifications
+  size_t FindOtherMask();
   //! Read masks of local nodes only for training, validation, and testing
   void ReadLocalMasks(const std::string& dataset_name);
   //! Reads the entire graph topology in (but nothing else)
@@ -368,10 +519,15 @@ class GNNGraph {
 
   //////////////////////////////////////////////////////////////////////////////
 
+  std::unique_ptr<GNNSubgraph> subgraph_;
+  // Degrees for sampled subgraph
+  galois::LargeArray<uint32_t> sampled_out_degrees_;
+  galois::LargeArray<uint32_t> sampled_in_degrees_;
   //! Sample data on edges: each edge gets a small bitset to mark
   //! if it's been sampled for a particular layer
   galois::LargeArray<std::vector<bool>> edge_sample_status_;
-
+  //! Indicates newly sampled nodes (for distributed synchronization of sampling
+  //! status
   galois::DynamicBitSet new_sampled_nodes_;
 
   //////////////////////////////////////////////////////////////////////////////
@@ -383,6 +539,9 @@ class GNNGraph {
   std::vector<char> local_validation_mask_;
   //! Bitset indicating which nodes are testing nodes
   std::vector<char> local_testing_mask_;
+  size_t valid_other_{0};
+  //! Bitset indicating which nodes don't fall anywhere
+  std::vector<char> other_mask_;
 
   //! Global mask range for training nodes; must convert to LIDs when using
   //! in this class
@@ -408,6 +567,7 @@ class GNNGraph {
   galois::PerThreadRNG sample_rng_;
 
   // TODO vars for subgraphs as necessary
+  bool use_subgraph_{false};
 
   //////////////////////////////////////////////////////////////////////////////
   // GPU things
diff --git a/libgnn/include/galois/graphs/GNNSubgraph.h b/libgnn/include/galois/graphs/GNNSubgraph.h
new file mode 100644
index 0000000000..c3c931f0da
--- /dev/null
+++ b/libgnn/include/galois/graphs/GNNSubgraph.h
@@ -0,0 +1,132 @@
+// Note no header guard or anything like that; this file is meant to be
+// included in the middle of GNNGraph class declaration as a class in a class
+class GNNSubgraph {
+public:
+  using GraphNode    = LC_CSR_CSC_Graph<char, void>::GraphNode;
+  using NodeIterator = boost::counting_iterator<size_t>;
+  using EdgeIterator = LC_CSR_CSC_Graph<char, void>::edge_iterator;
+
+  //! Allocates space for the lid to sid map
+  GNNSubgraph(size_t main_graph_size) {
+    lid_to_subgraph_id_.create(main_graph_size,
+                               std::numeric_limits<uint32_t>::max());
+    // the subgraph to original graph maps are allocated on demand in gstl
+    // vectors since those change every epoch
+  }
+  //! Given sampled bits set on gnn_graph, builds an explicit subgraph
+  //! for the sampled bits
+  size_t BuildSubgraph(GNNGraph& gnn_graph);
+
+  galois::gstl::Vector<GNNFeature>& GetLocalFeatures() {
+    return subgraph_node_features_;
+  }
+
+  //////////////////////////////////////////////////////////////////////////////
+  // Nodes
+  //////////////////////////////////////////////////////////////////////////////
+
+  uint32_t size() { return num_subgraph_nodes_; }
+  NodeIterator begin() const { return NodeIterator(0); }
+  NodeIterator end() const { return NodeIterator(num_subgraph_nodes_); }
+
+  NodeIterator begin_owned() const { return NodeIterator(0); }
+  NodeIterator end_owned() const {
+    return NodeIterator(subgraph_master_boundary_);
+  }
+
+  uint32_t SIDToLID(uint32_t sid) const { return subgraph_id_to_lid_[sid]; }
+
+  //////////////////////////////////////////////////////////////////////////////
+  // Edge iteration and destination
+  //////////////////////////////////////////////////////////////////////////////
+
+  EdgeIterator edge_begin(GraphNode n) {
+    return underlying_graph_.edge_begin(n);
+  }
+  EdgeIterator edge_end(GraphNode n) { return underlying_graph_.edge_end(n); }
+  GraphNode GetEdgeDest(EdgeIterator out_edge_iterator) {
+    return underlying_graph_.getEdgeDst(out_edge_iterator);
+  };
+  galois::runtime::iterable<
+      galois::NoDerefIterator<GNNDistGraph::edge_iterator>>
+  edges(GraphNode n) {
+    return internal::make_no_deref_range(edge_begin(n), edge_end(n));
+  }
+
+  EdgeIterator in_edge_begin(GraphNode n) {
+    return underlying_graph_.in_edge_begin(n);
+  }
+  EdgeIterator in_edge_end(GraphNode n) {
+    return underlying_graph_.in_edge_end(n);
+  }
+  GraphNode GetInEdgeDest(EdgeIterator in_edge_iterator) {
+    return underlying_graph_.getInEdgeDst(in_edge_iterator);
+  };
+  galois::runtime::iterable<
+      galois::NoDerefIterator<GNNDistGraph::edge_iterator>>
+  in_edges(GraphNode n) {
+    return internal::make_no_deref_range(in_edge_begin(n), in_edge_end(n));
+  }
+
+  size_t GetLocalDegree(GraphNode n) {
+    return std::distance(edge_begin(n), edge_end(n));
+  }
+
+  //////////////////////////////////////////////////////////////////////////////
+  // Edge sampling status check
+  //////////////////////////////////////////////////////////////////////////////
+
+  bool OutEdgeSampled(EdgeIterator out_edge_iterator, size_t layer_num,
+                      const GNNGraph& original_graph) {
+    return original_graph.IsEdgeSampledOriginalGraph(
+        subedge_to_original_edge_[*out_edge_iterator], layer_num);
+  }
+  bool InEdgeSampled(EdgeIterator in_edge_iterator, size_t layer_num,
+                     const GNNGraph& original_graph) {
+    // note that original IsEdgeSampled is called because this object stores the
+    // original edge already
+    return original_graph.IsEdgeSampledOriginalGraph(
+        in_subedge_to_original_edge_[*in_edge_iterator], layer_num);
+  }
+
+  //////////////////////////////////////////////////////////////////////////////
+
+private:
+  //! Creates subgraph ID mapping from the number of sampled nodes from the
+  //! original graph. Should be done every epoch when sampled graph changes.
+  void CreateLocalToSubgraphMapping(const GNNGraph& gnn_graph);
+  //! Counts in and out degrees of all sampled nodes in the graph
+  void DegreeCounting(const GNNGraph& gnn_graph);
+  //! Creates edges
+  void EdgeCreation(const GNNGraph& gnn_graph);
+  //! Copies over relevant features of the nodes
+  void NodeFeatureCreation(GNNGraph& gnn_graph);
+
+  static const constexpr char* kRegionName = "GNNSubgraph";
+
+  // name is self explanatory
+  LC_CSR_CSC_Graph<char, void> underlying_graph_;
+  // size vars
+  uint32_t num_subgraph_nodes_;
+  uint32_t num_subgraph_edges_;
+  uint32_t subgraph_master_boundary_;
+  //! Features corresponding only to this subgraph; copied from main graph
+  //! (in other words, redundant; would be nice if there was a way to
+  //! fake contiguous memory
+  galois::gstl::Vector<GNNFeature> subgraph_node_features_;
+  //! Dense array mapping local ids to subgraph id (not space efficient)
+  galois::LargeArray<uint32_t> lid_to_subgraph_id_;
+  //! Map subgraph ids back to local graph ids
+  //! gstl vector because this will get resized every epoch (LargeArray
+  //! is for static)
+  galois::gstl::Vector<uint32_t> subgraph_id_to_lid_;
+  // intermediate degrees used for edge construction
+  galois::gstl::Vector<uint32_t> subgraph_out_degrees_;
+  galois::gstl::Vector<uint32_t> subgraph_in_degrees_;
+  //! Maps from subgraph out-edge id to original graph edge id (used to check if
+  //! edge exists in particular layer)
+  galois::gstl::Vector<uint32_t> subedge_to_original_edge_;
+  //! Maps from subgraph in-edge id to original graph edge id (used to check if
+  //! edge exists in particular layer)
+  galois::gstl::Vector<uint32_t> in_subedge_to_original_edge_;
+};
diff --git a/libgnn/src/GNNMath.cpp b/libgnn/src/GNNMath.cpp
index 9e8de18d5f..582fba95f6 100644
--- a/libgnn/src/GNNMath.cpp
+++ b/libgnn/src/GNNMath.cpp
@@ -56,7 +56,7 @@ void galois::VectorMulAdd(size_t length, const GNNFloat* a, const GNNFloat* b,
   // 512
   constexpr size_t vectorization_length = 16;
   const size_t aligned_end = length - length % vectorization_length;
-  __m512 scale_vec_main = _mm512_set_ps(
+  __m512 scale_vec_main    = _mm512_set_ps(
       b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale,
       b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale);
   for (size_t i = 0; i < aligned_end; i += vectorization_length) {
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index c12288c950..56572ccb76 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -126,6 +126,9 @@ bool galois::graphs::GNNGraph::IsValidForPhaseCompleteRange(
   case GNNPhase::kTest:
     range_to_use = &global_testing_mask_range_;
     break;
+  case GNNPhase::kOther:
+    GALOIS_LOG_FATAL("no range for other");
+    break;
   default:
     GALOIS_LOG_FATAL("Invalid phase used");
     range_to_use = nullptr;
@@ -156,6 +159,12 @@ bool galois::graphs::GNNGraph::IsValidForPhaseMasked(
   case GNNPhase::kTest:
     mask_to_use = &local_testing_mask_;
     break;
+  case GNNPhase::kOther:
+    if (valid_other_ == 0) {
+      return false;
+    }
+    mask_to_use = &other_mask_;
+    break;
   default:
     GALOIS_LOG_FATAL("Invalid phase used");
     mask_to_use = nullptr;
@@ -486,6 +495,25 @@ size_t galois::graphs::GNNGraph::ReadLocalMasksFromFile(
   return valid_count;
 }
 
+size_t galois::graphs::GNNGraph::FindOtherMask() {
+  galois::GAccumulator<size_t> other_accum;
+  other_accum.reset();
+  other_mask_.resize(partitioned_graph_->size());
+
+  galois::do_all(
+      galois::iterate(size_t{0}, partitioned_graph_->size()),
+      [&](size_t local_id) {
+        if (!IsValidForPhase(local_id, GNNPhase::kTrain) &&
+            !IsValidForPhase(local_id, GNNPhase::kValidate) &&
+            !IsValidForPhase(local_id, GNNPhase::kTest)) {
+          other_mask_[local_id] = 1;
+          other_accum += 1;
+        }
+      },
+      galois::loopname("FindOtherMask"));
+  return other_accum.reduce();
+}
+
 void galois::graphs::GNNGraph::ReadLocalMasks(const std::string& dataset_name) {
   // allocate the memory for the local masks
   local_training_mask_.resize(partitioned_graph_->size());
@@ -535,10 +563,13 @@ void galois::graphs::GNNGraph::ReadLocalMasks(const std::string& dataset_name) {
     size_t valid_test  = ReadLocalMasksFromFile(dataset_name, "test",
                                                &global_testing_mask_range_,
                                                local_testing_mask_.data());
+    valid_other_       = FindOtherMask();
+    // the "other" set of nodes that don't fall into any classification
     if (galois::runtime::getSystemNetworkInterface().ID == 0) {
       galois::gInfo("Valid # training nodes is ", valid_train);
       galois::gInfo("Valid # validation nodes is ", valid_val);
       galois::gInfo("Valid # test nodes is ", valid_test);
+      galois::gInfo("Valid # other nodes is ", valid_other_);
     }
   }
 }
@@ -665,26 +696,30 @@ float galois::graphs::GNNGraph::GetGlobalAccuracyCPU(
 }
 
 float galois::graphs::GNNGraph::GetGlobalAccuracyCPUSingle(
-    PointerWithSize<GNNFloat> predictions, GNNPhase phase, bool sampling) {
+    PointerWithSize<GNNFloat> predictions, GNNPhase phase, bool) {
   // check owned nodes' accuracy
   assert((num_label_classes_ * size()) == predictions.size());
   num_correct_.reset();
   total_checked_.reset();
 
   galois::do_all(
+      // will only loop over sampled nodes if sampling is on
       galois::iterate(begin_owned(), end_owned()),
-      [&](const unsigned lid) {
-        if (IsValidForPhase(lid, phase)) {
-          if (sampling) {
-            if (phase == GNNPhase::kTrain && !IsInSampledGraph(lid)) {
-              return;
-            }
-          }
+      // this is possibly the subgraph id
+      [&](const unsigned node_id) {
+        unsigned lid = node_id;
+        if (use_subgraph_) {
+          // convert SID over to LID
+          lid = subgraph_->SIDToLID(node_id);
+        }
 
+        if (IsValidForPhase(lid, phase)) {
           total_checked_ += 1;
           // get prediction by getting max
+          // note the use of node_id here: lid only used to check original
+          // labels
           size_t predicted_label = galois::MaxIndex(
-              num_label_classes_, &(predictions[lid * num_label_classes_]));
+              num_label_classes_, &(predictions[node_id * num_label_classes_]));
           // check against ground truth and track accordingly
           // TODO static cast used here is dangerous
           if (predicted_label ==
@@ -699,7 +734,8 @@ float galois::graphs::GNNGraph::GetGlobalAccuracyCPUSingle(
   size_t global_correct = num_correct_.reduce();
   size_t global_checked = total_checked_.reduce();
 
-  GALOIS_LOG_VERBOSE("Accuracy: {} / {}", global_correct, global_checked);
+  GALOIS_LOG_WARN("Sub: {}, Accuracy: {} / {}", use_subgraph_, global_correct,
+                  global_checked);
 
   return static_cast<float>(global_correct) /
          static_cast<float>(global_checked);
@@ -819,6 +855,15 @@ float galois::graphs::GNNGraph::GetGlobalAccuracyCPUMulti(
   return global_f1_micro_score;
 }
 
+////////////////////////////////////////////////////////////////////////////////
+
+void galois::graphs::GNNGraph::InitializeSamplingData(size_t num_layers) {
+  subgraph_ = std::make_unique<GNNSubgraph>(partitioned_graph_->size());
+  edge_sample_status_.create(partitioned_graph_->sizeEdges(), num_layers);
+  sampled_out_degrees_.create(partitioned_graph_->size(), 0);
+  sampled_in_degrees_.create(partitioned_graph_->size(), 0);
+}
+
 void galois::graphs::GNNGraph::SetupNeighborhoodSample() {
   new_sampled_nodes_.resize(size());
   new_sampled_nodes_.reset();
@@ -839,8 +884,54 @@ void galois::graphs::GNNGraph::SetupNeighborhoodSample() {
                  });
 }
 
+void galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num) {
+  use_subgraph_ = false;
+
+  galois::GAccumulator<size_t> sampled;
+  galois::GAccumulator<size_t> total;
+  sampled.reset();
+  total.reset();
+
+  galois::do_all(
+      galois::iterate(begin(), end()),
+      [&](const NodeIterator& x) {
+        // only operate on if sampled
+        if (partitioned_graph_->getData(*x)) {
+          // marks ALL edges of nodes that connect to train/other nodes
+          for (auto edge_iter : partitioned_graph_->edges(*x)) {
+            if (IsValidForPhase(partitioned_graph_->getEdgeDst(edge_iter),
+                                GNNPhase::kTrain) ||
+                IsValidForPhase(partitioned_graph_->getEdgeDst(edge_iter),
+                                GNNPhase::kOther)) {
+              MakeEdgeSampled(edge_iter, agg_layer_num);
+              new_sampled_nodes_.set(partitioned_graph_->getEdgeDst(edge_iter));
+              sampled += 1;
+            }
+            total += 1;
+          }
+        }
+      },
+      galois::steal(), galois::loopname("ChooseAllEdges"));
+
+  galois::gPrint("Num sampled edges is ", sampled.reduce(), " out of ",
+                 total.reduce(), "\n");
+
+  std::vector<uint32_t> new_nodes = new_sampled_nodes_.getOffsets();
+  // update nodes, then communicate update to all hosts so that they can
+  // continue the exploration
+  galois::do_all(
+      galois::iterate(new_nodes),
+      [&](uint32_t new_node_id) { SetSampledNode(new_node_id); },
+      galois::loopname("NeighborhoodSampleSet"));
+
+  // XXX(loc) bitset; can readAny be weaker?
+  sync_substrate_->sync<writeSource, readAny, SampleFlagSync>("SampleSync");
+}
+
 void galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num,
                                            size_t num_to_sample) {
+  use_subgraph_ = false;
+
   galois::GAccumulator<size_t> sampled;
   galois::GAccumulator<size_t> total;
   sampled.reset();
@@ -852,18 +943,24 @@ void galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num,
         if (partitioned_graph_->getData(*x)) {
           // chance of not uniformly choosing an edge of this node num_to_sample
           // times (degree norm is 1 / degree)
-          // XXX in-degree prob, not out degree
           double probability_of_reject =
               std::pow(1 - GetDegreeNorm(*x), num_to_sample);
-          // loop through in-edges, turn "on" edge with some probability
-          for (auto edge_iter : partitioned_graph_->in_edges(*x)) {
+          // loop through edges, turn "on" edge with some probability
+          for (auto edge_iter : partitioned_graph_->edges(*x)) {
             if (sample_rng_.DoBernoulli(probability_of_reject)) {
-              // if here, it means edge accepted; set sampled on, mark source
-              // as part of next set
-              MakeInEdgeSampled(edge_iter, sample_layer_num);
-              new_sampled_nodes_.set(
-                  partitioned_graph_->GetInEdgeDest(edge_iter));
-              sampled += 1;
+              // only take if node is training node or a node not classified
+              // into train/test/val
+              if (IsValidForPhase(partitioned_graph_->getEdgeDst(edge_iter),
+                                  GNNPhase::kTrain) ||
+                  IsValidForPhase(partitioned_graph_->getEdgeDst(edge_iter),
+                                  GNNPhase::kOther)) {
+                // if here, it means edge accepted; set sampled on, mark source
+                // as part of next set
+                MakeEdgeSampled(edge_iter, sample_layer_num);
+                new_sampled_nodes_.set(
+                    partitioned_graph_->getEdgeDst(edge_iter));
+                sampled += 1;
+              }
             }
             total += 1;
           }
@@ -871,8 +968,8 @@ void galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num,
       },
       galois::steal(), galois::loopname("NeighborhoodSample"));
 
-  galois::gPrint("Num sampled edges is ", sampled.reduce(), " out of ",
-                 total.reduce(), "\n");
+  galois::gDebug("Num sampled edges for layer ", sample_layer_num, " is ",
+                 sampled.reduce(), " out of ", total.reduce());
 
   std::vector<uint32_t> new_nodes = new_sampled_nodes_.getOffsets();
 
diff --git a/libgnn/src/graphs/GNNSubgraph.cpp b/libgnn/src/graphs/GNNSubgraph.cpp
new file mode 100644
index 0000000000..e80dfffbc9
--- /dev/null
+++ b/libgnn/src/graphs/GNNSubgraph.cpp
@@ -0,0 +1,186 @@
+#include "galois/graphs/GNNGraph.h"
+#include <limits>
+
+size_t
+galois::graphs::GNNGraph::GNNSubgraph::BuildSubgraph(GNNGraph& gnn_graph) {
+  galois::StatTimer timer("BuildSubgraph", kRegionName);
+  timer.start();
+  CreateLocalToSubgraphMapping(gnn_graph);
+  DegreeCounting(gnn_graph);
+  EdgeCreation(gnn_graph);
+  NodeFeatureCreation(gnn_graph);
+  // loop over each node, grab out/in edges, construct them in LC_CSR_CSC
+  // no edge data, just topology
+  timer.stop();
+  return num_subgraph_nodes_;
+}
+
+void galois::graphs::GNNGraph::GNNSubgraph::CreateLocalToSubgraphMapping(
+    const GNNGraph& gnn_graph) {
+  galois::StatTimer timer("LIDToSIDMapping", kRegionName);
+  timer.start();
+
+  assert(gnn_graph.size() == lid_to_subgraph_id_.size());
+  // clear all mappings
+  std::fill(lid_to_subgraph_id_.begin(), lid_to_subgraph_id_.end(),
+            std::numeric_limits<uint32_t>::max());
+  // TODO(loc) depending on overhead, can parallelize this with a prefix sum
+  // serial loop over LIDs to construct lid -> subgraph id mapping
+  uint32_t current_sid = 0;
+
+  // split into 2 parts: masters, then mirrors
+  size_t last_owned_node = *(gnn_graph.end_owned());
+  for (size_t local_node_id = 0; local_node_id < last_owned_node;
+       local_node_id++) {
+    if (gnn_graph.IsInSampledGraph(local_node_id)) {
+      // TODO should bound check the SID to max uint32_t
+      // note: if SID is max uint32t, then it's not valid
+      lid_to_subgraph_id_[local_node_id] = current_sid++;
+    }
+  }
+
+  // all nodes before this SID are master nodes
+  subgraph_master_boundary_ = current_sid;
+
+  for (size_t local_node_id = last_owned_node; local_node_id < gnn_graph.size();
+       local_node_id++) {
+    if (gnn_graph.IsInSampledGraph(local_node_id)) {
+      // TODO should bound check the SID to max uint32_t
+      // note: if SID is max uint32t, then it's not valid
+      lid_to_subgraph_id_[local_node_id] = current_sid++;
+    }
+  }
+  galois::gDebug("Numbered sampled nodes for subgraph construction is ",
+                 current_sid);
+
+  num_subgraph_nodes_ = current_sid;
+
+  timer.stop();
+}
+
+void galois::graphs::GNNGraph::GNNSubgraph::DegreeCounting(
+    const GNNGraph& gnn_graph) {
+  galois::StatTimer timer("DegreeCounting", kRegionName);
+  timer.start();
+
+  subgraph_id_to_lid_.resize(num_subgraph_nodes_);
+  subgraph_out_degrees_.resize(num_subgraph_nodes_);
+  subgraph_in_degrees_.resize(num_subgraph_nodes_);
+
+  galois::do_all(
+      galois::iterate(gnn_graph.begin(), gnn_graph.end()),
+      [&](uint32_t node_id) {
+        if (gnn_graph.IsInSampledGraph(node_id)) {
+          uint32_t subgraph_id             = lid_to_subgraph_id_[node_id];
+          subgraph_id_to_lid_[subgraph_id] = node_id;
+
+          uint32_t out_degrees = 0;
+          for (auto out_edge_iter : gnn_graph.edges(node_id)) {
+            if (gnn_graph.IsEdgeSampledAny(out_edge_iter)) {
+              out_degrees++;
+            }
+          }
+          subgraph_out_degrees_[subgraph_id] = out_degrees;
+
+          uint32_t in_degrees = 0;
+          for (auto in_edge_iter : gnn_graph.in_edges(node_id)) {
+            if (gnn_graph.IsInEdgeSampledAny(in_edge_iter)) {
+              in_degrees++;
+            }
+          }
+          subgraph_in_degrees_[subgraph_id] = in_degrees;
+          // galois::gDebug("Local ID ", node_id, " SID ", subgraph_id, " out ",
+          //               out_degrees, " in ", in_degrees);
+        }
+      },
+      galois::steal());
+
+  timer.stop();
+}
+
+void galois::graphs::GNNGraph::GNNSubgraph::EdgeCreation(
+    const GNNGraph& gnn_graph) {
+  galois::StatTimer timer("EdgeConstruction", kRegionName);
+  timer.start();
+
+  // prefix sum over subgraph degrees from previous phase to get starting points
+  for (size_t i = 1; i < num_subgraph_nodes_; i++) {
+    subgraph_out_degrees_[i] += subgraph_out_degrees_[i - 1];
+    subgraph_in_degrees_[i] += subgraph_in_degrees_[i - 1];
+  }
+
+  // allocate then set node endpoints
+  num_subgraph_edges_ = subgraph_out_degrees_.back();
+  underlying_graph_.allocateFrom(num_subgraph_nodes_, num_subgraph_edges_);
+  underlying_graph_.CSCAllocate();
+  galois::do_all(galois::iterate(uint32_t{0}, num_subgraph_nodes_),
+                 [&](uint32_t subgraph_id) {
+                   underlying_graph_.fixEndEdge(
+                       subgraph_id, subgraph_out_degrees_[subgraph_id]);
+                   underlying_graph_.FixEndInEdge(
+                       subgraph_id, subgraph_in_degrees_[subgraph_id]);
+                 });
+  subedge_to_original_edge_.resize(num_subgraph_edges_);
+  in_subedge_to_original_edge_.resize(num_subgraph_edges_);
+
+  // save edges + save reference to layer sample status
+  galois::do_all(
+      galois::iterate(gnn_graph.begin(), gnn_graph.end()),
+      [&](uint32_t node_id) {
+        if (gnn_graph.IsInSampledGraph(node_id)) {
+          uint32_t subgraph_id = lid_to_subgraph_id_[node_id];
+
+          uint32_t out_location = 0;
+          uint32_t in_location  = 0;
+          if (subgraph_id != 0) {
+            out_location = subgraph_out_degrees_[subgraph_id - 1];
+            in_location  = subgraph_in_degrees_[subgraph_id - 1];
+          }
+
+          for (auto out_edge_iter : gnn_graph.edges(node_id)) {
+            if (gnn_graph.IsEdgeSampledAny(out_edge_iter)) {
+              subedge_to_original_edge_[out_location] = *out_edge_iter;
+              underlying_graph_.constructEdge(
+                  out_location++, gnn_graph.GetEdgeDest(out_edge_iter));
+            }
+          }
+
+          for (auto in_edge_iter : gnn_graph.in_edges(node_id)) {
+            if (gnn_graph.IsInEdgeSampledAny(in_edge_iter)) {
+              in_subedge_to_original_edge_[in_location] =
+                  *(gnn_graph.InEdgeToOutEdge(in_edge_iter));
+              underlying_graph_.ConstructInEdge(
+                  in_location++, gnn_graph.GetInEdgeDest(in_edge_iter));
+            }
+          }
+          assert(out_location == subgraph_out_degrees_[subgraph_id]);
+          assert(in_location == subgraph_in_degrees_[subgraph_id]);
+        }
+      },
+      galois::steal());
+  timer.stop();
+}
+
+void galois::graphs::GNNGraph::GNNSubgraph::NodeFeatureCreation(
+    GNNGraph& gnn_graph) {
+  galois::StatTimer timer("NodeFeatureCreation", kRegionName);
+  timer.start();
+  size_t feat_length = gnn_graph.node_feature_length();
+  // assumes everything is already setup
+  subgraph_node_features_.resize(feat_length * num_subgraph_nodes_);
+
+  galois::do_all(galois::iterate(begin(), end()), [&](size_t subgraph_node_id) {
+    size_t local_id = subgraph_id_to_lid_[subgraph_node_id];
+    std::memcpy(
+        &(subgraph_node_features_[subgraph_node_id * feat_length]),
+        &((gnn_graph.GetLocalFeatures().data())[local_id * feat_length]),
+        feat_length * sizeof(GNNFeature));
+    // for (unsigned i = 0; i < feat_length; i++) {
+    //  galois::gPrint(feat_length * sizeof(GNNFeature) , " ", subgraph_node_id,
+    //  " local id " , local_id, " feat at ", i, " is ",
+    //  subgraph_node_features_[subgraph_node_id * feat_length + i], " ",
+    //  gnn_graph.GetLocalFeatures()[local_id * feat_length + i], "\n");
+    //}
+  });
+  timer.stop();
+}

From 5b48172ddd90311d65c006ada8fcadfe1712380c Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 26 Apr 2021 17:00:16 -0500
Subject: [PATCH 516/660] WIP inductive training on GNN

Works on single host seemingly, but code not checked in detail yet.
---
 libgnn/include/galois/layers/GNNLayer.h  |  6 ++
 libgnn/include/galois/layers/SAGELayer.h |  9 ++-
 libgnn/src/GraphNeuralNetwork.cpp        | 82 +++++++++++++++-------
 libgnn/src/layers/GNNLayer.cpp           | 19 +++---
 libgnn/src/layers/SAGELayer.cpp          | 86 +++++++++++-------------
 libgnn/src/layers/SoftmaxLayer.cpp       | 16 +++--
 libgnn/test/sample-bit-test.cpp          |  2 +-
 7 files changed, 131 insertions(+), 89 deletions(-)

diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
index b5fb109ffe..82b149ee5e 100644
--- a/libgnn/include/galois/layers/GNNLayer.h
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -94,6 +94,12 @@ class GNNLayer {
       : GNNLayer(layer_num, graph, backward_output_matrix, dimensions,
                  GNNLayerConfig()) {}
 
+  virtual void ResizeRows(size_t new_row_count) {
+    layer_dimensions_.input_rows = new_row_count;
+    // TODO(loc) output matrix should be resized if space becomes an issue,
+    // else just use first S rows (S = subgraph size)
+  }
+
   GNNPhase layer_phase() { return layer_phase_; }
   //! Changes this layer's phase
   void SetLayerPhase(GNNPhase new_phase) { layer_phase_ = new_phase; }
diff --git a/libgnn/include/galois/layers/SAGELayer.h b/libgnn/include/galois/layers/SAGELayer.h
index b5ee978067..dd9ceb6e7b 100644
--- a/libgnn/include/galois/layers/SAGELayer.h
+++ b/libgnn/include/galois/layers/SAGELayer.h
@@ -28,7 +28,6 @@ class SAGELayer : public GNNLayer {
   //! memory for temporary matrices. Also initializes sync substrate for the
   //! weight matrix
   SAGELayer(size_t layer_num, const galois::graphs::GNNGraph& graph,
-
             PointerWithSize<GNNFloat>* backward_output_matrix,
             const GNNLayerDimensions& dimensions, const GNNLayerConfig& config,
             const SAGELayerConfig& sage_config);
@@ -45,6 +44,14 @@ class SAGELayer : public GNNLayer {
       : SAGELayer(layer_num, graph, backward_output_matrix, dimensions,
                   GNNLayerConfig(), SAGELayerConfig()) {}
 
+  void ResizeRows(size_t new_row_count) {
+    galois::gDebug("Resizing SAGE layer for sampled graph from ",
+                   layer_dimensions_.input_rows);
+    GNNLayer::ResizeRows(new_row_count);
+    galois::gDebug("To ", layer_dimensions_.input_rows);
+    // TODO(loc) resize input matrices if space is reason for doing this
+  }
+
   void InitSelfWeightsTo1() {
     if (layer_weights_2_.size()) {
       layer_weights_2_.assign(layer_weights_2_.size(), 1);
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index 42bcfc3b08..1b492c34ec 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -13,6 +13,10 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork(
     galois::GraphNeuralNetworkConfig&& config)
     : graph_(std::move(graph)), optimizer_(std::move(optimizer)),
       config_(std::move(config)) {
+  if (config_.do_sampling_ && config_.inductive_training_) {
+    GALOIS_LOG_FATAL("Do not set inductive training and sampling at same time "
+                     "(sampling is inductive already)");
+  }
   // max number of rows that can be passed as inputs; allocate space for it as
   // this will be the # of rows for each layer
   size_t max_rows = graph_->size();
@@ -99,8 +103,10 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork(
       break;
     }
   }
-  // output layer not included; it will never involve sampling
-  graph_->InitializeEdgeData(gnn_layers_.size());
+  if (config_.do_sampling() || config_.inductive_training_) {
+    // output layer not included; it will never involve sampling
+    graph_->InitializeSamplingData(gnn_layers_.size());
+  }
 
   // create the output layer
   GNNLayerDimensions output_dims = {
@@ -134,7 +140,7 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork(
     assert(false);
   }
 
-  // flip sampling
+  // flip sampling on layers
   if (config_.do_sampling()) {
     for (std::unique_ptr<galois::GNNLayer>& ptr : gnn_layers_) {
       ptr->EnableSampling();
@@ -145,20 +151,24 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork(
 float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
   const size_t this_host = graph_->host_id();
   float train_accuracy{0.f};
-
-  /*
-  if (config_.do_sampling()) {
-    for (std::unique_ptr<galois::GNNLayer>& ptr : gnn_layers_) {
-      assert(ptr->IsSampledLayer());
-    }
-  }
-  */
-
-  bool altered_norm_factor =
-      config_.inductive_training_ || config_.do_sampling();
-
+  size_t inductive_nodes = 0;
   if (config_.inductive_training_) {
-    graph_->CalculateSpecialNormFactor(false, true);
+    // Setup the subgraph to only be the training graph
+    graph_->SetupNeighborhoodSample();
+    for (auto back_iter = gnn_layers_.rbegin(); back_iter != gnn_layers_.rend();
+         back_iter++) {
+      GNNLayerType layer_type = (*back_iter)->layer_type();
+      if (layer_type == GNNLayerType::kGraphConvolutional ||
+          layer_type == GNNLayerType::kSAGE) {
+        graph_->SampleAllEdges((*back_iter)->layer_number());
+      }
+    }
+    // resize layer matrices
+    inductive_nodes = graph_->ConstructSampledSubgraph();
+    for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end();
+         layer++) {
+      (*layer)->ResizeRows(inductive_nodes);
+    }
   }
 
   galois::StatTimer epoch_timer("TrainingTime", "GraphNeuralNetwork");
@@ -168,6 +178,13 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
   // TODO incorporate validation/test intervals
   for (size_t epoch = 0; epoch < num_epochs; epoch++) {
     epoch_timer.start();
+    if (config_.inductive_training_) {
+      graph_->EnableSubgraph();
+      for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end();
+           layer++) {
+        (*layer)->ResizeRows(inductive_nodes);
+      }
+    }
 
     if (config_.do_sampling()) {
       graph_->SetupNeighborhoodSample();
@@ -180,11 +197,18 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
         GNNLayerType layer_type = (*back_iter)->layer_type();
         if (layer_type == GNNLayerType::kGraphConvolutional ||
             layer_type == GNNLayerType::kSAGE) {
-          graph_->SampleEdges((*back_iter)->layer_number(), 5);
+          graph_->SampleEdges((*back_iter)->layer_number(), 30);
           num_sampled_layers++;
         }
       }
       galois::gDebug("Number of sampled layers is ", num_sampled_layers);
+
+      // resize layer matrices
+      size_t num_subgraph_nodes = graph_->ConstructSampledSubgraph();
+      for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end();
+           layer++) {
+        (*layer)->ResizeRows(num_subgraph_nodes);
+      }
     }
 
     const PointerWithSize<galois::GNNFloat> predictions = DoInference();
@@ -210,8 +234,14 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
         config_.test_interval_ ? epoch % config_.test_interval_ == 0 : false;
 
     // get real norm factor back if altered by sampling or inductive training
-    if ((do_validate || do_test) && altered_norm_factor) {
-      graph_->CalculateFullNormFactor();
+    if (do_validate || do_test) {
+      // disable subgraph
+      graph_->DisableSubgraph();
+      // TODO only do this when necessary
+      for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end();
+           layer++) {
+        (*layer)->ResizeRows(graph_->size());
+      }
     }
 
     if (do_validate) {
@@ -256,18 +286,17 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
       SetLayerPhases(galois::GNNPhase::kTrain);
       // get back inductive norm factor as necessary; sampling norm is handled
       // at beginning of every iteration
-      if (config_.inductive_training_ && !config_.do_sampling()) {
-        graph_->CalculateSpecialNormFactor(false, true);
-      }
     }
   }
 
   uint64_t average_epoch_time = epoch_timer.get() / num_epochs;
   galois::runtime::reportStat_Tavg("GraphNeuralNetwork", "AverageEpochTime",
                                    average_epoch_time);
-
-  if (altered_norm_factor) {
-    graph_->CalculateFullNormFactor();
+  // disable subgraph
+  graph_->DisableSubgraph();
+  // TODO only do this when necessary
+  for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end(); layer++) {
+    (*layer)->ResizeRows(graph_->size());
   }
 
   // check test accuracy
@@ -284,7 +313,8 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
                                        "FinalTestAccuracy", global_accuracy);
   }
 
-  return global_accuracy;
+  // return global_accuracy;
+  return 0;
 }
 
 const galois::PointerWithSize<galois::GNNFloat>
diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp
index b88f91b631..14d8bd8759 100644
--- a/libgnn/src/layers/GNNLayer.cpp
+++ b/libgnn/src/layers/GNNLayer.cpp
@@ -154,9 +154,8 @@ void galois::GNNLayer::RandomInitVector(std::vector<GNNFloat>* vector_to_init) {
 void galois::GNNLayer::DoDropoutCPU(
     const PointerWithSize<GNNFloat> input_to_dropout,
     PointerWithSize<GNNFloat>* output_matrix) {
-  size_t num_elements = output_matrix->size();
-  assert(num_elements == dropout_mask_.size());
-  assert(num_elements == input_to_dropout.size());
+  size_t num_elements =
+      layer_dimensions_.input_rows * layer_dimensions_.input_columns;
 
   // determine which parts to drop
   galois::do_all(
@@ -263,7 +262,9 @@ void galois::GNNLayer::Activation() {
   // TODO only does relu at the moment; should check user specified activation
   // and act accordingly
   galois::do_all(
-      galois::iterate(static_cast<size_t>(0), forward_output_matrix_.size()),
+      galois::iterate(static_cast<size_t>(0),
+                      layer_dimensions_.input_rows *
+                          layer_dimensions_.output_columns),
       [&](size_t i) {
         if (forward_output_matrix_[i] > 0.0) {
           // do nothing, keep value; set the memo though
@@ -285,7 +286,9 @@ void galois::GNNLayer::ActivationDerivative(
   // and act accordingly
   // keep gradient if the original output was greater than 0
   galois::do_all(
-      galois::iterate(static_cast<size_t>(0), gradient->size()),
+      galois::iterate(static_cast<size_t>(0),
+                      layer_dimensions_.input_rows *
+                          layer_dimensions_.output_columns),
       [&](size_t i) {
         // it was <= 0 before; set back to 0
         if (!activation_memo_.test(i)) {
@@ -326,9 +329,9 @@ void galois::GNNLayer::MaskInputNonMasters(PointerWithSize<GNNFloat>* input) {
 #else
   assert(*(graph_.begin_owned()) == 0);
   size_t start_node = *(graph_.end_owned());
-  size_t end_node   = graph_.size();
+  size_t end_node   = graph_.active_size();
   size_t row_index  = layer_dimensions_.input_columns;
-  assert((row_index * layer_dimensions_.input_rows) == input->size());
+  assert((row_index * layer_dimensions_.input_rows) <= input->size());
   galois::do_all(
       galois::iterate(start_node, end_node),
       [&](size_t non_master) {
@@ -349,7 +352,7 @@ void galois::GNNLayer::MaskGradientNonMasters(
 #else
   assert(*(graph_.begin_owned()) == 0);
   size_t start_node = *(graph_.end_owned());
-  size_t end_node   = graph_.size();
+  size_t end_node   = graph_.active_size();
   size_t row_index  = layer_dimensions_.output_columns;
   galois::do_all(
       galois::iterate(start_node, end_node),
diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp
index 8a01c8fe1d..dfae86cbd2 100644
--- a/libgnn/src/layers/SAGELayer.cpp
+++ b/libgnn/src/layers/SAGELayer.cpp
@@ -125,10 +125,11 @@ const galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::ForwardPhase(
   galois::StatTimer timer("ForwardPhase", kRegionName);
   timer.start();
 
-  assert(input_embeddings.size() ==
+  assert(input_embeddings.size() >=
          (layer_dimensions_.input_rows * layer_dimensions_.input_columns));
-  assert(p_forward_output_matrix_.size() ==
+  assert(p_forward_output_matrix_.size() >=
          (layer_dimensions_.input_rows * layer_dimensions_.output_columns));
+
   // pointer to input to operate on
   const GNNFloat* input_data = input_embeddings.data();
   GNNFloat* agg_data;
@@ -172,7 +173,7 @@ const galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::ForwardPhase(
     Activation();
   }
 
-  assert(p_forward_output_matrix_.size() ==
+  assert(p_forward_output_matrix_.size() >=
          (layer_dimensions_.input_rows * layer_dimensions_.output_columns));
 
   timer.stop();
@@ -272,7 +273,7 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
     if (layer_number_ != 0) {
       // ---unmasked---
       // transposed sgemm for derivative; in_temp is output
-      assert(input_gradient->size() ==
+      assert(input_gradient->size() >=
              layer_dimensions_.input_rows * layer_dimensions_.output_columns);
       // pintemp1 contains (AF)'
       // overwrites the dropout matrix that was in ptemp1 (needed for second
@@ -365,9 +366,14 @@ void galois::SAGELayer::AggregateAll(
 
 #ifdef GALOIS_ENABLE_GPU
   if (device_personality == DevicePersonality::GPU_CUDA) {
-    gpu_object_.AggregateAllGPU(
-        graph_.GetGPUGraph(), graph_.size(), column_length, node_embeddings,
-        aggregate_output, !config_.disable_normalization);
+    if (!IsSampledLayer()) {
+      gpu_object_.AggregateAllGPU(
+          graph_.GetGPUGraph(), graph_.size(), column_length, node_embeddings,
+          aggregate_output, !config_.disable_normalization);
+    } else {
+      // TODO(hochan)
+      GALOIS_LOG_FATAL("SAMPLING IMPLEMENTATION");
+    }
     graph_.AggregateSync(aggregate_output, column_length, layer_number_);
   } else {
 #endif
@@ -385,10 +391,8 @@ void galois::SAGELayer::AggregateAllCPU(
     galois::substrate::PerThreadStorage<std::vector<GNNFloat>>*,
     bool is_backward) {
 
-  size_t num_nodes = graph_.size();
-
   galois::do_all(
-      galois::iterate(static_cast<size_t>(0), num_nodes),
+      galois::iterate(graph_.begin(), graph_.end()),
       [&](size_t src) {
         // TODO(loc) this is currently a hack: the sync substrate blows
         // up if not the entire bitset is set for sync call like in
@@ -400,20 +404,14 @@ void galois::SAGELayer::AggregateAllCPU(
           aggregate_output[index_to_src_feature + i] = 0;
         }
 
-        if (layer_phase_ == GNNPhase::kTrain) {
-          if (IsInductiveLayer()) {
-            // if inductive, all non-training nodes do not exist
-            if (!graph_.IsValidForPhase(src, GNNPhase::kTrain))
-              return;
-          }
-
-          if (IsSampledLayer()) {
-            // check if node is part of sampled graph
-            if (!graph_.IsInSampledGraph(src)) {
-              return;
-            }
-          }
-        }
+        // if (layer_phase_ == GNNPhase::kTrain) {
+        //  // XXX
+        //  if (IsInductiveLayer()) {
+        //    // if inductive, all non-training nodes do not exist
+        //    if (!graph_.IsValidForPhase(src, GNNPhase::kTrain))
+        //      return;
+        //  }
+        //}
 
         GNNFloat source_norm = 0.0;
         if (!config_.disable_normalization) {
@@ -426,22 +424,19 @@ void galois::SAGELayer::AggregateAllCPU(
                e++) {
             // graphs::bitset_graph_aggregate.set(src);
             size_t dst = graph_.GetEdgeDest(e);
+            // galois::gPrint("(", src, " ", dst, ")\n");
 
             if (layer_phase_ == GNNPhase::kTrain) {
-              if (IsInductiveLayer()) {
-                // if inductive, all non-training nodes do not exist
-                if (!graph_.IsValidForPhase(dst, GNNPhase::kTrain))
-                  return;
-              }
-
+              //// XXX
+              // if (IsInductiveLayer()) {
+              //  // if inductive, all non-training nodes do not exist
+              //  if (!graph_.IsValidForPhase(dst, GNNPhase::kTrain))
+              //    return;
+              //}
               if (IsSampledLayer()) {
                 if (!graph_.IsEdgeSampled(e, layer_number_)) {
                   continue;
                 }
-                // ignore non-sampled nodes
-                if (layer_phase_ == GNNPhase::kTrain &&
-                    !graph_.IsInSampledGraph(dst))
-                  continue;
               }
             }
 
@@ -475,20 +470,16 @@ void galois::SAGELayer::AggregateAllCPU(
             size_t dst = graph_.GetInEdgeDest(e);
 
             if (layer_phase_ == GNNPhase::kTrain) {
-              if (IsInductiveLayer()) {
-                // if inductive, all non-training nodes do not exist
-                if (!graph_.IsValidForPhase(dst, GNNPhase::kTrain))
-                  return;
-              }
-
+              // XXX
+              // if (IsInductiveLayer()) {
+              //  // if inductive, all non-training nodes do not exist
+              //  if (!graph_.IsValidForPhase(dst, GNNPhase::kTrain))
+              //    return;
+              //}
               if (IsSampledLayer()) {
                 if (!graph_.IsInEdgeSampled(e, layer_number_)) {
                   continue;
                 }
-                // ignore non-sampled nodes
-                if (layer_phase_ == GNNPhase::kTrain &&
-                    !graph_.IsInSampledGraph(dst))
-                  continue;
               }
             }
 
@@ -530,6 +521,9 @@ void galois::SAGELayer::UpdateEmbeddings(const GNNFloat* node_embeddings,
         base_gpu_object_.layer_weights(), output);
   } else {
 #endif
+    galois::gPrint(layer_dimensions_.input_rows, " ",
+                   layer_dimensions_.input_columns, " ",
+                   layer_dimensions_.output_columns, "\n");
     // CPU version is just a call into CBlas
     galois::CBlasSGEMM(CblasNoTrans, CblasNoTrans, layer_dimensions_.input_rows,
                        layer_dimensions_.input_columns,
@@ -564,7 +558,7 @@ void galois::SAGELayer::UpdateEmbeddingsDerivative(const GNNFloat* gradients,
   galois::StatTimer timer("BackwardXForm", kRegionName);
   timer.start();
 
-  assert(p_layer_weights_.size() ==
+  assert(p_layer_weights_.size() >=
          layer_dimensions_.input_columns * layer_dimensions_.output_columns);
 #ifdef GALOIS_ENABLE_GPU
   if (device_personality == DevicePersonality::GPU_CUDA) {
@@ -591,7 +585,7 @@ void galois::SAGELayer::SelfFeatureUpdateEmbeddingsDerivative(
   galois::StatTimer timer("SelfBackwardXForm", kRegionName);
   timer.start();
 
-  assert(p_layer_weights_.size() ==
+  assert(p_layer_weights_.size() >=
          layer_dimensions_.input_columns * layer_dimensions_.output_columns);
 #ifdef GALOIS_ENABLE_GPU
   // TODO gpu self
diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp
index 47d5f2ce0b..94523ce327 100644
--- a/libgnn/src/layers/SoftmaxLayer.cpp
+++ b/libgnn/src/layers/SoftmaxLayer.cpp
@@ -8,13 +8,13 @@ galois::SoftmaxLayer::ForwardPhaseCPU(
   // note: p_backward == input_embeddings
   input_loss_.assign(input_loss_.size(), 0.0);
   const size_t feature_length = layer_dimensions_.input_columns;
-#ifndef NDEBUG
+  //#ifndef NDEBUG
   //#ifdef NDEBUG
   galois::DGAccumulator<GNNFloat> loss_accum;
   galois::DGAccumulator<size_t> handled;
   loss_accum.reset();
   handled.reset();
-#endif
+  //#endif
 
   galois::do_all(
       galois::iterate(graph_.begin(), graph_.end()),
@@ -44,11 +44,11 @@ galois::SoftmaxLayer::ForwardPhaseCPU(
           input_loss_[i] =
               GNNCrossEntropy(feature_length, ground_truth_vec->data(),
                               &p_backward_output_matrix_[feature_length * i]);
-#ifndef NDEBUG
+          //#ifndef NDEBUG
           //#ifdef NDEBUG
           loss_accum += input_loss_[i];
           handled += 1;
-#endif
+          //#endif
         } else {
           VectorZero(feature_length,
                      &p_backward_output_matrix_[i * feature_length]);
@@ -57,12 +57,14 @@ galois::SoftmaxLayer::ForwardPhaseCPU(
       // TODO chunk size?
       // steal on as some threads may have nothing to work on
       galois::steal(), galois::loopname("SoftmaxForward"));
-#ifndef NDEBUG
+  //#ifndef NDEBUG
   //#ifdef NDEBUG
+
   GNNFloat reduced_loss = loss_accum.reduce();
   size_t t              = handled.reduce();
-  galois::gPrint("Loss is ", reduced_loss / t, "\n");
-#endif
+  galois::gPrint("Loss is ", reduced_loss / t, " ", reduced_loss, " ", t, "\n");
+
+  //#endif
 
   return p_backward_output_matrix_;
 }
diff --git a/libgnn/test/sample-bit-test.cpp b/libgnn/test/sample-bit-test.cpp
index 66d739a6d7..89ed60d0ad 100644
--- a/libgnn/test/sample-bit-test.cpp
+++ b/libgnn/test/sample-bit-test.cpp
@@ -15,7 +15,7 @@ int main() {
 
   galois::graphs::GNNGraph graph(
       "tester", galois::graphs::GNNPartitionScheme::kOEC, true);
-  graph.InitializeEdgeData(3);
+  graph.InitializeSamplingData(3);
 
   // first, assert all edges are not sampled (should come with all 0s)
   for (size_t node = 0; node < graph.size(); node++) {

From 8674b9059fb43cb9ca828660a786f8da1bf1f3d8 Mon Sep 17 00:00:00 2001
From: Hochan Lee <nicelhc13@users.noreply.github.com>
Date: Mon, 26 Apr 2021 22:10:31 -0500
Subject: [PATCH 517/660] Implement split/combine communications based on
 Katana's implementation (#3)

---
 .../include/galois/graphs/DistributedGraph.h  |  18 +-
 .../include/galois/graphs/MiningPartitioner.h |  31 +-
 libcusp/include/galois/graphs/NewGeneric.h    |  66 ++---
 libdist/include/galois/runtime/Network.h      |  13 +-
 libdist/include/galois/runtime/Serialize.h    | 192 +++++-------
 libdist/src/DistStats.cpp                     |  32 +-
 libdist/src/Network.cpp                       |  22 +-
 libdist/src/NetworkBuffered.cpp               | 279 ++++++++----------
 libdist/src/NetworkLCI.cpp                    |   6 +-
 .../galois/graphs/GluonEdgeSubstrate.h        |  48 ++-
 .../include/galois/graphs/GluonSubstrate.h    |  50 ++--
 11 files changed, 326 insertions(+), 431 deletions(-)

diff --git a/libcusp/include/galois/graphs/DistributedGraph.h b/libcusp/include/galois/graphs/DistributedGraph.h
index e13f71e4d2..bf88a17acf 100644
--- a/libcusp/include/galois/graphs/DistributedGraph.h
+++ b/libcusp/include/galois/graphs/DistributedGraph.h
@@ -260,14 +260,14 @@ class DistGraph {
       for (unsigned d = 0; d < DecomposeFactor; ++d) {
         galois::runtime::gSerialize(b, gid2host[id + d * numHosts]);
       }
-      net.sendTagged(h, galois::runtime::evilPhase, b);
+      net.sendTagged(h, galois::runtime::evilPhase, std::move(b));
     }
     net.flush();
     unsigned received = 1;
     while (received < numHosts) {
-      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
       do {
-        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+        p = net.recieveTagged(galois::runtime::evilPhase);
       } while (!p);
       assert(p->first != id);
       auto& b = p->second;
@@ -330,14 +330,14 @@ class DistGraph {
         continue;
       galois::runtime::SendBuffer b;
       galois::runtime::gSerialize(b, gid2host[id]);
-      net.sendTagged(h, galois::runtime::evilPhase, b);
+      net.sendTagged(h, galois::runtime::evilPhase, std::move(b));
     }
     net.flush();
     unsigned received = 1;
     while (received < numHosts) {
-      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
       do {
-        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+        p = net.recieveTagged(galois::runtime::evilPhase);
       } while (!p);
       assert(p->first != id);
       auto& b = p->second;
@@ -447,14 +447,14 @@ class DistGraph {
         continue;
       galois::runtime::SendBuffer b;
       galois::runtime::gSerialize(b, gid2host[id]);
-      net.sendTagged(h, galois::runtime::evilPhase, b);
+      net.sendTagged(h, galois::runtime::evilPhase, std::move(b));
     }
     net.flush();
     unsigned received = 1;
     while (received < numHosts) {
-      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
       do {
-        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+        p = net.recieveTagged(galois::runtime::evilPhase);
       } while (!p);
       assert(p->first != id);
       auto& b = p->second;
diff --git a/libcusp/include/galois/graphs/MiningPartitioner.h b/libcusp/include/galois/graphs/MiningPartitioner.h
index e49d16023e..c809c24dd0 100644
--- a/libcusp/include/galois/graphs/MiningPartitioner.h
+++ b/libcusp/include/galois/graphs/MiningPartitioner.h
@@ -540,15 +540,15 @@ class MiningGraph : public DistGraph<NodeTy, EdgeTy> {
       if (h != base_DistGraph::id) {
         galois::runtime::SendBuffer bitsetBuffer;
         galois::runtime::gSerialize(bitsetBuffer, presentProxies);
-        net.sendTagged(h, galois::runtime::evilPhase, bitsetBuffer);
+        net.sendTagged(h, galois::runtime::evilPhase, std::move(bitsetBuffer));
       }
     }
 
     // receive loop
     for (unsigned h = 0; h < net.Num - 1; h++) {
-      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
       do {
-        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+        p = net.recieveTagged(galois::runtime::evilPhase);
       } while (!p);
       uint32_t sendingHost = p->first;
       // deserialize proxiesOnOtherHosts
@@ -653,8 +653,7 @@ class MiningGraph : public DistGraph<NodeTy, EdgeTy> {
       bytesSent.update(b.size());
 
       // send buffer and free memory
-      net.sendTagged(h, galois::runtime::evilPhase, b);
-      b.getVec().clear();
+      net.sendTagged(h, galois::runtime::evilPhase, std::move(b));
     }
     galois::runtime::reportStat_Tsum(
         GRNAME, std::string("EdgeInspectionBytesSent"), bytesSent.reduce());
@@ -675,9 +674,9 @@ class MiningGraph : public DistGraph<NodeTy, EdgeTy> {
 
     for (unsigned h = 0; h < net.Num - 1; h++) {
       // expect data from comm partner back
-      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
       do {
-        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+        p = net.recieveTagged(galois::runtime::evilPhase);
       } while (!p);
 
       uint32_t sendingHost = p->first;
@@ -1057,15 +1056,15 @@ class MiningGraph : public DistGraph<NodeTy, EdgeTy> {
                 bytesSent.update(b.size());
                 maxBytesSent.update(b.size());
 
-                net.sendTagged(h, galois::runtime::evilPhase, b);
-                b.getVec().clear();
-                b.getVec().reserve(edgePartitionSendBufSize * 1.25);
+                net.sendTagged(h, galois::runtime::evilPhase, std::move(b));
+                b = galois::runtime::SerializeBuffer();
+                b.reserve(edgePartitionSendBufSize * 1.25);
               }
             }
           }
 
           // overlap receives
-          auto buffer = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+          auto buffer = net.recieveTagged(galois::runtime::evilPhase);
           this->processReceivedEdgeBuffer(buffer, graph, receivedNodes);
         },
 #if MORE_DIST_STATS
@@ -1085,8 +1084,8 @@ class MiningGraph : public DistGraph<NodeTy, EdgeTy> {
           bytesSent.update(sendBuffer.size());
           maxBytesSent.update(sendBuffer.size());
 
-          net.sendTagged(h, galois::runtime::evilPhase, sendBuffer);
-          sendBuffer.getVec().clear();
+          net.sendTagged(h, galois::runtime::evilPhase, std::move(sendBuffer));
+          sendBuffer = galois::runtime::SerializeBuffer();
         }
       }
     }
@@ -1108,7 +1107,7 @@ class MiningGraph : public DistGraph<NodeTy, EdgeTy> {
       GraphTy& graph, std::atomic<uint32_t>& receivedNodes) {
     if (buffer) {
       auto& rb = buffer->second;
-      while (rb.r_size() > 0) {
+      while (rb.size() > 0) {
         uint64_t n;
         std::vector<uint64_t> gdst_vec;
         galois::runtime::gDeserialize(rb, n);
@@ -1134,8 +1133,8 @@ class MiningGraph : public DistGraph<NodeTy, EdgeTy> {
 
     // receive edges for all mirror nodes
     while (receivedNodes < nodesToReceive) {
-      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
-      p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
+      p = net.recieveTagged(galois::runtime::evilPhase);
       processReceivedEdgeBuffer(p, graph, receivedNodes);
     }
   }
diff --git a/libcusp/include/galois/graphs/NewGeneric.h b/libcusp/include/galois/graphs/NewGeneric.h
index 710ba82996..6ece9e2c51 100644
--- a/libcusp/include/galois/graphs/NewGeneric.h
+++ b/libcusp/include/galois/graphs/NewGeneric.h
@@ -625,16 +625,16 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
       if (h != base_DistGraph::id) {
         galois::runtime::gSerialize(bitsetBuffer, syncNodes[h]);
         bytesSent += bitsetBuffer.size();
-        net.sendTagged(h, galois::runtime::evilPhase, bitsetBuffer);
+        net.sendTagged(h, galois::runtime::evilPhase, std::move(bitsetBuffer));
       }
     }
 
     // Step 5: recv bitset to other hosts; this indicates which local nodes each
     // other host needs to be informed of updates of
     for (unsigned h = 0; h < net.Num - 1; h++) {
-      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
       do {
-        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+        p = net.recieveTagged(galois::runtime::evilPhase);
       } while (!p);
       uint32_t sendingHost = p->first;
       // deserialize into neighbor bitsets
@@ -724,7 +724,7 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
 
         // note the +1 on evil phase; load messages send using a different
         // phase to avoid conflicts
-        net.sendTagged(h, base_DistGraph::evilPhasePlus1(), b);
+        net.sendTagged(h, base_DistGraph::evilPhasePlus1(), std::move(b));
       }
     }
     sendTimer.stop();
@@ -744,13 +744,13 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
                      std::vector<uint64_t>& edgeLoads,
                      galois::DynamicBitSet& loadsClear) {
     auto& net = galois::runtime::getSystemNetworkInterface();
-    decltype(net.recieveTagged(base_DistGraph::evilPhasePlus1(), nullptr)) p;
+    decltype(net.recieveTagged(base_DistGraph::evilPhasePlus1())) p;
 
     galois::StatTimer recvTimer("Phase0AsyncRecvLoadTime", GRNAME);
     recvTimer.start();
     do {
       // note the +1
-      p = net.recieveTagged(base_DistGraph::evilPhasePlus1(), nullptr);
+      p = net.recieveTagged(base_DistGraph::evilPhasePlus1());
 
       if (p) {
         unsigned messageType = (unsigned)-1;
@@ -945,13 +945,13 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
         galois::runtime::gSerialize(b, mastersToSend);
       }
       bytesSent += b.size();
-      net.sendTagged(targetHost, galois::runtime::evilPhase, b);
+      net.sendTagged(targetHost, galois::runtime::evilPhase, std::move(b));
     } else {
       // send empty no-op message, tag 0
       galois::runtime::SendBuffer b;
       galois::runtime::gSerialize(b, 0u);
       bytesSent += b.size();
-      net.sendTagged(targetHost, galois::runtime::evilPhase, b);
+      net.sendTagged(targetHost, galois::runtime::evilPhase, std::move(b));
     }
     sendOffsetsTimer.stop();
 
@@ -1020,9 +1020,9 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
         bytesSent += b.size();
         // assumes phase is 0 or 1
         if (phase == 1) {
-          net.sendTagged(h, base_DistGraph::evilPhasePlus1(), b);
+          net.sendTagged(h, base_DistGraph::evilPhasePlus1(), std::move(b));
         } else if (phase == 0) {
-          net.sendTagged(h, galois::runtime::evilPhase, b);
+          net.sendTagged(h, galois::runtime::evilPhase, std::move(b));
         } else {
           GALOIS_DIE("unexpected phase: ", phase);
         }
@@ -1067,9 +1067,9 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
                         std::vector<uint32_t>& receivedMasters) {
     auto& net = galois::runtime::getSystemNetworkInterface();
 
-    decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+    decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
     do {
-      p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+      p = net.recieveTagged(galois::runtime::evilPhase);
     } while (!p);
 
     uint32_t sendingHost = p->first;
@@ -1109,11 +1109,11 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
       std::unordered_map<uint64_t, uint32_t>& gid2offsets,
       galois::DynamicBitSet& hostFinished) {
     auto& net = galois::runtime::getSystemNetworkInterface();
-    decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+    decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
 
     // repeat loop until no message
     do {
-      p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+      p = net.recieveTagged(galois::runtime::evilPhase);
       if (p) {
         uint32_t sendingHost = p->first;
         unsigned messageType = (unsigned)-1;
@@ -2131,8 +2131,7 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
       bytesSent.update(b.size());
 
       // send buffer and free memory
-      net.sendTagged(h, galois::runtime::evilPhase, b);
-      b.getVec().clear();
+      net.sendTagged(h, galois::runtime::evilPhase, std::move(b));
     }
 
     galois::runtime::reportStat_Tsum(
@@ -2156,9 +2155,9 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
 
     for (unsigned h = 0; h < net.Num - 1; h++) {
       // expect data from comm partner back
-      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
       do {
-        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+        p = net.recieveTagged(galois::runtime::evilPhase);
       } while (!p);
 
       uint32_t sendingHost = p->first;
@@ -2739,16 +2738,15 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
                   bytesSent.update(b.size());
                   maxBytesSent.update(b.size());
 
-                  net.sendTagged(h, galois::runtime::evilPhase, b);
-                  b.getVec().clear();
-                  b.getVec().reserve(edgePartitionSendBufSize * 1.25);
+                  net.sendTagged(h, galois::runtime::evilPhase, std::move(b));
+                  b = galois::runtime::SerializeBuffer();
+                  b.reserve(edgePartitionSendBufSize * 1.25);
                 }
               }
             }
 
             // overlap receives
-            auto buffer =
-                net.recieveTagged(galois::runtime::evilPhase, nullptr);
+            auto buffer = net.recieveTagged(galois::runtime::evilPhase);
             this->processReceivedEdgeBuffer(buffer, graph, receivedNodes);
           },
 #if MORE_DIST_STATS
@@ -2771,8 +2769,8 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
           bytesSent.update(sendBuffer.size());
           maxBytesSent.update(sendBuffer.size());
 
-          net.sendTagged(h, galois::runtime::evilPhase, sendBuffer);
-          sendBuffer.getVec().clear();
+          net.sendTagged(h, galois::runtime::evilPhase, std::move(sendBuffer));
+          sendBuffer = galois::runtime::SerializeBuffer();
         }
       }
     }
@@ -2885,16 +2883,15 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
                   bytesSent.update(b.size());
                   maxBytesSent.update(b.size());
 
-                  net.sendTagged(h, galois::runtime::evilPhase, b);
-                  b.getVec().clear();
-                  b.getVec().reserve(edgePartitionSendBufSize * 1.25);
+                  net.sendTagged(h, galois::runtime::evilPhase, std::move(b));
+                  b = galois::runtime::SerializeBuffer();
+                  b.reserve(edgePartitionSendBufSize * 1.25);
                 }
               }
             }
 
             // overlap receives
-            auto buffer =
-                net.recieveTagged(galois::runtime::evilPhase, nullptr);
+            auto buffer = net.recieveTagged(galois::runtime::evilPhase);
             this->processReceivedEdgeBuffer(buffer, graph, receivedNodes);
           },
 #if MORE_DIST_STATS
@@ -2917,8 +2914,7 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
           bytesSent.update(sendBuffer.size());
           maxBytesSent.update(sendBuffer.size());
 
-          net.sendTagged(h, galois::runtime::evilPhase, sendBuffer);
-          sendBuffer.getVec().clear();
+          net.sendTagged(h, galois::runtime::evilPhase, std::move(sendBuffer));
         }
       }
     }
@@ -2940,7 +2936,7 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
       GraphTy& graph, std::atomic<uint32_t>& receivedNodes) {
     if (buffer) {
       auto& rb = buffer->second;
-      while (rb.r_size() > 0) {
+      while (rb.size() > 0) {
         uint64_t n;
         std::vector<uint64_t> gdst_vec;
         galois::runtime::gDeserialize(rb, n);
@@ -2966,8 +2962,8 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
 
     // receive edges for all mirror nodes
     while (receivedNodes < nodesToReceive) {
-      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
-      p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
+      p = net.recieveTagged(galois::runtime::evilPhase);
       processReceivedEdgeBuffer(p, graph, receivedNodes);
     }
   }
diff --git a/libdist/include/galois/runtime/Network.h b/libdist/include/galois/runtime/Network.h
index e4695c0c2b..1560b20914 100644
--- a/libdist/include/galois/runtime/Network.h
+++ b/libdist/include/galois/runtime/Network.h
@@ -109,7 +109,7 @@ class NetworkInterface {
   //! tag (tag) and some data (buf)
   //! on the receiver, buf will be returned on a receiveTagged(tag)
   //! buf is invalidated by this operation
-  virtual void sendTagged(uint32_t dest, uint32_t tag, SendBuffer& buf,
+  virtual void sendTagged(uint32_t dest, uint32_t tag, SendBuffer&& buf,
                           int type = 0) = 0;
 
   //! Send a message to all hosts.  A message is simply a
@@ -123,9 +123,6 @@ class NetworkInterface {
   template <typename... Args>
   void broadcastSimple(void (*recv)(uint32_t, Args...), Args... param);
 
-  //! Receive and dispatch messages
-  void handleReceives();
-
   //! Wrapper to reset the mem usage tracker's stats
   inline void resetMemUsage() { memUsageTracker.resetMemUsage(); }
 
@@ -134,8 +131,7 @@ class NetworkInterface {
 
   //! Receive a tagged message
   virtual std::optional<std::pair<uint32_t, RecvBuffer>>
-  recieveTagged(uint32_t tag, std::unique_lock<substrate::SimpleLock>* rlg,
-                int type = 0) = 0;
+  recieveTagged(uint32_t tag, int type = 0) = 0;
 
   //! move send buffers out to network
   virtual void flush() = 0;
@@ -195,9 +191,6 @@ NetworkInterface& makeNetworkLCI();
 //! @warning Should not be called within a parallel region; assumes only one
 //! thread is calling it
 substrate::Barrier& getHostBarrier();
-//! Returns a fence that ensures all pending messages are delivered, acting
-//! like a memory-barrier
-substrate::Barrier& getHostFence();
 
 ////////////////////////////////////////////////////////////////////////////////
 // Implementations
@@ -220,7 +213,7 @@ void NetworkInterface::sendSimple(uint32_t dest,
   SendBuffer buf;
   gSerialize(buf, (uintptr_t)recv, param...,
              (uintptr_t)genericLandingPad<Args...>);
-  sendTagged(dest, 0, buf);
+  sendTagged(dest, 0, std::move(buf));
 }
 
 template <typename... Args>
diff --git a/libdist/include/galois/runtime/Serialize.h b/libdist/include/galois/runtime/Serialize.h
index 94517e34ca..8110c954e9 100644
--- a/libdist/include/galois/runtime/Serialize.h
+++ b/libdist/include/galois/runtime/Serialize.h
@@ -49,30 +49,48 @@
 namespace galois {
 namespace runtime {
 
+struct BufferHeader {
+  enum class BufferType { kSingleMessage, kMultipleMessages, kPartialMessage };
+  BufferType type{BufferType::kSingleMessage};
+  uint8_t num_segments{1};
+  uint8_t segment_id{0};
+  uint8_t segment_tag{0};
+};
+
 class DeSerializeBuffer; // forward declaration for friend declaration
 
 /**
  * Buffer for serialization of data. Mainly used during network communication.
  */
 class SerializeBuffer {
+  static constexpr size_t kHeaderSize = sizeof(BufferHeader);
+
   //! Access to a deserialize buffer
   friend DeSerializeBuffer;
 
   //! type of data buffer
   // using vTy = std::vector<uint8_t>;
-  using vTy = galois::PODResizeableArray<uint8_t>;
+  using vTy       = galois::PODResizeableArray<uint8_t>;
+  using size_type = vTy::size_type;
+
   //! the actual data stored in this buffer
   vTy bufdata;
 
 public:
   //! default constructor
-  SerializeBuffer() = default;
+  SerializeBuffer() {
+    BufferHeader header;
+    insert(reinterpret_cast<uint8_t*>(&header), kHeaderSize);
+  }
+
   //! disabled copy constructor
   SerializeBuffer(SerializeBuffer&& rhs) = default;
-  //! Creates a buffer from another buffer
-  //! @param d buffer to create from
-  //! @param len amount of copy from buffer d
-  SerializeBuffer(const char* d, unsigned len) : bufdata(d, d + len) {}
+
+  SerializeBuffer& operator=(SerializeBuffer&& rhs) {
+    auto buf = std::move(rhs);
+    bufdata  = std::move(buf.get());
+    return *this;
+  }
 
   //! Push a character onto the serialize buffer
   inline void push(const char c) { bufdata.push_back(c); }
@@ -87,25 +105,19 @@ class SerializeBuffer {
   //! Insert characters from a buffer into the serialize buffer at a particular
   //! offset
   void insertAt(const uint8_t* c, size_t bytes, size_t offset) {
+    offset += kHeaderSize;
+    assert((offset + bytes) <= bufdata.size());
     if (bytes > 0) {
       std::copy_n(c, bytes, bufdata.begin() + offset);
     }
   }
 
-  /**
-   * Reserve space at the end for inserting new data into the serialize
-   * buffer
-   *
-   * @param bytes number of bytes to reserve at the end
-   * @returns offset to the end of the buffer before new space was reserved
-   */
-  size_t encomber(size_t bytes) {
-    size_t retval = bufdata.size();
-    bufdata.resize(retval + bytes);
-    return retval;
-  }
+  //! Returns an iterator to the beginning of the data in this serialize buffer
+  vTy::const_iterator begin() const { return bufdata.cbegin(); }
+  //! Returns an iterator to the end of the data in this serialize buffer
+  vTy::const_iterator end() const { return bufdata.cend(); }
 
-  void resize(size_t bytes) { bufdata.resize(bytes); }
+  void resize(size_t bytes) { bufdata.resize(kHeaderSize + bytes); }
 
   /**
    * Reserve more space in the serialize buffer.
@@ -115,34 +127,17 @@ class SerializeBuffer {
   void reserve(size_t s) { bufdata.reserve(bufdata.size() + s); }
 
   //! Returns a pointer to the data stored in this serialize buffer
-  const uint8_t* linearData() const { return bufdata.data(); }
+  const uint8_t* linearData() const { return bufdata.data() + kHeaderSize; }
   //! Returns vector of data stored in this serialize buffer
-  vTy& getVec() { return bufdata; }
-
-  //! Returns an iterator to the beginning of the data in this serialize buffer
-  vTy::const_iterator begin() const { return bufdata.cbegin(); }
-  //! Returns an iterator to the end of the data in this serialize buffer
-  vTy::const_iterator end() const { return bufdata.cend(); }
+  vTy& get() { return bufdata; }
 
-  using size_type = vTy::size_type;
+  //! Get a pointer to the remaining data of the deserialize buffer
+  //! (as determined by offset)
+  const uint8_t* data() const { return bufdata.data() + kHeaderSize; }
+  uint8_t* data() { return bufdata.data() + kHeaderSize; }
 
   //! Returns the size of the serialize buffer
-  size_type size() const { return bufdata.size(); }
-
-  //! Utility print function for the serialize buffer
-  //! @param o stream to print to
-  void print(std::ostream& o) const {
-    o << "<{" << std::hex;
-    for (auto& i : bufdata)
-      o << (unsigned int)i << " ";
-    o << std::dec << "}>";
-  }
-
-  //! Operator that calls the print function of the serialize buffer
-  friend std::ostream& operator<<(std::ostream& os, const SerializeBuffer& b) {
-    b.print(os);
-    return os;
-  }
+  size_type size() const { return bufdata.size() - kHeaderSize; }
 };
 
 /**
@@ -150,50 +145,54 @@ class SerializeBuffer {
  * communication.
  */
 class DeSerializeBuffer {
+  static constexpr size_t kHeaderSize = sizeof(BufferHeader);
   //! Access to serialize buffer
   friend SerializeBuffer;
   //! type of data buffer
   // using vTy = std::vector<uint8_t>;
   using vTy = galois::PODResizeableArray<uint8_t>;
   //! the actual data stored in this buffer
-  vTy bufdata;
-  int offset;
+  vTy bufdata{kHeaderSize};
+  size_t offset{kHeaderSize};
 
 public:
   //! Constructor initializes offset into buffer to 0
-  DeSerializeBuffer() : offset(0) {}
+  DeSerializeBuffer() : offset(kHeaderSize) {}
   //! Disable copy constructor
   DeSerializeBuffer(DeSerializeBuffer&&) = default;
   //! Move constructor
   //! @param v vector to act as deserialize buffer
   //! @param start offset to start saving data into
   DeSerializeBuffer(vTy&& v, uint32_t start = 0)
-      : bufdata(std::move(v)), offset(start) {}
+      : bufdata(std::move(v)), offset(start + kHeaderSize) {
+    assert(bufdata.size() >= offset);
+  }
 
   //! Constructor that takes an existing vector to use as the deserialize
   //! buffer
   explicit DeSerializeBuffer(vTy& data) {
     bufdata.swap(data);
-    offset = 0;
+    offset = kHeaderSize;
   }
 
   /**
    * Initializes the deserialize buffer with a certain size
    * @param [in] count size to initialize buffer to
    */
-  explicit DeSerializeBuffer(int count) : bufdata(count), offset(0) {}
+  explicit DeSerializeBuffer(int count)
+      : bufdata(count + kHeaderSize), offset(kHeaderSize) {}
 
   /**
    * Initializes the deserialize buffer using vector initialization from
    * 2 iterators.
    */
   template <typename Iter>
-  DeSerializeBuffer(Iter b, Iter e) : bufdata(b, e), offset{0} {}
+  DeSerializeBuffer(Iter b, Iter e) : bufdata(b, e), offset{kHeaderSize} {}
 
   /**
    * Initialize a deserialize buffer from a serialize buffer
    */
-  explicit DeSerializeBuffer(SerializeBuffer&& buf) : offset(0) {
+  explicit DeSerializeBuffer(SerializeBuffer&& buf) : offset(kHeaderSize) {
     bufdata.swap(buf.bufdata);
   }
 
@@ -207,31 +206,15 @@ class DeSerializeBuffer {
    * @param count new size of buffer
    */
   void reset(int count) {
-    offset = 0;
-    bufdata.resize(count);
-  }
-
-  //! Gets the current offset into the deserialize buffer
-  unsigned getOffset() const { return offset; }
-  //! Sets the offset into the deserialize buffer
-  void setOffset(unsigned off) {
-    assert(off <= size());
-    offset = off;
+    offset = kHeaderSize;
+    bufdata.resize(count + kHeaderSize);
   }
 
-  //! Gets the size of the deserialize buffer
-  unsigned size() const { return bufdata.size(); }
-
-  //! Returns true if the deserialize buffer is empty
-  //! @returns true if the deserialize buffer is empty
-  bool empty() const { return bufdata.empty(); }
-
   //! Get the next character in the deserialize buffer
   unsigned char pop() { return bufdata.at(offset++); }
 
-  //! Clears the last x bytes of the deserialize buffer, resizing it as well
-  //! @param x How many bytes from the end to clear
-  void pop_back(unsigned x) { bufdata.resize(bufdata.size() - x); }
+  //! Gets the size of the deserialize buffer
+  unsigned size() const { return bufdata.size() - offset; }
 
   /**
    * Extracts a certain amount of data from the deserialize buffer
@@ -240,6 +223,8 @@ class DeSerializeBuffer {
    * @param num Amount of data to get from deserialize buffer
    */
   void extract(uint8_t* dst, size_t num) {
+    assert(offset >= kHeaderSize);
+    assert((offset + num) <= bufdata.size());
     if (num > 0) {
       std::copy_n(&bufdata[offset], num, dst);
       offset += num;
@@ -248,37 +233,13 @@ class DeSerializeBuffer {
 
   //! Get the underlying vector storing the data of the deserialize
   //! buffer
-  vTy& getVec() { return bufdata; }
+  vTy& get() { return bufdata; }
 
   //! Get a pointer to the underlying data of the deserialize buffer
-  void* linearData() { return &bufdata[0]; }
+  void* linearData() { return &bufdata[offset]; }
 
-  //! Get a pointer to the remaining data of the deserialize buffer
-  //! (as determined by offset)
-  const uint8_t* r_linearData() const { return &bufdata[offset]; }
-  //! Get the remaining size of the deserialize buffer (as determined
-  //! by offset)
-  size_t r_size() const { return bufdata.size() - offset; }
-
-  //! Checks if the current location in the deserialize buffer is aligned
-  //! to some size a
-  bool atAlignment(size_t a) { return (uintptr_t)r_linearData() % a == 0; }
-
-  //! Utility print of deserialize buffer
-  //! @param o stream to print to
-  void print(std::ostream& o) const {
-    o << "<{(" << offset << ") " << std::hex;
-    for (auto ii = bufdata.begin(), ee = bufdata.end(); ii != ee; ++ii)
-      o << (unsigned int)*ii << " ";
-    o << std::dec << "}>";
-  }
-
-  //! Operator for printing deserialize buffer
-  friend std::ostream& operator<<(std::ostream& os,
-                                  const DeSerializeBuffer& buf) {
-    buf.print(os);
-    return os;
-  }
+  const uint8_t* data() const { return &bufdata[offset]; }
+  uint8_t* data() { return &bufdata[offset]; }
 };
 
 namespace internal {
@@ -411,7 +372,7 @@ inline size_t gSizedObj(const SerializeBuffer& data) { return data.size(); }
  *
  * @returns size of the deserialize buffer passed into it
  */
-inline size_t gSizedObj(const DeSerializeBuffer& rbuf) { return rbuf.r_size(); }
+inline size_t gSizedObj(const DeSerializeBuffer& rbuf) { return rbuf.size(); }
 
 /**
  * Returns the size of the passed in insert bag.
@@ -682,7 +643,7 @@ inline void gSerializeObj(SerializeBuffer& buf,
  * @param [in] data serialize buffer to get data from
  */
 inline void gSerializeObj(SerializeBuffer& buf, const SerializeBuffer& data) {
-  buf.insert(data.linearData(), data.size());
+  buf.insert(data.data(), data.size());
 }
 
 /**
@@ -693,7 +654,7 @@ inline void gSerializeObj(SerializeBuffer& buf, const SerializeBuffer& data) {
  */
 inline void gSerializeObj(SerializeBuffer& buf, const DeSerializeBuffer& rbuf) {
   //  buf.reserve(rbuf.r_size());
-  buf.insert(rbuf.r_linearData(), rbuf.r_size());
+  buf.insert(rbuf.data(), rbuf.size());
 }
 
 /**
@@ -757,8 +718,10 @@ gSerializeLazySeq(SerializeBuffer& buf, unsigned num, Seq*) {
                 "Not POD Sequence");
   typename Seq::size_type size = num;
   internal::gSerializeObj(buf, size);
-  size_t tsize = sizeof(typename Seq::value_type);
-  return LazyRef<typename Seq::value_type>{buf.encomber(tsize * num)};
+  size_t tsize    = sizeof(typename Seq::value_type);
+  size_t cur_size = buf.size();
+  buf.resize(cur_size + (tsize * num));
+  return LazyRef<typename Seq::value_type>{cur_size};
 }
 
 /**
@@ -980,18 +943,10 @@ void gDeserializeSeq(DeSerializeBuffer& buf, Seq& seq) {
 template <typename Seq>
 void gDeserializeLinearSeq(DeSerializeBuffer& buf, Seq& seq) {
   typedef typename Seq::value_type T;
-  //  seq.clear();
   typename Seq::size_type size;
   gDeserializeObj(buf, size);
-  // If the alignment is right, cast to a T array and insert
-  if (buf.atAlignment(alignof(T))) {
-    T* src = (T*)buf.r_linearData();
-    seq.assign(src, &src[size]);
-    buf.setOffset(buf.getOffset() + size * sizeof(T));
-  } else {
-    seq.resize(size);
-    buf.extract((uint8_t*)seq.data(), size * sizeof(T));
-  }
+  seq.resize(size);
+  buf.extract((uint8_t*)seq.data(), size * sizeof(T));
 }
 
 /**
@@ -1025,7 +980,7 @@ template <typename T>
 void gDeserializeObj(DeSerializeBuffer& buf, galois::BufferWrapper<T>& bf) {
   if (is_memory_copyable<T>::value) {
     // manual deserialization here
-    size_t buffer_size;
+    size_t buffer_size{0};
     gDeserializeObj(buf, buffer_size);
     bf.resize(buffer_size);
     buf.extract((uint8_t*)bf.get_vec_data(), buffer_size * sizeof(T));
@@ -1097,9 +1052,10 @@ inline void gDeserialize(DeSerializeBuffer&) {}
  * @param data Object to save data in the iterator type into
  */
 template <typename Iter, typename T>
-auto gDeserializeRaw(Iter iter, T& data) -> decltype(
-    std::declval<typename std::enable_if<is_memory_copyable<T>::value>::type>(),
-    Iter()) {
+auto gDeserializeRaw(Iter iter, T& data)
+    -> decltype(std::declval<typename std::enable_if<
+                    is_memory_copyable<T>::value>::type>(),
+                Iter()) {
   unsigned char* pdata = (unsigned char*)&data;
   for (size_t i = 0; i < sizeof(T); ++i)
     pdata[i] = *iter++;
diff --git a/libdist/src/DistStats.cpp b/libdist/src/DistStats.cpp
index 8faf4cee5a..e8399451f3 100644
--- a/libdist/src/DistStats.cpp
+++ b/libdist/src/DistStats.cpp
@@ -105,8 +105,8 @@ void DistStatManager::combineAtHost_0_helper(void) {
       SendBuffer b;
       gSerialize(b, hTotalMap.region(i), hTotalMap.category(i),
                  hTotalMap.stat(i).totalTy());
-      getSystemNetworkInterface().sendTagged(0, galois::runtime::evilPhase, b,
-                                             syncTypePhase);
+      getSystemNetworkInterface().sendTagged(0, galois::runtime::evilPhase,
+                                             std::move(b), syncTypePhase);
     }
   }
 
@@ -126,8 +126,8 @@ void DistStatManager::combineAtHost_0_helper(void) {
     } else {
       SendBuffer b;
       gSerialize(b, ln, cat, thrdTotal, totalTy, thrdVals);
-      getSystemNetworkInterface().sendTagged(0, galois::runtime::evilPhase, b,
-                                             syncTypePhase);
+      getSystemNetworkInterface().sendTagged(0, galois::runtime::evilPhase,
+                                             std::move(b), syncTypePhase);
     }
   }
 }
@@ -151,8 +151,8 @@ void DistStatManager::combineAtHost_0_helper2(void) {
     } else {
       SendBuffer b;
       gSerialize(b, ln, cat, thrdTotal, totalTy, thrdVals);
-      getSystemNetworkInterface().sendTagged(0, galois::runtime::evilPhase, b,
-                                             syncTypePhase);
+      getSystemNetworkInterface().sendTagged(0, galois::runtime::evilPhase,
+                                             std::move(b), syncTypePhase);
     }
   }
 
@@ -172,8 +172,8 @@ void DistStatManager::combineAtHost_0_helper2(void) {
     } else {
       SendBuffer b;
       gSerialize(b, ln, cat, thrdTotal, totalTy, thrdVals);
-      getSystemNetworkInterface().sendTagged(0, galois::runtime::evilPhase, b,
-                                             syncTypePhase);
+      getSystemNetworkInterface().sendTagged(0, galois::runtime::evilPhase,
+                                             std::move(b), syncTypePhase);
     }
   }
 }
@@ -182,10 +182,10 @@ void DistStatManager::receiveAtHost_0_helper(void) {
   size_t syncTypePhase = 0;
   {
     decltype(getSystemNetworkInterface().recieveTagged(
-        galois::runtime::evilPhase, nullptr, syncTypePhase)) p;
+        galois::runtime::evilPhase, syncTypePhase)) p;
     do {
       p = getSystemNetworkInterface().recieveTagged(galois::runtime::evilPhase,
-                                                    nullptr, syncTypePhase);
+                                                    syncTypePhase);
 
       if (p) {
         RecvBuffer& b = p->second;
@@ -203,10 +203,10 @@ void DistStatManager::receiveAtHost_0_helper(void) {
   ++syncTypePhase;
   {
     decltype(getSystemNetworkInterface().recieveTagged(
-        galois::runtime::evilPhase, nullptr, syncTypePhase)) p;
+        galois::runtime::evilPhase, syncTypePhase)) p;
     do {
       p = getSystemNetworkInterface().recieveTagged(galois::runtime::evilPhase,
-                                                    nullptr, syncTypePhase);
+                                                    syncTypePhase);
 
       if (p) {
         uint32_t hostID = p->first;
@@ -230,10 +230,10 @@ void DistStatManager::receiveAtHost_0_helper2(void) {
   size_t syncTypePhase = 0;
   {
     decltype(getSystemNetworkInterface().recieveTagged(
-        galois::runtime::evilPhase, nullptr, syncTypePhase)) p;
+        galois::runtime::evilPhase, syncTypePhase)) p;
     do {
       p = getSystemNetworkInterface().recieveTagged(galois::runtime::evilPhase,
-                                                    nullptr, syncTypePhase);
+                                                    syncTypePhase);
 
       if (p) {
         uint32_t hostID = p->first;
@@ -255,10 +255,10 @@ void DistStatManager::receiveAtHost_0_helper2(void) {
   ++syncTypePhase;
   {
     decltype(getSystemNetworkInterface().recieveTagged(
-        galois::runtime::evilPhase, nullptr, syncTypePhase)) p;
+        galois::runtime::evilPhase, syncTypePhase)) p;
     do {
       p = getSystemNetworkInterface().recieveTagged(galois::runtime::evilPhase,
-                                                    nullptr, syncTypePhase);
+                                                    syncTypePhase);
 
       if (p) {
         uint32_t hostID = p->first;
diff --git a/libdist/src/Network.cpp b/libdist/src/Network.cpp
index 44a189f7ad..7bf499a00b 100644
--- a/libdist/src/Network.cpp
+++ b/libdist/src/Network.cpp
@@ -93,7 +93,7 @@ void NetworkInterface::sendMsg(uint32_t dest,
                                void (*recv)(uint32_t, RecvBuffer&),
                                SendBuffer& buf) {
   gSerialize(buf, recv);
-  sendTagged(dest, 0, buf);
+  sendTagged(dest, 0, std::move(buf));
 }
 
 void NetworkInterface::broadcast(void (*recv)(uint32_t, RecvBuffer&),
@@ -104,30 +104,14 @@ void NetworkInterface::broadcast(void (*recv)(uint32_t, RecvBuffer&),
     if (x != ID) {
       SendBuffer b;
       gSerialize(b, fp, buf, (uintptr_t)&bcastLandingPad);
-      sendTagged(x, 0, b);
+      sendTagged(x, 0, std::move(b));
     } else if (self) {
-      RecvBuffer rb(buf.begin(), buf.end());
+      RecvBuffer rb = RecvBuffer(std::move(buf.get()));
       recv(ID, rb);
     }
   }
 }
 
-void NetworkInterface::handleReceives() {
-  std::unique_lock<substrate::SimpleLock> lg;
-  auto opt = recieveTagged(0, &lg);
-  while (opt) {
-    uint32_t src    = std::get<0>(*opt);
-    RecvBuffer& buf = std::get<1>(*opt);
-    uintptr_t fp    = 0;
-    gDeserializeRaw(buf.r_linearData() + buf.r_size() - sizeof(uintptr_t), fp);
-    buf.pop_back(sizeof(uintptr_t));
-    assert(fp);
-    auto f = (void (*)(uint32_t, RecvBuffer&))fp;
-    f(src, buf);
-    opt = recieveTagged(0, &lg);
-  }
-}
-
 NetworkInterface& galois::runtime::getSystemNetworkInterface() {
 #ifndef GALOIS_USE_LCI
   return makeNetworkBuffered();
diff --git a/libdist/src/NetworkBuffered.cpp b/libdist/src/NetworkBuffered.cpp
index 7b6d6c6ce1..a58f16c3ab 100644
--- a/libdist/src/NetworkBuffered.cpp
+++ b/libdist/src/NetworkBuffered.cpp
@@ -67,6 +67,12 @@ class NetworkInterfaceBuffered : public NetworkInterface {
   // using vTy = std::vector<uint8_t>;
   using vTy = galois::PODResizeableArray<uint8_t>;
 
+  static constexpr size_t kHeaderSize     = sizeof(BufferHeader);
+  static constexpr uint8_t kMaxSegmentTag = std::numeric_limits<uint8_t>::max();
+  static constexpr size_t kMaxBufferSize =
+      static_cast<size_t>(std::numeric_limits<int>::max());
+  static constexpr size_t kMaxDataSize = kMaxBufferSize - kHeaderSize;
+
   /**
    * Receive buffers for the buffered network interface
    */
@@ -77,6 +83,38 @@ class NetworkInterfaceBuffered : public NetworkInterface {
     // tag of head of queue
     std::atomic<uint32_t> dataPresent;
 
+    struct PartialMessages {
+      uint8_t num_segments{0};
+      std::vector<vTy> segments;
+    };
+    std::unordered_map<uint8_t, PartialMessages> partial_messages_map_;
+
+    std::optional<vTy> CombinePartialMessages(const BufferHeader& header,
+                                              vTy&& vec) {
+      auto& partial_messages = partial_messages_map_[header.segment_tag];
+      if (partial_messages.num_segments == 0) {
+        partial_messages.segments.resize(header.num_segments);
+      }
+
+      partial_messages.segments[header.segment_id] = std::move(vec);
+      ++partial_messages.num_segments;
+
+      if (partial_messages.num_segments != header.num_segments) {
+        assert(partial_messages.num_segments < header.num_segments);
+        assert(partial_messages.segments.size() == header.num_segments);
+        return std::nullopt;
+      }
+
+      std::vector<vTy>& segments = partial_messages.segments;
+      vTy message                = std::move(segments[0]);
+      for (size_t i = 1, end = segments.size(); i < end; ++i) {
+        message.insert(message.end(), segments[i].begin() + kHeaderSize,
+                       segments[i].end());
+      }
+      partial_messages_map_.erase(header.segment_tag);
+      return std::make_optional(std::move(message));
+    }
+
     bool sizeAtLeast(size_t n, uint32_t tag) {
       size_t tot = -frontOffset;
       for (auto& v : data) {
@@ -163,30 +201,6 @@ class NetworkInterfaceBuffered : public NetworkInterface {
     std::optional<RecvBuffer> popMsg(uint32_t tag,
                                      std::atomic<size_t>& inflightRecvs) {
       std::lock_guard<SimpleLock> lg(qlock);
-#ifndef NO_AGG
-      uint32_t len = getLenFromFront(tag);
-      //      assert(len);
-      if (len == ~0U || len == 0)
-        return std::optional<RecvBuffer>();
-      if (!sizeAtLeast(sizeof(uint32_t) + len, tag))
-        return std::optional<RecvBuffer>();
-      erase(4, inflightRecvs);
-
-      // Try just using the buffer
-      if (auto r = popVec(len, inflightRecvs)) {
-        auto start = r->size() - len;
-        //        std::cerr << "FP " << r->size() << " " << len << " " << start
-        //        << "\n";
-        return std::optional<RecvBuffer>(RecvBuffer(std::move(*r), start));
-      }
-
-      RecvBuffer buf(len);
-      // FIXME: This is slows things down 25%
-      copyOut((char*)buf.linearData(), len);
-      erase(len, inflightRecvs);
-      // std::cerr << "p " << tag << " " << len << "\n";
-      return std::optional<RecvBuffer>(std::move(buf));
-#else
       if (data.empty() || data.front().tag != tag)
         return std::optional<RecvBuffer>();
 
@@ -201,31 +215,28 @@ class NetworkInterfaceBuffered : public NetworkInterface {
       }
 
       return std::optional<RecvBuffer>(RecvBuffer(std::move(vec), 0));
-#endif
     }
 
     // Worker thread interface
-    void add(NetworkIO::message m) {
+    bool add(NetworkIO::message m) {
+      BufferHeader* header = reinterpret_cast<BufferHeader*>(m.data.data());
+      if (header->type == BufferHeader::BufferType::kPartialMessage) {
+        std::optional<vTy> segment =
+            CombinePartialMessages(*header, std::move(m.data));
+        if (!segment) {
+          return false;
+        }
+
+        m.data = std::move(*segment);
+      }
       std::lock_guard<SimpleLock> lg(qlock);
       if (data.empty()) {
         galois::runtime::trace("ADD LATEST ", m.tag);
         dataPresent = m.tag;
       }
 
-      // std::cerr << m.data.size() << " " <<
-      //              std::count(m.data.begin(), m.data.end(), 0) << "\n";
-      // for (auto x : m.data) {
-      //   std::cerr << (int) x << " ";
-      // }
-      // std::cerr << "\n";
-      // std::cerr << "A " << m.host << " " << m.tag << " " << m.data.size() <<
-      // "\n";
-
       data.push_back(std::move(m));
-
-      assert(data.back().data.size() !=
-             (unsigned int)std::count(data.back().data.begin(),
-                                      data.back().data.end(), 0));
+      return true;
     }
 
     bool hasData(uint32_t tag) { return dataPresent == tag; }
@@ -245,7 +256,7 @@ class NetworkInterfaceBuffered : public NetworkInterface {
     struct msg {
       uint32_t tag;
       vTy data;
-      msg(uint32_t t, vTy& _data) : tag(t), data(std::move(_data)) {}
+      msg(uint32_t t, vTy&& _data) : tag(t), data(std::move(_data)) {}
     };
 
     std::deque<msg> messages;
@@ -254,6 +265,43 @@ class NetworkInterfaceBuffered : public NetworkInterface {
     //! @todo FIXME track time since some epoch in an atomic.
     std::chrono::high_resolution_clock::time_point time;
     SimpleLock lock, timelock;
+    uint8_t segment_tag_{0};
+
+    void IncrementSegmentTag() {
+      if (segment_tag_ == kMaxSegmentTag) {
+        segment_tag_ = 0;
+      } else {
+        ++segment_tag_;
+      }
+    }
+
+    std::vector<NetworkIO::message> Split(uint32_t host, uint32_t tag,
+                                          vTy&& vec) {
+      std::vector<vTy> segments;
+      segments.emplace_back(std::move(vec));
+      auto begin = segments[0].begin();
+      for (size_t i = kMaxBufferSize, end = segments[0].size(); i < end;
+           i += kMaxDataSize) {
+        vTy segment(kHeaderSize);
+        size_t segment_end = std::min(end, i + kMaxDataSize);
+        segment.insert(segment.end(), begin + i, begin + segment_end);
+        segments.emplace_back(std::move(segment));
+      }
+      segments[0].resize(kMaxBufferSize);
+
+      std::vector<NetworkIO::message> msg;
+      for (size_t i = 0; i < segments.size(); ++i) {
+        auto& segment        = segments[i];
+        BufferHeader* header = reinterpret_cast<BufferHeader*>(segment.data());
+        header->type         = BufferHeader::BufferType::kPartialMessage;
+        header->num_segments = segments.size();
+        header->segment_id   = i;
+        header->segment_tag  = segment_tag_;
+        msg.emplace_back(host, tag, std::move(segment));
+      }
+      IncrementSegmentTag();
+      return msg;
+    }
 
   public:
     unsigned long statSendTimeout;
@@ -269,103 +317,35 @@ class NetworkInterfaceBuffered : public NetworkInterface {
       }
     }
 
-    bool ready() {
-#ifndef NO_AGG
-      if (numBytes == 0)
-        return false;
-      if (urgent) {
-        ++statSendUrgent;
-        return true;
-      }
-      if (numBytes > COMM_MIN) {
-        ++statSendOverflow;
-        return true;
-      }
-      auto n = std::chrono::high_resolution_clock::now();
-      decltype(n) mytime;
-      {
-        std::lock_guard<SimpleLock> lg(timelock);
-        mytime = time;
-      }
-      auto elapsed =
-          std::chrono::duration_cast<std::chrono::microseconds>(n - mytime);
-      if (elapsed.count() > COMM_DELAY) {
-        ++statSendTimeout;
-        return true;
-      }
-      return false;
-#else
-      return messages.size() > 0;
-#endif
-    }
+    bool ready() { return messages.size() > 0; }
 
-    std::pair<uint32_t, vTy>
-    assemble(std::atomic<size_t>& GALOIS_UNUSED(inflightSends)) {
+    std::vector<NetworkIO::message> assemble(uint32_t host) {
       std::unique_lock<SimpleLock> lg(lock);
-      if (messages.empty())
-        return std::make_pair(~0, vTy());
-#ifndef NO_AGG
-      // compute message size
-      uint32_t len = 0;
-      int num      = 0;
-      uint32_t tag = messages.front().tag;
-      for (auto& m : messages) {
-        if (m.tag != tag) {
-          break;
-        } else {
-          // do not let it go over the integer limit because MPI_Isend cannot
-          // deal with it
-          if ((m.data.size() + sizeof(uint32_t) + len + num) >
-              static_cast<size_t>(std::numeric_limits<int>::max())) {
-            break;
-          }
-          len += m.data.size();
-          num += sizeof(uint32_t);
-        }
-      }
-      lg.unlock();
-      // construct message
-      vTy vec;
-      vec.reserve(len + num);
-      // go out of our way to avoid locking out senders when making messages
-      lg.lock();
-      do {
-        auto& m = messages.front();
-        lg.unlock();
-        union {
-          uint32_t a;
-          uint8_t b[sizeof(uint32_t)];
-        } foo;
-        foo.a = m.data.size();
-        vec.insert(vec.end(), &foo.b[0], &foo.b[sizeof(uint32_t)]);
-        vec.insert(vec.end(), m.data.begin(), m.data.end());
-        if (urgent)
-          --urgent;
-        lg.lock();
-        messages.pop_front();
-        --inflightSends;
-      } while (vec.size() < len + num);
-      ++inflightSends;
-      numBytes -= len;
-#else
+      assert(!messages.empty());
       uint32_t tag = messages.front().tag;
       vTy vec(std::move(messages.front().data));
       messages.pop_front();
-#endif
-      return std::make_pair(tag, std::move(vec));
+
+      if (vec.size() > kMaxBufferSize) {
+        return Split(host, tag, std::move(vec));
+      }
+
+      BufferHeader* header = reinterpret_cast<BufferHeader*>(vec.data());
+      header->type         = BufferHeader::BufferType::kSingleMessage;
+      std::vector<NetworkIO::message> msgs;
+      msgs.emplace_back(host, tag, std::move(vec));
+      return msgs;
     }
 
-    void add(uint32_t tag, vTy& b) {
+    void add(uint32_t tag, vTy&& b) {
       std::lock_guard<SimpleLock> lg(lock);
       if (messages.empty()) {
         std::lock_guard<SimpleLock> lg(timelock);
         time = std::chrono::high_resolution_clock::now();
       }
-      unsigned oldNumBytes = numBytes;
+      assert(b.size() >= kHeaderSize);
       numBytes += b.size();
-      galois::runtime::trace("BufferedAdd", oldNumBytes, numBytes, tag,
-                             galois::runtime::printVec(b));
-      messages.emplace_back(tag, b);
+      messages.emplace_back(tag, std::move(b));
     }
   }; // end send buffer class
 
@@ -402,24 +382,26 @@ class NetworkInterfaceBuffered : public NetworkInterface {
         // handle send queue i
         auto& sd = sendData[i];
         if (sd.ready()) {
-          NetworkIO::message msg;
-          msg.host                    = i;
-          std::tie(msg.tag, msg.data) = sd.assemble(inflightSends);
-          galois::runtime::trace("BufferedSending", msg.host, msg.tag,
-                                 galois::runtime::printVec(msg.data));
-          ++statSendEnqueued;
-          netio->enqueue(std::move(msg));
+          std::vector<NetworkIO::message> msgs = sd.assemble(i);
+          if (msgs.size() > 1) {
+            inflightSends += msgs.size() - 1;
+          }
+
+          for (auto& msg : msgs) {
+            ++statSendEnqueued;
+            netio->enqueue(std::move(msg));
+          }
         }
+
         // handle receive
         NetworkIO::message rdata = netio->dequeue();
         if (rdata.data.size()) {
           ++statRecvDequeued;
-          assert(rdata.data.size() !=
-                 (unsigned int)std::count(rdata.data.begin(), rdata.data.end(),
-                                          0));
-          galois::runtime::trace("BufferedRecieving", rdata.host, rdata.tag,
-                                 galois::runtime::printVec(rdata.data));
-          recvData[rdata.host].add(std::move(rdata));
+          uint32_t h               = rdata.host;
+          bool not_partial_segment = recvData[h].add(std::move(rdata));
+          if (!not_partial_segment) {
+            --inflightRecvs;
+          }
         }
       }
     }
@@ -454,22 +436,19 @@ class NetworkInterfaceBuffered : public NetworkInterface {
 
   std::unique_ptr<galois::runtime::NetworkIO> netio;
 
-  virtual void sendTagged(uint32_t dest, uint32_t tag, SendBuffer& buf,
+  virtual void sendTagged(uint32_t dest, uint32_t tag, SendBuffer&& buf,
                           int phase) {
-    ++inflightSends;
     tag += phase;
     statSendNum += 1;
-    statSendBytes += buf.size();
-    galois::runtime::trace("sendTagged", dest, tag,
-                           galois::runtime::printVec(buf.getVec()));
+    statSendBytes += buf.size() + kHeaderSize;
+    memUsageTracker.incrementMemUsage(buf.size() + kHeaderSize);
+    ++inflightSends;
     auto& sd = sendData[dest];
-    sd.add(tag, buf.getVec());
+    sd.add(tag, std::move(buf.get()));
   }
 
   virtual std::optional<std::pair<uint32_t, RecvBuffer>>
-  recieveTagged(uint32_t tag,
-                std::unique_lock<galois::substrate::SimpleLock>* rlg,
-                int phase) {
+  recieveTagged(uint32_t tag, int phase) {
     tag += phase;
     for (unsigned h = 0; h < recvData.size(); ++h) {
       auto& rq = recvData[h];
@@ -480,12 +459,8 @@ class NetworkInterfaceBuffered : public NetworkInterface {
           auto buf = rq.popMsg(tag, inflightRecvs);
           if (buf) {
             ++statRecvNum;
-            statRecvBytes += buf->size();
-            memUsageTracker.decrementMemUsage(buf->size());
-            if (rlg)
-              *rlg = std::move(lg);
-            galois::runtime::trace("recvTagged", h, tag,
-                                   galois::runtime::printVec(buf->getVec()));
+            statRecvBytes += buf->size() + kHeaderSize;
+            memUsageTracker.decrementMemUsage(buf->size() + kHeaderSize);
             anyReceivedMessages = true;
             return std::optional<std::pair<uint32_t, RecvBuffer>>(
                 std::make_pair(h, std::move(*buf)));
diff --git a/libdist/src/NetworkLCI.cpp b/libdist/src/NetworkLCI.cpp
index 59b17a1d35..3770356c8c 100644
--- a/libdist/src/NetworkLCI.cpp
+++ b/libdist/src/NetworkLCI.cpp
@@ -182,8 +182,8 @@ class NetworkInterfaceLCI : public NetworkInterface {
     statSendBytes += buf.size();
     // int count = 0;
 #ifndef GALOIS_SUPPORT_ASYNC
-    if (buf.getVec().size() < 8192) {
-      while (lc_sendm(buf.getVec().data(), buf.getVec().size(), dest, tag,
+    if (buf.get().size() < 8192) {
+      while (lc_sendm(buf.get().data(), buf.get().size(), dest, tag,
                       lc_p2p_ep[phase]) != LC_OK) {
         sched_yield();
       }
@@ -191,7 +191,7 @@ class NetworkInterfaceLCI : public NetworkInterface {
 #endif
     {
       pendingReq* msg =
-          new pendingReq(dest, tag, phase, buf.getVec(), inflightSends);
+          new pendingReq(dest, tag, phase, buf.get(), inflightSends);
       while (lc_sendl(msg->buf.data(), msg->buf.size(), dest, tag,
                       lc_p2p_ep[phase], free_req, msg) != LC_OK) {
         sched_yield();
diff --git a/libgluon/include/galois/graphs/GluonEdgeSubstrate.h b/libgluon/include/galois/graphs/GluonEdgeSubstrate.h
index 7e39a5b7c0..7342c9a57e 100644
--- a/libgluon/include/galois/graphs/GluonEdgeSubstrate.h
+++ b/libgluon/include/galois/graphs/GluonEdgeSubstrate.h
@@ -133,7 +133,7 @@ class GluonEdgeSubstrate : public galois::runtime::GlobalObject {
 
       galois::runtime::SendBuffer b;
       gSerialize(b, mirrorEdges[x]);
-      net.sendTagged(x, galois::runtime::evilPhase, b);
+      net.sendTagged(x, galois::runtime::evilPhase, std::move(b));
     }
 
     // receive the mirror edges
@@ -141,9 +141,9 @@ class GluonEdgeSubstrate : public galois::runtime::GlobalObject {
       if (x == id)
         continue;
 
-      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
       do {
-        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+        p = net.recieveTagged(galois::runtime::evilPhase);
       } while (!p);
 
       galois::runtime::gDeserialize(p->second, masterEdges[p->first]);
@@ -169,7 +169,7 @@ class GluonEdgeSubstrate : public galois::runtime::GlobalObject {
 
       galois::runtime::SendBuffer b;
       gSerialize(b, totalMirrorEdges, totalOwnedEdges);
-      net.sendTagged(x, galois::runtime::evilPhase, b);
+      net.sendTagged(x, galois::runtime::evilPhase, std::move(b));
     }
 
     // receive
@@ -177,9 +177,9 @@ class GluonEdgeSubstrate : public galois::runtime::GlobalObject {
       if (x == id)
         continue;
 
-      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
       do {
-        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+        p = net.recieveTagged(galois::runtime::evilPhase);
       } while (!p);
 
       uint64_t totalMirrorFromOther;
@@ -1097,9 +1097,9 @@ class GluonEdgeSubstrate : public galois::runtime::GlobalObject {
   template <typename FnTy, SyncType syncType>
   inline bool extractBatchWrapper(unsigned x, galois::runtime::SendBuffer& b) {
     if (syncType == syncReduce) {
-      return FnTy::extract_reset_batch(x, b.getVec().data());
+      return FnTy::extract_reset_batch(x, b.data());
     } else {
-      return FnTy::extract_batch(x, b.getVec().data());
+      return FnTy::extract_batch(x, b.data());
     }
   }
 
@@ -1125,9 +1125,9 @@ class GluonEdgeSubstrate : public galois::runtime::GlobalObject {
   inline bool extractBatchWrapper(unsigned x, galois::runtime::SendBuffer& b,
                                   size_t& s, DataCommMode& data_mode) {
     if (syncType == syncReduce) {
-      return FnTy::extract_reset_batch(x, b.getVec().data(), &s, &data_mode);
+      return FnTy::extract_reset_batch(x, b.data(), &s, &data_mode);
     } else {
-      return FnTy::extract_batch(x, b.getVec().data(), &s, &data_mode);
+      return FnTy::extract_batch(x, b.data(), &s, &data_mode);
     }
   }
 
@@ -1243,12 +1243,12 @@ class GluonEdgeSubstrate : public galois::runtime::GlobalObject {
   template <typename FnTy, SyncType syncType, bool async>
   inline bool setBatchWrapper(unsigned x, galois::runtime::RecvBuffer& b) {
     if (syncType == syncReduce) {
-      return FnTy::reduce_batch(x, b.getVec().data() + b.getOffset());
+      return FnTy::reduce_batch(x, b.data());
     } else {
       if (async) {
-        return FnTy::reduce_mirror_batch(x, b.getVec().data() + b.getOffset());
+        return FnTy::reduce_mirror_batch(x, b.data());
       } else {
-        return FnTy::setVal_batch(x, b.getVec().data() + b.getOffset());
+        return FnTy::setVal_batch(x, b.data());
       }
     }
   }
@@ -1273,15 +1273,12 @@ class GluonEdgeSubstrate : public galois::runtime::GlobalObject {
   inline bool setBatchWrapper(unsigned x, galois::runtime::RecvBuffer& b,
                               DataCommMode& data_mode) {
     if (syncType == syncReduce) {
-      return FnTy::reduce_batch(x, b.getVec().data() + b.getOffset(),
-                                data_mode);
+      return FnTy::reduce_batch(x, b.data(), data_mode);
     } else {
       if (async) {
-        return FnTy::reduce_mirror_batch(x, b.getVec().data() + b.getOffset(),
-                                         data_mode);
+        return FnTy::reduce_mirror_batch(x, b.data(), data_mode);
       } else {
-        return FnTy::setVal_batch(x, b.getVec().data() + b.getOffset(),
-                                  data_mode);
+        return FnTy::setVal_batch(x, b.data(), data_mode);
       }
     }
   }
@@ -1723,7 +1720,8 @@ class GluonEdgeSubstrate : public galois::runtime::GlobalObject {
         size_t syncTypePhase = 0;
         if (async && (syncType == syncBroadcast))
           syncTypePhase = 1;
-        net.sendTagged(x, galois::runtime::evilPhase, b, syncTypePhase);
+        net.sendTagged(x, galois::runtime::evilPhase, std::move(b),
+                       syncTypePhase);
         ++numMessages;
       }
     }
@@ -1958,11 +1956,9 @@ class GluonEdgeSubstrate : public galois::runtime::GlobalObject {
       size_t syncTypePhase = 0;
       if (syncType == syncBroadcast)
         syncTypePhase = 1;
-      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr,
-                                 syncTypePhase)) p;
+      decltype(net.recieveTagged(galois::runtime::evilPhase, syncTypePhase)) p;
       do {
-        p = net.recieveTagged(galois::runtime::evilPhase, nullptr,
-                              syncTypePhase);
+        p = net.recieveTagged(galois::runtime::evilPhase, syncTypePhase);
 
         if (p) {
           syncRecvApply<syncType, SyncFnTy, BitsetFnTy, async>(
@@ -1977,9 +1973,9 @@ class GluonEdgeSubstrate : public galois::runtime::GlobalObject {
           continue;
 
         Twait.start();
-        decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+        decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
         do {
-          p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+          p = net.recieveTagged(galois::runtime::evilPhase);
         } while (!p);
         Twait.stop();
 
diff --git a/libgluon/include/galois/graphs/GluonSubstrate.h b/libgluon/include/galois/graphs/GluonSubstrate.h
index 9e7a7738a4..7a1e5b6665 100644
--- a/libgluon/include/galois/graphs/GluonSubstrate.h
+++ b/libgluon/include/galois/graphs/GluonSubstrate.h
@@ -190,7 +190,7 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
 
       galois::runtime::SendBuffer b;
       gSerialize(b, mirrorNodes[x]);
-      net.sendTagged(x, galois::runtime::evilPhase, b);
+      net.sendTagged(x, galois::runtime::evilPhase, std::move(b));
     }
 
     // receive the mirror nodes
@@ -198,9 +198,9 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
       if (x == id)
         continue;
 
-      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
       do {
-        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+        p = net.recieveTagged(galois::runtime::evilPhase);
       } while (!p);
 
       galois::runtime::gDeserialize(p->second, masterNodes[p->first]);
@@ -226,7 +226,7 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
 
       galois::runtime::SendBuffer b;
       gSerialize(b, global_total_mirror_nodes, global_total_owned_nodes);
-      net.sendTagged(x, galois::runtime::evilPhase, b);
+      net.sendTagged(x, galois::runtime::evilPhase, std::move(b));
     }
 
     // receive
@@ -234,9 +234,9 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
       if (x == id)
         continue;
 
-      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
       do {
-        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+        p = net.recieveTagged(galois::runtime::evilPhase);
       } while (!p);
 
       uint64_t total_mirror_nodes_from_others;
@@ -1348,9 +1348,9 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
   template <typename FnTy, SyncType syncType>
   inline bool extractBatchWrapper(unsigned x, galois::runtime::SendBuffer& b) {
     if (syncType == syncReduce) {
-      return FnTy::extract_reset_batch(x, b.getVec().data());
+      return FnTy::extract_reset_batch(x, b.data());
     } else {
-      return FnTy::extract_batch(x, b.getVec().data());
+      return FnTy::extract_batch(x, b.data());
     }
   }
 
@@ -1376,9 +1376,9 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
   inline bool extractBatchWrapper(unsigned x, galois::runtime::SendBuffer& b,
                                   size_t& s, DataCommMode& data_mode) {
     if (syncType == syncReduce) {
-      return FnTy::extract_reset_batch(x, b.getVec().data(), &s, &data_mode);
+      return FnTy::extract_reset_batch(x, b.data(), &s, &data_mode);
     } else {
-      return FnTy::extract_batch(x, b.getVec().data(), &s, &data_mode);
+      return FnTy::extract_batch(x, b.data(), &s, &data_mode);
     }
   }
 
@@ -1602,12 +1602,12 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
   template <typename FnTy, SyncType syncType, bool async>
   inline bool setBatchWrapper(unsigned x, galois::runtime::RecvBuffer& b) {
     if (syncType == syncReduce) {
-      return FnTy::reduce_batch(x, b.getVec().data() + b.getOffset());
+      return FnTy::reduce_batch(x, b.data());
     } else {
       if (async) {
-        return FnTy::reduce_mirror_batch(x, b.getVec().data() + b.getOffset());
+        return FnTy::reduce_mirror_batch(x, b.data());
       } else {
-        return FnTy::setVal_batch(x, b.getVec().data() + b.getOffset());
+        return FnTy::setVal_batch(x, b.data());
       }
     }
   }
@@ -1632,15 +1632,12 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
   inline bool setBatchWrapper(unsigned x, galois::runtime::RecvBuffer& b,
                               DataCommMode& data_mode) {
     if (syncType == syncReduce) {
-      return FnTy::reduce_batch(x, b.getVec().data() + b.getOffset(),
-                                data_mode);
+      return FnTy::reduce_batch(x, b.data(), data_mode);
     } else {
       if (async) {
-        return FnTy::reduce_mirror_batch(x, b.getVec().data() + b.getOffset(),
-                                         data_mode);
+        return FnTy::reduce_mirror_batch(x, b.data(), data_mode);
       } else {
-        return FnTy::setVal_batch(x, b.getVec().data() + b.getOffset(),
-                                  data_mode);
+        return FnTy::setVal_batch(x, b.data(), data_mode);
       }
     }
   }
@@ -2223,7 +2220,7 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
         MPI_Wait(&request[x], MPI_STATUS_IGNORE);
       }
       if (b[x].size() > 0) {
-        b[x].getVec().clear();
+        b[x].get().clear();
       }
 
       getSendBuffer<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(loopName, x,
@@ -2325,7 +2322,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
         size_t syncTypePhase = 0;
         if (async && (syncType == syncBroadcast))
           syncTypePhase = 1;
-        net.sendTagged(x, galois::runtime::evilPhase, b, syncTypePhase);
+        net.sendTagged(x, galois::runtime::evilPhase, std::move(b),
+                       syncTypePhase);
         ++numMessages;
       }
     }
@@ -2806,11 +2804,9 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
       size_t syncTypePhase = 0;
       if (syncType == syncBroadcast)
         syncTypePhase = 1;
-      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr,
-                                 syncTypePhase)) p;
+      decltype(net.recieveTagged(galois::runtime::evilPhase, syncTypePhase)) p;
       do {
-        p = net.recieveTagged(galois::runtime::evilPhase, nullptr,
-                              syncTypePhase);
+        p = net.recieveTagged(galois::runtime::evilPhase, syncTypePhase);
 
         if (p) {
           syncRecvApply<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(
@@ -2825,9 +2821,9 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
           continue;
 
         Twait.start();
-        decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+        decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
         do {
-          p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+          p = net.recieveTagged(galois::runtime::evilPhase);
         } while (!p);
         Twait.stop();
 

From 79701576db20348e01827901a25d7a5ac638ebf1 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 28 Apr 2021 12:34:15 -0500
Subject: [PATCH 518/660] Inductive/sampling for training now co-exist

Can use inductive and sampling (separately) on single host without any
issues. There are some remaining issues that need to be resolved like
using the correct degree for sampling (requires dist comms for degrees
which will be implemented next).
---
 .../include/galois/graphs/LC_CSR_Graph.h      |  8 +++
 libgnn/include/galois/GraphNeuralNetwork.h    |  2 +
 libgnn/include/galois/graphs/GNNGraph.h       | 29 +++++---
 libgnn/include/galois/graphs/GNNSubgraph.h    |  2 +
 libgnn/include/galois/layers/GNNLayer.h       |  6 ++
 libgnn/src/GraphNeuralNetwork.cpp             | 10 ++-
 libgnn/src/graphs/GNNGraph.cpp                | 69 +++++++++++++++----
 libgnn/src/graphs/GNNSubgraph.cpp             |  3 +
 libgnn/src/layers/SAGELayer.cpp               |  8 ++-
 9 files changed, 106 insertions(+), 31 deletions(-)

diff --git a/libgalois/include/galois/graphs/LC_CSR_Graph.h b/libgalois/include/galois/graphs/LC_CSR_Graph.h
index 9f849d0efc..45d39fafaa 100644
--- a/libgalois/include/galois/graphs/LC_CSR_Graph.h
+++ b/libgalois/include/galois/graphs/LC_CSR_Graph.h
@@ -606,6 +606,14 @@ class LC_CSR_Graph :
     edgeData.destroy();
   }
 
+  //! No destroy, only deallocate
+  void DeallocateOnly() {
+    nodeData.deallocate();
+    edgeIndData.deallocate();
+    edgeDst.deallocate();
+    edgeData.deallocate();
+  }
+
   void constructEdge(EdgeIndexTy e, NodeIndexTy dst,
                      const typename EdgeData::value_type& val) {
     edgeData.set(e, val);
diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h
index 3df6fbe94e..953e925d9a 100644
--- a/libgnn/include/galois/GraphNeuralNetwork.h
+++ b/libgnn/include/galois/GraphNeuralNetwork.h
@@ -198,6 +198,8 @@ class GraphNeuralNetwork {
   std::vector<std::unique_ptr<galois::GNNLayer>> gnn_layers_;
   //! Current phase of the GNN: train, validation, test
   GNNPhase phase_{GNNPhase::kTrain};
+  //! Number of layers that use the graph (e.g. SAGE, GCN)
+  size_t num_graph_user_layers_;
 
 #ifdef GALOIS_ENABLE_GPU
   //! Holds all GPU functions
diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 3a538d9da5..7e0d016e06 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -115,9 +115,9 @@ class GNNGraph {
   // Edges
   //////////////////////////////////////////////////////////////////////////////
 
-  void InitializeSamplingData() { InitializeSamplingData(1); }
+  void InitializeSamplingData() { InitializeSamplingData(1, false); }
   //! Initialize data required to do graph sampling
-  void InitializeSamplingData(size_t num_layers);
+  void InitializeSamplingData(size_t num_layers, bool is_inductive);
 
   //////////////////////////////////////////////////////////////////////////////
   // Out Edges
@@ -286,16 +286,23 @@ class GNNGraph {
   //////////////////////////////////////////////////////////////////////////////
 
   GNNFloat GetNormFactor(GraphNode n) const { return norm_factors_[n]; }
+
   //! Degree norm (1 / degree) of current functional graph (e.g., sampled,
   //! inductive graph, etc); calculated whenever norm factor is calculated
-  GNNFloat GetDegreeNorm(GraphNode n) const {
-    if (!use_subgraph_) {
-      return degree_norm_[n];
+  GNNFloat GetGlobalDegreeNorm(GraphNode n) const { return degree_norm_[n]; }
+
+  //! Get degree of subgraph for particular layer
+  GNNFloat GetDegreeNorm(GraphNode n, size_t graph_user_layer_num) const {
+    if (use_subgraph_) {
+      if (!subgraph_is_inductive_) {
+        // case because degrees in each layer differ
+        return 1.0 / sampled_out_degrees_[graph_user_layer_num]
+                                         [subgraph_->SIDToLID(n)];
+      } else {
+        return 1.0 / sampled_out_degrees_[0][subgraph_->SIDToLID(n)];
+      }
     } else {
-      // XXX does not work in distributed case, fix there
-      // XXX also need to account for current layer number in sampling
-      // case because degrees in each layer differ
-      return 1.0 / subgraph_->GetLocalDegree(n);
+      return degree_norm_[n];
     }
   }
 
@@ -521,8 +528,7 @@ class GNNGraph {
 
   std::unique_ptr<GNNSubgraph> subgraph_;
   // Degrees for sampled subgraph
-  galois::LargeArray<uint32_t> sampled_out_degrees_;
-  galois::LargeArray<uint32_t> sampled_in_degrees_;
+  std::vector<galois::LargeArray<uint32_t>> sampled_out_degrees_;
   //! Sample data on edges: each edge gets a small bitset to mark
   //! if it's been sampled for a particular layer
   galois::LargeArray<std::vector<bool>> edge_sample_status_;
@@ -568,6 +574,7 @@ class GNNGraph {
 
   // TODO vars for subgraphs as necessary
   bool use_subgraph_{false};
+  bool subgraph_is_inductive_{false};
 
   //////////////////////////////////////////////////////////////////////////////
   // GPU things
diff --git a/libgnn/include/galois/graphs/GNNSubgraph.h b/libgnn/include/galois/graphs/GNNSubgraph.h
index c3c931f0da..4ac7c739eb 100644
--- a/libgnn/include/galois/graphs/GNNSubgraph.h
+++ b/libgnn/include/galois/graphs/GNNSubgraph.h
@@ -104,6 +104,8 @@ class GNNSubgraph {
 
   static const constexpr char* kRegionName = "GNNSubgraph";
 
+  bool inductive_subgraph_{false};
+
   // name is self explanatory
   LC_CSR_CSC_Graph<char, void> underlying_graph_;
   // size vars
diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
index 82b149ee5e..5cfe69b83e 100644
--- a/libgnn/include/galois/layers/GNNLayer.h
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -142,6 +142,7 @@ class GNNLayer {
     return output_layer_type_;
   }
   size_t layer_number() const { return layer_number_; }
+  size_t graph_user_layer_number() const { return graph_user_layer_number_; }
 
   //! Conducts the forward phase given the input to this layer which
   //! ultimately leads to an output (classfication of node labels) at the end
@@ -175,6 +176,9 @@ class GNNLayer {
   void EnableSampling() { config_.do_sampling = true; }
   bool IsSampledLayer() const { return config_.do_sampling; }
   bool IsInductiveLayer() const { return config_.inductive_training_; }
+  //! Sets the graph user layer number; important for sampling as this index
+  //! determines which index to use when checking for sampled edges
+  void SetGraphUserLayerNumber(size_t num) { graph_user_layer_number_ = num; }
 
 #ifdef GALOIS_ENABLE_GPU
   //! Utility function for allocating
@@ -207,6 +211,8 @@ class GNNLayer {
   //! 0 does not need to do some things that other layers need to do
   // XXX be more specific
   size_t layer_number_;
+  //! Graph layer number: only layers that use the graph are numbered
+  size_t graph_user_layer_number_;
   //! Pointer to the graph being trained by this layer.
   //! This is owned by the creator of this layer, so no need to free it when
   //! this layer is destroyed.
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index 1b492c34ec..46b8a6bcdd 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -28,6 +28,7 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork(
 #endif
   // used for chaining layers together; begins as nullptr
   PointerWithSize<GNNFloat> prev_output_layer(nullptr, 0);
+  num_graph_user_layers_ = 0;
 
   // create the intermediate layers
   for (size_t i = 0; i < config_.num_intermediate_layers(); i++) {
@@ -52,6 +53,7 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork(
       gnn_layers_.push_back(std::move(std::make_unique<GraphConvolutionalLayer>(
           i, *graph_, &prev_output_layer, layer_dims,
           config_.default_layer_config())));
+      gnn_layers_.back()->SetGraphUserLayerNumber(num_graph_user_layers_++);
 #ifdef GALOIS_ENABLE_GPU
       if (device_personality == DevicePersonality::GPU_CUDA) {
         graph_->InitLayerVectorMetaObjects(
@@ -64,6 +66,7 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork(
       gnn_layers_.push_back(std::move(std::make_unique<SAGELayer>(
           i, *graph_, &prev_output_layer, layer_dims,
           config_.default_layer_config())));
+      gnn_layers_.back()->SetGraphUserLayerNumber(num_graph_user_layers_++);
 #ifdef GALOIS_ENABLE_GPU
       // TODO(loc/hochan) sage layer gpu
 #endif
@@ -105,7 +108,8 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork(
   }
   if (config_.do_sampling() || config_.inductive_training_) {
     // output layer not included; it will never involve sampling
-    graph_->InitializeSamplingData(gnn_layers_.size());
+    graph_->InitializeSamplingData(num_graph_user_layers_,
+                                   config_.inductive_training_);
   }
 
   // create the output layer
@@ -160,7 +164,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
       GNNLayerType layer_type = (*back_iter)->layer_type();
       if (layer_type == GNNLayerType::kGraphConvolutional ||
           layer_type == GNNLayerType::kSAGE) {
-        graph_->SampleAllEdges((*back_iter)->layer_number());
+        graph_->SampleAllEdges((*back_iter)->graph_user_layer_number());
       }
     }
     // resize layer matrices
@@ -197,7 +201,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
         GNNLayerType layer_type = (*back_iter)->layer_type();
         if (layer_type == GNNLayerType::kGraphConvolutional ||
             layer_type == GNNLayerType::kSAGE) {
-          graph_->SampleEdges((*back_iter)->layer_number(), 30);
+          graph_->SampleEdges((*back_iter)->graph_user_layer_number(), 5);
           num_sampled_layers++;
         }
       }
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index 56572ccb76..80ea988166 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -857,11 +857,26 @@ float galois::graphs::GNNGraph::GetGlobalAccuracyCPUMulti(
 
 ////////////////////////////////////////////////////////////////////////////////
 
-void galois::graphs::GNNGraph::InitializeSamplingData(size_t num_layers) {
+void galois::graphs::GNNGraph::InitializeSamplingData(size_t num_layers,
+                                                      bool is_inductive) {
   subgraph_ = std::make_unique<GNNSubgraph>(partitioned_graph_->size());
-  edge_sample_status_.create(partitioned_graph_->sizeEdges(), num_layers);
-  sampled_out_degrees_.create(partitioned_graph_->size(), 0);
-  sampled_in_degrees_.create(partitioned_graph_->size(), 0);
+  edge_sample_status_.create(partitioned_graph_->sizeEdges(), num_layers, 0);
+  // this is to hold the *global* degree of a sampled graph; yes, memory wise
+  // this is slightly problematic possibly, but each layer is its own
+  // subgraph
+  if (!is_inductive) {
+    sampled_out_degrees_.resize(num_layers);
+    for (galois::LargeArray<uint32_t>& array : sampled_out_degrees_) {
+      array.create(partitioned_graph_->size());
+    }
+  } else {
+    // TODO(loc) optimize possible: inductive setting means # nodes always
+    // only training/other nodes, so can allocate only what is required
+    // Allocating full size is inefficient
+    sampled_out_degrees_.resize(1);
+    sampled_out_degrees_[0].create(partitioned_graph_->size());
+    subgraph_is_inductive_ = true;
+  }
 }
 
 void galois::graphs::GNNGraph::SetupNeighborhoodSample() {
@@ -882,9 +897,17 @@ void galois::graphs::GNNGraph::SetupNeighborhoodSample() {
                    std::fill(edge_sample_status_[edge_id].begin(),
                              edge_sample_status_[edge_id].end(), 0);
                  });
+  // reset all degrees
+  galois::do_all(
+      galois::iterate(sampled_out_degrees_),
+      [&](galois::LargeArray<uint32_t>& array) {
+        std::fill(array.begin(), array.end(), 0);
+      },
+      galois::chunk_size<1>());
 }
 
 void galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num) {
+  assert(subgraph_is_inductive_);
   use_subgraph_ = false;
 
   galois::GAccumulator<size_t> sampled;
@@ -894,18 +917,27 @@ void galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num) {
 
   galois::do_all(
       galois::iterate(begin(), end()),
-      [&](const NodeIterator& x) {
+      [&](const NodeIterator& src_iter) {
         // only operate on if sampled
-        if (partitioned_graph_->getData(*x)) {
+        if (partitioned_graph_->getData(*src_iter)) {
           // marks ALL edges of nodes that connect to train/other nodes
-          for (auto edge_iter : partitioned_graph_->edges(*x)) {
+          for (auto edge_iter : partitioned_graph_->edges(*src_iter)) {
             if (IsValidForPhase(partitioned_graph_->getEdgeDst(edge_iter),
                                 GNNPhase::kTrain) ||
                 IsValidForPhase(partitioned_graph_->getEdgeDst(edge_iter),
                                 GNNPhase::kOther)) {
               MakeEdgeSampled(edge_iter, agg_layer_num);
-              new_sampled_nodes_.set(partitioned_graph_->getEdgeDst(edge_iter));
+              if (!IsInSampledGraph(
+                      partitioned_graph_->getEdgeDst(edge_iter))) {
+                new_sampled_nodes_.set(
+                    partitioned_graph_->getEdgeDst(edge_iter));
+              }
               sampled += 1;
+              // only count once for last layer (last layer is where all
+              // relevant nodes will be included)
+              if (agg_layer_num == 0) {
+                sampled_out_degrees_[0][*src_iter]++;
+              }
             }
             total += 1;
           }
@@ -930,6 +962,7 @@ void galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num) {
 
 void galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num,
                                            size_t num_to_sample) {
+  assert(!subgraph_is_inductive_);
   use_subgraph_ = false;
 
   galois::GAccumulator<size_t> sampled;
@@ -938,15 +971,16 @@ void galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num,
   total.reset();
   galois::do_all(
       galois::iterate(begin(), end()),
-      [&](const NodeIterator& x) {
+      [&](const NodeIterator& src_iter) {
         // only operate on if sampled
-        if (partitioned_graph_->getData(*x)) {
+        if (partitioned_graph_->getData(*src_iter)) {
           // chance of not uniformly choosing an edge of this node num_to_sample
           // times (degree norm is 1 / degree)
+          // XXX training degree + other norm, not global
           double probability_of_reject =
-              std::pow(1 - GetDegreeNorm(*x), num_to_sample);
+              std::pow(1 - GetGlobalDegreeNorm(*src_iter), num_to_sample);
           // loop through edges, turn "on" edge with some probability
-          for (auto edge_iter : partitioned_graph_->edges(*x)) {
+          for (auto edge_iter : partitioned_graph_->edges(*src_iter)) {
             if (sample_rng_.DoBernoulli(probability_of_reject)) {
               // only take if node is training node or a node not classified
               // into train/test/val
@@ -957,13 +991,20 @@ void galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num,
                 // if here, it means edge accepted; set sampled on, mark source
                 // as part of next set
                 MakeEdgeSampled(edge_iter, sample_layer_num);
-                new_sampled_nodes_.set(
-                    partitioned_graph_->getEdgeDst(edge_iter));
+                if (!IsInSampledGraph(
+                        partitioned_graph_->getEdgeDst(edge_iter))) {
+                  new_sampled_nodes_.set(
+                      partitioned_graph_->getEdgeDst(edge_iter));
+                }
+                // degree increment
+                sampled_out_degrees_[sample_layer_num][*src_iter]++;
                 sampled += 1;
               }
             }
             total += 1;
           }
+          // galois::gDebug(*src_iter, " with degree ",
+          // sampled_out_degrees_[sample_layer_num][*src_iter]);
         }
       },
       galois::steal(), galois::loopname("NeighborhoodSample"));
diff --git a/libgnn/src/graphs/GNNSubgraph.cpp b/libgnn/src/graphs/GNNSubgraph.cpp
index e80dfffbc9..cfacf02f4f 100644
--- a/libgnn/src/graphs/GNNSubgraph.cpp
+++ b/libgnn/src/graphs/GNNSubgraph.cpp
@@ -58,6 +58,7 @@ void galois::graphs::GNNGraph::GNNSubgraph::CreateLocalToSubgraphMapping(
   timer.stop();
 }
 
+// TODO optimize further?
 void galois::graphs::GNNGraph::GNNSubgraph::DegreeCounting(
     const GNNGraph& gnn_graph) {
   galois::StatTimer timer("DegreeCounting", kRegionName);
@@ -98,6 +99,7 @@ void galois::graphs::GNNGraph::GNNSubgraph::DegreeCounting(
   timer.stop();
 }
 
+// TODO optimize further?
 void galois::graphs::GNNGraph::GNNSubgraph::EdgeCreation(
     const GNNGraph& gnn_graph) {
   galois::StatTimer timer("EdgeConstruction", kRegionName);
@@ -111,6 +113,7 @@ void galois::graphs::GNNGraph::GNNSubgraph::EdgeCreation(
 
   // allocate then set node endpoints
   num_subgraph_edges_ = subgraph_out_degrees_.back();
+  underlying_graph_.DeallocateOnly();
   underlying_graph_.allocateFrom(num_subgraph_nodes_, num_subgraph_edges_);
   underlying_graph_.CSCAllocate();
   galois::do_all(galois::iterate(uint32_t{0}, num_subgraph_nodes_),
diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp
index dfae86cbd2..1240280b48 100644
--- a/libgnn/src/layers/SAGELayer.cpp
+++ b/libgnn/src/layers/SAGELayer.cpp
@@ -415,7 +415,7 @@ void galois::SAGELayer::AggregateAllCPU(
 
         GNNFloat source_norm = 0.0;
         if (!config_.disable_normalization) {
-          source_norm = graph_.GetDegreeNorm(src);
+          source_norm = graph_.GetDegreeNorm(src, graph_user_layer_number_);
         }
 
         if (!is_backward) {
@@ -447,7 +447,8 @@ void galois::SAGELayer::AggregateAllCPU(
               if (!is_backward) {
                 norm_scale = source_norm;
               } else {
-                norm_scale = graph_.GetDegreeNorm(dst);
+                norm_scale =
+                    graph_.GetDegreeNorm(dst, graph_user_layer_number_);
               }
 
               galois::VectorMulAdd(
@@ -486,7 +487,8 @@ void galois::SAGELayer::AggregateAllCPU(
             size_t index_to_dst_feature = dst * column_length;
 
             if (!config_.disable_normalization) {
-              GNNFloat norm_scale = graph_.GetDegreeNorm(dst);
+              GNNFloat norm_scale =
+                  graph_.GetDegreeNorm(dst, graph_user_layer_number_);
 
               galois::VectorMulAdd(
                   column_length, &aggregate_output[index_to_src_feature],

From d001e004f3c77a4b5d68156b6c0c234d01dc5bd1 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 28 Apr 2021 17:29:06 -0500
Subject: [PATCH 519/660] SAGE sync fix + bitset fix for GNNs

1) SAGE layer iterates over in-edges for backward phase, meaning you
write destination. Sync call, therefore, needs to be write destination
for the backward phase.

2) Non-full bitsets were not compatible with GNN applications because
the manyvec to single vec hack did not account for size changes from the
bitset. This commit fixes that.
---
 .../include/galois/graphs/GluonSubstrate.h    | 72 +++++++++++++++----
 libgnn/include/galois/graphs/GNNGraph.h       | 11 ++-
 libgnn/src/graphs/GNNGraph.cpp                | 16 +++--
 libgnn/src/layers/SAGELayer.cpp               | 33 ++-------
 4 files changed, 85 insertions(+), 47 deletions(-)

diff --git a/libgluon/include/galois/graphs/GluonSubstrate.h b/libgluon/include/galois/graphs/GluonSubstrate.h
index 7a1e5b6665..f102e3a4a1 100644
--- a/libgluon/include/galois/graphs/GluonSubstrate.h
+++ b/libgluon/include/galois/graphs/GluonSubstrate.h
@@ -841,6 +841,48 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
     }
   }
 
+  // only difference is val_vec doesn't get resized ever (it's the single array
+  // from the hack call
+  template <bool async, SyncType syncType, typename VecType>
+  void
+  serializeMessageVecHack(std::string loopName, DataCommMode data_mode,
+                          size_t bit_set_count, std::vector<size_t>& indices,
+                          galois::PODResizeableArray<unsigned int>& offsets,
+                          galois::DynamicBitSet& bit_set_comm, VecType& val_vec,
+                          galois::runtime::SendBuffer& b) {
+    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
+    std::string serialize_timer_str(syncTypeStr + "SerializeMessage_" +
+                                    get_run_identifier(loopName));
+    galois::CondStatTimer<GALOIS_COMM_STATS> Tserialize(
+        serialize_timer_str.c_str(), RNAME);
+    if (data_mode == noData) {
+      if (!async) {
+        Tserialize.start();
+        gSerialize(b, data_mode);
+        Tserialize.stop();
+      }
+    } else if (data_mode == gidsData) {
+      offsets.resize(bit_set_count);
+      convertLIDToGID<syncType>(loopName, indices, offsets);
+      Tserialize.start();
+      gSerialize(b, data_mode, bit_set_count, offsets, val_vec);
+      Tserialize.stop();
+    } else if (data_mode == offsetsData) {
+      offsets.resize(bit_set_count);
+      Tserialize.start();
+      gSerialize(b, data_mode, bit_set_count, offsets, val_vec);
+      Tserialize.stop();
+    } else if (data_mode == bitsetData) {
+      Tserialize.start();
+      gSerialize(b, data_mode, bit_set_count, bit_set_comm, val_vec);
+      Tserialize.stop();
+    } else { // onlyData
+      Tserialize.start();
+      gSerialize(b, data_mode, val_vec);
+      Tserialize.stop();
+    }
+  }
+
   /**
    * Given the data mode, deserialize the rest of a message in a Receive Buffer.
    *
@@ -2030,20 +2072,24 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
 
         // Vector of vectors is in val_vec
         // val vec over to contiguous array of #s
-        size_t num_nodes    = val_vec.size();
-        size_t feature_size = val_vec[0].size();
-        single_array.resize(num_nodes * feature_size);
-        galois::do_all(
-            galois::iterate(size_t{0}, num_nodes),
-            [&](size_t node) {
-              std::memcpy(&(single_array.data()[node * feature_size]),
-                          val_vec[node].data(), feature_size * sizeof(float));
-            },
-            galois::loopname("GluonSerializeManyVecToOne"));
+        size_t num_nodes    = bit_set_count;
+        size_t feature_size = 0;
+        if (bit_set_count != 0) {
+          feature_size = val_vec[0].size();
+          single_array.resize(num_nodes * feature_size);
+          galois::do_all(
+              galois::iterate(size_t{0}, num_nodes),
+              [&](size_t index) {
+                std::memcpy(&(single_array.data()[index * feature_size]),
+                            val_vec[index].data(),
+                            feature_size * sizeof(float));
+              },
+              galois::loopname("GluonSerializeManyVecToOne"));
+        }
 
-        serializeMessage<async, syncType>(loopName, data_mode, bit_set_count,
-                                          indices, offsets, bit_set_comm,
-                                          single_array, b);
+        serializeMessageVecHack<async, syncType>(
+            loopName, data_mode, bit_set_count, indices, offsets, bit_set_comm,
+            single_array, b);
         gSerialize(b, feature_size);
       } else {
         // TODO(loc/hochan) vector gpu hack for gnns
diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 7e0d016e06..01840c39fc 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -377,13 +377,20 @@ class GNNGraph {
 
   //////////////////////////////////////////////////////////////////////////////
 
+  // TODO(loc) Should not be a default version of this to avoid potential
+  // issues later
+  void AggregateSync(GNNFloat* matrix_to_sync,
+                     const size_t matrix_column_size) const {
+    AggregateSync(matrix_to_sync, matrix_column_size, false);
+  };
+
   //! Given a matrix and the column size, do an aggregate sync where each row
   //! is considered a node's data and sync using the graph's Gluon
   //! substrate
   //! Note that it's const because the only thing being used is the graph
   //! topology of this object; the thing modified is the passed in matrix
-  void AggregateSync(GNNFloat* matrix_to_sync,
-                     const size_t matrix_column_size) const;
+  void AggregateSync(GNNFloat* matrix_to_sync, const size_t matrix_column_size,
+                     bool is_backward) const;
 
   //////////////////////////////////////////////////////////////////////////////
   // Sampling related
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index 80ea988166..861a982a98 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -173,14 +173,20 @@ bool galois::graphs::GNNGraph::IsValidForPhaseMasked(
   return (*mask_to_use)[lid];
 }
 
-void galois::graphs::GNNGraph::AggregateSync(
-    GNNFloat* matrix_to_sync, const size_t matrix_column_size) const {
+void galois::graphs::GNNGraph::AggregateSync(GNNFloat* matrix_to_sync,
+                                             const size_t matrix_column_size,
+                                             bool is_backward) const {
   // set globals for the sync substrate
   gnn_matrix_to_sync_               = matrix_to_sync;
   gnn_matrix_to_sync_column_length_ = matrix_column_size;
-  sync_substrate_
-      ->sync<writeSource, readAny, GNNSumAggregate, Bitset_graph_aggregate>(
-          "GraphAggregateSync");
+  if (!is_backward) {
+    sync_substrate_
+        ->sync<writeSource, readAny, GNNSumAggregate, Bitset_graph_aggregate>(
+            "GraphAggregateSync");
+  } else {
+    sync_substrate_->sync<writeDestination, readAny, GNNSumAggregate,
+                          Bitset_graph_aggregate>("BackwardGraphAggregateSync");
+  }
 }
 
 #ifdef GALOIS_ENABLE_GPU
diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp
index 1240280b48..48ab1e0b4e 100644
--- a/libgnn/src/layers/SAGELayer.cpp
+++ b/libgnn/src/layers/SAGELayer.cpp
@@ -394,25 +394,12 @@ void galois::SAGELayer::AggregateAllCPU(
   galois::do_all(
       galois::iterate(graph_.begin(), graph_.end()),
       [&](size_t src) {
-        // TODO(loc) this is currently a hack: the sync substrate blows
-        // up if not the entire bitset is set for sync call like in
-        // edge sampling
-        graphs::bitset_graph_aggregate.set(src);
         size_t index_to_src_feature = src * column_length;
         // zero out src feature first
         for (size_t i = 0; i < column_length; i++) {
           aggregate_output[index_to_src_feature + i] = 0;
         }
 
-        // if (layer_phase_ == GNNPhase::kTrain) {
-        //  // XXX
-        //  if (IsInductiveLayer()) {
-        //    // if inductive, all non-training nodes do not exist
-        //    if (!graph_.IsValidForPhase(src, GNNPhase::kTrain))
-        //      return;
-        //  }
-        //}
-
         GNNFloat source_norm = 0.0;
         if (!config_.disable_normalization) {
           source_norm = graph_.GetDegreeNorm(src, graph_user_layer_number_);
@@ -422,17 +409,13 @@ void galois::SAGELayer::AggregateAllCPU(
           // loop through all destinations to grab the feature to aggregate
           for (auto e = graph_.edge_begin(src); e != graph_.edge_end(src);
                e++) {
-            // graphs::bitset_graph_aggregate.set(src);
+            // XXX set LID
+            graphs::bitset_graph_aggregate.set(src);
             size_t dst = graph_.GetEdgeDest(e);
             // galois::gPrint("(", src, " ", dst, ")\n");
 
             if (layer_phase_ == GNNPhase::kTrain) {
-              //// XXX
-              // if (IsInductiveLayer()) {
-              //  // if inductive, all non-training nodes do not exist
-              //  if (!graph_.IsValidForPhase(dst, GNNPhase::kTrain))
-              //    return;
-              //}
+              // XXX
               if (IsSampledLayer()) {
                 if (!graph_.IsEdgeSampled(e, layer_number_)) {
                   continue;
@@ -467,16 +450,12 @@ void galois::SAGELayer::AggregateAllCPU(
           // loop through all destinations to grab the feature to aggregate
           for (auto e = graph_.in_edge_begin(src); e != graph_.in_edge_end(src);
                e++) {
-            // graphs::bitset_graph_aggregate.set(src);
+            // XXX LID not SID
+            graphs::bitset_graph_aggregate.set(src);
             size_t dst = graph_.GetInEdgeDest(e);
 
             if (layer_phase_ == GNNPhase::kTrain) {
               // XXX
-              // if (IsInductiveLayer()) {
-              //  // if inductive, all non-training nodes do not exist
-              //  if (!graph_.IsValidForPhase(dst, GNNPhase::kTrain))
-              //    return;
-              //}
               if (IsSampledLayer()) {
                 if (!graph_.IsInEdgeSampled(e, layer_number_)) {
                   continue;
@@ -507,7 +486,7 @@ void galois::SAGELayer::AggregateAllCPU(
       galois::chunk_size<1>(), galois::steal(),
       galois::loopname("ConvolutionalAggregateAll"));
   // aggregate sync
-  graph_.AggregateSync(aggregate_output, column_length);
+  graph_.AggregateSync(aggregate_output, column_length, is_backward);
 }
 
 void galois::SAGELayer::UpdateEmbeddings(const GNNFloat* node_embeddings,

From d7d2e3dd7cd446c5fa559cc7c8094bfd091b9b88 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 28 Apr 2021 17:40:47 -0500
Subject: [PATCH 520/660] sample-bit-test fix: call update

---
 libgnn/test/sample-bit-test.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libgnn/test/sample-bit-test.cpp b/libgnn/test/sample-bit-test.cpp
index 89ed60d0ad..f603578c13 100644
--- a/libgnn/test/sample-bit-test.cpp
+++ b/libgnn/test/sample-bit-test.cpp
@@ -15,7 +15,7 @@ int main() {
 
   galois::graphs::GNNGraph graph(
       "tester", galois::graphs::GNNPartitionScheme::kOEC, true);
-  graph.InitializeSamplingData(3);
+  graph.InitializeSamplingData(3, false);
 
   // first, assert all edges are not sampled (should come with all 0s)
   for (size_t node = 0; node < graph.size(); node++) {

From 6a7e09d63fc180332d92bd6be719f52031beb8e9 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 29 Apr 2021 15:29:31 -0500
Subject: [PATCH 521/660] Distributed calculation of gnn node degrees

Before this commit, degrees were calculated on every host by loading in
full original graph and using it. That is not efficient: this commit
makes every host count degrees and send it to all other hosts.
The reading of the whole graph topology has been removed.

Removes a bunch of older unused functions like SpecialNormFactor and
such as well.
---
 .../galois/graphs/DegreeSyncStructures.h      |  58 +++++++++
 libgnn/include/galois/graphs/GNNGraph.h       |  49 +++++---
 .../graphs/GraphAggregationSyncStructures.h   |   1 -
 libgnn/src/graphs/GNNGraph.cpp                | 112 ++++--------------
 libgnn/src/layers/GraphConvolutionalLayer.cpp |   4 +-
 5 files changed, 114 insertions(+), 110 deletions(-)
 create mode 100644 libgnn/include/galois/graphs/DegreeSyncStructures.h

diff --git a/libgnn/include/galois/graphs/DegreeSyncStructures.h b/libgnn/include/galois/graphs/DegreeSyncStructures.h
new file mode 100644
index 0000000000..0141805df0
--- /dev/null
+++ b/libgnn/include/galois/graphs/DegreeSyncStructures.h
@@ -0,0 +1,58 @@
+#include "galois/GNNTypes.h"
+
+namespace galois {
+namespace graphs {
+
+extern uint32_t* gnn_degree_vec_1_;
+extern uint32_t* gnn_degree_vec_2_;
+
+struct InitialDegreeSync {
+  using ValTy = std::pair<uint32_t, uint32_t>;
+
+  //! return a vector of floats to sync
+  static ValTy extract(uint32_t lid, char&) {
+    return std::make_pair(gnn_degree_vec_1_[lid], gnn_degree_vec_2_[lid]);
+  }
+
+  //! reduction is addition in this case; add received vector to
+  //! own vector
+  static bool reduce(uint32_t lid, char&, ValTy y) {
+    gnn_degree_vec_1_[lid] += y.first;
+    gnn_degree_vec_2_[lid] += y.second;
+    if (y.first || y.second) {
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  //! No-op: readAny = overwritten anyways
+  static void reset(uint32_t lid, char&) {
+    gnn_degree_vec_1_[lid] = 0;
+    gnn_degree_vec_2_[lid] = 0;
+  }
+
+  //! element wise set
+  static void setVal(uint32_t lid, char&, ValTy y) {
+    gnn_degree_vec_1_[lid] = y.first;
+    gnn_degree_vec_2_[lid] = y.second;
+  }
+
+  // GPU options TODO for GPU
+  static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {
+    return false;
+  }
+  static bool extract_batch(unsigned, uint8_t*) { return false; }
+  static bool reduce_batch(unsigned, uint8_t*, DataCommMode) { return false; }
+  static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) {
+    return false;
+  }
+  static bool setVal_batch(unsigned, uint8_t*, DataCommMode) { return false; }
+  static bool extract_reset_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {
+    return false;
+  }
+  static bool extract_reset_batch(unsigned, uint8_t*) { return false; }
+};
+
+} // namespace graphs
+} // namespace galois
diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 01840c39fc..c5817a9b07 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -285,24 +285,42 @@ class GNNGraph {
 
   //////////////////////////////////////////////////////////////////////////////
 
-  GNNFloat GetNormFactor(GraphNode n) const { return norm_factors_[n]; }
+  GNNFloat GetGCNNormFactor(GraphNode lid) const {
+    if (global_degrees_[lid]) {
+      return 1.0 / std::sqrt(static_cast<float>(global_degrees_[lid]) + 1);
+    } else {
+      return 0.0;
+    }
+  }
 
   //! Degree norm (1 / degree) of current functional graph (e.g., sampled,
   //! inductive graph, etc); calculated whenever norm factor is calculated
-  GNNFloat GetGlobalDegreeNorm(GraphNode n) const { return degree_norm_[n]; }
+  GNNFloat GetGlobalDegreeNorm(GraphNode n) const {
+    if (global_degrees_[n]) {
+      return 1.0 / global_degrees_[n];
+    } else {
+      return 0.0;
+    }
+  }
 
-  //! Get degree of subgraph for particular layer
+  //! Get degree norm of subgraph for particular layer (i.e. includes training)
   GNNFloat GetDegreeNorm(GraphNode n, size_t graph_user_layer_num) const {
     if (use_subgraph_) {
+      size_t degree;
       if (!subgraph_is_inductive_) {
         // case because degrees in each layer differ
-        return 1.0 / sampled_out_degrees_[graph_user_layer_num]
-                                         [subgraph_->SIDToLID(n)];
+        degree =
+            sampled_out_degrees_[graph_user_layer_num][subgraph_->SIDToLID(n)];
+      } else {
+        degree = sampled_out_degrees_[0][subgraph_->SIDToLID(n)];
+      }
+      if (degree) {
+        return 1.0 / degree;
       } else {
-        return 1.0 / sampled_out_degrees_[0][subgraph_->SIDToLID(n)];
+        return 0;
       }
     } else {
-      return degree_norm_[n];
+      return GetGlobalDegreeNorm(n);
     }
   }
 
@@ -427,9 +445,6 @@ class GNNGraph {
 
   //! Calculate norm factor considering the entire graph
   void CalculateFullNormFactor();
-  //! Calculate norm factor considering sampled nodes and/or training nodes
-  //! only (inductive)
-  void CalculateSpecialNormFactor(bool is_sampled, bool is_inductive);
 
 #ifdef GALOIS_ENABLE_GPU
   void AggregateSync(GNNFloat* matrix_to_sync, const size_t matrix_column_size,
@@ -518,9 +533,6 @@ class GNNGraph {
   size_t node_feature_length_{0};
   //! Partitioned graph
   std::unique_ptr<GNNDistGraph> partitioned_graph_;
-  //! The entire topology of the dataset: used for things like norm factor
-  //! calculation or sampling
-  WholeGraph whole_graph_;
   //! Sync substrate for the partitioned graph
   std::unique_ptr<galois::graphs::GluonSubstrate<GNNDistGraph>> sync_substrate_;
   //! True if labels are single class
@@ -570,15 +582,14 @@ class GNNGraph {
   //! falling in range != part of that set)
   bool incomplete_masks_{false};
 
-  //! Normalization constant based on structure of the graph (degrees)
-  std::vector<GNNFloat> norm_factors_;
-  //! Normalization constant based on degrees (unlike nomral norm factors
-  //! it's only division without a square root)
-  std::vector<GNNFloat> degree_norm_;
-
   //! RNG for subgraph sampling
   galois::PerThreadRNG sample_rng_;
 
+  // TODO LargeArray instead of vector?
+  //! Degrees: needed since graph is distributed
+  std::vector<uint32_t> global_degrees_;
+  std::vector<uint32_t> global_train_degrees_;
+
   // TODO vars for subgraphs as necessary
   bool use_subgraph_{false};
   bool subgraph_is_inductive_{false};
diff --git a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
index 7759c26dca..073cde32c3 100644
--- a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
+++ b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
@@ -1,7 +1,6 @@
 // defined in GNNGraph.cpp; set in order to control which matrix
 // gets synchronized
 #include "galois/GNNTypes.h"
-#include "galois/BufferWrapper.h"
 #ifdef GALOIS_ENABLE_GPU
 #include "galois/GNNCudaContextHostDecls.h"
 #endif
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index 861a982a98..265066b361 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -3,6 +3,7 @@
 #include "galois/graphs/ReadGraph.h"
 #include "galois/graphs/GNNGraph.h"
 #include "galois/GNNMath.h"
+#include "galois/graphs/DegreeSyncStructures.h"
 #include <limits>
 
 namespace {
@@ -34,11 +35,15 @@ LoadPartition(const std::string& input_directory,
 
 } // end namespace
 
+// Sync structure variables; global to get around sync structure
+// limitations at the moment
 namespace galois {
 namespace graphs {
 GNNFloat* gnn_matrix_to_sync_            = nullptr;
 size_t gnn_matrix_to_sync_column_length_ = 0;
 galois::DynamicBitSet bitset_graph_aggregate;
+uint32_t* gnn_degree_vec_1_;
+uint32_t* gnn_degree_vec_2_;
 #ifdef GALOIS_ENABLE_GPU
 struct CUDA_Context* cuda_ctx_for_sync;
 unsigned layer_number_to_sync;
@@ -84,9 +89,7 @@ galois::graphs::GNNGraph::GNNGraph(const std::string& input_directory,
           partitioned_graph_->cartesianGrid());
   bitset_graph_aggregate.resize(partitioned_graph_->size());
 
-  // read in entire graph topology
-  ReadWholeGraph(dataset_name);
-  // init norm factors using the whole graph topology
+  // init norm factors (involves a sync call)
   InitNormFactor();
 
 #ifdef GALOIS_ENABLE_GPU
@@ -580,104 +583,37 @@ void galois::graphs::GNNGraph::ReadLocalMasks(const std::string& dataset_name) {
   }
 }
 
-void galois::graphs::GNNGraph::ReadWholeGraph(const std::string& dataset_name) {
-  std::string input_file = input_directory_ + dataset_name + ".csgr";
-  GALOIS_LOG_VERBOSE("[{}] Reading entire graph: file to read is {}", host_id_,
-                     input_file);
-  galois::graphs::readGraph(whole_graph_, input_file);
-}
-
 void galois::graphs::GNNGraph::InitNormFactor() {
   GALOIS_LOG_VERBOSE("[{}] Initializing norm factors", host_id_);
-  norm_factors_.resize(partitioned_graph_->size(), 0.0);
-  degree_norm_.resize(partitioned_graph_->size(), 0.0);
+  global_degrees_.resize(partitioned_graph_->size(), 0.0);
+  global_train_degrees_.resize(partitioned_graph_->size(), 0.0);
   CalculateFullNormFactor();
 }
 
 void galois::graphs::GNNGraph::CalculateFullNormFactor() {
-  norm_factors_.assign(partitioned_graph_->size(), 0.0);
-
-  // get the norm factor contribution for each node based on the GLOBAL graph
-  galois::do_all(
-      galois::iterate(static_cast<size_t>(0), partitioned_graph_->size()),
-      [&](size_t local_id) {
-        // translate lid into gid to get global degree
-        size_t global_id = partitioned_graph_->getGID(local_id);
-        // +1 because simulated self edge
-        size_t global_degree = whole_graph_.edge_end(global_id) -
-                               whole_graph_.edge_begin(global_id) + 1;
-        // only set if non-zero
-        if (global_degree != 0) {
-          norm_factors_[local_id] =
-              1.0 / std::sqrt(static_cast<float>(global_degree));
-          degree_norm_[local_id] = 1.0 / static_cast<float>(global_degree);
-        }
-      },
-      galois::loopname("CalculateFullNormFactor"));
-}
-
-void galois::graphs::GNNGraph::CalculateSpecialNormFactor(bool is_sampled,
-                                                          bool is_inductive) {
-  if (galois::runtime::getSystemNetworkInterface().Num > 1) {
-    GALOIS_LOG_FATAL("cannot run special norm factor in dist setting yet");
-  }
-
-  norm_factors_.assign(partitioned_graph_->size(), 0.0);
+  // TODO(loc) reset all degrees if this is called multiple times?
 
   // get the norm factor contribution for each node based on the GLOBAL graph
   galois::do_all(
       galois::iterate(static_cast<size_t>(0), partitioned_graph_->size()),
-      [&](size_t local_id) {
-        // ignore node if not valid
-        if (is_sampled && is_inductive) {
-          if (!IsValidForPhase(local_id, GNNPhase::kTrain) ||
-              !IsInSampledGraph(local_id)) {
-            return;
-          }
-        } else if (is_sampled) {
-          if (!IsInSampledGraph(local_id)) {
-            return;
-          }
-        } else if (is_inductive) {
-          if (!IsValidForPhase(local_id, GNNPhase::kTrain)) {
-            return;
-          }
-        }
-
-        size_t degree = 0;
-
-        // TODO(loc) make this work in a distributed setting; assuming
-        // whole graph is present on single host at the moment
-        for (EdgeIterator e = edge_begin(local_id); e != edge_end(local_id);
-             e++) {
-          size_t dest = GetEdgeDest(e);
-          if (is_sampled && is_inductive) {
-            if (!IsValidForPhase(dest, GNNPhase::kTrain) ||
-                !IsInSampledGraph(dest)) {
-              continue;
-            }
-          } else if (is_sampled) {
-            if (!IsInSampledGraph(dest)) {
-              continue;
-            }
-          } else if (is_inductive) {
-            if (!IsValidForPhase(dest, GNNPhase::kTrain)) {
-              continue;
-            }
-          } else {
-            GALOIS_LOG_WARN(
-                "Why is special norm factor called if not sampled/inductive?");
+      [&](size_t src) {
+        for (auto edge_iter = partitioned_graph_->edge_begin(src);
+             edge_iter != partitioned_graph_->edge_end(src); edge_iter++) {
+          // count degrees for all + train/other
+          size_t dest = GetEdgeDest(edge_iter);
+          if (IsValidForPhase(dest, GNNPhase::kTrain) ||
+              IsValidForPhase(dest, GNNPhase::kOther)) {
+            global_train_degrees_[src] += 1;
           }
-          degree += 1;
-        }
-
-        // only set if non-zero
-        if (degree != 0) {
-          norm_factors_[local_id] = 1.0 / std::sqrt(static_cast<float>(degree));
-          degree_norm_[local_id]  = 1.0 / static_cast<float>(degree);
+          global_degrees_[src] += 1;
         }
       },
-      galois::loopname("CalculateSpecialNormFactor"));
+      galois::loopname("CalculateLocalDegrees"));
+  // degree sync
+  gnn_degree_vec_1_ = global_train_degrees_.data();
+  gnn_degree_vec_2_ = global_degrees_.data();
+  sync_substrate_->sync<writeSource, readAny, InitialDegreeSync>(
+      "InitialDegreeSync");
 }
 
 float galois::graphs::GNNGraph::GetGlobalAccuracy(
diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp
index b5a538d314..3bca821078 100644
--- a/libgnn/src/layers/GraphConvolutionalLayer.cpp
+++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp
@@ -320,7 +320,7 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU(
 
         GNNFloat source_norm = 0.0;
         if (!config_.disable_normalization) {
-          source_norm = graph_.GetNormFactor(src);
+          source_norm = graph_.GetGCNNormFactor(src);
         }
 
         // init to self
@@ -359,7 +359,7 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU(
           size_t index_to_dst_feature = dst * column_length;
 
           if (!config_.disable_normalization) {
-            GNNFloat norm_scale = source_norm * graph_.GetNormFactor(dst);
+            GNNFloat norm_scale = source_norm * graph_.GetGCNNormFactor(dst);
             galois::VectorMulAdd(
                 column_length, &aggregate_output[index_to_src_feature],
                 &node_embeddings[index_to_dst_feature], norm_scale,

From bcffe89b1a504cea94ffa569966c75b9de70dae8 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 29 Apr 2021 16:21:35 -0500
Subject: [PATCH 522/660] Inductive graph uses train degrees rather

Before this commit, inductive subgraph creation recounted train degrees:
this is unnecessary now with the train degrees array.

What this means is that this should theoretically work in distributed
setting. This is the next step/commit.
---
 libgnn/include/galois/graphs/GNNGraph.h | 12 +++++++++---
 libgnn/src/graphs/GNNGraph.cpp          | 26 ++++++++-----------------
 2 files changed, 17 insertions(+), 21 deletions(-)

diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index c5817a9b07..cab41370c8 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -293,8 +293,6 @@ class GNNGraph {
     }
   }
 
-  //! Degree norm (1 / degree) of current functional graph (e.g., sampled,
-  //! inductive graph, etc); calculated whenever norm factor is calculated
   GNNFloat GetGlobalDegreeNorm(GraphNode n) const {
     if (global_degrees_[n]) {
       return 1.0 / global_degrees_[n];
@@ -303,6 +301,14 @@ class GNNGraph {
     }
   }
 
+  GNNFloat GetGlobalTrainDegreeNorm(GraphNode n) const {
+    if (global_train_degrees_[n]) {
+      return 1.0 / global_train_degrees_[n];
+    } else {
+      return 0.0;
+    }
+  }
+
   //! Get degree norm of subgraph for particular layer (i.e. includes training)
   GNNFloat GetDegreeNorm(GraphNode n, size_t graph_user_layer_num) const {
     if (use_subgraph_) {
@@ -312,7 +318,7 @@ class GNNGraph {
         degree =
             sampled_out_degrees_[graph_user_layer_num][subgraph_->SIDToLID(n)];
       } else {
-        degree = sampled_out_degrees_[0][subgraph_->SIDToLID(n)];
+        degree = global_train_degrees_[subgraph_->SIDToLID(n)];
       }
       if (degree) {
         return 1.0 / degree;
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index 265066b361..47f78b2173 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -812,11 +812,6 @@ void galois::graphs::GNNGraph::InitializeSamplingData(size_t num_layers,
       array.create(partitioned_graph_->size());
     }
   } else {
-    // TODO(loc) optimize possible: inductive setting means # nodes always
-    // only training/other nodes, so can allocate only what is required
-    // Allocating full size is inefficient
-    sampled_out_degrees_.resize(1);
-    sampled_out_degrees_[0].create(partitioned_graph_->size());
     subgraph_is_inductive_ = true;
   }
 }
@@ -840,12 +835,14 @@ void galois::graphs::GNNGraph::SetupNeighborhoodSample() {
                              edge_sample_status_[edge_id].end(), 0);
                  });
   // reset all degrees
-  galois::do_all(
-      galois::iterate(sampled_out_degrees_),
-      [&](galois::LargeArray<uint32_t>& array) {
-        std::fill(array.begin(), array.end(), 0);
-      },
-      galois::chunk_size<1>());
+  if (!subgraph_is_inductive_) {
+    galois::do_all(
+        galois::iterate(sampled_out_degrees_),
+        [&](galois::LargeArray<uint32_t>& array) {
+          std::fill(array.begin(), array.end(), 0);
+        },
+        galois::chunk_size<1>());
+  }
 }
 
 void galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num) {
@@ -875,11 +872,6 @@ void galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num) {
                     partitioned_graph_->getEdgeDst(edge_iter));
               }
               sampled += 1;
-              // only count once for last layer (last layer is where all
-              // relevant nodes will be included)
-              if (agg_layer_num == 0) {
-                sampled_out_degrees_[0][*src_iter]++;
-              }
             }
             total += 1;
           }
@@ -945,8 +937,6 @@ void galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num,
             }
             total += 1;
           }
-          // galois::gDebug(*src_iter, " with degree ",
-          // sampled_out_degrees_[sample_layer_num][*src_iter]);
         }
       },
       galois::steal(), galois::loopname("NeighborhoodSample"));

From 42448f0cb7ae8221887e5e9af88f443f214cf740 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 30 Apr 2021 12:55:37 -0500
Subject: [PATCH 523/660] GNNSubgraph fix edge dest construction

Subgraph edge dest construction was using the original ID from the graph
and not the subgraph ID. This was causing issues when original ID != SID
which happens a lot more in distributed setting. This commit fixes it by
mapping the ID correctly.
---
 libgnn/src/graphs/GNNSubgraph.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/libgnn/src/graphs/GNNSubgraph.cpp b/libgnn/src/graphs/GNNSubgraph.cpp
index cfacf02f4f..9bd467e8e3 100644
--- a/libgnn/src/graphs/GNNSubgraph.cpp
+++ b/libgnn/src/graphs/GNNSubgraph.cpp
@@ -35,6 +35,7 @@ void galois::graphs::GNNGraph::GNNSubgraph::CreateLocalToSubgraphMapping(
     if (gnn_graph.IsInSampledGraph(local_node_id)) {
       // TODO should bound check the SID to max uint32_t
       // note: if SID is max uint32t, then it's not valid
+      // galois::gInfo(local_node_id, " maps to ", current_sid);
       lid_to_subgraph_id_[local_node_id] = current_sid++;
     }
   }
@@ -47,6 +48,7 @@ void galois::graphs::GNNGraph::GNNSubgraph::CreateLocalToSubgraphMapping(
     if (gnn_graph.IsInSampledGraph(local_node_id)) {
       // TODO should bound check the SID to max uint32_t
       // note: if SID is max uint32t, then it's not valid
+      // galois::gInfo(local_node_id, " maps to ", current_sid);
       lid_to_subgraph_id_[local_node_id] = current_sid++;
     }
   }
@@ -144,7 +146,8 @@ void galois::graphs::GNNGraph::GNNSubgraph::EdgeCreation(
             if (gnn_graph.IsEdgeSampledAny(out_edge_iter)) {
               subedge_to_original_edge_[out_location] = *out_edge_iter;
               underlying_graph_.constructEdge(
-                  out_location++, gnn_graph.GetEdgeDest(out_edge_iter));
+                  out_location++,
+                  lid_to_subgraph_id_[gnn_graph.GetEdgeDest(out_edge_iter)]);
             }
           }
 
@@ -153,7 +156,8 @@ void galois::graphs::GNNGraph::GNNSubgraph::EdgeCreation(
               in_subedge_to_original_edge_[in_location] =
                   *(gnn_graph.InEdgeToOutEdge(in_edge_iter));
               underlying_graph_.ConstructInEdge(
-                  in_location++, gnn_graph.GetInEdgeDest(in_edge_iter));
+                  in_location++,
+                  lid_to_subgraph_id_[gnn_graph.GetInEdgeDest(in_edge_iter)]);
             }
           }
           assert(out_location == subgraph_out_degrees_[subgraph_id]);

From 7464acfa47304bc1b85434ab428a9d440d6a508e Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 30 Apr 2021 12:57:50 -0500
Subject: [PATCH 524/660] GNN graph (S)ID conversion helper functions

SID conversion functions and a function exposing the LID to SID map that
will be used in distributed sync (substrate needs to map LID to correct
SID to get the necessary info).
---
 libgnn/include/galois/graphs/GNNGraph.h    | 36 +++++++++++++++++++++-
 libgnn/include/galois/graphs/GNNSubgraph.h |  5 +++
 2 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index cab41370c8..25bc8f8f4f 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -280,8 +280,42 @@ class GNNGraph {
   }
 
   void EnableSubgraph() { use_subgraph_ = true; }
-
   void DisableSubgraph() { use_subgraph_ = false; }
+  bool IsSubgraphOn() const { return use_subgraph_; }
+
+  //! Converts an id to an lid for the graph if subgraphs are in use
+  uint32_t ConvertToLID(GraphNode sid) const {
+    if (use_subgraph_) {
+      return subgraph_->SIDToLID(sid);
+    } else {
+      return sid;
+    }
+  }
+  //! Converts an LID to an SID if subgraphs are in use
+  uint32_t ConvertToSID(GraphNode lid) const {
+    if (use_subgraph_) {
+      return subgraph_->LIDToSID(lid);
+    } else {
+      return lid;
+    }
+  }
+  //! Converts SID to GID if subgraphs in use (else just return GID)
+  uint32_t SIDToGID(GraphNode sid) const {
+    if (use_subgraph_) {
+      return GetGID(subgraph_->SIDToLID(sid));
+    } else {
+      return GetGID(sid);
+    }
+  }
+  //! Returns a pointer to the LID to SID map from the subgraph if subgraphs
+  //! are in use
+  galois::LargeArray<uint32_t>* GetLIDToSIDPointer() {
+    if (use_subgraph_) {
+      return subgraph_->GetLIDToSIDPointer();
+    } else {
+      return nullptr;
+    }
+  }
 
   //////////////////////////////////////////////////////////////////////////////
 
diff --git a/libgnn/include/galois/graphs/GNNSubgraph.h b/libgnn/include/galois/graphs/GNNSubgraph.h
index 4ac7c739eb..21642b189b 100644
--- a/libgnn/include/galois/graphs/GNNSubgraph.h
+++ b/libgnn/include/galois/graphs/GNNSubgraph.h
@@ -35,6 +35,7 @@ class GNNSubgraph {
   }
 
   uint32_t SIDToLID(uint32_t sid) const { return subgraph_id_to_lid_[sid]; }
+  uint32_t LIDToSID(uint32_t lid) const { return lid_to_subgraph_id_[lid]; }
 
   //////////////////////////////////////////////////////////////////////////////
   // Edge iteration and destination
@@ -91,6 +92,10 @@ class GNNSubgraph {
 
   //////////////////////////////////////////////////////////////////////////////
 
+  galois::LargeArray<uint32_t>* GetLIDToSIDPointer() {
+    return &lid_to_subgraph_id_;
+  }
+
 private:
   //! Creates subgraph ID mapping from the number of sampled nodes from the
   //! original graph. Should be done every epoch when sampled graph changes.

From 1217c4bdc1b156f68091eee3130d6e14ace4e034 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 30 Apr 2021 13:00:16 -0500
Subject: [PATCH 525/660] SAGE layer: distributed subgraph compatibility

Below only applies to SAGE layer. GCN layer is behind.

1) Bitset set maps from SID to LID as necessary.
2) New sync struct for subgraphs that accounts for LID/SID mapping.
3) Aggregate sync uses correct sync struct as necessary.
---
 .../graphs/GraphAggregationSyncStructures.h   | 81 +++++++++++++++++++
 libgnn/src/graphs/GNNGraph.cpp                | 30 +++++--
 libgnn/src/layers/SAGELayer.cpp               |  9 +--
 3 files changed, 107 insertions(+), 13 deletions(-)

diff --git a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
index 073cde32c3..bcf7ed5078 100644
--- a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
+++ b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
@@ -11,6 +11,7 @@ namespace graphs {
 extern GNNFloat* gnn_matrix_to_sync_;
 extern size_t gnn_matrix_to_sync_column_length_;
 extern galois::DynamicBitSet bitset_graph_aggregate;
+extern galois::LargeArray<uint32_t>* gnn_lid_to_sid_pointer_;
 #ifdef GALOIS_ENABLE_GPU
 extern struct CUDA_Context* cuda_ctx_for_sync;
 extern unsigned layer_number_to_sync;
@@ -66,6 +67,7 @@ struct GNNSumAggregate {
     // assert(device_personality == DevicePersonality::CPU);
     ValTy extracted_vec(gnn_matrix_to_sync_column_length_);
     for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) {
+      // XXX memcpy
       extracted_vec[i] =
           gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_ + i];
     }
@@ -79,6 +81,7 @@ struct GNNSumAggregate {
     assert(y.size() == gnn_matrix_to_sync_column_length_);
     // loop and do addition
     for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) {
+      // XXX vectorized add
       gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_ + i] +=
           y[i];
     }
@@ -121,6 +124,84 @@ struct GNNSumAggregate {
   static bool extract_reset_batch(unsigned, uint8_t*) { return false; }
 };
 
+struct GNNSampleSumAggregate {
+  using ValTy = galois::gstl::Vector<GNNFloat>;
+
+  //! return a vector of floats to sync
+  static ValTy extract(uint32_t node_id, char&) {
+    // It should be a CPU synchronizing substrate.
+    // If the GPU flag is turned off, then personality does not exist.
+    // assert(device_personality == DevicePersonality::CPU);
+    ValTy extracted_vec(gnn_matrix_to_sync_column_length_, 0.0);
+    if ((*gnn_lid_to_sid_pointer_)[node_id] ==
+        std::numeric_limits<uint32_t>::max()) {
+      return extracted_vec;
+    }
+
+    for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) {
+      // XXX memcpy
+      extracted_vec[i] =
+          gnn_matrix_to_sync_[(*gnn_lid_to_sid_pointer_)[node_id] *
+                                  gnn_matrix_to_sync_column_length_ +
+                              i];
+    }
+    // move constructor should kick in here to avoid return copy
+    return extracted_vec;
+  }
+
+  //! reduction is addition in this case; add received vector to
+  //! own vector
+  static bool reduce(uint32_t node_id, char&, ValTy y) {
+    assert(y.size() == gnn_matrix_to_sync_column_length_);
+    if ((*gnn_lid_to_sid_pointer_)[node_id] ==
+        std::numeric_limits<uint32_t>::max()) {
+      return false;
+    }
+
+    // loop and do addition
+    for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) {
+      gnn_matrix_to_sync_[(*gnn_lid_to_sid_pointer_)[node_id] *
+                              gnn_matrix_to_sync_column_length_ +
+                          i] += y[i];
+    }
+    return true;
+  }
+
+  //! No-op: readAny = overwritten anyways
+  static void reset(uint32_t, char&) {}
+
+  //! element wise set
+  static void setVal(uint32_t node_id, char&, ValTy y) {
+    assert(y.size() == gnn_matrix_to_sync_column_length_);
+    if ((*gnn_lid_to_sid_pointer_)[node_id] ==
+        std::numeric_limits<uint32_t>::max()) {
+      return;
+    }
+
+    // loop and do addition
+    for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) {
+      gnn_matrix_to_sync_[(*gnn_lid_to_sid_pointer_)[node_id] *
+                              gnn_matrix_to_sync_column_length_ +
+                          i] = y[i];
+    }
+  }
+
+  // GPU options TODO for GPU
+  static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {
+    return false;
+  }
+  static bool extract_batch(unsigned, uint8_t*) { return false; }
+  static bool reduce_batch(unsigned, uint8_t*, DataCommMode) { return false; }
+  static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) {
+    return false;
+  }
+  static bool setVal_batch(unsigned, uint8_t*, DataCommMode) { return false; }
+  static bool extract_reset_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {
+    return false;
+  }
+  static bool extract_reset_batch(unsigned, uint8_t*) { return false; }
+};
+
 #ifdef GALOIS_ENABLE_GPU
 GALOIS_SYNC_STRUCTURE_GNN_LAYER(layer_input, cuda_ctx_for_sync,
                                 gnn_matrix_to_sync_column_length_,
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index 47f78b2173..e1b0bcab67 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -42,6 +42,7 @@ namespace graphs {
 GNNFloat* gnn_matrix_to_sync_            = nullptr;
 size_t gnn_matrix_to_sync_column_length_ = 0;
 galois::DynamicBitSet bitset_graph_aggregate;
+galois::LargeArray<uint32_t>* gnn_lid_to_sid_pointer_ = nullptr;
 uint32_t* gnn_degree_vec_1_;
 uint32_t* gnn_degree_vec_2_;
 #ifdef GALOIS_ENABLE_GPU
@@ -179,16 +180,31 @@ bool galois::graphs::GNNGraph::IsValidForPhaseMasked(
 void galois::graphs::GNNGraph::AggregateSync(GNNFloat* matrix_to_sync,
                                              const size_t matrix_column_size,
                                              bool is_backward) const {
-  // set globals for the sync substrate
   gnn_matrix_to_sync_               = matrix_to_sync;
   gnn_matrix_to_sync_column_length_ = matrix_column_size;
-  if (!is_backward) {
-    sync_substrate_
-        ->sync<writeSource, readAny, GNNSumAggregate, Bitset_graph_aggregate>(
-            "GraphAggregateSync");
+  if (!use_subgraph_) {
+    // set globals for the sync substrate
+    if (!is_backward) {
+      sync_substrate_
+          ->sync<writeSource, readAny, GNNSumAggregate, Bitset_graph_aggregate>(
+              "GraphAggregateSync");
+    } else {
+      sync_substrate_->sync<writeDestination, readAny, GNNSumAggregate,
+                            Bitset_graph_aggregate>(
+          "BackwardGraphAggregateSync");
+    }
   } else {
-    sync_substrate_->sync<writeDestination, readAny, GNNSumAggregate,
-                          Bitset_graph_aggregate>("BackwardGraphAggregateSync");
+    // setup the SID to LID map for the sync substrate to use (SID != LID)
+    gnn_lid_to_sid_pointer_ = subgraph_->GetLIDToSIDPointer();
+
+    if (!is_backward) {
+      sync_substrate_->sync<writeSource, readAny, GNNSampleSumAggregate,
+                            Bitset_graph_aggregate>("GraphAggregateSync");
+    } else {
+      sync_substrate_->sync<writeDestination, readAny, GNNSampleSumAggregate,
+                            Bitset_graph_aggregate>(
+          "BackwardGraphAggregateSync");
+    }
   }
 }
 
diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp
index 48ab1e0b4e..134f24f3f2 100644
--- a/libgnn/src/layers/SAGELayer.cpp
+++ b/libgnn/src/layers/SAGELayer.cpp
@@ -390,7 +390,6 @@ void galois::SAGELayer::AggregateAllCPU(
     GNNFloat* aggregate_output,
     galois::substrate::PerThreadStorage<std::vector<GNNFloat>>*,
     bool is_backward) {
-
   galois::do_all(
       galois::iterate(graph_.begin(), graph_.end()),
       [&](size_t src) {
@@ -409,10 +408,8 @@ void galois::SAGELayer::AggregateAllCPU(
           // loop through all destinations to grab the feature to aggregate
           for (auto e = graph_.edge_begin(src); e != graph_.edge_end(src);
                e++) {
-            // XXX set LID
-            graphs::bitset_graph_aggregate.set(src);
+            graphs::bitset_graph_aggregate.set(graph_.ConvertToLID(src));
             size_t dst = graph_.GetEdgeDest(e);
-            // galois::gPrint("(", src, " ", dst, ")\n");
 
             if (layer_phase_ == GNNPhase::kTrain) {
               // XXX
@@ -450,8 +447,7 @@ void galois::SAGELayer::AggregateAllCPU(
           // loop through all destinations to grab the feature to aggregate
           for (auto e = graph_.in_edge_begin(src); e != graph_.in_edge_end(src);
                e++) {
-            // XXX LID not SID
-            graphs::bitset_graph_aggregate.set(src);
+            graphs::bitset_graph_aggregate.set(graph_.ConvertToLID(src));
             size_t dst = graph_.GetInEdgeDest(e);
 
             if (layer_phase_ == GNNPhase::kTrain) {
@@ -485,6 +481,7 @@ void galois::SAGELayer::AggregateAllCPU(
       },
       galois::chunk_size<1>(), galois::steal(),
       galois::loopname("ConvolutionalAggregateAll"));
+
   // aggregate sync
   graph_.AggregateSync(aggregate_output, column_length, is_backward);
 }

From 4f03bdfe77f7f0f8e2bc99e66582616bab33dd09 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Sat, 1 May 2021 14:08:32 -0500
Subject: [PATCH 526/660] Sampled degree sync for subgraphs, bug fix

Sync subgraph degrees for all layers using a single sync call by
serializing degrees into a vector.

Fix bug with subgraph mode being on when sampling new subgraphs which
would cause nodes from previous samples to be included unintentionally.

Cleanup of some code/prints.
---
 .../galois/graphs/DegreeSyncStructures.h      | 67 +++++++++++++++++++
 libgnn/include/galois/graphs/GNNGraph.h       | 12 +---
 libgnn/include/galois/graphs/GNNSubgraph.h    |  4 +-
 libgnn/src/graphs/GNNGraph.cpp                | 45 +++++++++++--
 libgnn/src/graphs/GNNSubgraph.cpp             | 37 +++++-----
 libgnn/src/layers/SAGELayer.cpp               |  5 +-
 libgnn/src/layers/SoftmaxLayer.cpp            | 17 ++---
 7 files changed, 135 insertions(+), 52 deletions(-)

diff --git a/libgnn/include/galois/graphs/DegreeSyncStructures.h b/libgnn/include/galois/graphs/DegreeSyncStructures.h
index 0141805df0..04c696f6ab 100644
--- a/libgnn/include/galois/graphs/DegreeSyncStructures.h
+++ b/libgnn/include/galois/graphs/DegreeSyncStructures.h
@@ -6,6 +6,9 @@ namespace graphs {
 extern uint32_t* gnn_degree_vec_1_;
 extern uint32_t* gnn_degree_vec_2_;
 
+extern galois::DynamicBitSet bitset_sampled_degrees_;
+extern std::vector<galois::LargeArray<uint32_t>>* gnn_sampled_out_degrees_;
+
 struct InitialDegreeSync {
   using ValTy = std::pair<uint32_t, uint32_t>;
 
@@ -54,5 +57,69 @@ struct InitialDegreeSync {
   static bool extract_reset_batch(unsigned, uint8_t*) { return false; }
 };
 
+struct SubgraphDegreeSync {
+  using ValTy = galois::gstl::Vector<uint32_t>;
+
+  //! return a vector of floats to sync
+  static ValTy extract(uint32_t lid, char&) {
+    ValTy vec_to_send(gnn_sampled_out_degrees_->size());
+    size_t count = 0;
+    for (galois::LargeArray<uint32_t>& layer_degrees :
+         *gnn_sampled_out_degrees_) {
+      vec_to_send[count++] = layer_degrees[lid];
+    }
+    assert(count == vec_to_send.size());
+    return vec_to_send;
+  }
+
+  static bool reduce(uint32_t lid, char&, ValTy y) {
+    assert(y.size() == gnn_sampled_out_degrees_->size());
+    for (size_t degree_index = 0; degree_index < y.size(); degree_index++) {
+      (*gnn_sampled_out_degrees_)[degree_index][lid] += y[degree_index];
+    }
+    return true;
+  }
+
+  //! No-op: readAny = overwritten anyways; can probably get away with no-op
+  static void reset(uint32_t lid, char&) {
+    for (galois::LargeArray<uint32_t>& layer_degrees :
+         *gnn_sampled_out_degrees_) {
+      layer_degrees[lid] = 0;
+    }
+  }
+
+  //! element wise set
+  static void setVal(uint32_t lid, char&, ValTy y) {
+    assert(y.size() == gnn_sampled_out_degrees_->size());
+    for (size_t degree_index = 0; degree_index < y.size(); degree_index++) {
+      (*gnn_sampled_out_degrees_)[degree_index][lid] = y[degree_index];
+    }
+  }
+
+  // GPU options TODO for GPU
+  static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {
+    return false;
+  }
+  static bool extract_batch(unsigned, uint8_t*) { return false; }
+  static bool reduce_batch(unsigned, uint8_t*, DataCommMode) { return false; }
+  static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) {
+    return false;
+  }
+  static bool setVal_batch(unsigned, uint8_t*, DataCommMode) { return false; }
+  static bool extract_reset_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {
+    return false;
+  }
+  static bool extract_reset_batch(unsigned, uint8_t*) { return false; }
+};
+
+struct SubgraphDegreeBitset {
+  static constexpr bool is_vector_bitset() { return false; }
+  static constexpr bool is_valid() { return true; }
+  static galois::DynamicBitSet& get() { return bitset_sampled_degrees_; }
+  static void reset_range(size_t begin, size_t end) {
+    bitset_sampled_degrees_.reset(begin, end);
+  }
+};
+
 } // namespace graphs
 } // namespace galois
diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 25bc8f8f4f..2812cd4210 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -269,15 +269,7 @@ class GNNGraph {
   void SampleEdges(size_t sample_layer_num, size_t num_to_sample);
 
   //! Construct the subgraph from sampled edges and corresponding nodes
-  size_t ConstructSampledSubgraph() {
-    // false first so that the build process can use functions to access the
-    // real graph
-    use_subgraph_             = false;
-    size_t num_subgraph_nodes = subgraph_->BuildSubgraph(*this);
-    // after this, this graph is a subgraph
-    use_subgraph_ = true;
-    return num_subgraph_nodes;
-  }
+  size_t ConstructSampledSubgraph();
 
   void EnableSubgraph() { use_subgraph_ = true; }
   void DisableSubgraph() { use_subgraph_ = false; }
@@ -380,8 +372,6 @@ class GNNGraph {
     }
 
     if (local_ground_truth_labels_[to_use] != num_label_classes_) {
-      // galois::gPrint(lid, " ", to_use, " ",
-      // (int)local_ground_truth_labels_[to_use], "\n");
       return local_ground_truth_labels_[to_use];
     } else {
       GALOIS_LOG_FATAL(
diff --git a/libgnn/include/galois/graphs/GNNSubgraph.h b/libgnn/include/galois/graphs/GNNSubgraph.h
index 21642b189b..976303be84 100644
--- a/libgnn/include/galois/graphs/GNNSubgraph.h
+++ b/libgnn/include/galois/graphs/GNNSubgraph.h
@@ -128,8 +128,8 @@ class GNNSubgraph {
   //! is for static)
   galois::gstl::Vector<uint32_t> subgraph_id_to_lid_;
   // intermediate degrees used for edge construction
-  galois::gstl::Vector<uint32_t> subgraph_out_degrees_;
-  galois::gstl::Vector<uint32_t> subgraph_in_degrees_;
+  galois::gstl::Vector<uint32_t> local_subgraph_out_degrees_;
+  galois::gstl::Vector<uint32_t> local_subgraph_in_degrees_;
   //! Maps from subgraph out-edge id to original graph edge id (used to check if
   //! edge exists in particular layer)
   galois::gstl::Vector<uint32_t> subedge_to_original_edge_;
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index e1b0bcab67..1a288365bd 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -41,10 +41,16 @@ namespace galois {
 namespace graphs {
 GNNFloat* gnn_matrix_to_sync_            = nullptr;
 size_t gnn_matrix_to_sync_column_length_ = 0;
+//! For synchronization of graph aggregations
 galois::DynamicBitSet bitset_graph_aggregate;
 galois::LargeArray<uint32_t>* gnn_lid_to_sid_pointer_ = nullptr;
 uint32_t* gnn_degree_vec_1_;
 uint32_t* gnn_degree_vec_2_;
+
+//! For synchronization of sampled degrees
+galois::DynamicBitSet bitset_sampled_degrees_;
+std::vector<galois::LargeArray<uint32_t>>* gnn_sampled_out_degrees_;
+
 #ifdef GALOIS_ENABLE_GPU
 struct CUDA_Context* cuda_ctx_for_sync;
 unsigned layer_number_to_sync;
@@ -692,8 +698,9 @@ float galois::graphs::GNNGraph::GetGlobalAccuracyCPUSingle(
   size_t global_correct = num_correct_.reduce();
   size_t global_checked = total_checked_.reduce();
 
-  GALOIS_LOG_WARN("Sub: {}, Accuracy: {} / {}", use_subgraph_, global_correct,
-                  global_checked);
+  // GALOIS_LOG_WARN("Sub: {}, Accuracy: {} / {}", use_subgraph_,
+  // global_correct,
+  //                global_checked);
 
   return static_cast<float>(global_correct) /
          static_cast<float>(global_checked);
@@ -833,6 +840,7 @@ void galois::graphs::GNNGraph::InitializeSamplingData(size_t num_layers,
 }
 
 void galois::graphs::GNNGraph::SetupNeighborhoodSample() {
+  use_subgraph_ = false;
   new_sampled_nodes_.resize(size());
   new_sampled_nodes_.reset();
 
@@ -859,6 +867,8 @@ void galois::graphs::GNNGraph::SetupNeighborhoodSample() {
         },
         galois::chunk_size<1>());
   }
+  bitset_sampled_degrees_.resize(partitioned_graph_->size());
+  bitset_sampled_degrees_.reset();
 }
 
 void galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num) {
@@ -874,7 +884,7 @@ void galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num) {
       galois::iterate(begin(), end()),
       [&](const NodeIterator& src_iter) {
         // only operate on if sampled
-        if (partitioned_graph_->getData(*src_iter)) {
+        if (IsInSampledGraph(src_iter)) {
           // marks ALL edges of nodes that connect to train/other nodes
           for (auto edge_iter : partitioned_graph_->edges(*src_iter)) {
             if (IsValidForPhase(partitioned_graph_->getEdgeDst(edge_iter),
@@ -895,8 +905,8 @@ void galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num) {
       },
       galois::steal(), galois::loopname("ChooseAllEdges"));
 
-  galois::gPrint("Num sampled edges is ", sampled.reduce(), " out of ",
-                 total.reduce(), "\n");
+  galois::gPrint("Num sampled edges in inductive graph is ", sampled.reduce(),
+                 " out of ", total.reduce(), "\n");
 
   std::vector<uint32_t> new_nodes = new_sampled_nodes_.getOffsets();
   // update nodes, then communicate update to all hosts so that they can
@@ -917,13 +927,16 @@ void galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num,
 
   galois::GAccumulator<size_t> sampled;
   galois::GAccumulator<size_t> total;
+  // galois::GAccumulator<size_t> total_nodes;
   sampled.reset();
   total.reset();
+  // total_nodes.reset();
+
   galois::do_all(
       galois::iterate(begin(), end()),
       [&](const NodeIterator& src_iter) {
         // only operate on if sampled
-        if (partitioned_graph_->getData(*src_iter)) {
+        if (IsInSampledGraph(src_iter)) {
           // chance of not uniformly choosing an edge of this node num_to_sample
           // times (degree norm is 1 / degree)
           // XXX training degree + other norm, not global
@@ -946,6 +959,7 @@ void galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num,
                   new_sampled_nodes_.set(
                       partitioned_graph_->getEdgeDst(edge_iter));
                 }
+                bitset_sampled_degrees_.set(*src_iter);
                 // degree increment
                 sampled_out_degrees_[sample_layer_num][*src_iter]++;
                 sampled += 1;
@@ -953,10 +967,13 @@ void galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num,
             }
             total += 1;
           }
+          // total_nodes += 1;
         }
       },
       galois::steal(), galois::loopname("NeighborhoodSample"));
 
+  // galois::gInfo(host_prefix(), "sampled nodes for layer ", sample_layer_num,
+  // " is ", total_nodes.reduce());
   galois::gDebug("Num sampled edges for layer ", sample_layer_num, " is ",
                  sampled.reduce(), " out of ", total.reduce());
 
@@ -973,6 +990,22 @@ void galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num,
   sync_substrate_->sync<writeSource, readAny, SampleFlagSync>("SampleSync");
 }
 
+//! Construct the subgraph from sampled edges and corresponding nodes
+size_t galois::graphs::GNNGraph::ConstructSampledSubgraph() {
+  // false first so that the build process can use functions to access the
+  // real graph
+  use_subgraph_            = false;
+  gnn_sampled_out_degrees_ = &sampled_out_degrees_;
+  // first, sync the degres of the sampled edges across all hosts
+  sync_substrate_
+      ->sync<writeSource, readAny, SubgraphDegreeSync, SubgraphDegreeBitset>(
+          "SubgraphDegree");
+  size_t num_subgraph_nodes = subgraph_->BuildSubgraph(*this);
+  // after this, this graph is a subgraph
+  use_subgraph_ = true;
+  return num_subgraph_nodes;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 
 #ifdef GALOIS_ENABLE_GPU
diff --git a/libgnn/src/graphs/GNNSubgraph.cpp b/libgnn/src/graphs/GNNSubgraph.cpp
index 9bd467e8e3..387e3fc250 100644
--- a/libgnn/src/graphs/GNNSubgraph.cpp
+++ b/libgnn/src/graphs/GNNSubgraph.cpp
@@ -6,6 +6,9 @@ galois::graphs::GNNGraph::GNNSubgraph::BuildSubgraph(GNNGraph& gnn_graph) {
   galois::StatTimer timer("BuildSubgraph", kRegionName);
   timer.start();
   CreateLocalToSubgraphMapping(gnn_graph);
+  if (num_subgraph_nodes_ == 0) {
+    return 0;
+  }
   DegreeCounting(gnn_graph);
   EdgeCreation(gnn_graph);
   NodeFeatureCreation(gnn_graph);
@@ -52,7 +55,7 @@ void galois::graphs::GNNGraph::GNNSubgraph::CreateLocalToSubgraphMapping(
       lid_to_subgraph_id_[local_node_id] = current_sid++;
     }
   }
-  galois::gDebug("Numbered sampled nodes for subgraph construction is ",
+  galois::gDebug("Number of sampled nodes for subgraph construction is ",
                  current_sid);
 
   num_subgraph_nodes_ = current_sid;
@@ -67,8 +70,8 @@ void galois::graphs::GNNGraph::GNNSubgraph::DegreeCounting(
   timer.start();
 
   subgraph_id_to_lid_.resize(num_subgraph_nodes_);
-  subgraph_out_degrees_.resize(num_subgraph_nodes_);
-  subgraph_in_degrees_.resize(num_subgraph_nodes_);
+  local_subgraph_out_degrees_.resize(num_subgraph_nodes_);
+  local_subgraph_in_degrees_.resize(num_subgraph_nodes_);
 
   galois::do_all(
       galois::iterate(gnn_graph.begin(), gnn_graph.end()),
@@ -83,7 +86,7 @@ void galois::graphs::GNNGraph::GNNSubgraph::DegreeCounting(
               out_degrees++;
             }
           }
-          subgraph_out_degrees_[subgraph_id] = out_degrees;
+          local_subgraph_out_degrees_[subgraph_id] = out_degrees;
 
           uint32_t in_degrees = 0;
           for (auto in_edge_iter : gnn_graph.in_edges(node_id)) {
@@ -91,7 +94,7 @@ void galois::graphs::GNNGraph::GNNSubgraph::DegreeCounting(
               in_degrees++;
             }
           }
-          subgraph_in_degrees_[subgraph_id] = in_degrees;
+          local_subgraph_in_degrees_[subgraph_id] = in_degrees;
           // galois::gDebug("Local ID ", node_id, " SID ", subgraph_id, " out ",
           //               out_degrees, " in ", in_degrees);
         }
@@ -109,21 +112,21 @@ void galois::graphs::GNNGraph::GNNSubgraph::EdgeCreation(
 
   // prefix sum over subgraph degrees from previous phase to get starting points
   for (size_t i = 1; i < num_subgraph_nodes_; i++) {
-    subgraph_out_degrees_[i] += subgraph_out_degrees_[i - 1];
-    subgraph_in_degrees_[i] += subgraph_in_degrees_[i - 1];
+    local_subgraph_out_degrees_[i] += local_subgraph_out_degrees_[i - 1];
+    local_subgraph_in_degrees_[i] += local_subgraph_in_degrees_[i - 1];
   }
 
   // allocate then set node endpoints
-  num_subgraph_edges_ = subgraph_out_degrees_.back();
+  num_subgraph_edges_ = local_subgraph_out_degrees_.back();
   underlying_graph_.DeallocateOnly();
   underlying_graph_.allocateFrom(num_subgraph_nodes_, num_subgraph_edges_);
   underlying_graph_.CSCAllocate();
   galois::do_all(galois::iterate(uint32_t{0}, num_subgraph_nodes_),
                  [&](uint32_t subgraph_id) {
                    underlying_graph_.fixEndEdge(
-                       subgraph_id, subgraph_out_degrees_[subgraph_id]);
+                       subgraph_id, local_subgraph_out_degrees_[subgraph_id]);
                    underlying_graph_.FixEndInEdge(
-                       subgraph_id, subgraph_in_degrees_[subgraph_id]);
+                       subgraph_id, local_subgraph_in_degrees_[subgraph_id]);
                  });
   subedge_to_original_edge_.resize(num_subgraph_edges_);
   in_subedge_to_original_edge_.resize(num_subgraph_edges_);
@@ -138,8 +141,8 @@ void galois::graphs::GNNGraph::GNNSubgraph::EdgeCreation(
           uint32_t out_location = 0;
           uint32_t in_location  = 0;
           if (subgraph_id != 0) {
-            out_location = subgraph_out_degrees_[subgraph_id - 1];
-            in_location  = subgraph_in_degrees_[subgraph_id - 1];
+            out_location = local_subgraph_out_degrees_[subgraph_id - 1];
+            in_location  = local_subgraph_in_degrees_[subgraph_id - 1];
           }
 
           for (auto out_edge_iter : gnn_graph.edges(node_id)) {
@@ -160,8 +163,8 @@ void galois::graphs::GNNGraph::GNNSubgraph::EdgeCreation(
                   lid_to_subgraph_id_[gnn_graph.GetInEdgeDest(in_edge_iter)]);
             }
           }
-          assert(out_location == subgraph_out_degrees_[subgraph_id]);
-          assert(in_location == subgraph_in_degrees_[subgraph_id]);
+          assert(out_location == local_subgraph_out_degrees_[subgraph_id]);
+          assert(in_location == local_subgraph_in_degrees_[subgraph_id]);
         }
       },
       galois::steal());
@@ -182,12 +185,6 @@ void galois::graphs::GNNGraph::GNNSubgraph::NodeFeatureCreation(
         &(subgraph_node_features_[subgraph_node_id * feat_length]),
         &((gnn_graph.GetLocalFeatures().data())[local_id * feat_length]),
         feat_length * sizeof(GNNFeature));
-    // for (unsigned i = 0; i < feat_length; i++) {
-    //  galois::gPrint(feat_length * sizeof(GNNFeature) , " ", subgraph_node_id,
-    //  " local id " , local_id, " feat at ", i, " is ",
-    //  subgraph_node_features_[subgraph_node_id * feat_length + i], " ",
-    //  gnn_graph.GetLocalFeatures()[local_id * feat_length + i], "\n");
-    //}
   });
   timer.stop();
 }
diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp
index 134f24f3f2..9d6ca7c5cc 100644
--- a/libgnn/src/layers/SAGELayer.cpp
+++ b/libgnn/src/layers/SAGELayer.cpp
@@ -499,9 +499,10 @@ void galois::SAGELayer::UpdateEmbeddings(const GNNFloat* node_embeddings,
         base_gpu_object_.layer_weights(), output);
   } else {
 #endif
-    galois::gPrint(layer_dimensions_.input_rows, " ",
+    galois::gDebug("Layer ", graph_user_layer_number_, " ",
+                   layer_dimensions_.input_rows, " ",
                    layer_dimensions_.input_columns, " ",
-                   layer_dimensions_.output_columns, "\n");
+                   layer_dimensions_.output_columns);
     // CPU version is just a call into CBlas
     galois::CBlasSGEMM(CblasNoTrans, CblasNoTrans, layer_dimensions_.input_rows,
                        layer_dimensions_.input_columns,
diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp
index 94523ce327..f7a345050d 100644
--- a/libgnn/src/layers/SoftmaxLayer.cpp
+++ b/libgnn/src/layers/SoftmaxLayer.cpp
@@ -8,13 +8,12 @@ galois::SoftmaxLayer::ForwardPhaseCPU(
   // note: p_backward == input_embeddings
   input_loss_.assign(input_loss_.size(), 0.0);
   const size_t feature_length = layer_dimensions_.input_columns;
-  //#ifndef NDEBUG
-  //#ifdef NDEBUG
+#ifndef NDEBUG
   galois::DGAccumulator<GNNFloat> loss_accum;
   galois::DGAccumulator<size_t> handled;
   loss_accum.reset();
   handled.reset();
-  //#endif
+#endif
 
   galois::do_all(
       galois::iterate(graph_.begin(), graph_.end()),
@@ -44,11 +43,10 @@ galois::SoftmaxLayer::ForwardPhaseCPU(
           input_loss_[i] =
               GNNCrossEntropy(feature_length, ground_truth_vec->data(),
                               &p_backward_output_matrix_[feature_length * i]);
-          //#ifndef NDEBUG
-          //#ifdef NDEBUG
+#ifndef NDEBUG
           loss_accum += input_loss_[i];
           handled += 1;
-          //#endif
+#endif
         } else {
           VectorZero(feature_length,
                      &p_backward_output_matrix_[i * feature_length]);
@@ -57,14 +55,11 @@ galois::SoftmaxLayer::ForwardPhaseCPU(
       // TODO chunk size?
       // steal on as some threads may have nothing to work on
       galois::steal(), galois::loopname("SoftmaxForward"));
-  //#ifndef NDEBUG
-  //#ifdef NDEBUG
-
+#ifndef NDEBUG
   GNNFloat reduced_loss = loss_accum.reduce();
   size_t t              = handled.reduce();
   galois::gPrint("Loss is ", reduced_loss / t, " ", reduced_loss, " ", t, "\n");
-
-  //#endif
+#endif
 
   return p_backward_output_matrix_;
 }

From 4b19e5c14183c1d91e2710abdea31b1c0ef9817e Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 3 May 2021 13:44:58 -0500
Subject: [PATCH 527/660] GNN single host training minibatching

Allows for training using minibatching on a single host via a command
line argument.

A minibatcher class has been added that takes in a mask and batches
it via linear scan over it (O(n) ultimately per epoch). This mask
is then used to mark the seed nodes for the sampler (which can do full
sampling or it can take all nodes; this needs to be implemented).

The rest of execution works the same as previous sampling subgraphs:
main difference is the seed nodes aren't all training nodes, and
everything works out the same way from there.

Other notable changes
- GNNMask class for masks
- kBatch phase for minibatching; set the phase and entire pipeline
works as you might expect
- Removed old node sampling stuff because unused right now;
might bring back SAINT sampling if it turns out to be useful
- Some signature changes to functions
---
 libgnn/CMakeLists.txt                      |   1 +
 libgnn/include/galois/GNNTypes.h           |   4 +-
 libgnn/include/galois/GraphNeuralNetwork.h |   2 +
 libgnn/include/galois/MinibatchGenerator.h |  28 +++++
 libgnn/include/galois/graphs/GNNGraph.h    |  56 ++++++----
 libgnn/src/GraphNeuralNetwork.cpp          |  81 ++++++++++++--
 libgnn/src/MinibatchGenerator.cpp          |  33 ++++++
 libgnn/src/graphs/GNNGraph.cpp             | 116 ++++-----------------
 libgnn/src/layers/SAGELayer.cpp            |   2 +-
 libgnn/src/layers/SoftmaxLayer.cpp         |   1 +
 lonestar/libgnnbench/src/Input.cpp         |  13 ++-
 11 files changed, 208 insertions(+), 129 deletions(-)
 create mode 100644 libgnn/include/galois/MinibatchGenerator.h
 create mode 100644 libgnn/src/MinibatchGenerator.cpp

diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt
index 2393ce043b..46ea0fd67c 100644
--- a/libgnn/CMakeLists.txt
+++ b/libgnn/CMakeLists.txt
@@ -2,6 +2,7 @@ set(sources
   src/GNNMath.cpp
   src/GNNOptimizers.cpp
   src/GraphNeuralNetwork.cpp
+  src/MinibatchGenerator.cpp
   src/graphs/GNNGraph.cpp
   src/graphs/GNNSubgraph.cpp
   src/layers/DenseLayer.cpp
diff --git a/libgnn/include/galois/GNNTypes.h b/libgnn/include/galois/GNNTypes.h
index 492bc841dc..5dbcf4771b 100644
--- a/libgnn/include/galois/GNNTypes.h
+++ b/libgnn/include/galois/GNNTypes.h
@@ -19,13 +19,15 @@ using GNNFloat = float;
 using GNNLabel = uint8_t;
 //! Type of a feature on vertices
 using GNNFeature = float;
+//! Type of mask
+using GNNMask = std::vector<char>;
 //! Type of node index on gpus
 using GPUNodeIndex = uint32_t;
 //! Type of edge index on gpus
 using GPUEdgeIndex = uint64_t;
 
 //! Phase of GNN computation
-enum class GNNPhase { kTrain, kValidate, kTest, kOther };
+enum class GNNPhase { kTrain, kValidate, kTest, kOther, kBatch };
 
 //! Vector like wrapper over a pointer and size; exists solely to pass around
 //! raw pointers with size (because vectors are a no-go due to the code
diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h
index 953e925d9a..580738b133 100644
--- a/libgnn/include/galois/GraphNeuralNetwork.h
+++ b/libgnn/include/galois/GraphNeuralNetwork.h
@@ -92,6 +92,7 @@ class GraphNeuralNetworkConfig {
   }
 
   bool do_sampling() const { return do_sampling_; }
+  unsigned train_minibatch_size() const { return train_minibatch_size_; }
 
   //! Get the default layer config of layers in this GNN
   const GNNLayerConfig& default_layer_config() const {
@@ -107,6 +108,7 @@ class GraphNeuralNetworkConfig {
   unsigned validation_interval_{0};
   //! Interval to run testing set on network at; 0 = no run
   unsigned test_interval_{0};
+  unsigned train_minibatch_size_{0};
 
 private:
   //! Number of layers to construct in the GNN not including the output
diff --git a/libgnn/include/galois/MinibatchGenerator.h b/libgnn/include/galois/MinibatchGenerator.h
new file mode 100644
index 0000000000..0bd063b90c
--- /dev/null
+++ b/libgnn/include/galois/MinibatchGenerator.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#include "galois/GNNTypes.h"
+
+namespace galois {
+
+//! Generates minibatchs given a mask for the class of things to generate
+//! the minibatch for
+class MinibatchGenerator {
+public:
+  MinibatchGenerator(const GNNMask& mask_to_minibatch, size_t minibatch_size)
+      : mask_to_minibatch_{mask_to_minibatch}, minibatch_size_{minibatch_size} {
+  }
+  void GetNextMinibatch(std::vector<char>* batch_mask);
+  //! True if no more minibatches from this generator
+  bool NoMoreMinibatches() {
+    return current_position_ == mask_to_minibatch_.size();
+  }
+  //! Reset the only state (a position bit)
+  void ResetMinibatchState() { current_position_ = 0; }
+
+private:
+  const GNNMask& mask_to_minibatch_;
+  size_t minibatch_size_;
+  size_t current_position_{0};
+};
+
+} // namespace galois
diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 2812cd4210..ded867787c 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -5,6 +5,7 @@
 #include "galois/graphs/CuSPPartitioner.h"
 #include "galois/graphs/GluonSubstrate.h"
 #include "galois/graphs/GraphAggregationSyncStructures.h"
+#include "galois/MinibatchGenerator.h"
 
 #ifdef GALOIS_ENABLE_GPU
 #include "galois/graphs/GNNGraph.cuh"
@@ -261,7 +262,8 @@ class GNNGraph {
   //////////////////////////////////////////////////////////////////////////////
 
   //! Set seed nodes, i.e., nodes that are being predicted on
-  void SetupNeighborhoodSample();
+  void SetupNeighborhoodSample() { SetupNeighborhoodSample(GNNPhase::kTrain); }
+  void SetupNeighborhoodSample(GNNPhase seed_phase);
 
   //! Choose all edges from sampled nodes
   void SampleAllEdges(size_t agg_layer_num);
@@ -310,7 +312,24 @@ class GNNGraph {
   }
 
   //////////////////////////////////////////////////////////////////////////////
+  void SetupTrainBatcher(size_t train_batch_size) {
+    if (train_batcher_) {
+      // clear before remake
+      train_batcher_.reset();
+    }
+    train_batcher_ = std::make_unique<MinibatchGenerator>(local_training_mask_,
+                                                          train_batch_size);
+    local_minibatch_mask_.resize(partitioned_graph_->size());
+  }
+
+  void ResetTrainMinibatcher() { train_batcher_->ResetMinibatchState(); }
 
+  //! Setup the state for the next minibatch sampling call by using the
+  //! minibatcher to pick up the next set batch of nodes
+  void PrepareNextTrainMinibatch();
+  //! Returns true if there are still more minibatches in this graph
+  bool MoreTrainMinibatches() { return !train_batcher_->NoMoreMinibatches(); };
+  //////////////////////////////////////////////////////////////////////////////
   GNNFloat GetGCNNormFactor(GraphNode lid) const {
     if (global_degrees_[lid]) {
       return 1.0 / std::sqrt(static_cast<float>(global_degrees_[lid]) + 1);
@@ -416,7 +435,11 @@ class GNNGraph {
     if (use_subgraph_) {
       to_use = subgraph_->SIDToLID(lid);
     }
-    if (!incomplete_masks_ && current_phase != GNNPhase::kOther) {
+    // re: phase checks in this if: ranges are not used for these
+    // phases even if they might exist; it's something to look into
+    // possibly, though at the same time it may not be worth it
+    if (!incomplete_masks_ && current_phase != GNNPhase::kOther &&
+        current_phase != GNNPhase::kBatch) {
       return IsValidForPhaseCompleteRange(to_use, current_phase);
     } else {
       return IsValidForPhaseMasked(to_use, current_phase);
@@ -444,17 +467,6 @@ class GNNGraph {
   // Sampling related
   //////////////////////////////////////////////////////////////////////////////
 
-  //! Loops through all master nodes and determines if it is "on" or "off"
-  //! (the meaning of on and off depends on how it is used; for now, it is used
-  //! to indicate subgraph presence); droprate controls chance of being dropped
-  //! (e.g. if 0.8, a node is 80% likely to not be included in subgraph)
-  void UniformNodeSample() { UniformNodeSample(0.5); }
-  void UniformNodeSample(float droprate);
-
-  //! Use the sampling method present in GraphSAINT
-  void GraphSAINTSample() { GraphSAINTSample(3000, 2); };
-  void GraphSAINTSample(size_t num_roots, size_t walk_depth);
-
   //! Makes a node "sampled"; used for debugging/testing
   void SetSampledNode(size_t node) { partitioned_graph_->getData(node) = 1; }
   //! Makes a node "not sampled"; used for debugging/testing
@@ -514,7 +526,7 @@ class GNNGraph {
   //! given a name, mask type, and arrays to save into
   size_t ReadLocalMasksFromFile(const std::string& dataset_name,
                                 const std::string& mask_type,
-                                GNNRange* mask_range, char* masks);
+                                GNNRange* mask_range, std::vector<char>* masks);
   //! Finds nodes that aren't part of the 3 main GNN phase classifications
   size_t FindOtherMask();
   //! Read masks of local nodes only for training, validation, and testing
@@ -589,14 +601,17 @@ class GNNGraph {
 
   // TODO maybe revisit this and use an actual bitset
   //! Bitset indicating which nodes are training nodes
-  std::vector<char> local_training_mask_;
+  GNNMask local_training_mask_;
   //! Bitset indicating which nodes are validation nodes
-  std::vector<char> local_validation_mask_;
+  GNNMask local_validation_mask_;
   //! Bitset indicating which nodes are testing nodes
-  std::vector<char> local_testing_mask_;
-  size_t valid_other_{0};
+  GNNMask local_testing_mask_;
   //! Bitset indicating which nodes don't fall anywhere
-  std::vector<char> other_mask_;
+  GNNMask other_mask_;
+  //! Bitset indicating which nodes are part of the minibatch
+  GNNMask local_minibatch_mask_;
+
+  size_t valid_other_{0};
 
   //! Global mask range for training nodes; must convert to LIDs when using
   //! in this class
@@ -624,6 +639,9 @@ class GNNGraph {
   bool use_subgraph_{false};
   bool subgraph_is_inductive_{false};
 
+  std::unique_ptr<MinibatchGenerator> train_batcher_;
+  std::unique_ptr<MinibatchGenerator> test_batcher_;
+
   //////////////////////////////////////////////////////////////////////////////
   // GPU things
   //////////////////////////////////////////////////////////////////////////////
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index 46b8a6bcdd..dc2ebb2834 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -106,12 +106,20 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork(
       break;
     }
   }
-  if (config_.do_sampling() || config_.inductive_training_) {
+
+  // XXX test minibatch
+  if (config_.do_sampling() || config_.inductive_training_ ||
+      config.train_minibatch_size()) {
     // output layer not included; it will never involve sampling
     graph_->InitializeSamplingData(num_graph_user_layers_,
                                    config_.inductive_training_);
   }
 
+  if (config_.train_minibatch_size()) {
+    graph_->SetupTrainBatcher(config_.train_minibatch_size());
+  }
+  // XXX test minibatch size
+
   // create the output layer
   GNNLayerDimensions output_dims = {
       .input_rows = max_rows,
@@ -156,7 +164,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
   const size_t this_host = graph_->host_id();
   float train_accuracy{0.f};
   size_t inductive_nodes = 0;
-  if (config_.inductive_training_) {
+  if (config_.inductive_training_ && !config_.train_minibatch_size()) {
     // Setup the subgraph to only be the training graph
     graph_->SetupNeighborhoodSample();
     for (auto back_iter = gnn_layers_.rbegin(); back_iter != gnn_layers_.rend();
@@ -179,10 +187,9 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
   galois::StatTimer validation_timer("ValidationTime", "GraphNeuralNetwork");
   galois::StatTimer epoch_test_timer("TestTime", "GraphNeuralNetwork");
 
-  // TODO incorporate validation/test intervals
   for (size_t epoch = 0; epoch < num_epochs; epoch++) {
     epoch_timer.start();
-    if (config_.inductive_training_) {
+    if (config_.inductive_training_ && !config_.train_minibatch_size()) {
       graph_->EnableSubgraph();
       for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end();
            layer++) {
@@ -190,7 +197,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
       }
     }
 
-    if (config_.do_sampling()) {
+    if (config_.do_sampling() && !config_.train_minibatch_size()) {
       graph_->SetupNeighborhoodSample();
       size_t num_sampled_layers = 0;
 
@@ -201,7 +208,11 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
         GNNLayerType layer_type = (*back_iter)->layer_type();
         if (layer_type == GNNLayerType::kGraphConvolutional ||
             layer_type == GNNLayerType::kSAGE) {
-          graph_->SampleEdges((*back_iter)->graph_user_layer_number(), 5);
+          if (num_sampled_layers == 0) {
+            graph_->SampleEdges((*back_iter)->graph_user_layer_number(), 10);
+          } else {
+            graph_->SampleEdges((*back_iter)->graph_user_layer_number(), 25);
+          }
           num_sampled_layers++;
         }
       }
@@ -215,11 +226,59 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
       }
     }
 
-    const PointerWithSize<galois::GNNFloat> predictions = DoInference();
-    // have to get accuracy here because gradient prop destroys the predictions
-    // matrix
-    train_accuracy = GetGlobalAccuracy(predictions);
-    GradientPropagation();
+    if (!config_.train_minibatch_size()) {
+      // no minibatching, full batch
+      const PointerWithSize<galois::GNNFloat> predictions = DoInference();
+      // have to get accuracy here because gradient prop destroys the
+      // predictions matrix
+      train_accuracy = GetGlobalAccuracy(predictions);
+      GradientPropagation();
+    } else {
+      graph_->ResetTrainMinibatcher();
+      SetLayerPhases(galois::GNNPhase::kBatch);
+
+      size_t batch_num = 0;
+
+      // XXX
+      // create mini batch graphs and loop until minibatches on all hosts done
+      while (true) {
+        galois::gInfo("Epoch ", epoch, " batch ", batch_num++);
+        // break when all hosts are done with minibatches
+        graph_->PrepareNextTrainMinibatch();
+        size_t num_sampled_layers = 0;
+        for (auto back_iter = gnn_layers_.rbegin();
+             back_iter != gnn_layers_.rend(); back_iter++) {
+          GNNLayerType layer_type = (*back_iter)->layer_type();
+          if (layer_type == GNNLayerType::kGraphConvolutional ||
+              layer_type == GNNLayerType::kSAGE) {
+            if (num_sampled_layers == 0) {
+              graph_->SampleEdges((*back_iter)->graph_user_layer_number(), 10);
+            } else {
+              graph_->SampleEdges((*back_iter)->graph_user_layer_number(), 25);
+            }
+            num_sampled_layers++;
+          }
+        }
+        // resize layer matrices
+        size_t num_subgraph_nodes = graph_->ConstructSampledSubgraph();
+        galois::gPrint(num_subgraph_nodes, "\n");
+        for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end();
+             layer++) {
+          (*layer)->ResizeRows(num_subgraph_nodes);
+        }
+
+        const PointerWithSize<galois::GNNFloat> batch_pred = DoInference();
+        DoInference();
+        train_accuracy = GetGlobalAccuracy(batch_pred);
+        GradientPropagation();
+        galois::gPrint("Epoch ", epoch, " Batch ", batch_num,
+                       ": Train accuracy/F1 micro is ", train_accuracy, "\n");
+        // XXX sync across all hosts minibatcher state
+        if (!graph_->MoreTrainMinibatches()) {
+          break;
+        }
+      }
+    }
     epoch_timer.stop();
 
     if (this_host == 0) {
diff --git a/libgnn/src/MinibatchGenerator.cpp b/libgnn/src/MinibatchGenerator.cpp
new file mode 100644
index 0000000000..7c3b6dd831
--- /dev/null
+++ b/libgnn/src/MinibatchGenerator.cpp
@@ -0,0 +1,33 @@
+#include "galois/MinibatchGenerator.h"
+#include <cassert>
+
+void galois::MinibatchGenerator::GetNextMinibatch(
+    std::vector<char>* batch_mask) {
+  std::fill(batch_mask->begin(), batch_mask->end(), 0);
+  assert(current_position_ <= mask_to_minibatch_.size());
+  assert(batch_mask->size() == mask_to_minibatch_.size());
+  if (current_position_ >= mask_to_minibatch_.size()) {
+    return;
+  }
+
+  size_t current_count = 0;
+  // start from last positiion
+  while (current_position_ < mask_to_minibatch_.size()) {
+    if (mask_to_minibatch_[current_position_]) {
+      // XXX and a master node; seed nodes only exist locally
+      (*batch_mask)[current_position_] = 1;
+      current_count++;
+    }
+    // break when minibatch is large enough
+    current_position_++;
+    if (current_count == minibatch_size_)
+      break;
+  }
+
+  // advance current position to next set bit for next call (or to end to detect
+  // no more minibatches
+  while (!mask_to_minibatch_[current_position_] &&
+         (current_position_ < mask_to_minibatch_.size())) {
+    current_position_++;
+  }
+}
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index 1a288365bd..d46f75305f 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -99,6 +99,10 @@ galois::graphs::GNNGraph::GNNGraph(const std::string& input_directory,
   // init norm factors (involves a sync call)
   InitNormFactor();
 
+  // XXX remove this
+  test_batcher_ =
+      std::make_unique<MinibatchGenerator>(local_testing_mask_, 2000);
+
 #ifdef GALOIS_ENABLE_GPU
   if (device_personality == DevicePersonality::GPU_CUDA) {
     // allocate/copy data structures over to GPU
@@ -158,7 +162,7 @@ bool galois::graphs::GNNGraph::IsValidForPhaseCompleteRange(
 bool galois::graphs::GNNGraph::IsValidForPhaseMasked(
     const unsigned lid, const galois::GNNPhase current_phase) const {
   // select mask to use based on phase
-  const std::vector<char>* mask_to_use;
+  const GNNMask* mask_to_use;
   switch (current_phase) {
   case GNNPhase::kTrain:
     mask_to_use = &local_training_mask_;
@@ -175,6 +179,9 @@ bool galois::graphs::GNNGraph::IsValidForPhaseMasked(
     }
     mask_to_use = &other_mask_;
     break;
+  case GNNPhase::kBatch:
+    mask_to_use = &local_minibatch_mask_;
+    break;
   default:
     GALOIS_LOG_FATAL("Invalid phase used");
     mask_to_use = nullptr;
@@ -246,84 +253,6 @@ void galois::graphs::GNNGraph::AggregateSync(
 }
 #endif
 
-void galois::graphs::GNNGraph::UniformNodeSample(float droprate) {
-  galois::do_all(
-      galois::iterate(begin_owned(), end_owned()), [&](const NodeIterator& x) {
-        partitioned_graph_->getData(*x) = sample_rng_.DoBernoulli(droprate);
-      });
-  // TODO(loc) GPU
-  // TODO(loc) sync the flags across all machines to have same sample on all of
-  // them
-}
-
-// TODO(loc) does not work in a distributed setting: assumes the partitioned
-// graph is the entire graph
-void galois::graphs::GNNGraph::GraphSAINTSample(size_t num_roots,
-                                                size_t walk_depth) {
-  // reset sample
-  galois::do_all(galois::iterate(begin(), end()),
-                 [&](size_t n) { partitioned_graph_->getData(n) = 0; });
-
-  galois::on_each([&](size_t thread_id, size_t num_threads) {
-    size_t my_start = 0;
-    size_t my_end   = 0;
-    std::tie(my_start, my_end) =
-        galois::block_range(size_t{0}, num_roots, thread_id, num_threads);
-    size_t thread_roots = my_end - my_start;
-    size_t train_range  = global_training_mask_range_.size;
-    // init RNG
-    drand48_data seed_struct;
-    srand48_r(sample_rng_.GetRandomNumber() * thread_id * num_threads,
-              &seed_struct);
-
-    for (size_t root_num = 0; root_num < thread_roots; root_num++) {
-      // pick a random training node root at random (with replacement);
-      size_t root = 0;
-      while (true) {
-        long int rand_num;
-        lrand48_r(&seed_struct, &rand_num);
-        root = global_training_mask_range_.begin + (rand_num % train_range);
-        if (IsValidForPhase(root, GNNPhase::kTrain)) {
-          break;
-        }
-      }
-      // mark this root as sampled
-      SetSampledNode(root);
-      assert(IsInSampledGraph(root));
-
-      // sample more nodes based on depth of the walk
-      for (size_t current_depth = 0; current_depth < walk_depth;
-           current_depth++) {
-        // pick random edge, mark sampled, swap roots
-        EdgeIterator first_edge = edge_begin(root);
-        size_t num_edges        = std::distance(first_edge, edge_end(root));
-        if (num_edges == 0) {
-          break;
-        }
-
-        // must select training neighbor: if it doesn't, then ignore and
-        // continue
-        // To prevent infinite loop in case node has NO training neighbor,
-        // this implementation will not loop until one is found and will
-        // not find full depth if it doesn't find any training nodes randomly
-        long int rand_num;
-        lrand48_r(&seed_struct, &rand_num);
-        EdgeIterator selected_edge = first_edge + (rand_num % num_edges);
-        size_t candidate_dest      = GetEdgeDest(selected_edge);
-
-        // TODO(loc) another possibility is to just pick it anyways regardless
-        // but don't mark it as sampled, though this would lead to disconnected
-        // graph
-        if (IsValidForPhase(candidate_dest, GNNPhase::kTrain)) {
-          SetSampledNode(candidate_dest);
-          assert(IsInSampledGraph(candidate_dest));
-          root = candidate_dest;
-        }
-      }
-    }
-  });
-}
-
 void galois::graphs::GNNGraph::ReadLocalLabels(const std::string& dataset_name,
                                                bool has_single_class_label) {
   GALOIS_LOG_VERBOSE("[{}] Reading labels from disk...", host_id_);
@@ -470,7 +399,7 @@ void galois::graphs::GNNGraph::ReadLocalFeatures(
 //! given a name, mask type, and arrays to save into
 size_t galois::graphs::GNNGraph::ReadLocalMasksFromFile(
     const std::string& dataset_name, const std::string& mask_type,
-    GNNRange* mask_range, char* masks) {
+    GNNRange* mask_range, std::vector<char>* masks) {
   size_t range_begin;
   size_t range_end;
 
@@ -504,7 +433,7 @@ size_t galois::graphs::GNNGraph::ReadLocalMasksFromFile(
       if (mask == 1) {
         valid_count++;
         if (partitioned_graph_->isLocal(cur_line_num)) {
-          masks[partitioned_graph_->getLID(cur_line_num)] = 1;
+          (*masks)[partitioned_graph_->getLID(cur_line_num)] = 1;
           local_sample_count++;
         }
       }
@@ -587,13 +516,13 @@ void galois::graphs::GNNGraph::ReadLocalMasks(const std::string& dataset_name) {
     // XXX i can get local sample counts from here if i need it
     size_t valid_train = ReadLocalMasksFromFile(dataset_name, "train",
                                                 &global_training_mask_range_,
-                                                local_training_mask_.data());
+                                                &local_training_mask_);
     size_t valid_val   = ReadLocalMasksFromFile(dataset_name, "val",
                                               &global_validation_mask_range_,
-                                              local_validation_mask_.data());
+                                              &local_validation_mask_);
     size_t valid_test  = ReadLocalMasksFromFile(dataset_name, "test",
                                                &global_testing_mask_range_,
-                                               local_testing_mask_.data());
+                                               &local_testing_mask_);
     valid_other_       = FindOtherMask();
     // the "other" set of nodes that don't fall into any classification
     if (galois::runtime::getSystemNetworkInterface().ID == 0) {
@@ -671,13 +600,7 @@ float galois::graphs::GNNGraph::GetGlobalAccuracyCPUSingle(
       galois::iterate(begin_owned(), end_owned()),
       // this is possibly the subgraph id
       [&](const unsigned node_id) {
-        unsigned lid = node_id;
-        if (use_subgraph_) {
-          // convert SID over to LID
-          lid = subgraph_->SIDToLID(node_id);
-        }
-
-        if (IsValidForPhase(lid, phase)) {
+        if (IsValidForPhase(node_id, phase)) {
           total_checked_ += 1;
           // get prediction by getting max
           // note the use of node_id here: lid only used to check original
@@ -687,7 +610,7 @@ float galois::graphs::GNNGraph::GetGlobalAccuracyCPUSingle(
           // check against ground truth and track accordingly
           // TODO static cast used here is dangerous
           if (predicted_label ==
-              static_cast<size_t>(GetSingleClassLabel(lid))) {
+              static_cast<size_t>(GetSingleClassLabel(node_id))) {
             num_correct_ += 1;
           }
         }
@@ -839,14 +762,14 @@ void galois::graphs::GNNGraph::InitializeSamplingData(size_t num_layers,
   }
 }
 
-void galois::graphs::GNNGraph::SetupNeighborhoodSample() {
+void galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) {
   use_subgraph_ = false;
   new_sampled_nodes_.resize(size());
   new_sampled_nodes_.reset();
 
   // for now, if training node, it goes into seed node
   galois::do_all(galois::iterate(begin(), end()), [&](const NodeIterator& x) {
-    if (IsValidForPhase(*x, GNNPhase::kTrain)) {
+    if (IsValidForPhase(*x, seed_phase)) {
       SetSampledNode(*x);
     } else {
       UnsetSampledNode(*x);
@@ -1006,6 +929,11 @@ size_t galois::graphs::GNNGraph::ConstructSampledSubgraph() {
   return num_subgraph_nodes;
 }
 
+void galois::graphs::GNNGraph::PrepareNextTrainMinibatch() {
+  train_batcher_->GetNextMinibatch(&local_minibatch_mask_);
+  SetupNeighborhoodSample(GNNPhase::kBatch);
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 
 #ifdef GALOIS_ENABLE_GPU
diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp
index 9d6ca7c5cc..22178ee2fa 100644
--- a/libgnn/src/layers/SAGELayer.cpp
+++ b/libgnn/src/layers/SAGELayer.cpp
@@ -187,7 +187,7 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
   galois::StatTimer timer("BackwardPhase", kRegionName);
   timer.start();
 
-  assert(layer_phase_ == GNNPhase::kTrain);
+  assert(layer_phase_ == GNNPhase::kTrain || layer_phase_ == GNNPhase::kBatch);
 
   // derivative of activation
   if (!config_.disable_activation) {
diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp
index f7a345050d..beccf42289 100644
--- a/libgnn/src/layers/SoftmaxLayer.cpp
+++ b/libgnn/src/layers/SoftmaxLayer.cpp
@@ -20,6 +20,7 @@ galois::SoftmaxLayer::ForwardPhaseCPU(
       [&](const unsigned i) {
         if (IsSampledLayer()) {
           if (layer_phase_ == GNNPhase::kTrain && !graph_.IsInSampledGraph(i)) {
+            // XXX
             VectorZero(feature_length,
                        &p_backward_output_matrix_[i * feature_length]);
             return;
diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp
index 0bc508963d..921baaa4df 100644
--- a/lonestar/libgnnbench/src/Input.cpp
+++ b/lonestar/libgnnbench/src/Input.cpp
@@ -114,6 +114,11 @@ llvm::cl::opt<bool>
                                     "all non-train nodes are ignored"),
                           cll::init(false));
 
+llvm::cl::opt<unsigned>
+    train_minibatch_size("trainMinibatchSize",
+                         cll::desc("Size of training minibatch (default 0)"),
+                         cll::init(0));
+
 llvm::cl::opt<unsigned>
     val_interval("valInterval",
                  cll::desc("# of epochs to test validation set (default 0)"),
@@ -264,9 +269,11 @@ std::unique_ptr<galois::GraphNeuralNetwork> InitializeGraphNeuralNetwork() {
   galois::GraphNeuralNetworkConfig gnn_config(
       num_layers, layer_types, layer_sizes_vector, output_layer_type,
       do_graph_sampling, layer_config);
-  gnn_config.inductive_training_  = do_inductive_training;
-  gnn_config.validation_interval_ = val_interval;
-  gnn_config.test_interval_       = test_interval;
+  gnn_config.inductive_training_   = do_inductive_training;
+  gnn_config.validation_interval_  = val_interval;
+  gnn_config.test_interval_        = test_interval;
+  gnn_config.train_minibatch_size_ = train_minibatch_size;
+
   // optimizer
   std::unique_ptr<galois::BaseOptimizer> opt = CreateOptimizer(gnn_graph.get());
 

From 69b5268d4b33095c2121161a9fb0b11cf840352a Mon Sep 17 00:00:00 2001
From: Hochan Lee <nicelhc13@users.noreply.github.com>
Date: Mon, 3 May 2021 21:35:24 -0500
Subject: [PATCH 528/660] GPU update (#2)

* Implement multi-GPU sage layer and test

* Temporarily disable unsupported options for GPU on gpu-conv

* Implement GPU Relu activation and update tests

* Temporarily modify PairGlorotBengioInit() for GPUs

* Implement collective MPI communications for weight synchronizations on the GPU side

* Implement bitset aggregation, nonmaster masking, dropout matrix reconstruction for GPU

* Remove backward correctness check of the layer 0

* Fix GPU sagelayer

* Fix gpu-sage normalization

* Fix proxy handling of reduce()

* Add gnn log parser

* Fix galois gnn log parsing scripit

* Add gnn-gpu test script

* Fix galois gnn log parsing script

* Complete rebasing on the gnn

* Add gnn experimental script

* Distinguish synch() of CPU and GPU

* Fix gpu conv test

* Update degree normalization

* Fix gpu-gnn tests

* Avoid unnecessary CPU memory allocation on GPU

* Add TODO for gpudirect

* Rebase + optimize memory allocations

Authored-by: Hochan Lee <hochan@login2.frontera.tacc.utexas.edu>
---
 .../include/galois/runtime/SyncStructures.h   |   7 +-
 .../include/galois/runtime/cuda/DeviceSync.h  |  74 +++++
 libgnn/CMakeLists.txt                         |   1 +
 libgnn/include/galois/CUDAUtil.h              |   4 +-
 .../include/galois/GNNCudaContextHostDecls.h  |   7 +-
 libgnn/include/galois/GNNMath.cuh             |   7 +
 libgnn/include/galois/graphs/GNNGraph.cuh     |  27 +-
 libgnn/include/galois/graphs/GNNGraph.h       |   7 +-
 .../graphs/GraphAggregationSyncStructures.h   |   3 +-
 libgnn/include/galois/layers/DenseLayer.h     |   3 +-
 libgnn/include/galois/layers/GNNLayer.cuh     |  31 +-
 libgnn/include/galois/layers/GNNLayer.h       |  36 ++-
 .../galois/layers/GraphConvolutionalLayer.cuh |  10 +-
 libgnn/include/galois/layers/SAGELayer.cuh    |  82 ++++++
 libgnn/include/galois/layers/SAGELayer.h      |  32 ++-
 libgnn/include/galois/layers/SoftmaxLayer.cuh |   2 +
 libgnn/src/GNNCudaContext.cu                  |  20 +-
 libgnn/src/GNNMath.cu                         |  27 +-
 libgnn/src/GNNOptimizers.cu                   |   2 +-
 libgnn/src/GraphNeuralNetwork.cpp             |  24 +-
 libgnn/src/graphs/GNNGraph.cpp                |  11 +-
 libgnn/src/graphs/GNNGraph.cu                 | 107 ++++++-
 libgnn/src/layers/DenseLayer.cpp              |   4 +
 libgnn/src/layers/GNNLayer.cpp                | 192 +++++++------
 libgnn/src/layers/GNNLayer.cu                 | 144 +++++++++-
 libgnn/src/layers/GraphConvolutionalLayer.cpp |  43 ++-
 libgnn/src/layers/GraphConvolutionalLayer.cu  |  92 ++++--
 libgnn/src/layers/SAGELayer.cpp               | 186 ++++++++----
 libgnn/src/layers/SAGELayer.cu                | 209 ++++++++++++++
 libgnn/src/layers/SoftmaxLayer.cpp            |   4 +-
 libgnn/src/layers/SoftmaxLayer.cu             |  64 ++++-
 libgnn/test/CMakeLists.txt                    |  25 ++
 libgnn/test/gpu-adam-test.cpp                 |  10 +-
 libgnn/test/gpu-aggregate-sync-test.cpp       |  19 +-
 libgnn/test/gpu-back-conv-test.cpp            | 167 +++++++++++
 libgnn/test/gpu-convlayer-test.cpp            | 168 +++++------
 libgnn/test/gpu-epoch-test.cpp                |  16 +-
 libgnn/test/gpu-sage-layer-test.cpp           | 270 ++++++++++++++++++
 libgnn/test/gpu-softmaxlayer-test.cpp         |  39 +--
 scripts/galois_gnn_log_parser.R               | 221 ++++++++++++++
 scripts/run-gpu.sh                            |  44 +++
 scripts/run_gnnsys.sh                         |  57 ++++
 42 files changed, 2101 insertions(+), 397 deletions(-)
 create mode 100644 libgnn/include/galois/layers/SAGELayer.cuh
 create mode 100644 libgnn/src/layers/SAGELayer.cu
 create mode 100644 libgnn/test/gpu-back-conv-test.cpp
 create mode 100644 libgnn/test/gpu-sage-layer-test.cpp
 create mode 100644 scripts/galois_gnn_log_parser.R
 create mode 100644 scripts/run-gpu.sh
 create mode 100644 scripts/run_gnnsys.sh

diff --git a/libgluon/include/galois/runtime/SyncStructures.h b/libgluon/include/galois/runtime/SyncStructures.h
index 44264461cd..75398c4f02 100644
--- a/libgluon/include/galois/runtime/SyncStructures.h
+++ b/libgluon/include/galois/runtime/SyncStructures.h
@@ -1920,17 +1920,17 @@ class FieldFlags {
     static bool is_valid() { return true; }                                    \
                                                                                \
     static galois::DynamicBitSet& get() {                                      \
-      if (personality == GPU_CUDA)                                             \
+      if (device_personality == DevicePersonality::GPU_CUDA)                   \
         get_bitset_##fieldname##_cuda(                                         \
             cuda_ctx, (uint64_t*)bitset_##fieldname.get_vec().data());         \
       return bitset_##fieldname;                                               \
     }                                                                          \
                                                                                \
     static void reset_range(size_t begin, size_t end) {                        \
-      if (personality == GPU_CUDA) {                                           \
+      if (device_personality == DevicePersonality::GPU_CUDA) {                 \
         bitset_##fieldname##_reset_cuda(cuda_ctx, begin, end);                 \
       } else {                                                                 \
-        assert(personality == CPU);                                            \
+        assert(device_personality == DevicePersonality::CPU);                  \
         bitset_##fieldname.reset(begin, end);                                  \
       }                                                                        \
     }                                                                          \
@@ -2079,5 +2079,4 @@ class FieldFlags {
     }                                                                          \
   };
 #endif
-
 #endif // header guard
diff --git a/libgluon/include/galois/runtime/cuda/DeviceSync.h b/libgluon/include/galois/runtime/cuda/DeviceSync.h
index a9512b1cc1..6b49aa743f 100644
--- a/libgluon/include/galois/runtime/cuda/DeviceSync.h
+++ b/libgluon/include/galois/runtime/cuda/DeviceSync.h
@@ -425,6 +425,80 @@ void reset_bitset_field(struct CUDA_Context_Field<DataType>* field,
                                           mask1, test2, bit_index2, mask2);
 }
 
+// TODO(lhc) we may not need this later, but for now just use this
+void reset_bitset_field(Shared<DynamicBitset>& bitset, size_t begin,
+                        size_t end) {
+  dim3 blocks;
+  dim3 threads;
+  kernel_sizing(blocks, threads);
+  const DynamicBitset* bitset_cpu = bitset.cpu_rd_ptr();
+  assert(begin <= (bitset_cpu->size() - 1));
+  assert(end <= (bitset_cpu->size() - 1));
+
+  size_t vec_begin = (begin + 63) / 64;
+  size_t vec_end;
+
+  if (end == (bitset_cpu->size() - 1))
+    vec_end = bitset_cpu->vec_size();
+  else
+    vec_end = (end + 1) / 64; // floor
+
+  size_t begin2 = vec_begin * 64;
+  size_t end2   = vec_end * 64;
+
+  bool test1;
+  size_t bit_index1;
+  uint64_t mask1;
+
+  bool test2;
+  size_t bit_index2;
+  uint64_t mask2;
+
+  if (begin2 > end2) {
+    test2 = false;
+
+    if (begin < begin2) {
+      test1       = true;
+      bit_index1  = begin / 64;
+      size_t diff = begin2 - begin;
+      assert(diff < 64);
+      mask1 = ((uint64_t)1 << (64 - diff)) - 1;
+
+      // create or mask
+      size_t diff2 = end - end2 + 1;
+      assert(diff2 < 64);
+      mask2 = ~(((uint64_t)1 << diff2) - 1);
+      mask1 |= ~mask2;
+    } else {
+      test1 = false;
+    }
+  } else {
+    if (begin < begin2) {
+      test1       = true;
+      bit_index1  = begin / 64;
+      size_t diff = begin2 - begin;
+      assert(diff < 64);
+      mask1 = ((uint64_t)1 << (64 - diff)) - 1;
+    } else {
+      test1 = false;
+    }
+
+    if (end >= end2) {
+      test2       = true;
+      bit_index2  = end / 64;
+      size_t diff = end - end2 + 1;
+      assert(diff < 64);
+      mask2 = ~(((uint64_t)1 << diff) - 1);
+    } else {
+      test2 = false;
+    }
+  }
+
+  bitset_reset_range<<<blocks, threads>>>(bitset.gpu_rd_ptr(), vec_begin,
+                                          vec_end, test1, bit_index1, mask1,
+                                          test2, bit_index2, mask2);
+}
+
 template <typename DataType>
 void reset_data_field(struct CUDA_Context_Field<DataType>* field, size_t begin,
                       size_t end, DataType val) {
diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt
index 46ea0fd67c..c5d9ee6e7a 100644
--- a/libgnn/CMakeLists.txt
+++ b/libgnn/CMakeLists.txt
@@ -41,6 +41,7 @@ if (GALOIS_ENABLE_GPU)
     src/layers/GNNLayer.cu
     src/layers/GraphConvolutionalLayer.cu
     src/layers/SoftmaxLayer.cu
+    src/layers/SAGELayer.cu
     src/GraphNeuralNetwork.cu
     src/GNNOptimizers.cu
     src/GNNCudaContext.cu
diff --git a/libgnn/include/galois/CUDAUtil.h b/libgnn/include/galois/CUDAUtil.h
index fd51eb1362..e19b0d9525 100644
--- a/libgnn/include/galois/CUDAUtil.h
+++ b/libgnn/include/galois/CUDAUtil.h
@@ -58,8 +58,8 @@ inline int CUDA_GET_BLOCKS(const int N) {
 
 //! Basic kernel loop for CUDA threads
 //! Caffe describes it as "grid stride"
-#define CUDA_KERNEL_LOOP(i, n)                                                 \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n);                 \
+#define CUDA_KERNEL_LOOP(i, s, e)                                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x + s; i < (e);             \
        i += blockDim.x * gridDim.x)
 
 //! Wrap a CuBLAS call with this to check if it threw any errors
diff --git a/libgnn/include/galois/GNNCudaContextHostDecls.h b/libgnn/include/galois/GNNCudaContextHostDecls.h
index fea68d5fec..58c45c3b97 100644
--- a/libgnn/include/galois/GNNCudaContextHostDecls.h
+++ b/libgnn/include/galois/GNNCudaContextHostDecls.h
@@ -1,5 +1,4 @@
 #pragma once
-
 #include "galois/cuda/HostDecls.h"
 
 extern int gpudevice;
@@ -7,6 +6,7 @@ extern int gpudevice;
 void load_graph_CUDA_GNN(struct CUDA_Context* ctx, PartitionedGraphInfo& g,
                          unsigned num_hosts);
 void resize_CUDA_layer_vector(struct CUDA_Context* ctx, size_t num_layers);
+void resize_CUDA_bitset(struct CUDA_Context* ctx, size_t bitset_size);
 void init_CUDA_layer_vector_meta_obj(struct CUDA_Context* ctx,
                                      unsigned layer_number, unsigned num_hosts,
                                      unsigned nnodes, size_t infl_in_size,
@@ -71,6 +71,11 @@ void batch_get_reset_node_layer_output_matrix_cuda(struct CUDA_Context* ctx,
                                                    uint8_t* buf,
                                                    size_t column_size,
                                                    unsigned layer_number);
+void get_bitset_graph_aggregate_cuda(struct CUDA_Context* ctx,
+                                     uint64_t* bitset_compute);
+
+void bitset_graph_aggregate_reset_cuda(struct CUDA_Context* ctx, size_t begin,
+                                       size_t end);
 
 void cudaSetLayerInputOutput(struct CUDA_Context* ctx, GNNFloat* layer_matrix,
                              size_t column_size, size_t num_nodes,
diff --git a/libgnn/include/galois/GNNMath.cuh b/libgnn/include/galois/GNNMath.cuh
index 1b262fa6a3..a50e8974ba 100644
--- a/libgnn/include/galois/GNNMath.cuh
+++ b/libgnn/include/galois/GNNMath.cuh
@@ -25,6 +25,12 @@ void CBlasSGEMMGPU(const cublasOperation_t trans_a,
                    size_t input_columns, size_t output_columns,
                    const GNNFloat* a, const GNNFloat* b, GNNFloat* output);
 
+void CBlasSGEMMGPU(const cublasOperation_t trans_a,
+                   const cublasOperation_t trans_b, size_t input_rows,
+                   size_t input_columns, size_t output_columns,
+                   const GNNFloat* a, const GNNFloat* b, GNNFloat* output,
+                   bool accumulate);
+
 //! Runs softmax + cross entropy on masked nodes. Will not overwrite all of
 //! the output, so make sure it's been zero'd out beforehand.
 //! At this point in time cross entropy is ignored because it only calculates a
@@ -48,5 +54,6 @@ SoftmaxCrossEntropyBackward(char* mask, size_t num_nodes, size_t feature_length,
 __device__ void DoSoftmax(size_t vector_length, const GNNFloat* input,
                           GNNFloat* output);
 
+__device__ void GPUVectorZero(size_t vector_length, GNNFloat* vec);
 } // namespace galois
 #endif
diff --git a/libgnn/include/galois/graphs/GNNGraph.cuh b/libgnn/include/galois/graphs/GNNGraph.cuh
index 2012dcd7c9..6b6ff2bb74 100644
--- a/libgnn/include/galois/graphs/GNNGraph.cuh
+++ b/libgnn/include/galois/graphs/GNNGraph.cuh
@@ -23,8 +23,8 @@ public:
   //! Copy over masks for the 3 sets to GPU
   void SetMasks(const std::vector<char>& train, const std::vector<char>& val,
                 const std::vector<char>& test);
-  //! Copy over norm factors
-  void SetNormFactors(const std::vector<GNNFloat> norm_factors);
+
+  void AllocAggregateBitset(size_t size);
 
   GNNFeature* feature_vector() const { return feature_vector_; };
   int* edge_index() const { return edge_index_; }
@@ -33,7 +33,19 @@ public:
   char* local_training_mask() const { return local_training_mask_; }
   char* local_validation_mask() const { return local_validation_mask_; }
   char* local_testing_mask() const { return local_testing_mask_; }
-  GNNFloat* norm_factors() const { return norm_factors_; }
+
+  //! Get the total degree of the partitioned graph
+  uint32_t* get_global_degrees() const { return global_degrees_; }
+  //! Get the total degree of the sampled subgraph
+  uint32_t* get_global_train_degrees() const { return global_train_degrees_; }
+  //! Allocate memory to objects related to normalization
+  void InitNormFactor(size_t num_nodes);
+  //! Copy degree of the partitioned graph from CPU
+  void SetGlobalDegrees(const std::vector<uint32_t> global_degrees);
+  //! Copy degree of the sampled subgraph from CPU
+  void SetGlobalTrainDegrees(const std::vector<uint32_t> global_train_degrees);
+
+  void CopyToCPU(const PointerWithSize<GNNFloat>& input);
 
 private:
   // ALL THESE VARIABLES ARE DEVICE SIDE (GPU) POINTERS
@@ -53,14 +65,19 @@ private:
   int* edge_destinations_{nullptr};
   //! (Local) feature vector
   GNNFeature* feature_vector_{nullptr};
+
   //! (Local) ground truth vector
   GNNLabel* ground_truth_{nullptr};
+
   // masks for phases
   char* local_training_mask_{nullptr};
   char* local_validation_mask_{nullptr};
   char* local_testing_mask_{nullptr};
-  //! Norm factors used during aggregation
-  GNNFloat* norm_factors_;
+
+  uint32_t* global_degrees_{nullptr};
+  size_t global_degree_size_{0};
+  uint32_t* global_train_degrees_{nullptr};
+  size_t global_train_degree_size_{0};
 };
 
 } // namespace graphs
diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index ded867787c..2d4bb5356b 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -489,13 +489,14 @@ class GNNGraph {
   void CalculateFullNormFactor();
 
 #ifdef GALOIS_ENABLE_GPU
-  void AggregateSync(GNNFloat* matrix_to_sync, const size_t matrix_column_size,
-                     const unsigned layer_number) const;
+  void AggregateSyncGPU(GNNFloat* matrix_to_sync,
+                        const size_t matrix_column_size,
+                        const unsigned layer_number) const;
 
   void InitLayerVectorMetaObjects(size_t layer_number, unsigned num_hosts,
                                   size_t infl_in_size, size_t infl_out_size);
 
-  void ResizeLayerVector(size_t num_layers);
+  void ResizeGPULayerVector(size_t num_layers);
 
   const GNNGraphGPUAllocations& GetGPUGraph() const { return gpu_memory_; }
 
diff --git a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
index bcf7ed5078..0dd43c3308 100644
--- a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
+++ b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
@@ -203,6 +203,7 @@ struct GNNSampleSumAggregate {
 };
 
 #ifdef GALOIS_ENABLE_GPU
+extern struct CUDA_Context* cuda_ctx;
 GALOIS_SYNC_STRUCTURE_GNN_LAYER(layer_input, cuda_ctx_for_sync,
                                 gnn_matrix_to_sync_column_length_,
                                 layer_number_to_sync);
@@ -210,7 +211,7 @@ GALOIS_SYNC_STRUCTURE_GNN_LAYER(layer_output, cuda_ctx_for_sync,
                                 gnn_matrix_to_sync_column_length_,
                                 layer_number_to_sync);
 #endif
-
 GALOIS_SYNC_STRUCTURE_BITSET(graph_aggregate);
+
 } // namespace graphs
 } // namespace galois
diff --git a/libgnn/include/galois/layers/DenseLayer.h b/libgnn/include/galois/layers/DenseLayer.h
index 7b00d1987c..e7dc46e9f3 100644
--- a/libgnn/include/galois/layers/DenseLayer.h
+++ b/libgnn/include/galois/layers/DenseLayer.h
@@ -1,3 +1,4 @@
+
 #pragma once
 #include "galois/layers/GNNLayer.h"
 
@@ -50,7 +51,7 @@ class DenseLayer : public GNNLayer {
 
 #ifdef GALOIS_ENABLE_GPU
   // TODO(hochan/loc) replace with dense gpu object
-  GCNGPUAllocations gpu_object_;
+  // GCNGPUAllocations gpu_object_;
 #endif
 };
 
diff --git a/libgnn/include/galois/layers/GNNLayer.cuh b/libgnn/include/galois/layers/GNNLayer.cuh
index 9dfd09e0da..439faad738 100644
--- a/libgnn/include/galois/layers/GNNLayer.cuh
+++ b/libgnn/include/galois/layers/GNNLayer.cuh
@@ -16,12 +16,16 @@ public:
   void InitDropoutMemory(size_t dropout_size);
   //! Copy provided data in vector to GPU weights
   void CopyToWeights(const std::vector<GNNFloat>& cpu_layer_weights);
+  //! Copy provided data in vector to GPU weight gradients
+  void CopyToWeightGradients(const std::vector<GNNFloat>& cpu_gradients);
   //! Copy GPU forward output to the provided vector (assumes vector is already
   //! correct size)
-  void CopyForwardOutputToCPU(std::vector<GNNFloat>* cpu_forward_output);
+  void CopyForwardOutputToCPU(GNNFloat* cpu_forward_output,
+                              size_t forward_output_size);
   //! Copy GPU backward output to the provided vector (assumes vector is already
   //! correct size)
-  void CopyBackwardOutputToCPU(std::vector<GNNFloat>* cpu_backward_output);
+  void CopyBackwardOutputToCPU(GNNFloat* cpu_backward_output,
+                               size_t backward_output_size);
   //! Copy GPU weight gradients to the provided vector (assumes vector is
   //! already correct size)
   void CopyWeightGradientsToCPU(std::vector<GNNFloat>* cpu_gradients);
@@ -29,6 +33,9 @@ public:
   //! Prints forward output matrix on gpu
   void PrintForwardOutput(size_t num);
 
+  //! Prints backward output matrix on gpu
+  void PrintBackwardOutput(size_t num);
+
   //! Does dropout on the GPU; saves non-dropped weights to output
   void DoDropoutGPU(const PointerWithSize<GNNFloat> input_to_dropout,
                     PointerWithSize<GNNFloat> output, float dropout_rate);
@@ -39,11 +46,30 @@ public:
   //! memory is allocated as necessary)
   GNNFloat* Allocate(const std::vector<GNNFloat>& v);
 
+  //! Initializes vectors on GPU to 1
+  void InitGPUVectorTo1(GNNFloat* vector, size_t vector_size);
+
+  //! Apply an activation function
+  void ActivationGPU(size_t num_forward_output_elements);
+  //! Apply an activation function for derivative
+  void ActivationDerivativeGPU(GNNFloat* gradients,
+                               size_t num_gradients_elements);
+  void
+  ReconstructDropoutMatrixGPU(const PointerWithSize<GNNFloat> input_to_drouput,
+                              PointerWithSize<GNNFloat>* output_matrix,
+                              size_t num_elements, GNNFloat scale);
+
+  void MaskNonMastersGPU(PointerWithSize<GNNFloat>* input, size_t start_node,
+                         size_t end_node, size_t row_index);
+
   GNNFloat* forward_output() { return forward_output_matrix_; }
   GNNFloat* backward_output() { return backward_output_matrix_; }
   GNNFloat* layer_weights() { return layer_weights_; }
   GNNFloat* layer_weight_gradients() { return layer_weight_gradients_; }
 
+  void CopyToCPU(PointerWithSize<GNNFloat>* input);
+  void CopyToCPU(GNNFloat* input, size_t size);
+
 private:
   size_t* num_weights_{nullptr};
   GNNFloat* forward_output_matrix_{nullptr};
@@ -52,6 +78,7 @@ private:
   GNNFloat* layer_weight_gradients_{nullptr};
   GNNFloat* rng_results_{nullptr};
   char* dropout_mask_{nullptr};
+  uint8_t* activation_memo_{nullptr};
 };
 
 } // namespace galois
diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
index 5cfe69b83e..9a71432471 100644
--- a/libgnn/include/galois/layers/GNNLayer.h
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -8,6 +8,9 @@
 #include "galois/layers/GNNLayer.cuh"
 #endif
 
+//#define PRINT_VEC_LOG_
+//#define PRINT_GPU_VEC_
+
 namespace galois {
 
 //! Supported layer types in the GNN
@@ -185,16 +188,29 @@ class GNNLayer {
   PointerWithSize<GNNFloat> AllocateGPU(const std::vector<GNNFloat>& v) {
     return PointerWithSize<GNNFloat>(base_gpu_object_.Allocate(v), v.size());
   }
+
   //! Copies over forward output results to CPU from GPU
-  const std::vector<GNNFloat>& CopyForwardOutputFromGPU() {
-    base_gpu_object_.CopyForwardOutputToCPU(&forward_output_matrix_);
-    return forward_output_matrix_;
+  const std::vector<GNNFloat> CopyForwardOutputFromGPU() {
+    size_t cpu_forward_output_size = p_forward_output_matrix_.size();
+    GNNFloat* cpu_forward_output =
+        (GNNFloat*)malloc(cpu_forward_output_size * sizeof(GNNFloat));
+    base_gpu_object_.CopyForwardOutputToCPU(cpu_forward_output,
+                                            cpu_forward_output_size);
+    return std::vector<GNNFloat>(cpu_forward_output,
+                                 cpu_forward_output + cpu_forward_output_size);
   }
+
   //! Copies over backward output results to CPU from GPU
-  const std::vector<GNNFloat>& CopyBackwardOutputFromGPU() {
-    base_gpu_object_.CopyBackwardOutputToCPU(&backward_output_matrix_);
-    return backward_output_matrix_;
+  const PointerWithSize<GNNFloat> CopyBackwardOutputFromGPU() {
+    size_t cpu_backward_output_size = p_backward_output_matrix_.size();
+    GNNFloat* cpu_backward_output =
+        (GNNFloat*)malloc(cpu_backward_output_size * sizeof(GNNFloat));
+    base_gpu_object_.CopyBackwardOutputToCPU(cpu_backward_output,
+                                             cpu_backward_output_size);
+    return PointerWithSize<GNNFloat>(cpu_backward_output,
+                                     cpu_backward_output_size);
   }
+
   //! Copies over weight gradients to CPU from GPU
   const std::vector<GNNFloat>& CopyWeightGradientsFromGPU() {
     base_gpu_object_.CopyWeightGradientsToCPU(&layer_weight_gradients_);
@@ -204,6 +220,10 @@ class GNNLayer {
   void PrintForwardOutputGPU() {
     base_gpu_object_.PrintForwardOutput(forward_output_matrix_.size());
   }
+
+  void PrintBackwardOutputGPU() {
+    base_gpu_object_.PrintBackwardOutput(p_backward_output_matrix_.size());
+  }
 #endif
 
 protected:
@@ -293,6 +313,7 @@ class GNNLayer {
   //! Does some activation function based on configuration on forward output
   //! matrix
   void Activation();
+  void ActivationCPU();
   //! Calculate derivative of activation function based on config on the matrix
   void ActivationDerivative(PointerWithSize<GNNFloat>* matrix);
 
@@ -317,6 +338,9 @@ class GNNLayer {
   double FloatElementsToGB(size_t num_of_floats) const {
     return num_of_floats * double{4} / (1 << 30);
   }
+
+  void MaskNonMastersGPU(PointerWithSize<GNNFloat>* input, size_t start_node,
+                         size_t end_node, size_t row_index);
 };
 
 } // namespace galois
diff --git a/libgnn/include/galois/layers/GraphConvolutionalLayer.cuh b/libgnn/include/galois/layers/GraphConvolutionalLayer.cuh
index c59617828d..51a167b9c1 100644
--- a/libgnn/include/galois/layers/GraphConvolutionalLayer.cuh
+++ b/libgnn/include/galois/layers/GraphConvolutionalLayer.cuh
@@ -9,8 +9,11 @@ class GCNGPUAllocations {
 public:
   // free memory
   ~GCNGPUAllocations();
-  // allocate the 3 temp arrays
-  void Allocate(size_t input_elements, size_t output_elements);
+
+  void AllocateInTemp1(const size_t size);
+  void AllocateInTemp2(const size_t size);
+  void AllocateOutTemp(const size_t size);
+
   GNNFloat* in_temp_1() { return in_temp_1_; }
   GNNFloat* in_temp_2() { return in_temp_2_; }
   GNNFloat* out_temp() { return out_temp_; }
@@ -18,7 +21,8 @@ public:
   void AggregateAllGPU(const graphs::GNNGraphGPUAllocations& gpu_graph,
                        size_t num_nodes, size_t column_length,
                        const GNNFloat* node_embeddings,
-                       GNNFloat* aggregate_output, bool use_norm);
+                       GNNFloat* aggregate_output, bool use_norm,
+                       bool disable_self_aggregate, size_t last_master);
 
   void UpdateEmbeddingsGPU(size_t num_nodes, size_t input_columns,
                            size_t output_columns,
diff --git a/libgnn/include/galois/layers/SAGELayer.cuh b/libgnn/include/galois/layers/SAGELayer.cuh
new file mode 100644
index 0000000000..05f9e8556c
--- /dev/null
+++ b/libgnn/include/galois/layers/SAGELayer.cuh
@@ -0,0 +1,82 @@
+#pragma once
+#include "galois/GNNTypes.h"
+#include "galois/graphs/GNNGraph.cuh"
+
+namespace galois {
+
+//! Holds pointers for GPU memory for SAGE layer
+class SAGEGPUAllocations {
+public:
+  // free memory
+  ~SAGEGPUAllocations();
+
+  // allocate the 3 temp arrays
+  void AllocateInTemp1(const size_t size);
+  void AllocateInTemp2(const size_t size);
+  void AllocateOutTemp(const size_t size);
+
+  GNNFloat* in_temp_1() { return in_temp_1_; }
+  GNNFloat* in_temp_2() { return in_temp_2_; }
+  GNNFloat* out_temp() { return out_temp_; }
+
+  void AllocateWeight2(const size_t size);
+  void AllocateWeightGradient2(const size_t size);
+
+  GNNFloat* layer_weights_2() { return layer_weights_2_; }
+  GNNFloat* layer_weight_gradients_2() { return layer_weight_gradients_2_; }
+
+  void AggregateAllGPU(const graphs::GNNGraphGPUAllocations& gpu_graph,
+                       size_t num_nodes, size_t column_length,
+                       const GNNFloat* node_embeddings,
+                       GNNFloat* aggregate_output, bool use_norm,
+                       bool is_backward);
+
+  void UpdateEmbeddingsGPU(size_t num_nodes, size_t input_columns,
+                           size_t output_columns,
+                           const GNNFloat* node_embeddings,
+                           const GNNFloat* layer_weights, GNNFloat* output);
+  void UpdateEmbeddingsDerivativeGPU(size_t num_nodes, size_t input_columns,
+                                     size_t output_columns,
+                                     const GNNFloat* node_embeddings,
+                                     const GNNFloat* layer_weights,
+                                     GNNFloat* output);
+
+  void GetWeightGradientsGPU(size_t num_nodes, size_t input_columns,
+                             size_t output_columns, const GNNFloat* prev_input,
+                             const GNNFloat* gradients, GNNFloat* output);
+
+  void SelfFeatureUpdateEmbeddingsGPU(size_t input_rows, size_t input_columns,
+                                      size_t output_columns,
+                                      const GNNFloat* node_embeddings,
+                                      GNNFloat* output);
+
+  void SelfFeatureUpdateEmbeddingsDerivativeGPU(size_t input_rows,
+                                                size_t output_columns,
+                                                size_t input_columns,
+                                                const GNNFloat* gradients,
+                                                GNNFloat* output);
+
+  void UpdateWeight2DerivativeGPU(size_t input_columns, size_t input_rows,
+                                  size_t output_columns,
+                                  const GNNFloat* prev_layer_inputs,
+                                  const GNNFloat* input_gradients,
+                                  GNNFloat* output);
+
+  //! Copy provided data in vector to GPU self weight
+  void CopyToWeights2(const std::vector<GNNFloat>& cpu_layer_weights);
+  //! Copy provided data in vector to GPU self weight gradients
+  void CopyToWeight2Gradients(const std::vector<GNNFloat>& cpu_gradients);
+
+  //! Copy GPU self weight gradients to the provided vector (assumes vector is
+  //! already correct size)
+  void CopyWeight2GradientsToCPU(std::vector<GNNFloat>* cpu_gradients);
+
+private:
+  GNNFloat* in_temp_1_{nullptr};
+  GNNFloat* in_temp_2_{nullptr};
+  GNNFloat* out_temp_{nullptr};
+  GNNFloat* layer_weights_2_{nullptr};
+  GNNFloat* layer_weight_gradients_2_{nullptr};
+};
+
+} // namespace galois
diff --git a/libgnn/include/galois/layers/SAGELayer.h b/libgnn/include/galois/layers/SAGELayer.h
index dd9ceb6e7b..3f12978663 100644
--- a/libgnn/include/galois/layers/SAGELayer.h
+++ b/libgnn/include/galois/layers/SAGELayer.h
@@ -3,7 +3,7 @@
 #include "galois/layers/GradientSyncStructures.h"
 
 #ifdef GALOIS_ENABLE_GPU
-// TODO(loc/hochan)
+#include "galois/layers/SAGELayer.cuh"
 #endif
 
 namespace galois {
@@ -53,9 +53,21 @@ class SAGELayer : public GNNLayer {
   }
 
   void InitSelfWeightsTo1() {
-    if (layer_weights_2_.size()) {
-      layer_weights_2_.assign(layer_weights_2_.size(), 1);
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      size_t layer_weights_2_size = p_layer_weights_2_.size();
+      if (layer_weights_2_size > 0) {
+        base_gpu_object_.InitGPUVectorTo1(gpu_object_.layer_weights_2(),
+                                          layer_weights_2_size);
+      }
+    } else {
+#endif
+      if (layer_weights_2_.size()) {
+        layer_weights_2_.assign(layer_weights_2_.size(), 1);
+      }
+#ifdef GALOIS_ENABLE_GPU
     }
+#endif
   }
 
   //! Returns the 2nd set of weight gradients
@@ -71,6 +83,17 @@ class SAGELayer : public GNNLayer {
   BackwardPhase(PointerWithSize<galois::GNNFloat> prev_layer_input,
                 PointerWithSize<galois::GNNFloat>* input_gradient) final;
 
+#ifdef GALOIS_ENABLE_GPU
+  //! Copies over self weight gradients to CPU from GPU
+  const std::vector<GNNFloat>& CopyWeight2GradientsFromGPU() {
+    if (!layer_weight_gradients_2_.size()) {
+      layer_weight_gradients_2_.resize(p_layer_weight_gradients_2_.size());
+    }
+    gpu_object_.CopyWeight2GradientsToCPU(&layer_weight_gradients_2_);
+    return layer_weight_gradients_2_;
+  }
+#endif
+
 private:
   static const constexpr char* kRegionName = "SAGELayer";
   //! CPU aggregation
@@ -143,8 +166,7 @@ class SAGELayer : public GNNLayer {
       output_column_intermediates_;
 
 #ifdef GALOIS_ENABLE_GPU
-  // TODO(loc/hochan)
-  GCNGPUAllocations gpu_object_;
+  SAGEGPUAllocations gpu_object_;
 #endif
 };
 
diff --git a/libgnn/include/galois/layers/SoftmaxLayer.cuh b/libgnn/include/galois/layers/SoftmaxLayer.cuh
index 8e1e5d21d7..6387edaeb6 100644
--- a/libgnn/include/galois/layers/SoftmaxLayer.cuh
+++ b/libgnn/include/galois/layers/SoftmaxLayer.cuh
@@ -20,6 +20,8 @@ public:
                         size_t feature_length, const GNNFloat* predictions,
                         GNNFloat* output_gradient);
 
+  void CopyToCPU(GNNFloat* input, size_t size);
+
 private:
   char* train_mask_;
   char* val_mask_;
diff --git a/libgnn/src/GNNCudaContext.cu b/libgnn/src/GNNCudaContext.cu
index d0512f8e72..28589da00c 100644
--- a/libgnn/src/GNNCudaContext.cu
+++ b/libgnn/src/GNNCudaContext.cu
@@ -6,6 +6,8 @@
 #include "galois/runtime/cuda/DeviceSync.h"
 #include "galois/GNNCudaContextHostDecls.h"
 
+extern Shared<DynamicBitset> cuda_bitset_graph_aggregate;
+
 // The forward declaration is in the original Context.h file; as long as
 // pointers to it are used it shouldn't be an issue (since space usage is
 // unknown at that point)
@@ -120,7 +122,7 @@ void batch_set_mirror_node_layer_input_matrix_cuda(
 void batch_get_reset_node_layer_input_matrix_cuda(
     struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, size_t* buf_size,
     DataCommMode* mode, size_t column_size, unsigned layer_number) {
-  batch_get_shared_field<GNNFloat, sharedMaster, true>(
+  batch_get_shared_field<GNNFloat, sharedMirror, true>(
       ctx, &ctx->layer_input_matrix[layer_number], from_id, buf, buf_size, mode,
       column_size);
 }
@@ -130,7 +132,7 @@ void batch_get_reset_node_layer_input_matrix_cuda(struct CUDA_Context* ctx,
                                                   uint8_t* buf,
                                                   size_t column_size,
                                                   unsigned layer_number) {
-  batch_get_shared_field<GNNFloat, sharedMaster, true>(
+  batch_get_shared_field<GNNFloat, sharedMirror, true>(
       ctx, &ctx->layer_input_matrix[layer_number], from_id, buf, column_size);
 }
 
@@ -189,7 +191,7 @@ void batch_set_mirror_node_layer_output_matrix_cuda(
 void batch_get_reset_node_layer_output_matrix_cuda(
     struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, size_t* buf_size,
     DataCommMode* mode, size_t column_size, unsigned layer_number) {
-  batch_get_shared_field<GNNFloat, sharedMaster, true>(
+  batch_get_shared_field<GNNFloat, sharedMirror, true>(
       ctx, &ctx->layer_output_matrix[layer_number], from_id, buf, buf_size,
       mode, column_size);
 }
@@ -199,10 +201,20 @@ void batch_get_reset_node_layer_output_matrix_cuda(struct CUDA_Context* ctx,
                                                    uint8_t* buf,
                                                    size_t column_size,
                                                    unsigned layer_number) {
-  batch_get_shared_field<GNNFloat, sharedMaster, true>(
+  batch_get_shared_field<GNNFloat, sharedMirror, true>(
       ctx, &ctx->layer_output_matrix[layer_number], from_id, buf, column_size);
 }
 
+void get_bitset_graph_aggregate_cuda(struct CUDA_Context*,
+                                     uint64_t* bitset_compute) {
+  cuda_bitset_graph_aggregate.cpu_rd_ptr()->copy_to_cpu(bitset_compute);
+}
+
+void bitset_graph_aggregate_reset_cuda(struct CUDA_Context*, size_t begin,
+                                       size_t end) {
+  reset_bitset_field(cuda_bitset_graph_aggregate, begin, end);
+}
+
 void cudaSetLayerInputOutput(struct CUDA_Context* ctx, GNNFloat* layer_matrix,
                              size_t column_size, size_t num_nodes,
                              unsigned layer_number) {
diff --git a/libgnn/src/GNNMath.cu b/libgnn/src/GNNMath.cu
index 8771b75d5b..8305990fc8 100644
--- a/libgnn/src/GNNMath.cu
+++ b/libgnn/src/GNNMath.cu
@@ -30,20 +30,29 @@ void galois::CBlasSGEMMGPU(const cublasOperation_t trans_a,
                            size_t input_columns, size_t output_columns,
                            const GNNFloat* a, const GNNFloat* b,
                            GNNFloat* output) {
+  CBlasSGEMMGPU(trans_a, trans_b, input_rows, input_columns, output_columns, a,
+                b, output, false);
+}
+
+void galois::CBlasSGEMMGPU(const cublasOperation_t trans_a,
+                           const cublasOperation_t trans_b, size_t input_rows,
+                           size_t input_columns, size_t output_columns,
+                           const GNNFloat* a, const GNNFloat* b,
+                           GNNFloat* output, bool accumulate) {
   if (!cublas_is_init) {
     InitCuBLAS();
   }
   size_t lead_dim_a = (trans_a == CUBLAS_OP_N) ? input_columns : input_rows;
   size_t lead_dim_b = (trans_b == CUBLAS_OP_N) ? output_columns : input_columns;
-  float dummy0      = 0.0;
-  float dummy1      = 1.0;
+  float beta        = (accumulate) ? 1.0 : 0.0;
+  float dummy0      = 1.0;
   // because cusparse assumes column major even though we're passing in row
   // major, the order of multiply is reversed so that it does what we
   // want anyways
   // https://stackoverflow.com/questions/56043539/cublassgemm-row-major-multiplication
   CUBLAS_CHECK(cublasSgemm(global_cublas_handle, trans_b, trans_a,
-                           output_columns, input_rows, input_columns, &dummy1,
-                           b, lead_dim_b, a, lead_dim_a, &dummy0, output,
+                           output_columns, input_rows, input_columns, &dummy0,
+                           b, lead_dim_b, a, lead_dim_a, &beta, output,
                            output_columns));
   CUDA_TEST("cublas sgemm failure");
 }
@@ -54,13 +63,15 @@ __global__ void galois::SoftmaxCrossEntropyForward(
 
   // NOTE: assumes that output is already 0'd out as it will not overwrite the
   // entire thing
-  CUDA_KERNEL_LOOP(i, num_nodes) {
+  CUDA_KERNEL_LOOP(i, 0, num_nodes) {
     if (mask[i] == 1) {
       galois::DoSoftmax(feature_length, input_embeddings + feature_length * i,
                         output + feature_length * i);
       // ignoring crossentropy loss calculation for now because I'm not using
       // loss for anything + didn't bother allocating an array to store loss
       // anyways
+    } else {
+      galois::GPUVectorZero(feature_length, output + feature_length * i);
     }
   }
 }
@@ -170,3 +181,9 @@ __device__ void galois::DoSoftmax(size_t vector_length, const GNNFloat* input,
     output[i] /= denominator;
   }
 }
+
+__device__ void galois::GPUVectorZero(size_t vector_length, GNNFloat* vec) {
+  for (size_t i = 0; i < vector_length; i++) {
+    vec[i] = 0;
+  }
+}
diff --git a/libgnn/src/GNNOptimizers.cu b/libgnn/src/GNNOptimizers.cu
index 77f3e74f5f..840554ddd4 100644
--- a/libgnn/src/GNNOptimizers.cu
+++ b/libgnn/src/GNNOptimizers.cu
@@ -42,7 +42,7 @@ __global__ void DoAdamUpdate(const galois::GNNFloat* derivatives,
                              galois::GNNFloat alpha, galois::GNNFloat beta1,
                              galois::GNNFloat beta2, galois::GNNFloat epsilon,
                              galois::GNNFloat beta1t, galois::GNNFloat beta2t) {
-  CUDA_KERNEL_LOOP(i, matrix_size) {
+  CUDA_KERNEL_LOOP(i, 0, matrix_size) {
     first_moment[i]  = beta1 * first_moment[i] + (1.0 - beta1) * derivatives[i];
     second_moment[i] = beta2 * second_moment[i] +
                        (1.0 - beta2) * (derivatives[i] * derivatives[i]);
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index dc2ebb2834..cb139191b4 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -23,7 +23,7 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork(
 
 #ifdef GALOIS_ENABLE_GPU
   if (device_personality == DevicePersonality::GPU_CUDA) {
-    graph_->ResizeLayerVector(config_.num_intermediate_layers());
+    graph_->ResizeGPULayerVector(config_.num_intermediate_layers());
   }
 #endif
   // used for chaining layers together; begins as nullptr
@@ -54,13 +54,6 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork(
           i, *graph_, &prev_output_layer, layer_dims,
           config_.default_layer_config())));
       gnn_layers_.back()->SetGraphUserLayerNumber(num_graph_user_layers_++);
-#ifdef GALOIS_ENABLE_GPU
-      if (device_personality == DevicePersonality::GPU_CUDA) {
-        graph_->InitLayerVectorMetaObjects(
-            i, galois::runtime::getSystemNetworkInterface().Num,
-            layer_dims.input_columns, layer_dims.output_columns);
-      }
-#endif
       break;
     case GNNLayerType::kSAGE:
       gnn_layers_.push_back(std::move(std::make_unique<SAGELayer>(
@@ -75,23 +68,25 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork(
       gnn_layers_.push_back(std::move(std::make_unique<L2NormLayer>(
           i, *graph_, &prev_output_layer, layer_dims,
           config_.default_layer_config())));
-#ifdef GALOIS_ENABLE_GPU
-      // TODO(loc/hochan) l2 layer gpu
-#endif
       break;
     case GNNLayerType::kDense:
       gnn_layers_.push_back(std::move(std::make_unique<DenseLayer>(
           i, *graph_, &prev_output_layer, layer_dims,
           config_.default_layer_config())));
-#ifdef GALOIS_ENABLE_GPU
-      // TODO(loc/hochan) dense layer gpu
-#endif
       break;
     default:
       GALOIS_LOG_FATAL("Invalid layer type during network construction");
     }
+
     // update output layer for next layer
     prev_output_layer = gnn_layers_.back()->GetForwardOutput();
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      graph_->InitLayerVectorMetaObjects(
+          i, galois::runtime::getSystemNetworkInterface().Num,
+          layer_dims.input_columns, layer_dims.output_columns);
+    }
+#endif
   }
 
   // loop backward and find last GCN/SAGE (main) layer to disable activation
@@ -385,6 +380,7 @@ galois::GraphNeuralNetwork::DoInference() {
   // start with graph features and pass it through all layers of the network
   galois::PointerWithSize<galois::GNNFloat> layer_input =
       graph_->GetLocalFeatures();
+
   for (std::unique_ptr<galois::GNNLayer>& ptr : gnn_layers_) {
     layer_input = ptr->ForwardPhase(layer_input);
   }
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index d46f75305f..cb63fbe307 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -53,6 +53,7 @@ std::vector<galois::LargeArray<uint32_t>>* gnn_sampled_out_degrees_;
 
 #ifdef GALOIS_ENABLE_GPU
 struct CUDA_Context* cuda_ctx_for_sync;
+struct CUDA_Context* cuda_ctx;
 unsigned layer_number_to_sync;
 #endif
 } // namespace graphs
@@ -222,7 +223,7 @@ void galois::graphs::GNNGraph::AggregateSync(GNNFloat* matrix_to_sync,
 }
 
 #ifdef GALOIS_ENABLE_GPU
-void galois::graphs::GNNGraph::AggregateSync(
+void galois::graphs::GNNGraph::AggregateSyncGPU(
     GNNFloat* matrix_to_sync, const size_t matrix_column_size,
     const unsigned layer_number) const {
   size_t layer_input_mtx_column_size =
@@ -539,11 +540,11 @@ void galois::graphs::GNNGraph::InitNormFactor() {
   global_degrees_.resize(partitioned_graph_->size(), 0.0);
   global_train_degrees_.resize(partitioned_graph_->size(), 0.0);
   CalculateFullNormFactor();
+  gpu_memory_.InitNormFactor(partitioned_graph_->size());
 }
 
 void galois::graphs::GNNGraph::CalculateFullNormFactor() {
   // TODO(loc) reset all degrees if this is called multiple times?
-
   // get the norm factor contribution for each node based on the GLOBAL graph
   galois::do_all(
       galois::iterate(static_cast<size_t>(0), partitioned_graph_->size()),
@@ -983,7 +984,9 @@ void galois::graphs::GNNGraph::InitGPUMemory() {
   gpu_memory_.SetLabels(local_ground_truth_labels_);
   gpu_memory_.SetMasks(local_training_mask_, local_validation_mask_,
                        local_testing_mask_);
-  gpu_memory_.SetNormFactors(norm_factors_);
+  gpu_memory_.AllocAggregateBitset(partitioned_graph_->size());
+  gpu_memory_.SetGlobalTrainDegrees(global_train_degrees_);
+  gpu_memory_.SetGlobalDegrees(global_degrees_);
 }
 
 void galois::graphs::GNNGraph::InitLayerVectorMetaObjects(
@@ -993,7 +996,7 @@ void galois::graphs::GNNGraph::InitLayerVectorMetaObjects(
                                   infl_in_size, infl_out_size);
 }
 
-void galois::graphs::GNNGraph::ResizeLayerVector(size_t num_layers) {
+void galois::graphs::GNNGraph::ResizeGPULayerVector(size_t num_layers) {
   resize_CUDA_layer_vector(cuda_ctx_, num_layers);
 }
 #endif
diff --git a/libgnn/src/graphs/GNNGraph.cu b/libgnn/src/graphs/GNNGraph.cu
index 96ba37db15..065e84be6c 100644
--- a/libgnn/src/graphs/GNNGraph.cu
+++ b/libgnn/src/graphs/GNNGraph.cu
@@ -1,5 +1,13 @@
+#include "gg.h"
+#include "ggcuda.h"
+
+#include "galois/cuda/DynamicBitset.h"
+
 #include "galois/CUDAUtil.h"
 #include "galois/graphs/GNNGraph.cuh"
+#include "sharedptr.h"
+
+Shared<DynamicBitset> cuda_bitset_graph_aggregate;
 
 galois::graphs::GNNGraphGPUAllocations::~GNNGraphGPUAllocations() {
   GALOIS_LOG_VERBOSE("Freeing GPU graph allocations");
@@ -13,6 +21,8 @@ galois::graphs::GNNGraphGPUAllocations::~GNNGraphGPUAllocations() {
   CUDA_FREE(local_training_mask_);
   CUDA_FREE(local_validation_mask_);
   CUDA_FREE(local_testing_mask_);
+  CUDA_FREE(global_degrees_);
+  CUDA_FREE(global_train_degrees_);
 }
 
 void galois::graphs::GNNGraphGPUAllocations::SetGraphTopology(
@@ -83,11 +93,96 @@ void galois::graphs::GNNGraphGPUAllocations::SetMasks(
                         test.size() * sizeof(char), cudaMemcpyHostToDevice));
 }
 
-void galois::graphs::GNNGraphGPUAllocations::SetNormFactors(
-    const std::vector<GNNFloat> norm_factors) {
-  CUDA_CHECK(cudaMalloc((void**)(&norm_factors_),
-                        norm_factors.size() * sizeof(GNNFloat)));
-  CUDA_CHECK(cudaMemcpy(norm_factors_, norm_factors.data(),
-                        norm_factors.size() * sizeof(GNNFloat),
+void galois::graphs::GNNGraphGPUAllocations::InitNormFactor(size_t num_nodes) {
+  GALOIS_LOG_ASSERT(global_degrees_ == nullptr);
+  GALOIS_LOG_ASSERT(global_train_degrees_ == nullptr);
+
+  CUDA_CHECK(
+      cudaMalloc((void**)(&global_degrees_), sizeof(uint32_t) * num_nodes));
+  CUDA_CHECK(cudaMalloc((void**)(&global_train_degrees_),
+                        sizeof(uint32_t) * num_nodes));
+  global_degree_size_       = num_nodes;
+  global_train_degree_size_ = num_nodes;
+}
+
+#if 0 // TODO(lhc) will be added
+__global__ void CalculateFullNormFactorGPU() {
+  const unsigned thread_id =
+      BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index
+  const unsigned thread_lane =
+      threadIdx.x & (WARP_SIZE - 1); // thread index within the warp
+  const unsigned warp_id = thread_id / WARP_SIZE; // global warp index
+  const unsigned warp_lane =
+    threadIdx.x / WARP_SIZE; // warp index within the CTA
+  const unsigned num_warps =
+    (BLOCK_SIZE / WARP_SIZE) * gridDim.x; // total number of active warps
+
+  // each warp gets a source: this var holds the first/last edge worked on by
+  // that warp
+  __shared__ int edge_begin_end[BLOCK_SIZE / WARP_SIZE][2];
+
+  // each warp works on a source: threads in warp split the feature
+  for (int src = warp_id; src < static_cast<int>(num_nodes); src += num_warps) {
+    if (thread_lane < 2) {
+      edge_begin_end[warp_lane][thread_lane] = edge_index[src + thread_lane];
+    }
+    __syncthreads();
+
+    const int edge_begin = edge_begin_end[warp_lane][0];
+    const int edge_end   = edge_begin_end[warp_lane][1];
+    for (int offest = edge_begin; offset < edge_end; offset++) {
+
+    }
+  }
+}
+
+void galois::graphs::GNNGraphGPUAllocations::CalculateFullNormFactor() {
+
+}
+#endif
+
+void galois::graphs::GNNGraphGPUAllocations::SetGlobalDegrees(
+    const std::vector<uint32_t> global_degrees) {
+  if (global_degree_size_ < global_degrees.size()) {
+    if (global_degree_size_ > 0) {
+      CUDA_CHECK(cudaFree(global_degrees_));
+    }
+    CUDA_CHECK(cudaMalloc((void**)(&global_degrees_),
+                          global_degrees.size() * sizeof(uint32_t)));
+    global_degree_size_ = global_degrees.size();
+  }
+
+  CUDA_CHECK(cudaMemcpy(global_degrees_, global_degrees.data(),
+                        global_degrees.size() * sizeof(uint32_t),
+                        cudaMemcpyHostToDevice));
+}
+
+void galois::graphs::GNNGraphGPUAllocations::SetGlobalTrainDegrees(
+    const std::vector<uint32_t> global_train_degrees) {
+  if (global_train_degree_size_ < global_train_degrees.size()) {
+    if (global_train_degree_size_ > 0) {
+      CUDA_CHECK(cudaFree(global_train_degrees_));
+    }
+    CUDA_CHECK(cudaMalloc((void**)(&global_train_degrees_),
+                          global_train_degrees.size() * sizeof(uint32_t)));
+    global_train_degree_size_ = global_train_degrees.size();
+  }
+
+  CUDA_CHECK(cudaMemcpy(global_train_degrees_, global_train_degrees.data(),
+                        global_train_degrees.size() * sizeof(uint32_t),
                         cudaMemcpyHostToDevice));
 }
+
+void galois::graphs::GNNGraphGPUAllocations::AllocAggregateBitset(size_t size) {
+  cuda_bitset_graph_aggregate.alloc(1);
+  cuda_bitset_graph_aggregate.cpu_wr_ptr()->alloc(size);
+}
+
+void galois::graphs::GNNGraphGPUAllocations::CopyToCPU(
+    const PointerWithSize<GNNFloat>& input) {
+  GNNFloat* cpu_input = (GNNFloat*)malloc(sizeof(GNNFloat) * input.size());
+  cudaMemcpy(cpu_input, input.data(), sizeof(GNNFloat) * input.size(),
+             cudaMemcpyDeviceToHost);
+  for (size_t i = 0; i < input.size(); i++)
+    fprintf(stdout, "** %lu is %f\n", i, cpu_input[i]);
+}
diff --git a/libgnn/src/layers/DenseLayer.cpp b/libgnn/src/layers/DenseLayer.cpp
index 75e715e482..483ceb7850 100644
--- a/libgnn/src/layers/DenseLayer.cpp
+++ b/libgnn/src/layers/DenseLayer.cpp
@@ -97,10 +97,12 @@ void galois::DenseLayer::UpdateEmbeddings(const GNNFloat* node_embeddings,
                                           GNNFloat* output) {
 #ifdef GALOIS_ENABLE_GPU
   if (device_personality == DevicePersonality::GPU_CUDA) {
+    /* TODO(lhc) implement this
     gpu_object_.UpdateEmbeddingsGPU(
         layer_dimensions_.input_rows, layer_dimensions_.input_columns,
         layer_dimensions_.output_columns, node_embeddings,
         base_gpu_object_.layer_weights(), output);
+        */
   } else {
 #endif
     // CPU version is just a call into CBlas
@@ -119,10 +121,12 @@ void galois::DenseLayer::UpdateEmbeddingsDerivative(const GNNFloat* gradients,
          layer_dimensions_.input_columns * layer_dimensions_.output_columns);
 #ifdef GALOIS_ENABLE_GPU
   if (device_personality == DevicePersonality::GPU_CUDA) {
+    /* TODO(lhc) implement this
     gpu_object_.UpdateEmbeddingsDerivativeGPU(
         layer_dimensions_.input_rows, layer_dimensions_.input_columns,
         layer_dimensions_.output_columns, gradients,
         base_gpu_object_.layer_weights(), output);
+        */
   } else {
 #endif
     // difference is Trans for B matrix (data) to get z by y (weights is y by z
diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp
index 14d8bd8759..0c01bb788b 100644
--- a/libgnn/src/layers/GNNLayer.cpp
+++ b/libgnn/src/layers/GNNLayer.cpp
@@ -38,9 +38,10 @@ galois::GNNLayer::GNNLayer(size_t layer_num,
     GlorotBengioInit(&layer_weights_);
   }
 
+  size_t num_output_elements =
+      layer_dimensions_.input_rows * layer_dimensions_.output_columns;
+
   if (!config_.disable_output) {
-    size_t num_output_elements =
-        layer_dimensions_.input_rows * layer_dimensions_.output_columns;
     galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
                   ", forward output matrix ", num_output_elements, " (",
                   FloatElementsToGB(num_output_elements), " GB)");
@@ -75,7 +76,8 @@ galois::GNNLayer::GNNLayer(size_t layer_num,
                                   layer_weight_gradients_.size());
     p_forward_output_matrix_ = PointerWithSize<GNNFloat>(
         base_gpu_object_.forward_output(), forward_output_matrix_.size());
-    p_backward_output_matrix_ = *backward_output_matrix;
+    p_backward_output_matrix_ = PointerWithSize<GNNFloat>(
+        base_gpu_object_.backward_output(), backward_output_matrix->size());
     // TODO can clear the cpu side vectors/don't use .size() since optimally
     // they aren't initialized
   } else {
@@ -127,9 +129,8 @@ void galois::GNNLayer::PairGlorotBengioInit(std::vector<GNNFloat>* vector1,
   for (size_t i = 0; i < vector2->size(); i++) {
     (*vector2)[i] = dist(rng);
   }
+
 #ifdef GALOIS_ENABLE_GPU
-  // TODO
-  GALOIS_LOG_FATAL("TODO: copy both not 1");
   if (device_personality == DevicePersonality::GPU_CUDA) {
     CopyLayerWeightsToGPU();
   }
@@ -200,15 +201,15 @@ void galois::GNNLayer::ReconstructDropoutMatrix(
     PointerWithSize<GNNFloat>* output_matrix) {
   galois::StatTimer timer("ReconstructDropoutMatrix", "GNNLayer");
   timer.start();
+  // reuse the dropout mask from a previous dropout call
+  size_t num_elements = output_matrix->size();
+  GNNFloat scale      = 1. / (1. - config_.dropout_rate);
 #ifdef GALOIS_ENABLE_GPU
   if (device_personality == DevicePersonality::GPU_CUDA) {
-    // TODO(hochan)
-    GALOIS_LOG_FATAL("Implement me");
+    base_gpu_object_.ReconstructDropoutMatrixGPU(
+        input_to_dropout, output_matrix, num_elements, scale);
   } else {
 #endif
-    // reuse the dropout mask from a previous dropout call
-    size_t num_elements = output_matrix->size();
-    GNNFloat scale      = 1. / (1. - config_.dropout_rate);
     galois::do_all(
         galois::iterate(static_cast<size_t>(0), num_elements),
         [&](size_t i) {
@@ -254,59 +255,69 @@ void galois::GNNLayer::Activation() {
   galois::StatTimer timer("ForwardActivation", "GNNLayer");
   timer.start();
 
-  if (activation_memo_.size() == 0) {
-    activation_memo_.resize(forward_output_matrix_.size());
-  }
-  activation_memo_.reset();
-
   // TODO only does relu at the moment; should check user specified activation
   // and act accordingly
-  galois::do_all(
-      galois::iterate(static_cast<size_t>(0),
-                      layer_dimensions_.input_rows *
-                          layer_dimensions_.output_columns),
-      [&](size_t i) {
-        if (forward_output_matrix_[i] > 0.0) {
-          // do nothing, keep value; set the memo though
-          activation_memo_.set(i);
-        } else {
-          forward_output_matrix_[i] = 0;
-        }
-      },
-      galois::loopname("ReLU"));
+#ifdef GALOIS_ENABLE_GPU
+  if (device_personality == DevicePersonality::GPU_CUDA) {
+    base_gpu_object_.ActivationGPU(p_forward_output_matrix_.size());
+  } else {
+#endif
+    if (activation_memo_.size() == 0) {
+      activation_memo_.resize(forward_output_matrix_.size());
+    }
+    activation_memo_.reset();
+
+    galois::do_all(
+        galois::iterate(static_cast<size_t>(0),
+                        layer_dimensions_.input_rows *
+                            layer_dimensions_.output_columns),
+        [&](size_t i) {
+          if (forward_output_matrix_[i] > 0.0) {
+            // do nothing, keep value; set the memo though
+            activation_memo_.set(i);
+          } else {
+            forward_output_matrix_[i] = 0;
+          }
+        },
+        galois::loopname("ReLU"));
+#ifdef GALOIS_ENABLE_GPU
+  }
+#endif
   timer.stop();
 }
 
 void galois::GNNLayer::ActivationDerivative(
     PointerWithSize<GNNFloat>* gradient) {
-  galois::StatTimer timer("BackwardActivation", "GNNLayer");
-  timer.start();
-
-  // TODO only does relu at the moment; should check user specified activation
-  // and act accordingly
-  // keep gradient if the original output was greater than 0
-  galois::do_all(
-      galois::iterate(static_cast<size_t>(0),
-                      layer_dimensions_.input_rows *
-                          layer_dimensions_.output_columns),
-      [&](size_t i) {
-        // it was <= 0 before; set back to 0
-        if (!activation_memo_.test(i)) {
-          (*gradient)[i] = 0;
-        }
-      },
-      galois::loopname("ReLU-Derivative"));
-  timer.stop();
+#ifdef GALOIS_ENABLE_GPU
+  if (device_personality == DevicePersonality::GPU_CUDA) {
+    base_gpu_object_.ActivationDerivativeGPU(gradient->data(),
+                                             gradient->size());
+  } else {
+#endif
+    // TODO only does relu at the moment; should check user specified activation
+    // and act accordingly
+    // keep gradient if the original output was greater than 0
+    galois::do_all(
+        galois::iterate(static_cast<size_t>(0),
+                        layer_dimensions_.input_rows *
+                            layer_dimensions_.output_columns),
+        [&](size_t i) {
+          // it was <= 0 before; set back to 0
+          if (!activation_memo_.test(i)) {
+            (*gradient)[i] = 0;
+          }
+        },
+        galois::loopname("ReLU-Derivative"));
+#ifdef GALOIS_ENABLE_GPU
+  }
+#endif
 }
 
 void galois::GNNLayer::WeightGradientSyncSum() {
   galois::StatTimer t("Sync_WeightGradientsSum", "GNNLayer");
   t.start();
-#ifdef GALOIS_ENABLE_GPU
-  // TODO(hochan) collectives here rather than gluon sync if possible like the
-  // CPU code
-  // preferably without needing to do a gpu->cpu copy
-#else
+  int weight_size = static_cast<int>(p_layer_weight_gradients_.size());
+
   // TODO(loc) remove this limitation later; can just do a loop over the weight
   // matrix
   if (p_layer_weight_gradients_.size() >
@@ -314,54 +325,73 @@ void galois::GNNLayer::WeightGradientSyncSum() {
     GALOIS_LOG_FATAL("Weight sync code does not handle size larger than max "
                      "int at the moment");
   }
-  MPI_Allreduce(MPI_IN_PLACE,
-                static_cast<void*>(p_layer_weight_gradients_.data()),
-                static_cast<int>(p_layer_weight_gradients_.size()), MPI_FLOAT,
-                MPI_SUM, MPI_COMM_WORLD);
+#ifdef GALOIS_ENABLE_GPU
+  // TODO(lhc) make this clang option later
+  bool gpu_direct_enabled = false;
+  if (device_personality == DevicePersonality::GPU_CUDA &&
+      !gpu_direct_enabled) {
+    base_gpu_object_.CopyWeightGradientsToCPU(&layer_weight_gradients_);
+    MPI_Allreduce(MPI_IN_PLACE, layer_weight_gradients_.data(), weight_size,
+                  MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);
+    base_gpu_object_.CopyToWeightGradients(layer_weight_gradients_);
+  } else {
+#endif
+    MPI_Allreduce(MPI_IN_PLACE,
+                  static_cast<void*>(p_layer_weight_gradients_.data()),
+                  weight_size, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);
+#ifdef GALOIS_ENABLE_GPU
+  }
 #endif
   t.stop();
 }
 
 void galois::GNNLayer::MaskInputNonMasters(PointerWithSize<GNNFloat>* input) {
-#ifdef GALOIS_ENABLE_GPU
-  // TODO(hochan) mask away the **non** masters on gpu
-  GALOIS_LOG_FATAL("implement this");
-#else
   assert(*(graph_.begin_owned()) == 0);
   size_t start_node = *(graph_.end_owned());
   size_t end_node   = graph_.active_size();
   size_t row_index  = layer_dimensions_.input_columns;
   assert((row_index * layer_dimensions_.input_rows) <= input->size());
-  galois::do_all(
-      galois::iterate(start_node, end_node),
-      [&](size_t non_master) {
-        // TODO(loc) use a std function for this for max efficiency
-        for (size_t i = 0; i < row_index; i++) {
-          (*input)[non_master * row_index + i] = 0;
-        }
-      },
-      galois::loopname("MaskInputNonMasters"));
+#ifdef GALOIS_ENABLE_GPU
+  if (device_personality == DevicePersonality::GPU_CUDA) {
+    base_gpu_object_.MaskNonMastersGPU(input, start_node, end_node, row_index);
+  } else {
+#endif
+    galois::do_all(
+        galois::iterate(start_node, end_node),
+        [&](size_t non_master) {
+          // TODO(loc) use a std function for this for max efficiency
+          for (size_t i = 0; i < row_index; i++) {
+            (*input)[non_master * row_index + i] = 0;
+          }
+        },
+        galois::loopname("MaskInputNonMasters"));
+#ifdef GALOIS_ENABLE_GPU
+  }
 #endif
 }
 
 void galois::GNNLayer::MaskGradientNonMasters(
     PointerWithSize<GNNFloat>* gradient) {
-#ifdef GALOIS_ENABLE_GPU
-  // TODO(hochan) mask away the **non** masters on gpu
-  GALOIS_LOG_FATAL("implement this");
-#else
   assert(*(graph_.begin_owned()) == 0);
   size_t start_node = *(graph_.end_owned());
   size_t end_node   = graph_.active_size();
   size_t row_index  = layer_dimensions_.output_columns;
-  galois::do_all(
-      galois::iterate(start_node, end_node),
-      [&](size_t non_master) {
-        // TODO(loc) use a std function for this for max efficiency
-        for (size_t i = 0; i < row_index; i++) {
-          (*gradient)[non_master * row_index + i] = 0;
-        }
-      },
-      galois::loopname("MaskGradientNonMasters"));
+#ifdef GALOIS_ENABLE_GPU
+  if (device_personality == DevicePersonality::GPU_CUDA) {
+    base_gpu_object_.MaskNonMastersGPU(gradient, start_node, end_node,
+                                       row_index);
+  } else {
+#endif
+    galois::do_all(
+        galois::iterate(start_node, end_node),
+        [&](size_t non_master) {
+          // TODO(loc) use a std function for this for max efficiency
+          for (size_t i = 0; i < row_index; i++) {
+            (*gradient)[non_master * row_index + i] = 0;
+          }
+        },
+        galois::loopname("MaskGradientNonMasters"));
+#ifdef GALOIS_ENABLE_GPU
+  }
 #endif
 }
diff --git a/libgnn/src/layers/GNNLayer.cu b/libgnn/src/layers/GNNLayer.cu
index d6616be5fe..71b2b4512f 100644
--- a/libgnn/src/layers/GNNLayer.cu
+++ b/libgnn/src/layers/GNNLayer.cu
@@ -9,6 +9,7 @@ galois::GNNLayerGPUAllocations::~GNNLayerGPUAllocations() {
   CUDA_FREE(backward_output_matrix_);
   CUDA_FREE(layer_weights_);
   CUDA_FREE(layer_weight_gradients_);
+  CUDA_FREE(activation_memo_);
 }
 
 void galois::GNNLayerGPUAllocations::InitInOutMemory(size_t forward_size,
@@ -47,17 +48,24 @@ void galois::GNNLayerGPUAllocations::CopyToWeights(
                         cudaMemcpyHostToDevice));
 }
 
+void galois::GNNLayerGPUAllocations::CopyToWeightGradients(
+    const std::vector<GNNFloat>& cpu_gradients) {
+  CUDA_CHECK(cudaMemcpy(layer_weight_gradients_, cpu_gradients.data(),
+                        cpu_gradients.size() * sizeof(GNNFloat),
+                        cudaMemcpyHostToDevice));
+}
+
 void galois::GNNLayerGPUAllocations::CopyForwardOutputToCPU(
-    std::vector<GNNFloat>* cpu_forward_output) {
-  CUDA_CHECK(cudaMemcpy(cpu_forward_output->data(), forward_output_matrix_,
-                        cpu_forward_output->size() * sizeof(GNNFloat),
+    GNNFloat* cpu_forward_output, size_t forward_output_size) {
+  CUDA_CHECK(cudaMemcpy(cpu_forward_output, forward_output_matrix_,
+                        forward_output_size * sizeof(GNNFloat),
                         cudaMemcpyDeviceToHost));
 }
 
 void galois::GNNLayerGPUAllocations::CopyBackwardOutputToCPU(
-    std::vector<GNNFloat>* cpu_backward_output) {
-  CUDA_CHECK(cudaMemcpy(cpu_backward_output->data(), backward_output_matrix_,
-                        cpu_backward_output->size() * sizeof(GNNFloat),
+    GNNFloat* cpu_backward_output, size_t backward_output_size) {
+  CUDA_CHECK(cudaMemcpy(cpu_backward_output, backward_output_matrix_,
+                        backward_output_size * sizeof(GNNFloat),
                         cudaMemcpyDeviceToHost));
 }
 
@@ -74,7 +82,7 @@ __global__ void
 DoDropoutImpl(size_t input_size, const galois::GNNFloat* input_to_dropout,
               galois::GNNFloat* output, const galois::GNNFloat* rng_vector,
               char* dropout_mask, float dropout_rate, galois::GNNFloat scale) {
-  CUDA_KERNEL_LOOP(i, input_size) {
+  CUDA_KERNEL_LOOP(i, 0, input_size) {
     // convert the rng floats into a mask
     dropout_mask[i] = rng_vector[i] > dropout_rate ? 1 : 0;
     // use mask to keep/drop weights
@@ -86,7 +94,7 @@ __global__ void DoDropoutDerivativeImpl(size_t input_size,
                                         galois::GNNFloat* input,
                                         char* dropout_mask,
                                         galois::GNNFloat scale) {
-  CUDA_KERNEL_LOOP(i, input_size) {
+  CUDA_KERNEL_LOOP(i, 0, input_size) {
     input[i] = input[i] * (float)dropout_mask[i] * scale;
   }
 }
@@ -138,3 +146,123 @@ __global__ void PrintVector(galois::GNNFloat* v, unsigned size) {
 void galois::GNNLayerGPUAllocations::PrintForwardOutput(size_t size) {
   PrintVector<<<1, 1>>>(forward_output_matrix_, size);
 }
+
+// TODO copy from gpu function as well just in case I need to check
+void galois::GNNLayerGPUAllocations::PrintBackwardOutput(size_t size) {
+  PrintVector<<<1, 1>>>(backward_output_matrix_, size);
+}
+
+namespace {
+__global__ void InitVectorTo1Kernel(galois::GNNFloat* vector,
+                                    size_t num_vector_elements) {
+  CUDA_KERNEL_LOOP(idx, 0, num_vector_elements) { vector[idx] = 1.0; }
+}
+
+__global__ void ReluActivationKernel(galois::GNNFloat* forward_output_matrix,
+                                     size_t num_forward_output_elements,
+                                     uint8_t* activation_memo) {
+  CUDA_KERNEL_LOOP(idx, 0, num_forward_output_elements) {
+    if (forward_output_matrix[idx] > galois::GNNFloat{0}) {
+      activation_memo[idx] = 1;
+    } else {
+      forward_output_matrix[idx] = 0;
+    }
+  }
+}
+
+__global__ void ReluActivationDerivativeKernel(
+    galois::GNNFloat* gradients, galois::GNNFloat* forward_output_matrix,
+    const size_t num_gradients_elements, const uint8_t* activation_memo) {
+  CUDA_KERNEL_LOOP(idx, 0, num_gradients_elements) {
+    if (!activation_memo[idx]) {
+      gradients[idx] = 0;
+    }
+  }
+}
+
+__global__ void
+ReconstructDropoutMatrixKernel(const galois::GNNFloat* input_to_dropout,
+                               galois::GNNFloat* output_matrix,
+                               char* dropout_mask, const size_t num_elements,
+                               const galois::GNNFloat scale) {
+  CUDA_KERNEL_LOOP(i, 0, num_elements) {
+    output_matrix[i] = input_to_dropout[i] * scale;
+  }
+
+  CUDA_KERNEL_LOOP(i, 0, num_elements) {
+    output_matrix[i] *= static_cast<galois::GNNFloat>(dropout_mask[i]);
+  }
+}
+
+__global__ void MaskNonMastersKernel(galois::GNNFloat* input,
+                                     uint32_t start_node, uint32_t end_node,
+                                     uint32_t row_index) {
+  // TODO(lhc) implement nested parallelism if it is worth
+  CUDA_KERNEL_LOOP(non_master, start_node, end_node) {
+    for (uint32_t j = 0; j < row_index; j++) {
+      input[non_master * row_index + j] = 0;
+    }
+  }
+}
+} // namespace
+
+void galois::GNNLayerGPUAllocations::InitGPUVectorTo1(GNNFloat* vector,
+                                                      size_t vector_size) {
+  InitVectorTo1Kernel<<<CUDA_GET_BLOCKS(vector_size), CUDA_NUM_THREADS>>>(
+      vector, vector_size);
+  CUDA_TEST("Failed to initialize vector to 1.");
+}
+
+void galois::GNNLayerGPUAllocations::ActivationGPU(
+    size_t num_forward_output_elements) {
+  if (activation_memo_ == nullptr) {
+    CUDA_CHECK(cudaMalloc((void**)(&activation_memo_),
+                          num_forward_output_elements * sizeof(uint8_t)));
+  }
+  ReluActivationKernel<<<CUDA_GET_BLOCKS(num_forward_output_elements),
+                         CUDA_NUM_THREADS>>>(
+      forward_output_matrix_, num_forward_output_elements, activation_memo_);
+  CUDA_TEST("Activation GPU failed.");
+}
+
+void galois::GNNLayerGPUAllocations::ActivationDerivativeGPU(
+    GNNFloat* gradients, size_t num_gradients_elements) {
+  ReluActivationDerivativeKernel<<<CUDA_GET_BLOCKS(num_gradients_elements),
+                                   CUDA_NUM_THREADS>>>(
+      gradients, forward_output_matrix_, num_gradients_elements,
+      activation_memo_);
+  CUDA_TEST("ActivationDerivative GPU failed.");
+}
+
+void galois::GNNLayerGPUAllocations::ReconstructDropoutMatrixGPU(
+    const PointerWithSize<GNNFloat> input_to_dropout,
+    PointerWithSize<GNNFloat>* output_matrix, size_t num_elements,
+    GNNFloat scale) {
+  ReconstructDropoutMatrixKernel<<<CUDA_GET_BLOCKS(num_elements),
+                                   CUDA_NUM_THREADS>>>(
+      input_to_dropout.data(), output_matrix->data(), dropout_mask_,
+      num_elements, scale);
+}
+
+void galois::GNNLayerGPUAllocations::MaskNonMastersGPU(
+    PointerWithSize<GNNFloat>* input, size_t start_node, size_t end_node,
+    size_t row_index) {
+  MaskNonMastersKernel<<<CUDA_GET_BLOCKS(row_index), CUDA_NUM_THREADS>>>(
+      input->data(), start_node, end_node, row_index);
+}
+
+void galois::GNNLayerGPUAllocations::CopyToCPU(
+    PointerWithSize<GNNFloat>* input) {
+  GNNFloat* cpu_input = (GNNFloat*)malloc(sizeof(GNNFloat) * input->size());
+  cudaMemcpy(cpu_input, input->data(), sizeof(GNNFloat) * input->size(),
+             cudaMemcpyDeviceToHost);
+  for (size_t i = 0; i < input->size(); i++)
+    fprintf(stderr, "%lu = %f\n", i, cpu_input[i]);
+}
+
+void galois::GNNLayerGPUAllocations::CopyToCPU(GNNFloat* input, size_t size) {
+  GNNFloat* cpu_input = (GNNFloat*)malloc(sizeof(GNNFloat) * size);
+  cudaMemcpy(cpu_input, input, sizeof(GNNFloat) * size, cudaMemcpyDeviceToHost);
+  for (size_t i = 0; i < size; i++)
+    fprintf(stderr, "%lu = %f\n", i, cpu_input[i]);
+}
diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp
index 3bca821078..7c22627f2f 100644
--- a/libgnn/src/layers/GraphConvolutionalLayer.cpp
+++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp
@@ -16,7 +16,15 @@ galois::GraphConvolutionalLayer::GraphConvolutionalLayer(
     galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
                   ", GCN input temp var 1 ", num_input_elements, " (",
                   FloatElementsToGB(num_input_elements), " GB)");
-    in_temp_1_.resize(num_input_elements, 0);
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      gpu_object_.AllocateInTemp1(num_input_elements);
+    } else {
+#endif
+      in_temp_1_.resize(num_input_elements, 0);
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
   }
 
   // only on in dropout case + if in temp is smaller than out temp
@@ -26,7 +34,15 @@ galois::GraphConvolutionalLayer::GraphConvolutionalLayer(
     galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
                   ", GCN input temp var 2 ", num_input_elements, " (",
                   FloatElementsToGB(num_input_elements), " GB)");
-    in_temp_2_.resize(num_input_elements, 0);
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      gpu_object_.AllocateInTemp2(num_input_elements);
+    } else {
+#endif
+      in_temp_2_.resize(num_input_elements, 0);
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
   }
 
   size_t num_output_elements =
@@ -39,20 +55,27 @@ galois::GraphConvolutionalLayer::GraphConvolutionalLayer(
     galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
                   ", GCN output temp var ", num_output_elements, " (",
                   FloatElementsToGB(num_output_elements), " GB)");
-    out_temp_.resize(num_output_elements, 0);
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      gpu_object_.AllocateOutTemp(num_output_elements);
+    } else {
+#endif
+      out_temp_.resize(num_output_elements, 0);
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
   }
 
   layer_type_ = galois::GNNLayerType::kGraphConvolutional;
 #ifdef GALOIS_ENABLE_GPU
   if (device_personality == DevicePersonality::GPU_CUDA) {
-    gpu_object_.Allocate(num_input_elements, num_output_elements);
     // init pointers with size
     p_in_temp_1_ =
-        PointerWithSize<GNNFloat>(gpu_object_.in_temp_1(), in_temp_1_.size());
+        PointerWithSize<GNNFloat>(gpu_object_.in_temp_1(), num_input_elements);
     p_in_temp_2_ =
-        PointerWithSize<GNNFloat>(gpu_object_.in_temp_2(), in_temp_2_.size());
+        PointerWithSize<GNNFloat>(gpu_object_.in_temp_2(), num_input_elements);
     p_out_temp_ =
-        PointerWithSize<GNNFloat>(gpu_object_.out_temp(), out_temp_.size());
+        PointerWithSize<GNNFloat>(gpu_object_.out_temp(), num_output_elements);
   } else {
 #endif
     p_in_temp_1_ = PointerWithSize<GNNFloat>(in_temp_1_);
@@ -270,10 +293,12 @@ void galois::GraphConvolutionalLayer::AggregateAll(
 
 #ifdef GALOIS_ENABLE_GPU
   if (device_personality == DevicePersonality::GPU_CUDA) {
+    size_t last_master = *(graph_.end_owned());
     gpu_object_.AggregateAllGPU(
         graph_.GetGPUGraph(), graph_.size(), column_length, node_embeddings,
-        aggregate_output, !config_.disable_normalization);
-    graph_.AggregateSync(aggregate_output, column_length, layer_number_);
+        aggregate_output, !config_.disable_normalization,
+        config_.disable_self_aggregate, last_master);
+    graph_.AggregateSyncGPU(aggregate_output, column_length, layer_number_);
   } else {
 #endif
     AggregateAllCPU(column_length, node_embeddings, aggregate_output, pts);
diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cu b/libgnn/src/layers/GraphConvolutionalLayer.cu
index 882cb32391..4ef8b62eca 100644
--- a/libgnn/src/layers/GraphConvolutionalLayer.cu
+++ b/libgnn/src/layers/GraphConvolutionalLayer.cu
@@ -1,5 +1,13 @@
+#include "gg.h"
+#include "ggcuda.h"
 #include "galois/GNNMath.cuh"
 #include "galois/layers/GraphConvolutionalLayer.cuh"
+#include "galois/cuda/DynamicBitset.h"
+#include "sharedptr.h"
+
+// TODO(lhc) better way for this declaration is to declare it
+//           inside of the cuda context, but this messed linking to Gluon
+extern Shared<DynamicBitset> cuda_bitset_graph_aggregate;
 
 galois::GCNGPUAllocations::~GCNGPUAllocations() {
   GALOIS_LOG_VERBOSE("Freeing GCN layer allocations");
@@ -8,24 +16,26 @@ galois::GCNGPUAllocations::~GCNGPUAllocations() {
   CUDA_FREE(out_temp_);
 }
 
-void galois::GCNGPUAllocations::Allocate(size_t input_elements,
-                                         size_t output_elements) {
-  CUDA_CHECK(
-      cudaMalloc((void**)(&in_temp_1_), input_elements * sizeof(GNNFloat)));
-  CUDA_CHECK(
-      cudaMalloc((void**)(&in_temp_2_), input_elements * sizeof(GNNFloat)));
-  CUDA_CHECK(
-      cudaMalloc((void**)(&out_temp_), output_elements * sizeof(GNNFloat)));
+void galois::GCNGPUAllocations::AllocateInTemp1(const size_t size) {
+  CUDA_CHECK(cudaMalloc((void**)(&in_temp_1_), size * sizeof(GNNFloat)));
+}
+
+void galois::GCNGPUAllocations::AllocateInTemp2(const size_t size) {
+  CUDA_CHECK(cudaMalloc((void**)(&in_temp_2_), size * sizeof(GNNFloat)));
+}
+
+void galois::GCNGPUAllocations::AllocateOutTemp(const size_t size) {
+  CUDA_CHECK(cudaMalloc((void**)(&out_temp_), size * sizeof(GNNFloat)));
 }
 
 namespace {
 // GPU side aggregation call: no matrix multiply, just regular dst accesses
-__global__ void AggregateAllKernel(unsigned num_nodes, size_t column_length,
-                                   const int* edge_index,
-                                   const int* edge_destination,
-                                   const galois::GNNFloat* norm_factors,
-                                   const galois::GNNFloat* node_embeddings,
-                                   galois::GNNFloat* aggregate_output) {
+__global__ void AggregateAllKernel(
+    unsigned num_nodes, size_t column_length, const int* edge_index,
+    const int* edge_destination, const uint32_t* global_degrees,
+    const galois::GNNFloat* node_embeddings, galois::GNNFloat* aggregate_output,
+    bool disable_self_aggregate, size_t last_master,
+    DynamicBitset* cuda_bitset_graph_aggregate) {
   const unsigned thread_id =
       BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index
   const unsigned thread_lane =
@@ -43,10 +53,13 @@ __global__ void AggregateAllKernel(unsigned num_nodes, size_t column_length,
   // each warp works on a source: threads in warp split the feature
   for (int src = warp_id; src < static_cast<int>(num_nodes); src += num_warps) {
     galois::GNNFloat src_norm    = 0.0;
+    galois::GNNFloat dst_norm    = 0.0;
     galois::GNNFloat norm_to_use = 1.0;
 
-    if (norm_factors != nullptr) {
-      src_norm = norm_factors[src];
+    if (global_degrees != nullptr) {
+      src_norm = (global_degrees[src])
+                     ? (1.0 / sqrt(static_cast<float>(global_degrees[src] + 1)))
+                     : 0.0;
     }
 
     if (thread_lane < 2) {
@@ -60,21 +73,44 @@ __global__ void AggregateAllKernel(unsigned num_nodes, size_t column_length,
     const int row_end       = edge_begin_end[warp_lane][1];
     unsigned base_src_index = src * column_length;
 
+    if (!disable_self_aggregate) {
+      cuda_bitset_graph_aggregate->set(src);
+      if (src < last_master) {
+        norm_to_use = src_norm * src_norm;
+        for (int i = 0; i < column_length; i += WARP_SIZE) {
+          if (thread_lane + i < column_length) {
+            aggregate_output[base_src_index + thread_lane + i] =
+                node_embeddings[base_src_index + thread_lane + i] * norm_to_use;
+          }
+        }
+      }
+    }
+
     for (int offset = row_begin; offset < row_end; offset++) {
       int dst                 = edge_destination[offset];
       unsigned base_dst_index = dst * column_length;
+      cuda_bitset_graph_aggregate->set(src);
 
-      if (norm_factors != nullptr) {
+      if (global_degrees != nullptr) {
+        dst_norm =
+            (global_degrees[dst])
+                ? (1.0 / sqrt(static_cast<float>(global_degrees[dst] + 1)))
+                : 0.0;
         // note that otherwise it's 1.0, so a no-op when it comes to multiply
-        norm_to_use = src_norm * norm_factors[dst];
+        norm_to_use = src_norm * dst_norm;
       }
 
       // NOTE: this is where warp diverges
       // the feature aggregation is split among thread in a warp
       for (int i = 0; i < column_length; i += WARP_SIZE) {
         if ((thread_lane + i) < column_length) {
-          aggregate_output[base_src_index + thread_lane + i] +=
-              node_embeddings[base_dst_index + thread_lane + i] * norm_to_use;
+          if (global_degrees != nullptr) {
+            aggregate_output[base_src_index + thread_lane + i] +=
+                node_embeddings[base_dst_index + thread_lane + i] * norm_to_use;
+          } else {
+            aggregate_output[base_src_index + thread_lane + i] +=
+                node_embeddings[base_dst_index + thread_lane + i];
+          }
         }
       }
     }
@@ -86,19 +122,27 @@ __global__ void AggregateAllKernel(unsigned num_nodes, size_t column_length,
 void galois::GCNGPUAllocations::AggregateAllGPU(
     const graphs::GNNGraphGPUAllocations& gpu_graph, size_t num_nodes,
     size_t column_length, const GNNFloat* node_embeddings,
-    GNNFloat* aggregate_output, bool use_norm) {
+    GNNFloat* aggregate_output, bool use_norm, bool disable_self_aggregate,
+    size_t last_master) {
+  // num_nodes should be greater than 0 to avoid negative number of thread
+  if (num_nodes == 0) {
+    return;
+  }
+
   CUDA_CHECK(cudaMemset(aggregate_output, 0,
                         num_nodes * column_length * sizeof(GNNFloat)));
   if (use_norm) {
     AggregateAllKernel<<<(num_nodes - 1) / WARPS_PER_BLOCK + 1, BLOCK_SIZE>>>(
         num_nodes, column_length, gpu_graph.edge_index(),
-        gpu_graph.edge_destinations(), gpu_graph.norm_factors(),
-        node_embeddings, aggregate_output);
+        gpu_graph.edge_destinations(), gpu_graph.get_global_degrees(),
+        node_embeddings, aggregate_output, disable_self_aggregate, last_master,
+        cuda_bitset_graph_aggregate.gpu_wr_ptr());
   } else {
     AggregateAllKernel<<<(num_nodes - 1) / WARPS_PER_BLOCK + 1, BLOCK_SIZE>>>(
         num_nodes, column_length, gpu_graph.edge_index(),
         gpu_graph.edge_destinations(), nullptr, node_embeddings,
-        aggregate_output);
+        aggregate_output, disable_self_aggregate, last_master,
+        cuda_bitset_graph_aggregate.gpu_wr_ptr());
   }
   CUDA_TEST("GPU aggregate all failure");
 }
diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp
index 22178ee2fa..9696a9b460 100644
--- a/libgnn/src/layers/SAGELayer.cpp
+++ b/libgnn/src/layers/SAGELayer.cpp
@@ -20,21 +20,44 @@ galois::SAGELayer::SAGELayer(size_t layer_num,
     galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
                   ", SAGE second layer weights ", num_weight_elements, " (",
                   FloatElementsToGB(num_weight_elements), " GB)");
+    // TODO(lhc) for now, allocate dummy cpu weight2 for copying to GPU
     layer_weights_2_.resize(num_weight_elements);
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      gpu_object_.AllocateWeight2(num_weight_elements);
+    }
+#endif
     galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
                   ", SAGE second layer gradients ", num_weight_elements, " (",
                   FloatElementsToGB(num_weight_elements), " GB)");
     layer_weight_gradients_2_.resize(num_weight_elements, 0);
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      gpu_object_.AllocateWeightGradient2(num_weight_elements);
+    }
+#endif
 
     // reinit both weight matrices as one unit
     PairGlorotBengioInit(&layer_weights_, &layer_weights_2_);
-
-    // update the pointers to them as well as realloc will require it
-    p_layer_weights_2_ = PointerWithSize<GNNFloat>(layer_weights_2_);
-    p_layer_weight_gradients_2_ =
-        PointerWithSize<GNNFloat>(layer_weight_gradients_2_);
-    // initialize the optimizer
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      // copy weight2 to GPU
+      gpu_object_.CopyToWeights2(layer_weights_2_);
+      p_layer_weights_2_ = PointerWithSize<GNNFloat>(
+          gpu_object_.layer_weights_2(), num_weight_elements);
+      p_layer_weight_gradients_2_ = PointerWithSize<GNNFloat>(
+          gpu_object_.layer_weight_gradients_2(), num_weight_elements);
+    } else {
+#endif
+      // update the pointers to them as well as realloc will require it
+      p_layer_weights_2_ = PointerWithSize<GNNFloat>(layer_weights_2_);
+      p_layer_weight_gradients_2_ =
+          PointerWithSize<GNNFloat>(layer_weight_gradients_2_);
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
     std::vector<size_t> weight_size = {num_weight_elements};
+    // initialize the optimizer
     second_weight_optimizer_ = std::make_unique<AdamOptimizer>(weight_size, 1);
   }
 
@@ -47,7 +70,15 @@ galois::SAGELayer::SAGELayer(size_t layer_num,
     galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
                   ", SAGE input temp var 1 ", num_input_elements, " (",
                   FloatElementsToGB(num_input_elements), " GB)");
-    in_temp_1_.resize(num_input_elements, 0);
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      gpu_object_.AllocateInTemp1(num_input_elements);
+    } else {
+#endif
+      in_temp_1_.resize(num_input_elements, 0);
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
   }
 
   // only on in dropout case + if in temp is smaller than out temp
@@ -57,40 +88,52 @@ galois::SAGELayer::SAGELayer(size_t layer_num,
     galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
                   ", SAGE input temp var 2 ", num_input_elements, " (",
                   FloatElementsToGB(num_input_elements), " GB)");
-    in_temp_2_.resize(num_input_elements, 0);
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      gpu_object_.AllocateInTemp2(num_input_elements);
+    } else {
+#endif
+      in_temp_2_.resize(num_input_elements, 0);
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
   }
 
   size_t num_output_elements =
       layer_dimensions_.input_rows * layer_dimensions_.output_columns;
-
   // only needed if out temp would be smaller than intemp
   if (!config_.disable_aggregate_after_update &&
       layer_dimensions_.input_columns > layer_dimensions_.output_columns) {
     galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
                   ", SAGE output temp var ", num_output_elements, " (",
                   FloatElementsToGB(num_output_elements), " GB)");
-    out_temp_.resize(num_output_elements, 0);
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      gpu_object_.AllocateOutTemp(num_output_elements);
+    } else {
+#endif
+      out_temp_.resize(num_output_elements, 0);
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
   }
 
   layer_type_ = galois::GNNLayerType::kSAGE;
 #ifdef GALOIS_ENABLE_GPU
-  // TODO(loc/hochan) GPU SAGE
   if (device_personality == DevicePersonality::GPU_CUDA) {
-    gpu_object_.Allocate(num_input_elements, num_output_elements);
     // init pointers with size
     p_in_temp_1_ =
-        PointerWithSize<GNNFloat>(gpu_object_.in_temp_1(), in_temp_1_.size());
+        PointerWithSize<GNNFloat>(gpu_object_.in_temp_1(), num_input_elements);
     p_in_temp_2_ =
-        PointerWithSize<GNNFloat>(gpu_object_.in_temp_2(), in_temp_2_.size());
+        PointerWithSize<GNNFloat>(gpu_object_.in_temp_2(), num_input_elements);
     p_out_temp_ =
-        PointerWithSize<GNNFloat>(gpu_object_.out_temp(), out_temp_.size());
+        PointerWithSize<GNNFloat>(gpu_object_.out_temp(), num_output_elements);
   } else {
 #endif
     p_in_temp_1_ = PointerWithSize<GNNFloat>(in_temp_1_);
     p_in_temp_2_ = PointerWithSize<GNNFloat>(in_temp_2_);
     p_out_temp_  = PointerWithSize<GNNFloat>(out_temp_);
 #ifdef GALOIS_ENABLE_GPU
-    // TODO concat parameters
   }
 #endif
 
@@ -100,22 +143,30 @@ galois::SAGELayer::SAGELayer(size_t layer_num,
 void galois::SAGELayer::WeightGradientSyncSum2() {
   galois::StatTimer t("Sync_WeightGradientsSum2", kRegionName);
   t.start();
+  int weight_size = static_cast<int>(p_layer_weight_gradients_2_.size());
+#ifdef GALOIS_ENABLE_GPU
+  bool gpu_direct_enabled = false;
+  if (device_personality == DevicePersonality::GPU_CUDA &&
+      !gpu_direct_enabled) {
+    gpu_object_.CopyWeight2GradientsToCPU(&layer_weight_gradients_2_);
+    MPI_Allreduce(MPI_IN_PLACE,
+                  static_cast<void*>(layer_weight_gradients_2_.data()),
+                  weight_size, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);
+    gpu_object_.CopyToWeight2Gradients(layer_weight_gradients_2_);
+  } else {
+#endif
+    // TODO(loc) remove this limitation later; can just do a loop over the
+    // weight matrix
+    if (p_layer_weight_gradients_2_.size() >
+        size_t{std::numeric_limits<int>::max()}) {
+      GALOIS_LOG_FATAL("Weight sync code does not handle size larger than max "
+                       "int at the moment");
+    }
+    MPI_Allreduce(MPI_IN_PLACE,
+                  static_cast<void*>(p_layer_weight_gradients_2_.data()),
+                  weight_size, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);
 #ifdef GALOIS_ENABLE_GPU
-  // TODO(hochan) collectives here rather than gluon sync if possible like the
-  // CPU code
-  GALOIS_LOG_FATAL("implement me");
-#else
-  // TODO(loc) remove this limitation later; can just do a loop over the weight
-  // matrix
-  if (p_layer_weight_gradients_2_.size() >
-      size_t{std::numeric_limits<int>::max()}) {
-    GALOIS_LOG_FATAL("Weight sync code does not handle size larger than max "
-                     "int at the moment");
   }
-  MPI_Allreduce(MPI_IN_PLACE,
-                static_cast<void*>(p_layer_weight_gradients_2_.data()),
-                static_cast<int>(p_layer_weight_gradients_2_.size()), MPI_FLOAT,
-                MPI_SUM, MPI_COMM_WORLD);
 #endif
   t.stop();
 }
@@ -226,19 +277,29 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
       // this is fine because gradient won't be used to get feature gradients
       MaskGradientNonMasters(input_gradient);
     }
-    // input data (prev layer input or temp1) or gradient need mask
-    // can mask gradient if layer == 0
-    // otherwise must mask other
-    galois::CBlasSGEMM(
-        CblasTrans, CblasNoTrans, layer_dimensions_.input_columns,
-        layer_dimensions_.input_rows, layer_dimensions_.output_columns,
-        input_data.data(), input_gradient->data(),
-        p_layer_weight_gradients_2_.data());
+
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      gpu_object_.UpdateWeight2DerivativeGPU(
+          layer_dimensions_.input_columns, layer_dimensions_.input_rows,
+          layer_dimensions_.output_columns, input_data.data(),
+          input_gradient->data(), p_layer_weight_gradients_2_.data());
+    } else {
+#endif
+      // input data (prev layer input or temp1) or gradient need mask
+      // can mask gradient if layer == 0
+      // otherwise must mask other
+      galois::CBlasSGEMM(
+          CblasTrans, CblasNoTrans, layer_dimensions_.input_columns,
+          layer_dimensions_.input_rows, layer_dimensions_.output_columns,
+          input_data.data(), input_gradient->data(),
+          p_layer_weight_gradients_2_.data());
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
   }
   WeightGradientSyncSum2();
 
-  // AFW = O
-
   // derivative of aggregation/update
   // TODO clean up logic here to reduce nesting
   if (config_.disable_aggregate_after_update ||
@@ -369,12 +430,12 @@ void galois::SAGELayer::AggregateAll(
     if (!IsSampledLayer()) {
       gpu_object_.AggregateAllGPU(
           graph_.GetGPUGraph(), graph_.size(), column_length, node_embeddings,
-          aggregate_output, !config_.disable_normalization);
+          aggregate_output, !config_.disable_normalization, is_backward);
     } else {
       // TODO(hochan)
       GALOIS_LOG_FATAL("SAMPLING IMPLEMENTATION");
     }
-    graph_.AggregateSync(aggregate_output, column_length, layer_number_);
+    graph_.AggregateSyncGPU(aggregate_output, column_length, layer_number_);
   } else {
 #endif
     AggregateAllCPU(column_length, node_embeddings, aggregate_output, pts,
@@ -519,17 +580,21 @@ void galois::SAGELayer::SelfFeatureUpdateEmbeddings(
   galois::StatTimer timer("SelfForwardXForm", kRegionName);
   timer.start();
 #ifdef GALOIS_ENABLE_GPU
-  // TODO self change
+  if (device_personality == DevicePersonality::GPU_CUDA) {
+    gpu_object_.SelfFeatureUpdateEmbeddingsGPU(
+        layer_dimensions_.input_rows, layer_dimensions_.input_columns,
+        layer_dimensions_.output_columns, node_embeddings, output);
+  } else {
 #endif
-  // note use of layer weights 2 differentiates this from above
-  galois::CBlasSGEMM(CblasNoTrans, CblasNoTrans, layer_dimensions_.input_rows,
-                     layer_dimensions_.input_columns,
-                     layer_dimensions_.output_columns, node_embeddings,
-                     layer_weights_2_.data(), output, true);
+    // note use of layer weights 2 differentiates this from above
+    galois::CBlasSGEMM(CblasNoTrans, CblasNoTrans, layer_dimensions_.input_rows,
+                       layer_dimensions_.input_columns,
+                       layer_dimensions_.output_columns, node_embeddings,
+                       layer_weights_2_.data(), output, true);
 #ifdef GALOIS_ENABLE_GPU
-}
+  }
 #endif
-timer.stop();
+  timer.stop();
 }
 
 void galois::SAGELayer::UpdateEmbeddingsDerivative(const GNNFloat* gradients,
@@ -567,16 +632,21 @@ void galois::SAGELayer::SelfFeatureUpdateEmbeddingsDerivative(
   assert(p_layer_weights_.size() >=
          layer_dimensions_.input_columns * layer_dimensions_.output_columns);
 #ifdef GALOIS_ENABLE_GPU
-  // TODO gpu self
+  if (device_personality == DevicePersonality::GPU_CUDA) {
+    gpu_object_.SelfFeatureUpdateEmbeddingsDerivativeGPU(
+        layer_dimensions_.input_rows, layer_dimensions_.output_columns,
+        layer_dimensions_.input_columns, gradients, output);
+  } else {
 #endif
-  // difference is Trans for B matrix (data) to get z by y (weights is y by z
-  // normally); result is x by y
-  // true at end -> accumulate
-  galois::CBlasSGEMM(CblasNoTrans, CblasTrans, layer_dimensions_.input_rows,
-                     layer_dimensions_.output_columns,
-                     layer_dimensions_.input_columns, gradients,
-                     layer_weights_2_.data(), output, true);
+    // difference is Trans for B matrix (data) to get z by y (weights is y by z
+    // normally); result is x by y
+    // true at end -> accumulate
+    galois::CBlasSGEMM(CblasNoTrans, CblasTrans, layer_dimensions_.input_rows,
+                       layer_dimensions_.output_columns,
+                       layer_dimensions_.input_columns, gradients,
+                       layer_weights_2_.data(), output, true);
 #ifdef GALOIS_ENABLE_GPU
+  }
 #endif
   timer.stop();
 }
diff --git a/libgnn/src/layers/SAGELayer.cu b/libgnn/src/layers/SAGELayer.cu
new file mode 100644
index 0000000000..33cf32d9d3
--- /dev/null
+++ b/libgnn/src/layers/SAGELayer.cu
@@ -0,0 +1,209 @@
+#include "gg.h"
+#include "ggcuda.h"
+#include "galois/cuda/DynamicBitset.h"
+#include "galois/GNNMath.cuh"
+#include "galois/layers/SAGELayer.cuh"
+
+extern Shared<DynamicBitset> cuda_bitset_graph_aggregate;
+
+galois::SAGEGPUAllocations::~SAGEGPUAllocations() {
+  GALOIS_LOG_VERBOSE("Freeing SAGE layer allocations");
+  CUDA_FREE(in_temp_1_);
+  CUDA_FREE(in_temp_2_);
+  CUDA_FREE(out_temp_);
+  CUDA_FREE(layer_weights_2_);
+  CUDA_FREE(layer_weight_gradients_2_);
+}
+
+void galois::SAGEGPUAllocations::AllocateWeight2(const size_t size) {
+  CUDA_CHECK(cudaMalloc((void**)(&layer_weights_2_), size * sizeof(GNNFloat)));
+}
+
+void galois::SAGEGPUAllocations::AllocateWeightGradient2(const size_t size) {
+  CUDA_CHECK(cudaMalloc((void**)(&layer_weight_gradients_2_),
+                        size * sizeof(GNNFloat)));
+}
+
+void galois::SAGEGPUAllocations::AllocateInTemp1(const size_t size) {
+  CUDA_CHECK(cudaMalloc((void**)(&in_temp_1_), size * sizeof(GNNFloat)));
+}
+
+void galois::SAGEGPUAllocations::AllocateInTemp2(const size_t size) {
+  CUDA_CHECK(cudaMalloc((void**)(&in_temp_2_), size * sizeof(GNNFloat)));
+}
+
+void galois::SAGEGPUAllocations::AllocateOutTemp(const size_t size) {
+  CUDA_CHECK(cudaMalloc((void**)(&out_temp_), size * sizeof(GNNFloat)));
+}
+
+namespace {
+// GPU side aggregation call: no matrix multiply, just regular dst accesses
+__global__ void AggregateAllKernel(
+    unsigned num_nodes, size_t column_length, const int* edge_index,
+    const int* edge_destination, const uint32_t* degree_for_norm,
+    const galois::GNNFloat* node_embeddings, galois::GNNFloat* aggregate_output,
+    DynamicBitset* cuda_bitset_graph_aggregate, bool is_backward) {
+  const unsigned thread_id =
+      BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index
+  const unsigned thread_lane =
+      threadIdx.x & (WARP_SIZE - 1); // thread index within the warp
+  const unsigned warp_id = thread_id / WARP_SIZE; // global warp index
+  const unsigned warp_lane =
+      threadIdx.x / WARP_SIZE; // warp index within the CTA
+  const unsigned num_warps =
+      (BLOCK_SIZE / WARP_SIZE) * gridDim.x; // total number of active warps
+
+  // each warp gets a source: this var holds the first/last edge worked on by
+  // that warp
+  __shared__ int edge_begin_end[BLOCK_SIZE / WARP_SIZE][2];
+
+  // each warp works on a source: threads in warp split the feature
+  for (int src = warp_id; src < static_cast<int>(num_nodes); src += num_warps) {
+    galois::GNNFloat norm_to_use = 0.0;
+
+    if (degree_for_norm != nullptr && !is_backward) {
+      norm_to_use = (degree_for_norm[src]) ? (1.0 / degree_for_norm[src]) : 0.0;
+    }
+
+    if (thread_lane < 2) {
+      edge_begin_end[warp_lane][thread_lane] = edge_index[src + thread_lane];
+    }
+    // essentially what this is doing is making 2 of the threads set edge
+    // begin/end; all threads wait for sync
+    __syncthreads();
+
+    const int row_begin     = edge_begin_end[warp_lane][0];
+    const int row_end       = edge_begin_end[warp_lane][1];
+    unsigned base_src_index = src * column_length;
+
+    for (int offset = row_begin; offset < row_end; offset++) {
+      cuda_bitset_graph_aggregate->set(src);
+      int dst                 = edge_destination[offset];
+      unsigned base_dst_index = dst * column_length;
+
+      if (degree_for_norm != nullptr && is_backward) {
+        norm_to_use =
+            (degree_for_norm[dst]) ? (1.0 / degree_for_norm[dst]) : 0.0;
+      }
+
+      // NOTE: this is where warp diverges
+      // the feature aggregation is split among thread in a warp
+      for (int i = 0; i < column_length; i += WARP_SIZE) {
+        if ((thread_lane + i) < column_length) {
+          if (degree_for_norm != nullptr) {
+            aggregate_output[base_src_index + thread_lane + i] +=
+                node_embeddings[base_dst_index + thread_lane + i] * norm_to_use;
+          } else {
+            aggregate_output[base_src_index + thread_lane + i] +=
+                node_embeddings[base_dst_index + thread_lane + i];
+          }
+        }
+      }
+    }
+  }
+}
+
+} // namespace
+
+// TODO(lhc) Will need to iterate over in-edges if is_backward is on
+void galois::SAGEGPUAllocations::AggregateAllGPU(
+    const graphs::GNNGraphGPUAllocations& gpu_graph, size_t num_nodes,
+    size_t column_length, const GNNFloat* node_embeddings,
+    GNNFloat* aggregate_output, bool use_norm, bool is_backward) {
+  // num_nodes should be greater than 0 to avoid negative number of thread
+  if (num_nodes == 0) {
+    return;
+  }
+
+  CUDA_CHECK(cudaMemset(aggregate_output, 0,
+                        num_nodes * column_length * sizeof(GNNFloat)));
+  if (use_norm) {
+    uint32_t* degree_for_norm{nullptr};
+    // TODO(lhc) will be added for sampling
+    // if (use_subgraph_) {
+    //} else {
+    degree_for_norm = gpu_graph.get_global_degrees();
+    //}
+    AggregateAllKernel<<<(num_nodes - 1) / WARPS_PER_BLOCK + 1, BLOCK_SIZE>>>(
+        num_nodes, column_length, gpu_graph.edge_index(),
+        gpu_graph.edge_destinations(), degree_for_norm, node_embeddings,
+        aggregate_output, cuda_bitset_graph_aggregate.gpu_wr_ptr(),
+        is_backward);
+  } else {
+    AggregateAllKernel<<<(num_nodes - 1) / WARPS_PER_BLOCK + 1, BLOCK_SIZE>>>(
+        num_nodes, column_length, gpu_graph.edge_index(),
+        gpu_graph.edge_destinations(), nullptr, node_embeddings,
+        aggregate_output, cuda_bitset_graph_aggregate.gpu_wr_ptr(),
+        is_backward);
+  }
+  CUDA_TEST("GPU aggregate all failure");
+}
+
+void galois::SAGEGPUAllocations::UpdateEmbeddingsGPU(
+    size_t num_nodes, size_t input_columns, size_t output_columns,
+    const GNNFloat* node_embeddings, const GNNFloat* layer_weights,
+    GNNFloat* output) {
+  CBlasSGEMMGPU(CUBLAS_OP_N, CUBLAS_OP_N, num_nodes, input_columns,
+                output_columns, node_embeddings, layer_weights, output);
+}
+
+void galois::SAGEGPUAllocations::UpdateEmbeddingsDerivativeGPU(
+    size_t num_nodes, size_t input_columns, size_t output_columns,
+    const GNNFloat* gradients, const GNNFloat* layer_weights,
+    GNNFloat* output) {
+  // note output clumns/input columns are flipped due to transpose of the
+  // layer weights
+  CBlasSGEMMGPU(CUBLAS_OP_N, CUBLAS_OP_T, num_nodes, output_columns,
+                input_columns, gradients, layer_weights, output);
+}
+
+void galois::SAGEGPUAllocations::GetWeightGradientsGPU(
+    size_t num_nodes, size_t input_columns, size_t output_columns,
+    const GNNFloat* prev_input, const GNNFloat* gradients, GNNFloat* output) {
+  CBlasSGEMMGPU(CUBLAS_OP_T, CUBLAS_OP_N, input_columns, num_nodes,
+                output_columns, prev_input, gradients, output);
+}
+
+void galois::SAGEGPUAllocations::SelfFeatureUpdateEmbeddingsGPU(
+    size_t input_rows, size_t input_columns, size_t output_columns,
+    const GNNFloat* node_embeddings, GNNFloat* output) {
+  CBlasSGEMMGPU(CUBLAS_OP_N, CUBLAS_OP_N, input_rows, input_columns,
+                output_columns, node_embeddings, layer_weights_2_, output,
+                true);
+}
+
+void galois::SAGEGPUAllocations::SelfFeatureUpdateEmbeddingsDerivativeGPU(
+    size_t input_rows, size_t output_columns, size_t input_columns,
+    const GNNFloat* gradients, GNNFloat* output) {
+  CBlasSGEMMGPU(CUBLAS_OP_N, CUBLAS_OP_T, input_rows, output_columns,
+                input_columns, gradients, layer_weights_2_, output, true);
+}
+
+void galois::SAGEGPUAllocations::UpdateWeight2DerivativeGPU(
+    size_t input_columns, size_t input_rows, size_t output_columns,
+    const GNNFloat* prev_layer_inputs, const GNNFloat* input_gradients,
+    GNNFloat* output) {
+  CBlasSGEMMGPU(CUBLAS_OP_T, CUBLAS_OP_N, input_columns, input_rows,
+                output_columns, prev_layer_inputs, input_gradients, output);
+}
+
+void galois::SAGEGPUAllocations::CopyToWeights2(
+    const std::vector<GNNFloat>& cpu_layer_weights) {
+  CUDA_CHECK(cudaMemcpy(layer_weights_2_, cpu_layer_weights.data(),
+                        cpu_layer_weights.size() * sizeof(GNNFloat),
+                        cudaMemcpyHostToDevice));
+}
+
+void galois::SAGEGPUAllocations::CopyToWeight2Gradients(
+    const std::vector<GNNFloat>& cpu_gradients) {
+  CUDA_CHECK(cudaMemcpy(layer_weight_gradients_2_, cpu_gradients.data(),
+                        cpu_gradients.size() * sizeof(GNNFloat),
+                        cudaMemcpyHostToDevice));
+}
+
+void galois::SAGEGPUAllocations::CopyWeight2GradientsToCPU(
+    std::vector<GNNFloat>* cpu_gradients) {
+  CUDA_CHECK(cudaMemcpy(cpu_gradients->data(), layer_weight_gradients_2_,
+                        cpu_gradients->size() * sizeof(GNNFloat),
+                        cudaMemcpyDeviceToHost));
+}
diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp
index beccf42289..312bdab9ac 100644
--- a/libgnn/src/layers/SoftmaxLayer.cpp
+++ b/libgnn/src/layers/SoftmaxLayer.cpp
@@ -72,8 +72,8 @@ galois::SoftmaxLayer::ForwardPhase(
   if (device_personality == DevicePersonality::GPU_CUDA) {
     gpu_object_.ForwardPhaseGPU(
         layer_phase_, graph_.size(), layer_dimensions_.input_columns,
-        input_embeddings.data(), p_forward_output_matrix_.data());
-    return p_forward_output_matrix_;
+        input_embeddings.data(), p_backward_output_matrix_.data());
+    return p_backward_output_matrix_;
   }
 #endif
   return ForwardPhaseCPU(input_embeddings);
diff --git a/libgnn/src/layers/SoftmaxLayer.cu b/libgnn/src/layers/SoftmaxLayer.cu
index f24a6f1e77..e29c1bb201 100644
--- a/libgnn/src/layers/SoftmaxLayer.cu
+++ b/libgnn/src/layers/SoftmaxLayer.cu
@@ -3,33 +3,77 @@
 #include "galois/Logging.h"
 #include "galois/layers/SoftmaxLayer.cuh"
 
+void galois::SoftmaxLayerGPU::CopyToCPU(GNNFloat* input, size_t size) {
+  GNNFloat* cpu_input = (GNNFloat*)malloc(sizeof(GNNFloat) * size);
+  cudaMemcpy(cpu_input, input, sizeof(GNNFloat) * size, cudaMemcpyDeviceToHost);
+  for (size_t i = 0; i < size; i++)
+    fprintf(stderr, "%lu = %f\n", i, cpu_input[i]);
+}
+
 void galois::SoftmaxLayerGPU::ForwardPhaseGPU(galois::GNNPhase phase,
                                               size_t num_nodes,
                                               size_t feature_length,
                                               const GNNFloat* input_embeddings,
                                               GNNFloat* output) {
   char* mask_to_use = ChooseMask(phase);
-  CUDA_CHECK(
-      cudaMemset(output, 0, num_nodes * feature_length * sizeof(GNNFloat)));
   SoftmaxCrossEntropyForward<<<CUDA_GET_BLOCKS(num_nodes), CUDA_NUM_THREADS>>>(
       mask_to_use, num_nodes, feature_length, input_embeddings, output);
   CUDA_TEST("Softmax cross entropy forward failed");
 }
 
+__global__ void SoftmaxBackward(char* mask, size_t num_nodes,
+                                size_t feature_length,
+                                const galois::GNNFloat* predictions,
+                                const galois::GNNLabel* ground_truth,
+                                galois::GNNFloat* output_gradient) {
+  const unsigned global_thread_id =
+      BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index
+  const unsigned warp_thread_lane =
+      threadIdx.x & (WARP_SIZE - 1); // thread index within the warp
+  const unsigned warp_id = global_thread_id / WARP_SIZE; // global warp index
+  const unsigned num_warps =
+      (BLOCK_SIZE / WARP_SIZE) * gridDim.x; // total number of active warps
+
+  // a warp works on a single node at once
+  for (unsigned wid = warp_id; wid < num_nodes; wid += num_warps) {
+    // operate only if masked
+    if (mask[wid] == 1) {
+      unsigned base_index = wid * feature_length;
+      // TODO can refactor below to device functions
+      // cross entropy derivative
+      // each thread of warp takes different feature
+      for (unsigned feat_index = warp_thread_lane; feat_index < feature_length;
+           feat_index += WARP_SIZE) {
+        if (feat_index < feature_length) {
+          if (feat_index == (unsigned)ground_truth[wid]) {
+            output_gradient[base_index + feat_index] =
+                predictions[base_index + feat_index] - 1;
+          } else {
+            output_gradient[base_index + feat_index] =
+                predictions[base_index + feat_index];
+          }
+        }
+      }
+      __syncthreads();
+    }
+  }
+}
+
 void galois::SoftmaxLayerGPU::BackwardPhaseGPU(galois::GNNPhase phase,
                                                size_t num_nodes,
                                                size_t feature_length,
                                                const GNNFloat* predictions,
                                                GNNFloat* output_gradient) {
   assert(feature_length <= MAX_NUM_CLASSES);
+  // num_nodes should be greater than 0 to avoid negative number of thread
+  if (num_nodes == 0) {
+    return;
+  }
+
   char* mask_to_use = ChooseMask(phase);
-  CUDA_CHECK(cudaMemset(output_gradient, 0,
-                        num_nodes * feature_length * sizeof(GNNFloat)));
-  // TODO check the launch parameters; this is taken directly from the original
-  // code
-  SoftmaxCrossEntropyBackward<<<(num_nodes - 1) / WARPS_PER_BLOCK + 1,
-                                BLOCK_SIZE>>>(mask_to_use, num_nodes,
-                                              feature_length, predictions,
-                                              local_labels_, output_gradient);
+  SoftmaxBackward<<<(num_nodes - 1) / WARPS_PER_BLOCK + 1, BLOCK_SIZE>>>(
+      mask_to_use, num_nodes, feature_length, predictions, local_labels_,
+      output_gradient);
+
   CUDA_TEST("Softmax cross entropy backward failed");
 }
diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt
index 91835cfc07..9834b302e7 100644
--- a/libgnn/test/CMakeLists.txt
+++ b/libgnn/test/CMakeLists.txt
@@ -86,6 +86,10 @@ if (NOT GALOIS_ENABLE_GPU)
   target_link_libraries(sample-bit-test galois_gnn)
   add_test(NAME sample-bit-test COMMAND sample-bit-test)
 else()
+  add_executable(gpu-sage-layer-test gpu-sage-layer-test.cpp)
+  target_link_libraries(gpu-sage-layer-test galois_gnn)
+  add_test(NAME gpu-sage-layer-test COMMAND gpu-sage-layer-test)
+
   add_executable(gpu-convlayer-test gpu-convlayer-test.cpp)
   target_link_libraries(gpu-convlayer-test galois_gnn)
   add_test(NAME gpu-convlayer-test COMMAND gpu-convlayer-test)
@@ -104,6 +108,27 @@ else()
 
   add_executable(gpu-aggregate-sync-test gpu-aggregate-sync-test.cpp)
   target_link_libraries(gpu-aggregate-sync-test galois_gnn)
+
+  set(gpu_hosts)
+  set(gpu_host 3) #TODO(lhc) more than 4 gpus, test failed
+                  #          seems like it happened due to graph size 0.
+                  #          so let me postpone this
+  while (${gpu_host} GREATER 1)
+    list(APPEND gpu_hosts ${gpu_host})
+    math(EXPR gpu_host "${gpu_host} - 1")
+  endwhile()
+  list(APPEND gpu_hosts "1")
+
+  add_executable(gpu-back-conv-test gpu-back-conv-test.cpp)
+  target_link_libraries(gpu-back-conv-test galois_gnn)
+  foreach(gpu_host_count ${gpu_hosts})
+    set(PSET "-pset=")
+    foreach(iter RANGE 1 ${gpu_host_count})
+      set(PSET "${PSET}g")
+    endforeach()
+    add_test(NAME run-gpu-back-conv-${gpu_host_count} COMMAND mpiexec --bind-to none -n ${gpu_host_count} ./gpu-back-conv-test ${PSET} -numNodes=1)
+    set_tests_properties(run-gpu-back-conv-${gpu_host_count} PROPERTIES ENVIRONMENT "GALOIS_DO_NOT_BIND_THREADS=1")
+  endforeach()
 endif()
 
 # TODO multi host tests?
diff --git a/libgnn/test/gpu-adam-test.cpp b/libgnn/test/gpu-adam-test.cpp
index ed99982a78..58da1d3b68 100644
--- a/libgnn/test/gpu-adam-test.cpp
+++ b/libgnn/test/gpu-adam-test.cpp
@@ -1,4 +1,4 @@
-//! @file adam-test.cpp
+//! @file gpu-adam-test.cpp
 //! Tests the adam optimizer
 #include "galois/DistGalois.h"
 #include "galois/GNNOptimizers.h"
@@ -32,8 +32,12 @@ int main() {
   dimension_0.input_rows     = 7;
   dimension_0.input_columns  = test_graph.GetNumLabelClasses();
   dimension_0.output_columns = test_graph.GetNumLabelClasses();
-  auto alloc_layer =
-      std::make_unique<galois::SoftmaxLayer>(3, test_graph, dimension_0);
+  std::vector<galois::GNNFloat> output_matrix;
+  output_matrix.resize(dimension_0.input_rows * dimension_0.input_columns);
+
+  galois::PointerWithSize<galois::GNNFloat> output_layer(output_matrix);
+  auto alloc_layer = std::make_unique<galois::SoftmaxLayer>(
+      3, test_graph, &output_layer, dimension_0);
 
   std::vector<galois::GNNFloat> weights1 = {1, 1};
   std::vector<galois::GNNFloat> weights2 = {10};
diff --git a/libgnn/test/gpu-aggregate-sync-test.cpp b/libgnn/test/gpu-aggregate-sync-test.cpp
index a3f645c5ee..3a0ee7f3d4 100644
--- a/libgnn/test/gpu-aggregate-sync-test.cpp
+++ b/libgnn/test/gpu-aggregate-sync-test.cpp
@@ -29,17 +29,22 @@ int main() {
   l_config.disable_aggregate_after_update = true;
 
   unsigned num_layers = 2;
-  test_graph->ResizeLayerVector(num_layers);
+  test_graph->ResizeGPULayerVector(num_layers);
   test_graph->InitLayerVectorMetaObjects(
       0, galois::runtime::getSystemNetworkInterface().Num,
       dimension_0.input_columns, dimension_0.output_columns);
   test_graph->InitLayerVectorMetaObjects(
       1, galois::runtime::getSystemNetworkInterface().Num,
       dimension_0.input_columns, dimension_0.output_columns);
+
+  galois::PointerWithSize<galois::GNNFloat> p_null(nullptr, 0);
+  std::vector<galois::GNNFloat> back_matrix(21);
+  galois::PointerWithSize<galois::GNNFloat> p_back(back_matrix);
+
   // create the layer, no norm factor
   std::unique_ptr<galois::GraphConvolutionalLayer> layer_0 =
-      std::make_unique<galois::GraphConvolutionalLayer>(0, *(test_graph.get()),
-                                                        dimension_0, l_config);
+      std::make_unique<galois::GraphConvolutionalLayer>(
+          0, *(test_graph.get()), &p_null, dimension_0, l_config);
   layer_0->InitAllWeightsTo1();
   // make sure it runs in a sane manner
   layer_0->ForwardPhase(test_graph->GetLocalFeatures());
@@ -110,7 +115,7 @@ int main() {
   // layer 0 means that an empty weight matrix is returned since there is no
   // point passing back anything
   layer_0->BackwardPhase(test_graph->GetLocalFeatures(), &dummy_ones);
-  const std::vector<galois::GNNFloat>& layer_0_backward_output =
+  const galois::PointerWithSize<galois::GNNFloat>& layer_0_backward_output =
       layer_0->CopyBackwardOutputFromGPU();
 
   //////////////////////////////////////////////////////////////////////////////
@@ -126,8 +131,8 @@ int main() {
   // layer 1 to check backward output
   //////////////////////////////////////////////////////////////////////////////
   std::unique_ptr<galois::GraphConvolutionalLayer> layer_1 =
-      std::make_unique<galois::GraphConvolutionalLayer>(1, *(test_graph.get()),
-                                                        dimension_0, l_config);
+      std::make_unique<galois::GraphConvolutionalLayer>(
+          1, *(test_graph.get()), &p_back, dimension_0, l_config);
   layer_1->InitAllWeightsTo1();
   layer_1->ForwardPhase(test_graph->GetLocalFeatures());
   const std::vector<galois::GNNFloat>& layer_1_forward_output =
@@ -176,7 +181,7 @@ int main() {
   // since layer isn't 0 anymore, backward phase will actually return something
   dummy_ones_v.assign(test_graph->size() * 2, 1);
   layer_1->BackwardPhase(test_graph->GetLocalFeatures(), &dummy_ones);
-  const std::vector<galois::GNNFloat>& layer_1_backward_output =
+  const galois::PointerWithSize<galois::GNNFloat>& layer_1_backward_output =
       layer_1->CopyBackwardOutputFromGPU();
 
   for (size_t row = 0; row < test_graph->size(); row++) {
diff --git a/libgnn/test/gpu-back-conv-test.cpp b/libgnn/test/gpu-back-conv-test.cpp
new file mode 100644
index 0000000000..c089ffb698
--- /dev/null
+++ b/libgnn/test/gpu-back-conv-test.cpp
@@ -0,0 +1,167 @@
+//! @file gpu-back-conv-test.cpp
+#include "galois/Logging.h"
+#include "galois/layers/GraphConvolutionalLayer.h"
+#include "galois/CUDAUtilHostDecls.h"
+
+extern int gpudevice;
+
+int main() {
+  galois::DistMemSys G;
+
+  size_t num_threads = galois::setActiveThreads(
+      56 / galois::runtime::getSystemNetworkInterface().Num);
+
+  const unsigned my_host_id = galois::runtime::getHostID();
+  gpudevice                 = my_host_id;
+  SetCUDADeviceId(gpudevice);
+  device_personality = DevicePersonality::GPU_CUDA;
+
+  GALOIS_LOG_VERBOSE("[{}] Using {} threads",
+                     galois::runtime::getSystemNetworkInterface().ID,
+                     num_threads);
+  // load test graph
+  galois::graphs::GNNGraph test_graph(
+      "tester", galois::graphs::GNNPartitionScheme::kCVC, true);
+
+  galois::GNNLayerDimensions dimension_0;
+  dimension_0.input_rows     = test_graph.size();
+  dimension_0.input_columns  = 3;
+  dimension_0.output_columns = 2;
+
+  galois::GNNLayerConfig dcon;
+  dcon.DebugConfig();
+
+  galois::PointerWithSize<galois::GNNFloat> p_null(nullptr, 0);
+  std::vector<galois::GNNFloat> back_matrix(test_graph.size() * 3);
+  galois::PointerWithSize<galois::GNNFloat> p_back(back_matrix);
+
+  // dummy 1 matrix
+  std::vector<galois::GNNFloat> dummy_ones_v(test_graph.size() * 2, 1);
+
+  unsigned num_layers = 2;
+  test_graph.ResizeGPULayerVector(num_layers);
+  // require 0th substrate initialization
+  test_graph.InitLayerVectorMetaObjects(
+      0, galois::runtime::getSystemNetworkInterface().Num,
+      dimension_0.input_columns, dimension_0.output_columns);
+  test_graph.InitLayerVectorMetaObjects(
+      1, galois::runtime::getSystemNetworkInterface().Num,
+      dimension_0.input_columns, dimension_0.output_columns);
+
+  std::vector<galois::GNNFloat> output_matrix;
+  output_matrix.resize(dimension_0.input_rows * dimension_0.input_columns);
+  galois::PointerWithSize<galois::GNNFloat> output_layer(output_matrix);
+
+  // create layer 1 for testing backward prop actually giving weights back
+  std::unique_ptr<galois::GraphConvolutionalLayer> layer_1 =
+      std::make_unique<galois::GraphConvolutionalLayer>(1, test_graph, &p_back,
+                                                        dimension_0, dcon);
+  galois::PointerWithSize dummy_ones = layer_1->AllocateGPU(dummy_ones_v);
+  layer_1->InitAllWeightsTo1();
+  layer_1->ForwardPhase(test_graph.GetLocalFeatures());
+
+  const std::vector<galois::GNNFloat>& layer_1_forward_output =
+      layer_1->CopyForwardOutputFromGPU();
+
+  for (size_t row = 0; row < test_graph.size(); row++) {
+    // row -> GID
+    size_t global_row             = test_graph.GetGID(row);
+    galois::GNNFloat ground_truth = 0.0;
+
+    switch (global_row) {
+    case 0:
+      ground_truth = 3;
+      break;
+    case 1:
+      ground_truth = 6;
+      break;
+    case 2:
+      ground_truth = 12;
+      break;
+    case 3:
+      ground_truth = 18;
+      break;
+    case 4:
+      ground_truth = 24;
+      break;
+    case 5:
+      ground_truth = 30;
+      break;
+    case 6:
+      ground_truth = 15;
+      break;
+    default:
+      GALOIS_LOG_FATAL("bad global row for test graph");
+      break;
+    }
+    // size 2 columns
+    for (size_t c = 0; c < 2; c++) {
+      GALOIS_LOG_VASSERT(layer_1_forward_output[row * 2 + c] == ground_truth,
+                         "{} not {}", ground_truth,
+                         layer_1_forward_output[row * 2 + c]);
+    }
+  }
+
+  layer_1->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
+  const galois::PointerWithSize<galois::GNNFloat>& layer_1_backward_output =
+      layer_1->CopyBackwardOutputFromGPU();
+
+  for (size_t row = 0; row < test_graph.size(); row++) {
+    // row -> GID
+    size_t global_row             = test_graph.GetGID(row);
+    galois::GNNFloat ground_truth = 0.0;
+
+    switch (global_row) {
+    case 0:
+      ground_truth = 2;
+      break;
+    case 1:
+      ground_truth = 4;
+      break;
+    case 2:
+      ground_truth = 4;
+      break;
+    case 3:
+      ground_truth = 4;
+      break;
+    case 4:
+      ground_truth = 4;
+      break;
+    case 5:
+      ground_truth = 4;
+      break;
+    case 6:
+      ground_truth = 2;
+      break;
+    default:
+      GALOIS_LOG_FATAL("bad global row for test graph");
+      break;
+    }
+    // size 2 columns
+    for (size_t c = 0; c < 3; c++) {
+      GALOIS_LOG_ASSERT(layer_1_backward_output[row * 3 + c] == ground_truth);
+    }
+  }
+
+  const std::vector<galois::GNNFloat>& layer_1_weight_gradients =
+      layer_1->CopyWeightGradientsFromGPU();
+
+  // make sure they are sane
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients.size() == 6);
+  GALOIS_LOG_VASSERT(layer_1_weight_gradients[0] == 36, "36 not {}",
+                     layer_1_weight_gradients[0]);
+  GALOIS_LOG_VASSERT(layer_1_weight_gradients[1] == 36, "36 not {}",
+                     layer_1_weight_gradients[1]);
+  GALOIS_LOG_VASSERT(layer_1_weight_gradients[2] == 36, "36 not {}",
+                     layer_1_weight_gradients[2]);
+  GALOIS_LOG_VASSERT(layer_1_weight_gradients[3] == 36, "36 not {}",
+                     layer_1_weight_gradients[3]);
+  GALOIS_LOG_VASSERT(layer_1_weight_gradients[4] == 36, "36 not {}",
+                     layer_1_weight_gradients[4]);
+  GALOIS_LOG_VASSERT(layer_1_weight_gradients[5] == 36, "36 not {}",
+                     layer_1_weight_gradients[5]);
+
+  layer_1.reset();
+
+  return 0;
+}
diff --git a/libgnn/test/gpu-convlayer-test.cpp b/libgnn/test/gpu-convlayer-test.cpp
index 947a0b8703..553d96e1a2 100644
--- a/libgnn/test/gpu-convlayer-test.cpp
+++ b/libgnn/test/gpu-convlayer-test.cpp
@@ -33,18 +33,27 @@ int main() {
 
   galois::GNNLayerConfig dcon;
   dcon.disable_aggregate_after_update = false;
+  dcon.DebugConfig();
 
-  unsigned num_layers = 2;
-  test_graph.ResizeLayerVector(num_layers);
+  galois::PointerWithSize<galois::GNNFloat> p_null(nullptr, 0);
+  std::vector<galois::GNNFloat> back_matrix(21);
+  galois::PointerWithSize<galois::GNNFloat> p_back(back_matrix);
+
+  unsigned num_layers = 3;
+  test_graph.ResizeGPULayerVector(num_layers);
   test_graph.InitLayerVectorMetaObjects(
       0, galois::runtime::getSystemNetworkInterface().Num,
       dimension_0.input_columns, dimension_0.output_columns);
   test_graph.InitLayerVectorMetaObjects(
       1, galois::runtime::getSystemNetworkInterface().Num,
       dimension_0.input_columns, dimension_0.output_columns);
+  test_graph.InitLayerVectorMetaObjects(
+      2, galois::runtime::getSystemNetworkInterface().Num,
+      dimension_0.input_columns, dimension_0.output_columns);
+
   // create the layer, no norm factor
   std::unique_ptr<galois::GraphConvolutionalLayer> layer_0 =
-      std::make_unique<galois::GraphConvolutionalLayer>(0, test_graph,
+      std::make_unique<galois::GraphConvolutionalLayer>(0, test_graph, &p_null,
                                                         dimension_0, dcon);
   layer_0->InitAllWeightsTo1();
   // make sure it runs in a sane manner
@@ -86,36 +95,6 @@ int main() {
   // point passing back anything
   // galois::PointerWithSize<galois::GNNFloat> layer_0_backward_output =
   layer_0->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
-  const std::vector<galois::GNNFloat>& layer_0_backward_output =
-      layer_0->CopyBackwardOutputFromGPU();
-
-  //////////////////////////////////////////////////////////////////////////////
-  // sanity check layer 0 backward output; all 0 because layer 0
-  //////////////////////////////////////////////////////////////////////////////
-  // since norm factors aren't invovled it is possible to do full assertions
-  // 7 x 3
-  GALOIS_LOG_ASSERT(layer_0_backward_output.size() == 21);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[0] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[1] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[2] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[3] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[4] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[5] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[6] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[7] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[8] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[9] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[10] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[11] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[12] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[13] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[14] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[15] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[16] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[17] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[18] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[19] == 0);
-  GALOIS_LOG_ASSERT((layer_0_backward_output)[20] == 0);
 
   const std::vector<galois::GNNFloat>& layer_0_weight_gradients =
       layer_0->CopyWeightGradientsFromGPU();
@@ -134,7 +113,7 @@ int main() {
   // create layer 1 for testing backward prop actually giving weights back
 
   std::unique_ptr<galois::GraphConvolutionalLayer> layer_1 =
-      std::make_unique<galois::GraphConvolutionalLayer>(1, test_graph,
+      std::make_unique<galois::GraphConvolutionalLayer>(1, test_graph, &p_back,
                                                         dimension_0, dcon);
   layer_1->InitAllWeightsTo1();
   layer_1->ForwardPhase(test_graph.GetLocalFeatures());
@@ -160,8 +139,9 @@ int main() {
   // since layer isn't 0 anymore, backward phase will actually return something
   dummy_ones = layer_1->AllocateGPU(dummy_ones_v);
   layer_1->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
-  const std::vector<galois::GNNFloat>& layer_1_backward_output =
-      layer_1->CopyBackwardOutputFromGPU();
+  const galois::PointerWithSize<galois::GNNFloat>&
+      layer_1_backward_output = layer_1->CopyBackwardOutputFromGPU();
+
   //////////////////////////////////////////////////////////////////////////////
   // check that multiplies go as expected
   //////////////////////////////////////////////////////////////////////////////
@@ -204,62 +184,66 @@ int main() {
 
   // TODO get dropout and activation working
 
-  // galois::GNNLayerConfig config;
-  // config.do_dropout                   = true;
-  // config.do_activation                = true;
-  // config.do_normalization             = true;
-  // config.allow_aggregate_after_update = false;
-
-  //// finally, just make sure dropout and activation run without crashes
-  //// (verification requires floating point accuracy or setting a seed which I
-  //// don't have time for at the moment
-  //// TODO in future maybe add better unit test for this
-  // std::unique_ptr<galois::GraphConvolutionalLayer> layer_2 =
-  //    std::make_unique<galois::GraphConvolutionalLayer>(1, test_graph,
-  //                                                      dimension_0, config);
-  // galois::PointerWithSize<galois::GNNFloat> l2_fo =
-  //    layer_2->ForwardPhase(test_graph.GetLocalFeatures());
-  // GALOIS_LOG_ASSERT(l2_fo.size() == 14);
-  // GALOIS_LOG_VERBOSE("{}", l2_fo[0]);
-  // GALOIS_LOG_VERBOSE("{}", l2_fo[1]);
-  // GALOIS_LOG_VERBOSE("{}", l2_fo[2]);
-  // GALOIS_LOG_VERBOSE("{}", l2_fo[3]);
-  // GALOIS_LOG_VERBOSE("{}", l2_fo[4]);
-  // GALOIS_LOG_VERBOSE("{}", l2_fo[5]);
-  // GALOIS_LOG_VERBOSE("{}", l2_fo[6]);
-  // GALOIS_LOG_VERBOSE("{}", l2_fo[7]);
-  // GALOIS_LOG_VERBOSE("{}", l2_fo[8]);
-  // GALOIS_LOG_VERBOSE("{}", l2_fo[9]);
-  // GALOIS_LOG_VERBOSE("{}", l2_fo[10]);
-  // GALOIS_LOG_VERBOSE("{}", l2_fo[11]);
-  // GALOIS_LOG_VERBOSE("{}", l2_fo[12]);
-  // GALOIS_LOG_VERBOSE("{}", l2_fo[13]);
-
-  // galois::PointerWithSize<galois::GNNFloat> l2_bo =
-  //    layer_2->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
-
-  // GALOIS_LOG_ASSERT(l2_bo.size() == 21);
-  // GALOIS_LOG_VERBOSE("{}", (l2_bo)[0]);
-  // GALOIS_LOG_VERBOSE("{}", (l2_bo)[1]);
-  // GALOIS_LOG_VERBOSE("{}", (l2_bo)[2]);
-  // GALOIS_LOG_VERBOSE("{}", (l2_bo)[3]);
-  // GALOIS_LOG_VERBOSE("{}", (l2_bo)[4]);
-  // GALOIS_LOG_VERBOSE("{}", (l2_bo)[5]);
-  // GALOIS_LOG_VERBOSE("{}", (l2_bo)[6]);
-  // GALOIS_LOG_VERBOSE("{}", (l2_bo)[7]);
-  // GALOIS_LOG_VERBOSE("{}", (l2_bo)[8]);
-  // GALOIS_LOG_VERBOSE("{}", (l2_bo)[9]);
-  // GALOIS_LOG_VERBOSE("{}", (l2_bo)[10]);
-  // GALOIS_LOG_VERBOSE("{}", (l2_bo)[11]);
-  // GALOIS_LOG_VERBOSE("{}", (l2_bo)[12]);
-  // GALOIS_LOG_VERBOSE("{}", (l2_bo)[13]);
-  // GALOIS_LOG_VERBOSE("{}", (l2_bo)[14]);
-  // GALOIS_LOG_VERBOSE("{}", (l2_bo)[15]);
-  // GALOIS_LOG_VERBOSE("{}", (l2_bo)[16]);
-  // GALOIS_LOG_VERBOSE("{}", (l2_bo)[17]);
-  // GALOIS_LOG_VERBOSE("{}", (l2_bo)[18]);
-  // GALOIS_LOG_VERBOSE("{}", (l2_bo)[19]);
-  // GALOIS_LOG_VERBOSE("{}", (l2_bo)[20]);
+  galois::GNNLayerConfig config;
+  config.disable_dropout                = false;
+  config.disable_activation             = false;
+  config.disable_normalization          = false;
+  config.disable_aggregate_after_update = true;
+
+  // finally, just make sure dropout and activation run without crashes
+  // (verification requires floating point accuracy or setting a seed which I
+  // don't have time for at the moment
+  // TODO in future maybe add better unit test for this
+  std::unique_ptr<galois::GraphConvolutionalLayer> layer_2 =
+      std::make_unique<galois::GraphConvolutionalLayer>(2, test_graph, &p_back,
+                                                        dimension_0, config);
+  layer_2->ForwardPhase(test_graph.GetLocalFeatures());
+  // pointer is to GPU memory: copy it over to a CPU source for verification
+  const std::vector<galois::GNNFloat>& l2_fo =
+      layer_2->CopyForwardOutputFromGPU();
+
+  GALOIS_LOG_ASSERT(l2_fo.size() == 14);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[0]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[1]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[2]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[3]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[4]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[5]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[6]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[7]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[8]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[9]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[10]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[11]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[12]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[13]);
+
+  layer_2->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
+  const galois::PointerWithSize<galois::GNNFloat>& l2_bo =
+      layer_2->CopyBackwardOutputFromGPU();
+
+  GALOIS_LOG_ASSERT(l2_bo.size() == 21);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[0]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[1]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[2]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[3]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[4]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[5]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[6]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[7]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[8]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[9]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[10]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[11]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[12]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[13]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[14]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[15]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[16]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[17]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[18]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[19]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[20]);
 
   return 0;
 }
diff --git a/libgnn/test/gpu-epoch-test.cpp b/libgnn/test/gpu-epoch-test.cpp
index 3ac2c2b2ed..8b71b81e3f 100644
--- a/libgnn/test/gpu-epoch-test.cpp
+++ b/libgnn/test/gpu-epoch-test.cpp
@@ -1,4 +1,4 @@
-//! @file epoch-test.cpp
+//! @file gpu-epoch-test.cpp
 //! Run 50 epochs of training to see if results improve.
 
 #include "galois/Logging.h"
@@ -23,9 +23,7 @@ int main() {
   std::vector<size_t> layer_output_sizes = {
       16, test_graph->GetNumLabelClasses(), test_graph->GetNumLabelClasses()};
   galois::GNNLayerConfig layer_config;
-  layer_config.do_dropout       = true;
-  layer_config.do_activation    = false;
-  layer_config.do_normalization = true;
+  layer_config.DebugConfig();
   // XXX Activation kills accuracy compared to old code, esp. for cora
   galois::GraphNeuralNetworkConfig gnn_config(
       2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax,
@@ -49,22 +47,18 @@ int main() {
   main_timer.start();
   for (size_t epoch = 0; epoch < 100; epoch++) {
     galois::PointerWithSize<galois::GNNFloat> predictions = gnn->DoInference();
-    if (cpu_pred.size() != predictions.size()) {
-      cpu_pred.resize(predictions.size());
-    }
     gnn->GradientPropagation();
     // copy to cpu
     // TODO currently adam has this helper function; it should be handled
     // by other class though
-    adam->CopyToVector(cpu_pred, predictions);
     galois::gPrint("Epoch ", epoch, ": Accuracy is ",
-                   gnn->GetGlobalAccuracy(cpu_pred), "\n");
+                   gnn->GetGlobalAccuracy(predictions), "\n");
   }
 
   // check test accuracy
   gnn->SetLayerPhases(galois::GNNPhase::kTest);
   galois::PointerWithSize<galois::GNNFloat> predictions = gnn->DoInference();
-  adam->CopyToVector(cpu_pred, predictions);
-  galois::gPrint("Test accuracy is ", gnn->GetGlobalAccuracy(cpu_pred), "\n");
+  galois::gPrint("Test accuracy is ", gnn->GetGlobalAccuracy(predictions),
+                 "\n");
   main_timer.stop();
 }
diff --git a/libgnn/test/gpu-sage-layer-test.cpp b/libgnn/test/gpu-sage-layer-test.cpp
new file mode 100644
index 0000000000..7cec3b9a2b
--- /dev/null
+++ b/libgnn/test/gpu-sage-layer-test.cpp
@@ -0,0 +1,270 @@
+//! @file gpu-sage-layer-test.cpp
+//! Sage layer test
+
+#include "galois/Logging.h"
+#include "galois/layers/SAGELayer.h"
+
+int main() {
+  galois::DistMemSys G;
+
+  size_t num_threads = galois::setActiveThreads(
+      56 / galois::runtime::getSystemNetworkInterface().Num);
+
+  GALOIS_LOG_VERBOSE("[{}] Using {} threads",
+                     galois::runtime::getSystemNetworkInterface().ID,
+                     num_threads);
+  device_personality = DevicePersonality::GPU_CUDA;
+
+  galois::GNNLayerDimensions dimension_0;
+  dimension_0.input_rows     = 7;
+  dimension_0.input_columns  = 3;
+  dimension_0.output_columns = 2;
+
+  // load test graph
+  galois::graphs::GNNGraph test_graph(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true);
+  unsigned num_layers = 3;
+  test_graph.ResizeGPULayerVector(num_layers);
+  test_graph.InitLayerVectorMetaObjects(
+      0, galois::runtime::getSystemNetworkInterface().Num,
+      dimension_0.input_columns, dimension_0.output_columns);
+  test_graph.InitLayerVectorMetaObjects(
+      1, galois::runtime::getSystemNetworkInterface().Num,
+      dimension_0.input_columns, dimension_0.output_columns);
+  test_graph.InitLayerVectorMetaObjects(
+      2, galois::runtime::getSystemNetworkInterface().Num,
+      dimension_0.input_columns, dimension_0.output_columns);
+
+  galois::GNNLayerConfig dcon;
+  dcon.disable_aggregate_after_update = false;
+  dcon.DebugConfig();
+
+  galois::PointerWithSize<galois::GNNFloat> p_null(nullptr, 0);
+  std::vector<galois::GNNFloat> back_matrix(21);
+  galois::PointerWithSize<galois::GNNFloat> p_back(back_matrix);
+
+  galois::SAGELayerConfig scon;
+  scon.disable_concat = false;
+
+  std::unique_ptr<galois::SAGELayer> layer_0 =
+      std::make_unique<galois::SAGELayer>(0, test_graph, &p_null, dimension_0,
+                                          dcon, scon);
+  layer_0->InitAllWeightsTo1();
+  // sage weights for self
+  layer_0->InitSelfWeightsTo1();
+
+  // make sure it runs in a sane manner
+  layer_0->ForwardPhase(test_graph.GetLocalFeatures());
+  const std::vector<galois::GNNFloat>& layer_0_forward_output =
+      layer_0->CopyForwardOutputFromGPU();
+
+  //////////////////////////////////////////////////////////////////////////////
+  // sanity check layer 0 output
+  //////////////////////////////////////////////////////////////////////////////
+  // since norm factors aren't invovled it is possible to do full assertions
+  // 7 x 2
+
+  GALOIS_LOG_ASSERT(layer_0_forward_output.size() == 14);
+  GALOIS_LOG_VASSERT(layer_0_forward_output[0] == 3, "{} should be 3",
+                     layer_0_forward_output[0]);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[1] == 3);
+  GALOIS_LOG_VASSERT(layer_0_forward_output[2] == 9, "{} should be 6",
+                     layer_0_forward_output[2]);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[3] == 9);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[4] == 18);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[5] == 18);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[6] == 27);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[7] == 27);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[8] == 36);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[9] == 36);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[10] == 45);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[11] == 45);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[12] == 33);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[13] == 33);
+  //////////////////////////////////////////////////////////////////////////////
+
+  // dummy 1 matrix
+  std::vector<galois::GNNFloat> dummy_ones_v(14, 1);
+  galois::PointerWithSize<galois::GNNFloat> dummy_ones =
+      layer_0->AllocateGPU(dummy_ones_v);
+
+  // backward pass checking
+  // layer 0 means that an empty weight matrix is returned since there is no
+  // point passing back anything
+  layer_0->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
+
+  const std::vector<galois::GNNFloat>& layer_0_weight_gradients =
+      layer_0->CopyWeightGradientsFromGPU();
+  const std::vector<galois::GNNFloat>& layer_0_weight_gradients_2 =
+      layer_0->CopyWeight2GradientsFromGPU();
+
+  // make sure they are sane
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients.size() == 6);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[0] == 36);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[1] == 36);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[2] == 36);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[3] == 36);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[4] == 36);
+
+  // make sure they are sane
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients_2.size() == 6);
+  GALOIS_LOG_VASSERT(layer_0_weight_gradients_2[0] == 21,
+                     "{} is wrong should be {}", layer_0_weight_gradients_2[0],
+                     21);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients_2[1] == 21);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients_2[2] == 21);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients_2[3] == 21);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients_2[4] == 21);
+
+  layer_0.reset();
+
+  ////////////////////////////////////////////////////////////////////////////////
+
+  // create layer 1 for testing backward prop actually giving weights back
+  auto layer_1 = std::make_unique<galois::SAGELayer>(1, test_graph, &p_back,
+                                                     dimension_0, dcon, scon);
+  layer_1->InitAllWeightsTo1();
+  layer_1->InitSelfWeightsTo1();
+
+  layer_1->ForwardPhase(test_graph.GetLocalFeatures());
+  const std::vector<galois::GNNFloat>& layer_1_forward_output =
+      layer_1->CopyForwardOutputFromGPU();
+
+  // same check as before for sanity purposes
+  GALOIS_LOG_ASSERT(layer_1_forward_output.size() == 14);
+  GALOIS_LOG_VASSERT(layer_1_forward_output[0] == 3, "{} should be 3",
+                     layer_1_forward_output[0]);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[1] == 3);
+  GALOIS_LOG_VASSERT(layer_1_forward_output[2] == 9, "{} should be 6",
+                     layer_1_forward_output[2]);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[3] == 9);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[4] == 18);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[5] == 18);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[6] == 27);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[7] == 27);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[8] == 36);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[9] == 36);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[10] == 45);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[11] == 45);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[12] == 33);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[13] == 33);
+
+  // since layer isn't 0 anymore, backward phase will actually return something
+  dummy_ones_v.assign(14, 1);
+  dummy_ones = layer_1->AllocateGPU(dummy_ones_v);
+  layer_1->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
+  const galois::PointerWithSize<galois::GNNFloat>& layer_1_backward_output =
+      layer_1->CopyBackwardOutputFromGPU();
+
+  //////////////////////////////////////////////////////////////////////////////
+  // check that multiplies go as expected
+  //////////////////////////////////////////////////////////////////////////////
+  GALOIS_LOG_ASSERT(layer_1_backward_output.size() == 21);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[0] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[1] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[2] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[3] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[4] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[5] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[6] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[7] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[8] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[9] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[10] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[11] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[12] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[13] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[14] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[15] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[16] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[17] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[18] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[19] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[20] == 4);
+
+  const std::vector<galois::GNNFloat>& layer_1_weight_gradients =
+      layer_1->CopyWeightGradientsFromGPU();
+  // make sure they are sane
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients.size() == 6);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[0] == 36);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[1] == 36);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[2] == 36);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[3] == 36);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[4] == 36);
+
+  const std::vector<galois::GNNFloat>& layer_1_weight_gradients_2 =
+      layer_1->CopyWeight2GradientsFromGPU();
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients_2.size() == 6);
+  GALOIS_LOG_VASSERT(layer_1_weight_gradients_2[0] == 21,
+                     "{} is wrong should be {}", layer_1_weight_gradients_2[0],
+                     21);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients_2[1] == 21);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients_2[2] == 21);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients_2[3] == 21);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients_2[4] == 21);
+
+  layer_1.reset();
+
+  ////////////////////////////////////////////////////////////////////////////////
+
+  galois::GNNLayerConfig config;
+  config.disable_dropout                = false;
+  config.disable_activation             = false;
+  config.disable_normalization          = false;
+  config.disable_aggregate_after_update = false;
+
+  // finally, just make sure dropout and activation run without crashes
+  // (verification requires floating point accuracy or setting a seed which I
+  // don't have time for at the moment
+  // TODO in future maybe add better unit test for this
+  auto layer_2 = std::make_unique<galois::SAGELayer>(2, test_graph, &p_back,
+                                                     dimension_0, config, scon);
+  layer_2->ForwardPhase(test_graph.GetLocalFeatures());
+  const std::vector<galois::GNNFloat>& l2_fo =
+      layer_2->CopyForwardOutputFromGPU();
+
+  GALOIS_LOG_ASSERT(l2_fo.size() == 14);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[0]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[1]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[2]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[3]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[4]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[5]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[6]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[7]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[8]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[9]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[10]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[11]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[12]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[13]);
+
+  layer_2->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
+  const galois::PointerWithSize<galois::GNNFloat>& l2_bo =
+      layer_2->CopyBackwardOutputFromGPU();
+
+  GALOIS_LOG_ASSERT(l2_bo.size() == 21);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[0]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[1]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[2]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[3]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[4]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[5]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[6]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[7]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[8]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[9]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[10]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[11]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[12]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[13]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[14]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[15]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[16]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[17]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[18]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[19]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[20]);
+
+  return 0;
+}
diff --git a/libgnn/test/gpu-softmaxlayer-test.cpp b/libgnn/test/gpu-softmaxlayer-test.cpp
index 5d52e80e35..64b7c9e6f0 100644
--- a/libgnn/test/gpu-softmaxlayer-test.cpp
+++ b/libgnn/test/gpu-softmaxlayer-test.cpp
@@ -1,4 +1,4 @@
-//! @file convlayer-test.cpp
+//! @file gpu-softmaxlayer-test.cpp
 //! Softmax layer test with a test graph
 
 #include "galois/Logging.h"
@@ -25,9 +25,12 @@ int main() {
 
   GALOIS_LOG_VERBOSE("Num output classes is {}", dimension_0.input_columns);
 
+  std::vector<galois::GNNFloat> back_matrix(49);
+  galois::PointerWithSize<galois::GNNFloat> p_back(back_matrix);
+
   // train mode
-  auto output_layer =
-      std::make_unique<galois::SoftmaxLayer>(3, test_graph, dimension_0);
+  auto output_layer = std::make_unique<galois::SoftmaxLayer>(
+      3, test_graph, &p_back, dimension_0);
   // input to softmax
   std::vector<galois::GNNFloat> softmax_input(49, 0.0);
   // create input with perfect accuracy
@@ -42,9 +45,11 @@ int main() {
       output_layer->AllocateGPU(softmax_input);
 
   output_layer->ForwardPhase(p_softmax_input);
+  output_layer->PrintForwardOutputGPU();
 
-  const std::vector<galois::GNNFloat>& prediction_distribution =
-      output_layer->CopyForwardOutputFromGPU();
+  // Softmax reuses output vector for forward phase
+  const galois::PointerWithSize<galois::GNNFloat> prediction_distribution =
+      output_layer->CopyBackwardOutputFromGPU();
 
   // assert that predictions are as expected
   for (size_t i = 0; i < 5; i++) {
@@ -63,17 +68,12 @@ int main() {
   }
 
   output_layer->BackwardPhase(p_softmax_input, nullptr);
-  const std::vector<galois::GNNFloat>& backward_output =
-      output_layer->CopyBackwardOutputFromGPU();
-  printf("Output 1\n========\n");
-  for (galois::GNNFloat a : backward_output) {
-    printf("%f\n", a);
-  }
 
   // validation mode
   output_layer->SetLayerPhase(galois::GNNPhase::kValidate);
   output_layer->ForwardPhase(p_softmax_input);
-  std::vector<galois::GNNFloat> pd2 = output_layer->CopyForwardOutputFromGPU();
+  galois::PointerWithSize<galois::GNNFloat> pd2 =
+      output_layer->CopyBackwardOutputFromGPU();
 
   // validate vertex is index 5
   GALOIS_LOG_ASSERT(galois::MaxIndex(7, &(pd2[5 * 7])) == 5);
@@ -97,17 +97,12 @@ int main() {
   }
 
   output_layer->BackwardPhase(p_softmax_input, nullptr);
-  const std::vector<galois::GNNFloat>& backward_output2 =
-      output_layer->CopyBackwardOutputFromGPU();
-  printf("Output 2\n========\n");
-  for (galois::GNNFloat a : backward_output2) {
-    printf("%f\n", a);
-  }
 
   // test mode
   output_layer->SetLayerPhase(galois::GNNPhase::kTest);
   output_layer->ForwardPhase(p_softmax_input);
-  std::vector<galois::GNNFloat> pd3 = output_layer->CopyForwardOutputFromGPU();
+  galois::PointerWithSize<galois::GNNFloat> pd3 =
+      output_layer->CopyBackwardOutputFromGPU();
   // validate vertex is index 6
   GALOIS_LOG_ASSERT(galois::MaxIndex(7, &(pd3[6 * 7])) == 6);
   // all but last are empty distributions
@@ -122,12 +117,6 @@ int main() {
   }
 
   output_layer->BackwardPhase(softmax_input, nullptr);
-  const std::vector<galois::GNNFloat>& backward_output3 =
-      output_layer->CopyBackwardOutputFromGPU();
-  printf("Output 3\n========\n");
-  for (galois::GNNFloat a : backward_output3) {
-    printf("%f\n", a);
-  }
 
   // TODO in future maybe: add better test for backward phase besides just
   // running it
diff --git a/scripts/galois_gnn_log_parser.R b/scripts/galois_gnn_log_parser.R
new file mode 100644
index 0000000000..4e60af0c5d
--- /dev/null
+++ b/scripts/galois_gnn_log_parser.R
@@ -0,0 +1,221 @@
+#!/usr/bin/env Rscript
+
+#######################################################
+# Author: Gurbinder Gill
+# Email:  gill@cs.utexas.edu
+# Date:   Oct 8, 2017
+######################################################
+library("optparse")
+library('data.table')
+
+convertZeroTosStr <- function(a) {
+  if (identical(numeric(0), as.numeric(a)) == 0) {
+    a <- as.numeric(a) / 1000
+  } else {
+    a <- "0"
+  }
+  return (a)
+}
+
+####START: @function to parse commadline##################
+# Parses the command line to get the arguments used
+parseCmdLine <- function (logData, isSharedMemGaloisLog, graphPassedAsInput) {
+   ## Select commandline & param rows
+  cmdLineRow <- subset(logData, CATEGORY == "CommandLine" & STAT_TYPE == "PARAM")
+
+  ## Distributed has extra column: HostID
+  if(isTRUE(isSharedMemGaloisLog)){
+    cmdLine <- substring(cmdLineRow[,5], 0)
+  }
+  else {
+    cmdLine <- substring(cmdLineRow[,6], 0)
+  }
+
+  cmdLineSplit = strsplit(cmdLine, "\\s+")[[1]]
+  deviceKind = "CPU"
+  if(!isTRUE(isSharedMemGaloisLog)){
+    ## To check the device kind
+    pos = regexpr('-pset', cmdLineSplit)
+    deviceKind = ""
+    if(sum(pos>0) > 0){
+      deviceKind = "GPU"
+    } else {
+      deviceKind = "CPU"
+    }
+  }
+
+  ## First postitional argument is always name of the executable
+  ### WORKING: split the exePath name found at the position 1 of the argument list and split on "/".
+  exePathSplit <- strsplit(cmdLineSplit[1], "/")[[1]]
+  benchmark <- exePathSplit[length(exePathSplit)]
+
+  ## subset the threads row from the table
+  numThreads <- (subset(logData, CATEGORY == "Threads" & TOTAL_TYPE != "HostValues"))$TOTAL
+
+  input = "noInput"
+  if(isTRUE(graphPassedAsInput)){
+    ## subset the input row from the table
+    inputPath <- (subset(logData, CATEGORY == "Input" & STAT_TYPE == "PARAM"))$TOTAL
+    print(inputPath)
+    if(!identical(inputPath, character(0))){
+      inputPathSplit <- strsplit(inputPath, "/")[[1]]
+      input <- inputPathSplit[length(inputPathSplit)]
+    }
+    else {
+      inputPathSplit <- strsplit(inputPath[[2]], "/")[[1]]
+      input <- inputPathSplit[length(inputPathSplit)]
+    }
+
+    ### This is to remore the extension for example .gr or .sgr
+    inputsplit <- strsplit(input, "[.]")[[1]]
+    if(length(inputsplit) > 1) {
+      input <- inputsplit[1]
+    }
+  }
+
+  if(isTRUE(isSharedMemGaloisLog)){
+    returnList <- list("benchmark" = benchmark, "input" = input, "numThreads" = numThreads, "deviceKind" = deviceKind)
+    return(returnList)
+  }
+
+ ## Need more params for distributed galois logs
+ numHosts <- (subset(logData, CATEGORY == "Hosts"& TOTAL_TYPE != "HostValues"))$TOTAL
+
+ partitionScheme <- (subset(logData, CATEGORY == "PartitionScheme"& TOTAL_TYPE != "HostValues"))$TOTAL
+
+ runID <- (subset(logData, CATEGORY == "Run_UUID"& TOTAL_TYPE != "HostValues"))$TOTAL
+
+ numIterations <- (subset(logData, CATEGORY == "NumIterations_0"& TOTAL_TYPE != "HostValues"))$TOTAL
+ #If numIterations is not printed in the log files
+ if(identical(numIterations, character(0))){
+   numIterations <- 0
+ }
+
+ end2endTimer <- (subset(logData, CATEGORY == "Timer_0"& TOTAL_TYPE != "HostValues"))$TOTAL
+ end2endTimer <- convertZeroTosStr(end2endTimer)
+
+ aggr_fwd <- (subset(logData, CATEGORY == "AggregateForward"))$TOTAL
+ aggr_fwd <- convertZeroTosStr(aggr_fwd)
+
+ aggr_bwd <- (subset(logData, CATEGORY == "AggregateBackward"))$TOTAL
+ aggr_bwd <- convertZeroTosStr(aggr_bwd)
+
+ fwd_total <- (subset(logData, CATEGORY == "ForwardPhase"))$TOTAL
+ fwd_total <- convertZeroTosStr(fwd_total)
+
+ fwd_xform <- (subset(logData, CATEGORY == "ForwardXForm"))$TOTAL
+ fwd_xform <- convertZeroTosStr(fwd_xform)
+
+ bwd_total <- (subset(logData, CATEGORY == "BackwardPhase"))$TOTAL
+ bwd_total <- convertZeroTosStr(bwd_total)
+
+ bwd_xform <- (subset(logData, CATEGORY == "BackwardXForm"))$TOTAL
+ bwd_xform <- convertZeroTosStr(bwd_xform)
+
+ avg_epoch <- (subset(logData, CATEGORY == "AverageEpochTime"))$TOTAL
+ avg_epoch <- convertZeroTosStr(avg_epoch)
+
+ final_accuracy <- (subset(logData, CATEGORY == "FinalTestAccuracy"))$TOTAL
+
+ train_time <- (subset(logData, CATEGORY == "TrainingTime"))$TOTAL
+ train_time <- convertZeroTosStr(train_time)
+
+ sync_aggr <- (subset(logData, CATEGORY == "Sync_GraphAggregateSync_0"))$TOTAL
+ sync_aggr <- convertZeroTosStr(sync_aggr)
+
+ sync_weight <- (subset(logData, CATEGORY == "Sync_WeightGradientsSum"))$TOTAL
+ sync_weight <- convertZeroTosStr(sync_weight)
+ 
+ buff_breserve_time <- (subset(logData, CATEGORY ==
+                       "BroadcastExtract_GraphAggregateSync_0"))$TOTAL 
+ buff_breserve_time <- convertZeroTosStr(buff_breserve_time)
+ buff_bextract_time <- (subset(logData, CATEGORY ==
+                       "BroadcastExtractBatch_GraphAggregateSync_0"))$TOTAL 
+ buff_bextract_time <- convertZeroTosStr(buff_bextract_time)
+ buff_rreserve_time <- (subset(logData, CATEGORY ==
+                       "ReduceExtract_GraphAggregateSync_0"))$TOTAL 
+ buff_rreserve_time <- convertZeroTosStr(buff_rreserve_time)
+ buff_rextract_time <- (subset(logData, CATEGORY ==
+                       "ReduceExtractBatch_GraphAggregateSync_0"))$TOTAL 
+ buff_rextract_time <- convertZeroTosStr(buff_rextract_time)
+
+ print(input)
+ print(partitionScheme)
+ print(numHosts)
+ ## returnList for distributed galois log
+ returnList <- list("RunID" = runID, "Benchmark" = benchmark,
+                    "Input" = input, "PartitionScheme" = partitionScheme,
+                    "Hosts" = numHosts, "NumThreads" = numThreads,
+                    "EndToEndTime" = end2endTimer,
+                    "TrainTime" = train_time,
+                    "TotalForwardTime" = fwd_total,
+                    "ForwardAggregate" = aggr_fwd,
+                    "ForwardXform" = fwd_xform,
+                    "TotalBackwardTime" = bwd_total,
+                    "BackwardAggregate" = aggr_bwd,
+                    "BackwardXfrom" = bwd_xform,
+                    "AverageEpochTime" = avg_epoch,
+                    "FinalTestAccuracy" = final_accuracy,
+                    "AggregateSync" = sync_aggr,
+                    "Broadcast_buf_reserve" = buff_breserve_time,
+                    "Broadcast_buf_extract" = buff_bextract_time,
+                    "Reduce_buf_reserve" = buff_rreserve_time,
+                    "Reduce_buf_extract" = buff_rreserve_time,
+                    "AggregateWeight" = sync_weight,
+                    "DeviceKind" = deviceKind)
+
+ print("List")
+ print(returnList)
+ # Timer is milli-sec unit 
+ return(returnList)
+}
+#### END: @function to parse commadline ##################
+
+#### START: @function entry point for galois log parser ##################
+galoisLogParser <- function(input, output) {
+  logData <- read.csv(input, stringsAsFactors=F, strip.white=T)
+
+  printNormalStats = TRUE;
+  print("Parsing commadline")
+  paramList <- parseCmdLine(logData, F, T)
+  print("Parsing timers for shared memory galois log")
+
+  ## if computing RSD then normal stats are not printed
+  if(isTRUE(printNormalStats)){
+    if(!file.exists(output)){
+      print(paste(output, "Does not exist. Creating new file"))
+      print(as.data.frame(paramList))
+      write.csv(as.data.frame(paramList), file=output, row.names=F, quote=F)
+    } else {
+      print(paste("Appending data to the existing file", output))
+      write.table(as.data.frame(paramList), file=output, row.names=F, col.names=F, quote=F, append=T, sep=",")
+    }
+  }
+}
+#### END: @function entry point for shared memory galois log ##################
+
+#############################################
+##  Commandline options.
+#######################################
+option_list = list(
+                   make_option(c("-i", "--input"), action="store", default=NA, type='character',
+                               help="name of the input file to parse"),
+                   make_option(c("-o", "--output"), action="store", default=NA, type='character',
+                               help="name of the output file parsed")
+                   )
+
+opt_parser <- OptionParser(usage = "%prog [options] -i input.log -o output.csv", option_list=option_list)
+opt <- parse_args(opt_parser)
+
+if (is.na(opt$i)){
+  print_help(opt_parser)
+  stop("At least one argument must be supplied (input file)", call.=FALSE)
+} else {
+  if (is.na(opt$o)){
+    print("Output file name is not specified. Using name ouput.csv as default")
+    opt$o <- "output.csv"
+  }
+  galoisLogParser(opt$i, opt$o)
+}
+
+##################### END #####################
diff --git a/scripts/run-gpu.sh b/scripts/run-gpu.sh
new file mode 100644
index 0000000000..9f78915a03
--- /dev/null
+++ b/scripts/run-gpu.sh
@@ -0,0 +1,44 @@
+EXECS=( "gcn-dist" "gcn-dist-pinned" )
+#INPUTS=( "ogbn-products" )
+INPUTS=( "reddit" "ogbn-products" )
+#INPUTS=( "ogbn-papers100M" )
+TYPES=( "sage" )
+LSIZE=16
+NLAYERS=2
+EPOCH=200
+PSET="g"
+
+for e in "${EXECS[@]}"
+do
+  for t in 0
+    do
+    echo "Iter:"$t
+    PSET="g"
+    for n in 1 2 3 4
+    do
+      for i in "${INPUTS[@]}"
+      do
+        for k in "${TYPES[@]}"
+        do
+          TYPES_STR=${k}
+          LSIZE_STR=${LSIZE}
+          for nr in {1..${NLAYERS}}
+          do
+            TYPES_STR+=","${k}
+            LSIZE_STR+=","${LSIZE}
+          done
+          echo GALOIS_DO_NOT_BIND_THREADS=1 mpirun -np $n ./${e} -inputDirectory=/net/ohm/export/iss/inputs/Learning/ -epochs=${EPOCH} \
+                                     -layerTypes=${TYPES_STR} -disableDropout ${i} -layerSizes=${LSIZE_STR} \
+                                     -numLayers=${NLAYERS} -t=56 -statFile=${e}_${i}_${k}_${LSIZE}_${NLAYERS}_${PSET}_${t}.stats -pset=${PSET} -numNodes=1
+
+
+          CUDA_VISIBLE_DEVICES=2,3,4,5 GALOIS_DO_NOT_BIND_THREADS=1 mpirun -np $n ./${e} -inputDirectory=/net/ohm/export/iss/inputs/Learning/ -epochs=${EPOCH} \
+                                     -layerTypes=${TYPES_STR} -disableDropout ${i} -layerSizes=${LSIZE_STR} \
+                                     -numLayers=${NLAYERS} -t=56 -statFile=${e}_${i}_${k}_${LSIZE}_${NLAYERS}_${PSET}_${t}.stats -pset=${PSET} -numNodes=1
+        done
+      done
+      PSET+="g"
+      echo $PSET
+    done
+  done
+done
diff --git a/scripts/run_gnnsys.sh b/scripts/run_gnnsys.sh
new file mode 100644
index 0000000000..3b6ec31e70
--- /dev/null
+++ b/scripts/run_gnnsys.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+
+INPUTDIR="/net/ohm/export/iss/inputs/Learning/"
+#EXECS=( "gcn-dist" "gcn-dist-pinned" )
+EXECS=( "gcn-dist" )
+#INPUTS=( "cora" "reddit" "ogbn-products" )
+INPUTS=( "reddit" ) 
+LAYERTYPES=( "sage" "gcn" )
+#LAYERTYPES=( "gcn" )
+LAYERSIZE=16
+NUMLAYERS=2
+#PARTITIONS=( "oec" "cvc" )
+PARTITIONS=( "oec" )
+DEVICES="0"
+
+FLAGS=" -epochs=200"
+#FLAGS+=" -disableDropout"
+#FLAGS+=" -testInterval=50"
+
+PREFIX="GALOIS_DO_NOT_BIND_THREADS=1 "
+
+for input in "${INPUTS[@]}"
+do
+  for partition in "${PARTITIONS[@]}"
+  do
+#for num_gpus in {2..4}
+    for num_gpus in 1
+    do
+      PSET="g"
+      for ngpus in $(seq 2 ${num_gpus})
+      do
+        PSET+="g"
+      done
+      for layer in "${LAYERTYPES[@]}"
+      do
+        for exe in "${EXECS[@]}"
+        do
+          # Variable parameters
+          LSIZE_STR=$LAYERSIZE
+          LTYPE_STR=$layer
+          for r in {1..${NUMLAYERS}} 
+          do
+            LSIZE_STR+=","$LAYERSIZE
+            LTYPE_STR+=","$layer
+          done
+          echo "CUDA_VISIBLE_DEVICES=${DEVICES} GALOIS_DO_NOT_BIND_THREADS=1 mpirun -np $num_gpus ./${exe} $input $FLAGS -layerTypes=${LTYPE_STR} -t=1 \
+                            -pset=${PSET} -layerSizes=${LSIZE_STR} -numNodes=1 --inputDirectory=${INPUTDIR} \
+                            -statFile=${exe}_${input}_${layer}_${NUMLAYERS}_${LAYERSIZE}_${PSET}_${partition}.stat -partition=${partition}"
+
+          CUDA_VISIBLE_DEVICES=${DEVICES} GALOIS_DO_NOT_BIND_THREADS=1 mpirun -np $num_gpus ./${exe} $input $FLAGS -layerTypes=${LTYPE_STR} -t=1 \
+                                    -pset=${PSET} -layerSizes=${LSIZE_STR} -numNodes=1 --inputDirectory=${INPUTDIR} \
+                                    -statFile=${exe}_${input}_${layer}_${NUMLAYERS}_${LAYERSIZE}_${PSET}_${partition}.stat -partition=${partition}
+        done
+      done
+    done
+  done
+done

From bd7b7cda0d831622cc5616606aa6a46fa50ec081 Mon Sep 17 00:00:00 2001
From: Hochan Lee <hochan@utexas.edu>
Date: Mon, 3 May 2021 22:18:32 -0500
Subject: [PATCH 529/660] Add a GPU wrapper and fix libgnn CMakList error

---
 libgnn/src/graphs/GNNGraph.cpp | 6 +++++-
 libgnn/test/CMakeLists.txt     | 4 ++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index cb63fbe307..dc175b5afc 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -540,7 +540,11 @@ void galois::graphs::GNNGraph::InitNormFactor() {
   global_degrees_.resize(partitioned_graph_->size(), 0.0);
   global_train_degrees_.resize(partitioned_graph_->size(), 0.0);
   CalculateFullNormFactor();
-  gpu_memory_.InitNormFactor(partitioned_graph_->size());
+#ifdef GALOIS_ENABLE_GPU
+  if (device_personality == DevicePersonality::GPU_CUDA) {
+    gpu_memory_.InitNormFactor(partitioned_graph_->size());
+  }
+#endif
 }
 
 void galois::graphs::GNNGraph::CalculateFullNormFactor() {
diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt
index 9834b302e7..11c7ab78b8 100644
--- a/libgnn/test/CMakeLists.txt
+++ b/libgnn/test/CMakeLists.txt
@@ -59,14 +59,14 @@ if (NOT GALOIS_ENABLE_GPU)
   add_executable(aggregate-sync-test aggregate-sync-test.cpp)
   target_link_libraries(aggregate-sync-test galois_gnn)
   foreach(host_count ${hosts})
-    add_test(NAME run-aggsync-${host_count} COMMAND mpiexec --bind-to none -n ${host_count} aggregate-sync-test)
+    add_test(NAME run-aggsync-${host_count} COMMAND mpiexec --bind-to none -n ${host_count} ./aggregate-sync-test)
     set_tests_properties(run-aggsync-${host_count} PROPERTIES ENVIRONMENT "GALOIS_DO_NOT_BIND_THREADS=1")
   endforeach()
 
   add_executable(back-conv-test back-conv-test.cpp)
   target_link_libraries(back-conv-test galois_gnn)
   foreach(host_count ${hosts})
-    add_test(NAME run-back-conv-${host_count} COMMAND mpiexec --bind-to none -n ${host_count} back-conv-test)
+    add_test(NAME run-back-conv-${host_count} COMMAND mpiexec --bind-to none -n ${host_count} ./back-conv-test)
     set_tests_properties(run-back-conv-${host_count} PROPERTIES ENVIRONMENT "GALOIS_DO_NOT_BIND_THREADS=1")
   endforeach()
 

From 50a767a8686bbf3844690ed6a28f7fb02630f763 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 3 May 2021 16:18:42 -0500
Subject: [PATCH 530/660] Minibatch generator loops through masters only

Minibatch generator should only loop through masters to avoid duplicates
in minibatches in a distributed setting. This assumes like all other
parts of the code at the moment are that master nodes always take the
prefix of node ids.

Generator takes the right bound of it and uses it instead of the mask
size.
---
 libgnn/include/galois/MinibatchGenerator.h | 15 +++++++++------
 libgnn/include/galois/graphs/GNNGraph.h    |  4 ++--
 libgnn/src/MinibatchGenerator.cpp          | 10 ++++++----
 libgnn/src/graphs/GNNGraph.cpp             | 13 +++++++++----
 4 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/libgnn/include/galois/MinibatchGenerator.h b/libgnn/include/galois/MinibatchGenerator.h
index 0bd063b90c..11bce02848 100644
--- a/libgnn/include/galois/MinibatchGenerator.h
+++ b/libgnn/include/galois/MinibatchGenerator.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "galois/GNNTypes.h"
+#include "galois/Logging.h"
 
 namespace galois {
 
@@ -8,21 +9,23 @@ namespace galois {
 //! the minibatch for
 class MinibatchGenerator {
 public:
-  MinibatchGenerator(const GNNMask& mask_to_minibatch, size_t minibatch_size)
-      : mask_to_minibatch_{mask_to_minibatch}, minibatch_size_{minibatch_size} {
+  MinibatchGenerator(const GNNMask& mask_to_minibatch, size_t minibatch_size,
+                     size_t master_bound)
+      : mask_to_minibatch_{mask_to_minibatch}, minibatch_size_{minibatch_size},
+        master_bound_{master_bound} {
+    GALOIS_LOG_ASSERT(master_bound_ <= mask_to_minibatch_.size());
   }
   void GetNextMinibatch(std::vector<char>* batch_mask);
   //! True if no more minibatches from this generator
-  bool NoMoreMinibatches() {
-    return current_position_ == mask_to_minibatch_.size();
-  }
+  bool NoMoreMinibatches() { return current_position_ == master_bound_; }
   //! Reset the only state (a position bit)
   void ResetMinibatchState() { current_position_ = 0; }
 
 private:
   const GNNMask& mask_to_minibatch_;
   size_t minibatch_size_;
-  size_t current_position_{0};
+  size_t current_position_;
+  size_t master_bound_;
 };
 
 } // namespace galois
diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 2d4bb5356b..7d6867a5c8 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -317,8 +317,8 @@ class GNNGraph {
       // clear before remake
       train_batcher_.reset();
     }
-    train_batcher_ = std::make_unique<MinibatchGenerator>(local_training_mask_,
-                                                          train_batch_size);
+    train_batcher_ = std::make_unique<MinibatchGenerator>(
+        local_training_mask_, train_batch_size, *end_owned());
     local_minibatch_mask_.resize(partitioned_graph_->size());
   }
 
diff --git a/libgnn/src/MinibatchGenerator.cpp b/libgnn/src/MinibatchGenerator.cpp
index 7c3b6dd831..48570c094e 100644
--- a/libgnn/src/MinibatchGenerator.cpp
+++ b/libgnn/src/MinibatchGenerator.cpp
@@ -3,16 +3,18 @@
 
 void galois::MinibatchGenerator::GetNextMinibatch(
     std::vector<char>* batch_mask) {
-  std::fill(batch_mask->begin(), batch_mask->end(), 0);
   assert(current_position_ <= mask_to_minibatch_.size());
+  assert(current_position_ <= master_bound_);
   assert(batch_mask->size() == mask_to_minibatch_.size());
-  if (current_position_ >= mask_to_minibatch_.size()) {
+
+  std::fill(batch_mask->begin(), batch_mask->end(), 0);
+  if (current_position_ >= master_bound_) {
     return;
   }
 
   size_t current_count = 0;
   // start from last positiion
-  while (current_position_ < mask_to_minibatch_.size()) {
+  while (current_position_ < master_bound_) {
     if (mask_to_minibatch_[current_position_]) {
       // XXX and a master node; seed nodes only exist locally
       (*batch_mask)[current_position_] = 1;
@@ -27,7 +29,7 @@ void galois::MinibatchGenerator::GetNextMinibatch(
   // advance current position to next set bit for next call (or to end to detect
   // no more minibatches
   while (!mask_to_minibatch_[current_position_] &&
-         (current_position_ < mask_to_minibatch_.size())) {
+         (current_position_ < master_bound_)) {
     current_position_++;
   }
 }
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index dc175b5afc..8b1374b271 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -100,10 +100,6 @@ galois::graphs::GNNGraph::GNNGraph(const std::string& input_directory,
   // init norm factors (involves a sync call)
   InitNormFactor();
 
-  // XXX remove this
-  test_batcher_ =
-      std::make_unique<MinibatchGenerator>(local_testing_mask_, 2000);
-
 #ifdef GALOIS_ENABLE_GPU
   if (device_personality == DevicePersonality::GPU_CUDA) {
     // allocate/copy data structures over to GPU
@@ -936,6 +932,15 @@ size_t galois::graphs::GNNGraph::ConstructSampledSubgraph() {
 
 void galois::graphs::GNNGraph::PrepareNextTrainMinibatch() {
   train_batcher_->GetNextMinibatch(&local_minibatch_mask_);
+#ifndef NDEBUG
+  galois::gPrint("Minibatch : ");
+  for (unsigned i = 0; i < local_minibatch_mask_.size(); i++) {
+    if (local_minibatch_mask_[i]) {
+      galois::gPrint(i, ",");
+    }
+  }
+  galois::gPrint("\n");
+#endif
   SetupNeighborhoodSample(GNNPhase::kBatch);
 }
 

From 8366824ba002505687af716304959abbb0e6d2db Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 4 May 2021 16:32:52 -0500
Subject: [PATCH 531/660] Fixes to make distributed minibatch work

This commit allows for distributed minibatching.

1) Fixes sample sync: write dest rather than write source since sample
activation is occuring on dest.
2) Adds reduce call to check if all hosts no longer have any more
minibatches before moving on to the next minibatch.
3) Safety assertions added to subgraph.
---
 libgnn/include/galois/GraphNeuralNetwork.h    |  3 +++
 .../graphs/GraphAggregationSyncStructures.h   |  4 +---
 libgnn/src/GraphNeuralNetwork.cpp             |  5 ++--
 libgnn/src/graphs/GNNGraph.cpp                | 23 +++++++++++--------
 libgnn/src/graphs/GNNSubgraph.cpp             |  6 ++++-
 5 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h
index 580738b133..9925764bef 100644
--- a/libgnn/include/galois/GraphNeuralNetwork.h
+++ b/libgnn/include/galois/GraphNeuralNetwork.h
@@ -203,6 +203,9 @@ class GraphNeuralNetwork {
   //! Number of layers that use the graph (e.g. SAGE, GCN)
   size_t num_graph_user_layers_;
 
+  //! Termination detection for minibatching
+  galois::DGAccumulator<uint32_t> work_left_;
+
 #ifdef GALOIS_ENABLE_GPU
   //! Holds all GPU functions
   GraphNeuralNetworkGPU gpu_object_;
diff --git a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
index 0dd43c3308..51f52e5323 100644
--- a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
+++ b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
@@ -23,10 +23,8 @@ struct SampleFlagSync {
   //! return a vector of floats to sync
   static ValTy extract(uint32_t, char& i) { return i; }
 
-  //! reduction is addition in this case; add received vector to
-  //! own vector
   static bool reduce(uint32_t, char& i, ValTy y) {
-    if (y > i) {
+    if (y) {
       i = y;
       assert(i == 1);
       return true;
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index cb139191b4..c5d2aea91e 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -237,6 +237,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
       // XXX
       // create mini batch graphs and loop until minibatches on all hosts done
       while (true) {
+        work_left_.reset();
         galois::gInfo("Epoch ", epoch, " batch ", batch_num++);
         // break when all hosts are done with minibatches
         graph_->PrepareNextTrainMinibatch();
@@ -263,13 +264,13 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
         }
 
         const PointerWithSize<galois::GNNFloat> batch_pred = DoInference();
-        DoInference();
         train_accuracy = GetGlobalAccuracy(batch_pred);
         GradientPropagation();
         galois::gPrint("Epoch ", epoch, " Batch ", batch_num,
                        ": Train accuracy/F1 micro is ", train_accuracy, "\n");
+        work_left_ += graph_->MoreTrainMinibatches();
         // XXX sync across all hosts minibatcher state
-        if (!graph_->MoreTrainMinibatches()) {
+        if (!work_left_.reduce()) {
           break;
         }
       }
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index 8b1374b271..50af592b99 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -510,7 +510,6 @@ void galois::graphs::GNNGraph::ReadLocalMasks(const std::string& dataset_name) {
       }
     }
   } else {
-    // XXX i can get local sample counts from here if i need it
     size_t valid_train = ReadLocalMasksFromFile(dataset_name, "train",
                                                 &global_training_mask_range_,
                                                 &local_training_mask_);
@@ -837,11 +836,13 @@ void galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num) {
   // continue the exploration
   galois::do_all(
       galois::iterate(new_nodes),
-      [&](uint32_t new_node_id) { SetSampledNode(new_node_id); },
+      [&](uint32_t new_node_id) {
+        SetSampledNode(new_node_id);
+      },
       galois::loopname("NeighborhoodSampleSet"));
-
   // XXX(loc) bitset; can readAny be weaker?
-  sync_substrate_->sync<writeSource, readAny, SampleFlagSync>("SampleSync");
+  sync_substrate_->sync<writeDestination, readAny, SampleFlagSync>(
+      "SampleSync");
 }
 
 void galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num,
@@ -865,7 +866,7 @@ void galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num,
           // times (degree norm is 1 / degree)
           // XXX training degree + other norm, not global
           double probability_of_reject =
-              std::pow(1 - GetGlobalDegreeNorm(*src_iter), num_to_sample);
+              std::pow(1 - GetGlobalTrainDegreeNorm(*src_iter), num_to_sample);
           // loop through edges, turn "on" edge with some probability
           for (auto edge_iter : partitioned_graph_->edges(*src_iter)) {
             if (sample_rng_.DoBernoulli(probability_of_reject)) {
@@ -911,7 +912,8 @@ void galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num,
       galois::loopname("NeighborhoodSampleSet"));
 
   // XXX(loc) bitset; can readAny be weaker?
-  sync_substrate_->sync<writeSource, readAny, SampleFlagSync>("SampleSync");
+  sync_substrate_->sync<writeDestination, readAny, SampleFlagSync>(
+      "SampleSync");
 }
 
 //! Construct the subgraph from sampled edges and corresponding nodes
@@ -933,13 +935,16 @@ size_t galois::graphs::GNNGraph::ConstructSampledSubgraph() {
 void galois::graphs::GNNGraph::PrepareNextTrainMinibatch() {
   train_batcher_->GetNextMinibatch(&local_minibatch_mask_);
 #ifndef NDEBUG
-  galois::gPrint("Minibatch : ");
+  size_t count = 0;
+  // galois::gPrint("Minibatch : ");
   for (unsigned i = 0; i < local_minibatch_mask_.size(); i++) {
     if (local_minibatch_mask_[i]) {
-      galois::gPrint(i, ",");
+      // galois::gPrint(partitioned_graph_->getGID(i), ",");
+      count++;
     }
   }
-  galois::gPrint("\n");
+  // galois::gPrint("\n");
+  galois::gInfo(host_prefix(), "num batched nodes ", count);
 #endif
   SetupNeighborhoodSample(GNNPhase::kBatch);
 }
diff --git a/libgnn/src/graphs/GNNSubgraph.cpp b/libgnn/src/graphs/GNNSubgraph.cpp
index 387e3fc250..a5f6d925ec 100644
--- a/libgnn/src/graphs/GNNSubgraph.cpp
+++ b/libgnn/src/graphs/GNNSubgraph.cpp
@@ -137,7 +137,7 @@ void galois::graphs::GNNGraph::GNNSubgraph::EdgeCreation(
       [&](uint32_t node_id) {
         if (gnn_graph.IsInSampledGraph(node_id)) {
           uint32_t subgraph_id = lid_to_subgraph_id_[node_id];
-
+          assert(subgraph_id != std::numeric_limits<uint32_t>::max());
           uint32_t out_location = 0;
           uint32_t in_location  = 0;
           if (subgraph_id != 0) {
@@ -147,7 +147,11 @@ void galois::graphs::GNNGraph::GNNSubgraph::EdgeCreation(
 
           for (auto out_edge_iter : gnn_graph.edges(node_id)) {
             if (gnn_graph.IsEdgeSampledAny(out_edge_iter)) {
+              assert(
+                  lid_to_subgraph_id_[gnn_graph.GetEdgeDest(out_edge_iter)] !=
+                  std::numeric_limits<uint32_t>::max());
               subedge_to_original_edge_[out_location] = *out_edge_iter;
+
               underlying_graph_.constructEdge(
                   out_location++,
                   lid_to_subgraph_id_[gnn_graph.GetEdgeDest(out_edge_iter)]);

From 6566351b4478b11fd342a5b8e2f12c3206f4c43a Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 5 May 2021 13:35:15 -0500
Subject: [PATCH 532/660] Fanout CLI, sample bitset, sampling behavior, time

1) Added batch timer to minibatch setting
2) Fanout command line argument for sampling
3) "DoGraphSampling" argument needs to be toggled to do any kind of
sampling, else all edges will be selected when creating subgraph.
4) Bitset used to sync sampling flag during sampling.

TODO doInductiveTraining needs to be changed to "use training subgraph"
or something similar; inductive training needs to be its own argument
(that will be done next commit)
---
 libgnn/include/galois/GraphNeuralNetwork.h    |  4 +++
 .../graphs/GraphAggregationSyncStructures.h   | 10 ++++++
 libgnn/src/GraphNeuralNetwork.cpp             | 35 +++++++++++--------
 libgnn/src/graphs/GNNGraph.cpp                | 33 +++++++++--------
 lonestar/libgnnbench/src/Input.cpp            | 30 ++++++++++++++--
 5 files changed, 78 insertions(+), 34 deletions(-)

diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h
index 9925764bef..fcc620738b 100644
--- a/libgnn/include/galois/GraphNeuralNetwork.h
+++ b/libgnn/include/galois/GraphNeuralNetwork.h
@@ -102,6 +102,8 @@ class GraphNeuralNetworkConfig {
   // public because they are independent of other settings
   //! Graph sampling
   bool do_sampling_{false};
+  // XXX Change the name of this var; it just means to create subgraph
+  // based on training nodes
   //! Inductive = training ignores test/val set
   bool inductive_training_{false};
   //! Interval to run validation set on network at; 0 = no run
@@ -109,6 +111,8 @@ class GraphNeuralNetworkConfig {
   //! Interval to run testing set on network at; 0 = no run
   unsigned test_interval_{0};
   unsigned train_minibatch_size_{0};
+  //! Fan out used for sampling (if sampling is enabled)
+  std::vector<unsigned> fan_out_vector_;
 
 private:
   //! Number of layers to construct in the GNN not including the output
diff --git a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
index 51f52e5323..89ccc83324 100644
--- a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
+++ b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
@@ -12,6 +12,7 @@ extern GNNFloat* gnn_matrix_to_sync_;
 extern size_t gnn_matrix_to_sync_column_length_;
 extern galois::DynamicBitSet bitset_graph_aggregate;
 extern galois::LargeArray<uint32_t>* gnn_lid_to_sid_pointer_;
+extern galois::DynamicBitSet bitset_sample_flag_;
 #ifdef GALOIS_ENABLE_GPU
 extern struct CUDA_Context* cuda_ctx_for_sync;
 extern unsigned layer_number_to_sync;
@@ -55,6 +56,15 @@ struct SampleFlagSync {
   static bool extract_reset_batch(unsigned, uint8_t*) { return false; }
 };
 
+struct SampleFlagBitset {
+  static constexpr bool is_vector_bitset() { return false; }
+  static constexpr bool is_valid() { return true; }
+  static galois::DynamicBitSet& get() { return bitset_sample_flag_; }
+  static void reset_range(size_t begin, size_t end) {
+    bitset_sample_flag_.reset(begin, end);
+  }
+};
+
 struct GNNSumAggregate {
   using ValTy = galois::gstl::Vector<GNNFloat>;
 
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index c5d2aea91e..5336e07756 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -159,6 +159,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
   const size_t this_host = graph_->host_id();
   float train_accuracy{0.f};
   size_t inductive_nodes = 0;
+  // this subgraph only needs to be created once
   if (config_.inductive_training_ && !config_.train_minibatch_size()) {
     // Setup the subgraph to only be the training graph
     graph_->SetupNeighborhoodSample();
@@ -184,6 +185,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
 
   for (size_t epoch = 0; epoch < num_epochs; epoch++) {
     epoch_timer.start();
+    // swap to inductive graph
     if (config_.inductive_training_ && !config_.train_minibatch_size()) {
       graph_->EnableSubgraph();
       for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end();
@@ -192,6 +194,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
       }
     }
 
+    // beginning of epoch sampling
     if (config_.do_sampling() && !config_.train_minibatch_size()) {
       graph_->SetupNeighborhoodSample();
       size_t num_sampled_layers = 0;
@@ -203,16 +206,11 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
         GNNLayerType layer_type = (*back_iter)->layer_type();
         if (layer_type == GNNLayerType::kGraphConvolutional ||
             layer_type == GNNLayerType::kSAGE) {
-          if (num_sampled_layers == 0) {
-            graph_->SampleEdges((*back_iter)->graph_user_layer_number(), 10);
-          } else {
-            graph_->SampleEdges((*back_iter)->graph_user_layer_number(), 25);
-          }
+          graph_->SampleEdges((*back_iter)->graph_user_layer_number(),
+                              config_.fan_out_vector_[num_sampled_layers]);
           num_sampled_layers++;
         }
       }
-      galois::gDebug("Number of sampled layers is ", num_sampled_layers);
-
       // resize layer matrices
       size_t num_subgraph_nodes = graph_->ConstructSampledSubgraph();
       for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end();
@@ -234,9 +232,12 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
 
       size_t batch_num = 0;
 
-      // XXX
       // create mini batch graphs and loop until minibatches on all hosts done
       while (true) {
+        const std::string btime_name("Epoch" + std::to_string(epoch) + "Batch" +
+                                     std::to_string(batch_num));
+        galois::StatTimer batch_timer(btime_name.c_str(), "GraphNeuralNetwork");
+        batch_timer.start();
         work_left_.reset();
         galois::gInfo("Epoch ", epoch, " batch ", batch_num++);
         // break when all hosts are done with minibatches
@@ -247,17 +248,19 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
           GNNLayerType layer_type = (*back_iter)->layer_type();
           if (layer_type == GNNLayerType::kGraphConvolutional ||
               layer_type == GNNLayerType::kSAGE) {
-            if (num_sampled_layers == 0) {
-              graph_->SampleEdges((*back_iter)->graph_user_layer_number(), 10);
+            // you can minibatch with sampling or minibatch and grab all
+            // relevant neighbors
+            if (config_.do_sampling()) {
+              graph_->SampleEdges((*back_iter)->graph_user_layer_number(),
+                                  config_.fan_out_vector_[num_sampled_layers]);
             } else {
-              graph_->SampleEdges((*back_iter)->graph_user_layer_number(), 25);
+              graph_->SampleAllEdges((*back_iter)->graph_user_layer_number());
             }
             num_sampled_layers++;
           }
         }
         // resize layer matrices
         size_t num_subgraph_nodes = graph_->ConstructSampledSubgraph();
-        galois::gPrint(num_subgraph_nodes, "\n");
         for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end();
              layer++) {
           (*layer)->ResizeRows(num_subgraph_nodes);
@@ -266,11 +269,13 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
         const PointerWithSize<galois::GNNFloat> batch_pred = DoInference();
         train_accuracy = GetGlobalAccuracy(batch_pred);
         GradientPropagation();
-        galois::gPrint("Epoch ", epoch, " Batch ", batch_num,
+
+        galois::gPrint("Epoch ", epoch, " Batch ", batch_num - 1,
                        ": Train accuracy/F1 micro is ", train_accuracy, "\n");
         work_left_ += graph_->MoreTrainMinibatches();
-        // XXX sync across all hosts minibatcher state
-        if (!work_left_.reduce()) {
+        char global_work_left = work_left_.reduce();
+        batch_timer.stop();
+        if (!global_work_left) {
           break;
         }
       }
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index 50af592b99..8e021327b6 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -47,6 +47,8 @@ galois::LargeArray<uint32_t>* gnn_lid_to_sid_pointer_ = nullptr;
 uint32_t* gnn_degree_vec_1_;
 uint32_t* gnn_degree_vec_2_;
 
+galois::DynamicBitSet bitset_sample_flag_;
+
 //! For synchronization of sampled degrees
 galois::DynamicBitSet bitset_sampled_degrees_;
 std::vector<galois::LargeArray<uint32_t>>* gnn_sampled_out_degrees_;
@@ -764,8 +766,8 @@ void galois::graphs::GNNGraph::InitializeSamplingData(size_t num_layers,
 
 void galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) {
   use_subgraph_ = false;
-  new_sampled_nodes_.resize(size());
-  new_sampled_nodes_.reset();
+  bitset_sample_flag_.resize(size());
+  bitset_sample_flag_.reset();
 
   // for now, if training node, it goes into seed node
   galois::do_all(galois::iterate(begin(), end()), [&](const NodeIterator& x) {
@@ -795,7 +797,6 @@ void galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) {
 }
 
 void galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num) {
-  assert(subgraph_is_inductive_);
   use_subgraph_ = false;
 
   galois::GAccumulator<size_t> sampled;
@@ -817,7 +818,7 @@ void galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num) {
               MakeEdgeSampled(edge_iter, agg_layer_num);
               if (!IsInSampledGraph(
                       partitioned_graph_->getEdgeDst(edge_iter))) {
-                new_sampled_nodes_.set(
+                bitset_sample_flag_.set(
                     partitioned_graph_->getEdgeDst(edge_iter));
               }
               sampled += 1;
@@ -831,18 +832,16 @@ void galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num) {
   galois::gPrint("Num sampled edges in inductive graph is ", sampled.reduce(),
                  " out of ", total.reduce(), "\n");
 
-  std::vector<uint32_t> new_nodes = new_sampled_nodes_.getOffsets();
+  std::vector<uint32_t> new_nodes = bitset_sample_flag_.getOffsets();
   // update nodes, then communicate update to all hosts so that they can
   // continue the exploration
   galois::do_all(
       galois::iterate(new_nodes),
-      [&](uint32_t new_node_id) {
-        SetSampledNode(new_node_id);
-      },
+      [&](uint32_t new_node_id) { SetSampledNode(new_node_id); },
       galois::loopname("NeighborhoodSampleSet"));
-  // XXX(loc) bitset; can readAny be weaker?
-  sync_substrate_->sync<writeDestination, readAny, SampleFlagSync>(
-      "SampleSync");
+  sync_substrate_
+      ->sync<writeDestination, readAny, SampleFlagSync, SampleFlagBitset>(
+          "SampleSync");
 }
 
 void galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num,
@@ -881,7 +880,7 @@ void galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num,
                 MakeEdgeSampled(edge_iter, sample_layer_num);
                 if (!IsInSampledGraph(
                         partitioned_graph_->getEdgeDst(edge_iter))) {
-                  new_sampled_nodes_.set(
+                  bitset_sample_flag_.set(
                       partitioned_graph_->getEdgeDst(edge_iter));
                 }
                 bitset_sampled_degrees_.set(*src_iter);
@@ -902,7 +901,7 @@ void galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num,
   galois::gDebug("Num sampled edges for layer ", sample_layer_num, " is ",
                  sampled.reduce(), " out of ", total.reduce());
 
-  std::vector<uint32_t> new_nodes = new_sampled_nodes_.getOffsets();
+  std::vector<uint32_t> new_nodes = bitset_sample_flag_.getOffsets();
 
   // update nodes, then communicate update to all hosts so that they can
   // continue the exploration
@@ -911,9 +910,9 @@ void galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num,
       [&](uint32_t new_node_id) { SetSampledNode(new_node_id); },
       galois::loopname("NeighborhoodSampleSet"));
 
-  // XXX(loc) bitset; can readAny be weaker?
-  sync_substrate_->sync<writeDestination, readAny, SampleFlagSync>(
-      "SampleSync");
+  sync_substrate_
+      ->sync<writeDestination, readAny, SampleFlagSync, SampleFlagBitset>(
+          "SampleSync");
 }
 
 //! Construct the subgraph from sampled edges and corresponding nodes
@@ -944,7 +943,7 @@ void galois::graphs::GNNGraph::PrepareNextTrainMinibatch() {
     }
   }
   // galois::gPrint("\n");
-  galois::gInfo(host_prefix(), "num batched nodes ", count);
+  galois::gInfo(host_prefix(), "Batched nodes ", count);
 #endif
   SetupNeighborhoodSample(GNNPhase::kBatch);
 }
diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp
index 921baaa4df..7914cdf6ea 100644
--- a/lonestar/libgnnbench/src/Input.cpp
+++ b/lonestar/libgnnbench/src/Input.cpp
@@ -26,13 +26,13 @@ llvm::cl::opt<galois::graphs::GNNPartitionScheme> partition_scheme(
                            "Original Cartesian Vertex-Cut")),
     cll::init(galois::graphs::GNNPartitionScheme::kOEC));
 
-llvm::cl::opt<size_t> num_layers(
+llvm::cl::opt<unsigned> num_layers(
     "numLayers",
     cll::desc(
         "Number of intermediate layers in the neural network (default 2))"),
     cll::init(2));
 
-llvm::cl::list<size_t> layer_sizes(
+llvm::cl::list<unsigned> layer_sizes(
     "layerSizes",
     cll::desc(
         "Comma separated list of numbers specifying "
@@ -53,6 +53,12 @@ llvm::cl::list<galois::GNNLayerType> cl_layer_types(
         clEnumValN(galois::GNNLayerType::kDense, "dense", "Dense layer")),
     cll::CommaSeparated);
 
+llvm::cl::list<unsigned> cl_fan_out_vector(
+    "samplingFanOut",
+    cll::desc(
+        "Comma separated list of layer fanout if sampling/batching is used"),
+    cll::CommaSeparated);
+
 llvm::cl::opt<bool>
     disable_dropout("disableDropout",
                     cll::desc("If true (off by default), disables dropout of "
@@ -253,6 +259,25 @@ CreateOptimizer(const galois::graphs::GNNGraph* gnn_graph) {
   return std::make_unique<galois::AdamOptimizer>(opt_sizes, num_layers);
 }
 
+std::vector<unsigned> CreateFanOutVector() {
+  std::vector<unsigned> fan_out;
+  // fan out only matters if graph sampling is enabled
+  if (do_graph_sampling) {
+    // assert fan out size is the same
+    if (cl_fan_out_vector.size() == num_layers) {
+
+    } else {
+      galois::gWarn("Fan out specification does not equal number of layers: "
+                    "using default 10 followed by 25s");
+      fan_out.emplace_back(10);
+      for (unsigned i = 1; i < num_layers; i++) {
+        fan_out.emplace_back(25);
+      }
+    }
+  }
+  return fan_out;
+}
+
 std::unique_ptr<galois::GraphNeuralNetwork> InitializeGraphNeuralNetwork() {
   // partition/load graph
   auto gnn_graph = std::make_unique<galois::graphs::GNNGraph>(
@@ -273,6 +298,7 @@ std::unique_ptr<galois::GraphNeuralNetwork> InitializeGraphNeuralNetwork() {
   gnn_config.validation_interval_  = val_interval;
   gnn_config.test_interval_        = test_interval;
   gnn_config.train_minibatch_size_ = train_minibatch_size;
+  gnn_config.fan_out_vector_       = CreateFanOutVector();
 
   // optimizer
   std::unique_ptr<galois::BaseOptimizer> opt = CreateOptimizer(gnn_graph.get());

From b6696aafbcf746b0f968ed0feb520a0a92884695 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 5 May 2021 13:51:12 -0500
Subject: [PATCH 533/660] doInductiveTraining changed to useTrainingSubgraph

Inductive training in my definition means that you only use training
nodes and nonval/test nodes during training phase. Option changed
to training subgraph because it's more in line with what occurs: you
create subgraph based on training subgraph and don't need to compute the
rest.

Right now in this commit it will still only pick up training/other
nodes, but next commit will add another option that will allow it to
pick up val/test nodes during sampling/subgraph construction.
---
 libgnn/include/galois/GraphNeuralNetwork.h    |  5 ++--
 libgnn/include/galois/graphs/GNNGraph.h       |  4 +--
 libgnn/include/galois/layers/GNNLayer.h       |  3 ---
 libgnn/src/GraphNeuralNetwork.cpp             | 27 +++++++++----------
 libgnn/src/graphs/GNNGraph.cpp                |  6 ++---
 libgnn/src/layers/GraphConvolutionalLayer.cpp | 12 ---------
 lonestar/libgnnbench/src/Input.cpp            | 14 +++++-----
 7 files changed, 27 insertions(+), 44 deletions(-)

diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h
index fcc620738b..bf4b4b2f3b 100644
--- a/libgnn/include/galois/GraphNeuralNetwork.h
+++ b/libgnn/include/galois/GraphNeuralNetwork.h
@@ -104,8 +104,9 @@ class GraphNeuralNetworkConfig {
   bool do_sampling_{false};
   // XXX Change the name of this var; it just means to create subgraph
   // based on training nodes
-  //! Inductive = training ignores test/val set
-  bool inductive_training_{false};
+  //! Creates subgraph that is only composed of training nodes (reduces
+  //! redundant work since you won't calculate things you don't need)
+  bool use_train_subgraph_{false};
   //! Interval to run validation set on network at; 0 = no run
   unsigned validation_interval_{0};
   //! Interval to run testing set on network at; 0 = no run
diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 7d6867a5c8..29d25afaf7 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -358,7 +358,7 @@ class GNNGraph {
   GNNFloat GetDegreeNorm(GraphNode n, size_t graph_user_layer_num) const {
     if (use_subgraph_) {
       size_t degree;
-      if (!subgraph_is_inductive_) {
+      if (!subgraph_is_train_) {
         // case because degrees in each layer differ
         degree =
             sampled_out_degrees_[graph_user_layer_num][subgraph_->SIDToLID(n)];
@@ -638,7 +638,7 @@ class GNNGraph {
 
   // TODO vars for subgraphs as necessary
   bool use_subgraph_{false};
-  bool subgraph_is_inductive_{false};
+  bool subgraph_is_train_{false};
 
   std::unique_ptr<MinibatchGenerator> train_batcher_;
   std::unique_ptr<MinibatchGenerator> test_batcher_;
diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
index 9a71432471..728c0ecae4 100644
--- a/libgnn/include/galois/layers/GNNLayer.h
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -65,8 +65,6 @@ struct GNNLayerConfig {
   bool disable_self_aggregate{false};
   //! Graph sampling flag in use or not
   bool do_sampling{false};
-  //! Inductive layer means for aggregation all non-training nodes are ignored
-  bool inductive_training_{false};
   // TODO activation type; for now default is softmax
 
   //! Sets settings such that testing is easy
@@ -178,7 +176,6 @@ class GNNLayer {
   //! Flip sampling switch on
   void EnableSampling() { config_.do_sampling = true; }
   bool IsSampledLayer() const { return config_.do_sampling; }
-  bool IsInductiveLayer() const { return config_.inductive_training_; }
   //! Sets the graph user layer number; important for sampling as this index
   //! determines which index to use when checking for sampled edges
   void SetGraphUserLayerNumber(size_t num) { graph_user_layer_number_ = num; }
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index 5336e07756..462d8813fc 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -13,9 +13,9 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork(
     galois::GraphNeuralNetworkConfig&& config)
     : graph_(std::move(graph)), optimizer_(std::move(optimizer)),
       config_(std::move(config)) {
-  if (config_.do_sampling_ && config_.inductive_training_) {
-    GALOIS_LOG_FATAL("Do not set inductive training and sampling at same time "
-                     "(sampling is inductive already)");
+  if (config_.do_sampling_ && config_.use_train_subgraph_) {
+    GALOIS_LOG_FATAL("Do not set train subgraph and sampling at same time "
+                     "(sampling uses training subgraph already)");
   }
   // max number of rows that can be passed as inputs; allocate space for it as
   // this will be the # of rows for each layer
@@ -103,11 +103,11 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork(
   }
 
   // XXX test minibatch
-  if (config_.do_sampling() || config_.inductive_training_ ||
+  if (config_.do_sampling() || config_.use_train_subgraph_ ||
       config.train_minibatch_size()) {
     // output layer not included; it will never involve sampling
     graph_->InitializeSamplingData(num_graph_user_layers_,
-                                   config_.inductive_training_);
+                                   config_.use_train_subgraph_);
   }
 
   if (config_.train_minibatch_size()) {
@@ -158,9 +158,9 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork(
 float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
   const size_t this_host = graph_->host_id();
   float train_accuracy{0.f};
-  size_t inductive_nodes = 0;
+  size_t train_subgraph_nodes = 0;
   // this subgraph only needs to be created once
-  if (config_.inductive_training_ && !config_.train_minibatch_size()) {
+  if (config_.use_train_subgraph_ && !config_.train_minibatch_size()) {
     // Setup the subgraph to only be the training graph
     graph_->SetupNeighborhoodSample();
     for (auto back_iter = gnn_layers_.rbegin(); back_iter != gnn_layers_.rend();
@@ -172,10 +172,10 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
       }
     }
     // resize layer matrices
-    inductive_nodes = graph_->ConstructSampledSubgraph();
+    train_subgraph_nodes = graph_->ConstructSampledSubgraph();
     for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end();
          layer++) {
-      (*layer)->ResizeRows(inductive_nodes);
+      (*layer)->ResizeRows(train_subgraph_nodes);
     }
   }
 
@@ -185,12 +185,12 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
 
   for (size_t epoch = 0; epoch < num_epochs; epoch++) {
     epoch_timer.start();
-    // swap to inductive graph
-    if (config_.inductive_training_ && !config_.train_minibatch_size()) {
+    // swap to train subgraph
+    if (config_.use_train_subgraph_ && !config_.train_minibatch_size()) {
       graph_->EnableSubgraph();
       for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end();
            layer++) {
-        (*layer)->ResizeRows(inductive_nodes);
+        (*layer)->ResizeRows(train_subgraph_nodes);
       }
     }
 
@@ -297,7 +297,6 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
     bool do_test =
         config_.test_interval_ ? epoch % config_.test_interval_ == 0 : false;
 
-    // get real norm factor back if altered by sampling or inductive training
     if (do_validate || do_test) {
       // disable subgraph
       graph_->DisableSubgraph();
@@ -348,8 +347,6 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
           epoch_timer.get());
       // revert to training phase for next epoch
       SetLayerPhases(galois::GNNPhase::kTrain);
-      // get back inductive norm factor as necessary; sampling norm is handled
-      // at beginning of every iteration
     }
   }
 
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index 8e021327b6..f2650ca8f9 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -760,7 +760,7 @@ void galois::graphs::GNNGraph::InitializeSamplingData(size_t num_layers,
       array.create(partitioned_graph_->size());
     }
   } else {
-    subgraph_is_inductive_ = true;
+    subgraph_is_train_ = true;
   }
 }
 
@@ -784,7 +784,7 @@ void galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) {
                              edge_sample_status_[edge_id].end(), 0);
                  });
   // reset all degrees
-  if (!subgraph_is_inductive_) {
+  if (!subgraph_is_train_) {
     galois::do_all(
         galois::iterate(sampled_out_degrees_),
         [&](galois::LargeArray<uint32_t>& array) {
@@ -846,7 +846,7 @@ void galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num) {
 
 void galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num,
                                            size_t num_to_sample) {
-  assert(!subgraph_is_inductive_);
+  assert(!subgraph_is_train_);
   use_subgraph_ = false;
 
   galois::GAccumulator<size_t> sampled;
diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp
index 7c22627f2f..82522fafd9 100644
--- a/libgnn/src/layers/GraphConvolutionalLayer.cpp
+++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp
@@ -326,12 +326,6 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU(
         }
 
         if (layer_phase_ == GNNPhase::kTrain) {
-          if (IsInductiveLayer()) {
-            // if inductive, all non-training nodes do not exist
-            if (!graph_.IsValidForPhase(src, GNNPhase::kTrain))
-              return;
-          }
-
           if (IsSampledLayer()) {
             // XXX(loc)
             GALOIS_LOG_WARN(
@@ -367,12 +361,6 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU(
           graphs::bitset_graph_aggregate.set(src);
 
           if (layer_phase_ == GNNPhase::kTrain) {
-            if (IsInductiveLayer()) {
-              // if inductive, all non-training nodes do not exist
-              if (!graph_.IsValidForPhase(dst, GNNPhase::kTrain))
-                return;
-            }
-
             if (IsSampledLayer()) {
               // ignore non-sampled nodes
               if (layer_phase_ == GNNPhase::kTrain &&
diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp
index 7914cdf6ea..d33a9bf422 100644
--- a/lonestar/libgnnbench/src/Input.cpp
+++ b/lonestar/libgnnbench/src/Input.cpp
@@ -114,11 +114,12 @@ llvm::cl::opt<bool>
                                 "use every epoch at a 50\% drop rate"),
                       cll::init(false));
 
-llvm::cl::opt<bool>
-    do_inductive_training("doInductiveTraining",
-                          cll::desc("If true (off by default), during training "
-                                    "all non-train nodes are ignored"),
-                          cll::init(false));
+llvm::cl::opt<bool> use_train_subgraph(
+    "useTrainingSubgraph",
+    cll::desc(
+        "If true (off by default), during training "
+        "only compute minimum required for training nodes in training phase"),
+    cll::init(false));
 
 llvm::cl::opt<unsigned>
     train_minibatch_size("trainMinibatchSize",
@@ -219,7 +220,6 @@ galois::GNNLayerConfig CreateLayerConfig() {
   layer_config.disable_normalization          = disable_normalization;
   layer_config.disable_aggregate_after_update = disable_agg_after_update;
   layer_config.disable_self_aggregate         = disable_self_aggregate;
-  layer_config.inductive_training_            = do_inductive_training;
   return layer_config;
 }
 
@@ -294,7 +294,7 @@ std::unique_ptr<galois::GraphNeuralNetwork> InitializeGraphNeuralNetwork() {
   galois::GraphNeuralNetworkConfig gnn_config(
       num_layers, layer_types, layer_sizes_vector, output_layer_type,
       do_graph_sampling, layer_config);
-  gnn_config.inductive_training_   = do_inductive_training;
+  gnn_config.use_train_subgraph_   = use_train_subgraph;
   gnn_config.validation_interval_  = val_interval;
   gnn_config.test_interval_        = test_interval;
   gnn_config.train_minibatch_size_ = train_minibatch_size;

From 90205221f8c459971aa0394394f6250917125cc2 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 5 May 2021 15:27:47 -0500
Subject: [PATCH 534/660] Inductive subgraph option, fan out fix

1) Adds CLI to make it so only train/other nodes are looked at when
constructing a subgraph.
2) Fixes the fanout implementation; I did not finish implementing it
when it was intially committed.
---
 libgnn/include/galois/GraphNeuralNetwork.h |  4 +-
 libgnn/include/galois/graphs/GNNGraph.h    |  5 +-
 libgnn/src/GraphNeuralNetwork.cpp          | 12 ++--
 libgnn/src/graphs/GNNGraph.cpp             | 83 +++++++++++++---------
 lonestar/libgnnbench/src/Input.cpp         | 11 ++-
 5 files changed, 72 insertions(+), 43 deletions(-)

diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h
index bf4b4b2f3b..fe1cb17477 100644
--- a/libgnn/include/galois/GraphNeuralNetwork.h
+++ b/libgnn/include/galois/GraphNeuralNetwork.h
@@ -102,11 +102,11 @@ class GraphNeuralNetworkConfig {
   // public because they are independent of other settings
   //! Graph sampling
   bool do_sampling_{false};
-  // XXX Change the name of this var; it just means to create subgraph
-  // based on training nodes
   //! Creates subgraph that is only composed of training nodes (reduces
   //! redundant work since you won't calculate things you don't need)
   bool use_train_subgraph_{false};
+  //! If on, subgraphs cannot pick up val/test nodes
+  bool inductive_subgraph_{false};
   //! Interval to run validation set on network at; 0 = no run
   unsigned validation_interval_{0};
   //! Interval to run testing set on network at; 0 = no run
diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 29d25afaf7..fee528c4b8 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -266,9 +266,10 @@ class GNNGraph {
   void SetupNeighborhoodSample(GNNPhase seed_phase);
 
   //! Choose all edges from sampled nodes
-  void SampleAllEdges(size_t agg_layer_num);
+  void SampleAllEdges(size_t agg_layer_num, bool inductive_subgraph);
   //! Sample neighbors of nodes that are marked as ready for sampling
-  void SampleEdges(size_t sample_layer_num, size_t num_to_sample);
+  void SampleEdges(size_t sample_layer_num, size_t num_to_sample,
+                   bool inductive_subgraph);
 
   //! Construct the subgraph from sampled edges and corresponding nodes
   size_t ConstructSampledSubgraph();
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index 462d8813fc..e53cac0b13 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -168,7 +168,8 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
       GNNLayerType layer_type = (*back_iter)->layer_type();
       if (layer_type == GNNLayerType::kGraphConvolutional ||
           layer_type == GNNLayerType::kSAGE) {
-        graph_->SampleAllEdges((*back_iter)->graph_user_layer_number());
+        graph_->SampleAllEdges((*back_iter)->graph_user_layer_number(),
+                               config_.inductive_subgraph_);
       }
     }
     // resize layer matrices
@@ -207,7 +208,8 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
         if (layer_type == GNNLayerType::kGraphConvolutional ||
             layer_type == GNNLayerType::kSAGE) {
           graph_->SampleEdges((*back_iter)->graph_user_layer_number(),
-                              config_.fan_out_vector_[num_sampled_layers]);
+                              config_.fan_out_vector_[num_sampled_layers],
+                              config_.inductive_subgraph_);
           num_sampled_layers++;
         }
       }
@@ -252,9 +254,11 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
             // relevant neighbors
             if (config_.do_sampling()) {
               graph_->SampleEdges((*back_iter)->graph_user_layer_number(),
-                                  config_.fan_out_vector_[num_sampled_layers]);
+                                  config_.fan_out_vector_[num_sampled_layers],
+                                  config_.inductive_subgraph_);
             } else {
-              graph_->SampleAllEdges((*back_iter)->graph_user_layer_number());
+              graph_->SampleAllEdges((*back_iter)->graph_user_layer_number(),
+                                     config_.inductive_subgraph_);
             }
             num_sampled_layers++;
           }
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index f2650ca8f9..5c159dc816 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -796,7 +796,8 @@ void galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) {
   bitset_sampled_degrees_.reset();
 }
 
-void galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num) {
+void galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num,
+                                              bool inductive_subgraph) {
   use_subgraph_ = false;
 
   galois::GAccumulator<size_t> sampled;
@@ -811,19 +812,22 @@ void galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num) {
         if (IsInSampledGraph(src_iter)) {
           // marks ALL edges of nodes that connect to train/other nodes
           for (auto edge_iter : partitioned_graph_->edges(*src_iter)) {
-            if (IsValidForPhase(partitioned_graph_->getEdgeDst(edge_iter),
-                                GNNPhase::kTrain) ||
-                IsValidForPhase(partitioned_graph_->getEdgeDst(edge_iter),
-                                GNNPhase::kOther)) {
-              MakeEdgeSampled(edge_iter, agg_layer_num);
-              if (!IsInSampledGraph(
-                      partitioned_graph_->getEdgeDst(edge_iter))) {
-                bitset_sample_flag_.set(
-                    partitioned_graph_->getEdgeDst(edge_iter));
+            total += 1;
+            if (inductive_subgraph) {
+              if (!IsValidForPhase(partitioned_graph_->getEdgeDst(edge_iter),
+                                   GNNPhase::kTrain) &&
+                  !IsValidForPhase(partitioned_graph_->getEdgeDst(edge_iter),
+                                   GNNPhase::kOther)) {
+                continue;
               }
-              sampled += 1;
             }
-            total += 1;
+
+            MakeEdgeSampled(edge_iter, agg_layer_num);
+            if (!IsInSampledGraph(partitioned_graph_->getEdgeDst(edge_iter))) {
+              bitset_sample_flag_.set(
+                  partitioned_graph_->getEdgeDst(edge_iter));
+            }
+            sampled += 1;
           }
         }
       },
@@ -845,7 +849,8 @@ void galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num) {
 }
 
 void galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num,
-                                           size_t num_to_sample) {
+                                           size_t num_to_sample,
+                                           bool inductive_subgraph) {
   assert(!subgraph_is_train_);
   use_subgraph_ = false;
 
@@ -863,33 +868,43 @@ void galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num,
         if (IsInSampledGraph(src_iter)) {
           // chance of not uniformly choosing an edge of this node num_to_sample
           // times (degree norm is 1 / degree)
-          // XXX training degree + other norm, not global
-          double probability_of_reject =
-              std::pow(1 - GetGlobalTrainDegreeNorm(*src_iter), num_to_sample);
+          double probability_of_reject;
+          if (!inductive_subgraph) {
+            probability_of_reject =
+                std::pow(1 - GetGlobalDegreeNorm(*src_iter), num_to_sample);
+          } else {
+            probability_of_reject = std::pow(
+                1 - GetGlobalTrainDegreeNorm(*src_iter), num_to_sample);
+          }
+
           // loop through edges, turn "on" edge with some probability
           for (auto edge_iter : partitioned_graph_->edges(*src_iter)) {
+            total += 1;
             if (sample_rng_.DoBernoulli(probability_of_reject)) {
-              // only take if node is training node or a node not classified
-              // into train/test/val
-              if (IsValidForPhase(partitioned_graph_->getEdgeDst(edge_iter),
-                                  GNNPhase::kTrain) ||
-                  IsValidForPhase(partitioned_graph_->getEdgeDst(edge_iter),
-                                  GNNPhase::kOther)) {
-                // if here, it means edge accepted; set sampled on, mark source
-                // as part of next set
-                MakeEdgeSampled(edge_iter, sample_layer_num);
-                if (!IsInSampledGraph(
-                        partitioned_graph_->getEdgeDst(edge_iter))) {
-                  bitset_sample_flag_.set(
-                      partitioned_graph_->getEdgeDst(edge_iter));
+              if (inductive_subgraph) {
+                // only take if node is training node or a node not classified
+                // into train/test/val
+                if (!IsValidForPhase(partitioned_graph_->getEdgeDst(edge_iter),
+                                     GNNPhase::kTrain) &&
+                    !IsValidForPhase(partitioned_graph_->getEdgeDst(edge_iter),
+                                     GNNPhase::kOther)) {
+                  continue;
                 }
-                bitset_sampled_degrees_.set(*src_iter);
-                // degree increment
-                sampled_out_degrees_[sample_layer_num][*src_iter]++;
-                sampled += 1;
               }
+
+              // if here, it means edge accepted; set sampled on, mark source
+              // as part of next set
+              MakeEdgeSampled(edge_iter, sample_layer_num);
+              if (!IsInSampledGraph(
+                      partitioned_graph_->getEdgeDst(edge_iter))) {
+                bitset_sample_flag_.set(
+                    partitioned_graph_->getEdgeDst(edge_iter));
+              }
+              bitset_sampled_degrees_.set(*src_iter);
+              // degree increment
+              sampled_out_degrees_[sample_layer_num][*src_iter]++;
+              sampled += 1;
             }
-            total += 1;
           }
           // total_nodes += 1;
         }
diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp
index d33a9bf422..d15adf2d9f 100644
--- a/lonestar/libgnnbench/src/Input.cpp
+++ b/lonestar/libgnnbench/src/Input.cpp
@@ -121,6 +121,12 @@ llvm::cl::opt<bool> use_train_subgraph(
         "only compute minimum required for training nodes in training phase"),
     cll::init(false));
 
+llvm::cl::opt<bool> inductive_subgraph(
+    "inductiveSubgraph",
+    cll::desc("If true (off by default), only sample training/other nodes when "
+              "constructing subgraph"),
+    cll::init(false));
+
 llvm::cl::opt<unsigned>
     train_minibatch_size("trainMinibatchSize",
                          cll::desc("Size of training minibatch (default 0)"),
@@ -265,7 +271,9 @@ std::vector<unsigned> CreateFanOutVector() {
   if (do_graph_sampling) {
     // assert fan out size is the same
     if (cl_fan_out_vector.size() == num_layers) {
-
+      for (unsigned i = 0; i < num_layers; i++) {
+        fan_out.emplace_back(cl_fan_out_vector[i]);
+      }
     } else {
       galois::gWarn("Fan out specification does not equal number of layers: "
                     "using default 10 followed by 25s");
@@ -298,6 +306,7 @@ std::unique_ptr<galois::GraphNeuralNetwork> InitializeGraphNeuralNetwork() {
   gnn_config.validation_interval_  = val_interval;
   gnn_config.test_interval_        = test_interval;
   gnn_config.train_minibatch_size_ = train_minibatch_size;
+  gnn_config.inductive_subgraph_   = inductive_subgraph;
   gnn_config.fan_out_vector_       = CreateFanOutVector();
 
   // optimizer

From f56ef5c4a1f10c9cc7b7e5a405ab235ce20974fd Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 6 May 2021 15:37:43 -0500
Subject: [PATCH 535/660] Addition of an output rows dim for layer dims

Adding output rows field to the layer dims struct; this will be required
for an optimization coming later to resize intermediate matrices as
moving down layers in a sampled subgraph setting.
---
 libgnn/include/galois/layers/GNNLayer.h |  2 ++
 libgnn/src/GraphNeuralNetwork.cpp       |  6 ++++--
 libgnn/src/layers/GNNLayer.cpp          | 13 +++++++++++--
 libgnn/src/layers/SAGELayer.cpp         |  6 +++---
 4 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
index 728c0ecae4..10e44511aa 100644
--- a/libgnn/include/galois/layers/GNNLayer.h
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -41,6 +41,8 @@ struct GNNLayerDimensions {
   size_t input_columns;
   //! Number of columns output of this layer
   size_t output_columns;
+  //! If rows change, this is set. Otherwise, ignored.
+  size_t output_rows;
 };
 
 //! Config options for operations that can occur in a layer
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index e53cac0b13..2ed988bed0 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -46,7 +46,8 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork(
     GNNLayerDimensions layer_dims = {.input_rows    = max_rows,
                                      .input_columns = prev_layer_columns,
                                      .output_columns =
-                                         config_.intermediate_layer_size(i)};
+                                         config_.intermediate_layer_size(i),
+                                     .output_rows = max_rows};
 
     switch (layer_type) {
     case GNNLayerType::kGraphConvolutional:
@@ -121,7 +122,8 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork(
       // get last intermediate layer column size
       .input_columns = config_.intermediate_layer_size(
           config_.num_intermediate_layers() - 1),
-      .output_columns = config_.output_layer_size()};
+      .output_columns = config_.output_layer_size(),
+      .output_rows    = max_rows};
 
   switch (config_.output_layer_type()) {
   case (GNNOutputLayerType::kSoftmax):
diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp
index 0c01bb788b..2ba3aa5ae3 100644
--- a/libgnn/src/layers/GNNLayer.cpp
+++ b/libgnn/src/layers/GNNLayer.cpp
@@ -9,6 +9,11 @@ galois::GNNLayer::GNNLayer(size_t layer_num,
                            const GNNLayerConfig& config)
     : layer_number_(layer_num), graph_(graph), layer_dimensions_(dimensions),
       config_(config) {
+  // TODO(loc)
+  // this is currently a backward-compatibility hack, need to have caller
+  // set output rows rather than created here
+  layer_dimensions_.output_rows = layer_dimensions_.input_rows;
+
   if (config_.allocate_weights) {
     // dropout allocation; dropout is same as input
     if (!config_.disable_dropout) {
@@ -38,6 +43,10 @@ galois::GNNLayer::GNNLayer(size_t layer_num,
     GlorotBengioInit(&layer_weights_);
   }
 
+  // TODO(loc) optimize this and layer creation in general
+  // this does not use output_rows and assumes the worst case where
+  // all nodes are generated
+  // for now it's kept as input_rows so as to not break things
   size_t num_output_elements =
       layer_dimensions_.input_rows * layer_dimensions_.output_columns;
 
@@ -269,7 +278,7 @@ void galois::GNNLayer::Activation() {
 
     galois::do_all(
         galois::iterate(static_cast<size_t>(0),
-                        layer_dimensions_.input_rows *
+                        layer_dimensions_.output_rows *
                             layer_dimensions_.output_columns),
         [&](size_t i) {
           if (forward_output_matrix_[i] > 0.0) {
@@ -299,7 +308,7 @@ void galois::GNNLayer::ActivationDerivative(
     // keep gradient if the original output was greater than 0
     galois::do_all(
         galois::iterate(static_cast<size_t>(0),
-                        layer_dimensions_.input_rows *
+                        layer_dimensions_.output_rows *
                             layer_dimensions_.output_columns),
         [&](size_t i) {
           // it was <= 0 before; set back to 0
diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp
index 9696a9b460..32aa863a0a 100644
--- a/libgnn/src/layers/SAGELayer.cpp
+++ b/libgnn/src/layers/SAGELayer.cpp
@@ -100,7 +100,7 @@ galois::SAGELayer::SAGELayer(size_t layer_num,
   }
 
   size_t num_output_elements =
-      layer_dimensions_.input_rows * layer_dimensions_.output_columns;
+      layer_dimensions_.output_rows * layer_dimensions_.output_columns;
   // only needed if out temp would be smaller than intemp
   if (!config_.disable_aggregate_after_update &&
       layer_dimensions_.input_columns > layer_dimensions_.output_columns) {
@@ -179,7 +179,7 @@ const galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::ForwardPhase(
   assert(input_embeddings.size() >=
          (layer_dimensions_.input_rows * layer_dimensions_.input_columns));
   assert(p_forward_output_matrix_.size() >=
-         (layer_dimensions_.input_rows * layer_dimensions_.output_columns));
+         (layer_dimensions_.output_rows * layer_dimensions_.output_columns));
 
   // pointer to input to operate on
   const GNNFloat* input_data = input_embeddings.data();
@@ -225,7 +225,7 @@ const galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::ForwardPhase(
   }
 
   assert(p_forward_output_matrix_.size() >=
-         (layer_dimensions_.input_rows * layer_dimensions_.output_columns));
+         (layer_dimensions_.output_rows * layer_dimensions_.output_columns));
 
   timer.stop();
 

From 068437e1fdc9d448dfd455e0c649a320f32445c4 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 7 May 2021 15:30:27 -0500
Subject: [PATCH 536/660] WIP: optimize row size as layers progress

1) Seed choosing for sampling needs to sync the seeds across
hosts if only master nodes get sampled (which is what it has
been changed to if I remember correctly). Needs sync esp. if
minibatching since not all hosts know what other hosts are minibatching.

2) In minibatch setting, rows will decrease as some nodes will not
need to aggregate/not need xform. This has been implemented in
minibatching setting (but not others, since WIP).

3) To support #2 above, subgraph construction has changed so that
the SIDs are suffixed in the order that they are deactivated as
the rows progress. This increases construction time (how much is
something I need to check).

4) SAGE layer updated to deal with #2; input/output rows used depending
on phase.

NOTE: Same as other commits: SAGE only; GCN is super outdated at this
point.
---
 libgnn/include/galois/graphs/GNNGraph.h    | 25 ++++--
 libgnn/include/galois/graphs/GNNSubgraph.h |  5 +-
 libgnn/include/galois/layers/GNNLayer.h    |  6 ++
 libgnn/src/GraphNeuralNetwork.cpp          | 94 ++++++++++++++++-----
 libgnn/src/graphs/GNNGraph.cpp             | 97 ++++++++++++++++++----
 libgnn/src/graphs/GNNSubgraph.cpp          | 38 ++++++---
 libgnn/src/layers/GNNLayer.cpp             |  1 +
 libgnn/src/layers/SAGELayer.cpp            | 65 ++++++++++-----
 8 files changed, 247 insertions(+), 84 deletions(-)

diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index fee528c4b8..5ff892057c 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -262,17 +262,24 @@ class GNNGraph {
   //////////////////////////////////////////////////////////////////////////////
 
   //! Set seed nodes, i.e., nodes that are being predicted on
-  void SetupNeighborhoodSample() { SetupNeighborhoodSample(GNNPhase::kTrain); }
-  void SetupNeighborhoodSample(GNNPhase seed_phase);
+  size_t SetupNeighborhoodSample() {
+    return SetupNeighborhoodSample(GNNPhase::kTrain);
+  }
+  size_t SetupNeighborhoodSample(GNNPhase seed_phase);
 
   //! Choose all edges from sampled nodes
-  void SampleAllEdges(size_t agg_layer_num, bool inductive_subgraph);
+  size_t SampleAllEdges(size_t agg_layer_num, bool inductive_subgraph,
+                        size_t timestamp);
   //! Sample neighbors of nodes that are marked as ready for sampling
-  void SampleEdges(size_t sample_layer_num, size_t num_to_sample,
-                   bool inductive_subgraph);
+  size_t SampleEdges(size_t sample_layer_num, size_t num_to_sample,
+                     bool inductive_subgraph, size_t timestamp);
 
   //! Construct the subgraph from sampled edges and corresponding nodes
-  size_t ConstructSampledSubgraph();
+  size_t ConstructSampledSubgraph(size_t num_sampled_layers);
+
+  unsigned SampleNodeTimestamp(unsigned lid) const {
+    return sample_node_timestamps_[lid];
+  }
 
   void EnableSubgraph() { use_subgraph_ = true; }
   void DisableSubgraph() { use_subgraph_ = false; }
@@ -327,7 +334,7 @@ class GNNGraph {
 
   //! Setup the state for the next minibatch sampling call by using the
   //! minibatcher to pick up the next set batch of nodes
-  void PrepareNextTrainMinibatch();
+  size_t PrepareNextTrainMinibatch();
   //! Returns true if there are still more minibatches in this graph
   bool MoreTrainMinibatches() { return !train_batcher_->NoMoreMinibatches(); };
   //////////////////////////////////////////////////////////////////////////////
@@ -595,6 +602,10 @@ class GNNGraph {
   //! Sample data on edges: each edge gets a small bitset to mark
   //! if it's been sampled for a particular layer
   galois::LargeArray<std::vector<bool>> edge_sample_status_;
+  // TODO use a char maybe? unlikely anyone will go over 2^8 layers...
+  //! What timestep a node was added to sampled set; used to determine
+  //! size of subgraph at each layer
+  galois::LargeArray<unsigned> sample_node_timestamps_;
   //! Indicates newly sampled nodes (for distributed synchronization of sampling
   //! status
   galois::DynamicBitSet new_sampled_nodes_;
diff --git a/libgnn/include/galois/graphs/GNNSubgraph.h b/libgnn/include/galois/graphs/GNNSubgraph.h
index 976303be84..29b4429e17 100644
--- a/libgnn/include/galois/graphs/GNNSubgraph.h
+++ b/libgnn/include/galois/graphs/GNNSubgraph.h
@@ -15,7 +15,7 @@ class GNNSubgraph {
   }
   //! Given sampled bits set on gnn_graph, builds an explicit subgraph
   //! for the sampled bits
-  size_t BuildSubgraph(GNNGraph& gnn_graph);
+  size_t BuildSubgraph(GNNGraph& gnn_graph, size_t num_sampled_layers);
 
   galois::gstl::Vector<GNNFeature>& GetLocalFeatures() {
     return subgraph_node_features_;
@@ -99,7 +99,8 @@ class GNNSubgraph {
 private:
   //! Creates subgraph ID mapping from the number of sampled nodes from the
   //! original graph. Should be done every epoch when sampled graph changes.
-  void CreateLocalToSubgraphMapping(const GNNGraph& gnn_graph);
+  void CreateLocalToSubgraphMapping(const GNNGraph& gnn_graph,
+                                    size_t num_sampled_layers);
   //! Counts in and out degrees of all sampled nodes in the graph
   void DegreeCounting(const GNNGraph& gnn_graph);
   //! Creates edges
diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
index 10e44511aa..124d3a80a7 100644
--- a/libgnn/include/galois/layers/GNNLayer.h
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -102,6 +102,12 @@ class GNNLayer {
     // TODO(loc) output matrix should be resized if space becomes an issue,
     // else just use first S rows (S = subgraph size)
   }
+  virtual void ResizeInputOutputRows(size_t input_row, size_t output_row) {
+    layer_dimensions_.input_rows  = input_row;
+    layer_dimensions_.output_rows = output_row;
+    // TODO(loc) output matrix should be resized if space becomes an issue,
+    // else just use first S rows (S = subgraph size)
+  }
 
   GNNPhase layer_phase() { return layer_phase_; }
   //! Changes this layer's phase
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index 2ed988bed0..9d45265afe 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -150,7 +150,7 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork(
   }
 
   // flip sampling on layers
-  if (config_.do_sampling()) {
+  if (config_.do_sampling() || config_.train_minibatch_size()) {
     for (std::unique_ptr<galois::GNNLayer>& ptr : gnn_layers_) {
       ptr->EnableSampling();
     }
@@ -164,18 +164,30 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
   // this subgraph only needs to be created once
   if (config_.use_train_subgraph_ && !config_.train_minibatch_size()) {
     // Setup the subgraph to only be the training graph
-    graph_->SetupNeighborhoodSample();
+    size_t local_seed_node_count = graph_->SetupNeighborhoodSample();
+    galois::gDebug(graph_->host_prefix(), "Number of local seed nodes is ",
+                   local_seed_node_count);
+    size_t num_sampled_layers = 0;
     for (auto back_iter = gnn_layers_.rbegin(); back_iter != gnn_layers_.rend();
          back_iter++) {
       GNNLayerType layer_type = (*back_iter)->layer_type();
       if (layer_type == GNNLayerType::kGraphConvolutional ||
           layer_type == GNNLayerType::kSAGE) {
-        graph_->SampleAllEdges((*back_iter)->graph_user_layer_number(),
-                               config_.inductive_subgraph_);
+        size_t current_sample_size = graph_->SampleAllEdges(
+            (*back_iter)->graph_user_layer_number(),
+            config_.inductive_subgraph_, num_sampled_layers + 1);
+        galois::gDebug(graph_->host_prefix(),
+                       "Number of local nodes for train subgraph for layer ",
+                       (*back_iter)->graph_user_layer_number(), " is ",
+                       current_sample_size);
+        num_sampled_layers++;
+        // XXX resizing of layers
       }
     }
+
     // resize layer matrices
-    train_subgraph_nodes = graph_->ConstructSampledSubgraph();
+    // XXX resizing of layers should be done above, not here
+    train_subgraph_nodes = graph_->ConstructSampledSubgraph(num_sampled_layers);
     for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end();
          layer++) {
       (*layer)->ResizeRows(train_subgraph_nodes);
@@ -191,6 +203,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
     // swap to train subgraph
     if (config_.use_train_subgraph_ && !config_.train_minibatch_size()) {
       graph_->EnableSubgraph();
+      // XXX resizing based on sampled per layer
       for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end();
            layer++) {
         (*layer)->ResizeRows(train_subgraph_nodes);
@@ -199,7 +212,10 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
 
     // beginning of epoch sampling
     if (config_.do_sampling() && !config_.train_minibatch_size()) {
-      graph_->SetupNeighborhoodSample();
+      size_t local_seed_node_count = graph_->SetupNeighborhoodSample();
+      galois::gDebug(graph_->host_prefix(), "Number of local seed nodes is ",
+                     local_seed_node_count);
+
       size_t num_sampled_layers = 0;
 
       // work backwards on GCN/SAGE layers
@@ -209,14 +225,20 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
         GNNLayerType layer_type = (*back_iter)->layer_type();
         if (layer_type == GNNLayerType::kGraphConvolutional ||
             layer_type == GNNLayerType::kSAGE) {
-          graph_->SampleEdges((*back_iter)->graph_user_layer_number(),
-                              config_.fan_out_vector_[num_sampled_layers],
-                              config_.inductive_subgraph_);
+          size_t current_sample_size = graph_->SampleEdges(
+              (*back_iter)->graph_user_layer_number(),
+              config_.fan_out_vector_[num_sampled_layers],
+              config_.inductive_subgraph_, num_sampled_layers + 1);
+          galois::gDebug(graph_->host_prefix(),
+                         "Number of local nodes for layer ",
+                         (*back_iter)->graph_user_layer_number(), " is ",
+                         current_sample_size);
           num_sampled_layers++;
         }
       }
       // resize layer matrices
-      size_t num_subgraph_nodes = graph_->ConstructSampledSubgraph();
+      size_t num_subgraph_nodes =
+          graph_->ConstructSampledSubgraph(num_sampled_layers);
       for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end();
            layer++) {
         (*layer)->ResizeRows(num_subgraph_nodes);
@@ -245,7 +267,15 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
         work_left_.reset();
         galois::gInfo("Epoch ", epoch, " batch ", batch_num++);
         // break when all hosts are done with minibatches
-        graph_->PrepareNextTrainMinibatch();
+        size_t seed_node_count = graph_->PrepareNextTrainMinibatch();
+        galois::gDebug(graph_->host_prefix(),
+                       "Number of local seed nodes is for batch is ",
+                       seed_node_count);
+
+        // last layer input size/output rows becomes seed node size
+        gnn_layers_.back()->ResizeInputOutputRows(seed_node_count,
+                                                  seed_node_count);
+
         size_t num_sampled_layers = 0;
         for (auto back_iter = gnn_layers_.rbegin();
              back_iter != gnn_layers_.rend(); back_iter++) {
@@ -254,33 +284,51 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
               layer_type == GNNLayerType::kSAGE) {
             // you can minibatch with sampling or minibatch and grab all
             // relevant neighbors
+            size_t current_sample_size;
             if (config_.do_sampling()) {
-              graph_->SampleEdges((*back_iter)->graph_user_layer_number(),
-                                  config_.fan_out_vector_[num_sampled_layers],
-                                  config_.inductive_subgraph_);
+              current_sample_size = graph_->SampleEdges(
+                  (*back_iter)->graph_user_layer_number(),
+                  config_.fan_out_vector_[num_sampled_layers],
+                  config_.inductive_subgraph_, num_sampled_layers + 1);
             } else {
-              graph_->SampleAllEdges((*back_iter)->graph_user_layer_number(),
-                                     config_.inductive_subgraph_);
+              current_sample_size = graph_->SampleAllEdges(
+                  (*back_iter)->graph_user_layer_number(),
+                  config_.inductive_subgraph_, num_sampled_layers + 1);
             }
+            galois::gDebug(graph_->host_prefix(),
+                           "Number of local nodes for layer ",
+                           (*back_iter)->graph_user_layer_number(), " is ",
+                           current_sample_size);
+            // resize this layer, change seed node count
+            (*back_iter)
+                ->ResizeInputOutputRows(current_sample_size, seed_node_count);
+            seed_node_count = current_sample_size;
             num_sampled_layers++;
           }
         }
+
         // resize layer matrices
-        size_t num_subgraph_nodes = graph_->ConstructSampledSubgraph();
-        for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end();
-             layer++) {
-          (*layer)->ResizeRows(num_subgraph_nodes);
-        }
+        // size_t num_subgraph_nodes = graph_->ConstructSampledSubgraph();
+        graph_->ConstructSampledSubgraph(num_sampled_layers);
+        // XXX resizes above only work for SAGE layers; will break if other
+        // layers are tested
+
+        // for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end();
+        //     layer++) {
+        //  (*layer)->ResizeRows(num_subgraph_nodes);
+        //}
 
         const PointerWithSize<galois::GNNFloat> batch_pred = DoInference();
         train_accuracy = GetGlobalAccuracy(batch_pred);
         GradientPropagation();
 
-        galois::gPrint("Epoch ", epoch, " Batch ", batch_num - 1,
-                       ": Train accuracy/F1 micro is ", train_accuracy, "\n");
         work_left_ += graph_->MoreTrainMinibatches();
         char global_work_left = work_left_.reduce();
         batch_timer.stop();
+        galois::gPrint("Epoch ", epoch, " Batch ", batch_num - 1,
+                       ": Train accuracy/F1 micro is ", train_accuracy,
+                       " time ", batch_timer.get(), "\n");
+
         if (!global_work_left) {
           break;
         }
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index 5c159dc816..fff79ea4fe 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -86,6 +86,11 @@ galois::graphs::GNNGraph::GNNGraph(const std::string& input_directory,
   // reverse edges
   partitioned_graph_->ConstructIncomingEdges();
 
+  galois::gInfo(host_prefix_, "Number of local proxies is ",
+                partitioned_graph_->size());
+  galois::gInfo(host_prefix_, "Number of local edges is ",
+                partitioned_graph_->sizeEdges());
+
   // read additional graph data
   ReadLocalLabels(dataset_name, has_single_class_label);
   ReadLocalFeatures(dataset_name);
@@ -750,6 +755,8 @@ float galois::graphs::GNNGraph::GetGlobalAccuracyCPUMulti(
 void galois::graphs::GNNGraph::InitializeSamplingData(size_t num_layers,
                                                       bool is_inductive) {
   subgraph_ = std::make_unique<GNNSubgraph>(partitioned_graph_->size());
+  sample_node_timestamps_.create(partitioned_graph_->size(),
+                                 std::numeric_limits<uint32_t>::max());
   edge_sample_status_.create(partitioned_graph_->sizeEdges(), num_layers, 0);
   // this is to hold the *global* degree of a sampled graph; yes, memory wise
   // this is slightly problematic possibly, but each layer is its own
@@ -764,19 +771,25 @@ void galois::graphs::GNNGraph::InitializeSamplingData(size_t num_layers,
   }
 }
 
-void galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) {
+size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) {
   use_subgraph_ = false;
   bitset_sample_flag_.resize(size());
   bitset_sample_flag_.reset();
 
   // for now, if training node, it goes into seed node
-  galois::do_all(galois::iterate(begin(), end()), [&](const NodeIterator& x) {
-    if (IsValidForPhase(*x, seed_phase)) {
-      SetSampledNode(*x);
-    } else {
-      UnsetSampledNode(*x);
-    }
-  });
+  galois::do_all(galois::iterate(begin_owned(), end_owned()),
+                 [&](const NodeIterator& x) {
+                   if (IsValidForPhase(*x, seed_phase)) {
+                     SetSampledNode(*x);
+                     bitset_sample_flag_.set(*x);
+                   } else {
+                     UnsetSampledNode(*x);
+                   }
+                 });
+
+  // clear node timestamps
+  std::fill(sample_node_timestamps_.begin(), sample_node_timestamps_.end(),
+            std::numeric_limits<uint32_t>::max());
   // clear all sampled edges
   galois::do_all(galois::iterate(size_t{0}, partitioned_graph_->sizeEdges()),
                  [&](size_t edge_id) {
@@ -794,10 +807,28 @@ void galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) {
   }
   bitset_sampled_degrees_.resize(partitioned_graph_->size());
   bitset_sampled_degrees_.reset();
+
+  // Write source = masters
+  sync_substrate_->sync<writeSource, readAny, SampleFlagSync, SampleFlagBitset>(
+      "SampleSync");
+
+  galois::GAccumulator<unsigned> local_seed_count;
+  local_seed_count.reset();
+  // count # of seed nodes
+  galois::do_all(galois::iterate(begin(), end()), [&](const NodeIterator& x) {
+    if (IsInSampledGraph(x)) {
+      local_seed_count += 1;
+      // 0 = seed node
+      sample_node_timestamps_[*x] = 0;
+    }
+  });
+
+  return local_seed_count.reduce();
 }
 
-void galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num,
-                                              bool inductive_subgraph) {
+size_t galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num,
+                                                bool inductive_subgraph,
+                                                size_t timestamp) {
   use_subgraph_ = false;
 
   galois::GAccumulator<size_t> sampled;
@@ -846,11 +877,26 @@ void galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num,
   sync_substrate_
       ->sync<writeDestination, readAny, SampleFlagSync, SampleFlagBitset>(
           "SampleSync");
+
+  galois::GAccumulator<unsigned> local_sample_count;
+  local_sample_count.reset();
+  // count # of seed nodes
+  galois::do_all(galois::iterate(begin(), end()), [&](const NodeIterator& x) {
+    if (IsInSampledGraph(x)) {
+      local_sample_count += 1;
+      if (sample_node_timestamps_[*x] == std::numeric_limits<uint32_t>::max()) {
+        sample_node_timestamps_[*x] = timestamp;
+      }
+    }
+  });
+
+  return local_sample_count.reduce();
 }
 
-void galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num,
-                                           size_t num_to_sample,
-                                           bool inductive_subgraph) {
+size_t galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num,
+                                             size_t num_to_sample,
+                                             bool inductive_subgraph,
+                                             size_t timestamp) {
   assert(!subgraph_is_train_);
   use_subgraph_ = false;
 
@@ -928,10 +974,26 @@ void galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num,
   sync_substrate_
       ->sync<writeDestination, readAny, SampleFlagSync, SampleFlagBitset>(
           "SampleSync");
+
+  // count sampled node size
+  galois::GAccumulator<unsigned> local_sample_count;
+  local_sample_count.reset();
+  // count # of seed nodes
+  galois::do_all(galois::iterate(begin(), end()), [&](const NodeIterator& x) {
+    if (IsInSampledGraph(x)) {
+      local_sample_count += 1;
+      if (sample_node_timestamps_[*x] == std::numeric_limits<uint32_t>::max()) {
+        sample_node_timestamps_[*x] = timestamp;
+      }
+    }
+  });
+
+  return local_sample_count.reduce();
 }
 
 //! Construct the subgraph from sampled edges and corresponding nodes
-size_t galois::graphs::GNNGraph::ConstructSampledSubgraph() {
+size_t
+galois::graphs::GNNGraph::ConstructSampledSubgraph(size_t num_sampled_layers) {
   // false first so that the build process can use functions to access the
   // real graph
   use_subgraph_            = false;
@@ -940,13 +1002,14 @@ size_t galois::graphs::GNNGraph::ConstructSampledSubgraph() {
   sync_substrate_
       ->sync<writeSource, readAny, SubgraphDegreeSync, SubgraphDegreeBitset>(
           "SubgraphDegree");
-  size_t num_subgraph_nodes = subgraph_->BuildSubgraph(*this);
+  size_t num_subgraph_nodes =
+      subgraph_->BuildSubgraph(*this, num_sampled_layers);
   // after this, this graph is a subgraph
   use_subgraph_ = true;
   return num_subgraph_nodes;
 }
 
-void galois::graphs::GNNGraph::PrepareNextTrainMinibatch() {
+size_t galois::graphs::GNNGraph::PrepareNextTrainMinibatch() {
   train_batcher_->GetNextMinibatch(&local_minibatch_mask_);
 #ifndef NDEBUG
   size_t count = 0;
@@ -960,7 +1023,7 @@ void galois::graphs::GNNGraph::PrepareNextTrainMinibatch() {
   // galois::gPrint("\n");
   galois::gInfo(host_prefix(), "Batched nodes ", count);
 #endif
-  SetupNeighborhoodSample(GNNPhase::kBatch);
+  return SetupNeighborhoodSample(GNNPhase::kBatch);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/libgnn/src/graphs/GNNSubgraph.cpp b/libgnn/src/graphs/GNNSubgraph.cpp
index a5f6d925ec..67d4b74fd0 100644
--- a/libgnn/src/graphs/GNNSubgraph.cpp
+++ b/libgnn/src/graphs/GNNSubgraph.cpp
@@ -1,11 +1,11 @@
 #include "galois/graphs/GNNGraph.h"
 #include <limits>
 
-size_t
-galois::graphs::GNNGraph::GNNSubgraph::BuildSubgraph(GNNGraph& gnn_graph) {
+size_t galois::graphs::GNNGraph::GNNSubgraph::BuildSubgraph(
+    GNNGraph& gnn_graph, size_t num_sampled_layers) {
   galois::StatTimer timer("BuildSubgraph", kRegionName);
   timer.start();
-  CreateLocalToSubgraphMapping(gnn_graph);
+  CreateLocalToSubgraphMapping(gnn_graph, num_sampled_layers);
   if (num_subgraph_nodes_ == 0) {
     return 0;
   }
@@ -19,7 +19,7 @@ galois::graphs::GNNGraph::GNNSubgraph::BuildSubgraph(GNNGraph& gnn_graph) {
 }
 
 void galois::graphs::GNNGraph::GNNSubgraph::CreateLocalToSubgraphMapping(
-    const GNNGraph& gnn_graph) {
+    const GNNGraph& gnn_graph, size_t num_sampled_layers) {
   galois::StatTimer timer("LIDToSIDMapping", kRegionName);
   timer.start();
 
@@ -27,6 +27,7 @@ void galois::graphs::GNNGraph::GNNSubgraph::CreateLocalToSubgraphMapping(
   // clear all mappings
   std::fill(lid_to_subgraph_id_.begin(), lid_to_subgraph_id_.end(),
             std::numeric_limits<uint32_t>::max());
+
   // TODO(loc) depending on overhead, can parallelize this with a prefix sum
   // serial loop over LIDs to construct lid -> subgraph id mapping
   uint32_t current_sid = 0;
@@ -35,31 +36,44 @@ void galois::graphs::GNNGraph::GNNSubgraph::CreateLocalToSubgraphMapping(
   size_t last_owned_node = *(gnn_graph.end_owned());
   for (size_t local_node_id = 0; local_node_id < last_owned_node;
        local_node_id++) {
-    if (gnn_graph.IsInSampledGraph(local_node_id)) {
+    if (gnn_graph.SampleNodeTimestamp(local_node_id) == 0) {
       // TODO should bound check the SID to max uint32_t
       // note: if SID is max uint32t, then it's not valid
-      // galois::gInfo(local_node_id, " maps to ", current_sid);
       lid_to_subgraph_id_[local_node_id] = current_sid++;
     }
   }
 
-  // all nodes before this SID are master nodes
+  // all nodes before this SID are master nodes *that matter*
+  // NOTE: there is a very subtle distinction here implementation wise
+  // that needs to be resolved in slightly more detail than this
   subgraph_master_boundary_ = current_sid;
 
   for (size_t local_node_id = last_owned_node; local_node_id < gnn_graph.size();
        local_node_id++) {
-    if (gnn_graph.IsInSampledGraph(local_node_id)) {
+    if (gnn_graph.SampleNodeTimestamp(local_node_id) == 0) {
       // TODO should bound check the SID to max uint32_t
       // note: if SID is max uint32t, then it's not valid
-      // galois::gInfo(local_node_id, " maps to ", current_sid);
       lid_to_subgraph_id_[local_node_id] = current_sid++;
     }
   }
-  galois::gDebug("Number of sampled nodes for subgraph construction is ",
-                 current_sid);
+  galois::gDebug(
+      "Number of sampled nodes for subgraph construction layer 0 is ",
+      current_sid);
+
+  // XXX each sampled layer can be queried in parallel (think prefix sum); do
+  // this if this becomes a bottleneck
+  for (size_t i = 1; i < num_sampled_layers + 1; i++) {
+    for (size_t local_node_id = 0; local_node_id < gnn_graph.size();
+         local_node_id++) {
+      if (gnn_graph.SampleNodeTimestamp(local_node_id) == i) {
+        lid_to_subgraph_id_[local_node_id] = current_sid++;
+      }
+    }
+    galois::gDebug("Number of sampled nodes for subgraph construction, layer ",
+                   i, " is ", current_sid);
+  }
 
   num_subgraph_nodes_ = current_sid;
-
   timer.stop();
 }
 
diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp
index 2ba3aa5ae3..1dabce8476 100644
--- a/libgnn/src/layers/GNNLayer.cpp
+++ b/libgnn/src/layers/GNNLayer.cpp
@@ -164,6 +164,7 @@ void galois::GNNLayer::RandomInitVector(std::vector<GNNFloat>* vector_to_init) {
 void galois::GNNLayer::DoDropoutCPU(
     const PointerWithSize<GNNFloat> input_to_dropout,
     PointerWithSize<GNNFloat>* output_matrix) {
+  // XXX(loc) check this to make sure it works in subgraph setting
   size_t num_elements =
       layer_dimensions_.input_rows * layer_dimensions_.input_columns;
 
diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp
index 32aa863a0a..3f712df0f7 100644
--- a/libgnn/src/layers/SAGELayer.cpp
+++ b/libgnn/src/layers/SAGELayer.cpp
@@ -270,6 +270,7 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
   }
 
   if (!sage_config_.disable_concat) {
+    // XXX masking may not be required in sampling case where rows change
     if (layer_number_ != 0) {
       MaskInputNonMasters(&input_data);
     } else {
@@ -291,7 +292,7 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
       // otherwise must mask other
       galois::CBlasSGEMM(
           CblasTrans, CblasNoTrans, layer_dimensions_.input_columns,
-          layer_dimensions_.input_rows, layer_dimensions_.output_columns,
+          layer_dimensions_.output_rows, layer_dimensions_.output_columns,
           input_data.data(), input_gradient->data(),
           p_layer_weight_gradients_2_.data());
 #ifdef GALOIS_ENABLE_GPU
@@ -306,6 +307,7 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
       layer_dimensions_.input_columns <= layer_dimensions_.output_columns) {
     // aggdata can == p_intemp1; in other words, need to use before overwrite
     // mask it, then use it
+    // XXX masking may not be required in sampling case where rows change
     if (layer_number_ != 0 || sage_config_.disable_concat) {
       MaskInputNonMasters(&agg_data);
     }
@@ -314,16 +316,17 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
 
 #ifdef GALOIS_ENABLE_GPU
     if (device_personality == DevicePersonality::GPU_CUDA) {
+      // XXX output rows
       gpu_object_.GetWeightGradientsGPU(
           layer_dimensions_.input_rows, layer_dimensions_.input_columns,
           layer_dimensions_.output_columns, agg_data.data(),
           input_gradient->data(), p_layer_weight_gradients_.data());
     } else {
 #endif
-      // temp 2 holds aggregated feature vectors from forward phase
+      // agg data holds aggregated feature vectors from forward phase
       galois::CBlasSGEMM(
           CblasTrans, CblasNoTrans, layer_dimensions_.input_columns,
-          layer_dimensions_.input_rows, layer_dimensions_.output_columns,
+          layer_dimensions_.output_rows, layer_dimensions_.output_columns,
           agg_data.data(), input_gradient->data(),
           p_layer_weight_gradients_.data());
 #ifdef GALOIS_ENABLE_GPU
@@ -349,6 +352,7 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
   } else {
     // --unmasked--
     // disable concat part is here because otherwise it would get done elsewhere
+    // XXX masking may not be required in sampling case where rows change
     if (layer_number_ != 0 && sage_config_.disable_concat) {
       MaskInputNonMasters(&input_data);
     } else {
@@ -367,6 +371,7 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
           p_out_temp_.data(), p_layer_weight_gradients_.data());
     } else {
 #endif
+      // input col x input row * input row x output col
       galois::CBlasSGEMM(CblasTrans, CblasNoTrans,
                          layer_dimensions_.input_columns,
                          layer_dimensions_.input_rows,
@@ -451,8 +456,16 @@ void galois::SAGELayer::AggregateAllCPU(
     GNNFloat* aggregate_output,
     galois::substrate::PerThreadStorage<std::vector<GNNFloat>>*,
     bool is_backward) {
+  // aggregation causes a row count change
+  size_t num_rows_to_handle;
+  if (!is_backward) {
+    num_rows_to_handle = layer_dimensions_.output_rows;
+  } else {
+    num_rows_to_handle = layer_dimensions_.input_rows;
+  }
+
   galois::do_all(
-      galois::iterate(graph_.begin(), graph_.end()),
+      galois::iterate(*(graph_.begin()), num_rows_to_handle),
       [&](size_t src) {
         size_t index_to_src_feature = src * column_length;
         // zero out src feature first
@@ -469,10 +482,8 @@ void galois::SAGELayer::AggregateAllCPU(
           // loop through all destinations to grab the feature to aggregate
           for (auto e = graph_.edge_begin(src); e != graph_.edge_end(src);
                e++) {
-            graphs::bitset_graph_aggregate.set(graph_.ConvertToLID(src));
-            size_t dst = graph_.GetEdgeDest(e);
-
-            if (layer_phase_ == GNNPhase::kTrain) {
+            if (layer_phase_ == GNNPhase::kTrain ||
+                layer_phase_ == GNNPhase::kBatch) {
               // XXX
               if (IsSampledLayer()) {
                 if (!graph_.IsEdgeSampled(e, layer_number_)) {
@@ -480,7 +491,8 @@ void galois::SAGELayer::AggregateAllCPU(
                 }
               }
             }
-
+            size_t dst = graph_.GetEdgeDest(e);
+            graphs::bitset_graph_aggregate.set(graph_.ConvertToLID(src));
             size_t index_to_dst_feature = dst * column_length;
 
             if (!config_.disable_normalization) {
@@ -508,10 +520,8 @@ void galois::SAGELayer::AggregateAllCPU(
           // loop through all destinations to grab the feature to aggregate
           for (auto e = graph_.in_edge_begin(src); e != graph_.in_edge_end(src);
                e++) {
-            graphs::bitset_graph_aggregate.set(graph_.ConvertToLID(src));
-            size_t dst = graph_.GetInEdgeDest(e);
-
-            if (layer_phase_ == GNNPhase::kTrain) {
+            if (layer_phase_ == GNNPhase::kTrain ||
+                layer_phase_ == GNNPhase::kBatch) {
               // XXX
               if (IsSampledLayer()) {
                 if (!graph_.IsInEdgeSampled(e, layer_number_)) {
@@ -519,6 +529,13 @@ void galois::SAGELayer::AggregateAllCPU(
                 }
               }
             }
+            size_t dst = graph_.GetInEdgeDest(e);
+            graphs::bitset_graph_aggregate.set(graph_.ConvertToLID(src));
+
+            // input row x output row in backward means that i shouldn't be
+            // touching nodes past output rows; the above sample check
+            // should deal with this where this matters
+            assert(dst < layer_dimensions_.output_rows);
 
             size_t index_to_dst_feature = dst * column_length;
 
@@ -553,6 +570,7 @@ void galois::SAGELayer::UpdateEmbeddings(const GNNFloat* node_embeddings,
   timer.start();
 #ifdef GALOIS_ENABLE_GPU
   // TODO self change
+  // XXX(hochan) output rows
   if (device_personality == DevicePersonality::GPU_CUDA) {
     gpu_object_.UpdateEmbeddingsGPU(
         layer_dimensions_.input_rows, layer_dimensions_.input_columns,
@@ -561,14 +579,14 @@ void galois::SAGELayer::UpdateEmbeddings(const GNNFloat* node_embeddings,
   } else {
 #endif
     galois::gDebug("Layer ", graph_user_layer_number_, " ",
-                   layer_dimensions_.input_rows, " ",
+                   layer_dimensions_.output_rows, " ",
                    layer_dimensions_.input_columns, " ",
                    layer_dimensions_.output_columns);
     // CPU version is just a call into CBlas
-    galois::CBlasSGEMM(CblasNoTrans, CblasNoTrans, layer_dimensions_.input_rows,
-                       layer_dimensions_.input_columns,
-                       layer_dimensions_.output_columns, node_embeddings,
-                       layer_weights_.data(), output);
+    galois::CBlasSGEMM(
+        CblasNoTrans, CblasNoTrans, layer_dimensions_.output_rows,
+        layer_dimensions_.input_columns, layer_dimensions_.output_columns,
+        node_embeddings, layer_weights_.data(), output);
 #ifdef GALOIS_ENABLE_GPU
   }
 #endif
@@ -587,10 +605,10 @@ void galois::SAGELayer::SelfFeatureUpdateEmbeddings(
   } else {
 #endif
     // note use of layer weights 2 differentiates this from above
-    galois::CBlasSGEMM(CblasNoTrans, CblasNoTrans, layer_dimensions_.input_rows,
-                       layer_dimensions_.input_columns,
-                       layer_dimensions_.output_columns, node_embeddings,
-                       layer_weights_2_.data(), output, true);
+    galois::CBlasSGEMM(
+        CblasNoTrans, CblasNoTrans, layer_dimensions_.output_rows,
+        layer_dimensions_.input_columns, layer_dimensions_.output_columns,
+        node_embeddings, layer_weights_2_.data(), output, true);
 #ifdef GALOIS_ENABLE_GPU
   }
 #endif
@@ -614,6 +632,7 @@ void galois::SAGELayer::UpdateEmbeddingsDerivative(const GNNFloat* gradients,
 #endif
     // difference is Trans for B matrix (data) to get z by y (weights is y by z
     // normally); result is x by y
+    // note input rows is used here due to transpose of aggregation
     galois::CBlasSGEMM(CblasNoTrans, CblasTrans, layer_dimensions_.input_rows,
                        layer_dimensions_.output_columns,
                        layer_dimensions_.input_columns, gradients,
@@ -641,7 +660,7 @@ void galois::SAGELayer::SelfFeatureUpdateEmbeddingsDerivative(
     // difference is Trans for B matrix (data) to get z by y (weights is y by z
     // normally); result is x by y
     // true at end -> accumulate
-    galois::CBlasSGEMM(CblasNoTrans, CblasTrans, layer_dimensions_.input_rows,
+    galois::CBlasSGEMM(CblasNoTrans, CblasTrans, layer_dimensions_.output_rows,
                        layer_dimensions_.output_columns,
                        layer_dimensions_.input_columns, gradients,
                        layer_weights_2_.data(), output, true);

From faf6b66f54e9e94ea27bffae6b73ef40776c317b Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 11 May 2021 12:24:41 -0500
Subject: [PATCH 537/660] GNN resize layer affects output; reset all sampled

1) Resize layer changes the output rows.
2) (Un)set sampled nodes needs to apply to all nodes, not just owned
nodes. Note that loop can be unconditional unset because seed nodes
will always only be owned nodes. (mirror of a seed node on another
host will get synchronized)
---
 libgnn/include/galois/layers/GNNLayer.h |  1 +
 libgnn/src/graphs/GNNGraph.cpp          | 17 +++++++++++------
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
index 124d3a80a7..4f5822d1b2 100644
--- a/libgnn/include/galois/layers/GNNLayer.h
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -99,6 +99,7 @@ class GNNLayer {
 
   virtual void ResizeRows(size_t new_row_count) {
     layer_dimensions_.input_rows = new_row_count;
+    layer_dimensions_.output_rows = new_row_count;
     // TODO(loc) output matrix should be resized if space becomes an issue,
     // else just use first S rows (S = subgraph size)
   }
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index fff79ea4fe..92dee12a28 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -628,9 +628,8 @@ float galois::graphs::GNNGraph::GetGlobalAccuracyCPUSingle(
   size_t global_correct = num_correct_.reduce();
   size_t global_checked = total_checked_.reduce();
 
-  // GALOIS_LOG_WARN("Sub: {}, Accuracy: {} / {}", use_subgraph_,
-  // global_correct,
-  //                global_checked);
+  galois::gDebug("Sub: {}, Accuracy: {} / {}", use_subgraph_, global_correct,
+                 global_checked);
 
   return static_cast<float>(global_correct) /
          static_cast<float>(global_checked);
@@ -786,6 +785,12 @@ size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) {
                      UnsetSampledNode(*x);
                    }
                  });
+  // unsets nodes set in previous iterations; for some reason they get
+  // synchronized along  with everything else even though bitset sample flag
+  // should prevent it (that, or it's because they don't get sync'd that they
+  // remain the same)
+  galois::do_all(galois::iterate(end_owned(), end()),
+                 [&](const NodeIterator& x) { UnsetSampledNode(*x); });
 
   // clear node timestamps
   std::fill(sample_node_timestamps_.begin(), sample_node_timestamps_.end(),
@@ -958,9 +963,9 @@ size_t galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num,
       galois::steal(), galois::loopname("NeighborhoodSample"));
 
   // galois::gInfo(host_prefix(), "sampled nodes for layer ", sample_layer_num,
-  // " is ", total_nodes.reduce());
-  galois::gDebug("Num sampled edges for layer ", sample_layer_num, " is ",
-                 sampled.reduce(), " out of ", total.reduce());
+  //              " is ", total_nodes.reduce());
+  // galois::gInfo("Num sampled edges for layer ", sample_layer_num, " is ",
+  //              sampled.reduce(), " out of ", total.reduce());
 
   std::vector<uint32_t> new_nodes = bitset_sample_flag_.getOffsets();
 

From 9772fb98be02cf5066cb221b7e679694ae1e336e Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 11 May 2021 18:18:53 -0500
Subject: [PATCH 538/660] distgraphconvert builds again

distgraphconvert was not updated to keep up with changes to send
buffers. This commit fixes that.

Made to build again so I could use the node reordering function of it
for papers100M.
---
 .../dist-graph-convert-helpers.cpp            | 23 +++++++++----------
 .../dist-graph-convert-helpers.h              | 22 ++++++++++--------
 2 files changed, 24 insertions(+), 21 deletions(-)

diff --git a/tools/dist-graph-convert/dist-graph-convert-helpers.cpp b/tools/dist-graph-convert/dist-graph-convert-helpers.cpp
index 4764598bbf..886103881d 100644
--- a/tools/dist-graph-convert/dist-graph-convert-helpers.cpp
+++ b/tools/dist-graph-convert/dist-graph-convert-helpers.cpp
@@ -269,7 +269,7 @@ void sendAndReceiveEdgeChunkCounts(std::vector<uint64_t>& chunkCounts) {
       continue;
     galois::runtime::SendBuffer b;
     galois::runtime::gSerialize(b, chunkCounts);
-    net.sendTagged(h, galois::runtime::evilPhase, b);
+    net.sendTagged(h, galois::runtime::evilPhase, std::move(b));
   }
 
   // receive chunk counts
@@ -279,10 +279,10 @@ void sendAndReceiveEdgeChunkCounts(std::vector<uint64_t>& chunkCounts) {
   for (unsigned h = 0; h < totalNumHosts; h++) {
     if (h == hostID)
       continue;
-    decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) rBuffer;
+    decltype(net.recieveTagged(galois::runtime::evilPhase)) rBuffer;
 
     do {
-      rBuffer = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+      rBuffer = net.recieveTagged(galois::runtime::evilPhase);
     } while (!rBuffer);
 
     galois::runtime::gDeserialize(rBuffer->second, recvChunkCounts);
@@ -416,12 +416,12 @@ uint64_t receiveEdgeCounts() {
   for (unsigned h = 0; h < totalNumHosts; h++) {
     if (h == hostID)
       continue;
-    decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) rBuffer;
+    decltype(net.recieveTagged(galois::runtime::evilPhase)) rBuffer;
 
     uint64_t recvCount;
 
     do {
-      rBuffer = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+      rBuffer = net.recieveTagged(galois::runtime::evilPhase);
     } while (!rBuffer);
     galois::runtime::gDeserialize(rBuffer->second, recvCount);
 
@@ -450,9 +450,8 @@ void receiveAssignedEdges(std::atomic<uint64_t>& edgesToReceive,
         std::vector<uint32_t> recvDataVector;
 
         while (edgesToReceive) {
-          decltype(
-              net.recieveTagged(galois::runtime::evilPhase, nullptr)) rBuffer;
-          rBuffer = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+          decltype(net.recieveTagged(galois::runtime::evilPhase)) rBuffer;
+          rBuffer = net.recieveTagged(galois::runtime::evilPhase);
 
           // the buffer will have edge data as well if localsrctodata is
           // nonempty (it will be nonempty if initialized to non-empty by the
@@ -460,7 +459,7 @@ void receiveAssignedEdges(std::atomic<uint64_t>& edgesToReceive,
           // going to send edge data
           if (rBuffer) {
             auto& receiveBuffer = rBuffer->second;
-            while (receiveBuffer.r_size() > 0) {
+            while (receiveBuffer.size() > 0) {
               uint64_t src;
               if (localSrcToData.empty()) {
                 // receive only dest data
@@ -514,7 +513,7 @@ std::vector<uint64_t> getEdgesPerHost(uint64_t localAssignedEdges) {
       continue;
     galois::runtime::SendBuffer b;
     galois::runtime::gSerialize(b, localAssignedEdges);
-    net.sendTagged(h, galois::runtime::evilPhase, b);
+    net.sendTagged(h, galois::runtime::evilPhase, std::move(b));
   }
 
   // receive
@@ -524,10 +523,10 @@ std::vector<uint64_t> getEdgesPerHost(uint64_t localAssignedEdges) {
       continue;
     }
 
-    decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) rBuffer;
+    decltype(net.recieveTagged(galois::runtime::evilPhase)) rBuffer;
     uint64_t otherAssignedEdges;
     do {
-      rBuffer = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+      rBuffer = net.recieveTagged(galois::runtime::evilPhase);
     } while (!rBuffer);
     galois::runtime::gDeserialize(rBuffer->second, otherAssignedEdges);
 
diff --git a/tools/dist-graph-convert/dist-graph-convert-helpers.h b/tools/dist-graph-convert/dist-graph-convert-helpers.h
index abf932056c..dc8d2a954a 100644
--- a/tools/dist-graph-convert/dist-graph-convert-helpers.h
+++ b/tools/dist-graph-convert/dist-graph-convert-helpers.h
@@ -838,7 +838,7 @@ void sendEdgeCounts(const std::vector<Uint64Pair>& hostToNodes,
       continue;
     galois::runtime::SendBuffer b;
     galois::runtime::gSerialize(b, numEdgesPerHost[h].reduce());
-    net.sendTagged(h, galois::runtime::evilPhase, b);
+    net.sendTagged(h, galois::runtime::evilPhase, std::move(b));
   }
 };
 
@@ -929,8 +929,9 @@ void sendAssignedEdges(const std::vector<Uint64Pair>& hostToNodes,
               dstVector.clear();
               if (hostSendBuffer.size() > 1400) {
                 net.sendTagged(edgeOwner, galois::runtime::evilPhase,
-                               hostSendBuffer);
-                hostSendBuffer.getVec().clear();
+                               std::move(hostSendBuffer));
+                (*(sendBuffers.getLocal()))[edgeOwner] =
+                    galois::runtime::SendBuffer();
               }
             }
 
@@ -966,8 +967,9 @@ void sendAssignedEdges(const std::vector<Uint64Pair>& hostToNodes,
           }
 
           if (hostSendBuffer.size() > 0) {
-            net.sendTagged(h, galois::runtime::evilPhase, hostSendBuffer);
-            hostSendBuffer.getVec().clear();
+            net.sendTagged(h, galois::runtime::evilPhase,
+                           std::move(hostSendBuffer));
+            (*(sendBuffers.getLocal()))[h] = galois::runtime::SendBuffer();
           }
         }
       },
@@ -1049,8 +1051,9 @@ void sendAssignedEdges(const std::vector<Uint64Pair>& hostToNodes,
               dataVector.clear();
               if (hostSendBuffer.size() > 1400) {
                 net.sendTagged(edgeOwner, galois::runtime::evilPhase,
-                               hostSendBuffer);
-                hostSendBuffer.getVec().clear();
+                               std::move(hostSendBuffer));
+                (*(sendBuffers.getLocal()))[edgeOwner] =
+                    galois::runtime::SendBuffer();
               }
             }
 
@@ -1090,8 +1093,9 @@ void sendAssignedEdges(const std::vector<Uint64Pair>& hostToNodes,
           }
 
           if (hostSendBuffer.size() > 0) {
-            net.sendTagged(h, galois::runtime::evilPhase, hostSendBuffer);
-            hostSendBuffer.getVec().clear();
+            net.sendTagged(h, galois::runtime::evilPhase,
+                           std::move(hostSendBuffer));
+            (*(sendBuffers.getLocal()))[h] = galois::runtime::SendBuffer();
           }
         }
       },

From b5a79f4025a85af6427e0ee5ecb2630ba7dff1ab Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 11 May 2021 18:29:44 -0500
Subject: [PATCH 539/660] papers100M contiguous remapping related changes

1) added code to gnngraph that remaps labels, features, and
node ids to make train/val/test ranges contiguous. This is mostly useful
for partitioning so that I can partitioning training nodes evenly which
is key to minibatching.
2) CuSP hardcodes the training range for 100M-remap.
3) 100M-remap hardcoded mask ranges to reduce reading time for it in
GNNGraph.
4) 100M-remap custom label reader that isn't text based; all labels
should eventually switch to this format in any case.
5) commented out code in gnngraph-test that I used to test/do remapping
of 100M.
---
 libcusp/include/galois/graphs/NewGeneric.h |   5 +
 libgnn/include/galois/graphs/GNNGraph.h    |   5 +
 libgnn/src/graphs/GNNGraph.cpp             | 247 +++++++++++++++++++--
 libgnn/test/gnngraph-test.cpp              |  11 +
 libgnn/test/gpu-convlayer-test.cpp         |   4 +-
 5 files changed, 257 insertions(+), 15 deletions(-)

diff --git a/libcusp/include/galois/graphs/NewGeneric.h b/libcusp/include/galois/graphs/NewGeneric.h
index 6ece9e2c51..6f13f42737 100644
--- a/libcusp/include/galois/graphs/NewGeneric.h
+++ b/libcusp/include/galois/graphs/NewGeneric.h
@@ -119,6 +119,11 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
       // this is entire graph: amazon's mask isn't contiguous
       bps.push_back(0);
       bps.push_back(86618);
+    } else if (filename.find("ogbn-papers100M-remap") != std::string::npos) {
+      galois::gInfo("papers remap being used");
+      // whole graph (non contiguous mask)
+      bps.push_back(0);
+      bps.push_back(1207178);
     } else if (filename.find("ogbn-papers100M") != std::string::npos) {
       // whole graph (non contiguous mask)
       bps.push_back(0);
diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 5ff892057c..f970288718 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -517,6 +517,8 @@ class GNNGraph {
   }
 #endif
 
+  void ContiguousRemap(const std::string& new_name);
+
 private:
 // included like this to avoid cyclic dependency issues + not used anywhere but
 // in this class anyways
@@ -526,6 +528,7 @@ class GNNGraph {
   // Initialization
   //////////////////////////////////////////////////////////////////////////////
 
+  void ReadLocalLabelsBin(const std::string& dataset_name);
   //! Read labels of local nodes only
   void ReadLocalLabels(const std::string& dataset_name,
                        bool has_single_class_label);
@@ -655,6 +658,8 @@ class GNNGraph {
   std::unique_ptr<MinibatchGenerator> train_batcher_;
   std::unique_ptr<MinibatchGenerator> test_batcher_;
 
+  std::vector<uint32_t> node_remapping_;
+
   //////////////////////////////////////////////////////////////////////////////
   // GPU things
   //////////////////////////////////////////////////////////////////////////////
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index 92dee12a28..0c1d3b4d8f 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -92,7 +92,12 @@ galois::graphs::GNNGraph::GNNGraph(const std::string& input_directory,
                 partitioned_graph_->sizeEdges());
 
   // read additional graph data
-  ReadLocalLabels(dataset_name, has_single_class_label);
+  if (dataset_name != "ogbn-papers100M-remap") {
+    ReadLocalLabels(dataset_name, has_single_class_label);
+  } else {
+    galois::gInfo("Remapped ogbn 100M");
+    ReadLocalLabelsBin(dataset_name);
+  }
   ReadLocalFeatures(dataset_name);
   ReadLocalMasks(dataset_name);
 
@@ -256,6 +261,49 @@ void galois::graphs::GNNGraph::AggregateSyncGPU(
   }
 }
 #endif
+void galois::graphs::GNNGraph::ReadLocalLabelsBin(
+    const std::string& dataset_name) {
+  GALOIS_LOG_VERBOSE("[{}] Reading labels from disk...", host_id_);
+
+  std::ifstream file_stream;
+  file_stream.open(input_directory_ + dataset_name + "-labels-dims.txt",
+                   std::ios::in);
+  size_t num_nodes;
+  file_stream >> num_nodes >> num_label_classes_ >> std::ws;
+  assert(num_nodes == partitioned_graph_->globalSize());
+  if (host_id_ == 0) {
+    galois::gInfo("Number of label classes is ", num_label_classes_);
+  }
+  file_stream.close();
+
+  std::string filename = input_directory_ + dataset_name + "-labels.bin";
+  std::ifstream file_stream_bin;
+  file_stream_bin.open(filename, std::ios::binary | std::ios::in);
+
+  std::vector<GNNLabel> all_labels(num_nodes);
+  // read all labels into a vector
+  file_stream_bin.read((char*)all_labels.data(), sizeof(GNNLabel) * num_nodes);
+
+  using_single_class_labels_ = true;
+  local_ground_truth_labels_.resize(partitioned_graph_->size());
+
+  galois::GAccumulator<size_t> found_local_vertices;
+  found_local_vertices.reset();
+
+  // save only local ones; can do in parallel as well
+  // assumes -1 already dealt with
+  galois::do_all(galois::iterate(size_t{0}, partitioned_graph_->size()),
+                 [&](size_t lid) {
+                   local_ground_truth_labels_[lid] = all_labels[GetGID(lid)];
+                   found_local_vertices += 1;
+                 });
+
+  size_t fli = found_local_vertices.reduce();
+  galois::gInfo(host_prefix_, "Read ", fli, " labels (",
+                local_ground_truth_labels_.size() * double{4} / (1 << 30),
+                " GB)");
+  GALOIS_LOG_ASSERT(fli == partitioned_graph_->size());
+}
 
 void galois::graphs::GNNGraph::ReadLocalLabels(const std::string& dataset_name,
                                                bool has_single_class_label) {
@@ -380,23 +428,25 @@ void galois::graphs::GNNGraph::ReadLocalFeatures(
                               node_feature_length_);
 
   // copy over features for local nodes only
-  size_t num_kept_vertices = 0;
-  for (size_t gid = 0; gid < num_global_vertices; gid++) {
-    if (partitioned_graph_->isLocal(gid)) {
-      // copy over feature vector
-      std::copy(full_feature_set.get() + gid * node_feature_length_,
-                full_feature_set.get() + (gid + 1) * node_feature_length_,
-                &local_node_features_[partitioned_graph_->getLID(gid) *
-                                      node_feature_length_]);
-      num_kept_vertices++;
-    }
-  }
+  galois::GAccumulator<size_t> num_kept_vertices;
+  num_kept_vertices.reset();
+  galois::do_all(
+      galois::iterate(size_t{0}, num_global_vertices), [&](size_t gid) {
+        if (partitioned_graph_->isLocal(gid)) {
+          // copy over feature vector
+          std::copy(full_feature_set.get() + gid * node_feature_length_,
+                    full_feature_set.get() + (gid + 1) * node_feature_length_,
+                    &local_node_features_[partitioned_graph_->getLID(gid) *
+                                          node_feature_length_]);
+          num_kept_vertices += 1;
+        }
+      });
   full_feature_set.reset();
 
   galois::gInfo(host_prefix_, "Read ", local_node_features_.size(),
                 " features (",
                 local_node_features_.size() * double{4} / (1 << 30), " GB)");
-  GALOIS_LOG_ASSERT(num_kept_vertices == partitioned_graph_->size());
+  GALOIS_LOG_ASSERT(num_kept_vertices.reduce() == partitioned_graph_->size());
 }
 
 //! Helper function to read masks from file into the appropriate structures
@@ -516,6 +566,35 @@ void galois::graphs::GNNGraph::ReadLocalMasks(const std::string& dataset_name) {
         local_testing_mask_[partitioned_graph_->getLID(i)] = 1;
       }
     }
+  } else if (dataset_name == "ogbn-papers100M-remap") {
+    global_training_mask_range_ = {.begin = 0, .end = 1207178, .size = 1207178};
+    global_validation_mask_range_ = {
+        .begin = 1207178, .end = 1207178 + 125264, .size = 125264};
+    global_testing_mask_range_ = {
+        .begin = 1332442, .end = 1332442 + 214337, .size = 214337};
+    // training
+    for (size_t i = global_training_mask_range_.begin;
+         i < global_training_mask_range_.end; i++) {
+      if (partitioned_graph_->isLocal(i)) {
+        local_training_mask_[partitioned_graph_->getLID(i)] = 1;
+      }
+    }
+    // validation
+    for (size_t i = global_validation_mask_range_.begin;
+         i < global_validation_mask_range_.end; i++) {
+      if (partitioned_graph_->isLocal(i)) {
+        local_validation_mask_[partitioned_graph_->getLID(i)] = 1;
+      }
+    }
+    // testing
+    for (size_t i = global_testing_mask_range_.begin;
+         i < global_testing_mask_range_.end; i++) {
+      if (partitioned_graph_->isLocal(i)) {
+        local_testing_mask_[partitioned_graph_->getLID(i)] = 1;
+      }
+    }
+    valid_other_ = FindOtherMask();
+    GALOIS_LOG_ASSERT(valid_other_ == 109513177);
   } else {
     size_t valid_train = ReadLocalMasksFromFile(dataset_name, "train",
                                                 &global_training_mask_range_,
@@ -1096,3 +1175,145 @@ void galois::graphs::GNNGraph::ResizeGPULayerVector(size_t num_layers) {
   resize_CUDA_layer_vector(cuda_ctx_, num_layers);
 }
 #endif
+void galois::graphs::GNNGraph::ContiguousRemap(const std::string& new_name) {
+  node_remapping_.resize(partitioned_graph_->size());
+
+  uint32_t new_node_id = 0;
+
+  // serial loops because new ID needs to be kept consistent
+  // first, train nodes
+  for (size_t cur_node = 0; cur_node < partitioned_graph_->size(); cur_node++) {
+    if (IsValidForPhase(cur_node, GNNPhase::kTrain)) {
+      node_remapping_[new_node_id++] = cur_node;
+    }
+  }
+  galois::gInfo("Train nodes are from 0 to ", new_node_id);
+
+  // second, val nodes
+  uint32_t val_start = new_node_id;
+  for (size_t cur_node = 0; cur_node < partitioned_graph_->size(); cur_node++) {
+    if (IsValidForPhase(cur_node, GNNPhase::kValidate)) {
+      node_remapping_[new_node_id++] = cur_node;
+    }
+  }
+  galois::gInfo("Val nodes are from ", val_start, " to ", new_node_id, "(",
+                new_node_id - val_start, ")");
+
+  // third, test nodes
+  uint32_t test_start = new_node_id;
+  for (size_t cur_node = 0; cur_node < partitioned_graph_->size(); cur_node++) {
+    if (IsValidForPhase(cur_node, GNNPhase::kTest)) {
+      node_remapping_[new_node_id++] = cur_node;
+    }
+  }
+  galois::gInfo("Test nodes are from ", test_start, " to ", new_node_id, "(",
+                new_node_id - test_start, ")");
+
+  // last, everything else
+  uint32_t other_start = new_node_id;
+  for (size_t cur_node = 0; cur_node < partitioned_graph_->size(); cur_node++) {
+    if (IsValidForPhase(cur_node, GNNPhase::kOther)) {
+      node_remapping_[new_node_id++] = cur_node;
+    }
+  }
+  galois::gInfo("Other nodes are from ", other_start, " to ", new_node_id, "(",
+                new_node_id - other_start, ")");
+  GALOIS_LOG_ASSERT(new_node_id == partitioned_graph_->size());
+
+  // remap features to match new node mapping, save to disk
+  // std::vector<GNNFeature> remapped_features(local_node_features_.size());
+  //// do all works because can copy in parallel
+  // galois::do_all(
+  //  galois::iterate(size_t{0}, partitioned_graph_->size()),
+  //  [&] (size_t remap_node_id) {
+  //    std::memcpy(
+  //        &(remapped_features[remap_node_id * node_feature_length_]),
+  //        &((local_node_features_.data())[node_remapping_[remap_node_id] *
+  //        node_feature_length_]), node_feature_length_ * sizeof(GNNFeature));
+  //  }
+  //);
+  //// sanity check
+  // galois::do_all(
+  //  galois::iterate(size_t{0}, partitioned_graph_->size()),
+  //  [&] (size_t remap_node_id) {
+  //    for (size_t i = 0; i < node_feature_length_; i++) {
+  //      GALOIS_LOG_ASSERT(remapped_features[remap_node_id *
+  //      node_feature_length_ + i] ==
+  //                        local_node_features_[node_remapping_[remap_node_id]
+  //                        * node_feature_length_ + i]);
+  //    }
+  //  }
+  //);
+  //// save to disk
+  // std::ofstream write_file_stream;
+  // std::string feature_file = input_directory_ + new_name + "-feats.bin";
+  // galois::gPrint(feature_file, "\n");
+  // write_file_stream.open(feature_file, std::ios::binary | std::ios::out);
+  // write_file_stream.write((char*)remapped_features.data(), sizeof(GNNFeature)
+  // *
+  //                                                   partitioned_graph_->size()
+  //                                                   * node_feature_length_);
+  // write_file_stream.close();
+
+  // std::ifstream file_stream;
+  // file_stream.open(feature_file, std::ios::binary | std::ios::in);
+  // file_stream.read((char*)remapped_features.data(), sizeof(GNNFloat) *
+  //                                                  partitioned_graph_->size()
+  //                                                  * node_feature_length_);
+  // file_stream.close();
+  //// sanity check again
+  // galois::do_all(
+  //  galois::iterate(size_t{0}, partitioned_graph_->size()),
+  //  [&] (size_t remap_node_id) {
+  //    for (size_t i = 0; i < node_feature_length_; i++) {
+  //      GALOIS_LOG_ASSERT(remapped_features[remap_node_id *
+  //      node_feature_length_ + i] ==
+  //                        local_node_features_[node_remapping_[remap_node_id]
+  //                        * node_feature_length_ + i]);
+  //    }
+  //  }
+  //);
+  // remapped_features.clear();
+
+  // std::vector<GNNLabel> remapped_labels(local_ground_truth_labels_.size());
+  //// save new labels order to disk (binary file)
+  // galois::do_all(
+  //  galois::iterate(size_t{0}, partitioned_graph_->size()),
+  //  [&] (size_t remap_node_id) {
+  //    remapped_labels[remap_node_id] =
+  //    local_ground_truth_labels_[node_remapping_[remap_node_id]];
+  //  }
+  //);
+
+  // std::string label_filename = input_directory_ + new_name + "-labels.bin";
+  // std::ofstream label_write_stream;
+  // label_write_stream.open(label_filename, std::ios::binary | std::ios::out);
+  // label_write_stream.write((char*)remapped_labels.data(), sizeof(GNNLabel) *
+  //                                                        partitioned_graph_->size());
+  // label_write_stream.close();
+
+  // galois::do_all(
+  //  galois::iterate(size_t{0}, partitioned_graph_->size()),
+  //  [&] (size_t remap_node_id) {
+  //    remapped_labels[remap_node_id] =
+  //    local_ground_truth_labels_[remap_node_id];
+  //  }
+  //);
+  // ReadLocalLabelsBin(new_name);
+  // galois::do_all(
+  //  galois::iterate(size_t{0}, partitioned_graph_->size()),
+  //  [&] (size_t remap_node_id) {
+  //    GALOIS_LOG_ASSERT(local_ground_truth_labels_[remap_node_id] ==
+  //    remapped_labels[node_remapping_[remap_node_id]]);
+  //  }
+  //);
+
+  // save the mapping to a binary file for use by graph convert to deal with
+  // the gr
+  std::string label_filename = input_directory_ + new_name + "-mapping.bin";
+  std::ofstream label_write_stream;
+  label_write_stream.open(label_filename, std::ios::binary | std::ios::out);
+  label_write_stream.write((char*)node_remapping_.data(),
+                           sizeof(uint32_t) * node_remapping_.size());
+  label_write_stream.close();
+}
diff --git a/libgnn/test/gnngraph-test.cpp b/libgnn/test/gnngraph-test.cpp
index 101540f4d5..6e12b13899 100644
--- a/libgnn/test/gnngraph-test.cpp
+++ b/libgnn/test/gnngraph-test.cpp
@@ -22,5 +22,16 @@ int main() {
   galois::graphs::GNNGraph("cora", galois::graphs::GNNPartitionScheme::kCVC,
                            true);
 
+  // below for when I want to check the remapper
+  // galois::graphs::GNNGraph remapper("ogbn-papers100M",
+  // galois::graphs::GNNPartitionScheme::kOEC, true);
+  // remapper.ContiguousRemap("ogbn-papers100M-remap");
+  // galois::graphs::GNNGraph remapper("ogbn-papers100M-remap",
+  // galois::graphs::GNNPartitionScheme::kOEC, true);
+
+  // galois::graphs::GNNGraph remapper("yelp",
+  // galois::graphs::GNNPartitionScheme::kOEC, true);
+  // remapper.ContiguousRemap("yelp-remap");
+
   return 0;
 }
diff --git a/libgnn/test/gpu-convlayer-test.cpp b/libgnn/test/gpu-convlayer-test.cpp
index 553d96e1a2..3a822cf9c5 100644
--- a/libgnn/test/gpu-convlayer-test.cpp
+++ b/libgnn/test/gpu-convlayer-test.cpp
@@ -139,8 +139,8 @@ int main() {
   // since layer isn't 0 anymore, backward phase will actually return something
   dummy_ones = layer_1->AllocateGPU(dummy_ones_v);
   layer_1->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
-  const galois::PointerWithSize<galois::GNNFloat>&
-      layer_1_backward_output = layer_1->CopyBackwardOutputFromGPU();
+  const galois::PointerWithSize<galois::GNNFloat>& layer_1_backward_output =
+      layer_1->CopyBackwardOutputFromGPU();
 
   //////////////////////////////////////////////////////////////////////////////
   // check that multiplies go as expected

From fa608aecf7713741e5de142ad88ec6092a40394d Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 13 May 2021 14:10:20 -0500
Subject: [PATCH 540/660] GNN subgraph view implementation

Adds another form of subgraph creation: views. Does not construct the
explicit subgraph but only creates the LID-SID mappings. Issue is that
this moves the overhead to the forward/backward aggregation where the
program must now loop over a very large number of nodes + do mappings
from LID to SID in the background; per minibatch this overhead adds up.
---
 libgnn/include/galois/graphs/GNNGraph.h    | 121 ++++++++++++++----
 libgnn/include/galois/graphs/GNNSubgraph.h |   7 +-
 libgnn/src/graphs/GNNGraph.cpp             |  43 +++++--
 libgnn/src/graphs/GNNSubgraph.cpp          | 137 ++++++++++++---------
 4 files changed, 212 insertions(+), 96 deletions(-)

diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index f970288718..154a4027ff 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -34,12 +34,15 @@ enum class GNNPartitionScheme { kOEC, kCVC, kOCVC };
 class GNNGraph {
 public:
   using GNNDistGraph = galois::graphs::DistGraph<char, void>;
-  using WholeGraph   = galois::graphs::LC_CSR_Graph<char, void>;
   using GraphNode    = GNNDistGraph::GraphNode;
   // defined as such because dist graph range objects used long unsigned
   using NodeIterator = boost::counting_iterator<size_t>;
   using EdgeIterator = GNNDistGraph::edge_iterator;
 
+  // using GNNEdgeSortIterator = internal::EdgeSortIterator<GraphNode,
+  //  uint64_t, galois::LargeArray<uint32_t>,
+  //  galois::LargeArray<std::vector<bool>>>;
+
   GNNGraph(const std::string& dataset_name, GNNPartitionScheme partition_scheme,
            bool has_single_class_label);
   //! Loads a graph and all relevant metadata (labels, features, masks, etc.)
@@ -68,7 +71,7 @@ class GNNGraph {
   size_t size() const { return partitioned_graph_->size(); }
   //! Returns # of nodes in the *graph that is currently active*.
   size_t active_size() const {
-    if (!use_subgraph_) {
+    if (!use_subgraph_ && !use_subgraph_view_) {
       return partitioned_graph_->size();
     } else {
       return subgraph_->size();
@@ -81,7 +84,7 @@ class GNNGraph {
 
   //! Node begin for all local nodes
   NodeIterator begin() const {
-    if (!use_subgraph_) {
+    if (!use_subgraph_ && !use_subgraph_view_) {
       return partitioned_graph_->allNodesRange().begin();
     } else {
       return subgraph_->begin();
@@ -89,7 +92,7 @@ class GNNGraph {
   }
   //! Node end for all local nodes
   NodeIterator end() const {
-    if (!use_subgraph_) {
+    if (!use_subgraph_ && !use_subgraph_view_) {
       return partitioned_graph_->allNodesRange().end();
     } else {
       return subgraph_->end();
@@ -97,7 +100,7 @@ class GNNGraph {
   }
 
   NodeIterator begin_owned() const {
-    if (!use_subgraph_) {
+    if (!use_subgraph_ && !use_subgraph_view_) {
       return partitioned_graph_->masterNodesRange().begin();
     } else {
       return subgraph_->begin_owned();
@@ -105,7 +108,7 @@ class GNNGraph {
   }
 
   NodeIterator end_owned() const {
-    if (!use_subgraph_) {
+    if (!use_subgraph_ && !use_subgraph_view_) {
       return partitioned_graph_->masterNodesRange().end();
     } else {
       return subgraph_->end_owned();
@@ -126,32 +129,46 @@ class GNNGraph {
 
   // All following functions take a local node id
   EdgeIterator edge_begin(GraphNode n) const {
-    if (!use_subgraph_) {
+    if (!use_subgraph_ && !use_subgraph_view_) {
       return partitioned_graph_->edge_begin(n);
+    } else if (use_subgraph_view_) {
+      return partitioned_graph_->edge_begin(ConvertToLID(n));
     } else {
       return subgraph_->edge_begin(n);
     }
   };
 
   EdgeIterator edge_end(GraphNode n) const {
-    if (!use_subgraph_) {
+    if (!use_subgraph_ && !use_subgraph_view_) {
       return partitioned_graph_->edge_end(n);
+    } else if (use_subgraph_view_) {
+      return partitioned_graph_->edge_end(ConvertToLID(n));
     } else {
       return subgraph_->edge_end(n);
     }
   };
   GraphNode GetEdgeDest(EdgeIterator ei) const {
-    if (!use_subgraph_) {
+    if (!use_subgraph_ && !use_subgraph_view_) {
       return partitioned_graph_->getEdgeDst(ei);
+    } else if (use_subgraph_view_) {
+      // WARNING: this may return max of uint32 if the edge destination doesn't
+      // exist in the subgraph view
+      // get edge dest should NOT be called in that case
+      GraphNode rv = ConvertToSID(partitioned_graph_->getEdgeDst(ei));
+      assert(rv != std::numeric_limits<uint32_t>::max());
+      return rv;
     } else {
       return subgraph_->GetEdgeDest(ei);
     }
   };
+
   galois::runtime::iterable<
       galois::NoDerefIterator<GNNDistGraph::edge_iterator>>
   edges(GraphNode N) const {
-    if (!use_subgraph_) {
+    if (!use_subgraph_ && !use_subgraph_view_) {
       return partitioned_graph_->edges(N);
+    } else if (use_subgraph_view_) {
+      return partitioned_graph_->edges(ConvertToLID(N));
     } else {
       return subgraph_->edges(N);
     }
@@ -166,14 +183,16 @@ class GNNGraph {
   }
   bool IsEdgeSampled(uint32_t ei, size_t layer_num) const {
     if (!use_subgraph_) {
+      // view uses original graph edge iterators
       return edge_sample_status_[ei][layer_num];
     } else {
-      GALOIS_LOG_FATAL("This shouldn't be called with subgraph");
+      return subgraph_->OutEdgeSampled(ei, layer_num, *this);
       return false;
     }
   };
   bool IsEdgeSampled(EdgeIterator ei, size_t layer_num) const {
     if (!use_subgraph_) {
+      // view uses original graph edge iterators
       return edge_sample_status_[*ei][layer_num];
     } else {
       return subgraph_->OutEdgeSampled(ei, layer_num, *this);
@@ -193,19 +212,32 @@ class GNNGraph {
     edge_sample_status_[*ei][layer_num] = 0;
   };
 
+  // GNNEdgeSortIterator EdgeSortBegin(GraphNode n) {
+  //  return GNNEdgeSortIterator(*edge_begin(n),
+  //  partitioned_graph_->edge_dst_ptr_LA(), &edge_sample_status_);
+  //}
+  // GNNEdgeSortIterator EdgeSortEnd(GraphNode n) {
+  //  return GNNEdgeSortIterator(*edge_begin(n),
+  //  partitioned_graph_->edge_dst_ptr_LA(), &edge_sample_status_);
+  //}
+
   //////////////////////////////////////////////////////////////////////////////
   // in edges
   //////////////////////////////////////////////////////////////////////////////
   EdgeIterator in_edge_begin(GraphNode n) const {
-    if (!use_subgraph_) {
+    if (!use_subgraph_ && !use_subgraph_view_) {
       return partitioned_graph_->in_edge_begin(n);
+    } else if (use_subgraph_view_) {
+      return partitioned_graph_->in_edge_begin(ConvertToLID(n));
     } else {
       return subgraph_->in_edge_begin(n);
     }
   }
   EdgeIterator in_edge_end(GraphNode n) const {
-    if (!use_subgraph_) {
+    if (!use_subgraph_ && !use_subgraph_view_) {
       return partitioned_graph_->in_edge_end(n);
+    } else if (use_subgraph_view_) {
+      return partitioned_graph_->in_edge_end(ConvertToLID(n));
     } else {
       return subgraph_->in_edge_end(n);
     }
@@ -213,15 +245,22 @@ class GNNGraph {
   galois::runtime::iterable<
       galois::NoDerefIterator<GNNDistGraph::edge_iterator>>
   in_edges(GraphNode N) const {
-    if (!use_subgraph_) {
+    if (!use_subgraph_ && !use_subgraph_view_) {
       return partitioned_graph_->in_edges(N);
+    } else if (use_subgraph_view_) {
+      return partitioned_graph_->in_edges(ConvertToLID(N));
     } else {
       return subgraph_->in_edges(N);
     }
   }
   GraphNode GetInEdgeDest(EdgeIterator ei) const {
-    if (!use_subgraph_) {
+    if (!use_subgraph_ && !use_subgraph_view_) {
+      return partitioned_graph_->GetInEdgeDest(ei);
+    } else if (use_subgraph_view_) {
       return partitioned_graph_->GetInEdgeDest(ei);
+      GraphNode rv = ConvertToSID(partitioned_graph_->GetInEdgeDest(ei));
+      assert(rv != std::numeric_limits<uint32_t>::max());
+      return rv;
     } else {
       return subgraph_->GetInEdgeDest(ei);
     }
@@ -241,6 +280,7 @@ class GNNGraph {
   };
   bool IsInEdgeSampled(EdgeIterator ei, size_t layer_num) const {
     if (!use_subgraph_) {
+      // view can use this fine + requires it
       return edge_sample_status_[partitioned_graph_->InEdgeToOutEdge(ei)]
                                 [layer_num];
     } else {
@@ -274,20 +314,28 @@ class GNNGraph {
   size_t SampleEdges(size_t sample_layer_num, size_t num_to_sample,
                      bool inductive_subgraph, size_t timestamp);
 
+  size_t ConstructSampledSubgraph(size_t num_sampled_layers) {
+    return ConstructSampledSubgraph(num_sampled_layers, false);
+  };
   //! Construct the subgraph from sampled edges and corresponding nodes
-  size_t ConstructSampledSubgraph(size_t num_sampled_layers);
+  size_t ConstructSampledSubgraph(size_t num_sampled_layers, bool use_view);
 
   unsigned SampleNodeTimestamp(unsigned lid) const {
     return sample_node_timestamps_[lid];
   }
 
   void EnableSubgraph() { use_subgraph_ = true; }
-  void DisableSubgraph() { use_subgraph_ = false; }
+  void EnableSubgraphView() { use_subgraph_view_ = true; }
+  void DisableSubgraph() {
+    use_subgraph_      = false;
+    use_subgraph_view_ = false;
+  }
   bool IsSubgraphOn() const { return use_subgraph_; }
+  bool IsSubgraphViewOn() const { return use_subgraph_view_; }
 
   //! Converts an id to an lid for the graph if subgraphs are in use
   uint32_t ConvertToLID(GraphNode sid) const {
-    if (use_subgraph_) {
+    if (use_subgraph_ || use_subgraph_view_) {
       return subgraph_->SIDToLID(sid);
     } else {
       return sid;
@@ -295,7 +343,7 @@ class GNNGraph {
   }
   //! Converts an LID to an SID if subgraphs are in use
   uint32_t ConvertToSID(GraphNode lid) const {
-    if (use_subgraph_) {
+    if (use_subgraph_ || use_subgraph_view_) {
       return subgraph_->LIDToSID(lid);
     } else {
       return lid;
@@ -303,7 +351,7 @@ class GNNGraph {
   }
   //! Converts SID to GID if subgraphs in use (else just return GID)
   uint32_t SIDToGID(GraphNode sid) const {
-    if (use_subgraph_) {
+    if (use_subgraph_ || use_subgraph_view_) {
       return GetGID(subgraph_->SIDToLID(sid));
     } else {
       return GetGID(sid);
@@ -312,13 +360,34 @@ class GNNGraph {
   //! Returns a pointer to the LID to SID map from the subgraph if subgraphs
   //! are in use
   galois::LargeArray<uint32_t>* GetLIDToSIDPointer() {
-    if (use_subgraph_) {
+    if (use_subgraph_ || use_subgraph_view_) {
       return subgraph_->GetLIDToSIDPointer();
     } else {
       return nullptr;
     }
   }
 
+  // void SortAllInEdgesBySID() {
+  //  // check it out for node 0
+  //  //for (auto iter : in_edges(0)) {
+  //  //  galois::gInfo("0 to ", GetInEdgeDest(*iter), " with in out edge map ",
+  //  *InEdgeToOutEdge(iter), " SID ",
+  //  subgraph_->LIDToSID(GetInEdgeDest(*iter)));
+  //  //}
+  //  //galois::gInfo("Starting sort");
+  //  galois::StatTimer t("SortBySID");
+  //  t.start();
+  //  partitioned_graph_->SortAllInEdgesBySID(*(subgraph_->GetLIDToSIDPointer()));
+  //  t.stop();
+  //  galois::gInfo("sort took ", t.get());
+  //  //galois::gInfo("End Sort");
+  //  //for (auto iter : in_edges(0)) {
+  //  //  galois::gInfo("0 to ", GetInEdgeDest(*iter), " with in out edge map ",
+  //  *InEdgeToOutEdge(iter), " SID ",
+  //  subgraph_->LIDToSID(GetInEdgeDest(*iter)));
+  //  //}
+  //}
+
   //////////////////////////////////////////////////////////////////////////////
   void SetupTrainBatcher(size_t train_batch_size) {
     if (train_batcher_) {
@@ -364,7 +433,7 @@ class GNNGraph {
 
   //! Get degree norm of subgraph for particular layer (i.e. includes training)
   GNNFloat GetDegreeNorm(GraphNode n, size_t graph_user_layer_num) const {
-    if (use_subgraph_) {
+    if (use_subgraph_ || use_subgraph_view_) {
       size_t degree;
       if (!subgraph_is_train_) {
         // case because degrees in each layer differ
@@ -373,6 +442,7 @@ class GNNGraph {
       } else {
         degree = global_train_degrees_[subgraph_->SIDToLID(n)];
       }
+
       if (degree) {
         return 1.0 / degree;
       } else {
@@ -394,7 +464,7 @@ class GNNGraph {
   GNNFloat GetSingleClassLabel(const unsigned lid) const {
     assert(using_single_class_labels_);
     unsigned to_use = lid;
-    if (use_subgraph_) {
+    if (use_subgraph_ || use_subgraph_view_) {
       to_use = subgraph_->SIDToLID(lid);
     }
 
@@ -424,7 +494,7 @@ class GNNGraph {
                              local_node_features_.size());
     }
 #endif
-    if (!use_subgraph_) {
+    if (!use_subgraph_ && !use_subgraph_view_) {
       return PointerWithSize(local_node_features_);
     } else {
       return PointerWithSize(subgraph_->GetLocalFeatures().data(),
@@ -440,7 +510,7 @@ class GNNGraph {
     // XXX maybe just map this all over to subgraph, though in that case
     // issue is that subgraph doesn't necessarily know about test/val
     unsigned to_use = lid;
-    if (use_subgraph_) {
+    if (use_subgraph_ || use_subgraph_view_) {
       to_use = subgraph_->SIDToLID(lid);
     }
     // re: phase checks in this if: ranges are not used for these
@@ -653,6 +723,7 @@ class GNNGraph {
 
   // TODO vars for subgraphs as necessary
   bool use_subgraph_{false};
+  bool use_subgraph_view_{false};
   bool subgraph_is_train_{false};
 
   std::unique_ptr<MinibatchGenerator> train_batcher_;
diff --git a/libgnn/include/galois/graphs/GNNSubgraph.h b/libgnn/include/galois/graphs/GNNSubgraph.h
index 29b4429e17..0a7f2670c7 100644
--- a/libgnn/include/galois/graphs/GNNSubgraph.h
+++ b/libgnn/include/galois/graphs/GNNSubgraph.h
@@ -17,6 +17,8 @@ class GNNSubgraph {
   //! for the sampled bits
   size_t BuildSubgraph(GNNGraph& gnn_graph, size_t num_sampled_layers);
 
+  size_t BuildSubgraphView(GNNGraph& gnn_graph, size_t num_sampled_layers);
+
   galois::gstl::Vector<GNNFeature>& GetLocalFeatures() {
     return subgraph_node_features_;
   }
@@ -99,8 +101,9 @@ class GNNSubgraph {
 private:
   //! Creates subgraph ID mapping from the number of sampled nodes from the
   //! original graph. Should be done every epoch when sampled graph changes.
-  void CreateLocalToSubgraphMapping(const GNNGraph& gnn_graph,
-                                    size_t num_sampled_layers);
+  void CreateSubgraphMapping(const GNNGraph& gnn_graph,
+                             size_t num_sampled_layers);
+
   //! Counts in and out degrees of all sampled nodes in the graph
   void DegreeCounting(const GNNGraph& gnn_graph);
   //! Creates edges
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index 0c1d3b4d8f..0a8a29ad0e 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -204,7 +204,7 @@ void galois::graphs::GNNGraph::AggregateSync(GNNFloat* matrix_to_sync,
                                              bool is_backward) const {
   gnn_matrix_to_sync_               = matrix_to_sync;
   gnn_matrix_to_sync_column_length_ = matrix_column_size;
-  if (!use_subgraph_) {
+  if (!use_subgraph_ && !use_subgraph_view_) {
     // set globals for the sync substrate
     if (!is_backward) {
       sync_substrate_
@@ -594,7 +594,7 @@ void galois::graphs::GNNGraph::ReadLocalMasks(const std::string& dataset_name) {
       }
     }
     valid_other_ = FindOtherMask();
-    GALOIS_LOG_ASSERT(valid_other_ == 109513177);
+    GALOIS_LOG_ASSERT(valid_other_ <= 109513177);
   } else {
     size_t valid_train = ReadLocalMasksFromFile(dataset_name, "train",
                                                 &global_training_mask_range_,
@@ -707,8 +707,8 @@ float galois::graphs::GNNGraph::GetGlobalAccuracyCPUSingle(
   size_t global_correct = num_correct_.reduce();
   size_t global_checked = total_checked_.reduce();
 
-  galois::gDebug("Sub: {}, Accuracy: {} / {}", use_subgraph_, global_correct,
-                 global_checked);
+  GALOIS_LOG_DEBUG("Sub: {}, Accuracy: {} / {}", use_subgraph_, global_correct,
+                   global_checked);
 
   return static_cast<float>(global_correct) /
          static_cast<float>(global_checked);
@@ -850,7 +850,9 @@ void galois::graphs::GNNGraph::InitializeSamplingData(size_t num_layers,
 }
 
 size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) {
-  use_subgraph_ = false;
+  use_subgraph_      = false;
+  use_subgraph_view_ = false;
+
   bitset_sample_flag_.resize(size());
   bitset_sample_flag_.reset();
 
@@ -913,7 +915,8 @@ size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) {
 size_t galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num,
                                                 bool inductive_subgraph,
                                                 size_t timestamp) {
-  use_subgraph_ = false;
+  use_subgraph_      = false;
+  use_subgraph_view_ = false;
 
   galois::GAccumulator<size_t> sampled;
   galois::GAccumulator<size_t> total;
@@ -982,7 +985,8 @@ size_t galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num,
                                              bool inductive_subgraph,
                                              size_t timestamp) {
   assert(!subgraph_is_train_);
-  use_subgraph_ = false;
+  use_subgraph_      = false;
+  use_subgraph_view_ = false;
 
   galois::GAccumulator<size_t> sampled;
   galois::GAccumulator<size_t> total;
@@ -1077,19 +1081,36 @@ size_t galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num,
 
 //! Construct the subgraph from sampled edges and corresponding nodes
 size_t
-galois::graphs::GNNGraph::ConstructSampledSubgraph(size_t num_sampled_layers) {
+galois::graphs::GNNGraph::ConstructSampledSubgraph(size_t num_sampled_layers,
+                                                   bool use_view) {
   // false first so that the build process can use functions to access the
   // real graph
   use_subgraph_            = false;
+  use_subgraph_view_       = false;
   gnn_sampled_out_degrees_ = &sampled_out_degrees_;
+
   // first, sync the degres of the sampled edges across all hosts
   sync_substrate_
       ->sync<writeSource, readAny, SubgraphDegreeSync, SubgraphDegreeBitset>(
           "SubgraphDegree");
-  size_t num_subgraph_nodes =
-      subgraph_->BuildSubgraph(*this, num_sampled_layers);
+  size_t num_subgraph_nodes;
+  // use_view = true;
+  if (!use_view) {
+    num_subgraph_nodes = subgraph_->BuildSubgraph(*this, num_sampled_layers);
+  } else {
+    // a view only has lid<->sid mappings
+    num_subgraph_nodes =
+        subgraph_->BuildSubgraphView(*this, num_sampled_layers);
+    //SortAllInEdgesBySID();
+  }
+
   // after this, this graph is a subgraph
-  use_subgraph_ = true;
+  if (!use_view) {
+    use_subgraph_ = true;
+  } else {
+    use_subgraph_view_ = true;
+  }
+
   return num_subgraph_nodes;
 }
 
diff --git a/libgnn/src/graphs/GNNSubgraph.cpp b/libgnn/src/graphs/GNNSubgraph.cpp
index 67d4b74fd0..f5bde956f2 100644
--- a/libgnn/src/graphs/GNNSubgraph.cpp
+++ b/libgnn/src/graphs/GNNSubgraph.cpp
@@ -5,7 +5,7 @@ size_t galois::graphs::GNNGraph::GNNSubgraph::BuildSubgraph(
     GNNGraph& gnn_graph, size_t num_sampled_layers) {
   galois::StatTimer timer("BuildSubgraph", kRegionName);
   timer.start();
-  CreateLocalToSubgraphMapping(gnn_graph, num_sampled_layers);
+  CreateSubgraphMapping(gnn_graph, num_sampled_layers);
   if (num_subgraph_nodes_ == 0) {
     return 0;
   }
@@ -18,9 +18,19 @@ size_t galois::graphs::GNNGraph::GNNSubgraph::BuildSubgraph(
   return num_subgraph_nodes_;
 }
 
-void galois::graphs::GNNGraph::GNNSubgraph::CreateLocalToSubgraphMapping(
+size_t galois::graphs::GNNGraph::GNNSubgraph::BuildSubgraphView(
+    GNNGraph& gnn_graph, size_t num_sampled_layers) {
+  galois::StatTimer timer("BuildSubgraphView", kRegionName);
+  timer.start();
+  CreateSubgraphMapping(gnn_graph, num_sampled_layers);
+  NodeFeatureCreation(gnn_graph);
+  timer.stop();
+  return num_subgraph_nodes_;
+}
+
+void galois::graphs::GNNGraph::GNNSubgraph::CreateSubgraphMapping(
     const GNNGraph& gnn_graph, size_t num_sampled_layers) {
-  galois::StatTimer timer("LIDToSIDMapping", kRegionName);
+  galois::StatTimer timer("SIDMapping", kRegionName);
   timer.start();
 
   assert(gnn_graph.size() == lid_to_subgraph_id_.size());
@@ -28,6 +38,17 @@ void galois::graphs::GNNGraph::GNNSubgraph::CreateLocalToSubgraphMapping(
   std::fill(lid_to_subgraph_id_.begin(), lid_to_subgraph_id_.end(),
             std::numeric_limits<uint32_t>::max());
 
+  galois::GAccumulator<uint32_t> subgraph_count;
+  subgraph_count.reset();
+  galois::do_all(galois::iterate(gnn_graph.begin(), gnn_graph.end()),
+                 [&](uint32_t node_id) {
+                   if (gnn_graph.IsInSampledGraph(node_id)) {
+                     subgraph_count += 1;
+                   }
+                 });
+  num_subgraph_nodes_ = subgraph_count.reduce();
+  subgraph_id_to_lid_.resize(num_subgraph_nodes_, 0);
+
   // TODO(loc) depending on overhead, can parallelize this with a prefix sum
   // serial loop over LIDs to construct lid -> subgraph id mapping
   uint32_t current_sid = 0;
@@ -39,13 +60,18 @@ void galois::graphs::GNNGraph::GNNSubgraph::CreateLocalToSubgraphMapping(
     if (gnn_graph.SampleNodeTimestamp(local_node_id) == 0) {
       // TODO should bound check the SID to max uint32_t
       // note: if SID is max uint32t, then it's not valid
+      subgraph_id_to_lid_[current_sid]   = local_node_id;
       lid_to_subgraph_id_[local_node_id] = current_sid++;
     }
   }
 
   // all nodes before this SID are master nodes *that matter*
   // NOTE: there is a very subtle distinction here implementation wise
-  // that needs to be resolved in slightly more detail than this
+  // that needs to be resolved in slightly more detail than this;
+  // there may be master nodes that are past this boundary that will
+  // not be covered by this begin_owned loop, which may cause problems down
+  // the line
+  // TODO(loc) see above
   subgraph_master_boundary_ = current_sid;
 
   for (size_t local_node_id = last_owned_node; local_node_id < gnn_graph.size();
@@ -53,6 +79,7 @@ void galois::graphs::GNNGraph::GNNSubgraph::CreateLocalToSubgraphMapping(
     if (gnn_graph.SampleNodeTimestamp(local_node_id) == 0) {
       // TODO should bound check the SID to max uint32_t
       // note: if SID is max uint32t, then it's not valid
+      subgraph_id_to_lid_[current_sid]   = local_node_id;
       lid_to_subgraph_id_[local_node_id] = current_sid++;
     }
   }
@@ -66,6 +93,7 @@ void galois::graphs::GNNGraph::GNNSubgraph::CreateLocalToSubgraphMapping(
     for (size_t local_node_id = 0; local_node_id < gnn_graph.size();
          local_node_id++) {
       if (gnn_graph.SampleNodeTimestamp(local_node_id) == i) {
+        subgraph_id_to_lid_[current_sid]   = local_node_id;
         lid_to_subgraph_id_[local_node_id] = current_sid++;
       }
     }
@@ -73,7 +101,8 @@ void galois::graphs::GNNGraph::GNNSubgraph::CreateLocalToSubgraphMapping(
                    i, " is ", current_sid);
   }
 
-  num_subgraph_nodes_ = current_sid;
+  GALOIS_LOG_ASSERT(num_subgraph_nodes_ == current_sid);
+  // num_subgraph_nodes_ = current_sid;
   timer.stop();
 }
 
@@ -83,35 +112,30 @@ void galois::graphs::GNNGraph::GNNSubgraph::DegreeCounting(
   galois::StatTimer timer("DegreeCounting", kRegionName);
   timer.start();
 
-  subgraph_id_to_lid_.resize(num_subgraph_nodes_);
   local_subgraph_out_degrees_.resize(num_subgraph_nodes_);
   local_subgraph_in_degrees_.resize(num_subgraph_nodes_);
 
   galois::do_all(
-      galois::iterate(gnn_graph.begin(), gnn_graph.end()),
-      [&](uint32_t node_id) {
-        if (gnn_graph.IsInSampledGraph(node_id)) {
-          uint32_t subgraph_id             = lid_to_subgraph_id_[node_id];
-          subgraph_id_to_lid_[subgraph_id] = node_id;
-
-          uint32_t out_degrees = 0;
-          for (auto out_edge_iter : gnn_graph.edges(node_id)) {
-            if (gnn_graph.IsEdgeSampledAny(out_edge_iter)) {
-              out_degrees++;
-            }
+      galois::iterate(begin(), end()),
+      [&](uint32_t subgraph_id) {
+        uint32_t node_id     = subgraph_id_to_lid_[subgraph_id];
+        uint32_t out_degrees = 0;
+        for (auto out_edge_iter : gnn_graph.edges(node_id)) {
+          if (gnn_graph.IsEdgeSampledAny(out_edge_iter)) {
+            out_degrees++;
           }
-          local_subgraph_out_degrees_[subgraph_id] = out_degrees;
+        }
+        local_subgraph_out_degrees_[subgraph_id] = out_degrees;
 
-          uint32_t in_degrees = 0;
-          for (auto in_edge_iter : gnn_graph.in_edges(node_id)) {
-            if (gnn_graph.IsInEdgeSampledAny(in_edge_iter)) {
-              in_degrees++;
-            }
+        uint32_t in_degrees = 0;
+        for (auto in_edge_iter : gnn_graph.in_edges(node_id)) {
+          if (gnn_graph.IsInEdgeSampledAny(in_edge_iter)) {
+            in_degrees++;
           }
-          local_subgraph_in_degrees_[subgraph_id] = in_degrees;
-          // galois::gDebug("Local ID ", node_id, " SID ", subgraph_id, " out ",
-          //               out_degrees, " in ", in_degrees);
         }
+        local_subgraph_in_degrees_[subgraph_id] = in_degrees;
+        // galois::gDebug("Local ID ", node_id, " SID ", subgraph_id, " out ",
+        //               out_degrees, " in ", in_degrees);
       },
       galois::steal());
 
@@ -147,43 +171,40 @@ void galois::graphs::GNNGraph::GNNSubgraph::EdgeCreation(
 
   // save edges + save reference to layer sample status
   galois::do_all(
-      galois::iterate(gnn_graph.begin(), gnn_graph.end()),
-      [&](uint32_t node_id) {
-        if (gnn_graph.IsInSampledGraph(node_id)) {
-          uint32_t subgraph_id = lid_to_subgraph_id_[node_id];
-          assert(subgraph_id != std::numeric_limits<uint32_t>::max());
-          uint32_t out_location = 0;
-          uint32_t in_location  = 0;
-          if (subgraph_id != 0) {
-            out_location = local_subgraph_out_degrees_[subgraph_id - 1];
-            in_location  = local_subgraph_in_degrees_[subgraph_id - 1];
-          }
+      galois::iterate(begin(), end()),
+      [&](uint32_t subgraph_id) {
+        uint32_t node_id = subgraph_id_to_lid_[subgraph_id];
+        assert(subgraph_id != std::numeric_limits<uint32_t>::max());
+        uint32_t out_location = 0;
+        uint32_t in_location  = 0;
+        if (subgraph_id != 0) {
+          out_location = local_subgraph_out_degrees_[subgraph_id - 1];
+          in_location  = local_subgraph_in_degrees_[subgraph_id - 1];
+        }
+
+        for (auto out_edge_iter : gnn_graph.edges(node_id)) {
+          if (gnn_graph.IsEdgeSampledAny(out_edge_iter)) {
+            assert(lid_to_subgraph_id_[gnn_graph.GetEdgeDest(out_edge_iter)] !=
+                   std::numeric_limits<uint32_t>::max());
+            subedge_to_original_edge_[out_location] = *out_edge_iter;
 
-          for (auto out_edge_iter : gnn_graph.edges(node_id)) {
-            if (gnn_graph.IsEdgeSampledAny(out_edge_iter)) {
-              assert(
-                  lid_to_subgraph_id_[gnn_graph.GetEdgeDest(out_edge_iter)] !=
-                  std::numeric_limits<uint32_t>::max());
-              subedge_to_original_edge_[out_location] = *out_edge_iter;
-
-              underlying_graph_.constructEdge(
-                  out_location++,
-                  lid_to_subgraph_id_[gnn_graph.GetEdgeDest(out_edge_iter)]);
-            }
+            underlying_graph_.constructEdge(
+                out_location++,
+                lid_to_subgraph_id_[gnn_graph.GetEdgeDest(out_edge_iter)]);
           }
+        }
 
-          for (auto in_edge_iter : gnn_graph.in_edges(node_id)) {
-            if (gnn_graph.IsInEdgeSampledAny(in_edge_iter)) {
-              in_subedge_to_original_edge_[in_location] =
-                  *(gnn_graph.InEdgeToOutEdge(in_edge_iter));
-              underlying_graph_.ConstructInEdge(
-                  in_location++,
-                  lid_to_subgraph_id_[gnn_graph.GetInEdgeDest(in_edge_iter)]);
-            }
+        for (auto in_edge_iter : gnn_graph.in_edges(node_id)) {
+          if (gnn_graph.IsInEdgeSampledAny(in_edge_iter)) {
+            in_subedge_to_original_edge_[in_location] =
+                *(gnn_graph.InEdgeToOutEdge(in_edge_iter));
+            underlying_graph_.ConstructInEdge(
+                in_location++,
+                lid_to_subgraph_id_[gnn_graph.GetInEdgeDest(in_edge_iter)]);
           }
-          assert(out_location == local_subgraph_out_degrees_[subgraph_id]);
-          assert(in_location == local_subgraph_in_degrees_[subgraph_id]);
         }
+        assert(out_location == local_subgraph_out_degrees_[subgraph_id]);
+        assert(in_location == local_subgraph_in_degrees_[subgraph_id]);
       },
       galois::steal());
   timer.stop();

From 27826c21232b4d27a13d9720fcd716aac81f9497 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 13 May 2021 16:19:51 -0500
Subject: [PATCH 541/660] Training subgraph resizes rows in GNN; degree norm

1) resize gnn layer row counts to reduce linear xform cost
2) training subgraph uses global degrees now since it will take all
edges and not just training nodes
---
 libgnn/include/galois/graphs/GNNGraph.h |  4 ++-
 libgnn/src/GraphNeuralNetwork.cpp       | 38 +++++++++++++++----------
 2 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 154a4027ff..f78ab15bfc 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -440,7 +440,9 @@ class GNNGraph {
         degree =
             sampled_out_degrees_[graph_user_layer_num][subgraph_->SIDToLID(n)];
       } else {
-        degree = global_train_degrees_[subgraph_->SIDToLID(n)];
+        // XXX if inductive
+        // degree = global_train_degrees_[subgraph_->SIDToLID(n)];
+        degree = global_degrees_[subgraph_->SIDToLID(n)];
       }
 
       if (degree) {
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index 9d45265afe..ce2a111af8 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -160,14 +160,16 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork(
 float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
   const size_t this_host = graph_->host_id();
   float train_accuracy{0.f};
-  size_t train_subgraph_nodes = 0;
+  std::vector<size_t> subgraph_layer_sizes;
   // this subgraph only needs to be created once
   if (config_.use_train_subgraph_ && !config_.train_minibatch_size()) {
     // Setup the subgraph to only be the training graph
     size_t local_seed_node_count = graph_->SetupNeighborhoodSample();
+    subgraph_layer_sizes.emplace_back(local_seed_node_count);
     galois::gDebug(graph_->host_prefix(), "Number of local seed nodes is ",
                    local_seed_node_count);
     size_t num_sampled_layers = 0;
+    gnn_layers_.back()->ResizeRows(local_seed_node_count);
     for (auto back_iter = gnn_layers_.rbegin(); back_iter != gnn_layers_.rend();
          back_iter++) {
       GNNLayerType layer_type = (*back_iter)->layer_type();
@@ -180,18 +182,15 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
                        "Number of local nodes for train subgraph for layer ",
                        (*back_iter)->graph_user_layer_number(), " is ",
                        current_sample_size);
+        // resizing
+        (*back_iter)
+            ->ResizeInputOutputRows(current_sample_size, local_seed_node_count);
+        local_seed_node_count = current_sample_size;
+        subgraph_layer_sizes.emplace_back(local_seed_node_count);
         num_sampled_layers++;
-        // XXX resizing of layers
       }
     }
-
-    // resize layer matrices
-    // XXX resizing of layers should be done above, not here
-    train_subgraph_nodes = graph_->ConstructSampledSubgraph(num_sampled_layers);
-    for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end();
-         layer++) {
-      (*layer)->ResizeRows(train_subgraph_nodes);
-    }
+    graph_->ConstructSampledSubgraph(num_sampled_layers);
   }
 
   galois::StatTimer epoch_timer("TrainingTime", "GraphNeuralNetwork");
@@ -203,10 +202,18 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
     // swap to train subgraph
     if (config_.use_train_subgraph_ && !config_.train_minibatch_size()) {
       graph_->EnableSubgraph();
-      // XXX resizing based on sampled per layer
-      for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end();
-           layer++) {
-        (*layer)->ResizeRows(train_subgraph_nodes);
+      size_t l_count = 0;
+      gnn_layers_.back()->ResizeRows(subgraph_layer_sizes[0]);
+      for (auto back_iter = gnn_layers_.rbegin();
+           back_iter != gnn_layers_.rend(); back_iter++) {
+        GNNLayerType layer_type = (*back_iter)->layer_type();
+        if (layer_type == GNNLayerType::kGraphConvolutional ||
+            layer_type == GNNLayerType::kSAGE) {
+          (*back_iter)
+              ->ResizeInputOutputRows(subgraph_layer_sizes[l_count + 1],
+                                      subgraph_layer_sizes[l_count]);
+          l_count++;
+        }
       }
     }
 
@@ -354,7 +361,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
     if (do_validate || do_test) {
       // disable subgraph
       graph_->DisableSubgraph();
-      // TODO only do this when necessary
+      // XXX test batching
       for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end();
            layer++) {
         (*layer)->ResizeRows(graph_->size());
@@ -415,6 +422,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
   }
 
   // check test accuracy
+  // XXX test batching
   galois::StatTimer test_timer("FinalTestRun", "GraphNeuralNetwork");
   test_timer.start();
   SetLayerPhases(galois::GNNPhase::kTest);

From 9a016eb97568a444b8976da459bbff130c88f509 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 13 May 2021 16:34:37 -0500
Subject: [PATCH 542/660] row change optimization for full batch sampling

---
 libgnn/src/GraphNeuralNetwork.cpp | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index ce2a111af8..c1b51e757c 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -220,9 +220,9 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
     // beginning of epoch sampling
     if (config_.do_sampling() && !config_.train_minibatch_size()) {
       size_t local_seed_node_count = graph_->SetupNeighborhoodSample();
+      gnn_layers_.back()->ResizeRows(local_seed_node_count);
       galois::gDebug(graph_->host_prefix(), "Number of local seed nodes is ",
                      local_seed_node_count);
-
       size_t num_sampled_layers = 0;
 
       // work backwards on GCN/SAGE layers
@@ -240,16 +240,16 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
                          "Number of local nodes for layer ",
                          (*back_iter)->graph_user_layer_number(), " is ",
                          current_sample_size);
+
+          (*back_iter)
+              ->ResizeInputOutputRows(current_sample_size,
+                                      local_seed_node_count);
+          local_seed_node_count = current_sample_size;
           num_sampled_layers++;
         }
       }
       // resize layer matrices
-      size_t num_subgraph_nodes =
-          graph_->ConstructSampledSubgraph(num_sampled_layers);
-      for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end();
-           layer++) {
-        (*layer)->ResizeRows(num_subgraph_nodes);
-      }
+      graph_->ConstructSampledSubgraph(num_sampled_layers);
     }
 
     if (!config_.train_minibatch_size()) {
@@ -315,16 +315,10 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
         }
 
         // resize layer matrices
-        // size_t num_subgraph_nodes = graph_->ConstructSampledSubgraph();
         graph_->ConstructSampledSubgraph(num_sampled_layers);
         // XXX resizes above only work for SAGE layers; will break if other
         // layers are tested
 
-        // for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end();
-        //     layer++) {
-        //  (*layer)->ResizeRows(num_subgraph_nodes);
-        //}
-
         const PointerWithSize<galois::GNNFloat> batch_pred = DoInference();
         train_accuracy = GetGlobalAccuracy(batch_pred);
         GradientPropagation();
@@ -336,6 +330,8 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
                        ": Train accuracy/F1 micro is ", train_accuracy,
                        " time ", batch_timer.get(), "\n");
 
+        // XXX mid batch test accuracy checking?
+
         if (!global_work_left) {
           break;
         }

From a39868cb25aceb14d43fb7d622e229a0a6b66df1 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 13 May 2021 19:06:03 -0500
Subject: [PATCH 543/660] Test batching

Adds test batching capabilities to GNNs that allow test accuracy to be
evaluated in chunks (reason is that maybe it's not possible to evaluate
entire graph at once due to memory concerns).
---
 libgnn/include/galois/GraphNeuralNetwork.h |   4 +
 libgnn/include/galois/graphs/GNNGraph.h    |  22 ++++
 libgnn/src/GraphNeuralNetwork.cpp          | 126 +++++++++++++++++++--
 libgnn/src/graphs/GNNGraph.cpp             |  54 +++++++--
 lonestar/libgnnbench/src/Input.cpp         |   6 +
 5 files changed, 191 insertions(+), 21 deletions(-)

diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h
index fe1cb17477..712be6a8ec 100644
--- a/libgnn/include/galois/GraphNeuralNetwork.h
+++ b/libgnn/include/galois/GraphNeuralNetwork.h
@@ -93,6 +93,7 @@ class GraphNeuralNetworkConfig {
 
   bool do_sampling() const { return do_sampling_; }
   unsigned train_minibatch_size() const { return train_minibatch_size_; }
+  unsigned test_minibatch_size() const { return test_minibatch_size_; }
 
   //! Get the default layer config of layers in this GNN
   const GNNLayerConfig& default_layer_config() const {
@@ -112,6 +113,7 @@ class GraphNeuralNetworkConfig {
   //! Interval to run testing set on network at; 0 = no run
   unsigned test_interval_{0};
   unsigned train_minibatch_size_{0};
+  unsigned test_minibatch_size_{0};
   //! Fan out used for sampling (if sampling is enabled)
   std::vector<unsigned> fan_out_vector_;
 
@@ -173,6 +175,8 @@ class GraphNeuralNetwork {
   //! Returns the output layer
   galois::GNNLayer* GetOutputLayer() { return gnn_layers_.back().get(); }
 
+  float MinibatchedTesting();
+
   //! Do training for a specified # of epochs and return test accuracy at the
   //! end of it
   float Train(size_t num_epochs);
diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index f78ab15bfc..c3bc396551 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -406,6 +406,25 @@ class GNNGraph {
   size_t PrepareNextTrainMinibatch();
   //! Returns true if there are still more minibatches in this graph
   bool MoreTrainMinibatches() { return !train_batcher_->NoMoreMinibatches(); };
+
+  //////////////////////////////////////////////////////////////////////////////
+
+  void SetupTestBatcher(size_t test_batch_size) {
+    if (test_batcher_) {
+      // clear before remake
+      test_batcher_.reset();
+    }
+    test_batcher_ = std::make_unique<MinibatchGenerator>(
+        local_testing_mask_, test_batch_size, *end_owned());
+    local_minibatch_mask_.resize(partitioned_graph_->size());
+  }
+  void ResetTestMinibatcher() { test_batcher_->ResetMinibatchState(); }
+  //! Setup the state for the next minibatch sampling call by using the
+  //! minibatcher to pick up the next set batch of nodes
+  size_t PrepareNextTestMinibatch();
+  //! Returns true if there are still more minibatches in this graph
+  bool MoreTestMinibatches() { return !test_batcher_->NoMoreMinibatches(); };
+
   //////////////////////////////////////////////////////////////////////////////
   GNNFloat GetGCNNormFactor(GraphNode lid) const {
     if (global_degrees_[lid]) {
@@ -461,6 +480,9 @@ class GNNGraph {
   float GetGlobalAccuracy(PointerWithSize<GNNFloat> predictions, GNNPhase phase,
                           bool sampling);
 
+  std::pair<uint32_t, uint32_t>
+  GetBatchAccuracy(PointerWithSize<GNNFloat> predictions);
+
   //! Returns the ground truth label of some local id assuming labels are single
   //! class labels.
   GNNFloat GetSingleClassLabel(const unsigned lid) const {
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index c1b51e757c..ab1d89e066 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -103,9 +103,8 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork(
     }
   }
 
-  // XXX test minibatch
   if (config_.do_sampling() || config_.use_train_subgraph_ ||
-      config.train_minibatch_size()) {
+      config.train_minibatch_size() || config.test_minibatch_size()) {
     // output layer not included; it will never involve sampling
     graph_->InitializeSamplingData(num_graph_user_layers_,
                                    config_.use_train_subgraph_);
@@ -114,7 +113,9 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork(
   if (config_.train_minibatch_size()) {
     graph_->SetupTrainBatcher(config_.train_minibatch_size());
   }
-  // XXX test minibatch size
+  if (config_.test_minibatch_size()) {
+    graph_->SetupTestBatcher(config_.test_minibatch_size());
+  }
 
   // create the output layer
   GNNLayerDimensions output_dims = {
@@ -150,13 +151,70 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork(
   }
 
   // flip sampling on layers
-  if (config_.do_sampling() || config_.train_minibatch_size()) {
+  if (config_.use_train_subgraph_ || config_.do_sampling() ||
+      config_.train_minibatch_size()) {
     for (std::unique_ptr<galois::GNNLayer>& ptr : gnn_layers_) {
       ptr->EnableSampling();
     }
   }
 }
 
+float galois::GraphNeuralNetwork::MinibatchedTesting() {
+  galois::gDebug("minibatched testing");
+  graph_->ResetTestMinibatcher();
+  SetLayerPhases(galois::GNNPhase::kBatch);
+
+  uint32_t correct = 0;
+  uint32_t total   = 0;
+  while (true) {
+    work_left_.reset();
+    size_t seed_node_count = graph_->PrepareNextTestMinibatch();
+    // last layer input size/output rows becomes seed node size
+    gnn_layers_.back()->ResizeInputOutputRows(seed_node_count, seed_node_count);
+    size_t num_sampled_layers = 0;
+    for (auto back_iter = gnn_layers_.rbegin(); back_iter != gnn_layers_.rend();
+         back_iter++) {
+      GNNLayerType layer_type = (*back_iter)->layer_type();
+      if (layer_type == GNNLayerType::kGraphConvolutional ||
+          layer_type == GNNLayerType::kSAGE) {
+        // you can minibatch with sampling or minibatch and grab all
+        // relevant neighbors
+        size_t current_sample_size;
+        current_sample_size =
+            graph_->SampleAllEdges((*back_iter)->graph_user_layer_number(),
+                                   false, num_sampled_layers + 1);
+        // resize this layer, change seed node count
+        (*back_iter)
+            ->ResizeInputOutputRows(current_sample_size, seed_node_count);
+        seed_node_count = current_sample_size;
+        num_sampled_layers++;
+        // XXX resizes above only work for SAGE layers; will break if other
+        // layers are tested
+      }
+    }
+
+    // resize layer matrices
+    graph_->ConstructSampledSubgraph(num_sampled_layers);
+
+    const PointerWithSize<galois::GNNFloat> batch_pred = DoInference();
+    std::pair<uint32_t, uint32_t> correct_total =
+        graph_->GetBatchAccuracy(batch_pred);
+
+    correct += correct_total.first;
+    total += correct_total.second;
+
+    work_left_ += graph_->MoreTestMinibatches();
+    char global_work_left = work_left_.reduce();
+    if (!global_work_left) {
+      break;
+    }
+  }
+
+  galois::gDebug("correct / total ", correct, " ", total);
+
+  return (1.0 * correct) / (1.0 * total);
+}
+
 float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
   const size_t this_host = graph_->host_id();
   float train_accuracy{0.f};
@@ -357,7 +415,6 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
     if (do_validate || do_test) {
       // disable subgraph
       graph_->DisableSubgraph();
-      // XXX test batching
       for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end();
            layer++) {
         (*layer)->ResizeRows(graph_->size());
@@ -383,11 +440,19 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
 
     if (do_test) {
       epoch_test_timer.start();
-      SetLayerPhases(galois::GNNPhase::kTest);
-      const PointerWithSize<galois::GNNFloat> test_pred = DoInference();
-      epoch_test_timer.stop();
+      float test_acc;
+
+      if (!config_.test_minibatch_size()) {
+        SetLayerPhases(galois::GNNPhase::kTest);
+        const PointerWithSize<galois::GNNFloat> test_pred = DoInference();
+        epoch_test_timer.stop();
+
+        test_acc = GetGlobalAccuracy(test_pred);
+      } else {
+        test_acc = MinibatchedTesting();
+        epoch_test_timer.stop();
+      }
 
-      float test_acc = GetGlobalAccuracy(test_pred);
       if (this_host == 0) {
         galois::gPrint("Epoch ", epoch, ": Test accuracy is ", test_acc, "\n");
         const std::string test_name_acc =
@@ -404,6 +469,35 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
           epoch_timer.get());
       // revert to training phase for next epoch
       SetLayerPhases(galois::GNNPhase::kTrain);
+
+      // TODO too much code dupe
+      // Resconstruct the train subgraph since it was replaced by test subgraph
+      if (config_.use_train_subgraph_ && !config_.train_minibatch_size() &&
+          config_.test_minibatch_size() && do_test) {
+        // Setup the subgraph to only be the training graph
+        size_t local_seed_node_count = graph_->SetupNeighborhoodSample();
+        galois::gDebug(graph_->host_prefix(), "Number of local seed nodes is ",
+                       local_seed_node_count);
+        size_t num_sampled_layers = 0;
+        gnn_layers_.back()->ResizeRows(local_seed_node_count);
+        for (auto back_iter = gnn_layers_.rbegin();
+             back_iter != gnn_layers_.rend(); back_iter++) {
+          GNNLayerType layer_type = (*back_iter)->layer_type();
+          if (layer_type == GNNLayerType::kGraphConvolutional ||
+              layer_type == GNNLayerType::kSAGE) {
+            size_t current_sample_size = graph_->SampleAllEdges(
+                (*back_iter)->graph_user_layer_number(),
+                config_.inductive_subgraph_, num_sampled_layers + 1);
+            // resizing
+            (*back_iter)
+                ->ResizeInputOutputRows(current_sample_size,
+                                        local_seed_node_count);
+            local_seed_node_count = current_sample_size;
+            num_sampled_layers++;
+          }
+        }
+        graph_->ConstructSampledSubgraph(num_sampled_layers);
+      }
     }
   }
 
@@ -420,10 +514,18 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
   // check test accuracy
   // XXX test batching
   galois::StatTimer test_timer("FinalTestRun", "GraphNeuralNetwork");
+  float global_accuracy;
+
   test_timer.start();
-  SetLayerPhases(galois::GNNPhase::kTest);
-  const PointerWithSize<galois::GNNFloat> predictions = DoInference();
-  float global_accuracy = GetGlobalAccuracy(predictions);
+
+  if (!config_.test_minibatch_size()) {
+    SetLayerPhases(galois::GNNPhase::kTest);
+    const PointerWithSize<galois::GNNFloat> predictions = DoInference();
+    global_accuracy = GetGlobalAccuracy(predictions);
+  } else {
+    global_accuracy = MinibatchedTesting();
+  }
+
   test_timer.stop();
 
   if (this_host == 0) {
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index 0a8a29ad0e..fca2c78cfd 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -713,6 +713,36 @@ float galois::graphs::GNNGraph::GetGlobalAccuracyCPUSingle(
   return static_cast<float>(global_correct) /
          static_cast<float>(global_checked);
 }
+std::pair<uint32_t, uint32_t> galois::graphs::GNNGraph::GetBatchAccuracy(
+    PointerWithSize<GNNFloat> predictions) {
+  // check owned nodes' accuracy
+  assert((num_label_classes_ * size()) == predictions.size());
+  num_correct_.reset();
+  total_checked_.reset();
+
+  galois::do_all(
+      // will only loop over sampled nodes if sampling is on
+      galois::iterate(begin_owned(), end_owned()),
+      // this is possibly the subgraph id
+      [&](const unsigned node_id) {
+        if (IsValidForPhase(node_id, GNNPhase::kBatch)) {
+          total_checked_ += 1;
+          size_t predicted_label = galois::MaxIndex(
+              num_label_classes_, &(predictions[node_id * num_label_classes_]));
+          if (predicted_label ==
+              static_cast<size_t>(GetSingleClassLabel(node_id))) {
+            num_correct_ += 1;
+          }
+        }
+      },
+      // steal on as some threads may have nothing to work on
+      galois::steal(), galois::loopname("GlobalAccuracy"));
+
+  size_t global_correct = num_correct_.reduce();
+  size_t global_checked = total_checked_.reduce();
+
+  return std::make_pair(global_correct, global_checked);
+}
 
 float galois::graphs::GNNGraph::GetGlobalAccuracyCPUMulti(
     PointerWithSize<GNNFloat> predictions, GNNPhase phase, bool sampling) {
@@ -918,10 +948,10 @@ size_t galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num,
   use_subgraph_      = false;
   use_subgraph_view_ = false;
 
-  galois::GAccumulator<size_t> sampled;
-  galois::GAccumulator<size_t> total;
-  sampled.reset();
-  total.reset();
+  // galois::GAccumulator<size_t> sampled;
+  // galois::GAccumulator<size_t> total;
+  // sampled.reset();
+  // total.reset();
 
   galois::do_all(
       galois::iterate(begin(), end()),
@@ -930,7 +960,7 @@ size_t galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num,
         if (IsInSampledGraph(src_iter)) {
           // marks ALL edges of nodes that connect to train/other nodes
           for (auto edge_iter : partitioned_graph_->edges(*src_iter)) {
-            total += 1;
+            // total += 1;
             if (inductive_subgraph) {
               if (!IsValidForPhase(partitioned_graph_->getEdgeDst(edge_iter),
                                    GNNPhase::kTrain) &&
@@ -945,14 +975,15 @@ size_t galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num,
               bitset_sample_flag_.set(
                   partitioned_graph_->getEdgeDst(edge_iter));
             }
-            sampled += 1;
+            // sampled += 1;
           }
         }
       },
       galois::steal(), galois::loopname("ChooseAllEdges"));
 
-  galois::gPrint("Num sampled edges in inductive graph is ", sampled.reduce(),
-                 " out of ", total.reduce(), "\n");
+  // galois::gPrint("Num sampled edges in inductive graph is ",
+  // sampled.reduce(),
+  //               " out of ", total.reduce(), "\n");
 
   std::vector<uint32_t> new_nodes = bitset_sample_flag_.getOffsets();
   // update nodes, then communicate update to all hosts so that they can
@@ -1101,7 +1132,7 @@ galois::graphs::GNNGraph::ConstructSampledSubgraph(size_t num_sampled_layers,
     // a view only has lid<->sid mappings
     num_subgraph_nodes =
         subgraph_->BuildSubgraphView(*this, num_sampled_layers);
-    //SortAllInEdgesBySID();
+    // SortAllInEdgesBySID();
   }
 
   // after this, this graph is a subgraph
@@ -1131,6 +1162,11 @@ size_t galois::graphs::GNNGraph::PrepareNextTrainMinibatch() {
   return SetupNeighborhoodSample(GNNPhase::kBatch);
 }
 
+size_t galois::graphs::GNNGraph::PrepareNextTestMinibatch() {
+  test_batcher_->GetNextMinibatch(&local_minibatch_mask_);
+  return SetupNeighborhoodSample(GNNPhase::kBatch);
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 
 #ifdef GALOIS_ENABLE_GPU
diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp
index d15adf2d9f..4b7717eac3 100644
--- a/lonestar/libgnnbench/src/Input.cpp
+++ b/lonestar/libgnnbench/src/Input.cpp
@@ -132,6 +132,11 @@ llvm::cl::opt<unsigned>
                          cll::desc("Size of training minibatch (default 0)"),
                          cll::init(0));
 
+llvm::cl::opt<unsigned>
+    test_minibatch_size("testMinibatchSize",
+                        cll::desc("Size of test minibatch (default 0)"),
+                        cll::init(0));
+
 llvm::cl::opt<unsigned>
     val_interval("valInterval",
                  cll::desc("# of epochs to test validation set (default 0)"),
@@ -306,6 +311,7 @@ std::unique_ptr<galois::GraphNeuralNetwork> InitializeGraphNeuralNetwork() {
   gnn_config.validation_interval_  = val_interval;
   gnn_config.test_interval_        = test_interval;
   gnn_config.train_minibatch_size_ = train_minibatch_size;
+  gnn_config.test_minibatch_size_  = test_minibatch_size;
   gnn_config.inductive_subgraph_   = inductive_subgraph;
   gnn_config.fan_out_vector_       = CreateFanOutVector();
 

From 8ff7cfc7e3a749190092944dd6b6dad9b3903a65 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 14 May 2021 22:38:22 -0500
Subject: [PATCH 544/660] Minibatch test interval

For minibatching, allows testing at a particular step of minibatches so
that you can evaluate accuracy in the middle of a minibatch epoch.
---
 libgnn/include/galois/GraphNeuralNetwork.h |  1 +
 libgnn/src/GraphNeuralNetwork.cpp          | 42 +++++++++++++++++++++-
 lonestar/libgnnbench/src/Input.cpp         | 20 +++++++----
 3 files changed, 55 insertions(+), 8 deletions(-)

diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h
index 712be6a8ec..af1955d258 100644
--- a/libgnn/include/galois/GraphNeuralNetwork.h
+++ b/libgnn/include/galois/GraphNeuralNetwork.h
@@ -112,6 +112,7 @@ class GraphNeuralNetworkConfig {
   unsigned validation_interval_{0};
   //! Interval to run testing set on network at; 0 = no run
   unsigned test_interval_{0};
+  unsigned minibatch_test_interval_{10};
   unsigned train_minibatch_size_{0};
   unsigned test_minibatch_size_{0};
   //! Fan out used for sampling (if sampling is enabled)
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index ab1d89e066..fbaad4ff4d 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -384,11 +384,51 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
         work_left_ += graph_->MoreTrainMinibatches();
         char global_work_left = work_left_.reduce();
         batch_timer.stop();
+        epoch_timer.stop();
         galois::gPrint("Epoch ", epoch, " Batch ", batch_num - 1,
                        ": Train accuracy/F1 micro is ", train_accuracy,
                        " time ", batch_timer.get(), "\n");
 
-        // XXX mid batch test accuracy checking?
+        bool test_eval =
+            config_.minibatch_test_interval_
+                ? (batch_num - 1) % config_.minibatch_test_interval_ == 0
+                : false;
+
+        if (test_eval) {
+          float test_acc;
+          if (!config_.test_minibatch_size()) {
+            graph_->DisableSubgraph();
+            for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end();
+                 layer++) {
+              (*layer)->ResizeRows(graph_->size());
+            }
+            SetLayerPhases(galois::GNNPhase::kTest);
+            const PointerWithSize<galois::GNNFloat> test_pred = DoInference();
+            test_acc = GetGlobalAccuracy(test_pred);
+          } else {
+            test_acc = MinibatchedTesting();
+          }
+
+          if (this_host == 0) {
+            galois::gPrint("Epoch ", epoch, " Batch ", batch_num - 1,
+                           ": Test accuracy is ", test_acc, "\n");
+            const std::string test_name_acc =
+                "TestEpoch" + std::to_string(epoch) + "Batch" +
+                std::to_string(batch_num - 1) + "Accuracy";
+            galois::runtime::reportStat_Single("GraphNeuralNetwork",
+                                               test_name_acc, test_acc);
+          }
+          // report the training time elapsed at this point in time
+          galois::runtime::reportStat_Single(
+              "GraphNeuralNetwork",
+              "ElapsedTrainTimeEpoch" + std::to_string(epoch) + "Batch" +
+                  std::to_string(batch_num - 1),
+              epoch_timer.get());
+          // revert to training phase for next epoch
+          SetLayerPhases(galois::GNNPhase::kTrain);
+        }
+
+        epoch_timer.start();
 
         if (!global_work_left) {
           break;
diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp
index 4b7717eac3..5facfa95c5 100644
--- a/lonestar/libgnnbench/src/Input.cpp
+++ b/lonestar/libgnnbench/src/Input.cpp
@@ -137,6 +137,11 @@ llvm::cl::opt<unsigned>
                         cll::desc("Size of test minibatch (default 0)"),
                         cll::init(0));
 
+llvm::cl::opt<unsigned> minibatch_test_interval(
+    "minibatchTestInterval",
+    cll::desc("Size of test intervals for minibatch (default 0)"),
+    cll::init(0));
+
 llvm::cl::opt<unsigned>
     val_interval("valInterval",
                  cll::desc("# of epochs to test validation set (default 0)"),
@@ -307,13 +312,14 @@ std::unique_ptr<galois::GraphNeuralNetwork> InitializeGraphNeuralNetwork() {
   galois::GraphNeuralNetworkConfig gnn_config(
       num_layers, layer_types, layer_sizes_vector, output_layer_type,
       do_graph_sampling, layer_config);
-  gnn_config.use_train_subgraph_   = use_train_subgraph;
-  gnn_config.validation_interval_  = val_interval;
-  gnn_config.test_interval_        = test_interval;
-  gnn_config.train_minibatch_size_ = train_minibatch_size;
-  gnn_config.test_minibatch_size_  = test_minibatch_size;
-  gnn_config.inductive_subgraph_   = inductive_subgraph;
-  gnn_config.fan_out_vector_       = CreateFanOutVector();
+  gnn_config.use_train_subgraph_      = use_train_subgraph;
+  gnn_config.validation_interval_     = val_interval;
+  gnn_config.test_interval_           = test_interval;
+  gnn_config.train_minibatch_size_    = train_minibatch_size;
+  gnn_config.test_minibatch_size_     = test_minibatch_size;
+  gnn_config.minibatch_test_interval_ = minibatch_test_interval;
+  gnn_config.inductive_subgraph_      = inductive_subgraph;
+  gnn_config.fan_out_vector_          = CreateFanOutVector();
 
   // optimizer
   std::unique_ptr<galois::BaseOptimizer> opt = CreateOptimizer(gnn_graph.get());

From 6ab5a9e2298cb8bdde101d898d6edc751e6e2bf7 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 17 May 2021 22:53:37 -0500
Subject: [PATCH 545/660] MKL microbenchmark

3 variants: no galois, init galois shared, and init galois dist
---
 libgnn/test/CMakeLists.txt | 17 +++++++
 libgnn/test/mkl_micro.cpp  | 98 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 115 insertions(+)
 create mode 100644 libgnn/test/mkl_micro.cpp

diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt
index 11c7ab78b8..3a7fec8729 100644
--- a/libgnn/test/CMakeLists.txt
+++ b/libgnn/test/CMakeLists.txt
@@ -1,3 +1,20 @@
+add_executable(mkl_micro mkl_micro.cpp)
+target_link_directories(mkl_micro PUBLIC ${MKL_LIBRARIES})
+target_include_directories(mkl_micro PUBLIC
+  ${MKL_INCLUDE_DIRS}
+)
+target_link_libraries(mkl_micro ${INTEL_LIBS})
+
+add_executable(mkl_micro_sgalois mkl_micro.cpp)
+target_link_libraries(mkl_micro_sgalois galois_gnn)
+target_compile_definitions(mkl_micro_sgalois PUBLIC USE_SHARED_GALOIS=1)
+
+add_executable(mkl_micro_dgalois mkl_micro.cpp)
+target_link_libraries(mkl_micro_dgalois galois_gnn)
+target_compile_definitions(mkl_micro_dgalois PUBLIC USE_DIST_GALOIS=1)
+
+################################################################################
+
 add_executable(gnngraph-test gnngraph-test.cpp)
 target_link_libraries(gnngraph-test galois_gnn)
 add_test(NAME gnngraph-test COMMAND gnngraph-test)
diff --git a/libgnn/test/mkl_micro.cpp b/libgnn/test/mkl_micro.cpp
new file mode 100644
index 0000000000..e784b2cde0
--- /dev/null
+++ b/libgnn/test/mkl_micro.cpp
@@ -0,0 +1,98 @@
+#include <cstdlib>
+#include <vector>
+#include <random>
+#include <mkl.h>
+
+#ifdef USE_SHARED_GALOIS
+#include "galois/Galois.h"
+#endif
+#ifdef USE_DIST_GALOIS
+#include "galois/DistGalois.h"
+#endif
+
+// MKL wrapper
+void CBlasSGEMM(const CBLAS_TRANSPOSE trans_a, const CBLAS_TRANSPOSE trans_b,
+                size_t input_rows, size_t input_columns, size_t output_columns,
+                const float* a, const float* b, float* output) {
+  // set lead dimension based on cblas spec w.r.t. transpose setting
+  size_t lead_dim_a = (trans_a == CblasNoTrans) ? input_columns : input_rows;
+  size_t lead_dim_b =
+      (trans_b == CblasNoTrans) ? output_columns : input_columns;
+  // do the MM
+  cblas_sgemm(CblasRowMajor, trans_a, trans_b, input_rows, output_columns,
+              input_columns, 1.0, a, lead_dim_a, b, lead_dim_b,
+              false ? 1.0 : 0.0, output, output_columns);
+}
+
+void CacheFlush(std::vector<float>* matrix) {
+  for (size_t i = 0; i < matrix->size(); i++) {
+    (*matrix)[i] = i;
+  }
+}
+
+int main(int argc, char* argv[]) {
+#ifdef USE_SHARED_GALOIS
+  galois::SharedMemSys G;
+  if (argc != 2) {
+    printf("Thread arg not specified\n");
+    exit(1);
+  }
+  galois::setActiveThreads(std::stoi(argv[1]));
+  printf("Initialized Galois Shared Mem with %u threads\n",
+         galois::getActiveThreads());
+#endif
+
+#ifdef USE_DIST_GALOIS
+  galois::DistMemSys G;
+  if (argc != 2) {
+    printf("Thread arg not specified\n");
+    exit(1);
+  }
+  galois::setActiveThreads(std::stoi(argv[1]));
+  printf("Initialized Galois Dist Mem with %u threads\n",
+         galois::getActiveThreads());
+#endif
+
+  printf("%d %s\n", argc, argv[0]);
+
+  // dimensions from test case
+  size_t a_dim = 12000000;
+  size_t b_dim = 128;
+  size_t c_dim = 16;
+
+  // inputs
+  std::vector<float> matrix_1(a_dim * b_dim);
+  std::vector<float> matrix_2(a_dim * c_dim);
+  // output
+  std::vector<float> matrix_3(b_dim * c_dim);
+
+  size_t kBigSize = 1000000000;
+  std::vector<float> very_big_matrix(kBigSize);
+
+  // change reps here; maybe make it command line arg
+  for (size_t reps = 0; reps < 3; reps++) {
+    // reinit
+    srand(0);
+    for (size_t i = 0; i < matrix_1.size(); i++) {
+      matrix_1[i] = rand() / static_cast<float>(RAND_MAX / 10);
+    }
+    srand(1);
+    for (size_t i = 0; i < matrix_2.size(); i++) {
+      matrix_2[i] = rand() / static_cast<float>(RAND_MAX / 10);
+    }
+
+    very_big_matrix.clear();
+    very_big_matrix.resize(kBigSize);
+    // cache flush
+    CacheFlush(&very_big_matrix);
+
+    printf("Rep %lu\n", reps);
+
+    // transpose because it's the same as the problematic call in GNN
+    // TODO(loc) non transpose version
+    CBlasSGEMM(CblasTrans, CblasNoTrans, b_dim, a_dim, c_dim, matrix_1.data(),
+               matrix_2.data(), matrix_3.data());
+  }
+
+  return 0;
+}

From 24551108c8fb2928dda4cab2b95f41fc5c8885b4 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 18 May 2021 16:42:02 -0500
Subject: [PATCH 546/660] GNN subgraph PODResizableArray rather than vector

---
 libgnn/include/galois/graphs/GNNGraph.h    |  2 +-
 libgnn/include/galois/graphs/GNNSubgraph.h | 14 +++++------
 libgnn/src/graphs/GNNSubgraph.cpp          | 28 +++++++++++++++++-----
 3 files changed, 30 insertions(+), 14 deletions(-)

diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index c3bc396551..f3689b00be 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -522,7 +522,7 @@ class GNNGraph {
       return PointerWithSize(local_node_features_);
     } else {
       return PointerWithSize(subgraph_->GetLocalFeatures().data(),
-                             subgraph_->GetLocalFeatures().size());
+                             subgraph_->size() * node_feature_length_);
     }
   }
 
diff --git a/libgnn/include/galois/graphs/GNNSubgraph.h b/libgnn/include/galois/graphs/GNNSubgraph.h
index 0a7f2670c7..6be5fb04fc 100644
--- a/libgnn/include/galois/graphs/GNNSubgraph.h
+++ b/libgnn/include/galois/graphs/GNNSubgraph.h
@@ -19,7 +19,7 @@ class GNNSubgraph {
 
   size_t BuildSubgraphView(GNNGraph& gnn_graph, size_t num_sampled_layers);
 
-  galois::gstl::Vector<GNNFeature>& GetLocalFeatures() {
+  galois::PODResizeableArray<GNNFeature>& GetLocalFeatures() {
     return subgraph_node_features_;
   }
 
@@ -124,20 +124,20 @@ class GNNSubgraph {
   //! Features corresponding only to this subgraph; copied from main graph
   //! (in other words, redundant; would be nice if there was a way to
   //! fake contiguous memory
-  galois::gstl::Vector<GNNFeature> subgraph_node_features_;
+  galois::PODResizeableArray<GNNFeature> subgraph_node_features_;
   //! Dense array mapping local ids to subgraph id (not space efficient)
   galois::LargeArray<uint32_t> lid_to_subgraph_id_;
   //! Map subgraph ids back to local graph ids
   //! gstl vector because this will get resized every epoch (LargeArray
   //! is for static)
-  galois::gstl::Vector<uint32_t> subgraph_id_to_lid_;
+  galois::PODResizeableArray<uint32_t> subgraph_id_to_lid_;
   // intermediate degrees used for edge construction
-  galois::gstl::Vector<uint32_t> local_subgraph_out_degrees_;
-  galois::gstl::Vector<uint32_t> local_subgraph_in_degrees_;
+  galois::PODResizeableArray<uint32_t> local_subgraph_out_degrees_;
+  galois::PODResizeableArray<uint32_t> local_subgraph_in_degrees_;
   //! Maps from subgraph out-edge id to original graph edge id (used to check if
   //! edge exists in particular layer)
-  galois::gstl::Vector<uint32_t> subedge_to_original_edge_;
+  galois::PODResizeableArray<uint32_t> subedge_to_original_edge_;
   //! Maps from subgraph in-edge id to original graph edge id (used to check if
   //! edge exists in particular layer)
-  galois::gstl::Vector<uint32_t> in_subedge_to_original_edge_;
+  galois::PODResizeableArray<uint32_t> in_subedge_to_original_edge_;
 };
diff --git a/libgnn/src/graphs/GNNSubgraph.cpp b/libgnn/src/graphs/GNNSubgraph.cpp
index f5bde956f2..332cd98072 100644
--- a/libgnn/src/graphs/GNNSubgraph.cpp
+++ b/libgnn/src/graphs/GNNSubgraph.cpp
@@ -47,7 +47,9 @@ void galois::graphs::GNNGraph::GNNSubgraph::CreateSubgraphMapping(
                    }
                  });
   num_subgraph_nodes_ = subgraph_count.reduce();
-  subgraph_id_to_lid_.resize(num_subgraph_nodes_, 0);
+  if (subgraph_id_to_lid_.size() < num_subgraph_nodes_) {
+    subgraph_id_to_lid_.resize(num_subgraph_nodes_ * 1.02);
+  }
 
   // TODO(loc) depending on overhead, can parallelize this with a prefix sum
   // serial loop over LIDs to construct lid -> subgraph id mapping
@@ -112,8 +114,13 @@ void galois::graphs::GNNGraph::GNNSubgraph::DegreeCounting(
   galois::StatTimer timer("DegreeCounting", kRegionName);
   timer.start();
 
-  local_subgraph_out_degrees_.resize(num_subgraph_nodes_);
-  local_subgraph_in_degrees_.resize(num_subgraph_nodes_);
+  if (local_subgraph_out_degrees_.size() < num_subgraph_nodes_) {
+    local_subgraph_out_degrees_.resize(num_subgraph_nodes_ * 1.02);
+  }
+
+  if (local_subgraph_in_degrees_.size() < num_subgraph_nodes_) {
+    local_subgraph_in_degrees_.resize(num_subgraph_nodes_ * 1.02);
+  }
 
   galois::do_all(
       galois::iterate(begin(), end()),
@@ -155,10 +162,15 @@ void galois::graphs::GNNGraph::GNNSubgraph::EdgeCreation(
   }
 
   // allocate then set node endpoints
-  num_subgraph_edges_ = local_subgraph_out_degrees_.back();
+  num_subgraph_edges_ = local_subgraph_out_degrees_[num_subgraph_nodes_ - 1];
+
+  galois::StatTimer alloc_time("EdgeCreationAlloc", kRegionName);
+  alloc_time.start();
   underlying_graph_.DeallocateOnly();
   underlying_graph_.allocateFrom(num_subgraph_nodes_, num_subgraph_edges_);
   underlying_graph_.CSCAllocate();
+  alloc_time.stop();
+
   galois::do_all(galois::iterate(uint32_t{0}, num_subgraph_nodes_),
                  [&](uint32_t subgraph_id) {
                    underlying_graph_.fixEndEdge(
@@ -166,8 +178,12 @@ void galois::graphs::GNNGraph::GNNSubgraph::EdgeCreation(
                    underlying_graph_.FixEndInEdge(
                        subgraph_id, local_subgraph_in_degrees_[subgraph_id]);
                  });
-  subedge_to_original_edge_.resize(num_subgraph_edges_);
-  in_subedge_to_original_edge_.resize(num_subgraph_edges_);
+  if (subedge_to_original_edge_.size() < num_subgraph_edges_) {
+    subedge_to_original_edge_.resize(num_subgraph_edges_ * 1.02);
+  }
+  if (in_subedge_to_original_edge_.size() < num_subgraph_edges_) {
+    in_subedge_to_original_edge_.resize(num_subgraph_edges_ * 1.02);
+  }
 
   // save edges + save reference to layer sample status
   galois::do_all(

From 154fd9c358a679d5db8452b653a7a06100bb02db Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 18 May 2021 17:10:32 -0500
Subject: [PATCH 547/660] MKL micro delete galois and OMP loop

---
 libgnn/test/CMakeLists.txt | 12 ++++++++++++
 libgnn/test/mkl_micro.cpp  | 30 ++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+)

diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt
index 3a7fec8729..b9c1eea043 100644
--- a/libgnn/test/CMakeLists.txt
+++ b/libgnn/test/CMakeLists.txt
@@ -5,6 +5,14 @@ target_include_directories(mkl_micro PUBLIC
 )
 target_link_libraries(mkl_micro ${INTEL_LIBS})
 
+add_executable(mkl_micro_omp mkl_micro.cpp)
+target_link_directories(mkl_micro_omp PUBLIC ${MKL_LIBRARIES})
+target_include_directories(mkl_micro_omp PUBLIC
+  ${MKL_INCLUDE_DIRS}
+)
+target_link_libraries(mkl_micro_omp ${INTEL_LIBS})
+target_link_libraries(mkl_micro_omp -fopenmp)
+
 add_executable(mkl_micro_sgalois mkl_micro.cpp)
 target_link_libraries(mkl_micro_sgalois galois_gnn)
 target_compile_definitions(mkl_micro_sgalois PUBLIC USE_SHARED_GALOIS=1)
@@ -13,6 +21,10 @@ add_executable(mkl_micro_dgalois mkl_micro.cpp)
 target_link_libraries(mkl_micro_dgalois galois_gnn)
 target_compile_definitions(mkl_micro_dgalois PUBLIC USE_DIST_GALOIS=1)
 
+add_executable(mkl_micro_delete_galois mkl_micro.cpp)
+target_link_libraries(mkl_micro_delete_galois galois_gnn)
+target_compile_definitions(mkl_micro_delete_galois PUBLIC USE_SHARED_GALOIS_DELETE=1)
+
 ################################################################################
 
 add_executable(gnngraph-test gnngraph-test.cpp)
diff --git a/libgnn/test/mkl_micro.cpp b/libgnn/test/mkl_micro.cpp
index e784b2cde0..63a3f3f33b 100644
--- a/libgnn/test/mkl_micro.cpp
+++ b/libgnn/test/mkl_micro.cpp
@@ -9,6 +9,13 @@
 #ifdef USE_DIST_GALOIS
 #include "galois/DistGalois.h"
 #endif
+#ifdef USE_SHARED_GALOIS_DELETE
+#include "galois/Galois.h"
+#endif
+
+#ifdef USE_OMP
+#include "omp.h"
+#endif
 
 // MKL wrapper
 void CBlasSGEMM(const CBLAS_TRANSPOSE trans_a, const CBLAS_TRANSPOSE trans_b,
@@ -42,6 +49,21 @@ int main(int argc, char* argv[]) {
          galois::getActiveThreads());
 #endif
 
+#ifdef USE_SHARED_GALOIS_DELETE
+  std::unique_ptr<galois::SharedMemSys> G;
+  G = std::make_unique<galois::SharedMemSys>();
+
+  if (argc != 2) {
+    printf("Thread arg not specified\n");
+    exit(1);
+  }
+  galois::setActiveThreads(std::stoi(argv[1]));
+  printf("Initialized Galois Shared Mem with %u threads\n",
+         galois::getActiveThreads());
+  printf("Deleting galois\n");
+  G.reset();
+#endif
+
 #ifdef USE_DIST_GALOIS
   galois::DistMemSys G;
   if (argc != 2) {
@@ -86,6 +108,14 @@ int main(int argc, char* argv[]) {
     // cache flush
     CacheFlush(&very_big_matrix);
 
+    // dummy OMP TBB loop
+#ifdef USE_OMP
+#pragma omp parallel
+    for (size_t i = 0; i < very_big_matrix.size(); i++) {
+      very_big_matrix[i] = i;
+    }
+#endif
+
     printf("Rep %lu\n", reps);
 
     // transpose because it's the same as the problematic call in GNN

From ee42d7e9d9bedcc0ec8b7b892f36678e86bebfb3 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 18 May 2021 22:42:05 -0500
Subject: [PATCH 548/660] mkl micro: use Large arrays when galois active

---
 libgnn/test/mkl_micro.cpp | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/libgnn/test/mkl_micro.cpp b/libgnn/test/mkl_micro.cpp
index 63a3f3f33b..ea9511df74 100644
--- a/libgnn/test/mkl_micro.cpp
+++ b/libgnn/test/mkl_micro.cpp
@@ -5,9 +5,11 @@
 
 #ifdef USE_SHARED_GALOIS
 #include "galois/Galois.h"
+#include "galois/LargeArray.h"
 #endif
 #ifdef USE_DIST_GALOIS
 #include "galois/DistGalois.h"
+#include "galois/LargeArray.h"
 #endif
 #ifdef USE_SHARED_GALOIS_DELETE
 #include "galois/Galois.h"
@@ -82,11 +84,23 @@ int main(int argc, char* argv[]) {
   size_t b_dim = 128;
   size_t c_dim = 16;
 
+#if defined(USE_SHARED_GALOIS) || defined(USE_DIST_GALOIS)
+  printf("Using Galois large arrays\n");
+  // inputs
+  galois::LargeArray<float> matrix_1;
+  matrix_1.create(a_dim * b_dim);
+  galois::LargeArray<float> matrix_2;
+  matrix_2.create(a_dim * c_dim);
+  // output
+  galois::LargeArray<float> matrix_3;
+  matrix_3.create(b_dim * c_dim);
+#else
   // inputs
   std::vector<float> matrix_1(a_dim * b_dim);
   std::vector<float> matrix_2(a_dim * c_dim);
   // output
   std::vector<float> matrix_3(b_dim * c_dim);
+#endif
 
   size_t kBigSize = 1000000000;
   std::vector<float> very_big_matrix(kBigSize);

From 27feed3824eea88c792cf22e2a45c6702f91c821 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 19 May 2021 18:17:38 -0500
Subject: [PATCH 549/660] Timer disabling option in GNNs

1) Adds method to disable timers for GNNs; this is mostly going to be
used to not time anything for Test phase since I don't want test time to
get included in existing timers (all I care for is training time
breakdown).
2) Small change to how newly sampled nodes are set in sampling: rather
than use getOffsets on bitset, just loop over it myself and call test
individually on each bit; saves materialization of vector.
---
 libgnn/include/galois/GraphNeuralNetwork.h |  16 +++
 libgnn/include/galois/graphs/GNNGraph.h    |  15 +++
 libgnn/include/galois/graphs/GNNSubgraph.h |  12 +++
 libgnn/include/galois/layers/GNNLayer.h    |  14 ++-
 libgnn/include/galois/layers/SAGELayer.h   |   1 +
 libgnn/src/GraphNeuralNetwork.cpp          |  47 ++++++---
 libgnn/src/graphs/GNNGraph.cpp             | 111 +++++++++++++++------
 libgnn/src/graphs/GNNSubgraph.cpp          |  28 +++---
 libgnn/src/layers/GNNLayer.cpp             |  48 ++++-----
 libgnn/src/layers/SAGELayer.cpp            |  52 ++++++----
 libgnn/src/layers/SoftmaxLayer.cpp         |   9 ++
 11 files changed, 248 insertions(+), 105 deletions(-)

diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h
index af1955d258..fc200e7baa 100644
--- a/libgnn/include/galois/GraphNeuralNetwork.h
+++ b/libgnn/include/galois/GraphNeuralNetwork.h
@@ -200,6 +200,22 @@ class GraphNeuralNetwork {
   void GradientPropagation();
 
 private:
+  static const constexpr char* kRegionName = "GraphNeuralNetwork";
+
+  void EnableTimers() {
+    galois::gDebug("Enabling timers");
+    graph_->EnableTimers();
+    for (auto& layer : gnn_layers_)
+      layer->EnableTimers();
+  }
+
+  void DisableTimers() {
+    galois::gDebug("Disabling timers");
+    graph_->DisableTimers();
+    for (auto& layer : gnn_layers_)
+      layer->DisableTimers();
+  }
+
   //! Underlying graph to train
   std::unique_ptr<graphs::GNNGraph> graph_;
   //! Optimizer object for weight updates
diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index f3689b00be..6e2b211e00 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -613,6 +613,19 @@ class GNNGraph {
 
   void ContiguousRemap(const std::string& new_name);
 
+  void EnableTimers() {
+    use_timer_ = true;
+    if (subgraph_) {
+      subgraph_->EnableTimers();
+    }
+  }
+  void DisableTimers() {
+    use_timer_ = false;
+    if (subgraph_) {
+      subgraph_->DisableTimers();
+    }
+  }
+
 private:
 // included like this to avoid cyclic dependency issues + not used anywhere but
 // in this class anyways
@@ -776,6 +789,8 @@ class GNNGraph {
   DGAccumulator<size_t> local_true_negative_;
   DGAccumulator<size_t> local_false_positive_;
   DGAccumulator<size_t> local_false_negative_;
+
+  bool use_timer_{true};
 };
 
 } // namespace graphs
diff --git a/libgnn/include/galois/graphs/GNNSubgraph.h b/libgnn/include/galois/graphs/GNNSubgraph.h
index 6be5fb04fc..81825e2ed1 100644
--- a/libgnn/include/galois/graphs/GNNSubgraph.h
+++ b/libgnn/include/galois/graphs/GNNSubgraph.h
@@ -97,8 +97,20 @@ class GNNSubgraph {
   galois::LargeArray<uint32_t>* GetLIDToSIDPointer() {
     return &lid_to_subgraph_id_;
   }
+  void EnableTimers() { use_timer_ = true; }
+  void DisableTimers() { use_timer_ = false; }
 
 private:
+  bool use_timer_{true};
+  void TimerStart(galois::StatTimer* t) {
+    if (use_timer_)
+      t->start();
+  }
+  void TimerStop(galois::StatTimer* t) {
+    if (use_timer_)
+      t->stop();
+  }
+
   //! Creates subgraph ID mapping from the number of sampled nodes from the
   //! original graph. Should be done every epoch when sampled graph changes.
   void CreateSubgraphMapping(const GNNGraph& gnn_graph,
diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
index 4f5822d1b2..c835d05454 100644
--- a/libgnn/include/galois/layers/GNNLayer.h
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -98,7 +98,7 @@ class GNNLayer {
                  GNNLayerConfig()) {}
 
   virtual void ResizeRows(size_t new_row_count) {
-    layer_dimensions_.input_rows = new_row_count;
+    layer_dimensions_.input_rows  = new_row_count;
     layer_dimensions_.output_rows = new_row_count;
     // TODO(loc) output matrix should be resized if space becomes an issue,
     // else just use first S rows (S = subgraph size)
@@ -231,6 +231,8 @@ class GNNLayer {
     base_gpu_object_.PrintBackwardOutput(p_backward_output_matrix_.size());
   }
 #endif
+  void EnableTimers() { use_timer_ = true; }
+  void DisableTimers() { use_timer_ = false; }
 
 protected:
   //! Layer order (starts from 0); used in backward to shortcut output as layer
@@ -287,6 +289,16 @@ class GNNLayer {
 
   //////////////////////////////////////////////////////////////////////////////
 
+  bool use_timer_{true};
+  void TimerStart(galois::StatTimer* t) {
+    if (use_timer_)
+      t->start();
+  }
+  void TimerStop(galois::StatTimer* t) {
+    if (use_timer_)
+      t->stop();
+  }
+
   //! Init based from following paper
   //! http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf
   //! Since it is unclear what j and j+1 refer to in that paper, the things
diff --git a/libgnn/include/galois/layers/SAGELayer.h b/libgnn/include/galois/layers/SAGELayer.h
index 3f12978663..0711862240 100644
--- a/libgnn/include/galois/layers/SAGELayer.h
+++ b/libgnn/include/galois/layers/SAGELayer.h
@@ -96,6 +96,7 @@ class SAGELayer : public GNNLayer {
 
 private:
   static const constexpr char* kRegionName = "SAGELayer";
+
   //! CPU aggregation
   void AggregateAllCPU(
       size_t column_length, const GNNFloat* node_embeddings,
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index fbaad4ff4d..02be5edbf4 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -216,6 +216,7 @@ float galois::GraphNeuralNetwork::MinibatchedTesting() {
 }
 
 float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
+  EnableTimers();
   const size_t this_host = graph_->host_id();
   float train_accuracy{0.f};
   std::vector<size_t> subgraph_layer_sizes;
@@ -251,9 +252,9 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
     graph_->ConstructSampledSubgraph(num_sampled_layers);
   }
 
-  galois::StatTimer epoch_timer("TrainingTime", "GraphNeuralNetwork");
-  galois::StatTimer validation_timer("ValidationTime", "GraphNeuralNetwork");
-  galois::StatTimer epoch_test_timer("TestTime", "GraphNeuralNetwork");
+  galois::StatTimer epoch_timer("TrainingTime", kRegionName);
+  galois::StatTimer validation_timer("ValidationTime", kRegionName);
+  galois::StatTimer epoch_test_timer("TestTime", kRegionName);
 
   for (size_t epoch = 0; epoch < num_epochs; epoch++) {
     epoch_timer.start();
@@ -277,6 +278,9 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
 
     // beginning of epoch sampling
     if (config_.do_sampling() && !config_.train_minibatch_size()) {
+      galois::StatTimer mb_timer("EpochSubgraphCreation", kRegionName);
+      mb_timer.start();
+
       size_t local_seed_node_count = graph_->SetupNeighborhoodSample();
       gnn_layers_.back()->ResizeRows(local_seed_node_count);
       galois::gDebug(graph_->host_prefix(), "Number of local seed nodes is ",
@@ -308,6 +312,8 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
       }
       // resize layer matrices
       graph_->ConstructSampledSubgraph(num_sampled_layers);
+
+      mb_timer.stop();
     }
 
     if (!config_.train_minibatch_size()) {
@@ -325,9 +331,12 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
 
       // create mini batch graphs and loop until minibatches on all hosts done
       while (true) {
+        galois::StatTimer mb_timer("MinibatchSubgraphCreation", kRegionName);
+        mb_timer.start();
+
         const std::string btime_name("Epoch" + std::to_string(epoch) + "Batch" +
                                      std::to_string(batch_num));
-        galois::StatTimer batch_timer(btime_name.c_str(), "GraphNeuralNetwork");
+        galois::StatTimer batch_timer(btime_name.c_str(), kRegionName);
         batch_timer.start();
         work_left_.reset();
         galois::gInfo("Epoch ", epoch, " batch ", batch_num++);
@@ -377,6 +386,8 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
         // XXX resizes above only work for SAGE layers; will break if other
         // layers are tested
 
+        mb_timer.stop();
+
         const PointerWithSize<galois::GNNFloat> batch_pred = DoInference();
         train_accuracy = GetGlobalAccuracy(batch_pred);
         GradientPropagation();
@@ -395,6 +406,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
                 : false;
 
         if (test_eval) {
+          DisableTimers();
           float test_acc;
           if (!config_.test_minibatch_size()) {
             graph_->DisableSubgraph();
@@ -415,17 +427,18 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
             const std::string test_name_acc =
                 "TestEpoch" + std::to_string(epoch) + "Batch" +
                 std::to_string(batch_num - 1) + "Accuracy";
-            galois::runtime::reportStat_Single("GraphNeuralNetwork",
-                                               test_name_acc, test_acc);
+            galois::runtime::reportStat_Single(kRegionName, test_name_acc,
+                                               test_acc);
           }
           // report the training time elapsed at this point in time
           galois::runtime::reportStat_Single(
-              "GraphNeuralNetwork",
+              kRegionName,
               "ElapsedTrainTimeEpoch" + std::to_string(epoch) + "Batch" +
                   std::to_string(batch_num - 1),
               epoch_timer.get());
           // revert to training phase for next epoch
           SetLayerPhases(galois::GNNPhase::kTrain);
+          EnableTimers();
         }
 
         epoch_timer.start();
@@ -442,7 +455,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
           "TrainEpoch" + std::to_string(epoch) + "Accuracy";
       galois::gPrint("Epoch ", epoch, ": Train accuracy/F1 micro is ",
                      train_accuracy, "\n");
-      galois::runtime::reportStat_Single("GraphNeuralNetwork", t_name_acc,
+      galois::runtime::reportStat_Single(kRegionName, t_name_acc,
                                          train_accuracy);
     }
 
@@ -453,6 +466,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
         config_.test_interval_ ? epoch % config_.test_interval_ == 0 : false;
 
     if (do_validate || do_test) {
+      DisableTimers();
       // disable subgraph
       graph_->DisableSubgraph();
       for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end();
@@ -473,8 +487,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
                        "\n");
         const std::string v_name_acc =
             "ValEpoch" + std::to_string(epoch) + "Accuracy";
-        galois::runtime::reportStat_Single("GraphNeuralNetwork", v_name_acc,
-                                           val_acc);
+        galois::runtime::reportStat_Single(kRegionName, v_name_acc, val_acc);
       }
     }
 
@@ -497,7 +510,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
         galois::gPrint("Epoch ", epoch, ": Test accuracy is ", test_acc, "\n");
         const std::string test_name_acc =
             "TestEpoch" + std::to_string(epoch) + "Accuracy";
-        galois::runtime::reportStat_Single("GraphNeuralNetwork", test_name_acc,
+        galois::runtime::reportStat_Single(kRegionName, test_name_acc,
                                            test_acc);
       }
     }
@@ -505,7 +518,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
     if (do_validate || do_test) {
       // report the training time elapsed at this point in time
       galois::runtime::reportStat_Single(
-          "GraphNeuralNetwork", "ElapsedTrainTimeEpoch" + std::to_string(epoch),
+          kRegionName, "ElapsedTrainTimeEpoch" + std::to_string(epoch),
           epoch_timer.get());
       // revert to training phase for next epoch
       SetLayerPhases(galois::GNNPhase::kTrain);
@@ -538,11 +551,13 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
         }
         graph_->ConstructSampledSubgraph(num_sampled_layers);
       }
+
+      EnableTimers();
     }
   }
 
   uint64_t average_epoch_time = epoch_timer.get() / num_epochs;
-  galois::runtime::reportStat_Tavg("GraphNeuralNetwork", "AverageEpochTime",
+  galois::runtime::reportStat_Tavg(kRegionName, "AverageEpochTime",
                                    average_epoch_time);
   // disable subgraph
   graph_->DisableSubgraph();
@@ -553,7 +568,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
 
   // check test accuracy
   // XXX test batching
-  galois::StatTimer test_timer("FinalTestRun", "GraphNeuralNetwork");
+  galois::StatTimer test_timer("FinalTestRun", kRegionName);
   float global_accuracy;
 
   test_timer.start();
@@ -570,8 +585,8 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
 
   if (this_host == 0) {
     galois::gPrint("Final test accuracy is ", global_accuracy, "\n");
-    galois::runtime::reportStat_Single("GraphNeuralNetwork",
-                                       "FinalTestAccuracy", global_accuracy);
+    galois::runtime::reportStat_Single(kRegionName, "FinalTestAccuracy",
+                                       global_accuracy);
   }
 
   // return global_accuracy;
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index fca2c78cfd..b77d27eb7a 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -207,9 +207,13 @@ void galois::graphs::GNNGraph::AggregateSync(GNNFloat* matrix_to_sync,
   if (!use_subgraph_ && !use_subgraph_view_) {
     // set globals for the sync substrate
     if (!is_backward) {
-      sync_substrate_
-          ->sync<writeSource, readAny, GNNSumAggregate, Bitset_graph_aggregate>(
-              "GraphAggregateSync");
+      if (use_timer_) {
+        sync_substrate_->sync<writeSource, readAny, GNNSumAggregate,
+                              Bitset_graph_aggregate>("GraphAggregateSync");
+      } else {
+        sync_substrate_->sync<writeSource, readAny, GNNSumAggregate,
+                              Bitset_graph_aggregate>("Ignore");
+      }
     } else {
       sync_substrate_->sync<writeDestination, readAny, GNNSumAggregate,
                             Bitset_graph_aggregate>(
@@ -220,8 +224,13 @@ void galois::graphs::GNNGraph::AggregateSync(GNNFloat* matrix_to_sync,
     gnn_lid_to_sid_pointer_ = subgraph_->GetLIDToSIDPointer();
 
     if (!is_backward) {
-      sync_substrate_->sync<writeSource, readAny, GNNSampleSumAggregate,
-                            Bitset_graph_aggregate>("GraphAggregateSync");
+      if (use_timer_) {
+        sync_substrate_->sync<writeSource, readAny, GNNSampleSumAggregate,
+                              Bitset_graph_aggregate>("GraphAggregateSync");
+      } else {
+        sync_substrate_->sync<writeSource, readAny, GNNSampleSumAggregate,
+                              Bitset_graph_aggregate>("Ignore");
+      }
     } else {
       sync_substrate_->sync<writeDestination, readAny, GNNSampleSumAggregate,
                             Bitset_graph_aggregate>(
@@ -248,13 +257,24 @@ void galois::graphs::GNNGraph::AggregateSyncGPU(
   cudaSetLayerInputOutput(cuda_ctx_, matrix_to_sync, matrix_column_size, size(),
                           layer_number);
 
+  // XXX no timer if use_timer is off
   if (gnn_matrix_to_sync_column_length_ == layer_input_mtx_column_size) {
-    sync_substrate_->sync<writeSource, readAny, GNNSumAggregate_layer_input>(
-        "GraphAggregateSync", gnn_matrix_to_sync_column_length_);
+    if (use_timer_) {
+      sync_substrate_->sync<writeSource, readAny, GNNSumAggregate_layer_input>(
+          "GraphAggregateSync", gnn_matrix_to_sync_column_length_);
+    } else {
+      sync_substrate_->sync<writeSource, readAny, GNNSumAggregate_layer_input>(
+          "Ignore", gnn_matrix_to_sync_column_length_);
+    }
   } else if (gnn_matrix_to_sync_column_length_ ==
              layer_output_mtx_column_size) {
-    sync_substrate_->sync<writeSource, readAny, GNNSumAggregate_layer_output>(
-        "GraphAggregateSync", gnn_matrix_to_sync_column_length_);
+    if (use_timer_) {
+      sync_substrate_->sync<writeSource, readAny, GNNSumAggregate_layer_output>(
+          "GraphAggregateSync", gnn_matrix_to_sync_column_length_);
+    } else {
+      sync_substrate_->sync<writeSource, readAny, GNNSumAggregate_layer_output>(
+          "Ignore", gnn_matrix_to_sync_column_length_);
+    }
   } else {
     GALOIS_LOG_FATAL("Column size of the synchronized matrix does not"
                      " match to the column size of the CUDA context");
@@ -924,10 +944,16 @@ size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) {
   bitset_sampled_degrees_.resize(partitioned_graph_->size());
   bitset_sampled_degrees_.reset();
 
-  // Write source = masters
-  sync_substrate_->sync<writeSource, readAny, SampleFlagSync, SampleFlagBitset>(
-      "SampleSync");
-
+  // Seed nodes sync
+  if (use_timer_) {
+    sync_substrate_
+        ->sync<writeSource, readAny, SampleFlagSync, SampleFlagBitset>(
+            "SeedNodeSample");
+  } else {
+    sync_substrate_
+        ->sync<writeSource, readAny, SampleFlagSync, SampleFlagBitset>(
+            "Ignore");
+  }
   galois::GAccumulator<unsigned> local_seed_count;
   local_seed_count.reset();
   // count # of seed nodes
@@ -985,16 +1011,26 @@ size_t galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num,
   // sampled.reduce(),
   //               " out of ", total.reduce(), "\n");
 
-  std::vector<uint32_t> new_nodes = bitset_sample_flag_.getOffsets();
   // update nodes, then communicate update to all hosts so that they can
   // continue the exploration
   galois::do_all(
-      galois::iterate(new_nodes),
-      [&](uint32_t new_node_id) { SetSampledNode(new_node_id); },
+      galois::iterate(size_t{0}, bitset_sample_flag_.size()),
+      [&](uint32_t new_node_id) {
+        if (bitset_sample_flag_.test(new_node_id)) {
+          SetSampledNode(new_node_id);
+        }
+      },
       galois::loopname("NeighborhoodSampleSet"));
-  sync_substrate_
-      ->sync<writeDestination, readAny, SampleFlagSync, SampleFlagBitset>(
-          "SampleSync");
+
+  if (use_timer_) {
+    sync_substrate_
+        ->sync<writeDestination, readAny, SampleFlagSync, SampleFlagBitset>(
+            "SampleFlag");
+  } else {
+    sync_substrate_
+        ->sync<writeDestination, readAny, SampleFlagSync, SampleFlagBitset>(
+            "Ignore");
+  }
 
   galois::GAccumulator<unsigned> local_sample_count;
   local_sample_count.reset();
@@ -1081,18 +1117,29 @@ size_t galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num,
   // galois::gInfo("Num sampled edges for layer ", sample_layer_num, " is ",
   //              sampled.reduce(), " out of ", total.reduce());
 
-  std::vector<uint32_t> new_nodes = bitset_sample_flag_.getOffsets();
-
   // update nodes, then communicate update to all hosts so that they can
   // continue the exploration
   galois::do_all(
-      galois::iterate(new_nodes),
-      [&](uint32_t new_node_id) { SetSampledNode(new_node_id); },
+      galois::iterate(size_t{0}, bitset_sample_flag_.size()),
+      [&](uint32_t new_node_id) {
+        if (bitset_sample_flag_.test(new_node_id)) {
+          SetSampledNode(new_node_id);
+        }
+      },
       galois::loopname("NeighborhoodSampleSet"));
 
-  sync_substrate_
-      ->sync<writeDestination, readAny, SampleFlagSync, SampleFlagBitset>(
-          "SampleSync");
+  // why not read source? even if it doesn't need to sample anything, it needs
+  // to know that it's active so that subgraph construction can proceed
+  // correctly
+  if (use_timer_) {
+    sync_substrate_
+        ->sync<writeDestination, readAny, SampleFlagSync, SampleFlagBitset>(
+            "SampleFlag");
+  } else {
+    sync_substrate_
+        ->sync<writeDestination, readAny, SampleFlagSync, SampleFlagBitset>(
+            "Ignore");
+  }
 
   // count sampled node size
   galois::GAccumulator<unsigned> local_sample_count;
@@ -1121,9 +1168,15 @@ galois::graphs::GNNGraph::ConstructSampledSubgraph(size_t num_sampled_layers,
   gnn_sampled_out_degrees_ = &sampled_out_degrees_;
 
   // first, sync the degres of the sampled edges across all hosts
-  sync_substrate_
-      ->sync<writeSource, readAny, SubgraphDegreeSync, SubgraphDegreeBitset>(
-          "SubgraphDegree");
+  if (use_timer_) {
+    sync_substrate_
+        ->sync<writeSource, readAny, SubgraphDegreeSync, SubgraphDegreeBitset>(
+            "SubgraphDegree");
+  } else {
+    sync_substrate_
+        ->sync<writeSource, readAny, SubgraphDegreeSync, SubgraphDegreeBitset>(
+            "Ignore");
+  }
   size_t num_subgraph_nodes;
   // use_view = true;
   if (!use_view) {
diff --git a/libgnn/src/graphs/GNNSubgraph.cpp b/libgnn/src/graphs/GNNSubgraph.cpp
index 332cd98072..dcb5c0f2db 100644
--- a/libgnn/src/graphs/GNNSubgraph.cpp
+++ b/libgnn/src/graphs/GNNSubgraph.cpp
@@ -4,7 +4,7 @@
 size_t galois::graphs::GNNGraph::GNNSubgraph::BuildSubgraph(
     GNNGraph& gnn_graph, size_t num_sampled_layers) {
   galois::StatTimer timer("BuildSubgraph", kRegionName);
-  timer.start();
+  TimerStart(&timer);
   CreateSubgraphMapping(gnn_graph, num_sampled_layers);
   if (num_subgraph_nodes_ == 0) {
     return 0;
@@ -14,24 +14,24 @@ size_t galois::graphs::GNNGraph::GNNSubgraph::BuildSubgraph(
   NodeFeatureCreation(gnn_graph);
   // loop over each node, grab out/in edges, construct them in LC_CSR_CSC
   // no edge data, just topology
-  timer.stop();
+  TimerStop(&timer);
   return num_subgraph_nodes_;
 }
 
 size_t galois::graphs::GNNGraph::GNNSubgraph::BuildSubgraphView(
     GNNGraph& gnn_graph, size_t num_sampled_layers) {
   galois::StatTimer timer("BuildSubgraphView", kRegionName);
-  timer.start();
+  TimerStart(&timer);
   CreateSubgraphMapping(gnn_graph, num_sampled_layers);
   NodeFeatureCreation(gnn_graph);
-  timer.stop();
+  TimerStop(&timer);
   return num_subgraph_nodes_;
 }
 
 void galois::graphs::GNNGraph::GNNSubgraph::CreateSubgraphMapping(
     const GNNGraph& gnn_graph, size_t num_sampled_layers) {
   galois::StatTimer timer("SIDMapping", kRegionName);
-  timer.start();
+  TimerStart(&timer);
 
   assert(gnn_graph.size() == lid_to_subgraph_id_.size());
   // clear all mappings
@@ -105,14 +105,14 @@ void galois::graphs::GNNGraph::GNNSubgraph::CreateSubgraphMapping(
 
   GALOIS_LOG_ASSERT(num_subgraph_nodes_ == current_sid);
   // num_subgraph_nodes_ = current_sid;
-  timer.stop();
+  TimerStop(&timer);
 }
 
 // TODO optimize further?
 void galois::graphs::GNNGraph::GNNSubgraph::DegreeCounting(
     const GNNGraph& gnn_graph) {
   galois::StatTimer timer("DegreeCounting", kRegionName);
-  timer.start();
+  TimerStart(&timer);
 
   if (local_subgraph_out_degrees_.size() < num_subgraph_nodes_) {
     local_subgraph_out_degrees_.resize(num_subgraph_nodes_ * 1.02);
@@ -146,14 +146,14 @@ void galois::graphs::GNNGraph::GNNSubgraph::DegreeCounting(
       },
       galois::steal());
 
-  timer.stop();
+  TimerStop(&timer);
 }
 
 // TODO optimize further?
 void galois::graphs::GNNGraph::GNNSubgraph::EdgeCreation(
     const GNNGraph& gnn_graph) {
   galois::StatTimer timer("EdgeConstruction", kRegionName);
-  timer.start();
+  TimerStart(&timer);
 
   // prefix sum over subgraph degrees from previous phase to get starting points
   for (size_t i = 1; i < num_subgraph_nodes_; i++) {
@@ -165,11 +165,11 @@ void galois::graphs::GNNGraph::GNNSubgraph::EdgeCreation(
   num_subgraph_edges_ = local_subgraph_out_degrees_[num_subgraph_nodes_ - 1];
 
   galois::StatTimer alloc_time("EdgeCreationAlloc", kRegionName);
-  alloc_time.start();
+  TimerStart(&alloc_time);
   underlying_graph_.DeallocateOnly();
   underlying_graph_.allocateFrom(num_subgraph_nodes_, num_subgraph_edges_);
   underlying_graph_.CSCAllocate();
-  alloc_time.stop();
+  TimerStop(&alloc_time);
 
   galois::do_all(galois::iterate(uint32_t{0}, num_subgraph_nodes_),
                  [&](uint32_t subgraph_id) {
@@ -223,13 +223,13 @@ void galois::graphs::GNNGraph::GNNSubgraph::EdgeCreation(
         assert(in_location == local_subgraph_in_degrees_[subgraph_id]);
       },
       galois::steal());
-  timer.stop();
+  TimerStop(&timer);
 }
 
 void galois::graphs::GNNGraph::GNNSubgraph::NodeFeatureCreation(
     GNNGraph& gnn_graph) {
   galois::StatTimer timer("NodeFeatureCreation", kRegionName);
-  timer.start();
+  TimerStart(&timer);
   size_t feat_length = gnn_graph.node_feature_length();
   // assumes everything is already setup
   subgraph_node_features_.resize(feat_length * num_subgraph_nodes_);
@@ -241,5 +241,5 @@ void galois::graphs::GNNGraph::GNNSubgraph::NodeFeatureCreation(
         &((gnn_graph.GetLocalFeatures().data())[local_id * feat_length]),
         feat_length * sizeof(GNNFeature));
   });
-  timer.stop();
+  TimerStop(&timer);
 }
diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp
index 1dabce8476..171ae5c05d 100644
--- a/libgnn/src/layers/GNNLayer.cpp
+++ b/libgnn/src/layers/GNNLayer.cpp
@@ -192,7 +192,7 @@ void galois::GNNLayer::DoDropout(
     const PointerWithSize<GNNFloat> input_to_dropout,
     PointerWithSize<GNNFloat>* output_matrix) {
   galois::StatTimer timer("ForwardDropout", "GNNLayer");
-  timer.start();
+  TimerStart(&timer);
 #ifdef GALOIS_ENABLE_GPU
   if (device_personality == DevicePersonality::GPU_CUDA) {
     base_gpu_object_.DoDropoutGPU(input_to_dropout, *output_matrix,
@@ -203,14 +203,14 @@ void galois::GNNLayer::DoDropout(
 #ifdef GALOIS_ENABLE_GPU
   }
 #endif
-  timer.stop();
+  TimerStop(&timer);
 }
 
 void galois::GNNLayer::ReconstructDropoutMatrix(
     const PointerWithSize<GNNFloat> input_to_dropout,
     PointerWithSize<GNNFloat>* output_matrix) {
   galois::StatTimer timer("ReconstructDropoutMatrix", "GNNLayer");
-  timer.start();
+  TimerStart(&timer);
   // reuse the dropout mask from a previous dropout call
   size_t num_elements = output_matrix->size();
   GNNFloat scale      = 1. / (1. - config_.dropout_rate);
@@ -230,12 +230,12 @@ void galois::GNNLayer::ReconstructDropoutMatrix(
 #ifdef GALOIS_ENABLE_GPU
   }
 #endif
-  timer.stop();
+  TimerStop(&timer);
 }
 
 void galois::GNNLayer::DoDropoutDerivative() {
   galois::StatTimer timer("BackwardDropout", "GNNLayer");
-  timer.start();
+  TimerStart(&timer);
   assert(p_backward_output_matrix_.size() == dropout_mask_.size());
   GNNFloat scale = 1. / (1. - config_.dropout_rate);
 
@@ -258,12 +258,12 @@ void galois::GNNLayer::DoDropoutDerivative() {
 #ifdef GALOIS_ENABLE_GPU
   }
 #endif
-  timer.stop();
+  TimerStop(&timer);
 }
 
 void galois::GNNLayer::Activation() {
   galois::StatTimer timer("ForwardActivation", "GNNLayer");
-  timer.start();
+  TimerStart(&timer);
 
   // TODO only does relu at the moment; should check user specified activation
   // and act accordingly
@@ -277,27 +277,28 @@ void galois::GNNLayer::Activation() {
     }
     activation_memo_.reset();
 
-    galois::do_all(
-        galois::iterate(static_cast<size_t>(0),
-                        layer_dimensions_.output_rows *
-                            layer_dimensions_.output_columns),
-        [&](size_t i) {
-          if (forward_output_matrix_[i] > 0.0) {
-            // do nothing, keep value; set the memo though
-            activation_memo_.set(i);
-          } else {
-            forward_output_matrix_[i] = 0;
-          }
-        },
-        galois::loopname("ReLU"));
+    galois::do_all(galois::iterate(static_cast<size_t>(0),
+                                   layer_dimensions_.output_rows *
+                                       layer_dimensions_.output_columns),
+                   [&](size_t i) {
+                     if (forward_output_matrix_[i] > 0.0) {
+                       // do nothing, keep value; set the memo though
+                       activation_memo_.set(i);
+                     } else {
+                       forward_output_matrix_[i] = 0;
+                     }
+                   });
 #ifdef GALOIS_ENABLE_GPU
   }
 #endif
-  timer.stop();
+  TimerStop(&timer);
 }
 
 void galois::GNNLayer::ActivationDerivative(
     PointerWithSize<GNNFloat>* gradient) {
+  galois::StatTimer timer("BackwardActivation", "GNNLayer");
+  TimerStart(&timer);
+
 #ifdef GALOIS_ENABLE_GPU
   if (device_personality == DevicePersonality::GPU_CUDA) {
     base_gpu_object_.ActivationDerivativeGPU(gradient->data(),
@@ -321,11 +322,12 @@ void galois::GNNLayer::ActivationDerivative(
 #ifdef GALOIS_ENABLE_GPU
   }
 #endif
+  TimerStop(&timer);
 }
 
 void galois::GNNLayer::WeightGradientSyncSum() {
   galois::StatTimer t("Sync_WeightGradientsSum", "GNNLayer");
-  t.start();
+  TimerStart(&t);
   int weight_size = static_cast<int>(p_layer_weight_gradients_.size());
 
   // TODO(loc) remove this limitation later; can just do a loop over the weight
@@ -352,7 +354,7 @@ void galois::GNNLayer::WeightGradientSyncSum() {
 #ifdef GALOIS_ENABLE_GPU
   }
 #endif
-  t.stop();
+  TimerStop(&t);
 }
 
 void galois::GNNLayer::MaskInputNonMasters(PointerWithSize<GNNFloat>* input) {
diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp
index 3f712df0f7..8e2470ffda 100644
--- a/libgnn/src/layers/SAGELayer.cpp
+++ b/libgnn/src/layers/SAGELayer.cpp
@@ -142,7 +142,7 @@ galois::SAGELayer::SAGELayer(size_t layer_num,
 
 void galois::SAGELayer::WeightGradientSyncSum2() {
   galois::StatTimer t("Sync_WeightGradientsSum2", kRegionName);
-  t.start();
+  TimerStart(&t);
   int weight_size = static_cast<int>(p_layer_weight_gradients_2_.size());
 #ifdef GALOIS_ENABLE_GPU
   bool gpu_direct_enabled = false;
@@ -168,13 +168,13 @@ void galois::SAGELayer::WeightGradientSyncSum2() {
 #ifdef GALOIS_ENABLE_GPU
   }
 #endif
-  t.stop();
+  TimerStop(&t);
 }
 
 const galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::ForwardPhase(
     const galois::PointerWithSize<galois::GNNFloat> input_embeddings) {
   galois::StatTimer timer("ForwardPhase", kRegionName);
-  timer.start();
+  TimerStart(&timer);
 
   assert(input_embeddings.size() >=
          (layer_dimensions_.input_rows * layer_dimensions_.input_columns));
@@ -227,7 +227,7 @@ const galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::ForwardPhase(
   assert(p_forward_output_matrix_.size() >=
          (layer_dimensions_.output_rows * layer_dimensions_.output_columns));
 
-  timer.stop();
+  TimerStop(&timer);
 
   return p_forward_output_matrix_;
 }
@@ -236,7 +236,7 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
     galois::PointerWithSize<galois::GNNFloat> prev_layer_input,
     galois::PointerWithSize<galois::GNNFloat>* input_gradient) {
   galois::StatTimer timer("BackwardPhase", kRegionName);
-  timer.start();
+  TimerStart(&timer);
 
   assert(layer_phase_ == GNNPhase::kTrain || layer_phase_ == GNNPhase::kBatch);
 
@@ -290,11 +290,15 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
       // input data (prev layer input or temp1) or gradient need mask
       // can mask gradient if layer == 0
       // otherwise must mask other
+
+      galois::StatTimer concat_grad_timer("ConcatGradMultiply", kRegionName);
+      TimerStart(&concat_grad_timer);
       galois::CBlasSGEMM(
           CblasTrans, CblasNoTrans, layer_dimensions_.input_columns,
           layer_dimensions_.output_rows, layer_dimensions_.output_columns,
           input_data.data(), input_gradient->data(),
           p_layer_weight_gradients_2_.data());
+      TimerStop(&concat_grad_timer);
 #ifdef GALOIS_ENABLE_GPU
     }
 #endif
@@ -324,11 +328,14 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
     } else {
 #endif
       // agg data holds aggregated feature vectors from forward phase
+      galois::StatTimer normal_grad_timer("NormalGradMultiply", kRegionName);
+      TimerStart(&normal_grad_timer);
       galois::CBlasSGEMM(
           CblasTrans, CblasNoTrans, layer_dimensions_.input_columns,
           layer_dimensions_.output_rows, layer_dimensions_.output_columns,
           agg_data.data(), input_gradient->data(),
           p_layer_weight_gradients_.data());
+      TimerStop(&normal_grad_timer);
 #ifdef GALOIS_ENABLE_GPU
     }
 #endif
@@ -372,11 +379,14 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
     } else {
 #endif
       // input col x input row * input row x output col
+      galois::StatTimer normal_grad_timer("NormalGradMultiply", kRegionName);
+      TimerStart(&normal_grad_timer);
       galois::CBlasSGEMM(CblasTrans, CblasNoTrans,
                          layer_dimensions_.input_columns,
                          layer_dimensions_.input_rows,
                          layer_dimensions_.output_columns, input_data.data(),
                          p_out_temp_.data(), p_layer_weight_gradients_.data());
+      TimerStop(&normal_grad_timer);
 #ifdef GALOIS_ENABLE_GPU
     }
 #endif
@@ -403,7 +413,7 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
     DoDropoutDerivative();
   }
 
-  timer.stop();
+  TimerStop(&timer);
   return p_backward_output_matrix_;
 }
 
@@ -421,14 +431,14 @@ void galois::SAGELayer::AggregateAll(
     [[maybe_unused]] galois::substrate::PerThreadStorage<std::vector<GNNFloat>>*
         pts,
     bool is_backward) {
-  std::string agg_timer_name = "Aggregate";
+  std::string agg_timer_name = "AggregateCompute";
   if (!is_backward) {
     agg_timer_name += "Forward";
   } else {
     agg_timer_name += "Backward";
   }
   galois::StatTimer timer(agg_timer_name.c_str(), kRegionName);
-  timer.start();
+  TimerStart(&timer);
 
 #ifdef GALOIS_ENABLE_GPU
   if (device_personality == DevicePersonality::GPU_CUDA) {
@@ -445,10 +455,12 @@ void galois::SAGELayer::AggregateAll(
 #endif
     AggregateAllCPU(column_length, node_embeddings, aggregate_output, pts,
                     is_backward);
+    TimerStop(&timer);
+    // aggregate sync
+    graph_.AggregateSync(aggregate_output, column_length, is_backward);
 #ifdef GALOIS_ENABLE_GPU
   }
 #endif
-  timer.stop();
 }
 
 void galois::SAGELayer::AggregateAllCPU(
@@ -557,17 +569,13 @@ void galois::SAGELayer::AggregateAllCPU(
           }
         }
       },
-      galois::chunk_size<1>(), galois::steal(),
-      galois::loopname("ConvolutionalAggregateAll"));
-
-  // aggregate sync
-  graph_.AggregateSync(aggregate_output, column_length, is_backward);
+      galois::chunk_size<1>(), galois::steal());
 }
 
 void galois::SAGELayer::UpdateEmbeddings(const GNNFloat* node_embeddings,
                                          GNNFloat* output) {
   galois::StatTimer timer("ForwardXForm", kRegionName);
-  timer.start();
+  TimerStart(&timer);
 #ifdef GALOIS_ENABLE_GPU
   // TODO self change
   // XXX(hochan) output rows
@@ -590,13 +598,13 @@ void galois::SAGELayer::UpdateEmbeddings(const GNNFloat* node_embeddings,
 #ifdef GALOIS_ENABLE_GPU
   }
 #endif
-  timer.stop();
+  TimerStop(&timer);
 }
 
 void galois::SAGELayer::SelfFeatureUpdateEmbeddings(
     const GNNFloat* node_embeddings, GNNFloat* output) {
   galois::StatTimer timer("SelfForwardXForm", kRegionName);
-  timer.start();
+  TimerStart(&timer);
 #ifdef GALOIS_ENABLE_GPU
   if (device_personality == DevicePersonality::GPU_CUDA) {
     gpu_object_.SelfFeatureUpdateEmbeddingsGPU(
@@ -612,13 +620,13 @@ void galois::SAGELayer::SelfFeatureUpdateEmbeddings(
 #ifdef GALOIS_ENABLE_GPU
   }
 #endif
-  timer.stop();
+  TimerStop(&timer);
 }
 
 void galois::SAGELayer::UpdateEmbeddingsDerivative(const GNNFloat* gradients,
                                                    GNNFloat* output) {
   galois::StatTimer timer("BackwardXForm", kRegionName);
-  timer.start();
+  TimerStart(&timer);
 
   assert(p_layer_weights_.size() >=
          layer_dimensions_.input_columns * layer_dimensions_.output_columns);
@@ -640,13 +648,13 @@ void galois::SAGELayer::UpdateEmbeddingsDerivative(const GNNFloat* gradients,
 #ifdef GALOIS_ENABLE_GPU
   }
 #endif
-  timer.stop();
+  TimerStop(&timer);
 }
 
 void galois::SAGELayer::SelfFeatureUpdateEmbeddingsDerivative(
     const GNNFloat* gradients, GNNFloat* output) {
   galois::StatTimer timer("SelfBackwardXForm", kRegionName);
-  timer.start();
+  TimerStart(&timer);
 
   assert(p_layer_weights_.size() >=
          layer_dimensions_.input_columns * layer_dimensions_.output_columns);
@@ -667,7 +675,7 @@ void galois::SAGELayer::SelfFeatureUpdateEmbeddingsDerivative(
 #ifdef GALOIS_ENABLE_GPU
   }
 #endif
-  timer.stop();
+  TimerStop(&timer);
 }
 
 void galois::SAGELayer::OptimizeLayer(BaseOptimizer* optimizer,
diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp
index 312bdab9ac..e7cd7b00d1 100644
--- a/libgnn/src/layers/SoftmaxLayer.cpp
+++ b/libgnn/src/layers/SoftmaxLayer.cpp
@@ -5,6 +5,9 @@
 const galois::PointerWithSize<galois::GNNFloat>
 galois::SoftmaxLayer::ForwardPhaseCPU(
     const galois::PointerWithSize<galois::GNNFloat> input_embeddings) {
+  galois::StatTimer timer("SoftmaxForward", "SoftmaxLayer");
+  TimerStart(&timer);
+
   // note: p_backward == input_embeddings
   input_loss_.assign(input_loss_.size(), 0.0);
   const size_t feature_length = layer_dimensions_.input_columns;
@@ -62,6 +65,7 @@ galois::SoftmaxLayer::ForwardPhaseCPU(
   galois::gPrint("Loss is ", reduced_loss / t, " ", reduced_loss, " ", t, "\n");
 #endif
 
+  TimerStop(&timer);
   return p_backward_output_matrix_;
 }
 
@@ -81,6 +85,9 @@ galois::SoftmaxLayer::ForwardPhase(
 
 galois::PointerWithSize<galois::GNNFloat>
 galois::SoftmaxLayer::BackwardPhaseCPU() {
+  galois::StatTimer timer("SoftmaxForward", "SoftmaxLayer");
+  TimerStart(&timer);
+
   const size_t feature_length = layer_dimensions_.input_columns;
 
   galois::do_all(
@@ -114,6 +121,8 @@ galois::SoftmaxLayer::BackwardPhaseCPU() {
       },
       galois::steal(), galois::loopname("SoftmaxBackward"));
 
+  TimerStop(&timer);
+
   return p_backward_output_matrix_;
 }
 

From 6e008daafc81cac132ae28e90062008bda5897a5 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 20 May 2021 19:11:33 -0500
Subject: [PATCH 550/660] GNN subgraphs always use global norm factor

Subgraph degree norm factor was buggy because I wasn't finding the
incoming degrees and using them for the subgraph; this caused accuracy
to get really weird depending on code route taken. For simplicity (and
to speedup subgraph construction) only global degrees are used even for
subgraphs.

This will 100% affect time to accuracy due to possible overcompensation
during training, but it saves a lot of micromanagement.
---
 libgnn/include/galois/MinibatchGenerator.h |  2 +-
 libgnn/include/galois/graphs/GNNGraph.h    | 43 ++++++++------
 libgnn/src/GraphNeuralNetwork.cpp          |  3 +-
 libgnn/src/graphs/GNNGraph.cpp             | 67 +++++++++++-----------
 libgnn/src/layers/SAGELayer.cpp            |  8 +--
 libgnn/src/layers/SoftmaxLayer.cpp         |  4 +-
 6 files changed, 65 insertions(+), 62 deletions(-)

diff --git a/libgnn/include/galois/MinibatchGenerator.h b/libgnn/include/galois/MinibatchGenerator.h
index 11bce02848..8a5063ed1d 100644
--- a/libgnn/include/galois/MinibatchGenerator.h
+++ b/libgnn/include/galois/MinibatchGenerator.h
@@ -12,7 +12,7 @@ class MinibatchGenerator {
   MinibatchGenerator(const GNNMask& mask_to_minibatch, size_t minibatch_size,
                      size_t master_bound)
       : mask_to_minibatch_{mask_to_minibatch}, minibatch_size_{minibatch_size},
-        master_bound_{master_bound} {
+        current_position_{0}, master_bound_{master_bound} {
     GALOIS_LOG_ASSERT(master_bound_ <= mask_to_minibatch_.size());
   }
   void GetNextMinibatch(std::vector<char>* batch_mask);
diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 6e2b211e00..971b00e676 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -451,27 +451,33 @@ class GNNGraph {
   }
 
   //! Get degree norm of subgraph for particular layer (i.e. includes training)
-  GNNFloat GetDegreeNorm(GraphNode n, size_t graph_user_layer_num) const {
+  // GNNFloat GetDegreeNorm(GraphNode n, size_t graph_user_layer_num) const {
+  GNNFloat GetDegreeNorm(GraphNode n, size_t) const {
     if (use_subgraph_ || use_subgraph_view_) {
-      size_t degree;
-      if (!subgraph_is_train_) {
-        // case because degrees in each layer differ
-        degree =
-            sampled_out_degrees_[graph_user_layer_num][subgraph_->SIDToLID(n)];
-      } else {
-        // XXX if inductive
-        // degree = global_train_degrees_[subgraph_->SIDToLID(n)];
-        degree = global_degrees_[subgraph_->SIDToLID(n)];
-      }
-
-      if (degree) {
-        return 1.0 / degree;
-      } else {
-        return 0;
-      }
+      // TODO(loc) this is impresise: subgraph degrees differ from global
+      // degrees, but going to always use global degree -> not correct
+      return GetGlobalDegreeNorm(subgraph_->SIDToLID(n));
     } else {
       return GetGlobalDegreeNorm(n);
     }
+
+    //  size_t degree;
+    //  if (!subgraph_is_train_) {
+    //    // case because degrees in each layer differ
+    //    degree =
+    //        sampled_out_degrees_[graph_user_layer_num][subgraph_->SIDToLID(n)];
+    //  } else {
+    //    // XXX if inductive
+    //    // degree = global_train_degrees_[subgraph_->SIDToLID(n)];
+    //    degree = global_degrees_[subgraph_->SIDToLID(n)];
+    //  }
+    //  //degree = global_degrees_[subgraph_->SIDToLID(n)];
+
+    //  if (degree) {
+    //    return 1.0 / degree;
+    //  } else {
+    //    return 0;
+    //  }
   }
 
   // Get accuracy: sampling is by default false
@@ -708,7 +714,8 @@ class GNNGraph {
 
   std::unique_ptr<GNNSubgraph> subgraph_;
   // Degrees for sampled subgraph
-  std::vector<galois::LargeArray<uint32_t>> sampled_out_degrees_;
+  // std::vector<galois::LargeArray<uint32_t>> sampled_out_degrees_;
+  // std::vector<galois::LargeArray<uint32_t>> sampled_in_degrees_;
   //! Sample data on edges: each edge gets a small bitset to mark
   //! if it's been sampled for a particular layer
   galois::LargeArray<std::vector<bool>> edge_sample_status_;
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index 02be5edbf4..92af85b278 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -172,6 +172,7 @@ float galois::GraphNeuralNetwork::MinibatchedTesting() {
     // last layer input size/output rows becomes seed node size
     gnn_layers_.back()->ResizeInputOutputRows(seed_node_count, seed_node_count);
     size_t num_sampled_layers = 0;
+
     for (auto back_iter = gnn_layers_.rbegin(); back_iter != gnn_layers_.rend();
          back_iter++) {
       GNNLayerType layer_type = (*back_iter)->layer_type();
@@ -499,7 +500,6 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
         SetLayerPhases(galois::GNNPhase::kTest);
         const PointerWithSize<galois::GNNFloat> test_pred = DoInference();
         epoch_test_timer.stop();
-
         test_acc = GetGlobalAccuracy(test_pred);
       } else {
         test_acc = MinibatchedTesting();
@@ -559,6 +559,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
   uint64_t average_epoch_time = epoch_timer.get() / num_epochs;
   galois::runtime::reportStat_Tavg(kRegionName, "AverageEpochTime",
                                    average_epoch_time);
+  DisableTimers();
   // disable subgraph
   graph_->DisableSubgraph();
   // TODO only do this when necessary
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index b77d27eb7a..a445e299a5 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -890,10 +890,10 @@ void galois::graphs::GNNGraph::InitializeSamplingData(size_t num_layers,
   // this is slightly problematic possibly, but each layer is its own
   // subgraph
   if (!is_inductive) {
-    sampled_out_degrees_.resize(num_layers);
-    for (galois::LargeArray<uint32_t>& array : sampled_out_degrees_) {
-      array.create(partitioned_graph_->size());
-    }
+    // sampled_out_degrees_.resize(num_layers);
+    // for (galois::LargeArray<uint32_t>& array : sampled_out_degrees_) {
+    //  array.create(partitioned_graph_->size());
+    //}
   } else {
     subgraph_is_train_ = true;
   }
@@ -906,7 +906,6 @@ size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) {
   bitset_sample_flag_.resize(size());
   bitset_sample_flag_.reset();
 
-  // for now, if training node, it goes into seed node
   galois::do_all(galois::iterate(begin_owned(), end_owned()),
                  [&](const NodeIterator& x) {
                    if (IsValidForPhase(*x, seed_phase)) {
@@ -933,16 +932,16 @@ size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) {
                              edge_sample_status_[edge_id].end(), 0);
                  });
   // reset all degrees
-  if (!subgraph_is_train_) {
-    galois::do_all(
-        galois::iterate(sampled_out_degrees_),
-        [&](galois::LargeArray<uint32_t>& array) {
-          std::fill(array.begin(), array.end(), 0);
-        },
-        galois::chunk_size<1>());
-  }
-  bitset_sampled_degrees_.resize(partitioned_graph_->size());
-  bitset_sampled_degrees_.reset();
+  // if (!subgraph_is_train_) {
+  //  galois::do_all(
+  //      galois::iterate(sampled_out_degrees_),
+  //      [&](galois::LargeArray<uint32_t>& array) {
+  //        std::fill(array.begin(), array.end(), 0);
+  //      },
+  //      galois::chunk_size<1>());
+  //}
+  // bitset_sampled_degrees_.resize(partitioned_graph_->size());
+  // bitset_sampled_degrees_.reset();
 
   // Seed nodes sync
   if (use_timer_) {
@@ -1093,7 +1092,7 @@ size_t galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num,
                 }
               }
 
-              // if here, it means edge accepted; set sampled on, mark source
+              // if here, it means edge accepted; set sampled on, mark
               // as part of next set
               MakeEdgeSampled(edge_iter, sample_layer_num);
               if (!IsInSampledGraph(
@@ -1101,9 +1100,9 @@ size_t galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num,
                 bitset_sample_flag_.set(
                     partitioned_graph_->getEdgeDst(edge_iter));
               }
-              bitset_sampled_degrees_.set(*src_iter);
+              // bitset_sampled_degrees_.set(*src_iter);
               // degree increment
-              sampled_out_degrees_[sample_layer_num][*src_iter]++;
+              // sampled_out_degrees_[sample_layer_num][*src_iter]++;
               sampled += 1;
             }
           }
@@ -1163,29 +1162,29 @@ galois::graphs::GNNGraph::ConstructSampledSubgraph(size_t num_sampled_layers,
                                                    bool use_view) {
   // false first so that the build process can use functions to access the
   // real graph
-  use_subgraph_            = false;
-  use_subgraph_view_       = false;
-  gnn_sampled_out_degrees_ = &sampled_out_degrees_;
-
-  // first, sync the degres of the sampled edges across all hosts
-  if (use_timer_) {
-    sync_substrate_
-        ->sync<writeSource, readAny, SubgraphDegreeSync, SubgraphDegreeBitset>(
-            "SubgraphDegree");
-  } else {
-    sync_substrate_
-        ->sync<writeSource, readAny, SubgraphDegreeSync, SubgraphDegreeBitset>(
-            "Ignore");
-  }
+  use_subgraph_      = false;
+  use_subgraph_view_ = false;
+  // gnn_sampled_out_degrees_ = &sampled_out_degrees_;
+
+  //// first, sync the degres of the sampled edges across all hosts
+  // if (use_timer_) {
+  //  sync_substrate_
+  //      ->sync<writeSource, readAny, SubgraphDegreeSync,
+  //      SubgraphDegreeBitset>(
+  //          "SubgraphDegree");
+  //} else {
+  //  sync_substrate_
+  //      ->sync<writeSource, readAny, SubgraphDegreeSync,
+  //      SubgraphDegreeBitset>(
+  //          "Ignore");
+  //}
   size_t num_subgraph_nodes;
-  // use_view = true;
   if (!use_view) {
     num_subgraph_nodes = subgraph_->BuildSubgraph(*this, num_sampled_layers);
   } else {
     // a view only has lid<->sid mappings
     num_subgraph_nodes =
         subgraph_->BuildSubgraphView(*this, num_sampled_layers);
-    // SortAllInEdgesBySID();
   }
 
   // after this, this graph is a subgraph
diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp
index 8e2470ffda..48a7da9b94 100644
--- a/libgnn/src/layers/SAGELayer.cpp
+++ b/libgnn/src/layers/SAGELayer.cpp
@@ -508,13 +508,7 @@ void galois::SAGELayer::AggregateAllCPU(
             size_t index_to_dst_feature = dst * column_length;
 
             if (!config_.disable_normalization) {
-              GNNFloat norm_scale;
-              if (!is_backward) {
-                norm_scale = source_norm;
-              } else {
-                norm_scale =
-                    graph_.GetDegreeNorm(dst, graph_user_layer_number_);
-              }
+              GNNFloat norm_scale = source_norm;
 
               galois::VectorMulAdd(
                   column_length, &aggregate_output[index_to_src_feature],
diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp
index e7cd7b00d1..ade63b9d1e 100644
--- a/libgnn/src/layers/SoftmaxLayer.cpp
+++ b/libgnn/src/layers/SoftmaxLayer.cpp
@@ -22,7 +22,9 @@ galois::SoftmaxLayer::ForwardPhaseCPU(
       galois::iterate(graph_.begin(), graph_.end()),
       [&](const unsigned i) {
         if (IsSampledLayer()) {
-          if (layer_phase_ == GNNPhase::kTrain && !graph_.IsInSampledGraph(i)) {
+          if ((layer_phase_ == GNNPhase::kTrain ||
+               layer_phase_ == GNNPhase::kBatch) &&
+              !graph_.IsInSampledGraph(i)) {
             // XXX
             VectorZero(feature_length,
                        &p_backward_output_matrix_[i * feature_length]);

From 7f8779eb106ab4a639925ea2dc961e023bd9f591 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 21 May 2021 15:17:25 -0500
Subject: [PATCH 551/660] Fixed rows for GNN/SAGE (not GCN)

input/output rows can differ now, so intermediate matrices also have
different sizes that must be used. This commit fixes that (and adds a
argument to update embeddings and some other functions that deals with
the rows).
---
 libgnn/include/galois/layers/GNNLayer.h  | 11 ++-
 libgnn/include/galois/layers/SAGELayer.h |  6 +-
 libgnn/src/layers/GNNLayer.cpp           | 21 ++++-
 libgnn/src/layers/SAGELayer.cpp          | 97 ++++++++++++++----------
 4 files changed, 89 insertions(+), 46 deletions(-)

diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
index c835d05454..ac8f2c8f05 100644
--- a/libgnn/include/galois/layers/GNNLayer.h
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -348,9 +348,16 @@ class GNNLayer {
 #endif
 
   //! Mask a input size'd matrix's rows that correspond to mirrors
-  void MaskInputNonMasters(PointerWithSize<GNNFloat>* input);
+  void MaskInputNonMasters(PointerWithSize<GNNFloat>* input) {
+    MaskInputNonMasters(input, std::numeric_limits<size_t>::max());
+  }
+  void MaskInputNonMasters(PointerWithSize<GNNFloat>* input, size_t max_rows);
   //! Mask a gradient size'd matrix's rows that correspond to mirrors
-  void MaskGradientNonMasters(PointerWithSize<GNNFloat>* gradients);
+  void MaskGradientNonMasters(PointerWithSize<GNNFloat>* input) {
+    MaskGradientNonMasters(input, std::numeric_limits<size_t>::max());
+  }
+  void MaskGradientNonMasters(PointerWithSize<GNNFloat>* gradients,
+                              size_t max_rows);
 
   //! Does some math to get GB used by some # of floats
   double FloatElementsToGB(size_t num_of_floats) const {
diff --git a/libgnn/include/galois/layers/SAGELayer.h b/libgnn/include/galois/layers/SAGELayer.h
index 0711862240..e127b78e73 100644
--- a/libgnn/include/galois/layers/SAGELayer.h
+++ b/libgnn/include/galois/layers/SAGELayer.h
@@ -118,12 +118,14 @@ class SAGELayer : public GNNLayer {
                bool is_backward);
 
   //! Do embedding update via mxm with this layer's weights (forward)
-  void UpdateEmbeddings(const GNNFloat* node_embeddings, GNNFloat* output);
+  void UpdateEmbeddings(const GNNFloat* node_embeddings, GNNFloat* output,
+                        bool after);
   //! Same as above but uses the second set of weights (self feature weights)
   void SelfFeatureUpdateEmbeddings(const GNNFloat* node_embeddings,
                                    GNNFloat* output);
   //! Calculate graident via mxm with last layer's gradients (backward)
-  void UpdateEmbeddingsDerivative(const GNNFloat* gradients, GNNFloat* output);
+  void UpdateEmbeddingsDerivative(const GNNFloat* gradients, GNNFloat* output,
+                                  bool after);
   //! Same as above but uses the second set of weights (self feature weights)
   void SelfFeatureUpdateEmbeddingsDerivative(const GNNFloat* gradients,
                                              GNNFloat* output);
diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp
index 171ae5c05d..07e839cb48 100644
--- a/libgnn/src/layers/GNNLayer.cpp
+++ b/libgnn/src/layers/GNNLayer.cpp
@@ -357,12 +357,21 @@ void galois::GNNLayer::WeightGradientSyncSum() {
   TimerStop(&t);
 }
 
-void galois::GNNLayer::MaskInputNonMasters(PointerWithSize<GNNFloat>* input) {
+void galois::GNNLayer::MaskInputNonMasters(PointerWithSize<GNNFloat>* input,
+                                           size_t max_rows) {
   assert(*(graph_.begin_owned()) == 0);
   size_t start_node = *(graph_.end_owned());
   size_t end_node   = graph_.active_size();
   size_t row_index  = layer_dimensions_.input_columns;
   assert((row_index * layer_dimensions_.input_rows) <= input->size());
+
+  if (start_node > max_rows) {
+    start_node = max_rows;
+  }
+  if (end_node > max_rows) {
+    end_node = max_rows;
+  }
+
 #ifdef GALOIS_ENABLE_GPU
   if (device_personality == DevicePersonality::GPU_CUDA) {
     base_gpu_object_.MaskNonMastersGPU(input, start_node, end_node, row_index);
@@ -383,11 +392,19 @@ void galois::GNNLayer::MaskInputNonMasters(PointerWithSize<GNNFloat>* input) {
 }
 
 void galois::GNNLayer::MaskGradientNonMasters(
-    PointerWithSize<GNNFloat>* gradient) {
+    PointerWithSize<GNNFloat>* gradient, size_t max_rows) {
   assert(*(graph_.begin_owned()) == 0);
   size_t start_node = *(graph_.end_owned());
   size_t end_node   = graph_.active_size();
   size_t row_index  = layer_dimensions_.output_columns;
+
+  if (start_node > max_rows) {
+    start_node = max_rows;
+  }
+  if (end_node > max_rows) {
+    end_node = max_rows;
+  }
+
 #ifdef GALOIS_ENABLE_GPU
   if (device_personality == DevicePersonality::GPU_CUDA) {
     base_gpu_object_.MaskNonMastersGPU(gradient, start_node, end_node,
diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp
index 48a7da9b94..5aa8e24a77 100644
--- a/libgnn/src/layers/SAGELayer.cpp
+++ b/libgnn/src/layers/SAGELayer.cpp
@@ -61,21 +61,22 @@ galois::SAGELayer::SAGELayer(size_t layer_num,
     second_weight_optimizer_ = std::make_unique<AdamOptimizer>(weight_size, 1);
   }
 
-  size_t num_input_elements =
-      layer_dimensions_.input_rows * layer_dimensions_.input_columns;
+  // TODO(loc) dropout uses input rows; this won't work if dropout is enabled
+  size_t num_in_temp_elements =
+      layer_dimensions_.output_rows * layer_dimensions_.input_columns;
 
   // if in temp is smaller than out temp, or if dropout exists
   if (!config_.disable_dropout || config_.disable_aggregate_after_update ||
       layer_dimensions_.input_columns <= layer_dimensions_.output_columns) {
     galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
-                  ", SAGE input temp var 1 ", num_input_elements, " (",
-                  FloatElementsToGB(num_input_elements), " GB)");
+                  ", SAGE input temp var 1 ", num_in_temp_elements, " (",
+                  FloatElementsToGB(num_in_temp_elements), " GB)");
 #ifdef GALOIS_ENABLE_GPU
     if (device_personality == DevicePersonality::GPU_CUDA) {
-      gpu_object_.AllocateInTemp1(num_input_elements);
+      gpu_object_.AllocateInTemp1(num_in_temp_elements);
     } else {
 #endif
-      in_temp_1_.resize(num_input_elements, 0);
+      in_temp_1_.resize(num_in_temp_elements, 0);
 #ifdef GALOIS_ENABLE_GPU
     }
 #endif
@@ -86,33 +87,33 @@ galois::SAGELayer::SAGELayer(size_t layer_num,
       (config_.disable_aggregate_after_update ||
        layer_dimensions_.input_columns <= layer_dimensions_.output_columns)) {
     galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
-                  ", SAGE input temp var 2 ", num_input_elements, " (",
-                  FloatElementsToGB(num_input_elements), " GB)");
+                  ", SAGE input temp var 2 ", num_in_temp_elements, " (",
+                  FloatElementsToGB(num_in_temp_elements), " GB)");
 #ifdef GALOIS_ENABLE_GPU
     if (device_personality == DevicePersonality::GPU_CUDA) {
-      gpu_object_.AllocateInTemp2(num_input_elements);
+      gpu_object_.AllocateInTemp2(num_in_temp_elements);
     } else {
 #endif
-      in_temp_2_.resize(num_input_elements, 0);
+      in_temp_2_.resize(num_in_temp_elements, 0);
 #ifdef GALOIS_ENABLE_GPU
     }
 #endif
   }
 
-  size_t num_output_elements =
-      layer_dimensions_.output_rows * layer_dimensions_.output_columns;
+  size_t num_out_temp =
+      layer_dimensions_.input_rows * layer_dimensions_.output_columns;
   // only needed if out temp would be smaller than intemp
   if (!config_.disable_aggregate_after_update &&
       layer_dimensions_.input_columns > layer_dimensions_.output_columns) {
     galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
-                  ", SAGE output temp var ", num_output_elements, " (",
-                  FloatElementsToGB(num_output_elements), " GB)");
+                  ", SAGE output temp var ", num_out_temp, " (",
+                  FloatElementsToGB(num_out_temp), " GB)");
 #ifdef GALOIS_ENABLE_GPU
     if (device_personality == DevicePersonality::GPU_CUDA) {
-      gpu_object_.AllocateOutTemp(num_output_elements);
+      gpu_object_.AllocateOutTemp(num_out_temp);
     } else {
 #endif
-      out_temp_.resize(num_output_elements, 0);
+      out_temp_.resize(num_out_temp, 0);
 #ifdef GALOIS_ENABLE_GPU
     }
 #endif
@@ -122,10 +123,10 @@ galois::SAGELayer::SAGELayer(size_t layer_num,
 #ifdef GALOIS_ENABLE_GPU
   if (device_personality == DevicePersonality::GPU_CUDA) {
     // init pointers with size
-    p_in_temp_1_ =
-        PointerWithSize<GNNFloat>(gpu_object_.in_temp_1(), num_input_elements);
-    p_in_temp_2_ =
-        PointerWithSize<GNNFloat>(gpu_object_.in_temp_2(), num_input_elements);
+    p_in_temp_1_ = PointerWithSize<GNNFloat>(gpu_object_.in_temp_1(),
+                                             num_in_temp_elements);
+    p_in_temp_2_ = PointerWithSize<GNNFloat>(gpu_object_.in_temp_2(),
+                                             num_in_temp_elements);
     p_out_temp_ =
         PointerWithSize<GNNFloat>(gpu_object_.out_temp(), num_output_elements);
   } else {
@@ -202,11 +203,11 @@ const galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::ForwardPhase(
     // aggregation and update
     AggregateAll(layer_dimensions_.input_columns, input_data, agg_data,
                  &input_column_intermediates_);
-    UpdateEmbeddings(agg_data, p_forward_output_matrix_.data());
+    UpdateEmbeddings(agg_data, p_forward_output_matrix_.data(), true);
   } else {
     // update to aggregate
     // FW
-    UpdateEmbeddings(input_data, p_out_temp_.data());
+    UpdateEmbeddings(input_data, p_out_temp_.data(), false);
     // A(FW)
     AggregateAll(layer_dimensions_.output_columns, p_out_temp_.data(),
                  p_forward_output_matrix_.data(),
@@ -272,11 +273,11 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
   if (!sage_config_.disable_concat) {
     // XXX masking may not be required in sampling case where rows change
     if (layer_number_ != 0) {
-      MaskInputNonMasters(&input_data);
+      MaskInputNonMasters(&input_data, layer_dimensions_.input_rows);
     } else {
       // if 0 then no input to mask: mask the gradient
       // this is fine because gradient won't be used to get feature gradients
-      MaskGradientNonMasters(input_gradient);
+      MaskGradientNonMasters(input_gradient, layer_dimensions_.output_rows);
     }
 
 #ifdef GALOIS_ENABLE_GPU
@@ -313,7 +314,7 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
     // mask it, then use it
     // XXX masking may not be required in sampling case where rows change
     if (layer_number_ != 0 || sage_config_.disable_concat) {
-      MaskInputNonMasters(&agg_data);
+      MaskInputNonMasters(&agg_data, layer_dimensions_.output_rows);
     }
     // if concat is disabled, then input grad isn't masked; therefore, mask
     // this to get the same effect
@@ -345,11 +346,12 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
       // ---unmasked---
       // transposed sgemm for derivative; in_temp is output
       assert(input_gradient->size() >=
-             layer_dimensions_.input_rows * layer_dimensions_.output_columns);
+             layer_dimensions_.output_rows * layer_dimensions_.output_columns);
       // pintemp1 contains (AF)'
       // overwrites the dropout matrix that was in ptemp1 (needed for second
       // weight matrix)
-      UpdateEmbeddingsDerivative(input_gradient->data(), p_in_temp_1_.data());
+      UpdateEmbeddingsDerivative(input_gradient->data(), p_in_temp_1_.data(),
+                                 true);
       // pback contains F'
       // derivative of aggregate is the same due to symmetric graph
       AggregateAll(layer_dimensions_.input_columns, p_in_temp_1_.data(),
@@ -361,11 +363,11 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
     // disable concat part is here because otherwise it would get done elsewhere
     // XXX masking may not be required in sampling case where rows change
     if (layer_number_ != 0 && sage_config_.disable_concat) {
-      MaskInputNonMasters(&input_data);
+      MaskInputNonMasters(&input_data, layer_dimensions_.input_rows);
     } else {
       // if 0 then no input to mask: mask the gradient
       // this is fine because gradient won't be used to get feature gradients
-      MaskGradientNonMasters(&p_out_temp_);
+      MaskGradientNonMasters(&p_out_temp_, layer_dimensions_.input_rows);
     }
 
     // W' = F^T (FW)'
@@ -395,7 +397,7 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
       // derivative for update
       // backout = F'
       UpdateEmbeddingsDerivative(p_out_temp_.data(),
-                                 p_backward_output_matrix_.data());
+                                 p_backward_output_matrix_.data(), false);
     }
   }
   WeightGradientSyncSum();
@@ -567,7 +569,7 @@ void galois::SAGELayer::AggregateAllCPU(
 }
 
 void galois::SAGELayer::UpdateEmbeddings(const GNNFloat* node_embeddings,
-                                         GNNFloat* output) {
+                                         GNNFloat* output, bool after) {
   galois::StatTimer timer("ForwardXForm", kRegionName);
   TimerStart(&timer);
 #ifdef GALOIS_ENABLE_GPU
@@ -585,10 +587,17 @@ void galois::SAGELayer::UpdateEmbeddings(const GNNFloat* node_embeddings,
                    layer_dimensions_.input_columns, " ",
                    layer_dimensions_.output_columns);
     // CPU version is just a call into CBlas
-    galois::CBlasSGEMM(
-        CblasNoTrans, CblasNoTrans, layer_dimensions_.output_rows,
-        layer_dimensions_.input_columns, layer_dimensions_.output_columns,
-        node_embeddings, layer_weights_.data(), output);
+    if (after) {
+      galois::CBlasSGEMM(
+          CblasNoTrans, CblasNoTrans, layer_dimensions_.output_rows,
+          layer_dimensions_.input_columns, layer_dimensions_.output_columns,
+          node_embeddings, layer_weights_.data(), output);
+    } else {
+      galois::CBlasSGEMM(
+          CblasNoTrans, CblasNoTrans, layer_dimensions_.input_rows,
+          layer_dimensions_.input_columns, layer_dimensions_.output_columns,
+          node_embeddings, layer_weights_.data(), output);
+    }
 #ifdef GALOIS_ENABLE_GPU
   }
 #endif
@@ -618,7 +627,8 @@ void galois::SAGELayer::SelfFeatureUpdateEmbeddings(
 }
 
 void galois::SAGELayer::UpdateEmbeddingsDerivative(const GNNFloat* gradients,
-                                                   GNNFloat* output) {
+                                                   GNNFloat* output,
+                                                   bool after) {
   galois::StatTimer timer("BackwardXForm", kRegionName);
   TimerStart(&timer);
 
@@ -635,10 +645,17 @@ void galois::SAGELayer::UpdateEmbeddingsDerivative(const GNNFloat* gradients,
     // difference is Trans for B matrix (data) to get z by y (weights is y by z
     // normally); result is x by y
     // note input rows is used here due to transpose of aggregation
-    galois::CBlasSGEMM(CblasNoTrans, CblasTrans, layer_dimensions_.input_rows,
-                       layer_dimensions_.output_columns,
-                       layer_dimensions_.input_columns, gradients,
-                       layer_weights_.data(), output);
+    if (after) {
+      galois::CBlasSGEMM(
+          CblasNoTrans, CblasTrans, layer_dimensions_.output_rows,
+          layer_dimensions_.output_columns, layer_dimensions_.input_columns,
+          gradients, layer_weights_.data(), output);
+    } else {
+      galois::CBlasSGEMM(CblasNoTrans, CblasTrans, layer_dimensions_.input_rows,
+                         layer_dimensions_.output_columns,
+                         layer_dimensions_.input_columns, gradients,
+                         layer_weights_.data(), output);
+    }
 #ifdef GALOIS_ENABLE_GPU
   }
 #endif

From a23cb1437039625dc285d27f94f6889731e72e9e Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 21 May 2021 18:43:55 -0500
Subject: [PATCH 552/660] Back to using subgraph degrees

Subgraph degrees used again because accuracy when sampling suffers
otherwise. Don't actually need in-degrees because out-degree was used in
forward (meaning it's used in backward). Also, norm factor should
**NEVER** be 0 after some more thought. Added an assertion checking for
it in the debug build.

The other fix this commit includes is that the "choose all" mode must be
set appropriately when testing occurs (since test will always use all
degrees).
---
 libgnn/include/galois/graphs/GNNGraph.h | 49 +++++++------
 libgnn/include/galois/layers/GNNLayer.h |  1 +
 libgnn/src/GraphNeuralNetwork.cpp       | 20 +++++-
 libgnn/src/graphs/GNNGraph.cpp          | 92 ++++++++++---------------
 libgnn/src/layers/SAGELayer.cpp         |  8 ++-
 libgnn/src/layers/SoftmaxLayer.cpp      |  2 +-
 6 files changed, 87 insertions(+), 85 deletions(-)

diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 971b00e676..c40a3e20de 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -451,33 +451,26 @@ class GNNGraph {
   }
 
   //! Get degree norm of subgraph for particular layer (i.e. includes training)
-  // GNNFloat GetDegreeNorm(GraphNode n, size_t graph_user_layer_num) const {
-  GNNFloat GetDegreeNorm(GraphNode n, size_t) const {
+  GNNFloat GetDegreeNorm(GraphNode n, size_t graph_user_layer_num) const {
     if (use_subgraph_ || use_subgraph_view_) {
-      // TODO(loc) this is impresise: subgraph degrees differ from global
-      // degrees, but going to always use global degree -> not correct
-      return GetGlobalDegreeNorm(subgraph_->SIDToLID(n));
+      size_t degree;
+      if (!subgraph_choose_all_) {
+        // case because degrees in each layer differ
+        degree =
+            sampled_out_degrees_[graph_user_layer_num][subgraph_->SIDToLID(n)];
+      } else {
+        // XXX if inductive
+        // degree = global_train_degrees_[subgraph_->SIDToLID(n)];
+        degree = global_degrees_[subgraph_->SIDToLID(n)];
+      }
+      if (degree) {
+        return 1.0 / degree;
+      } else {
+        return 0;
+      }
     } else {
       return GetGlobalDegreeNorm(n);
     }
-
-    //  size_t degree;
-    //  if (!subgraph_is_train_) {
-    //    // case because degrees in each layer differ
-    //    degree =
-    //        sampled_out_degrees_[graph_user_layer_num][subgraph_->SIDToLID(n)];
-    //  } else {
-    //    // XXX if inductive
-    //    // degree = global_train_degrees_[subgraph_->SIDToLID(n)];
-    //    degree = global_degrees_[subgraph_->SIDToLID(n)];
-    //  }
-    //  //degree = global_degrees_[subgraph_->SIDToLID(n)];
-
-    //  if (degree) {
-    //    return 1.0 / degree;
-    //  } else {
-    //    return 0;
-    //  }
   }
 
   // Get accuracy: sampling is by default false
@@ -632,6 +625,11 @@ class GNNGraph {
     }
   }
 
+  bool SubgraphChooseAllStatus() { return subgraph_choose_all_; }
+  void EnableSubgraphChooseAll() { subgraph_choose_all_ = true; }
+  void DisableSubgraphChooseAll() { subgraph_choose_all_ = false; }
+  void SetSubgraphChooseAll(bool a) { subgraph_choose_all_ = a; }
+
 private:
 // included like this to avoid cyclic dependency issues + not used anywhere but
 // in this class anyways
@@ -714,8 +712,7 @@ class GNNGraph {
 
   std::unique_ptr<GNNSubgraph> subgraph_;
   // Degrees for sampled subgraph
-  // std::vector<galois::LargeArray<uint32_t>> sampled_out_degrees_;
-  // std::vector<galois::LargeArray<uint32_t>> sampled_in_degrees_;
+  std::vector<galois::LargeArray<uint32_t>> sampled_out_degrees_;
   //! Sample data on edges: each edge gets a small bitset to mark
   //! if it's been sampled for a particular layer
   galois::LargeArray<std::vector<bool>> edge_sample_status_;
@@ -768,7 +765,7 @@ class GNNGraph {
   // TODO vars for subgraphs as necessary
   bool use_subgraph_{false};
   bool use_subgraph_view_{false};
-  bool subgraph_is_train_{false};
+  bool subgraph_choose_all_{false};
 
   std::unique_ptr<MinibatchGenerator> train_batcher_;
   std::unique_ptr<MinibatchGenerator> test_batcher_;
diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
index ac8f2c8f05..45a9c08893 100644
--- a/libgnn/include/galois/layers/GNNLayer.h
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -184,6 +184,7 @@ class GNNLayer {
 
   //! Flip sampling switch on
   void EnableSampling() { config_.do_sampling = true; }
+  void DisableSampling() { config_.do_sampling = false; }
   bool IsSampledLayer() const { return config_.do_sampling; }
   //! Sets the graph user layer number; important for sampling as this index
   //! determines which index to use when checking for sampled edges
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index 92af85b278..01974baaca 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -160,10 +160,12 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork(
 }
 
 float galois::GraphNeuralNetwork::MinibatchedTesting() {
-  galois::gDebug("minibatched testing");
+  galois::gDebug("Minibatched Testing");
   graph_->ResetTestMinibatcher();
   SetLayerPhases(galois::GNNPhase::kBatch);
 
+  bool choose_all_status = graph_->SubgraphChooseAllStatus();
+
   uint32_t correct = 0;
   uint32_t total   = 0;
   while (true) {
@@ -196,6 +198,7 @@ float galois::GraphNeuralNetwork::MinibatchedTesting() {
 
     // resize layer matrices
     graph_->ConstructSampledSubgraph(num_sampled_layers);
+    graph_->EnableSubgraphChooseAll();
 
     const PointerWithSize<galois::GNNFloat> batch_pred = DoInference();
     std::pair<uint32_t, uint32_t> correct_total =
@@ -211,7 +214,13 @@ float galois::GraphNeuralNetwork::MinibatchedTesting() {
     }
   }
 
-  galois::gDebug("correct / total ", correct, " ", total);
+  galois::gDebug("Minibatching Correct / Total ", correct, " ", total);
+
+  if (choose_all_status) {
+    graph_->EnableSubgraphChooseAll();
+  } else {
+    graph_->DisableSubgraphChooseAll();
+  }
 
   return (1.0 * correct) / (1.0 * total);
 }
@@ -410,14 +419,17 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
           DisableTimers();
           float test_acc;
           if (!config_.test_minibatch_size()) {
+            bool f = graph_->SubgraphChooseAllStatus();
             graph_->DisableSubgraph();
             for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end();
                  layer++) {
               (*layer)->ResizeRows(graph_->size());
             }
             SetLayerPhases(galois::GNNPhase::kTest);
+            graph_->EnableSubgraphChooseAll();
             const PointerWithSize<galois::GNNFloat> test_pred = DoInference();
             test_acc = GetGlobalAccuracy(test_pred);
+            graph_->SetSubgraphChooseAll(f);
           } else {
             test_acc = MinibatchedTesting();
           }
@@ -466,6 +478,8 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
     bool do_test =
         config_.test_interval_ ? epoch % config_.test_interval_ == 0 : false;
 
+    bool subgraph_choose_all_status = graph_->SubgraphChooseAllStatus();
+
     if (do_validate || do_test) {
       DisableTimers();
       // disable subgraph
@@ -474,6 +488,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
            layer++) {
         (*layer)->ResizeRows(graph_->size());
       }
+      graph_->EnableSubgraphChooseAll();
     }
 
     if (do_validate) {
@@ -522,6 +537,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
           epoch_timer.get());
       // revert to training phase for next epoch
       SetLayerPhases(galois::GNNPhase::kTrain);
+      graph_->SetSubgraphChooseAll(subgraph_choose_all_status);
 
       // TODO too much code dupe
       // Resconstruct the train subgraph since it was replaced by test subgraph
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index a445e299a5..0cb05e9a4f 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -881,21 +881,21 @@ float galois::graphs::GNNGraph::GetGlobalAccuracyCPUMulti(
 ////////////////////////////////////////////////////////////////////////////////
 
 void galois::graphs::GNNGraph::InitializeSamplingData(size_t num_layers,
-                                                      bool is_inductive) {
+                                                      bool choose_all) {
   subgraph_ = std::make_unique<GNNSubgraph>(partitioned_graph_->size());
   sample_node_timestamps_.create(partitioned_graph_->size(),
                                  std::numeric_limits<uint32_t>::max());
   edge_sample_status_.create(partitioned_graph_->sizeEdges(), num_layers, 0);
-  // this is to hold the *global* degree of a sampled graph; yes, memory wise
-  // this is slightly problematic possibly, but each layer is its own
-  // subgraph
-  if (!is_inductive) {
-    // sampled_out_degrees_.resize(num_layers);
-    // for (galois::LargeArray<uint32_t>& array : sampled_out_degrees_) {
-    //  array.create(partitioned_graph_->size());
-    //}
+  // this is to hold the degree of a sampled graph considering all hosts; yes,
+  // memory wise this is slightly problematic possibly, but each layer is its
+  // own subgraph
+  if (!choose_all) {
+    sampled_out_degrees_.resize(num_layers);
+    for (galois::LargeArray<uint32_t>& array : sampled_out_degrees_) {
+      array.create(partitioned_graph_->size());
+    }
   } else {
-    subgraph_is_train_ = true;
+    subgraph_choose_all_ = true;
   }
 }
 
@@ -932,16 +932,17 @@ size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) {
                              edge_sample_status_[edge_id].end(), 0);
                  });
   // reset all degrees
-  // if (!subgraph_is_train_) {
-  //  galois::do_all(
-  //      galois::iterate(sampled_out_degrees_),
-  //      [&](galois::LargeArray<uint32_t>& array) {
-  //        std::fill(array.begin(), array.end(), 0);
-  //      },
-  //      galois::chunk_size<1>());
-  //}
-  // bitset_sampled_degrees_.resize(partitioned_graph_->size());
-  // bitset_sampled_degrees_.reset();
+  if (!subgraph_choose_all_) {
+    galois::do_all(
+        galois::iterate(sampled_out_degrees_),
+        [&](galois::LargeArray<uint32_t>& array) {
+          std::fill(array.begin(), array.end(), 0);
+        },
+        galois::chunk_size<1>());
+  }
+
+  bitset_sampled_degrees_.resize(partitioned_graph_->size());
+  bitset_sampled_degrees_.reset();
 
   // Seed nodes sync
   if (use_timer_) {
@@ -1050,17 +1051,9 @@ size_t galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num,
                                              size_t num_to_sample,
                                              bool inductive_subgraph,
                                              size_t timestamp) {
-  assert(!subgraph_is_train_);
   use_subgraph_      = false;
   use_subgraph_view_ = false;
 
-  galois::GAccumulator<size_t> sampled;
-  galois::GAccumulator<size_t> total;
-  // galois::GAccumulator<size_t> total_nodes;
-  sampled.reset();
-  total.reset();
-  // total_nodes.reset();
-
   galois::do_all(
       galois::iterate(begin(), end()),
       [&](const NodeIterator& src_iter) {
@@ -1079,7 +1072,6 @@ size_t galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num,
 
           // loop through edges, turn "on" edge with some probability
           for (auto edge_iter : partitioned_graph_->edges(*src_iter)) {
-            total += 1;
             if (sample_rng_.DoBernoulli(probability_of_reject)) {
               if (inductive_subgraph) {
                 // only take if node is training node or a node not classified
@@ -1100,22 +1092,15 @@ size_t galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num,
                 bitset_sample_flag_.set(
                     partitioned_graph_->getEdgeDst(edge_iter));
               }
-              // bitset_sampled_degrees_.set(*src_iter);
+              bitset_sampled_degrees_.set(*src_iter);
               // degree increment
-              // sampled_out_degrees_[sample_layer_num][*src_iter]++;
-              sampled += 1;
+              sampled_out_degrees_[sample_layer_num][*src_iter]++;
             }
           }
-          // total_nodes += 1;
         }
       },
       galois::steal(), galois::loopname("NeighborhoodSample"));
 
-  // galois::gInfo(host_prefix(), "sampled nodes for layer ", sample_layer_num,
-  //              " is ", total_nodes.reduce());
-  // galois::gInfo("Num sampled edges for layer ", sample_layer_num, " is ",
-  //              sampled.reduce(), " out of ", total.reduce());
-
   // update nodes, then communicate update to all hosts so that they can
   // continue the exploration
   galois::do_all(
@@ -1162,22 +1147,21 @@ galois::graphs::GNNGraph::ConstructSampledSubgraph(size_t num_sampled_layers,
                                                    bool use_view) {
   // false first so that the build process can use functions to access the
   // real graph
-  use_subgraph_      = false;
-  use_subgraph_view_ = false;
-  // gnn_sampled_out_degrees_ = &sampled_out_degrees_;
-
-  //// first, sync the degres of the sampled edges across all hosts
-  // if (use_timer_) {
-  //  sync_substrate_
-  //      ->sync<writeSource, readAny, SubgraphDegreeSync,
-  //      SubgraphDegreeBitset>(
-  //          "SubgraphDegree");
-  //} else {
-  //  sync_substrate_
-  //      ->sync<writeSource, readAny, SubgraphDegreeSync,
-  //      SubgraphDegreeBitset>(
-  //          "Ignore");
-  //}
+  use_subgraph_            = false;
+  use_subgraph_view_       = false;
+  gnn_sampled_out_degrees_ = &sampled_out_degrees_;
+
+  // first, sync the degres of the sampled edges across all hosts
+  // read any because destinations need it to for reverse phase
+  if (use_timer_) {
+    sync_substrate_
+        ->sync<writeSource, readAny, SubgraphDegreeSync, SubgraphDegreeBitset>(
+            "SubgraphDegree");
+  } else {
+    sync_substrate_
+        ->sync<writeSource, readAny, SubgraphDegreeSync, SubgraphDegreeBitset>(
+            "Ignore");
+  }
   size_t num_subgraph_nodes;
   if (!use_view) {
     num_subgraph_nodes = subgraph_->BuildSubgraph(*this, num_sampled_layers);
diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp
index 5aa8e24a77..a342cfbe14 100644
--- a/libgnn/src/layers/SAGELayer.cpp
+++ b/libgnn/src/layers/SAGELayer.cpp
@@ -499,8 +499,9 @@ void galois::SAGELayer::AggregateAllCPU(
             if (layer_phase_ == GNNPhase::kTrain ||
                 layer_phase_ == GNNPhase::kBatch) {
               // XXX
+              // galois::gDebug("In here");
               if (IsSampledLayer()) {
-                if (!graph_.IsEdgeSampled(e, layer_number_)) {
+                if (!graph_.IsEdgeSampled(e, graph_user_layer_number_)) {
                   continue;
                 }
               }
@@ -511,6 +512,7 @@ void galois::SAGELayer::AggregateAllCPU(
 
             if (!config_.disable_normalization) {
               GNNFloat norm_scale = source_norm;
+              assert(norm_scale != 0);
 
               galois::VectorMulAdd(
                   column_length, &aggregate_output[index_to_src_feature],
@@ -532,7 +534,7 @@ void galois::SAGELayer::AggregateAllCPU(
                 layer_phase_ == GNNPhase::kBatch) {
               // XXX
               if (IsSampledLayer()) {
-                if (!graph_.IsInEdgeSampled(e, layer_number_)) {
+                if (!graph_.IsInEdgeSampled(e, graph_user_layer_number_)) {
                   continue;
                 }
               }
@@ -551,6 +553,8 @@ void galois::SAGELayer::AggregateAllCPU(
               GNNFloat norm_scale =
                   graph_.GetDegreeNorm(dst, graph_user_layer_number_);
 
+              assert(norm_scale != 0);
+
               galois::VectorMulAdd(
                   column_length, &aggregate_output[index_to_src_feature],
                   &node_embeddings[index_to_dst_feature], norm_scale,
diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp
index ade63b9d1e..bf9a376092 100644
--- a/libgnn/src/layers/SoftmaxLayer.cpp
+++ b/libgnn/src/layers/SoftmaxLayer.cpp
@@ -19,7 +19,7 @@ galois::SoftmaxLayer::ForwardPhaseCPU(
 #endif
 
   galois::do_all(
-      galois::iterate(graph_.begin(), graph_.end()),
+      galois::iterate(size_t{0}, layer_dimensions_.input_rows),
       [&](const unsigned i) {
         if (IsSampledLayer()) {
           if ((layer_phase_ == GNNPhase::kTrain ||

From 2fb3dac5be54bdcce223803a918bdbf2e6c18608 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 21 May 2021 23:33:01 -0500
Subject: [PATCH 553/660] GNN subgraph "choose_all" fix

Choose all needs to be turned on if sample all edges use used (and vice
versa). Letting init decide if this var is to be turned on was a bad
idea and this commit fixes that by turning it on after the appropriate
call.
---
 libgnn/src/graphs/GNNGraph.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index 0cb05e9a4f..8af26898a1 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -1044,6 +1044,7 @@ size_t galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num,
     }
   });
 
+  EnableSubgraphChooseAll();
   return local_sample_count.reduce();
 }
 
@@ -1138,6 +1139,7 @@ size_t galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num,
     }
   });
 
+  DisableSubgraphChooseAll();
   return local_sample_count.reduce();
 }
 

From aba12dc555d17ee9c3f49b081ca493cec98f753e Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Sat, 22 May 2021 17:17:54 -0500
Subject: [PATCH 554/660] Dynamic resizing of intermediate/output matrices

Subgraphs don't need full space allocation. This commit is a first pass
at dynamically resizing these matrices to only use the amount of space
required. Will need cleaning.
---
 libgnn/include/galois/GraphNeuralNetwork.h  |   3 +
 libgnn/include/galois/graphs/GNNGraph.h     |   9 ++
 libgnn/include/galois/layers/GNNLayer.h     |  21 +++-
 libgnn/include/galois/layers/SAGELayer.h    |  20 ++--
 libgnn/include/galois/layers/SoftmaxLayer.h |  19 ++++
 libgnn/src/GraphNeuralNetwork.cpp           |  72 +++++++++++---
 libgnn/src/graphs/GNNGraph.cpp              |   4 +-
 libgnn/src/graphs/GNNSubgraph.cpp           |   4 +-
 libgnn/src/layers/GNNLayer.cpp              |  55 ++++++++--
 libgnn/src/layers/SAGELayer.cpp             | 105 +++++++++++++++++++-
 libgnn/src/layers/SoftmaxLayer.cpp          |  10 +-
 11 files changed, 277 insertions(+), 45 deletions(-)

diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h
index fc200e7baa..3b5b268daa 100644
--- a/libgnn/include/galois/GraphNeuralNetwork.h
+++ b/libgnn/include/galois/GraphNeuralNetwork.h
@@ -199,6 +199,9 @@ class GraphNeuralNetwork {
   //! most literature
   void GradientPropagation();
 
+  //! Call whenever resize occurs to correct reuse of pointers for layers
+  void CorrectBackwardLinks();
+
 private:
   static const constexpr char* kRegionName = "GraphNeuralNetwork";
 
diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index c40a3e20de..775fd2af3a 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -585,6 +585,15 @@ class GNNGraph {
     assert(node_id < size());
     return partitioned_graph_->getData(node_id);
   }
+  bool IsInSampledGraphSubgraph(size_t node_id) const {
+    // TODO(loc) GPU
+    assert(node_id < size());
+    if (use_subgraph_) {
+      return partitioned_graph_->getData(ConvertToLID(node_id));
+    } else {
+      return partitioned_graph_->getData(node_id);
+    }
+  }
 
   //! Calculate norm factor considering the entire graph
   void CalculateFullNormFactor();
diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
index 45a9c08893..e61d398a64 100644
--- a/libgnn/include/galois/layers/GNNLayer.h
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -100,14 +100,26 @@ class GNNLayer {
   virtual void ResizeRows(size_t new_row_count) {
     layer_dimensions_.input_rows  = new_row_count;
     layer_dimensions_.output_rows = new_row_count;
-    // TODO(loc) output matrix should be resized if space becomes an issue,
-    // else just use first S rows (S = subgraph size)
+    ResizeOutputMatrix(new_row_count);
   }
+
   virtual void ResizeInputOutputRows(size_t input_row, size_t output_row) {
     layer_dimensions_.input_rows  = input_row;
     layer_dimensions_.output_rows = output_row;
-    // TODO(loc) output matrix should be resized if space becomes an issue,
-    // else just use first S rows (S = subgraph size)
+    ResizeOutputMatrix(output_row);
+  }
+
+  void ResizeOutputMatrix(size_t new_output_row);
+
+  void UpdateBackwardOutput(PointerWithSize<GNNFloat>* backward_output_matrix) {
+    // XXX(hochan) gpu
+    if (layer_number_ != 0) {
+      assert(backward_output_matrix->size() >=
+             layer_dimensions_.input_rows * layer_dimensions_.input_columns);
+    } else {
+      GALOIS_LOG_FATAL("Layer 0 should not need to update backward output");
+    }
+    p_backward_output_matrix_ = *backward_output_matrix;
   }
 
   GNNPhase layer_phase() { return layer_phase_; }
@@ -348,7 +360,6 @@ class GNNLayer {
   }
 #endif
 
-  //! Mask a input size'd matrix's rows that correspond to mirrors
   void MaskInputNonMasters(PointerWithSize<GNNFloat>* input) {
     MaskInputNonMasters(input, std::numeric_limits<size_t>::max());
   }
diff --git a/libgnn/include/galois/layers/SAGELayer.h b/libgnn/include/galois/layers/SAGELayer.h
index e127b78e73..581115a00e 100644
--- a/libgnn/include/galois/layers/SAGELayer.h
+++ b/libgnn/include/galois/layers/SAGELayer.h
@@ -44,14 +44,6 @@ class SAGELayer : public GNNLayer {
       : SAGELayer(layer_num, graph, backward_output_matrix, dimensions,
                   GNNLayerConfig(), SAGELayerConfig()) {}
 
-  void ResizeRows(size_t new_row_count) {
-    galois::gDebug("Resizing SAGE layer for sampled graph from ",
-                   layer_dimensions_.input_rows);
-    GNNLayer::ResizeRows(new_row_count);
-    galois::gDebug("To ", layer_dimensions_.input_rows);
-    // TODO(loc) resize input matrices if space is reason for doing this
-  }
-
   void InitSelfWeightsTo1() {
 #ifdef GALOIS_ENABLE_GPU
     if (device_personality == DevicePersonality::GPU_CUDA) {
@@ -136,6 +128,18 @@ class SAGELayer : public GNNLayer {
   //! Sync second set of weight gradients
   void WeightGradientSyncSum2();
 
+  void ResizeRows(size_t new_row_count) {
+    GNNLayer::ResizeRows(new_row_count);
+    ResizeIntermediates(new_row_count, new_row_count);
+  }
+
+  void ResizeInputOutputRows(size_t input_row, size_t output_row) {
+    GNNLayer::ResizeInputOutputRows(input_row, output_row);
+    ResizeIntermediates(input_row, output_row);
+  }
+
+  void ResizeIntermediates(size_t new_input_rows, size_t new_output_rows);
+
   //! SAGE config params
   SAGELayerConfig sage_config_;
   //! Need own optimizer for the 2nd weight matrix
diff --git a/libgnn/include/galois/layers/SoftmaxLayer.h b/libgnn/include/galois/layers/SoftmaxLayer.h
index 433d055f83..3878b29685 100644
--- a/libgnn/include/galois/layers/SoftmaxLayer.h
+++ b/libgnn/include/galois/layers/SoftmaxLayer.h
@@ -47,6 +47,25 @@ class SoftmaxLayer : public GNNLayer {
   BackwardPhase(PointerWithSize<galois::GNNFloat> in_out,
                 PointerWithSize<galois::GNNFloat>* input_gradient) final;
 
+  void ResizeRows(size_t new_row_count) {
+    layer_dimensions_.input_rows  = new_row_count;
+    layer_dimensions_.output_rows = new_row_count;
+    // no output resize
+    if (input_loss_.size() < new_row_count) {
+      input_loss_.resize(new_row_count * 1.02);
+    }
+  }
+
+  void ResizeInputOutputRows(size_t in, size_t out) {
+    assert(in == out);
+    layer_dimensions_.input_rows  = in;
+    layer_dimensions_.output_rows = out;
+    // no output resize
+    if (input_loss_.size() < in) {
+      input_loss_.resize(in * 1.02);
+    }
+  }
+
 private:
 #ifdef GALOIS_ENABLE_GPU
   SoftmaxLayerGPU gpu_object_;
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index 01974baaca..b29ec3af88 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -43,12 +43,24 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork(
       prev_layer_columns = graph_->node_feature_length();
     }
 
+    // max dims
     GNNLayerDimensions layer_dims = {.input_rows    = max_rows,
                                      .input_columns = prev_layer_columns,
                                      .output_columns =
                                          config_.intermediate_layer_size(i),
                                      .output_rows = max_rows};
 
+    // test minibatch size: if it's not enabled, then currently the full
+    // graph is used (should really only subgraph the test nodes, though;
+    // that's a TODO)
+    if ((config_.train_minibatch_size() || config_.use_train_subgraph_) &&
+        config_.test_minibatch_size()) {
+      galois::gInfo("Not allocating rows");
+      // set to 0 here to make it allocate nothing
+      layer_dims.input_rows  = 0;
+      layer_dims.output_rows = 0;
+    }
+
     switch (layer_type) {
     case GNNLayerType::kGraphConvolutional:
       gnn_layers_.push_back(std::move(std::make_unique<GraphConvolutionalLayer>(
@@ -126,6 +138,12 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork(
       .output_columns = config_.output_layer_size(),
       .output_rows    = max_rows};
 
+  if ((config_.train_minibatch_size() || config_.use_train_subgraph_) &&
+      config_.test_minibatch_size()) {
+    output_dims.input_rows  = 0;
+    output_dims.output_rows = 0;
+  }
+
   switch (config_.output_layer_type()) {
   case (GNNOutputLayerType::kSoftmax):
     gnn_layers_.push_back(std::move(std::make_unique<SoftmaxLayer>(
@@ -199,6 +217,7 @@ float galois::GraphNeuralNetwork::MinibatchedTesting() {
     // resize layer matrices
     graph_->ConstructSampledSubgraph(num_sampled_layers);
     graph_->EnableSubgraphChooseAll();
+    CorrectBackwardLinks();
 
     const PointerWithSize<galois::GNNFloat> batch_pred = DoInference();
     std::pair<uint32_t, uint32_t> correct_total =
@@ -260,6 +279,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
       }
     }
     graph_->ConstructSampledSubgraph(num_sampled_layers);
+    CorrectBackwardLinks();
   }
 
   galois::StatTimer epoch_timer("TrainingTime", kRegionName);
@@ -284,6 +304,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
           l_count++;
         }
       }
+      CorrectBackwardLinks();
     }
 
     // beginning of epoch sampling
@@ -322,7 +343,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
       }
       // resize layer matrices
       graph_->ConstructSampledSubgraph(num_sampled_layers);
-
+      CorrectBackwardLinks();
       mb_timer.stop();
     }
 
@@ -344,9 +365,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
         galois::StatTimer mb_timer("MinibatchSubgraphCreation", kRegionName);
         mb_timer.start();
 
-        const std::string btime_name("Epoch" + std::to_string(epoch) + "Batch" +
-                                     std::to_string(batch_num));
-        galois::StatTimer batch_timer(btime_name.c_str(), kRegionName);
+        galois::Timer batch_timer;
         batch_timer.start();
         work_left_.reset();
         galois::gInfo("Epoch ", epoch, " batch ", batch_num++);
@@ -393,6 +412,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
 
         // resize layer matrices
         graph_->ConstructSampledSubgraph(num_sampled_layers);
+        CorrectBackwardLinks();
         // XXX resizes above only work for SAGE layers; will break if other
         // layers are tested
 
@@ -423,8 +443,10 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
             graph_->DisableSubgraph();
             for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end();
                  layer++) {
+              // TODO nuclear resize
               (*layer)->ResizeRows(graph_->size());
             }
+            CorrectBackwardLinks();
             SetLayerPhases(galois::GNNPhase::kTest);
             graph_->EnableSubgraphChooseAll();
             const PointerWithSize<galois::GNNFloat> test_pred = DoInference();
@@ -443,6 +465,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
             galois::runtime::reportStat_Single(kRegionName, test_name_acc,
                                                test_acc);
           }
+
           // report the training time elapsed at this point in time
           galois::runtime::reportStat_Single(
               kRegionName,
@@ -484,14 +507,18 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
       DisableTimers();
       // disable subgraph
       graph_->DisableSubgraph();
+      graph_->EnableSubgraphChooseAll();
+    }
+
+    if (do_validate) {
+      // XXX induced subgraph here
       for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end();
            layer++) {
+        // nuclear resize
         (*layer)->ResizeRows(graph_->size());
       }
-      graph_->EnableSubgraphChooseAll();
-    }
 
-    if (do_validate) {
+      CorrectBackwardLinks();
       validation_timer.start();
       SetLayerPhases(galois::GNNPhase::kValidate);
       const PointerWithSize<galois::GNNFloat> val_pred = DoInference();
@@ -512,6 +539,12 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
       float test_acc;
 
       if (!config_.test_minibatch_size()) {
+        for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end();
+             layer++) {
+          // nuclear resize
+          (*layer)->ResizeRows(graph_->size());
+        }
+        CorrectBackwardLinks();
         SetLayerPhases(galois::GNNPhase::kTest);
         const PointerWithSize<galois::GNNFloat> test_pred = DoInference();
         epoch_test_timer.stop();
@@ -566,6 +599,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
           }
         }
         graph_->ConstructSampledSubgraph(num_sampled_layers);
+        CorrectBackwardLinks();
       }
 
       EnableTimers();
@@ -578,19 +612,21 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
   DisableTimers();
   // disable subgraph
   graph_->DisableSubgraph();
-  // TODO only do this when necessary
-  for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end(); layer++) {
-    (*layer)->ResizeRows(graph_->size());
-  }
+  graph_->EnableSubgraphChooseAll();
 
   // check test accuracy
-  // XXX test batching
   galois::StatTimer test_timer("FinalTestRun", kRegionName);
   float global_accuracy;
 
   test_timer.start();
 
   if (!config_.test_minibatch_size()) {
+    for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end();
+         layer++) {
+      // TODO nuclear resize
+      (*layer)->ResizeRows(graph_->size());
+    }
+    CorrectBackwardLinks();
     SetLayerPhases(galois::GNNPhase::kTest);
     const PointerWithSize<galois::GNNFloat> predictions = DoInference();
     global_accuracy = GetGlobalAccuracy(predictions);
@@ -673,3 +709,15 @@ void galois::GraphNeuralNetwork::GradientPropagation() {
     gnn_layers_[layer_index]->OptimizeLayer(optimizer_.get(), layer_index);
   }
 }
+
+void galois::GraphNeuralNetwork::CorrectBackwardLinks() {
+  // layer chain pointer
+  PointerWithSize<GNNFloat> prev_output_layer(nullptr, 0);
+  for (size_t layer_num = 0; layer_num < gnn_layers_.size(); layer_num++) {
+    // first layer is nullptr so can be ignored
+    if (layer_num != 0) {
+      gnn_layers_[layer_num]->UpdateBackwardOutput(&prev_output_layer);
+    }
+    prev_output_layer = gnn_layers_[layer_num]->GetForwardOutput();
+  }
+}
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index 8af26898a1..b7cb9596e0 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -697,7 +697,6 @@ float galois::graphs::GNNGraph::GetGlobalAccuracyCPU(
 float galois::graphs::GNNGraph::GetGlobalAccuracyCPUSingle(
     PointerWithSize<GNNFloat> predictions, GNNPhase phase, bool) {
   // check owned nodes' accuracy
-  assert((num_label_classes_ * size()) == predictions.size());
   num_correct_.reset();
   total_checked_.reset();
 
@@ -722,7 +721,7 @@ float galois::graphs::GNNGraph::GetGlobalAccuracyCPUSingle(
         }
       },
       // steal on as some threads may have nothing to work on
-      galois::steal(), galois::loopname("GlobalAccuracy"));
+      galois::steal());
 
   size_t global_correct = num_correct_.reduce();
   size_t global_checked = total_checked_.reduce();
@@ -736,7 +735,6 @@ float galois::graphs::GNNGraph::GetGlobalAccuracyCPUSingle(
 std::pair<uint32_t, uint32_t> galois::graphs::GNNGraph::GetBatchAccuracy(
     PointerWithSize<GNNFloat> predictions) {
   // check owned nodes' accuracy
-  assert((num_label_classes_ * size()) == predictions.size());
   num_correct_.reset();
   total_checked_.reset();
 
diff --git a/libgnn/src/graphs/GNNSubgraph.cpp b/libgnn/src/graphs/GNNSubgraph.cpp
index dcb5c0f2db..2493319904 100644
--- a/libgnn/src/graphs/GNNSubgraph.cpp
+++ b/libgnn/src/graphs/GNNSubgraph.cpp
@@ -144,7 +144,7 @@ void galois::graphs::GNNGraph::GNNSubgraph::DegreeCounting(
         // galois::gDebug("Local ID ", node_id, " SID ", subgraph_id, " out ",
         //               out_degrees, " in ", in_degrees);
       },
-      galois::steal());
+      galois::loopname("DegreeCountingDoAll"), galois::steal());
 
   TimerStop(&timer);
 }
@@ -222,7 +222,7 @@ void galois::graphs::GNNGraph::GNNSubgraph::EdgeCreation(
         assert(out_location == local_subgraph_out_degrees_[subgraph_id]);
         assert(in_location == local_subgraph_in_degrees_[subgraph_id]);
       },
-      galois::steal());
+      galois::loopname("EdgeCreationDoAll"), galois::steal());
   TimerStop(&timer);
 }
 
diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp
index 07e839cb48..4c828dbb19 100644
--- a/libgnn/src/layers/GNNLayer.cpp
+++ b/libgnn/src/layers/GNNLayer.cpp
@@ -103,6 +103,31 @@ galois::GNNLayer::GNNLayer(size_t layer_num,
 #endif
 }
 
+void galois::GNNLayer::ResizeOutputMatrix(size_t new_output_row) {
+  size_t num_output_elements =
+      new_output_row * layer_dimensions_.output_columns;
+
+  if (!config_.disable_output &&
+      (forward_output_matrix_.size() < num_output_elements)) {
+    galois::gInfo(graph_.host_prefix(), "Resizing layer ", layer_number_,
+                  ", forward output matrix to ", num_output_elements, " (",
+                  FloatElementsToGB(num_output_elements), " GB)");
+    // resize with a bit of a buffer to prevent possible future resizes
+    size_t buffer_size = (num_output_elements * 0.02);
+    forward_output_matrix_.resize(num_output_elements + buffer_size, 0);
+  }
+
+  // XXX(hochan) GPU end
+#ifdef GALOIS_ENABLE_GPU
+  // XXX(hochan)
+#endif
+  // reinitialize the PointerWithSize wrappers
+  p_forward_output_matrix_ = PointerWithSize<GNNFloat>(forward_output_matrix_);
+#ifdef GALOIS_ENABLE_GPU
+  // XXX(hochan)
+#endif
+}
+
 void galois::GNNLayer::GlorotBengioInit(std::vector<GNNFloat>* vector_to_init) {
   float max = std::sqrt(6.0) / std::sqrt(layer_dimensions_.output_columns +
                                          layer_dimensions_.input_columns);
@@ -272,20 +297,23 @@ void galois::GNNLayer::Activation() {
     base_gpu_object_.ActivationGPU(p_forward_output_matrix_.size());
   } else {
 #endif
-    if (activation_memo_.size() == 0) {
-      activation_memo_.resize(forward_output_matrix_.size());
+    if (activation_memo_.size() != p_forward_output_matrix_.size()) {
+      activation_memo_.resize(p_forward_output_matrix_.size());
     }
     activation_memo_.reset();
+    assert(activation_memo_.size() == p_forward_output_matrix_.size());
+    assert(layer_dimensions_.output_rows * layer_dimensions_.output_columns <=
+           p_forward_output_matrix_.size());
 
     galois::do_all(galois::iterate(static_cast<size_t>(0),
                                    layer_dimensions_.output_rows *
                                        layer_dimensions_.output_columns),
                    [&](size_t i) {
-                     if (forward_output_matrix_[i] > 0.0) {
+                     if (p_forward_output_matrix_[i] > 0.0) {
                        // do nothing, keep value; set the memo though
                        activation_memo_.set(i);
                      } else {
-                       forward_output_matrix_[i] = 0;
+                       p_forward_output_matrix_[i] = 0;
                      }
                    });
 #ifdef GALOIS_ENABLE_GPU
@@ -305,6 +333,8 @@ void galois::GNNLayer::ActivationDerivative(
                                              gradient->size());
   } else {
 #endif
+    assert(gradient->size() >=
+           layer_dimensions_.output_rows * layer_dimensions_.output_columns);
     // TODO only does relu at the moment; should check user specified activation
     // and act accordingly
     // keep gradient if the original output was greater than 0
@@ -362,8 +392,6 @@ void galois::GNNLayer::MaskInputNonMasters(PointerWithSize<GNNFloat>* input,
   assert(*(graph_.begin_owned()) == 0);
   size_t start_node = *(graph_.end_owned());
   size_t end_node   = graph_.active_size();
-  size_t row_index  = layer_dimensions_.input_columns;
-  assert((row_index * layer_dimensions_.input_rows) <= input->size());
 
   if (start_node > max_rows) {
     start_node = max_rows;
@@ -372,6 +400,10 @@ void galois::GNNLayer::MaskInputNonMasters(PointerWithSize<GNNFloat>* input,
     end_node = max_rows;
   }
 
+  size_t row_index = layer_dimensions_.input_columns;
+  assert(start_node * row_index <= input->size());
+  assert(end_node * row_index <= input->size());
+
 #ifdef GALOIS_ENABLE_GPU
   if (device_personality == DevicePersonality::GPU_CUDA) {
     base_gpu_object_.MaskNonMastersGPU(input, start_node, end_node, row_index);
@@ -396,7 +428,6 @@ void galois::GNNLayer::MaskGradientNonMasters(
   assert(*(graph_.begin_owned()) == 0);
   size_t start_node = *(graph_.end_owned());
   size_t end_node   = graph_.active_size();
-  size_t row_index  = layer_dimensions_.output_columns;
 
   if (start_node > max_rows) {
     start_node = max_rows;
@@ -405,6 +436,16 @@ void galois::GNNLayer::MaskGradientNonMasters(
     end_node = max_rows;
   }
 
+  size_t row_index = layer_dimensions_.output_columns;
+  if (start_node > max_rows) {
+    start_node = max_rows;
+  }
+  if (end_node > max_rows) {
+    end_node = max_rows;
+  }
+  assert(start_node * row_index <= gradient->size());
+  assert(end_node * row_index <= gradient->size());
+
 #ifdef GALOIS_ENABLE_GPU
   if (device_personality == DevicePersonality::GPU_CUDA) {
     base_gpu_object_.MaskNonMastersGPU(gradient, start_node, end_node,
diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp
index a342cfbe14..70d85b853a 100644
--- a/libgnn/src/layers/SAGELayer.cpp
+++ b/libgnn/src/layers/SAGELayer.cpp
@@ -141,6 +141,86 @@ galois::SAGELayer::SAGELayer(size_t layer_num,
   GALOIS_LOG_VERBOSE("SAGE layer initialized");
 }
 
+void galois::SAGELayer::ResizeIntermediates(size_t new_input_rows,
+                                            size_t new_output_rows) {
+  size_t num_in_temp_elements =
+      new_output_rows * layer_dimensions_.input_columns;
+  galois::gDebug("Layer num ", layer_number_, " ", in_temp_1_.size(), " and ",
+                 num_in_temp_elements, " ", layer_dimensions_.input_columns,
+                 " ", layer_dimensions_.output_columns);
+
+  // if in temp is smaller than out temp, or if dropout exists
+  if (!config_.disable_dropout || config_.disable_aggregate_after_update ||
+      layer_dimensions_.input_columns <= layer_dimensions_.output_columns) {
+    galois::gDebug("in first if");
+    if (in_temp_1_.size() < num_in_temp_elements) {
+      galois::gDebug("in the resize");
+      galois::gInfo(graph_.host_prefix(), "Resize layer ", layer_number_,
+                    ", SAGE input temp var 1 ", num_in_temp_elements, " (",
+                    FloatElementsToGB(num_in_temp_elements), " GB)");
+      size_t buffer_size = num_in_temp_elements * 0.02;
+#ifdef GALOIS_ENABLE_GPU
+      // XXX(hochan)
+      if (device_personality == DevicePersonality::GPU_CUDA) {
+        gpu_object_.AllocateInTemp1(num_in_temp_elements + buffer_size);
+      } else {
+#endif
+        in_temp_1_.resize(num_in_temp_elements + buffer_size, 0);
+#ifdef GALOIS_ENABLE_GPU
+      }
+#endif
+      // XXX(hochan) GPU
+      p_in_temp_1_ = PointerWithSize<GNNFloat>(in_temp_1_);
+    }
+  }
+
+  // only on in dropout case + if in temp is smaller than out temp
+  if (!config_.disable_dropout &&
+      (config_.disable_aggregate_after_update ||
+       layer_dimensions_.input_columns <= layer_dimensions_.output_columns)) {
+    if (in_temp_2_.size() < num_in_temp_elements) {
+      galois::gInfo(graph_.host_prefix(), "Resize layer ", layer_number_,
+                    ", SAGE input temp var 2 ", num_in_temp_elements, " (",
+                    FloatElementsToGB(num_in_temp_elements), " GB)");
+      size_t buffer_size = num_in_temp_elements * 0.02;
+#ifdef GALOIS_ENABLE_GPU
+      if (device_personality == DevicePersonality::GPU_CUDA) {
+        gpu_object_.AllocateInTemp2(num_in_temp_elements + buffer_size);
+      } else {
+#endif
+        in_temp_2_.resize(num_in_temp_elements + buffer_size, 0);
+#ifdef GALOIS_ENABLE_GPU
+      }
+#endif
+      // XXX(hochan) GPU
+      p_in_temp_2_ = PointerWithSize<GNNFloat>(in_temp_2_);
+    }
+  }
+
+  size_t num_output_temp_elements =
+      new_input_rows * layer_dimensions_.output_columns;
+  // only needed if out temp would be smaller than intemp
+  if (!config_.disable_aggregate_after_update &&
+      layer_dimensions_.input_columns > layer_dimensions_.output_columns) {
+    if (out_temp_.size() < num_output_temp_elements) {
+      galois::gInfo(graph_.host_prefix(), "Resize layer ", layer_number_,
+                    ", SAGE output temp var ", num_output_temp_elements, " (",
+                    FloatElementsToGB(num_output_temp_elements), " GB)");
+       size_t buffer_size = (num_output_temp_elements * 0.02);
+#ifdef GALOIS_ENABLE_GPU
+      if (device_personality == DevicePersonality::GPU_CUDA) {
+        gpu_object_.AllocateOutTemp(num_output_temp_elements + buffer_size);
+      } else {
+#endif
+        out_temp_.resize(num_output_temp_elements + buffer_size, 0);
+#ifdef GALOIS_ENABLE_GPU
+      }
+#endif
+      p_out_temp_ = PointerWithSize<GNNFloat>(out_temp_);
+    }
+  }
+}
+
 void galois::SAGELayer::WeightGradientSyncSum2() {
   galois::StatTimer t("Sync_WeightGradientsSum2", kRegionName);
   TimerStart(&t);
@@ -174,6 +254,10 @@ void galois::SAGELayer::WeightGradientSyncSum2() {
 
 const galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::ForwardPhase(
     const galois::PointerWithSize<galois::GNNFloat> input_embeddings) {
+  galois::gDebug(
+      "Layer ", layer_number_, " dims: ", layer_dimensions_.input_rows, " ",
+      layer_dimensions_.output_rows, " ", layer_dimensions_.input_columns, " ",
+      layer_dimensions_.output_columns);
   galois::StatTimer timer("ForwardPhase", kRegionName);
   TimerStart(&timer);
 
@@ -200,15 +284,28 @@ const galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::ForwardPhase(
   // flip aggregate/update if dimensions favor it (do less work)
   if (config_.disable_aggregate_after_update ||
       layer_dimensions_.input_columns <= layer_dimensions_.output_columns) {
+    if (!config_.disable_dropout && (layer_phase_ == GNNPhase::kTrain)) {
+      assert(p_in_temp_2_.size() >=
+             layer_dimensions_.output_rows * layer_dimensions_.input_columns);
+    } else {
+      assert(p_in_temp_1_.size() >=
+             layer_dimensions_.output_rows * layer_dimensions_.input_columns);
+    }
     // aggregation and update
     AggregateAll(layer_dimensions_.input_columns, input_data, agg_data,
                  &input_column_intermediates_);
+    assert(p_forward_output_matrix_.size() >=
+           layer_dimensions_.output_columns * layer_dimensions_.output_columns);
     UpdateEmbeddings(agg_data, p_forward_output_matrix_.data(), true);
   } else {
+    assert(p_out_temp_.size() >=
+           layer_dimensions_.input_rows * layer_dimensions_.output_columns);
     // update to aggregate
     // FW
     UpdateEmbeddings(input_data, p_out_temp_.data(), false);
     // A(FW)
+    assert(p_forward_output_matrix_.size() >=
+           layer_dimensions_.output_columns * layer_dimensions_.output_columns);
     AggregateAll(layer_dimensions_.output_columns, p_out_temp_.data(),
                  p_forward_output_matrix_.data(),
                  &output_column_intermediates_);
@@ -595,12 +692,12 @@ void galois::SAGELayer::UpdateEmbeddings(const GNNFloat* node_embeddings,
       galois::CBlasSGEMM(
           CblasNoTrans, CblasNoTrans, layer_dimensions_.output_rows,
           layer_dimensions_.input_columns, layer_dimensions_.output_columns,
-          node_embeddings, layer_weights_.data(), output);
+          node_embeddings, p_layer_weights_.data(), output);
     } else {
       galois::CBlasSGEMM(
           CblasNoTrans, CblasNoTrans, layer_dimensions_.input_rows,
           layer_dimensions_.input_columns, layer_dimensions_.output_columns,
-          node_embeddings, layer_weights_.data(), output);
+          node_embeddings, p_layer_weights_.data(), output);
     }
 #ifdef GALOIS_ENABLE_GPU
   }
@@ -653,12 +750,12 @@ void galois::SAGELayer::UpdateEmbeddingsDerivative(const GNNFloat* gradients,
       galois::CBlasSGEMM(
           CblasNoTrans, CblasTrans, layer_dimensions_.output_rows,
           layer_dimensions_.output_columns, layer_dimensions_.input_columns,
-          gradients, layer_weights_.data(), output);
+          gradients, p_layer_weights_.data(), output);
     } else {
       galois::CBlasSGEMM(CblasNoTrans, CblasTrans, layer_dimensions_.input_rows,
                          layer_dimensions_.output_columns,
                          layer_dimensions_.input_columns, gradients,
-                         layer_weights_.data(), output);
+                         p_layer_weights_.data(), output);
     }
 #ifdef GALOIS_ENABLE_GPU
   }
diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp
index bf9a376092..eb6e900413 100644
--- a/libgnn/src/layers/SoftmaxLayer.cpp
+++ b/libgnn/src/layers/SoftmaxLayer.cpp
@@ -24,7 +24,7 @@ galois::SoftmaxLayer::ForwardPhaseCPU(
         if (IsSampledLayer()) {
           if ((layer_phase_ == GNNPhase::kTrain ||
                layer_phase_ == GNNPhase::kBatch) &&
-              !graph_.IsInSampledGraph(i)) {
+              !graph_.IsInSampledGraphSubgraph(i)) {
             // XXX
             VectorZero(feature_length,
                        &p_backward_output_matrix_[i * feature_length]);
@@ -60,7 +60,8 @@ galois::SoftmaxLayer::ForwardPhaseCPU(
       },
       // TODO chunk size?
       // steal on as some threads may have nothing to work on
-      galois::steal(), galois::loopname("SoftmaxForward"));
+      // galois::steal(), galois::loopname("SoftmaxForward"));
+      galois::steal());
 #ifndef NDEBUG
   GNNFloat reduced_loss = loss_accum.reduce();
   size_t t              = handled.reduce();
@@ -93,12 +94,13 @@ galois::SoftmaxLayer::BackwardPhaseCPU() {
   const size_t feature_length = layer_dimensions_.input_columns;
 
   galois::do_all(
-      galois::iterate(graph_.begin(), graph_.end()),
+      // galois::iterate(graph_.begin(), graph_.end()),
+      galois::iterate(size_t{0}, layer_dimensions_.input_rows),
       [&](const unsigned node) {
         if (graph_.IsValidForPhase(node, layer_phase_)) {
           if (IsSampledLayer()) {
             if (layer_phase_ == GNNPhase::kTrain &&
-                !graph_.IsInSampledGraph(node))
+                !graph_.IsInSampledGraphSubgraph(node))
               return;
           }
 

From 3e22e29d87c2f545b5bd9afa9b8035ed06017c82 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 1 Jun 2021 17:43:17 -0500
Subject: [PATCH 555/660] QoL layers/size CLI specification

User no longer has to specify layer name for every layer specified; just
replicate one layer type + size. This makes it so you can't have
heterogeneous layers/sizes, but that setting isn't explored much
anyways.

Also reports more CLI options in stats file.
---
 lonestar/libgnnbench/include/GNNBench/Start.h |   7 +
 lonestar/libgnnbench/src/Input.cpp            | 169 +++++++++++-------
 lonestar/libgnnbench/src/Start.cpp            |  22 ++-
 3 files changed, 129 insertions(+), 69 deletions(-)

diff --git a/lonestar/libgnnbench/include/GNNBench/Start.h b/lonestar/libgnnbench/include/GNNBench/Start.h
index 75ec167f78..48507df80e 100644
--- a/lonestar/libgnnbench/include/GNNBench/Start.h
+++ b/lonestar/libgnnbench/include/GNNBench/Start.h
@@ -13,6 +13,11 @@
 
 extern llvm::cl::opt<unsigned> num_threads;
 extern llvm::cl::opt<unsigned> num_epochs;
+extern llvm::cl::opt<unsigned> layer_size;
+extern llvm::cl::opt<galois::GNNLayerType> cl_layer_type;
+extern llvm::cl::opt<unsigned> train_minibatch_size;
+extern llvm::cl::opt<unsigned> test_minibatch_size;
+extern llvm::cl::opt<bool> do_graph_sampling;
 
 #ifdef GALOIS_ENABLE_GPU
 std::string personality_str(DevicePersonality p);
@@ -24,6 +29,8 @@ void heteroSetup();
 };
 #endif
 
+const char* GNNLayerToString(galois::GNNLayerType s);
+
 ////////////////////////////////////////////////////////////////////////////////
 // Init functions
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp
index 5facfa95c5..3ebee8adea 100644
--- a/lonestar/libgnnbench/src/Input.cpp
+++ b/lonestar/libgnnbench/src/Input.cpp
@@ -32,18 +32,26 @@ llvm::cl::opt<unsigned> num_layers(
         "Number of intermediate layers in the neural network (default 2))"),
     cll::init(2));
 
-llvm::cl::list<unsigned> layer_sizes(
-    "layerSizes",
+// llvm::cl::list<unsigned> layer_sizes(
+//    "layerSizes",
+//    cll::desc(
+//        "Comma separated list of numbers specifying "
+//        "intermediate layer sizes (does not include output); default sizes are
+//        " "16 until last layer which is the size of the # of labels"),
+//    cll::CommaSeparated);
+
+llvm::cl::opt<unsigned> layer_size(
+    "layerSize",
     cll::desc(
-        "Comma separated list of numbers specifying "
+        "Number specifying "
         "intermediate layer sizes (does not include output); default sizes are "
         "16 until last layer which is the size of the # of labels"),
-    cll::CommaSeparated);
+    cll::init(16));
 
-llvm::cl::list<galois::GNNLayerType> cl_layer_types(
-    "layerTypes",
-    cll::desc("Comma separated list of layer types specifying "
-              "intermediate layers (does not include output)"),
+llvm::cl::opt<galois::GNNLayerType> cl_layer_type(
+    "layerType",
+    cll::desc("Layer type specifying "
+              "intermediate layers (does not include output); default SAGE"),
     cll::values(
         clEnumValN(galois::GNNLayerType::kGraphConvolutional, "gcn",
                    "Graph Convolutional Layer (default)"),
@@ -51,7 +59,7 @@ llvm::cl::list<galois::GNNLayerType> cl_layer_types(
                    "SAGE layer (GCN with concat + mean)"),
         clEnumValN(galois::GNNLayerType::kL2Norm, "l2norm", "L2 norm layer"),
         clEnumValN(galois::GNNLayerType::kDense, "dense", "Dense layer")),
-    cll::CommaSeparated);
+    cll::init(galois::GNNLayerType::kSAGE));
 
 llvm::cl::list<unsigned> cl_fan_out_vector(
     "samplingFanOut",
@@ -169,19 +177,22 @@ const char* GNNPartitionToString(galois::graphs::GNNPartitionScheme s) {
 //! Initializes the vector of layer sizes from command line args + graph
 std::vector<galois::GNNLayerType> CreateLayerTypesVector() {
   std::vector<galois::GNNLayerType> layer_types;
-  if (!cl_layer_types.size()) {
-    // default is all GCN layers
-    for (size_t i = 0; i < num_layers; i++) {
-      layer_types.emplace_back(galois::GNNLayerType::kGraphConvolutional);
-    }
-  } else {
-    GALOIS_LOG_VASSERT(cl_layer_types.size() == num_layers,
-                       "Number layer types should be {} not {}", num_layers,
-                       cl_layer_types.size());
-    for (size_t i = 0; i < num_layers; i++) {
-      layer_types.emplace_back(cl_layer_types[i]);
-    }
+  for (size_t i = 0; i < num_layers; i++) {
+    layer_types.emplace_back(cl_layer_type);
   }
+  // if (!cl_layer_types.size()) {
+  //  // default is all GCN layers
+  //  for (size_t i = 0; i < num_layers; i++) {
+  //    layer_types.emplace_back(galois::GNNLayerType::kGraphConvolutional);
+  //  }
+  //} else {
+  //  GALOIS_LOG_VASSERT(cl_layer_types.size() == num_layers,
+  //                     "Number layer types should be {} not {}", num_layers,
+  //                     cl_layer_types.size());
+  //  for (size_t i = 0; i < num_layers; i++) {
+  //    layer_types.emplace_back(cl_layer_types[i]);
+  //  }
+  //}
   return layer_types;
 }
 
@@ -190,34 +201,41 @@ std::vector<size_t>
 CreateLayerSizesVector(const galois::graphs::GNNGraph* gnn_graph) {
   // set layer sizes for intermdiate and output layers
   std::vector<size_t> layer_sizes_vector;
-  if (layer_sizes.size()) {
-    GALOIS_LOG_ASSERT(layer_sizes.size() == num_layers);
-    for (size_t i = 0; i < num_layers; i++) {
-      layer_sizes_vector.emplace_back(layer_sizes[i]);
-    }
-    // verify user satisfies last intermediate layer needing to have same size
-    // as # label classes
-    if (layer_sizes_vector.back() != gnn_graph->GetNumLabelClasses()) {
-      galois::gWarn(
-          "Size of last layer (", layer_sizes_vector.back(),
-          ") is not equal to # label classes: forcefully changing it to ",
-          gnn_graph->GetNumLabelClasses());
-      layer_sizes_vector.back()   = gnn_graph->GetNumLabelClasses();
-      layer_sizes[num_layers - 1] = gnn_graph->GetNumLabelClasses();
-    }
 
-    GALOIS_LOG_ASSERT(layer_sizes_vector.back() ==
-                      gnn_graph->GetNumLabelClasses());
-  } else {
-    // default 16 for everything until last 2
-    for (size_t i = 0; i < num_layers - 1; i++) {
-      layer_sizes_vector.emplace_back(16);
-    }
-    // last 2 sizes must be equivalent to # label classes; this is the last
-    // intermediate layer
-    layer_sizes_vector.emplace_back(gnn_graph->GetNumLabelClasses());
+  // if (layer_sizes.size()) {
+  //  GALOIS_LOG_ASSERT(layer_sizes.size() == num_layers);
+  //  for (size_t i = 0; i < num_layers; i++) {
+  //    layer_sizes_vector.emplace_back(layer_sizes[i]);
+  //  }
+  //  // verify user satisfies last intermediate layer needing to have same size
+  //  // as # label classes
+  //  if (layer_sizes_vector.back() != gnn_graph->GetNumLabelClasses()) {
+  //    galois::gWarn(
+  //        "Size of last layer (", layer_sizes_vector.back(),
+  //        ") is not equal to # label classes: forcefully changing it to ",
+  //        gnn_graph->GetNumLabelClasses());
+  //    layer_sizes_vector.back()   = gnn_graph->GetNumLabelClasses();
+  //    layer_sizes[num_layers - 1] = gnn_graph->GetNumLabelClasses();
+  //  }
+
+  //  GALOIS_LOG_ASSERT(layer_sizes_vector.back() ==
+  //                    gnn_graph->GetNumLabelClasses());
+  //} else {
+  //  // default 16 for everything until last 2
+  //  for (size_t i = 0; i < num_layers - 1; i++) {
+  //    layer_sizes_vector.emplace_back(16);
+  //  }
+  //  // last 2 sizes must be equivalent to # label classes; this is the last
+  //  // intermediate layer
+  //  layer_sizes_vector.emplace_back(gnn_graph->GetNumLabelClasses());
+  //}
+
+  for (size_t i = 0; i < num_layers - 1; i++) {
+    layer_sizes_vector.emplace_back(layer_size);
   }
-
+  // last 2 sizes must be equivalent to # label classes; this is the last
+  // intermediate layer
+  layer_sizes_vector.emplace_back(gnn_graph->GetNumLabelClasses());
   // TODO
   // for now only softmax layer which dictates the output size of the last
   // intermediate layer + size of the output layer
@@ -245,29 +263,44 @@ CreateOptimizer(const galois::graphs::GNNGraph* gnn_graph) {
 
   // optimizer sizes are based on intermediate layer sizes, input feats, and
   // # label classes
-  if (layer_sizes.size()) {
-    GALOIS_LOG_ASSERT(layer_sizes.size() == num_layers);
-    opt_sizes.emplace_back(gnn_graph->node_feature_length() * layer_sizes[0]);
-    // assumption here is that if it reached this point then layer sizes were
-    // already sanity checked previously (esp. last layer)
-    for (size_t i = 1; i < num_layers; i++) {
-      opt_sizes.emplace_back(layer_sizes[i] * layer_sizes[i - 1]);
-    }
+  // if (layer_sizes.size()) {
+  //  GALOIS_LOG_ASSERT(layer_sizes.size() == num_layers);
+  //  opt_sizes.emplace_back(gnn_graph->node_feature_length() * layer_sizes[0]);
+  //  // assumption here is that if it reached this point then layer sizes were
+  //  // already sanity checked previously (esp. last layer)
+  //  for (size_t i = 1; i < num_layers; i++) {
+  //    opt_sizes.emplace_back(layer_sizes[i] * layer_sizes[i - 1]);
+  //  }
+  //} else {
+  //  // everything is size 16 until last
+  //  if (num_layers == 1) {
+  //    // single layer requires a bit of special handling
+  //    opt_sizes.emplace_back(gnn_graph->node_feature_length() *
+  //                           gnn_graph->GetNumLabelClasses());
+  //  } else {
+  //    // first
+  //    opt_sizes.emplace_back(gnn_graph->node_feature_length() * 16);
+  //    for (size_t i = 1; i < num_layers - 1; i++) {
+  //      opt_sizes.emplace_back(16 * 16);
+  //    }
+  //    // last
+  //    opt_sizes.emplace_back(16 * gnn_graph->GetNumLabelClasses());
+  //  }
+  //}
+
+  // everything is size 16 until last
+  if (num_layers == 1) {
+    // single layer requires a bit of special handling
+    opt_sizes.emplace_back(gnn_graph->node_feature_length() *
+                           gnn_graph->GetNumLabelClasses());
   } else {
-    // everything is size 16 until last
-    if (num_layers == 1) {
-      // single layer requires a bit of special handling
-      opt_sizes.emplace_back(gnn_graph->node_feature_length() *
-                             gnn_graph->GetNumLabelClasses());
-    } else {
-      // first
-      opt_sizes.emplace_back(gnn_graph->node_feature_length() * 16);
-      for (size_t i = 1; i < num_layers - 1; i++) {
-        opt_sizes.emplace_back(16 * 16);
-      }
-      // last
-      opt_sizes.emplace_back(16 * gnn_graph->GetNumLabelClasses());
+    // first
+    opt_sizes.emplace_back(gnn_graph->node_feature_length() * layer_size);
+    for (size_t i = 1; i < num_layers - 1; i++) {
+      opt_sizes.emplace_back(layer_size * layer_size);
     }
+    // last
+    opt_sizes.emplace_back(layer_size * gnn_graph->GetNumLabelClasses());
   }
   GALOIS_LOG_ASSERT(opt_sizes.size() == num_layers);
 
diff --git a/lonestar/libgnnbench/src/Start.cpp b/lonestar/libgnnbench/src/Start.cpp
index aa059c60f6..9a7e747744 100644
--- a/lonestar/libgnnbench/src/Start.cpp
+++ b/lonestar/libgnnbench/src/Start.cpp
@@ -45,6 +45,18 @@ static void PrintVersion(llvm::raw_ostream& out) {
   out.flush();
 }
 
+const char* GNNLayerToString(galois::GNNLayerType s) {
+  switch (s) {
+  case galois::GNNLayerType::kSAGE:
+    return "sage";
+  case galois::GNNLayerType::kGraphConvolutional:
+    return "gcn";
+  default:
+    GALOIS_LOG_FATAL("Invalid gnn layer");
+    return "";
+  }
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 
 void GNNBenchStart(int argc, char** argv, const char* app) {
@@ -95,7 +107,15 @@ void GNNBenchStart(int argc, char** argv, const char* app, const char* desc,
     galois::runtime::reportParam("GNNBench", "Input", input_name);
     galois::runtime::reportParam("GNNBench", "PartitionScheme",
                                  GNNPartitionToString(partition_scheme));
-    // XXX report the rest of the command line options
+    galois::runtime::reportParam("GNNBench", "HiddenLayerSize", layer_size);
+    galois::runtime::reportParam("GNNBench", "LayerType",
+                                 GNNLayerToString(cl_layer_type));
+    galois::runtime::reportParam("GNNBench", "TrainingMinibatchSize",
+                                 train_minibatch_size);
+    galois::runtime::reportParam("GNNBench", "TestingMinibatchSize",
+                                 test_minibatch_size);
+    galois::runtime::reportParam("GNNBench", "IsGraphSampled",
+                                 do_graph_sampling);
   }
 
   char name[256];

From d4a8a362cdce1a69d0db7ebeb0c77015a3f3b6f9 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 3 Jun 2021 18:35:18 -0500
Subject: [PATCH 556/660] GSTL vs std memory leak test

Test written to isolate effects of memory leaks in GNN gluon
communication. Basically, gstl vector must be allocated/freed by same
parallel operator or memory will leak.
---
 libgnn/test/CMakeLists.txt |  3 +++
 libgnn/test/gstl_test.cpp  | 42 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+)
 create mode 100644 libgnn/test/gstl_test.cpp

diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt
index b9c1eea043..a1ea769105 100644
--- a/libgnn/test/CMakeLists.txt
+++ b/libgnn/test/CMakeLists.txt
@@ -25,6 +25,9 @@ add_executable(mkl_micro_delete_galois mkl_micro.cpp)
 target_link_libraries(mkl_micro_delete_galois galois_gnn)
 target_compile_definitions(mkl_micro_delete_galois PUBLIC USE_SHARED_GALOIS_DELETE=1)
 
+add_executable(gstl_test gstl_test.cpp)
+target_link_libraries(gstl_test galois_shmem)
+
 ################################################################################
 
 add_executable(gnngraph-test gnngraph-test.cpp)
diff --git a/libgnn/test/gstl_test.cpp b/libgnn/test/gstl_test.cpp
new file mode 100644
index 0000000000..ef89d96a8b
--- /dev/null
+++ b/libgnn/test/gstl_test.cpp
@@ -0,0 +1,42 @@
+#include "galois/Galois.h"
+#include "galois/gstl.h"
+
+int main(int argc, char* argv[]) {
+  galois::SharedMemSys G;
+  if (argc != 2) {
+    printf("Thread arg not specified\n");
+    exit(1);
+  }
+  galois::setActiveThreads(std::stoi(argv[1]));
+  printf("Initialized Galois Shared Mem with %u threads\n",
+         galois::getActiveThreads());
+
+  // std vector has no leak issues
+  using VecType = galois::gstl::Vector<float>;
+  // using VecType = std::vector<float>;
+
+  for (size_t i = 0; i < 1000000; i++) {
+    if (i % 10000 == 0)
+      galois::gPrint("Current is ", i, "\n");
+    size_t how_many = 100000;
+
+    std::vector<VecType> carrier;
+    carrier.resize(how_many);
+
+    galois::do_all(galois::iterate(size_t{0}, how_many), [&](size_t iter) {
+      // allocate some vector then do something with it
+      VecType dummy_vec(16);
+      for (unsigned j = 0; j < dummy_vec.size(); j++) {
+        dummy_vec[j] = j;
+      }
+      carrier[iter].swap(dummy_vec);
+    });
+
+    galois::do_all(galois::iterate(size_t{0}, how_many), [&](size_t iter) {
+      VecType to_swap;
+      carrier[iter].swap(to_swap);
+    });
+  }
+
+  return 0;
+}

From 05888c4be7c52c98227dd7f30e27ff5095a701b4 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 4 Jun 2021 20:03:20 -0500
Subject: [PATCH 557/660] wip 2dvec

---
 libgalois/include/galois/TwoDVector.h         |  41 ++
 .../include/galois/graphs/GluonSubstrate.h    | 520 ++++++++++++++++--
 .../galois/graphs/DegreeSyncStructures.h      |  18 +
 .../graphs/GraphAggregationSyncStructures.h   |  55 ++
 4 files changed, 597 insertions(+), 37 deletions(-)
 create mode 100644 libgalois/include/galois/TwoDVector.h

diff --git a/libgalois/include/galois/TwoDVector.h b/libgalois/include/galois/TwoDVector.h
new file mode 100644
index 0000000000..ab3a7ff882
--- /dev/null
+++ b/libgalois/include/galois/TwoDVector.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include "gstl.h" 
+#include "PODResizeableArray.h" 
+
+namespace galois {
+
+template <typename T>
+class TwoDVector {
+public:
+  using value_type = T;
+
+  void SetVecSize(size_t fixed_vector_size) {
+    fixed_vector_size_ = fixed_vector_size;
+  }
+
+  //! Call this before using this else bad things will happen: initializes
+  //! the memory + fixed size metadata
+  void Create(size_t num_elements) {
+    num_elements_ = num_elements;
+    underlying_memory_.resize(num_elements_ * fixed_vector_size_);
+  }
+  void SetVector(size_t index, const galois::gstl::Vector<T>& to_copy) {
+    // TODO(loc) for generality should work with any vector type, but for
+    // now just use gstl
+    assert(index < num_elements_);
+    assert(to_copy == fixed_vector_size_);
+    size_t array_index = index * fixed_vector_size_;
+    std::memcpy((void*)(&(underlying_memory_[array_index])),
+                (void*)to_copy.data(),
+                sizeof(T) * fixed_vector_size_);
+  }
+
+  const PODResizeableArray<T>& data() { return underlying_memory_; }
+private:
+  size_t num_elements_{0};
+  size_t fixed_vector_size_{0};
+  PODResizeableArray<T> underlying_memory_;
+};
+
+}
diff --git a/libgluon/include/galois/graphs/GluonSubstrate.h b/libgluon/include/galois/graphs/GluonSubstrate.h
index f102e3a4a1..6ccf35edec 100644
--- a/libgluon/include/galois/graphs/GluonSubstrate.h
+++ b/libgluon/include/galois/graphs/GluonSubstrate.h
@@ -29,6 +29,7 @@
 #include <unordered_map>
 #include <fstream>
 
+#include "galois/TwoDVector.h"
 #include "galois/runtime/GlobalObj.h"
 #include "galois/runtime/DistStats.h"
 #include "galois/runtime/SyncStructures.h"
@@ -716,9 +717,38 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
   struct is_vector_of_vec<std::vector<std::vector<T, A2>, A>>
       : public std::true_type {};
 
+  template <typename T>
+  struct IsVector : public std::false_type {};
+  template <typename T, typename A>
+  struct IsVector<std::vector<T, A>> : public std::true_type {};
+
+  template <typename T>
+  struct Is2DVector : public std::false_type {};
+  template <typename T>
+  struct Is2DVector<galois::TwoDVector<T>> : public std::true_type {};
+
   ////////////////////////////////////////////////////////////////////////////////
   // Message prep functions (buffering, send buffer getting, etc.)
   ////////////////////////////////////////////////////////////////////////////////
+
+  template <
+      SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy,
+      bool async,
+      typename std::enable_if<Is2DVector<VecTy>::value>::type* = nullptr>
+  void getSendBuffer(std::string loopName, unsigned x,
+                     galois::runtime::SendBuffer& b, size_t elem_size) {
+    auto& sharedNodes = (syncType == syncReduce) ? mirrorNodes : masterNodes;
+
+    SyncExtract2D<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(
+        loopName, x, sharedNodes[x], b, elem_size);
+
+    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
+    std::string statSendBytes_str(syncTypeStr + "SendBytes_" +
+                                  get_run_identifier(loopName));
+
+    galois::runtime::reportStat_Tsum(RNAME, statSendBytes_str, b.size());
+  }
+
   /**
    * Get data that is going to be sent for synchronization and returns
    * it in a send buffer.
@@ -735,27 +765,17 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
   template <
       SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy,
       bool async,
-      typename std::enable_if<!BitsetFnTy::is_vector_bitset()>::type* = nullptr>
+      typename std::enable_if<!Is2DVector<VecTy>::value>::type* = nullptr>
   void getSendBuffer(std::string loopName, unsigned x,
                      galois::runtime::SendBuffer& b, size_t elem_size) {
     auto& sharedNodes = (syncType == syncReduce) ? mirrorNodes : masterNodes;
 
     if (BitsetFnTy::is_valid()) {
-      if (is_vector_of_vec<VecTy>::value) {
-        syncExtractFloatVecHack<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(
-            loopName, x, sharedNodes[x], b, elem_size);
-      } else {
-        syncExtract<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(
-            loopName, x, sharedNodes[x], b, elem_size);
-      }
+      syncExtract<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(
+          loopName, x, sharedNodes[x], b, elem_size);
     } else {
-      if (is_vector_of_vec<VecTy>::value) {
-        // TODO (loc)
-        GALOIS_LOG_FATAL("implement me");
-      } else {
-        syncExtract<syncType, SyncFnTy, VecTy, async>(
-            loopName, x, sharedNodes[x], b, elem_size);
-      }
+      syncExtract<syncType, SyncFnTy, VecTy, async>(
+          loopName, x, sharedNodes[x], b, elem_size);
     }
 
     std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
@@ -764,23 +784,6 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
 
     galois::runtime::reportStat_Tsum(RNAME, statSendBytes_str, b.size());
   }
-  template <
-      SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy,
-      bool async,
-      typename std::enable_if<BitsetFnTy::is_vector_bitset()>::type* = nullptr>
-  void getSendBuffer(std::string loopName, unsigned x,
-                     galois::runtime::SendBuffer& b, size_t elem_size) {
-    auto& sharedNodes = (syncType == syncReduce) ? mirrorNodes : masterNodes;
-
-    syncExtract<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(
-        loopName, x, sharedNodes[x], b, elem_size);
-
-    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
-    std::string statSendBytes_str(syncTypeStr + "SendBytesVector_" +
-                                  get_run_identifier(loopName));
-
-    galois::runtime::reportStat_Tsum(RNAME, statSendBytes_str, b.size());
-  }
 
   /**
    * Given data to serialize in val_vec, serialize it into the send buffer
@@ -883,6 +886,47 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
     }
   }
 
+  // Calls data on the TwoDVector
+  template <bool async, SyncType syncType, typename TwoDVecType>
+  void
+  SerializeMessage2D(std::string loopName, DataCommMode data_mode,
+                     size_t bit_set_count, std::vector<size_t>& indices,
+                     galois::PODResizeableArray<unsigned int>& offsets,
+                     galois::DynamicBitSet& bit_set_comm, TwoDVecType& two_d_vec,
+                     galois::runtime::SendBuffer& b) {
+    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
+    std::string serialize_timer_str(syncTypeStr + "SerializeMessage_" +
+                                    get_run_identifier(loopName));
+    galois::CondStatTimer<GALOIS_COMM_STATS> Tserialize(
+        serialize_timer_str.c_str(), RNAME);
+    if (data_mode == noData) {
+      if (!async) {
+        Tserialize.start();
+        gSerialize(b, data_mode);
+        Tserialize.stop();
+      }
+    } else if (data_mode == gidsData) {
+      offsets.resize(bit_set_count);
+      convertLIDToGID<syncType>(loopName, indices, offsets);
+      Tserialize.start();
+      gSerialize(b, data_mode, bit_set_count, offsets, two_d_vec.data());
+      Tserialize.stop();
+    } else if (data_mode == offsetsData) {
+      offsets.resize(bit_set_count);
+      Tserialize.start();
+      gSerialize(b, data_mode, bit_set_count, offsets, two_d_vec.data());
+      Tserialize.stop();
+    } else if (data_mode == bitsetData) {
+      Tserialize.start();
+      gSerialize(b, data_mode, bit_set_count, bit_set_comm, two_d_vec.data());
+      Tserialize.stop();
+    } else { // onlyData
+      Tserialize.start();
+      gSerialize(b, data_mode, two_d_vec.data());
+      Tserialize.stop();
+    }
+  }
+
   /**
    * Given the data mode, deserialize the rest of a message in a Receive Buffer.
    *
@@ -1239,6 +1283,47 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
     }
   }
 
+  template <typename FnTy, SyncType syncType, typename VecTy,
+            bool identity_offsets = false, bool parallelize = true>
+  void ExtractSubset2D(const std::string& loopName,
+                     const std::vector<size_t>& indices, size_t size,
+                     const galois::PODResizeableArray<unsigned int>& offsets,
+                     VecTy& two_d_vector, size_t start = 0) {
+    if (parallelize) {
+      std::string syncTypeStr =
+          (syncType == syncReduce) ? "Reduce" : "Broadcast";
+      std::string doall_str(syncTypeStr + "ExtractVal_" + loopName);
+
+      galois::do_all(
+          galois::iterate(start, start + size),
+          [&](unsigned int n) {
+            unsigned int offset;
+            if (identity_offsets)
+              offset = n;
+            else
+              offset = offsets[n];
+            size_t lid         = indices[offset];
+            two_d_vector.SetVector(n - start, extractWrapper<FnTy, syncType>(lid));
+          },
+#if GALOIS_COMM_STATS
+          galois::loopname(get_run_identifier(doall_str).c_str()),
+#endif
+          galois::no_stats());
+    } else { // non-parallel version
+      for (unsigned n = start; n < start + size; ++n) {
+        unsigned int offset;
+        if (identity_offsets)
+          offset = n;
+        else
+          offset = offsets[n];
+
+        size_t lid         = indices[offset];
+        two_d_vector.SetVector(n - start, extractWrapper<FnTy, syncType>(lid));
+      }
+    }
+  }
+
+
   /**
    * Based on provided arguments, extracts the data that we are interested
    * in sending into val_vec. Same as above, except it has the vecIndex
@@ -1455,6 +1540,23 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
     }
   }
 
+  // 2D
+  template <typename FnTy, SyncType syncType, bool async>
+  void SetWrapper2D(size_t lid, typename FnTy::ValTy::value_type* pointer_to_data,
+                    galois::DynamicBitSet& bit_set_compute) {
+    if (syncType == syncReduce) {
+      if (FnTy::reduce(lid, userGraph.getData(lid), pointer_to_data)) {
+        if (bit_set_compute.size() != 0)
+          bit_set_compute.set(lid);
+      }
+    } else {
+      if (async)
+        FnTy::reduce(lid, userGraph.getData(lid), pointer_to_data);
+      else
+        FnTy::setVal(lid, userGraph.getData(lid), pointer_to_data);
+    }
+  }
+
   /**
    * VECTOR VARIANT.
    *
@@ -1554,6 +1656,51 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
     }
   }
 
+  // 2D; vecty is a PODResize
+  template <typename IndicesVecTy, typename FnTy, SyncType syncType,
+            typename VecTy, bool async, bool identity_offsets = false,
+            bool parallelize = true>
+  void SetSubset2D(const std::string& loopName, const IndicesVecTy& indices,
+                   size_t size,
+                   const galois::PODResizeableArray<unsigned int>& offsets,
+                   VecTy& val_vec, galois::DynamicBitSet& bit_set_compute,
+                   size_t start = 0) {
+    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
+    std::string doall_str(syncTypeStr + "SetVal_" +
+                          get_run_identifier(loopName));
+
+    if (parallelize) {
+      galois::do_all(
+          galois::iterate(start, start + size),
+          [&](unsigned int n) {
+            unsigned int offset;
+            if (identity_offsets)
+              offset = n;
+            else
+              offset = offsets[n];
+            auto lid = indices[offset];
+            SetWrapper2D<FnTy, syncType, async>(lid, &val_vec[(n - start) * FnTy::FeatVecSize()],
+                                              bit_set_compute);
+          },
+#if GALOIS_COMM_STATS
+          galois::loopname(get_run_identifier(doall_str).c_str()),
+#endif
+          galois::no_stats());
+    } else {
+      for (unsigned int n = start; n < start + size; ++n) {
+        unsigned int offset;
+        if (identity_offsets)
+          offset = n;
+        else
+          offset = offsets[n];
+        auto lid = indices[offset];
+        SetWrapper2D<FnTy, syncType, async>(lid, &val_vec[(n - start) * FnTy::FeatVecSize()],
+                                          bit_set_compute);
+      }
+    }
+  }
+
+
   /**
    * VECTOR BITSET VARIANT.
    *
@@ -2140,6 +2287,128 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
                                                             1);
   }
 
+  template <
+      SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy,
+      bool async>
+  void SyncExtract2D(std::string loopName, unsigned from_id,
+                     std::vector<size_t>& indices,
+                     galois::runtime::SendBuffer& b, size_t elem_size) {
+    uint32_t num                        = indices.size() * elem_size;
+    galois::DynamicBitSet& bit_set_comm = syncBitset;
+    static VecTy two_d_array;
+    two_d_array.SetVecSize(SyncFnTy::FeatVecSize());
+    galois::PODResizeableArray<unsigned int>& offsets = syncOffsets;
+
+    //////////////////////////////////////////////////////////////////////////////
+    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
+    std::string extract_timer_str(syncTypeStr + "Extract_" +
+                                  get_run_identifier(loopName));
+    galois::CondStatTimer<GALOIS_COMM_STATS> Textract(extract_timer_str.c_str(),
+                                                      RNAME);
+    std::string extract_alloc_timer_str(syncTypeStr + "ExtractAlloc_" +
+                                        get_run_identifier(loopName));
+    galois::CondStatTimer<GALOIS_COMM_STATS> Textractalloc(
+        extract_alloc_timer_str.c_str(), RNAME);
+    std::string extract_batch_timer_str(syncTypeStr + "ExtractBatch_" +
+                                        get_run_identifier(loopName));
+    galois::CondStatTimer<GALOIS_COMM_STATS> Textractbatch(
+        extract_batch_timer_str.c_str(), RNAME);
+    //////////////////////////////////////////////////////////////////////////////
+
+    DataCommMode data_mode;
+    Textract.start();
+
+    if (num > 0) {
+      size_t bit_set_count = 0;
+      Textractalloc.start();
+      b.reserve(getMaxSendBufferSize<SyncFnTy>(num));
+      Textractalloc.stop();
+
+      Textractbatch.start();
+      bool batch_succeeded = extractBatchWrapper<SyncFnTy, syncType>(
+          from_id, b, bit_set_count, data_mode);
+      Textractbatch.stop();
+
+      // GPUs have a batch function they can use; CPUs do not; therefore,
+      // CPUS always enter this if block
+      if (!batch_succeeded) {
+        Textractalloc.start();
+        b.resize(0);
+        bit_set_comm.reserve(maxSharedSize);
+        offsets.reserve(maxSharedSize);
+        bit_set_comm.resize(num);
+        offsets.resize(num);
+        two_d_array.Create(num);
+        Textractalloc.stop();
+        const galois::DynamicBitSet& bit_set_compute = BitsetFnTy::get();
+
+        getBitsetAndOffsets<SyncFnTy, syncType>(
+            loopName, indices, bit_set_compute, bit_set_comm, offsets,
+            bit_set_count, data_mode);
+
+        if (data_mode == onlyData) {
+          bit_set_count = indices.size();
+          ExtractSubset2D<SyncFnTy, syncType, VecTy, true, true>(
+              loopName, indices, bit_set_count, offsets, two_d_array);
+        } else if (data_mode !=
+                   noData) { // bitsetData or offsetsData or gidsData
+          ExtractSubset2D<SyncFnTy, syncType, VecTy, false, true>(
+              loopName, indices, bit_set_count, offsets, two_d_array);
+        }
+
+        SerializeMessage2D<async, syncType>(
+            loopName, data_mode, bit_set_count, indices, offsets, bit_set_comm,
+            two_d_array, b);
+      } else {
+        // TODO(loc/hochan) GPU
+        //if (data_mode == noData) {
+        //  b.resize(0);
+        //  if (!async) {
+        //    gSerialize(b, data_mode);
+        //  }
+        //} else if (data_mode == gidsData) {
+        //  b.resize(sizeof(DataCommMode) + sizeof(bit_set_count) +
+        //           sizeof(size_t) + (bit_set_count * sizeof(unsigned int)) +
+        //           sizeof(size_t) +
+        //           (bit_set_count * sizeof(typename SyncFnTy::ValTy)));
+        //} else if (data_mode == offsetsData) {
+        //  b.resize(sizeof(DataCommMode) + sizeof(bit_set_count) +
+        //           sizeof(size_t) + (bit_set_count * sizeof(unsigned int)) +
+        //           sizeof(size_t) +
+        //           (bit_set_count * sizeof(typename SyncFnTy::ValTy)));
+        //} else if (data_mode == bitsetData) {
+        //  size_t bitset_alloc_size = ((num + 63) / 64) * sizeof(uint64_t);
+        //  b.resize(sizeof(DataCommMode) + sizeof(bit_set_count) +
+        //           sizeof(size_t)   // bitset size
+        //           + sizeof(size_t) // bitset vector size
+        //           + bitset_alloc_size + sizeof(size_t) +
+        //           (bit_set_count * sizeof(typename SyncFnTy::ValTy)));
+        //} else { // onlyData
+        //  b.resize(sizeof(DataCommMode) + sizeof(size_t) +
+        //           (num * sizeof(typename SyncFnTy::ValTy)));
+        //}
+        GALOIS_LOG_FATAL("Make sure this is implemented correctly");
+      }
+      reportRedundantSize<SyncFnTy>(loopName, syncTypeStr, num, bit_set_count,
+                                    bit_set_comm);
+    } else {
+      data_mode = noData;
+      b.resize(0);
+      if (!async) {
+        gSerialize(b, noData);
+      }
+    }
+
+    Textract.stop();
+
+    std::string metadata_str(syncTypeStr + "MetadataMode_" +
+                             std::to_string(data_mode) + "_" +
+                             get_run_identifier(loopName));
+    galois::runtime::reportStatCond_Single<MORE_DIST_STATS>(RNAME, metadata_str,
+                                                            1);
+  }
+
+
   /**
    * Vector bitset variant.
    *
@@ -2521,6 +2790,103 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
     return retval;
   }
 
+  // TODO (loc) way too much code duplication
+  template <
+      SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy,
+      bool async>
+  size_t SyncRecvApply2D(uint32_t from_id,
+                         galois::runtime::RecvBuffer& buf,
+                         std::string loopName) {
+    ////////////////////////////////////////////////////////////////////////////
+    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
+    std::string set_timer_str(syncTypeStr + "Set_" +
+                              get_run_identifier(loopName));
+    galois::CondStatTimer<GALOIS_COMM_STATS> Tset(set_timer_str.c_str(), RNAME);
+    std::string set_batch_timer_str(syncTypeStr + "SetBatch_" +
+                                    get_run_identifier(loopName));
+    galois::CondStatTimer<GALOIS_COMM_STATS> Tsetbatch(
+        set_batch_timer_str.c_str(), RNAME);
+    ////////////////////////////////////////////////////////////////////////////
+
+    galois::DynamicBitSet& bit_set_comm = syncBitset;
+    //static VecTy two_d_vector;
+    galois::PODResizeableArray<unsigned int>& offsets = syncOffsets;
+
+    auto& sharedNodes = (syncType == syncReduce) ? masterNodes : mirrorNodes;
+    uint32_t num      = sharedNodes[from_id].size();
+    size_t retval     = 0;
+
+    Tset.start();
+
+    if (num > 0) { // only enter if we expect message from that host
+      DataCommMode data_mode;
+      // 1st deserialize gets data mode
+      galois::runtime::gDeserialize(buf, data_mode);
+
+      if (data_mode != noData) {
+        Tsetbatch.start();
+        bool batch_succeeded =
+            setBatchWrapper<SyncFnTy, syncType, async>(from_id, buf, data_mode);
+        Tsetbatch.stop();
+
+        // cpu always enters this block
+        if (!batch_succeeded) {
+          size_t bit_set_count = num;
+          size_t buf_start     = 0;
+
+          using DeserialPOD = galois::PODResizeableArray<typename VecTy::value_type>;
+          DeserialPOD deserial_pod;
+
+          // deserialize the rest of the data in the buffer depending on the
+          // data mode; arguments passed in here are mostly output vars
+          deserializeMessage<syncType>(loopName, data_mode, num, buf,
+                                       bit_set_count, offsets, bit_set_comm,
+                                       buf_start, retval, deserial_pod);
+
+          bit_set_comm.reserve(maxSharedSize);
+          offsets.reserve(maxSharedSize);
+
+          galois::DynamicBitSet& bit_set_compute = BitsetFnTy::get();
+
+          if (data_mode == bitsetData) {
+            size_t bit_set_count2;
+            getOffsetsFromBitset<syncType>(loopName, bit_set_comm, offsets,
+                                           bit_set_count2);
+            assert(bit_set_count == bit_set_count2);
+          }
+
+          if (data_mode == onlyData) {
+            SetSubset2D<decltype(sharedNodes[from_id]), SyncFnTy, syncType, DeserialPOD,
+                      async, true, true>(loopName, sharedNodes[from_id],
+                                         bit_set_count, offsets, deserial_pod,
+                                         bit_set_compute);
+          } else if (data_mode == dataSplit || data_mode == dataSplitFirst) {
+            SetSubset2D<decltype(sharedNodes[from_id]), SyncFnTy, syncType, DeserialPOD,
+                      async, true, true>(loopName, sharedNodes[from_id],
+                                         bit_set_count, offsets, deserial_pod,
+                                         bit_set_compute, buf_start);
+          } else if (data_mode == gidsData) {
+            SetSubset2D<decltype(offsets), SyncFnTy, syncType, DeserialPOD, async, true,
+                      true>(loopName, offsets, bit_set_count, offsets, deserial_pod,
+                            bit_set_compute);
+          } else { // bitsetData or offsetsData
+            SetSubset2D<decltype(sharedNodes[from_id]), SyncFnTy, syncType, DeserialPOD,
+                      async, false, true>(loopName, sharedNodes[from_id],
+                                          bit_set_count, offsets, deserial_pod,
+                                          bit_set_compute);
+          }
+        } else {
+          // TODO(loc/hochan)
+          GALOIS_LOG_FATAL("Implement GPU");
+        }
+      }
+    }
+
+    Tset.stop();
+
+    return retval;
+  }
+
   // TODO (loc) way too much code duplication
   template <
       SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy,
@@ -2820,6 +3186,39 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
   }
 #endif
 
+  template <WriteLocation writeLocation, ReadLocation readLocation,
+            SyncType syncType, typename SyncFnTy, typename BitsetFnTy,
+            typename VecTy, bool async,
+            typename std::enable_if<Is2DVector<VecTy>::value>::type* = nullptr>
+  void syncNetRecv(std::string loopName) {
+    auto& net = galois::runtime::getSystemNetworkInterface();
+    std::string wait_timer_str("Wait_" + get_run_identifier(loopName));
+    galois::CondStatTimer<GALOIS_COMM_STATS> Twait(wait_timer_str.c_str(),
+                                                   RNAME);
+
+    if (async) {
+      GALOIS_LOG_FATAL("2d vector + async = unimplemented");
+    } else {
+      for (unsigned x = 0; x < numHosts; ++x) {
+        if (x == id)
+          continue;
+        if (nothingToRecv(x, syncType, writeLocation, readLocation))
+          continue;
+
+        Twait.start();
+        decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
+        do {
+          p = net.recieveTagged(galois::runtime::evilPhase);
+        } while (!p);
+        Twait.stop();
+
+        SyncRecvApply2D<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(
+            p->first, p->second, loopName);
+      }
+      incrementEvilPhase();
+    }
+  }
+
   /**
    * Determines if there is anything to receive from a host and receives/applies
    * the messages.
@@ -2834,7 +3233,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
    */
   template <WriteLocation writeLocation, ReadLocation readLocation,
             SyncType syncType, typename SyncFnTy, typename BitsetFnTy,
-            typename VecTy, bool async>
+            typename VecTy, bool async,
+            typename std::enable_if<!Is2DVector<VecTy>::value>::type* = nullptr>
   void syncNetRecv(std::string loopName) {
     auto& net = galois::runtime::getSystemNetworkInterface();
     std::string wait_timer_str("Wait_" + get_run_identifier(loopName));
@@ -3061,6 +3461,28 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
   // Higher Level Sync Calls (broadcast/reduce, etc)
   ////////////////////////////////////////////////////////////////////////////////
 
+  // 2D vector
+  template <WriteLocation writeLocation, ReadLocation readLocation,
+            typename ReduceFnTy, typename BitsetFnTy, bool async,
+            typename std::enable_if<IsVector<typename ReduceFnTy::ValTy>::value>::type* = nullptr>
+  void reduce(std::string loopName, size_t elem_size) {
+    std::string timer_str("Reduce_" + get_run_identifier(loopName));
+    galois::CondStatTimer<GALOIS_COMM_STATS> TsyncReduce(timer_str.c_str(),
+                                                         RNAME);
+
+    using T = typename ReduceFnTy::ValTy::value_type;
+    using VecTy = galois::TwoDVector<T>;
+
+    TsyncReduce.start();
+
+    syncSend<writeLocation, readLocation, syncReduce, ReduceFnTy, BitsetFnTy,
+             VecTy, async>(loopName, elem_size);
+    syncRecv<writeLocation, readLocation, syncReduce, ReduceFnTy, BitsetFnTy,
+             VecTy, async>(loopName);
+
+    TsyncReduce.stop();
+  }
+
   /**
    * Does a reduction of data from mirror nodes to master nodes.
    *
@@ -3072,8 +3494,9 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
    * @param loopName used to name timers for statistics
    */
   template <WriteLocation writeLocation, ReadLocation readLocation,
-            typename ReduceFnTy, typename BitsetFnTy, bool async>
-  inline void reduce(std::string loopName, size_t elem_size) {
+            typename ReduceFnTy, typename BitsetFnTy, bool async,
+            typename std::enable_if<!IsVector<typename ReduceFnTy::ValTy>::value>::type* = nullptr>
+  void reduce(std::string loopName, size_t elem_size) {
     std::string timer_str("Reduce_" + get_run_identifier(loopName));
     galois::CondStatTimer<GALOIS_COMM_STATS> TsyncReduce(timer_str.c_str(),
                                                          RNAME);
@@ -3112,6 +3535,28 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
     TsyncReduce.stop();
   }
 
+  // 2d
+  template <WriteLocation writeLocation, ReadLocation readLocation,
+            typename BroadcastFnTy, typename BitsetFnTy, bool async,
+            typename std::enable_if<IsVector<typename BroadcastFnTy::ValTy>::value>::type* = nullptr>
+  void broadcast(std::string loopname, size_t elem_size) {
+    std::string timer_str("Broadcast_" + get_run_identifier(loopname));
+    galois::CondStatTimer<GALOIS_COMM_STATS> TsyncBroadcast(timer_str.c_str(),
+                                                            RNAME);
+
+    typedef typename BroadcastFnTy::ValTy::value_type T;
+    using VecTy = galois::TwoDVector<T>;
+
+    TsyncBroadcast.start();
+
+    syncSend<writeLocation, readLocation, syncBroadcast, BroadcastFnTy,
+             BitsetFnTy, VecTy, async>(loopname, elem_size);
+    syncRecv<writeLocation, readLocation, syncBroadcast, BroadcastFnTy,
+             BitsetFnTy, VecTy, async>(loopname);
+
+    TsyncBroadcast.stop();
+  }
+
   /**
    * Does a broadcast of data from master to mirror nodes.
    *
@@ -3123,8 +3568,9 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
    * @param loopName used to name timers for statistics
    */
   template <WriteLocation writeLocation, ReadLocation readLocation,
-            typename BroadcastFnTy, typename BitsetFnTy, bool async>
-  inline void broadcast(std::string loopName, size_t elem_size) {
+            typename BroadcastFnTy, typename BitsetFnTy, bool async,
+            typename std::enable_if<!IsVector<typename BroadcastFnTy::ValTy>::value>::type* = nullptr>
+  void broadcast(std::string loopName, size_t elem_size) {
     std::string timer_str("Broadcast_" + get_run_identifier(loopName));
     galois::CondStatTimer<GALOIS_COMM_STATS> TsyncBroadcast(timer_str.c_str(),
                                                             RNAME);
diff --git a/libgnn/include/galois/graphs/DegreeSyncStructures.h b/libgnn/include/galois/graphs/DegreeSyncStructures.h
index 04c696f6ab..0ba0ad2bd9 100644
--- a/libgnn/include/galois/graphs/DegreeSyncStructures.h
+++ b/libgnn/include/galois/graphs/DegreeSyncStructures.h
@@ -60,6 +60,10 @@ struct InitialDegreeSync {
 struct SubgraphDegreeSync {
   using ValTy = galois::gstl::Vector<uint32_t>;
 
+  static size_t FeatVecSize() {
+    return gnn_sampled_out_degrees_->size();;
+  }
+
   //! return a vector of floats to sync
   static ValTy extract(uint32_t lid, char&) {
     ValTy vec_to_send(gnn_sampled_out_degrees_->size());
@@ -80,6 +84,13 @@ struct SubgraphDegreeSync {
     return true;
   }
 
+  static bool reduce(uint32_t lid, char&, ValTy::value_type* y) {
+    for (size_t degree_index = 0; degree_index < gnn_sampled_out_degrees_->size(); degree_index++) {
+      (*gnn_sampled_out_degrees_)[degree_index][lid] += y[degree_index];
+    }
+    return true;
+  }
+
   //! No-op: readAny = overwritten anyways; can probably get away with no-op
   static void reset(uint32_t lid, char&) {
     for (galois::LargeArray<uint32_t>& layer_degrees :
@@ -96,6 +107,13 @@ struct SubgraphDegreeSync {
     }
   }
 
+  static void setVal(uint32_t lid, char&, ValTy::value_type* y) {
+    for (size_t degree_index = 0; degree_index < gnn_sampled_out_degrees_->size(); degree_index++) {
+      (*gnn_sampled_out_degrees_)[degree_index][lid] = y[degree_index];
+    }
+  }
+
+
   // GPU options TODO for GPU
   static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {
     return false;
diff --git a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
index 89ccc83324..17063fe8ec 100644
--- a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
+++ b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
@@ -68,6 +68,10 @@ struct SampleFlagBitset {
 struct GNNSumAggregate {
   using ValTy = galois::gstl::Vector<GNNFloat>;
 
+  static size_t FeatVecSize() {
+    return gnn_matrix_to_sync_column_length_;
+  }
+
   //! return a vector of floats to sync
   static ValTy extract(uint32_t node_id, char&) {
     // It should be a CPU synchronizing substrate.
@@ -96,6 +100,16 @@ struct GNNSumAggregate {
     return true;
   }
 
+  static bool reduce(uint32_t node_id, char&, const ValTy::value_type* y) {
+    // loop and do addition
+    for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) {
+      // XXX vectorized add
+      gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_ + i] +=
+          y[i];
+    }
+    return true;
+  }
+
   //! No-op: readAny = overwritten anyways
   static void reset(uint32_t, char&) {}
   // Reset is here in case anyone wants to bring it back
@@ -116,6 +130,15 @@ struct GNNSumAggregate {
     }
   }
 
+  static void setVal(uint32_t node_id, char&, const ValTy::value_type* y) {
+    // loop and do addition
+    for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) {
+      gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_ + i] =
+          y[i];
+    }
+  }
+
+
   // GPU options TODO for GPU
   static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {
     return false;
@@ -135,6 +158,10 @@ struct GNNSumAggregate {
 struct GNNSampleSumAggregate {
   using ValTy = galois::gstl::Vector<GNNFloat>;
 
+  static size_t FeatVecSize() {
+    return gnn_matrix_to_sync_column_length_;
+  }
+
   //! return a vector of floats to sync
   static ValTy extract(uint32_t node_id, char&) {
     // It should be a CPU synchronizing substrate.
@@ -175,6 +202,21 @@ struct GNNSampleSumAggregate {
     return true;
   }
 
+  static bool reduce(uint32_t node_id, char&, ValTy::value_type* y) {
+    if ((*gnn_lid_to_sid_pointer_)[node_id] ==
+        std::numeric_limits<uint32_t>::max()) {
+      return false;
+    }
+
+    // loop and do addition
+    for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) {
+      gnn_matrix_to_sync_[(*gnn_lid_to_sid_pointer_)[node_id] *
+                              gnn_matrix_to_sync_column_length_ +
+                          i] += y[i];
+    }
+    return true;
+  }
+
   //! No-op: readAny = overwritten anyways
   static void reset(uint32_t, char&) {}
 
@@ -193,6 +235,19 @@ struct GNNSampleSumAggregate {
                           i] = y[i];
     }
   }
+  static void setVal(uint32_t node_id, char&, ValTy::value_type* y) {
+    if ((*gnn_lid_to_sid_pointer_)[node_id] ==
+        std::numeric_limits<uint32_t>::max()) {
+      return;
+    }
+
+    // loop and do addition
+    for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) {
+      gnn_matrix_to_sync_[(*gnn_lid_to_sid_pointer_)[node_id] *
+                              gnn_matrix_to_sync_column_length_ +
+                          i] = y[i];
+    }
+  }
 
   // GPU options TODO for GPU
   static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {

From 8aa64e4d69bebf8f5e64e57f5d0f9608d59bfc5c Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 4 Jun 2021 20:46:33 -0500
Subject: [PATCH 558/660] WIP part 2: cut off 0s

---
 libgalois/include/galois/TwoDVector.h           | 1 +
 libgluon/include/galois/graphs/GluonSubstrate.h | 7 +++++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/libgalois/include/galois/TwoDVector.h b/libgalois/include/galois/TwoDVector.h
index ab3a7ff882..240e7750a5 100644
--- a/libgalois/include/galois/TwoDVector.h
+++ b/libgalois/include/galois/TwoDVector.h
@@ -32,6 +32,7 @@ class TwoDVector {
   }
 
   const PODResizeableArray<T>& data() { return underlying_memory_; }
+  void resize(size_t s) { underlying_memory_.resize(s); }
 private:
   size_t num_elements_{0};
   size_t fixed_vector_size_{0};
diff --git a/libgluon/include/galois/graphs/GluonSubstrate.h b/libgluon/include/galois/graphs/GluonSubstrate.h
index 6ccf35edec..37e6cca573 100644
--- a/libgluon/include/galois/graphs/GluonSubstrate.h
+++ b/libgluon/include/galois/graphs/GluonSubstrate.h
@@ -893,7 +893,7 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
                      size_t bit_set_count, std::vector<size_t>& indices,
                      galois::PODResizeableArray<unsigned int>& offsets,
                      galois::DynamicBitSet& bit_set_comm, TwoDVecType& two_d_vec,
-                     galois::runtime::SendBuffer& b) {
+                     galois::runtime::SendBuffer& b, size_t feat_size) {
     std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
     std::string serialize_timer_str(syncTypeStr + "SerializeMessage_" +
                                     get_run_identifier(loopName));
@@ -908,16 +908,19 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
     } else if (data_mode == gidsData) {
       offsets.resize(bit_set_count);
       convertLIDToGID<syncType>(loopName, indices, offsets);
+      two_d_vec.resize(bit_set_count * feat_size);
       Tserialize.start();
       gSerialize(b, data_mode, bit_set_count, offsets, two_d_vec.data());
       Tserialize.stop();
     } else if (data_mode == offsetsData) {
       offsets.resize(bit_set_count);
+      two_d_vec.resize(bit_set_count * feat_size);
       Tserialize.start();
       gSerialize(b, data_mode, bit_set_count, offsets, two_d_vec.data());
       Tserialize.stop();
     } else if (data_mode == bitsetData) {
       Tserialize.start();
+      two_d_vec.resize(bit_set_count * feat_size);
       gSerialize(b, data_mode, bit_set_count, bit_set_comm, two_d_vec.data());
       Tserialize.stop();
     } else { // onlyData
@@ -2358,7 +2361,7 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
 
         SerializeMessage2D<async, syncType>(
             loopName, data_mode, bit_set_count, indices, offsets, bit_set_comm,
-            two_d_array, b);
+            two_d_array, b, SyncFnTy::FeatVecSize());
       } else {
         // TODO(loc/hochan) GPU
         //if (data_mode == noData) {

From 2de8ab57de2548f1b384f0df58dc43510963e90b Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Sat, 5 Jun 2021 15:20:18 -0500
Subject: [PATCH 559/660] WIP #3: write directly to 2d vector

---
 libgalois/include/galois/TwoDVector.h         |  2 ++
 .../include/galois/graphs/GluonSubstrate.h    | 17 ++++++++--
 .../galois/graphs/DegreeSyncStructures.h      | 13 ++++++-
 .../graphs/GraphAggregationSyncStructures.h   | 34 +++++++++++++++----
 4 files changed, 56 insertions(+), 10 deletions(-)

diff --git a/libgalois/include/galois/TwoDVector.h b/libgalois/include/galois/TwoDVector.h
index 240e7750a5..1af9fba505 100644
--- a/libgalois/include/galois/TwoDVector.h
+++ b/libgalois/include/galois/TwoDVector.h
@@ -31,8 +31,10 @@ class TwoDVector {
                 sizeof(T) * fixed_vector_size_);
   }
 
+  PODResizeableArray<T>& edit_data() { return underlying_memory_; }
   const PODResizeableArray<T>& data() { return underlying_memory_; }
   void resize(size_t s) { underlying_memory_.resize(s); }
+  size_t size() const { return underlying_memory_.size(); }
 private:
   size_t num_elements_{0};
   size_t fixed_vector_size_{0};
diff --git a/libgluon/include/galois/graphs/GluonSubstrate.h b/libgluon/include/galois/graphs/GluonSubstrate.h
index 37e6cca573..73282a0644 100644
--- a/libgluon/include/galois/graphs/GluonSubstrate.h
+++ b/libgluon/include/galois/graphs/GluonSubstrate.h
@@ -1199,6 +1199,18 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
     }
   }
 
+
+  template <typename FnTy, SyncType syncType>
+  void ExtractWrapper2D(size_t lid, typename FnTy::ValTy::value_type* location_to_write) {
+    if (syncType == syncReduce) {
+      FnTy::ExtractDirect(lid, location_to_write);
+      char dummy = 0;
+      FnTy::reset(lid, dummy);
+    } else {
+      FnTy::ExtractDirect(lid, location_to_write);
+    }
+  }
+
   /**
    * Extracts data at provided lid; uses vecIndex to get the correct element
    * from the vector.
@@ -1306,7 +1318,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
             else
               offset = offsets[n];
             size_t lid         = indices[offset];
-            two_d_vector.SetVector(n - start, extractWrapper<FnTy, syncType>(lid));
+
+            ExtractWrapper2D<FnTy, syncType>(lid, (&(two_d_vector.edit_data()[(n - start) * FnTy::FeatVecSize()])));
           },
 #if GALOIS_COMM_STATS
           galois::loopname(get_run_identifier(doall_str).c_str()),
@@ -1321,7 +1334,7 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
           offset = offsets[n];
 
         size_t lid         = indices[offset];
-        two_d_vector.SetVector(n - start, extractWrapper<FnTy, syncType>(lid));
+        ExtractWrapper2D<FnTy, syncType>(lid, &((two_d_vector.edit_data())[(n - start) * FnTy::FeatVecSize()]));
       }
     }
   }
diff --git a/libgnn/include/galois/graphs/DegreeSyncStructures.h b/libgnn/include/galois/graphs/DegreeSyncStructures.h
index 0ba0ad2bd9..44102a3807 100644
--- a/libgnn/include/galois/graphs/DegreeSyncStructures.h
+++ b/libgnn/include/galois/graphs/DegreeSyncStructures.h
@@ -1,4 +1,5 @@
 #include "galois/GNNTypes.h"
+//#include "galois/Logging.h"
 
 namespace galois {
 namespace graphs {
@@ -64,7 +65,6 @@ struct SubgraphDegreeSync {
     return gnn_sampled_out_degrees_->size();;
   }
 
-  //! return a vector of floats to sync
   static ValTy extract(uint32_t lid, char&) {
     ValTy vec_to_send(gnn_sampled_out_degrees_->size());
     size_t count = 0;
@@ -76,6 +76,17 @@ struct SubgraphDegreeSync {
     return vec_to_send;
   }
 
+  static void ExtractDirect(uint32_t lid, typename ValTy::value_type* to_write) {
+    size_t count = 0;
+    for (galois::LargeArray<uint32_t>& layer_degrees :
+         *gnn_sampled_out_degrees_) {
+      std::memcpy(&to_write[count],
+                  &layer_degrees[lid],
+                  sizeof(typename ValTy::value_type));
+      count++;
+    }
+  }
+
   static bool reduce(uint32_t lid, char&, ValTy y) {
     assert(y.size() == gnn_sampled_out_degrees_->size());
     for (size_t degree_index = 0; degree_index < y.size(); degree_index++) {
diff --git a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
index 17063fe8ec..1270df5ff5 100644
--- a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
+++ b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
@@ -72,21 +72,30 @@ struct GNNSumAggregate {
     return gnn_matrix_to_sync_column_length_;
   }
 
+
   //! return a vector of floats to sync
   static ValTy extract(uint32_t node_id, char&) {
     // It should be a CPU synchronizing substrate.
     // If the GPU flag is turned off, then personality does not exist.
     // assert(device_personality == DevicePersonality::CPU);
-    ValTy extracted_vec(gnn_matrix_to_sync_column_length_);
+    ValTy extracted_vec;
+    extracted_vec.reserve(gnn_matrix_to_sync_column_length_);
     for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) {
       // XXX memcpy
-      extracted_vec[i] =
-          gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_ + i];
+      extracted_vec.emplace_back(
+          gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_ + i]);
     }
     // move constructor should kick in here to avoid return copy
     return extracted_vec;
   }
 
+  //! return a vector of floats to sync
+  static void ExtractDirect(uint32_t node_id, typename ValTy::value_type* to_write) {
+    std::memcpy(to_write,
+                (char*)&(gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_]),
+                gnn_matrix_to_sync_column_length_ * sizeof(typename ValTy::value_type));
+  }
+
   //! reduction is addition in this case; add received vector to
   //! own vector
   static bool reduce(uint32_t node_id, char&, ValTy y) {
@@ -138,7 +147,6 @@ struct GNNSumAggregate {
     }
   }
 
-
   // GPU options TODO for GPU
   static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {
     return false;
@@ -167,7 +175,9 @@ struct GNNSampleSumAggregate {
     // It should be a CPU synchronizing substrate.
     // If the GPU flag is turned off, then personality does not exist.
     // assert(device_personality == DevicePersonality::CPU);
-    ValTy extracted_vec(gnn_matrix_to_sync_column_length_, 0.0);
+    //ValTy extracted_vec(gnn_matrix_to_sync_column_length_);
+    ValTy extracted_vec;
+    extracted_vec.reserve(gnn_matrix_to_sync_column_length_);
     if ((*gnn_lid_to_sid_pointer_)[node_id] ==
         std::numeric_limits<uint32_t>::max()) {
       return extracted_vec;
@@ -175,15 +185,25 @@ struct GNNSampleSumAggregate {
 
     for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) {
       // XXX memcpy
-      extracted_vec[i] =
+      extracted_vec.emplace_back(
           gnn_matrix_to_sync_[(*gnn_lid_to_sid_pointer_)[node_id] *
                                   gnn_matrix_to_sync_column_length_ +
-                              i];
+                              i]);
     }
     // move constructor should kick in here to avoid return copy
     return extracted_vec;
   }
 
+  static void ExtractDirect(uint32_t node_id, typename ValTy::value_type* to_write) {
+    if ((*gnn_lid_to_sid_pointer_)[node_id] ==
+        std::numeric_limits<uint32_t>::max()) {
+      return;
+    }
+    std::memcpy(to_write,
+                (char*)&(gnn_matrix_to_sync_[(*gnn_lid_to_sid_pointer_)[node_id]* gnn_matrix_to_sync_column_length_]),
+                gnn_matrix_to_sync_column_length_ * sizeof(typename ValTy::value_type));
+  }
+
   //! reduction is addition in this case; add received vector to
   //! own vector
   static bool reduce(uint32_t node_id, char&, ValTy y) {

From 3d33e96d3ad7dce8d86ada68cb466c07b5055639 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 7 Jun 2021 16:53:21 -0500
Subject: [PATCH 560/660] Directly serialize vectors to comm buffer

---
 libdist/include/galois/runtime/Serialize.h    |  10 +-
 .../include/galois/graphs/GluonSubstrate.h    | 321 +++++++++++++-----
 2 files changed, 245 insertions(+), 86 deletions(-)

diff --git a/libdist/include/galois/runtime/Serialize.h b/libdist/include/galois/runtime/Serialize.h
index 8110c954e9..bfd25c3cf3 100644
--- a/libdist/include/galois/runtime/Serialize.h
+++ b/libdist/include/galois/runtime/Serialize.h
@@ -135,6 +135,9 @@ class SerializeBuffer {
   //! (as determined by offset)
   const uint8_t* data() const { return bufdata.data() + kHeaderSize; }
   uint8_t* data() { return bufdata.data() + kHeaderSize; }
+  uint8_t* DataAtOffset(size_t offset) {
+    return bufdata.data() + kHeaderSize + offset;
+  }
 
   //! Returns the size of the serialize buffer
   size_type size() const { return bufdata.size() - kHeaderSize; }
@@ -1052,10 +1055,9 @@ inline void gDeserialize(DeSerializeBuffer&) {}
  * @param data Object to save data in the iterator type into
  */
 template <typename Iter, typename T>
-auto gDeserializeRaw(Iter iter, T& data)
-    -> decltype(std::declval<typename std::enable_if<
-                    is_memory_copyable<T>::value>::type>(),
-                Iter()) {
+auto gDeserializeRaw(Iter iter, T& data) -> decltype(
+    std::declval<typename std::enable_if<is_memory_copyable<T>::value>::type>(),
+    Iter()) {
   unsigned char* pdata = (unsigned char*)&data;
   for (size_t i = 0; i < sizeof(T); ++i)
     pdata[i] = *iter++;
diff --git a/libgluon/include/galois/graphs/GluonSubstrate.h b/libgluon/include/galois/graphs/GluonSubstrate.h
index 73282a0644..c60308ac93 100644
--- a/libgluon/include/galois/graphs/GluonSubstrate.h
+++ b/libgluon/include/galois/graphs/GluonSubstrate.h
@@ -595,6 +595,7 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
                            galois::PODResizeableArray<unsigned int>& offsets,
                            size_t& bit_set_count,
                            DataCommMode& data_mode) const {
+    // i.e. not set by user
     if (substrateDataMode != onlyData) {
       bitset_comm.reset();
       std::string syncTypeStr =
@@ -619,10 +620,60 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
           galois::no_stats());
 
       // get the number of set bits and the offsets into the comm bitset
+      // i.e., the things thaneed to be grabbed
       getOffsetsFromBitset<syncType>(loopName, bitset_comm, offsets,
                                      bit_set_count);
     }
 
+    // from the count of things that need to be grabbed determine the data mode
+    // to use
+    data_mode =
+        get_data_mode<typename FnTy::ValTy>(bit_set_count, indices.size());
+  }
+
+  template <typename FnTy, SyncType syncType>
+  void GetBitsetAndOffsets2D(const std::string& loopName,
+                             const std::vector<size_t>& indices,
+                             const galois::DynamicBitSet& bitset_compute,
+                             galois::DynamicBitSet& bitset_comm,
+                             galois::PODResizeableArray<unsigned int>& offsets,
+                             size_t& bit_set_count,
+                             DataCommMode& data_mode) const {
+    // i.e. not set by user
+    if (substrateDataMode != onlyData) {
+      bitset_comm.reset();
+      std::string syncTypeStr =
+          (syncType == syncReduce) ? "Reduce" : "Broadcast";
+      std::string doall_str(syncTypeStr + "Bitset_" + loopName);
+
+      bitset_comm.reset();
+      // determine which local nodes in the indices array need to be
+      // sychronized
+      galois::do_all(
+          galois::iterate(size_t{0}, indices.size()),
+          [&](size_t n) {
+            // assumes each lid is unique as test is not thread safe
+            size_t lid = indices[n];
+            if (bitset_compute.test(lid)) {
+              bitset_comm.set(n);
+            }
+          },
+#if GALOIS_COMM_STATS
+          galois::loopname(get_run_identifier(doall_str).c_str()),
+#endif
+          galois::no_stats());
+
+      // get the number of set bits and the offsets into the comm bitset
+      // i.e., the things thaneed to be grabbed
+      getOffsetsFromBitset<syncType>(loopName, bitset_comm, offsets,
+                                     bit_set_count);
+    }
+
+    // from the count of things that need to be grabbed determine the data mode
+    // to use
+    // NOTE: this function is imprecise as it doesn't get actual size of
+    // vectors but only the size of the wrapper itself, but doesn't matter
+    // for selection purposes
     data_mode =
         get_data_mode<typename FnTy::ValTy>(bit_set_count, indices.size());
   }
@@ -654,6 +705,39 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
     }
   }
 
+  template <typename SyncFnTy>
+  size_t GetMaxSendBufferSizeVecs(uint32_t numShared) {
+    if (substrateDataMode == gidsData) {
+      return sizeof(DataCommMode) + sizeof(size_t) + sizeof(size_t) +
+             (numShared * sizeof(unsigned int)) + sizeof(size_t) +
+             sizeof(size_t) +
+             (numShared * sizeof(typename SyncFnTy::ValTy::value_type) *
+              SyncFnTy::FeatVecSize());
+    } else if (substrateDataMode == offsetsData) {
+      return sizeof(DataCommMode) + sizeof(size_t) + sizeof(size_t) +
+             (numShared * sizeof(unsigned int)) + sizeof(size_t) +
+             sizeof(size_t) +
+             (numShared * sizeof(typename SyncFnTy::ValTy::value_type) *
+              SyncFnTy::FeatVecSize());
+    } else if (substrateDataMode == bitsetData) {
+      size_t bitset_alloc_size = ((numShared + 63) / 64) * sizeof(uint64_t);
+      return sizeof(DataCommMode) + sizeof(size_t) +
+             sizeof(size_t)   // bitset size
+             + sizeof(size_t) // bitset vector size
+             + bitset_alloc_size + sizeof(size_t) + sizeof(size_t) +
+             (numShared * sizeof(typename SyncFnTy::ValTy::value_type) *
+              SyncFnTy::FeatVecSize());
+    } else { // onlyData or noData (auto)
+      size_t bitset_alloc_size = ((numShared + 63) / 64) * sizeof(uint64_t);
+      return sizeof(DataCommMode) + sizeof(size_t) +
+             sizeof(size_t)   // bitset size
+             + sizeof(size_t) // bitset vector size
+             + bitset_alloc_size + sizeof(size_t) + sizeof(size_t) +
+             (numShared * sizeof(typename SyncFnTy::ValTy::value_type) *
+              SyncFnTy::FeatVecSize());
+    }
+  }
+
   ////////////////////////////////////////////////////////////////////////////////
   // Local to global ID conversion
   ////////////////////////////////////////////////////////////////////////////////
@@ -731,10 +815,9 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
   // Message prep functions (buffering, send buffer getting, etc.)
   ////////////////////////////////////////////////////////////////////////////////
 
-  template <
-      SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy,
-      bool async,
-      typename std::enable_if<Is2DVector<VecTy>::value>::type* = nullptr>
+  template <SyncType syncType, typename SyncFnTy, typename BitsetFnTy,
+            typename VecTy, bool async,
+            typename std::enable_if<Is2DVector<VecTy>::value>::type* = nullptr>
   void getSendBuffer(std::string loopName, unsigned x,
                      galois::runtime::SendBuffer& b, size_t elem_size) {
     auto& sharedNodes = (syncType == syncReduce) ? mirrorNodes : masterNodes;
@@ -762,10 +845,9 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
    * @param b OUTPUT: Buffer that will hold data to send
    * @param elem_size The inner-vector dimesnion of a vector of the vector
    */
-  template <
-      SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy,
-      bool async,
-      typename std::enable_if<!Is2DVector<VecTy>::value>::type* = nullptr>
+  template <SyncType syncType, typename SyncFnTy, typename BitsetFnTy,
+            typename VecTy, bool async,
+            typename std::enable_if<!Is2DVector<VecTy>::value>::type* = nullptr>
   void getSendBuffer(std::string loopName, unsigned x,
                      galois::runtime::SendBuffer& b, size_t elem_size) {
     auto& sharedNodes = (syncType == syncReduce) ? mirrorNodes : masterNodes;
@@ -774,8 +856,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
       syncExtract<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(
           loopName, x, sharedNodes[x], b, elem_size);
     } else {
-      syncExtract<syncType, SyncFnTy, VecTy, async>(
-          loopName, x, sharedNodes[x], b, elem_size);
+      syncExtract<syncType, SyncFnTy, VecTy, async>(loopName, x, sharedNodes[x],
+                                                    b, elem_size);
     }
 
     std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
@@ -888,12 +970,12 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
 
   // Calls data on the TwoDVector
   template <bool async, SyncType syncType, typename TwoDVecType>
-  void
-  SerializeMessage2D(std::string loopName, DataCommMode data_mode,
-                     size_t bit_set_count, std::vector<size_t>& indices,
-                     galois::PODResizeableArray<unsigned int>& offsets,
-                     galois::DynamicBitSet& bit_set_comm, TwoDVecType& two_d_vec,
-                     galois::runtime::SendBuffer& b, size_t feat_size) {
+  void SerializeMessage2D(std::string loopName, DataCommMode data_mode,
+                          size_t bit_set_count, std::vector<size_t>& indices,
+                          galois::PODResizeableArray<unsigned int>& offsets,
+                          galois::DynamicBitSet& bit_set_comm,
+                          TwoDVecType& two_d_vec,
+                          galois::runtime::SendBuffer& b, size_t feat_size) {
     std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
     std::string serialize_timer_str(syncTypeStr + "SerializeMessage_" +
                                     get_run_identifier(loopName));
@@ -930,6 +1012,45 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
     }
   }
 
+  // Only serializes the prefix
+  template <SyncType syncType>
+  void
+  SerializeMessagePrefix2D(std::string loopName, DataCommMode data_mode,
+                           size_t bit_set_count, std::vector<size_t>& indices,
+                           galois::PODResizeableArray<unsigned int>& offsets,
+                           galois::DynamicBitSet& bit_set_comm,
+                           galois::runtime::SendBuffer& b) {
+    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
+    std::string serialize_timer_str(syncTypeStr + "SerializeMessagePrefix_" +
+                                    get_run_identifier(loopName));
+    galois::CondStatTimer<GALOIS_COMM_STATS> Tserialize(
+        serialize_timer_str.c_str(), RNAME);
+    if (data_mode == noData) {
+      Tserialize.start();
+      gSerialize(b, data_mode);
+      Tserialize.stop();
+    } else if (data_mode == gidsData) {
+      offsets.resize(bit_set_count);
+      convertLIDToGID<syncType>(loopName, indices, offsets);
+      Tserialize.start();
+      gSerialize(b, data_mode, bit_set_count, offsets);
+      Tserialize.stop();
+    } else if (data_mode == offsetsData) {
+      offsets.resize(bit_set_count);
+      Tserialize.start();
+      gSerialize(b, data_mode, bit_set_count, offsets);
+      Tserialize.stop();
+    } else if (data_mode == bitsetData) {
+      Tserialize.start();
+      gSerialize(b, data_mode, bit_set_count, bit_set_comm);
+      Tserialize.stop();
+    } else if (data_mode == onlyData) {
+      Tserialize.start();
+      gSerialize(b, data_mode);
+      Tserialize.stop();
+    }
+  }
+
   /**
    * Given the data mode, deserialize the rest of a message in a Receive Buffer.
    *
@@ -1199,9 +1320,9 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
     }
   }
 
-
   template <typename FnTy, SyncType syncType>
-  void ExtractWrapper2D(size_t lid, typename FnTy::ValTy::value_type* location_to_write) {
+  void ExtractWrapper2D(size_t lid,
+                        typename FnTy::ValTy::value_type* location_to_write) {
     if (syncType == syncReduce) {
       FnTy::ExtractDirect(lid, location_to_write);
       char dummy = 0;
@@ -1300,10 +1421,11 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
 
   template <typename FnTy, SyncType syncType, typename VecTy,
             bool identity_offsets = false, bool parallelize = true>
-  void ExtractSubset2D(const std::string& loopName,
-                     const std::vector<size_t>& indices, size_t size,
-                     const galois::PODResizeableArray<unsigned int>& offsets,
-                     VecTy& two_d_vector, size_t start = 0) {
+  void ExtractSubsetLazy2D(
+      const std::string& loopName, const std::vector<size_t>& indices,
+      size_t size, const galois::PODResizeableArray<unsigned int>& offsets,
+      galois::runtime::SendBuffer& send_buffer, size_t base_offset) {
+    size_t start = 0;
     if (parallelize) {
       std::string syncTypeStr =
           (syncType == syncReduce) ? "Reduce" : "Broadcast";
@@ -1317,9 +1439,15 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
               offset = n;
             else
               offset = offsets[n];
-            size_t lid         = indices[offset];
+            size_t lid = indices[offset];
 
-            ExtractWrapper2D<FnTy, syncType>(lid, (&(two_d_vector.edit_data()[(n - start) * FnTy::FeatVecSize()])));
+            ExtractWrapper2D<FnTy, syncType>(
+                lid,
+                (typename FnTy::ValTy::value_type*)(&(send_buffer.DataAtOffset(
+                    base_offset)[(n - start) * FnTy::FeatVecSize() *
+                                 sizeof(typename FnTy::ValTy::value_type)])));
+            // ExtractWrapper2D<FnTy, syncType>(lid,
+            // (&(two_d_vector.edit_data()[(n - start) * FnTy::FeatVecSize()])));
           },
 #if GALOIS_COMM_STATS
           galois::loopname(get_run_identifier(doall_str).c_str()),
@@ -1333,13 +1461,18 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
         else
           offset = offsets[n];
 
-        size_t lid         = indices[offset];
-        ExtractWrapper2D<FnTy, syncType>(lid, &((two_d_vector.edit_data())[(n - start) * FnTy::FeatVecSize()]));
+        size_t lid = indices[offset];
+
+        ExtractWrapper2D<FnTy, syncType>(
+            lid, (typename FnTy::ValTy::value_type*)(&(send_buffer.DataAtOffset(
+                     base_offset)[(n - start) * FnTy::FeatVecSize() *
+                                  sizeof(typename FnTy::ValTy::value_type)])));
+        // ExtractWrapper2D<FnTy, syncType>(lid, &((two_d_vector.edit_data())[(n
+        // - start) * FnTy::FeatVecSize()]));
       }
     }
   }
 
-
   /**
    * Based on provided arguments, extracts the data that we are interested
    * in sending into val_vec. Same as above, except it has the vecIndex
@@ -1558,7 +1691,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
 
   // 2D
   template <typename FnTy, SyncType syncType, bool async>
-  void SetWrapper2D(size_t lid, typename FnTy::ValTy::value_type* pointer_to_data,
+  void SetWrapper2D(size_t lid,
+                    typename FnTy::ValTy::value_type* pointer_to_data,
                     galois::DynamicBitSet& bit_set_compute) {
     if (syncType == syncReduce) {
       if (FnTy::reduce(lid, userGraph.getData(lid), pointer_to_data)) {
@@ -1695,8 +1829,9 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
             else
               offset = offsets[n];
             auto lid = indices[offset];
-            SetWrapper2D<FnTy, syncType, async>(lid, &val_vec[(n - start) * FnTy::FeatVecSize()],
-                                              bit_set_compute);
+            SetWrapper2D<FnTy, syncType, async>(
+                lid, &val_vec[(n - start) * FnTy::FeatVecSize()],
+                bit_set_compute);
           },
 #if GALOIS_COMM_STATS
           galois::loopname(get_run_identifier(doall_str).c_str()),
@@ -1710,13 +1845,12 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
         else
           offset = offsets[n];
         auto lid = indices[offset];
-        SetWrapper2D<FnTy, syncType, async>(lid, &val_vec[(n - start) * FnTy::FeatVecSize()],
-                                          bit_set_compute);
+        SetWrapper2D<FnTy, syncType, async>(
+            lid, &val_vec[(n - start) * FnTy::FeatVecSize()], bit_set_compute);
       }
     }
   }
 
-
   /**
    * VECTOR BITSET VARIANT.
    *
@@ -2303,16 +2437,13 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
                                                             1);
   }
 
-  template <
-      SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy,
-      bool async>
+  template <SyncType syncType, typename SyncFnTy, typename BitsetFnTy,
+            typename VecTy, bool async>
   void SyncExtract2D(std::string loopName, unsigned from_id,
                      std::vector<size_t>& indices,
                      galois::runtime::SendBuffer& b, size_t elem_size) {
     uint32_t num                        = indices.size() * elem_size;
     galois::DynamicBitSet& bit_set_comm = syncBitset;
-    static VecTy two_d_array;
-    two_d_array.SetVecSize(SyncFnTy::FeatVecSize());
     galois::PODResizeableArray<unsigned int>& offsets = syncOffsets;
 
     //////////////////////////////////////////////////////////////////////////////
@@ -2337,7 +2468,7 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
     if (num > 0) {
       size_t bit_set_count = 0;
       Textractalloc.start();
-      b.reserve(getMaxSendBufferSize<SyncFnTy>(num));
+      b.reserve(GetMaxSendBufferSizeVecs<SyncFnTy>(num));
       Textractalloc.stop();
 
       Textractbatch.start();
@@ -2354,30 +2485,53 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
         offsets.reserve(maxSharedSize);
         bit_set_comm.resize(num);
         offsets.resize(num);
-        two_d_array.Create(num);
         Textractalloc.stop();
         const galois::DynamicBitSet& bit_set_compute = BitsetFnTy::get();
 
-        getBitsetAndOffsets<SyncFnTy, syncType>(
+        GetBitsetAndOffsets2D<SyncFnTy, syncType>(
             loopName, indices, bit_set_compute, bit_set_comm, offsets,
             bit_set_count, data_mode);
 
-        if (data_mode == onlyData) {
-          bit_set_count = indices.size();
-          ExtractSubset2D<SyncFnTy, syncType, VecTy, true, true>(
-              loopName, indices, bit_set_count, offsets, two_d_array);
-        } else if (data_mode !=
-                   noData) { // bitsetData or offsetsData or gidsData
-          ExtractSubset2D<SyncFnTy, syncType, VecTy, false, true>(
-              loopName, indices, bit_set_count, offsets, two_d_array);
-        }
+        // serialize the prefix for the buffer based on data type: the data
+        // itself gets serialized directly into the buffer later
+        SerializeMessagePrefix2D<syncType>(loopName, data_mode, bit_set_count,
+                                           indices, offsets, bit_set_comm, b);
 
-        SerializeMessage2D<async, syncType>(
-            loopName, data_mode, bit_set_count, indices, offsets, bit_set_comm,
-            two_d_array, b, SyncFnTy::FeatVecSize());
+        if (data_mode != noData) {
+          size_t lazy_buffer_size = 0;
+          if (data_mode == gidsData) {
+            lazy_buffer_size = bit_set_count * SyncFnTy::FeatVecSize();
+          } else if (data_mode == offsetsData) {
+            lazy_buffer_size = bit_set_count * SyncFnTy::FeatVecSize();
+          } else if (data_mode == bitsetData) {
+            lazy_buffer_size = bit_set_count * SyncFnTy::FeatVecSize();
+          } else if (data_mode == onlyData) {
+            lazy_buffer_size = num * SyncFnTy::FeatVecSize();
+          }
+
+          size_t base_offset = 0;
+          if (lazy_buffer_size > 0) {
+            auto lazy_buffer = gSerializeLazySeq(
+                b, lazy_buffer_size,
+                (galois::PODResizeableArray<
+                    typename SyncFnTy::ValTy::value_type>*)nullptr);
+            base_offset = lazy_buffer.off;
+          }
+
+          // serialize the actual data directly into the buffer with lazy
+          // serialization
+          if (data_mode == onlyData) {
+            bit_set_count = indices.size();
+            ExtractSubsetLazy2D<SyncFnTy, syncType, VecTy, true, true>(
+                loopName, indices, bit_set_count, offsets, b, base_offset);
+          } else { // bitsetData or offsetsData or gidsData
+            ExtractSubsetLazy2D<SyncFnTy, syncType, VecTy, false, true>(
+                loopName, indices, bit_set_count, offsets, b, base_offset);
+          }
+        }
       } else {
         // TODO(loc/hochan) GPU
-        //if (data_mode == noData) {
+        // if (data_mode == noData) {
         //  b.resize(0);
         //  if (!async) {
         //    gSerialize(b, data_mode);
@@ -2424,7 +2578,6 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
                                                             1);
   }
 
-
   /**
    * Vector bitset variant.
    *
@@ -2807,11 +2960,9 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
   }
 
   // TODO (loc) way too much code duplication
-  template <
-      SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy,
-      bool async>
-  size_t SyncRecvApply2D(uint32_t from_id,
-                         galois::runtime::RecvBuffer& buf,
+  template <SyncType syncType, typename SyncFnTy, typename BitsetFnTy,
+            typename VecTy, bool async>
+  size_t SyncRecvApply2D(uint32_t from_id, galois::runtime::RecvBuffer& buf,
                          std::string loopName) {
     ////////////////////////////////////////////////////////////////////////////
     std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
@@ -2825,7 +2976,7 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
     ////////////////////////////////////////////////////////////////////////////
 
     galois::DynamicBitSet& bit_set_comm = syncBitset;
-    //static VecTy two_d_vector;
+    // static VecTy two_d_vector;
     galois::PODResizeableArray<unsigned int>& offsets = syncOffsets;
 
     auto& sharedNodes = (syncType == syncReduce) ? masterNodes : mirrorNodes;
@@ -2850,7 +3001,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
           size_t bit_set_count = num;
           size_t buf_start     = 0;
 
-          using DeserialPOD = galois::PODResizeableArray<typename VecTy::value_type>;
+          using DeserialPOD =
+              galois::PODResizeableArray<typename VecTy::value_type>;
           DeserialPOD deserial_pod;
 
           // deserialize the rest of the data in the buffer depending on the
@@ -2872,24 +3024,25 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
           }
 
           if (data_mode == onlyData) {
-            SetSubset2D<decltype(sharedNodes[from_id]), SyncFnTy, syncType, DeserialPOD,
-                      async, true, true>(loopName, sharedNodes[from_id],
-                                         bit_set_count, offsets, deserial_pod,
-                                         bit_set_compute);
+            SetSubset2D<decltype(sharedNodes[from_id]), SyncFnTy, syncType,
+                        DeserialPOD, async, true, true>(
+                loopName, sharedNodes[from_id], bit_set_count, offsets,
+                deserial_pod, bit_set_compute);
           } else if (data_mode == dataSplit || data_mode == dataSplitFirst) {
-            SetSubset2D<decltype(sharedNodes[from_id]), SyncFnTy, syncType, DeserialPOD,
-                      async, true, true>(loopName, sharedNodes[from_id],
-                                         bit_set_count, offsets, deserial_pod,
-                                         bit_set_compute, buf_start);
+            SetSubset2D<decltype(sharedNodes[from_id]), SyncFnTy, syncType,
+                        DeserialPOD, async, true, true>(
+                loopName, sharedNodes[from_id], bit_set_count, offsets,
+                deserial_pod, bit_set_compute, buf_start);
           } else if (data_mode == gidsData) {
-            SetSubset2D<decltype(offsets), SyncFnTy, syncType, DeserialPOD, async, true,
-                      true>(loopName, offsets, bit_set_count, offsets, deserial_pod,
-                            bit_set_compute);
+            SetSubset2D<decltype(offsets), SyncFnTy, syncType, DeserialPOD,
+                        async, true, true>(loopName, offsets, bit_set_count,
+                                           offsets, deserial_pod,
+                                           bit_set_compute);
           } else { // bitsetData or offsetsData
-            SetSubset2D<decltype(sharedNodes[from_id]), SyncFnTy, syncType, DeserialPOD,
-                      async, false, true>(loopName, sharedNodes[from_id],
-                                          bit_set_count, offsets, deserial_pod,
-                                          bit_set_compute);
+            SetSubset2D<decltype(sharedNodes[from_id]), SyncFnTy, syncType,
+                        DeserialPOD, async, false, true>(
+                loopName, sharedNodes[from_id], bit_set_count, offsets,
+                deserial_pod, bit_set_compute);
           }
         } else {
           // TODO(loc/hochan)
@@ -3480,13 +3633,14 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
   // 2D vector
   template <WriteLocation writeLocation, ReadLocation readLocation,
             typename ReduceFnTy, typename BitsetFnTy, bool async,
-            typename std::enable_if<IsVector<typename ReduceFnTy::ValTy>::value>::type* = nullptr>
+            typename std::enable_if<
+                IsVector<typename ReduceFnTy::ValTy>::value>::type* = nullptr>
   void reduce(std::string loopName, size_t elem_size) {
     std::string timer_str("Reduce_" + get_run_identifier(loopName));
     galois::CondStatTimer<GALOIS_COMM_STATS> TsyncReduce(timer_str.c_str(),
                                                          RNAME);
 
-    using T = typename ReduceFnTy::ValTy::value_type;
+    using T     = typename ReduceFnTy::ValTy::value_type;
     using VecTy = galois::TwoDVector<T>;
 
     TsyncReduce.start();
@@ -3511,7 +3665,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
    */
   template <WriteLocation writeLocation, ReadLocation readLocation,
             typename ReduceFnTy, typename BitsetFnTy, bool async,
-            typename std::enable_if<!IsVector<typename ReduceFnTy::ValTy>::value>::type* = nullptr>
+            typename std::enable_if<
+                !IsVector<typename ReduceFnTy::ValTy>::value>::type* = nullptr>
   void reduce(std::string loopName, size_t elem_size) {
     std::string timer_str("Reduce_" + get_run_identifier(loopName));
     galois::CondStatTimer<GALOIS_COMM_STATS> TsyncReduce(timer_str.c_str(),
@@ -3554,7 +3709,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
   // 2d
   template <WriteLocation writeLocation, ReadLocation readLocation,
             typename BroadcastFnTy, typename BitsetFnTy, bool async,
-            typename std::enable_if<IsVector<typename BroadcastFnTy::ValTy>::value>::type* = nullptr>
+            typename std::enable_if<IsVector<
+                typename BroadcastFnTy::ValTy>::value>::type* = nullptr>
   void broadcast(std::string loopname, size_t elem_size) {
     std::string timer_str("Broadcast_" + get_run_identifier(loopname));
     galois::CondStatTimer<GALOIS_COMM_STATS> TsyncBroadcast(timer_str.c_str(),
@@ -3585,7 +3741,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
    */
   template <WriteLocation writeLocation, ReadLocation readLocation,
             typename BroadcastFnTy, typename BitsetFnTy, bool async,
-            typename std::enable_if<!IsVector<typename BroadcastFnTy::ValTy>::value>::type* = nullptr>
+            typename std::enable_if<!IsVector<
+                typename BroadcastFnTy::ValTy>::value>::type* = nullptr>
   void broadcast(std::string loopName, size_t elem_size) {
     std::string timer_str("Broadcast_" + get_run_identifier(loopName));
     galois::CondStatTimer<GALOIS_COMM_STATS> TsyncBroadcast(timer_str.c_str(),

From 5b442736958b2deb08f8bc6563f4b15876b9db12 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 7 Jun 2021 18:19:09 -0500
Subject: [PATCH 561/660] Direct deserialization of GNN sync

---
 .../include/galois/graphs/GluonSubstrate.h    | 105 ++++++++++++------
 1 file changed, 69 insertions(+), 36 deletions(-)

diff --git a/libgluon/include/galois/graphs/GluonSubstrate.h b/libgluon/include/galois/graphs/GluonSubstrate.h
index c60308ac93..ae50e0e10f 100644
--- a/libgluon/include/galois/graphs/GluonSubstrate.h
+++ b/libgluon/include/galois/graphs/GluonSubstrate.h
@@ -1114,6 +1114,44 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
     Tdeserialize.stop();
   }
 
+  template <SyncType syncType>
+  void DeserializeMessagePrefix(
+      std::string loopName, DataCommMode data_mode, uint32_t num,
+      galois::runtime::RecvBuffer& buf, size_t& bit_set_count,
+      galois::PODResizeableArray<unsigned int>& offsets,
+      galois::DynamicBitSet& bit_set_comm, size_t& buf_start, size_t& retval,
+      size_t& vec_size) {
+    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
+    std::string serialize_timer_str(syncTypeStr + "DeserializeMessage_" +
+                                    get_run_identifier(loopName));
+    galois::CondStatTimer<GALOIS_COMM_STATS> Tdeserialize(
+        serialize_timer_str.c_str(), RNAME);
+    Tdeserialize.start();
+
+    // get other metadata associated with message if mode isn't OnlyData
+    if (data_mode != onlyData) {
+      galois::runtime::gDeserialize(buf, bit_set_count);
+
+      if (data_mode == gidsData) {
+        galois::runtime::gDeserialize(buf, offsets);
+        convertGIDToLID<syncType>(loopName, offsets);
+      } else if (data_mode == offsetsData) {
+        galois::runtime::gDeserialize(buf, offsets);
+      } else if (data_mode == bitsetData) {
+        bit_set_comm.resize(num);
+        galois::runtime::gDeserialize(buf, bit_set_comm);
+      } else if (data_mode == dataSplit) {
+        galois::runtime::gDeserialize(buf, buf_start);
+      } else if (data_mode == dataSplitFirst) {
+        galois::runtime::gDeserialize(buf, retval);
+      }
+    }
+    // Grab data size but not data
+    galois::runtime::gDeserialize(buf, vec_size);
+
+    Tdeserialize.stop();
+  }
+
   ////////////////////////////////////////////////////////////////////////////////
   // Other helper functions
   ////////////////////////////////////////////////////////////////////////////////
@@ -1446,8 +1484,6 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
                 (typename FnTy::ValTy::value_type*)(&(send_buffer.DataAtOffset(
                     base_offset)[(n - start) * FnTy::FeatVecSize() *
                                  sizeof(typename FnTy::ValTy::value_type)])));
-            // ExtractWrapper2D<FnTy, syncType>(lid,
-            // (&(two_d_vector.edit_data()[(n - start) * FnTy::FeatVecSize()])));
           },
 #if GALOIS_COMM_STATS
           galois::loopname(get_run_identifier(doall_str).c_str()),
@@ -1467,8 +1503,6 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
             lid, (typename FnTy::ValTy::value_type*)(&(send_buffer.DataAtOffset(
                      base_offset)[(n - start) * FnTy::FeatVecSize() *
                                   sizeof(typename FnTy::ValTy::value_type)])));
-        // ExtractWrapper2D<FnTy, syncType>(lid, &((two_d_vector.edit_data())[(n
-        // - start) * FnTy::FeatVecSize()]));
       }
     }
   }
@@ -1807,18 +1841,16 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
   }
 
   // 2D; vecty is a PODResize
-  template <typename IndicesVecTy, typename FnTy, SyncType syncType,
-            typename VecTy, bool async, bool identity_offsets = false,
-            bool parallelize = true>
+  template <typename IndicesVecTy, typename FnTy, SyncType syncType, bool async,
+            bool identity_offsets = false, bool parallelize = true>
   void SetSubset2D(const std::string& loopName, const IndicesVecTy& indices,
                    size_t size,
                    const galois::PODResizeableArray<unsigned int>& offsets,
-                   VecTy& val_vec, galois::DynamicBitSet& bit_set_compute,
-                   size_t start = 0) {
+                   galois::runtime::RecvBuffer& buf,
+                   galois::DynamicBitSet& bit_set_compute, size_t start = 0) {
     std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
     std::string doall_str(syncTypeStr + "SetVal_" +
                           get_run_identifier(loopName));
-
     if (parallelize) {
       galois::do_all(
           galois::iterate(start, start + size),
@@ -1830,7 +1862,10 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
               offset = offsets[n];
             auto lid = indices[offset];
             SetWrapper2D<FnTy, syncType, async>(
-                lid, &val_vec[(n - start) * FnTy::FeatVecSize()],
+                lid,
+                (typename FnTy::ValTy::value_type*)&(
+                    buf.data()[(n - start) * FnTy::FeatVecSize() *
+                               sizeof(typename FnTy::ValTy::value_type)]),
                 bit_set_compute);
           },
 #if GALOIS_COMM_STATS
@@ -1846,7 +1881,11 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
           offset = offsets[n];
         auto lid = indices[offset];
         SetWrapper2D<FnTy, syncType, async>(
-            lid, &val_vec[(n - start) * FnTy::FeatVecSize()], bit_set_compute);
+            lid,
+            (typename FnTy::ValTy::value_type*)(&(
+                buf.data()[(n - start) * FnTy::FeatVecSize() *
+                           sizeof(typename FnTy::ValTy::value_type)])),
+            bit_set_compute);
       }
     }
   }
@@ -2976,7 +3015,6 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
     ////////////////////////////////////////////////////////////////////////////
 
     galois::DynamicBitSet& bit_set_comm = syncBitset;
-    // static VecTy two_d_vector;
     galois::PODResizeableArray<unsigned int>& offsets = syncOffsets;
 
     auto& sharedNodes = (syncType == syncReduce) ? masterNodes : mirrorNodes;
@@ -3001,21 +3039,15 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
           size_t bit_set_count = num;
           size_t buf_start     = 0;
 
-          using DeserialPOD =
-              galois::PODResizeableArray<typename VecTy::value_type>;
-          DeserialPOD deserial_pod;
-
-          // deserialize the rest of the data in the buffer depending on the
-          // data mode; arguments passed in here are mostly output vars
-          deserializeMessage<syncType>(loopName, data_mode, num, buf,
-                                       bit_set_count, offsets, bit_set_comm,
-                                       buf_start, retval, deserial_pod);
+          size_t vec_size = 0;
+          DeserializeMessagePrefix<syncType>(
+              loopName, data_mode, num, buf, bit_set_count, offsets,
+              bit_set_comm, buf_start, retval, vec_size);
 
           bit_set_comm.reserve(maxSharedSize);
           offsets.reserve(maxSharedSize);
 
           galois::DynamicBitSet& bit_set_compute = BitsetFnTy::get();
-
           if (data_mode == bitsetData) {
             size_t bit_set_count2;
             getOffsetsFromBitset<syncType>(loopName, bit_set_comm, offsets,
@@ -3023,26 +3055,27 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
             assert(bit_set_count == bit_set_count2);
           }
 
+          // note for all these the deserialize buffer is extracted from
+          // directly rather than copying it over to another vector
           if (data_mode == onlyData) {
             SetSubset2D<decltype(sharedNodes[from_id]), SyncFnTy, syncType,
-                        DeserialPOD, async, true, true>(
-                loopName, sharedNodes[from_id], bit_set_count, offsets,
-                deserial_pod, bit_set_compute);
+                        async, true, true>(loopName, sharedNodes[from_id],
+                                           bit_set_count, offsets, buf,
+                                           bit_set_compute);
           } else if (data_mode == dataSplit || data_mode == dataSplitFirst) {
             SetSubset2D<decltype(sharedNodes[from_id]), SyncFnTy, syncType,
-                        DeserialPOD, async, true, true>(
-                loopName, sharedNodes[from_id], bit_set_count, offsets,
-                deserial_pod, bit_set_compute, buf_start);
+                        async, true, true>(loopName, sharedNodes[from_id],
+                                           bit_set_count, offsets, buf,
+                                           bit_set_compute, buf_start);
           } else if (data_mode == gidsData) {
-            SetSubset2D<decltype(offsets), SyncFnTy, syncType, DeserialPOD,
-                        async, true, true>(loopName, offsets, bit_set_count,
-                                           offsets, deserial_pod,
-                                           bit_set_compute);
+            SetSubset2D<decltype(offsets), SyncFnTy, syncType, async, true,
+                        true>(loopName, offsets, bit_set_count, offsets, buf,
+                              bit_set_compute);
           } else { // bitsetData or offsetsData
             SetSubset2D<decltype(sharedNodes[from_id]), SyncFnTy, syncType,
-                        DeserialPOD, async, false, true>(
-                loopName, sharedNodes[from_id], bit_set_count, offsets,
-                deserial_pod, bit_set_compute);
+                        async, false, true>(loopName, sharedNodes[from_id],
+                                            bit_set_count, offsets, buf,
+                                            bit_set_compute);
           }
         } else {
           // TODO(loc/hochan)

From 3a80d2483d8151f0c84b658d6e43a361271d7131 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 9 Jun 2021 13:59:46 -0500
Subject: [PATCH 562/660] MKL microbenchmark additions and changes

- Version that uses galois wrapper around single thread MKL calls
- removal of unused things from original test
---
 libgnn/CMakeLists.txt            |  11 ++
 libgnn/test/CMakeLists.txt       |  33 +++++-
 libgnn/test/mkl_micro.cpp        |  26 +++--
 libgnn/test/single_mkl_micro.cpp | 168 +++++++++++++++++++++++++++++++
 4 files changed, 222 insertions(+), 16 deletions(-)
 create mode 100644 libgnn/test/single_mkl_micro.cpp

diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt
index c5d9ee6e7a..5bf32581d7 100644
--- a/libgnn/CMakeLists.txt
+++ b/libgnn/CMakeLists.txt
@@ -16,6 +16,7 @@ set(sources
 
 set(MKL_LIBRARIES ${MKL_ROOT}/lib/intel64)
 set(INTEL_LIBS "-lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5")
+set(SINGLE_INTEL_LIBS "-lmkl_intel_lp64 -lmkl_sequential -lmkl_core")
 
 add_library(galois_gnn STATIC ${sources})
 
@@ -28,6 +29,16 @@ target_include_directories(galois_gnn PUBLIC
   ${MKL_INCLUDE_DIRS}
 )
 
+add_library(galois_gnn_single STATIC ${sources})
+target_link_directories(galois_gnn_single PUBLIC ${MKL_LIBRARIES})
+target_link_libraries(galois_gnn_single galois_shmem)
+target_link_libraries(galois_gnn_single ${SINGLE_INTEL_LIBS})
+target_link_libraries(galois_gnn_single galois_dist_async galois_cusp galois_gluon galois_support)
+target_include_directories(galois_gnn_single PUBLIC
+  ${CMAKE_CURRENT_SOURCE_DIR}/include
+  ${MKL_INCLUDE_DIRS}
+)
+
 set_target_properties(galois_gnn PROPERTIES EXPORT_NAME galois_gnn)
 add_subdirectory(test)
 
diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt
index a1ea769105..98b1d01e3e 100644
--- a/libgnn/test/CMakeLists.txt
+++ b/libgnn/test/CMakeLists.txt
@@ -1,3 +1,5 @@
+find_package(OpenMP)
+
 add_executable(mkl_micro mkl_micro.cpp)
 target_link_directories(mkl_micro PUBLIC ${MKL_LIBRARIES})
 target_include_directories(mkl_micro PUBLIC
@@ -10,8 +12,8 @@ target_link_directories(mkl_micro_omp PUBLIC ${MKL_LIBRARIES})
 target_include_directories(mkl_micro_omp PUBLIC
   ${MKL_INCLUDE_DIRS}
 )
-target_link_libraries(mkl_micro_omp ${INTEL_LIBS})
-target_link_libraries(mkl_micro_omp -fopenmp)
+target_link_libraries(mkl_micro_omp PUBLIC ${INTEL_LIBS} OpenMP::OpenMP_CXX)
+target_compile_definitions(mkl_micro_omp PUBLIC USE_OMP=1)
 
 add_executable(mkl_micro_sgalois mkl_micro.cpp)
 target_link_libraries(mkl_micro_sgalois galois_gnn)
@@ -25,6 +27,33 @@ add_executable(mkl_micro_delete_galois mkl_micro.cpp)
 target_link_libraries(mkl_micro_delete_galois galois_gnn)
 target_compile_definitions(mkl_micro_delete_galois PUBLIC USE_SHARED_GALOIS_DELETE=1)
 
+################################################################################
+
+#add_executable(single_mkl_micro single_mkl_micro.cpp)
+#target_link_directories(single_mkl_micro PUBLIC ${MKL_LIBRARIES})
+#target_include_directories(single_mkl_micro PUBLIC
+#  ${MKL_INCLUDE_DIRS}
+#)
+#target_link_libraries(single_mkl_micro ${SINGLE_INTEL_LIBS})
+
+add_executable(single_mkl_micro_omp single_mkl_micro.cpp)
+target_link_directories(single_mkl_micro_omp PUBLIC ${MKL_LIBRARIES})
+target_include_directories(single_mkl_micro_omp PUBLIC
+  ${MKL_INCLUDE_DIRS}
+)
+target_link_libraries(single_mkl_micro_omp ${SINGLE_INTEL_LIBS} OpenMP::OpenMP_CXX)
+target_compile_definitions(single_mkl_micro_omp PUBLIC USE_OMP=1)
+
+add_executable(single_mkl_micro_sgalois single_mkl_micro.cpp)
+target_link_libraries(single_mkl_micro_sgalois galois_gnn_single)
+target_compile_definitions(single_mkl_micro_sgalois PUBLIC USE_SHARED_GALOIS=1)
+
+add_executable(single_mkl_micro_dgalois single_mkl_micro.cpp)
+target_link_libraries(single_mkl_micro_dgalois galois_gnn_single)
+target_compile_definitions(single_mkl_micro_dgalois PUBLIC USE_DIST_GALOIS=1)
+
+################################################################################
+
 add_executable(gstl_test gstl_test.cpp)
 target_link_libraries(gstl_test galois_shmem)
 
diff --git a/libgnn/test/mkl_micro.cpp b/libgnn/test/mkl_micro.cpp
index ea9511df74..10867a8c63 100644
--- a/libgnn/test/mkl_micro.cpp
+++ b/libgnn/test/mkl_micro.cpp
@@ -1,6 +1,7 @@
 #include <cstdlib>
 #include <vector>
 #include <random>
+#include <chrono>
 #include <mkl.h>
 
 #ifdef USE_SHARED_GALOIS
@@ -84,29 +85,18 @@ int main(int argc, char* argv[]) {
   size_t b_dim = 128;
   size_t c_dim = 16;
 
-#if defined(USE_SHARED_GALOIS) || defined(USE_DIST_GALOIS)
-  printf("Using Galois large arrays\n");
-  // inputs
-  galois::LargeArray<float> matrix_1;
-  matrix_1.create(a_dim * b_dim);
-  galois::LargeArray<float> matrix_2;
-  matrix_2.create(a_dim * c_dim);
-  // output
-  galois::LargeArray<float> matrix_3;
-  matrix_3.create(b_dim * c_dim);
-#else
   // inputs
   std::vector<float> matrix_1(a_dim * b_dim);
   std::vector<float> matrix_2(a_dim * c_dim);
   // output
+  //std::vector<float> matrix_3(a_dim * c_dim);
   std::vector<float> matrix_3(b_dim * c_dim);
-#endif
 
   size_t kBigSize = 1000000000;
   std::vector<float> very_big_matrix(kBigSize);
 
   // change reps here; maybe make it command line arg
-  for (size_t reps = 0; reps < 3; reps++) {
+  for (size_t reps = 0; reps < 5; reps++) {
     // reinit
     srand(0);
     for (size_t i = 0; i < matrix_1.size(); i++) {
@@ -124,7 +114,7 @@ int main(int argc, char* argv[]) {
 
     // dummy OMP TBB loop
 #ifdef USE_OMP
-#pragma omp parallel
+#pragma omp parallel for
     for (size_t i = 0; i < very_big_matrix.size(); i++) {
       very_big_matrix[i] = i;
     }
@@ -132,10 +122,18 @@ int main(int argc, char* argv[]) {
 
     printf("Rep %lu\n", reps);
 
+    auto start = std::chrono::high_resolution_clock::now();
     // transpose because it's the same as the problematic call in GNN
     // TODO(loc) non transpose version
+    //CBlasSGEMM(CblasNoTrans, CblasNoTrans, a_dim, b_dim, c_dim, matrix_1.data(),
+    //           matrix_2.data(), matrix_3.data());
     CBlasSGEMM(CblasTrans, CblasNoTrans, b_dim, a_dim, c_dim, matrix_1.data(),
                matrix_2.data(), matrix_3.data());
+    auto stop = std::chrono::high_resolution_clock::now();
+
+    auto duration = std::chrono::time_point_cast<std::chrono::milliseconds>(stop) - 
+                    std::chrono::time_point_cast<std::chrono::microseconds>(start);
+    printf("Run duration is %lf ms\n", duration.count() / 1000.0);
   }
 
   return 0;
diff --git a/libgnn/test/single_mkl_micro.cpp b/libgnn/test/single_mkl_micro.cpp
new file mode 100644
index 0000000000..ecbf9da6fd
--- /dev/null
+++ b/libgnn/test/single_mkl_micro.cpp
@@ -0,0 +1,168 @@
+#include <cstdlib>
+#include <vector>
+#include <random>
+#include <chrono>
+#include <mkl.h>
+
+#ifdef USE_SHARED_GALOIS
+#include "galois/Galois.h"
+#include "galois/LargeArray.h"
+#endif
+#ifdef USE_DIST_GALOIS
+#include "galois/DistGalois.h"
+#include "galois/LargeArray.h"
+#endif
+
+#ifdef USE_OMP
+#include "omp.h"
+#endif
+
+// MKL wrapper
+#ifdef USE_OMP
+void CBlasSGEMMOMP(const CBLAS_TRANSPOSE trans_a, const CBLAS_TRANSPOSE trans_b,
+                size_t input_rows, size_t input_columns, size_t output_columns,
+                const float* a, const float* b, float* output) {
+  // set lead dimension based on cblas spec w.r.t. transpose setting
+  size_t lead_dim_a = (trans_a == CblasNoTrans) ? input_columns : input_rows;
+  size_t lead_dim_b =
+      (trans_b == CblasNoTrans) ? output_columns : input_columns;
+
+  #pragma omp parallel for
+  for (int i = 0; i < omp_get_num_threads(); i++) {
+    unsigned chunk_size = input_rows / omp_get_num_threads();
+    unsigned my_start = chunk_size * i;
+    unsigned my_end = chunk_size * (i + 1);
+    if (omp_get_num_threads() - 1 == i) {
+      my_end = input_rows;
+    }
+    unsigned rows_to_use = my_end - my_start;
+
+    const float* my_a = a + (my_start * input_columns);
+    float* my_output = output + (my_start * output_columns);
+
+    // do the MM
+    cblas_sgemm(CblasRowMajor, trans_a, trans_b, rows_to_use, output_columns,
+                input_columns, 1.0, my_a, lead_dim_a, b, lead_dim_b,
+                false ? 1.0 : 0.0, my_output, output_columns);
+  }
+}
+#endif
+
+#if defined(USE_SHARED_GALOIS) || defined(USE_DIST_GALOIS)
+void CBlasSGEMMGalois(const CBLAS_TRANSPOSE trans_a, const CBLAS_TRANSPOSE trans_b,
+                size_t input_rows, size_t input_columns, size_t output_columns,
+                const float* a, const float* b, float* output) {
+  // set lead dimension based on cblas spec w.r.t. transpose setting
+  size_t lead_dim_a = (trans_a == CblasNoTrans) ? input_columns : input_rows;
+  size_t lead_dim_b =
+      (trans_b == CblasNoTrans) ? output_columns : input_columns;
+
+  galois::on_each(
+    [&] (size_t i, size_t num_threads) {
+    unsigned chunk_size = input_rows / num_threads;
+    unsigned my_start = chunk_size * i;
+    unsigned my_end = chunk_size * (i + 1);
+    if (num_threads - 1 == i) {
+      my_end = input_rows;
+    }
+    unsigned rows_to_use = my_end - my_start;
+
+    const float* my_a = a + (my_start * input_columns);
+    float* my_output = output + (my_start * output_columns);
+
+    // do the MM
+    cblas_sgemm(CblasRowMajor, trans_a, trans_b, rows_to_use, output_columns,
+                input_columns, 1.0, my_a, lead_dim_a, b, lead_dim_b,
+                false ? 1.0 : 0.0, my_output, output_columns);
+    }
+  );
+}
+#endif
+
+
+void CacheFlush(std::vector<float>* matrix) {
+  for (size_t i = 0; i < matrix->size(); i++) {
+    (*matrix)[i] = i;
+  }
+}
+
+int main(int argc, char* argv[]) {
+#ifdef USE_SHARED_GALOIS
+  galois::SharedMemSys G;
+  if (argc != 2) {
+    printf("Thread arg not specified\n");
+    exit(1);
+  }
+  galois::setActiveThreads(std::stoi(argv[1]));
+  printf("Initialized Galois Shared Mem with %u threads\n",
+         galois::getActiveThreads());
+#endif
+
+#ifdef USE_DIST_GALOIS
+  galois::DistMemSys G;
+  if (argc != 2) {
+    printf("Thread arg not specified\n");
+    exit(1);
+  }
+  galois::setActiveThreads(std::stoi(argv[1]));
+  printf("Initialized Galois Dist Mem with %u threads\n",
+         galois::getActiveThreads());
+#endif
+
+  printf("%d %s\n", argc, argv[0]);
+
+  // dimensions from test case
+  size_t a_dim = 12000000;
+  size_t b_dim = 128;
+  size_t c_dim = 16;
+
+  // inputs
+  std::vector<float> matrix_1(a_dim * b_dim);
+  std::vector<float> matrix_2(a_dim * c_dim);
+  // output
+  std::vector<float> matrix_3(a_dim * c_dim);
+
+  size_t kBigSize = 1000000000;
+  std::vector<float> very_big_matrix(kBigSize);
+
+  // change reps here; maybe make it command line arg
+  for (size_t reps = 0; reps < 5; reps++) {
+    // reinit
+    srand(0);
+    for (size_t i = 0; i < matrix_1.size(); i++) {
+      matrix_1[i] = rand() / static_cast<float>(RAND_MAX / 10);
+    }
+    srand(1);
+    for (size_t i = 0; i < matrix_2.size(); i++) {
+      matrix_2[i] = rand() / static_cast<float>(RAND_MAX / 10);
+    }
+
+    very_big_matrix.clear();
+    very_big_matrix.resize(kBigSize);
+    // cache flush
+    CacheFlush(&very_big_matrix);
+
+    printf("Rep %lu\n", reps);
+
+    auto start = std::chrono::high_resolution_clock::now();
+    // transpose because it's the same as the problematic call in GNN
+    // TODO(loc) non transpose version
+#ifdef USE_OMP
+    CBlasSGEMMOMP(CblasNoTrans, CblasNoTrans, a_dim, b_dim, c_dim, matrix_1.data(),
+               matrix_2.data(), matrix_3.data());
+#endif
+#if defined(USE_SHARED_GALOIS) || defined(USE_DIST_GALOIS)
+    CBlasSGEMMGalois(CblasNoTrans, CblasNoTrans, a_dim, b_dim, c_dim, matrix_1.data(),
+               matrix_2.data(), matrix_3.data());
+#endif
+    //CBlasSGEMM(CblasTrans, CblasNoTrans, b_dim, a_dim, c_dim, matrix_1.data(),
+    //           matrix_2.data(), matrix_3.data());
+    auto stop = std::chrono::high_resolution_clock::now();
+
+    auto duration = std::chrono::time_point_cast<std::chrono::milliseconds>(stop) - 
+                    std::chrono::time_point_cast<std::chrono::microseconds>(start);
+    printf("Run duration is %lf ms\n", duration.count() / 1000.0);
+  }
+
+  return 0;
+}

From 7ef6e893b0ea6ba6872ff09e8aa2c9198485e95b Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 9 Jun 2021 19:12:07 -0500
Subject: [PATCH 563/660] mkl micro: transpose for single thread mkl

---
 libgnn/test/mkl_micro.cpp        |  3 ++
 libgnn/test/single_mkl_micro.cpp | 74 ++++++++++++++++++++++++--------
 2 files changed, 59 insertions(+), 18 deletions(-)

diff --git a/libgnn/test/mkl_micro.cpp b/libgnn/test/mkl_micro.cpp
index 10867a8c63..73b3a08893 100644
--- a/libgnn/test/mkl_micro.cpp
+++ b/libgnn/test/mkl_micro.cpp
@@ -82,6 +82,7 @@ int main(int argc, char* argv[]) {
 
   // dimensions from test case
   size_t a_dim = 12000000;
+  //size_t a_dim = 120000;
   size_t b_dim = 128;
   size_t c_dim = 16;
 
@@ -129,6 +130,8 @@ int main(int argc, char* argv[]) {
     //           matrix_2.data(), matrix_3.data());
     CBlasSGEMM(CblasTrans, CblasNoTrans, b_dim, a_dim, c_dim, matrix_1.data(),
                matrix_2.data(), matrix_3.data());
+    //CBlasSGEMM(CblasNoTrans, CblasTrans, b_dim, a_dim, c_dim, matrix_1.data(),
+    //           matrix_2.data(), matrix_3.data());
     auto stop = std::chrono::high_resolution_clock::now();
 
     auto duration = std::chrono::time_point_cast<std::chrono::milliseconds>(stop) - 
diff --git a/libgnn/test/single_mkl_micro.cpp b/libgnn/test/single_mkl_micro.cpp
index ecbf9da6fd..7111b1b057 100644
--- a/libgnn/test/single_mkl_micro.cpp
+++ b/libgnn/test/single_mkl_micro.cpp
@@ -6,11 +6,11 @@
 
 #ifdef USE_SHARED_GALOIS
 #include "galois/Galois.h"
-#include "galois/LargeArray.h"
+#include "galois/PODResizeableArray.h"
 #endif
 #ifdef USE_DIST_GALOIS
 #include "galois/DistGalois.h"
-#include "galois/LargeArray.h"
+#include "galois/PODResizeableArray.h"
 #endif
 
 #ifdef USE_OMP
@@ -57,25 +57,60 @@ void CBlasSGEMMGalois(const CBLAS_TRANSPOSE trans_a, const CBLAS_TRANSPOSE trans
   size_t lead_dim_b =
       (trans_b == CblasNoTrans) ? output_columns : input_columns;
 
+  static std::vector<galois::PODResizeableArray<float>> temps;
+  if (trans_a == CblasTrans) {
+    temps.resize(galois::getActiveThreads());
+  }
+
   galois::on_each(
     [&] (size_t i, size_t num_threads) {
-    unsigned chunk_size = input_rows / num_threads;
-    unsigned my_start = chunk_size * i;
-    unsigned my_end = chunk_size * (i + 1);
-    if (num_threads - 1 == i) {
-      my_end = input_rows;
+      if (trans_a != CblasTrans) {
+        unsigned chunk_size = input_rows / num_threads;
+        unsigned my_start = chunk_size * i;
+        unsigned my_end = chunk_size * (i + 1);
+        if (num_threads - 1 == i) {
+          my_end = input_rows;
+        }
+        unsigned rows_to_use = my_end - my_start;
+
+        const float* my_a = a + (my_start * input_columns);
+        float* my_output = output + (my_start * output_columns);
+
+        // do the MM
+        cblas_sgemm(CblasRowMajor, trans_a, trans_b, rows_to_use, output_columns,
+                    input_columns, 1.0, my_a, lead_dim_a, b, lead_dim_b,
+                    false ? 1.0 : 0.0, my_output, output_columns);
+      } else {
+        galois::PODResizeableArray<float>& my_pod = temps[i];
+        my_pod.resize(input_rows * output_columns);
+
+        unsigned chunk_size = input_columns / num_threads;
+        unsigned my_start = chunk_size * i;
+        unsigned my_end = chunk_size * (i + 1);
+        if (num_threads - 1 == i) {
+          my_end = input_columns;
+        }
+        unsigned b_rows_to_use = my_end - my_start;
+
+        const float* my_a = a + (my_start * input_rows);
+        const float* my_b = b + (my_start * output_columns);
+
+        // do the MM
+        cblas_sgemm(CblasRowMajor, trans_a, trans_b, input_rows, output_columns,
+                    b_rows_to_use, 1.0, my_a, lead_dim_a, my_b, lead_dim_b,
+                    false ? 1.0 : 0.0, my_pod.data(), output_columns);
+      }
     }
-    unsigned rows_to_use = my_end - my_start;
-
-    const float* my_a = a + (my_start * input_columns);
-    float* my_output = output + (my_start * output_columns);
+  );
 
-    // do the MM
-    cblas_sgemm(CblasRowMajor, trans_a, trans_b, rows_to_use, output_columns,
-                input_columns, 1.0, my_a, lead_dim_a, b, lead_dim_b,
-                false ? 1.0 : 0.0, my_output, output_columns);
+  if (trans_a == CblasTrans) {
+    printf("Manual summation\n");
+    for (galois::PODResizeableArray<float>& temp_out : temps) {
+      for (unsigned i = 0; i < temp_out.size(); i++) {
+        output[i] += temp_out[i];
+      }
     }
-  );
+  }
 }
 #endif
 
@@ -120,7 +155,8 @@ int main(int argc, char* argv[]) {
   std::vector<float> matrix_1(a_dim * b_dim);
   std::vector<float> matrix_2(a_dim * c_dim);
   // output
-  std::vector<float> matrix_3(a_dim * c_dim);
+  //std::vector<float> matrix_3(a_dim * c_dim);
+  std::vector<float> matrix_3(b_dim * c_dim);
 
   size_t kBigSize = 1000000000;
   std::vector<float> very_big_matrix(kBigSize);
@@ -152,7 +188,9 @@ int main(int argc, char* argv[]) {
                matrix_2.data(), matrix_3.data());
 #endif
 #if defined(USE_SHARED_GALOIS) || defined(USE_DIST_GALOIS)
-    CBlasSGEMMGalois(CblasNoTrans, CblasNoTrans, a_dim, b_dim, c_dim, matrix_1.data(),
+    //CBlasSGEMMGalois(CblasNoTrans, CblasNoTrans, a_dim, b_dim, c_dim, matrix_1.data(),
+    //           matrix_2.data(), matrix_3.data());
+    CBlasSGEMMGalois(CblasTrans, CblasNoTrans, b_dim, a_dim, c_dim, matrix_1.data(),
                matrix_2.data(), matrix_3.data());
 #endif
     //CBlasSGEMM(CblasTrans, CblasNoTrans, b_dim, a_dim, c_dim, matrix_1.data(),

From fbdf83383ca079155234e80f329018bb7adff8a5 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 15 Jun 2021 19:50:39 -0500
Subject: [PATCH 564/660] SampledAny opt, fix end-of-execution free overhead

1) SampledAny was very expensive apparently (looping through boolean vec
for every edge); sensible because probably locality issues. Avoids
this by using a bitset to mark if an edge has ever been sampled.
Improves perf significantly from what I can tell, but needs more
testing.

2) End of execution free LargeArray of std::vectors was insanely
expensive. This fixes it by using gstl::Vector instead.
---
 libgnn/include/galois/graphs/GNNGraph.h | 19 +++++++------------
 libgnn/src/graphs/GNNGraph.cpp          |  2 ++
 2 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 775fd2af3a..723249fe2f 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -175,11 +175,7 @@ class GNNGraph {
   }
 
   bool IsEdgeSampledAny(EdgeIterator ei) const {
-    for (bool b : edge_sample_status_[*ei]) {
-      if (b)
-        return true;
-    }
-    return false;
+    return sampled_edges_.test(*ei);
   }
   bool IsEdgeSampled(uint32_t ei, size_t layer_num) const {
     if (!use_subgraph_) {
@@ -205,6 +201,7 @@ class GNNGraph {
 
   //! Set the flag on the edge to 1; makes it sampled
   void MakeEdgeSampled(EdgeIterator ei, size_t layer_num) {
+    sampled_edges_.set(*ei);
     edge_sample_status_[*ei][layer_num] = 1;
   };
   //! Set the flag on the edge to 0; makes it not sampled
@@ -224,6 +221,7 @@ class GNNGraph {
   //////////////////////////////////////////////////////////////////////////////
   // in edges
   //////////////////////////////////////////////////////////////////////////////
+
   EdgeIterator in_edge_begin(GraphNode n) const {
     if (!use_subgraph_ && !use_subgraph_view_) {
       return partitioned_graph_->in_edge_begin(n);
@@ -271,12 +269,7 @@ class GNNGraph {
   }
 
   bool IsInEdgeSampledAny(EdgeIterator ei) const {
-    for (bool b :
-         edge_sample_status_[partitioned_graph_->InEdgeToOutEdge(ei)]) {
-      if (b)
-        return true;
-    }
-    return false;
+    return sampled_edges_.test(partitioned_graph_->InEdgeToOutEdge(ei));
   };
   bool IsInEdgeSampled(EdgeIterator ei, size_t layer_num) const {
     if (!use_subgraph_) {
@@ -724,7 +717,7 @@ class GNNGraph {
   std::vector<galois::LargeArray<uint32_t>> sampled_out_degrees_;
   //! Sample data on edges: each edge gets a small bitset to mark
   //! if it's been sampled for a particular layer
-  galois::LargeArray<std::vector<bool>> edge_sample_status_;
+  galois::LargeArray<galois::gstl::Vector<bool>> edge_sample_status_;
   // TODO use a char maybe? unlikely anyone will go over 2^8 layers...
   //! What timestep a node was added to sampled set; used to determine
   //! size of subgraph at each layer
@@ -732,6 +725,8 @@ class GNNGraph {
   //! Indicates newly sampled nodes (for distributed synchronization of sampling
   //! status
   galois::DynamicBitSet new_sampled_nodes_;
+  //! If edge is sampled at any point, mark this
+  galois::DynamicBitSet sampled_edges_;
 
   //////////////////////////////////////////////////////////////////////////////
 
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index b7cb9596e0..c1afe6e6c4 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -884,6 +884,7 @@ void galois::graphs::GNNGraph::InitializeSamplingData(size_t num_layers,
   sample_node_timestamps_.create(partitioned_graph_->size(),
                                  std::numeric_limits<uint32_t>::max());
   edge_sample_status_.create(partitioned_graph_->sizeEdges(), num_layers, 0);
+  sampled_edges_.resize(partitioned_graph_->sizeEdges());
   // this is to hold the degree of a sampled graph considering all hosts; yes,
   // memory wise this is slightly problematic possibly, but each layer is its
   // own subgraph
@@ -929,6 +930,7 @@ size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) {
                    std::fill(edge_sample_status_[edge_id].begin(),
                              edge_sample_status_[edge_id].end(), 0);
                  });
+  sampled_edges_.reset();
   // reset all degrees
   if (!subgraph_choose_all_) {
     galois::do_all(

From 9146c0c47630ebe92f997f91d5e021d6a1a51050 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 15 Jun 2021 21:40:50 -0500
Subject: [PATCH 565/660] DynamicBitsets for each layer's edge samples

Rather than a vector for each edge, have a bitset for each layer for
edge sampling marking; more locality when checking edges of one
particular layer rather than jumping around many vectors.
---
 libgnn/include/galois/graphs/GNNGraph.h | 22 ++++++++++++----------
 libgnn/src/graphs/GNNGraph.cpp          | 14 ++++++++------
 2 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 723249fe2f..e50d0197d4 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -180,7 +180,7 @@ class GNNGraph {
   bool IsEdgeSampled(uint32_t ei, size_t layer_num) const {
     if (!use_subgraph_) {
       // view uses original graph edge iterators
-      return edge_sample_status_[ei][layer_num];
+      return edge_sample_status_[layer_num].test(ei);
     } else {
       return subgraph_->OutEdgeSampled(ei, layer_num, *this);
       return false;
@@ -189,24 +189,24 @@ class GNNGraph {
   bool IsEdgeSampled(EdgeIterator ei, size_t layer_num) const {
     if (!use_subgraph_) {
       // view uses original graph edge iterators
-      return edge_sample_status_[*ei][layer_num];
+      return edge_sample_status_[layer_num].test(*ei);
     } else {
       return subgraph_->OutEdgeSampled(ei, layer_num, *this);
     }
   };
   //! Always use original graph's edge iterator here
   bool IsEdgeSampledOriginalGraph(EdgeIterator ei, size_t layer_num) const {
-    return edge_sample_status_[*ei][layer_num];
+    return edge_sample_status_[layer_num].test(*ei);
   };
 
   //! Set the flag on the edge to 1; makes it sampled
   void MakeEdgeSampled(EdgeIterator ei, size_t layer_num) {
     sampled_edges_.set(*ei);
-    edge_sample_status_[*ei][layer_num] = 1;
+    edge_sample_status_[layer_num].set(*ei);
   };
   //! Set the flag on the edge to 0; makes it not sampled
   void MakeEdgeUnsampled(EdgeIterator ei, size_t layer_num) {
-    edge_sample_status_[*ei][layer_num] = 0;
+    edge_sample_status_[layer_num].reset(*ei, *ei);
   };
 
   // GNNEdgeSortIterator EdgeSortBegin(GraphNode n) {
@@ -274,8 +274,8 @@ class GNNGraph {
   bool IsInEdgeSampled(EdgeIterator ei, size_t layer_num) const {
     if (!use_subgraph_) {
       // view can use this fine + requires it
-      return edge_sample_status_[partitioned_graph_->InEdgeToOutEdge(ei)]
-                                [layer_num];
+      return edge_sample_status_[layer_num].test(
+          partitioned_graph_->InEdgeToOutEdge(ei));
     } else {
       return subgraph_->InEdgeSampled(ei, layer_num, *this);
     }
@@ -283,11 +283,13 @@ class GNNGraph {
 
   //! Set the flag on the edge to 1; makes it sampled
   void MakeInEdgeSampled(EdgeIterator ei, size_t layer_num) {
-    edge_sample_status_[partitioned_graph_->InEdgeToOutEdge(ei)][layer_num] = 1;
+    edge_sample_status_[layer_num].set(partitioned_graph_->InEdgeToOutEdge(ei));
   };
   //! Set the flag on the edge to 0; makes it not sampled
   void MakeInEdgeUnsampled(EdgeIterator ei, size_t layer_num) {
-    edge_sample_status_[partitioned_graph_->InEdgeToOutEdge(ei)][layer_num] = 0;
+    edge_sample_status_[layer_num].reset(
+        partitioned_graph_->InEdgeToOutEdge(ei),
+        partitioned_graph_->InEdgeToOutEdge(ei));
   };
 
   //////////////////////////////////////////////////////////////////////////////
@@ -717,7 +719,7 @@ class GNNGraph {
   std::vector<galois::LargeArray<uint32_t>> sampled_out_degrees_;
   //! Sample data on edges: each edge gets a small bitset to mark
   //! if it's been sampled for a particular layer
-  galois::LargeArray<galois::gstl::Vector<bool>> edge_sample_status_;
+  std::vector<galois::DynamicBitSet> edge_sample_status_;
   // TODO use a char maybe? unlikely anyone will go over 2^8 layers...
   //! What timestep a node was added to sampled set; used to determine
   //! size of subgraph at each layer
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index c1afe6e6c4..b10ea8d26e 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -883,7 +883,10 @@ void galois::graphs::GNNGraph::InitializeSamplingData(size_t num_layers,
   subgraph_ = std::make_unique<GNNSubgraph>(partitioned_graph_->size());
   sample_node_timestamps_.create(partitioned_graph_->size(),
                                  std::numeric_limits<uint32_t>::max());
-  edge_sample_status_.create(partitioned_graph_->sizeEdges(), num_layers, 0);
+  edge_sample_status_.resize(num_layers);
+  for (size_t i = 0; i < num_layers; i++) {
+    edge_sample_status_[i].resize(partitioned_graph_->sizeEdges());
+  }
   sampled_edges_.resize(partitioned_graph_->sizeEdges());
   // this is to hold the degree of a sampled graph considering all hosts; yes,
   // memory wise this is slightly problematic possibly, but each layer is its
@@ -925,11 +928,10 @@ size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) {
   std::fill(sample_node_timestamps_.begin(), sample_node_timestamps_.end(),
             std::numeric_limits<uint32_t>::max());
   // clear all sampled edges
-  galois::do_all(galois::iterate(size_t{0}, partitioned_graph_->sizeEdges()),
-                 [&](size_t edge_id) {
-                   std::fill(edge_sample_status_[edge_id].begin(),
-                             edge_sample_status_[edge_id].end(), 0);
-                 });
+  galois::do_all(
+      galois::iterate(edge_sample_status_.begin(), edge_sample_status_.end()),
+      [&](galois::DynamicBitSet& edge_layer) { edge_layer.reset(); });
+
   sampled_edges_.reset();
   // reset all degrees
   if (!subgraph_choose_all_) {

From 23d449b2ed1f10bcbe6a122572fcec05dccf7aa9 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 17 Jun 2021 14:26:11 -0500
Subject: [PATCH 566/660] GNN: Optimize SID map, fix subgraph master masking

1) SID mapping made many passes over the graph; this is inefficient.
Optimize by counting how many nodes will make first appearance in each
layer and make only a single pass over the graph to get SID mappings.
Further improvement could be obtained by making this run in parallel
(doable with another prefix sum on top of existing one for work assigned
to each thread possibly), but will go there as necessary.

2) Master masking in SAGE/GCN layers for correctness was buggy in
subgraph case as it is possible that masters would appear beyond the
layer 0 prefix used by subgraphs. Avoid this by using a bitset which
indicates masters that are not in this prefix so that they are not
masked out accidentally. This fix seemingly improves time to accuracy as
well as time in general because less 0s are being written.
---
 .../galois/graphs/DegreeSyncStructures.h      | 15 +--
 libgnn/include/galois/graphs/GNNGraph.h       | 22 ++++-
 libgnn/include/galois/graphs/GNNSubgraph.h    |  4 +-
 .../graphs/GraphAggregationSyncStructures.h   | 33 ++++---
 libgnn/include/galois/layers/GNNLayer.h       |  5 +
 libgnn/src/graphs/GNNGraph.cpp                | 49 ++++++++++
 libgnn/src/graphs/GNNSubgraph.cpp             | 97 +++++++++++--------
 libgnn/src/layers/GNNLayer.cpp                | 88 +++++++++++++++++
 libgnn/src/layers/SAGELayer.cpp               | 37 +++++--
 9 files changed, 275 insertions(+), 75 deletions(-)

diff --git a/libgnn/include/galois/graphs/DegreeSyncStructures.h b/libgnn/include/galois/graphs/DegreeSyncStructures.h
index 44102a3807..91a94d64ac 100644
--- a/libgnn/include/galois/graphs/DegreeSyncStructures.h
+++ b/libgnn/include/galois/graphs/DegreeSyncStructures.h
@@ -62,7 +62,7 @@ struct SubgraphDegreeSync {
   using ValTy = galois::gstl::Vector<uint32_t>;
 
   static size_t FeatVecSize() {
-    return gnn_sampled_out_degrees_->size();;
+    return gnn_sampled_out_degrees_->size();
   }
 
   static ValTy extract(uint32_t lid, char&) {
@@ -76,12 +76,12 @@ struct SubgraphDegreeSync {
     return vec_to_send;
   }
 
-  static void ExtractDirect(uint32_t lid, typename ValTy::value_type* to_write) {
+  static void ExtractDirect(uint32_t lid,
+                            typename ValTy::value_type* to_write) {
     size_t count = 0;
     for (galois::LargeArray<uint32_t>& layer_degrees :
          *gnn_sampled_out_degrees_) {
-      std::memcpy(&to_write[count],
-                  &layer_degrees[lid],
+      std::memcpy(&to_write[count], &layer_degrees[lid],
                   sizeof(typename ValTy::value_type));
       count++;
     }
@@ -96,7 +96,8 @@ struct SubgraphDegreeSync {
   }
 
   static bool reduce(uint32_t lid, char&, ValTy::value_type* y) {
-    for (size_t degree_index = 0; degree_index < gnn_sampled_out_degrees_->size(); degree_index++) {
+    for (size_t degree_index = 0;
+         degree_index < gnn_sampled_out_degrees_->size(); degree_index++) {
       (*gnn_sampled_out_degrees_)[degree_index][lid] += y[degree_index];
     }
     return true;
@@ -119,12 +120,12 @@ struct SubgraphDegreeSync {
   }
 
   static void setVal(uint32_t lid, char&, ValTy::value_type* y) {
-    for (size_t degree_index = 0; degree_index < gnn_sampled_out_degrees_->size(); degree_index++) {
+    for (size_t degree_index = 0;
+         degree_index < gnn_sampled_out_degrees_->size(); degree_index++) {
       (*gnn_sampled_out_degrees_)[degree_index][lid] = y[degree_index];
     }
   }
 
-
   // GPU options TODO for GPU
   static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {
     return false;
diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index e50d0197d4..9c2e6061bf 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -634,6 +634,16 @@ class GNNGraph {
   void DisableSubgraphChooseAll() { subgraph_choose_all_ = false; }
   void SetSubgraphChooseAll(bool a) { subgraph_choose_all_ = a; }
 
+  std::vector<unsigned>& GetMasterOffsets() { return sample_master_offsets_; }
+  std::vector<unsigned>& GetMirrorOffsets() { return sample_mirror_offsets_; }
+
+  galois::DynamicBitSet& GetNonLayerZeroMasters() {
+    return non_layer_zero_masters_;
+  }
+  const galois::DynamicBitSet& GetNonLayerZeroMasters() const {
+    return non_layer_zero_masters_;
+  }
+
 private:
 // included like this to avoid cyclic dependency issues + not used anywhere but
 // in this class anyways
@@ -724,6 +734,17 @@ class GNNGraph {
   //! What timestep a node was added to sampled set; used to determine
   //! size of subgraph at each layer
   galois::LargeArray<unsigned> sample_node_timestamps_;
+  //! Count of how many masters are in each layer in a sampled subgraph.
+  std::vector<unsigned> sample_master_offsets_;
+  //! Count of how many mirrors are in each layer in a sampled subgraph.
+  std::vector<unsigned> sample_mirror_offsets_;
+  //! In a subgraph, all layer 0 masters are made the prefix of SIDs; other
+  //! masters that are not layer 0 will be scattered elsewhere. This bitset
+  //! tracks which of those SIDs are the masters.
+  //! This is required for master masking in certain layers in distributed
+  //! execution to avoid recomputation of certain gradients.
+  galois::DynamicBitSet non_layer_zero_masters_;
+
   //! Indicates newly sampled nodes (for distributed synchronization of sampling
   //! status
   galois::DynamicBitSet new_sampled_nodes_;
@@ -768,7 +789,6 @@ class GNNGraph {
   std::vector<uint32_t> global_degrees_;
   std::vector<uint32_t> global_train_degrees_;
 
-  // TODO vars for subgraphs as necessary
   bool use_subgraph_{false};
   bool use_subgraph_view_{false};
   bool subgraph_choose_all_{false};
diff --git a/libgnn/include/galois/graphs/GNNSubgraph.h b/libgnn/include/galois/graphs/GNNSubgraph.h
index 81825e2ed1..ddd4c8d277 100644
--- a/libgnn/include/galois/graphs/GNNSubgraph.h
+++ b/libgnn/include/galois/graphs/GNNSubgraph.h
@@ -111,10 +111,10 @@ class GNNSubgraph {
       t->stop();
   }
 
+  // TODO signature cleanup
   //! Creates subgraph ID mapping from the number of sampled nodes from the
   //! original graph. Should be done every epoch when sampled graph changes.
-  void CreateSubgraphMapping(const GNNGraph& gnn_graph,
-                             size_t num_sampled_layers);
+  void CreateSubgraphMapping(GNNGraph& gnn_graph, size_t);
 
   //! Counts in and out degrees of all sampled nodes in the graph
   void DegreeCounting(const GNNGraph& gnn_graph);
diff --git a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
index 1270df5ff5..7501a7c23d 100644
--- a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
+++ b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
@@ -68,10 +68,7 @@ struct SampleFlagBitset {
 struct GNNSumAggregate {
   using ValTy = galois::gstl::Vector<GNNFloat>;
 
-  static size_t FeatVecSize() {
-    return gnn_matrix_to_sync_column_length_;
-  }
-
+  static size_t FeatVecSize() { return gnn_matrix_to_sync_column_length_; }
 
   //! return a vector of floats to sync
   static ValTy extract(uint32_t node_id, char&) {
@@ -90,10 +87,13 @@ struct GNNSumAggregate {
   }
 
   //! return a vector of floats to sync
-  static void ExtractDirect(uint32_t node_id, typename ValTy::value_type* to_write) {
-    std::memcpy(to_write,
-                (char*)&(gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_]),
-                gnn_matrix_to_sync_column_length_ * sizeof(typename ValTy::value_type));
+  static void ExtractDirect(uint32_t node_id,
+                            typename ValTy::value_type* to_write) {
+    std::memcpy(
+        to_write,
+        (char*)&(
+            gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_]),
+        gnn_matrix_to_sync_column_length_ * sizeof(typename ValTy::value_type));
   }
 
   //! reduction is addition in this case; add received vector to
@@ -166,16 +166,14 @@ struct GNNSumAggregate {
 struct GNNSampleSumAggregate {
   using ValTy = galois::gstl::Vector<GNNFloat>;
 
-  static size_t FeatVecSize() {
-    return gnn_matrix_to_sync_column_length_;
-  }
+  static size_t FeatVecSize() { return gnn_matrix_to_sync_column_length_; }
 
   //! return a vector of floats to sync
   static ValTy extract(uint32_t node_id, char&) {
     // It should be a CPU synchronizing substrate.
     // If the GPU flag is turned off, then personality does not exist.
     // assert(device_personality == DevicePersonality::CPU);
-    //ValTy extracted_vec(gnn_matrix_to_sync_column_length_);
+    // ValTy extracted_vec(gnn_matrix_to_sync_column_length_);
     ValTy extracted_vec;
     extracted_vec.reserve(gnn_matrix_to_sync_column_length_);
     if ((*gnn_lid_to_sid_pointer_)[node_id] ==
@@ -194,14 +192,17 @@ struct GNNSampleSumAggregate {
     return extracted_vec;
   }
 
-  static void ExtractDirect(uint32_t node_id, typename ValTy::value_type* to_write) {
+  static void ExtractDirect(uint32_t node_id,
+                            typename ValTy::value_type* to_write) {
     if ((*gnn_lid_to_sid_pointer_)[node_id] ==
         std::numeric_limits<uint32_t>::max()) {
       return;
     }
-    std::memcpy(to_write,
-                (char*)&(gnn_matrix_to_sync_[(*gnn_lid_to_sid_pointer_)[node_id]* gnn_matrix_to_sync_column_length_]),
-                gnn_matrix_to_sync_column_length_ * sizeof(typename ValTy::value_type));
+    std::memcpy(
+        to_write,
+        (char*)&(gnn_matrix_to_sync_[(*gnn_lid_to_sid_pointer_)[node_id] *
+                                     gnn_matrix_to_sync_column_length_]),
+        gnn_matrix_to_sync_column_length_ * sizeof(typename ValTy::value_type));
   }
 
   //! reduction is addition in this case; add received vector to
diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
index e61d398a64..786a973230 100644
--- a/libgnn/include/galois/layers/GNNLayer.h
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -364,12 +364,17 @@ class GNNLayer {
     MaskInputNonMasters(input, std::numeric_limits<size_t>::max());
   }
   void MaskInputNonMasters(PointerWithSize<GNNFloat>* input, size_t max_rows);
+  void MaskInputNonMasters(PointerWithSize<GNNFloat>* input, size_t max_rows,
+                           const galois::DynamicBitSet&);
+
   //! Mask a gradient size'd matrix's rows that correspond to mirrors
   void MaskGradientNonMasters(PointerWithSize<GNNFloat>* input) {
     MaskGradientNonMasters(input, std::numeric_limits<size_t>::max());
   }
   void MaskGradientNonMasters(PointerWithSize<GNNFloat>* gradients,
                               size_t max_rows);
+  void MaskGradientNonMasters(PointerWithSize<GNNFloat>* gradients,
+                              size_t max_rows, const galois::DynamicBitSet&);
 
   //! Does some math to get GB used by some # of floats
   double FloatElementsToGB(size_t num_of_floats) const {
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index b10ea8d26e..fdd2d6e1dc 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -899,6 +899,9 @@ void galois::graphs::GNNGraph::InitializeSamplingData(size_t num_layers,
   } else {
     subgraph_choose_all_ = true;
   }
+
+  sample_master_offsets_.resize(num_layers + 1, 0);
+  sample_mirror_offsets_.resize(num_layers + 1, 0);
 }
 
 size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) {
@@ -927,6 +930,9 @@ size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) {
   // clear node timestamps
   std::fill(sample_node_timestamps_.begin(), sample_node_timestamps_.end(),
             std::numeric_limits<uint32_t>::max());
+  std::fill(sample_master_offsets_.begin(), sample_master_offsets_.end(), 0);
+  std::fill(sample_mirror_offsets_.begin(), sample_mirror_offsets_.end(), 0);
+
   // clear all sampled edges
   galois::do_all(
       galois::iterate(edge_sample_status_.begin(), edge_sample_status_.end()),
@@ -958,15 +964,29 @@ size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) {
   }
   galois::GAccumulator<unsigned> local_seed_count;
   local_seed_count.reset();
+  galois::GAccumulator<unsigned> master_offset;
+  master_offset.reset();
+  galois::GAccumulator<unsigned> mirror_offset;
+  mirror_offset.reset();
   // count # of seed nodes
   galois::do_all(galois::iterate(begin(), end()), [&](const NodeIterator& x) {
     if (IsInSampledGraph(x)) {
+      if (*x < *end_owned()) {
+        master_offset += 1;
+      } else {
+        // mirror
+        mirror_offset += 1;
+      }
+
       local_seed_count += 1;
       // 0 = seed node
       sample_node_timestamps_[*x] = 0;
     }
   });
 
+  sample_master_offsets_[0] = master_offset.reduce();
+  sample_mirror_offsets_[0] = mirror_offset.reduce();
+
   return local_seed_count.reduce();
 }
 
@@ -1036,15 +1056,29 @@ size_t galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num,
 
   galois::GAccumulator<unsigned> local_sample_count;
   local_sample_count.reset();
+  galois::GAccumulator<unsigned> master_offset;
+  master_offset.reset();
+  galois::GAccumulator<unsigned> mirror_offset;
+  mirror_offset.reset();
   // count # of seed nodes
   galois::do_all(galois::iterate(begin(), end()), [&](const NodeIterator& x) {
     if (IsInSampledGraph(x)) {
       local_sample_count += 1;
       if (sample_node_timestamps_[*x] == std::numeric_limits<uint32_t>::max()) {
+        if (*x < *end_owned()) {
+          master_offset += 1;
+        } else {
+          // mirror
+          mirror_offset += 1;
+        }
         sample_node_timestamps_[*x] = timestamp;
       }
     }
   });
+  assert(sample_master_offsets_.size() > timestamp);
+  assert(sample_mirror_offsets_.size() > timestamp);
+  sample_master_offsets_[timestamp] = master_offset.reduce();
+  sample_mirror_offsets_[timestamp] = mirror_offset.reduce();
 
   EnableSubgraphChooseAll();
   return local_sample_count.reduce();
@@ -1131,15 +1165,30 @@ size_t galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num,
   // count sampled node size
   galois::GAccumulator<unsigned> local_sample_count;
   local_sample_count.reset();
+  galois::GAccumulator<unsigned> master_offset;
+  master_offset.reset();
+  galois::GAccumulator<unsigned> mirror_offset;
+  mirror_offset.reset();
   // count # of seed nodes
   galois::do_all(galois::iterate(begin(), end()), [&](const NodeIterator& x) {
     if (IsInSampledGraph(x)) {
+
       local_sample_count += 1;
       if (sample_node_timestamps_[*x] == std::numeric_limits<uint32_t>::max()) {
+        if (*x < *end_owned()) {
+          master_offset += 1;
+        } else {
+          // mirror
+          mirror_offset += 1;
+        }
         sample_node_timestamps_[*x] = timestamp;
       }
     }
   });
+  assert(sample_master_offsets_.size() > timestamp);
+  assert(sample_mirror_offsets_.size() > timestamp);
+  sample_master_offsets_[timestamp] = master_offset.reduce();
+  sample_mirror_offsets_[timestamp] = mirror_offset.reduce();
 
   DisableSubgraphChooseAll();
   return local_sample_count.reduce();
diff --git a/libgnn/src/graphs/GNNSubgraph.cpp b/libgnn/src/graphs/GNNSubgraph.cpp
index 2493319904..360586b7df 100644
--- a/libgnn/src/graphs/GNNSubgraph.cpp
+++ b/libgnn/src/graphs/GNNSubgraph.cpp
@@ -28,8 +28,9 @@ size_t galois::graphs::GNNGraph::GNNSubgraph::BuildSubgraphView(
   return num_subgraph_nodes_;
 }
 
+// TODO signature cleanup
 void galois::graphs::GNNGraph::GNNSubgraph::CreateSubgraphMapping(
-    const GNNGraph& gnn_graph, size_t num_sampled_layers) {
+    GNNGraph& gnn_graph, size_t) {
   galois::StatTimer timer("SIDMapping", kRegionName);
   TimerStart(&timer);
 
@@ -51,60 +52,73 @@ void galois::graphs::GNNGraph::GNNSubgraph::CreateSubgraphMapping(
     subgraph_id_to_lid_.resize(num_subgraph_nodes_ * 1.02);
   }
 
-  // TODO(loc) depending on overhead, can parallelize this with a prefix sum
-  // serial loop over LIDs to construct lid -> subgraph id mapping
-  uint32_t current_sid = 0;
+  galois::DynamicBitSet& non_layer_zero_masters =
+      gnn_graph.GetNonLayerZeroMasters();
+  std::vector<unsigned>& master_offsets = gnn_graph.GetMasterOffsets();
+  std::vector<unsigned>& mirror_offsets = gnn_graph.GetMirrorOffsets();
 
-  // split into 2 parts: masters, then mirrors
+  // init the bitset as necessary
+  if (non_layer_zero_masters.size() < num_subgraph_nodes_) {
+    non_layer_zero_masters.resize(num_subgraph_nodes_);
+  } else {
+    non_layer_zero_masters.reset();
+  }
+
+  // compute offsets for each layer
+  uint32_t layer_zero_offset = 0;
+  galois::PODResizeableArray<unsigned> layer_offsets;
+  layer_offsets.resize(master_offsets.size() - 1);
+  for (unsigned i = 0; i < layer_offsets.size(); i++) {
+    layer_offsets[i] = master_offsets[i] + mirror_offsets[i];
+    if (i > 0) {
+      // prefix summing
+      layer_offsets[i] += layer_offsets[i - 1];
+    }
+  }
+
+  // split into 2 parts: masters, then everything else
   size_t last_owned_node = *(gnn_graph.end_owned());
+  galois::gInfo(last_owned_node);
   for (size_t local_node_id = 0; local_node_id < last_owned_node;
        local_node_id++) {
-    if (gnn_graph.SampleNodeTimestamp(local_node_id) == 0) {
-      // TODO should bound check the SID to max uint32_t
-      // note: if SID is max uint32t, then it's not valid
-      subgraph_id_to_lid_[current_sid]   = local_node_id;
-      lid_to_subgraph_id_[local_node_id] = current_sid++;
+    unsigned node_timestamp = gnn_graph.SampleNodeTimestamp(local_node_id);
+    if (node_timestamp != std::numeric_limits<unsigned>::max()) {
+      uint32_t sid_to_use;
+      if (node_timestamp != 0) {
+        sid_to_use = layer_offsets[node_timestamp - 1]++;
+        // master that won't be in prefix needs to be marked
+        non_layer_zero_masters.set(sid_to_use);
+      } else {
+        sid_to_use = layer_zero_offset++;
+      }
+      subgraph_id_to_lid_[sid_to_use]    = local_node_id;
+      lid_to_subgraph_id_[local_node_id] = sid_to_use++;
     }
   }
 
-  // all nodes before this SID are master nodes *that matter*
-  // NOTE: there is a very subtle distinction here implementation wise
-  // that needs to be resolved in slightly more detail than this;
-  // there may be master nodes that are past this boundary that will
-  // not be covered by this begin_owned loop, which may cause problems down
+  // all nodes before this SID are master nodes in layer 0;
+  // NOTE: there are master nodes past this boundary that will
+  // not be covered by a begin_owned loop, which may cause problems down
   // the line
-  // TODO(loc) see above
-  subgraph_master_boundary_ = current_sid;
+  subgraph_master_boundary_ = master_offsets[0];
 
+  // everything else; none of these are master nodes
   for (size_t local_node_id = last_owned_node; local_node_id < gnn_graph.size();
        local_node_id++) {
-    if (gnn_graph.SampleNodeTimestamp(local_node_id) == 0) {
-      // TODO should bound check the SID to max uint32_t
-      // note: if SID is max uint32t, then it's not valid
-      subgraph_id_to_lid_[current_sid]   = local_node_id;
-      lid_to_subgraph_id_[local_node_id] = current_sid++;
-    }
-  }
-  galois::gDebug(
-      "Number of sampled nodes for subgraph construction layer 0 is ",
-      current_sid);
-
-  // XXX each sampled layer can be queried in parallel (think prefix sum); do
-  // this if this becomes a bottleneck
-  for (size_t i = 1; i < num_sampled_layers + 1; i++) {
-    for (size_t local_node_id = 0; local_node_id < gnn_graph.size();
-         local_node_id++) {
-      if (gnn_graph.SampleNodeTimestamp(local_node_id) == i) {
-        subgraph_id_to_lid_[current_sid]   = local_node_id;
-        lid_to_subgraph_id_[local_node_id] = current_sid++;
+    unsigned node_timestamp = gnn_graph.SampleNodeTimestamp(local_node_id);
+    if (node_timestamp != std::numeric_limits<unsigned>::max()) {
+      uint32_t sid_to_use;
+      if (node_timestamp != 0) {
+        sid_to_use = layer_offsets[node_timestamp - 1]++;
+      } else {
+        sid_to_use = layer_zero_offset++;
       }
+      subgraph_id_to_lid_[sid_to_use]    = local_node_id;
+      lid_to_subgraph_id_[local_node_id] = sid_to_use++;
     }
-    galois::gDebug("Number of sampled nodes for subgraph construction, layer ",
-                   i, " is ", current_sid);
   }
 
-  GALOIS_LOG_ASSERT(num_subgraph_nodes_ == current_sid);
-  // num_subgraph_nodes_ = current_sid;
+  GALOIS_LOG_ASSERT(layer_offsets.back() == num_subgraph_nodes_);
   TimerStop(&timer);
 }
 
@@ -141,8 +155,6 @@ void galois::graphs::GNNGraph::GNNSubgraph::DegreeCounting(
           }
         }
         local_subgraph_in_degrees_[subgraph_id] = in_degrees;
-        // galois::gDebug("Local ID ", node_id, " SID ", subgraph_id, " out ",
-        //               out_degrees, " in ", in_degrees);
       },
       galois::loopname("DegreeCountingDoAll"), galois::steal());
 
@@ -231,7 +243,6 @@ void galois::graphs::GNNGraph::GNNSubgraph::NodeFeatureCreation(
   galois::StatTimer timer("NodeFeatureCreation", kRegionName);
   TimerStart(&timer);
   size_t feat_length = gnn_graph.node_feature_length();
-  // assumes everything is already setup
   subgraph_node_features_.resize(feat_length * num_subgraph_nodes_);
 
   galois::do_all(galois::iterate(begin(), end()), [&](size_t subgraph_node_id) {
diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp
index 4c828dbb19..e4f14d7408 100644
--- a/libgnn/src/layers/GNNLayer.cpp
+++ b/libgnn/src/layers/GNNLayer.cpp
@@ -423,6 +423,45 @@ void galois::GNNLayer::MaskInputNonMasters(PointerWithSize<GNNFloat>* input,
 #endif
 }
 
+void galois::GNNLayer::MaskInputNonMasters(PointerWithSize<GNNFloat>* input,
+                                           size_t max_rows,
+                                           const galois::DynamicBitSet& bs) {
+  assert(*(graph_.begin_owned()) == 0);
+  size_t start_node = *(graph_.end_owned());
+  size_t end_node   = graph_.active_size();
+
+  if (start_node > max_rows) {
+    start_node = max_rows;
+  }
+  if (end_node > max_rows) {
+    end_node = max_rows;
+  }
+
+  size_t row_index = layer_dimensions_.input_columns;
+  assert(start_node * row_index <= input->size());
+  assert(end_node * row_index <= input->size());
+
+#ifdef GALOIS_ENABLE_GPU
+  if (device_personality == DevicePersonality::GPU_CUDA) {
+    base_gpu_object_.MaskNonMastersGPU(input, start_node, end_node, row_index);
+  } else {
+#endif
+    galois::do_all(
+        galois::iterate(start_node, end_node),
+        [&](size_t non_master) {
+          if (!bs.test(non_master)) {
+            // TODO(loc) use a std function for this for max efficiency
+            for (size_t i = 0; i < row_index; i++) {
+              (*input)[non_master * row_index + i] = 0;
+            }
+          }
+        },
+        galois::loopname("MaskInputNonMasters"));
+#ifdef GALOIS_ENABLE_GPU
+  }
+#endif
+}
+
 void galois::GNNLayer::MaskGradientNonMasters(
     PointerWithSize<GNNFloat>* gradient, size_t max_rows) {
   assert(*(graph_.begin_owned()) == 0);
@@ -465,3 +504,52 @@ void galois::GNNLayer::MaskGradientNonMasters(
   }
 #endif
 }
+
+void galois::GNNLayer::MaskGradientNonMasters(
+    PointerWithSize<GNNFloat>* gradient, size_t max_rows,
+    const galois::DynamicBitSet& bs) {
+  assert(*(graph_.begin_owned()) == 0);
+  size_t start_node = *(graph_.end_owned());
+  size_t end_node   = graph_.active_size();
+
+  if (start_node > max_rows) {
+    start_node = max_rows;
+  }
+  if (end_node > max_rows) {
+    end_node = max_rows;
+  }
+
+  size_t row_index = layer_dimensions_.output_columns;
+  if (start_node > max_rows) {
+    start_node = max_rows;
+  }
+  if (end_node > max_rows) {
+    end_node = max_rows;
+  }
+  assert(start_node * row_index <= gradient->size());
+  assert(end_node * row_index <= gradient->size());
+
+#ifdef GALOIS_ENABLE_GPU
+  if (device_personality == DevicePersonality::GPU_CUDA) {
+    base_gpu_object_.MaskNonMastersGPU(gradient, start_node, end_node,
+                                       row_index);
+  } else {
+#endif
+    // galois::gInfo(start_node, " to ", end_node);
+    galois::do_all(
+        galois::iterate(start_node, end_node),
+        [&](size_t non_master) {
+          // if something is not a master, kill it
+          if (!bs.test(non_master)) {
+            // galois::gInfo("don't keep ", non_master);
+            // TODO(loc) use a std function for this for max efficiency
+            for (size_t i = 0; i < row_index; i++) {
+              (*gradient)[non_master * row_index + i] = 0;
+            }
+          }
+        },
+        galois::loopname("MaskGradientNonMasters"));
+#ifdef GALOIS_ENABLE_GPU
+  }
+#endif
+}
diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp
index 70d85b853a..169dbe7ea3 100644
--- a/libgnn/src/layers/SAGELayer.cpp
+++ b/libgnn/src/layers/SAGELayer.cpp
@@ -206,7 +206,7 @@ void galois::SAGELayer::ResizeIntermediates(size_t new_input_rows,
       galois::gInfo(graph_.host_prefix(), "Resize layer ", layer_number_,
                     ", SAGE output temp var ", num_output_temp_elements, " (",
                     FloatElementsToGB(num_output_temp_elements), " GB)");
-       size_t buffer_size = (num_output_temp_elements * 0.02);
+      size_t buffer_size = (num_output_temp_elements * 0.02);
 #ifdef GALOIS_ENABLE_GPU
       if (device_personality == DevicePersonality::GPU_CUDA) {
         gpu_object_.AllocateOutTemp(num_output_temp_elements + buffer_size);
@@ -370,11 +370,21 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
   if (!sage_config_.disable_concat) {
     // XXX masking may not be required in sampling case where rows change
     if (layer_number_ != 0) {
-      MaskInputNonMasters(&input_data, layer_dimensions_.input_rows);
+      if (graph_.IsSubgraphOn()) {
+        MaskInputNonMasters(&input_data, layer_dimensions_.input_rows,
+                            graph_.GetNonLayerZeroMasters());
+      } else {
+        MaskInputNonMasters(&input_data, layer_dimensions_.input_rows);
+      }
     } else {
       // if 0 then no input to mask: mask the gradient
       // this is fine because gradient won't be used to get feature gradients
-      MaskGradientNonMasters(input_gradient, layer_dimensions_.output_rows);
+      if (graph_.IsSubgraphOn()) {
+        MaskGradientNonMasters(input_gradient, layer_dimensions_.output_rows,
+                               graph_.GetNonLayerZeroMasters());
+      } else {
+        MaskGradientNonMasters(input_gradient, layer_dimensions_.output_rows);
+      }
     }
 
 #ifdef GALOIS_ENABLE_GPU
@@ -411,7 +421,12 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
     // mask it, then use it
     // XXX masking may not be required in sampling case where rows change
     if (layer_number_ != 0 || sage_config_.disable_concat) {
-      MaskInputNonMasters(&agg_data, layer_dimensions_.output_rows);
+      if (graph_.IsSubgraphOn()) {
+        MaskInputNonMasters(&agg_data, layer_dimensions_.output_rows,
+                            graph_.GetNonLayerZeroMasters());
+      } else {
+        MaskInputNonMasters(&agg_data, layer_dimensions_.output_rows);
+      }
     }
     // if concat is disabled, then input grad isn't masked; therefore, mask
     // this to get the same effect
@@ -460,11 +475,21 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
     // disable concat part is here because otherwise it would get done elsewhere
     // XXX masking may not be required in sampling case where rows change
     if (layer_number_ != 0 && sage_config_.disable_concat) {
-      MaskInputNonMasters(&input_data, layer_dimensions_.input_rows);
+      if (graph_.IsSubgraphOn()) {
+        MaskInputNonMasters(&input_data, layer_dimensions_.input_rows,
+                            graph_.GetNonLayerZeroMasters());
+      } else {
+        MaskInputNonMasters(&input_data, layer_dimensions_.input_rows);
+      }
     } else {
       // if 0 then no input to mask: mask the gradient
       // this is fine because gradient won't be used to get feature gradients
-      MaskGradientNonMasters(&p_out_temp_, layer_dimensions_.input_rows);
+      if (graph_.IsSubgraphOn()) {
+        MaskGradientNonMasters(&p_out_temp_, layer_dimensions_.input_rows,
+                               graph_.GetNonLayerZeroMasters());
+      } else {
+        MaskGradientNonMasters(&p_out_temp_, layer_dimensions_.input_rows);
+      }
     }
 
     // W' = F^T (FW)'

From b8d89a138499698fe308b28b8ac8a729fd357595 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 25 Jun 2021 16:55:17 -0500
Subject: [PATCH 567/660] prints for gnnsubgraph

---
 libgnn/src/graphs/GNNSubgraph.cpp | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/libgnn/src/graphs/GNNSubgraph.cpp b/libgnn/src/graphs/GNNSubgraph.cpp
index 360586b7df..6faa5ad419 100644
--- a/libgnn/src/graphs/GNNSubgraph.cpp
+++ b/libgnn/src/graphs/GNNSubgraph.cpp
@@ -183,8 +183,27 @@ void galois::graphs::GNNGraph::GNNSubgraph::EdgeCreation(
   underlying_graph_.CSCAllocate();
   TimerStop(&alloc_time);
 
+  galois::gInfo("subgraph nodes and edges are ", num_subgraph_nodes_, " ", num_subgraph_edges_);
+
+  galois::DGAccumulator<uint32_t> empty_masters;
+  galois::DGAccumulator<uint32_t> empty_mirrors;
+  empty_masters.reset();
+  empty_mirrors.reset();
+
   galois::do_all(galois::iterate(uint32_t{0}, num_subgraph_nodes_),
                  [&](uint32_t subgraph_id) {
+                   if (local_subgraph_out_degrees_[subgraph_id] == 0 &&
+                       local_subgraph_in_degrees_[subgraph_id] == 0) {
+                     if (subgraph_id < subgraph_master_boundary_) {
+                       empty_masters += 1;
+                     } else {
+                       if (gnn_graph.GetNonLayerZeroMasters().test(subgraph_id)) {
+                         empty_masters += 1;
+                       } else {
+                         empty_mirrors += 1;
+                       }
+                     }
+                   }
                    underlying_graph_.fixEndEdge(
                        subgraph_id, local_subgraph_out_degrees_[subgraph_id]);
                    underlying_graph_.FixEndInEdge(
@@ -196,6 +215,10 @@ void galois::graphs::GNNGraph::GNNSubgraph::EdgeCreation(
   if (in_subedge_to_original_edge_.size() < num_subgraph_edges_) {
     in_subedge_to_original_edge_.resize(num_subgraph_edges_ * 1.02);
   }
+  uint32_t emaster = empty_masters.reduce();
+  uint32_t emirror = empty_mirrors.reduce();
+  galois::gInfo("empty masters percent is ", emaster / (float)num_subgraph_nodes_, " ", emaster);
+  galois::gInfo("empty mirrors percent is ", emirror / (float)num_subgraph_nodes_, " ", emirror);
 
   // save edges + save reference to layer sample status
   galois::do_all(

From 2ac6505fb78e5405430b5ce4232661113a1cc197 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 25 Jun 2021 17:17:38 -0500
Subject: [PATCH 568/660] Fixed empty master/mirror counting in subgraph

---
 libgnn/src/graphs/GNNSubgraph.cpp | 61 ++++++++++++++++++-------------
 1 file changed, 36 insertions(+), 25 deletions(-)

diff --git a/libgnn/src/graphs/GNNSubgraph.cpp b/libgnn/src/graphs/GNNSubgraph.cpp
index 6faa5ad419..c85da2a957 100644
--- a/libgnn/src/graphs/GNNSubgraph.cpp
+++ b/libgnn/src/graphs/GNNSubgraph.cpp
@@ -166,11 +166,42 @@ void galois::graphs::GNNGraph::GNNSubgraph::EdgeCreation(
     const GNNGraph& gnn_graph) {
   galois::StatTimer timer("EdgeConstruction", kRegionName);
   TimerStart(&timer);
+  galois::DGAccumulator<uint32_t> empty_masters;
+  galois::DGAccumulator<uint32_t> empty_mirrors;
+  empty_masters.reset();
+  empty_mirrors.reset();
 
+  galois::DGAccumulator<uint32_t> total_sn;
+  total_sn.reset();
+  total_sn += num_subgraph_nodes_;
+  size_t global_sub_size = total_sn.reduce();
   // prefix sum over subgraph degrees from previous phase to get starting points
-  for (size_t i = 1; i < num_subgraph_nodes_; i++) {
-    local_subgraph_out_degrees_[i] += local_subgraph_out_degrees_[i - 1];
-    local_subgraph_in_degrees_[i] += local_subgraph_in_degrees_[i - 1];
+  for (size_t i = 0; i < num_subgraph_nodes_; i++) {
+    if (local_subgraph_out_degrees_[i] == 0 &&
+        local_subgraph_in_degrees_[i] == 0) {
+      if (i < subgraph_master_boundary_) {
+        empty_masters += 1;
+      } else {
+        if (gnn_graph.GetNonLayerZeroMasters().test(i)) {
+          empty_masters += 1;
+        } else {
+          empty_mirrors += 1;
+        }
+      }
+    }
+    if (i != 0) {
+      local_subgraph_out_degrees_[i] += local_subgraph_out_degrees_[i - 1];
+      local_subgraph_in_degrees_[i] += local_subgraph_in_degrees_[i - 1];
+    }
+  }
+
+  uint32_t emaster = empty_masters.reduce();
+  uint32_t emirror = empty_mirrors.reduce();
+  if (gnn_graph.host_id() == 0) {
+    galois::gInfo("empty masters percent is ", emaster / (float)global_sub_size,
+                  " ", emaster, " ", global_sub_size);
+    galois::gInfo("empty mirrors percent is ", emirror / (float)global_sub_size,
+                  " ", emirror, " ", global_sub_size);
   }
 
   // allocate then set node endpoints
@@ -183,27 +214,11 @@ void galois::graphs::GNNGraph::GNNSubgraph::EdgeCreation(
   underlying_graph_.CSCAllocate();
   TimerStop(&alloc_time);
 
-  galois::gInfo("subgraph nodes and edges are ", num_subgraph_nodes_, " ", num_subgraph_edges_);
-
-  galois::DGAccumulator<uint32_t> empty_masters;
-  galois::DGAccumulator<uint32_t> empty_mirrors;
-  empty_masters.reset();
-  empty_mirrors.reset();
+  galois::gInfo("subgraph nodes and edges are ", num_subgraph_nodes_, " ",
+                num_subgraph_edges_);
 
   galois::do_all(galois::iterate(uint32_t{0}, num_subgraph_nodes_),
                  [&](uint32_t subgraph_id) {
-                   if (local_subgraph_out_degrees_[subgraph_id] == 0 &&
-                       local_subgraph_in_degrees_[subgraph_id] == 0) {
-                     if (subgraph_id < subgraph_master_boundary_) {
-                       empty_masters += 1;
-                     } else {
-                       if (gnn_graph.GetNonLayerZeroMasters().test(subgraph_id)) {
-                         empty_masters += 1;
-                       } else {
-                         empty_mirrors += 1;
-                       }
-                     }
-                   }
                    underlying_graph_.fixEndEdge(
                        subgraph_id, local_subgraph_out_degrees_[subgraph_id]);
                    underlying_graph_.FixEndInEdge(
@@ -215,10 +230,6 @@ void galois::graphs::GNNGraph::GNNSubgraph::EdgeCreation(
   if (in_subedge_to_original_edge_.size() < num_subgraph_edges_) {
     in_subedge_to_original_edge_.resize(num_subgraph_edges_ * 1.02);
   }
-  uint32_t emaster = empty_masters.reduce();
-  uint32_t emirror = empty_mirrors.reduce();
-  galois::gInfo("empty masters percent is ", emaster / (float)num_subgraph_nodes_, " ", emaster);
-  galois::gInfo("empty mirrors percent is ", emirror / (float)num_subgraph_nodes_, " ", emirror);
 
   // save edges + save reference to layer sample status
   galois::do_all(

From ea88e2757de179afc481bdb7a263aebe8a421777 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 25 Jun 2021 20:23:59 -0500
Subject: [PATCH 569/660] Remove dead mirrors from GNN sampling

Some mirrors have no incoming/outgoing edges and are just activated by
activation on another host; these should not exist in the sampled
subgraph because they occupy rows/memory which results in wasted work.
---
 libgnn/include/galois/GraphNeuralNetwork.h    |  19 +++
 .../galois/graphs/DegreeSyncStructures.h      |   4 +-
 libgnn/include/galois/graphs/GNNGraph.h       |  14 +-
 libgnn/src/GraphNeuralNetwork.cpp             |  11 +-
 libgnn/src/graphs/GNNGraph.cpp                |  98 +++++++-------
 libgnn/src/graphs/GNNSubgraph.cpp             | 127 +++++++++---------
 6 files changed, 153 insertions(+), 120 deletions(-)

diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h
index 3b5b268daa..91bdf67d14 100644
--- a/libgnn/include/galois/GraphNeuralNetwork.h
+++ b/libgnn/include/galois/GraphNeuralNetwork.h
@@ -199,6 +199,25 @@ class GraphNeuralNetwork {
   //! most literature
   void GradientPropagation();
 
+  //! # nodes may change in distributed setting due to dead mirrors;
+  //! given the # of nodes at each layer, fix the input/output rows
+  void CorrectRowCounts(const std::vector<unsigned>& nodes_at_each_layer) {
+    size_t layer_offset = 0;
+    // work backwards
+    for (auto back_iter = gnn_layers_.rbegin(); back_iter != gnn_layers_.rend();
+         back_iter++) {
+      GNNLayerType layer_type = (*back_iter)->layer_type();
+      if (layer_type == GNNLayerType::kGraphConvolutional ||
+          layer_type == GNNLayerType::kSAGE) {
+        (*back_iter)
+            ->ResizeInputOutputRows(nodes_at_each_layer[layer_offset + 1],
+                                    nodes_at_each_layer[layer_offset]);
+        layer_offset++;
+      }
+    }
+    GALOIS_LOG_ASSERT(layer_offset + 1 == nodes_at_each_layer.size());
+  }
+
   //! Call whenever resize occurs to correct reuse of pointers for layers
   void CorrectBackwardLinks();
 
diff --git a/libgnn/include/galois/graphs/DegreeSyncStructures.h b/libgnn/include/galois/graphs/DegreeSyncStructures.h
index 91a94d64ac..659541570d 100644
--- a/libgnn/include/galois/graphs/DegreeSyncStructures.h
+++ b/libgnn/include/galois/graphs/DegreeSyncStructures.h
@@ -61,9 +61,7 @@ struct InitialDegreeSync {
 struct SubgraphDegreeSync {
   using ValTy = galois::gstl::Vector<uint32_t>;
 
-  static size_t FeatVecSize() {
-    return gnn_sampled_out_degrees_->size();
-  }
+  static size_t FeatVecSize() { return gnn_sampled_out_degrees_->size(); }
 
   static ValTy extract(uint32_t lid, char&) {
     ValTy vec_to_send(gnn_sampled_out_degrees_->size());
diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 9c2e6061bf..09fe0bffe4 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -309,11 +309,12 @@ class GNNGraph {
   size_t SampleEdges(size_t sample_layer_num, size_t num_to_sample,
                      bool inductive_subgraph, size_t timestamp);
 
-  size_t ConstructSampledSubgraph(size_t num_sampled_layers) {
+  std::vector<unsigned> ConstructSampledSubgraph(size_t num_sampled_layers) {
     return ConstructSampledSubgraph(num_sampled_layers, false);
   };
   //! Construct the subgraph from sampled edges and corresponding nodes
-  size_t ConstructSampledSubgraph(size_t num_sampled_layers, bool use_view);
+  std::vector<unsigned> ConstructSampledSubgraph(size_t num_sampled_layers,
+                                                 bool use_view);
 
   unsigned SampleNodeTimestamp(unsigned lid) const {
     return sample_node_timestamps_[lid];
@@ -590,6 +591,10 @@ class GNNGraph {
     }
   }
 
+  bool IsActiveInSubgraph(size_t node_id) const {
+    return definitely_sampled_nodes_.test(node_id);
+  }
+
   //! Calculate norm factor considering the entire graph
   void CalculateFullNormFactor();
 
@@ -738,6 +743,11 @@ class GNNGraph {
   std::vector<unsigned> sample_master_offsets_;
   //! Count of how many mirrors are in each layer in a sampled subgraph.
   std::vector<unsigned> sample_mirror_offsets_;
+  //! Definitely sampled nodes
+  galois::DynamicBitSet definitely_sampled_nodes_;
+
+  std::vector<galois::GAccumulator<uint32_t>> master_offset_accum_;
+  std::vector<galois::GAccumulator<uint32_t>> mirror_offset_accum_;
   //! In a subgraph, all layer 0 masters are made the prefix of SIDs; other
   //! masters that are not layer 0 will be scattered elsewhere. This bitset
   //! tracks which of those SIDs are the masters.
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index b29ec3af88..110dff0ad4 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -215,7 +215,7 @@ float galois::GraphNeuralNetwork::MinibatchedTesting() {
     }
 
     // resize layer matrices
-    graph_->ConstructSampledSubgraph(num_sampled_layers);
+    CorrectRowCounts(graph_->ConstructSampledSubgraph(num_sampled_layers));
     graph_->EnableSubgraphChooseAll();
     CorrectBackwardLinks();
 
@@ -278,7 +278,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
         num_sampled_layers++;
       }
     }
-    graph_->ConstructSampledSubgraph(num_sampled_layers);
+    CorrectRowCounts(graph_->ConstructSampledSubgraph(num_sampled_layers));
     CorrectBackwardLinks();
   }
 
@@ -342,7 +342,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
         }
       }
       // resize layer matrices
-      graph_->ConstructSampledSubgraph(num_sampled_layers);
+      CorrectRowCounts(graph_->ConstructSampledSubgraph(num_sampled_layers));
       CorrectBackwardLinks();
       mb_timer.stop();
     }
@@ -379,6 +379,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
         gnn_layers_.back()->ResizeInputOutputRows(seed_node_count,
                                                   seed_node_count);
 
+        // +1 later in call because 0 is already taken
         size_t num_sampled_layers = 0;
         for (auto back_iter = gnn_layers_.rbegin();
              back_iter != gnn_layers_.rend(); back_iter++) {
@@ -411,7 +412,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
         }
 
         // resize layer matrices
-        graph_->ConstructSampledSubgraph(num_sampled_layers);
+        CorrectRowCounts(graph_->ConstructSampledSubgraph(num_sampled_layers));
         CorrectBackwardLinks();
         // XXX resizes above only work for SAGE layers; will break if other
         // layers are tested
@@ -598,7 +599,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
             num_sampled_layers++;
           }
         }
-        graph_->ConstructSampledSubgraph(num_sampled_layers);
+        CorrectRowCounts(graph_->ConstructSampledSubgraph(num_sampled_layers));
         CorrectBackwardLinks();
       }
 
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index fdd2d6e1dc..b23d5f81f4 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -899,7 +899,9 @@ void galois::graphs::GNNGraph::InitializeSamplingData(size_t num_layers,
   } else {
     subgraph_choose_all_ = true;
   }
-
+  definitely_sampled_nodes_.resize(partitioned_graph_->size());
+  master_offset_accum_.resize(num_layers + 1);
+  mirror_offset_accum_.resize(num_layers + 1);
   sample_master_offsets_.resize(num_layers + 1, 0);
   sample_mirror_offsets_.resize(num_layers + 1, 0);
 }
@@ -910,12 +912,14 @@ size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) {
 
   bitset_sample_flag_.resize(size());
   bitset_sample_flag_.reset();
+  definitely_sampled_nodes_.reset();
 
   galois::do_all(galois::iterate(begin_owned(), end_owned()),
                  [&](const NodeIterator& x) {
                    if (IsValidForPhase(*x, seed_phase)) {
                      SetSampledNode(*x);
                      bitset_sample_flag_.set(*x);
+                     definitely_sampled_nodes_.set(*x);
                    } else {
                      UnsetSampledNode(*x);
                    }
@@ -933,6 +937,11 @@ size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) {
   std::fill(sample_master_offsets_.begin(), sample_master_offsets_.end(), 0);
   std::fill(sample_mirror_offsets_.begin(), sample_mirror_offsets_.end(), 0);
 
+  for (unsigned i = 0; i < master_offset_accum_.size(); i++) {
+    master_offset_accum_[i].reset();
+    mirror_offset_accum_[i].reset();
+  }
+
   // clear all sampled edges
   galois::do_all(
       galois::iterate(edge_sample_status_.begin(), edge_sample_status_.end()),
@@ -996,11 +1005,6 @@ size_t galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num,
   use_subgraph_      = false;
   use_subgraph_view_ = false;
 
-  // galois::GAccumulator<size_t> sampled;
-  // galois::GAccumulator<size_t> total;
-  // sampled.reset();
-  // total.reset();
-
   galois::do_all(
       galois::iterate(begin(), end()),
       [&](const NodeIterator& src_iter) {
@@ -1019,11 +1023,12 @@ size_t galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num,
             }
 
             MakeEdgeSampled(edge_iter, agg_layer_num);
-            if (!IsInSampledGraph(partitioned_graph_->getEdgeDst(edge_iter))) {
-              bitset_sample_flag_.set(
-                  partitioned_graph_->getEdgeDst(edge_iter));
+            uint32_t dest = partitioned_graph_->getEdgeDst(edge_iter);
+            if (!IsInSampledGraph(dest)) {
+              bitset_sample_flag_.set(dest);
             }
-            // sampled += 1;
+            definitely_sampled_nodes_.set(*src_iter);
+            definitely_sampled_nodes_.set(dest);
           }
         }
       },
@@ -1056,29 +1061,15 @@ size_t galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num,
 
   galois::GAccumulator<unsigned> local_sample_count;
   local_sample_count.reset();
-  galois::GAccumulator<unsigned> master_offset;
-  master_offset.reset();
-  galois::GAccumulator<unsigned> mirror_offset;
-  mirror_offset.reset();
   // count # of seed nodes
   galois::do_all(galois::iterate(begin(), end()), [&](const NodeIterator& x) {
     if (IsInSampledGraph(x)) {
       local_sample_count += 1;
       if (sample_node_timestamps_[*x] == std::numeric_limits<uint32_t>::max()) {
-        if (*x < *end_owned()) {
-          master_offset += 1;
-        } else {
-          // mirror
-          mirror_offset += 1;
-        }
         sample_node_timestamps_[*x] = timestamp;
       }
     }
   });
-  assert(sample_master_offsets_.size() > timestamp);
-  assert(sample_mirror_offsets_.size() > timestamp);
-  sample_master_offsets_[timestamp] = master_offset.reduce();
-  sample_mirror_offsets_[timestamp] = mirror_offset.reduce();
 
   EnableSubgraphChooseAll();
   return local_sample_count.reduce();
@@ -1121,15 +1112,16 @@ size_t galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num,
                 }
               }
 
+              uint32_t edge_dst = partitioned_graph_->getEdgeDst(edge_iter);
               // if here, it means edge accepted; set sampled on, mark
               // as part of next set
               MakeEdgeSampled(edge_iter, sample_layer_num);
-              if (!IsInSampledGraph(
-                      partitioned_graph_->getEdgeDst(edge_iter))) {
-                bitset_sample_flag_.set(
-                    partitioned_graph_->getEdgeDst(edge_iter));
+              if (!IsInSampledGraph(edge_dst)) {
+                bitset_sample_flag_.set(edge_dst);
               }
               bitset_sampled_degrees_.set(*src_iter);
+              definitely_sampled_nodes_.set(*src_iter);
+              definitely_sampled_nodes_.set(edge_dst);
               // degree increment
               sampled_out_degrees_[sample_layer_num][*src_iter]++;
             }
@@ -1165,37 +1157,22 @@ size_t galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num,
   // count sampled node size
   galois::GAccumulator<unsigned> local_sample_count;
   local_sample_count.reset();
-  galois::GAccumulator<unsigned> master_offset;
-  master_offset.reset();
-  galois::GAccumulator<unsigned> mirror_offset;
-  mirror_offset.reset();
   // count # of seed nodes
   galois::do_all(galois::iterate(begin(), end()), [&](const NodeIterator& x) {
     if (IsInSampledGraph(x)) {
-
       local_sample_count += 1;
       if (sample_node_timestamps_[*x] == std::numeric_limits<uint32_t>::max()) {
-        if (*x < *end_owned()) {
-          master_offset += 1;
-        } else {
-          // mirror
-          mirror_offset += 1;
-        }
         sample_node_timestamps_[*x] = timestamp;
       }
     }
   });
-  assert(sample_master_offsets_.size() > timestamp);
-  assert(sample_mirror_offsets_.size() > timestamp);
-  sample_master_offsets_[timestamp] = master_offset.reduce();
-  sample_mirror_offsets_[timestamp] = mirror_offset.reduce();
 
   DisableSubgraphChooseAll();
   return local_sample_count.reduce();
 }
 
 //! Construct the subgraph from sampled edges and corresponding nodes
-size_t
+std::vector<unsigned>
 galois::graphs::GNNGraph::ConstructSampledSubgraph(size_t num_sampled_layers,
                                                    bool use_view) {
   // false first so that the build process can use functions to access the
@@ -1215,13 +1192,36 @@ galois::graphs::GNNGraph::ConstructSampledSubgraph(size_t num_sampled_layers,
         ->sync<writeSource, readAny, SubgraphDegreeSync, SubgraphDegreeBitset>(
             "Ignore");
   }
-  size_t num_subgraph_nodes;
+
+  galois::do_all(galois::iterate(begin(), end()), [&](const NodeIterator& x) {
+    if (IsActiveInSubgraph(*x)) {
+      if (sample_node_timestamps_[*x] != std::numeric_limits<uint32_t>::max()) {
+        if (*x < *end_owned()) {
+          // master
+          master_offset_accum_[sample_node_timestamps_[*x]] += 1;
+        } else {
+          // mirror
+          mirror_offset_accum_[sample_node_timestamps_[*x]] += 1;
+        }
+      } else {
+        GALOIS_LOG_FATAL(
+            "should have been timestamped at some point if active");
+      }
+    }
+  });
+
+  std::vector<unsigned> new_rows(master_offset_accum_.size());
+  for (unsigned i = 0; i < master_offset_accum_.size(); i++) {
+    sample_master_offsets_[i] = master_offset_accum_[i].reduce();
+    sample_mirror_offsets_[i] = mirror_offset_accum_[i].reduce();
+    new_rows[i] = sample_master_offsets_[i] + sample_mirror_offsets_[i];
+  }
+
   if (!use_view) {
-    num_subgraph_nodes = subgraph_->BuildSubgraph(*this, num_sampled_layers);
+    subgraph_->BuildSubgraph(*this, num_sampled_layers);
   } else {
     // a view only has lid<->sid mappings
-    num_subgraph_nodes =
-        subgraph_->BuildSubgraphView(*this, num_sampled_layers);
+    subgraph_->BuildSubgraphView(*this, num_sampled_layers);
   }
 
   // after this, this graph is a subgraph
@@ -1231,7 +1231,7 @@ galois::graphs::GNNGraph::ConstructSampledSubgraph(size_t num_sampled_layers,
     use_subgraph_view_ = true;
   }
 
-  return num_subgraph_nodes;
+  return new_rows;
 }
 
 size_t galois::graphs::GNNGraph::PrepareNextTrainMinibatch() {
diff --git a/libgnn/src/graphs/GNNSubgraph.cpp b/libgnn/src/graphs/GNNSubgraph.cpp
index c85da2a957..fb1d7c78c6 100644
--- a/libgnn/src/graphs/GNNSubgraph.cpp
+++ b/libgnn/src/graphs/GNNSubgraph.cpp
@@ -39,11 +39,15 @@ void galois::graphs::GNNGraph::GNNSubgraph::CreateSubgraphMapping(
   std::fill(lid_to_subgraph_id_.begin(), lid_to_subgraph_id_.end(),
             std::numeric_limits<uint32_t>::max());
 
+  std::vector<unsigned>& master_offsets = gnn_graph.GetMasterOffsets();
+  std::vector<unsigned>& mirror_offsets = gnn_graph.GetMirrorOffsets();
+
   galois::GAccumulator<uint32_t> subgraph_count;
   subgraph_count.reset();
   galois::do_all(galois::iterate(gnn_graph.begin(), gnn_graph.end()),
                  [&](uint32_t node_id) {
-                   if (gnn_graph.IsInSampledGraph(node_id)) {
+                   // if (gnn_graph.IsInSampledGraph(node_id)) {
+                   if (gnn_graph.IsActiveInSubgraph(node_id)) {
                      subgraph_count += 1;
                    }
                  });
@@ -54,9 +58,6 @@ void galois::graphs::GNNGraph::GNNSubgraph::CreateSubgraphMapping(
 
   galois::DynamicBitSet& non_layer_zero_masters =
       gnn_graph.GetNonLayerZeroMasters();
-  std::vector<unsigned>& master_offsets = gnn_graph.GetMasterOffsets();
-  std::vector<unsigned>& mirror_offsets = gnn_graph.GetMirrorOffsets();
-
   // init the bitset as necessary
   if (non_layer_zero_masters.size() < num_subgraph_nodes_) {
     non_layer_zero_masters.resize(num_subgraph_nodes_);
@@ -78,21 +79,22 @@ void galois::graphs::GNNGraph::GNNSubgraph::CreateSubgraphMapping(
 
   // split into 2 parts: masters, then everything else
   size_t last_owned_node = *(gnn_graph.end_owned());
-  galois::gInfo(last_owned_node);
   for (size_t local_node_id = 0; local_node_id < last_owned_node;
        local_node_id++) {
-    unsigned node_timestamp = gnn_graph.SampleNodeTimestamp(local_node_id);
-    if (node_timestamp != std::numeric_limits<unsigned>::max()) {
-      uint32_t sid_to_use;
-      if (node_timestamp != 0) {
-        sid_to_use = layer_offsets[node_timestamp - 1]++;
-        // master that won't be in prefix needs to be marked
-        non_layer_zero_masters.set(sid_to_use);
-      } else {
-        sid_to_use = layer_zero_offset++;
+    if (gnn_graph.IsActiveInSubgraph(local_node_id)) {
+      unsigned node_timestamp = gnn_graph.SampleNodeTimestamp(local_node_id);
+      if (node_timestamp != std::numeric_limits<unsigned>::max()) {
+        uint32_t sid_to_use;
+        if (node_timestamp != 0) {
+          sid_to_use = layer_offsets[node_timestamp - 1]++;
+          // master that won't be in prefix needs to be marked
+          non_layer_zero_masters.set(sid_to_use);
+        } else {
+          sid_to_use = layer_zero_offset++;
+        }
+        subgraph_id_to_lid_[sid_to_use]    = local_node_id;
+        lid_to_subgraph_id_[local_node_id] = sid_to_use++;
       }
-      subgraph_id_to_lid_[sid_to_use]    = local_node_id;
-      lid_to_subgraph_id_[local_node_id] = sid_to_use++;
     }
   }
 
@@ -105,16 +107,18 @@ void galois::graphs::GNNGraph::GNNSubgraph::CreateSubgraphMapping(
   // everything else; none of these are master nodes
   for (size_t local_node_id = last_owned_node; local_node_id < gnn_graph.size();
        local_node_id++) {
-    unsigned node_timestamp = gnn_graph.SampleNodeTimestamp(local_node_id);
-    if (node_timestamp != std::numeric_limits<unsigned>::max()) {
-      uint32_t sid_to_use;
-      if (node_timestamp != 0) {
-        sid_to_use = layer_offsets[node_timestamp - 1]++;
-      } else {
-        sid_to_use = layer_zero_offset++;
+    if (gnn_graph.IsActiveInSubgraph(local_node_id)) {
+      unsigned node_timestamp = gnn_graph.SampleNodeTimestamp(local_node_id);
+      if (node_timestamp != std::numeric_limits<unsigned>::max()) {
+        uint32_t sid_to_use;
+        if (node_timestamp != 0) {
+          sid_to_use = layer_offsets[node_timestamp - 1]++;
+        } else {
+          sid_to_use = layer_zero_offset++;
+        }
+        subgraph_id_to_lid_[sid_to_use]    = local_node_id;
+        lid_to_subgraph_id_[local_node_id] = sid_to_use++;
       }
-      subgraph_id_to_lid_[sid_to_use]    = local_node_id;
-      lid_to_subgraph_id_[local_node_id] = sid_to_use++;
     }
   }
 
@@ -166,43 +170,44 @@ void galois::graphs::GNNGraph::GNNSubgraph::EdgeCreation(
     const GNNGraph& gnn_graph) {
   galois::StatTimer timer("EdgeConstruction", kRegionName);
   TimerStart(&timer);
-  galois::DGAccumulator<uint32_t> empty_masters;
-  galois::DGAccumulator<uint32_t> empty_mirrors;
-  empty_masters.reset();
-  empty_mirrors.reset();
-
-  galois::DGAccumulator<uint32_t> total_sn;
-  total_sn.reset();
-  total_sn += num_subgraph_nodes_;
-  size_t global_sub_size = total_sn.reduce();
+  // galois::DGAccumulator<uint32_t> empty_masters;
+  // galois::DGAccumulator<uint32_t> empty_mirrors;
+  // empty_masters.reset();
+  // empty_mirrors.reset();
+
+  // galois::DGAccumulator<uint32_t> total_sn;
+  // total_sn.reset();
+  // total_sn += num_subgraph_nodes_;
+  // size_t global_sub_size = total_sn.reduce();
+
   // prefix sum over subgraph degrees from previous phase to get starting points
-  for (size_t i = 0; i < num_subgraph_nodes_; i++) {
-    if (local_subgraph_out_degrees_[i] == 0 &&
-        local_subgraph_in_degrees_[i] == 0) {
-      if (i < subgraph_master_boundary_) {
-        empty_masters += 1;
-      } else {
-        if (gnn_graph.GetNonLayerZeroMasters().test(i)) {
-          empty_masters += 1;
-        } else {
-          empty_mirrors += 1;
-        }
-      }
-    }
-    if (i != 0) {
-      local_subgraph_out_degrees_[i] += local_subgraph_out_degrees_[i - 1];
-      local_subgraph_in_degrees_[i] += local_subgraph_in_degrees_[i - 1];
-    }
+  for (size_t i = 1; i < num_subgraph_nodes_; i++) {
+    // if (local_subgraph_out_degrees_[i] == 0 &&
+    //    local_subgraph_in_degrees_[i] == 0) {
+    //  if (i < subgraph_master_boundary_) {
+    //    empty_masters += 1;
+    //  } else {
+    //    if (gnn_graph.GetNonLayerZeroMasters().test(i)) {
+    //      empty_masters += 1;
+    //    } else {
+    //      empty_mirrors += 1;
+    //    }
+    //  }
+    //}
+    local_subgraph_out_degrees_[i] += local_subgraph_out_degrees_[i - 1];
+    local_subgraph_in_degrees_[i] += local_subgraph_in_degrees_[i - 1];
   }
 
-  uint32_t emaster = empty_masters.reduce();
-  uint32_t emirror = empty_mirrors.reduce();
-  if (gnn_graph.host_id() == 0) {
-    galois::gInfo("empty masters percent is ", emaster / (float)global_sub_size,
-                  " ", emaster, " ", global_sub_size);
-    galois::gInfo("empty mirrors percent is ", emirror / (float)global_sub_size,
-                  " ", emirror, " ", global_sub_size);
-  }
+  // uint32_t emaster = empty_masters.reduce();
+  // uint32_t emirror = empty_mirrors.reduce();
+  // if (gnn_graph.host_id() == 0) {
+  //  galois::gInfo("Empty masters percent is ", emaster /
+  //  (float)global_sub_size,
+  //                " ", emaster, " ", global_sub_size);
+  //  galois::gInfo("Empty mirrors percent is ", emirror /
+  //  (float)global_sub_size,
+  //                " ", emirror, " ", global_sub_size);
+  //}
 
   // allocate then set node endpoints
   num_subgraph_edges_ = local_subgraph_out_degrees_[num_subgraph_nodes_ - 1];
@@ -214,8 +219,8 @@ void galois::graphs::GNNGraph::GNNSubgraph::EdgeCreation(
   underlying_graph_.CSCAllocate();
   TimerStop(&alloc_time);
 
-  galois::gInfo("subgraph nodes and edges are ", num_subgraph_nodes_, " ",
-                num_subgraph_edges_);
+  galois::gInfo(gnn_graph.host_prefix(), "Subgraph nodes and edges are ",
+                num_subgraph_nodes_, " ", num_subgraph_edges_);
 
   galois::do_all(galois::iterate(uint32_t{0}, num_subgraph_nodes_),
                  [&](uint32_t subgraph_id) {

From 8f6bfdeb9ad2cef882e2ce08c887b1452e028097 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Sat, 26 Jun 2021 15:20:38 -0500
Subject: [PATCH 570/660] GNN: FATAL in dead test code path

---
 libgnn/src/GraphNeuralNetwork.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index 110dff0ad4..957e7a8eea 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -440,6 +440,9 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
           DisableTimers();
           float test_acc;
           if (!config_.test_minibatch_size()) {
+            // TODO something about this path breaks accuracy
+            GALOIS_LOG_FATAL("this path breaks accuracy for the rest of the "
+                             "run for some reason");
             bool f = graph_->SubgraphChooseAllStatus();
             graph_->DisableSubgraph();
             for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end();

From 2c2008c3d335ff884d3a3a73f318a878d97e3f09 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Sat, 26 Jun 2021 19:18:00 -0500
Subject: [PATCH 571/660] Fixed row correction code to remove dead mirrors

Needed to do a prefix sum over actives in each layer when correcting
rows after removal of dead mirror nodes.
---
 libgnn/include/galois/GraphNeuralNetwork.h | 2 ++
 libgnn/src/graphs/GNNGraph.cpp             | 3 +++
 2 files changed, 5 insertions(+)

diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h
index 91bdf67d14..4e33bcb8fa 100644
--- a/libgnn/include/galois/GraphNeuralNetwork.h
+++ b/libgnn/include/galois/GraphNeuralNetwork.h
@@ -209,6 +209,8 @@ class GraphNeuralNetwork {
       GNNLayerType layer_type = (*back_iter)->layer_type();
       if (layer_type == GNNLayerType::kGraphConvolutional ||
           layer_type == GNNLayerType::kSAGE) {
+        GALOIS_LOG_ASSERT(nodes_at_each_layer[layer_offset + 1] >=
+                          nodes_at_each_layer[layer_offset]);
         (*back_iter)
             ->ResizeInputOutputRows(nodes_at_each_layer[layer_offset + 1],
                                     nodes_at_each_layer[layer_offset]);
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index b23d5f81f4..ef92ef7615 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -1215,6 +1215,9 @@ galois::graphs::GNNGraph::ConstructSampledSubgraph(size_t num_sampled_layers,
     sample_master_offsets_[i] = master_offset_accum_[i].reduce();
     sample_mirror_offsets_[i] = mirror_offset_accum_[i].reduce();
     new_rows[i] = sample_master_offsets_[i] + sample_mirror_offsets_[i];
+    if (i > 0) {
+      new_rows[i] += new_rows[i - 1];
+    }
   }
 
   if (!use_view) {

From 8c15669112e73aed7d51a81839e1829d5d0e6106 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 29 Jun 2021 18:13:18 -0500
Subject: [PATCH 572/660] Sample sync; non subgraph node needs to return vec

Before this commit, sample sync for a node not in the subgraph would
return an empty vector. This is bad because Gluon will expect vectors of
a certain length, and trying to copy beyond allocated memory may cause
memory corruption. This fixes that by allocating dead space to serialize
anyways.
---
 libgnn/include/galois/graphs/GraphAggregationSyncStructures.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
index 7501a7c23d..29fdd66e0c 100644
--- a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
+++ b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
@@ -178,6 +178,9 @@ struct GNNSampleSumAggregate {
     extracted_vec.reserve(gnn_matrix_to_sync_column_length_);
     if ((*gnn_lid_to_sid_pointer_)[node_id] ==
         std::numeric_limits<uint32_t>::max()) {
+      // need to have correct size because serializer will expect
+      // it to be of a certain length
+      extracted_vec.resize(gnn_matrix_to_sync_column_length_, 0);
       return extracted_vec;
     }
 

From 3d820f6cc46243cf63c16d10d429a826956c4506 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 29 Jun 2021 21:21:31 -0500
Subject: [PATCH 573/660] Resizes layers after mirror nodes are deleted

Only resizes layers (i.e. allocates memory) after it is known
how many mirror nodes remain after deletion of dead mirrors rather than
resizing the worst case scenario and changing rows later. This is done
to save memory.
---
 libgnn/include/galois/GraphNeuralNetwork.h |  5 +++
 libgnn/src/GraphNeuralNetwork.cpp          | 51 ++++++++++++----------
 2 files changed, 32 insertions(+), 24 deletions(-)

diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h
index 4e33bcb8fa..7d71efa61c 100644
--- a/libgnn/include/galois/GraphNeuralNetwork.h
+++ b/libgnn/include/galois/GraphNeuralNetwork.h
@@ -202,6 +202,11 @@ class GraphNeuralNetwork {
   //! # nodes may change in distributed setting due to dead mirrors;
   //! given the # of nodes at each layer, fix the input/output rows
   void CorrectRowCounts(const std::vector<unsigned>& nodes_at_each_layer) {
+    // assumes last layer is  output row and resizes it based on first
+    // offset
+    gnn_layers_.back()->ResizeInputOutputRows(nodes_at_each_layer[0],
+                                              nodes_at_each_layer[0]);
+
     size_t layer_offset = 0;
     // work backwards
     for (auto back_iter = gnn_layers_.rbegin(); back_iter != gnn_layers_.rend();
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index 957e7a8eea..1ed89e99cc 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -188,9 +188,11 @@ float galois::GraphNeuralNetwork::MinibatchedTesting() {
   uint32_t total   = 0;
   while (true) {
     work_left_.reset();
-    size_t seed_node_count = graph_->PrepareNextTestMinibatch();
+    // size_t seed_node_count = graph_->PrepareNextTestMinibatch();
+    graph_->PrepareNextTestMinibatch();
     // last layer input size/output rows becomes seed node size
-    gnn_layers_.back()->ResizeInputOutputRows(seed_node_count, seed_node_count);
+    // gnn_layers_.back()->ResizeInputOutputRows(seed_node_count,
+    // seed_node_count);
     size_t num_sampled_layers = 0;
 
     for (auto back_iter = gnn_layers_.rbegin(); back_iter != gnn_layers_.rend();
@@ -200,14 +202,14 @@ float galois::GraphNeuralNetwork::MinibatchedTesting() {
           layer_type == GNNLayerType::kSAGE) {
         // you can minibatch with sampling or minibatch and grab all
         // relevant neighbors
-        size_t current_sample_size;
-        current_sample_size =
-            graph_->SampleAllEdges((*back_iter)->graph_user_layer_number(),
-                                   false, num_sampled_layers + 1);
+        // size_t current_sample_size;
+        graph_->SampleAllEdges((*back_iter)->graph_user_layer_number(), false,
+                               num_sampled_layers + 1);
         // resize this layer, change seed node count
-        (*back_iter)
-            ->ResizeInputOutputRows(current_sample_size, seed_node_count);
-        seed_node_count = current_sample_size;
+        //(*back_iter)
+        //    ->ResizeInputOutputRows(current_sample_size, seed_node_count);
+        // seed_node_count = current_sample_size;
+
         num_sampled_layers++;
         // XXX resizes above only work for SAGE layers; will break if other
         // layers are tested
@@ -257,7 +259,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
     galois::gDebug(graph_->host_prefix(), "Number of local seed nodes is ",
                    local_seed_node_count);
     size_t num_sampled_layers = 0;
-    gnn_layers_.back()->ResizeRows(local_seed_node_count);
+    // gnn_layers_.back()->ResizeRows(local_seed_node_count);
     for (auto back_iter = gnn_layers_.rbegin(); back_iter != gnn_layers_.rend();
          back_iter++) {
       GNNLayerType layer_type = (*back_iter)->layer_type();
@@ -271,8 +273,9 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
                        (*back_iter)->graph_user_layer_number(), " is ",
                        current_sample_size);
         // resizing
-        (*back_iter)
-            ->ResizeInputOutputRows(current_sample_size, local_seed_node_count);
+        //(*back_iter)
+        //    ->ResizeInputOutputRows(current_sample_size,
+        //    local_seed_node_count);
         local_seed_node_count = current_sample_size;
         subgraph_layer_sizes.emplace_back(local_seed_node_count);
         num_sampled_layers++;
@@ -313,7 +316,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
       mb_timer.start();
 
       size_t local_seed_node_count = graph_->SetupNeighborhoodSample();
-      gnn_layers_.back()->ResizeRows(local_seed_node_count);
+      // gnn_layers_.back()->ResizeRows(local_seed_node_count);
       galois::gDebug(graph_->host_prefix(), "Number of local seed nodes is ",
                      local_seed_node_count);
       size_t num_sampled_layers = 0;
@@ -334,9 +337,9 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
                          (*back_iter)->graph_user_layer_number(), " is ",
                          current_sample_size);
 
-          (*back_iter)
-              ->ResizeInputOutputRows(current_sample_size,
-                                      local_seed_node_count);
+          //(*back_iter)
+          //    ->ResizeInputOutputRows(current_sample_size,
+          //                            local_seed_node_count);
           local_seed_node_count = current_sample_size;
           num_sampled_layers++;
         }
@@ -376,8 +379,8 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
                        seed_node_count);
 
         // last layer input size/output rows becomes seed node size
-        gnn_layers_.back()->ResizeInputOutputRows(seed_node_count,
-                                                  seed_node_count);
+        // gnn_layers_.back()->ResizeInputOutputRows(seed_node_count,
+        //                                          seed_node_count);
 
         // +1 later in call because 0 is already taken
         size_t num_sampled_layers = 0;
@@ -404,8 +407,8 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
                            (*back_iter)->graph_user_layer_number(), " is ",
                            current_sample_size);
             // resize this layer, change seed node count
-            (*back_iter)
-                ->ResizeInputOutputRows(current_sample_size, seed_node_count);
+            //(*back_iter)
+            //    ->ResizeInputOutputRows(current_sample_size, seed_node_count);
             seed_node_count = current_sample_size;
             num_sampled_layers++;
           }
@@ -585,7 +588,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
         galois::gDebug(graph_->host_prefix(), "Number of local seed nodes is ",
                        local_seed_node_count);
         size_t num_sampled_layers = 0;
-        gnn_layers_.back()->ResizeRows(local_seed_node_count);
+        // gnn_layers_.back()->ResizeRows(local_seed_node_count);
         for (auto back_iter = gnn_layers_.rbegin();
              back_iter != gnn_layers_.rend(); back_iter++) {
           GNNLayerType layer_type = (*back_iter)->layer_type();
@@ -595,9 +598,9 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
                 (*back_iter)->graph_user_layer_number(),
                 config_.inductive_subgraph_, num_sampled_layers + 1);
             // resizing
-            (*back_iter)
-                ->ResizeInputOutputRows(current_sample_size,
-                                        local_seed_node_count);
+            //(*back_iter)
+            //    ->ResizeInputOutputRows(current_sample_size,
+            //                            local_seed_node_count);
             local_seed_node_count = current_sample_size;
             num_sampled_layers++;
           }

From 46196de98c87f3e6143554902b7e4811eecb4a73 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 30 Jun 2021 17:57:27 -0500
Subject: [PATCH 574/660] Softmax backward timer fix

Timer for backward phase of softmax was coupled into the forward timer;
this commit changes it to backward
---
 libgnn/src/layers/SoftmaxLayer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp
index eb6e900413..8b99db4073 100644
--- a/libgnn/src/layers/SoftmaxLayer.cpp
+++ b/libgnn/src/layers/SoftmaxLayer.cpp
@@ -88,7 +88,7 @@ galois::SoftmaxLayer::ForwardPhase(
 
 galois::PointerWithSize<galois::GNNFloat>
 galois::SoftmaxLayer::BackwardPhaseCPU() {
-  galois::StatTimer timer("SoftmaxForward", "SoftmaxLayer");
+  galois::StatTimer timer("SoftmaxBackward", "SoftmaxLayer");
   TimerStart(&timer);
 
   const size_t feature_length = layer_dimensions_.input_columns;

From c99236cab58001997db76f806fa215c84df4404a Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 30 Jun 2021 18:00:58 -0500
Subject: [PATCH 575/660] Adjustable learning rate for Adam optimizer

Command line argument added to adjust Adam learning rate for GNNs.
---
 lonestar/libgnnbench/include/GNNBench/Start.h |  1 +
 lonestar/libgnnbench/src/Input.cpp            | 11 ++++++++++-
 lonestar/libgnnbench/src/Start.cpp            |  1 +
 3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/lonestar/libgnnbench/include/GNNBench/Start.h b/lonestar/libgnnbench/include/GNNBench/Start.h
index 48507df80e..125307e0c3 100644
--- a/lonestar/libgnnbench/include/GNNBench/Start.h
+++ b/lonestar/libgnnbench/include/GNNBench/Start.h
@@ -18,6 +18,7 @@ extern llvm::cl::opt<galois::GNNLayerType> cl_layer_type;
 extern llvm::cl::opt<unsigned> train_minibatch_size;
 extern llvm::cl::opt<unsigned> test_minibatch_size;
 extern llvm::cl::opt<bool> do_graph_sampling;
+extern llvm::cl::opt<float> learning_rate;
 
 #ifdef GALOIS_ENABLE_GPU
 std::string personality_str(DevicePersonality p);
diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp
index 3ebee8adea..7719340224 100644
--- a/lonestar/libgnnbench/src/Input.cpp
+++ b/lonestar/libgnnbench/src/Input.cpp
@@ -160,6 +160,11 @@ llvm::cl::opt<unsigned>
                   cll::desc("# of epochs to test test set (default 0)"),
                   cll::init(0));
 
+llvm::cl::opt<float>
+    learning_rate("learningRate",
+                  cll::desc("Adam optimizer learning rate (default 0.01)"),
+                  cll::init(0.01));
+
 const char* GNNPartitionToString(galois::graphs::GNNPartitionScheme s) {
   switch (s) {
   case galois::graphs::GNNPartitionScheme::kOEC:
@@ -304,8 +309,12 @@ CreateOptimizer(const galois::graphs::GNNGraph* gnn_graph) {
   }
   GALOIS_LOG_ASSERT(opt_sizes.size() == num_layers);
 
+  galois::AdamOptimizer::AdamConfiguration adam_config;
+  adam_config.alpha = learning_rate;
+
   // TODO only adam works right now, add the others later
-  return std::make_unique<galois::AdamOptimizer>(opt_sizes, num_layers);
+  return std::make_unique<galois::AdamOptimizer>(adam_config, opt_sizes,
+                                                 num_layers);
 }
 
 std::vector<unsigned> CreateFanOutVector() {
diff --git a/lonestar/libgnnbench/src/Start.cpp b/lonestar/libgnnbench/src/Start.cpp
index 9a7e747744..daff6ad114 100644
--- a/lonestar/libgnnbench/src/Start.cpp
+++ b/lonestar/libgnnbench/src/Start.cpp
@@ -116,6 +116,7 @@ void GNNBenchStart(int argc, char** argv, const char* app, const char* desc,
                                  test_minibatch_size);
     galois::runtime::reportParam("GNNBench", "IsGraphSampled",
                                  do_graph_sampling);
+    galois::runtime::reportParam("GNNBench", "LearningRate", learning_rate);
   }
 
   char name[256];

From 46c07d91855fe2c19582e37426e54e104ee2bd7a Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 30 Jun 2021 18:33:50 -0500
Subject: [PATCH 576/660] Timers for full forward/backward phase in GNN

Forward/backward timers before this commit did not catch non GCN/SAGE
layers. This commit puts a timer around the forward/backward phase.

Note that the accuracy check will not be caught by this timer but will
still be in the epoch timer.
---
 libgnn/include/galois/GraphNeuralNetwork.h |  4 ++++
 libgnn/src/GraphNeuralNetwork.cpp          | 18 ++++++++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h
index 7d71efa61c..ff13e24c41 100644
--- a/libgnn/include/galois/GraphNeuralNetwork.h
+++ b/libgnn/include/galois/GraphNeuralNetwork.h
@@ -231,7 +231,10 @@ class GraphNeuralNetwork {
 private:
   static const constexpr char* kRegionName = "GraphNeuralNetwork";
 
+  bool timers_on_{false};
+
   void EnableTimers() {
+    timers_on_ = true;
     galois::gDebug("Enabling timers");
     graph_->EnableTimers();
     for (auto& layer : gnn_layers_)
@@ -239,6 +242,7 @@ class GraphNeuralNetwork {
   }
 
   void DisableTimers() {
+    timers_on_ = false;
     galois::gDebug("Disabling timers");
     graph_->DisableTimers();
     for (auto& layer : gnn_layers_)
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index 1ed89e99cc..7629a8ef57 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -655,6 +655,11 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
 
 const galois::PointerWithSize<galois::GNNFloat>
 galois::GraphNeuralNetwork::DoInference() {
+  galois::StatTimer timer("DoInference", "GraphNeuralNetwork");
+  if (timers_on_) {
+    timer.start();
+  }
+
   // start with graph features and pass it through all layers of the network
   galois::PointerWithSize<galois::GNNFloat> layer_input =
       graph_->GetLocalFeatures();
@@ -663,6 +668,10 @@ galois::GraphNeuralNetwork::DoInference() {
     layer_input = ptr->ForwardPhase(layer_input);
   }
 
+  if (timers_on_) {
+    timer.stop();
+  }
+
   return layer_input;
 }
 
@@ -688,6 +697,11 @@ float galois::GraphNeuralNetwork::GetGlobalAccuracy(
 }
 
 void galois::GraphNeuralNetwork::GradientPropagation() {
+  galois::StatTimer timer("GradientPropagation", "GraphNeuralNetwork");
+  if (timers_on_) {
+    timer.start();
+  }
+
   // from output layer get initial gradients
   std::vector<galois::GNNFloat> dummy;
   std::unique_ptr<galois::GNNLayer>& output_layer = gnn_layers_.back();
@@ -715,6 +729,10 @@ void galois::GraphNeuralNetwork::GradientPropagation() {
     // update the weights of the layer
     gnn_layers_[layer_index]->OptimizeLayer(optimizer_.get(), layer_index);
   }
+
+  if (timers_on_) {
+    timer.stop();
+  }
 }
 
 void galois::GraphNeuralNetwork::CorrectBackwardLinks() {

From 71248703017037c2fea52a3bfbd248c490cf0922 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 2 Jul 2021 20:39:54 -0500
Subject: [PATCH 577/660] sage layer assertion fix

---
 libgnn/src/layers/SAGELayer.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp
index 169dbe7ea3..8962ec319a 100644
--- a/libgnn/src/layers/SAGELayer.cpp
+++ b/libgnn/src/layers/SAGELayer.cpp
@@ -295,7 +295,7 @@ const galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::ForwardPhase(
     AggregateAll(layer_dimensions_.input_columns, input_data, agg_data,
                  &input_column_intermediates_);
     assert(p_forward_output_matrix_.size() >=
-           layer_dimensions_.output_columns * layer_dimensions_.output_columns);
+           layer_dimensions_.output_rows * layer_dimensions_.output_columns);
     UpdateEmbeddings(agg_data, p_forward_output_matrix_.data(), true);
   } else {
     assert(p_out_temp_.size() >=
@@ -305,7 +305,7 @@ const galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::ForwardPhase(
     UpdateEmbeddings(input_data, p_out_temp_.data(), false);
     // A(FW)
     assert(p_forward_output_matrix_.size() >=
-           layer_dimensions_.output_columns * layer_dimensions_.output_columns);
+           layer_dimensions_.output_rows * layer_dimensions_.output_columns);
     AggregateAll(layer_dimensions_.output_columns, p_out_temp_.data(),
                  p_forward_output_matrix_.data(),
                  &output_column_intermediates_);

From daf355f3a6e01368bbbbf0ef839509411af642af Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Sat, 3 Jul 2021 23:11:47 -0500
Subject: [PATCH 578/660] Mirror/master nodes in gluon now pointers

Made the master/mirror node vars in Gluon pointers so that they can be
swapped in/out; mostly used for subgraph mirror changes to avoid extra
communications to dead mirrors on other hosts.
---
 .../include/galois/graphs/DistributedGraph.h  |   3 +
 libcusp/include/galois/graphs/NewGeneric.h    |   1 +
 .../include/galois/graphs/GluonSubstrate.h    | 266 ++++++++++++------
 3 files changed, 177 insertions(+), 93 deletions(-)

diff --git a/libcusp/include/galois/graphs/DistributedGraph.h b/libcusp/include/galois/graphs/DistributedGraph.h
index bf88a17acf..0e3e5fa43c 100644
--- a/libcusp/include/galois/graphs/DistributedGraph.h
+++ b/libcusp/include/galois/graphs/DistributedGraph.h
@@ -543,6 +543,9 @@ class DistGraph {
 
 public:
   virtual ~DistGraph() {}
+
+  unsigned GetLIDHost(uint64_t lid) const { return getHostIDImpl(getGID(lid)); }
+
   //! Determines which host has the master for a particular node
   //! @returns Host id of node in question
   inline unsigned getHostID(uint64_t gid) const { return getHostIDImpl(gid); }
diff --git a/libcusp/include/galois/graphs/NewGeneric.h b/libcusp/include/galois/graphs/NewGeneric.h
index 6f13f42737..4ff7832f3e 100644
--- a/libcusp/include/galois/graphs/NewGeneric.h
+++ b/libcusp/include/galois/graphs/NewGeneric.h
@@ -2589,6 +2589,7 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
     for (uint32_t i = base_DistGraph::numOwned; i < base_DistGraph::numNodes;
          i++) {
       uint32_t globalID = base_DistGraph::localToGlobalVector[i];
+      assert(graphPartitioner->retrieveMaster(globalID) != base_DistGraph::id);
       base_DistGraph::mirrorNodes[graphPartitioner->retrieveMaster(globalID)]
           .push_back(globalID);
     }
diff --git a/libgluon/include/galois/graphs/GluonSubstrate.h b/libgluon/include/galois/graphs/GluonSubstrate.h
index ae50e0e10f..860480b262 100644
--- a/libgluon/include/galois/graphs/GluonSubstrate.h
+++ b/libgluon/include/galois/graphs/GluonSubstrate.h
@@ -119,12 +119,16 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
 
   // memoization optimization
   //! Master nodes on different hosts. For broadcast;
-  std::vector<std::vector<size_t>> masterNodes;
+  std::vector<std::vector<size_t>> master_nodes_concrete_;
+  std::vector<std::vector<size_t>> subgraph_master_nodes_;
+  std::vector<std::vector<size_t>>* masterNodes;
   //! Mirror nodes on different hosts. For reduce; comes from the user graph
   //! during initialization (we expect user to give to us)
-  std::vector<std::vector<size_t>>& mirrorNodes;
+  std::vector<std::vector<size_t>>* mirrorNodes;
   //! Maximum size of master or mirror nodes on different hosts
   size_t maxSharedSize;
+  //! Maximum size of master or mirror nodes on different hosts
+  size_t original_max_shared_size_;
 
 #ifdef GALOIS_USE_BARE_MPI
   std::vector<MPI_Group> mpi_identity_groups;
@@ -190,7 +194,7 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
         continue;
 
       galois::runtime::SendBuffer b;
-      gSerialize(b, mirrorNodes[x]);
+      gSerialize(b, (*mirrorNodes)[x]);
       net.sendTagged(x, galois::runtime::evilPhase, std::move(b));
     }
 
@@ -204,7 +208,7 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
         p = net.recieveTagged(galois::runtime::evilPhase);
       } while (!p);
 
-      galois::runtime::gDeserialize(p->second, masterNodes[p->first]);
+      galois::runtime::gDeserialize(p->second, (*masterNodes)[p->first]);
     }
     incrementEvilPhase();
   }
@@ -274,11 +278,11 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
     // convert the global ids stored in the master/mirror nodes arrays to local
     // ids
     // TODO: use 32-bit distinct vectors for masters and mirrors from here on
-    for (uint32_t h = 0; h < masterNodes.size(); ++h) {
+    for (uint32_t h = 0; h < masterNodes->size(); ++h) {
       galois::do_all(
-          galois::iterate(size_t{0}, masterNodes[h].size()),
+          galois::iterate(size_t{0}, (*masterNodes)[h].size()),
           [&](size_t n) {
-            masterNodes[h][n] = userGraph.getLID(masterNodes[h][n]);
+            (*masterNodes)[h][n] = userGraph.getLID((*masterNodes)[h][n]);
           },
 #if GALOIS_COMM_STATS
           galois::loopname(get_run_identifier("MasterNodes").c_str()),
@@ -286,11 +290,11 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
           galois::no_stats());
     }
 
-    for (uint32_t h = 0; h < mirrorNodes.size(); ++h) {
+    for (uint32_t h = 0; h < mirrorNodes->size(); ++h) {
       galois::do_all(
-          galois::iterate(size_t{0}, mirrorNodes[h].size()),
+          galois::iterate(size_t{0}, (*mirrorNodes)[h].size()),
           [&](size_t n) {
-            mirrorNodes[h][n] = userGraph.getLID(mirrorNodes[h][n]);
+            (*mirrorNodes)[h][n] = userGraph.getLID((*mirrorNodes)[h][n]);
           },
 #if GALOIS_COMM_STATS
           galois::loopname(get_run_identifier("MirrorNodes").c_str()),
@@ -302,30 +306,32 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
 
     maxSharedSize = 0;
     // report masters/mirrors to/from other hosts as statistics
-    for (auto x = 0U; x < masterNodes.size(); ++x) {
+    for (auto x = 0U; x < masterNodes->size(); ++x) {
       if (x == id)
         continue;
       std::string master_nodes_str =
           "MasterNodesFrom_" + std::to_string(id) + "_To_" + std::to_string(x);
       galois::runtime::reportStatCond_Tsum<MORE_DIST_STATS>(
-          RNAME, master_nodes_str, masterNodes[x].size());
-      if (masterNodes[x].size() > maxSharedSize) {
-        maxSharedSize = masterNodes[x].size();
+          RNAME, master_nodes_str, (*masterNodes)[x].size());
+      if ((*masterNodes)[x].size() > maxSharedSize) {
+        maxSharedSize = (*masterNodes)[x].size();
       }
     }
 
-    for (auto x = 0U; x < mirrorNodes.size(); ++x) {
+    for (auto x = 0U; x < mirrorNodes->size(); ++x) {
       if (x == id)
         continue;
       std::string mirror_nodes_str =
           "MirrorNodesFrom_" + std::to_string(x) + "_To_" + std::to_string(id);
       galois::runtime::reportStatCond_Tsum<MORE_DIST_STATS>(
-          RNAME, mirror_nodes_str, mirrorNodes[x].size());
-      if (mirrorNodes[x].size() > maxSharedSize) {
-        maxSharedSize = mirrorNodes[x].size();
+          RNAME, mirror_nodes_str, (*mirrorNodes)[x].size());
+      if ((*mirrorNodes)[x].size() > maxSharedSize) {
+        maxSharedSize = (*mirrorNodes)[x].size();
       }
     }
 
+    original_max_shared_size_ = maxSharedSize;
+
     sendInfoToHost();
 
     // do not track memory usage of partitioning
@@ -435,7 +441,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
         cartesianGrid(_cartesianGrid), partitionAgnostic(_partitionAgnostic),
         substrateDataMode(_enforcedDataMode), numHosts(numHosts), num_run(0),
         num_round(0), currentBVFlag(nullptr),
-        mirrorNodes(userGraph.getMirrorNodes()) {
+        masterNodes(&master_nodes_concrete_),
+        mirrorNodes(&(userGraph.getMirrorNodes())) {
     is_a_graph_ = _userGraph.is_a_graph();
     if (cartesianGrid.first != 0 && cartesianGrid.second != 0) {
       GALOIS_ASSERT(cartesianGrid.first * cartesianGrid.second == numHosts,
@@ -455,7 +462,7 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
 
     initBareMPI();
     // master setup from mirrors done by setupCommunication call
-    masterNodes.resize(numHosts);
+    masterNodes->resize(numHosts);
     // setup proxy communication
     galois::CondStatTimer<MORE_DIST_STATS> Tgraph_construct_comm(
         "GraphCommSetupTime", RNAME);
@@ -464,11 +471,73 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
     Tgraph_construct_comm.stop();
   }
 
+  void RevertHandshakeToRealGraph() {
+    // XXX make sure I dont need anything else
+    masterNodes = &master_nodes_concrete_;
+    mirrorNodes = &(userGraph.getMirrorNodes());
+    maxSharedSize = original_max_shared_size_;
+  }
+
+  void
+  SetupSubgraphMirrors(std::vector<std::vector<size_t>>& subgraph_mirrors) {
+    galois::StatTimer t("SubgraphMirrorSetup");
+    t.start();
+
+    // resetup master mirrors
+    masterNodes = &subgraph_master_nodes_;
+    mirrorNodes = &subgraph_mirrors;
+    masterNodes->clear();
+    if (masterNodes->size() < numHosts)
+      masterNodes->resize(numHosts);
+
+    // Exchange information for memoization optimization.
+    exchangeProxyInfo();
+
+    assert(masterNodes->size() == numHosts);
+    assert(mirrorNodes->size() == numHosts);
+
+    // convert the global ids stored in the master/mirror nodes arrays to local
+    // ids
+    // TODO: use 32-bit distinct vectors for masters and mirrors from here on
+    for (uint32_t h = 0; h < masterNodes->size(); ++h) {
+      galois::do_all(
+          galois::iterate(size_t{0}, (*masterNodes)[h].size()),
+          [&](size_t n) {
+            (*masterNodes)[h][n] = userGraph.getLID((*masterNodes)[h][n]);
+          },
+          galois::no_stats());
+    }
+
+    for (uint32_t h = 0; h < mirrorNodes->size(); ++h) {
+      galois::do_all(
+          galois::iterate(size_t{0}, (*mirrorNodes)[h].size()),
+          [&](size_t n) {
+            (*mirrorNodes)[h][n] = userGraph.getLID((*mirrorNodes)[h][n]);
+          },
+          galois::no_stats());
+    }
+
+    maxSharedSize = 0;
+    for (auto x = 0U; x < masterNodes->size(); ++x) {
+      assert(x < mirrorNodes->size());
+      if (x == id)
+        continue;
+      if ((*masterNodes)[x].size() > maxSharedSize) {
+        maxSharedSize = (*masterNodes)[x].size();
+      }
+      if ((*mirrorNodes)[x].size() > maxSharedSize) {
+        maxSharedSize = (*mirrorNodes)[x].size();
+      }
+    }
+
+    t.stop();
+  }
+
+private:
   ////////////////////////////////////////////////////////////////////////////////
   // Data extraction from bitsets
   ////////////////////////////////////////////////////////////////////////////////
 
-private:
   /**
    * Given a bitset, determine the indices of the bitset that are currently
    * set.
@@ -820,7 +889,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
             typename std::enable_if<Is2DVector<VecTy>::value>::type* = nullptr>
   void getSendBuffer(std::string loopName, unsigned x,
                      galois::runtime::SendBuffer& b, size_t elem_size) {
-    auto& sharedNodes = (syncType == syncReduce) ? mirrorNodes : masterNodes;
+    auto& sharedNodes =
+        (syncType == syncReduce) ? (*mirrorNodes) : (*masterNodes);
 
     SyncExtract2D<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(
         loopName, x, sharedNodes[x], b, elem_size);
@@ -850,7 +920,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
             typename std::enable_if<!Is2DVector<VecTy>::value>::type* = nullptr>
   void getSendBuffer(std::string loopName, unsigned x,
                      galois::runtime::SendBuffer& b, size_t elem_size) {
-    auto& sharedNodes = (syncType == syncReduce) ? mirrorNodes : masterNodes;
+    auto& sharedNodes =
+        (syncType == syncReduce) ? (*mirrorNodes) : (*masterNodes);
 
     if (BitsetFnTy::is_valid()) {
       syncExtract<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(
@@ -886,7 +957,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
    */
   template <bool async, SyncType syncType, typename VecType>
   void serializeMessage(std::string loopName, DataCommMode data_mode,
-                        size_t bit_set_count, std::vector<size_t>& indices,
+                        size_t bit_set_count,
+                        const std::vector<size_t>& indices,
                         galois::PODResizeableArray<unsigned int>& offsets,
                         galois::DynamicBitSet& bit_set_comm, VecType& val_vec,
                         galois::runtime::SendBuffer& b) {
@@ -931,7 +1003,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
   template <bool async, SyncType syncType, typename VecType>
   void
   serializeMessageVecHack(std::string loopName, DataCommMode data_mode,
-                          size_t bit_set_count, std::vector<size_t>& indices,
+                          size_t bit_set_count,
+                          const std::vector<size_t>& indices,
                           galois::PODResizeableArray<unsigned int>& offsets,
                           galois::DynamicBitSet& bit_set_comm, VecType& val_vec,
                           galois::runtime::SendBuffer& b) {
@@ -971,7 +1044,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
   // Calls data on the TwoDVector
   template <bool async, SyncType syncType, typename TwoDVecType>
   void SerializeMessage2D(std::string loopName, DataCommMode data_mode,
-                          size_t bit_set_count, std::vector<size_t>& indices,
+                          size_t bit_set_count,
+                          const std::vector<size_t>& indices,
                           galois::PODResizeableArray<unsigned int>& offsets,
                           galois::DynamicBitSet& bit_set_comm,
                           TwoDVecType& two_d_vec,
@@ -1014,12 +1088,11 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
 
   // Only serializes the prefix
   template <SyncType syncType>
-  void
-  SerializeMessagePrefix2D(std::string loopName, DataCommMode data_mode,
-                           size_t bit_set_count, std::vector<size_t>& indices,
-                           galois::PODResizeableArray<unsigned int>& offsets,
-                           galois::DynamicBitSet& bit_set_comm,
-                           galois::runtime::SendBuffer& b) {
+  void SerializeMessagePrefix2D(
+      std::string loopName, DataCommMode data_mode, size_t bit_set_count,
+      const std::vector<size_t>& indices,
+      galois::PODResizeableArray<unsigned int>& offsets,
+      galois::DynamicBitSet& bit_set_comm, galois::runtime::SendBuffer& b) {
     std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
     std::string serialize_timer_str(syncTypeStr + "SerializeMessagePrefix_" +
                                     get_run_identifier(loopName));
@@ -1258,7 +1331,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
    */
   bool nothingToSend(unsigned host, SyncType syncType,
                      WriteLocation writeLocation, ReadLocation readLocation) {
-    auto& sharedNodes = (syncType == syncReduce) ? mirrorNodes : masterNodes;
+    auto& sharedNodes =
+        (syncType == syncReduce) ? (*mirrorNodes) : (*masterNodes);
     // TODO refactor (below)
     if (!isCartCut) {
       return (sharedNodes[host].size() == 0);
@@ -1287,7 +1361,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
    */
   bool nothingToRecv(unsigned host, SyncType syncType,
                      WriteLocation writeLocation, ReadLocation readLocation) {
-    auto& sharedNodes = (syncType == syncReduce) ? masterNodes : mirrorNodes;
+    auto& sharedNodes =
+        (syncType == syncReduce) ? (*masterNodes) : (*mirrorNodes);
     // TODO refactor (above)
     if (!isCartCut) {
       return (sharedNodes[host].size() == 0);
@@ -2041,8 +2116,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
             typename std::enable_if<galois::runtime::is_memory_copyable<
                 typename SyncFnTy::ValTy>::value>::type* = nullptr>
   void syncExtract(std::string loopName, unsigned from_id,
-                   std::vector<size_t>& indices, galois::runtime::SendBuffer& b,
-                   size_t elem_size) {
+                   const std::vector<size_t>& indices,
+                   galois::runtime::SendBuffer& b, size_t elem_size) {
     uint32_t num = indices.size() * elem_size;
     static VecTy val_vec; // sometimes wasteful
     galois::PODResizeableArray<unsigned int>& offsets = syncOffsets;
@@ -2122,8 +2197,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
             typename std::enable_if<!galois::runtime::is_memory_copyable<
                 typename SyncFnTy::ValTy>::value>::type* = nullptr>
   void syncExtract(std::string loopName, unsigned from_id,
-                   std::vector<size_t>& indices, galois::runtime::SendBuffer& b,
-                   size_t elem_size) {
+                   const std::vector<size_t>& indices,
+                   galois::runtime::SendBuffer& b, size_t elem_size) {
     std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
     std::string extract_timer_str(syncTypeStr + "Extract_" +
                                   get_run_identifier(loopName));
@@ -2204,8 +2279,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
       bool async,
       typename std::enable_if<!BitsetFnTy::is_vector_bitset()>::type* = nullptr>
   void syncExtract(std::string loopName, unsigned from_id,
-                   std::vector<size_t>& indices, galois::runtime::SendBuffer& b,
-                   size_t elem_size) {
+                   const std::vector<size_t>& indices,
+                   galois::runtime::SendBuffer& b, size_t elem_size) {
     uint32_t num                        = indices.size() * elem_size;
     galois::DynamicBitSet& bit_set_comm = syncBitset;
     static VecTy val_vec; // sometimes wasteful
@@ -2337,7 +2412,7 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
       typename std::enable_if<!BitsetFnTy::is_vector_bitset()>::type* = nullptr,
       typename std::enable_if<is_vector_of_vec<VecTy>::value>::type*  = nullptr>
   void syncExtractFloatVecHack(std::string loopName, unsigned from_id,
-                               std::vector<size_t>& indices,
+                               const std::vector<size_t>& indices,
                                galois::runtime::SendBuffer& b,
                                size_t elem_size) {
     // TODO(loc) assumption that type in the VecTy is a vector of floats
@@ -2479,7 +2554,7 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
   template <SyncType syncType, typename SyncFnTy, typename BitsetFnTy,
             typename VecTy, bool async>
   void SyncExtract2D(std::string loopName, unsigned from_id,
-                     std::vector<size_t>& indices,
+                     const std::vector<size_t>& indices,
                      galois::runtime::SendBuffer& b, size_t elem_size) {
     uint32_t num                        = indices.size() * elem_size;
     galois::DynamicBitSet& bit_set_comm = syncBitset;
@@ -2641,7 +2716,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
       SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy,
       bool async,
       typename std::enable_if<BitsetFnTy::is_vector_bitset()>::type* = nullptr>
-  void syncExtract(std::string loopName, unsigned, std::vector<size_t>& indices,
+  void syncExtract(std::string loopName, unsigned,
+                   const std::vector<size_t>& indices,
                    galois::runtime::SendBuffer& b, size_t elem_size) {
     uint32_t num                        = indices.size() * elem_size;
     galois::DynamicBitSet& bit_set_comm = syncBitset;
@@ -2684,13 +2760,11 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
         // vector extract, i.e. get element i of the vector (i passed in as
         // argument as well)
         if (data_mode == onlyData) {
-          // galois::gInfo(id, " node ", i, " has data to send");
           bit_set_count = indices.size();
           extractSubset<SyncFnTy, syncType, VecTy, true, true, true>(
               loopName, indices, bit_set_count, offsets, val_vec, i);
         } else if (data_mode !=
                    noData) { // bitsetData or offsetsData or gidsData
-          // galois::gInfo(id, " node ", i, " has data to send");
           extractSubset<SyncFnTy, syncType, VecTy, false, true, true>(
               loopName, indices, bit_set_count, offsets, val_vec, i);
         }
@@ -2926,9 +3000,10 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
     static VecTy val_vec;
     galois::PODResizeableArray<unsigned int>& offsets = syncOffsets;
 
-    auto& sharedNodes = (syncType == syncReduce) ? masterNodes : mirrorNodes;
-    uint32_t num      = sharedNodes[from_id].size();
-    size_t retval     = 0;
+    auto& sharedNodes =
+        (syncType == syncReduce) ? (*masterNodes) : (*mirrorNodes);
+    uint32_t num  = sharedNodes[from_id].size();
+    size_t retval = 0;
 
     Tset.start();
 
@@ -3014,12 +3089,13 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
         set_batch_timer_str.c_str(), RNAME);
     ////////////////////////////////////////////////////////////////////////////
 
-    galois::DynamicBitSet& bit_set_comm = syncBitset;
+    galois::DynamicBitSet& bit_set_comm               = syncBitset;
     galois::PODResizeableArray<unsigned int>& offsets = syncOffsets;
 
-    auto& sharedNodes = (syncType == syncReduce) ? masterNodes : mirrorNodes;
-    uint32_t num      = sharedNodes[from_id].size();
-    size_t retval     = 0;
+    auto& sharedNodes =
+        (syncType == syncReduce) ? (*masterNodes) : (*mirrorNodes);
+    uint32_t num  = sharedNodes[from_id].size();
+    size_t retval = 0;
 
     Tset.start();
 
@@ -3115,9 +3191,10 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
     static galois::gstl::Vector<float> single_array;
     galois::PODResizeableArray<unsigned int>& offsets = syncOffsets;
 
-    auto& sharedNodes = (syncType == syncReduce) ? masterNodes : mirrorNodes;
-    uint32_t num      = sharedNodes[from_id].size();
-    size_t retval     = 0;
+    auto& sharedNodes =
+        (syncType == syncReduce) ? (*masterNodes) : (*mirrorNodes);
+    uint32_t num  = sharedNodes[from_id].size();
+    size_t retval = 0;
 
     Tset.start();
 
@@ -3251,9 +3328,10 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
     static VecTy val_vec;
     galois::PODResizeableArray<unsigned int>& offsets = syncOffsets;
 
-    auto& sharedNodes = (syncType == syncReduce) ? masterNodes : mirrorNodes;
-    uint32_t num      = sharedNodes[from_id].size();
-    size_t retval     = 0;
+    auto& sharedNodes =
+        (syncType == syncReduce) ? (*masterNodes) : (*mirrorNodes);
+    uint32_t num  = sharedNodes[from_id].size();
+    size_t retval = 0;
 
     Tset.start();
 
@@ -3536,7 +3614,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
 
     if (rb.size() == 0) { // create the receive buffers
       TRecvTime.start();
-      auto& sharedNodes = (syncType == syncReduce) ? masterNodes : mirrorNodes;
+      auto& sharedNodes =
+          (syncType == syncReduce) ? (*masterNodes) : (*mirrorNodes);
       rb.resize(numHosts);
       request.resize(numHosts, MPI_REQUEST_NULL);
 
@@ -3591,7 +3670,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
 
     if (window.size() == 0) { // create the windows
       TRecvTime.start();
-      auto& sharedNodes = (syncType == syncReduce) ? masterNodes : mirrorNodes;
+      auto& sharedNodes =
+          (syncType == syncReduce) ? (*masterNodes) : (*mirrorNodes);
       window.resize(numHosts);
       rb.resize(numHosts);
 
@@ -4407,20 +4487,20 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
 
     // copy memoization meta-data
     m.num_master_nodes =
-        (unsigned int*)calloc(masterNodes.size(), sizeof(unsigned int));
+        (unsigned int*)calloc(masterNodes->size(), sizeof(unsigned int));
     ;
     m.master_nodes =
-        (unsigned int**)calloc(masterNodes.size(), sizeof(unsigned int*));
+        (unsigned int**)calloc(masterNodes->size(), sizeof(unsigned int*));
     ;
 
-    for (uint32_t h = 0; h < masterNodes.size(); ++h) {
-      m.num_master_nodes[h] = masterNodes[h].size();
+    for (uint32_t h = 0; h < masterNodes->size(); ++h) {
+      m.num_master_nodes[h] = (*masterNodes)[h].size();
 
-      if (masterNodes[h].size() > 0) {
-        m.master_nodes[h] =
-            (unsigned int*)calloc(masterNodes[h].size(), sizeof(unsigned int));
+      if ((*masterNodes)[h].size() > 0) {
+        m.master_nodes[h] = (unsigned int*)calloc((*masterNodes)[h].size(),
+                                                  sizeof(unsigned int));
         ;
-        std::copy(masterNodes[h].begin(), masterNodes[h].end(),
+        std::copy((*masterNodes)[h].begin(), (*masterNodes)[h].end(),
                   m.master_nodes[h]);
       } else {
         m.master_nodes[h] = NULL;
@@ -4428,19 +4508,19 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
     }
 
     m.num_mirror_nodes =
-        (unsigned int*)calloc(mirrorNodes.size(), sizeof(unsigned int));
+        (unsigned int*)calloc(mirrorNodes->size(), sizeof(unsigned int));
     ;
     m.mirror_nodes =
-        (unsigned int**)calloc(mirrorNodes.size(), sizeof(unsigned int*));
+        (unsigned int**)calloc(mirrorNodes->size(), sizeof(unsigned int*));
     ;
-    for (uint32_t h = 0; h < mirrorNodes.size(); ++h) {
-      m.num_mirror_nodes[h] = mirrorNodes[h].size();
+    for (uint32_t h = 0; h < mirrorNodes->size(); ++h) {
+      m.num_mirror_nodes[h] = (*mirrorNodes)[h].size();
 
-      if (mirrorNodes[h].size() > 0) {
-        m.mirror_nodes[h] =
-            (unsigned int*)calloc(mirrorNodes[h].size(), sizeof(unsigned int));
+      if ((*mirrorNodes)[h].size() > 0) {
+        m.mirror_nodes[h] = (unsigned int*)calloc((*mirrorNodes)[h].size(),
+                                                  sizeof(unsigned int));
         ;
-        std::copy(mirrorNodes[h].begin(), mirrorNodes[h].end(),
+        std::copy((*mirrorNodes)[h].begin(), (*mirrorNodes)[h].end(),
                   m.mirror_nodes[h]);
       } else {
         m.mirror_nodes[h] = NULL;
@@ -4469,18 +4549,18 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
 
     // copy memoization meta-data
     g_info.num_master_nodes =
-        (unsigned int*)calloc(masterNodes.size(), sizeof(unsigned int));
+        (unsigned int*)calloc(masterNodes->size(), sizeof(unsigned int));
     g_info.master_nodes =
-        (unsigned int**)calloc(masterNodes.size(), sizeof(unsigned int*));
+        (unsigned int**)calloc(masterNodes->size(), sizeof(unsigned int*));
 
-    for (uint32_t h = 0; h < masterNodes.size(); ++h) {
-      g_info.num_master_nodes[h] = masterNodes[h].size();
+    for (uint32_t h = 0; h < masterNodes->size(); ++h) {
+      g_info.num_master_nodes[h] = (*masterNodes)[h].size();
 
-      if (masterNodes[h].size() > 0) {
-        g_info.master_nodes[h] =
-            (unsigned int*)calloc(masterNodes[h].size(), sizeof(unsigned int));
+      if ((*masterNodes)[h].size() > 0) {
+        g_info.master_nodes[h] = (unsigned int*)calloc((*masterNodes)[h].size(),
+                                                       sizeof(unsigned int));
         ;
-        std::copy(masterNodes[h].begin(), masterNodes[h].end(),
+        std::copy((*masterNodes)[h].begin(), (*masterNodes)[h].end(),
                   g_info.master_nodes[h]);
       } else {
         g_info.master_nodes[h] = NULL;
@@ -4488,16 +4568,16 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
     }
 
     g_info.num_mirror_nodes =
-        (unsigned int*)calloc(mirrorNodes.size(), sizeof(unsigned int));
+        (unsigned int*)calloc(mirrorNodes->size(), sizeof(unsigned int));
     g_info.mirror_nodes =
-        (unsigned int**)calloc(mirrorNodes.size(), sizeof(unsigned int*));
-    for (uint32_t h = 0; h < mirrorNodes.size(); ++h) {
-      g_info.num_mirror_nodes[h] = mirrorNodes[h].size();
-
-      if (mirrorNodes[h].size() > 0) {
-        g_info.mirror_nodes[h] =
-            (unsigned int*)calloc(mirrorNodes[h].size(), sizeof(unsigned int));
-        std::copy(mirrorNodes[h].begin(), mirrorNodes[h].end(),
+        (unsigned int**)calloc(mirrorNodes->size(), sizeof(unsigned int*));
+    for (uint32_t h = 0; h < mirrorNodes->size(); ++h) {
+      g_info.num_mirror_nodes[h] = (*mirrorNodes)[h].size();
+
+      if ((*mirrorNodes)[h].size() > 0) {
+        g_info.mirror_nodes[h] = (unsigned int*)calloc((*mirrorNodes)[h].size(),
+                                                       sizeof(unsigned int));
+        std::copy((*mirrorNodes)[h].begin(), (*mirrorNodes)[h].end(),
                   g_info.mirror_nodes[h]);
       } else {
         g_info.mirror_nodes[h] = NULL;

From 225f3fca34769a117c81d367668e2f3e635e3336 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Sat, 3 Jul 2021 23:20:47 -0500
Subject: [PATCH 579/660] Subgraph mirror creation for GNN sampling

Redoes Gluon handshake for live mirrors after subgraph creation; idea is
to avoid broadcasting very large feature vectors to dead mirror nodes.

There are a lot of dead commented out prints in this code that need to
be removed; low priority for now, just trying to get this pushed to test
on Stampede.
---
 libgnn/include/galois/graphs/GNNGraph.h         |  5 +++++
 libgnn/include/galois/graphs/GNNSubgraph.h      |  8 ++++++++
 .../graphs/GraphAggregationSyncStructures.h     | 15 +++++++++++++++
 libgnn/src/GraphNeuralNetwork.cpp               |  8 +++++---
 libgnn/src/graphs/GNNGraph.cpp                  | 17 +++++++++++------
 libgnn/src/graphs/GNNSubgraph.cpp               | 11 +++++++++++
 6 files changed, 55 insertions(+), 9 deletions(-)

diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 09fe0bffe4..1d639a9cbd 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -69,6 +69,7 @@ class GNNGraph {
 
   //! Return # of nodes in the partitioned graph
   size_t size() const { return partitioned_graph_->size(); }
+  size_t global_size() const { return partitioned_graph_->globalSize(); }
   //! Returns # of nodes in the *graph that is currently active*.
   size_t active_size() const {
     if (!use_subgraph_ && !use_subgraph_view_) {
@@ -81,6 +82,9 @@ class GNNGraph {
   bool is_local(size_t gid) const { return partitioned_graph_->isLocal(gid); }
   size_t GetLID(size_t gid) const { return partitioned_graph_->getLID(gid); }
   size_t GetGID(size_t lid) const { return partitioned_graph_->getGID(lid); }
+  size_t GetHostID(size_t gid) const {
+    return partitioned_graph_->getHostID(gid);
+  }
 
   //! Node begin for all local nodes
   NodeIterator begin() const {
@@ -325,6 +329,7 @@ class GNNGraph {
   void DisableSubgraph() {
     use_subgraph_      = false;
     use_subgraph_view_ = false;
+    sync_substrate_->RevertHandshakeToRealGraph();
   }
   bool IsSubgraphOn() const { return use_subgraph_; }
   bool IsSubgraphViewOn() const { return use_subgraph_view_; }
diff --git a/libgnn/include/galois/graphs/GNNSubgraph.h b/libgnn/include/galois/graphs/GNNSubgraph.h
index ddd4c8d277..2836e3f181 100644
--- a/libgnn/include/galois/graphs/GNNSubgraph.h
+++ b/libgnn/include/galois/graphs/GNNSubgraph.h
@@ -12,6 +12,7 @@ class GNNSubgraph {
                                std::numeric_limits<uint32_t>::max());
     // the subgraph to original graph maps are allocated on demand in gstl
     // vectors since those change every epoch
+    subgraph_mirrors_.resize(galois::runtime::getSystemNetworkInterface().Num);
   }
   //! Given sampled bits set on gnn_graph, builds an explicit subgraph
   //! for the sampled bits
@@ -100,6 +101,10 @@ class GNNSubgraph {
   void EnableTimers() { use_timer_ = true; }
   void DisableTimers() { use_timer_ = false; }
 
+  std::vector<std::vector<size_t>>& GetSubgraphMirrors() {
+    return subgraph_mirrors_;
+  }
+
 private:
   bool use_timer_{true};
   void TimerStart(galois::StatTimer* t) {
@@ -152,4 +157,7 @@ class GNNSubgraph {
   //! Maps from subgraph in-edge id to original graph edge id (used to check if
   //! edge exists in particular layer)
   galois::PODResizeableArray<uint32_t> in_subedge_to_original_edge_;
+
+  //! Mirror mappings for Gluon for subgraph
+  std::vector<std::vector<size_t>> subgraph_mirrors_;
 };
diff --git a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
index 29fdd66e0c..8cfb3c9a5d 100644
--- a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
+++ b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
@@ -13,6 +13,7 @@ extern size_t gnn_matrix_to_sync_column_length_;
 extern galois::DynamicBitSet bitset_graph_aggregate;
 extern galois::LargeArray<uint32_t>* gnn_lid_to_sid_pointer_;
 extern galois::DynamicBitSet bitset_sample_flag_;
+extern size_t subgraph_size_;
 #ifdef GALOIS_ENABLE_GPU
 extern struct CUDA_Context* cuda_ctx_for_sync;
 extern unsigned layer_number_to_sync;
@@ -216,9 +217,12 @@ struct GNNSampleSumAggregate {
         std::numeric_limits<uint32_t>::max()) {
       return false;
     }
+    assert((*gnn_lid_to_sid_pointer_)[node_id] < subgraph_size_);
 
     // loop and do addition
     for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) {
+      // galois::gPrint("write ", (*gnn_lid_to_sid_pointer_)[node_id] *
+      //                        gnn_matrix_to_sync_column_length_ + i, "\n");
       gnn_matrix_to_sync_[(*gnn_lid_to_sid_pointer_)[node_id] *
                               gnn_matrix_to_sync_column_length_ +
                           i] += y[i];
@@ -231,9 +235,14 @@ struct GNNSampleSumAggregate {
         std::numeric_limits<uint32_t>::max()) {
       return false;
     }
+    assert((*gnn_lid_to_sid_pointer_)[node_id] < subgraph_size_);
 
     // loop and do addition
     for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) {
+      // galois::gPrint(galois::runtime::getSystemNetworkInterface().ID,  "]
+      // nodeid ", node_id, " sid ",  (*gnn_lid_to_sid_pointer_)[node_id],
+      //               " write ", (*gnn_lid_to_sid_pointer_)[node_id] *
+      //                        gnn_matrix_to_sync_column_length_ + i, "\n");
       gnn_matrix_to_sync_[(*gnn_lid_to_sid_pointer_)[node_id] *
                               gnn_matrix_to_sync_column_length_ +
                           i] += y[i];
@@ -251,6 +260,7 @@ struct GNNSampleSumAggregate {
         std::numeric_limits<uint32_t>::max()) {
       return;
     }
+    assert((*gnn_lid_to_sid_pointer_)[node_id] < subgraph_size_);
 
     // loop and do addition
     for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) {
@@ -267,6 +277,11 @@ struct GNNSampleSumAggregate {
 
     // loop and do addition
     for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) {
+      // galois::gPrint(galois::runtime::getSystemNetworkInterface().ID,  "]
+      // broadxast nodeid ", node_id, " sid ",
+      // (*gnn_lid_to_sid_pointer_)[node_id],
+      //               " write ", (*gnn_lid_to_sid_pointer_)[node_id] *
+      //                        gnn_matrix_to_sync_column_length_ + i, "\n");
       gnn_matrix_to_sync_[(*gnn_lid_to_sid_pointer_)[node_id] *
                               gnn_matrix_to_sync_column_length_ +
                           i] = y[i];
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index 7629a8ef57..b6c38963ed 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -179,6 +179,7 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork(
 
 float galois::GraphNeuralNetwork::MinibatchedTesting() {
   galois::gDebug("Minibatched Testing");
+  graph_->DisableSubgraph();
   graph_->ResetTestMinibatcher();
   SetLayerPhases(galois::GNNPhase::kBatch);
 
@@ -630,7 +631,9 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
   if (!config_.test_minibatch_size()) {
     for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end();
          layer++) {
-      // TODO nuclear resize
+      // TODO nuclear resize; this is **ridiculously** inefficient
+      // because full graph will be used even if not included in test
+      // k-hop neighborhood for eval
       (*layer)->ResizeRows(graph_->size());
     }
     CorrectBackwardLinks();
@@ -649,8 +652,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
                                        global_accuracy);
   }
 
-  // return global_accuracy;
-  return 0;
+  return global_accuracy;
 }
 
 const galois::PointerWithSize<galois::GNNFloat>
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index ef92ef7615..19d7bb0ad5 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -41,6 +41,7 @@ namespace galois {
 namespace graphs {
 GNNFloat* gnn_matrix_to_sync_            = nullptr;
 size_t gnn_matrix_to_sync_column_length_ = 0;
+size_t subgraph_size_                    = 0;
 //! For synchronization of graph aggregations
 galois::DynamicBitSet bitset_graph_aggregate;
 galois::LargeArray<uint32_t>* gnn_lid_to_sid_pointer_ = nullptr;
@@ -204,6 +205,7 @@ void galois::graphs::GNNGraph::AggregateSync(GNNFloat* matrix_to_sync,
                                              bool is_backward) const {
   gnn_matrix_to_sync_               = matrix_to_sync;
   gnn_matrix_to_sync_column_length_ = matrix_column_size;
+  subgraph_size_                    = active_size();
   if (!use_subgraph_ && !use_subgraph_view_) {
     // set globals for the sync substrate
     if (!is_backward) {
@@ -907,8 +909,7 @@ void galois::graphs::GNNGraph::InitializeSamplingData(size_t num_layers,
 }
 
 size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) {
-  use_subgraph_      = false;
-  use_subgraph_view_ = false;
+  DisableSubgraph();
 
   bitset_sample_flag_.resize(size());
   bitset_sample_flag_.reset();
@@ -987,6 +988,7 @@ size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) {
         mirror_offset += 1;
       }
 
+      // galois::gInfo(host_prefix_, "Seed node is ", GetGID(*x));
       local_seed_count += 1;
       // 0 = seed node
       sample_node_timestamps_[*x] = 0;
@@ -1002,8 +1004,7 @@ size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) {
 size_t galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num,
                                                 bool inductive_subgraph,
                                                 size_t timestamp) {
-  use_subgraph_      = false;
-  use_subgraph_view_ = false;
+  DisableSubgraph();
 
   galois::do_all(
       galois::iterate(begin(), end()),
@@ -1066,6 +1067,8 @@ size_t galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num,
     if (IsInSampledGraph(x)) {
       local_sample_count += 1;
       if (sample_node_timestamps_[*x] == std::numeric_limits<uint32_t>::max()) {
+        // galois::gInfo(host_prefix_, "Layer ", timestamp, " new node is ",
+        // GetGID(*x));
         sample_node_timestamps_[*x] = timestamp;
       }
     }
@@ -1177,8 +1180,8 @@ galois::graphs::GNNGraph::ConstructSampledSubgraph(size_t num_sampled_layers,
                                                    bool use_view) {
   // false first so that the build process can use functions to access the
   // real graph
-  use_subgraph_            = false;
-  use_subgraph_view_       = false;
+  DisableSubgraph();
+
   gnn_sampled_out_degrees_ = &sampled_out_degrees_;
 
   // first, sync the degres of the sampled edges across all hosts
@@ -1227,6 +1230,8 @@ galois::graphs::GNNGraph::ConstructSampledSubgraph(size_t num_sampled_layers,
     subgraph_->BuildSubgraphView(*this, num_sampled_layers);
   }
 
+  sync_substrate_->SetupSubgraphMirrors(subgraph_->GetSubgraphMirrors());
+
   // after this, this graph is a subgraph
   if (!use_view) {
     use_subgraph_ = true;
diff --git a/libgnn/src/graphs/GNNSubgraph.cpp b/libgnn/src/graphs/GNNSubgraph.cpp
index fb1d7c78c6..141390e20e 100644
--- a/libgnn/src/graphs/GNNSubgraph.cpp
+++ b/libgnn/src/graphs/GNNSubgraph.cpp
@@ -5,6 +5,10 @@ size_t galois::graphs::GNNGraph::GNNSubgraph::BuildSubgraph(
     GNNGraph& gnn_graph, size_t num_sampled_layers) {
   galois::StatTimer timer("BuildSubgraph", kRegionName);
   TimerStart(&timer);
+  for (auto& vec : subgraph_mirrors_) {
+    vec.clear();
+    // vec.reserve(num_subgraph_nodes_ - subgraph_master_boundary_);
+  }
   CreateSubgraphMapping(gnn_graph, num_sampled_layers);
   if (num_subgraph_nodes_ == 0) {
     return 0;
@@ -118,6 +122,13 @@ void galois::graphs::GNNGraph::GNNSubgraph::CreateSubgraphMapping(
         }
         subgraph_id_to_lid_[sid_to_use]    = local_node_id;
         lid_to_subgraph_id_[local_node_id] = sid_to_use++;
+
+        uint32_t node_gid = gnn_graph.GetGID(local_node_id);
+        // mirror node; gids because they need to be sent as gids
+        // and converted over later
+        assert(node_gid < gnn_graph.global_size());
+        assert(subgraph_mirrors_.size() > gnn_graph.GetHostID(node_gid));
+        subgraph_mirrors_[gnn_graph.GetHostID(node_gid)].push_back(node_gid);
       }
     }
   }

From 59d6deeac5068fd781694aadf94dd7d44c197717 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 5 Jul 2021 21:43:25 -0500
Subject: [PATCH 580/660] Argument for gnn gluon handshake to disable timers

Disable timer argument to be used for test phases to avoid counting time
for that.
---
 libgluon/include/galois/graphs/GluonSubstrate.h | 11 ++++++++---
 libgnn/src/graphs/GNNGraph.cpp                  |  3 ++-
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/libgluon/include/galois/graphs/GluonSubstrate.h b/libgluon/include/galois/graphs/GluonSubstrate.h
index 860480b262..dc834357b5 100644
--- a/libgluon/include/galois/graphs/GluonSubstrate.h
+++ b/libgluon/include/galois/graphs/GluonSubstrate.h
@@ -479,9 +479,12 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
   }
 
   void
-  SetupSubgraphMirrors(std::vector<std::vector<size_t>>& subgraph_mirrors) {
+  SetupSubgraphMirrors(std::vector<std::vector<size_t>>& subgraph_mirrors,
+                       bool use_timer) {
     galois::StatTimer t("SubgraphMirrorSetup");
-    t.start();
+    if (use_timer) {
+      t.start();
+    }
 
     // resetup master mirrors
     masterNodes = &subgraph_master_nodes_;
@@ -530,7 +533,9 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
       }
     }
 
-    t.stop();
+    if (use_timer) {
+      t.stop();
+    }
   }
 
 private:
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index 19d7bb0ad5..36ef2ab58b 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -1230,7 +1230,8 @@ galois::graphs::GNNGraph::ConstructSampledSubgraph(size_t num_sampled_layers,
     subgraph_->BuildSubgraphView(*this, num_sampled_layers);
   }
 
-  sync_substrate_->SetupSubgraphMirrors(subgraph_->GetSubgraphMirrors());
+  sync_substrate_->SetupSubgraphMirrors(subgraph_->GetSubgraphMirrors(),
+                                        use_timer_);
 
   // after this, this graph is a subgraph
   if (!use_view) {

From 1933cbf77f9f2a9b609435caf36255e3e3ab258d Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 6 Jul 2021 21:44:23 -0500
Subject: [PATCH 581/660] Added one timer for all GNN backward syncs

Max across hosts stat only works if the max being used is all from one
host; the current setup sums 3 different sync times that can be from 3
different hosts which is incorrect. Therefore, wrap all backward sync
things in a single timer.
---
 libgnn/src/graphs/GNNGraph.cpp  | 6 ++++++
 libgnn/src/layers/GNNLayer.cpp  | 3 +++
 libgnn/src/layers/SAGELayer.cpp | 3 +++
 3 files changed, 12 insertions(+)

diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index 36ef2ab58b..6f87f3e88e 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -217,9 +217,12 @@ void galois::graphs::GNNGraph::AggregateSync(GNNFloat* matrix_to_sync,
                               Bitset_graph_aggregate>("Ignore");
       }
     } else {
+      galois::StatTimer clubbed_timer("Sync_BackwardSync", "Gluon");
+      clubbed_timer.start();
       sync_substrate_->sync<writeDestination, readAny, GNNSumAggregate,
                             Bitset_graph_aggregate>(
           "BackwardGraphAggregateSync");
+      clubbed_timer.stop();
     }
   } else {
     // setup the SID to LID map for the sync substrate to use (SID != LID)
@@ -234,9 +237,12 @@ void galois::graphs::GNNGraph::AggregateSync(GNNFloat* matrix_to_sync,
                               Bitset_graph_aggregate>("Ignore");
       }
     } else {
+      galois::StatTimer clubbed_timer("Sync_BackwardSync", "Gluon");
+      clubbed_timer.start();
       sync_substrate_->sync<writeDestination, readAny, GNNSampleSumAggregate,
                             Bitset_graph_aggregate>(
           "BackwardGraphAggregateSync");
+      clubbed_timer.stop();
     }
   }
 }
diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp
index e4f14d7408..885dc1f537 100644
--- a/libgnn/src/layers/GNNLayer.cpp
+++ b/libgnn/src/layers/GNNLayer.cpp
@@ -356,6 +356,8 @@ void galois::GNNLayer::ActivationDerivative(
 }
 
 void galois::GNNLayer::WeightGradientSyncSum() {
+  galois::StatTimer clubbed_timer("Sync_BackwardSync", "Gluon");
+  TimerStart(&clubbed_timer);
   galois::StatTimer t("Sync_WeightGradientsSum", "GNNLayer");
   TimerStart(&t);
   int weight_size = static_cast<int>(p_layer_weight_gradients_.size());
@@ -385,6 +387,7 @@ void galois::GNNLayer::WeightGradientSyncSum() {
   }
 #endif
   TimerStop(&t);
+  TimerStop(&clubbed_timer);
 }
 
 void galois::GNNLayer::MaskInputNonMasters(PointerWithSize<GNNFloat>* input,
diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp
index 8962ec319a..e28bc2d0c3 100644
--- a/libgnn/src/layers/SAGELayer.cpp
+++ b/libgnn/src/layers/SAGELayer.cpp
@@ -222,6 +222,8 @@ void galois::SAGELayer::ResizeIntermediates(size_t new_input_rows,
 }
 
 void galois::SAGELayer::WeightGradientSyncSum2() {
+  galois::StatTimer clubbed_timer("Sync_BackwardSync", "Gluon");
+  TimerStart(&clubbed_timer);
   galois::StatTimer t("Sync_WeightGradientsSum2", kRegionName);
   TimerStart(&t);
   int weight_size = static_cast<int>(p_layer_weight_gradients_2_.size());
@@ -250,6 +252,7 @@ void galois::SAGELayer::WeightGradientSyncSum2() {
   }
 #endif
   TimerStop(&t);
+  TimerStop(&clubbed_timer);
 }
 
 const galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::ForwardPhase(

From 8799bff2476b069db449a7c7abcdc29d708ff15b Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 7 Jul 2021 20:26:56 -0500
Subject: [PATCH 582/660] Optimize layer 0 backward for SAGE layer

Layer 0 can avoid aggregation communication/compute completely if
aggregation is done before update in that layer. Therefore, ignore the
agg/update flip completely for that layer and always do aggregation
followed by update (will result in more inefficient forward phase, but
in sampling setting unless you have incredibly large HL size it won't
matter because layer 0 backward aggregation is insanely expensive)
---
 libgnn/src/layers/SAGELayer.cpp | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp
index e28bc2d0c3..eae594d824 100644
--- a/libgnn/src/layers/SAGELayer.cpp
+++ b/libgnn/src/layers/SAGELayer.cpp
@@ -65,6 +65,12 @@ galois::SAGELayer::SAGELayer(size_t layer_num,
   size_t num_in_temp_elements =
       layer_dimensions_.output_rows * layer_dimensions_.input_columns;
 
+  if (layer_number_ == 0) {
+    // set this to true for layer 0; it avoids aggregation completely
+    // in the last layer for the backward phase
+    config_.disable_aggregate_after_update = true;
+  }
+
   // if in temp is smaller than out temp, or if dropout exists
   if (!config_.disable_dropout || config_.disable_aggregate_after_update ||
       layer_dimensions_.input_columns <= layer_dimensions_.output_columns) {
@@ -366,6 +372,8 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
     // aggregate occurs regardless of layer being equal to 0 because it is
     // required in this case for the weight gradient calculation
     // this is (FW)'
+    // TODO: this is absolutely terrible performance wise as well; keep
+    // in mind
     AggregateAll(layer_dimensions_.output_columns, input_gradient->data(),
                  p_out_temp_.data(), &output_column_intermediates_, true);
   }
@@ -458,6 +466,8 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
 
     // 0 means input gradient shouldn't get masked
     if (layer_number_ != 0) {
+      // NOTE: this is super nice because it avoids aggregation completely
+      // in the layer 0 setting
       // ---unmasked---
       // transposed sgemm for derivative; in_temp is output
       assert(input_gradient->size() >=

From 3ab8b2686c4458a619b0b092088779c8aa71b81c Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 7 Jul 2021 20:57:58 -0500
Subject: [PATCH 583/660] Undo last commit

Tradeoff space needs better exploration
---
 libgnn/src/layers/SAGELayer.cpp | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp
index eae594d824..29f4719996 100644
--- a/libgnn/src/layers/SAGELayer.cpp
+++ b/libgnn/src/layers/SAGELayer.cpp
@@ -65,11 +65,15 @@ galois::SAGELayer::SAGELayer(size_t layer_num,
   size_t num_in_temp_elements =
       layer_dimensions_.output_rows * layer_dimensions_.input_columns;
 
-  if (layer_number_ == 0) {
-    // set this to true for layer 0; it avoids aggregation completely
-    // in the last layer for the backward phase
-    config_.disable_aggregate_after_update = true;
-  }
+  // if (layer_number_ == 0) {
+  //   // set this to true for layer 0; it avoids aggregation completely
+  //   // in the last layer for the backward phase
+  //   config_.disable_aggregate_after_update = true;
+  //   // TODO this *will* hurt test evaluation because test eval has no
+  //   // backward phase, so the end-to-end benefits do not exist there
+  //   // Solution to this is to allocate all intermediate structures for both
+  //   // cases + make sure resize handles both cases
+  // }
 
   // if in temp is smaller than out temp, or if dropout exists
   if (!config_.disable_dropout || config_.disable_aggregate_after_update ||

From 20c9ce891359fbb840475e8ad360754712050c19 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 12 Jul 2021 15:33:17 -0500
Subject: [PATCH 584/660] Sample flag sync can be readSource

Sample flag setting is readAny before this commit: rationale I had back
then was that even if a node didn't have outgoing edges it should still
be included in the graph. This actually isn't necessary though because I
remove dead mirrors later in subgraph construction, and it turns out
that this sync call is actually a major part of subgraph construction.

Therefore, to optimize runtime, go to readAny. This can cause problems
if some vertex is activated later without the sync (i.e. no consistent
"time" in which a vertex is entered), so in the sync call you have to
make sure not to set anything that exceeds the current layer's number of
rows (or you get a segfault).
---
 libgnn/include/galois/graphs/GNNGraph.h       |  5 ++--
 .../graphs/GraphAggregationSyncStructures.h   | 29 +++++++++----------
 libgnn/src/graphs/GNNGraph.cpp                | 20 +++++++------
 libgnn/src/layers/SAGELayer.cpp               |  8 ++++-
 4 files changed, 35 insertions(+), 27 deletions(-)

diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 1d639a9cbd..36115929d7 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -554,7 +554,8 @@ class GNNGraph {
   // issues later
   void AggregateSync(GNNFloat* matrix_to_sync,
                      const size_t matrix_column_size) const {
-    AggregateSync(matrix_to_sync, matrix_column_size, false);
+    AggregateSync(matrix_to_sync, matrix_column_size, false,
+                  std::numeric_limits<uint32_t>::max());
   };
 
   //! Given a matrix and the column size, do an aggregate sync where each row
@@ -563,7 +564,7 @@ class GNNGraph {
   //! Note that it's const because the only thing being used is the graph
   //! topology of this object; the thing modified is the passed in matrix
   void AggregateSync(GNNFloat* matrix_to_sync, const size_t matrix_column_size,
-                     bool is_backward) const;
+                     bool is_backward, uint32_t active_row_boundary) const;
 
   //////////////////////////////////////////////////////////////////////////////
   // Sampling related
diff --git a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
index 8cfb3c9a5d..50a07bdd4e 100644
--- a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
+++ b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
@@ -14,6 +14,7 @@ extern galois::DynamicBitSet bitset_graph_aggregate;
 extern galois::LargeArray<uint32_t>* gnn_lid_to_sid_pointer_;
 extern galois::DynamicBitSet bitset_sample_flag_;
 extern size_t subgraph_size_;
+extern size_t num_active_layer_rows_;
 #ifdef GALOIS_ENABLE_GPU
 extern struct CUDA_Context* cuda_ctx_for_sync;
 extern unsigned layer_number_to_sync;
@@ -253,37 +254,35 @@ struct GNNSampleSumAggregate {
   //! No-op: readAny = overwritten anyways
   static void reset(uint32_t, char&) {}
 
-  //! element wise set
+  // version where you have a vector object
   static void setVal(uint32_t node_id, char&, ValTy y) {
     assert(y.size() == gnn_matrix_to_sync_column_length_);
-    if ((*gnn_lid_to_sid_pointer_)[node_id] ==
-        std::numeric_limits<uint32_t>::max()) {
+    uint32_t converted_sid = (*gnn_lid_to_sid_pointer_)[node_id];
+    if (converted_sid >= num_active_layer_rows_ ||
+        converted_sid == std::numeric_limits<uint32_t>::max()) {
       return;
     }
-    assert((*gnn_lid_to_sid_pointer_)[node_id] < subgraph_size_);
+    assert(converted_sid < subgraph_size_);
 
     // loop and do addition
     for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) {
-      gnn_matrix_to_sync_[(*gnn_lid_to_sid_pointer_)[node_id] *
-                              gnn_matrix_to_sync_column_length_ +
+      gnn_matrix_to_sync_[converted_sid * gnn_matrix_to_sync_column_length_ +
                           i] = y[i];
     }
   }
+
+  // version where you have a pointer only (more efficient because this
+  // version is for reading directly from the recv buffer)
   static void setVal(uint32_t node_id, char&, ValTy::value_type* y) {
-    if ((*gnn_lid_to_sid_pointer_)[node_id] ==
-        std::numeric_limits<uint32_t>::max()) {
+    uint32_t converted_sid = (*gnn_lid_to_sid_pointer_)[node_id];
+    if (converted_sid >= num_active_layer_rows_ ||
+        converted_sid == std::numeric_limits<uint32_t>::max()) {
       return;
     }
 
     // loop and do addition
     for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) {
-      // galois::gPrint(galois::runtime::getSystemNetworkInterface().ID,  "]
-      // broadxast nodeid ", node_id, " sid ",
-      // (*gnn_lid_to_sid_pointer_)[node_id],
-      //               " write ", (*gnn_lid_to_sid_pointer_)[node_id] *
-      //                        gnn_matrix_to_sync_column_length_ + i, "\n");
-      gnn_matrix_to_sync_[(*gnn_lid_to_sid_pointer_)[node_id] *
-                              gnn_matrix_to_sync_column_length_ +
+      gnn_matrix_to_sync_[converted_sid * gnn_matrix_to_sync_column_length_ +
                           i] = y[i];
     }
   }
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index 6f87f3e88e..c12701d926 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -45,6 +45,7 @@ size_t subgraph_size_                    = 0;
 //! For synchronization of graph aggregations
 galois::DynamicBitSet bitset_graph_aggregate;
 galois::LargeArray<uint32_t>* gnn_lid_to_sid_pointer_ = nullptr;
+size_t num_active_layer_rows_                         = 0;
 uint32_t* gnn_degree_vec_1_;
 uint32_t* gnn_degree_vec_2_;
 
@@ -200,12 +201,13 @@ bool galois::graphs::GNNGraph::IsValidForPhaseMasked(
   return (*mask_to_use)[lid];
 }
 
-void galois::graphs::GNNGraph::AggregateSync(GNNFloat* matrix_to_sync,
-                                             const size_t matrix_column_size,
-                                             bool is_backward) const {
+void galois::graphs::GNNGraph::AggregateSync(
+    GNNFloat* matrix_to_sync, const size_t matrix_column_size, bool is_backward,
+    uint32_t active_row_boundary) const {
   gnn_matrix_to_sync_               = matrix_to_sync;
   gnn_matrix_to_sync_column_length_ = matrix_column_size;
   subgraph_size_                    = active_size();
+  num_active_layer_rows_            = active_row_boundary;
   if (!use_subgraph_ && !use_subgraph_view_) {
     // set globals for the sync substrate
     if (!is_backward) {
@@ -971,11 +973,11 @@ size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) {
   // Seed nodes sync
   if (use_timer_) {
     sync_substrate_
-        ->sync<writeSource, readAny, SampleFlagSync, SampleFlagBitset>(
+        ->sync<writeSource, readSource, SampleFlagSync, SampleFlagBitset>(
             "SeedNodeSample");
   } else {
     sync_substrate_
-        ->sync<writeSource, readAny, SampleFlagSync, SampleFlagBitset>(
+        ->sync<writeSource, readSource, SampleFlagSync, SampleFlagBitset>(
             "Ignore");
   }
   galois::GAccumulator<unsigned> local_seed_count;
@@ -1058,11 +1060,11 @@ size_t galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num,
 
   if (use_timer_) {
     sync_substrate_
-        ->sync<writeDestination, readAny, SampleFlagSync, SampleFlagBitset>(
+        ->sync<writeDestination, readSource, SampleFlagSync, SampleFlagBitset>(
             "SampleFlag");
   } else {
     sync_substrate_
-        ->sync<writeDestination, readAny, SampleFlagSync, SampleFlagBitset>(
+        ->sync<writeDestination, readSource, SampleFlagSync, SampleFlagBitset>(
             "Ignore");
   }
 
@@ -1155,11 +1157,11 @@ size_t galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num,
   // correctly
   if (use_timer_) {
     sync_substrate_
-        ->sync<writeDestination, readAny, SampleFlagSync, SampleFlagBitset>(
+        ->sync<writeDestination, readSource, SampleFlagSync, SampleFlagBitset>(
             "SampleFlag");
   } else {
     sync_substrate_
-        ->sync<writeDestination, readAny, SampleFlagSync, SampleFlagBitset>(
+        ->sync<writeDestination, readSource, SampleFlagSync, SampleFlagBitset>(
             "Ignore");
   }
 
diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp
index 29f4719996..636d7690b9 100644
--- a/libgnn/src/layers/SAGELayer.cpp
+++ b/libgnn/src/layers/SAGELayer.cpp
@@ -573,11 +573,15 @@ void galois::SAGELayer::AggregateAll(
         pts,
     bool is_backward) {
   std::string agg_timer_name = "AggregateCompute";
+  size_t num_rows_to_handle;
   if (!is_backward) {
     agg_timer_name += "Forward";
+    num_rows_to_handle = layer_dimensions_.output_rows;
   } else {
     agg_timer_name += "Backward";
+    num_rows_to_handle = layer_dimensions_.input_rows;
   }
+
   galois::StatTimer timer(agg_timer_name.c_str(), kRegionName);
   TimerStart(&timer);
 
@@ -597,8 +601,10 @@ void galois::SAGELayer::AggregateAll(
     AggregateAllCPU(column_length, node_embeddings, aggregate_output, pts,
                     is_backward);
     TimerStop(&timer);
+
     // aggregate sync
-    graph_.AggregateSync(aggregate_output, column_length, is_backward);
+    graph_.AggregateSync(aggregate_output, column_length, is_backward,
+                         num_rows_to_handle);
 #ifdef GALOIS_ENABLE_GPU
   }
 #endif

From 44c003dd6f19db03d6956bd89516c5c4b99c9e0b Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 12 Jul 2021 21:27:39 -0500
Subject: [PATCH 585/660] GCN barrier before start

---
 lonestar/gnn/distributed/gcn/gcn-dist.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lonestar/gnn/distributed/gcn/gcn-dist.cpp b/lonestar/gnn/distributed/gcn/gcn-dist.cpp
index e3dd1cac77..f33fd89c38 100644
--- a/lonestar/gnn/distributed/gcn/gcn-dist.cpp
+++ b/lonestar/gnn/distributed/gcn/gcn-dist.cpp
@@ -13,6 +13,8 @@ int main(int argc, char* argv[]) {
   gnn->SetLayerPhases(galois::GNNPhase::kTrain);
   init_timer.stop();
 
+  galois::runtime::getHostBarrier().wait();
+
   galois::StatTimer compute_timer("Timer_0");
   compute_timer.start();
   gnn->Train(num_epochs);

From 14c6168ce12389c35f4e4381afadf624cdf6b196 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 12 Jul 2021 21:29:07 -0500
Subject: [PATCH 586/660] Parallel SID mapping for GNN subgraph

SID mapping isn't slow if graph is sufficiently small, but
it becomes a big problem once graphs scale up. This commit
makes the process parallel using prefixsums.

TODO need to double check it code wise for sanity.
---
 libgnn/include/galois/graphs/GNNSubgraph.h |  46 +++++
 libgnn/src/graphs/GNNSubgraph.cpp          | 205 ++++++++++++++++-----
 2 files changed, 205 insertions(+), 46 deletions(-)

diff --git a/libgnn/include/galois/graphs/GNNSubgraph.h b/libgnn/include/galois/graphs/GNNSubgraph.h
index 2836e3f181..d9abd10c30 100644
--- a/libgnn/include/galois/graphs/GNNSubgraph.h
+++ b/libgnn/include/galois/graphs/GNNSubgraph.h
@@ -121,6 +121,45 @@ class GNNSubgraph {
   //! original graph. Should be done every epoch when sampled graph changes.
   void CreateSubgraphMapping(GNNGraph& gnn_graph, size_t);
 
+  //! reset sid thread offsets used for parallel SID mapping creation
+  void ResetSIDThreadOffsets(size_t num_layers) {
+    if (!sid_thread_offsets_.size()) {
+      sid_thread_offsets_.resize(galois::getActiveThreads());
+      galois::on_each([&](size_t thread_id, size_t) {
+        sid_thread_offsets_[thread_id].resize(num_layers);
+      });
+    }
+
+    if (!subgraph_mirror_offsets_.size()) {
+      subgraph_mirror_offsets_.resize(galois::getActiveThreads());
+      galois::on_each([&](size_t thread_id, size_t) {
+        subgraph_mirror_offsets_[thread_id].resize(
+            galois::runtime::getSystemNetworkInterface().Num);
+      });
+    }
+
+    galois::do_all(
+        galois::iterate(size_t{0}, sid_thread_offsets_.size()), [&](size_t i) {
+          galois::PODResizeableArray<unsigned>& arr = sid_thread_offsets_[i];
+          std::fill(arr.begin(), arr.end(), 0);
+          galois::PODResizeableArray<unsigned>& arr2 =
+              subgraph_mirror_offsets_[i];
+          std::fill(arr2.begin(), arr2.end(), 0);
+        });
+
+    if (thread_zero_work_.size() < num_layers) {
+      thread_zero_work_.resize(num_layers);
+    }
+    if (thread_zero_mirror_offsets_.size() <
+        galois::runtime::getSystemNetworkInterface().Num) {
+      thread_zero_mirror_offsets_.resize(
+          galois::runtime::getSystemNetworkInterface().Num);
+    }
+    std::fill(thread_zero_work_.begin(), thread_zero_work_.end(), 0);
+    std::fill(thread_zero_mirror_offsets_.begin(),
+              thread_zero_mirror_offsets_.end(), 0);
+  }
+
   //! Counts in and out degrees of all sampled nodes in the graph
   void DegreeCounting(const GNNGraph& gnn_graph);
   //! Creates edges
@@ -159,5 +198,12 @@ class GNNSubgraph {
   galois::PODResizeableArray<uint32_t> in_subedge_to_original_edge_;
 
   //! Mirror mappings for Gluon for subgraph
+  // std::vector<std::vector<size_t>> subgraph_mirrors_;
   std::vector<std::vector<size_t>> subgraph_mirrors_;
+
+  //! Offsets to use for
+  std::vector<galois::PODResizeableArray<unsigned>> sid_thread_offsets_;
+  std::vector<galois::PODResizeableArray<unsigned>> subgraph_mirror_offsets_;
+  galois::PODResizeableArray<unsigned> thread_zero_work_;
+  galois::PODResizeableArray<unsigned> thread_zero_mirror_offsets_;
 };
diff --git a/libgnn/src/graphs/GNNSubgraph.cpp b/libgnn/src/graphs/GNNSubgraph.cpp
index 141390e20e..a19d1d1320 100644
--- a/libgnn/src/graphs/GNNSubgraph.cpp
+++ b/libgnn/src/graphs/GNNSubgraph.cpp
@@ -7,7 +7,6 @@ size_t galois::graphs::GNNGraph::GNNSubgraph::BuildSubgraph(
   TimerStart(&timer);
   for (auto& vec : subgraph_mirrors_) {
     vec.clear();
-    // vec.reserve(num_subgraph_nodes_ - subgraph_master_boundary_);
   }
   CreateSubgraphMapping(gnn_graph, num_sampled_layers);
   if (num_subgraph_nodes_ == 0) {
@@ -43,23 +42,30 @@ void galois::graphs::GNNGraph::GNNSubgraph::CreateSubgraphMapping(
   std::fill(lid_to_subgraph_id_.begin(), lid_to_subgraph_id_.end(),
             std::numeric_limits<uint32_t>::max());
 
-  std::vector<unsigned>& master_offsets = gnn_graph.GetMasterOffsets();
-  std::vector<unsigned>& mirror_offsets = gnn_graph.GetMirrorOffsets();
-
   galois::GAccumulator<uint32_t> subgraph_count;
   subgraph_count.reset();
   galois::do_all(galois::iterate(gnn_graph.begin(), gnn_graph.end()),
                  [&](uint32_t node_id) {
-                   // if (gnn_graph.IsInSampledGraph(node_id)) {
                    if (gnn_graph.IsActiveInSubgraph(node_id)) {
                      subgraph_count += 1;
                    }
                  });
   num_subgraph_nodes_ = subgraph_count.reduce();
+  // if no subgraph, get out
+  if (num_subgraph_nodes_ == 0) {
+    subgraph_master_boundary_ = 0;
+    TimerStop(&timer);
+    return;
+  }
+
   if (subgraph_id_to_lid_.size() < num_subgraph_nodes_) {
+    // allocate a bit more than necessary to avoid a big realloc
+    // if node value changes slightly later
     subgraph_id_to_lid_.resize(num_subgraph_nodes_ * 1.02);
   }
 
+  // bitset to mark if a master is outside the "master only" boundary
+  // and not contiguous; needed to mask out non-masters
   galois::DynamicBitSet& non_layer_zero_masters =
       gnn_graph.GetNonLayerZeroMasters();
   // init the bitset as necessary
@@ -69,8 +75,12 @@ void galois::graphs::GNNGraph::GNNSubgraph::CreateSubgraphMapping(
     non_layer_zero_masters.reset();
   }
 
+  std::vector<unsigned>& master_offsets = gnn_graph.GetMasterOffsets();
+  std::vector<unsigned>& mirror_offsets = gnn_graph.GetMirrorOffsets();
+
+  ResetSIDThreadOffsets(master_offsets.size());
+
   // compute offsets for each layer
-  uint32_t layer_zero_offset = 0;
   galois::PODResizeableArray<unsigned> layer_offsets;
   layer_offsets.resize(master_offsets.size() - 1);
   for (unsigned i = 0; i < layer_offsets.size(); i++) {
@@ -81,59 +91,162 @@ void galois::graphs::GNNGraph::GNNSubgraph::CreateSubgraphMapping(
     }
   }
 
-  // split into 2 parts: masters, then everything else
+  // all nodes before this SID are master nodes in layer 0;
+  // NOTE: there are master nodes past this boundary that will
+  // not be covered by a begin_owned loop, which may cause problems down
+  // the line; this is handled by the bitset above
+  subgraph_master_boundary_ = master_offsets[0];
+
   size_t last_owned_node = *(gnn_graph.end_owned());
-  for (size_t local_node_id = 0; local_node_id < last_owned_node;
-       local_node_id++) {
-    if (gnn_graph.IsActiveInSubgraph(local_node_id)) {
-      unsigned node_timestamp = gnn_graph.SampleNodeTimestamp(local_node_id);
-      if (node_timestamp != std::numeric_limits<unsigned>::max()) {
-        uint32_t sid_to_use;
-        if (node_timestamp != 0) {
-          sid_to_use = layer_offsets[node_timestamp - 1]++;
-          // master that won't be in prefix needs to be marked
-          non_layer_zero_masters.set(sid_to_use);
+  // compute amount of work each thread needs to do
+  galois::on_each([&](size_t thread_id, size_t num_threads) {
+    unsigned start_node;
+    unsigned end_node;
+    // this thread always has a set number of nodes to run; this is it
+    std::tie(start_node, end_node) = galois::block_range(
+        size_t{0}, gnn_graph.size(), thread_id, num_threads);
+    // these arrays track how much work will need to be done by this
+    // thread
+    galois::PODResizeableArray<unsigned>& my_offsets =
+        sid_thread_offsets_[thread_id];
+    galois::PODResizeableArray<unsigned>& my_mirror_offsets =
+        subgraph_mirror_offsets_[thread_id];
+
+    for (size_t local_node_id = start_node; local_node_id < end_node;
+         local_node_id++) {
+      // only bother if node was active
+      if (gnn_graph.IsActiveInSubgraph(local_node_id)) {
+        unsigned node_timestamp = gnn_graph.SampleNodeTimestamp(local_node_id);
+        // TODO(loc) this check shouldn't even be necessary; active in subgraph
+        // implies added at somepoint
+        if (node_timestamp != std::numeric_limits<unsigned>::max()) {
+          // tracks how many nodes for each timestamp this node will
+          // work with by incrementing this
+          my_offsets[node_timestamp]++;
+
+          if (local_node_id >= last_owned_node) {
+            // this is a mirror node; get the host that the master is located
+            // on and increment this thread's mirror node count for that host
+            uint32_t node_gid = gnn_graph.GetGID(local_node_id);
+            my_mirror_offsets[gnn_graph.GetHostID(node_gid)]++;
+          }
         } else {
-          sid_to_use = layer_zero_offset++;
+          GALOIS_LOG_WARN("shouldn't ever get here right?");
         }
-        subgraph_id_to_lid_[sid_to_use]    = local_node_id;
-        lid_to_subgraph_id_[local_node_id] = sid_to_use++;
       }
     }
+  });
+
+  // prefix sum the threads
+  galois::do_all(galois::iterate(size_t{0}, master_offsets.size()),
+                 [&](size_t layer_num) {
+                   for (size_t thread_id = 1;
+                        thread_id < galois::getActiveThreads(); thread_id++) {
+                     sid_thread_offsets_[thread_id][layer_num] +=
+                         sid_thread_offsets_[thread_id - 1][layer_num];
+                   }
+                 });
+
+  for (unsigned i = 0; i < master_offsets.size() - 1; i++) {
+    if (i > 0) {
+      GALOIS_LOG_VASSERT(
+          sid_thread_offsets_[galois::getActiveThreads() - 1][i] +
+                  layer_offsets[i - 1] ==
+              (layer_offsets[i]),
+          "layer {} wrong {} vs correct {}", i,
+          sid_thread_offsets_[galois::getActiveThreads() - 1][i],
+          layer_offsets[i]);
+    } else {
+      GALOIS_LOG_VASSERT(
+          sid_thread_offsets_[galois::getActiveThreads() - 1][i] ==
+              (layer_offsets[i]),
+          "layer {} wrong {} vs correct {}", i,
+          sid_thread_offsets_[galois::getActiveThreads() - 1][i],
+          layer_offsets[i]);
+    }
   }
 
-  // all nodes before this SID are master nodes in layer 0;
-  // NOTE: there are master nodes past this boundary that will
-  // not be covered by a begin_owned loop, which may cause problems down
-  // the line
-  subgraph_master_boundary_ = master_offsets[0];
+  // last element of prefix sum needs to equal the correct layer offset
+  galois::do_all(
+      galois::iterate(uint32_t{0},
+                      galois::runtime::getSystemNetworkInterface().Num),
+      [&](size_t host_num) {
+        // for each host, get prefix sum of each thread's mirrors
+        for (size_t thread_id = 1; thread_id < galois::getActiveThreads();
+             thread_id++) {
+          subgraph_mirror_offsets_[thread_id][host_num] +=
+              subgraph_mirror_offsets_[thread_id - 1][host_num];
+        }
+      });
+
+  // allocate the mirror space; last element of prefix sum is total size
+  for (unsigned host_num = 0;
+       host_num < galois::runtime::getSystemNetworkInterface().Num;
+       host_num++) {
+    if (galois::runtime::getSystemNetworkInterface().ID == host_num) {
+      continue;
+    }
+    subgraph_mirrors_[host_num].resize(
+        subgraph_mirror_offsets_[galois::getActiveThreads() - 1][host_num]);
+  }
+
+  galois::on_each([&](size_t thread_id, size_t num_threads) {
+    unsigned start_node;
+    unsigned end_node;
+    std::tie(start_node, end_node) = galois::block_range(
+        size_t{0}, gnn_graph.size(), thread_id, num_threads);
+
+    galois::PODResizeableArray<unsigned>& current_thread_offset =
+        thread_id != 0 ? sid_thread_offsets_[thread_id - 1] : thread_zero_work_;
+    galois::PODResizeableArray<unsigned>& my_mirror_offsets =
+        thread_id != 0 ? subgraph_mirror_offsets_[thread_id - 1]
+                       : thread_zero_mirror_offsets_;
+
+    for (size_t local_node_id = start_node; local_node_id < end_node;
+         local_node_id++) {
+      if (gnn_graph.IsActiveInSubgraph(local_node_id)) {
+        unsigned node_timestamp = gnn_graph.SampleNodeTimestamp(local_node_id);
+        if (node_timestamp != std::numeric_limits<unsigned>::max()) {
+          uint32_t sid_to_use;
+          if (node_timestamp != 0) {
+            sid_to_use = layer_offsets[node_timestamp - 1] +
+                         current_thread_offset[node_timestamp]++;
+            if (local_node_id < last_owned_node) {
+              // master node that is not in layer 0 (i.e. node_timestamp != 0)
+              non_layer_zero_masters.set(sid_to_use);
+            }
+          } else {
+            // node timestamp == 0; no layer offset needed because offset
+            // is 0
+            sid_to_use = current_thread_offset[node_timestamp]++;
+          }
 
-  // everything else; none of these are master nodes
-  for (size_t local_node_id = last_owned_node; local_node_id < gnn_graph.size();
-       local_node_id++) {
-    if (gnn_graph.IsActiveInSubgraph(local_node_id)) {
-      unsigned node_timestamp = gnn_graph.SampleNodeTimestamp(local_node_id);
-      if (node_timestamp != std::numeric_limits<unsigned>::max()) {
-        uint32_t sid_to_use;
-        if (node_timestamp != 0) {
-          sid_to_use = layer_offsets[node_timestamp - 1]++;
+          // this is a mirror
+          if (local_node_id >= last_owned_node) {
+            // XXX(loc) mirror offsets
+            uint32_t node_gid = gnn_graph.GetGID(local_node_id);
+            size_t my_offset =
+                my_mirror_offsets[gnn_graph.GetHostID(node_gid)]++;
+
+            if (my_offset >
+                subgraph_mirrors_[gnn_graph.GetHostID(node_gid)].size())
+              GALOIS_LOG_FATAL(
+                  "{} {}", my_offset,
+                  subgraph_mirrors_[gnn_graph.GetHostID(node_gid)].size());
+
+            subgraph_mirrors_[gnn_graph.GetHostID(node_gid)][my_offset] =
+                node_gid;
+          }
+
+          subgraph_id_to_lid_[sid_to_use]    = local_node_id;
+          lid_to_subgraph_id_[local_node_id] = sid_to_use;
         } else {
-          sid_to_use = layer_zero_offset++;
+          GALOIS_LOG_WARN("shouldn't ever get here right?");
         }
-        subgraph_id_to_lid_[sid_to_use]    = local_node_id;
-        lid_to_subgraph_id_[local_node_id] = sid_to_use++;
-
-        uint32_t node_gid = gnn_graph.GetGID(local_node_id);
-        // mirror node; gids because they need to be sent as gids
-        // and converted over later
-        assert(node_gid < gnn_graph.global_size());
-        assert(subgraph_mirrors_.size() > gnn_graph.GetHostID(node_gid));
-        subgraph_mirrors_[gnn_graph.GetHostID(node_gid)].push_back(node_gid);
       }
     }
-  }
+  });
 
-  GALOIS_LOG_ASSERT(layer_offsets.back() == num_subgraph_nodes_);
   TimerStop(&timer);
 }
 

From 7546f211f0d14834f86ef1927edcfd19b91e7d15 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 16 Jul 2021 20:17:36 -0500
Subject: [PATCH 587/660] Minibatch shuffling between epochs

First half of ogbn paper fix: minibatches need to be shuffled
between epochs to prevent overfitting to the same node minibatches
in every epoch.

The second half is input related: my remapping of the graph seems
to have broken something as using the original graph results
in much better accuracy. For the time being I will use that one.
---
 libgnn/include/galois/MinibatchGenerator.h | 54 ++++++++++++++++++++--
 libgnn/include/galois/graphs/GNNGraph.h    |  1 +
 libgnn/src/MinibatchGenerator.cpp          | 14 +++++-
 3 files changed, 64 insertions(+), 5 deletions(-)

diff --git a/libgnn/include/galois/MinibatchGenerator.h b/libgnn/include/galois/MinibatchGenerator.h
index 8a5063ed1d..459014f65a 100644
--- a/libgnn/include/galois/MinibatchGenerator.h
+++ b/libgnn/include/galois/MinibatchGenerator.h
@@ -2,6 +2,8 @@
 
 #include "galois/GNNTypes.h"
 #include "galois/Logging.h"
+#include <random>
+#include <algorithm>
 
 namespace galois {
 
@@ -12,20 +14,64 @@ class MinibatchGenerator {
   MinibatchGenerator(const GNNMask& mask_to_minibatch, size_t minibatch_size,
                      size_t master_bound)
       : mask_to_minibatch_{mask_to_minibatch}, minibatch_size_{minibatch_size},
-        current_position_{0}, master_bound_{master_bound} {
+        current_position_{0}, master_bound_{master_bound}, rand_generator_{
+                                                               100} {
     GALOIS_LOG_ASSERT(master_bound_ <= mask_to_minibatch_.size());
   }
-  void GetNextMinibatch(std::vector<char>* batch_mask);
+
+  void GetNextMinibatch(std::vector<char>* batch_mask) {
+    if (!shuffle_mode_) {
+      OriginalGetNextMinibatch(batch_mask);
+    } else {
+      ShuffleGetNextMinibatch(batch_mask);
+    }
+  }
+
   //! True if no more minibatches from this generator
-  bool NoMoreMinibatches() { return current_position_ == master_bound_; }
+  bool NoMoreMinibatches() {
+    if (!shuffle_mode_) {
+      return current_position_ == master_bound_;
+    } else {
+      return current_position_ >= all_indices_.size();
+    }
+  }
+
   //! Reset the only state (a position bit)
-  void ResetMinibatchState() { current_position_ = 0; }
+  void ResetMinibatchState() {
+    current_position_ = 0;
+    if (shuffle_mode_) {
+      std::shuffle(all_indices_.begin(), all_indices_.end(), rand_generator_);
+    }
+  }
+
+  void ShuffleMode() {
+    if (!shuffle_mode_) {
+      shuffle_mode_ = true;
+      all_indices_.reserve(master_bound_);
+      // setup all set indices for the minibatch
+      for (size_t pos = 0; pos < master_bound_; pos++) {
+        if (mask_to_minibatch_[pos]) {
+          all_indices_.emplace_back(pos);
+        }
+      }
+      // shuffle it
+      std::shuffle(all_indices_.begin(), all_indices_.end(), rand_generator_);
+      printf("Number of things in minibatch generator is %lu\n",
+             all_indices_.size());
+    }
+  }
 
 private:
   const GNNMask& mask_to_minibatch_;
   size_t minibatch_size_;
   size_t current_position_;
   size_t master_bound_;
+  std::vector<uint32_t> all_indices_;
+  bool shuffle_mode_ = false;
+  std::mt19937 rand_generator_;
+
+  void OriginalGetNextMinibatch(std::vector<char>* batch_mask);
+  void ShuffleGetNextMinibatch(std::vector<char>* batch_mask);
 };
 
 } // namespace galois
diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 36115929d7..0dc906c772 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -397,6 +397,7 @@ class GNNGraph {
     }
     train_batcher_ = std::make_unique<MinibatchGenerator>(
         local_training_mask_, train_batch_size, *end_owned());
+    train_batcher_->ShuffleMode();
     local_minibatch_mask_.resize(partitioned_graph_->size());
   }
 
diff --git a/libgnn/src/MinibatchGenerator.cpp b/libgnn/src/MinibatchGenerator.cpp
index 48570c094e..a0c66f703b 100644
--- a/libgnn/src/MinibatchGenerator.cpp
+++ b/libgnn/src/MinibatchGenerator.cpp
@@ -1,7 +1,7 @@
 #include "galois/MinibatchGenerator.h"
 #include <cassert>
 
-void galois::MinibatchGenerator::GetNextMinibatch(
+void galois::MinibatchGenerator::OriginalGetNextMinibatch(
     std::vector<char>* batch_mask) {
   assert(current_position_ <= mask_to_minibatch_.size());
   assert(current_position_ <= master_bound_);
@@ -33,3 +33,15 @@ void galois::MinibatchGenerator::GetNextMinibatch(
     current_position_++;
   }
 }
+
+void galois::MinibatchGenerator::ShuffleGetNextMinibatch(
+    std::vector<char>* batch_mask) {
+  size_t current_count = 0;
+  std::fill(batch_mask->begin(), batch_mask->end(), 0);
+  while (current_position_ < all_indices_.size()) {
+    (*batch_mask)[all_indices_[current_position_++]] = 1;
+    current_count++;
+    if (current_count == minibatch_size_)
+      break;
+  }
+}

From b68fcae6f4a1e9e7b32119b49554000bafe0968f Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 19 Jul 2021 21:29:23 -0500
Subject: [PATCH 588/660] WIP: timing subgraph clears and beginning to fix

---
 libgnn/src/GraphNeuralNetwork.cpp |  8 +++++++-
 libgnn/src/graphs/GNNGraph.cpp    | 24 +++++++++++++++++++-----
 2 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index b6c38963ed..898523aedf 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -236,7 +236,7 @@ float galois::GraphNeuralNetwork::MinibatchedTesting() {
     }
   }
 
-  galois::gDebug("Minibatching Correct / Total ", correct, " ", total);
+  galois::gInfo("Minibatching Correct / Total ", correct, " ", total);
 
   if (choose_all_status) {
     graph_->EnableSubgraphChooseAll();
@@ -366,6 +366,8 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
 
       // create mini batch graphs and loop until minibatches on all hosts done
       while (true) {
+        galois::StatTimer prep_timer("PrepNextMinibatch", kRegionName);
+        galois::StatTimer sample_time("MinibatchSampling", kRegionName);
         galois::StatTimer mb_timer("MinibatchSubgraphCreation", kRegionName);
         mb_timer.start();
 
@@ -374,15 +376,18 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
         work_left_.reset();
         galois::gInfo("Epoch ", epoch, " batch ", batch_num++);
         // break when all hosts are done with minibatches
+        prep_timer.start();
         size_t seed_node_count = graph_->PrepareNextTrainMinibatch();
         galois::gDebug(graph_->host_prefix(),
                        "Number of local seed nodes is for batch is ",
                        seed_node_count);
+        prep_timer.stop();
 
         // last layer input size/output rows becomes seed node size
         // gnn_layers_.back()->ResizeInputOutputRows(seed_node_count,
         //                                          seed_node_count);
 
+        sample_time.start(); 
         // +1 later in call because 0 is already taken
         size_t num_sampled_layers = 0;
         for (auto back_iter = gnn_layers_.rbegin();
@@ -414,6 +419,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
             num_sampled_layers++;
           }
         }
+        sample_time.stop(); 
 
         // resize layer matrices
         CorrectRowCounts(graph_->ConstructSampledSubgraph(num_sampled_layers));
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index c12701d926..edb6738bcc 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -932,7 +932,7 @@ size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) {
                    } else {
                      UnsetSampledNode(*x);
                    }
-                 });
+                 }, galois::loopname("InitialSeedSetting"));
   // unsets nodes set in previous iterations; for some reason they get
   // synchronized along  with everything else even though bitset sample flag
   // should prevent it (that, or it's because they don't get sync'd that they
@@ -941,10 +941,13 @@ size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) {
                  [&](const NodeIterator& x) { UnsetSampledNode(*x); });
 
   // clear node timestamps
+  galois::StatTimer fill_time("ClearFillTime");
+  fill_time.start();
   std::fill(sample_node_timestamps_.begin(), sample_node_timestamps_.end(),
             std::numeric_limits<uint32_t>::max());
   std::fill(sample_master_offsets_.begin(), sample_master_offsets_.end(), 0);
   std::fill(sample_mirror_offsets_.begin(), sample_mirror_offsets_.end(), 0);
+  fill_time.stop();
 
   for (unsigned i = 0; i < master_offset_accum_.size(); i++) {
     master_offset_accum_[i].reset();
@@ -954,7 +957,8 @@ size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) {
   // clear all sampled edges
   galois::do_all(
       galois::iterate(edge_sample_status_.begin(), edge_sample_status_.end()),
-      [&](galois::DynamicBitSet& edge_layer) { edge_layer.reset(); });
+      [&](galois::DynamicBitSet& edge_layer) { edge_layer.reset(); },
+      galois::loopname("ClearSampleEdges"));
 
   sampled_edges_.reset();
   // reset all degrees
@@ -962,8 +966,11 @@ size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) {
     galois::do_all(
         galois::iterate(sampled_out_degrees_),
         [&](galois::LargeArray<uint32_t>& array) {
-          std::fill(array.begin(), array.end(), 0);
+          memset(array.data(), 0, array.size() * sizeof(uint32_t))
+          //std::fill(array.begin(), array.end(), 0);
+          //std::fill(array.begin(), array.end(), 0);
         },
+        galois::loopname("ClearAllDegrees"),
         galois::chunk_size<1>());
   }
 
@@ -1001,7 +1008,8 @@ size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) {
       // 0 = seed node
       sample_node_timestamps_[*x] = 0;
     }
-  });
+  },
+  galois::loopname("SeedNodeOffsetCounting"));
 
   sample_master_offsets_[0] = master_offset.reduce();
   sample_mirror_offsets_[0] = mirror_offset.reduce();
@@ -1204,6 +1212,8 @@ galois::graphs::GNNGraph::ConstructSampledSubgraph(size_t num_sampled_layers,
             "Ignore");
   }
 
+  galois::StatTimer offsets_n_rows_time("OffsetRowSubgraphTime");
+  offsets_n_rows_time.start();
   galois::do_all(galois::iterate(begin(), end()), [&](const NodeIterator& x) {
     if (IsActiveInSubgraph(*x)) {
       if (sample_node_timestamps_[*x] != std::numeric_limits<uint32_t>::max()) {
@@ -1219,7 +1229,9 @@ galois::graphs::GNNGraph::ConstructSampledSubgraph(size_t num_sampled_layers,
             "should have been timestamped at some point if active");
       }
     }
-  });
+  },
+  galois::loopname("MasterMirrorOffset")
+  );
 
   std::vector<unsigned> new_rows(master_offset_accum_.size());
   for (unsigned i = 0; i < master_offset_accum_.size(); i++) {
@@ -1231,6 +1243,8 @@ galois::graphs::GNNGraph::ConstructSampledSubgraph(size_t num_sampled_layers,
     }
   }
 
+  offsets_n_rows_time.stop();
+
   if (!use_view) {
     subgraph_->BuildSubgraph(*this, num_sampled_layers);
   } else {

From 44e7995c1d4ae04455a129c576d13b08755d2e01 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Mon, 19 Jul 2021 22:38:37 -0500
Subject: [PATCH 589/660] Parallel bitset reset and parallel std::fill

fill and reset operations for GNNs are a very big bottleneck
for large graphs. This commit adds parallel reset for bitsets and
std::fill and uses them in the GNN libraries to avoid resetting them in
parallel.
---
 libgalois/include/galois/DynamicBitset.h      |   6 +
 libgalois/include/galois/PODResizeableArray.h |   2 +-
 libgalois/include/galois/ParallelSTL.h        |   6 +
 libgalois/include/galois/TwoDVector.h         |  10 +-
 libgnn/include/galois/graphs/GNNSubgraph.h    |   7 +-
 libgnn/src/GraphNeuralNetwork.cpp             |   4 +-
 libgnn/src/MinibatchGenerator.cpp             |   5 +-
 libgnn/src/graphs/GNNGraph.cpp                | 149 ++++++++++--------
 libgnn/src/graphs/GNNSubgraph.cpp             |   5 +-
 9 files changed, 114 insertions(+), 80 deletions(-)

diff --git a/libgalois/include/galois/DynamicBitset.h b/libgalois/include/galois/DynamicBitset.h
index 6bb9c34864..e2035d018a 100644
--- a/libgalois/include/galois/DynamicBitset.h
+++ b/libgalois/include/galois/DynamicBitset.h
@@ -109,6 +109,12 @@ class DynamicBitSet {
    */
   void reset() { std::fill(bitvec.begin(), bitvec.end(), 0); }
 
+  void ParallelReset() {
+    galois::do_all(
+        galois::iterate(bitvec),
+        [&](galois::CopyableAtomic<uint64_t>& to_reset) { to_reset = 0; });
+  }
+
   /**
    * Unset a range of bits given an inclusive range
    *
diff --git a/libgalois/include/galois/PODResizeableArray.h b/libgalois/include/galois/PODResizeableArray.h
index dc1cabdb48..acff59c8e9 100644
--- a/libgalois/include/galois/PODResizeableArray.h
+++ b/libgalois/include/galois/PODResizeableArray.h
@@ -187,7 +187,7 @@ class PODResizeableArray {
   void insert(iterator GALOIS_USED_ONLY_IN_DEBUG(position), InputIterator first,
               InputIterator last) {
     assert(position == end());
-    size_t to_add   = last - first;
+    size_t to_add = last - first;
     if (to_add > 0) {
       size_t old_size = size_;
       resize(old_size + to_add);
diff --git a/libgalois/include/galois/ParallelSTL.h b/libgalois/include/galois/ParallelSTL.h
index c22858c84f..4158a6dc5c 100644
--- a/libgalois/include/galois/ParallelSTL.h
+++ b/libgalois/include/galois/ParallelSTL.h
@@ -377,6 +377,12 @@ OutputIt partial_sum(InputIt first, InputIt last, OutputIt d_first) {
   }
 }
 
+template <class ForwardIt, class T>
+void fill(ForwardIt first, ForwardIt last, const T& value) {
+  galois::do_all(galois::iterate(first, last),
+                 [&](auto& iter) { iter = value; });
+}
+
 } // end namespace ParallelSTL
 } // end namespace galois
 #endif
diff --git a/libgalois/include/galois/TwoDVector.h b/libgalois/include/galois/TwoDVector.h
index 1af9fba505..396bb208af 100644
--- a/libgalois/include/galois/TwoDVector.h
+++ b/libgalois/include/galois/TwoDVector.h
@@ -1,7 +1,7 @@
 #pragma once
 
-#include "gstl.h" 
-#include "PODResizeableArray.h" 
+#include "gstl.h"
+#include "PODResizeableArray.h"
 
 namespace galois {
 
@@ -27,18 +27,18 @@ class TwoDVector {
     assert(to_copy == fixed_vector_size_);
     size_t array_index = index * fixed_vector_size_;
     std::memcpy((void*)(&(underlying_memory_[array_index])),
-                (void*)to_copy.data(),
-                sizeof(T) * fixed_vector_size_);
+                (void*)to_copy.data(), sizeof(T) * fixed_vector_size_);
   }
 
   PODResizeableArray<T>& edit_data() { return underlying_memory_; }
   const PODResizeableArray<T>& data() { return underlying_memory_; }
   void resize(size_t s) { underlying_memory_.resize(s); }
   size_t size() const { return underlying_memory_.size(); }
+
 private:
   size_t num_elements_{0};
   size_t fixed_vector_size_{0};
   PODResizeableArray<T> underlying_memory_;
 };
 
-}
+} // namespace galois
diff --git a/libgnn/include/galois/graphs/GNNSubgraph.h b/libgnn/include/galois/graphs/GNNSubgraph.h
index d9abd10c30..c7692533ba 100644
--- a/libgnn/include/galois/graphs/GNNSubgraph.h
+++ b/libgnn/include/galois/graphs/GNNSubgraph.h
@@ -155,9 +155,10 @@ class GNNSubgraph {
       thread_zero_mirror_offsets_.resize(
           galois::runtime::getSystemNetworkInterface().Num);
     }
-    std::fill(thread_zero_work_.begin(), thread_zero_work_.end(), 0);
-    std::fill(thread_zero_mirror_offsets_.begin(),
-              thread_zero_mirror_offsets_.end(), 0);
+    galois::ParallelSTL::fill(thread_zero_work_.begin(),
+                              thread_zero_work_.end(), 0);
+    galois::ParallelSTL::fill(thread_zero_mirror_offsets_.begin(),
+                              thread_zero_mirror_offsets_.end(), 0);
   }
 
   //! Counts in and out degrees of all sampled nodes in the graph
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index 898523aedf..39c8c03eb0 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -387,7 +387,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
         // gnn_layers_.back()->ResizeInputOutputRows(seed_node_count,
         //                                          seed_node_count);
 
-        sample_time.start(); 
+        sample_time.start();
         // +1 later in call because 0 is already taken
         size_t num_sampled_layers = 0;
         for (auto back_iter = gnn_layers_.rbegin();
@@ -419,7 +419,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
             num_sampled_layers++;
           }
         }
-        sample_time.stop(); 
+        sample_time.stop();
 
         // resize layer matrices
         CorrectRowCounts(graph_->ConstructSampledSubgraph(num_sampled_layers));
diff --git a/libgnn/src/MinibatchGenerator.cpp b/libgnn/src/MinibatchGenerator.cpp
index a0c66f703b..120a1e7533 100644
--- a/libgnn/src/MinibatchGenerator.cpp
+++ b/libgnn/src/MinibatchGenerator.cpp
@@ -1,4 +1,5 @@
 #include "galois/MinibatchGenerator.h"
+#include "galois/Galois.h"
 #include <cassert>
 
 void galois::MinibatchGenerator::OriginalGetNextMinibatch(
@@ -7,7 +8,7 @@ void galois::MinibatchGenerator::OriginalGetNextMinibatch(
   assert(current_position_ <= master_bound_);
   assert(batch_mask->size() == mask_to_minibatch_.size());
 
-  std::fill(batch_mask->begin(), batch_mask->end(), 0);
+  galois::ParallelSTL::fill(batch_mask->begin(), batch_mask->end(), 0);
   if (current_position_ >= master_bound_) {
     return;
   }
@@ -37,7 +38,7 @@ void galois::MinibatchGenerator::OriginalGetNextMinibatch(
 void galois::MinibatchGenerator::ShuffleGetNextMinibatch(
     std::vector<char>* batch_mask) {
   size_t current_count = 0;
-  std::fill(batch_mask->begin(), batch_mask->end(), 0);
+  galois::ParallelSTL::fill(batch_mask->begin(), batch_mask->end(), 0);
   while (current_position_ < all_indices_.size()) {
     (*batch_mask)[all_indices_[current_position_++]] = 1;
     current_count++;
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index edb6738bcc..78d975ceee 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -919,20 +919,24 @@ void galois::graphs::GNNGraph::InitializeSamplingData(size_t num_layers,
 size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) {
   DisableSubgraph();
 
-  bitset_sample_flag_.resize(size());
-  bitset_sample_flag_.reset();
-  definitely_sampled_nodes_.reset();
-
-  galois::do_all(galois::iterate(begin_owned(), end_owned()),
-                 [&](const NodeIterator& x) {
-                   if (IsValidForPhase(*x, seed_phase)) {
-                     SetSampledNode(*x);
-                     bitset_sample_flag_.set(*x);
-                     definitely_sampled_nodes_.set(*x);
-                   } else {
-                     UnsetSampledNode(*x);
-                   }
-                 }, galois::loopname("InitialSeedSetting"));
+  if (!bitset_sample_flag_.size()) {
+    bitset_sample_flag_.resize(size());
+  }
+  bitset_sample_flag_.ParallelReset();
+  definitely_sampled_nodes_.ParallelReset();
+
+  galois::do_all(
+      galois::iterate(begin_owned(), end_owned()),
+      [&](const NodeIterator& x) {
+        if (IsValidForPhase(*x, seed_phase)) {
+          SetSampledNode(*x);
+          bitset_sample_flag_.set(*x);
+          definitely_sampled_nodes_.set(*x);
+        } else {
+          UnsetSampledNode(*x);
+        }
+      },
+      galois::loopname("InitialSeedSetting"));
   // unsets nodes set in previous iterations; for some reason they get
   // synchronized along  with everything else even though bitset sample flag
   // should prevent it (that, or it's because they don't get sync'd that they
@@ -943,10 +947,13 @@ size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) {
   // clear node timestamps
   galois::StatTimer fill_time("ClearFillTime");
   fill_time.start();
-  std::fill(sample_node_timestamps_.begin(), sample_node_timestamps_.end(),
-            std::numeric_limits<uint32_t>::max());
-  std::fill(sample_master_offsets_.begin(), sample_master_offsets_.end(), 0);
-  std::fill(sample_mirror_offsets_.begin(), sample_mirror_offsets_.end(), 0);
+  galois::ParallelSTL::fill(sample_node_timestamps_.begin(),
+                            sample_node_timestamps_.end(),
+                            std::numeric_limits<uint32_t>::max());
+  galois::ParallelSTL::fill(sample_master_offsets_.begin(),
+                            sample_master_offsets_.end(), 0);
+  galois::ParallelSTL::fill(sample_mirror_offsets_.begin(),
+                            sample_mirror_offsets_.end(), 0);
   fill_time.stop();
 
   for (unsigned i = 0; i < master_offset_accum_.size(); i++) {
@@ -955,26 +962,33 @@ size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) {
   }
 
   // clear all sampled edges
-  galois::do_all(
-      galois::iterate(edge_sample_status_.begin(), edge_sample_status_.end()),
-      [&](galois::DynamicBitSet& edge_layer) { edge_layer.reset(); },
-      galois::loopname("ClearSampleEdges"));
+  galois::StatTimer ctime("ClearSampleEdges");
+  ctime.start();
+  for (galois::DynamicBitSet& edge_layer : edge_sample_status_) {
+    edge_layer.ParallelReset();
+  }
+  ctime.stop();
+  //  galois::do_all(
+  //      galois::iterate(edge_sample_status_.begin(),
+  //      edge_sample_status_.end()),
+  //      [&](galois::DynamicBitSet& edge_layer) { edge_layer.reset(); },
+  //      galois::loopname("ClearSampleEdges"));
+
+  sampled_edges_.ParallelReset();
 
-  sampled_edges_.reset();
   // reset all degrees
   if (!subgraph_choose_all_) {
-    galois::do_all(
-        galois::iterate(sampled_out_degrees_),
-        [&](galois::LargeArray<uint32_t>& array) {
-          memset(array.data(), 0, array.size() * sizeof(uint32_t))
-          //std::fill(array.begin(), array.end(), 0);
-          //std::fill(array.begin(), array.end(), 0);
-        },
-        galois::loopname("ClearAllDegrees"),
-        galois::chunk_size<1>());
+    galois::StatTimer cad_timer("ClearAllDegrees");
+    cad_timer.start();
+    for (galois::LargeArray<uint32_t>& array : sampled_out_degrees_) {
+      galois::ParallelSTL::fill(array.begin(), array.end(), 0);
+    }
+    cad_timer.stop();
   }
 
-  bitset_sampled_degrees_.resize(partitioned_graph_->size());
+  if (!bitset_sampled_degrees_.size()) {
+    bitset_sampled_degrees_.resize(partitioned_graph_->size());
+  }
   bitset_sampled_degrees_.reset();
 
   // Seed nodes sync
@@ -987,6 +1001,7 @@ size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) {
         ->sync<writeSource, readSource, SampleFlagSync, SampleFlagBitset>(
             "Ignore");
   }
+
   galois::GAccumulator<unsigned> local_seed_count;
   local_seed_count.reset();
   galois::GAccumulator<unsigned> master_offset;
@@ -994,22 +1009,24 @@ size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) {
   galois::GAccumulator<unsigned> mirror_offset;
   mirror_offset.reset();
   // count # of seed nodes
-  galois::do_all(galois::iterate(begin(), end()), [&](const NodeIterator& x) {
-    if (IsInSampledGraph(x)) {
-      if (*x < *end_owned()) {
-        master_offset += 1;
-      } else {
-        // mirror
-        mirror_offset += 1;
-      }
+  galois::do_all(
+      galois::iterate(begin(), end()),
+      [&](const NodeIterator& x) {
+        if (IsInSampledGraph(x)) {
+          if (*x < *end_owned()) {
+            master_offset += 1;
+          } else {
+            // mirror
+            mirror_offset += 1;
+          }
 
-      // galois::gInfo(host_prefix_, "Seed node is ", GetGID(*x));
-      local_seed_count += 1;
-      // 0 = seed node
-      sample_node_timestamps_[*x] = 0;
-    }
-  },
-  galois::loopname("SeedNodeOffsetCounting"));
+          // galois::gInfo(host_prefix_, "Seed node is ", GetGID(*x));
+          local_seed_count += 1;
+          // 0 = seed node
+          sample_node_timestamps_[*x] = 0;
+        }
+      },
+      galois::loopname("SeedNodeOffsetCounting"));
 
   sample_master_offsets_[0] = master_offset.reduce();
   sample_mirror_offsets_[0] = mirror_offset.reduce();
@@ -1214,24 +1231,26 @@ galois::graphs::GNNGraph::ConstructSampledSubgraph(size_t num_sampled_layers,
 
   galois::StatTimer offsets_n_rows_time("OffsetRowSubgraphTime");
   offsets_n_rows_time.start();
-  galois::do_all(galois::iterate(begin(), end()), [&](const NodeIterator& x) {
-    if (IsActiveInSubgraph(*x)) {
-      if (sample_node_timestamps_[*x] != std::numeric_limits<uint32_t>::max()) {
-        if (*x < *end_owned()) {
-          // master
-          master_offset_accum_[sample_node_timestamps_[*x]] += 1;
-        } else {
-          // mirror
-          mirror_offset_accum_[sample_node_timestamps_[*x]] += 1;
+  galois::do_all(
+      galois::iterate(begin(), end()),
+      [&](const NodeIterator& x) {
+        if (IsActiveInSubgraph(*x)) {
+          if (sample_node_timestamps_[*x] !=
+              std::numeric_limits<uint32_t>::max()) {
+            if (*x < *end_owned()) {
+              // master
+              master_offset_accum_[sample_node_timestamps_[*x]] += 1;
+            } else {
+              // mirror
+              mirror_offset_accum_[sample_node_timestamps_[*x]] += 1;
+            }
+          } else {
+            GALOIS_LOG_FATAL(
+                "should have been timestamped at some point if active");
+          }
         }
-      } else {
-        GALOIS_LOG_FATAL(
-            "should have been timestamped at some point if active");
-      }
-    }
-  },
-  galois::loopname("MasterMirrorOffset")
-  );
+      },
+      galois::loopname("MasterMirrorOffset"));
 
   std::vector<unsigned> new_rows(master_offset_accum_.size());
   for (unsigned i = 0; i < master_offset_accum_.size(); i++) {
diff --git a/libgnn/src/graphs/GNNSubgraph.cpp b/libgnn/src/graphs/GNNSubgraph.cpp
index a19d1d1320..720ff95413 100644
--- a/libgnn/src/graphs/GNNSubgraph.cpp
+++ b/libgnn/src/graphs/GNNSubgraph.cpp
@@ -39,8 +39,9 @@ void galois::graphs::GNNGraph::GNNSubgraph::CreateSubgraphMapping(
 
   assert(gnn_graph.size() == lid_to_subgraph_id_.size());
   // clear all mappings
-  std::fill(lid_to_subgraph_id_.begin(), lid_to_subgraph_id_.end(),
-            std::numeric_limits<uint32_t>::max());
+  galois::ParallelSTL::fill(lid_to_subgraph_id_.begin(),
+                            lid_to_subgraph_id_.end(),
+                            std::numeric_limits<uint32_t>::max());
 
   galois::GAccumulator<uint32_t> subgraph_count;
   subgraph_count.reset();

From 5648fd935ff2f1f71137543be095f248ebc46f44 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Tue, 20 Jul 2021 11:29:13 -0500
Subject: [PATCH 590/660] ogbnpapers remapping program

Called "remap verify" but it doesn't actually verify; it remaps the
original ogbn-papers using the existing binary remapping file on cdgc
servers.
---
 libgnn/test/CMakeLists.txt  |   4 ++
 libgnn/test/remapverify.cpp | 104 ++++++++++++++++++++++++++++++++++++
 2 files changed, 108 insertions(+)
 create mode 100644 libgnn/test/remapverify.cpp

diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt
index 98b1d01e3e..e646259f87 100644
--- a/libgnn/test/CMakeLists.txt
+++ b/libgnn/test/CMakeLists.txt
@@ -23,6 +23,10 @@ add_executable(mkl_micro_dgalois mkl_micro.cpp)
 target_link_libraries(mkl_micro_dgalois galois_gnn)
 target_compile_definitions(mkl_micro_dgalois PUBLIC USE_DIST_GALOIS=1)
 
+add_executable(remapverify remapverify.cpp)
+target_link_libraries(remapverify galois_gnn)
+target_compile_definitions(remapverify PUBLIC USE_DIST_GALOIS=1)
+
 add_executable(mkl_micro_delete_galois mkl_micro.cpp)
 target_link_libraries(mkl_micro_delete_galois galois_gnn)
 target_compile_definitions(mkl_micro_delete_galois PUBLIC USE_SHARED_GALOIS_DELETE=1)
diff --git a/libgnn/test/remapverify.cpp b/libgnn/test/remapverify.cpp
new file mode 100644
index 0000000000..169a0f129c
--- /dev/null
+++ b/libgnn/test/remapverify.cpp
@@ -0,0 +1,104 @@
+#include "galois/Logging.h"
+#include "galois/graphs/GNNGraph.h"
+
+// actually does remapping
+int main() {
+  galois::DistMemSys G;
+  galois::graphs::LC_CSR_Graph<char, void> orig;
+  orig.readGraphFromGRFile(
+      "/net/ohm/export/iss/inputs/Learning/ogbn-papers100M.tgr");
+  // orig.readGraphFromGRFile("/net/ohm/export/iss/inputs/Learning/ogbn-papers100M.gr");
+
+  std::vector<uint64_t> node_indices;
+  node_indices.resize(orig.size(), 0);
+  std::vector<uint32_t> destinations;
+  destinations.resize(orig.sizeEdges(), 0);
+
+  // get mapping
+  std::string remap_name =
+      galois::default_gnn_dataset_path + "ogbn-papers100M-remap-mapping.bin";
+  std::ifstream file_stream;
+  file_stream.open(remap_name, std::ios::binary | std::ios::in);
+  std::vector<uint32_t> new_to_old(111059956);
+  file_stream.read((char*)new_to_old.data(),
+                   sizeof(uint32_t) * new_to_old.size());
+  file_stream.close();
+
+  std::vector<uint32_t> old_to_new(111059956);
+
+  galois::DynamicBitSet mark_all;
+  mark_all.resize(orig.size());
+  mark_all.reset();
+
+  // get # edges on each node in remapped
+  galois::do_all(
+      galois::iterate(orig.begin(), orig.end()), [&](uint32_t remapped_id) {
+        uint32_t source_id    = new_to_old[remapped_id];
+        old_to_new[source_id] = remapped_id;
+        mark_all.set(source_id);
+        GALOIS_LOG_ASSERT(source_id < orig.size());
+        // TODO check duplicates too
+        node_indices[remapped_id] =
+            std::distance(orig.edge_begin(source_id), orig.edge_end(source_id));
+      });
+
+  galois::do_all(galois::iterate(0, 111059956),
+                 [&](unsigned i) { GALOIS_LOG_ASSERT(mark_all.test(i)); });
+
+  // prefix sum it
+  for (size_t i = 1; i < node_indices.size(); i++) {
+    node_indices[i] += node_indices[i - 1];
+  }
+  // write all edges
+  galois::do_all(
+      galois::iterate(orig.begin(), orig.end()),
+      [&](uint32_t remapped_id) {
+        uint32_t source_id = new_to_old[remapped_id];
+        GALOIS_LOG_ASSERT(source_id < orig.size());
+        uint64_t current_idx;
+        if (remapped_id != 0) {
+          current_idx = node_indices[remapped_id - 1];
+        } else {
+          current_idx = 0;
+        }
+        uint64_t my_end = node_indices[remapped_id];
+
+        for (auto ei = orig.edge_begin(source_id);
+             ei != orig.edge_end(source_id); ei++) {
+          uint32_t dest               = old_to_new[orig.getEdgeDst(ei)];
+          destinations[current_idx++] = dest;
+        }
+        GALOIS_LOG_ASSERT(current_idx == my_end);
+        // TODO check duplicates too
+        // node_indices[remapped_id] = std::distance(orig.edge_begin(node_id),
+        // orig.edge_end(node_id));
+      },
+      galois::steal());
+
+  // write everything
+  struct Header {
+    uint64_t version;
+    uint64_t size;
+    uint64_t numNodes;
+    uint64_t numEdges;
+  };
+  Header h;
+  h.version  = 1;
+  h.size     = 0;
+  h.numNodes = orig.size();
+  h.numEdges = orig.sizeEdges();
+
+  std::string filename =
+      "/net/ohm/export/iss/inputs/Learning/ogbn-papers100M-remap.tgr";
+  // std::string filename =
+  // "/net/ohm/export/iss/inputs/Learning/ogbn-papers100M-remap.gr";
+  std::ofstream write_stream;
+  write_stream.open(filename, std::ios::binary | std::ios::out);
+  write_stream.write((char*)&h, sizeof(Header));
+  write_stream.write((char*)node_indices.data(),
+                     sizeof(uint64_t) * node_indices.size());
+  write_stream.write((char*)destinations.data(),
+                     sizeof(uint32_t) * destinations.size());
+
+  write_stream.close();
+}

From 0b8538b1d48cc0dc828487227ee4655ce23fd1e4 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 21 Jul 2021 23:21:20 -0500
Subject: [PATCH 591/660] GNN softmax, sync, masking correctness changes

1) You cannot use readSource for sampled nodes: they need to be active
across all hosts the moment that they become active on one host because
all other hosts need to start calculating the gradients for them in the
backward phase because they may be used later for gradient computation.
(unless they're *completely* disconnected in a subgraph, but that
is unknown until sampling is done; therefore, readAny must be used)

2) Softmax was using a "is valid for phase" check for nodes: this
is wrong because hosts will need to compute the softmax value
for nodes that aren't part of its own batch but part of another
hosts' batch.

3) A matrix was being masked incorrectly by SAGE due to a faulty
if condition: this broke the distributed execution accuracy curve.
The if has been fixed.

Performance wise, (1) and (2) will affect performance due to increased
sync volume and more compute in the softmax layer.
---
 libgnn/include/galois/graphs/GNNGraph.h |  1 +
 libgnn/src/GraphNeuralNetwork.cpp       | 37 +++++++++++++++----------
 libgnn/src/graphs/GNNGraph.cpp          | 18 ++++--------
 libgnn/src/layers/SAGELayer.cpp         | 35 +++++++++++++++--------
 libgnn/src/layers/SoftmaxLayer.cpp      |  8 ------
 5 files changed, 53 insertions(+), 46 deletions(-)

diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 0dc906c772..e46e388bf1 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -79,6 +79,7 @@ class GNNGraph {
     }
   }
 
+  bool is_owned(size_t gid) const { return partitioned_graph_->isOwned(gid); }
   bool is_local(size_t gid) const { return partitioned_graph_->isLocal(gid); }
   size_t GetLID(size_t gid) const { return partitioned_graph_->getLID(gid); }
   size_t GetGID(size_t lid) const { return partitioned_graph_->getGID(lid); }
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index 39c8c03eb0..eca0e4022c 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -295,23 +295,28 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
     // swap to train subgraph
     if (config_.use_train_subgraph_ && !config_.train_minibatch_size()) {
       graph_->EnableSubgraph();
-      size_t l_count = 0;
-      gnn_layers_.back()->ResizeRows(subgraph_layer_sizes[0]);
-      for (auto back_iter = gnn_layers_.rbegin();
-           back_iter != gnn_layers_.rend(); back_iter++) {
-        GNNLayerType layer_type = (*back_iter)->layer_type();
-        if (layer_type == GNNLayerType::kGraphConvolutional ||
-            layer_type == GNNLayerType::kSAGE) {
-          (*back_iter)
-              ->ResizeInputOutputRows(subgraph_layer_sizes[l_count + 1],
-                                      subgraph_layer_sizes[l_count]);
-          l_count++;
-        }
-      }
+      // TODO(loc) this doesn't actually function as expected anymore
+      // with the numerous changes to the system; this commenting
+      // out is more of a hack for the train subgraph option (which
+      // probably shouldn't be used anyways)
+
+      //size_t l_count = 0;
+      //gnn_layers_.back()->ResizeRows(subgraph_layer_sizes[0]);
+      //for (auto back_iter = gnn_layers_.rbegin();
+      //     back_iter != gnn_layers_.rend(); back_iter++) {
+      //  GNNLayerType layer_type = (*back_iter)->layer_type();
+      //  if (layer_type == GNNLayerType::kGraphConvolutional ||
+      //      layer_type == GNNLayerType::kSAGE) {
+      //    (*back_iter)
+      //        ->ResizeInputOutputRows(subgraph_layer_sizes[l_count + 1],
+      //                                subgraph_layer_sizes[l_count]);
+      //    l_count++;
+      //  }
+      //}
       CorrectBackwardLinks();
     }
 
-    // beginning of epoch sampling
+    // beginning of epoch sampling (no minibatches)
     if (config_.do_sampling() && !config_.train_minibatch_size()) {
       galois::StatTimer mb_timer("EpochSubgraphCreation", kRegionName);
       mb_timer.start();
@@ -398,6 +403,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
             // you can minibatch with sampling or minibatch and grab all
             // relevant neighbors
             size_t current_sample_size;
+
             if (config_.do_sampling()) {
               current_sample_size = graph_->SampleEdges(
                   (*back_iter)->graph_user_layer_number(),
@@ -408,10 +414,12 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
                   (*back_iter)->graph_user_layer_number(),
                   config_.inductive_subgraph_, num_sampled_layers + 1);
             }
+
             galois::gDebug(graph_->host_prefix(),
                            "Number of local nodes for layer ",
                            (*back_iter)->graph_user_layer_number(), " is ",
                            current_sample_size);
+
             // resize this layer, change seed node count
             //(*back_iter)
             //    ->ResizeInputOutputRows(current_sample_size, seed_node_count);
@@ -424,6 +432,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
         // resize layer matrices
         CorrectRowCounts(graph_->ConstructSampledSubgraph(num_sampled_layers));
         CorrectBackwardLinks();
+
         // XXX resizes above only work for SAGE layers; will break if other
         // layers are tested
 
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index 78d975ceee..e616465d1b 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -994,11 +994,11 @@ size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) {
   // Seed nodes sync
   if (use_timer_) {
     sync_substrate_
-        ->sync<writeSource, readSource, SampleFlagSync, SampleFlagBitset>(
+        ->sync<writeSource, readAny, SampleFlagSync, SampleFlagBitset>(
             "SeedNodeSample");
   } else {
     sync_substrate_
-        ->sync<writeSource, readSource, SampleFlagSync, SampleFlagBitset>(
+        ->sync<writeSource, readAny, SampleFlagSync, SampleFlagBitset>(
             "Ignore");
   }
 
@@ -1068,10 +1068,6 @@ size_t galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num,
       },
       galois::steal(), galois::loopname("ChooseAllEdges"));
 
-  // galois::gPrint("Num sampled edges in inductive graph is ",
-  // sampled.reduce(),
-  //               " out of ", total.reduce(), "\n");
-
   // update nodes, then communicate update to all hosts so that they can
   // continue the exploration
   galois::do_all(
@@ -1085,11 +1081,11 @@ size_t galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num,
 
   if (use_timer_) {
     sync_substrate_
-        ->sync<writeDestination, readSource, SampleFlagSync, SampleFlagBitset>(
+        ->sync<writeDestination, readAny, SampleFlagSync, SampleFlagBitset>(
             "SampleFlag");
   } else {
     sync_substrate_
-        ->sync<writeDestination, readSource, SampleFlagSync, SampleFlagBitset>(
+        ->sync<writeDestination, readAny, SampleFlagSync, SampleFlagBitset>(
             "Ignore");
   }
 
@@ -1100,8 +1096,6 @@ size_t galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num,
     if (IsInSampledGraph(x)) {
       local_sample_count += 1;
       if (sample_node_timestamps_[*x] == std::numeric_limits<uint32_t>::max()) {
-        // galois::gInfo(host_prefix_, "Layer ", timestamp, " new node is ",
-        // GetGID(*x));
         sample_node_timestamps_[*x] = timestamp;
       }
     }
@@ -1182,11 +1176,11 @@ size_t galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num,
   // correctly
   if (use_timer_) {
     sync_substrate_
-        ->sync<writeDestination, readSource, SampleFlagSync, SampleFlagBitset>(
+        ->sync<writeDestination, readAny, SampleFlagSync, SampleFlagBitset>(
             "SampleFlag");
   } else {
     sync_substrate_
-        ->sync<writeDestination, readSource, SampleFlagSync, SampleFlagBitset>(
+        ->sync<writeDestination, readAny, SampleFlagSync, SampleFlagBitset>(
             "Ignore");
   }
 
diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp
index 636d7690b9..0354035958 100644
--- a/libgnn/src/layers/SAGELayer.cpp
+++ b/libgnn/src/layers/SAGELayer.cpp
@@ -155,16 +155,15 @@ void galois::SAGELayer::ResizeIntermediates(size_t new_input_rows,
                                             size_t new_output_rows) {
   size_t num_in_temp_elements =
       new_output_rows * layer_dimensions_.input_columns;
-  galois::gDebug("Layer num ", layer_number_, " ", in_temp_1_.size(), " and ",
-                 num_in_temp_elements, " ", layer_dimensions_.input_columns,
-                 " ", layer_dimensions_.output_columns);
+  galois::gDebug(graph_.host_prefix(), "Layer num ", layer_number_, " ",
+                 in_temp_1_.size(), " and ", num_in_temp_elements, " ",
+                 layer_dimensions_.input_columns, " ",
+                 layer_dimensions_.output_columns);
 
   // if in temp is smaller than out temp, or if dropout exists
   if (!config_.disable_dropout || config_.disable_aggregate_after_update ||
       layer_dimensions_.input_columns <= layer_dimensions_.output_columns) {
-    galois::gDebug("in first if");
     if (in_temp_1_.size() < num_in_temp_elements) {
-      galois::gDebug("in the resize");
       galois::gInfo(graph_.host_prefix(), "Resize layer ", layer_number_,
                     ", SAGE input temp var 1 ", num_in_temp_elements, " (",
                     FloatElementsToGB(num_in_temp_elements), " GB)");
@@ -237,6 +236,7 @@ void galois::SAGELayer::WeightGradientSyncSum2() {
   galois::StatTimer t("Sync_WeightGradientsSum2", kRegionName);
   TimerStart(&t);
   int weight_size = static_cast<int>(p_layer_weight_gradients_2_.size());
+
 #ifdef GALOIS_ENABLE_GPU
   bool gpu_direct_enabled = false;
   if (device_personality == DevicePersonality::GPU_CUDA &&
@@ -270,7 +270,8 @@ const galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::ForwardPhase(
   galois::gDebug(
       "Layer ", layer_number_, " dims: ", layer_dimensions_.input_rows, " ",
       layer_dimensions_.output_rows, " ", layer_dimensions_.input_columns, " ",
-      layer_dimensions_.output_columns);
+      layer_dimensions_.output_columns, " ", input_embeddings.size(), " ",
+      layer_dimensions_.input_rows * layer_dimensions_.input_columns);
   galois::StatTimer timer("ForwardPhase", kRegionName);
   TimerStart(&timer);
 
@@ -304,6 +305,7 @@ const galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::ForwardPhase(
       assert(p_in_temp_1_.size() >=
              layer_dimensions_.output_rows * layer_dimensions_.input_columns);
     }
+
     // aggregation and update
     AggregateAll(layer_dimensions_.input_columns, input_data, agg_data,
                  &input_column_intermediates_);
@@ -313,9 +315,11 @@ const galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::ForwardPhase(
   } else {
     assert(p_out_temp_.size() >=
            layer_dimensions_.input_rows * layer_dimensions_.output_columns);
+
     // update to aggregate
     // FW
     UpdateEmbeddings(input_data, p_out_temp_.data(), false);
+
     // A(FW)
     assert(p_forward_output_matrix_.size() >=
            layer_dimensions_.output_rows * layer_dimensions_.output_columns);
@@ -383,7 +387,6 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
   }
 
   if (!sage_config_.disable_concat) {
-    // XXX masking may not be required in sampling case where rows change
     if (layer_number_ != 0) {
       if (graph_.IsSubgraphOn()) {
         MaskInputNonMasters(&input_data, layer_dimensions_.input_rows,
@@ -422,6 +425,7 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
           input_data.data(), input_gradient->data(),
           p_layer_weight_gradients_2_.data());
       TimerStop(&concat_grad_timer);
+
 #ifdef GALOIS_ENABLE_GPU
     }
 #endif
@@ -443,8 +447,6 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
         MaskInputNonMasters(&agg_data, layer_dimensions_.output_rows);
       }
     }
-    // if concat is disabled, then input grad isn't masked; therefore, mask
-    // this to get the same effect
 
 #ifdef GALOIS_ENABLE_GPU
     if (device_personality == DevicePersonality::GPU_CUDA) {
@@ -481,6 +483,7 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
       // weight matrix)
       UpdateEmbeddingsDerivative(input_gradient->data(), p_in_temp_1_.data(),
                                  true);
+
       // pback contains F'
       // derivative of aggregate is the same due to symmetric graph
       AggregateAll(layer_dimensions_.input_columns, p_in_temp_1_.data(),
@@ -489,8 +492,9 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
     }
   } else {
     // --unmasked--
-    // disable concat part is here because otherwise it would get done elsewhere
-    // XXX masking may not be required in sampling case where rows change
+
+    // disable concat is part of condition because otherwise this mask
+    // should have gotten done elsewhere
     if (layer_number_ != 0 && sage_config_.disable_concat) {
       if (graph_.IsSubgraphOn()) {
         MaskInputNonMasters(&input_data, layer_dimensions_.input_rows,
@@ -498,7 +502,11 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
       } else {
         MaskInputNonMasters(&input_data, layer_dimensions_.input_rows);
       }
-    } else {
+    }
+
+    // layer number 0 means output needs to be masked because input cannot
+    // be masked
+    if (layer_number_ == 0) {
       // if 0 then no input to mask: mask the gradient
       // this is fine because gradient won't be used to get feature gradients
       if (graph_.IsSubgraphOn()) {
@@ -532,6 +540,9 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
     }
 #endif
 
+    // to get a correct result out temp mask cannot be masked;
+    // outtemp will only be masked if layer number is 0, so this
+    // is safe in all other cases
     if (layer_number_ != 0) {
       // derivative for update
       // backout = F'
diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp
index 8b99db4073..70a6afa6c3 100644
--- a/libgnn/src/layers/SoftmaxLayer.cpp
+++ b/libgnn/src/layers/SoftmaxLayer.cpp
@@ -32,7 +32,6 @@ galois::SoftmaxLayer::ForwardPhaseCPU(
           }
         }
 
-        if (graph_.IsValidForPhase(i, layer_phase_)) {
           // do softmax
           GNNSoftmax(feature_length, &input_embeddings[feature_length * i],
                      &p_backward_output_matrix_[feature_length * i]);
@@ -53,10 +52,6 @@ galois::SoftmaxLayer::ForwardPhaseCPU(
           loss_accum += input_loss_[i];
           handled += 1;
 #endif
-        } else {
-          VectorZero(feature_length,
-                     &p_backward_output_matrix_[i * feature_length]);
-        }
       },
       // TODO chunk size?
       // steal on as some threads may have nothing to work on
@@ -94,10 +89,8 @@ galois::SoftmaxLayer::BackwardPhaseCPU() {
   const size_t feature_length = layer_dimensions_.input_columns;
 
   galois::do_all(
-      // galois::iterate(graph_.begin(), graph_.end()),
       galois::iterate(size_t{0}, layer_dimensions_.input_rows),
       [&](const unsigned node) {
-        if (graph_.IsValidForPhase(node, layer_phase_)) {
           if (IsSampledLayer()) {
             if (layer_phase_ == GNNPhase::kTrain &&
                 !graph_.IsInSampledGraphSubgraph(node))
@@ -121,7 +114,6 @@ galois::SoftmaxLayer::BackwardPhaseCPU() {
                   p_backward_output_matrix_[node * feature_length + idx];
             }
           }
-        }
       },
       galois::steal(), galois::loopname("SoftmaxBackward"));
 

From 1f0dbf6aaee3cc9197aacc700b82f33800129b2e Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 22 Jul 2021 15:09:09 -0500
Subject: [PATCH 592/660] GNN: time-based randomness; parreset in subgraph

1) Parallel reset for a bitset in subgraph construction.

2) RNG for minibatcher now determined by time when called.
---
 libgnn/include/galois/MinibatchGenerator.h | 13 ++++++++-----
 libgnn/src/graphs/GNNSubgraph.cpp          |  2 +-
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/libgnn/include/galois/MinibatchGenerator.h b/libgnn/include/galois/MinibatchGenerator.h
index 459014f65a..73a65180d0 100644
--- a/libgnn/include/galois/MinibatchGenerator.h
+++ b/libgnn/include/galois/MinibatchGenerator.h
@@ -2,6 +2,7 @@
 
 #include "galois/GNNTypes.h"
 #include "galois/Logging.h"
+#include <ctime>
 #include <random>
 #include <algorithm>
 
@@ -14,8 +15,10 @@ class MinibatchGenerator {
   MinibatchGenerator(const GNNMask& mask_to_minibatch, size_t minibatch_size,
                      size_t master_bound)
       : mask_to_minibatch_{mask_to_minibatch}, minibatch_size_{minibatch_size},
-        current_position_{0}, master_bound_{master_bound}, rand_generator_{
-                                                               100} {
+        current_position_{0}, master_bound_{master_bound} {
+    // set seed based on time then initialize random generate with rand()
+    srand(time(NULL));
+    rand_generator_ = std::make_unique<std::mt19937>(rand());
     GALOIS_LOG_ASSERT(master_bound_ <= mask_to_minibatch_.size());
   }
 
@@ -40,7 +43,7 @@ class MinibatchGenerator {
   void ResetMinibatchState() {
     current_position_ = 0;
     if (shuffle_mode_) {
-      std::shuffle(all_indices_.begin(), all_indices_.end(), rand_generator_);
+      std::shuffle(all_indices_.begin(), all_indices_.end(), *rand_generator_);
     }
   }
 
@@ -55,7 +58,7 @@ class MinibatchGenerator {
         }
       }
       // shuffle it
-      std::shuffle(all_indices_.begin(), all_indices_.end(), rand_generator_);
+      std::shuffle(all_indices_.begin(), all_indices_.end(), *rand_generator_);
       printf("Number of things in minibatch generator is %lu\n",
              all_indices_.size());
     }
@@ -68,7 +71,7 @@ class MinibatchGenerator {
   size_t master_bound_;
   std::vector<uint32_t> all_indices_;
   bool shuffle_mode_ = false;
-  std::mt19937 rand_generator_;
+  std::unique_ptr<std::mt19937> rand_generator_;
 
   void OriginalGetNextMinibatch(std::vector<char>* batch_mask);
   void ShuffleGetNextMinibatch(std::vector<char>* batch_mask);
diff --git a/libgnn/src/graphs/GNNSubgraph.cpp b/libgnn/src/graphs/GNNSubgraph.cpp
index 720ff95413..5e95b079fd 100644
--- a/libgnn/src/graphs/GNNSubgraph.cpp
+++ b/libgnn/src/graphs/GNNSubgraph.cpp
@@ -73,7 +73,7 @@ void galois::graphs::GNNGraph::GNNSubgraph::CreateSubgraphMapping(
   if (non_layer_zero_masters.size() < num_subgraph_nodes_) {
     non_layer_zero_masters.resize(num_subgraph_nodes_);
   } else {
-    non_layer_zero_masters.reset();
+    non_layer_zero_masters.ParallelReset();
   }
 
   std::vector<unsigned>& master_offsets = gnn_graph.GetMasterOffsets();

From a7477e8fc26120ace304c10d380c51f17c8d8658 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 23 Jul 2021 19:14:32 -0500
Subject: [PATCH 593/660] GNN distributed minibatch shuffler

As # of hosts grow in distributed setting, it becomes more difficult to
sample certain kinds of minibatch distributions because in the old
scheme before this commit you always had to pick a fixed number on each
host: this commit changes that up and does some sync to allow each host
to pick a different number from each other to shuffle up the
distribution more like single host.
---
 libgnn/CMakeLists.txt                         |  1 +
 .../galois/DistributedMinibatchTracker.h      | 63 +++++++++++++++++++
 libgnn/include/galois/GraphNeuralNetwork.h    |  4 ++
 libgnn/include/galois/MinibatchGenerator.h    | 21 +++++++
 libgnn/include/galois/graphs/GNNGraph.h       |  7 ++-
 libgnn/src/DistributedMinibatchTracker.cpp    | 57 +++++++++++++++++
 libgnn/src/GraphNeuralNetwork.cpp             | 36 +++++++++--
 libgnn/src/MinibatchGenerator.cpp             | 12 ++++
 libgnn/src/layers/SAGELayer.cpp               | 26 ++++----
 9 files changed, 208 insertions(+), 19 deletions(-)
 create mode 100644 libgnn/include/galois/DistributedMinibatchTracker.h
 create mode 100644 libgnn/src/DistributedMinibatchTracker.cpp

diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt
index 5bf32581d7..22a18c7fdf 100644
--- a/libgnn/CMakeLists.txt
+++ b/libgnn/CMakeLists.txt
@@ -1,4 +1,5 @@
 set(sources
+  src/DistributedMinibatchTracker.cpp
   src/GNNMath.cpp
   src/GNNOptimizers.cpp
   src/GraphNeuralNetwork.cpp
diff --git a/libgnn/include/galois/DistributedMinibatchTracker.h b/libgnn/include/galois/DistributedMinibatchTracker.h
new file mode 100644
index 0000000000..1469db0e62
--- /dev/null
+++ b/libgnn/include/galois/DistributedMinibatchTracker.h
@@ -0,0 +1,63 @@
+#pragma once
+
+#include "galois/graphs/GNNGraph.h"
+#include <random>
+
+namespace galois {
+
+//! Tracks how many nodes remain to be chosen from every host's
+//! minibatch and also determines how many to pull from this
+//! particular host every iteration.
+class DistributedMinibatchTracker {
+public:
+  DistributedMinibatchTracker(size_t my_host_id, size_t num_hosts,
+                              size_t my_minibatch_nodes,
+                              size_t total_minibatch_size)
+      : my_host_id_{my_host_id}, num_hosts_{num_hosts},
+        total_minibatch_size_{total_minibatch_size}, complete_hosts_{0},
+        rng_object_{(long unsigned)rand() * (my_host_id_ + 1)},
+        int_distribution_{0, (unsigned)num_hosts_ - 1} {
+    max_num_on_hosts_.resize(num_hosts_, 0);
+    current_num_on_hosts_.resize(num_hosts_, 0);
+    sampled_num_on_hosts_.resize(num_hosts_, 0);
+    max_num_on_hosts_[my_host_id_] = my_minibatch_nodes;
+
+    // all reduce so all get the right values
+    // TODO technically all reduce would be sending unnecessary 0s
+    // but whatever this is relatively small
+    MPI_Allreduce(MPI_IN_PLACE, static_cast<void*>(max_num_on_hosts_.data()),
+                  num_hosts_, MPI_UINT32_T, MPI_SUM, MPI_COMM_WORLD);
+  }
+
+  //! Reset epoch = set all current sampled back to initial state
+  void ResetEpoch() {
+    galois::do_all(
+        galois::iterate(size_t{0}, num_hosts_), [&](size_t host_id_) {
+          current_num_on_hosts_[host_id_] = max_num_on_hosts_[host_id_];
+        });
+    complete_hosts_ = 0;
+  }
+
+  size_t GetNumberForNextMinibatch();
+
+  bool OutOfWork() { return complete_hosts_ == num_hosts_; }
+
+private:
+  size_t my_host_id_;
+  size_t num_hosts_;
+  size_t total_minibatch_size_;
+  unsigned complete_hosts_;
+
+  std::mt19937 rng_object_;
+  std::uniform_int_distribution<unsigned> int_distribution_;
+  //! Maximum amount of nodes on each host; used to reset state
+  std::vector<uint32_t> max_num_on_hosts_;
+  //! Current number of nodes left on each host; used to know how
+  //! to sample on each host
+  std::vector<uint32_t> current_num_on_hosts_;
+  //! Vector to be sync'd indicating how many to grab from each
+  //! batch
+  std::vector<uint32_t> sampled_num_on_hosts_;
+};
+
+} // namespace galois
diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h
index ff13e24c41..a813378116 100644
--- a/libgnn/include/galois/GraphNeuralNetwork.h
+++ b/libgnn/include/galois/GraphNeuralNetwork.h
@@ -8,6 +8,7 @@
 #include "galois/GNNOptimizers.h"
 #include "galois/graphs/GNNGraph.h"
 #include "galois/layers/GNNLayer.h"
+#include "galois/DistributedMinibatchTracker.h"
 
 #ifdef GALOIS_ENABLE_GPU
 #include "galois/GraphNeuralNetwork.cuh"
@@ -265,6 +266,9 @@ class GraphNeuralNetwork {
   //! Termination detection for minibatching
   galois::DGAccumulator<uint32_t> work_left_;
 
+  size_t num_hosts_{0};
+  std::unique_ptr<galois::DistributedMinibatchTracker> dist_minibatch_tracker_;
+
 #ifdef GALOIS_ENABLE_GPU
   //! Holds all GPU functions
   GraphNeuralNetworkGPU gpu_object_;
diff --git a/libgnn/include/galois/MinibatchGenerator.h b/libgnn/include/galois/MinibatchGenerator.h
index 73a65180d0..7e939c9cf4 100644
--- a/libgnn/include/galois/MinibatchGenerator.h
+++ b/libgnn/include/galois/MinibatchGenerator.h
@@ -30,6 +30,15 @@ class MinibatchGenerator {
     }
   }
 
+  void GetNextMinibatch(std::vector<char>* batch_mask, size_t num_to_get) {
+    if (!shuffle_mode_) {
+      // TODO
+      GALOIS_LOG_FATAL("not yet implemented");
+    } else {
+      ShuffleGetNextMinibatch(batch_mask, num_to_get);
+    }
+  }
+
   //! True if no more minibatches from this generator
   bool NoMoreMinibatches() {
     if (!shuffle_mode_) {
@@ -64,6 +73,16 @@ class MinibatchGenerator {
     }
   }
 
+  //! Total number of nodes that can be minibatched by this minibatch
+  //! generator on this host
+  size_t ShuffleMinibatchTotal() {
+    if (shuffle_mode_) {
+      return all_indices_.size();
+    } else {
+      return 0;
+    }
+  }
+
 private:
   const GNNMask& mask_to_minibatch_;
   size_t minibatch_size_;
@@ -75,6 +94,8 @@ class MinibatchGenerator {
 
   void OriginalGetNextMinibatch(std::vector<char>* batch_mask);
   void ShuffleGetNextMinibatch(std::vector<char>* batch_mask);
+  void ShuffleGetNextMinibatch(std::vector<char>* batch_mask,
+                               size_t num_to_get);
 };
 
 } // namespace galois
diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index e46e388bf1..044f82e7a2 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -391,7 +391,7 @@ class GNNGraph {
   //}
 
   //////////////////////////////////////////////////////////////////////////////
-  void SetupTrainBatcher(size_t train_batch_size) {
+  size_t SetupTrainBatcher(size_t train_batch_size) {
     if (train_batcher_) {
       // clear before remake
       train_batcher_.reset();
@@ -400,6 +400,7 @@ class GNNGraph {
         local_training_mask_, train_batch_size, *end_owned());
     train_batcher_->ShuffleMode();
     local_minibatch_mask_.resize(partitioned_graph_->size());
+    return train_batcher_->ShuffleMinibatchTotal();
   }
 
   void ResetTrainMinibatcher() { train_batcher_->ResetMinibatchState(); }
@@ -407,6 +408,10 @@ class GNNGraph {
   //! Setup the state for the next minibatch sampling call by using the
   //! minibatcher to pick up the next set batch of nodes
   size_t PrepareNextTrainMinibatch();
+  size_t PrepareNextTrainMinibatch(size_t num_to_get) {
+    train_batcher_->GetNextMinibatch(&local_minibatch_mask_, num_to_get);
+    return SetupNeighborhoodSample(GNNPhase::kBatch);
+  }
   //! Returns true if there are still more minibatches in this graph
   bool MoreTrainMinibatches() { return !train_batcher_->NoMoreMinibatches(); };
 
diff --git a/libgnn/src/DistributedMinibatchTracker.cpp b/libgnn/src/DistributedMinibatchTracker.cpp
new file mode 100644
index 0000000000..609030ae23
--- /dev/null
+++ b/libgnn/src/DistributedMinibatchTracker.cpp
@@ -0,0 +1,57 @@
+#include <algorithm>
+#include "galois/DistributedMinibatchTracker.h"
+
+size_t galois::DistributedMinibatchTracker::GetNumberForNextMinibatch() {
+  galois::StatTimer timer("DistributedGetNumberForNextMinibatch");
+  timer.start();
+
+  // TODO
+  for (size_t i = 0; i < total_minibatch_size_; i++) {
+    // pick a host, increment
+    unsigned chosen_host = int_distribution_(rng_object_);
+    assert(chosen_host < num_hosts_);
+    sampled_num_on_hosts_[chosen_host]++;
+  }
+  // sync and post process *the same way on all hosts*
+  MPI_Allreduce(MPI_IN_PLACE, static_cast<void*>(sampled_num_on_hosts_.data()),
+                num_hosts_, MPI_UINT32_T, MPI_SUM, MPI_COMM_WORLD);
+
+  size_t to_return              = 0;
+  uint32_t leftover_to_allocate = 0;
+
+  // TODO parallel?
+  for (size_t i = 0; i < num_hosts_; i++) {
+    uint32_t proposed_to_sample = sampled_num_on_hosts_[i];
+    size_t left_to_sample       = current_num_on_hosts_[i];
+    size_t actual_to_sample     = 0;
+    if (left_to_sample > 0) {
+      actual_to_sample = std::min(proposed_to_sample, current_num_on_hosts_[i]);
+
+      if (actual_to_sample < left_to_sample && leftover_to_allocate) {
+        // more left to sample and we have extra; dump more from extra if
+        // possible
+        uint32_t what_is_left = left_to_sample - actual_to_sample;
+        size_t more_to_sample = std::min(what_is_left, leftover_to_allocate);
+        leftover_to_allocate -= more_to_sample;
+        actual_to_sample += more_to_sample;
+        assert(actual_to_sample <= left_to_sample);
+      }
+    }
+    leftover_to_allocate = proposed_to_sample - actual_to_sample;
+    current_num_on_hosts_[i] -= actual_to_sample;
+
+    sampled_num_on_hosts_[i] = 0;
+    if (my_host_id_ == i) {
+      to_return = actual_to_sample;
+    }
+  }
+  timer.stop();
+
+  if (leftover_to_allocate) {
+    // if there are leftovers, it means that there is no more work
+    // in this system period
+    complete_hosts_ = num_hosts_;
+  }
+
+  return to_return;
+}
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index eca0e4022c..feddc1fb2c 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -122,9 +122,17 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork(
                                    config_.use_train_subgraph_);
   }
 
+  num_hosts_ = galois::runtime::getSystemNetworkInterface().Num;
   if (config_.train_minibatch_size()) {
-    graph_->SetupTrainBatcher(config_.train_minibatch_size());
+    size_t local_num =
+        graph_->SetupTrainBatcher(config_.train_minibatch_size());
+    if (num_hosts_ > 1) {
+      dist_minibatch_tracker_ = std::make_unique<DistributedMinibatchTracker>(
+          galois::runtime::getSystemNetworkInterface().ID, num_hosts_,
+          local_num, config_.train_minibatch_size());
+    }
   }
+
   if (config_.test_minibatch_size()) {
     graph_->SetupTestBatcher(config_.test_minibatch_size());
   }
@@ -300,9 +308,9 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
       // out is more of a hack for the train subgraph option (which
       // probably shouldn't be used anyways)
 
-      //size_t l_count = 0;
-      //gnn_layers_.back()->ResizeRows(subgraph_layer_sizes[0]);
-      //for (auto back_iter = gnn_layers_.rbegin();
+      // size_t l_count = 0;
+      // gnn_layers_.back()->ResizeRows(subgraph_layer_sizes[0]);
+      // for (auto back_iter = gnn_layers_.rbegin();
       //     back_iter != gnn_layers_.rend(); back_iter++) {
       //  GNNLayerType layer_type = (*back_iter)->layer_type();
       //  if (layer_type == GNNLayerType::kGraphConvolutional ||
@@ -365,6 +373,10 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
       GradientPropagation();
     } else {
       graph_->ResetTrainMinibatcher();
+      if (num_hosts_ > 1) {
+        dist_minibatch_tracker_->ResetEpoch();
+      }
+
       SetLayerPhases(galois::GNNPhase::kBatch);
 
       size_t batch_num = 0;
@@ -382,7 +394,18 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
         galois::gInfo("Epoch ", epoch, " batch ", batch_num++);
         // break when all hosts are done with minibatches
         prep_timer.start();
-        size_t seed_node_count = graph_->PrepareNextTrainMinibatch();
+        size_t seed_node_count;
+        if (num_hosts_ > 1) {
+          size_t num_for_next_batch =
+              dist_minibatch_tracker_->GetNumberForNextMinibatch();
+          galois::gInfo(graph_->host_prefix(), "Sampling ", num_for_next_batch,
+                        " for this minibatch");
+          seed_node_count =
+              graph_->PrepareNextTrainMinibatch(num_for_next_batch);
+        } else {
+          seed_node_count = graph_->PrepareNextTrainMinibatch();
+        }
+
         galois::gDebug(graph_->host_prefix(),
                        "Number of local seed nodes is for batch is ",
                        seed_node_count);
@@ -503,6 +526,9 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
         epoch_timer.start();
 
         if (!global_work_left) {
+          if (num_hosts_ > 1) {
+            GALOIS_LOG_ASSERT(dist_minibatch_tracker_->OutOfWork());
+          }
           break;
         }
       }
diff --git a/libgnn/src/MinibatchGenerator.cpp b/libgnn/src/MinibatchGenerator.cpp
index 120a1e7533..c1bb8c221d 100644
--- a/libgnn/src/MinibatchGenerator.cpp
+++ b/libgnn/src/MinibatchGenerator.cpp
@@ -46,3 +46,15 @@ void galois::MinibatchGenerator::ShuffleGetNextMinibatch(
       break;
   }
 }
+
+void galois::MinibatchGenerator::ShuffleGetNextMinibatch(
+    std::vector<char>* batch_mask, size_t num_to_get) {
+  size_t current_count = 0;
+  galois::ParallelSTL::fill(batch_mask->begin(), batch_mask->end(), 0);
+  while (current_position_ < all_indices_.size()) {
+    (*batch_mask)[all_indices_[current_position_++]] = 1;
+    current_count++;
+    if (current_count == num_to_get)
+      break;
+  }
+}
diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp
index 0354035958..f078d97bd9 100644
--- a/libgnn/src/layers/SAGELayer.cpp
+++ b/libgnn/src/layers/SAGELayer.cpp
@@ -155,10 +155,10 @@ void galois::SAGELayer::ResizeIntermediates(size_t new_input_rows,
                                             size_t new_output_rows) {
   size_t num_in_temp_elements =
       new_output_rows * layer_dimensions_.input_columns;
-  galois::gDebug(graph_.host_prefix(), "Layer num ", layer_number_, " ",
-                 in_temp_1_.size(), " and ", num_in_temp_elements, " ",
-                 layer_dimensions_.input_columns, " ",
-                 layer_dimensions_.output_columns);
+  //galois::gDebug(graph_.host_prefix(), "Layer num ", layer_number_, " ",
+  //               in_temp_1_.size(), " and ", num_in_temp_elements, " ",
+  //               layer_dimensions_.input_columns, " ",
+  //               layer_dimensions_.output_columns);
 
   // if in temp is smaller than out temp, or if dropout exists
   if (!config_.disable_dropout || config_.disable_aggregate_after_update ||
@@ -267,11 +267,11 @@ void galois::SAGELayer::WeightGradientSyncSum2() {
 
 const galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::ForwardPhase(
     const galois::PointerWithSize<galois::GNNFloat> input_embeddings) {
-  galois::gDebug(
-      "Layer ", layer_number_, " dims: ", layer_dimensions_.input_rows, " ",
-      layer_dimensions_.output_rows, " ", layer_dimensions_.input_columns, " ",
-      layer_dimensions_.output_columns, " ", input_embeddings.size(), " ",
-      layer_dimensions_.input_rows * layer_dimensions_.input_columns);
+  //galois::gDebug(
+  //    "Layer ", layer_number_, " dims: ", layer_dimensions_.input_rows, " ",
+  //    layer_dimensions_.output_rows, " ", layer_dimensions_.input_columns, " ",
+  //    layer_dimensions_.output_columns, " ", input_embeddings.size(), " ",
+  //    layer_dimensions_.input_rows * layer_dimensions_.input_columns);
   galois::StatTimer timer("ForwardPhase", kRegionName);
   TimerStart(&timer);
 
@@ -742,10 +742,10 @@ void galois::SAGELayer::UpdateEmbeddings(const GNNFloat* node_embeddings,
         base_gpu_object_.layer_weights(), output);
   } else {
 #endif
-    galois::gDebug("Layer ", graph_user_layer_number_, " ",
-                   layer_dimensions_.output_rows, " ",
-                   layer_dimensions_.input_columns, " ",
-                   layer_dimensions_.output_columns);
+    //galois::gDebug("Layer ", graph_user_layer_number_, " ",
+    //               layer_dimensions_.output_rows, " ",
+    //               layer_dimensions_.input_columns, " ",
+    //               layer_dimensions_.output_columns);
     // CPU version is just a call into CBlas
     if (after) {
       galois::CBlasSGEMM(

From 1a0a94d57ccf5daa80627ac621bd4cc535b65822 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 28 Jul 2021 15:09:38 -0500
Subject: [PATCH 594/660] Updated the distributed minibatch sampler

1) Uses normalized sampling to get more varied distributions across
hosts
2) Fixes bug with original sampler where extra work from later hosts is
not applied to hosts that come before it in the loop
---
 .../galois/DistributedMinibatchTracker.h      |  16 ++-
 libgnn/src/DistributedMinibatchTracker.cpp    | 109 +++++++++++-------
 libgnn/src/GraphNeuralNetwork.cpp             |   6 +-
 3 files changed, 83 insertions(+), 48 deletions(-)

diff --git a/libgnn/include/galois/DistributedMinibatchTracker.h b/libgnn/include/galois/DistributedMinibatchTracker.h
index 1469db0e62..be5496ba92 100644
--- a/libgnn/include/galois/DistributedMinibatchTracker.h
+++ b/libgnn/include/galois/DistributedMinibatchTracker.h
@@ -12,11 +12,13 @@ class DistributedMinibatchTracker {
 public:
   DistributedMinibatchTracker(size_t my_host_id, size_t num_hosts,
                               size_t my_minibatch_nodes,
-                              size_t total_minibatch_size)
+                              size_t local_minibatch_size)
       : my_host_id_{my_host_id}, num_hosts_{num_hosts},
-        total_minibatch_size_{total_minibatch_size}, complete_hosts_{0},
-        rng_object_{(long unsigned)rand() * (my_host_id_ + 1)},
-        int_distribution_{0, (unsigned)num_hosts_ - 1} {
+        local_minibatch_size_{local_minibatch_size},
+        total_minibatch_size_{local_minibatch_size_ * num_hosts_},
+        complete_hosts_{0}, rng_object_{(long unsigned)rand() *
+                                        (my_host_id_ + 1)},
+        int_distribution_{1, 10} {
     max_num_on_hosts_.resize(num_hosts_, 0);
     current_num_on_hosts_.resize(num_hosts_, 0);
     sampled_num_on_hosts_.resize(num_hosts_, 0);
@@ -40,11 +42,15 @@ class DistributedMinibatchTracker {
 
   size_t GetNumberForNextMinibatch();
 
-  bool OutOfWork() { return complete_hosts_ == num_hosts_; }
+  bool OutOfWork() {
+    GALOIS_LOG_FATAL("NEED TO IMPLEMENT");
+    return complete_hosts_ == num_hosts_;
+  }
 
 private:
   size_t my_host_id_;
   size_t num_hosts_;
+  size_t local_minibatch_size_;
   size_t total_minibatch_size_;
   unsigned complete_hosts_;
 
diff --git a/libgnn/src/DistributedMinibatchTracker.cpp b/libgnn/src/DistributedMinibatchTracker.cpp
index 609030ae23..dddbc33519 100644
--- a/libgnn/src/DistributedMinibatchTracker.cpp
+++ b/libgnn/src/DistributedMinibatchTracker.cpp
@@ -5,53 +5,82 @@ size_t galois::DistributedMinibatchTracker::GetNumberForNextMinibatch() {
   galois::StatTimer timer("DistributedGetNumberForNextMinibatch");
   timer.start();
 
-  // TODO
-  for (size_t i = 0; i < total_minibatch_size_; i++) {
-    // pick a host, increment
-    unsigned chosen_host = int_distribution_(rng_object_);
-    assert(chosen_host < num_hosts_);
-    sampled_num_on_hosts_[chosen_host]++;
+  uint32_t my_share = int_distribution_(rng_object_);
+  if (current_num_on_hosts_[my_host_id_] == 0) {
+    my_share = 0;
   }
+  sampled_num_on_hosts_[my_host_id_] = my_share;
   // sync and post process *the same way on all hosts*
-  MPI_Allreduce(MPI_IN_PLACE, static_cast<void*>(sampled_num_on_hosts_.data()),
-                num_hosts_, MPI_UINT32_T, MPI_SUM, MPI_COMM_WORLD);
-
-  size_t to_return              = 0;
-  uint32_t leftover_to_allocate = 0;
-
-  // TODO parallel?
-  for (size_t i = 0; i < num_hosts_; i++) {
-    uint32_t proposed_to_sample = sampled_num_on_hosts_[i];
-    size_t left_to_sample       = current_num_on_hosts_[i];
-    size_t actual_to_sample     = 0;
-    if (left_to_sample > 0) {
-      actual_to_sample = std::min(proposed_to_sample, current_num_on_hosts_[i]);
-
-      if (actual_to_sample < left_to_sample && leftover_to_allocate) {
-        // more left to sample and we have extra; dump more from extra if
-        // possible
-        uint32_t what_is_left = left_to_sample - actual_to_sample;
-        size_t more_to_sample = std::min(what_is_left, leftover_to_allocate);
-        leftover_to_allocate -= more_to_sample;
-        actual_to_sample += more_to_sample;
-        assert(actual_to_sample <= left_to_sample);
-      }
+  MPI_Allgather(MPI_IN_PLACE, 0, MPI_UINT32_T,
+                static_cast<void*>(sampled_num_on_hosts_.data()), 1,
+                MPI_UINT32_T, MPI_COMM_WORLD);
+
+  for (size_t i = 1; i < sampled_num_on_hosts_.size(); i++) {
+    sampled_num_on_hosts_[i] += sampled_num_on_hosts_[i - 1];
+  }
+  uint32_t share_sum = sampled_num_on_hosts_.back();
+  uint32_t num_per_unit =
+      std::max((total_minibatch_size_ + share_sum - 1) / share_sum, size_t{1});
+
+  size_t my_value_to_take    = 0;
+  size_t extra_to_distribute = 0;
+  size_t sanity_sum          = 0;
+  for (size_t host = 0; host < num_hosts_; host++) {
+    // determine how much to pull from each host based on sampled number
+    uint32_t start;
+    uint32_t end;
+    if (host == 0) {
+      start = 0;
+      end   = std::min(num_per_unit * sampled_num_on_hosts_[host],
+                     (uint32_t)total_minibatch_size_);
+    } else if (host == (num_hosts_ - 1)) {
+      start = std::min(num_per_unit * sampled_num_on_hosts_[host - 1],
+                       (uint32_t)total_minibatch_size_);
+      end   = total_minibatch_size_;
+    } else {
+      start = std::min(num_per_unit * sampled_num_on_hosts_[host - 1],
+                       (uint32_t)total_minibatch_size_);
+      end   = std::min(num_per_unit * sampled_num_on_hosts_[host],
+                     (uint32_t)total_minibatch_size_);
     }
-    leftover_to_allocate = proposed_to_sample - actual_to_sample;
-    current_num_on_hosts_[i] -= actual_to_sample;
 
-    sampled_num_on_hosts_[i] = 0;
-    if (my_host_id_ == i) {
-      to_return = actual_to_sample;
+    uint32_t proposed_to_take = end - start;
+    sanity_sum += proposed_to_take;
+
+    // is there actually that much? check
+    uint32_t actual_to_take =
+        std::min(proposed_to_take, current_num_on_hosts_[host]);
+
+    if (actual_to_take < proposed_to_take) {
+      extra_to_distribute += proposed_to_take - actual_to_take;
+    }
+    // update counts, then return
+    current_num_on_hosts_[host] -= actual_to_take;
+    if (host == my_host_id_) {
+      my_value_to_take = actual_to_take;
     }
   }
-  timer.stop();
+  GALOIS_LOG_ASSERT(sanity_sum == total_minibatch_size_);
+
+  // redistribute extra to hosts with remaining
+  for (size_t host = 0; host < num_hosts_; host++) {
+    if (!extra_to_distribute) {
+      // leave when there is nothing selse to distribute
+      break;
+    }
 
-  if (leftover_to_allocate) {
-    // if there are leftovers, it means that there is no more work
-    // in this system period
-    complete_hosts_ = num_hosts_;
+    size_t left_on_host = current_num_on_hosts_[host];
+    if (left_on_host) {
+      uint32_t to_take = std::min(extra_to_distribute, left_on_host);
+      extra_to_distribute -= to_take;
+      current_num_on_hosts_[host] -= to_take;
+      // update my count as neccessary
+      if (my_host_id_ == host) {
+        my_value_to_take += to_take;
+      }
+    }
   }
+  timer.stop();
 
-  return to_return;
+  return my_value_to_take;
 }
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index feddc1fb2c..ea0c5dc05f 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -526,9 +526,9 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
         epoch_timer.start();
 
         if (!global_work_left) {
-          if (num_hosts_ > 1) {
-            GALOIS_LOG_ASSERT(dist_minibatch_tracker_->OutOfWork());
-          }
+          // if (num_hosts_ > 1) {
+          //  GALOIS_LOG_ASSERT(dist_minibatch_tracker_->OutOfWork());
+          //}
           break;
         }
       }

From 97e8632db69d7acd27a4b2b0ab4ac1411c026369 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 30 Jul 2021 19:53:13 -0500
Subject: [PATCH 595/660] Test: all hosts share the same minibatch shuffle

All hosts initialize a training-length array and shuffle it the same
way. Removes the need for the distributed minibatch generator as well.
---
 libgnn/include/galois/MinibatchGenerator.h | 22 ++++++++----
 libgnn/include/galois/graphs/GNNGraph.h    |  5 ++-
 libgnn/src/GraphNeuralNetwork.cpp          | 41 +++++++++++-----------
 libgnn/src/MinibatchGenerator.cpp          | 13 +++++--
 libgnn/src/graphs/GNNGraph.cpp             | 11 ++++++
 5 files changed, 62 insertions(+), 30 deletions(-)

diff --git a/libgnn/include/galois/MinibatchGenerator.h b/libgnn/include/galois/MinibatchGenerator.h
index 7e939c9cf4..8c6ae2275f 100644
--- a/libgnn/include/galois/MinibatchGenerator.h
+++ b/libgnn/include/galois/MinibatchGenerator.h
@@ -2,6 +2,7 @@
 
 #include "galois/GNNTypes.h"
 #include "galois/Logging.h"
+#include "galois/graphs/DistributedGraph.h"
 #include <ctime>
 #include <random>
 #include <algorithm>
@@ -17,8 +18,9 @@ class MinibatchGenerator {
       : mask_to_minibatch_{mask_to_minibatch}, minibatch_size_{minibatch_size},
         current_position_{0}, master_bound_{master_bound} {
     // set seed based on time then initialize random generate with rand()
-    srand(time(NULL));
+    srand(1);
     rand_generator_ = std::make_unique<std::mt19937>(rand());
+    srand(time(NULL));
     GALOIS_LOG_ASSERT(master_bound_ <= mask_to_minibatch_.size());
   }
 
@@ -56,16 +58,24 @@ class MinibatchGenerator {
     }
   }
 
-  void ShuffleMode() {
+  void ShuffleMode(const galois::graphs::DistGraph<char, void>& graph,
+                   GNNMask& global_training_mask, size_t total_train_nodes) {
     if (!shuffle_mode_) {
       shuffle_mode_ = true;
-      all_indices_.reserve(master_bound_);
+      all_indices_.reserve(total_train_nodes);
       // setup all set indices for the minibatch
-      for (size_t pos = 0; pos < master_bound_; pos++) {
-        if (mask_to_minibatch_[pos]) {
-          all_indices_.emplace_back(pos);
+      for (size_t pos = 0; pos < global_training_mask.size(); pos++) {
+        if (global_training_mask[pos]) {
+          if (graph.isLocal(pos)) {
+            all_indices_.emplace_back(graph.getLID(pos));
+          } else {
+            // size is greater than LID; use this as a "not present"
+            all_indices_.emplace_back(graph.size());
+          }
         }
       }
+      GALOIS_LOG_ASSERT(all_indices_.size() == total_train_nodes);
+
       // shuffle it
       std::shuffle(all_indices_.begin(), all_indices_.end(), *rand_generator_);
       printf("Number of things in minibatch generator is %lu\n",
diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 044f82e7a2..835c2cba01 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -398,7 +398,8 @@ class GNNGraph {
     }
     train_batcher_ = std::make_unique<MinibatchGenerator>(
         local_training_mask_, train_batch_size, *end_owned());
-    train_batcher_->ShuffleMode();
+    train_batcher_->ShuffleMode(*partitioned_graph_, global_training_mask_,
+                                global_training_mask_range_.size);
     local_minibatch_mask_.resize(partitioned_graph_->size());
     return train_batcher_->ShuffleMinibatchTotal();
   }
@@ -777,6 +778,8 @@ class GNNGraph {
   //////////////////////////////////////////////////////////////////////////////
 
   // TODO maybe revisit this and use an actual bitset
+  //! Bitset indicating which nodes are training nodes (global)
+  GNNMask global_training_mask_;
   //! Bitset indicating which nodes are training nodes
   GNNMask local_training_mask_;
   //! Bitset indicating which nodes are validation nodes
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index ea0c5dc05f..90fa6fd009 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -124,13 +124,13 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork(
 
   num_hosts_ = galois::runtime::getSystemNetworkInterface().Num;
   if (config_.train_minibatch_size()) {
-    size_t local_num =
-        graph_->SetupTrainBatcher(config_.train_minibatch_size());
-    if (num_hosts_ > 1) {
-      dist_minibatch_tracker_ = std::make_unique<DistributedMinibatchTracker>(
-          galois::runtime::getSystemNetworkInterface().ID, num_hosts_,
-          local_num, config_.train_minibatch_size());
-    }
+    graph_->SetupTrainBatcher(config_.train_minibatch_size());
+    // size_t local_num =
+    // if (num_hosts_ > 1) {
+    //  dist_minibatch_tracker_ = std::make_unique<DistributedMinibatchTracker>(
+    //      galois::runtime::getSystemNetworkInterface().ID, num_hosts_,
+    //      local_num, config_.train_minibatch_size());
+    //}
   }
 
   if (config_.test_minibatch_size()) {
@@ -373,9 +373,9 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
       GradientPropagation();
     } else {
       graph_->ResetTrainMinibatcher();
-      if (num_hosts_ > 1) {
-        dist_minibatch_tracker_->ResetEpoch();
-      }
+      // if (num_hosts_ > 1) {
+      //  dist_minibatch_tracker_->ResetEpoch();
+      //}
 
       SetLayerPhases(galois::GNNPhase::kBatch);
 
@@ -395,16 +395,17 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
         // break when all hosts are done with minibatches
         prep_timer.start();
         size_t seed_node_count;
-        if (num_hosts_ > 1) {
-          size_t num_for_next_batch =
-              dist_minibatch_tracker_->GetNumberForNextMinibatch();
-          galois::gInfo(graph_->host_prefix(), "Sampling ", num_for_next_batch,
-                        " for this minibatch");
-          seed_node_count =
-              graph_->PrepareNextTrainMinibatch(num_for_next_batch);
-        } else {
-          seed_node_count = graph_->PrepareNextTrainMinibatch();
-        }
+        // if (num_hosts_ > 1) {
+        //  size_t num_for_next_batch =
+        //      dist_minibatch_tracker_->GetNumberForNextMinibatch();
+        //  galois::gInfo(graph_->host_prefix(), "Sampling ",
+        //  num_for_next_batch,
+        //                " for this minibatch");
+        //  seed_node_count =
+        //      graph_->PrepareNextTrainMinibatch(num_for_next_batch);
+        //} else {
+        //}
+        seed_node_count = graph_->PrepareNextTrainMinibatch();
 
         galois::gDebug(graph_->host_prefix(),
                        "Number of local seed nodes is for batch is ",
diff --git a/libgnn/src/MinibatchGenerator.cpp b/libgnn/src/MinibatchGenerator.cpp
index c1bb8c221d..4d851aacac 100644
--- a/libgnn/src/MinibatchGenerator.cpp
+++ b/libgnn/src/MinibatchGenerator.cpp
@@ -37,12 +37,19 @@ void galois::MinibatchGenerator::OriginalGetNextMinibatch(
 
 void galois::MinibatchGenerator::ShuffleGetNextMinibatch(
     std::vector<char>* batch_mask) {
-  size_t current_count = 0;
   galois::ParallelSTL::fill(batch_mask->begin(), batch_mask->end(), 0);
+
+  size_t current_count = 0;
+  size_t global_minibatch_size =
+      minibatch_size_ * galois::runtime::getSystemNetworkInterface().Num;
   while (current_position_ < all_indices_.size()) {
-    (*batch_mask)[all_indices_[current_position_++]] = 1;
+    size_t candidate_lid = all_indices_[current_position_++];
+    if (candidate_lid < batch_mask->size() && candidate_lid < master_bound_) {
+      (*batch_mask)[candidate_lid] = 1;
+    }
+
     current_count++;
-    if (current_count == minibatch_size_)
+    if (current_count == global_minibatch_size)
       break;
   }
 }
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index e616465d1b..9f980b6134 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -490,6 +490,11 @@ size_t galois::graphs::GNNGraph::ReadLocalMasksFromFile(
   // read mask range
   std::string mask_filename =
       input_directory_ + dataset_name + "-" + mask_type + "_mask.txt";
+  bool train_is_on = false;
+  if (mask_type == "train") {
+    train_is_on = true;
+  }
+
   std::ifstream mask_stream;
   mask_stream.open(mask_filename, std::ios::in);
   mask_stream >> range_begin >> range_end >> std::ws;
@@ -520,6 +525,9 @@ size_t galois::graphs::GNNGraph::ReadLocalMasksFromFile(
           (*masks)[partitioned_graph_->getLID(cur_line_num)] = 1;
           local_sample_count++;
         }
+        if (train_is_on) {
+          global_training_mask_[cur_line_num] = 1;
+        }
       }
     }
     cur_line_num++;
@@ -560,6 +568,7 @@ size_t galois::graphs::GNNGraph::FindOtherMask() {
 
 void galois::graphs::GNNGraph::ReadLocalMasks(const std::string& dataset_name) {
   // allocate the memory for the local masks
+  global_training_mask_.resize(partitioned_graph_->globalSize());
   local_training_mask_.resize(partitioned_graph_->size());
   local_validation_mask_.resize(partitioned_graph_->size());
   local_testing_mask_.resize(partitioned_graph_->size());
@@ -579,6 +588,7 @@ void galois::graphs::GNNGraph::ReadLocalMasks(const std::string& dataset_name) {
       if (partitioned_graph_->isLocal(i)) {
         local_training_mask_[partitioned_graph_->getLID(i)] = 1;
       }
+      global_training_mask_[i] = 1;
     }
 
     // validation
@@ -608,6 +618,7 @@ void galois::graphs::GNNGraph::ReadLocalMasks(const std::string& dataset_name) {
       if (partitioned_graph_->isLocal(i)) {
         local_training_mask_[partitioned_graph_->getLID(i)] = 1;
       }
+      global_training_mask_[i] = 1;
     }
     // validation
     for (size_t i = global_validation_mask_range_.begin;

From c42688d2d8f2eeadb0f3e78b720ed66ac675a4bb Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Wed, 11 Aug 2021 17:08:30 -0500
Subject: [PATCH 596/660] Major mirror deletion fix: don't delete masters

Master proxies were being deleted from subgraphs if they did not have
incoming/outgoing edges. This is a problem because the master is
responsible for reducing updates from all other proxies which may still
exist on other hosts, and this was the cause of accuracy degradation at
a higher number of hosts.

This problem did not appear much for edge cuts because master nodes
would not get deleted since edges end up on master.
---
 libgnn/include/galois/MinibatchGenerator.h |  4 +++-
 libgnn/include/galois/graphs/GNNGraph.h    |  3 ++-
 libgnn/src/graphs/GNNGraph.cpp             | 24 ++++++++++++++++++++++
 libgnn/src/graphs/GNNSubgraph.cpp          | 21 +++++++++++++++++++
 4 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/libgnn/include/galois/MinibatchGenerator.h b/libgnn/include/galois/MinibatchGenerator.h
index 8c6ae2275f..fd7c92ff50 100644
--- a/libgnn/include/galois/MinibatchGenerator.h
+++ b/libgnn/include/galois/MinibatchGenerator.h
@@ -74,7 +74,9 @@ class MinibatchGenerator {
           }
         }
       }
-      GALOIS_LOG_ASSERT(all_indices_.size() == total_train_nodes);
+      GALOIS_LOG_VASSERT(all_indices_.size() == total_train_nodes,
+                         "{} vs right {}", all_indices_.size(),
+                         total_train_nodes);
 
       // shuffle it
       std::shuffle(all_indices_.begin(), all_indices_.end(), *rand_generator_);
diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 835c2cba01..18604361a4 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -399,7 +399,7 @@ class GNNGraph {
     train_batcher_ = std::make_unique<MinibatchGenerator>(
         local_training_mask_, train_batch_size, *end_owned());
     train_batcher_->ShuffleMode(*partitioned_graph_, global_training_mask_,
-                                global_training_mask_range_.size);
+                                global_training_count_);
     local_minibatch_mask_.resize(partitioned_graph_->size());
     return train_batcher_->ShuffleMinibatchTotal();
   }
@@ -778,6 +778,7 @@ class GNNGraph {
   //////////////////////////////////////////////////////////////////////////////
 
   // TODO maybe revisit this and use an actual bitset
+  size_t global_training_count_;
   //! Bitset indicating which nodes are training nodes (global)
   GNNMask global_training_mask_;
   //! Bitset indicating which nodes are training nodes
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index 9f980b6134..1c7d19040b 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -534,6 +534,10 @@ size_t galois::graphs::GNNGraph::ReadLocalMasksFromFile(
   }
   mask_stream.close();
 
+  if (train_is_on) {
+    global_training_count_ = valid_count;
+  }
+
   if (valid_count != mask_range->size) {
     // overlapping masks: need to actually check the masks rather than use
     // ranges
@@ -574,6 +578,8 @@ void galois::graphs::GNNGraph::ReadLocalMasks(const std::string& dataset_name) {
   local_testing_mask_.resize(partitioned_graph_->size());
 
   if (dataset_name == "reddit") {
+    global_training_count_ = 153431;
+
     // TODO reddit is hardcode handled at the moment; better way to not do
     // this?
     global_training_mask_range_   = {.begin = 0, .end = 153431, .size = 153431};
@@ -607,6 +613,8 @@ void galois::graphs::GNNGraph::ReadLocalMasks(const std::string& dataset_name) {
       }
     }
   } else if (dataset_name == "ogbn-papers100M-remap") {
+    global_training_count_ = 1207178;
+
     global_training_mask_range_ = {.begin = 0, .end = 1207178, .size = 1207178};
     global_validation_mask_range_ = {
         .begin = 1207178, .end = 1207178 + 125264, .size = 125264};
@@ -1107,6 +1115,14 @@ size_t galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num,
     if (IsInSampledGraph(x)) {
       local_sample_count += 1;
       if (sample_node_timestamps_[*x] == std::numeric_limits<uint32_t>::max()) {
+        if (x < end_owned()) {
+          // owned nodes that are activated on other hosts shoudl always
+          // be activated because it's responsible for keeping others in
+          // sync during comms; ignoring it = bad
+          // TODO(gluon) make it so you don't have to deal with this
+          // and just use host as a reducer point
+          definitely_sampled_nodes_.set(*x);
+        }
         sample_node_timestamps_[*x] = timestamp;
       }
     }
@@ -1203,6 +1219,14 @@ size_t galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num,
     if (IsInSampledGraph(x)) {
       local_sample_count += 1;
       if (sample_node_timestamps_[*x] == std::numeric_limits<uint32_t>::max()) {
+        if (x < end_owned()) {
+          // owned nodes that are activated on other hosts shoudl always
+          // be activated because it's responsible for keeping others in
+          // sync during comms; ignoring it = bad
+          // TODO(gluon) make it so you don't have to deal with this
+          // and just use host as a reducer point
+          definitely_sampled_nodes_.set(*x);
+        }
         sample_node_timestamps_[*x] = timestamp;
       }
     }
diff --git a/libgnn/src/graphs/GNNSubgraph.cpp b/libgnn/src/graphs/GNNSubgraph.cpp
index 5e95b079fd..f2148b2706 100644
--- a/libgnn/src/graphs/GNNSubgraph.cpp
+++ b/libgnn/src/graphs/GNNSubgraph.cpp
@@ -59,6 +59,27 @@ void galois::graphs::GNNGraph::GNNSubgraph::CreateSubgraphMapping(
     return;
   }
 
+  // checking sanity
+  // galois::do_all(galois::iterate(gnn_graph.begin(), gnn_graph.end()),
+  //               [&](uint32_t node_id) {
+  //                 if (gnn_graph.IsInSampledGraph(node_id) &&
+  //                 !gnn_graph.IsActiveInSubgraph(node_id)) {
+  //                  // check if any edges are active
+  //                  for (auto a = gnn_graph.edge_begin(node_id); a !=
+  //                  gnn_graph.edge_end(node_id);a++) {
+  //                    if (gnn_graph.IsEdgeSampledAny(a)) {
+  //                      galois::gWarn("ERROR node ", node_id);
+  //                    }
+  //                  }
+  //                  for (auto a = gnn_graph.in_edge_begin(node_id); a !=
+  //                  gnn_graph.in_edge_end(node_id);a++) {
+  //                    if (gnn_graph.IsInEdgeSampledAny(a)) {
+  //                      galois::gWarn("ERROR in node ", node_id);
+  //                    }
+  //                  }
+  //                 }
+  //               });
+
   if (subgraph_id_to_lid_.size() < num_subgraph_nodes_) {
     // allocate a bit more than necessary to avoid a big realloc
     // if node value changes slightly later

From c27153ab2339789b3e2c0736063c4a4586fe6f01 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 12 Aug 2021 14:40:11 -0500
Subject: [PATCH 597/660] Revert minibatch selection to "pick locally"

Instead of global minibatch view where all hosts advance selector the
same way (can result in uneven minibatch size on each host), revert to
scheme where all hosts select same # (which requires even distribution
of training nodes across hosts).

Revert this commit to get back to the other functionality.
---
 libgnn/include/galois/MinibatchGenerator.h | 57 ++++++++++++++++------
 libgnn/include/galois/graphs/GNNGraph.h    | 14 +++---
 libgnn/src/MinibatchGenerator.cpp          | 20 +++++++-
 3 files changed, 70 insertions(+), 21 deletions(-)

diff --git a/libgnn/include/galois/MinibatchGenerator.h b/libgnn/include/galois/MinibatchGenerator.h
index fd7c92ff50..127367bdf1 100644
--- a/libgnn/include/galois/MinibatchGenerator.h
+++ b/libgnn/include/galois/MinibatchGenerator.h
@@ -18,9 +18,9 @@ class MinibatchGenerator {
       : mask_to_minibatch_{mask_to_minibatch}, minibatch_size_{minibatch_size},
         current_position_{0}, master_bound_{master_bound} {
     // set seed based on time then initialize random generate with rand()
-    srand(1);
-    rand_generator_ = std::make_unique<std::mt19937>(rand());
+    // srand(1);
     srand(time(NULL));
+    rand_generator_ = std::make_unique<std::mt19937>(rand());
     GALOIS_LOG_ASSERT(master_bound_ <= mask_to_minibatch_.size());
   }
 
@@ -32,14 +32,14 @@ class MinibatchGenerator {
     }
   }
 
-  void GetNextMinibatch(std::vector<char>* batch_mask, size_t num_to_get) {
-    if (!shuffle_mode_) {
-      // TODO
-      GALOIS_LOG_FATAL("not yet implemented");
-    } else {
-      ShuffleGetNextMinibatch(batch_mask, num_to_get);
-    }
-  }
+  // void GetNextMinibatch(std::vector<char>* batch_mask, size_t num_to_get) {
+  //  if (!shuffle_mode_) {
+  //    // TODO
+  //    GALOIS_LOG_FATAL("not yet implemented");
+  //  } else {
+  //    ShuffleGetNextMinibatch(batch_mask, num_to_get);
+  //  }
+  //}
 
   //! True if no more minibatches from this generator
   bool NoMoreMinibatches() {
@@ -58,8 +58,34 @@ class MinibatchGenerator {
     }
   }
 
-  void ShuffleMode(const galois::graphs::DistGraph<char, void>& graph,
-                   GNNMask& global_training_mask, size_t total_train_nodes) {
+  //! Original shuffle mode in which every host only considers locally owned
+  //! training nodes in the all indices array
+  void ShuffleMode() {
+    if (!shuffle_mode_) {
+      shuffle_mode_ = true;
+      all_indices_.reserve(master_bound_);
+      // setup all set indices for the minibatch
+      for (size_t pos = 0; pos < master_bound_; pos++) {
+        if (mask_to_minibatch_[pos]) {
+          all_indices_.emplace_back(pos);
+        }
+      }
+      // shuffle it
+      std::shuffle(all_indices_.begin(), all_indices_.end(), *rand_generator_);
+      printf("Number of things in minibatch generator is %lu\n",
+             all_indices_.size());
+    }
+  }
+
+  //! Distributed shuffle mode: all hosts create array with ALL global training
+  //! node IDs and initialize shuffler to same seed. All hosts then advance it
+  //! at the same time, resulting in a consistent minibatch across all hosts.
+  //! Will *NOT* balance # of training nodes done on a host each minibatch
+  //! unlike original shuffle.
+  void
+  DistributedShuffleMode(const galois::graphs::DistGraph<char, void>& graph,
+                         GNNMask& global_training_mask,
+                         size_t total_train_nodes) {
     if (!shuffle_mode_) {
       shuffle_mode_ = true;
       all_indices_.reserve(total_train_nodes);
@@ -106,8 +132,11 @@ class MinibatchGenerator {
 
   void OriginalGetNextMinibatch(std::vector<char>* batch_mask);
   void ShuffleGetNextMinibatch(std::vector<char>* batch_mask);
-  void ShuffleGetNextMinibatch(std::vector<char>* batch_mask,
-                               size_t num_to_get);
+
+  // Do not use these unless you know what they're doing
+  void DistributedShuffleGetNextMinibatch(std::vector<char>* batch_mask);
+  void DistributedShuffleGetNextMinibatch(std::vector<char>* batch_mask,
+                                          size_t num_to_get);
 };
 
 } // namespace galois
diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 18604361a4..2eaba6e90d 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -398,8 +398,9 @@ class GNNGraph {
     }
     train_batcher_ = std::make_unique<MinibatchGenerator>(
         local_training_mask_, train_batch_size, *end_owned());
-    train_batcher_->ShuffleMode(*partitioned_graph_, global_training_mask_,
-                                global_training_count_);
+    train_batcher_->ShuffleMode();
+    // train_batcher_->DistributedShuffleMode(*partitioned_graph_,
+    // global_training_mask_, global_training_count_);
     local_minibatch_mask_.resize(partitioned_graph_->size());
     return train_batcher_->ShuffleMinibatchTotal();
   }
@@ -409,10 +410,11 @@ class GNNGraph {
   //! Setup the state for the next minibatch sampling call by using the
   //! minibatcher to pick up the next set batch of nodes
   size_t PrepareNextTrainMinibatch();
-  size_t PrepareNextTrainMinibatch(size_t num_to_get) {
-    train_batcher_->GetNextMinibatch(&local_minibatch_mask_, num_to_get);
-    return SetupNeighborhoodSample(GNNPhase::kBatch);
-  }
+  // Used with distributed minibatch tracker
+  // size_t PrepareNextTrainMinibatch(size_t num_to_get) {
+  //  train_batcher_->GetNextMinibatch(&local_minibatch_mask_, num_to_get);
+  //  return SetupNeighborhoodSample(GNNPhase::kBatch);
+  //}
   //! Returns true if there are still more minibatches in this graph
   bool MoreTrainMinibatches() { return !train_batcher_->NoMoreMinibatches(); };
 
diff --git a/libgnn/src/MinibatchGenerator.cpp b/libgnn/src/MinibatchGenerator.cpp
index 4d851aacac..9b603fc2e4 100644
--- a/libgnn/src/MinibatchGenerator.cpp
+++ b/libgnn/src/MinibatchGenerator.cpp
@@ -37,6 +37,22 @@ void galois::MinibatchGenerator::OriginalGetNextMinibatch(
 
 void galois::MinibatchGenerator::ShuffleGetNextMinibatch(
     std::vector<char>* batch_mask) {
+  size_t current_count = 0;
+  galois::ParallelSTL::fill(batch_mask->begin(), batch_mask->end(), 0);
+  // loops through a number of indices locally and sets
+  while (current_position_ < all_indices_.size()) {
+    (*batch_mask)[all_indices_[current_position_++]] = 1;
+    current_count++;
+    if (current_count == minibatch_size_)
+      break;
+  }
+}
+
+// used if all hosts have a global view of the same minibatch sequence
+// (occurs if all hosts use same shuffle seed)
+// Do not use unless you know what you are doing
+void galois::MinibatchGenerator::DistributedShuffleGetNextMinibatch(
+    std::vector<char>* batch_mask) {
   galois::ParallelSTL::fill(batch_mask->begin(), batch_mask->end(), 0);
 
   size_t current_count = 0;
@@ -54,7 +70,9 @@ void galois::MinibatchGenerator::ShuffleGetNextMinibatch(
   }
 }
 
-void galois::MinibatchGenerator::ShuffleGetNextMinibatch(
+// used with distributed minibatch tracker which is deprecated; code not
+// guaranteed to work
+void galois::MinibatchGenerator::DistributedShuffleGetNextMinibatch(
     std::vector<char>* batch_mask, size_t num_to_get) {
   size_t current_count = 0;
   galois::ParallelSTL::fill(batch_mask->begin(), batch_mask->end(), 0);

From 916d25fb106a1e223d8a58bd35a4dd3b225c992d Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Thu, 12 Aug 2021 15:15:44 -0500
Subject: [PATCH 598/660] Warning/crashes added to layers

Warnings or failures added to out of date layers that do not work with
new GNN execution pipeline.
---
 libgnn/src/layers/DenseLayer.cpp              |  4 +
 libgnn/src/layers/GNNLayer.cpp                |  2 +-
 libgnn/src/layers/GraphConvolutionalLayer.cpp |  4 +
 libgnn/src/layers/L2NormLayer.cpp             |  2 +
 libgnn/src/layers/SAGELayer.cpp               | 10 +--
 libgnn/src/layers/SigmoidLayer.cpp            |  4 +
 libgnn/src/layers/SoftmaxLayer.cpp            | 76 +++++++++----------
 7 files changed, 58 insertions(+), 44 deletions(-)

diff --git a/libgnn/src/layers/DenseLayer.cpp b/libgnn/src/layers/DenseLayer.cpp
index 483ceb7850..eed3143a01 100644
--- a/libgnn/src/layers/DenseLayer.cpp
+++ b/libgnn/src/layers/DenseLayer.cpp
@@ -9,6 +9,10 @@ galois::DenseLayer::DenseLayer(
     : GNNLayer(layer_num, graph, backward_output_matrix, dimensions, config),
       input_column_intermediates_(dimensions.input_columns),
       output_column_intermediates_(dimensions.output_columns) {
+  // TODO Need to make sure that layer knows about forward/backward matrix
+  // sharing (e.g., overwriting previously used input to save space)
+  GALOIS_LOG_FATAL("This layer has not been kept up to date; do not use until "
+                   "sure it's been updated");
   size_t num_input_elements =
       layer_dimensions_.input_rows * layer_dimensions_.input_columns;
   in_temp_1_.resize(num_input_elements, 0);
diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp
index 885dc1f537..82a864a41d 100644
--- a/libgnn/src/layers/GNNLayer.cpp
+++ b/libgnn/src/layers/GNNLayer.cpp
@@ -189,7 +189,7 @@ void galois::GNNLayer::RandomInitVector(std::vector<GNNFloat>* vector_to_init) {
 void galois::GNNLayer::DoDropoutCPU(
     const PointerWithSize<GNNFloat> input_to_dropout,
     PointerWithSize<GNNFloat>* output_matrix) {
-  // XXX(loc) check this to make sure it works in subgraph setting
+  // TODO This (and dropout in general) may not work in the sampling setting
   size_t num_elements =
       layer_dimensions_.input_rows * layer_dimensions_.input_columns;
 
diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp
index 82522fafd9..de84903447 100644
--- a/libgnn/src/layers/GraphConvolutionalLayer.cpp
+++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp
@@ -9,6 +9,10 @@ galois::GraphConvolutionalLayer::GraphConvolutionalLayer(
     : GNNLayer(layer_num, graph, backward_output_matrix, dimensions, config),
       input_column_intermediates_(dimensions.input_columns),
       output_column_intermediates_(dimensions.output_columns) {
+  galois::gWarn(
+      "GCN layer not up to date with new subgraph/sampling changes; "
+      "do not use until updated to reflect changes (see GraphSAGE layer)");
+
   size_t num_input_elements =
       layer_dimensions_.input_rows * layer_dimensions_.input_columns;
   if (!config_.disable_dropout || config_.disable_aggregate_after_update ||
diff --git a/libgnn/src/layers/L2NormLayer.cpp b/libgnn/src/layers/L2NormLayer.cpp
index bcf66eb2f9..0d566f0b66 100644
--- a/libgnn/src/layers/L2NormLayer.cpp
+++ b/libgnn/src/layers/L2NormLayer.cpp
@@ -5,6 +5,8 @@ galois::L2NormLayer::ForwardPhase(
 #ifdef GALOIS_ENABLE_GPU
   // TODO
 #endif
+  GALOIS_LOG_FATAL(
+      "L2 layer has not been kept up to date for months; do not use");
   return ForwardPhaseCPU(input_embeddings);
 }
 
diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp
index f078d97bd9..25b9418fa1 100644
--- a/libgnn/src/layers/SAGELayer.cpp
+++ b/libgnn/src/layers/SAGELayer.cpp
@@ -155,7 +155,7 @@ void galois::SAGELayer::ResizeIntermediates(size_t new_input_rows,
                                             size_t new_output_rows) {
   size_t num_in_temp_elements =
       new_output_rows * layer_dimensions_.input_columns;
-  //galois::gDebug(graph_.host_prefix(), "Layer num ", layer_number_, " ",
+  // galois::gDebug(graph_.host_prefix(), "Layer num ", layer_number_, " ",
   //               in_temp_1_.size(), " and ", num_in_temp_elements, " ",
   //               layer_dimensions_.input_columns, " ",
   //               layer_dimensions_.output_columns);
@@ -267,10 +267,10 @@ void galois::SAGELayer::WeightGradientSyncSum2() {
 
 const galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::ForwardPhase(
     const galois::PointerWithSize<galois::GNNFloat> input_embeddings) {
-  //galois::gDebug(
+  // galois::gDebug(
   //    "Layer ", layer_number_, " dims: ", layer_dimensions_.input_rows, " ",
-  //    layer_dimensions_.output_rows, " ", layer_dimensions_.input_columns, " ",
-  //    layer_dimensions_.output_columns, " ", input_embeddings.size(), " ",
+  //    layer_dimensions_.output_rows, " ", layer_dimensions_.input_columns, "
+  //    ", layer_dimensions_.output_columns, " ", input_embeddings.size(), " ",
   //    layer_dimensions_.input_rows * layer_dimensions_.input_columns);
   galois::StatTimer timer("ForwardPhase", kRegionName);
   TimerStart(&timer);
@@ -742,7 +742,7 @@ void galois::SAGELayer::UpdateEmbeddings(const GNNFloat* node_embeddings,
         base_gpu_object_.layer_weights(), output);
   } else {
 #endif
-    //galois::gDebug("Layer ", graph_user_layer_number_, " ",
+    // galois::gDebug("Layer ", graph_user_layer_number_, " ",
     //               layer_dimensions_.output_rows, " ",
     //               layer_dimensions_.input_columns, " ",
     //               layer_dimensions_.output_columns);
diff --git a/libgnn/src/layers/SigmoidLayer.cpp b/libgnn/src/layers/SigmoidLayer.cpp
index 1809decc8a..595fd5c023 100644
--- a/libgnn/src/layers/SigmoidLayer.cpp
+++ b/libgnn/src/layers/SigmoidLayer.cpp
@@ -7,6 +7,10 @@
 const galois::PointerWithSize<galois::GNNFloat>
 galois::SigmoidLayer::ForwardPhaseCPU(
     const galois::PointerWithSize<galois::GNNFloat> input_embeddings) {
+  galois::gWarn(
+      "Sigmoid layer has not been kept up to date; do not use unless sure"
+      " it works with new changes");
+
   input_loss_.assign(input_loss_.size(), 0.0);
   forward_output_matrix_.assign(forward_output_matrix_.size(), 0.0);
   const size_t feature_length = layer_dimensions_.input_columns;
diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp
index 70a6afa6c3..aebbb3dd9b 100644
--- a/libgnn/src/layers/SoftmaxLayer.cpp
+++ b/libgnn/src/layers/SoftmaxLayer.cpp
@@ -32,25 +32,25 @@ galois::SoftmaxLayer::ForwardPhaseCPU(
           }
         }
 
-          // do softmax
-          GNNSoftmax(feature_length, &input_embeddings[feature_length * i],
-                     &p_backward_output_matrix_[feature_length * i]);
-          // create ground truth vector for this LID
-          std::vector<GNNFloat>* ground_truth_vec =
-              ground_truth_vectors_.getLocal();
-          assert(ground_truth_vec->size() == feature_length);
-          ground_truth_vec->assign(ground_truth_vec->size(), 0.0);
-          // single class label is an index; set the correct one
-          (*ground_truth_vec)[static_cast<size_t>(
-              graph_.GetSingleClassLabel(i))] = 1.0;
+        // do softmax
+        GNNSoftmax(feature_length, &input_embeddings[feature_length * i],
+                   &p_backward_output_matrix_[feature_length * i]);
+        // create ground truth vector for this LID
+        std::vector<GNNFloat>* ground_truth_vec =
+            ground_truth_vectors_.getLocal();
+        assert(ground_truth_vec->size() == feature_length);
+        ground_truth_vec->assign(ground_truth_vec->size(), 0.0);
+        // single class label is an index; set the correct one
+        (*ground_truth_vec)[static_cast<size_t>(
+            graph_.GetSingleClassLabel(i))] = 1.0;
 
-          // calculate loss for this LID (note not all i will be filled)
-          input_loss_[i] =
-              GNNCrossEntropy(feature_length, ground_truth_vec->data(),
-                              &p_backward_output_matrix_[feature_length * i]);
+        // calculate loss for this LID (note not all i will be filled)
+        input_loss_[i] =
+            GNNCrossEntropy(feature_length, ground_truth_vec->data(),
+                            &p_backward_output_matrix_[feature_length * i]);
 #ifndef NDEBUG
-          loss_accum += input_loss_[i];
-          handled += 1;
+        loss_accum += input_loss_[i];
+        handled += 1;
 #endif
       },
       // TODO chunk size?
@@ -91,29 +91,29 @@ galois::SoftmaxLayer::BackwardPhaseCPU() {
   galois::do_all(
       galois::iterate(size_t{0}, layer_dimensions_.input_rows),
       [&](const unsigned node) {
-          if (IsSampledLayer()) {
-            if (layer_phase_ == GNNPhase::kTrain &&
-                !graph_.IsInSampledGraphSubgraph(node))
-              return;
-          }
+        if (IsSampledLayer()) {
+          if (layer_phase_ == GNNPhase::kTrain &&
+              !graph_.IsInSampledGraphSubgraph(node))
+            return;
+        }
 
-          size_t correct = graph_.GetSingleClassLabel(node);
-          // See here for explanation for why this works
-          // https://gombru.github.io/2018/05/23/cross_entropy_loss/
-          // Derivation of full combined derivative isn't there, but some
-          // emperical inspection tells me this is likely correct
-          // TODO(loc) work it out myself
-          for (size_t idx = 0; idx < feature_length; idx++) {
-            if (idx == correct) {
-              // positive class
-              p_backward_output_matrix_[node * feature_length + idx] =
-                  p_backward_output_matrix_[node * feature_length + idx] - 1;
-            } else {
-              // negative class
-              p_backward_output_matrix_[node * feature_length + idx] =
-                  p_backward_output_matrix_[node * feature_length + idx];
-            }
+        size_t correct = graph_.GetSingleClassLabel(node);
+        // See here for explanation for why this works
+        // https://gombru.github.io/2018/05/23/cross_entropy_loss/
+        // Derivation of full combined derivative isn't there, but some
+        // emperical inspection tells me this is likely correct
+        // TODO(loc) work it out myself
+        for (size_t idx = 0; idx < feature_length; idx++) {
+          if (idx == correct) {
+            // positive class
+            p_backward_output_matrix_[node * feature_length + idx] =
+                p_backward_output_matrix_[node * feature_length + idx] - 1;
+          } else {
+            // negative class
+            p_backward_output_matrix_[node * feature_length + idx] =
+                p_backward_output_matrix_[node * feature_length + idx];
           }
+        }
       },
       galois::steal(), galois::loopname("SoftmaxBackward"));
 

From adbd3b578260f2405d0c08aaeb7a7c90c71aeea2 Mon Sep 17 00:00:00 2001
From: Loc Hoang <l_hoang@utexas.edu>
Date: Fri, 13 Aug 2021 17:48:44 -0500
Subject: [PATCH 599/660] Add design doc README, some comments to SAGE

Added a full markdown README for libgnn explaining some design
decisions. Comments to the SAGE layer indicating which conditionals
are XForm first
---
 libgnn/README.md                | 562 ++++++++++++++++++++++++++++++++
 libgnn/src/layers/SAGELayer.cpp |   3 +
 2 files changed, 565 insertions(+)
 create mode 100644 libgnn/README.md

diff --git a/libgnn/README.md b/libgnn/README.md
new file mode 100644
index 0000000000..dbca774922
--- /dev/null
+++ b/libgnn/README.md
@@ -0,0 +1,562 @@
+Author: Loc Hoang, <l_hoang@utexas.edu>
+
+Best viewed with a Markdown viewer due to Latex + formatting.
+
+This file's sections are ordered such that you can read from
+top to bottom and still get a decent understanding of the
+pieces of `libgnn`. As such, independent portions are near the
+top.
+
+This file is being written so that whoever works on this code in the
+future has a general idea what contributions I've made to the code
+and how the gnn branch differs from master. Some of these changes
+need to get merged into master in the future. It also allows me
+to take stock of the changes/implementation choices I've made
+in the past year.
+
+# CuSP Changes
+
+Variants of the regular partitions were added to allow training
+nodes to be partitioned relatively evenly among machines rather
+than having CVC/OEC use a regular block partition over all nodes (which
+would ignore the train/val/test split).
+
+This causes some weird effects when this version's CuSP is used outside
+of GNNs or if the training boundaries are not hardcoded (e.g., if
+the training boundaries are unknown, a segfault can occur). Some care
+will be needed to make this integration more clean.
+
+# Gluon Changes
+
+Many changes occurred to Gluon to optimize for the vector communication
+case. A few of them are listed below.
+
+* Serialize/deserialize **directly** to/from the serialization and
+deserialization buffers. This eliminates a large amount of redundant
+copying from original source to vector to buffer (and in the reverse)
+which is incredibly important for performance when communicating vectors.
+Something important to also take away from this experience is that
+if you have a vector of vectors, serializing each vector individually
+into the buffer is a very bad idea: care should be taken to make
+it so that you can serialize as much data as possible in one go.
+
+* QoL change: way to disable Gluon timers with a variable change/flag.
+
+* Method to swap out mirror handshake since this is used by subgraph
+code to avoid sending messages to inactive mirrors.
+
+* Hochan ported large message handling from KatanaGraph into Galois.
+This involved changing the serialization buffers among other things.
+
+# GNN Optimizers
+
+Only one that exists is the ADAM optimizer. Note that each
+layer has its own moments and does not share them (this may or
+may not be standard; I'm not sure).
+
+All hosts will see the same gradients due to synchronization,
+so all hosts should end up making the same changes to the weights.
+
+# Layers
+
+Each layer inherits from a `GNNLayer`  class which has common functionality
+like weight allocation, output allocation, etc. The children classes
+can add more things to it; for example, SAGE adds weights for the
+concatenated feature and intermediates for intermediate calculation
+(also reused in backward prop).
+
+One thing to note is that the backward output matrix (used to output
+gradients during the backward phase) is **not** a completely independent
+piece of memory: it is the **memory used by the forward output of
+the layer that came before it**. The reason for this is that doing it
+this way saves a very large amount of memory, especially in a full batch
+setting where the number of nodes (multiplied by features/hidden feature
+size) can grow very large. **Be very careful about this as it means that
+you cannot reuse the output matrix from the forward pass after it
+has been overwritten.** This results in some rather convoluted logic that
+you may find in the code. It also means that **whenever an output matrix
+is resized for any reason, the pointers that each layer holds MUST
+be updated, or you will get undefined behavior**.
+
+## Softmax Layer
+
+Runs a softmax on each individual row, gets the highest value,
+compares with ground truth, gets loss.
+
+Note that the **forward and backward output matrix are shared** in this
+layer, so be careful with the assumptions made after the backward
+step is run (because the forward output will no longer be accessible
+after the backward step; this is why the accuracy check in the
+code has to occur before backward is called).
+
+Regarding the backward step: it turns out that for single class
+classification, the gradient if the answer is wrong is simply
+the softmax value itself, and if the answer is right, then its 
+the softmax value - 1. This has the advantage of being very
+numerically stable as well.
+
+Things are slightly more complicated for the multi-class case; some
+investigation needs to be done to figure this out.
+
+## SAGE Layer (and GCN Layer by Extension)
+
+### ReLU Activation and Overwriting of the Forward Matrix
+
+ReLU activation is used by the compute layers: if the value
+is greater than 0, it is kept, else it is discarded.
+
+Because the forward output matrix gets overwritten during
+the backward step and because the derivative of the 
+ReLU operation requires knowledge of what elements were
+affected by the ReLU, the system must *track* which
+elements were not set to 0 using a bitmask. This
+mask is used during the backward phase to keep gradients
+only if their values corresponding to that gradient
+were originally greater than 0, and it works even
+if the original forward matrix has been overwritten.
+
+### Row Dimensions and Active Portions of Matrices
+
+An optimal version of a normal GNN should make it so that
+the number of active rows decreases as execution progresses
+through the layers of the GNN: the last layer's active
+rows in the feature matrix should be *only* the seed
+nodes (i.e., nodes that are being predicted): keeping
+all nodes up to date is a waste of compute.
+
+The number of active nodes at the beginning of a GNNs
+should be all nodes involved in the k-hop neighborhood
+of the seed nodes. The next layer should remove
+the kth hop from the active nodes; the layer after,
+the (k-1)th hop, and so on. This can be accomplished
+relatively easily without disrupting the contiguous
+feature matrix by making sure that the nodes that will
+be dropped are in the suffix of matrix in the order
+that they will be dropped from the bottom. Then,
+to drop them, the code just changes the number of input
+rows for the layer so that any loops/matrix multiplies
+will only look at the relevant row prefix.
+
+In a distributed setting, the active nodes of a particular
+layer should be *shared* across all hosts; a host should not
+drop a node if it is being used somewhere else *and* if
+the node in question has a contribution to it (i.e.,
+has edges or is the master proxy).
+
+### SAGE's Concatenation of Input Features
+
+The GraphSAGE model concatenates the input feature to the aggregated
+feature vector on each node after aggregation which doubles
+the length of the vector. Actually doing this in the feature
+matrix is not great as it would mean that the original weight
+matrix needs to double in size, and additional space would have
+to be allocated on top of the existing input features
+with the aggregated copied over to it. 
+
+Instead of doing this, you can allocate a separate weight matrix
+of the same size as the original, multiply the original input
+features with that new weight matrix, and sum it up to the final
+output matrix. The result is exactly the same as if the input
+feature was concatenated to the aggregated features then
+multiplied with a weight matrix with double the number of rows.
+(work it out mathematically; it's the same)
+
+### Intermediates and Flipping Aggregation/Linear XForm: Basics
+
+The GNN computation in SAGE is two-step: aggregation
+followed by linear transform (more steps if dropout is enabled):
+an intermediate matrix is required to store the result of the first
+step for use in the next step. Additionally, keeping this
+intermediate result around in memory significantly speeds up
+the backward step which can use it to derive gradients.
+Therefore, the SAGE layer must allocate space for the intermediate.
+
+The size of the intermediate changes depending on if you do
+linear xform before aggregation; this is done if doing
+the linear xform reduces the column dimension as it makes
+the aggregation aggregate on smaller feature vector sizes (which
+speeds up computation overall in general). It helps to understand
+how the dimensions change after aggregation and after linear
+xform. Say the input matrix is IR by IC (input row by input column).
+
+* Aggregation only needs to occur for the nodes that will
+be active in the next layer, i.e. the *output rows* (OR). Therefore,
+after aggregation, the rows of the matrix go from IR to OR.
+
+* Linear transform changes the number of columns to output columns (OC).
+Therefore, after linear xform, IC turns to OC.
+
+After both operations, the output matrix to the next layer is the
+expected OR by OC. Depending on which one occurs first, 
+the code generates an intermediate of OR by IC *or* IC by OC.
+(more than one may be needed if dropout is used as that generates
+a new dropout matrix).
+
+### Intermediates and Flipping Aggregation/Linear XForm: Backward Pass
+
+The computation of a SAGE layer is the following in matrix
+terms where $T$ is the graph topology, $F$ is features,
+and the $W$s are the two weight matrices (one for aggregated
+value, other for concatenated vector).
+
+$TFW_1 + FW_2 = O$
+
+The gradients we want are $W_{1,2}'$ and $F'$ to pass back to the next layer in
+the backward phase. We have the gradient $O'$. The method in which this occurs
+depends on the order of aggregation/xform in the forward phase.
+
+First, $FW_2$. One can derive one part of $F'$ (the other part
+is from the first term) and $W_{2}'$. $F' = O'(W_2)^T$ and $W_{2}' = F^T O'$.
+
+Next, $TFW_1$.
+
+* If aggregation occurs first, we have $(TF)$ in an intermediate
+matrix.  The $W_{1}'$ gradient is $W_{1}' = (TF)^{T}O'$. To get one part of
+$F'$, we do $O' W_{1}^{T} = (TF)'$ followed by $T^T (TF)' = F'$.
+* If xform occurs first, $(FW_1)$ is in the intermediate matrix.
+To get $F'$, $T^T O' = (FW_{1})'$, followed by $(FW_{1})' (W_{1})^T = F'$.
+The weight gradient is $W_{1}' = F^T (FW_{1})'$.
+
+The $F'$ gradient from the two terms ($TFW_1$ and $FW_2$) can be summed
+together.
+
+### Masking Out Non-Masters in Distributed Setting
+
+In a distributed setting, all hosts need to see the same gradient
+computed in the backward phase so that the weights can all be updated
+in the same manner to keep consistency across hosts. This can
+be accomplished by synchronizing appropriately and making
+sure that a gradient computation isn't accounted for more than
+once globally.
+
+For $F'$, keeping it consistent simply means making sure that all
+hosts compute all the required rows. This is doable if a host knows
+what proxies it owns are active in the global subgraph being operated
+on and makes sure that it has the most up-to-date value for that proxy's
+gradient at all times. For example, since all hosts have a copy of the
+weights, in order to get the gradients for $F'$, all a host needs
+is to make sure $O'$ contains the gradients for local proxies
+active in a particular layer (even if they aren't part of that
+host's seed nodes). In this way, all hosts *recompute* the same gradient
+required for a proxy.
+
+For $W'$, each node contributes a gradient to it. A node is
+replicated across hosts via proxies; unlike the previous case,
+however, a *sync* of weight gradients occurs across all hosts because
+not all hosts have all proxies, and in this case, you need the
+contribution of all nodes and not just the ones you have proxies
+of, so you do **not** want a node's gradient to be computed more than
+once across all hosts. Therefore, when doing computation involving
+the weight gradient, a node's contribution should only be computed
+once **by the owner/master of that node**. Therefore, non-masters
+on hosts **need to be masked when computing $W'$**.
+This presents a problem implementation wise: masking non-masters
+is an in-place operation since you do not want to allocate
+new memory, so some care needs to be taken on which matrices to mask
+as well as when to mask them since $F'$ computation requires *non-masked*
+matrices. This is the reason for the very convoluted logic in the
+backward pass in the code that will need to be cleaned up or
+redesigned at some point.
+It might be possible to play a similar trick to active row prefixing
+where non-masters are placed lower in the rows so that "masking"
+can occur by changing the row count, but I believe I tried
+this and ran into issues with non-contiguity of masters/mirrors.
+
+Below is the masking logic used by the current code:
+
+```
+Calculate W2' using masked input or masked gradients (mask required else overcount,
+if not layer 0 then can mask input, else mask gradient)
+
+if (xform before agg)
+  Calculate (FW1)' by tranpose aggregating gradients
+  Mask out the non-masters in feature matrix F if not layer 0, else mask FW1
+  Calculate W1' using F^T and (FW1)' (one of which is masked)
+  Calculate F' from W1 by using (FW1)', W1^T and W2^T (masked FW1 won't occur here,
+  because this is only required if layer isn't 0)
+else
+  Mask F if not layer 0, else mask gradient
+  Get F' from W2 by multiplying O' with W2 (no masks allowed here)
+  Mask TF^T if not layer 0 (because O' won't be masked in that case)
+  Get W1' by multiplying TF^T with O' (one will be masked)
+  Get F' from W1 by (1) multipling O' with W1^T then (2) transpose aggregate to get F'
+  (none of the ops above should be masked)
+```
+
+The above isn't the neatest explanation of things, but essentially,
+anything involving a W' calculation requires one of the operands
+to have masked non-masters. Layer 0 is special because you
+can't mask the inputs there as those are the inputs used at
+the beginning of an epoch.
+
+### Regarding Dropout
+
+The way that dropout works is that random parts of the input
+are set to 0 for that particular batch.
+The ones set to 0 need to be memorized so that the backward
+pass can correctly compute the derivative.
+
+Dropout currently **does not work in a distributed setting**: the problem
+is that each host may dropout different weights due to the nature
+of RNG, leading to divergence on each host. One way to avoid
+this is to make it so each host dropouts a particular portion only and
+synchronize this choice. This has not been implemented efficiently (yet?).
+**I have not kept this code up-to-date as well** as all runs I've been
+doing are without dropout.
+
+*Therefore, it's probably better not to use it for the time being.*
+
+# Graph Neural Network
+
+`GraphNeuralNetwork.{cpp/h}` is the main class which runs the
+graph neural network. It creates the layers and chains their outputs
+together to create the network flow.
+
+## Constructor
+
+1) Creates the intermediate layers. See the section on Layers to get an
+idea of what is done.
+Typically, activation is activated for compute layers except for the last
+layer: activation is typically disabled for that layer for accuracy
+reasons (running activation on the final output layer messes with
+predictions).
+
+2) If minibatching is enabled, create minibatch generators.
+
+3) Create the output layer (Softmax is the only one that works right now,
+but Sigmoid is required for multi-class classification).
+
+## Training Flow
+
+There are a few scenarios based on if training and testing minibatching
+is enabled or not. These are not necessarily the most optimal things to
+do (e.g., you never want the entire graph to participate in training;
+only k-hop neighborhood is required).
+
+1) No training/testing minibatch -> the entire graph participates in training.
+
+2) Training minibatch but no test minibatch -> k-hop neighborhood only, but
+space required for entire graph is allocated (inefficient, should only need
+k-hop neighborhood of test nodes)
+
+3) Train/test minibatching -> k-hop neighborhood subgraphs only, and space
+for them is allocated on demand rather than worst case entire graph.
+
+Note that because of the way the code works, if you want to do an *efficient*
+full-batch no sampling run, you should specify very large numbers for the train
+and test minibatches so that the efficient code path is taken. Due to the
+way the design is at the moment it will **inefficiently regenerate
+the k-hop full batch train/test subgraphs when they are used**: this
+need to be fixed in a future redesign where multiple subgraphs can be
+swapped among.
+
+If a k-hop subgraph needs to be generated, it's generated with the following
+flow:
+
+1) Choose the seed nodes (i.e., nodes that will have their output compared
+to ground truth to potentially get loss/gradients to backpropagate)
+
+2) From seed nodes, sample a few edges OR if not sampling, choose all
+of them. Activate the destination nodes, communicate this, repeat
+for k hops.
+
+3) Correct layer dimensions based on subgraph/number of nodes at
+each layer (reduce memory AND compute footprint).
+
+4) Generate subgraph (see subgraph construction section).
+
+5) Do inference and back prop, update weights, repeat.
+The way this works is relatively simple: the code loops
+through each layer and calls the forward or backward pass function
+on it.
+
+Depending on how the test interval is set, between each epoch 
+a test subgraph may be used to check test accuracy.
+The flaw with the current design is that the graph object is
+only aware of one 'graph' at any one point, meaning the code
+has to be very careful to generate the right graph (train/test)
+for use at the right time.
+
+Note that the `kBatch` mode used in the Train code refers to
+a status that is set on nodes based on the minibatch and only
+includes *local seed nodes*, so keep this in mind when using it (there
+have been unintentional problems where I assumed `kBatch` meant
+more than just local seed nodes). The main reason for this is
+that it helps to distinguish local and global seed nodes to avoid 
+over-calculating gradients.
+
+# GNN Graph
+
+`GNNGraph.{cpp/h}` is responsible for reading in the graph topology,
+labels, and features. Topology is read/partitioned via the CuSP
+infrastructure. Each host reads labels for nodes it owns; same with
+features (right now it's pretty inefficient as all hosts read the entire
+file; some better way should probably be come up with).
+
+It is responsible for the synchronization substrate: Gluon is initialized
+on the partitioned graph. Normally sync occurs on the node data of the graph,
+but the node data in GNN case is a feature vector. To get around sync
+structure limitations, a global pointer is set to point to the feature
+matrix array (along with some other globals) so that the sync structure
+can know how to access it.
+
+There are sync structures for global degrees and aggregation mainly.
+If a subgraph is used, things change slightly (see subgraph section)
+
+The class provides functions to get degrees and also holds the minibatch
+generator. It also holds one `GNNSubgraph` object if a subgraph is being used
+(this is a limitation; there can only be one active subgraph at any one point).
+If the subgraph is active and the flag for the subgraph is on, then all
+user-facing functions on the `GNNGraph` object will access the *subgraph*
+instead of the original graph. **Be very careful with this and make sure the
+graph is in the right mode that you intend it to be.**
+
+# Subgraph Construction
+
+Subgraphs are created by the sampling/minibatch infrastructure:
+a few nodes are marked "active" along with edges, and
+the program compiles these chosen nodes/edges into a separate
+CSR for use during execution. There are a few implementation details
+during this process that will be documented here.
+
+## Code Structure
+
+The current implementation in Galois has a Subgraph class
+contained by the GNNGraph class. The subgraph is enabled
+by a flag which alters GNNGraph calls to direct to the
+subgraph instead.
+
+Optimally, we want to be able to work with many subgraphs
+at once; this design makes it difficult to do so as
+only 1 subgraph is contained by on GNNGraph. It would
+probably be possible to extend this design and have GNNGraph
+expose a subgraph switcher or something of the sort so that
+it isn't tied directly to the class.
+
+## Sampling
+
+The "activeness" of a node is marked on the node itself as a flag.
+In addition to this, the layer number in which a node is added
+is noted as well (the reason for this will be apparent later).
+
+Each edge has two variables associated with it: a normal flag
+saying if it has been sampled in any layer, and a bitset saying
+which layers the edge has been sampled in. This is because
+an edge once sampled is not necessarily sampled in *all* layers:
+it may be sampled in only a single layer (or many layers),
+and this info needs to be known when iterating over the edges
+to keep things correct.
+
+In addition, the degree of a node for each sampled phase locally
+is kept track of. At the end of all sampling, the degrees
+of the nodes at each layer are synchronized among all hosts.
+This is required because normalization in aggregation uses 
+the subgraph degrees (this is actually quite annoying runtime
+wise as it adds this extra degree sync step).
+
+## Construction Steps
+
+The steps in subgraph construction are the following:
+
+1) Create the local ID to subgraph ID mapping (and vice versa)
+2) Count degrees for the sampled vertices in order to construct
+the CSR; this includes edges that may not always be active.
+3) Create the CSR using the degrees.
+4) Create the local subgraph features matrix by copying
+them over from the original feature matrix.
+
+In order to make row elimination easier, 
+the SID of the vertices are ordered such that seed nodes are
+first, the 1-hop samples next, then 2-hops, 3-hops, etc.
+This makes it easy to eliminate vertices that aren't used after
+a certain point by changing the row dimensions used by multiplies/
+aggregations. Master nodes that are also seed nodes always occupy
+the first SIDs so that it's easy to loop through master nodes only.
+Other master nodes may end up with non-contiguous SIDs as they
+may become active in different layers; to track these masters
+for masking later, a bitset is maintained.
+Counts as to how many nodes are in each layer have to be
+compiled so this process can be done in parallel. An on_each
+loop is used to get SIDs in parallel.
+
+In addition, nodes that (1) are not master proxies and (2) do
+not have any outgoing or incoming edges are eliminated from
+the local subgraph. This is because some proxies do not have
+edges on some hosts even if they do on other hosts, so even
+if they become active, they do not change the outcome of computation
+and actually add unnecessary overhead. **This dead mirror
+removal is extremely important for performance.** Implementation
+wise it is done by keeping a "definitely active" flag which
+will only mark proxies that definitely have an edge connecting
+them or proxies that are masters.
+
+Degree counting and graph construction proceed as normal: count
+degrees, do a prefix sum, create the CSR. One thing to note is
+that the CSC is also created in order to do the backward aggregation
+step. The data which says which layers an edge is active in is
+pointed to by the newly constructed graph.
+
+## Synchronization when Subgraphs Exist
+
+### Mirror Regeneration
+
+Some mirrors on a local host may be inactive in the subgraph because
+they were not sampled. The subgraph code can create a new mirror
+node mapping that Gluon can swap out for each subgraph.
+
+This has its own overhead, and from some experiments in the
+past this doesn't significantly affect performance, but it's
+done anyways.
+
+### GID to SID
+
+Gluon memoizes GID-LID handshakes on each host to avoid the need
+to send IDs along with messages. This means that if a subgraph is being
+synchronized, another conversion to SIDs must occur. There need
+to be sampled graph versions of the sync structures that use
+a mapping from LID to SID in order to save the updates to the correct
+memory locations.
+
+Sometimes, due to the way Gluon works, a node that isn't part of the
+active subgraph may have its data queried for extract/update. The sync
+structure must account for this and check if such data is being accessed
+so that it can avoid seg-faulting.
+
+# Minibatch Generator
+
+`MinibatchGenerator.{cpp/h}` takes the list of training/test nodes on
+a single host and gives the user an interface for getting the nodes
+in batches at a time. This is used to do minibatching of nodes across
+hosts; each host picks the same number at a time before the beginning
+of minibatch.
+
+# Other (Dead) Files/Code
+
+`DistributedMinibatchTracker` was created to track variable number
+of seed nodes on each host to make the sampling more like single-host
+sampling. This was deprecated for a new functionality in the `MinibatchGenerator`
+which does it in a much more sane manner by having all hosts see the same
+global sequence of nodes to choose and moving the window locally on each
+host (this can result in imbalanced seeds).
+
+A lot of the existing layers have not been kept up-to-date due to the rapid
+development process on minibatching/sampling. Only the SAGE layer and Softmax
+Layer are guaranteed to be functional as those are the ones most
+of the runs have been on.
+
+There is an experimental implementation of something known as "sampled views"
+in which an explicit subgraph isn't constructed; a mask is used instead.
+Performance wise this did not do too well, so the code has been abandoned
+and is not guaranteed to work.
+
+# Regarding GPU Code
+
+It has been a while since I worked on the GPU code, but the idea is essentially
+to pre-allocate the same data that you would have allocated on the CPU
+and use those pointers instead of CPU pointers.
+
+Some updates will need to be made in order to do dynamic resizing of the
+data depending on the size of the minibatch. The best way to avoid this
+in general, though, is to just allocate space for the test subgraph's
+k-hops since that is likely to be more expensive than whatever
+the minibatch size for the train nodes are (unless it's all nodes).
\ No newline at end of file
diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp
index 25b9418fa1..bf301e5bdd 100644
--- a/libgnn/src/layers/SAGELayer.cpp
+++ b/libgnn/src/layers/SAGELayer.cpp
@@ -375,6 +375,7 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
   }
 
   // aggregate this here before gradient starts to get overwritten
+  // this is xform ffirst
   if (!config_.disable_aggregate_after_update &&
       layer_dimensions_.input_columns > layer_dimensions_.output_columns) {
     // aggregate occurs regardless of layer being equal to 0 because it is
@@ -491,6 +492,8 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
                    &input_column_intermediates_, true);
     }
   } else {
+    // xform first
+
     // --unmasked--
 
     // disable concat is part of condition because otherwise this mask

From 051f88b011a4fb3eb7c8eacd07e1bf032a5a6ba5 Mon Sep 17 00:00:00 2001
From: Hochan Lee <hochan.amd@amd.com>
Date: Mon, 22 May 2023 14:42:38 -0700
Subject: [PATCH 600/660] Fix a minor bug with a file path

---
 libdeepgalois/include/deepgalois/layers/GluonGradients.h | 2 ++
 libgnn/include/galois/graphs/GNNGraph.h                  | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/libdeepgalois/include/deepgalois/layers/GluonGradients.h b/libdeepgalois/include/deepgalois/layers/GluonGradients.h
index e14fe27bc8..2918cdd8dd 100644
--- a/libdeepgalois/include/deepgalois/layers/GluonGradients.h
+++ b/libdeepgalois/include/deepgalois/layers/GluonGradients.h
@@ -40,6 +40,8 @@ class GluonGradients {
   std::vector<std::pair<uint32_t, uint32_t>> _mirrorRanges;
 
 public:
+  bool is_a_graph() { return true; }
+
   /**
    * Save weight gradients + number of them (i.e. size).
    * Then setup mirror metadata for Gluon to use during setup.
diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 2eaba6e90d..fff1d03ed4 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -16,7 +16,7 @@ namespace galois {
 // TODO remove the need to hardcode this path
 //! Path to location of all gnn files
 static const std::string default_gnn_dataset_path =
-    "/net/ohm/export/iss/inputs/Learning/";
+    "/home/hochan/inputs/Learning/";
 
 //! Helper struct to maintain start/end/size of any particular range. Mostly
 //! used for mask ranges.

From 7bd324a6ebbb46b1eb0201950b08a9be28db5adb Mon Sep 17 00:00:00 2001
From: Hochan Lee <hochan@utexas.edu>
Date: Wed, 14 Jun 2023 00:14:09 -0500
Subject: [PATCH 601/660] Add timers for time breakdown

---
 libgnn/include/galois/GraphNeuralNetwork.h    |  2 +-
 libgnn/src/GraphNeuralNetwork.cpp             | 28 ++++++++++++++++++-
 libgnn/src/graphs/GNNGraph.cpp                | 15 ++++++++--
 libgnn/src/layers/GraphConvolutionalLayer.cpp | 11 ++++++++
 libgnn/src/layers/SAGELayer.cpp               | 21 ++++++++++++--
 5 files changed, 71 insertions(+), 6 deletions(-)

diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h
index a813378116..7aa859c84c 100644
--- a/libgnn/include/galois/GraphNeuralNetwork.h
+++ b/libgnn/include/galois/GraphNeuralNetwork.h
@@ -232,7 +232,7 @@ class GraphNeuralNetwork {
 private:
   static const constexpr char* kRegionName = "GraphNeuralNetwork";
 
-  bool timers_on_{false};
+  bool timers_on_{true};
 
   void EnableTimers() {
     timers_on_ = true;
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
index 90fa6fd009..201da985d5 100644
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ b/libgnn/src/GraphNeuralNetwork.cpp
@@ -262,12 +262,22 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
   std::vector<size_t> subgraph_layer_sizes;
   // this subgraph only needs to be created once
   if (config_.use_train_subgraph_ && !config_.train_minibatch_size()) {
+    galois::StatTimer total_subgraph_construction_timer("TotalSubGraphConstruction", kRegionName);
+    galois::StatTimer setup_neighborhood_sample_timer("SetupNeighborhoodSample", kRegionName);
+    galois::StatTimer edge_sampling_timer("SampleAllEdges", kRegionName);
+    galois::StatTimer subgraph_construction_timer("SubGraphConstruction", kRegionName);
+    total_subgraph_construction_timer.start();
+
+    setup_neighborhood_sample_timer.start();
     // Setup the subgraph to only be the training graph
     size_t local_seed_node_count = graph_->SetupNeighborhoodSample();
+    setup_neighborhood_sample_timer.stop();
+
     subgraph_layer_sizes.emplace_back(local_seed_node_count);
     galois::gDebug(graph_->host_prefix(), "Number of local seed nodes is ",
                    local_seed_node_count);
     size_t num_sampled_layers = 0;
+    edge_sampling_timer.start();
     // gnn_layers_.back()->ResizeRows(local_seed_node_count);
     for (auto back_iter = gnn_layers_.rbegin(); back_iter != gnn_layers_.rend();
          back_iter++) {
@@ -290,8 +300,12 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
         num_sampled_layers++;
       }
     }
+    edge_sampling_timer.stop();
+    subgraph_construction_timer.start();
     CorrectRowCounts(graph_->ConstructSampledSubgraph(num_sampled_layers));
+    subgraph_construction_timer.stop();
     CorrectBackwardLinks();
+    total_subgraph_construction_timer.stop();
   }
 
   galois::StatTimer epoch_timer("TrainingTime", kRegionName);
@@ -327,14 +341,20 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
     // beginning of epoch sampling (no minibatches)
     if (config_.do_sampling() && !config_.train_minibatch_size()) {
       galois::StatTimer mb_timer("EpochSubgraphCreation", kRegionName);
+      galois::StatTimer subgraph_construction_timer("SubGraphConstruction", kRegionName);
+      galois::StatTimer setup_neighborhood_sample_timer("SetupNeighborhoodSample", kRegionName);
+      galois::StatTimer edge_sampling_timer("SampleEdges", kRegionName);
       mb_timer.start();
 
+      setup_neighborhood_sample_timer.start();
       size_t local_seed_node_count = graph_->SetupNeighborhoodSample();
+      setup_neighborhood_sample_timer.stop();
       // gnn_layers_.back()->ResizeRows(local_seed_node_count);
       galois::gDebug(graph_->host_prefix(), "Number of local seed nodes is ",
                      local_seed_node_count);
       size_t num_sampled_layers = 0;
 
+      edge_sampling_timer.start();
       // work backwards on GCN/SAGE layers
       // loop backward and find last GCN/SAGE (main) layer to disable activation
       for (auto back_iter = gnn_layers_.rbegin();
@@ -358,8 +378,11 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
           num_sampled_layers++;
         }
       }
+      edge_sampling_timer.stop();
       // resize layer matrices
+      subgraph_construction_timer.start();
       CorrectRowCounts(graph_->ConstructSampledSubgraph(num_sampled_layers));
+      subgraph_construction_timer.stop();
       CorrectBackwardLinks();
       mb_timer.stop();
     }
@@ -386,6 +409,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
         galois::StatTimer prep_timer("PrepNextMinibatch", kRegionName);
         galois::StatTimer sample_time("MinibatchSampling", kRegionName);
         galois::StatTimer mb_timer("MinibatchSubgraphCreation", kRegionName);
+        galois::StatTimer subgraph_construction_timer("SubGraphConstruction", kRegionName);
         mb_timer.start();
 
         galois::Timer batch_timer;
@@ -454,7 +478,9 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
         sample_time.stop();
 
         // resize layer matrices
+        subgraph_construction_timer.start();
         CorrectRowCounts(graph_->ConstructSampledSubgraph(num_sampled_layers));
+        subgraph_construction_timer.stop();
         CorrectBackwardLinks();
 
         // XXX resizes above only work for SAGE layers; will break if other
@@ -659,7 +685,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
   uint64_t average_epoch_time = epoch_timer.get() / num_epochs;
   galois::runtime::reportStat_Tavg(kRegionName, "AverageEpochTime",
                                    average_epoch_time);
-  DisableTimers();
+  //DisableTimers();
   // disable subgraph
   graph_->DisableSubgraph();
   graph_->EnableSubgraphChooseAll();
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index 1c7d19040b..4a83753670 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -716,11 +716,22 @@ float galois::graphs::GNNGraph::GetGlobalAccuracy(
 
 float galois::graphs::GNNGraph::GetGlobalAccuracyCPU(
     PointerWithSize<GNNFloat> predictions, GNNPhase phase, bool sampling) {
+  galois::StatTimer global_accuracy_timer("GetGlobalAccuracy");
+  galois::StatTimer global_accuracy_for_singleclass_timer("GetGlobalAccuracyForSingleClass");
+  galois::StatTimer global_accuracy_for_multiclass_timer("GetGlobalAccuracyForMultiClass");
+  global_accuracy_timer.start();
+  float accuracy{0};
   if (is_single_class_label()) {
-    return GetGlobalAccuracyCPUSingle(predictions, phase, sampling);
+    global_accuracy_for_singleclass_timer.start();
+    accuracy = GetGlobalAccuracyCPUSingle(predictions, phase, sampling);
+    global_accuracy_for_singleclass_timer.stop();
   } else {
-    return GetGlobalAccuracyCPUMulti(predictions, phase, sampling);
+    global_accuracy_for_multiclass_timer.start();
+    accuracy = GetGlobalAccuracyCPUMulti(predictions, phase, sampling);
+    global_accuracy_for_multiclass_timer.stop();
   }
+  global_accuracy_timer.stop();
+  return accuracy;
 }
 
 float galois::graphs::GNNGraph::GetGlobalAccuracyCPUSingle(
diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp
index de84903447..b9a9c2120c 100644
--- a/libgnn/src/layers/GraphConvolutionalLayer.cpp
+++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp
@@ -148,6 +148,8 @@ galois::GraphConvolutionalLayer::BackwardPhase(
     galois::PointerWithSize<galois::GNNFloat> prev_layer_input,
     galois::PointerWithSize<galois::GNNFloat>* input_gradient) {
   galois::StatTimer timer("BackwardPhase", kRegionName);
+  galois::StatTimer weight_gradient_timer("BackwardPhaseWeight", kRegionName);
+  galois::StatTimer weight_gradient_sync_timer("BackwardPhaseWeightSync", kRegionName);
   timer.start();
 
   assert(layer_phase_ == GNNPhase::kTrain);
@@ -190,12 +192,14 @@ galois::GraphConvolutionalLayer::BackwardPhase(
           input_gradient->data(), p_layer_weight_gradients_.data());
     } else {
 #endif
+      weight_gradient_timer.start();
       // temp 2 holds aggregated feature vectors from forward phase
       galois::CBlasSGEMM(
           CblasTrans, CblasNoTrans, layer_dimensions_.input_columns,
           layer_dimensions_.input_rows, layer_dimensions_.output_columns,
           agg_data.data(), input_gradient->data(),
           p_layer_weight_gradients_.data());
+      weight_gradient_timer.stop();
 #ifdef GALOIS_ENABLE_GPU
     }
 #endif
@@ -243,11 +247,13 @@ galois::GraphConvolutionalLayer::BackwardPhase(
           p_out_temp_.data(), p_layer_weight_gradients_.data());
     } else {
 #endif
+      weight_gradient_timer.start();
       galois::CBlasSGEMM(CblasTrans, CblasNoTrans,
                          layer_dimensions_.input_columns,
                          layer_dimensions_.input_rows,
                          layer_dimensions_.output_columns, input_data.data(),
                          p_out_temp_.data(), p_layer_weight_gradients_.data());
+      weight_gradient_timer.stop();
 #ifdef GALOIS_ENABLE_GPU
     }
 #endif
@@ -262,7 +268,9 @@ galois::GraphConvolutionalLayer::BackwardPhase(
 
   // sync weight gradients; note aggregation sync occurs in the function call
   // already
+  weight_gradient_sync_timer.start();
   WeightGradientSyncSum();
+  weight_gradient_sync_timer.stop();
 
   if (!config_.disable_dropout && layer_number_ != 0) {
     DoDropoutDerivative();
@@ -316,6 +324,7 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU(
     size_t column_length, const GNNFloat* node_embeddings,
     GNNFloat* aggregate_output,
     galois::substrate::PerThreadStorage<std::vector<GNNFloat>>*) {
+  galois::StatTimer aggregate_all_sync_timer("AggregateSync", kRegionName);
   size_t num_nodes   = graph_.size();
   size_t last_master = *(graph_.end_owned());
   assert(0 == *(graph_.begin_owned()));
@@ -393,7 +402,9 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU(
       galois::chunk_size<1>(), galois::steal(),
       galois::loopname("ConvolutionalAggregateAll"));
   // aggregate sync
+  aggregate_all_sync_timer.start();
   graph_.AggregateSync(aggregate_output, column_length);
+  aggregate_all_sync_timer.stop();
 }
 
 void galois::GraphConvolutionalLayer::UpdateEmbeddings(
diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp
index bf301e5bdd..032478745d 100644
--- a/libgnn/src/layers/SAGELayer.cpp
+++ b/libgnn/src/layers/SAGELayer.cpp
@@ -351,6 +351,8 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
     galois::PointerWithSize<galois::GNNFloat> prev_layer_input,
     galois::PointerWithSize<galois::GNNFloat>* input_gradient) {
   galois::StatTimer timer("BackwardPhase", kRegionName);
+  galois::StatTimer weight_gradient_sync_timer("BackwardPhaseWeightSync", kRegionName);
+  galois::StatTimer weight_gradient_sync_timer2("BackwardPhaseWeight2Sync", kRegionName);
   TimerStart(&timer);
 
   assert(layer_phase_ == GNNPhase::kTrain || layer_phase_ == GNNPhase::kBatch);
@@ -431,7 +433,10 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
     }
 #endif
   }
+
+  weight_gradient_sync_timer2.start();
   WeightGradientSyncSum2();
+  weight_gradient_sync_timer2.stop();
 
   // derivative of aggregation/update
   // TODO clean up logic here to reduce nesting
@@ -553,7 +558,10 @@ galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
                                  p_backward_output_matrix_.data(), false);
     }
   }
+
+  weight_gradient_sync_timer.start();
   WeightGradientSyncSum();
+  weight_gradient_sync_timer.stop();
 
   // full gradient needed here; should occur after all updates
   if (layer_number_ != 0) {
@@ -587,16 +595,19 @@ void galois::SAGELayer::AggregateAll(
         pts,
     bool is_backward) {
   std::string agg_timer_name = "AggregateCompute";
+  std::string agg_sync_timer_name = "AggregateSync";
   size_t num_rows_to_handle;
   if (!is_backward) {
     agg_timer_name += "Forward";
+    agg_sync_timer_name += "Forward";
     num_rows_to_handle = layer_dimensions_.output_rows;
   } else {
     agg_timer_name += "Backward";
+    agg_sync_timer_name += "Backward";
     num_rows_to_handle = layer_dimensions_.input_rows;
   }
-
   galois::StatTimer timer(agg_timer_name.c_str(), kRegionName);
+  galois::StatTimer aggregate_all_sync_timer(agg_sync_timer_name.c_str(), kRegionName);
   TimerStart(&timer);
 
 #ifdef GALOIS_ENABLE_GPU
@@ -617,8 +628,10 @@ void galois::SAGELayer::AggregateAll(
     TimerStop(&timer);
 
     // aggregate sync
+    aggregate_all_sync_timer.start();
     graph_.AggregateSync(aggregate_output, column_length, is_backward,
                          num_rows_to_handle);
+    aggregate_all_sync_timer.stop();
 #ifdef GALOIS_ENABLE_GPU
   }
 #endif
@@ -728,7 +741,8 @@ void galois::SAGELayer::AggregateAllCPU(
           }
         }
       },
-      galois::chunk_size<1>(), galois::steal());
+      galois::chunk_size<1>(), galois::steal(),
+      galois::loopname("SAGEAggregateAll"));
 }
 
 void galois::SAGELayer::UpdateEmbeddings(const GNNFloat* node_embeddings,
@@ -854,10 +868,13 @@ void galois::SAGELayer::SelfFeatureUpdateEmbeddingsDerivative(
 
 void galois::SAGELayer::OptimizeLayer(BaseOptimizer* optimizer,
                                       size_t trainable_layer_number) {
+  galois::StatTimer total_gradient_timer("GradientDescent", kRegionName);
+  total_gradient_timer.start();
   optimizer->GradientDescent(p_layer_weight_gradients_, p_layer_weights_,
                              trainable_layer_number);
   if (!sage_config_.disable_concat) {
     second_weight_optimizer_->GradientDescent(p_layer_weight_gradients_2_,
                                               p_layer_weights_2_, 0);
   }
+  total_gradient_timer.stop();
 }

From 5ab5c10238c49e5e1d0b15a14e6a5a998ab784cf Mon Sep 17 00:00:00 2001
From: "Lee, Hochan" <133701794+hochanlee-amd@users.noreply.github.com>
Date: Tue, 25 Jul 2023 03:35:18 -0500
Subject: [PATCH 602/660] WMD CSV-based graph ingestion in Galois (#3)

This commit is to enable Galois/Gluon to read WMD CSV-based graphs. This is a temporary code and this reads the whole graph on each machine on memory, which is not scaled. This will be updated to scalable graph ingestion with new dynamic graph data types (e.g., Log-structured CSR). For now, users can test WFs and ISBs with the WMD inputs through this.
---
 CMakeLists.txt                                |   7 +-
 README_SHAD.md                                |  57 ++
 libcusp/CMakeLists.txt                        |   2 +
 .../include/galois/graphs/CuSPPartitioner.h   |   8 +-
 .../include/galois/graphs/DistributedGraph.h  | 258 ++++++
 libcusp/include/galois/graphs/NewGeneric.h    | 180 ++++-
 libcusp/test/CMakeLists.txt                   |   2 +
 libcusp/test/shad-dist-graph.cpp              | 118 +++
 libdeepgalois/include/deepgalois/types.h      |   1 +
 .../include/galois/graphs/BufferedGraph.h     |  43 +-
 libgalois/include/shad/DataTypes.h            | 734 ++++++++++++++++++
 libgalois/include/shad/Graph.h                | 169 ++++
 libgalois/include/shad/GraphTypes.h           |  71 ++
 libgalois/include/shad/ShadGraphConverter.h   | 712 +++++++++++++++++
 libgnn/CMakeLists.txt                         |   7 +-
 libgnn/include/galois/graphs/GNNGraph.h       |   5 +-
 libgnn/src/graphs/GNNGraph.cpp                |  19 +-
 lonestar/analytics/distributed/CMakeLists.txt |   1 -
 lonestar/gnn/include/DistributedGraphLoader.h |  13 +-
 lonestar/gnn/src/DistributedGraphLoader.cpp   |   5 +
 .../libdistbench/include/DistBench/Input.h    |  67 +-
 lonestar/libdistbench/src/Input.cpp           |   5 +
 lonestar/libgnnbench/src/Input.cpp            |   8 +-
 .../scientific/cpu/longestedge/test/catch.hpp |   7 +
 24 files changed, 2418 insertions(+), 81 deletions(-)
 create mode 100644 README_SHAD.md
 create mode 100644 libcusp/test/CMakeLists.txt
 create mode 100644 libcusp/test/shad-dist-graph.cpp
 create mode 100644 libgalois/include/shad/DataTypes.h
 create mode 100644 libgalois/include/shad/Graph.h
 create mode 100644 libgalois/include/shad/GraphTypes.h
 create mode 100644 libgalois/include/shad/ShadGraphConverter.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1eaa1e1e0a..88eaa64d74 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -38,7 +38,7 @@ set(USE_ARCH native CACHE STRING "Optimize for a specific processor architecture
 set(USE_DEEPGALOIS OFF CACHE BOOL "Use gnn apps as well as the DeepGalois library")
 set(USE_MKL_BLAS OFF CACHE BOOL "Use MKL for BLAS")
 # TODO; this is GNN related; find better way to do than hardcode
-SET(CUDA_HOME /org/centers/cdgc/cuda/cuda-10.2)
+#SET(CUDA_HOME /org/centers/cdgc/cuda/cuda-10.2)
 
 # This option is automatically handled by CMake.
 # It makes add_library build a shared lib unless STATIC is explicitly specified.
@@ -141,8 +141,7 @@ endif()
 # TODO (loc) prefix with GALOIS, move elsewhere more fitting in this file
 ################################################################################
 if(USE_MKL_BLAS)
-  SET(INTEL_ROOT /opt/apps/sysnet/intel/20.0)
-  SET(MKL_ROOT ${INTEL_ROOT}/mkl)
+  SET(MKL_ROOT /home/hochan/intel/oneapi/mkl/2023.1.0)
   find_package(MKL REQUIRED)
   message(STATUS "MKL: ${MKL_INCLUDE_DIRS}")
   if (MKL_FOUND)
@@ -151,7 +150,7 @@ if(USE_MKL_BLAS)
   endif()
 endif()
 
-SET(OPENBLAS_ROOT /org/centers/cdgc/openblas/gcc8.1)
+#SET(OPENBLAS_ROOT /org/centers/cdgc/openblas/gcc8.1)
 if(USE_OPENBLAS)
   find_package(OpenBLAS)
   message(STATUS "OpenBLAS: ${OPENBLAS_INCLUDE_DIRS}")
diff --git a/README_SHAD.md b/README_SHAD.md
new file mode 100644
index 0000000000..4253bb0e55
--- /dev/null
+++ b/README_SHAD.md
@@ -0,0 +1,57 @@
+README related to SHAD input graph ingestion
+(Including some notes for other workflows)
+This README is for our internal purpose.
+This README will be refined with more concrete information later.
+
+1. CMakeList paths:
+The current CMake in Galois is using hard-:coded paths for CUDA_HOME,
+OPENBLAS_ROOT, INTEL_COMPILER_LIBRARIES, and MKL_LIBRARIES.
+Please set those variables based on your environments.
+
+
+2. Assumptions regarding SHAD WMD graph formats:
+We assume that in SHAD WMD graph formats, each node and edge has a single type,
+and those types are ALWAYS uint64_t.
+The current Galois does not support node/edge properties (possibly,
+programmers can implement a struct containing multiple
+fields, but that is not like getData<Property1>(n), getData<Property2>(n), etc.)
+and so, we store those SHAD types in node and edge data.
+If you need other types than uint64_t, you should add new execution paths for
+them.
+
+
+3. Limitations of the current SHAD graph ingestion module:
+In the original CuSP, each host reads parts of the .gr graph file and constructs
+in-memory format. In this case, each host does not need to load the full graph
+in its memory space. This is possible since .gr file is CSR and each component
+such as outgoing edge indices, outgoing edge destinations, and outgoing edge
+data is stored consecutively.
+
+However, in the SHAD graph format, all components are not stored consecutively.
+They are unsorted. For example, edges and nodes can be stored in interleaved
+manner. Therefore, it is not possible to read partial graphs by using
+the original method. 
+
+As the current SHAD graph ingestion does not focus on decent/scalable methods,
+but to make SHAD graphs work in Galois to proceed with workflows,
+each host reads the FULL graph to in-memory. This should NOT be the final
+artifact since our long-run target graphs should exceed a single machine memory.
+But for the immediate goal and the target data sets, I assume that it is fine
+for now.
+
+UT team is currently working on new graph formats for dynamic graphs, and 
+scalable SHAD graph ingestion across hosts.
+
+4. TODO:
+CuSP marks training/test/validation nodes while it is partitioning a graph.
+It is not implemented yet for a SHAD graph.
+This will be added in a GNN/feature construction branch.
+
+5. Requirements:
+Galois-GNN requires additional packages listed below on top of the requirements of Galois.
+You can use older/newer versions but let me (hochan) also list the versions that I have used:
+1) Intel MKL: 2023.1.0
+2) Intel Compiler (including runtime libraries): 2023.0.0
+3) Intel Onedpl-devel library: 2023.1.0
+4) Intel OpenMP: 2023.0.0
+
diff --git a/libcusp/CMakeLists.txt b/libcusp/CMakeLists.txt
index 2cc6e1714d..67b603019e 100644
--- a/libcusp/CMakeLists.txt
+++ b/libcusp/CMakeLists.txt
@@ -27,3 +27,5 @@ install(TARGETS galois_cusp
     COMPONENT lib
   INCLUDES DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
 )
+
+add_subdirectory(test)
diff --git a/libcusp/include/galois/graphs/CuSPPartitioner.h b/libcusp/include/galois/graphs/CuSPPartitioner.h
index 6df9707a27..6b7fef6dab 100644
--- a/libcusp/include/galois/graphs/CuSPPartitioner.h
+++ b/libcusp/include/galois/graphs/CuSPPartitioner.h
@@ -50,6 +50,7 @@ using DistGraphPtr =
  * to the partitioner
  * @param outputType Specifies the output format (CSR or CSC) that each
  * partition will be created in
+ * @param useShad "true" if the passed graph file format is a SHAD WMD graph
  * @param symmetricGraph This should be "true" if the passed in graphFile
  * is a symmetric graph
  * @param transposeGraphFile Transpose graph of graphFile in Galois binary
@@ -83,7 +84,8 @@ template <typename PartitionPolicy, typename NodeData = char,
           typename EdgeData = void>
 DistGraphPtr<NodeData, EdgeData>
 cuspPartitionGraph(std::string graphFile, CUSP_GRAPH_TYPE inputType,
-                   CUSP_GRAPH_TYPE outputType, bool symmetricGraph = false,
+                   CUSP_GRAPH_TYPE outputType, bool useShad = false,
+                   bool symmetricGraph = false,
                    std::string transposeGraphFile = "",
                    std::string masterBlockFile = "", bool cuspAsync = true,
                    uint32_t cuspStateRounds = 100,
@@ -126,13 +128,13 @@ cuspPartitionGraph(std::string graphFile, CUSP_GRAPH_TYPE inputType,
     }
 
     return std::make_unique<DistGraphConstructor>(
-        inputToUse, net.ID, net.Num, cuspAsync, cuspStateRounds, useTranspose,
+        inputToUse, net.ID, net.Num, useShad, cuspAsync, cuspStateRounds, useTranspose,
         readPolicy, nodeWeight, edgeWeight, masterBlockFile);
   } else {
     // symmetric graph path: assume the passed in graphFile is a symmetric
     // graph; output is also symmetric
     return std::make_unique<DistGraphConstructor>(
-        graphFile, net.ID, net.Num, cuspAsync, cuspStateRounds, false,
+        graphFile, net.ID, net.Num, useShad, cuspAsync, cuspStateRounds, false,
         readPolicy, nodeWeight, edgeWeight, masterBlockFile);
   }
 }
diff --git a/libcusp/include/galois/graphs/DistributedGraph.h b/libcusp/include/galois/graphs/DistributedGraph.h
index 0e3e5fa43c..415afba33d 100644
--- a/libcusp/include/galois/graphs/DistributedGraph.h
+++ b/libcusp/include/galois/graphs/DistributedGraph.h
@@ -347,6 +347,203 @@ class DistGraph {
     increment_evilPhase();
   }
 
+  /**
+   * Given the number of global nodes, compute the masters for each node by
+   * evenly (or unevenly as specified by scale factor)
+   * blocking the nodes off to assign to each host. Considers
+   * ONLY nodes and not edges.
+   *
+   * @param numGlobalNodes The number of global nodes to divide
+   * @param scalefactor A vector that specifies if a particular host
+   * should have more or less than other hosts
+   * @param DecomposeFactor Specifies how decomposed the blocking
+   * of nodes should be. For example, a factor of 2 will make 2 blocks
+   * out of 1 block had the decompose factor been set to 1.
+   */
+  void computeMastersBlockedNodes(uint64_t numGlobalNodes,
+                                  const std::vector<unsigned>& scalefactor,
+                                  unsigned DecomposeFactor = 1) {
+    uint64_t numNodes_to_divide = numGlobalNodes;
+    if (scalefactor.empty() || (numHosts * DecomposeFactor == 1)) {
+      for (unsigned i = 0; i < numHosts * DecomposeFactor; ++i)
+        gid2host.push_back(galois::block_range(uint64_t{0}, numNodes_to_divide,
+                                               i, numHosts * DecomposeFactor));
+      return;
+    }
+
+    // TODO: not compatible with DecomposeFactor.
+    assert(scalefactor.size() == numHosts);
+
+    unsigned numBlocks = 0;
+
+    for (unsigned i = 0; i < numHosts; ++i) {
+      numBlocks += scalefactor[i];
+    }
+
+    std::vector<std::pair<uint64_t, uint64_t>> blocks;
+    for (unsigned i = 0; i < numBlocks; ++i) {
+      blocks.push_back(
+          galois::block_range(uint64_t{0}, numNodes_to_divide, i, numBlocks));
+    }
+
+    std::vector<unsigned> prefixSums;
+    prefixSums.push_back(0);
+
+    for (unsigned i = 1; i < numHosts; ++i) {
+      prefixSums.push_back(prefixSums[i - 1] + scalefactor[i - 1]);
+    }
+
+    for (unsigned i = 0; i < numHosts; ++i) {
+      unsigned firstBlock = prefixSums[i];
+      unsigned lastBlock  = prefixSums[i] + scalefactor[i] - 1;
+      gid2host.push_back(
+          std::make_pair(blocks[firstBlock].first, blocks[lastBlock].second));
+    }
+  }
+
+  /**
+   * Given the number of global nodes and edges,
+   * compute the masters for each node by
+   * evenly (or unevenly as specified by scale factor)
+   * blocking the nodes off to assign to each host while taking
+   * into consideration the only edges of the node to get
+   * even blocks.
+   *
+   * @param numGlobalNodes The number of global nodes to divide
+   * @param numGlobalEdges The number of global edges to divide
+   * @param outIndices A complete outgoing edge range array of CSR to calculate
+   * range
+   * @param scalefactor A vector that specifies if a particular host
+   * should have more or less than other hosts
+   * @param DecomposeFactor Specifies how decomposed the blocking
+   * of nodes should be. For example, a factor of 2 will make 2 blocks
+   * out of 1 block had the decompose factor been set to 1.
+   */
+  void computeMastersBalancedEdges(uint64_t numGlobalNodes,
+                                   uint64_t numGlobalEdges,
+                                   uint64_t* outIndices,
+                                   const std::vector<unsigned>& scalefactor,
+                                   uint32_t edgeWeight,
+                                   unsigned DecomposeFactor = 1) {
+    if (edgeWeight == 0) {
+      edgeWeight = 1;
+    }
+
+    auto& net = galois::runtime::getSystemNetworkInterface();
+
+    gid2host.resize(numHosts * DecomposeFactor);
+    for (unsigned d = 0; d < DecomposeFactor; ++d) {
+      // TODO(hc):
+      auto r = galois::graphs::divideNodesBinarySearch(
+          numGlobalNodes, numGlobalEdges, 0, edgeWeight, (id + d * numHosts),
+              numHosts * DecomposeFactor, outIndices, scalefactor);
+      gid2host[id + d * numHosts].first  = *(r.first.first);
+      gid2host[id + d * numHosts].second = *(r.first.second);
+    }
+
+    for (unsigned h = 0; h < numHosts; ++h) {
+      if (h == id) {
+        continue;
+      }
+      galois::runtime::SendBuffer b;
+      for (unsigned d = 0; d < DecomposeFactor; ++d) {
+        galois::runtime::gSerialize(b, gid2host[id + d * numHosts]);
+      }
+      net.sendTagged(h, galois::runtime::evilPhase, std::move(b));
+    }
+    net.flush();
+    unsigned received = 1;
+    while (received < numHosts) {
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
+      do {
+        p = net.recieveTagged(galois::runtime::evilPhase);
+      } while (!p);
+      assert(p->first != id);
+      auto& b = p->second;
+      for (unsigned d = 0; d < DecomposeFactor; ++d) {
+        galois::runtime::gDeserialize(b, gid2host[p->first + d * numHosts]);
+      }
+      ++received;
+    }
+    increment_evilPhase();
+
+#ifndef NDEBUG
+    // TODO(hc):
+    for (unsigned h = 0; h < numHosts; h++) {
+      if (h == 0) {
+        assert(gid2host[h].first == 0);
+      } else if (h == numHosts - 1) {
+        assert(gid2host[h].first == gid2host[h - 1].second);
+        assert(gid2host[h].second == numGlobalNodes);
+      } else {
+        assert(gid2host[h].first == gid2host[h - 1].second);
+        assert(gid2host[h].second == gid2host[h + 1].first);
+      }
+    }
+#endif
+  }
+
+  /**
+   * Given the number of global nodes and edges,
+   * compute the masters for each node by evenly
+   * (or unevenly as specified by scale factor)
+   * blocking the nodes off to assign to each host while taking
+   * into consideration the edges of the node AND the node itself.
+   *
+   * @param numGlobalNodes The number of global nodes to divide
+   * @param numGlobalEdges The number of global edges to divide
+   * @param outIndices A complete outgoing edge range array of CSR to calculate
+   * range
+   * @param scalefactor A vector that specifies if a particular host
+   * should have more or less than other hosts
+   * @param DecomposeFactor Specifies how decomposed the blocking
+   * of nodes should be. For example, a factor of 2 will make 2 blocks
+   * out of 1 block had the decompose factor been set to 1. Ignored
+   * in this function currently.
+   *
+   * @todo make this function work with decompose factor
+   */
+  void computeMastersBalancedNodesAndEdges(
+      uint64_t numGlobalNodes, uint64_t numGlobalEdges,
+      uint64_t* outIndices, const std::vector<unsigned>& scalefactor,
+      uint32_t nodeWeight, uint32_t edgeWeight, unsigned) {
+    if (nodeWeight == 0) {
+      nodeWeight = numGlobalEdges / numGlobalNodes; // average degree
+    }
+    if (edgeWeight == 0) {
+      edgeWeight = 1;
+    }
+
+    auto& net = galois::runtime::getSystemNetworkInterface();
+    gid2host.resize(numHosts);
+    auto r = galois::graphs::divideNodesBinarySearch(
+        numGlobalNodes, numGlobalEdges, nodeWeight, edgeWeight,
+            id, numHosts, outIndices, scalefactor);
+    gid2host[id].first  = *r.first.first;
+    gid2host[id].second = *r.first.second;
+    for (unsigned h = 0; h < numHosts; ++h) {
+      if (h == id)
+        continue;
+      galois::runtime::SendBuffer b;
+      galois::runtime::gSerialize(b, gid2host[id]);
+      net.sendTagged(h, galois::runtime::evilPhase, std::move(b));
+    }
+    net.flush();
+    unsigned received = 1;
+    while (received < numHosts) {
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
+      do {
+        p = net.recieveTagged(galois::runtime::evilPhase);
+      } while (!p);
+      assert(p->first != id);
+      auto& b = p->second;
+      galois::runtime::gDeserialize(b, gid2host[p->first]);
+      ++received;
+    }
+    increment_evilPhase();
+  }
+
+
 protected:
   /**
    * Wrapper call that will call into more specific compute masters
@@ -401,6 +598,67 @@ class DistGraph {
     return numNodes_to_divide;
   }
 
+  /**
+   * Wrapper call that will call into more specific compute masters
+   * functions that compute masters based on nodes, edges, or both.
+   *
+   * @param masters_distribution method of masters distribution to use
+   * @param numGlobalNodes The number of global nodes to divide
+   * @param numGlobalEdges The number of global edges to divide
+   * @param outIndices A complete outgoing edge range array of CSR to calculate
+   * range
+   * @param scalefactor A vector that specifies if a particular host
+   * should have more or less than other hosts
+   * @param nodeWeight weight to give nodes when computing balance
+   * @param edgeWeight weight to give edges when computing balance
+   * @param DecomposeFactor Specifies how decomposed the blocking
+   * of nodes should be. For example, a factor of 2 will make 2 blocks
+   * out of 1 block had the decompose factor been set to 1.
+   */
+  uint64_t computeMasters(MASTERS_DISTRIBUTION masters_distribution,
+                          uint64_t numGlobalNodes, uint64_t numGlobalEdges,
+                          uint64_t* outIndices,
+                          const std::vector<unsigned>& scalefactor,
+                          uint32_t nodeWeight = 0, uint32_t edgeWeight = 0,
+                          unsigned DecomposeFactor = 1) {
+    galois::Timer timer;
+    timer.start();
+    uint64_t numNodes_to_divide = numGlobalNodes;
+
+    // compute masters for all nodes
+    switch (masters_distribution) {
+    case BALANCED_MASTERS:
+      computeMastersBlockedNodes(
+          numGlobalNodes, scalefactor, DecomposeFactor);
+      break;
+    case BALANCED_MASTERS_AND_EDGES:
+      computeMastersBalancedNodesAndEdges(
+          numGlobalNodes, numGlobalEdges, outIndices,
+          scalefactor, nodeWeight, edgeWeight, DecomposeFactor);
+      break;
+    case BALANCED_EDGES_OF_MASTERS:
+    default:
+      computeMastersBalancedEdges(
+          numGlobalNodes, numGlobalEdges, outIndices,
+          scalefactor, edgeWeight, DecomposeFactor);
+      break;
+    }
+
+    timer.stop();
+
+    galois::runtime::reportStatCond_Tmax<MORE_DIST_STATS>(
+        GRNAME, "MasterDistTime", timer.get());
+
+#if 0
+    galois::gDebug(
+        "[", id, "] Master distribution time : ", timer.get_usec() / 1000000.0f,
+        " seconds to read ", g.num_bytes_read(), " bytes in ", g.num_seeks(),
+        " seeks (", g.num_bytes_read() / (float)timer.get_usec(), " MBPS)");
+#endif
+    return numNodes_to_divide;
+  }
+
+
   //! reader assignment from a file
   //! corresponds to master assignment if using an edge cut
   void readersFromFile(galois::graphs::OfflineGraph& g, std::string filename) {
diff --git a/libcusp/include/galois/graphs/NewGeneric.h b/libcusp/include/galois/graphs/NewGeneric.h
index 4ff7832f3e..49c96a965c 100644
--- a/libcusp/include/galois/graphs/NewGeneric.h
+++ b/libcusp/include/galois/graphs/NewGeneric.h
@@ -29,6 +29,9 @@
 
 #include "galois/graphs/DistributedGraph.h"
 #include "galois/DReducible.h"
+
+#include "shad/ShadGraphConverter.h"
+
 #include <optional>
 #include <sstream>
 
@@ -220,7 +223,8 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
    */
   NewDistGraphGeneric(
       const std::string& filename, unsigned host, unsigned _numHosts,
-      bool cuspAsync = true, uint32_t stateRounds = 100, bool transpose = false,
+      bool useShad = false, bool cuspAsync = true, uint32_t stateRounds = 100,
+      bool transpose = false,
       galois::graphs::MASTERS_DISTRIBUTION md = BALANCED_EDGES_OF_MASTERS,
       uint32_t nodeWeight = 0, uint32_t edgeWeight = 0,
       std::string masterBlockFile = "", bool readFromFile = false,
@@ -240,17 +244,65 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
       return;
     }
 
-    galois::graphs::OfflineGraph g(filename);
+    galois::graphs::OfflineGraph* offlineGraph{nullptr};
+
+    shad::ShadGraphConverter<EdgeTy> shadConverter;
+    galois::graphs::BufferedGraph<EdgeTy> bufGraph;
+    bufGraph.resetReadCounters();
 
-    base_DistGraph::numGlobalNodes = g.size();
-    base_DistGraph::numGlobalEdges = g.sizeEdges();
     std::vector<unsigned> dummy;
     // not actually getting masters, but getting assigned readers for nodes
     if (masterBlockFile == "") {
-      base_DistGraph::computeMasters(md, g, dummy, nodeWeight, edgeWeight);
+      if (useShad) {
+        std::cout << "Construct a distributed graph from SHAD WMD format.\n";
+        uint64_t numGlobalNodes{0}, numGlobalEdges{0};
+        // Read and load the whole SHAD WMD dataset to memory.
+        // TODO(hc): Note that this reads the entire graph.
+        //           We will improve this to read partial graphs
+        //           on each host later. For now, the main focus is
+        //           to enable WMD dataset for the workflows.
+        shadConverter.readSHADFile(filename, &numGlobalNodes, &numGlobalEdges);
+        base_DistGraph::numGlobalNodes = numGlobalNodes;
+        base_DistGraph::numGlobalEdges = numGlobalEdges;
+        // Construct node data/outgoing index range arrays
+        // for a GLOBAL array, not a local array.
+        // Later, parts for the local graph partition will be
+        // extracted and be used after graph partitioning is done.
+        // Basically, the idea that is used here is to mimic
+        // the BufferedGraph. BufferedGraph does not load the whole arrays
+        // to memory, but only reads and loads parts of the arrays from
+        // an input file. It is possible since the .gr files are stored
+        // in a CSR format, and in a consecutive manner. We can know
+        // offset for each data in advance.
+        // However, we cannot achieve it from a SHAD graph file since
+        // it is not consecutive, but edges and nodes are mixed.
+        // Due to this, we construct nodes' array for a global graph
+        // here. This array will be restructured after CuSP decides
+        // local nodes.
+        // TODO(hc): UT will improve and redesign this part to
+        // get scalability.
+        shadConverter.constructNodeArrays(
+            0, numGlobalNodes, numGlobalNodes);
+
+        // Compute master proxies by using the number of global nodes
+        // and edges.
+        base_DistGraph::computeMasters(
+            md, base_DistGraph::numGlobalNodes,
+            base_DistGraph::numGlobalEdges,
+            shadConverter.getOutIndexBuffer(), dummy, nodeWeight,
+            edgeWeight);
+      } else {
+        offlineGraph = new galois::graphs::OfflineGraph(filename);
+        base_DistGraph::numGlobalNodes = offlineGraph->size();
+        base_DistGraph::numGlobalEdges = offlineGraph->sizeEdges();
+        base_DistGraph::computeMasters(md, *offlineGraph, dummy, nodeWeight, edgeWeight);
+      }
     } else {
+      if (useShad) {
+        GALOIS_DIE("SHAD graph format does not support master block file");
+      }
       galois::gInfo("Getting reader assignment from file");
-      base_DistGraph::readersFromFile(g, masterBlockFile);
+      base_DistGraph::readersFromFile(*offlineGraph, masterBlockFile);
     }
 
     graphPartitioner = std::make_unique<Partitioner>(
@@ -261,17 +313,18 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
 
     // get training nodes and split evenly among hosts
     std::vector<uint32_t> trainPoints = this->getGNNBreakpoints(filename);
+    // TODO(hc)
     if (!trainPoints.empty()) {
       std::vector<unsigned> testDistribution =
           galois::graphs::determineUnitRangesFromPrefixSum(
-              base_DistGraph::numHosts, g, trainPoints[0], trainPoints[1]);
+              base_DistGraph::numHosts, *offlineGraph, trainPoints[0], trainPoints[1]);
 
       std::vector<unsigned> restDistribution =
           galois::graphs::determineUnitRangesFromPrefixSum(
-              base_DistGraph::numHosts, g, trainPoints[1], g.size());
+              base_DistGraph::numHosts, *offlineGraph, trainPoints[1], offlineGraph->size());
 
       // create global distribution of edges
-      std::vector<uint32_t> mappings(g.size());
+      std::vector<uint32_t> mappings(offlineGraph->size());
       galois::do_all(
           galois::iterate((size_t)0, (size_t)base_DistGraph::numHosts),
           [&](size_t h) {
@@ -294,13 +347,6 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
       }
     }
 
-    uint64_t nodeBegin = base_DistGraph::gid2host[base_DistGraph::id].first;
-    typename galois::graphs::OfflineGraph::edge_iterator edgeBegin =
-        g.edge_begin(nodeBegin);
-    uint64_t nodeEnd = base_DistGraph::gid2host[base_DistGraph::id].second;
-    typename galois::graphs::OfflineGraph::edge_iterator edgeEnd =
-        g.edge_begin(nodeEnd);
-
     // signifies how many outgoing edges a particular host should expect from
     // this host
     std::vector<std::vector<uint64_t>> numOutgoingEdges;
@@ -321,13 +367,59 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
     // phase 0
 
     galois::gDebug("[", base_DistGraph::id, "] Starting graph reading.");
-    galois::graphs::BufferedGraph<EdgeTy> bufGraph;
-    bufGraph.resetReadCounters();
     galois::StatTimer graphReadTimer("GraphReading", GRNAME);
     graphReadTimer.start();
-    bufGraph.loadPartialGraph(filename, nodeBegin, nodeEnd, *edgeBegin,
-                              *edgeEnd, base_DistGraph::numGlobalNodes,
-                              base_DistGraph::numGlobalEdges);
+
+    uint64_t nodeBegin = base_DistGraph::gid2host[base_DistGraph::id].first;
+    uint64_t nodeEnd = base_DistGraph::gid2host[base_DistGraph::id].second;
+
+    if (!useShad) {
+      // If the input graph is not SHAD WMD format,
+      // construct a buffered graph from the file directly, as ordinary.
+      typename galois::graphs::OfflineGraph::edge_iterator edgeBegin =
+          offlineGraph->edge_begin(nodeBegin);
+      typename galois::graphs::OfflineGraph::edge_iterator edgeEnd =
+          offlineGraph->edge_begin(nodeEnd);
+      bufGraph.loadPartialGraph(filename, nodeBegin, nodeEnd, *edgeBegin,
+                                *edgeEnd, base_DistGraph::numGlobalNodes,
+                                base_DistGraph::numGlobalEdges);
+    } else {
+      // Now construct arrays for in-memory CSR.
+      // In case of the node out-going edge range array and
+      // the node data array, it will extract parts corresponding to 
+      // local graph paritition from the arrays holding the global
+      // array information.
+      // Edge destination and data arrays are constructed based on
+      // unrefined maps constructed from SHAD graph reading.
+      // NOTE that those arrays all store GLOBAL node ids.
+      // For example, edge destination array's size is equal
+      // to the number of local edges, but its destination ID is
+      // global node IDs, not local node IDs.
+      uint32_t numLocalNodes = nodeEnd - nodeBegin;
+      // So, this holds outgoing edge array of a whole (global) graph.
+      uint64_t *outIndexBuffer = shadConverter.getOutIndexBuffer();
+      // Global edge id range assigned to the current host.
+      uint64_t edgeBegin =
+          (nodeBegin == 0)? 0 : outIndexBuffer[nodeBegin - 1];
+      // This is the last local node's edge range end.
+      // So, [edgeBegin, edgeEnd) is for this current host.
+      uint64_t edgeEnd = outIndexBuffer[nodeEnd - 1];
+      // Extract node out-going range and data arrays of local nodes.
+      // From now on, those arrays store local node information
+      // as a dense memory representation.
+      shadConverter.extractLocalOutIndexArray(
+          nodeBegin, nodeEnd);
+
+      uint64_t numLocalEdges = edgeEnd - edgeBegin;
+      shadConverter.constructEdgeArrays(
+          nodeBegin, edgeBegin, numLocalNodes, numLocalEdges);
+      // Construct a buffered graph that is used by CuSP to partition
+      // a graph.
+      shadConverter.constructBufferedGraph(
+          base_DistGraph::numGlobalNodes, base_DistGraph::numGlobalEdges,
+          nodeBegin, nodeEnd, edgeBegin, edgeEnd, &bufGraph);
+    }
+
     graphReadTimer.stop();
     galois::gDebug("[", base_DistGraph::id, "] Reading graph complete.");
 
@@ -455,6 +547,15 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
     Tgraph_construct.stop();
     galois::gDebug("[", base_DistGraph::id, "] Graph construction complete.");
 
+    if (useShad) {
+      // Different from the gr format file that has been used by Galois
+      // and does not contain node data in the file, 
+      // a SHAD graph file has a single type for each node, and it
+      // is considered as node data.
+      // This function constructs and sets node data (type).
+      assignNodeDataFromSHADProp(&shadConverter);
+    }
+
     // report state rounds
     if (base_DistGraph::id == 0) {
       galois::runtime::reportStat_Single(GRNAME, "CuSPStateRounds",
@@ -503,6 +604,43 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
     return toReturn;
   }
 
+  /**
+   * @brief Assign a SHAD node type to a node data.
+   *
+   * @detail Different from the gr format file that has been used by Galois
+   * and does not contain node data in the file, 
+   * a SHAD graph file has a single type for each node, and it
+   * considered as node data. This function constructs and sets node
+   * data based on that.
+   * This function assumes that the node type's data type is always
+   * uint64_t.
+   *
+   * @tparam T Node data type
+   *
+   * @param shadConverter SHAD graph converter holding node data from a
+   * SHAD file.
+   */
+  template <typename T = NodeTy,
+            typename std::enable_if_t<
+                std::is_same_v<T, uint64_t>>* = nullptr>
+  void assignNodeDataFromSHADProp(shad::ShadGraphConverter<EdgeTy>* shadConverter) {
+    galois::gPrint("[", base_DistGraph::id, "] Graph node data is assigned.");
+    uint64_t* nodeDataBuffer = shadConverter->getNodeDataBuffer();
+    galois::do_all(galois::iterate(base_DistGraph::allNodesRange()),
+        [&](uint32_t lid) {
+          uint64_t gid = this->getGID(lid);
+          this->getData(lid) = nodeDataBuffer[gid];
+          std::cout << "lid :" << lid << " is set to " <<
+          this->getData(lid) << "\n";
+        });
+  }
+
+  template <typename T = NodeTy,
+            typename std::enable_if_t<
+                !std::is_same_v<T, uint64_t>>* = nullptr>
+  void assignNodeDataFromSHADProp(
+      [[maybe_unused]] shad::ShadGraphConverter<EdgeTy>* shadConverter) {}
+
   /**
    * For each other host, determine which nodes that this host needs to get
    * info from
diff --git a/libcusp/test/CMakeLists.txt b/libcusp/test/CMakeLists.txt
new file mode 100644
index 0000000000..710627302c
--- /dev/null
+++ b/libcusp/test/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_executable(shad_dist_graph shad-dist-graph.cpp)
+target_link_libraries(shad_dist_graph galois_gnn)
diff --git a/libcusp/test/shad-dist-graph.cpp b/libcusp/test/shad-dist-graph.cpp
new file mode 100644
index 0000000000..fe71231295
--- /dev/null
+++ b/libcusp/test/shad-dist-graph.cpp
@@ -0,0 +1,118 @@
+/*
+ * This file belongs to the Galois project, a C++ library for exploiting
+ * parallelism. The code is being released under the terms of the 3-Clause BSD
+ * License (a copy is located in LICENSE.txt at the top-level directory).
+ *
+ * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
+ * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
+ * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
+ * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
+ * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
+ * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
+ * shall University be liable for incidental, special, indirect, direct or
+ * consequential damages or loss of profits, interruption of business, or
+ * related expenses which may arise from use of Software or Documentation,
+ * including but not limited to those resulting from defects in Software and/or
+ * Documentation, or loss or inaccuracy of data of any kind.
+ */
+
+#include <fstream>
+
+#include "galois/Galois.h"
+#include "galois/graphs/CuSPPartitioner.h"
+#include "shad/ShadGraphConverter.h" 
+
+int main() {
+  galois::DistMemSys G;
+  unsigned M = galois::substrate::getThreadPool().getMaxThreads();
+  //M = 1;
+  galois::setActiveThreads(M);
+
+  shad::ShadGraphConverter<uint64_t> shadConverter;
+  size_t numNodes{0}, numEdges{0};
+
+  std::string filename = "/home/hochan/data.csv";
+  shadConverter.readSHADFile(filename, &numNodes, &numEdges);
+  std::unique_ptr<galois::graphs::DistGraph<uint64_t, uint64_t>>
+      graph = galois::cuspPartitionGraph<GenericCVC, uint64_t, uint64_t>(
+          filename, galois::CUSP_CSR, galois::CUSP_CSR, true, true);
+
+  galois::DGAccumulator<uint64_t> sumGlobalNodes;
+  galois::DGAccumulator<uint64_t> sumGlobalEdges;
+
+  sumGlobalNodes.reset();
+  sumGlobalEdges.reset();
+
+  sumGlobalNodes += graph->numMasters();
+  sumGlobalEdges += graph->sizeEdges();
+
+  uint64_t reducedSumGlobalNodes = sumGlobalNodes.reduce();
+  uint64_t reducedSumGlobalEdges = sumGlobalEdges.reduce();
+
+  assert(reducedSumGlobalNodes == numNodes);
+  assert(reducedSumGlobalNodes == graph->globalSize());
+  assert(reducedSumGlobalEdges == numEdges);
+  assert(reducedSumGlobalEdges == graph->globalSizeEdges());
+
+  uint32_t id = galois::runtime::getSystemNetworkInterface().ID;
+  uint32_t numHosts = galois::runtime::getSystemNetworkInterface().Num;
+  {
+  std::ofstream fp(std::to_string(id) + ".master");
+  for (uint32_t src = 0; src < graph->numMasters(); ++src) {
+    uint64_t srcglobal = graph->getGID(src);
+    fp << "node " << srcglobal << ", type: " << graph->getData(src) << "\n";
+    for (auto e : graph->edges(src)) {
+      uint32_t dstlocal = graph->getEdgeDst(e);
+      uint64_t dstglobal = graph->getGID(dstlocal);
+      fp << "\t edge dst " << dstglobal << ", type: " <<
+          graph->getEdgeData(e) << "\n";
+    }
+  }
+  fp.close();
+  }
+
+  {
+    for (uint32_t host = 0; host < numHosts; ++host) {
+      if (host == id) { continue; }
+      std::ofstream fp(std::to_string(id) + "-" + std::to_string(host) + ".graph");
+      for (uint32_t i = 0; i < graph->size(); ++i) {
+        fp << i << ", " << graph->getGID(i) << ", " <<
+          graph->getData(i) << "\n";
+      }
+      fp.close();
+    }
+  }
+  {
+  for (uint32_t host = 0; host < numHosts; ++host) {
+    if (host == id) {
+      continue;
+    }
+    std::ofstream fp(std::to_string(id) + "-" + std::to_string(host) + ".mirror");
+    for (uint32_t i = 0;
+         i < graph->getMirrorNodes()[host].size(); ++i) {
+      uint64_t srcglobal = graph->getMirrorNodes()[host][i];
+      uint32_t src = graph->getLID(srcglobal);
+      fp << "src:" << src << ", global:" << srcglobal << ", node data:" <<
+        graph->getData(src) << "\n" << std::flush;
+
+      assert(shadConverter.checkNode(srcglobal, graph->getData(src)));
+      fp << "node " << srcglobal << ", type: " << graph->getData(src) << "\n";
+      //if (std::distance(graph->edge_begin(src), graph->edge_end(src)) > 0) {
+        for (auto e : graph->edges(src)) {
+          uint32_t dst = graph->getEdgeDst(e);
+          uint64_t dstglobal = graph->getGID(dst);
+          assert(shadConverter.checkNode(dstglobal, graph->getData(dst)));
+          assert(shadConverter.checkEdge(srcglobal, dstglobal,
+              std::distance(graph->edge_begin(src), e),
+              graph->getEdgeData(e)));
+          fp << "\t edge dst " << dstglobal << ", type: " <<
+              graph->getEdgeData(e) << "\n" << std::flush;
+        }
+    }
+    fp.close();
+    }
+  }
+
+  return 0;
+}
diff --git a/libdeepgalois/include/deepgalois/types.h b/libdeepgalois/include/deepgalois/types.h
index 43d55eb331..17dd05b15d 100644
--- a/libdeepgalois/include/deepgalois/types.h
+++ b/libdeepgalois/include/deepgalois/types.h
@@ -3,6 +3,7 @@
 #include <set>
 #include <vector>
 #include <stdint.h>
+#include <cstddef>
 
 // TODO namespace
 
diff --git a/libgalois/include/galois/graphs/BufferedGraph.h b/libgalois/include/galois/graphs/BufferedGraph.h
index 22cc10cc11..956c9d7d7a 100644
--- a/libgalois/include/galois/graphs/BufferedGraph.h
+++ b/libgalois/include/galois/graphs/BufferedGraph.h
@@ -277,6 +277,46 @@ class BufferedGraph {
    */
   BufferedGraph() { resetReadCounters(); }
 
+  /**
+   * @brief Construct a buffered graph from parameters paseed.
+   * The array parameters should be constructed outside.
+   *
+   * @param _outIndexBuffer Outgoing neighbors range for each node
+   * @param _edgeDestBuffer Outgoing edge destination nodes
+   * @param _edgeDataBuffer Outgoing edge data
+   * @param _globalsize The number of global nodes
+   * @param _globalEdgeSize The number of global edges
+   * @param _numLocalNodes The number of local nodes
+   * @param _numLocalEdges The number of local edges
+   * @param _nodeOffset Node offsets on the global node space of
+   * the current host
+   * @param _edgeOffset Edge offsets on the global edge space of
+   * the current host
+   */
+  void constructFrom(uint64_t* _outIndexBuffer, uint32_t* _edgeDestBuffer,
+      EdgeDataType* _edgeDataBuffer, uint32_t _globalSize,
+      uint64_t _globalEdgeSize, uint32_t _numLocalNodes,
+      uint64_t _numLocalEdges, uint64_t _nodeOffset,
+      uint64_t _edgeOffset) {
+    assert(_outIndexBuffer != nullptr);
+    assert(_edgeDestBuffer != nullptr);
+    assert(_edgeDataBuffer != nullptr);
+    outIndexBuffer = _outIndexBuffer;
+    edgeDestBuffer = _edgeDestBuffer;
+    edgeDataBuffer = _edgeDataBuffer;
+    globalSize = _globalSize;
+    globalEdgeSize = _globalEdgeSize;
+    numLocalNodes = _numLocalNodes;
+    numLocalEdges = _numLocalEdges;
+    nodeOffset = _nodeOffset;
+    edgeOffset = _edgeOffset;
+    resetReadCounters();
+    graphLoaded = true;
+    numBytesReadOutIndex += sizeof(uint64_t);
+    numBytesReadEdgeDest += sizeof(uint64_t);
+    numBytesReadEdgeData += sizeof(uint64_t);
+  }
+
   /**
    * On destruction, free allocated buffers (if necessary).
    */
@@ -430,10 +470,9 @@ class BufferedGraph {
     }
     assert(nodeOffset <= globalNodeID);
     assert(globalNodeID < (nodeOffset + numLocalNodes));
-
     numBytesReadOutIndex += sizeof(uint64_t);
-
     uint64_t localNodeID = globalNodeID - nodeOffset;
+
     return EdgeIterator(outIndexBuffer[localNodeID]);
   }
 
diff --git a/libgalois/include/shad/DataTypes.h b/libgalois/include/shad/DataTypes.h
new file mode 100644
index 0000000000..84dc770bee
--- /dev/null
+++ b/libgalois/include/shad/DataTypes.h
@@ -0,0 +1,734 @@
+//===------------------------------------------------------------*- C++ -*-===//
+//
+//                                     SHAD
+//
+//      The Scalable High-performance Algorithms and Data Structure Library
+//
+//===----------------------------------------------------------------------===//
+//
+// Copyright 2018 Battelle Memorial Institute
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LIBGALOIS_INCLUDE_SHAD_DATATYPES_H_
+#define LIBGALOIS_INCLUDE_SHAD_DATATYPES_H_
+
+#include <ctime>
+#include <cstring>
+#include <algorithm>
+#include <iomanip>
+#include <iostream>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+
+namespace shad {
+
+/// @brief Data conversion utilities.
+/// 
+/// Please refer to methods specialization to check
+/// which data types are supported.
+namespace data_types {
+
+  /// @brief Enumeration of supported data types.
+  /// 
+  /// The enumeration is meant to be used when parsing data
+  /// (i.e. type information is not known at compile time).
+  enum data_t {
+    STRING = 0,  // string support is currenlty limited
+    CHARS,       // sequence of characters
+    UINT,        // unsigned, binds by default to uint64_t
+    INT,         // int, binds by default to int64_t
+    FLOAT,       // float, binds by default to float
+    DOUBLE,      // double, binds by default to double
+    BOOL,        // bool, binds by default to bool
+    DATE,        // date in "%y-%m-%d" format, binds by default to time_t
+    USDATE,      // date in "%m/%d/%y" format, binds by default to time_t
+    DATE_TIME,   // date in "%y-%m-%dT%H:%M:%S" format,
+                 // binds by default to time_t
+    IP_ADDRESS,  // IPv4, binds by default to data_types::ipv4_t
+    LIST_UINT,   // Sequence of unsigneds, support currently limited
+    LIST_INT,    // Sequence of integers, support currently limited
+    LIST_DOUBLE, // Sequence of doubles, support currently limited
+    NONE
+  };
+
+  /// @brief Data structures for storing schema information.
+  /// Given a tuple of data, it associates elements labels and data types
+  /// to their position in the tuple.
+  using schema_t = std::vector<std::pair<std::string, data_t>>;
+
+  /// @brief Encoded null value.
+  /// @tparam ENC_t encoding type.
+  /// @return Encoded null value for ENC_t.
+  template <typename ENC_t>
+  constexpr ENC_t kNullValue = ENC_t();
+
+  /// @brief Encoded null value for uint64_t.
+  /// @return Null encoded value for uint64_t.
+  template <>
+  constexpr uint64_t kNullValue<uint64_t> = std::numeric_limits<int64_t>::max();
+
+  /// @brief Encoded null value for time_t (same as long).
+  /// @return Null encoded value for time_t (same as long).
+  template <>
+  constexpr time_t kNullValue<time_t> = std::numeric_limits<time_t>::max();
+
+  /// @brief Encoded null value for double.
+  /// @return Null encoded value for double.
+  template <>
+  constexpr double kNullValue<double> = std::numeric_limits<double>::max();
+  
+  /// @brief Encode Function
+  /// Available specializations:
+  ///    ENC_t = uint64_t, IN_t = std::string
+  /// @tparam ENC_t The type to encode to.
+  /// @tparam IN_t The type (format) of the data to encode.
+  /// @tparam DT data_types::data_t of the data to encode.
+  /// @param in Data to encode.
+  /// @return Encoded data.
+  template <typename ENC_t, typename IN_t, data_t DT>
+  ENC_t encode(IN_t &in);
+
+  /// @brief Encode Function
+  /// Available specializations:
+  ///    ENC_t = uint64_t, IN_t = default bindings of data_types::data_t
+  /// @tparam ENC_t The type to encode to.
+  /// @tparam IN_t The type of the data to encode.
+  /// @param in Data to encode.
+  /// @return Encoded data.
+  template <typename ENC_t, typename IN_t>
+  ENC_t encode(IN_t &in);
+
+  template <typename ENC_t, typename IN_t>
+  ENC_t encode(IN_t &in, data_t dt);
+
+  template <typename ENC_t, size_t MAX_s, data_t ST>
+  std::array<ENC_t, MAX_s> encode(std::string &str) {
+    std::array<ENC_t, MAX_s> res;
+    if (str.size() > 0) {
+      memcpy(res.data(), str.data(), sizeof(ENC_t)*MAX_s);
+    } else {
+      res.fill('\0');
+    }
+    return res;
+  }
+
+  template <typename ENC_t, typename DEC_t>
+  typename std::enable_if<(std::is_arithmetic<DEC_t>::value or (sizeof(DEC_t) == sizeof(ENC_t))), DEC_t>::type
+  decode(ENC_t encvalue) {
+    DEC_t val;
+    memcpy(&val, &encvalue, sizeof(DEC_t));
+    return val;
+  }
+
+  template <typename ENC_t, typename DEC_t, data_t ST>
+  DEC_t decode(ENC_t value);
+
+  template <typename ENC_t, data_t ST>
+  typename std::enable_if<(ST==data_t::INT), int64_t>::type 
+  decode(ENC_t encvalue) {
+    return decode<ENC_t, int64_t>(encvalue);
+  }
+
+  template <typename ENC_t, data_t ST>
+  typename std::enable_if<(ST==data_t::UINT), uint64_t>::type 
+  decode(ENC_t encvalue) {
+    return decode<ENC_t, uint64_t>(encvalue);
+  }
+
+  template <typename ENC_t, data_t ST>
+  typename std::enable_if<(ST==data_t::FLOAT), float>::type 
+  decode(ENC_t encvalue) {
+    return decode<ENC_t, float>(encvalue);
+  }
+
+  template <typename ENC_t, data_t ST>
+  typename std::enable_if<(ST==data_t::DOUBLE), double>::type 
+  decode(ENC_t encvalue) {
+    return decode<ENC_t, double>(encvalue);
+  }
+
+  template <typename ENC_t, data_t ST>
+  typename std::enable_if<(ST==data_t::BOOL), bool>::type 
+  decode(ENC_t encvalue) {
+    return decode<ENC_t, bool>(encvalue);
+  }
+
+  template <typename ENC_t, data_t ST>
+  typename std::enable_if<(ST==data_t::DATE), std::time_t>::type 
+  decode(ENC_t encvalue) {
+    return decode<ENC_t, std::time_t>(encvalue);
+  }
+  
+  template <typename ENC_t, size_t MAX_s, data_t ST>
+  std::string decode(std::array<ENC_t, MAX_s> &val) {
+    return std::string(reinterpret_cast<const char*>(val.data()));
+  }
+}  // namespace data_types
+
+
+// ENCODE METHODS SPECIALIZATION FOR UINT64 ENC_t
+template<> inline
+uint64_t data_types::encode<uint64_t,
+                            std::string,
+                            data_types::UINT>(std::string &str) {
+  uint64_t value;
+  try { value = std::stoull(str); }
+  catch(...) { value = kNullValue<uint64_t>; }
+  return value;
+}
+
+template<> inline
+uint64_t data_types::encode<uint64_t,
+                            std::string,
+                            data_types::INT>(std::string &str) {
+  uint64_t encval;
+  int64_t value;
+  try { value = stoll(str); }
+  catch(...) { return kNullValue<uint64_t>; }
+  memcpy(&encval, &value, sizeof(value));
+  return encval;
+}
+
+template<> inline
+uint64_t data_types::encode<uint64_t,
+                            std::string,
+                            data_types::FLOAT>(std::string &str) {
+  uint64_t encval;
+  float value;
+  try { value = stof(str); }
+  catch(...) { return kNullValue<uint64_t>; }
+  memcpy(&encval, &value, sizeof(value));
+  return encval;
+}
+
+template<> inline
+uint64_t data_types::encode<uint64_t,
+                            std::string,
+                            data_types::DOUBLE>(std::string &str) {
+  uint64_t encval;
+  double value;
+  try { value = stod(str); }
+  catch(...) { return kNullValue<uint64_t>; }
+  memcpy(&encval, &value, sizeof(value));
+  return encval;
+}
+
+template<> inline
+uint64_t data_types::encode<uint64_t,
+                            std::string,
+                            data_types::BOOL>(std::string &str) {
+  if (str.size() == 0) return kNullValue<uint64_t>;
+  uint64_t encval = 1;
+  if ((str == "F") || (str == "f") || (str == "FALSE") 
+                   || (str == "false") || (str == "0")) encval = 0;
+  return encval;
+}
+
+
+template<> inline
+uint64_t data_types::encode<uint64_t,
+                            std::string,
+                            data_types::CHARS>(std::string &str) {
+  uint64_t encval = 0;
+  memset(&encval, '\0', sizeof(encval));
+  memcpy(&encval, str.c_str(), sizeof(encval)-1);
+  return encval;
+}
+
+template<> inline
+uint64_t data_types::encode<uint64_t,
+                            std::string,
+                            data_types::IP_ADDRESS>(std::string &str) {
+  uint64_t val, value = 0;
+  std::string::iterator start = str.begin();
+  for (unsigned i = 0; i < 4; i ++) {
+    std::string::iterator end = std::find(start, str.end(), '.');
+    try {
+      val = std::stoull(std::string(start, end));
+    } catch(...) {
+      return kNullValue<uint64_t>;
+    }
+    if (val < 256) {
+      value = (value << 8) + val; start = end + 1;
+    } else {
+      return kNullValue<uint64_t>;
+    }
+  }
+  return value;
+}
+
+template<> inline
+uint64_t data_types::encode<uint64_t,
+                            std::string,
+                            data_types::DATE>(std::string &str) {
+  uint64_t value = 0;
+  struct tm date{};
+  date.tm_isdst = -1;
+  strptime(str.c_str(), "%Y-%m-%d", &date);
+  time_t t;
+  try {
+    t = mktime(&date);
+  }
+  catch(...) {
+    return kNullValue<uint64_t>;
+  }
+  memcpy(&value, &t, sizeof(value));
+  return value;
+}
+
+template<> inline
+uint64_t data_types::encode<uint64_t,
+                            std::string,
+                            data_types::USDATE>(std::string &str) {
+  uint64_t value = 0;
+  struct tm date{};
+  date.tm_isdst = -1;
+  strptime(str.c_str(), "%m/%d/%y", &date);
+  time_t t;
+  try {
+    t = mktime(&date);
+  }
+  catch(...) {
+    return kNullValue<uint64_t>;
+  }
+  memcpy(&value, &t, sizeof(value));
+  return value;
+}
+
+template<> inline
+uint64_t data_types::encode<uint64_t,
+                            std::string,
+                            data_types::DATE_TIME>(std::string &str) {
+  uint64_t value = 0;
+  struct tm date{};
+  date.tm_isdst = -1;
+  strptime(str.c_str(), "%Y-%m-%dT%H:%M:%S", &date);
+  time_t t;
+  try {
+    t = mktime(&date);
+  }
+  catch(...) {
+    return kNullValue<uint64_t>;
+  }
+  memcpy(&value, &t, sizeof(value));
+  return value;
+}
+
+// ENCODE METHODS SPECIALIZATION FOR DOUBLE ENC_t
+
+template<> inline
+double data_types::encode<double,
+                          std::string,
+                          data_types::UINT>(std::string &str) {
+  double encval;
+  uint64_t value;
+  try { value = std::stoull(str); }
+  catch(...) { return kNullValue<double>; }
+  memcpy(&encval, &value, sizeof(value));
+  return encval;
+}
+
+template<> inline
+double data_types::encode<double,
+                          std::string,
+                          data_types::INT>(std::string &str) {
+  double encval;
+  int64_t value;
+  try { value = stoll(str); }
+  catch(...) { return kNullValue<double>; }
+  memcpy(&encval, &value, sizeof(value));
+  return encval;
+}
+
+template<> inline
+double data_types::encode<double,
+                          std::string,
+                          data_types::FLOAT>(std::string &str) {
+  double encval;
+  float value;
+  try { value = stof(str); }
+  catch(...) { return kNullValue<double>; }
+  memcpy(&encval, &value, sizeof(value));
+  return encval;
+}
+
+template<> inline
+double data_types::encode<double,
+                          std::string,
+                          data_types::DOUBLE>(std::string &str) {
+  double value;
+  try { value = stod(str); }
+  catch(...) { return kNullValue<double>; }
+  return value;
+}
+
+template<> inline
+double data_types::encode<double,
+                          std::string,
+                          data_types::BOOL>(std::string &str) {
+  if (str.size() == 0) return kNullValue<uint64_t>;
+  double encval = 1;
+  if ((str == "F") || (str == "f") || (str == "FALSE") 
+                   || (str == "false") || (str == "0")) encval = 0;
+  return encval;
+}
+
+
+template<> inline
+double data_types::encode<double,
+                          std::string,
+                          data_types::CHARS>(std::string &str) {
+  double encval = 0;
+  memset(&encval, '\0', sizeof(encval));
+  memcpy(&encval, str.c_str(), sizeof(encval)-1);
+  return encval;
+}
+
+template<> inline
+double data_types::encode<double,
+                          std::string,
+                          data_types::IP_ADDRESS>(std::string &str) {
+  uint64_t val, value = 0;
+  std::string::iterator start = str.begin();
+  for (unsigned i = 0; i < 4; i ++) {
+    std::string::iterator end = std::find(start, str.end(), '.');
+    try {
+      val = std::stoull(std::string(start, end));
+    } catch(...) {
+      return kNullValue<double>;
+    }
+    if (val < 256) {
+      value = (value << 8) + val; start = end + 1;
+    } else {
+      return kNullValue<double>;
+    }
+  }
+  double encval;
+  memcpy(&encval, &value, sizeof(value));
+  return encval;
+}
+
+template<> inline
+double data_types::encode<double,
+                          std::string,
+                          data_types::DATE>(std::string &str) {
+  double value = 0;
+  struct tm date{};
+  date.tm_isdst = -1;
+  strptime(str.c_str(), "%Y-%m-%d", &date);
+  time_t t;
+  try {
+    t = mktime(&date);
+  }
+  catch(...) {
+    return kNullValue<double>;
+  }
+  memcpy(&value, &t, sizeof(value));
+  return value;
+}
+
+template<> inline
+double data_types::encode<double,
+                          std::string,
+                          data_types::USDATE>(std::string &str) {
+  double value = 0;
+  struct tm date{};
+  date.tm_isdst = -1;
+  strptime(str.c_str(), "%m/%d/%y", &date);
+  time_t t;
+  try {
+    t = mktime(&date);
+  }
+  catch(...) {
+    return kNullValue<uint64_t>;
+  }
+  memcpy(&value, &t, sizeof(value));
+  return value;
+}
+
+template<> inline
+double data_types::encode<double,
+                          std::string,
+                          data_types::DATE_TIME>(std::string &str) {
+  double value = 0;
+  struct tm date{};
+  date.tm_isdst = -1;
+  strptime(str.c_str(), "%Y-%m-%dT%H:%M:%S", &date);
+  time_t t;
+  try {
+    t = mktime(&date);
+  }
+  catch(...) {
+    return kNullValue<uint64_t>;
+  }
+  memcpy(&value, &t, sizeof(value));
+  return value;
+}
+
+// ENCODE METHODS SPECIALIZATION FOR TIME_T ENC_t (same as long)
+template<> inline
+time_t data_types::encode<time_t,
+                          std::string,
+                          data_types::UINT>(std::string &str) {
+  time_t value;
+  try { value = std::stoul(str); }
+  catch(...) { value = kNullValue<time_t>; }
+  return value;
+}
+
+template<> inline
+time_t data_types::encode<time_t,
+                          std::string,
+                          data_types::INT>(std::string &str) {
+  int64_t value;
+  try { value = stol(str); }
+  catch(...) { return kNullValue<time_t>; }
+  return value;
+}
+
+template<> inline
+time_t data_types::encode<time_t,
+                          std::string,
+                          data_types::FLOAT>(std::string &str) {
+  time_t encval;
+  float value;
+  try { value = stof(str); }
+  catch(...) { return kNullValue<time_t>; }
+  memcpy(&encval, &value, sizeof(value));
+  return encval;
+}
+
+template<> inline
+time_t data_types::encode<time_t,
+                          std::string,
+                          data_types::DOUBLE>(std::string &str) {
+  time_t encval;
+  double value;
+  try { value = stod(str); }
+  catch(...) { return kNullValue<time_t>; }
+  memcpy(&encval, &value, sizeof(value));
+  return encval;
+}
+
+template<> inline
+time_t data_types::encode<time_t,
+                          std::string,
+                          data_types::BOOL>(std::string &str) {
+  if (str.size() == 0) return kNullValue<uint64_t>;
+  time_t encval = 1;
+  if ((str == "F") || (str == "f") || (str == "FALSE") 
+                   || (str == "false") || (str == "0")) encval = 0;
+  return encval;
+}
+
+
+template<> inline
+time_t data_types::encode<time_t,
+                          std::string,
+                          data_types::CHARS>(std::string &str) {
+  time_t encval = 0;
+  memset(&encval, '\0', sizeof(encval));
+  memcpy(&encval, str.c_str(), sizeof(encval)-1);
+  return encval;
+}
+
+template<> inline
+time_t data_types::encode<time_t,
+                          std::string,
+                          data_types::IP_ADDRESS>(std::string &str) {
+  time_t val, value = 0;
+  std::string::iterator start = str.begin();
+  for (unsigned i = 0; i < 4; i ++) {
+    std::string::iterator end = std::find(start, str.end(), '.');
+    try {
+      val = std::stoull(std::string(start, end));
+    } catch(...) {
+      return kNullValue<time_t>;
+    }
+    if (val < 256) {
+      value = (value << 8) + val; start = end + 1;
+    } else {
+      return kNullValue<time_t>;
+    }
+  }
+  return value;
+}
+
+template<> inline
+time_t data_types::encode<time_t,
+                          std::string,
+                          data_types::DATE>(std::string &str) {
+  struct tm date{};
+  date.tm_isdst = -1;
+  strptime(str.c_str(), "%Y-%m-%d", &date);
+  time_t t;
+  try {
+    t = mktime(&date);
+  }
+  catch(...) {
+    return kNullValue<time_t>;
+  }
+  return t;
+}
+
+template<> inline
+time_t data_types::encode<time_t,
+                          std::string,
+                          data_types::USDATE>(std::string &str) {
+  struct tm date{};
+  date.tm_isdst = -1;
+  strptime(str.c_str(), "%m/%d/%y", &date);
+  time_t t;
+  try {
+    t = mktime(&date);
+  }
+  catch(...) {
+    return kNullValue<time_t>;
+  }
+  return t;
+}
+
+template<> inline
+time_t data_types::encode<time_t,
+                          std::string,
+                          data_types::DATE_TIME>(std::string &str) {
+  struct tm date{};
+  date.tm_isdst = -1;
+  strptime(str.c_str(), "%Y-%m-%dT%H:%M:%S", &date);
+  time_t t;
+  try {
+    t = mktime(&date);
+  }
+  catch(...) {
+    return kNullValue<uint64_t>;
+  }
+  return t;
+}
+
+template <typename ENC_t, typename IN_t>
+ENC_t data_types::encode(IN_t &in, data_types::data_t dt) {
+  switch (dt) {
+//     case data_types::STRING :
+//       return data_types::encode<ENC_t, IN_t, data_types::STRING>(in);
+//     case data_types::CHARS :
+//       return data_types::encode<ENC_t, IN_t, data_types::CHARS>(in);
+    case data_types::UINT :
+      return data_types::encode<ENC_t, IN_t, data_types::UINT>(in);
+    case data_types::INT :
+      return data_types::encode<ENC_t, IN_t, data_types::INT>(in);
+    case data_types::FLOAT :
+      return data_types::encode<ENC_t, IN_t, data_types::FLOAT>(in);
+    case data_types::DOUBLE :
+      return data_types::encode<ENC_t, IN_t, data_types::DOUBLE>(in);
+    case data_types::BOOL :
+      return data_types::encode<ENC_t, IN_t, data_types::BOOL>(in);
+    case data_types::DATE :
+      return data_types::encode<ENC_t, IN_t, data_types::DATE>(in);
+    case data_types::USDATE :
+      return data_types::encode<ENC_t, IN_t, data_types::USDATE>(in);
+    case data_types::DATE_TIME :
+      return data_types::encode<ENC_t, IN_t, data_types::DATE_TIME>(in);
+    case data_types::IP_ADDRESS :
+      return data_types::encode<ENC_t, IN_t, data_types::IP_ADDRESS>(in);
+  }
+  return data_types::kNullValue<ENC_t>;
+}
+
+template<> inline
+std::string data_types::decode<uint64_t,
+                               std::string,
+                               data_types::UINT>(uint64_t value) {
+  if (value == kNullValue<uint64_t>) return "";
+  return std::to_string(value);
+}
+
+template<> inline
+std::string data_types::decode<uint64_t,
+                               std::string,
+                               data_types::INT>(uint64_t value) {
+  if (value == kNullValue<uint64_t>) return "";
+  int64_t v;
+  memcpy(&v, &value, sizeof(v));
+  return std::to_string(v);
+}
+
+template<> inline
+std::string data_types::decode<uint64_t,
+                               std::string,
+                               data_types::FLOAT>(uint64_t value) {
+  if (value == kNullValue<uint64_t>) return "";
+  float v;
+  memcpy(&v, &value, sizeof(v));
+  return std::to_string(v);
+}
+
+template<> inline
+std::string data_types::decode<uint64_t,
+                               std::string,
+                               data_types::DOUBLE>(uint64_t value) {
+  if (value == kNullValue<uint64_t>) return "";
+  double v;
+  memcpy(&v, &value, sizeof(v));
+  return std::to_string(v);
+}
+
+template<> inline
+std::string data_types::decode<uint64_t,
+                               std::string,
+                               data_types::IP_ADDRESS>(uint64_t value) {
+  std::string ipAddr = "";
+  uint64_t octets[4];
+  for (uint64_t k = 0; k < 4; k ++) {octets[k] = value & 255; value = value >> 8;}
+  for (uint64_t k = 3; k >= 1; k --) ipAddr += std::to_string(octets[k]) + '.';
+  return ipAddr + std::to_string(octets[0]);
+}
+
+template<> inline
+std::string data_types::decode<uint64_t,
+                               std::string,
+                               data_types::BOOL>(uint64_t value) {
+  if (value == kNullValue<uint64_t>) return "";
+  return std::to_string(value);
+}
+
+template<> inline
+std::string data_types::decode<uint64_t,
+                               std::string,
+                               data_types::DATE>(uint64_t value) {
+  time_t t = data_types::decode<uint64_t, data_types::DATE>(value);
+  char dateString[11];
+  strftime(dateString, 11, "%Y-%m-%d", std::localtime(&t));
+  return std::string(dateString);
+}
+
+template<> inline
+std::string data_types::decode<uint64_t,
+                               std::string,
+                               data_types::CHARS>(uint64_t value) {
+  const char* c = reinterpret_cast<const char*>(&value);
+  return std::string(c);
+}
+
+template <> inline
+uint64_t data_types::decode<uint64_t, uint64_t>(uint64_t encvalue) {
+  return encvalue;
+}
+}  // namespace shad
+
+#endif // LIBGALOIS_INCLUDE_SHAD_DATA_TYPES_H_
diff --git a/libgalois/include/shad/Graph.h b/libgalois/include/shad/Graph.h
new file mode 100644
index 0000000000..9029b1ef32
--- /dev/null
+++ b/libgalois/include/shad/Graph.h
@@ -0,0 +1,169 @@
+//TODO(hc): Upgrade copyright if it is necessary; for now, we have no plan
+// to make this public.
+
+//===------------------------------------------------------------*- C++ -*-===//
+//
+//                            The AGILE Workflows
+//
+//===----------------------------------------------------------------------===//
+// ** Pre-Copyright Notice
+//
+// This computer software was prepared by Battelle Memorial Institute,
+// hereinafter the Contractor, under Contract No. DE-AC05-76RL01830 with the
+// Department of Energy (DOE). All rights in the computer software are reserved
+// by DOE on behalf of the United States Government and the Contractor as
+// provided in the Contract. You are authorized to use this computer software
+// for Governmental purposes but it is not to be released or distributed to the
+// public. NEITHER THE GOVERNMENT NOR THE CONTRACTOR MAKES ANY WARRANTY, EXPRESS
+// OR IMPLIED, OR ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. This
+// notice including this sentence must appear on any copies of this computer
+// software.
+//
+// ** Disclaimer Notice
+//
+// This material was prepared as an account of work sponsored by an agency of
+// the United States Government. Neither the United States Government nor the
+// United States Department of Energy, nor Battelle, nor any of their employees,
+// nor any jurisdiction or organization that has cooperated in the development
+// of these materials, makes any warranty, express or implied, or assumes any
+// legal liability or responsibility for the accuracy, completeness, or
+// usefulness or any information, apparatus, product, software, or process
+// disclosed, or represents that its use would not infringe privately owned
+// rights. Reference herein to any specific commercial product, process, or
+// service by trade name, trademark, manufacturer, or otherwise does not
+// necessarily constitute or imply its endorsement, recommendation, or favoring
+// by the United States Government or any agency thereof, or Battelle Memorial
+// Institute. The views and opinions of authors expressed herein do not
+// necessarily state or reflect those of the United States Government or any
+// agency thereof.
+//
+//                    PACIFIC NORTHWEST NATIONAL LABORATORY
+//===----------------------------------------------------------------------===//
+
+#ifndef LIBGALOIS_INCLUDE_SHAD_GRAPH_H_
+#define LIBGALOIS_INCLUDE_SHAD_GRAPH_H_
+
+#include <cstdint>
+#include <limits>
+#include <vector>
+
+#include "DataTypes.h"
+#include "GraphTypes.h"
+
+#define UINT   shad::data_types::UINT
+#define DOUBLE shad::data_types::DOUBLE
+#define USDATE shad::data_types::USDATE
+#define ENCODE shad::data_types::encode
+
+namespace shad {
+
+class Vertex {
+  public:
+    // Vertex id; initially it is set
+    // to a local node id while CuSP reads a file and constructs
+    // this vertex. After each host finishes and synchronizes it to construct
+    // a full CSR graph, it is updated to a global node id.
+    uint64_t id;
+    TYPES    type;
+    uint64_t shadKey;
+    // Number of edges.
+    // This is incremented while reads a graph.
+    uint64_t numEdges{0};
+
+    Vertex () {
+      this->id    = shad::data_types::kNullValue<uint64_t>;
+      this->type  = TYPES::NONE;
+      this->shadKey = shad::data_types::kNullValue<uint64_t>;
+    }
+
+    Vertex (uint64_t id_, TYPES type_, uint64_t shadKey_) {
+      this->id    = id_;
+      this->type  = type_;
+      this->shadKey = shadKey_;
+    }
+
+    void incrNumEdges() {
+      this->numEdges += 1;
+    }
+
+    uint64_t getNumEdges() {
+      return this->numEdges;
+    }
+};
+
+class Edge {
+  public:
+    uint64_t src;     // vertex id of src
+    uint64_t dst;     // vertex id of dst
+    TYPES    type;
+    TYPES    src_type;
+    TYPES    dst_type;
+    uint64_t src_glbid;
+    uint64_t dst_glbid;
+
+    Edge () {
+      src       = shad::data_types::kNullValue<uint64_t>;
+      dst       = shad::data_types::kNullValue<uint64_t>;
+      type      = TYPES::NONE;
+      src_type  = TYPES::NONE;
+      dst_type  = TYPES::NONE;
+      src_glbid = shad::data_types::kNullValue<uint64_t>;
+      dst_glbid = shad::data_types::kNullValue<uint64_t>;
+    }
+
+    Edge (std::vector <std::string> & tokens) {
+      if (tokens[0] == "Sale") {
+         src       = ENCODE<uint64_t, std::string, UINT>(tokens[1]);
+         dst       = ENCODE<uint64_t, std::string, UINT>(tokens[2]);
+         type      = TYPES::SALE;
+         src_type  = TYPES::PERSON;
+         dst_type  = TYPES::PERSON;
+         src_glbid = shad::data_types::kNullValue<uint64_t>;
+         dst_glbid = shad::data_types::kNullValue<uint64_t>;
+      } else if (tokens[0] == "Author") {
+         src  = ENCODE<uint64_t, std::string, UINT>(tokens[1]);
+         type      = TYPES::AUTHOR;
+         src_type  = TYPES::PERSON;
+         src_glbid = shad::data_types::kNullValue<uint64_t>;
+         dst_glbid = shad::data_types::kNullValue<uint64_t>;
+         if      (tokens[3] != "") dst = ENCODE<uint64_t, std::string, UINT>(tokens[3]);
+         else if (tokens[4] != "") dst = ENCODE<uint64_t, std::string, UINT>(tokens[4]);
+         else if (tokens[5] != "") dst = ENCODE<uint64_t, std::string, UINT>(tokens[5]);
+         if      (tokens[3] != "") dst_type = TYPES::FORUM;
+         else if (tokens[4] != "") dst_type = TYPES::FORUMEVENT;
+         else if (tokens[5] != "") dst_type = TYPES::PUBLICATION;
+      } else if (tokens[0] == "Includes") {
+         src       = ENCODE<uint64_t, std::string, UINT>(tokens[3]);
+         dst       = ENCODE<uint64_t, std::string, UINT>(tokens[4]);
+         type      = TYPES::INCLUDES;
+         src_type  = TYPES::FORUM;
+         dst_type  = TYPES::FORUMEVENT;
+         src_glbid = shad::data_types::kNullValue<uint64_t>;
+         dst_glbid = shad::data_types::kNullValue<uint64_t>;
+      } else if (tokens[0] == "HasTopic") {
+         dst       = ENCODE<uint64_t, std::string, UINT>(tokens[6]);
+         type      = TYPES::HASTOPIC;
+         dst_type  = TYPES::TOPIC;
+         src_glbid = shad::data_types::kNullValue<uint64_t>;
+         dst_glbid = shad::data_types::kNullValue<uint64_t>;
+         if      (tokens[3] != "") src = ENCODE<uint64_t, std::string, UINT>(tokens[3]);
+         else if (tokens[4] != "") src = ENCODE<uint64_t, std::string, UINT>(tokens[4]);
+         else if (tokens[5] != "") src = ENCODE<uint64_t, std::string, UINT>(tokens[5]);
+         if      (tokens[3] != "") src_type = TYPES::FORUM;
+         else if (tokens[4] != "") src_type = TYPES::FORUMEVENT;
+         else if (tokens[5] != "") src_type = TYPES::PUBLICATION;
+      } else if (tokens[0] == "HasOrg") {
+         src       = ENCODE<uint64_t, std::string, UINT>(tokens[5]);
+         dst       = ENCODE<uint64_t, std::string, UINT>(tokens[6]);
+         type      = TYPES::HASORG;
+         src_type  = TYPES::PUBLICATION;
+         dst_type  = TYPES::TOPIC;
+         src_glbid = shad::data_types::kNullValue<uint64_t>;
+         dst_glbid = shad::data_types::kNullValue<uint64_t>;
+      }
+    }
+};
+
+} // namespace agile::workflow1
+
+#endif // GRAPH_H
diff --git a/libgalois/include/shad/GraphTypes.h b/libgalois/include/shad/GraphTypes.h
new file mode 100644
index 0000000000..eb84e123c2
--- /dev/null
+++ b/libgalois/include/shad/GraphTypes.h
@@ -0,0 +1,71 @@
+//===------------------------------------------------------------*- C++ -*-===//
+//
+//                            The AGILE Workflows
+//
+//===----------------------------------------------------------------------===//
+// ** Pre-Copyright Notice
+//
+// This computer software was prepared by Battelle Memorial Institute,
+// hereinafter the Contractor, under Contract No. DE-AC05-76RL01830 with the
+// Department of Energy (DOE). All rights in the computer software are reserved
+// by DOE on behalf of the United States Government and the Contractor as
+// provided in the Contract. You are authorized to use this computer software
+// for Governmental purposes but it is not to be released or distributed to the
+// public. NEITHER THE GOVERNMENT NOR THE CONTRACTOR MAKES ANY WARRANTY, EXPRESS
+// OR IMPLIED, OR ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. This
+// notice including this sentence must appear on any copies of this computer
+// software.
+//
+// ** Disclaimer Notice
+//
+// This material was prepared as an account of work sponsored by an agency of
+// the United States Government. Neither the United States Government nor the
+// United States Department of Energy, nor Battelle, nor any of their employees,
+// nor any jurisdiction or organization that has cooperated in the development
+// of these materials, makes any warranty, express or implied, or assumes any
+// legal liability or responsibility for the accuracy, completeness, or
+// usefulness or any information, apparatus, product, software, or process
+// disclosed, or represents that its use would not infringe privately owned
+// rights. Reference herein to any specific commercial product, process, or
+// service by trade name, trademark, manufacturer, or otherwise does not
+// necessarily constitute or imply its endorsement, recommendation, or favoring
+// by the United States Government or any agency thereof, or Battelle Memorial
+// Institute. The views and opinions of authors expressed herein do not
+// necessarily state or reflect those of the United States Government or any
+// agency thereof.
+//
+//                    PACIFIC NORTHWEST NATIONAL LABORATORY
+//                                 operated by
+//                                   BATTELLE
+//                                   for the
+//                      UNITED STATES DEPARTMENT OF ENERGY
+//                       under Contract DE-AC05-76RL01830
+//===----------------------------------------------------------------------===//
+
+#ifndef LIBGALOIS_INCLUDE_SHAD_GRAPHTYPES_H_
+#define LIBGALOIS_INCLUDE_SHAD_GRAPHTYPES_H_
+
+namespace shad {
+
+enum class TYPES {
+  PERSON,
+  FORUMEVENT,
+  FORUM,
+  PUBLICATION,
+  TOPIC,
+  PURCHASE,
+  SALE,
+  AUTHOR,
+  WRITTENBY,
+  INCLUDES,
+  INCLUDEDIN,
+  HASTOPIC,
+  TOPICIN,
+  HASORG,
+  ORGIN,
+  NONE
+};
+
+} // namespace agile::workflow1
+
+#endif // GRAPHTYPES_H
diff --git a/libgalois/include/shad/ShadGraphConverter.h b/libgalois/include/shad/ShadGraphConverter.h
new file mode 100644
index 0000000000..5162fc8dfb
--- /dev/null
+++ b/libgalois/include/shad/ShadGraphConverter.h
@@ -0,0 +1,712 @@
+#ifndef LIBGALOIS_INCLUDE_SHAED_GRAPH_READER_H_
+#define LIBGALOIS_INCLUDE_SHAED_GRAPH_READER_H_
+
+#include <fstream>
+#include <string>
+
+#include "galois/graphs/BufferedGraph.h"
+
+#include "shad/DataTypes.h"
+#include "shad/Graph.h"
+#include "shad/GraphTypes.h"
+
+namespace shad {
+
+/**
+ * TODO(hc): This is a shared-memory version.
+ * Later, a distributed-memory version in libgluon will reuse this code.
+ */
+template <typename EdgeDataTy>
+class ShadGraphConverter {
+
+public:
+  ShadGraphConverter() :
+      nodeDataBuffer(nullptr) {}
+
+  ~ShadGraphConverter() {
+    // BufferedGraph holds these arrays.
+    outIndexBuffer = nullptr;
+    nodeDataBuffer = nullptr;
+    edgeDestBuffer = nullptr;
+    edgeDataBuffer = nullptr;
+  }
+
+  /**
+   * @brief Flush a graph topology to a file for debugging.
+   */
+  void flushGraphTopology() {
+    std::ofstream fp("shad_graph.out");
+    for (size_t i = 0; i < this->verticeIdKeyMapping.size(); ++i) {
+      uint64_t key = this->verticeIdKeyMapping[i];
+      Vertex v = this->vertices[key];
+      fp << "node " << i << ", type: " << to_underlying(v.type) << "\n";
+      auto edgeRange = this->edges.equal_range(key);
+      for (auto ei = edgeRange.first ; ei != edgeRange.second; ++ei) {
+        Edge& edge = ei->second;
+        Vertex dst = this->vertices[edge.dst];
+        fp << "\t edge dst " << dst.id << ", type: " <<
+            to_underlying(edge.type) << "\n";
+      }
+    }
+    fp.close();
+  }
+
+  /**
+   * @brief Read a input graph file and inspect the number of nodes and edges.
+   * @detail In order to construct a dense LC_CSR_Graph, we need to know how
+   * many edges and nodes exist. This method reads one line by one line, and
+   * counts those information.
+   * Note that this method assumes that the types of {"Person", "ForumEvent",
+   * "Forum", "Publication", "Topic"} are nodes, and the types of
+   * {"SALE", "Author", "Includes", "HasTopic", "HasOrg"} are edges.
+   *
+   * @param filename file name to read
+   * @param numNodes number of nodes that this method reads
+   * @param numEdges number of edges that this method reads
+   */
+  void InspectGraph(const std::string& filename, size_t* numNodes,
+      size_t* numEdges) {
+    // TODO(hc): Get the number of nodes and edges from file
+    // For example, it reads {SALE, Author, Includes, HasTopic, HasOrg} as
+    // edges. So we just count how many they exist in the file.
+
+    std::string line;
+    std::ifstream file(filename);
+    if (!file.is_open()) {
+      std::cerr << "Cannot open file " << filename << "\n";
+      exit(-1);
+    }
+    while (!file.eof()) {
+      getline(file, line);
+      // Skip comments.
+      if (line[0] == '#') continue;
+      // Delimiter and # tokens set for WMD data file.
+      std::vector <std::string> tokens = splitTokens(line, ',', 10);
+
+      if (this->isTokenNodeType(tokens[0])) {
+        ++(*numNodes);
+      } else if (this->isTokenEdgeType(tokens[0])) {
+        *numEdges += 2;
+      }
+    }
+
+    std::cout << "Number of nodes:" << *numNodes << ", number of edges:" <<
+        *numEdges << "\n";
+  }
+
+  /**
+   * @brief Construct a buffered graph from existing arrays constructed
+   * by constructNodeArrays() and constructEdgeArrays().
+   *
+   * @param numGlobalNodes The number of global nodes 
+   * @param numGlobalEdges The number of global edges
+   * @param nodeBegin Global node ID of the first local node
+   * @param nodeEnd (Global node ID of the last local node) + 1
+   * @param edgeBegin Global edge ID of the first local edge
+   * @param edgeEnd (Global edge ID of the last local edge) + 1
+   * @param bufferedGraph Buffered graph for CuSP
+   */
+  void constructBufferedGraph(
+      uint64_t numGlobalNodes, uint64_t numGlobalEdges,
+      uint32_t nodeBegin, uint32_t nodeEnd,
+      uint64_t edgeBegin, uint64_t edgeEnd,
+      [[maybe_unused]]galois::graphs::BufferedGraph<EdgeDataTy>* bufferedGraph) {
+    // TODO(hc): Each of these functions first construct graphs in the SHAD
+    // format as this file is written in not binary, but string, and also 
+    // nodes or edges are not sorted. So, until we preprocess the input graph
+    // file, we should first read it in memory, and reconstruct this to Galois
+    // compatible 
+
+    uint32_t numLocalNodes = nodeEnd - nodeBegin;
+    uint64_t numLocalEdges = edgeEnd - edgeBegin;
+
+    bufferedGraph->constructFrom(
+        outIndexBuffer, edgeDestBuffer, edgeDataBuffer,
+        numGlobalNodes, numGlobalEdges, numLocalNodes, numLocalEdges,
+        nodeBegin, edgeBegin);
+#if 0
+    TODO(hc): This verification should be fixed since it tests 
+              a shared-memory execution that one host loads the whole
+              graph. It should not work on distributed-memory machine
+              since a CSR graph should be partitioned but tepmorary
+              maps reading and holding SHAD graphs are for global graph.
+#ifndef NDEBUG
+    std::cout << "CSR verification starts.." << std::endl << std::flush;
+    this->VerifyCSRConstruction(outIndexBuffer, nodeDataBuffer, 
+        edgeDestBuffer, edgeDataBuffer);
+    std::cout << "CSR verification starts.. [done]" << std::endl << std::flush;
+#endif
+#endif
+    // TODO(hc): Construct `buffer_graph`.
+  }
+
+  /**
+   * @brief Read SHAD graph file and construct in-memory buffer SHAD graph.
+   *
+   * @param filename SHAD graph file name
+   */
+  // TODO(hc): We can assign a disjointed range of file for each host.
+  // For now, let all hosts read the whole file.
+  void readSHADFile(
+      const std::string& filename, uint64_t* numGlobalNodes,
+      uint64_t *numGlobalEdges) {
+    std::ifstream graphFile(filename.c_str());
+    uint64_t vertexId{0};
+    std::string line;
+    uint64_t numNodes{0}, numEdges{0};
+    // TODO(hc): We can parallelize it by assigning disjointed
+    // ranges with some inspection.
+    // But this would be the future work as 
+    while (!graphFile.eof()) {
+      getline(graphFile, line);
+      // Skip comments.
+      if (line[0] == '#') continue;
+      // Delimiter and # tokens set for WMD data file.
+      std::vector<std::string> tokens = splitTokens(line, ',', 10);
+
+      if (tokens[0] == "Person") {
+        insertSHADVertex(ENCODE<uint64_t, std::string, UINT>(tokens[1]),
+                         TYPES::PERSON, vertexId);
+        ++numNodes;
+      } else if (tokens[0] == "ForumEvent") {
+        insertSHADVertex(ENCODE<uint64_t, std::string, UINT>(tokens[4]),
+                         TYPES::FORUMEVENT, vertexId);
+        ++numNodes;
+      } else if (tokens[0] == "Forum") {
+        insertSHADVertex(ENCODE<uint64_t, std::string, UINT>(tokens[3]),
+                         TYPES::FORUM, vertexId);
+        ++numNodes;
+      } else if (tokens[0] == "Publication") {
+        insertSHADVertex(ENCODE<uint64_t, std::string, UINT>(tokens[5]),
+                         TYPES::PUBLICATION, vertexId);
+        ++numNodes;
+      } else if (tokens[0] == "Topic") {
+        insertSHADVertex(ENCODE<uint64_t, std::string, UINT>(tokens[6]),
+                         TYPES::TOPIC, vertexId);
+        ++numNodes;
+      } else if (tokens[0] == "Sale") {
+        Edge sale(tokens);
+        insertSHADEdge(sale.src, sale);
+
+        Edge purchase = sale;
+        purchase.type = TYPES::PURCHASE;
+        std::swap(purchase.src, purchase.dst);
+        insertSHADEdge(purchase.src, purchase);
+        numEdges += 2;
+      } else if (tokens[0] == "Author") {
+        Edge authors(tokens);
+        insertSHADEdge(authors.src, authors);
+
+        Edge writtenBY = authors;
+        writtenBY.type = TYPES::WRITTENBY;
+        std::swap(writtenBY.src, writtenBY.dst);
+        std::swap(writtenBY.src_type, writtenBY.dst_type);
+        insertSHADEdge(writtenBY.src, writtenBY);
+        numEdges += 2;
+      } else if (tokens[0] == "Includes") {
+        Edge includes(tokens);
+        insertSHADEdge(includes.src, includes);
+
+        Edge includedIN = includes;
+        includedIN.type = TYPES::INCLUDEDIN;
+        std::swap(includedIN.src, includedIN.dst);
+        std::swap(includedIN.src_type, includedIN.dst_type);
+        insertSHADEdge(includedIN.src, includedIN);
+        numEdges += 2;
+      } else if (tokens[0] == "HasTopic") {
+        Edge hasTopic(tokens);
+        insertSHADEdge(hasTopic.src, hasTopic);
+
+        Edge topicIN = hasTopic;
+        topicIN.type = TYPES::TOPICIN;
+        std::swap(topicIN.src, topicIN.dst);
+        std::swap(topicIN.src_type, topicIN.dst_type);
+        insertSHADEdge(topicIN.src, topicIN);
+        numEdges += 2;
+      } else if (tokens[0] == "HasOrg") {
+        Edge hasOrg(tokens);
+        insertSHADEdge(hasOrg.src, hasOrg);
+
+        Edge orgIN = hasOrg;
+        orgIN.type = TYPES::ORGIN;
+        std::swap(orgIN.src, orgIN.dst);
+        std::swap(orgIN.src_type, orgIN.dst_type);
+        insertSHADEdge(orgIN.src, orgIN);
+        numEdges += 2;
+      }
+    }
+
+    // After the above loop, vertices and edges are complete. 
+    this->CountNumEdgesForEachVertex(numNodes, numEdges);
+    *numGlobalNodes = numNodes;
+    *numGlobalEdges = numEdges;
+
+#ifndef NDEBUG
+    this->VerifySHADGraphRead(filename);
+#endif
+  }
+
+  /**
+   * @brief Return node data array. 
+   * Note that this can be either of global graph or local graph.
+   */
+  uint64_t* getNodeDataBuffer() {
+    return nodeDataBuffer;
+  }
+
+  /**
+   * @brief Return node outgoing edge index array
+   * Note that this can be either of global graph or local graph.
+   */
+  uint64_t* getOutIndexBuffer() {
+    return outIndexBuffer;
+  }
+
+  /**
+   * @brief Construct vertex outgoing edge range buffer and
+   * vertex data buffer.
+   *
+   * @detail Extract local vertices' outgoing edge ranges and
+   * data from a temprory buffer of vertex map that is read and constructed
+   * from a SHAD CSV graph file. Note that these arrays are for local graph
+   * partition and their indices should be corresponding to local node ids.
+   *
+   * @param nodeBegin Global node ID of the first local node
+   * @param nodeEnd (Global node ID of the last local node + 1)
+   * @param numLocalNodes The number of local nodes
+   *
+   */
+  void constructNodeArrays(
+      uint32_t nodeBegin, uint32_t nodeEnd, uint32_t numLocalNodes) {
+    // 1) Construct an edge index array (size == number of nodes).
+    this->outIndexBuffer = new uint64_t[numLocalNodes];
+    this->nodeDataBuffer = new uint64_t[numLocalNodes];
+
+    // TODO(hc): for now, only consider a single host, but need to add offset later.
+    galois::do_all(galois::iterate(this->vertices),
+        [&](auto element) {
+          Vertex& vertex = element.second;
+          uint64_t vertexId = vertex.id;
+          if (vertexId >= nodeBegin && vertexId < nodeEnd) {
+            this->outIndexBuffer[vertexId - nodeBegin] =
+                vertex.getNumEdges();
+            // Fill vertex data too; This assumes that a SHAD graph
+            // has a type, which is considered as a vertex data.
+            this->nodeDataBuffer[vertexId - nodeBegin] =
+                this->to_underlying(vertex.type);
+          }
+        });
+    // 2) Perform parallel prefix sum to finalize outgoing edge index
+    // array construction.
+    galois::ParallelSTL::partial_sum(
+        outIndexBuffer, &(outIndexBuffer[numLocalNodes]),
+        outIndexBuffer);
+  }
+
+  /**
+   * @brief Construct edge destination and data arrays.
+   *
+   * @detail Extract local edge destination and data from a
+   * temprory buffer of edge map that is read and constructed
+   * from a SHAD CSV graph file. Note that these arrays are for local graph
+   * partition and their indices should be corresponding to local node ids.
+   *
+   * @tparam T Edge data type; if this is not void, edge data array is
+   * constructed
+   *
+   * @param nodeBegin Global node ID of the first local node
+   * @param edgeBegin Global edge ID of the first local edge
+   * @param numLocalNodes The number of local nodes
+   * @param numLocalEdges The number of local edges
+   *
+   */
+  template <typename T = EdgeDataTy,
+            typename std::enable_if_t<!std::is_same_v<
+                T, void>>* = nullptr>
+  void constructEdgeArrays(
+      uint32_t nodeBegin, uint64_t edgeBegin, uint32_t numLocalNodes,
+      uint64_t numLocalEdges) {
+    this->edgeDestBuffer = new uint32_t[numLocalEdges];
+    this->edgeDataBuffer = new EdgeDataTy[numLocalEdges];
+    std::vector<uint32_t> edgeIndexPointers(numLocalNodes, 0);
+    galois::on_each([&](uint32_t tid, uint32_t numThreads) {
+      // 1) Find disjointed node range for each thread.
+      auto thread_work_range =
+          galois::block_range(uint32_t{0}, numLocalNodes, tid, numThreads);
+      // 2) Each thread iterates the whole edges.
+      for (auto edgeElem : this->edges) {
+        uint64_t srcVertex = edgeElem.first;
+        Vertex& vertex = this->vertices[srcVertex];
+        uint64_t srcVertexId = vertex.id;
+        // 3) Each thread fills edge destination for the assigned nodes.
+        if (srcVertexId >= thread_work_range.first + nodeBegin &&
+            srcVertexId < thread_work_range.second + nodeBegin) {
+          uint64_t edgeIdx = edgeIndexPointers[srcVertexId - nodeBegin]++;
+          // OutIndexBuffer now contains global edge range.
+          // So we need to subtract edge offset to get the local edge id.
+          uint64_t nodeBaseOffset =
+              ((srcVertexId - nodeBegin) == 0)?
+                  0 : outIndexBuffer[srcVertexId - nodeBegin - 1] - edgeBegin;
+          edgeDestBuffer[edgeIdx + nodeBaseOffset] =
+              this->vertices[edgeElem.second.dst].id;
+          edgeDataBuffer[edgeIdx + nodeBaseOffset] =
+              to_underlying(edgeElem.second.type);
+        }
+      }
+    });
+    // Or inspector/executor model
+    // But that might be more expensive.
+  }
+
+  /**
+   * @brief Construct edge destination array
+   *
+   * @detail Extract local edge destination from a
+   * temprory buffer of edge map that is read and constructed
+   * from a SHAD CSV graph file. Note that this array is for local graph
+   * partition and their indices should be corresponding to local node ids.
+   *
+   * @tparam T Edge data type; This function is enabled when
+   * edge data type is void
+   *
+   * @param nodeBegin Global node ID of the first local node
+   * @param edgeBegin Global edge ID of the first local edge
+   * @param numLocalNodes The number of local nodes
+   * @param numLocalEdges The number of local edges
+   *
+   */
+  template <typename T = EdgeDataTy,
+            typename std::enable_if_t<std::is_same_v<
+                T, void>>* = nullptr>
+  void constructEdgeArrays(
+      uint32_t nodeBegin, uint64_t edgeBegin, uint32_t numLocalNodes,
+      uint64_t numLocalEdges) {
+    edgeDestBuffer = new uint32_t[numLocalEdges];
+    std::vector<uint32_t> edgeIndexPointers(numLocalNodes, 0);
+    galois::on_each([&](uint32_t tid, uint32_t numThreads) {
+      // 1) Find disjointed node range for each thread.
+      auto thread_work_range =
+          galois::block_range(uint32_t{0}, numLocalNodes, tid, numThreads);
+      // 2) Each thread iterates the whole edges.
+      for (auto edgeElem : this->edges) {
+        uint64_t srcVertex = edgeElem.first;
+        Vertex& vertex = this->vertices[srcVertex];
+        uint64_t srcVertexId = vertex.id;
+        // 3) Each thread fills edge destination for the assigned nodes.
+        if (srcVertexId >= thread_work_range.first + nodeBegin &&
+            srcVertexId < thread_work_range.second + nodeBegin) {
+          uint64_t edgeIdx = edgeIndexPointers[srcVertexId - nodeBegin]++;
+          uint64_t nodeBaseOffset =
+              ((srcVertexId - nodeBegin)== 0)?
+                  0 : outIndexBuffer[srcVertexId - 1] - edgeBegin;
+          edgeDestBuffer[edgeIdx + nodeBaseOffset] =
+              this->vertices[edgeElem.second.dst].id;
+        }
+      }
+    });
+    // Or inspector/executor model
+    // But that might be more expensive.
+  }
+
+  /**
+   * @brief Extract outgoing edge index ranges for local vertices
+   * from the global outgoing edge index range array. 
+   *
+   * @param nodeBegin Node global id of the first local node
+   * @param nodeEnd (Node global id for the last local node + 1)
+   */
+  void extractLocalOutIndexArray(uint32_t nodeBegin, uint32_t nodeEnd) {
+
+    uint64_t* newOutIndexBuffer = new uint64_t[nodeEnd - nodeBegin];
+    galois::do_all(galois::iterate(nodeBegin, nodeEnd),
+        [&](uint32_t n) {
+          newOutIndexBuffer[n - nodeBegin] = this->outIndexBuffer[n];
+        } );
+    delete[] this->outIndexBuffer;
+    this->outIndexBuffer = newOutIndexBuffer;
+  }
+
+  /**
+   * @brief Check if a type of a node having the passed id is
+   * equal to the one in a temporary vertex map constructed from
+   * SHAD graph file.
+   *
+   * @param id Node global id to check
+   * @param type Node type
+   *
+   * @return True if passed information matches to the one in
+   * a temporary vertex map
+   */
+  bool checkNode(uint64_t id, uint64_t type) {
+    uint64_t key = this->verticeIdKeyMapping[id];
+    Vertex& vertex = this->vertices[key];
+    return (this->to_underlying(vertex.type) == type);
+  }
+
+  /**
+   * @brief Check if a type of a edge having the passed id is
+   * equal to the one in a temporary edge map constructed from
+   * SHAD graph file.
+   *
+   * @param snid Global node ID of the source node of an edge
+   * @param dnid Global node ID of the destination node of an edge
+   * @param type Edge type
+   * @param type Edge type
+   *
+   * @return True if passed information matches to the one in
+   * a temporary edge map
+   */
+  bool checkEdge(uint64_t snid, uint64_t dnid,
+      uint64_t eid, uint64_t type) {
+    uint64_t skey = this->verticeIdKeyMapping[snid];
+    uint64_t dkey = this->verticeIdKeyMapping[dnid];
+
+    Vertex& vertex = this->vertices[skey];
+    auto edgeRange = this->edges.equal_range(skey);
+    uint64_t eidx{0};
+    Edge edge;
+    bool found{false};
+    for (auto ei = edgeRange.first ; ei != edgeRange.second; ++ei, ++eidx) {
+      edge = ei->second;
+      // Multiple edges having the same source and destination could
+      // exist. So we repeat until find the one that has the same type to
+      // the passed one.
+      if (this->vertices[edge.dst].id == dnid &&
+            this->to_underlying(edge.type) == type) {
+        found = true;
+        break;
+      }
+    }
+    return found;
+  }
+
+private:
+  /**
+   * @brief Return true if a token is a node type.
+   *
+   * @param token Token parsed from a graph file to check
+   */
+  bool isTokenNodeType(std::string token) {
+    if (token == "Person" || token == "ForumEvent" || token == "Forum" ||
+        token == "Publication" || token == "Topic") {
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  /**
+   * @brief Return true if a token is an edge type.
+   *
+   * @param token Token parsed from a graph file to check
+   */
+  bool isTokenEdgeType(std::string token) {
+    if (token == "Sale" || token == "Author" || token == "Includes" ||
+        token == "HasTopic" || token == "HasOrg") {
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  std::vector<std::string> splitTokens(
+      std::string& line, char delim, uint64_t size = 0) {
+    uint64_t ndx = 0, start = 0, end = 0;
+    std::vector <std::string> tokens(size);
+
+    for ( ; end < line.length(); end ++)  {
+      if ((line[end] == delim) || (line[end] == '\n')) {
+         tokens[ndx] = line.substr(start, end - start);
+         start = end + 1;
+         ndx ++;
+      }
+    }
+
+    // Flush the last token.
+    tokens[size - 1] = line.substr(start, end - start);
+    return tokens;
+  }
+
+  void CountNumEdgesForEachVertex(uint64_t numNodes, uint64_t numEdges) {
+    //galois::on_each([this, numNodes, numEdges](
+    galois::on_each([&](
+        uint32_t tid, uint32_t numThreads) {
+      // Each thread is assigned disjointed range of nodes.
+      // Each thread iterates edges and accumulates edges for only
+      // the nodes assigned to that.
+      auto thread_work_range =
+          galois::block_range(uint64_t{0}, numNodes, tid, numThreads);
+      for (auto edgeElem : this->edges) {
+        uint64_t srcVertex = edgeElem.first;
+        Vertex& vertex = this->vertices[srcVertex];
+        if (vertex.id >= thread_work_range.first &&
+            vertex.id < thread_work_range.second) {
+          vertex.incrNumEdges();
+        }
+      }
+    });
+
+#ifndef NDEBUG
+    this->VerifyNumEdgesPerVertex(numEdges);
+#endif
+  }
+
+  /**
+   * @brief Insert SHAD vertex to a vertex map.
+   *
+   * @param key SHAD token key
+   * @param type SHAD vertex type
+   * @param id Vertex id; Local vertex id until it is synchronized
+   */
+  void insertSHADVertex(const uint64_t& key, const TYPES& type, uint64_t& id) {
+    auto found = this->vertices.find(key);
+    if (found == this->vertices.end()) {
+      this->vertices[key] = Vertex(id, type, key); 
+      this->verticeIdKeyMapping[id] = key;
+      id++;
+    } else {
+      std::cerr << "[error] There is no reason to have duplicated vertices\n";
+    }
+  }
+
+  /**
+   * @brief Insert SHAD edge to a edge map.
+   * @detail Edges 
+   *
+   * @param vertexKey Source vertex's SHAD token key
+   * @param edge Adjacent edge of the vertex
+   */
+  void insertSHADEdge(const uint64_t& vertexKey, const Edge& edge) {
+    this->edges.insert({vertexKey, edge});
+  }
+
+  /*
+  uint64_t edge_begin(uint32_t n) {
+    return this->verticeIdKeyMapping[n]
+  */
+
+#ifndef NDEBUG
+  /**
+   * @brief Verify in-meomry SHAD graph.
+   *
+   * @param filename SHAD graph file name
+   */
+  // TODO(hc): This function can be parallelized but
+  // let me stick with sequential execution until the whole
+  // implementation works correctly.
+  void VerifySHADGraphRead(const std::string& filename) {
+    size_t numNodes{0}, numEdges{0};
+    this->InspectGraph(filename, &numNodes, &numEdges);
+    // 1) Check the number of vertices and edges.
+    assert(this->vertices.size() == numNodes);
+    // Note that edges are doubled to symmetrize a graph.
+    assert(this->edges.size() == numEdges);
+    for ([[maybe_unused]] auto& element : this->edges) {
+      // 2) Check if a source node key of the edges map is equal to a source
+      // of an edge.
+      assert(element.first == element.second.src);
+      // 3) Check if vertex information in the edges map is equal to the one
+      // in the vertex map.
+      assert(element.second.src_type ==
+          this->vertices[element.second.src].type);
+      assert(element.second.dst_type ==
+          this->vertices[element.second.dst].type);
+    }
+  }
+
+  void VerifyNumEdgesPerVertex([[maybe_unused]] uint64_t numEdges) {
+    // 4) Check if the total number of edges of each vertex is equal to
+    // the number of total edges counted during inspection.
+    uint64_t numAccumulatedEdges{0};
+    for (auto& element : this->vertices) {
+      numAccumulatedEdges += element.second.getNumEdges(); 
+    }
+    assert(numAccumulatedEdges == numEdges);
+  }
+
+  void VerifyCSRConstruction(
+      [[maybe_unused]] uint64_t* outIndexBuffer,
+      [[maybe_unused]] uint64_t* nodeDataBuffer,
+      [[maybe_unused]] uint32_t* edgeDestBuffer,
+      [[maybe_unused]] void* edgeDataBuffer) {}
+
+  template <typename T = EdgeDataTy,
+            typename std::enable_if_t<std::is_same_v<
+                T, uint64_t>>* = nullptr>
+  void VerifyCSRConstruction(
+      uint64_t* outIndexBuffer, [[maybe_unused]] uint64_t* nodeDataBuffer,
+      uint32_t* edgeDestBuffer, EdgeDataTy* edgeDataBuffer) {
+    // 1) Iterate edge index array.
+    // 2) Compare each verteices' edge range with SHAD vertex 
+    for (size_t i = 0; i < this->vertices.size(); ++i) {
+      Vertex& srcV = this->vertices[this->verticeIdKeyMapping[i]];
+      uint64_t srcShadKey = srcV.shadKey;
+      assert(this->verticeIdKeyMapping[i] == srcV.shadKey);
+      uint64_t edgeBegin = (i == 0)? 0 : outIndexBuffer[i - 1];
+      uint64_t edgeEnd = outIndexBuffer[i];
+      assert(srcV.numEdges == edgeEnd - edgeBegin);
+      assert(this->to_underlying(srcV.type) == int(nodeDataBuffer[i]));
+      assert(srcV.id == i);
+      galois::do_all(galois::iterate(edgeBegin, edgeEnd),
+          [&](size_t j) {
+        uint32_t dstV = edgeDestBuffer[j];
+        [[maybe_unused]] uint64_t edgeData = edgeDataBuffer[j];
+
+        [[maybe_unused]] bool found{false};
+        auto edgeRange = this->edges.equal_range(srcShadKey);
+        size_t cnt{0};
+        for (auto ei = edgeRange.first ; ei != edgeRange.second; ++ei) {
+          Edge& edge = ei->second;
+          if (this->vertices[edge.dst].id == dstV) {
+            // Multiple edges between vertices are possible.
+            if (this->to_underlying(edge.type) == int(edgeData)) {
+              assert(this->vertices[edge.src].id == i);
+              assert(this->vertices[edge.src].id == srcV.id);
+              found = true;
+            }
+          }
+          cnt++;
+        }
+        assert((edgeEnd - edgeBegin) == cnt);
+        /*
+        for (auto i = this->edges.begin(); i != this->edges.end(); ++i) {
+          std::cout << srcId << " vs " << i->first << "\n"; 
+        }
+        */
+        assert(found);
+      }, galois::steal());
+    }
+  }
+#endif
+
+  /**
+   * @brief Cast a type to an underlying type; in case of scoped enum,
+   * this should be an integral type.
+   *
+   * @param e 
+   */
+  template <typename E>
+  constexpr typename std::underlying_type<E>::type to_underlying(E e) noexcept {
+      return static_cast<typename std::underlying_type<E>::type>(e);
+  }
+
+  // This holds the whole global vertices and their 
+  // information such as its type. A key is globla node ID, and its value
+  // is the information.
+  std::unordered_map<uint64_t, Vertex> vertices;
+  // This holds the whole global edges and their information
+  // such as its type. The key is global source node ID, and its
+  // value is an edge iterator pointing to adjacent edges to the source.
+  std::unordered_multimap<uint64_t, Edge> edges;
+  // Key is global node id and value is corresponding key of that node
+  std::unordered_map<uint64_t, uint64_t> verticeIdKeyMapping;
+  // TODO(hc): Always assume uint64_t node data type
+  uint64_t* nodeDataBuffer;
+  uint64_t* outIndexBuffer;
+  uint32_t* edgeDestBuffer;
+  EdgeDataTy* edgeDataBuffer;
+};
+
+}; // shad namespace
+
+#endif
diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt
index 22a18c7fdf..98df493175 100644
--- a/libgnn/CMakeLists.txt
+++ b/libgnn/CMakeLists.txt
@@ -15,15 +15,20 @@ set(sources
   src/layers/SoftmaxLayer.cpp
 )
 
+## TODO(hc): Note that these libraries should be hard-coded
+## based on your own system.
+## These should be automatic library linking.
 set(MKL_LIBRARIES ${MKL_ROOT}/lib/intel64)
+set(INTEL_COMPILER_LIBRARIES /home/hochan/intel/oneapi/compiler/2023.1.0/linux/compiler/lib/intel64_lin)
 set(INTEL_LIBS "-lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5")
 set(SINGLE_INTEL_LIBS "-lmkl_intel_lp64 -lmkl_sequential -lmkl_core")
 
 add_library(galois_gnn STATIC ${sources})
 
 target_link_directories(galois_gnn PUBLIC ${MKL_LIBRARIES})
-target_link_libraries(galois_gnn galois_shmem)
+target_link_directories(galois_gnn PUBLIC ${INTEL_COMPILER_LIBRARIES})
 target_link_libraries(galois_gnn ${INTEL_LIBS})
+target_link_libraries(galois_gnn galois_shmem)
 target_link_libraries(galois_gnn galois_dist_async galois_cusp galois_gluon galois_support)
 target_include_directories(galois_gnn PUBLIC
   ${CMAKE_CURRENT_SOURCE_DIR}/include
diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index fff1d03ed4..447facef39 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -44,10 +44,11 @@ class GNNGraph {
   //  galois::LargeArray<std::vector<bool>>>;
 
   GNNGraph(const std::string& dataset_name, GNNPartitionScheme partition_scheme,
-           bool has_single_class_label);
+           bool has_single_class_label, bool useShad = false);
   //! Loads a graph and all relevant metadata (labels, features, masks, etc.)
   GNNGraph(const std::string& input_directory, const std::string& dataset_name,
-           GNNPartitionScheme partition_scheme, bool has_single_class_label);
+           GNNPartitionScheme partition_scheme, bool has_single_class_label,
+           bool useShad = false);
 
   //! Returns host id
   size_t host_id() const { return host_id_; }
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index 4a83753670..8fc68ea193 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -11,7 +11,8 @@ namespace {
 std::unique_ptr<galois::graphs::GNNGraph::GNNDistGraph>
 LoadPartition(const std::string& input_directory,
               const std::string& dataset_name,
-              galois::graphs::GNNPartitionScheme partition_scheme) {
+              galois::graphs::GNNPartitionScheme partition_scheme,
+              bool useShad) {
   // XXX input path
   std::string input_file = input_directory + dataset_name + ".csgr";
   GALOIS_LOG_VERBOSE("Partition loading: File to read is {}", input_file);
@@ -20,13 +21,13 @@ LoadPartition(const std::string& input_directory,
   switch (partition_scheme) {
   case galois::graphs::GNNPartitionScheme::kOEC:
     return galois::cuspPartitionGraph<GnnOEC, char, void>(
-        input_file, galois::CUSP_CSR, galois::CUSP_CSR, true, "", "", false, 1);
+        input_file, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true, "", "", false, 1);
   case galois::graphs::GNNPartitionScheme::kCVC:
     return galois::cuspPartitionGraph<GnnCVC, char, void>(
-        input_file, galois::CUSP_CSR, galois::CUSP_CSR, true, "", "", false, 1);
+        input_file, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true, "", "", false, 1);
   case galois::graphs::GNNPartitionScheme::kOCVC:
     return galois::cuspPartitionGraph<GenericCVC, char, void>(
-        input_file, galois::CUSP_CSR, galois::CUSP_CSR, true, "", "", false, 1);
+        input_file, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true, "", "", false, 1);
   default:
     GALOIS_LOG_FATAL("Error: partition scheme specified is invalid");
     return nullptr;
@@ -65,14 +66,16 @@ unsigned layer_number_to_sync;
 
 galois::graphs::GNNGraph::GNNGraph(const std::string& dataset_name,
                                    GNNPartitionScheme partition_scheme,
-                                   bool has_single_class_label)
+                                   bool has_single_class_label,
+                                   bool useShad)
     : GNNGraph(galois::default_gnn_dataset_path, dataset_name, partition_scheme,
-               has_single_class_label) {}
+               has_single_class_label, useShad) {}
 
 galois::graphs::GNNGraph::GNNGraph(const std::string& input_directory,
                                    const std::string& dataset_name,
                                    GNNPartitionScheme partition_scheme,
-                                   bool has_single_class_label)
+                                   bool has_single_class_label,
+                                   bool useShad)
     : input_directory_(input_directory) {
   GALOIS_LOG_VERBOSE("[{}] Constructing partitioning for {}", host_id_,
                      dataset_name);
@@ -84,7 +87,7 @@ galois::graphs::GNNGraph::GNNGraph(const std::string& input_directory,
       std::string("] ");
   // load partition
   partitioned_graph_ =
-      LoadPartition(input_directory_, dataset_name, partition_scheme);
+      LoadPartition(input_directory_, dataset_name, partition_scheme, useShad);
   // reverse edges
   partitioned_graph_->ConstructIncomingEdges();
 
diff --git a/lonestar/analytics/distributed/CMakeLists.txt b/lonestar/analytics/distributed/CMakeLists.txt
index fa3046c679..546937cbda 100644
--- a/lonestar/analytics/distributed/CMakeLists.txt
+++ b/lonestar/analytics/distributed/CMakeLists.txt
@@ -6,6 +6,5 @@ add_subdirectory(connected-components)
 add_subdirectory(k-core)
 add_subdirectory(pagerank)
 add_subdirectory(partition)
-add_subdirectory(matrixcompletion)
 add_subdirectory(sssp)
 add_subdirectory(triangle-counting)
diff --git a/lonestar/gnn/include/DistributedGraphLoader.h b/lonestar/gnn/include/DistributedGraphLoader.h
index ac9cf02060..87b12de63d 100644
--- a/lonestar/gnn/include/DistributedGraphLoader.h
+++ b/lonestar/gnn/include/DistributedGraphLoader.h
@@ -107,6 +107,8 @@ namespace cll = llvm::cl;
 extern cll::opt<std::string> dataset;
 //! partitioning scheme to use
 extern cll::opt<galois::graphs::PARTITIONING_SCHEME> partitionScheme;
+//! true if input graph file format is SHAD WMD
+extern cll::opt<bool> useShad;
 
 // @todo command line argument for read balancing across hosts
 
@@ -130,27 +132,26 @@ template <typename NodeData, typename EdgeData>
 std::unique_ptr<DistGraph<NodeData, EdgeData>> constructSymmetricGraph(std::vector<unsigned>&) {
   std::string inputFile = deepgalois::path + dataset + ".csgr";
   galois::gInfo("File to read is ", inputFile);
-
   switch (partitionScheme) {
   case OEC:
   case IEC:
     return cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, "");
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true, "");
   case HOVC:
   case HIVC:
     return cuspPartitionGraph<GenericHVC, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, "");
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true, "");
 
   case CART_VCUT:
   case CART_VCUT_IEC:
     return cuspPartitionGraph<GenericCVC, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, "");
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true, "");
   case GNN_OEC:
     return cuspPartitionGraph<GnnOEC, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, "");
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true, "");
   case GNN_CVC:
     return cuspPartitionGraph<GnnCVC, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, "");
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true, "");
   default:
     GALOIS_DIE("Error: partition scheme specified is invalid");
     return nullptr;
diff --git a/lonestar/gnn/src/DistributedGraphLoader.cpp b/lonestar/gnn/src/DistributedGraphLoader.cpp
index e18340fe82..611a7c3e50 100644
--- a/lonestar/gnn/src/DistributedGraphLoader.cpp
+++ b/lonestar/gnn/src/DistributedGraphLoader.cpp
@@ -41,3 +41,8 @@ cll::opt<PARTITIONING_SCHEME> partitionScheme(
         clEnumValN(GNN_CVC, "g-cvc",
                    "gnn cvc: train nodes evenly distributed")),
     cll::init(GNN_OEC));
+
+cll::opt<bool> useShad("useShad", cll::desc("true if the input graph is"
+                                            " SHAD WMD graph format."
+                                            " Otheriwse, set false."),
+                       cll::init(false));
diff --git a/lonestar/libdistbench/include/DistBench/Input.h b/lonestar/libdistbench/include/DistBench/Input.h
index 088bc82444..396b01a983 100644
--- a/lonestar/libdistbench/include/DistBench/Input.h
+++ b/lonestar/libdistbench/include/DistBench/Input.h
@@ -99,6 +99,8 @@ extern cll::opt<std::string> inputFileTranspose;
 extern cll::opt<bool> symmetricGraph;
 //! partitioning scheme to use
 extern cll::opt<PARTITIONING_SCHEME> partitionScheme;
+//! true if input graph file format is SHAD WMD
+extern cll::opt<bool> useShad;
 ////! path to vertex id map for custom edge cut
 // extern cll::opt<std::string> vertexIDMapFileName;
 //! true if you want to read graph structure from a file
@@ -143,18 +145,18 @@ constructSymmetricGraph(std::vector<unsigned>& GALOIS_UNUSED(scaleFactor)) {
   case OEC:
   case IEC:
     return galois::cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, inputFileTranspose,
-        mastersFile);
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true,
+        inputFileTranspose, mastersFile);
   case HOVC:
   case HIVC:
     return galois::cuspPartitionGraph<GenericHVC, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true,
         inputFileTranspose);
 
   case CART_VCUT:
   case CART_VCUT_IEC:
     return galois::cuspPartitionGraph<GenericCVC, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true,
         inputFileTranspose);
 
     // case CEC:
@@ -164,18 +166,18 @@ constructSymmetricGraph(std::vector<unsigned>& GALOIS_UNUSED(scaleFactor)) {
   case GINGER_O:
   case GINGER_I:
     return galois::cuspPartitionGraph<GingerP, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad ,true,
         inputFileTranspose);
 
   case FENNEL_O:
   case FENNEL_I:
     return galois::cuspPartitionGraph<FennelP, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true,
         inputFileTranspose);
 
   case SUGAR_O:
     return galois::cuspPartitionGraph<SugarP, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true,
         inputFileTranspose);
   default:
     GALOIS_DIE("partition scheme specified is invalid: ", partitionScheme);
@@ -204,19 +206,19 @@ constructGraph(std::vector<unsigned>& GALOIS_UNUSED(scaleFactor)) {
   auto& net = galois::runtime::getSystemNetworkInterface();
   if (net.Num == 1) {
     return galois::cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, false,
         inputFileTranspose);
   }
 
   switch (partitionScheme) {
   case OEC:
     return galois::cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, false,
         inputFileTranspose, mastersFile);
   case IEC:
     if (inputFileTranspose.size()) {
       return galois::cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
-          inputFile, galois::CUSP_CSC, galois::CUSP_CSR, false,
+          inputFile, galois::CUSP_CSC, galois::CUSP_CSR, useShad, false,
           inputFileTranspose, mastersFile);
     } else {
       GALOIS_DIE("incoming edge cut requires transpose graph");
@@ -225,12 +227,12 @@ constructGraph(std::vector<unsigned>& GALOIS_UNUSED(scaleFactor)) {
 
   case HOVC:
     return galois::cuspPartitionGraph<GenericHVC, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, false,
         inputFileTranspose);
   case HIVC:
     if (inputFileTranspose.size()) {
       return galois::cuspPartitionGraph<GenericHVC, NodeData, EdgeData>(
-          inputFile, galois::CUSP_CSC, galois::CUSP_CSR, false,
+          inputFile, galois::CUSP_CSC, galois::CUSP_CSR, useShad, false,
           inputFileTranspose);
     } else {
       GALOIS_DIE("incoming hybrid cut requires transpose graph");
@@ -239,13 +241,13 @@ constructGraph(std::vector<unsigned>& GALOIS_UNUSED(scaleFactor)) {
 
   case CART_VCUT:
     return galois::cuspPartitionGraph<GenericCVC, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, false,
         inputFileTranspose);
 
   case CART_VCUT_IEC:
     if (inputFileTranspose.size()) {
       return galois::cuspPartitionGraph<GenericCVC, NodeData, EdgeData>(
-          inputFile, galois::CUSP_CSC, galois::CUSP_CSR, false,
+          inputFile, galois::CUSP_CSC, galois::CUSP_CSR, useShad, false,
           inputFileTranspose);
     } else {
       GALOIS_DIE("cvc incoming cut requires transpose graph");
@@ -258,12 +260,12 @@ constructGraph(std::vector<unsigned>& GALOIS_UNUSED(scaleFactor)) {
 
   case GINGER_O:
     return galois::cuspPartitionGraph<GingerP, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, false,
         inputFileTranspose);
   case GINGER_I:
     if (inputFileTranspose.size()) {
       return galois::cuspPartitionGraph<GingerP, NodeData, EdgeData>(
-          inputFile, galois::CUSP_CSC, galois::CUSP_CSR, false,
+          inputFile, galois::CUSP_CSC, galois::CUSP_CSR, useShad, false,
           inputFileTranspose);
     } else {
       GALOIS_DIE("Ginger requires transpose graph");
@@ -272,12 +274,12 @@ constructGraph(std::vector<unsigned>& GALOIS_UNUSED(scaleFactor)) {
 
   case FENNEL_O:
     return galois::cuspPartitionGraph<FennelP, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, false,
         inputFileTranspose);
   case FENNEL_I:
     if (inputFileTranspose.size()) {
       return galois::cuspPartitionGraph<FennelP, NodeData, EdgeData>(
-          inputFile, galois::CUSP_CSC, galois::CUSP_CSR, false,
+          inputFile, galois::CUSP_CSC, galois::CUSP_CSR, useShad, false,
           inputFileTranspose);
     } else {
       GALOIS_DIE("Fennel requires transpose graph");
@@ -286,7 +288,7 @@ constructGraph(std::vector<unsigned>& GALOIS_UNUSED(scaleFactor)) {
 
   case SUGAR_O:
     return galois::cuspPartitionGraph<SugarP, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, false,
         inputFileTranspose);
 
   default:
@@ -318,7 +320,7 @@ DistGraphPtr<NodeData, EdgeData> constructGraph(std::vector<unsigned>&) {
   if (net.Num == 1) {
     if (inputFileTranspose.size()) {
       return galois::cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
-          inputFile, galois::CUSP_CSC, galois::CUSP_CSC, false,
+          inputFile, galois::CUSP_CSC, galois::CUSP_CSC, useShad, false,
           inputFileTranspose);
     } else {
       fprintf(stderr, "WARNING: Loading transpose graph through in-memory "
@@ -326,7 +328,7 @@ DistGraphPtr<NodeData, EdgeData> constructGraph(std::vector<unsigned>&) {
                       "graph with -graphTranspose to avoid unnecessary "
                       "overhead.\n");
       return galois::cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
-          inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false,
+          inputFile, galois::CUSP_CSR, galois::CUSP_CSC, useShad, false,
           inputFileTranspose);
     }
   }
@@ -334,12 +336,12 @@ DistGraphPtr<NodeData, EdgeData> constructGraph(std::vector<unsigned>&) {
   switch (partitionScheme) {
   case OEC:
     return galois::cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSC, useShad, false,
         inputFileTranspose, mastersFile);
   case IEC:
     if (inputFileTranspose.size()) {
       return galois::cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
-          inputFile, galois::CUSP_CSC, galois::CUSP_CSC, false,
+          inputFile, galois::CUSP_CSC, galois::CUSP_CSC, useShad, false,
           inputFileTranspose, mastersFile);
     } else {
       GALOIS_DIE("iec requires transpose graph");
@@ -348,12 +350,12 @@ DistGraphPtr<NodeData, EdgeData> constructGraph(std::vector<unsigned>&) {
 
   case HOVC:
     return galois::cuspPartitionGraph<GenericHVC, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSC, useShad, false,
         inputFileTranspose);
   case HIVC:
     if (inputFileTranspose.size()) {
       return galois::cuspPartitionGraph<GenericHVC, NodeData, EdgeData>(
-          inputFile, galois::CUSP_CSC, galois::CUSP_CSC, false,
+          inputFile, galois::CUSP_CSC, galois::CUSP_CSC, useShad, false,
           inputFileTranspose);
     } else {
       GALOIS_DIE("hivc requires transpose graph");
@@ -362,13 +364,14 @@ DistGraphPtr<NodeData, EdgeData> constructGraph(std::vector<unsigned>&) {
 
   case CART_VCUT:
     return galois::cuspPartitionGraph<GenericCVCColumnFlip, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSC, useShad, false,
         inputFileTranspose);
   case CART_VCUT_IEC:
     if (inputFileTranspose.size()) {
       return galois::cuspPartitionGraph<GenericCVCColumnFlip, NodeData,
                                         EdgeData>(inputFile, galois::CUSP_CSC,
-                                                  galois::CUSP_CSC, false,
+                                                  galois::CUSP_CSC, useShad,
+                                                  false,
                                                   inputFileTranspose);
     } else {
       GALOIS_DIE("cvc requires transpose graph");
@@ -377,12 +380,12 @@ DistGraphPtr<NodeData, EdgeData> constructGraph(std::vector<unsigned>&) {
 
   case GINGER_O:
     return galois::cuspPartitionGraph<GingerP, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSC, useShad, false,
         inputFileTranspose);
   case GINGER_I:
     if (inputFileTranspose.size()) {
       return galois::cuspPartitionGraph<GingerP, NodeData, EdgeData>(
-          inputFile, galois::CUSP_CSC, galois::CUSP_CSC, false,
+          inputFile, galois::CUSP_CSC, galois::CUSP_CSC, useShad, false,
           inputFileTranspose);
     } else {
       GALOIS_DIE("Ginger requires transpose graph");
@@ -391,12 +394,12 @@ DistGraphPtr<NodeData, EdgeData> constructGraph(std::vector<unsigned>&) {
 
   case FENNEL_O:
     return galois::cuspPartitionGraph<FennelP, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSC, useShad, false,
         inputFileTranspose);
   case FENNEL_I:
     if (inputFileTranspose.size()) {
       return galois::cuspPartitionGraph<FennelP, NodeData, EdgeData>(
-          inputFile, galois::CUSP_CSC, galois::CUSP_CSC, false,
+          inputFile, galois::CUSP_CSC, galois::CUSP_CSC, useShad, false,
           inputFileTranspose);
     } else {
       GALOIS_DIE("Fennel requires transpose graph");
@@ -405,7 +408,7 @@ DistGraphPtr<NodeData, EdgeData> constructGraph(std::vector<unsigned>&) {
 
   case SUGAR_O:
     return galois::cuspPartitionGraph<SugarColumnFlipP, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSC, useShad, false,
         inputFileTranspose);
 
   default:
diff --git a/lonestar/libdistbench/src/Input.cpp b/lonestar/libdistbench/src/Input.cpp
index 495f68c0c5..04321bd14e 100644
--- a/lonestar/libdistbench/src/Input.cpp
+++ b/lonestar/libdistbench/src/Input.cpp
@@ -60,6 +60,11 @@ cll::opt<PARTITIONING_SCHEME> partitionScheme(
                    "fennel, incoming edge cut, using CuSP")),
     cll::init(OEC));
 
+cll::opt<bool> useShad("useShad", cll::desc("true if the input graph is"
+                                            " SHAD WMD graph format."
+                                            " Otheriwse, set false."),
+                       cll::init(false));
+
 cll::opt<bool> readFromFile("readFromFile",
                             cll::desc("Set this flag if graph is to be "
                                       "constructed from file (file must be "
diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp
index 7719340224..d1685b8e2b 100644
--- a/lonestar/libgnnbench/src/Input.cpp
+++ b/lonestar/libgnnbench/src/Input.cpp
@@ -26,6 +26,11 @@ llvm::cl::opt<galois::graphs::GNNPartitionScheme> partition_scheme(
                            "Original Cartesian Vertex-Cut")),
     cll::init(galois::graphs::GNNPartitionScheme::kOEC));
 
+cll::opt<bool> useShad("useShad", cll::desc("true if the input graph is"
+                                            " SHAD WMD graph format."
+                                            " Otheriwse, set false."),
+                       cll::init(false));
+
 llvm::cl::opt<unsigned> num_layers(
     "numLayers",
     cll::desc(
@@ -341,7 +346,8 @@ std::vector<unsigned> CreateFanOutVector() {
 std::unique_ptr<galois::GraphNeuralNetwork> InitializeGraphNeuralNetwork() {
   // partition/load graph
   auto gnn_graph = std::make_unique<galois::graphs::GNNGraph>(
-      input_directory, input_name, partition_scheme, !multiclass_labels);
+      input_directory, input_name, partition_scheme, !multiclass_labels,
+      useShad);
 
   // create layer types vector
   std::vector<galois::GNNLayerType> layer_types = CreateLayerTypesVector();
diff --git a/lonestar/scientific/cpu/longestedge/test/catch.hpp b/lonestar/scientific/cpu/longestedge/test/catch.hpp
index 6c1756a6ce..841b9c8128 100644
--- a/lonestar/scientific/cpu/longestedge/test/catch.hpp
+++ b/lonestar/scientific/cpu/longestedge/test/catch.hpp
@@ -10723,6 +10723,13 @@ PVOID FatalConditionHandler::exceptionHandlerHandle = nullptr;
 
 #elif defined( CATCH_CONFIG_POSIX_SIGNALS )
 
+// MINSIGSTKSZ is not constexpr in the recent Linux, and so,
+// requires manual declaration for backward compatibility.
+// This number is from
+// https://stackoverflow.com/questions/71454588/minsigstksz-error-after-update-in-my-manjaro-linux`
+#undef MINSIGSTKSZ
+#define MINSIGSTKSZ 16384
+
 namespace Catch {
 
     struct SignalDefs {

From d9a1c69839a74d0ffd8cfba2d854c1199e87366c Mon Sep 17 00:00:00 2001
From: "Lee, Hochan" <133701794+hochanlee-amd@users.noreply.github.com>
Date: Wed, 2 Aug 2023 14:17:18 -0500
Subject: [PATCH 603/660] Use a separate vector to specify sampled nodes
 instead of a node label to extend its usage (#4)

Co-authored-by: Hochan Lee <hochan.amd@amd.com>
---
 libgnn/include/galois/graphs/GNNGraph.h | 14 ++++++++------
 libgnn/src/graphs/GNNGraph.cpp          |  2 ++
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 447facef39..6dbfdfbcf1 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -582,29 +582,29 @@ class GNNGraph {
   //////////////////////////////////////////////////////////////////////////////
 
   //! Makes a node "sampled"; used for debugging/testing
-  void SetSampledNode(size_t node) { partitioned_graph_->getData(node) = 1; }
+  void SetSampledNode(size_t node) { mark_sampled_nodes_[node] = 1; }
   //! Makes a node "not sampled"; used for debugging/testing
-  void UnsetSampledNode(size_t node) { partitioned_graph_->getData(node) = 0; }
+  void UnsetSampledNode(size_t node) { mark_sampled_nodes_[node] = 0; }
 
   //! Returns true if a particular node is currently considered "in" a sampled
   //! graph
   bool IsInSampledGraph(const NodeIterator& ni) const {
     // TODO(loc) GPU
     assert(*ni < size());
-    return partitioned_graph_->getData(*ni);
+    return mark_sampled_nodes_[*ni];
   }
   bool IsInSampledGraph(size_t node_id) const {
     // TODO(loc) GPU
     assert(node_id < size());
-    return partitioned_graph_->getData(node_id);
+    return mark_sampled_nodes_[node_id];
   }
   bool IsInSampledGraphSubgraph(size_t node_id) const {
     // TODO(loc) GPU
     assert(node_id < size());
     if (use_subgraph_) {
-      return partitioned_graph_->getData(ConvertToLID(node_id));
+      return mark_sampled_nodes_[ConvertToLID(node_id)];
     } else {
-      return partitioned_graph_->getData(node_id);
+      return mark_sampled_nodes_[node_id];
     }
   }
 
@@ -850,6 +850,8 @@ class GNNGraph {
   DGAccumulator<size_t> local_false_positive_;
   DGAccumulator<size_t> local_false_negative_;
 
+  std::vector<char> mark_sampled_nodes_;
+
   bool use_timer_{true};
 };
 
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index 8fc68ea193..b0ed03d34c 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -90,6 +90,8 @@ galois::graphs::GNNGraph::GNNGraph(const std::string& input_directory,
       LoadPartition(input_directory_, dataset_name, partition_scheme, useShad);
   // reverse edges
   partitioned_graph_->ConstructIncomingEdges();
+  // mark a node if it is sampled
+  mark_sampled_nodes_.resize(partitioned_graph_->size());
 
   galois::gInfo(host_prefix_, "Number of local proxies is ",
                 partitioned_graph_->size());

From 01cdb4ccfecebfa8ec32aa23663662b2ec9481f4 Mon Sep 17 00:00:00 2001
From: "Lee, Hochan" <133701794+hochanlee-amd@users.noreply.github.com>
Date: Wed, 30 Aug 2023 21:51:54 -0500
Subject: [PATCH 604/660] Templatizing graphs and layers, feature construction
 and training/testing/validation vertices marking

This commit contains three updates.
First, it templatizes graphs and layers to support various node and edge data types, other than char and void.
Second, this commit performs histogram-based feature construction like AGILE does.
Third, this commit selects training/testing/validation vertices like AGILE does.
This commit passed a correctness check on data10.csv, 4hosts, and CVC partitioning policy.
---
 .../include/galois/graphs/CuSPPartitioner.h   |   12 +-
 .../include/galois/graphs/DistributedGraph.h  |   28 +-
 libcusp/include/galois/graphs/NewGeneric.h    |  186 +-
 libcusp/test/shad-dist-graph.cpp              |   22 +-
 libgalois/include/shad/ShadGraphConverter.h   |   54 +-
 .../include/galois/graphs/GluonSubstrate.h    |    2 +-
 .../include/galois/runtime/SyncStructures.h   |    8 +-
 libgnn/CMakeLists.txt                         |   11 +-
 libgnn/include/galois/GraphNeuralNetwork.h    |  848 +++++++-
 .../galois/graphs/DegreeSyncStructures.h      |   28 +-
 libgnn/include/galois/graphs/GNNGraph.h       | 1882 ++++++++++++++++-
 libgnn/include/galois/graphs/GNNSubgraph.h    |  456 +++-
 .../graphs/GraphAggregationSyncStructures.h   |  161 +-
 libgnn/include/galois/layers/DenseLayer.h     |  154 +-
 libgnn/include/galois/layers/GNNLayer.h       |  563 ++++-
 .../galois/layers/GraphConvolutionalLayer.h   |  485 ++++-
 libgnn/include/galois/layers/L2NormLayer.h    |  133 +-
 libgnn/include/galois/layers/SAGELayer.h      |  946 ++++++++-
 libgnn/include/galois/layers/SigmoidLayer.h   |  123 +-
 libgnn/include/galois/layers/SoftmaxLayer.h   |  156 +-
 libgnn/src/DistributedMinibatchTracker.cpp    |    4 +-
 libgnn/src/GNNMath.cpp                        |    4 +-
 libgnn/src/GraphNeuralNetwork.cpp             |  818 -------
 libgnn/src/graphs/GNNGraph.cpp                | 1539 +-------------
 libgnn/src/graphs/GNNSubgraph.cpp             |  440 ----
 libgnn/src/layers/DenseLayer.cpp              |  144 --
 libgnn/src/layers/GNNLayer.cpp                |  557 -----
 libgnn/src/layers/GraphConvolutionalLayer.cpp |  459 ----
 libgnn/src/layers/L2NormLayer.cpp             |  124 --
 libgnn/src/layers/SAGELayer.cpp               |  879 --------
 libgnn/src/layers/SigmoidLayer.cpp            |  113 -
 libgnn/src/layers/SoftmaxLayer.cpp            |  139 --
 libgnn/test/CMakeLists.txt                    |    2 +
 libgnn/test/accuracy-test.cpp                 |    6 +-
 libgnn/test/aggregate-sync-test.cpp           |   20 +-
 libgnn/test/back-conv-test.cpp                |    8 +-
 libgnn/test/convlayer-test.cpp                |   16 +-
 libgnn/test/epoch-test.cpp                    |    6 +-
 libgnn/test/f1-test.cpp                       |    4 +-
 libgnn/test/gnnconstruct-test.cpp             |    8 +-
 libgnn/test/gnnfb-test.cpp                    |   12 +-
 libgnn/test/gnngraph-test.cpp                 |    8 +-
 libgnn/test/gpu-adam-test.cpp                 |    4 +-
 libgnn/test/gpu-aggregate-sync-test.cpp       |   12 +-
 libgnn/test/gpu-back-conv-test.cpp            |    8 +-
 libgnn/test/gpu-convlayer-test.cpp            |   16 +-
 libgnn/test/gpu-epoch-test.cpp                |    6 +-
 libgnn/test/gpu-sage-layer-test.cpp           |   12 +-
 libgnn/test/gpu-softmaxlayer-test.cpp         |    6 +-
 libgnn/test/l2norm-layer-test.cpp             |    6 +-
 libgnn/test/multilabel-epoch-test.cpp         |    6 +-
 libgnn/test/multilabel-read.cpp               |    4 +-
 libgnn/test/sage-layer-test.cpp               |   12 +-
 libgnn/test/sample-bit-test.cpp               |    4 +-
 libgnn/test/sample-test.cpp                   |   16 +-
 libgnn/test/sigmoidlayer-test.cpp             |    6 +-
 libgnn/test/softmaxlayer-test.cpp             |    6 +-
 lonestar/gnn/distributed/gcn/gcn-dist.cpp     |    5 +-
 lonestar/gnn/include/DistributedGraphLoader.h |   12 +-
 lonestar/gnn/src/DistributedGraphLoader.cpp   |    6 +-
 .../libdistbench/include/DistBench/Input.h    |   64 +-
 .../libdistbench/include/DistBench/Output.h   |    1 +
 lonestar/libdistbench/src/Input.cpp           |    6 +-
 lonestar/libgnnbench/include/GNNBench/Input.h |  161 +-
 lonestar/libgnnbench/src/Input.cpp            |  144 +-
 lonestar/libgnnbench/src/Start.cpp            |   27 +
 .../shad-gnn/check_feature_construction.py    |   51 +
 67 files changed, 6328 insertions(+), 5841 deletions(-)
 delete mode 100644 libgnn/src/GraphNeuralNetwork.cpp
 delete mode 100644 libgnn/src/layers/SoftmaxLayer.cpp
 create mode 100644 scripts/shad-gnn/check_feature_construction.py

diff --git a/libcusp/include/galois/graphs/CuSPPartitioner.h b/libcusp/include/galois/graphs/CuSPPartitioner.h
index 6b7fef6dab..5541be426d 100644
--- a/libcusp/include/galois/graphs/CuSPPartitioner.h
+++ b/libcusp/include/galois/graphs/CuSPPartitioner.h
@@ -50,7 +50,7 @@ using DistGraphPtr =
  * to the partitioner
  * @param outputType Specifies the output format (CSR or CSC) that each
  * partition will be created in
- * @param useShad "true" if the passed graph file format is a SHAD WMD graph
+ * @param useWMD "true" if the passed graph file format is a WMD graph
  * @param symmetricGraph This should be "true" if the passed in graphFile
  * is a symmetric graph
  * @param transposeGraphFile Transpose graph of graphFile in Galois binary
@@ -84,8 +84,8 @@ template <typename PartitionPolicy, typename NodeData = char,
           typename EdgeData = void>
 DistGraphPtr<NodeData, EdgeData>
 cuspPartitionGraph(std::string graphFile, CUSP_GRAPH_TYPE inputType,
-                   CUSP_GRAPH_TYPE outputType, bool useShad = false,
-                   bool symmetricGraph = false,
+                   CUSP_GRAPH_TYPE outputType, bool useWMD = false,
+                   bool symmetricGraph            = false,
                    std::string transposeGraphFile = "",
                    std::string masterBlockFile = "", bool cuspAsync = true,
                    uint32_t cuspStateRounds = 100,
@@ -128,13 +128,13 @@ cuspPartitionGraph(std::string graphFile, CUSP_GRAPH_TYPE inputType,
     }
 
     return std::make_unique<DistGraphConstructor>(
-        inputToUse, net.ID, net.Num, useShad, cuspAsync, cuspStateRounds, useTranspose,
-        readPolicy, nodeWeight, edgeWeight, masterBlockFile);
+        inputToUse, net.ID, net.Num, useWMD, cuspAsync, cuspStateRounds,
+        useTranspose, readPolicy, nodeWeight, edgeWeight, masterBlockFile);
   } else {
     // symmetric graph path: assume the passed in graphFile is a symmetric
     // graph; output is also symmetric
     return std::make_unique<DistGraphConstructor>(
-        graphFile, net.ID, net.Num, useShad, cuspAsync, cuspStateRounds, false,
+        graphFile, net.ID, net.Num, useWMD, cuspAsync, cuspStateRounds, false,
         readPolicy, nodeWeight, edgeWeight, masterBlockFile);
   }
 }
diff --git a/libcusp/include/galois/graphs/DistributedGraph.h b/libcusp/include/galois/graphs/DistributedGraph.h
index 415afba33d..540b25e120 100644
--- a/libcusp/include/galois/graphs/DistributedGraph.h
+++ b/libcusp/include/galois/graphs/DistributedGraph.h
@@ -436,7 +436,7 @@ class DistGraph {
       // TODO(hc):
       auto r = galois::graphs::divideNodesBinarySearch(
           numGlobalNodes, numGlobalEdges, 0, edgeWeight, (id + d * numHosts),
-              numHosts * DecomposeFactor, outIndices, scalefactor);
+          numHosts * DecomposeFactor, outIndices, scalefactor);
       gid2host[id + d * numHosts].first  = *(r.first.first);
       gid2host[id + d * numHosts].second = *(r.first.second);
     }
@@ -504,9 +504,9 @@ class DistGraph {
    * @todo make this function work with decompose factor
    */
   void computeMastersBalancedNodesAndEdges(
-      uint64_t numGlobalNodes, uint64_t numGlobalEdges,
-      uint64_t* outIndices, const std::vector<unsigned>& scalefactor,
-      uint32_t nodeWeight, uint32_t edgeWeight, unsigned) {
+      uint64_t numGlobalNodes, uint64_t numGlobalEdges, uint64_t* outIndices,
+      const std::vector<unsigned>& scalefactor, uint32_t nodeWeight,
+      uint32_t edgeWeight, unsigned) {
     if (nodeWeight == 0) {
       nodeWeight = numGlobalEdges / numGlobalNodes; // average degree
     }
@@ -517,8 +517,8 @@ class DistGraph {
     auto& net = galois::runtime::getSystemNetworkInterface();
     gid2host.resize(numHosts);
     auto r = galois::graphs::divideNodesBinarySearch(
-        numGlobalNodes, numGlobalEdges, nodeWeight, edgeWeight,
-            id, numHosts, outIndices, scalefactor);
+        numGlobalNodes, numGlobalEdges, nodeWeight, edgeWeight, id, numHosts,
+        outIndices, scalefactor);
     gid2host[id].first  = *r.first.first;
     gid2host[id].second = *r.first.second;
     for (unsigned h = 0; h < numHosts; ++h) {
@@ -543,7 +543,6 @@ class DistGraph {
     increment_evilPhase();
   }
 
-
 protected:
   /**
    * Wrapper call that will call into more specific compute masters
@@ -628,19 +627,17 @@ class DistGraph {
     // compute masters for all nodes
     switch (masters_distribution) {
     case BALANCED_MASTERS:
-      computeMastersBlockedNodes(
-          numGlobalNodes, scalefactor, DecomposeFactor);
+      computeMastersBlockedNodes(numGlobalNodes, scalefactor, DecomposeFactor);
       break;
     case BALANCED_MASTERS_AND_EDGES:
-      computeMastersBalancedNodesAndEdges(
-          numGlobalNodes, numGlobalEdges, outIndices,
-          scalefactor, nodeWeight, edgeWeight, DecomposeFactor);
+      computeMastersBalancedNodesAndEdges(numGlobalNodes, numGlobalEdges,
+                                          outIndices, scalefactor, nodeWeight,
+                                          edgeWeight, DecomposeFactor);
       break;
     case BALANCED_EDGES_OF_MASTERS:
     default:
-      computeMastersBalancedEdges(
-          numGlobalNodes, numGlobalEdges, outIndices,
-          scalefactor, edgeWeight, DecomposeFactor);
+      computeMastersBalancedEdges(numGlobalNodes, numGlobalEdges, outIndices,
+                                  scalefactor, edgeWeight, DecomposeFactor);
       break;
     }
 
@@ -658,7 +655,6 @@ class DistGraph {
     return numNodes_to_divide;
   }
 
-
   //! reader assignment from a file
   //! corresponds to master assignment if using an edge cut
   void readersFromFile(galois::graphs::OfflineGraph& g, std::string filename) {
diff --git a/libcusp/include/galois/graphs/NewGeneric.h b/libcusp/include/galois/graphs/NewGeneric.h
index 49c96a965c..9fa37159f1 100644
--- a/libcusp/include/galois/graphs/NewGeneric.h
+++ b/libcusp/include/galois/graphs/NewGeneric.h
@@ -223,8 +223,8 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
    */
   NewDistGraphGeneric(
       const std::string& filename, unsigned host, unsigned _numHosts,
-      bool useShad = false, bool cuspAsync = true, uint32_t stateRounds = 100,
-      bool transpose = false,
+      bool useWMD = false, bool cuspAsync = true, uint32_t stateRounds = 100,
+      bool transpose                          = false,
       galois::graphs::MASTERS_DISTRIBUTION md = BALANCED_EDGES_OF_MASTERS,
       uint32_t nodeWeight = 0, uint32_t edgeWeight = 0,
       std::string masterBlockFile = "", bool readFromFile = false,
@@ -246,24 +246,36 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
 
     galois::graphs::OfflineGraph* offlineGraph{nullptr};
 
-    shad::ShadGraphConverter<EdgeTy> shadConverter;
+    std::string host_prefix =
+        std::string("[") +
+        std::to_string(galois::runtime::getSystemNetworkInterface().ID) +
+        std::string("] ");
+
+    shad::ShadGraphConverter shadConverter;
     galois::graphs::BufferedGraph<EdgeTy> bufGraph;
     bufGraph.resetReadCounters();
 
     std::vector<unsigned> dummy;
     // not actually getting masters, but getting assigned readers for nodes
     if (masterBlockFile == "") {
-      if (useShad) {
-        std::cout << "Construct a distributed graph from SHAD WMD format.\n";
+      if (useWMD) {
         uint64_t numGlobalNodes{0}, numGlobalEdges{0};
+        galois::gInfo(host_prefix, "Starts reading SHAD graph file");
         // Read and load the whole SHAD WMD dataset to memory.
         // TODO(hc): Note that this reads the entire graph.
         //           We will improve this to read partial graphs
         //           on each host later. For now, the main focus is
         //           to enable WMD dataset for the workflows.
         shadConverter.readSHADFile(filename, &numGlobalNodes, &numGlobalEdges);
+        galois::gInfo(host_prefix, "Completes reading SHAD graph file");
         base_DistGraph::numGlobalNodes = numGlobalNodes;
         base_DistGraph::numGlobalEdges = numGlobalEdges;
+
+        galois::gInfo(host_prefix,
+                      "Read graph # nodes:", std::to_string(numGlobalNodes),
+                      " # edges:", std::to_string(numGlobalEdges));
+        galois::gInfo(host_prefix, "Starts node array construction from SHAD"
+                                   " graph");
         // Construct node data/outgoing index range arrays
         // for a GLOBAL array, not a local array.
         // Later, parts for the local graph partition will be
@@ -281,24 +293,23 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
         // local nodes.
         // TODO(hc): UT will improve and redesign this part to
         // get scalability.
-        shadConverter.constructNodeArrays(
-            0, numGlobalNodes, numGlobalNodes);
-
+        shadConverter.constructNodeArrays(0, numGlobalNodes, numGlobalNodes);
+        galois::gInfo(host_prefix, "Completes node array construction from SHAD"
+                                   " graph");
         // Compute master proxies by using the number of global nodes
         // and edges.
         base_DistGraph::computeMasters(
-            md, base_DistGraph::numGlobalNodes,
-            base_DistGraph::numGlobalEdges,
-            shadConverter.getOutIndexBuffer(), dummy, nodeWeight,
-            edgeWeight);
+            md, base_DistGraph::numGlobalNodes, base_DistGraph::numGlobalEdges,
+            shadConverter.getOutIndexBuffer(), dummy, nodeWeight, edgeWeight);
       } else {
         offlineGraph = new galois::graphs::OfflineGraph(filename);
         base_DistGraph::numGlobalNodes = offlineGraph->size();
         base_DistGraph::numGlobalEdges = offlineGraph->sizeEdges();
-        base_DistGraph::computeMasters(md, *offlineGraph, dummy, nodeWeight, edgeWeight);
+        base_DistGraph::computeMasters(md, *offlineGraph, dummy, nodeWeight,
+                                       edgeWeight);
       }
     } else {
-      if (useShad) {
+      if (useWMD) {
         GALOIS_DIE("SHAD graph format does not support master block file");
       }
       galois::gInfo("Getting reader assignment from file");
@@ -317,11 +328,13 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
     if (!trainPoints.empty()) {
       std::vector<unsigned> testDistribution =
           galois::graphs::determineUnitRangesFromPrefixSum(
-              base_DistGraph::numHosts, *offlineGraph, trainPoints[0], trainPoints[1]);
+              base_DistGraph::numHosts, *offlineGraph, trainPoints[0],
+              trainPoints[1]);
 
       std::vector<unsigned> restDistribution =
           galois::graphs::determineUnitRangesFromPrefixSum(
-              base_DistGraph::numHosts, *offlineGraph, trainPoints[1], offlineGraph->size());
+              base_DistGraph::numHosts, *offlineGraph, trainPoints[1],
+              offlineGraph->size());
 
       // create global distribution of edges
       std::vector<uint32_t> mappings(offlineGraph->size());
@@ -371,9 +384,9 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
     graphReadTimer.start();
 
     uint64_t nodeBegin = base_DistGraph::gid2host[base_DistGraph::id].first;
-    uint64_t nodeEnd = base_DistGraph::gid2host[base_DistGraph::id].second;
+    uint64_t nodeEnd   = base_DistGraph::gid2host[base_DistGraph::id].second;
 
-    if (!useShad) {
+    if (!useWMD) {
       // If the input graph is not SHAD WMD format,
       // construct a buffered graph from the file directly, as ordinary.
       typename galois::graphs::OfflineGraph::edge_iterator edgeBegin =
@@ -384,40 +397,8 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
                                 *edgeEnd, base_DistGraph::numGlobalNodes,
                                 base_DistGraph::numGlobalEdges);
     } else {
-      // Now construct arrays for in-memory CSR.
-      // In case of the node out-going edge range array and
-      // the node data array, it will extract parts corresponding to 
-      // local graph paritition from the arrays holding the global
-      // array information.
-      // Edge destination and data arrays are constructed based on
-      // unrefined maps constructed from SHAD graph reading.
-      // NOTE that those arrays all store GLOBAL node ids.
-      // For example, edge destination array's size is equal
-      // to the number of local edges, but its destination ID is
-      // global node IDs, not local node IDs.
-      uint32_t numLocalNodes = nodeEnd - nodeBegin;
-      // So, this holds outgoing edge array of a whole (global) graph.
-      uint64_t *outIndexBuffer = shadConverter.getOutIndexBuffer();
-      // Global edge id range assigned to the current host.
-      uint64_t edgeBegin =
-          (nodeBegin == 0)? 0 : outIndexBuffer[nodeBegin - 1];
-      // This is the last local node's edge range end.
-      // So, [edgeBegin, edgeEnd) is for this current host.
-      uint64_t edgeEnd = outIndexBuffer[nodeEnd - 1];
-      // Extract node out-going range and data arrays of local nodes.
-      // From now on, those arrays store local node information
-      // as a dense memory representation.
-      shadConverter.extractLocalOutIndexArray(
-          nodeBegin, nodeEnd);
-
-      uint64_t numLocalEdges = edgeEnd - edgeBegin;
-      shadConverter.constructEdgeArrays(
-          nodeBegin, edgeBegin, numLocalNodes, numLocalEdges);
-      // Construct a buffered graph that is used by CuSP to partition
-      // a graph.
-      shadConverter.constructBufferedGraph(
-          base_DistGraph::numGlobalNodes, base_DistGraph::numGlobalEdges,
-          nodeBegin, nodeEnd, edgeBegin, edgeEnd, &bufGraph);
+      constructCSRFromSHADGraph(
+          &bufGraph, &shadConverter, nodeBegin, nodeEnd, host_prefix);
     }
 
     graphReadTimer.stop();
@@ -547,9 +528,9 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
     Tgraph_construct.stop();
     galois::gDebug("[", base_DistGraph::id, "] Graph construction complete.");
 
-    if (useShad) {
+    if (useWMD) {
       // Different from the gr format file that has been used by Galois
-      // and does not contain node data in the file, 
+      // and does not contain node data in the file,
       // a SHAD graph file has a single type for each node, and it
       // is considered as node data.
       // This function constructs and sets node data (type).
@@ -604,11 +585,79 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
     return toReturn;
   }
 
+  /// Construct arrays for in-memory CSR.
+  /// In case of the node out-going edge range array and
+  /// the node data array, it will extract parts corresponding to
+  /// local graph paritition from the arrays holding the global
+  /// array information.
+  /// Edge destination and data arrays are constructed based on
+  /// unordered maps constructed from SHAD graph reading.
+  /// NOTE that those arrays for CSR all store GLOBAL node ids.
+  /// For example, edge destination array's size is equal
+  /// to the number of local edges, but its destination ID is
+  /// global node IDs, not local node IDs.
+  ///
+  /// @tparam T Graph node data type
+  ///
+  /// @param bufGraph Buffered graph to construct
+  /// @param shadConverter Shad graph ingestor which ingested
+  /// a SHAD graph in memory to an unordered node/edge map
+  /// @param nodeBegin Global id of the first local node range
+  /// @param nodeEnd Global id of the last local node range
+  /// @param host_prefix Log prefix string for this host
+  template <
+      typename T                                                      = NodeTy,
+      typename std::enable_if_t<std::is_same_v<T, shad::ShadNodeTy>>* = nullptr>
+  void constructCSRFromSHADGraph(
+      galois::graphs::BufferedGraph<EdgeTy>* bufGraph,
+      shad::ShadGraphConverter* shadConverter,
+      uint64_t nodeBegin, uint64_t nodeEnd, std::string host_prefix) {
+    uint32_t numLocalNodes = nodeEnd - nodeBegin;
+    // So, this holds outgoing edge array of a whole (global) graph.
+    uint64_t* outIndexBuffer = shadConverter->getOutIndexBuffer();
+    // Global edge id range assigned to the current host.
+    uint64_t edgeBegin = (nodeBegin == 0) ? 0 : outIndexBuffer[nodeBegin - 1];
+    // This is the last local node's edge range end.
+    // So, [edgeBegin, edgeEnd) is for this current host.
+    uint64_t edgeEnd = outIndexBuffer[nodeEnd - 1];
+    galois::gInfo(host_prefix, "Starts local out index array construction");
+    // Extract node out-going range and data arrays of local nodes.
+    // From now on, those arrays store local node information
+    // as a dense memory representation.
+    shadConverter->extractLocalOutIndexArray(nodeBegin, nodeEnd);
+    galois::gInfo(host_prefix,
+                  "Completes local out index array construction");
+
+    galois::gInfo(host_prefix, "Starts edge destination/data "
+                               "array construction");
+    uint64_t numLocalEdges = edgeEnd - edgeBegin;
+    shadConverter->constructEdgeArrays(nodeBegin, edgeBegin, numLocalNodes,
+                                      numLocalEdges);
+
+    galois::gInfo(host_prefix, "Completes edge destination/data "
+                               "array construction");
+    // Construct a buffered graph that is used by CuSP to partition
+    // a graph.
+    shadConverter->constructBufferedGraph(
+        base_DistGraph::numGlobalNodes, base_DistGraph::numGlobalEdges,
+        nodeBegin, nodeEnd, edgeBegin, edgeEnd, bufGraph);
+    galois::gInfo(host_prefix, "Completes buffered graph construction from"
+                               " SHAD graph");
+  }
+
+  // Disable this method for non-SHAD graph construction.
+  template <
+      typename T                                                      = NodeTy,
+      typename std::enable_if_t<!std::is_same_v<T, shad::ShadNodeTy>>* = nullptr>
+  void constructCSRFromSHADGraph(
+      galois::graphs::BufferedGraph<EdgeTy>*,
+      shad::ShadGraphConverter*, uint64_t, uint64_t, std::string) {}
+
   /**
    * @brief Assign a SHAD node type to a node data.
    *
    * @detail Different from the gr format file that has been used by Galois
-   * and does not contain node data in the file, 
+   * and does not contain node data in the file,
    * a SHAD graph file has a single type for each node, and it
    * considered as node data. This function constructs and sets node
    * data based on that.
@@ -620,26 +669,23 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
    * @param shadConverter SHAD graph converter holding node data from a
    * SHAD file.
    */
-  template <typename T = NodeTy,
-            typename std::enable_if_t<
-                std::is_same_v<T, uint64_t>>* = nullptr>
-  void assignNodeDataFromSHADProp(shad::ShadGraphConverter<EdgeTy>* shadConverter) {
+  template <
+      typename T                                                      = NodeTy,
+      typename std::enable_if_t<std::is_same_v<T, shad::ShadNodeTy>>* = nullptr>
+  void assignNodeDataFromSHADProp(shad::ShadGraphConverter* shadConverter) {
     galois::gPrint("[", base_DistGraph::id, "] Graph node data is assigned.");
-    uint64_t* nodeDataBuffer = shadConverter->getNodeDataBuffer();
+    shad::ShadNodeTy* nodeDataBuffer = shadConverter->getNodeDataBuffer();
     galois::do_all(galois::iterate(base_DistGraph::allNodesRange()),
-        [&](uint32_t lid) {
-          uint64_t gid = this->getGID(lid);
-          this->getData(lid) = nodeDataBuffer[gid];
-          std::cout << "lid :" << lid << " is set to " <<
-          this->getData(lid) << "\n";
-        });
+                   [&](uint32_t lid) {
+                     uint64_t gid       = this->getGID(lid);
+                     this->getData(lid) = nodeDataBuffer[gid];
+                   });
   }
 
   template <typename T = NodeTy,
-            typename std::enable_if_t<
-                !std::is_same_v<T, uint64_t>>* = nullptr>
-  void assignNodeDataFromSHADProp(
-      [[maybe_unused]] shad::ShadGraphConverter<EdgeTy>* shadConverter) {}
+            typename std::enable_if_t<!std::is_same_v<T, shad::ShadNodeTy>>* =
+                nullptr>
+  void assignNodeDataFromSHADProp(shad::ShadGraphConverter*) {}
 
   /**
    * For each other host, determine which nodes that this host needs to get
diff --git a/libcusp/test/shad-dist-graph.cpp b/libcusp/test/shad-dist-graph.cpp
index fe71231295..dedc3c34cb 100644
--- a/libcusp/test/shad-dist-graph.cpp
+++ b/libcusp/test/shad-dist-graph.cpp
@@ -29,15 +29,20 @@ int main() {
   //M = 1;
   galois::setActiveThreads(M);
 
-  shad::ShadGraphConverter<uint64_t> shadConverter;
+  shad::ShadGraphConverter shadConverter;
   size_t numNodes{0}, numEdges{0};
 
-  std::string filename = "/home/hochan/data.csv";
+  // TODO(hc): This path should be properly set based on user's environment.
+  // Later, this test dataset will be included in the Galois repository, and
+  // will use a relative path.
+  std::string filename = "/home/hochan/data.01.csv";
   shadConverter.readSHADFile(filename, &numNodes, &numEdges);
-  std::unique_ptr<galois::graphs::DistGraph<uint64_t, uint64_t>>
-      graph = galois::cuspPartitionGraph<GenericCVC, uint64_t, uint64_t>(
+  std::unique_ptr<galois::graphs::DistGraph<shad::ShadNodeTy, shad::ShadEdgeTy>>
+      graph = galois::cuspPartitionGraph<GenericCVC, shad::ShadNodeTy, shad::ShadEdgeTy>(
           filename, galois::CUSP_CSR, galois::CUSP_CSR, true, true);
 
+  std::cout << "Test starts...\n";
+
   galois::DGAccumulator<uint64_t> sumGlobalNodes;
   galois::DGAccumulator<uint64_t> sumGlobalEdges;
 
@@ -55,13 +60,16 @@ int main() {
   assert(reducedSumGlobalEdges == numEdges);
   assert(reducedSumGlobalEdges == graph->globalSizeEdges());
 
+  std::cout << "Num. nodes/edges tests has been passed\n";
+
   uint32_t id = galois::runtime::getSystemNetworkInterface().ID;
   uint32_t numHosts = galois::runtime::getSystemNetworkInterface().Num;
   {
   std::ofstream fp(std::to_string(id) + ".master");
   for (uint32_t src = 0; src < graph->numMasters(); ++src) {
     uint64_t srcglobal = graph->getGID(src);
-    fp << "node " << srcglobal << ", type: " << graph->getData(src) << "\n";
+    fp << "node " << srcglobal << ", type: " << graph->getData(src).type << 
+      ", key: " << graph->getData(src).key << "\n";
     for (auto e : graph->edges(src)) {
       uint32_t dstlocal = graph->getEdgeDst(e);
       uint64_t dstglobal = graph->getGID(dstlocal);
@@ -78,11 +86,12 @@ int main() {
       std::ofstream fp(std::to_string(id) + "-" + std::to_string(host) + ".graph");
       for (uint32_t i = 0; i < graph->size(); ++i) {
         fp << i << ", " << graph->getGID(i) << ", " <<
-          graph->getData(i) << "\n";
+          graph->getData(i).type << ", " << graph->getData(i).key << "\n";
       }
       fp.close();
     }
   }
+#if 0
   {
   for (uint32_t host = 0; host < numHosts; ++host) {
     if (host == id) {
@@ -113,6 +122,7 @@ int main() {
     fp.close();
     }
   }
+#endif
 
   return 0;
 }
diff --git a/libgalois/include/shad/ShadGraphConverter.h b/libgalois/include/shad/ShadGraphConverter.h
index 5162fc8dfb..4b1c0351db 100644
--- a/libgalois/include/shad/ShadGraphConverter.h
+++ b/libgalois/include/shad/ShadGraphConverter.h
@@ -12,11 +12,16 @@
 
 namespace shad {
 
+struct ShadNodeTy {
+  int type;
+  uint64_t key;
+};
+using ShadEdgeTy = uint64_t;
+
 /**
  * TODO(hc): This is a shared-memory version.
  * Later, a distributed-memory version in libgluon will reuse this code.
  */
-template <typename EdgeDataTy>
 class ShadGraphConverter {
 
 public:
@@ -39,13 +44,14 @@ class ShadGraphConverter {
     for (size_t i = 0; i < this->verticeIdKeyMapping.size(); ++i) {
       uint64_t key = this->verticeIdKeyMapping[i];
       Vertex v = this->vertices[key];
-      fp << "node " << i << ", type: " << to_underlying(v.type) << "\n";
+      fp << "node " << i << ", type: " << to_underlying(v.type) << ", key: " <<
+        key << "\n";
       auto edgeRange = this->edges.equal_range(key);
       for (auto ei = edgeRange.first ; ei != edgeRange.second; ++ei) {
         Edge& edge = ei->second;
         Vertex dst = this->vertices[edge.dst];
         fp << "\t edge dst " << dst.id << ", type: " <<
-            to_underlying(edge.type) << "\n";
+            to_underlying(edge.type) << ", key: " << dst.shadKey << "\n";
       }
     }
     fp.close();
@@ -110,7 +116,7 @@ class ShadGraphConverter {
       uint64_t numGlobalNodes, uint64_t numGlobalEdges,
       uint32_t nodeBegin, uint32_t nodeEnd,
       uint64_t edgeBegin, uint64_t edgeEnd,
-      [[maybe_unused]]galois::graphs::BufferedGraph<EdgeDataTy>* bufferedGraph) {
+      [[maybe_unused]]galois::graphs::BufferedGraph<ShadEdgeTy>* bufferedGraph) {
     // TODO(hc): Each of these functions first construct graphs in the SHAD
     // format as this file is written in not binary, but string, and also 
     // nodes or edges are not sorted. So, until we preprocess the input graph
@@ -250,7 +256,7 @@ class ShadGraphConverter {
    * @brief Return node data array. 
    * Note that this can be either of global graph or local graph.
    */
-  uint64_t* getNodeDataBuffer() {
+  ShadNodeTy* getNodeDataBuffer() {
     return nodeDataBuffer;
   }
 
@@ -280,7 +286,7 @@ class ShadGraphConverter {
       uint32_t nodeBegin, uint32_t nodeEnd, uint32_t numLocalNodes) {
     // 1) Construct an edge index array (size == number of nodes).
     this->outIndexBuffer = new uint64_t[numLocalNodes];
-    this->nodeDataBuffer = new uint64_t[numLocalNodes];
+    this->nodeDataBuffer = new ShadNodeTy[numLocalNodes];
 
     // TODO(hc): for now, only consider a single host, but need to add offset later.
     galois::do_all(galois::iterate(this->vertices),
@@ -292,8 +298,13 @@ class ShadGraphConverter {
                 vertex.getNumEdges();
             // Fill vertex data too; This assumes that a SHAD graph
             // has a type, which is considered as a vertex data.
-            this->nodeDataBuffer[vertexId - nodeBegin] =
+            this->nodeDataBuffer[vertexId - nodeBegin].type =
                 this->to_underlying(vertex.type);
+            this->nodeDataBuffer[vertexId - nodeBegin].key =
+                vertex.shadKey;
+            //std::cout << vertexId - nodeBegin << " is set to "
+            //<< this->nodeDataBuffer[vertexId - nodeBegin].type << " and " <<
+            //this->nodeDataBuffer[vertexId - nodeBegin].key << "\n";
           }
         });
     // 2) Perform parallel prefix sum to finalize outgoing edge index
@@ -320,14 +331,14 @@ class ShadGraphConverter {
    * @param numLocalEdges The number of local edges
    *
    */
-  template <typename T = EdgeDataTy,
+  template <typename T = ShadEdgeTy,
             typename std::enable_if_t<!std::is_same_v<
                 T, void>>* = nullptr>
   void constructEdgeArrays(
       uint32_t nodeBegin, uint64_t edgeBegin, uint32_t numLocalNodes,
       uint64_t numLocalEdges) {
     this->edgeDestBuffer = new uint32_t[numLocalEdges];
-    this->edgeDataBuffer = new EdgeDataTy[numLocalEdges];
+    this->edgeDataBuffer = new ShadEdgeTy[numLocalEdges];
     std::vector<uint32_t> edgeIndexPointers(numLocalNodes, 0);
     galois::on_each([&](uint32_t tid, uint32_t numThreads) {
       // 1) Find disjointed node range for each thread.
@@ -375,7 +386,7 @@ class ShadGraphConverter {
    * @param numLocalEdges The number of local edges
    *
    */
-  template <typename T = EdgeDataTy,
+  template <typename T = ShadEdgeTy,
             typename std::enable_if_t<std::is_same_v<
                 T, void>>* = nullptr>
   void constructEdgeArrays(
@@ -437,7 +448,7 @@ class ShadGraphConverter {
    * @return True if passed information matches to the one in
    * a temporary vertex map
    */
-  bool checkNode(uint64_t id, uint64_t type) {
+  bool checkNode(uint64_t id, int type) {
     uint64_t key = this->verticeIdKeyMapping[id];
     Vertex& vertex = this->vertices[key];
     return (this->to_underlying(vertex.type) == type);
@@ -457,11 +468,8 @@ class ShadGraphConverter {
    * a temporary edge map
    */
   bool checkEdge(uint64_t snid, uint64_t dnid,
-      uint64_t eid, uint64_t type) {
+      uint64_t /*eid*/, int type) {
     uint64_t skey = this->verticeIdKeyMapping[snid];
-    uint64_t dkey = this->verticeIdKeyMapping[dnid];
-
-    Vertex& vertex = this->vertices[skey];
     auto edgeRange = this->edges.equal_range(skey);
     uint64_t eidx{0};
     Edge edge;
@@ -527,7 +535,7 @@ class ShadGraphConverter {
     return tokens;
   }
 
-  void CountNumEdgesForEachVertex(uint64_t numNodes, uint64_t numEdges) {
+  void CountNumEdgesForEachVertex(uint64_t numNodes, uint64_t /*numEdges*/) {
     //galois::on_each([this, numNodes, numEdges](
     galois::on_each([&](
         uint32_t tid, uint32_t numThreads) {
@@ -626,16 +634,16 @@ class ShadGraphConverter {
 
   void VerifyCSRConstruction(
       [[maybe_unused]] uint64_t* outIndexBuffer,
-      [[maybe_unused]] uint64_t* nodeDataBuffer,
+      [[maybe_unused]] ShadNodeTy* nodeDataBuffer,
       [[maybe_unused]] uint32_t* edgeDestBuffer,
       [[maybe_unused]] void* edgeDataBuffer) {}
 
-  template <typename T = EdgeDataTy,
+  template <typename T = ShadEdgeTy,
             typename std::enable_if_t<std::is_same_v<
                 T, uint64_t>>* = nullptr>
   void VerifyCSRConstruction(
-      uint64_t* outIndexBuffer, [[maybe_unused]] uint64_t* nodeDataBuffer,
-      uint32_t* edgeDestBuffer, EdgeDataTy* edgeDataBuffer) {
+      uint64_t* outIndexBuffer, [[maybe_unused]] ShadNodeTy* nodeDataBuffer,
+      uint32_t* edgeDestBuffer, ShadEdgeTy* edgeDataBuffer) {
     // 1) Iterate edge index array.
     // 2) Compare each verteices' edge range with SHAD vertex 
     for (size_t i = 0; i < this->vertices.size(); ++i) {
@@ -645,7 +653,7 @@ class ShadGraphConverter {
       uint64_t edgeBegin = (i == 0)? 0 : outIndexBuffer[i - 1];
       uint64_t edgeEnd = outIndexBuffer[i];
       assert(srcV.numEdges == edgeEnd - edgeBegin);
-      assert(this->to_underlying(srcV.type) == int(nodeDataBuffer[i]));
+      assert(this->to_underlying(srcV.type) == int(nodeDataBuffer[i].type));
       assert(srcV.id == i);
       galois::do_all(galois::iterate(edgeBegin, edgeEnd),
           [&](size_t j) {
@@ -701,10 +709,10 @@ class ShadGraphConverter {
   // Key is global node id and value is corresponding key of that node
   std::unordered_map<uint64_t, uint64_t> verticeIdKeyMapping;
   // TODO(hc): Always assume uint64_t node data type
-  uint64_t* nodeDataBuffer;
+  ShadNodeTy* nodeDataBuffer;
   uint64_t* outIndexBuffer;
   uint32_t* edgeDestBuffer;
-  EdgeDataTy* edgeDataBuffer;
+  ShadEdgeTy* edgeDataBuffer;
 };
 
 }; // shad namespace
diff --git a/libgluon/include/galois/graphs/GluonSubstrate.h b/libgluon/include/galois/graphs/GluonSubstrate.h
index dc834357b5..ec24bf2ce6 100644
--- a/libgluon/include/galois/graphs/GluonSubstrate.h
+++ b/libgluon/include/galois/graphs/GluonSubstrate.h
@@ -1443,7 +1443,7 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
                         typename FnTy::ValTy::value_type* location_to_write) {
     if (syncType == syncReduce) {
       FnTy::ExtractDirect(lid, location_to_write);
-      char dummy = 0;
+      typename FnTy::NodeTy dummy{};
       FnTy::reset(lid, dummy);
     } else {
       FnTy::ExtractDirect(lid, location_to_write);
diff --git a/libgluon/include/galois/runtime/SyncStructures.h b/libgluon/include/galois/runtime/SyncStructures.h
index 75398c4f02..56cf8dd311 100644
--- a/libgluon/include/galois/runtime/SyncStructures.h
+++ b/libgluon/include/galois/runtime/SyncStructures.h
@@ -1985,14 +1985,16 @@ class FieldFlags {
 #define GALOIS_SYNC_STRUCTURE_GNN_LAYER(fieldname, cuda_ctx_for_sync,          \
                                         gnn_matrix_to_sync_column_length_,     \
                                         layer_number_to_sync)                  \
+  template <typename NTy>                                                      \
   struct GNNSumAggregate_##fieldname {                                         \
+    using NodeTy = NTy;                                                        \
     using ValTy = GNNFloat;                                                    \
                                                                                \
-    static ValTy extract(uint32_t, char&) { return 0.f; }                      \
+    static ValTy extract(uint32_t, NodeTy&) { return 0.f; }                    \
                                                                                \
-    static bool reduce(uint32_t, char&, ValTy) { return false; }               \
+    static bool reduce(uint32_t, NodeTy&, ValTy) { return false; }             \
                                                                                \
-    static void reset(uint32_t, char&) {}                                      \
+    static void reset(uint32_t, NodeTy&) {}                                    \
                                                                                \
     static void setVal(uint32_t, char&, ValTy) {}                              \
                                                                                \
diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt
index 98df493175..ca799c34b4 100644
--- a/libgnn/CMakeLists.txt
+++ b/libgnn/CMakeLists.txt
@@ -2,17 +2,8 @@ set(sources
   src/DistributedMinibatchTracker.cpp
   src/GNNMath.cpp
   src/GNNOptimizers.cpp
-  src/GraphNeuralNetwork.cpp
   src/MinibatchGenerator.cpp
   src/graphs/GNNGraph.cpp
-  src/graphs/GNNSubgraph.cpp
-  src/layers/DenseLayer.cpp
-  src/layers/GNNLayer.cpp
-  src/layers/GraphConvolutionalLayer.cpp
-  src/layers/L2NormLayer.cpp
-  src/layers/SAGELayer.cpp
-  src/layers/SigmoidLayer.cpp
-  src/layers/SoftmaxLayer.cpp
 )
 
 ## TODO(hc): Note that these libraries should be hard-coded
@@ -26,8 +17,8 @@ set(SINGLE_INTEL_LIBS "-lmkl_intel_lp64 -lmkl_sequential -lmkl_core")
 add_library(galois_gnn STATIC ${sources})
 
 target_link_directories(galois_gnn PUBLIC ${MKL_LIBRARIES})
-target_link_directories(galois_gnn PUBLIC ${INTEL_COMPILER_LIBRARIES})
 target_link_libraries(galois_gnn ${INTEL_LIBS})
+target_link_directories(galois_gnn PUBLIC ${INTEL_COMPILER_LIBRARIES})
 target_link_libraries(galois_gnn galois_shmem)
 target_link_libraries(galois_gnn galois_dist_async galois_cusp galois_gluon galois_support)
 target_include_directories(galois_gnn PUBLIC
diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h
index 7aa859c84c..c63175f65e 100644
--- a/libgnn/include/galois/GraphNeuralNetwork.h
+++ b/libgnn/include/galois/GraphNeuralNetwork.h
@@ -9,6 +9,14 @@
 #include "galois/graphs/GNNGraph.h"
 #include "galois/layers/GNNLayer.h"
 #include "galois/DistributedMinibatchTracker.h"
+#include "galois/GNNMath.h"
+#include "galois/GraphNeuralNetwork.h"
+#include "galois/layers/DenseLayer.h"
+#include "galois/layers/GraphConvolutionalLayer.h"
+#include "galois/layers/L2NormLayer.h"
+#include "galois/layers/SAGELayer.h"
+#include "galois/layers/SigmoidLayer.h"
+#include "galois/layers/SoftmaxLayer.h"
 
 #ifdef GALOIS_ENABLE_GPU
 #include "galois/GraphNeuralNetwork.cuh"
@@ -139,19 +147,195 @@ class GraphNeuralNetworkConfig {
 
 //! Class representing the graph neural network: contains the graph to train as
 //! well as all the layers that comprise it
+template <typename VTy, typename ETy>
 class GraphNeuralNetwork {
 public:
   //! Construct the graph neural network given the graph to train on as well as
   //! a configuration object
-  GraphNeuralNetwork(std::unique_ptr<graphs::GNNGraph> graph,
+  GraphNeuralNetwork(std::unique_ptr<graphs::GNNGraph<VTy, ETy>> graph,
                      std::unique_ptr<BaseOptimizer> optimizer,
-                     GraphNeuralNetworkConfig&& config);
+                     GraphNeuralNetworkConfig&& config)
+      : graph_(std::move(graph)), optimizer_(std::move(optimizer)),
+        config_(std::move(config)) {
+    if (config_.do_sampling_ && config_.use_train_subgraph_) {
+      GALOIS_LOG_FATAL("Do not set train subgraph and sampling at same time "
+                       "(sampling uses training subgraph already)");
+    }
+    // max number of rows that can be passed as inputs; allocate space for it as
+    // this will be the # of rows for each layer
+    size_t max_rows = graph_->size();
+
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      graph_->ResizeGPULayerVector(config_.num_intermediate_layers());
+    }
+#endif
+    // used for chaining layers together; begins as nullptr
+    PointerWithSize<GNNFloat> prev_output_layer(nullptr, 0);
+    num_graph_user_layers_ = 0;
+
+    // create the intermediate layers
+    for (size_t i = 0; i < config_.num_intermediate_layers(); i++) {
+      GNNLayerType layer_type = config_.intermediate_layer_type(i);
+      size_t prev_layer_columns;
+
+      if (i != 0) {
+        // grab previous layer's size
+        prev_layer_columns = config_.intermediate_layer_size(i - 1);
+      } else {
+        // first layer means the input columns are # features in graph
+        prev_layer_columns = graph_->node_feature_length();
+      }
+
+      // max dims
+      GNNLayerDimensions layer_dims = {.input_rows    = max_rows,
+                                       .input_columns = prev_layer_columns,
+                                       .output_columns =
+                                           config_.intermediate_layer_size(i),
+                                       .output_rows = max_rows};
+
+      // test minibatch size: if it's not enabled, then currently the full
+      // graph is used (should really only subgraph the test nodes, though;
+      // that's a TODO)
+      if ((config_.train_minibatch_size() || config_.use_train_subgraph_) &&
+          config_.test_minibatch_size()) {
+        galois::gInfo("Not allocating rows");
+        // set to 0 here to make it allocate nothing
+        layer_dims.input_rows  = 0;
+        layer_dims.output_rows = 0;
+      }
+
+      switch (layer_type) {
+      case GNNLayerType::kGraphConvolutional:
+        gnn_layers_.push_back(
+            std::move(std::make_unique<GraphConvolutionalLayer<VTy, ETy>>(
+                i, *graph_, &prev_output_layer, layer_dims,
+                config_.default_layer_config())));
+        gnn_layers_.back()->SetGraphUserLayerNumber(num_graph_user_layers_++);
+        break;
+      case GNNLayerType::kSAGE:
+        gnn_layers_.push_back(std::move(std::make_unique<SAGELayer<VTy, ETy>>(
+            i, *graph_, &prev_output_layer, layer_dims,
+            config_.default_layer_config())));
+        gnn_layers_.back()->SetGraphUserLayerNumber(num_graph_user_layers_++);
+#ifdef GALOIS_ENABLE_GPU
+        // TODO(loc/hochan) sage layer gpu
+#endif
+        break;
+      case GNNLayerType::kL2Norm:
+        gnn_layers_.push_back(std::move(std::make_unique<L2NormLayer<VTy, ETy>>(
+            i, *graph_, &prev_output_layer, layer_dims,
+            config_.default_layer_config())));
+        break;
+      case GNNLayerType::kDense:
+        gnn_layers_.push_back(std::move(std::make_unique<DenseLayer<VTy, ETy>>(
+            i, *graph_, &prev_output_layer, layer_dims,
+            config_.default_layer_config())));
+        break;
+      default:
+        GALOIS_LOG_FATAL("Invalid layer type during network construction");
+      }
+
+      // update output layer for next layer
+      prev_output_layer = gnn_layers_.back()->GetForwardOutput();
+#ifdef GALOIS_ENABLE_GPU
+      if (device_personality == DevicePersonality::GPU_CUDA) {
+        graph_->InitLayerVectorMetaObjects(
+            i, galois::runtime::getSystemNetworkInterface().Num,
+            layer_dims.input_columns, layer_dims.output_columns);
+      }
+#endif
+    }
+
+    // loop backward and find last GCN/SAGE (main) layer to disable activation
+    for (auto back_iter = gnn_layers_.rbegin(); back_iter != gnn_layers_.rend();
+         back_iter++) {
+      GNNLayerType layer_type = (*back_iter)->layer_type();
+      if (layer_type == GNNLayerType::kGraphConvolutional ||
+          layer_type == GNNLayerType::kSAGE) {
+        galois::gDebug("Disabling activation on layer ",
+                       (*back_iter)->layer_number(), "\n");
+        (*back_iter)->DisableActivation();
+        break;
+      }
+    }
+
+    if (config_.do_sampling() || config_.use_train_subgraph_ ||
+        config.train_minibatch_size() || config.test_minibatch_size()) {
+      // output layer not included; it will never involve sampling
+      graph_->InitializeSamplingData(num_graph_user_layers_,
+                                     config_.use_train_subgraph_);
+    }
+
+    num_hosts_ = galois::runtime::getSystemNetworkInterface().Num;
+    if (config_.train_minibatch_size()) {
+      graph_->SetupTrainBatcher(config_.train_minibatch_size());
+      // size_t local_num =
+      // if (num_hosts_ > 1) {
+      //  dist_minibatch_tracker_ =
+      //  std::make_unique<DistributedMinibatchTracker>(
+      //      galois::runtime::getSystemNetworkInterface().ID, num_hosts_,
+      //      local_num, config_.train_minibatch_size());
+      //}
+    }
+
+    if (config_.test_minibatch_size()) {
+      graph_->SetupTestBatcher(config_.test_minibatch_size());
+    }
+
+    // create the output layer
+    GNNLayerDimensions output_dims = {
+        .input_rows = max_rows,
+        // get last intermediate layer column size
+        .input_columns = config_.intermediate_layer_size(
+            config_.num_intermediate_layers() - 1),
+        .output_columns = config_.output_layer_size(),
+        .output_rows    = max_rows};
+
+    if ((config_.train_minibatch_size() || config_.use_train_subgraph_) &&
+        config_.test_minibatch_size()) {
+      output_dims.input_rows  = 0;
+      output_dims.output_rows = 0;
+    }
+
+    switch (config_.output_layer_type()) {
+    case (GNNOutputLayerType::kSoftmax):
+      gnn_layers_.push_back(std::move(std::make_unique<SoftmaxLayer<VTy, ETy>>(
+          config_.num_intermediate_layers(), *graph_, &prev_output_layer,
+          output_dims)));
+      break;
+    case (GNNOutputLayerType::kSigmoid):
+      gnn_layers_.push_back(std::move(std::make_unique<SigmoidLayer<VTy, ETy>>(
+          config_.num_intermediate_layers(), *graph_, &prev_output_layer,
+          output_dims)));
+      break;
+    default:
+      GALOIS_LOG_FATAL("Invalid layer type during network construction");
+    }
+
+    // sanity checking multi-class + output layer
+    if (!graph_->is_single_class_label() &&
+        (config_.output_layer_type() != GNNOutputLayerType::kSigmoid)) {
+      GALOIS_LOG_WARN(
+          "Using a non-sigmoid output layer with a multi-class label!");
+      // if debug mode just kill program
+      assert(false);
+    }
+
+    // flip sampling on layers
+    if (config_.use_train_subgraph_ || config_.do_sampling() ||
+        config_.train_minibatch_size()) {
+      for (std::unique_ptr<galois::GNNLayer<VTy, ETy>>& ptr : gnn_layers_) {
+        ptr->EnableSampling();
+      }
+    }
+  }
 
   //! Number of intermediate layers (DOES NOT INCLUDE OUTPUT LAYER)
   size_t num_intermediate_layers() { return gnn_layers_.size() - 1; }
 
   //! Returns pointer to intermediate layer i
-  galois::GNNLayer* GetIntermediateLayer(size_t i) {
+  galois::GNNLayer<VTy, ETy>* GetIntermediateLayer(size_t i) {
     if (i < gnn_layers_.size() - 1) {
       return gnn_layers_[i].get();
     } else {
@@ -162,43 +346,669 @@ class GraphNeuralNetwork {
   //! Set the phases of all layers at once as well as this network
   void SetLayerPhases(galois::GNNPhase phase) {
     phase_ = phase;
-    for (std::unique_ptr<galois::GNNLayer>& ptr : gnn_layers_) {
+    for (std::unique_ptr<galois::GNNLayer<VTy, ETy>>& ptr : gnn_layers_) {
       ptr->SetLayerPhase(phase);
     }
   }
 
   //! Set weights on all layers to 1; should be used for debugging only
   void SetAllLayerWeightsTo1() {
-    for (std::unique_ptr<galois::GNNLayer>& ptr : gnn_layers_) {
+    for (std::unique_ptr<galois::GNNLayer<VTy, ETy>>& ptr : gnn_layers_) {
       ptr->InitAllWeightsTo1();
     }
   }
 
   //! Returns the output layer
-  galois::GNNLayer* GetOutputLayer() { return gnn_layers_.back().get(); }
+  galois::GNNLayer<VTy, ETy>* GetOutputLayer() {
+    return gnn_layers_.back().get();
+  }
+
+  float MinibatchedTesting() {
+    galois::gDebug("Minibatched Testing");
+    graph_->DisableSubgraph();
+    graph_->ResetTestMinibatcher();
+    SetLayerPhases(galois::GNNPhase::kBatch);
+
+    bool choose_all_status = graph_->SubgraphChooseAllStatus();
+
+    uint32_t correct = 0;
+    uint32_t total   = 0;
+    while (true) {
+      work_left_.reset();
+      // size_t seed_node_count = graph_->PrepareNextTestMinibatch();
+      graph_->PrepareNextTestMinibatch();
+      // last layer input size/output rows becomes seed node size
+      // gnn_layers_.back()->ResizeInputOutputRows(seed_node_count,
+      // seed_node_count);
+      size_t num_sampled_layers = 0;
+
+      for (auto back_iter = gnn_layers_.rbegin();
+           back_iter != gnn_layers_.rend(); back_iter++) {
+        GNNLayerType layer_type = (*back_iter)->layer_type();
+        if (layer_type == GNNLayerType::kGraphConvolutional ||
+            layer_type == GNNLayerType::kSAGE) {
+          // you can minibatch with sampling or minibatch and grab all
+          // relevant neighbors
+          // size_t current_sample_size;
+          graph_->SampleAllEdges((*back_iter)->graph_user_layer_number(), false,
+                                 num_sampled_layers + 1);
+          // resize this layer, change seed node count
+          //(*back_iter)
+          //    ->ResizeInputOutputRows(current_sample_size, seed_node_count);
+          // seed_node_count = current_sample_size;
+
+          num_sampled_layers++;
+          // XXX resizes above only work for SAGE layers; will break if other
+          // layers are tested
+        }
+      }
+
+      // resize layer matrices
+      CorrectRowCounts(graph_->ConstructSampledSubgraph(num_sampled_layers));
+      graph_->EnableSubgraphChooseAll();
+      CorrectBackwardLinks();
+
+      const PointerWithSize<galois::GNNFloat> batch_pred = DoInference();
+      std::pair<uint32_t, uint32_t> correct_total =
+          graph_->GetBatchAccuracy(batch_pred);
+
+      correct += correct_total.first;
+      total += correct_total.second;
+
+      work_left_ += graph_->MoreTestMinibatches();
+      char global_work_left = work_left_.reduce();
+      if (!global_work_left) {
+        break;
+      }
+    }
+
+    galois::gInfo("Minibatching Correct / Total ", correct, " ", total);
+
+    if (choose_all_status) {
+      graph_->EnableSubgraphChooseAll();
+    } else {
+      graph_->DisableSubgraphChooseAll();
+    }
 
-  float MinibatchedTesting();
+    return (1.0 * correct) / (1.0 * total);
+  }
 
   //! Do training for a specified # of epochs and return test accuracy at the
   //! end of it
-  float Train(size_t num_epochs);
+  float Train(size_t num_epochs) {
+    EnableTimers();
+    const size_t this_host = graph_->host_id();
+    float train_accuracy{0.f};
+    std::vector<size_t> subgraph_layer_sizes;
+    // this subgraph only needs to be created once
+    if (config_.use_train_subgraph_ && !config_.train_minibatch_size()) {
+      galois::StatTimer total_subgraph_construction_timer(
+          "TotalSubGraphConstruction", kRegionName);
+      galois::StatTimer setup_neighborhood_sample_timer(
+          "SetupNeighborhoodSample", kRegionName);
+      galois::StatTimer edge_sampling_timer("SampleAllEdges", kRegionName);
+      galois::StatTimer subgraph_construction_timer("SubGraphConstruction",
+                                                    kRegionName);
+      total_subgraph_construction_timer.start();
+
+      setup_neighborhood_sample_timer.start();
+      // Setup the subgraph to only be the training graph
+      size_t local_seed_node_count = graph_->SetupNeighborhoodSample();
+      setup_neighborhood_sample_timer.stop();
+
+      subgraph_layer_sizes.emplace_back(local_seed_node_count);
+      galois::gDebug(graph_->host_prefix(), "Number of local seed nodes is ",
+                     local_seed_node_count);
+      size_t num_sampled_layers = 0;
+      edge_sampling_timer.start();
+      // gnn_layers_.back()->ResizeRows(local_seed_node_count);
+      for (auto back_iter = gnn_layers_.rbegin();
+           back_iter != gnn_layers_.rend(); back_iter++) {
+        GNNLayerType layer_type = (*back_iter)->layer_type();
+        if (layer_type == GNNLayerType::kGraphConvolutional ||
+            layer_type == GNNLayerType::kSAGE) {
+          size_t current_sample_size = graph_->SampleAllEdges(
+              (*back_iter)->graph_user_layer_number(),
+              config_.inductive_subgraph_, num_sampled_layers + 1);
+          galois::gDebug(graph_->host_prefix(),
+                         "Number of local nodes for train subgraph for layer ",
+                         (*back_iter)->graph_user_layer_number(), " is ",
+                         current_sample_size);
+          // resizing
+          //(*back_iter)
+          //    ->ResizeInputOutputRows(current_sample_size,
+          //    local_seed_node_count);
+          local_seed_node_count = current_sample_size;
+          subgraph_layer_sizes.emplace_back(local_seed_node_count);
+          num_sampled_layers++;
+        }
+      }
+      edge_sampling_timer.stop();
+      subgraph_construction_timer.start();
+      CorrectRowCounts(graph_->ConstructSampledSubgraph(num_sampled_layers));
+      subgraph_construction_timer.stop();
+      CorrectBackwardLinks();
+      total_subgraph_construction_timer.stop();
+    }
+
+    galois::StatTimer epoch_timer("TrainingTime", kRegionName);
+    galois::StatTimer validation_timer("ValidationTime", kRegionName);
+    galois::StatTimer epoch_test_timer("TestTime", kRegionName);
+
+    for (size_t epoch = 0; epoch < num_epochs; epoch++) {
+      epoch_timer.start();
+      // swap to train subgraph
+      if (config_.use_train_subgraph_ && !config_.train_minibatch_size()) {
+        graph_->EnableSubgraph();
+        // TODO(loc) this doesn't actually function as expected anymore
+        // with the numerous changes to the system; this commenting
+        // out is more of a hack for the train subgraph option (which
+        // probably shouldn't be used anyways)
+
+        // size_t l_count = 0;
+        // gnn_layers_.back()->ResizeRows(subgraph_layer_sizes[0]);
+        // for (auto back_iter = gnn_layers_.rbegin();
+        //     back_iter != gnn_layers_.rend(); back_iter++) {
+        //  GNNLayerType layer_type = (*back_iter)->layer_type();
+        //  if (layer_type == GNNLayerType::kGraphConvolutional ||
+        //      layer_type == GNNLayerType::kSAGE) {
+        //    (*back_iter)
+        //        ->ResizeInputOutputRows(subgraph_layer_sizes[l_count + 1],
+        //                                subgraph_layer_sizes[l_count]);
+        //    l_count++;
+        //  }
+        //}
+        CorrectBackwardLinks();
+      }
+
+      // beginning of epoch sampling (no minibatches)
+      if (config_.do_sampling() && !config_.train_minibatch_size()) {
+        galois::StatTimer mb_timer("EpochSubgraphCreation", kRegionName);
+        galois::StatTimer subgraph_construction_timer("SubGraphConstruction",
+                                                      kRegionName);
+        galois::StatTimer setup_neighborhood_sample_timer(
+            "SetupNeighborhoodSample", kRegionName);
+        galois::StatTimer edge_sampling_timer("SampleEdges", kRegionName);
+        mb_timer.start();
+
+        setup_neighborhood_sample_timer.start();
+        size_t local_seed_node_count = graph_->SetupNeighborhoodSample();
+        setup_neighborhood_sample_timer.stop();
+        // gnn_layers_.back()->ResizeRows(local_seed_node_count);
+        galois::gDebug(graph_->host_prefix(), "Number of local seed nodes is ",
+                       local_seed_node_count);
+        size_t num_sampled_layers = 0;
+
+        edge_sampling_timer.start();
+        // work backwards on GCN/SAGE layers
+        // loop backward and find last GCN/SAGE (main) layer to disable
+        // activation
+        for (auto back_iter = gnn_layers_.rbegin();
+             back_iter != gnn_layers_.rend(); back_iter++) {
+          GNNLayerType layer_type = (*back_iter)->layer_type();
+          if (layer_type == GNNLayerType::kGraphConvolutional ||
+              layer_type == GNNLayerType::kSAGE) {
+            size_t current_sample_size = graph_->SampleEdges(
+                (*back_iter)->graph_user_layer_number(),
+                config_.fan_out_vector_[num_sampled_layers],
+                config_.inductive_subgraph_, num_sampled_layers + 1);
+            galois::gDebug(graph_->host_prefix(),
+                           "Number of local nodes for layer ",
+                           (*back_iter)->graph_user_layer_number(), " is ",
+                           current_sample_size);
+
+            //(*back_iter)
+            //    ->ResizeInputOutputRows(current_sample_size,
+            //                            local_seed_node_count);
+            local_seed_node_count = current_sample_size;
+            num_sampled_layers++;
+          }
+        }
+        edge_sampling_timer.stop();
+        // resize layer matrices
+        subgraph_construction_timer.start();
+        CorrectRowCounts(graph_->ConstructSampledSubgraph(num_sampled_layers));
+        subgraph_construction_timer.stop();
+        CorrectBackwardLinks();
+        mb_timer.stop();
+      }
+
+      if (!config_.train_minibatch_size()) {
+        // no minibatching, full batch
+        const PointerWithSize<galois::GNNFloat> predictions = DoInference();
+        // have to get accuracy here because gradient prop destroys the
+        // predictions matrix
+        train_accuracy = GetGlobalAccuracy(predictions);
+        GradientPropagation();
+      } else {
+        graph_->ResetTrainMinibatcher();
+        // if (num_hosts_ > 1) {
+        //  dist_minibatch_tracker_->ResetEpoch();
+        //}
+
+        SetLayerPhases(galois::GNNPhase::kBatch);
+
+        size_t batch_num = 0;
+
+        // create mini batch graphs and loop until minibatches on all hosts done
+        while (true) {
+          galois::StatTimer prep_timer("PrepNextMinibatch", kRegionName);
+          galois::StatTimer sample_time("MinibatchSampling", kRegionName);
+          galois::StatTimer mb_timer("MinibatchSubgraphCreation", kRegionName);
+          galois::StatTimer subgraph_construction_timer("SubGraphConstruction",
+                                                        kRegionName);
+          mb_timer.start();
+
+          galois::Timer batch_timer;
+          batch_timer.start();
+          work_left_.reset();
+          galois::gInfo("Epoch ", epoch, " batch ", batch_num++);
+          // break when all hosts are done with minibatches
+          prep_timer.start();
+          size_t seed_node_count;
+          // if (num_hosts_ > 1) {
+          //  size_t num_for_next_batch =
+          //      dist_minibatch_tracker_->GetNumberForNextMinibatch();
+          //  galois::gInfo(graph_->host_prefix(), "Sampling ",
+          //  num_for_next_batch,
+          //                " for this minibatch");
+          //  seed_node_count =
+          //      graph_->PrepareNextTrainMinibatch(num_for_next_batch);
+          //} else {
+          //}
+          seed_node_count = graph_->PrepareNextTrainMinibatch();
+
+          galois::gDebug(graph_->host_prefix(),
+                         "Number of local seed nodes is for batch is ",
+                         seed_node_count);
+          prep_timer.stop();
+
+          // last layer input size/output rows becomes seed node size
+          // gnn_layers_.back()->ResizeInputOutputRows(seed_node_count,
+          //                                          seed_node_count);
+
+          sample_time.start();
+          // +1 later in call because 0 is already taken
+          size_t num_sampled_layers = 0;
+          for (auto back_iter = gnn_layers_.rbegin();
+               back_iter != gnn_layers_.rend(); back_iter++) {
+            GNNLayerType layer_type = (*back_iter)->layer_type();
+            if (layer_type == GNNLayerType::kGraphConvolutional ||
+                layer_type == GNNLayerType::kSAGE) {
+              // you can minibatch with sampling or minibatch and grab all
+              // relevant neighbors
+              size_t current_sample_size;
+
+              if (config_.do_sampling()) {
+                current_sample_size = graph_->SampleEdges(
+                    (*back_iter)->graph_user_layer_number(),
+                    config_.fan_out_vector_[num_sampled_layers],
+                    config_.inductive_subgraph_, num_sampled_layers + 1);
+              } else {
+                current_sample_size = graph_->SampleAllEdges(
+                    (*back_iter)->graph_user_layer_number(),
+                    config_.inductive_subgraph_, num_sampled_layers + 1);
+              }
+
+              galois::gDebug(graph_->host_prefix(),
+                             "Number of local nodes for layer ",
+                             (*back_iter)->graph_user_layer_number(), " is ",
+                             current_sample_size);
+
+              // resize this layer, change seed node count
+              //(*back_iter)
+              //    ->ResizeInputOutputRows(current_sample_size,
+              //    seed_node_count);
+              seed_node_count = current_sample_size;
+              num_sampled_layers++;
+            }
+          }
+          sample_time.stop();
+
+          // resize layer matrices
+          subgraph_construction_timer.start();
+          CorrectRowCounts(
+              graph_->ConstructSampledSubgraph(num_sampled_layers));
+          subgraph_construction_timer.stop();
+          CorrectBackwardLinks();
+
+          // XXX resizes above only work for SAGE layers; will break if other
+          // layers are tested
+
+          mb_timer.stop();
+
+          const PointerWithSize<galois::GNNFloat> batch_pred = DoInference();
+          train_accuracy = GetGlobalAccuracy(batch_pred);
+          GradientPropagation();
+
+          work_left_ += graph_->MoreTrainMinibatches();
+          char global_work_left = work_left_.reduce();
+          batch_timer.stop();
+          epoch_timer.stop();
+          galois::gPrint("Epoch ", epoch, " Batch ", batch_num - 1,
+                         ": Train accuracy/F1 micro is ", train_accuracy,
+                         " time ", batch_timer.get(), "\n");
+
+          bool test_eval =
+              config_.minibatch_test_interval_
+                  ? (batch_num - 1) % config_.minibatch_test_interval_ == 0
+                  : false;
+
+          if (test_eval) {
+            DisableTimers();
+            float test_acc;
+            if (!config_.test_minibatch_size()) {
+              // TODO something about this path breaks accuracy
+              GALOIS_LOG_FATAL("this path breaks accuracy for the rest of the "
+                               "run for some reason");
+              bool f = graph_->SubgraphChooseAllStatus();
+              graph_->DisableSubgraph();
+              for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end();
+                   layer++) {
+                // TODO nuclear resize
+                (*layer)->ResizeRows(graph_->size());
+              }
+              CorrectBackwardLinks();
+              SetLayerPhases(galois::GNNPhase::kTest);
+              graph_->EnableSubgraphChooseAll();
+              const PointerWithSize<galois::GNNFloat> test_pred = DoInference();
+              test_acc = GetGlobalAccuracy(test_pred);
+              graph_->SetSubgraphChooseAll(f);
+            } else {
+              test_acc = MinibatchedTesting();
+            }
+
+            if (this_host == 0) {
+              galois::gPrint("Epoch ", epoch, " Batch ", batch_num - 1,
+                             ": Test accuracy is ", test_acc, "\n");
+              const std::string test_name_acc =
+                  "TestEpoch" + std::to_string(epoch) + "Batch" +
+                  std::to_string(batch_num - 1) + "Accuracy";
+              galois::runtime::reportStat_Single(kRegionName, test_name_acc,
+                                                 test_acc);
+            }
+
+            // report the training time elapsed at this point in time
+            galois::runtime::reportStat_Single(
+                kRegionName,
+                "ElapsedTrainTimeEpoch" + std::to_string(epoch) + "Batch" +
+                    std::to_string(batch_num - 1),
+                epoch_timer.get());
+            // revert to training phase for next epoch
+            SetLayerPhases(galois::GNNPhase::kTrain);
+            EnableTimers();
+          }
+
+          epoch_timer.start();
+
+          if (!global_work_left) {
+            // if (num_hosts_ > 1) {
+            //  GALOIS_LOG_ASSERT(dist_minibatch_tracker_->OutOfWork());
+            //}
+            break;
+          }
+        }
+      }
+      epoch_timer.stop();
+
+      if (this_host == 0) {
+        const std::string t_name_acc =
+            "TrainEpoch" + std::to_string(epoch) + "Accuracy";
+        galois::gPrint("Epoch ", epoch, ": Train accuracy/F1 micro is ",
+                       train_accuracy, "\n");
+        galois::runtime::reportStat_Single(kRegionName, t_name_acc,
+                                           train_accuracy);
+      }
+
+      bool do_validate = config_.validation_interval_
+                             ? epoch % config_.validation_interval_ == 0
+                             : false;
+      bool do_test =
+          config_.test_interval_ ? epoch % config_.test_interval_ == 0 : false;
+
+      bool subgraph_choose_all_status = graph_->SubgraphChooseAllStatus();
+
+      if (do_validate || do_test) {
+        DisableTimers();
+        // disable subgraph
+        graph_->DisableSubgraph();
+        graph_->EnableSubgraphChooseAll();
+      }
+
+      if (do_validate) {
+        // XXX induced subgraph here
+        for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end();
+             layer++) {
+          // nuclear resize
+          (*layer)->ResizeRows(graph_->size());
+        }
+
+        CorrectBackwardLinks();
+        validation_timer.start();
+        SetLayerPhases(galois::GNNPhase::kValidate);
+        const PointerWithSize<galois::GNNFloat> val_pred = DoInference();
+        validation_timer.stop();
+
+        float val_acc = GetGlobalAccuracy(val_pred);
+        if (this_host == 0) {
+          galois::gPrint("Epoch ", epoch, ": Validation accuracy is ", val_acc,
+                         "\n");
+          const std::string v_name_acc =
+              "ValEpoch" + std::to_string(epoch) + "Accuracy";
+          galois::runtime::reportStat_Single(kRegionName, v_name_acc, val_acc);
+        }
+      }
+
+      if (do_test) {
+        epoch_test_timer.start();
+        float test_acc;
+
+        if (!config_.test_minibatch_size()) {
+          for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end();
+               layer++) {
+            // nuclear resize
+            (*layer)->ResizeRows(graph_->size());
+          }
+          CorrectBackwardLinks();
+          SetLayerPhases(galois::GNNPhase::kTest);
+          const PointerWithSize<galois::GNNFloat> test_pred = DoInference();
+          epoch_test_timer.stop();
+          test_acc = GetGlobalAccuracy(test_pred);
+        } else {
+          test_acc = MinibatchedTesting();
+          epoch_test_timer.stop();
+        }
+
+        if (this_host == 0) {
+          galois::gPrint("Epoch ", epoch, ": Test accuracy is ", test_acc,
+                         "\n");
+          const std::string test_name_acc =
+              "TestEpoch" + std::to_string(epoch) + "Accuracy";
+          galois::runtime::reportStat_Single(kRegionName, test_name_acc,
+                                             test_acc);
+        }
+      }
+
+      if (do_validate || do_test) {
+        // report the training time elapsed at this point in time
+        galois::runtime::reportStat_Single(
+            kRegionName, "ElapsedTrainTimeEpoch" + std::to_string(epoch),
+            epoch_timer.get());
+        // revert to training phase for next epoch
+        SetLayerPhases(galois::GNNPhase::kTrain);
+        graph_->SetSubgraphChooseAll(subgraph_choose_all_status);
+
+        // TODO too much code dupe
+        // Resconstruct the train subgraph since it was replaced by test
+        // subgraph
+        if (config_.use_train_subgraph_ && !config_.train_minibatch_size() &&
+            config_.test_minibatch_size() && do_test) {
+          // Setup the subgraph to only be the training graph
+          size_t local_seed_node_count = graph_->SetupNeighborhoodSample();
+          galois::gDebug(graph_->host_prefix(),
+                         "Number of local seed nodes is ",
+                         local_seed_node_count);
+          size_t num_sampled_layers = 0;
+          // gnn_layers_.back()->ResizeRows(local_seed_node_count);
+          for (auto back_iter = gnn_layers_.rbegin();
+               back_iter != gnn_layers_.rend(); back_iter++) {
+            GNNLayerType layer_type = (*back_iter)->layer_type();
+            if (layer_type == GNNLayerType::kGraphConvolutional ||
+                layer_type == GNNLayerType::kSAGE) {
+              size_t current_sample_size = graph_->SampleAllEdges(
+                  (*back_iter)->graph_user_layer_number(),
+                  config_.inductive_subgraph_, num_sampled_layers + 1);
+              // resizing
+              //(*back_iter)
+              //    ->ResizeInputOutputRows(current_sample_size,
+              //                            local_seed_node_count);
+              local_seed_node_count = current_sample_size;
+              num_sampled_layers++;
+            }
+          }
+          CorrectRowCounts(
+              graph_->ConstructSampledSubgraph(num_sampled_layers));
+          CorrectBackwardLinks();
+        }
+
+        EnableTimers();
+      }
+    }
+
+    uint64_t average_epoch_time = epoch_timer.get() / num_epochs;
+    galois::runtime::reportStat_Tavg(kRegionName, "AverageEpochTime",
+                                     average_epoch_time);
+    // DisableTimers();
+    //  disable subgraph
+    graph_->DisableSubgraph();
+    graph_->EnableSubgraphChooseAll();
+
+    // check test accuracy
+    galois::StatTimer test_timer("FinalTestRun", kRegionName);
+    float global_accuracy;
+
+    test_timer.start();
+
+    if (!config_.test_minibatch_size()) {
+      for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end();
+           layer++) {
+        // TODO nuclear resize; this is **ridiculously** inefficient
+        // because full graph will be used even if not included in test
+        // k-hop neighborhood for eval
+        (*layer)->ResizeRows(graph_->size());
+      }
+      CorrectBackwardLinks();
+      SetLayerPhases(galois::GNNPhase::kTest);
+      const PointerWithSize<galois::GNNFloat> predictions = DoInference();
+      global_accuracy = GetGlobalAccuracy(predictions);
+    } else {
+      global_accuracy = MinibatchedTesting();
+    }
+
+    test_timer.stop();
+
+    if (this_host == 0) {
+      galois::gPrint("Final test accuracy is ", global_accuracy, "\n");
+      galois::runtime::reportStat_Single(kRegionName, "FinalTestAccuracy",
+                                         global_accuracy);
+    }
+
+    return global_accuracy;
+  }
 
   //! Propogates the graph's feature vectors through the network to get a new
   //! vector representation.
   //! Also known as the forward phase in most literature
   //! @returns Output layer's output
-  const PointerWithSize<GNNFloat> DoInference();
+  const PointerWithSize<GNNFloat> DoInference() {
+    galois::StatTimer timer("DoInference", "GraphNeuralNetwork");
+    if (timers_on_) {
+      timer.start();
+    }
+
+    // start with graph features and pass it through all layers of the network
+    galois::PointerWithSize<galois::GNNFloat> layer_input =
+        graph_->GetLocalFeatures();
+
+    for (std::unique_ptr<galois::GNNLayer<VTy, ETy>>& ptr : gnn_layers_) {
+      layer_input = ptr->ForwardPhase(layer_input);
+    }
+
+    if (timers_on_) {
+      timer.stop();
+    }
+
+    return layer_input;
+  }
 
   //! Returns classification accuracy for single class label or micro F1 score
   //! for multi-class predictions; this calls into GNNGraph's accuracy call
-  float GetGlobalAccuracy(const PointerWithSize<GNNFloat> predictions);
+  float GetGlobalAccuracy(const PointerWithSize<GNNFloat> predictions) {
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      if (cpu_pred_.size() != predictions.size()) {
+        cpu_pred_.resize(predictions.size());
+      }
+
+      // TODO get rid of CPU copy here if possible
+      AdamOptimizer* adam = static_cast<AdamOptimizer*>(optimizer_.get());
+      adam->CopyToVector(cpu_pred_, predictions);
+      return graph_->GetGlobalAccuracy(cpu_pred_, phase_,
+                                       config_.do_sampling());
+    } else {
+#endif
+      return graph_->GetGlobalAccuracy(predictions, phase_,
+                                       config_.do_sampling());
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+  }
+
   float GetGlobalAccuracy(const PointerWithSize<GNNFloat> predictions,
                           bool sampling);
 
   //! Backpropagate gradients from the output layer backwards through the
   //! network to update the layer weights. Also known as a backward phase in
   //! most literature
-  void GradientPropagation();
+  void GradientPropagation() {
+    galois::StatTimer timer("GradientPropagation", "GraphNeuralNetwork");
+    if (timers_on_) {
+      timer.start();
+    }
+
+    // from output layer get initial gradients
+    std::vector<galois::GNNFloat> dummy;
+    std::unique_ptr<galois::GNNLayer<VTy, ETy>>& output_layer =
+        gnn_layers_.back();
+    galois::PointerWithSize<galois::GNNFloat> current_gradients =
+        output_layer->BackwardPhase(dummy, nullptr);
+    // loops through intermediate layers in a backward fashion
+    // -1 to ignore output layer which was handled above
+    for (size_t i = 0; i < gnn_layers_.size() - 1; i++) {
+      // note this assumes you have at least 2 layers (including output)
+      size_t layer_index = gnn_layers_.size() - 2 - i;
+
+      // get the input to the layer before this one
+      galois::PointerWithSize<galois::GNNFloat> prev_layer_input;
+      if (layer_index != 0) {
+        prev_layer_input = gnn_layers_[layer_index - 1]->GetForwardOutput();
+      } else {
+        prev_layer_input = graph_->GetLocalFeatures();
+      }
+
+      // backward prop and get a new set of gradients
+      current_gradients = gnn_layers_[layer_index]->BackwardPhase(
+          prev_layer_input, &current_gradients);
+      // if not output do optimization/gradient descent
+      // at this point in the layer the gradients exist; use the gradients to
+      // update the weights of the layer
+      gnn_layers_[layer_index]->OptimizeLayer(optimizer_.get(), layer_index);
+    }
+
+    if (timers_on_) {
+      timer.stop();
+    }
+  }
 
   //! # nodes may change in distributed setting due to dead mirrors;
   //! given the # of nodes at each layer, fix the input/output rows
@@ -227,7 +1037,17 @@ class GraphNeuralNetwork {
   }
 
   //! Call whenever resize occurs to correct reuse of pointers for layers
-  void CorrectBackwardLinks();
+  void CorrectBackwardLinks() {
+    // layer chain pointer
+    PointerWithSize<GNNFloat> prev_output_layer(nullptr, 0);
+    for (size_t layer_num = 0; layer_num < gnn_layers_.size(); layer_num++) {
+      // first layer is nullptr so can be ignored
+      if (layer_num != 0) {
+        gnn_layers_[layer_num]->UpdateBackwardOutput(&prev_output_layer);
+      }
+      prev_output_layer = gnn_layers_[layer_num]->GetForwardOutput();
+    }
+  }
 
 private:
   static const constexpr char* kRegionName = "GraphNeuralNetwork";
@@ -251,13 +1071,13 @@ class GraphNeuralNetwork {
   }
 
   //! Underlying graph to train
-  std::unique_ptr<graphs::GNNGraph> graph_;
+  std::unique_ptr<graphs::GNNGraph<VTy, ETy>> graph_;
   //! Optimizer object for weight updates
   std::unique_ptr<BaseOptimizer> optimizer_;
   //! Configuration object used to construct this GNN
   GraphNeuralNetworkConfig config_;
   //! GNN layers including the output
-  std::vector<std::unique_ptr<galois::GNNLayer>> gnn_layers_;
+  std::vector<std::unique_ptr<galois::GNNLayer<VTy, ETy>>> gnn_layers_;
   //! Current phase of the GNN: train, validation, test
   GNNPhase phase_{GNNPhase::kTrain};
   //! Number of layers that use the graph (e.g. SAGE, GCN)
diff --git a/libgnn/include/galois/graphs/DegreeSyncStructures.h b/libgnn/include/galois/graphs/DegreeSyncStructures.h
index 659541570d..a104f18bff 100644
--- a/libgnn/include/galois/graphs/DegreeSyncStructures.h
+++ b/libgnn/include/galois/graphs/DegreeSyncStructures.h
@@ -10,17 +10,19 @@ extern uint32_t* gnn_degree_vec_2_;
 extern galois::DynamicBitSet bitset_sampled_degrees_;
 extern std::vector<galois::LargeArray<uint32_t>>* gnn_sampled_out_degrees_;
 
+template <typename NTy>
 struct InitialDegreeSync {
-  using ValTy = std::pair<uint32_t, uint32_t>;
+  using NodeTy = NTy;
+  using ValTy  = std::pair<uint32_t, uint32_t>;
 
   //! return a vector of floats to sync
-  static ValTy extract(uint32_t lid, char&) {
+  static ValTy extract(uint32_t lid, NodeTy&) {
     return std::make_pair(gnn_degree_vec_1_[lid], gnn_degree_vec_2_[lid]);
   }
 
   //! reduction is addition in this case; add received vector to
   //! own vector
-  static bool reduce(uint32_t lid, char&, ValTy y) {
+  static bool reduce(uint32_t lid, NodeTy&, ValTy y) {
     gnn_degree_vec_1_[lid] += y.first;
     gnn_degree_vec_2_[lid] += y.second;
     if (y.first || y.second) {
@@ -31,13 +33,13 @@ struct InitialDegreeSync {
   }
 
   //! No-op: readAny = overwritten anyways
-  static void reset(uint32_t lid, char&) {
+  static void reset(uint32_t lid, NodeTy&) {
     gnn_degree_vec_1_[lid] = 0;
     gnn_degree_vec_2_[lid] = 0;
   }
 
   //! element wise set
-  static void setVal(uint32_t lid, char&, ValTy y) {
+  static void setVal(uint32_t lid, NodeTy&, ValTy y) {
     gnn_degree_vec_1_[lid] = y.first;
     gnn_degree_vec_2_[lid] = y.second;
   }
@@ -58,12 +60,14 @@ struct InitialDegreeSync {
   static bool extract_reset_batch(unsigned, uint8_t*) { return false; }
 };
 
+template <typename NTy>
 struct SubgraphDegreeSync {
-  using ValTy = galois::gstl::Vector<uint32_t>;
+  using NodeTy = NTy;
+  using ValTy  = galois::gstl::Vector<uint32_t>;
 
   static size_t FeatVecSize() { return gnn_sampled_out_degrees_->size(); }
 
-  static ValTy extract(uint32_t lid, char&) {
+  static ValTy extract(uint32_t lid, NodeTy&) {
     ValTy vec_to_send(gnn_sampled_out_degrees_->size());
     size_t count = 0;
     for (galois::LargeArray<uint32_t>& layer_degrees :
@@ -85,7 +89,7 @@ struct SubgraphDegreeSync {
     }
   }
 
-  static bool reduce(uint32_t lid, char&, ValTy y) {
+  static bool reduce(uint32_t lid, NodeTy&, ValTy y) {
     assert(y.size() == gnn_sampled_out_degrees_->size());
     for (size_t degree_index = 0; degree_index < y.size(); degree_index++) {
       (*gnn_sampled_out_degrees_)[degree_index][lid] += y[degree_index];
@@ -93,7 +97,7 @@ struct SubgraphDegreeSync {
     return true;
   }
 
-  static bool reduce(uint32_t lid, char&, ValTy::value_type* y) {
+  static bool reduce(uint32_t lid, NodeTy&, ValTy::value_type* y) {
     for (size_t degree_index = 0;
          degree_index < gnn_sampled_out_degrees_->size(); degree_index++) {
       (*gnn_sampled_out_degrees_)[degree_index][lid] += y[degree_index];
@@ -102,7 +106,7 @@ struct SubgraphDegreeSync {
   }
 
   //! No-op: readAny = overwritten anyways; can probably get away with no-op
-  static void reset(uint32_t lid, char&) {
+  static void reset(uint32_t lid, NodeTy&) {
     for (galois::LargeArray<uint32_t>& layer_degrees :
          *gnn_sampled_out_degrees_) {
       layer_degrees[lid] = 0;
@@ -110,14 +114,14 @@ struct SubgraphDegreeSync {
   }
 
   //! element wise set
-  static void setVal(uint32_t lid, char&, ValTy y) {
+  static void setVal(uint32_t lid, NodeTy&, ValTy y) {
     assert(y.size() == gnn_sampled_out_degrees_->size());
     for (size_t degree_index = 0; degree_index < y.size(); degree_index++) {
       (*gnn_sampled_out_degrees_)[degree_index][lid] = y[degree_index];
     }
   }
 
-  static void setVal(uint32_t lid, char&, ValTy::value_type* y) {
+  static void setVal(uint32_t lid, NodeTy&, ValTy::value_type* y) {
     for (size_t degree_index = 0;
          degree_index < gnn_sampled_out_degrees_->size(); degree_index++) {
       (*gnn_sampled_out_degrees_)[degree_index][lid] = y[degree_index];
diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 6dbfdfbcf1..ad41def334 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -6,6 +6,14 @@
 #include "galois/graphs/GluonSubstrate.h"
 #include "galois/graphs/GraphAggregationSyncStructures.h"
 #include "galois/MinibatchGenerator.h"
+#include "galois/Logging.h"
+#include "galois/graphs/ReadGraph.h"
+#include "galois/GNNMath.h"
+#include "galois/graphs/DegreeSyncStructures.h"
+
+#include <fstream>
+#include <limits>
+#include <unordered_set>
 
 #ifdef GALOIS_ENABLE_GPU
 #include "galois/graphs/GNNGraph.cuh"
@@ -31,24 +39,99 @@ namespace graphs {
 //! Possible partitioning schemes for the GNN graph
 enum class GNNPartitionScheme { kOEC, kCVC, kOCVC };
 
+template <typename VTy, typename ETy>
 class GNNGraph {
 public:
-  using GNNDistGraph = galois::graphs::DistGraph<char, void>;
-  using GraphNode    = GNNDistGraph::GraphNode;
+  using GNNDistGraph = galois::graphs::DistGraph<VTy, ETy>;
+  using GraphNode    = typename GNNDistGraph::GraphNode;
   // defined as such because dist graph range objects used long unsigned
   using NodeIterator = boost::counting_iterator<size_t>;
-  using EdgeIterator = GNNDistGraph::edge_iterator;
+  using EdgeIterator = typename GNNDistGraph::edge_iterator;
 
   // using GNNEdgeSortIterator = internal::EdgeSortIterator<GraphNode,
   //  uint64_t, galois::LargeArray<uint32_t>,
   //  galois::LargeArray<std::vector<bool>>>;
 
   GNNGraph(const std::string& dataset_name, GNNPartitionScheme partition_scheme,
-           bool has_single_class_label, bool useShad = false);
+           bool has_single_class_label, bool useWMD = false)
+      : GNNGraph(galois::default_gnn_dataset_path, dataset_name,
+                 partition_scheme, has_single_class_label,
+                 useWMD) {}
+
   //! Loads a graph and all relevant metadata (labels, features, masks, etc.)
   GNNGraph(const std::string& input_directory, const std::string& dataset_name,
            GNNPartitionScheme partition_scheme, bool has_single_class_label,
-           bool useShad = false);
+           bool useWMD = false)
+      : input_directory_(input_directory) {
+    GALOIS_LOG_VERBOSE("[{}] Constructing partitioning for {}", host_id_,
+                       dataset_name);
+    // save host id
+    host_id_ = galois::runtime::getSystemNetworkInterface().ID;
+    host_prefix_ =
+        std::string("[") +
+        std::to_string(galois::runtime::getSystemNetworkInterface().ID) +
+        std::string("] ");
+    // load partition
+    partitioned_graph_ = LoadPartition(input_directory_, dataset_name,
+                                       partition_scheme, useWMD);
+    galois::gInfo(host_prefix_, "Loading partition is completed");
+    // reverse edges
+    partitioned_graph_->ConstructIncomingEdges();
+    // mark a node if it is sampled
+    mark_sampled_nodes_.resize(partitioned_graph_->size());
+
+    galois::gInfo(host_prefix_, "Number of local proxies is ",
+                  partitioned_graph_->size());
+    galois::gInfo(host_prefix_, "Number of local edges is ",
+                  partitioned_graph_->sizeEdges());
+
+    // init gluon from the partitioned graph
+    sync_substrate_ =
+        std::make_unique<galois::graphs::GluonSubstrate<GNNDistGraph>>(
+            *partitioned_graph_, host_id_,
+            galois::runtime::getSystemNetworkInterface().Num, false,
+            partitioned_graph_->cartesianGrid());
+    bitset_graph_aggregate.resize(partitioned_graph_->size());
+
+    // Construct/read additional graph data
+    if (useWMD) {
+      galois::gInfo("Feature is constructed by aggregating 2-hop features, "
+                    "instead from feature files");
+      this->ConstructFeatureBy2HopAggregation();
+      this->ConstructLocalLabels();
+      this->SetLocalMasksRandomly();
+    } else {
+      if (dataset_name != "ogbn-papers100M-remap") {
+        ReadLocalLabels(dataset_name, has_single_class_label);
+      } else {
+        galois::gInfo("Remapped ogbn 100M");
+        ReadLocalLabelsBin(dataset_name);
+      }
+      ReadLocalFeatures(dataset_name);
+      ReadLocalMasks(dataset_name);
+    }
+
+    // init norm factors (involves a sync call)
+    InitNormFactor();
+
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      // allocate/copy data structures over to GPU
+      GALOIS_LOG_VERBOSE("[{}] Initializing GPU memory", host_id_);
+      InitGPUMemory();
+
+      // initialize CUDA context
+      cuda_ctx_ = get_CUDA_context(host_id_);
+      if (!init_CUDA_context(cuda_ctx_, ::gpudevice)) {
+        GALOIS_DIE("Failed to initialize CUDA context");
+      }
+      PartitionedGraphInfo g_info;
+      GetPartitionedGraphInfo(g_info);
+      load_graph_CUDA_GNN(cuda_ctx_, g_info,
+                          galois::runtime::getSystemNetworkInterface().Num);
+    }
+#endif
+  }
 
   //! Returns host id
   size_t host_id() const { return host_id_; }
@@ -127,7 +210,32 @@ class GNNGraph {
 
   void InitializeSamplingData() { InitializeSamplingData(1, false); }
   //! Initialize data required to do graph sampling
-  void InitializeSamplingData(size_t num_layers, bool is_inductive);
+  void InitializeSamplingData(size_t num_layers, bool choose_all) {
+    subgraph_ = std::make_unique<GNNSubgraph>(partitioned_graph_->size());
+    sample_node_timestamps_.create(partitioned_graph_->size(),
+                                   std::numeric_limits<uint32_t>::max());
+    edge_sample_status_.resize(num_layers);
+    for (size_t i = 0; i < num_layers; i++) {
+      edge_sample_status_[i].resize(partitioned_graph_->sizeEdges());
+    }
+    sampled_edges_.resize(partitioned_graph_->sizeEdges());
+    // this is to hold the degree of a sampled graph considering all hosts; yes,
+    // memory wise this is slightly problematic possibly, but each layer is its
+    // own subgraph
+    if (!choose_all) {
+      sampled_out_degrees_.resize(num_layers);
+      for (galois::LargeArray<uint32_t>& array : sampled_out_degrees_) {
+        array.create(partitioned_graph_->size());
+      }
+    } else {
+      subgraph_choose_all_ = true;
+    }
+    definitely_sampled_nodes_.resize(partitioned_graph_->size());
+    master_offset_accum_.resize(num_layers + 1);
+    mirror_offset_accum_.resize(num_layers + 1);
+    sample_master_offsets_.resize(num_layers + 1, 0);
+    sample_mirror_offsets_.resize(num_layers + 1, 0);
+  }
 
   //////////////////////////////////////////////////////////////////////////////
   // Out Edges
@@ -169,7 +277,7 @@ class GNNGraph {
   };
 
   galois::runtime::iterable<
-      galois::NoDerefIterator<GNNDistGraph::edge_iterator>>
+      galois::NoDerefIterator<typename GNNDistGraph::edge_iterator>>
   edges(GraphNode N) const {
     if (!use_subgraph_ && !use_subgraph_view_) {
       return partitioned_graph_->edges(N);
@@ -247,7 +355,7 @@ class GNNGraph {
     }
   }
   galois::runtime::iterable<
-      galois::NoDerefIterator<GNNDistGraph::edge_iterator>>
+      galois::NoDerefIterator<typename GNNDistGraph::edge_iterator>>
   in_edges(GraphNode N) const {
     if (!use_subgraph_ && !use_subgraph_view_) {
       return partitioned_graph_->in_edges(N);
@@ -306,21 +414,364 @@ class GNNGraph {
   size_t SetupNeighborhoodSample() {
     return SetupNeighborhoodSample(GNNPhase::kTrain);
   }
-  size_t SetupNeighborhoodSample(GNNPhase seed_phase);
+  size_t SetupNeighborhoodSample(GNNPhase seed_phase) {
+    DisableSubgraph();
+
+    if (!bitset_sample_flag_.size()) {
+      bitset_sample_flag_.resize(size());
+    }
+    bitset_sample_flag_.ParallelReset();
+    definitely_sampled_nodes_.ParallelReset();
+
+    galois::do_all(
+        galois::iterate(begin_owned(), end_owned()),
+        [&](const NodeIterator& x) {
+          if (IsValidForPhase(*x, seed_phase)) {
+            SetSampledNode(*x);
+            bitset_sample_flag_.set(*x);
+            definitely_sampled_nodes_.set(*x);
+          } else {
+            UnsetSampledNode(*x);
+          }
+        },
+        galois::loopname("InitialSeedSetting"));
+    // unsets nodes set in previous iterations; for some reason they get
+    // synchronized along  with everything else even though bitset sample flag
+    // should prevent it (that, or it's because they don't get sync'd that they
+    // remain the same)
+    galois::do_all(galois::iterate(end_owned(), end()),
+                   [&](const NodeIterator& x) { UnsetSampledNode(*x); });
+
+    // clear node timestamps
+    galois::StatTimer fill_time("ClearFillTime");
+    fill_time.start();
+    galois::ParallelSTL::fill(sample_node_timestamps_.begin(),
+                              sample_node_timestamps_.end(),
+                              std::numeric_limits<uint32_t>::max());
+    galois::ParallelSTL::fill(sample_master_offsets_.begin(),
+                              sample_master_offsets_.end(), 0);
+    galois::ParallelSTL::fill(sample_mirror_offsets_.begin(),
+                              sample_mirror_offsets_.end(), 0);
+    fill_time.stop();
+
+    for (unsigned i = 0; i < master_offset_accum_.size(); i++) {
+      master_offset_accum_[i].reset();
+      mirror_offset_accum_[i].reset();
+    }
+
+    // clear all sampled edges
+    galois::StatTimer ctime("ClearSampleEdges");
+    ctime.start();
+    for (galois::DynamicBitSet& edge_layer : edge_sample_status_) {
+      edge_layer.ParallelReset();
+    }
+    ctime.stop();
+    //  galois::do_all(
+    //      galois::iterate(edge_sample_status_.begin(),
+    //      edge_sample_status_.end()),
+    //      [&](galois::DynamicBitSet& edge_layer) { edge_layer.reset(); },
+    //      galois::loopname("ClearSampleEdges"));
+
+    sampled_edges_.ParallelReset();
+
+    // reset all degrees
+    if (!subgraph_choose_all_) {
+      galois::StatTimer cad_timer("ClearAllDegrees");
+      cad_timer.start();
+      for (galois::LargeArray<uint32_t>& array : sampled_out_degrees_) {
+        galois::ParallelSTL::fill(array.begin(), array.end(), 0);
+      }
+      cad_timer.stop();
+    }
+
+    if (!bitset_sampled_degrees_.size()) {
+      bitset_sampled_degrees_.resize(partitioned_graph_->size());
+    }
+    bitset_sampled_degrees_.reset();
+
+    // Seed nodes sync
+    SampleNodeSync("SeedNodeSample");
+
+    galois::GAccumulator<unsigned> local_seed_count;
+    local_seed_count.reset();
+    galois::GAccumulator<unsigned> master_offset;
+    master_offset.reset();
+    galois::GAccumulator<unsigned> mirror_offset;
+    mirror_offset.reset();
+    // count # of seed nodes
+    galois::do_all(
+        galois::iterate(begin(), end()),
+        [&](const NodeIterator& x) {
+          if (IsInSampledGraph(x)) {
+            if (*x < *end_owned()) {
+              master_offset += 1;
+            } else {
+              // mirror
+              mirror_offset += 1;
+            }
+
+            // galois::gInfo(host_prefix_, "Seed node is ", GetGID(*x));
+            local_seed_count += 1;
+            // 0 = seed node
+            sample_node_timestamps_[*x] = 0;
+          }
+        },
+        galois::loopname("SeedNodeOffsetCounting"));
+
+    sample_master_offsets_[0] = master_offset.reduce();
+    sample_mirror_offsets_[0] = mirror_offset.reduce();
+
+    return local_seed_count.reduce();
+  }
 
   //! Choose all edges from sampled nodes
   size_t SampleAllEdges(size_t agg_layer_num, bool inductive_subgraph,
-                        size_t timestamp);
+                        size_t timestamp) {
+    DisableSubgraph();
+
+    galois::do_all(
+        galois::iterate(begin(), end()),
+        [&](const NodeIterator& src_iter) {
+          // only operate on if sampled
+          if (IsInSampledGraph(src_iter)) {
+            // marks ALL edges of nodes that connect to train/other nodes
+            for (auto edge_iter : partitioned_graph_->edges(*src_iter)) {
+              // total += 1;
+              if (inductive_subgraph) {
+                if (!IsValidForPhase(partitioned_graph_->getEdgeDst(edge_iter),
+                                     GNNPhase::kTrain) &&
+                    !IsValidForPhase(partitioned_graph_->getEdgeDst(edge_iter),
+                                     GNNPhase::kOther)) {
+                  continue;
+                }
+              }
+
+              MakeEdgeSampled(edge_iter, agg_layer_num);
+              uint32_t dest = partitioned_graph_->getEdgeDst(edge_iter);
+              if (!IsInSampledGraph(dest)) {
+                bitset_sample_flag_.set(dest);
+              }
+              definitely_sampled_nodes_.set(*src_iter);
+              definitely_sampled_nodes_.set(dest);
+            }
+          }
+        },
+        galois::steal(), galois::loopname("ChooseAllEdges"));
+
+    // update nodes, then communicate update to all hosts so that they can
+    // continue the exploration
+    galois::do_all(
+        galois::iterate(size_t{0}, bitset_sample_flag_.size()),
+        [&](uint32_t new_node_id) {
+          if (bitset_sample_flag_.test(new_node_id)) {
+            SetSampledNode(new_node_id);
+          }
+        },
+        galois::loopname("NeighborhoodSampleSet"));
+
+    SampleNodeSync("SampleFlag");
+
+    galois::GAccumulator<unsigned> local_sample_count;
+    local_sample_count.reset();
+    // count # of seed nodes
+    galois::do_all(galois::iterate(begin(), end()), [&](const NodeIterator& x) {
+      if (IsInSampledGraph(x)) {
+        local_sample_count += 1;
+        if (sample_node_timestamps_[*x] ==
+            std::numeric_limits<uint32_t>::max()) {
+          if (x < end_owned()) {
+            // owned nodes that are activated on other hosts shoudl always
+            // be activated because it's responsible for keeping others in
+            // sync during comms; ignoring it = bad
+            // TODO(gluon) make it so you don't have to deal with this
+            // and just use host as a reducer point
+            definitely_sampled_nodes_.set(*x);
+          }
+          sample_node_timestamps_[*x] = timestamp;
+        }
+      }
+    });
+
+    EnableSubgraphChooseAll();
+    return local_sample_count.reduce();
+  }
+
   //! Sample neighbors of nodes that are marked as ready for sampling
   size_t SampleEdges(size_t sample_layer_num, size_t num_to_sample,
-                     bool inductive_subgraph, size_t timestamp);
+                     bool inductive_subgraph, size_t timestamp) {
+    use_subgraph_      = false;
+    use_subgraph_view_ = false;
+
+    galois::do_all(
+        galois::iterate(begin(), end()),
+        [&](const NodeIterator& src_iter) {
+          // only operate on if sampled
+          if (IsInSampledGraph(src_iter)) {
+            // chance of not uniformly choosing an edge of this node
+            // num_to_sample times (degree norm is 1 / degree)
+            double probability_of_reject;
+            if (!inductive_subgraph) {
+              probability_of_reject =
+                  std::pow(1 - GetGlobalDegreeNorm(*src_iter), num_to_sample);
+            } else {
+              probability_of_reject = std::pow(
+                  1 - GetGlobalTrainDegreeNorm(*src_iter), num_to_sample);
+            }
+
+            // loop through edges, turn "on" edge with some probability
+            for (auto edge_iter : partitioned_graph_->edges(*src_iter)) {
+              if (sample_rng_.DoBernoulli(probability_of_reject)) {
+                if (inductive_subgraph) {
+                  // only take if node is training node or a node not classified
+                  // into train/test/val
+                  if (!IsValidForPhase(
+                          partitioned_graph_->getEdgeDst(edge_iter),
+                          GNNPhase::kTrain) &&
+                      !IsValidForPhase(
+                          partitioned_graph_->getEdgeDst(edge_iter),
+                          GNNPhase::kOther)) {
+                    continue;
+                  }
+                }
+
+                uint32_t edge_dst = partitioned_graph_->getEdgeDst(edge_iter);
+                // if here, it means edge accepted; set sampled on, mark
+                // as part of next set
+                MakeEdgeSampled(edge_iter, sample_layer_num);
+                if (!IsInSampledGraph(edge_dst)) {
+                  bitset_sample_flag_.set(edge_dst);
+                }
+                bitset_sampled_degrees_.set(*src_iter);
+                definitely_sampled_nodes_.set(*src_iter);
+                definitely_sampled_nodes_.set(edge_dst);
+                // degree increment
+                sampled_out_degrees_[sample_layer_num][*src_iter]++;
+              }
+            }
+          }
+        },
+        galois::steal(), galois::loopname("NeighborhoodSample"));
+
+    // update nodes, then communicate update to all hosts so that they can
+    // continue the exploration
+    galois::do_all(
+        galois::iterate(size_t{0}, bitset_sample_flag_.size()),
+        [&](uint32_t new_node_id) {
+          if (bitset_sample_flag_.test(new_node_id)) {
+            SetSampledNode(new_node_id);
+          }
+        },
+        galois::loopname("NeighborhoodSampleSet"));
+
+    // why not read source? even if it doesn't need to sample anything, it needs
+    // to know that it's active so that subgraph construction can proceed
+    // correctly
+    SampleNodeSync("SampleFlag");
+
+    // count sampled node size
+    galois::GAccumulator<unsigned> local_sample_count;
+    local_sample_count.reset();
+    // count # of seed nodes
+    galois::do_all(galois::iterate(begin(), end()), [&](const NodeIterator& x) {
+      if (IsInSampledGraph(x)) {
+        local_sample_count += 1;
+        if (sample_node_timestamps_[*x] ==
+            std::numeric_limits<uint32_t>::max()) {
+          if (x < end_owned()) {
+            // owned nodes that are activated on other hosts shoudl always
+            // be activated because it's responsible for keeping others in
+            // sync during comms; ignoring it = bad
+            // TODO(gluon) make it so you don't have to deal with this
+            // and just use host as a reducer point
+            definitely_sampled_nodes_.set(*x);
+          }
+          sample_node_timestamps_[*x] = timestamp;
+        }
+      }
+    });
+
+    DisableSubgraphChooseAll();
+    return local_sample_count.reduce();
+  }
 
   std::vector<unsigned> ConstructSampledSubgraph(size_t num_sampled_layers) {
     return ConstructSampledSubgraph(num_sampled_layers, false);
   };
   //! Construct the subgraph from sampled edges and corresponding nodes
   std::vector<unsigned> ConstructSampledSubgraph(size_t num_sampled_layers,
-                                                 bool use_view);
+                                                 bool use_view) {
+    // false first so that the build process can use functions to access the
+    // real graph
+    DisableSubgraph();
+
+    gnn_sampled_out_degrees_ = &sampled_out_degrees_;
+
+    // first, sync the degres of the sampled edges across all hosts
+    // read any because destinations need it to for reverse phase
+    if (use_timer_) {
+      sync_substrate_->template sync<
+          writeSource, readAny, SubgraphDegreeSync<VTy>, SubgraphDegreeBitset>(
+          "SubgraphDegree");
+    } else {
+      sync_substrate_->template sync<
+          writeSource, readAny, SubgraphDegreeSync<VTy>, SubgraphDegreeBitset>(
+          "Ignore");
+    }
+
+    galois::StatTimer offsets_n_rows_time("OffsetRowSubgraphTime");
+    offsets_n_rows_time.start();
+    galois::do_all(
+        galois::iterate(begin(), end()),
+        [&](const NodeIterator& x) {
+          if (IsActiveInSubgraph(*x)) {
+            if (sample_node_timestamps_[*x] !=
+                std::numeric_limits<uint32_t>::max()) {
+              if (*x < *end_owned()) {
+                // master
+                master_offset_accum_[sample_node_timestamps_[*x]] += 1;
+              } else {
+                // mirror
+                mirror_offset_accum_[sample_node_timestamps_[*x]] += 1;
+              }
+            } else {
+              GALOIS_LOG_FATAL(
+                  "should have been timestamped at some point if active");
+            }
+          }
+        },
+        galois::loopname("MasterMirrorOffset"));
+
+    std::vector<unsigned> new_rows(master_offset_accum_.size());
+    for (unsigned i = 0; i < master_offset_accum_.size(); i++) {
+      sample_master_offsets_[i] = master_offset_accum_[i].reduce();
+      sample_mirror_offsets_[i] = mirror_offset_accum_[i].reduce();
+      new_rows[i] = sample_master_offsets_[i] + sample_mirror_offsets_[i];
+      if (i > 0) {
+        new_rows[i] += new_rows[i - 1];
+      }
+    }
+
+    offsets_n_rows_time.stop();
+
+    if (!use_view) {
+      subgraph_->BuildSubgraph(*this, num_sampled_layers);
+    } else {
+      // a view only has lid<->sid mappings
+      subgraph_->BuildSubgraphView(*this, num_sampled_layers);
+    }
+
+    sync_substrate_->SetupSubgraphMirrors(subgraph_->GetSubgraphMirrors(),
+                                          use_timer_);
+
+    // after this, this graph is a subgraph
+    if (!use_view) {
+      use_subgraph_ = true;
+    } else {
+      use_subgraph_view_ = true;
+    }
+
+    return new_rows;
+  }
 
   unsigned SampleNodeTimestamp(unsigned lid) const {
     return sample_node_timestamps_[lid];
@@ -410,7 +861,23 @@ class GNNGraph {
 
   //! Setup the state for the next minibatch sampling call by using the
   //! minibatcher to pick up the next set batch of nodes
-  size_t PrepareNextTrainMinibatch();
+  size_t PrepareNextTrainMinibatch() {
+    train_batcher_->GetNextMinibatch(&local_minibatch_mask_);
+#ifndef NDEBUG
+    size_t count = 0;
+    // galois::gPrint("Minibatch : ");
+    for (unsigned i = 0; i < local_minibatch_mask_.size(); i++) {
+      if (local_minibatch_mask_[i]) {
+        // galois::gPrint(partitioned_graph_->getGID(i), ",");
+        count++;
+      }
+    }
+    // galois::gPrint("\n");
+    galois::gInfo(host_prefix(), "Batched nodes ", count);
+#endif
+    return SetupNeighborhoodSample(GNNPhase::kBatch);
+  }
+
   // Used with distributed minibatch tracker
   // size_t PrepareNextTrainMinibatch(size_t num_to_get) {
   //  train_batcher_->GetNextMinibatch(&local_minibatch_mask_, num_to_get);
@@ -419,6 +886,169 @@ class GNNGraph {
   //! Returns true if there are still more minibatches in this graph
   bool MoreTrainMinibatches() { return !train_batcher_->NoMoreMinibatches(); };
 
+  template <
+      typename T                                                      = VTy,
+      typename std::enable_if_t<std::is_same_v<T, shad::ShadNodeTy>>* = nullptr>
+  void ConstructFeatureBy2HopAggregation() {
+    galois::StatTimer timer("ConstructFeatureBy2HopAggregation");
+    if (this->use_timer_) {
+      timer.start();
+    }
+
+    // TODO(hc): This constant is from SHAD implementation.
+    //           This will be an user parameter for general/flexible support.
+
+    // The first 15 floats are for the current node feature,
+    // and the another 15 floats are for the aggregated neighbor's node feature.
+    // These two 15-dimension features are concateneated to a single feature
+    // for each node.
+    this->node_feature_length_ = 30;
+    this->local_node_features_.resize(
+        this->partitioned_graph_->size() * this->node_feature_length_, 0.f);
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      this->ConstructFeatureBy2HopAggregationGPU();
+    } else {
+#endif
+      this->ConstructFeatureBy2HopAggregationCPU();
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+
+    if (this->use_timer_) {
+      timer.stop();
+    }
+  }
+
+  template <
+      typename T                                                      = VTy,
+      typename std::enable_if_t<!std::is_same_v<T, shad::ShadNodeTy>>* = nullptr>
+  void ConstructFeatureBy2HopAggregation() {}
+
+  void ConstructFeatureBy2HopAggregationGPU() {
+    // TODO(hc): This might not be used in the future.
+    //           This might be renamed to use "PANDO" instead of "GPU".
+    //           For now, just following the existing code format.
+    GALOIS_LOG_FATAL(
+        "ConstructFeatureBy2HopAggregationGPU() is not supported.");
+  }
+
+  void ConstructFeatureBy2HopAggregationCPU() {
+    galois::gInfo("Construct an initial feature on CPU by "
+                  "aggregating and concatenating neighbors' features.");
+    //this->PrintFeatures("0hop");
+    // this->FillTestNodeType();
+    //this->PrintGraphTopo("before");
+    this->Construct1HopFeatureCPU();
+    //this->PrintFeatures("1hop");
+    this->Construct2HopFeatureCPU();
+    this->PrintFeatures("2hop");
+  }
+
+  void PrintFeatures(std::string postfix) {
+    // XXX(hc): Printing code for correctness check.
+    auto& net        = galois::runtime::getSystemNetworkInterface();
+    unsigned host_id = net.ID;
+    std::ofstream fp(postfix + "." + std::to_string(host_id) + ".feat");
+    for (size_t lid = 0; lid < this->partitioned_graph_->size(); ++lid) {
+      /*
+      size_t gid = this->partitioned_graph_->getGID(lid);
+      fp << "src:" << gid << ", " <<
+          this->partitioned_graph_->getData(lid).type << ", " <<
+          this->partitioned_graph_->getData(lid).key << "\n";
+      for (size_t i = 0; i < this->node_feature_length_; ++i) {
+        fp << "\t [" << i << "] = " <<
+            this->local_node_features_[lid * this->node_feature_length_ + i]
+            << "\n";
+      }
+      */
+      fp << this->partitioned_graph_->getData(lid).key;
+      for (size_t i = 0; i < this->node_feature_length_; ++i) {
+        fp << ","
+           << this->local_node_features_[lid * this->node_feature_length_ + i];
+      }
+      fp << "\n";
+    }
+    fp.close();
+  }
+
+  /// Construct feature from 1-hop neighbors.
+  /// This method traverses 1-hop outgoing neighbors from each vertex
+  /// and constructs a histogram of the outgoing edge type and
+  /// the outgoing neighbor type.
+  void Construct1HopFeatureCPU() {
+    auto& graph = *(this->partitioned_graph_);
+    // Aggregate adjacent node and edge types and construct
+    // an intermediate feature.
+    galois::do_all(
+        galois::iterate(size_t{0}, graph.size()),
+        [&](size_t src_lid) {
+          bitset_graph_aggregate.set(src_lid);
+          for (auto edge_iter = graph.edge_begin(src_lid);
+               edge_iter < graph.edge_end(src_lid); ++edge_iter) {
+            size_t dst_lid     = graph.getEdgeDst(edge_iter);
+            uint32_t dst_type  = graph.getData(dst_lid).type;
+            uint64_t edge_type = graph.getEdgeData(edge_iter);
+            // Aggregate out neighbors' types.
+            ++this->local_node_features_[this->node_feature_length_ * src_lid +
+                                         dst_type];
+            // TODO(hc): Assume that edge type is always 0.
+            //           So, the 0th feature value of a node should be
+            //           (degree of the node + sum of type-0 neighbors).
+            ++this->local_node_features_[this->node_feature_length_ * src_lid +
+                                         edge_type];
+          }
+        },
+        galois::steal(), galois::loopname("Construct1HopFeatureCPU"));
+
+    gnn_matrix_to_sync_               = this->local_node_features_.data();
+    gnn_matrix_to_sync_column_length_ = this->node_feature_length_;
+    // All the source vertices reduce and update proxies' data
+    // and both the source and destination vertices set those
+    // updated data to their data.
+    sync_substrate_->template sync<writeSource, readAny, GNNSumAggregate<VTy>,
+                                   Bitset_graph_aggregate>(
+        "GraphAggregateSync");
+  }
+
+
+  /// Construct feature from 2-hop neighbors.
+  /// After `Construct1HopFeatureCPU()`, each vertex aggregates types of
+  /// the outgoing edges and neighbors, and constructs a histogram for
+  /// its feature. Now, in this method, each vertex aggregates those
+  /// histograms from outgoing neighbors and constructs a new histogram.
+  /// Then, each vertex appends this new histogram to the old histogram
+  /// as its feature.
+  void Construct2HopFeatureCPU() {
+    auto& graph = *(this->partitioned_graph_);
+    // Aggregate neighbor nodes' features and append (concatenate) it to the
+    // current node feature. So the first half is the current node and
+    // the next half is the aggregated node feature.
+    galois::do_all(
+        galois::iterate(size_t{0}, graph.size()),
+        [&](size_t src_lid) {
+          // Offset for the second part of the source node feature.
+          size_t src_foffset = this->node_feature_length_ * src_lid +
+                               this->node_feature_length_ / 2;
+          bitset_graph_aggregate.set(src_lid);
+          for (auto edge_iter = graph.edge_begin(src_lid);
+               edge_iter < graph.edge_end(src_lid); ++edge_iter) {
+            size_t dst_lid = graph.getEdgeDst(edge_iter);
+            // Offset for the first part of the destination node feature.
+            size_t dst_foffset = this->node_feature_length_ * dst_lid;
+            for (size_t fid = 0; fid < this->node_feature_length_ / 2; ++fid) {
+              // Aggregate outgoing neighbors' features and,
+              // construct and append a new histogram to the old one.
+              this->local_node_features_[src_foffset + fid] +=
+                  this->local_node_features_[dst_foffset + fid];
+            }
+          }
+        },
+        galois::steal(), galois::loopname("Construct2HopFeatureCPU"));
+    this->SHADFeatureAggregateSync(this->local_node_features_.data(),
+                                   this->node_feature_length_);
+  }
+
   //////////////////////////////////////////////////////////////////////////////
 
   void SetupTestBatcher(size_t test_batch_size) {
@@ -433,7 +1063,11 @@ class GNNGraph {
   void ResetTestMinibatcher() { test_batcher_->ResetMinibatchState(); }
   //! Setup the state for the next minibatch sampling call by using the
   //! minibatcher to pick up the next set batch of nodes
-  size_t PrepareNextTestMinibatch();
+  size_t PrepareNextTestMinibatch() {
+    test_batcher_->GetNextMinibatch(&local_minibatch_mask_);
+    return SetupNeighborhoodSample(GNNPhase::kBatch);
+  }
+
   //! Returns true if there are still more minibatches in this graph
   bool MoreTestMinibatches() { return !test_batcher_->NoMoreMinibatches(); };
 
@@ -487,12 +1121,47 @@ class GNNGraph {
 
   // Get accuracy: sampling is by default false
   float GetGlobalAccuracy(PointerWithSize<GNNFloat> predictions,
-                          GNNPhase phase);
+                          GNNPhase phase) {
+    // No GPU version yet, but this is where it would be
+    return GetGlobalAccuracy(predictions, phase, false);
+  }
+
   float GetGlobalAccuracy(PointerWithSize<GNNFloat> predictions, GNNPhase phase,
-                          bool sampling);
+                          bool sampling) {
+    // No GPU version yet, but this is where it would be
+    return GetGlobalAccuracyCPU(predictions, phase, sampling);
+  }
 
   std::pair<uint32_t, uint32_t>
-  GetBatchAccuracy(PointerWithSize<GNNFloat> predictions);
+  GetBatchAccuracy(PointerWithSize<GNNFloat> predictions) {
+    // check owned nodes' accuracy
+    num_correct_.reset();
+    total_checked_.reset();
+
+    galois::do_all(
+        // will only loop over sampled nodes if sampling is on
+        galois::iterate(begin_owned(), end_owned()),
+        // this is possibly the subgraph id
+        [&](const unsigned node_id) {
+          if (IsValidForPhase(node_id, GNNPhase::kBatch)) {
+            total_checked_ += 1;
+            size_t predicted_label =
+                galois::MaxIndex(num_label_classes_,
+                                 &(predictions[node_id * num_label_classes_]));
+            if (predicted_label ==
+                static_cast<size_t>(GetSingleClassLabel(node_id))) {
+              num_correct_ += 1;
+            }
+          }
+        },
+        // steal on as some threads may have nothing to work on
+        galois::steal(), galois::loopname("GlobalAccuracy"));
+
+    size_t global_correct = num_correct_.reduce();
+    size_t global_checked = total_checked_.reduce();
+
+    return std::make_pair(global_correct, global_checked);
+  }
 
   //! Returns the ground truth label of some local id assuming labels are single
   //! class labels.
@@ -561,6 +1230,49 @@ class GNNGraph {
 
   //////////////////////////////////////////////////////////////////////////////
 
+  //! @brief Variant of the plain feature aggregation.
+  //! @detail This is a variant version of the dense feature aggregation
+  //! that follows SHAD GNN feature construction. This aggregates features of
+  //! the neighbor vertices that are from (vertex's feature offset +
+  //! 1/2 * feature length) to (vertex's feature offset + feature length),
+  //! to (vertex's feature offset) of the current vertex, from its proxies. 
+  //!
+  //! @param matrix_to_sync Float pointer pointing to features of the target
+  //! vertex
+  //! @param matrix_column_size Feature length to calculate a base offset of
+  //! each vertex
+  void SHADFeatureAggregateSync(GNNFloat* matrix_to_sync,
+                                const size_t matrix_column_size) const {
+    gnn_matrix_to_sync_               = matrix_to_sync;
+    gnn_matrix_to_sync_column_length_ = matrix_column_size;
+
+    // set globals for the sync substrate
+    if (use_timer_) {
+      sync_substrate_->template sync<
+          writeSource, readAny, SHADGNNSumAggregate<VTy>, Bitset_graph_aggregate>(
+          "SHADGraphAggregateSync");
+    } else {
+      sync_substrate_->template sync<
+          writeSource, readAny, SHADGNNSumAggregate<VTy>, Bitset_graph_aggregate>(
+          "Ignore");
+    }
+  }
+
+  void SampleNodeSync(std::string stat_str) {
+    sampled_nodes_ = &(this->mark_sampled_nodes_);
+
+    // set globals for the sync substrate
+    if (use_timer_) {
+      sync_substrate_->template sync<writeSource, readDestination,
+                                     SampleFlagSync<VTy>, SampleFlagBitset>(
+          stat_str);
+    } else {
+      sync_substrate_->template sync<writeSource, readDestination,
+                                     SampleFlagSync<VTy>, SampleFlagBitset>(
+          "Ignore");
+    }
+  }
+
   // TODO(loc) Should not be a default version of this to avoid potential
   // issues later
   void AggregateSync(GNNFloat* matrix_to_sync,
@@ -575,7 +1287,58 @@ class GNNGraph {
   //! Note that it's const because the only thing being used is the graph
   //! topology of this object; the thing modified is the passed in matrix
   void AggregateSync(GNNFloat* matrix_to_sync, const size_t matrix_column_size,
-                     bool is_backward, uint32_t active_row_boundary) const;
+                     bool is_backward, uint32_t active_row_boundary) const {
+    gnn_matrix_to_sync_               = matrix_to_sync;
+    gnn_matrix_to_sync_column_length_ = matrix_column_size;
+    subgraph_size_                    = active_size();
+    num_active_layer_rows_            = active_row_boundary;
+
+    if (!use_subgraph_ && !use_subgraph_view_) {
+      // set globals for the sync substrate
+      if (!is_backward) {
+        if (use_timer_) {
+          sync_substrate_
+              ->template sync<writeSource, readAny, GNNSumAggregate<VTy>,
+                              Bitset_graph_aggregate>("GraphAggregateSync");
+        } else {
+          sync_substrate_
+              ->template sync<writeSource, readAny, GNNSumAggregate<VTy>,
+                              Bitset_graph_aggregate>("Ignore");
+        }
+      } else {
+        galois::StatTimer clubbed_timer("Sync_BackwardSync", "Gluon");
+        clubbed_timer.start();
+        sync_substrate_
+            ->template sync<writeDestination, readAny, GNNSumAggregate<VTy>,
+                            Bitset_graph_aggregate>(
+                "BackwardGraphAggregateSync");
+        clubbed_timer.stop();
+      }
+    } else {
+      // setup the SID to LID map for the sync substrate to use (SID != LID)
+      gnn_lid_to_sid_pointer_ = subgraph_->GetLIDToSIDPointer();
+
+      if (!is_backward) {
+        if (use_timer_) {
+          sync_substrate_
+              ->template sync<writeSource, readAny, GNNSampleSumAggregate<VTy>,
+                              Bitset_graph_aggregate>("GraphAggregateSync");
+        } else {
+          sync_substrate_
+              ->template sync<writeSource, readAny, GNNSampleSumAggregate<VTy>,
+                              Bitset_graph_aggregate>("Ignore");
+        }
+      } else {
+        galois::StatTimer clubbed_timer("Sync_BackwardSync", "Gluon");
+        clubbed_timer.start();
+        sync_substrate_
+            ->template sync<writeDestination, readAny,
+                            GNNSampleSumAggregate<VTy>, Bitset_graph_aggregate>(
+                "BackwardGraphAggregateSync");
+        clubbed_timer.stop();
+      }
+    }
+  }
 
   //////////////////////////////////////////////////////////////////////////////
   // Sampling related
@@ -613,17 +1376,87 @@ class GNNGraph {
   }
 
   //! Calculate norm factor considering the entire graph
-  void CalculateFullNormFactor();
+  void CalculateFullNormFactor() {
+    // TODO(loc) reset all degrees if this is called multiple times?
+    // get the norm factor contribution for each node based on the GLOBAL graph
+    galois::do_all(
+        galois::iterate(static_cast<size_t>(0), partitioned_graph_->size()),
+        [&](size_t src) {
+          for (auto edge_iter = partitioned_graph_->edge_begin(src);
+               edge_iter != partitioned_graph_->edge_end(src); edge_iter++) {
+            // count degrees for all + train/other
+            size_t dest = GetEdgeDest(edge_iter);
+            if (IsValidForPhase(dest, GNNPhase::kTrain) ||
+                IsValidForPhase(dest, GNNPhase::kOther)) {
+              global_train_degrees_[src] += 1;
+            }
+            global_degrees_[src] += 1;
+          }
+        },
+        galois::loopname("CalculateLocalDegrees"));
+    // degree sync
+    gnn_degree_vec_1_ = global_train_degrees_.data();
+    gnn_degree_vec_2_ = global_degrees_.data();
+    sync_substrate_
+        ->template sync<writeSource, readAny, InitialDegreeSync<VTy>>(
+            "InitialDegreeSync");
+  }
 
 #ifdef GALOIS_ENABLE_GPU
   void AggregateSyncGPU(GNNFloat* matrix_to_sync,
                         const size_t matrix_column_size,
-                        const unsigned layer_number) const;
+                        const unsigned layer_number) const {
+    size_t layer_input_mtx_column_size =
+        getLayerInputMatrixColumnSize(cuda_ctx_, layer_number);
+    size_t layer_output_mtx_column_size =
+        getLayerOutputMatrixColumnSize(cuda_ctx_, layer_number);
+    // set globals for the sync substrate
+    gnn_matrix_to_sync_               = matrix_to_sync;
+    gnn_matrix_to_sync_column_length_ = matrix_column_size;
+    cuda_ctx_for_sync                 = cuda_ctx_;
+    layer_number_to_sync              = layer_number;
+    // TODO bitset setting
+    // call sync
+    cudaSetLayerInputOutput(cuda_ctx_, matrix_to_sync, matrix_column_size,
+                            size(), layer_number);
+
+    // XXX no timer if use_timer is off
+    if (gnn_matrix_to_sync_column_length_ == layer_input_mtx_column_size) {
+      if (use_timer_) {
+        sync_substrate_->template sync<writeSource, readAny,
+                                       GNNSumAggregate_layer_input<VTy>>(
+            "GraphAggregateSync", gnn_matrix_to_sync_column_length_);
+      } else {
+        sync_substrate_->template sync<writeSource, readAny,
+                                       GNNSumAggregate_layer_input<VTy>>(
+            "Ignore", gnn_matrix_to_sync_column_length_);
+      }
+    } else if (gnn_matrix_to_sync_column_length_ ==
+               layer_output_mtx_column_size) {
+      if (use_timer_) {
+        sync_substrate_->template sync<writeSource, readAny,
+                                       GNNSumAggregate_layer_output<VTy>>(
+            "GraphAggregateSync", gnn_matrix_to_sync_column_length_);
+      } else {
+        sync_substrate_->template sync<writeSource, readAny,
+                                       GNNSumAggregate_layer_output<VTy>>(
+            "Ignore", gnn_matrix_to_sync_column_length_);
+      }
+    } else {
+      GALOIS_LOG_FATAL("Column size of the synchronized matrix does not"
+                       " match to the column size of the CUDA context");
+    }
+  }
 
   void InitLayerVectorMetaObjects(size_t layer_number, unsigned num_hosts,
-                                  size_t infl_in_size, size_t infl_out_size);
+                                  size_t infl_in_size, size_t infl_out_size) {
+    init_CUDA_layer_vector_meta_obj(cuda_ctx_, layer_number, num_hosts, size(),
+                                    infl_in_size, infl_out_size);
+  }
 
-  void ResizeGPULayerVector(size_t num_layers);
+  void ResizeGPULayerVector(size_t num_layers) {
+    resize_CUDA_layer_vector(cuda_ctx_, num_layers);
+  }
 
   const GNNGraphGPUAllocations& GetGPUGraph() const { return gpu_memory_; }
 
@@ -636,7 +1469,64 @@ class GNNGraph {
   }
 #endif
 
-  void ContiguousRemap(const std::string& new_name);
+  void ContiguousRemap(const std::string& new_name) {
+    node_remapping_.resize(partitioned_graph_->size());
+
+    uint32_t new_node_id = 0;
+
+    // serial loops because new ID needs to be kept consistent
+    // first, train nodes
+    for (size_t cur_node = 0; cur_node < partitioned_graph_->size();
+         cur_node++) {
+      if (IsValidForPhase(cur_node, GNNPhase::kTrain)) {
+        node_remapping_[new_node_id++] = cur_node;
+      }
+    }
+    galois::gInfo("Train nodes are from 0 to ", new_node_id);
+
+    // second, val nodes
+    uint32_t val_start = new_node_id;
+    for (size_t cur_node = 0; cur_node < partitioned_graph_->size();
+         cur_node++) {
+      if (IsValidForPhase(cur_node, GNNPhase::kValidate)) {
+        node_remapping_[new_node_id++] = cur_node;
+      }
+    }
+    galois::gInfo("Val nodes are from ", val_start, " to ", new_node_id, "(",
+                  new_node_id - val_start, ")");
+
+    // third, test nodes
+    uint32_t test_start = new_node_id;
+    for (size_t cur_node = 0; cur_node < partitioned_graph_->size();
+         cur_node++) {
+      if (IsValidForPhase(cur_node, GNNPhase::kTest)) {
+        node_remapping_[new_node_id++] = cur_node;
+      }
+    }
+    galois::gInfo("Test nodes are from ", test_start, " to ", new_node_id, "(",
+                  new_node_id - test_start, ")");
+
+    // last, everything else
+    uint32_t other_start = new_node_id;
+    for (size_t cur_node = 0; cur_node < partitioned_graph_->size();
+         cur_node++) {
+      if (IsValidForPhase(cur_node, GNNPhase::kOther)) {
+        node_remapping_[new_node_id++] = cur_node;
+      }
+    }
+    galois::gInfo("Other nodes are from ", other_start, " to ", new_node_id,
+                  "(", new_node_id - other_start, ")");
+    GALOIS_LOG_ASSERT(new_node_id == partitioned_graph_->size());
+
+    // save the mapping to a binary file for use by graph convert to deal with
+    // the gr
+    std::string label_filename = input_directory_ + new_name + "-mapping.bin";
+    std::ofstream label_write_stream;
+    label_write_stream.open(label_filename, std::ios::binary | std::ios::out);
+    label_write_stream.write((char*)node_remapping_.data(),
+                             sizeof(uint32_t) * node_remapping_.size());
+    label_write_stream.close();
+  }
 
   void EnableTimers() {
     use_timer_ = true;
@@ -675,46 +1565,909 @@ class GNNGraph {
   // Initialization
   //////////////////////////////////////////////////////////////////////////////
 
-  void ReadLocalLabelsBin(const std::string& dataset_name);
+  //! Partitions a particular dataset given some partitioning scheme
+  std::unique_ptr<GNNDistGraph> LoadPartition(
+      const std::string& input_directory, const std::string& dataset_name,
+      galois::graphs::GNNPartitionScheme partition_scheme, bool useWMD) {
+    // XXX input path
+    std::string input_file = input_directory + dataset_name + ".csgr";
+    if (useWMD) {
+      input_file = dataset_name;
+    }
+    GALOIS_LOG_VERBOSE("Partition loading: File to read is {}", input_file);
+
+    // load partition
+    switch (partition_scheme) {
+    case galois::graphs::GNNPartitionScheme::kOEC:
+      return galois::cuspPartitionGraph<GnnOEC, VTy, ETy>(
+          input_file, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, true, "", "",
+          false, 1);
+    case galois::graphs::GNNPartitionScheme::kCVC:
+      return galois::cuspPartitionGraph<GnnCVC, VTy, ETy>(
+          input_file, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, true, "", "",
+          false, 1);
+    case galois::graphs::GNNPartitionScheme::kOCVC:
+      return galois::cuspPartitionGraph<GenericCVC, VTy, ETy>(
+          input_file, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, true, "", "",
+          false, 1);
+    default:
+      GALOIS_LOG_FATAL("Error: partition scheme specified is invalid");
+      return nullptr;
+    }
+  }
+
+  template <
+      typename T                                                      = VTy,
+      typename std::enable_if_t<std::is_same_v<T, shad::ShadNodeTy>>* = nullptr>
+  void ConstructLocalLabels() {
+    GALOIS_LOG_VERBOSE("[{}] Constructing labels from disk...", host_id_);
+    auto& graph = *(this->partitioned_graph_);
+    // For WMD graph, we always assume a single class label.
+    // allocate memory for labels
+    // single-class (one-hot) label for each vertex: N x 1
+    using_single_class_labels_ = true;
+    local_ground_truth_labels_.resize(graph.size());
+    // In WMD graphs, a vertex class is a vertex type.
+    // As the vertex type is already materialized in a vertex data,
+    // iterate a graph and extract that.
+    // TODO(hc): Using concurrent set using a finer-grained lock
+    // is better
+    std::mutex label_class_set_mtx;
+    std::unordered_set<int> label_class_set;
+    galois::do_all(
+        galois::iterate(size_t{0}, graph.size()),
+        [&](size_t lid) {
+          local_ground_truth_labels_[lid] = graph.getData(lid).type;
+          label_class_set_mtx.lock();
+          auto found = label_class_set.find(local_ground_truth_labels_[lid]);
+          if (found == label_class_set.end()) {
+            label_class_set.emplace(local_ground_truth_labels_[lid]);
+            ++num_label_classes_;
+          }
+          label_class_set_mtx.unlock();
+        });
+
+    // Exchange found local vertex classes with other hosts to
+    // calculate the total number of the classes.
+    //
+    // Serialize the label class set to a vector to serialize this data
+    // to galois::runtime::SendBuffer. The current libdist does not
+    // support std::set and std::unordered_set de/serialization.
+    // TODO(hc): support this type of serialization.
+    std::vector<int> label_vec(label_class_set.begin(), label_class_set.end());
+    auto &net = galois::runtime::getSystemNetworkInterface();
+    for (uint32_t h = 0; h < net.Num; ++h) {
+      if (h == net.ID) { continue; }
+      galois::runtime::SendBuffer b;
+      galois::runtime::gSerialize(b, label_vec);
+      net.sendTagged(h, galois::runtime::evilPhase, std::move(b));
+    }
+    net.flush();
+    for (uint32_t h = 0; h < net.Num - 1; ++h) {
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
+      do {
+        p = net.recieveTagged(galois::runtime::evilPhase);
+      } while (!p);
+
+      std::vector<int> h_label_vec;
+      galois::runtime::gDeserialize(p->second, h_label_vec);
+      galois::do_all(galois::iterate(h_label_vec),
+          [&](int i) {
+            label_class_set_mtx.lock();
+            auto found = label_class_set.find(i);
+            if (found == label_class_set.end()) {
+              label_class_set.emplace(i);
+              // Increaes the number of classes only if
+              // it was not found in the local host.
+              ++num_label_classes_;
+            }
+            label_class_set_mtx.unlock();
+          } );
+    }
+    increment_evilPhase();
+  }
+
+  template <
+      typename T                                                      = VTy,
+      typename std::enable_if_t<!std::is_same_v<T, shad::ShadNodeTy>>* = nullptr>
+  void ConstructLocalLabels() {}
+
+  void ReadLocalLabelsBin(const std::string& dataset_name) {
+    GALOIS_LOG_VERBOSE("[{}] Reading labels from disk...", host_id_);
+
+    std::ifstream file_stream;
+    file_stream.open(input_directory_ + dataset_name + "-labels-dims.txt",
+                     std::ios::in);
+    size_t num_nodes;
+    file_stream >> num_nodes >> num_label_classes_ >> std::ws;
+    assert(num_nodes == partitioned_graph_->globalSize());
+    if (host_id_ == 0) {
+      galois::gInfo("Number of label classes is ", num_label_classes_);
+    }
+    file_stream.close();
+
+    std::string filename = input_directory_ + dataset_name + "-labels.bin";
+    std::ifstream file_stream_bin;
+    file_stream_bin.open(filename, std::ios::binary | std::ios::in);
+
+    std::vector<GNNLabel> all_labels(num_nodes);
+    // read all labels into a vector
+    file_stream_bin.read((char*)all_labels.data(),
+                         sizeof(GNNLabel) * num_nodes);
+
+    using_single_class_labels_ = true;
+    local_ground_truth_labels_.resize(partitioned_graph_->size());
+
+    galois::GAccumulator<size_t> found_local_vertices;
+    found_local_vertices.reset();
+
+    // save only local ones; can do in parallel as well
+    // assumes -1 already dealt with
+    galois::do_all(galois::iterate(size_t{0}, partitioned_graph_->size()),
+                   [&](size_t lid) {
+                     local_ground_truth_labels_[lid] = all_labels[GetGID(lid)];
+                     found_local_vertices += 1;
+                   });
+
+    size_t fli = found_local_vertices.reduce();
+    galois::gInfo(host_prefix_, "Read ", fli, " labels (",
+                  local_ground_truth_labels_.size() * double{4} / (1 << 30),
+                  " GB)");
+    GALOIS_LOG_ASSERT(fli == partitioned_graph_->size());
+  }
+
   //! Read labels of local nodes only
   void ReadLocalLabels(const std::string& dataset_name,
-                       bool has_single_class_label);
+                       bool has_single_class_label) {
+    GALOIS_LOG_VERBOSE("[{}] Reading labels from disk...", host_id_);
+    std::string filename;
+    if (has_single_class_label) {
+      filename = input_directory_ + dataset_name + "-labels.txt";
+    } else {
+      filename = input_directory_ + dataset_name + "-mlabels.txt";
+    }
+
+    // read file header, save num label classes while at it
+    std::ifstream file_stream;
+    file_stream.open(filename, std::ios::in);
+    size_t num_nodes;
+    file_stream >> num_nodes >> num_label_classes_ >> std::ws;
+    assert(num_nodes == partitioned_graph_->globalSize());
+    if (host_id_ == 0) {
+      galois::gInfo("Number of label classes is ", num_label_classes_);
+    }
+
+    // allocate memory for labels
+    if (has_single_class_label) {
+      // single-class (one-hot) label for each vertex: N x 1
+      using_single_class_labels_ = true;
+      local_ground_truth_labels_.resize(partitioned_graph_->size());
+    } else {
+      // multi-class label for each vertex: N x num classes
+      using_single_class_labels_ = false;
+      local_ground_truth_labels_.resize(partitioned_graph_->size() *
+                                        num_label_classes_);
+    }
+
+    size_t cur_gid              = 0;
+    size_t found_local_vertices = 0;
+    // each line contains a set of 0s and 1s
+    std::string read_line;
+
+    // loop through all labels of the graph
+    while (std::getline(file_stream, read_line)) {
+      // only process label if this node is local
+      if (partitioned_graph_->isLocal(cur_gid)) {
+        uint32_t cur_lid = partitioned_graph_->getLID(cur_gid);
+        // read line as bitset of 0s and 1s
+        std::istringstream label_stream(read_line);
+        int cur_bit;
+        // bitset size is # of label classes
+        for (size_t cur_class = 0; cur_class < num_label_classes_;
+             ++cur_class) {
+          // read a bit
+          label_stream >> cur_bit;
+
+          if (has_single_class_label) {
+            // no label
+            if (cur_bit == -1) {
+              local_ground_truth_labels_[cur_lid] = num_label_classes_;
+              break;
+            }
+
+            // in single class, only 1 bit is set in bitset; that represents the
+            // class to take
+            if (cur_bit != 0) {
+              // set class and break (assumption is that's the only bit that is
+              // set)
+              local_ground_truth_labels_[cur_lid] = cur_class;
+              break;
+            }
+          } else {
+            // else the entire bitset needs to be copied over to the label array
+            // TODO this can possibly be saved all at once rather than bit by
+            // bit?
+            local_ground_truth_labels_[cur_lid * num_label_classes_ +
+                                       cur_class] = cur_bit;
+          }
+        }
+        found_local_vertices++;
+      }
+      // always increment cur_gid
+      cur_gid++;
+    }
+
+    file_stream.close();
+
+    galois::gInfo(host_prefix_, "Read ", found_local_vertices, " labels (",
+                  local_ground_truth_labels_.size() * double{4} / (1 << 30),
+                  " GB)");
+    GALOIS_LOG_ASSERT(found_local_vertices == partitioned_graph_->size());
+  }
+
   //! Read features of local nodes only
-  void ReadLocalFeatures(const std::string& dataset_str);
+  void ReadLocalFeatures(const std::string& dataset_name) {
+    GALOIS_LOG_VERBOSE("[{}] Reading features from disk...", host_id_);
+
+    // read in dimensions of features, specifically node feature length
+    size_t num_global_vertices;
+
+    std::string file_dims = input_directory_ + dataset_name + "-dims.txt";
+    std::ifstream ifs;
+    ifs.open(file_dims, std::ios::in);
+    ifs >> num_global_vertices >> node_feature_length_;
+    ifs.close();
+
+    GALOIS_LOG_ASSERT(num_global_vertices == partitioned_graph_->globalSize());
+    GALOIS_LOG_VERBOSE("[{}] N x D: {} x {}", host_id_, num_global_vertices,
+                       node_feature_length_);
+
+    // memory for all features of all nodes in graph
+    // TODO read features without loading entire feature file into memory; this
+    // is quite inefficient
+    std::unique_ptr<GNNFloat[]> full_feature_set = std::make_unique<GNNFloat[]>(
+        num_global_vertices * node_feature_length_);
+
+    // read in all features
+    std::ifstream file_stream;
+    std::string feature_file = input_directory_ + dataset_name + "-feats.bin";
+    file_stream.open(feature_file, std::ios::binary | std::ios::in);
+    file_stream.read((char*)full_feature_set.get(), sizeof(GNNFloat) *
+                                                        num_global_vertices *
+                                                        node_feature_length_);
+    file_stream.close();
+
+    // allocate memory for local features
+    local_node_features_.resize(partitioned_graph_->size() *
+                                node_feature_length_);
+
+    // copy over features for local nodes only
+    galois::GAccumulator<size_t> num_kept_vertices;
+    num_kept_vertices.reset();
+    galois::do_all(
+        galois::iterate(size_t{0}, num_global_vertices), [&](size_t gid) {
+          if (partitioned_graph_->isLocal(gid)) {
+            // copy over feature vector
+            std::copy(full_feature_set.get() + gid * node_feature_length_,
+                      full_feature_set.get() + (gid + 1) * node_feature_length_,
+                      &local_node_features_[partitioned_graph_->getLID(gid) *
+                                            node_feature_length_]);
+            num_kept_vertices += 1;
+          }
+        });
+    full_feature_set.reset();
+
+    galois::gInfo(host_prefix_, "Read ", local_node_features_.size(),
+                  " features (",
+                  local_node_features_.size() * double{4} / (1 << 30), " GB)");
+    GALOIS_LOG_ASSERT(num_kept_vertices.reduce() == partitioned_graph_->size());
+  }
+
   //! Helper function to read masks from file into the appropriate structures
   //! given a name, mask type, and arrays to save into
   size_t ReadLocalMasksFromFile(const std::string& dataset_name,
                                 const std::string& mask_type,
-                                GNNRange* mask_range, std::vector<char>* masks);
+                                GNNRange* mask_range,
+                                std::vector<char>* masks) {
+    size_t range_begin;
+    size_t range_end;
+
+    // read mask range
+    std::string mask_filename =
+        input_directory_ + dataset_name + "-" + mask_type + "_mask.txt";
+    bool train_is_on = false;
+    if (mask_type == "train") {
+      train_is_on = true;
+    }
+
+    std::ifstream mask_stream;
+    mask_stream.open(mask_filename, std::ios::in);
+    mask_stream >> range_begin >> range_end >> std::ws;
+    GALOIS_LOG_ASSERT(range_begin <= range_end);
+
+    // set the range object
+    mask_range->begin = range_begin;
+    mask_range->end   = range_end;
+    mask_range->size  = range_end - range_begin;
+
+    size_t cur_line_num = 0;
+    // valid nodes on this host
+    size_t local_sample_count = 0;
+    // this tracks TOTAL # of valid nodes in this group (not necessarily valid
+    // ones on this host)
+    size_t valid_count = 0;
+    std::string line;
+    // each line is a number signifying if mask is set for the vertex
+    while (std::getline(mask_stream, line)) {
+      std::istringstream mask_stream(line);
+      // only examine vertices/lines in range
+      if (cur_line_num >= range_begin && cur_line_num < range_end) {
+        unsigned mask = 0;
+        mask_stream >> mask;
+        if (mask == 1) {
+          valid_count++;
+          if (partitioned_graph_->isLocal(cur_line_num)) {
+            (*masks)[partitioned_graph_->getLID(cur_line_num)] = 1;
+            local_sample_count++;
+          }
+          if (train_is_on) {
+            global_training_mask_[cur_line_num] = 1;
+          }
+        }
+      }
+      cur_line_num++;
+    }
+    mask_stream.close();
+
+    if (train_is_on) {
+      global_training_count_ = valid_count;
+    }
+
+    if (valid_count != mask_range->size) {
+      // overlapping masks: need to actually check the masks rather than use
+      // ranges
+      if (!incomplete_masks_) {
+        galois::gInfo(
+            "Masks are not contained in range: must actually check mask");
+      }
+      incomplete_masks_ = true;
+    }
+
+    return valid_count;
+  }
+
   //! Finds nodes that aren't part of the 3 main GNN phase classifications
-  size_t FindOtherMask();
+  size_t FindOtherMask() {
+    galois::GAccumulator<size_t> other_accum;
+    other_accum.reset();
+    other_mask_.resize(partitioned_graph_->size());
+
+    galois::do_all(
+        galois::iterate(size_t{0}, partitioned_graph_->size()),
+        [&](size_t local_id) {
+          if (!IsValidForPhase(local_id, GNNPhase::kTrain) &&
+              !IsValidForPhase(local_id, GNNPhase::kValidate) &&
+              !IsValidForPhase(local_id, GNNPhase::kTest)) {
+            other_mask_[local_id] = 1;
+            other_accum += 1;
+          }
+        },
+        galois::loopname("FindOtherMask"));
+    return other_accum.reduce();
+  }
+
+  //! @brief Choose and set local training/validation/testing vertices 
+  //! consecutively.
+  void SetLocalMasksConsecutively() {
+    // allocate the memory for the local masks
+    global_training_mask_.resize(partitioned_graph_->globalSize());
+    local_training_mask_.resize(partitioned_graph_->size());
+    local_validation_mask_.resize(partitioned_graph_->size());
+    local_testing_mask_.resize(partitioned_graph_->size());
+
+    global_training_count_ = partitioned_graph_->globalSize() / 4;
+    size_t global_testing_count = global_training_count_ / 2;
+    global_training_mask_range_ = {
+        .begin = 0, .end = global_training_count_, .size = global_training_count_};
+    global_testing_mask_range_ = {
+        .begin = global_training_count_,
+        .end = global_training_count_ + global_testing_count,
+        .size = global_testing_count
+    };
+    global_validation_mask_range_ = {
+        .begin = global_training_count_ + global_testing_count,
+        .end = global_training_count_ + 2 * global_testing_count,
+        .size = global_testing_count
+    };
+    // training
+    for (size_t i = global_training_mask_range_.begin;
+         i < global_training_mask_range_.end; i++) {
+      if (partitioned_graph_->isLocal(i)) {
+        local_training_mask_[partitioned_graph_->getLID(i)] = 1;
+      }
+      global_training_mask_[i] = 1;
+    }
+
+    // validation
+    for (size_t i = global_validation_mask_range_.begin;
+         i < global_validation_mask_range_.end; i++) {
+      if (partitioned_graph_->isLocal(i)) {
+        local_validation_mask_[partitioned_graph_->getLID(i)] = 1;
+      }
+    }
+
+    // testing
+    for (size_t i = global_testing_mask_range_.begin;
+         i < global_testing_mask_range_.end; i++) {
+      if (partitioned_graph_->isLocal(i)) {
+        local_testing_mask_[partitioned_graph_->getLID(i)] = 1;
+      }
+    }
+  }
+
+  //! @brief Randomly choose and set local training/validation/testing
+  //! vertices. This mimics what AGILE GNN does through Pytorch
+  //! `DistributedRandomSampler`.
+  void DistributedRandomSampling(
+      size_t local_sample_size, std::vector<char>* masks) {
+    // Pytorch's DistributedRandomSampler,
+    // first materializes an array populated with
+    // 0 to (num_local_vertices - 1), shuffles this array, and
+    // extracts 0 to (num_local_shuffle - 1) vertices.
+    // This method mimics this operation.
+    // Like Pytorch, all the hosts use the same seed, and so,
+    // deterministically choose each type of vertices for not only
+    // the current host, but also others, and mark vertices to
+    // the corresponding mask array if they are locals.
+    auto& net = galois::runtime::getSystemNetworkInterface();
+    std::vector<
+        std::pair<uint64_t, uint64_t>> num_masters_per_hosts(net.Num);
+    std::pair<uint64_t, uint64_t> master_ranges =
+        { partitioned_graph_->getGID(0),
+          partitioned_graph_->getGID(partitioned_graph_->numMasters() - 1) };
+    // 1) Exchange node master ranges, and so, each host knows
+    // the range of vertex sampling.
+    for (uint32_t h = 0; h < net.Num; ++h) {
+      if (h == net.ID) { continue; }
+      galois::runtime::SendBuffer b;
+      galois::runtime::gSerialize(b, master_ranges);
+      net.sendTagged(h, galois::runtime::evilPhase, std::move(b));
+    }
+    net.flush();
+    for (uint32_t h = 0; h < net.Num - 1; ++h) {
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
+      do {
+        p = net.recieveTagged(galois::runtime::evilPhase);
+      } while (!p);
+
+      galois::runtime::gDeserialize(p->second,
+          num_masters_per_hosts[p->first]);
+    }
+    increment_evilPhase();
+
+    // 2) Sample vertices and mark them to the `masks` array
+    // if a vertex is local.
+    for (uint32_t h = 0; h < net.Num; ++h) {
+      size_t h_begin = (h == net.ID)? master_ranges.first : num_masters_per_hosts[h].first;
+      size_t h_end = (h == net.ID)? master_ranges.second : num_masters_per_hosts[h].second;
+      std::vector<uint64_t> h_all_indices(h_end - h_begin);
+      // Fill global vertex ids to h_global_ids.
+      galois::do_all(galois::iterate(h_begin, h_end),
+          [&](size_t i) {
+            h_all_indices[i - h_begin] = i;
+          } );
+      std::mt19937 rand(0);
+      std::shuffle(h_all_indices.begin(), h_all_indices.end(), rand);
+      galois::do_all(
+          galois::iterate(size_t{0}, local_sample_size),
+          [&](size_t i) {
+            // First, it doens't have duplications.
+            // Second, only mark `masks` if the checking vertex is a local
+            // vertex.
+            if (partitioned_graph_->isLocal(h_all_indices[i])) {
+              (*masks)[partitioned_graph_->getLID(h_all_indices[i])] = 1;
+            }
+          } );
+    }
+  }
+
+  void SetLocalMasksRandomly() {
+    // allocate the memory for the local masks
+    global_training_mask_.resize(partitioned_graph_->globalSize());
+    local_training_mask_.resize(partitioned_graph_->size());
+    local_validation_mask_.resize(partitioned_graph_->size());
+    local_testing_mask_.resize(partitioned_graph_->size());
+
+    auto& net = galois::runtime::getSystemNetworkInterface();
+    global_training_count_ = partitioned_graph_->globalSize() / 4;
+    size_t global_testing_count = global_training_count_ / 2;
+    size_t num_local_training_samples = global_training_count_ / net.Num;
+    size_t num_local_testing_samples = global_testing_count / net.Num;
+    size_t num_local_validating_samples = num_local_testing_samples;
+    global_training_mask_range_ = {
+        .begin = 0, .end = global_training_count_, .size = global_training_count_};
+    global_testing_mask_range_ = {
+        .begin = 0, .end = global_training_count_, .size = global_training_count_};
+    global_validation_mask_range_ = {
+        .begin = 0, .end = global_training_count_, .size = global_training_count_};
+
+    incomplete_masks_ = true;
+    DistributedRandomSampling(
+        num_local_training_samples, &local_training_mask_);
+    DistributedRandomSampling(
+        num_local_testing_samples, &local_testing_mask_);
+    DistributedRandomSampling(
+        num_local_validating_samples, &local_validation_mask_);
+  }
+
   //! Read masks of local nodes only for training, validation, and testing
-  void ReadLocalMasks(const std::string& dataset_name);
-  //! Reads the entire graph topology in (but nothing else)
-  void ReadWholeGraph(const std::string& dataset_name);
+  void ReadLocalMasks(const std::string& dataset_name) {
+    // allocate the memory for the local masks
+    global_training_mask_.resize(partitioned_graph_->globalSize());
+    local_training_mask_.resize(partitioned_graph_->size());
+    local_validation_mask_.resize(partitioned_graph_->size());
+    local_testing_mask_.resize(partitioned_graph_->size());
+
+    if (dataset_name == "reddit") {
+      global_training_count_ = 153431;
+
+      // TODO reddit is hardcode handled at the moment; better way to not do
+      // this?
+      global_training_mask_range_ = {.begin = 0, .end = 153431, .size = 153431};
+      global_validation_mask_range_ = {
+          .begin = 153431, .end = 153431 + 23831, .size = 23831};
+      global_testing_mask_range_ = {
+          .begin = 177262, .end = 177262 + 55703, .size = 55703};
+
+      // training
+      for (size_t i = global_training_mask_range_.begin;
+           i < global_training_mask_range_.end; i++) {
+        if (partitioned_graph_->isLocal(i)) {
+          local_training_mask_[partitioned_graph_->getLID(i)] = 1;
+        }
+        global_training_mask_[i] = 1;
+      }
+
+      // validation
+      for (size_t i = global_validation_mask_range_.begin;
+           i < global_validation_mask_range_.end; i++) {
+        if (partitioned_graph_->isLocal(i)) {
+          local_validation_mask_[partitioned_graph_->getLID(i)] = 1;
+        }
+      }
+
+      // testing
+      for (size_t i = global_testing_mask_range_.begin;
+           i < global_testing_mask_range_.end; i++) {
+        if (partitioned_graph_->isLocal(i)) {
+          local_testing_mask_[partitioned_graph_->getLID(i)] = 1;
+        }
+      }
+    } else if (dataset_name == "ogbn-papers100M-remap") {
+      global_training_count_ = 1207178;
+
+      global_training_mask_range_ = {
+          .begin = 0, .end = 1207178, .size = 1207178};
+      global_validation_mask_range_ = {
+          .begin = 1207178, .end = 1207178 + 125264, .size = 125264};
+      global_testing_mask_range_ = {
+          .begin = 1332442, .end = 1332442 + 214337, .size = 214337};
+      // training
+      for (size_t i = global_training_mask_range_.begin;
+           i < global_training_mask_range_.end; i++) {
+        if (partitioned_graph_->isLocal(i)) {
+          local_training_mask_[partitioned_graph_->getLID(i)] = 1;
+        }
+        global_training_mask_[i] = 1;
+      }
+      // validation
+      for (size_t i = global_validation_mask_range_.begin;
+           i < global_validation_mask_range_.end; i++) {
+        if (partitioned_graph_->isLocal(i)) {
+          local_validation_mask_[partitioned_graph_->getLID(i)] = 1;
+        }
+      }
+      // testing
+      for (size_t i = global_testing_mask_range_.begin;
+           i < global_testing_mask_range_.end; i++) {
+        if (partitioned_graph_->isLocal(i)) {
+          local_testing_mask_[partitioned_graph_->getLID(i)] = 1;
+        }
+      }
+      valid_other_ = FindOtherMask();
+      GALOIS_LOG_ASSERT(valid_other_ <= 109513177);
+    } else {
+      size_t valid_train = ReadLocalMasksFromFile(dataset_name, "train",
+                                                  &global_training_mask_range_,
+                                                  &local_training_mask_);
+      size_t valid_val   = ReadLocalMasksFromFile(dataset_name, "val",
+                                                  &global_validation_mask_range_,
+                                                  &local_validation_mask_);
+      size_t valid_test  = ReadLocalMasksFromFile(dataset_name, "test",
+                                                  &global_testing_mask_range_,
+                                                  &local_testing_mask_);
+      valid_other_       = FindOtherMask();
+      // the "other" set of nodes that don't fall into any classification
+      if (galois::runtime::getSystemNetworkInterface().ID == 0) {
+        galois::gInfo("Valid # training nodes is ", valid_train);
+        galois::gInfo("Valid # validation nodes is ", valid_val);
+        galois::gInfo("Valid # test nodes is ", valid_test);
+        galois::gInfo("Valid # other nodes is ", valid_other_);
+      }
+    }
+  }
+
   //! Initializes the norm factors using the entire graph's topology for global
   //! degree access
-  void InitNormFactor();
+  void InitNormFactor() {
+    GALOIS_LOG_VERBOSE("[{}] Initializing norm factors", host_id_);
+    global_degrees_.resize(partitioned_graph_->size(), 0.0);
+    global_train_degrees_.resize(partitioned_graph_->size(), 0.0);
+    CalculateFullNormFactor();
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      gpu_memory_.InitNormFactor(partitioned_graph_->size());
+    }
+#endif
+  }
 
   //! Used if ranges for a mask are complete (if in range, it's part of mask).
-  bool IsValidForPhaseCompleteRange(const unsigned lid,
-                                    const galois::GNNPhase current_phase) const;
+  bool
+  IsValidForPhaseCompleteRange(const unsigned lid,
+                               const galois::GNNPhase current_phase) const {
+    // only use ranges if they're complete
+    // convert to gid first
+    size_t gid = partitioned_graph_->getGID(lid);
+
+    // select range to use based on phase
+    const GNNRange* range_to_use;
+    switch (current_phase) {
+    case GNNPhase::kTrain:
+      range_to_use = &global_training_mask_range_;
+      break;
+    case GNNPhase::kValidate:
+      range_to_use = &global_validation_mask_range_;
+      break;
+    case GNNPhase::kTest:
+      range_to_use = &global_testing_mask_range_;
+      break;
+    case GNNPhase::kOther:
+      GALOIS_LOG_FATAL("no range for other");
+      break;
+    default:
+      GALOIS_LOG_FATAL("Invalid phase used");
+      range_to_use = nullptr;
+    }
+
+    // if within range, it is valid
+    // there is an assumption here that ranges are contiguous; may not
+    // necessarily be the case in all inputs in which case using the mask is
+    // required (but less cache efficient)
+    if (range_to_use->begin <= gid && gid < range_to_use->end) {
+      return true;
+    } else {
+      return false;
+    }
+  }
 
   //! Used if ranges for a mask are incomplete, meaning I actually have to
   //! check the mask.
   bool IsValidForPhaseMasked(const unsigned lid,
-                             const galois::GNNPhase current_phase) const;
+                             const galois::GNNPhase current_phase) const {
+    // select mask to use based on phase
+    const GNNMask* mask_to_use;
+    switch (current_phase) {
+    case GNNPhase::kTrain:
+      mask_to_use = &local_training_mask_;
+      break;
+    case GNNPhase::kValidate:
+      mask_to_use = &local_validation_mask_;
+      break;
+    case GNNPhase::kTest:
+      mask_to_use = &local_testing_mask_;
+      break;
+    case GNNPhase::kOther:
+      if (valid_other_ == 0) {
+        return false;
+      }
+      mask_to_use = &other_mask_;
+      break;
+    case GNNPhase::kBatch:
+      mask_to_use = &local_minibatch_mask_;
+      break;
+    default:
+      GALOIS_LOG_FATAL("Invalid phase used");
+      mask_to_use = nullptr;
+    }
+    return (*mask_to_use)[lid];
+  }
 
   //////////////////////////////////////////////////////////////////////////////
   // Accuracy
   //////////////////////////////////////////////////////////////////////////////
 
   float GetGlobalAccuracyCPU(PointerWithSize<GNNFloat> predictions,
-                             GNNPhase phase, bool sampling);
+                             GNNPhase phase, bool sampling) {
+    galois::StatTimer global_accuracy_timer("GetGlobalAccuracy");
+    galois::StatTimer global_accuracy_for_singleclass_timer(
+        "GetGlobalAccuracyForSingleClass");
+    galois::StatTimer global_accuracy_for_multiclass_timer(
+        "GetGlobalAccuracyForMultiClass");
+    global_accuracy_timer.start();
+    float accuracy{0};
+    if (is_single_class_label()) {
+      global_accuracy_for_singleclass_timer.start();
+      accuracy = GetGlobalAccuracyCPUSingle(predictions, phase, sampling);
+      global_accuracy_for_singleclass_timer.stop();
+    } else {
+      global_accuracy_for_multiclass_timer.start();
+      accuracy = GetGlobalAccuracyCPUMulti(predictions, phase, sampling);
+      global_accuracy_for_multiclass_timer.stop();
+    }
+    global_accuracy_timer.stop();
+    return accuracy;
+  }
+
   float GetGlobalAccuracyCPUSingle(PointerWithSize<GNNFloat> predictions,
-                                   GNNPhase phase, bool sampling);
+                                   GNNPhase phase, bool) {
+    // check owned nodes' accuracy
+    num_correct_.reset();
+    total_checked_.reset();
+
+    galois::do_all(
+        // will only loop over sampled nodes if sampling is on
+        galois::iterate(begin_owned(), end_owned()),
+        // this is possibly the subgraph id
+        [&](const unsigned node_id) {
+          if (IsValidForPhase(node_id, phase)) {
+            total_checked_ += 1;
+            // get prediction by getting max
+            // note the use of node_id here: lid only used to check original
+            // labels
+            size_t predicted_label =
+                galois::MaxIndex(num_label_classes_,
+                                 &(predictions[node_id * num_label_classes_]));
+            // check against ground truth and track accordingly
+            // TODO static cast used here is dangerous
+            if (predicted_label ==
+                static_cast<size_t>(GetSingleClassLabel(node_id))) {
+              num_correct_ += 1;
+            }
+          }
+        },
+        // steal on as some threads may have nothing to work on
+        galois::steal());
+
+    size_t global_correct = num_correct_.reduce();
+    size_t global_checked = total_checked_.reduce();
+
+    GALOIS_LOG_DEBUG("Sub: {}, Accuracy: {} / {}", use_subgraph_,
+                     global_correct, global_checked);
+
+    return static_cast<float>(global_correct) /
+           static_cast<float>(global_checked);
+  }
+
   float GetGlobalAccuracyCPUMulti(PointerWithSize<GNNFloat> predictions,
-                                  GNNPhase phase, bool sampling);
+                                  GNNPhase phase, bool sampling) {
+    const GNNLabel* full_ground_truth = GetMultiClassLabel(0);
+    assert(predictions.size() == (num_label_classes_ * size()));
+
+    size_t global_true_positive  = 0;
+    size_t global_true_negative  = 0;
+    size_t global_false_positive = 0;
+    size_t global_false_negative = 0;
+    size_t global_f1_score       = 0;
+
+    // per class check
+    for (size_t label_class = 0; label_class < num_label_classes_;
+         label_class++) {
+      local_true_positive_.reset();
+      local_true_negative_.reset();
+      local_false_positive_.reset();
+      local_false_negative_.reset();
+
+      // loop through all *owned* nodes (do not want to overcount)
+      galois::do_all(
+          galois::iterate(begin_owned(), end_owned()),
+          [&](const unsigned lid) {
+            if (IsValidForPhase(lid, phase)) {
+              if (sampling) {
+                if (phase == GNNPhase::kTrain && !IsInSampledGraph(lid)) {
+                  return;
+                }
+              }
+
+              size_t label_index = lid * num_label_classes_ + label_class;
+
+              GNNLabel true_label = full_ground_truth[label_index];
+              GNNLabel prediction_is_positive =
+                  (predictions[label_index] > 0.5) ? 1 : 0;
+
+              if (true_label && prediction_is_positive) {
+                local_true_positive_ += 1;
+              } else if (true_label && !prediction_is_positive) {
+                local_false_negative_ += 1;
+              } else if (!true_label && prediction_is_positive) {
+                local_false_positive_ += 1;
+              } else if (!true_label && !prediction_is_positive) {
+                local_true_negative_ += 1;
+              } else {
+                // all cases should be covered with clauses above, so it should
+                // NEVER get here; adding it here just for sanity purposes
+                GALOIS_LOG_FATAL(
+                    "Logic error with true label and prediction label");
+              }
+            }
+            total_checked_ += 1;
+          },
+          galois::steal(), galois::loopname("GlobalMultiAccuracy"));
+
+      // reduce from accumulators across all hosts for this particular class
+      size_t class_true_positives  = local_true_positive_.reduce();
+      size_t class_false_positives = local_false_positive_.reduce();
+      size_t class_true_negatives  = local_true_negative_.reduce();
+      size_t class_false_negatives = local_false_negative_.reduce();
+
+      // add to global counts
+      global_true_positive += class_true_positives;
+      global_false_positive += class_false_positives;
+      global_true_negative += class_true_negatives;
+      global_false_negative += class_false_negatives;
+
+      // calculate precision, recall, and f1 score for this class
+      // ternery op used to avoid division by 0
+      double class_precision =
+          (class_true_positives + class_true_negatives) > 0
+              ? static_cast<double>(class_true_positives) /
+                    (class_true_positives + class_false_positives)
+              : 0.0;
+      double class_recall =
+          (class_true_positives + class_false_negatives) > 0
+              ? static_cast<double>(class_true_positives) /
+                    (class_true_positives + class_false_negatives)
+              : 0.0;
+      double class_f1_score = (class_precision + class_recall) > 0
+                                  ? (2.0 * (class_precision * class_recall)) /
+                                        (class_precision + class_recall)
+                                  : 0.0;
+
+      global_f1_score += class_f1_score;
+    } // end label class loop
+
+    // GALOIS_LOG_WARN("{} {} {} {}", global_true_positive,
+    // global_true_negative, global_false_positive, global_false_negative);
+
+    // double global_f1_macro_score = global_f1_score / num_label_classes_;
+
+    // micro = considers all classes for precision/recall
+    double global_micro_precision =
+        (global_true_positive + global_true_negative) > 0
+            ? static_cast<double>(global_true_positive) /
+                  (global_true_positive + global_false_positive)
+            : 0.0;
+    double global_micro_recall =
+        (global_true_positive + global_false_negative) > 0
+            ? static_cast<double>(global_true_positive) /
+                  (global_true_positive + global_false_negative)
+            : 0.0;
+
+    double global_f1_micro_score =
+        (global_micro_precision + global_micro_recall) > 0
+            ? (2.0 * (global_micro_precision * global_micro_recall)) /
+                  (global_micro_precision + global_micro_recall)
+            : 0.0;
+
+    return global_f1_micro_score;
+  }
+
+  void increment_evilPhase() {
+    ++galois::runtime::evilPhase;
+    if (galois::runtime::evilPhase >=
+        static_cast<uint32_t>(std::numeric_limits<int64_t>::max())) {
+      galois::runtime::evilPhase = 1; 
+    }
+  }
 
   //////////////////////////////////////////////////////////////////////////////
   // Vars
@@ -838,7 +2591,58 @@ class GNNGraph {
   GNNGraphGPUAllocations gpu_memory_;
   //! Call this to setup GPU memory for this graph: allocates necessary GPU
   //! memory and copies things over
-  void InitGPUMemory();
+  void InitGPUMemory() {
+    // create int casted CSR
+    uint64_t* e_index_ptr = partitioned_graph_->row_start_ptr();
+    uint32_t* e_dest_ptr  = partitioned_graph_->edge_dst_ptr();
+
+    // + 1 because first element is 0 in BLAS CSRs
+    std::vector<int> e_index(partitioned_graph_->size() + 1);
+    std::vector<int> e_dest(partitioned_graph_->sizeEdges());
+
+    // set in parallel
+    galois::do_all(
+        galois::iterate(static_cast<size_t>(0), partitioned_graph_->size() + 1),
+        [&](size_t index) {
+          if (index != 0) {
+            if (e_index_ptr[index - 1] >
+                static_cast<size_t>(std::numeric_limits<int>::max())) {
+              GALOIS_LOG_FATAL("{} is too big a number for int arrays on GPUs",
+                               e_index_ptr[index - 1]);
+            }
+            e_index[index] = static_cast<int>(e_index_ptr[index - 1]);
+          } else {
+            e_index[index] = 0;
+          }
+        },
+        galois::loopname("GPUEdgeIndexConstruction"));
+    galois::do_all(
+        galois::iterate(static_cast<size_t>(0),
+                        partitioned_graph_->sizeEdges()),
+        [&](size_t edge) {
+          if (e_dest_ptr[edge] >
+              static_cast<size_t>(std::numeric_limits<int>::max())) {
+            GALOIS_LOG_FATAL("{} is too big a number for int arrays on GPUs",
+                             e_dest_ptr[edge]);
+          }
+
+          e_dest[edge] = static_cast<int>(e_dest_ptr[edge]);
+        },
+        galois::loopname("GPUEdgeDestConstruction"));
+
+    gpu_memory_.SetGraphTopology(e_index, e_dest);
+    e_index.clear();
+    e_dest.clear();
+
+    gpu_memory_.SetFeatures(local_node_features_, node_feature_length_);
+    gpu_memory_.SetLabels(local_ground_truth_labels_);
+    gpu_memory_.SetMasks(local_training_mask_, local_validation_mask_,
+                         local_testing_mask_);
+    gpu_memory_.AllocAggregateBitset(partitioned_graph_->size());
+    gpu_memory_.SetGlobalTrainDegrees(global_train_degrees_);
+    gpu_memory_.SetGlobalDegrees(global_degrees_);
+  }
+
 #endif
   //! Used to track accurate predictions during accuracy calculation
   DGAccumulator<size_t> num_correct_;
diff --git a/libgnn/include/galois/graphs/GNNSubgraph.h b/libgnn/include/galois/graphs/GNNSubgraph.h
index c7692533ba..9bddc9d313 100644
--- a/libgnn/include/galois/graphs/GNNSubgraph.h
+++ b/libgnn/include/galois/graphs/GNNSubgraph.h
@@ -1,10 +1,14 @@
+#include "galois/graphs/GNNGraph.h"
+
+#include <limits>
+
 // Note no header guard or anything like that; this file is meant to be
 // included in the middle of GNNGraph class declaration as a class in a class
 class GNNSubgraph {
 public:
-  using GraphNode    = LC_CSR_CSC_Graph<char, void>::GraphNode;
+  using GraphNode    = typename LC_CSR_CSC_Graph<VTy, ETy>::GraphNode;
   using NodeIterator = boost::counting_iterator<size_t>;
-  using EdgeIterator = LC_CSR_CSC_Graph<char, void>::edge_iterator;
+  using EdgeIterator = typename LC_CSR_CSC_Graph<VTy, ETy>::edge_iterator;
 
   //! Allocates space for the lid to sid map
   GNNSubgraph(size_t main_graph_size) {
@@ -16,9 +20,35 @@ class GNNSubgraph {
   }
   //! Given sampled bits set on gnn_graph, builds an explicit subgraph
   //! for the sampled bits
-  size_t BuildSubgraph(GNNGraph& gnn_graph, size_t num_sampled_layers);
+  size_t BuildSubgraph(GNNGraph<VTy, ETy>& gnn_graph,
+                       size_t num_sampled_layers) {
+    galois::StatTimer timer("BuildSubgraph", kRegionName);
+    TimerStart(&timer);
+    for (auto& vec : subgraph_mirrors_) {
+      vec.clear();
+    }
+    CreateSubgraphMapping(gnn_graph, num_sampled_layers);
+    if (num_subgraph_nodes_ == 0) {
+      return 0;
+    }
+    DegreeCounting(gnn_graph);
+    EdgeCreation(gnn_graph);
+    NodeFeatureCreation(gnn_graph);
+    // loop over each node, grab out/in edges, construct them in LC_CSR_CSC
+    // no edge data, just topology
+    TimerStop(&timer);
+    return num_subgraph_nodes_;
+  }
 
-  size_t BuildSubgraphView(GNNGraph& gnn_graph, size_t num_sampled_layers);
+  size_t BuildSubgraphView(GNNGraph<VTy, ETy>& gnn_graph,
+                           size_t num_sampled_layers) {
+    galois::StatTimer timer("BuildSubgraphView", kRegionName);
+    TimerStart(&timer);
+    CreateSubgraphMapping(gnn_graph, num_sampled_layers);
+    NodeFeatureCreation(gnn_graph);
+    TimerStop(&timer);
+    return num_subgraph_nodes_;
+  }
 
   galois::PODResizeableArray<GNNFeature>& GetLocalFeatures() {
     return subgraph_node_features_;
@@ -52,7 +82,7 @@ class GNNSubgraph {
     return underlying_graph_.getEdgeDst(out_edge_iterator);
   };
   galois::runtime::iterable<
-      galois::NoDerefIterator<GNNDistGraph::edge_iterator>>
+      galois::NoDerefIterator<typename GNNDistGraph::edge_iterator>>
   edges(GraphNode n) {
     return internal::make_no_deref_range(edge_begin(n), edge_end(n));
   }
@@ -67,7 +97,7 @@ class GNNSubgraph {
     return underlying_graph_.getInEdgeDst(in_edge_iterator);
   };
   galois::runtime::iterable<
-      galois::NoDerefIterator<GNNDistGraph::edge_iterator>>
+      galois::NoDerefIterator<typename GNNDistGraph::edge_iterator>>
   in_edges(GraphNode n) {
     return internal::make_no_deref_range(in_edge_begin(n), in_edge_end(n));
   }
@@ -81,12 +111,12 @@ class GNNSubgraph {
   //////////////////////////////////////////////////////////////////////////////
 
   bool OutEdgeSampled(EdgeIterator out_edge_iterator, size_t layer_num,
-                      const GNNGraph& original_graph) {
+                      const GNNGraph<VTy, ETy>& original_graph) {
     return original_graph.IsEdgeSampledOriginalGraph(
         subedge_to_original_edge_[*out_edge_iterator], layer_num);
   }
   bool InEdgeSampled(EdgeIterator in_edge_iterator, size_t layer_num,
-                     const GNNGraph& original_graph) {
+                     const GNNGraph<VTy, ETy>& original_graph) {
     // note that original IsEdgeSampled is called because this object stores the
     // original edge already
     return original_graph.IsEdgeSampledOriginalGraph(
@@ -119,7 +149,247 @@ class GNNSubgraph {
   // TODO signature cleanup
   //! Creates subgraph ID mapping from the number of sampled nodes from the
   //! original graph. Should be done every epoch when sampled graph changes.
-  void CreateSubgraphMapping(GNNGraph& gnn_graph, size_t);
+  void CreateSubgraphMapping(GNNGraph<VTy, ETy>& gnn_graph, size_t) {
+    galois::StatTimer timer("SIDMapping", kRegionName);
+    TimerStart(&timer);
+
+    assert(gnn_graph.size() == lid_to_subgraph_id_.size());
+    // clear all mappings
+    galois::ParallelSTL::fill(lid_to_subgraph_id_.begin(),
+                              lid_to_subgraph_id_.end(),
+                              std::numeric_limits<uint32_t>::max());
+
+    galois::GAccumulator<uint32_t> subgraph_count;
+    subgraph_count.reset();
+    galois::do_all(galois::iterate(gnn_graph.begin(), gnn_graph.end()),
+                   [&](uint32_t node_id) {
+                     if (gnn_graph.IsActiveInSubgraph(node_id)) {
+                       subgraph_count += 1;
+                     }
+                   });
+    num_subgraph_nodes_ = subgraph_count.reduce();
+    // if no subgraph, get out
+    if (num_subgraph_nodes_ == 0) {
+      subgraph_master_boundary_ = 0;
+      TimerStop(&timer);
+      return;
+    }
+
+    // checking sanity
+    // galois::do_all(galois::iterate(gnn_graph.begin(), gnn_graph.end()),
+    //               [&](uint32_t node_id) {
+    //                 if (gnn_graph.IsInSampledGraph(node_id) &&
+    //                 !gnn_graph.IsActiveInSubgraph(node_id)) {
+    //                  // check if any edges are active
+    //                  for (auto a = gnn_graph.edge_begin(node_id); a !=
+    //                  gnn_graph.edge_end(node_id);a++) {
+    //                    if (gnn_graph.IsEdgeSampledAny(a)) {
+    //                      galois::gWarn("ERROR node ", node_id);
+    //                    }
+    //                  }
+    //                  for (auto a = gnn_graph.in_edge_begin(node_id); a !=
+    //                  gnn_graph.in_edge_end(node_id);a++) {
+    //                    if (gnn_graph.IsInEdgeSampledAny(a)) {
+    //                      galois::gWarn("ERROR in node ", node_id);
+    //                    }
+    //                  }
+    //                 }
+    //               });
+
+    if (subgraph_id_to_lid_.size() < num_subgraph_nodes_) {
+      // allocate a bit more than necessary to avoid a big realloc
+      // if node value changes slightly later
+      subgraph_id_to_lid_.resize(num_subgraph_nodes_ * 1.02);
+    }
+
+    // bitset to mark if a master is outside the "master only" boundary
+    // and not contiguous; needed to mask out non-masters
+    galois::DynamicBitSet& non_layer_zero_masters =
+        gnn_graph.GetNonLayerZeroMasters();
+    // init the bitset as necessary
+    if (non_layer_zero_masters.size() < num_subgraph_nodes_) {
+      non_layer_zero_masters.resize(num_subgraph_nodes_);
+    } else {
+      non_layer_zero_masters.ParallelReset();
+    }
+
+    std::vector<unsigned>& master_offsets = gnn_graph.GetMasterOffsets();
+    std::vector<unsigned>& mirror_offsets = gnn_graph.GetMirrorOffsets();
+
+    ResetSIDThreadOffsets(master_offsets.size());
+
+    // compute offsets for each layer
+    galois::PODResizeableArray<unsigned> layer_offsets;
+    layer_offsets.resize(master_offsets.size() - 1);
+    for (unsigned i = 0; i < layer_offsets.size(); i++) {
+      layer_offsets[i] = master_offsets[i] + mirror_offsets[i];
+      if (i > 0) {
+        // prefix summing
+        layer_offsets[i] += layer_offsets[i - 1];
+      }
+    }
+
+    // all nodes before this SID are master nodes in layer 0;
+    // NOTE: there are master nodes past this boundary that will
+    // not be covered by a begin_owned loop, which may cause problems down
+    // the line; this is handled by the bitset above
+    subgraph_master_boundary_ = master_offsets[0];
+
+    size_t last_owned_node = *(gnn_graph.end_owned());
+    // compute amount of work each thread needs to do
+    galois::on_each([&](size_t thread_id, size_t num_threads) {
+      unsigned start_node;
+      unsigned end_node;
+      // this thread always has a set number of nodes to run; this is it
+      std::tie(start_node, end_node) = galois::block_range(
+          size_t{0}, gnn_graph.size(), thread_id, num_threads);
+      // these arrays track how much work will need to be done by this
+      // thread
+      galois::PODResizeableArray<unsigned>& my_offsets =
+          sid_thread_offsets_[thread_id];
+      galois::PODResizeableArray<unsigned>& my_mirror_offsets =
+          subgraph_mirror_offsets_[thread_id];
+
+      for (size_t local_node_id = start_node; local_node_id < end_node;
+           local_node_id++) {
+        // only bother if node was active
+        if (gnn_graph.IsActiveInSubgraph(local_node_id)) {
+          unsigned node_timestamp =
+              gnn_graph.SampleNodeTimestamp(local_node_id);
+          // TODO(loc) this check shouldn't even be necessary; active in
+          // subgraph implies added at somepoint
+          if (node_timestamp != std::numeric_limits<unsigned>::max()) {
+            // tracks how many nodes for each timestamp this node will
+            // work with by incrementing this
+            my_offsets[node_timestamp]++;
+
+            if (local_node_id >= last_owned_node) {
+              // this is a mirror node; get the host that the master is located
+              // on and increment this thread's mirror node count for that host
+              uint32_t node_gid = gnn_graph.GetGID(local_node_id);
+              my_mirror_offsets[gnn_graph.GetHostID(node_gid)]++;
+            }
+          } else {
+            GALOIS_LOG_WARN("shouldn't ever get here right?");
+          }
+        }
+      }
+    });
+
+    // prefix sum the threads
+    galois::do_all(galois::iterate(size_t{0}, master_offsets.size()),
+                   [&](size_t layer_num) {
+                     for (size_t thread_id = 1;
+                          thread_id < galois::getActiveThreads(); thread_id++) {
+                       sid_thread_offsets_[thread_id][layer_num] +=
+                           sid_thread_offsets_[thread_id - 1][layer_num];
+                     }
+                   });
+
+    for (unsigned i = 0; i < master_offsets.size() - 1; i++) {
+      if (i > 0) {
+        GALOIS_LOG_VASSERT(
+            sid_thread_offsets_[galois::getActiveThreads() - 1][i] +
+                    layer_offsets[i - 1] ==
+                (layer_offsets[i]),
+            "layer {} wrong {} vs correct {}", i,
+            sid_thread_offsets_[galois::getActiveThreads() - 1][i],
+            layer_offsets[i]);
+      } else {
+        GALOIS_LOG_VASSERT(
+            sid_thread_offsets_[galois::getActiveThreads() - 1][i] ==
+                (layer_offsets[i]),
+            "layer {} wrong {} vs correct {}", i,
+            sid_thread_offsets_[galois::getActiveThreads() - 1][i],
+            layer_offsets[i]);
+      }
+    }
+
+    // last element of prefix sum needs to equal the correct layer offset
+    galois::do_all(
+        galois::iterate(uint32_t{0},
+                        galois::runtime::getSystemNetworkInterface().Num),
+        [&](size_t host_num) {
+          // for each host, get prefix sum of each thread's mirrors
+          for (size_t thread_id = 1; thread_id < galois::getActiveThreads();
+               thread_id++) {
+            subgraph_mirror_offsets_[thread_id][host_num] +=
+                subgraph_mirror_offsets_[thread_id - 1][host_num];
+          }
+        });
+
+    // allocate the mirror space; last element of prefix sum is total size
+    for (unsigned host_num = 0;
+         host_num < galois::runtime::getSystemNetworkInterface().Num;
+         host_num++) {
+      if (galois::runtime::getSystemNetworkInterface().ID == host_num) {
+        continue;
+      }
+      subgraph_mirrors_[host_num].resize(
+          subgraph_mirror_offsets_[galois::getActiveThreads() - 1][host_num]);
+    }
+
+    galois::on_each([&](size_t thread_id, size_t num_threads) {
+      unsigned start_node;
+      unsigned end_node;
+      std::tie(start_node, end_node) = galois::block_range(
+          size_t{0}, gnn_graph.size(), thread_id, num_threads);
+
+      galois::PODResizeableArray<unsigned>& current_thread_offset =
+          thread_id != 0 ? sid_thread_offsets_[thread_id - 1]
+                         : thread_zero_work_;
+      galois::PODResizeableArray<unsigned>& my_mirror_offsets =
+          thread_id != 0 ? subgraph_mirror_offsets_[thread_id - 1]
+                         : thread_zero_mirror_offsets_;
+
+      for (size_t local_node_id = start_node; local_node_id < end_node;
+           local_node_id++) {
+        if (gnn_graph.IsActiveInSubgraph(local_node_id)) {
+          unsigned node_timestamp =
+              gnn_graph.SampleNodeTimestamp(local_node_id);
+          if (node_timestamp != std::numeric_limits<unsigned>::max()) {
+            uint32_t sid_to_use;
+            if (node_timestamp != 0) {
+              sid_to_use = layer_offsets[node_timestamp - 1] +
+                           current_thread_offset[node_timestamp]++;
+              if (local_node_id < last_owned_node) {
+                // master node that is not in layer 0 (i.e. node_timestamp != 0)
+                non_layer_zero_masters.set(sid_to_use);
+              }
+            } else {
+              // node timestamp == 0; no layer offset needed because offset
+              // is 0
+              sid_to_use = current_thread_offset[node_timestamp]++;
+            }
+
+            // this is a mirror
+            if (local_node_id >= last_owned_node) {
+              // XXX(loc) mirror offsets
+              uint32_t node_gid = gnn_graph.GetGID(local_node_id);
+              size_t my_offset =
+                  my_mirror_offsets[gnn_graph.GetHostID(node_gid)]++;
+
+              if (my_offset >
+                  subgraph_mirrors_[gnn_graph.GetHostID(node_gid)].size())
+                GALOIS_LOG_FATAL(
+                    "{} {}", my_offset,
+                    subgraph_mirrors_[gnn_graph.GetHostID(node_gid)].size());
+
+              subgraph_mirrors_[gnn_graph.GetHostID(node_gid)][my_offset] =
+                  node_gid;
+            }
+
+            subgraph_id_to_lid_[sid_to_use]    = local_node_id;
+            lid_to_subgraph_id_[local_node_id] = sid_to_use;
+          } else {
+            GALOIS_LOG_WARN("shouldn't ever get here right?");
+          }
+        }
+      }
+    });
+
+    TimerStop(&timer);
+  }
 
   //! reset sid thread offsets used for parallel SID mapping creation
   void ResetSIDThreadOffsets(size_t num_layers) {
@@ -162,11 +432,173 @@ class GNNSubgraph {
   }
 
   //! Counts in and out degrees of all sampled nodes in the graph
-  void DegreeCounting(const GNNGraph& gnn_graph);
+  void DegreeCounting(const GNNGraph<VTy, ETy>& gnn_graph) {
+    galois::StatTimer timer("DegreeCounting", kRegionName);
+    TimerStart(&timer);
+
+    if (local_subgraph_out_degrees_.size() < num_subgraph_nodes_) {
+      local_subgraph_out_degrees_.resize(num_subgraph_nodes_ * 1.02);
+    }
+
+    if (local_subgraph_in_degrees_.size() < num_subgraph_nodes_) {
+      local_subgraph_in_degrees_.resize(num_subgraph_nodes_ * 1.02);
+    }
+
+    galois::do_all(
+        galois::iterate(begin(), end()),
+        [&](uint32_t subgraph_id) {
+          uint32_t node_id     = subgraph_id_to_lid_[subgraph_id];
+          uint32_t out_degrees = 0;
+          for (auto out_edge_iter : gnn_graph.edges(node_id)) {
+            if (gnn_graph.IsEdgeSampledAny(out_edge_iter)) {
+              out_degrees++;
+            }
+          }
+          local_subgraph_out_degrees_[subgraph_id] = out_degrees;
+
+          uint32_t in_degrees = 0;
+          for (auto in_edge_iter : gnn_graph.in_edges(node_id)) {
+            if (gnn_graph.IsInEdgeSampledAny(in_edge_iter)) {
+              in_degrees++;
+            }
+          }
+          local_subgraph_in_degrees_[subgraph_id] = in_degrees;
+        },
+        galois::loopname("DegreeCountingDoAll"), galois::steal());
+
+    TimerStop(&timer);
+  }
+
   //! Creates edges
-  void EdgeCreation(const GNNGraph& gnn_graph);
+  void EdgeCreation(const GNNGraph<VTy, ETy>& gnn_graph) {
+    galois::StatTimer timer("EdgeConstruction", kRegionName);
+    TimerStart(&timer);
+    // galois::DGAccumulator<uint32_t> empty_masters;
+    // galois::DGAccumulator<uint32_t> empty_mirrors;
+    // empty_masters.reset();
+    // empty_mirrors.reset();
+
+    // galois::DGAccumulator<uint32_t> total_sn;
+    // total_sn.reset();
+    // total_sn += num_subgraph_nodes_;
+    // size_t global_sub_size = total_sn.reduce();
+
+    // prefix sum over subgraph degrees from previous phase to get starting
+    // points
+    for (size_t i = 1; i < num_subgraph_nodes_; i++) {
+      // if (local_subgraph_out_degrees_[i] == 0 &&
+      //    local_subgraph_in_degrees_[i] == 0) {
+      //  if (i < subgraph_master_boundary_) {
+      //    empty_masters += 1;
+      //  } else {
+      //    if (gnn_graph.GetNonLayerZeroMasters().test(i)) {
+      //      empty_masters += 1;
+      //    } else {
+      //      empty_mirrors += 1;
+      //    }
+      //  }
+      //}
+      local_subgraph_out_degrees_[i] += local_subgraph_out_degrees_[i - 1];
+      local_subgraph_in_degrees_[i] += local_subgraph_in_degrees_[i - 1];
+    }
+
+    // uint32_t emaster = empty_masters.reduce();
+    // uint32_t emirror = empty_mirrors.reduce();
+    // if (gnn_graph.host_id() == 0) {
+    //  galois::gInfo("Empty masters percent is ", emaster /
+    //  (float)global_sub_size,
+    //                " ", emaster, " ", global_sub_size);
+    //  galois::gInfo("Empty mirrors percent is ", emirror /
+    //  (float)global_sub_size,
+    //                " ", emirror, " ", global_sub_size);
+    //}
+
+    // allocate then set node endpoints
+    num_subgraph_edges_ = local_subgraph_out_degrees_[num_subgraph_nodes_ - 1];
+
+    galois::StatTimer alloc_time("EdgeCreationAlloc", kRegionName);
+    TimerStart(&alloc_time);
+    underlying_graph_.DeallocateOnly();
+    underlying_graph_.allocateFrom(num_subgraph_nodes_, num_subgraph_edges_);
+    underlying_graph_.CSCAllocate();
+    TimerStop(&alloc_time);
+
+    galois::gInfo(gnn_graph.host_prefix(), "Subgraph nodes and edges are ",
+                  num_subgraph_nodes_, " ", num_subgraph_edges_);
+
+    galois::do_all(galois::iterate(uint32_t{0}, num_subgraph_nodes_),
+                   [&](uint32_t subgraph_id) {
+                     underlying_graph_.fixEndEdge(
+                         subgraph_id, local_subgraph_out_degrees_[subgraph_id]);
+                     underlying_graph_.FixEndInEdge(
+                         subgraph_id, local_subgraph_in_degrees_[subgraph_id]);
+                   });
+    if (subedge_to_original_edge_.size() < num_subgraph_edges_) {
+      subedge_to_original_edge_.resize(num_subgraph_edges_ * 1.02);
+    }
+    if (in_subedge_to_original_edge_.size() < num_subgraph_edges_) {
+      in_subedge_to_original_edge_.resize(num_subgraph_edges_ * 1.02);
+    }
+
+    // save edges + save reference to layer sample status
+    galois::do_all(
+        galois::iterate(begin(), end()),
+        [&](uint32_t subgraph_id) {
+          uint32_t node_id = subgraph_id_to_lid_[subgraph_id];
+          assert(subgraph_id != std::numeric_limits<uint32_t>::max());
+          uint32_t out_location = 0;
+          uint32_t in_location  = 0;
+          if (subgraph_id != 0) {
+            out_location = local_subgraph_out_degrees_[subgraph_id - 1];
+            in_location  = local_subgraph_in_degrees_[subgraph_id - 1];
+          }
+
+          for (auto out_edge_iter : gnn_graph.edges(node_id)) {
+            if (gnn_graph.IsEdgeSampledAny(out_edge_iter)) {
+              assert(
+                  lid_to_subgraph_id_[gnn_graph.GetEdgeDest(out_edge_iter)] !=
+                  std::numeric_limits<uint32_t>::max());
+              subedge_to_original_edge_[out_location] = *out_edge_iter;
+
+              underlying_graph_.constructEdge(
+                  out_location++,
+                  lid_to_subgraph_id_[gnn_graph.GetEdgeDest(out_edge_iter)]);
+            }
+          }
+
+          for (auto in_edge_iter : gnn_graph.in_edges(node_id)) {
+            if (gnn_graph.IsInEdgeSampledAny(in_edge_iter)) {
+              in_subedge_to_original_edge_[in_location] =
+                  *(gnn_graph.InEdgeToOutEdge(in_edge_iter));
+              underlying_graph_.ConstructInEdge(
+                  in_location++,
+                  lid_to_subgraph_id_[gnn_graph.GetInEdgeDest(in_edge_iter)]);
+            }
+          }
+          assert(out_location == local_subgraph_out_degrees_[subgraph_id]);
+          assert(in_location == local_subgraph_in_degrees_[subgraph_id]);
+        },
+        galois::loopname("EdgeCreationDoAll"), galois::steal());
+    TimerStop(&timer);
+  }
+
   //! Copies over relevant features of the nodes
-  void NodeFeatureCreation(GNNGraph& gnn_graph);
+  void NodeFeatureCreation(GNNGraph<VTy, ETy>& gnn_graph) {
+    galois::StatTimer timer("NodeFeatureCreation", kRegionName);
+    TimerStart(&timer);
+    size_t feat_length = gnn_graph.node_feature_length();
+    subgraph_node_features_.resize(feat_length * num_subgraph_nodes_);
+
+    galois::do_all(
+        galois::iterate(begin(), end()), [&](size_t subgraph_node_id) {
+          size_t local_id = subgraph_id_to_lid_[subgraph_node_id];
+          std::memcpy(
+              &(subgraph_node_features_[subgraph_node_id * feat_length]),
+              &((gnn_graph.GetLocalFeatures().data())[local_id * feat_length]),
+              feat_length * sizeof(GNNFeature));
+        });
+    TimerStop(&timer);
+  }
 
   static const constexpr char* kRegionName = "GNNSubgraph";
 
diff --git a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
index 50a07bdd4e..422965fbaf 100644
--- a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
+++ b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
@@ -8,6 +8,7 @@
 namespace galois {
 namespace graphs {
 
+extern std::vector<char>* sampled_nodes_;
 extern GNNFloat* gnn_matrix_to_sync_;
 extern size_t gnn_matrix_to_sync_column_length_;
 extern galois::DynamicBitSet bitset_graph_aggregate;
@@ -20,16 +21,22 @@ extern struct CUDA_Context* cuda_ctx_for_sync;
 extern unsigned layer_number_to_sync;
 #endif
 
+// NodeTy is always a node data type of a "graph" type.
+// This type is used by GluonSubstrate to reset a value.
+// ValTy is either a node data type of a graph or the ones
+// that are stored in separate objects.
+template <typename NTy>
 struct SampleFlagSync {
-  using ValTy = char;
+  using NodeTy = NTy;
+  using ValTy  = char;
 
   //! return a vector of floats to sync
-  static ValTy extract(uint32_t, char& i) { return i; }
+  static ValTy extract(uint32_t lid, NodeTy&) { return (*sampled_nodes_)[lid]; }
 
-  static bool reduce(uint32_t, char& i, ValTy y) {
+  static bool reduce(uint32_t lid, NodeTy&, ValTy y) {
     if (y) {
-      i = y;
-      assert(i == 1);
+      (*sampled_nodes_)[lid] = y;
+      assert((*sampled_nodes_)[lid] == 1);
       return true;
     } else {
       return false;
@@ -37,10 +44,12 @@ struct SampleFlagSync {
   }
 
   //! No-op: readAny = overwritten anyways
-  static void reset(uint32_t, char&) {}
+  static void reset(uint32_t, NodeTy&) {}
 
   //! element wise set
-  static void setVal(uint32_t, char& i, ValTy y) { i = y; }
+  static void setVal(uint32_t lid, NodeTy&, ValTy y) {
+    (*sampled_nodes_)[lid] = y;
+  }
 
   // GPU options TODO for GPU
   static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {
@@ -67,13 +76,15 @@ struct SampleFlagBitset {
   }
 };
 
+template <typename NTy>
 struct GNNSumAggregate {
-  using ValTy = galois::gstl::Vector<GNNFloat>;
+  using ValTy  = galois::gstl::Vector<GNNFloat>;
+  using NodeTy = NTy;
 
   static size_t FeatVecSize() { return gnn_matrix_to_sync_column_length_; }
 
   //! return a vector of floats to sync
-  static ValTy extract(uint32_t node_id, char&) {
+  static ValTy extract(uint32_t node_id, NodeTy&) {
     // It should be a CPU synchronizing substrate.
     // If the GPU flag is turned off, then personality does not exist.
     // assert(device_personality == DevicePersonality::CPU);
@@ -100,7 +111,7 @@ struct GNNSumAggregate {
 
   //! reduction is addition in this case; add received vector to
   //! own vector
-  static bool reduce(uint32_t node_id, char&, ValTy y) {
+  static bool reduce(uint32_t node_id, NodeTy&, ValTy y) {
     assert(y.size() == gnn_matrix_to_sync_column_length_);
     // loop and do addition
     for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) {
@@ -111,7 +122,7 @@ struct GNNSumAggregate {
     return true;
   }
 
-  static bool reduce(uint32_t node_id, char&, const ValTy::value_type* y) {
+  static bool reduce(uint32_t node_id, NodeTy&, const ValTy::value_type* y) {
     // loop and do addition
     for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) {
       // XXX vectorized add
@@ -122,7 +133,7 @@ struct GNNSumAggregate {
   }
 
   //! No-op: readAny = overwritten anyways
-  static void reset(uint32_t, char&) {}
+  static void reset(uint32_t, NodeTy&) {}
   // Reset is here in case anyone wants to bring it back
   // static void reset(uint32_t node_id, char&) {
   //  for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) {
@@ -132,7 +143,7 @@ struct GNNSumAggregate {
   //}
 
   //! element wise set
-  static void setVal(uint32_t node_id, char&, ValTy y) {
+  static void setVal(uint32_t node_id, NodeTy&, ValTy y) {
     assert(y.size() == gnn_matrix_to_sync_column_length_);
     // loop and do addition
     for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) {
@@ -141,7 +152,7 @@ struct GNNSumAggregate {
     }
   }
 
-  static void setVal(uint32_t node_id, char&, const ValTy::value_type* y) {
+  static void setVal(uint32_t node_id, NodeTy&, const ValTy::value_type* y) {
     // loop and do addition
     for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) {
       gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_ + i] =
@@ -165,13 +176,15 @@ struct GNNSumAggregate {
   static bool extract_reset_batch(unsigned, uint8_t*) { return false; }
 };
 
+template <typename NTy>
 struct GNNSampleSumAggregate {
-  using ValTy = galois::gstl::Vector<GNNFloat>;
+  using ValTy  = galois::gstl::Vector<GNNFloat>;
+  using NodeTy = NTy;
 
   static size_t FeatVecSize() { return gnn_matrix_to_sync_column_length_; }
 
   //! return a vector of floats to sync
-  static ValTy extract(uint32_t node_id, char&) {
+  static ValTy extract(uint32_t node_id, NodeTy&) {
     // It should be a CPU synchronizing substrate.
     // If the GPU flag is turned off, then personality does not exist.
     // assert(device_personality == DevicePersonality::CPU);
@@ -212,7 +225,7 @@ struct GNNSampleSumAggregate {
 
   //! reduction is addition in this case; add received vector to
   //! own vector
-  static bool reduce(uint32_t node_id, char&, ValTy y) {
+  static bool reduce(uint32_t node_id, NodeTy&, ValTy y) {
     assert(y.size() == gnn_matrix_to_sync_column_length_);
     if ((*gnn_lid_to_sid_pointer_)[node_id] ==
         std::numeric_limits<uint32_t>::max()) {
@@ -231,7 +244,7 @@ struct GNNSampleSumAggregate {
     return true;
   }
 
-  static bool reduce(uint32_t node_id, char&, ValTy::value_type* y) {
+  static bool reduce(uint32_t node_id, NodeTy&, ValTy::value_type* y) {
     if ((*gnn_lid_to_sid_pointer_)[node_id] ==
         std::numeric_limits<uint32_t>::max()) {
       return false;
@@ -252,10 +265,10 @@ struct GNNSampleSumAggregate {
   }
 
   //! No-op: readAny = overwritten anyways
-  static void reset(uint32_t, char&) {}
+  static void reset(uint32_t, NodeTy&) {}
 
   // version where you have a vector object
-  static void setVal(uint32_t node_id, char&, ValTy y) {
+  static void setVal(uint32_t node_id, NodeTy&, ValTy y) {
     assert(y.size() == gnn_matrix_to_sync_column_length_);
     uint32_t converted_sid = (*gnn_lid_to_sid_pointer_)[node_id];
     if (converted_sid >= num_active_layer_rows_ ||
@@ -273,7 +286,7 @@ struct GNNSampleSumAggregate {
 
   // version where you have a pointer only (more efficient because this
   // version is for reading directly from the recv buffer)
-  static void setVal(uint32_t node_id, char&, ValTy::value_type* y) {
+  static void setVal(uint32_t node_id, NodeTy&, ValTy::value_type* y) {
     uint32_t converted_sid = (*gnn_lid_to_sid_pointer_)[node_id];
     if (converted_sid >= num_active_layer_rows_ ||
         converted_sid == std::numeric_limits<uint32_t>::max()) {
@@ -303,6 +316,112 @@ struct GNNSampleSumAggregate {
   static bool extract_reset_batch(unsigned, uint8_t*) { return false; }
 };
 
+template <typename NTy>
+struct SHADGNNSumAggregate {
+  using ValTy  = galois::gstl::Vector<GNNFloat>;
+  using NodeTy = NTy;
+
+  static size_t FeatVecSize() { return gnn_matrix_to_sync_column_length_ / 2; }
+
+  //! return a vector of floats to sync
+  static ValTy extract(uint32_t node_id, NodeTy&) {
+    // It should be a CPU synchronizing substrate.
+    // If the GPU flag is turned off, then personality does not exist.
+    // assert(device_personality == DevicePersonality::CPU);
+
+    // It should extract the last half of features of the adjacent neighbors
+    // (So, source of feature aggregation).
+    ValTy extracted_vec;
+    extracted_vec.reserve(gnn_matrix_to_sync_column_length_ / 2);
+    for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_ / 2; i++) {
+      // XXX memcpy
+      extracted_vec.emplace_back(
+          gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_ + i +
+                              gnn_matrix_to_sync_column_length_ / 2]);
+    }
+    // move constructor should kick in here to avoid return copy
+    return extracted_vec;
+  }
+
+  //! return a vector of floats to sync
+  static void ExtractDirect(uint32_t node_id,
+                            typename ValTy::value_type* to_write) {
+    std::memcpy(
+        to_write,
+        (char*)&(
+            gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_ +
+                                gnn_matrix_to_sync_column_length_ / 2]),
+        (gnn_matrix_to_sync_column_length_ / 2) *
+            sizeof(typename ValTy::value_type));
+  }
+
+  //! reduction is addition in this case; add received vector to
+  //! own vector
+  static bool reduce(uint32_t node_id, char&, ValTy y) {
+    assert(y.size() == gnn_matrix_to_sync_column_length_ / 2);
+    // loop and do addition
+    for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_ / 2; i++) {
+      // XXX vectorized add
+      gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_ + i +
+                          gnn_matrix_to_sync_column_length_ / 2] += y[i];
+    }
+    return true;
+  }
+
+  static bool reduce(uint32_t node_id, NodeTy&, const ValTy::value_type* y) {
+    // loop and do addition
+    for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_ / 2; i++) {
+      // XXX vectorized add
+      gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_ + i +
+                          gnn_matrix_to_sync_column_length_ / 2] += y[i];
+    }
+    return true;
+  }
+
+  //! No-op: readAny = overwritten anyways
+  static void reset(uint32_t, NodeTy&) {}
+  // Reset is here in case anyone wants to bring it back
+  // static void reset(uint32_t node_id, char&) {
+  //  for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) {
+  //    gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_ + i] =
+  //    0;
+  //  }
+  //}
+
+  //! element wise set
+  static void setVal(uint32_t node_id, NodeTy&, ValTy y) {
+    assert(y.size() == gnn_matrix_to_sync_column_length_);
+    // loop and do addition
+    for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_ / 2; i++) {
+      gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_ + i +
+                          gnn_matrix_to_sync_column_length_ / 2] = y[i];
+    }
+  }
+
+  static void setVal(uint32_t node_id, NodeTy&, const ValTy::value_type* y) {
+    // loop and do addition
+    for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_ / 2; i++) {
+      gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_ + i +
+                          gnn_matrix_to_sync_column_length_ / 2] = y[i];
+    }
+  }
+
+  // GPU options TODO for GPU
+  static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {
+    return false;
+  }
+  static bool extract_batch(unsigned, uint8_t*) { return false; }
+  static bool reduce_batch(unsigned, uint8_t*, DataCommMode) { return false; }
+  static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) {
+    return false;
+  }
+  static bool setVal_batch(unsigned, uint8_t*, DataCommMode) { return false; }
+  static bool extract_reset_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {
+    return false;
+  }
+  static bool extract_reset_batch(unsigned, uint8_t*) { return false; }
+};
+
 #ifdef GALOIS_ENABLE_GPU
 extern struct CUDA_Context* cuda_ctx;
 GALOIS_SYNC_STRUCTURE_GNN_LAYER(layer_input, cuda_ctx_for_sync,
diff --git a/libgnn/include/galois/layers/DenseLayer.h b/libgnn/include/galois/layers/DenseLayer.h
index e7dc46e9f3..c347ae8dbe 100644
--- a/libgnn/include/galois/layers/DenseLayer.h
+++ b/libgnn/include/galois/layers/DenseLayer.h
@@ -1,21 +1,43 @@
 
 #pragma once
 #include "galois/layers/GNNLayer.h"
+#include "galois/Logging.h"
+#include "galois/GNNMath.h"
 
 namespace galois {
 
 //! Just does a linear xform with no convolution over graph
-class DenseLayer : public GNNLayer {
+template <typename VTy, typename ETy>
+class DenseLayer : public GNNLayer<VTy, ETy> {
 public:
   //! Initializes the variables of the base class and also allocates additional
   //! memory for temporary matrices. Also initializes sync substrate for the
   //! weight matrix
-  DenseLayer(size_t layer_num, const galois::graphs::GNNGraph& graph,
+  DenseLayer(size_t layer_num, const galois::graphs::GNNGraph<VTy, ETy>& graph,
              PointerWithSize<GNNFloat>* backward_output_matrix,
-             const GNNLayerDimensions& dimensions,
-             const GNNLayerConfig& config);
+             const GNNLayerDimensions& layer_dimensions,
+             const GNNLayerConfig& config)
+      : GNNLayer<VTy, ETy>(layer_num, graph, backward_output_matrix,
+                           layer_dimensions, config),
+        input_column_intermediates_(layer_dimensions.input_columns),
+        output_column_intermediates_(layer_dimensions.output_columns) {
+    // TODO Need to make sure that layer knows about forward/backward matrix
+    // sharing (e.g., overwriting previously used input to save space)
+    GALOIS_LOG_FATAL(
+        "This layer has not been kept up to date; do not use until "
+        "sure it's been updated");
+    size_t num_input_elements = this->layer_dimensions_.input_rows *
+                                this->layer_dimensions_.input_columns;
+    in_temp_1_.resize(num_input_elements, 0);
+    size_t num_output_elements = this->layer_dimensions_.input_rows *
+                                 this->layer_dimensions_.output_columns;
+    GALOIS_LOG_VERBOSE("Output elements {}", num_output_elements);
+    this->layer_type_  = galois::GNNLayerType::kDense;
+    this->p_in_temp_1_ = PointerWithSize<GNNFloat>(in_temp_1_);
+    GALOIS_LOG_VERBOSE("Dense initialized");
+  }
 
-  DenseLayer(size_t layer_num, const galois::graphs::GNNGraph& graph,
+  DenseLayer(size_t layer_num, const galois::graphs::GNNGraph<VTy, ETy>& graph,
              PointerWithSize<GNNFloat>* backward_output_matrix,
              const GNNLayerDimensions& dimensions)
       : DenseLayer(layer_num, graph, backward_output_matrix, dimensions,
@@ -23,11 +45,80 @@ class DenseLayer : public GNNLayer {
 
   // Parent functions
   const PointerWithSize<galois::GNNFloat>
-  ForwardPhase(const PointerWithSize<galois::GNNFloat> input_embeddings) final;
+  ForwardPhase(const PointerWithSize<galois::GNNFloat> input_embeddings) final {
+    GALOIS_LOG_VERBOSE("Calling forward phase");
+    assert(input_embeddings.size() == (this->layer_dimensions_.input_rows *
+                                       this->layer_dimensions_.input_columns));
+    assert(this->p_in_temp_1_.size() == input_embeddings.size());
+    assert(this->p_forward_output_matrix_.size() ==
+           (this->layer_dimensions_.input_rows *
+            this->layer_dimensions_.output_columns));
+    // pointer to input to operate on
+    const GNNFloat* input_data = input_embeddings.data();
+    // first, dropout
+    if (!this->config_.disable_dropout &&
+        (this->layer_phase_ == GNNPhase::kTrain)) {
+      this->DoDropout(input_embeddings, &this->p_in_temp_1_);
+      input_data = this->p_in_temp_1_.data();
+    }
+
+    // FW
+    UpdateEmbeddings(input_data, this->p_forward_output_matrix_.data());
+
+    if (!this->config_.disable_activation) {
+      GALOIS_LOG_VERBOSE("Doing activation");
+      this->Activation();
+    }
+
+    assert(this->p_forward_output_matrix_.size() ==
+           (this->layer_dimensions_.input_rows *
+            this->layer_dimensions_.output_columns));
+    return this->p_forward_output_matrix_;
+  }
 
   PointerWithSize<galois::GNNFloat>
   BackwardPhase(PointerWithSize<galois::GNNFloat> prev_layer_input,
-                PointerWithSize<galois::GNNFloat>* input_gradient) final;
+                PointerWithSize<galois::GNNFloat>* input_gradient) final {
+    assert(this->layer_phase_ == GNNPhase::kTrain);
+
+    // derivative of activation
+    if (!this->config_.disable_activation) {
+      this->ActivationDerivative(input_gradient);
+    }
+
+    if (this->layer_number_ != 0) {
+      // derivative for update
+      // backout = F'
+      UpdateEmbeddingsDerivative(input_gradient->data(),
+                                 this->p_backward_output_matrix_.data());
+    }
+
+    galois::PointerWithSize<galois::GNNFloat> input_data;
+    if (!this->config_.disable_dropout) {
+      // dropout result is currently stored in temp 1
+      // needs to be used before it gets overwritten
+      input_data = this->p_in_temp_1_;
+    } else {
+      // no dropout = use vanilla input
+      input_data = prev_layer_input;
+    }
+
+    // W' = F^T (FW)'
+    galois::CBlasSGEMM(
+        CblasTrans, CblasNoTrans, this->layer_dimensions_.input_columns,
+        this->layer_dimensions_.input_rows,
+        this->layer_dimensions_.output_columns, input_data.data(),
+        input_gradient->data(), this->p_layer_weight_gradients_.data());
+    // sync weight gradients; note aggregation sync occurs in the function call
+    // already
+    this->WeightGradientSyncSum();
+
+    if (!this->config_.disable_dropout && this->layer_number_ != 0) {
+      this->DoDropoutDerivative();
+    }
+
+    return this->p_backward_output_matrix_;
+  }
 
 private:
   // 2 temporaries the size of the forward input; used for dropout and
@@ -45,9 +136,54 @@ class DenseLayer : public GNNLayer {
       output_column_intermediates_;
 
   //! Do embedding update via mxm with this layer's weights (forward)
-  void UpdateEmbeddings(const GNNFloat* node_embeddings, GNNFloat* output);
+  void UpdateEmbeddings(const GNNFloat* node_embeddings, GNNFloat* output) {
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      /* TODO(lhc) implement this
+      gpu_object_.UpdateEmbeddingsGPU(
+          layer_dimensions_.input_rows, layer_dimensions_.input_columns,
+          layer_dimensions_.output_columns, node_embeddings,
+          base_gpu_object_.layer_weights(), output);
+          */
+    } else {
+#endif
+      // CPU version is just a call into CBlas
+      galois::CBlasSGEMM(CblasNoTrans, CblasNoTrans,
+                         this->layer_dimensions_.input_rows,
+                         this->layer_dimensions_.input_columns,
+                         this->layer_dimensions_.output_columns,
+                         node_embeddings, this->layer_weights_.data(), output);
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+  }
+
   //! Calculate graident via mxm with last layer's gradients (backward)
-  void UpdateEmbeddingsDerivative(const GNNFloat* gradients, GNNFloat* output);
+  void UpdateEmbeddingsDerivative(const GNNFloat* gradients, GNNFloat* output) {
+    assert(this->p_layer_weights_.size() ==
+           this->layer_dimensions_.input_columns *
+               this->layer_dimensions_.output_columns);
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      /* TODO(lhc) implement this
+      gpu_object_.UpdateEmbeddingsDerivativeGPU(
+          layer_dimensions_.input_rows, layer_dimensions_.input_columns,
+          layer_dimensions_.output_columns, gradients,
+          base_gpu_object_.layer_weights(), output);
+          */
+    } else {
+#endif
+      // difference is Trans for B matrix (data) to get z by y (weights is y by
+      // z normally); result is x by y
+      galois::CBlasSGEMM(CblasNoTrans, CblasTrans,
+                         this->layer_dimensions_.input_rows,
+                         this->layer_dimensions_.output_columns,
+                         this->layer_dimensions_.input_columns, gradients,
+                         this->layer_weights_.data(), output);
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+  }
 
 #ifdef GALOIS_ENABLE_GPU
   // TODO(hochan/loc) replace with dense gpu object
diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
index 786a973230..9ac6b925ae 100644
--- a/libgnn/include/galois/layers/GNNLayer.h
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -3,6 +3,8 @@
 #include "galois/PerThreadRNG.h"
 #include "galois/GNNOptimizers.h"
 #include "galois/graphs/GNNGraph.h"
+#include "galois/Logging.h"
+#include "galois/layers/GradientSyncStructures.h"
 
 #ifdef GALOIS_ENABLE_GPU
 #include "galois/layers/GNNLayer.cuh"
@@ -81,17 +83,113 @@ struct GNNLayerConfig {
 // Tried to avoid inheritance, but keeping track of heterogeneous layers
 // becomes a mess if there isn't a base class I can create the container on.
 //! Base class for layers in a graph neural network
+template <typename VTy, typename ETy>
 class GNNLayer {
 public:
   //! Creation of a layer needs the # of the layer, the graph to train on, and
   //! the input/output dimensions of the MxM that occurs in the layer; config
   //! as well
-  GNNLayer(size_t layer_num, const galois::graphs::GNNGraph& graph,
+  GNNLayer(size_t layer_num, const galois::graphs::GNNGraph<VTy, ETy>& graph,
            PointerWithSize<GNNFloat>* backward_output_matrix,
-           const GNNLayerDimensions& dimensions, const GNNLayerConfig& config);
+           const GNNLayerDimensions& dimensions, const GNNLayerConfig& config)
+      : layer_number_(layer_num), graph_(graph), layer_dimensions_(dimensions),
+        config_(config) {
+    // TODO(loc)
+    // this is currently a backward-compatibility hack, need to have caller
+    // set output rows rather than created here
+    layer_dimensions_.output_rows = layer_dimensions_.input_rows;
+
+    if (config_.allocate_weights) {
+      // dropout allocation; dropout is same as input
+      if (!config_.disable_dropout) {
+        dropout_mask_.resize(layer_dimensions_.input_rows *
+                                 layer_dimensions_.input_columns,
+                             false);
+      }
+      // allocate memory based on layer dimensions
+      size_t num_weight_elements =
+          layer_dimensions_.input_columns * layer_dimensions_.output_columns;
+      galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
+                    ", layer weights ", num_weight_elements, " (",
+                    FloatElementsToGB(num_weight_elements), " GB)");
+      layer_weights_.resize(num_weight_elements);
+      galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
+                    ", layer gradients ", num_weight_elements, " (",
+                    FloatElementsToGB(num_weight_elements), " GB)");
+      layer_weight_gradients_.resize(num_weight_elements, 0);
+#ifdef GALOIS_ENABLE_GPU
+      if (device_personality == DevicePersonality::GPU_CUDA) {
+        base_gpu_object_.InitWeightMemory(num_weight_elements);
+        base_gpu_object_.InitDropoutMemory(layer_dimensions_.input_rows *
+                                           layer_dimensions_.input_columns);
+      }
+#endif
+
+      GlorotBengioInit(&layer_weights_);
+    }
+
+    // TODO(loc) optimize this and layer creation in general
+    // this does not use output_rows and assumes the worst case where
+    // all nodes are generated
+    // for now it's kept as input_rows so as to not break things
+    size_t num_output_elements =
+        layer_dimensions_.input_rows * layer_dimensions_.output_columns;
+
+    if (!config_.disable_output) {
+      galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
+                    ", forward output matrix ", num_output_elements, " (",
+                    FloatElementsToGB(num_output_elements), " GB)");
+      forward_output_matrix_.resize(num_output_elements, 0);
+    }
+
+    if (layer_number_ != 0) {
+      GALOIS_LOG_VASSERT(
+          backward_output_matrix->size() ==
+              layer_dimensions_.input_rows * layer_dimensions_.input_columns,
+          "backward output size {} should equal input size {}",
+          backward_output_matrix->size(),
+          layer_dimensions_.input_rows * layer_dimensions_.input_columns);
+    } else {
+      GALOIS_LOG_VASSERT(backward_output_matrix->data() == nullptr,
+                         "layer 0 should null ptr backward output");
+      GALOIS_LOG_VASSERT(backward_output_matrix->size() == 0,
+                         "layer 0 should size 0 backward output");
+    }
+
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      base_gpu_object_.InitInOutMemory(num_output_elements,
+                                       layer_dimensions_.input_rows *
+                                           layer_dimensions_.input_columns);
+
+      // initialize the PointerWithSize wrappers
+      p_layer_weights_ = PointerWithSize<GNNFloat>(
+          base_gpu_object_.layer_weights(), layer_weights_.size());
+      p_layer_weight_gradients_ =
+          PointerWithSize<GNNFloat>(base_gpu_object_.layer_weight_gradients(),
+                                    layer_weight_gradients_.size());
+      p_forward_output_matrix_ = PointerWithSize<GNNFloat>(
+          base_gpu_object_.forward_output(), forward_output_matrix_.size());
+      p_backward_output_matrix_ = PointerWithSize<GNNFloat>(
+          base_gpu_object_.backward_output(), backward_output_matrix->size());
+      // TODO can clear the cpu side vectors/don't use .size() since optimally
+      // they aren't initialized
+    } else {
+#endif
+      // initialize the PointerWithSize wrappers
+      p_layer_weights_ = PointerWithSize<GNNFloat>(layer_weights_);
+      p_layer_weight_gradients_ =
+          PointerWithSize<GNNFloat>(layer_weight_gradients_);
+      p_forward_output_matrix_ =
+          PointerWithSize<GNNFloat>(forward_output_matrix_);
+      p_backward_output_matrix_ = *backward_output_matrix;
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+  }
 
   //! Uses a default config
-  GNNLayer(size_t layer_num, const galois::graphs::GNNGraph& graph,
+  GNNLayer(size_t layer_num, const galois::graphs::GNNGraph<VTy, ETy>& graph,
            PointerWithSize<GNNFloat>* backward_output_matrix,
            const GNNLayerDimensions& dimensions)
       : GNNLayer(layer_num, graph, backward_output_matrix, dimensions,
@@ -109,7 +207,31 @@ class GNNLayer {
     ResizeOutputMatrix(output_row);
   }
 
-  void ResizeOutputMatrix(size_t new_output_row);
+  void ResizeOutputMatrix(size_t new_output_row) {
+    size_t num_output_elements =
+        new_output_row * layer_dimensions_.output_columns;
+
+    if (!config_.disable_output &&
+        (forward_output_matrix_.size() < num_output_elements)) {
+      galois::gInfo(graph_.host_prefix(), "Resizing layer ", layer_number_,
+                    ", forward output matrix to ", num_output_elements, " (",
+                    FloatElementsToGB(num_output_elements), " GB)");
+      // resize with a bit of a buffer to prevent possible future resizes
+      size_t buffer_size = (num_output_elements * 0.02);
+      forward_output_matrix_.resize(num_output_elements + buffer_size, 0);
+    }
+
+    // XXX(hochan) GPU end
+#ifdef GALOIS_ENABLE_GPU
+    // XXX(hochan)
+#endif
+    // reinitialize the PointerWithSize wrappers
+    p_forward_output_matrix_ =
+        PointerWithSize<GNNFloat>(forward_output_matrix_);
+#ifdef GALOIS_ENABLE_GPU
+    // XXX(hochan)
+#endif
+  }
 
   void UpdateBackwardOutput(PointerWithSize<GNNFloat>* backward_output_matrix) {
     // XXX(hochan) gpu
@@ -257,7 +379,7 @@ class GNNLayer {
   //! Pointer to the graph being trained by this layer.
   //! This is owned by the creator of this layer, so no need to free it when
   //! this layer is destroyed.
-  const galois::graphs::GNNGraph& graph_;
+  const galois::graphs::GNNGraph<VTy, ETy>& graph_;
   //! Dimensions (input/output sizes) of this layer
   GNNLayerDimensions layer_dimensions_;
   //! Config object for certain parameters for layer
@@ -318,38 +440,277 @@ class GNNLayer {
   //! used are the dimensions of this particular weight matrix
   //! TODO revisit paper and see what they really mean
   //! Code inspired DGL and TinyDNN
-  void GlorotBengioInit(std::vector<GNNFloat>* vector_to_init);
+  void GlorotBengioInit(std::vector<GNNFloat>* vector_to_init) {
+    float max = std::sqrt(6.0) / std::sqrt(layer_dimensions_.output_columns +
+                                           layer_dimensions_.input_columns);
+    std::default_random_engine rng(1 + layer_number_);
+    std::uniform_real_distribution<GNNFloat> dist(-max, max);
+
+    for (size_t i = 0; i < vector_to_init->size(); i++) {
+      (*vector_to_init)[i] = dist(rng);
+    }
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      CopyLayerWeightsToGPU();
+    }
+#endif
+  }
 
   //! Init 2 things as one unit; used for SAGE
   void PairGlorotBengioInit(std::vector<GNNFloat>* vector1,
-                            std::vector<GNNFloat>* vector2);
+                            std::vector<GNNFloat>* vector2) {
+    // multiplied by 2 here because 2 pieces are 1 unit
+    float max =
+        std::sqrt(6.0) / std::sqrt((2 * layer_dimensions_.output_columns) +
+                                   layer_dimensions_.input_columns);
+    assert(vector1->size() == (layer_dimensions_.input_columns *
+                               layer_dimensions_.output_columns));
+    assert(vector2->size() == (layer_dimensions_.input_columns *
+                               layer_dimensions_.output_columns));
+    std::default_random_engine rng(1 + layer_number_);
+    std::uniform_real_distribution<GNNFloat> dist(-max, max);
+
+    for (size_t i = 0; i < vector1->size(); i++) {
+      (*vector1)[i] = dist(rng);
+    }
+    for (size_t i = 0; i < vector2->size(); i++) {
+      (*vector2)[i] = dist(rng);
+    }
+
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      CopyLayerWeightsToGPU();
+    }
+#endif
+  }
 
   //! Randomly init a float vector using the class's random init RNG
-  void RandomInitVector(std::vector<GNNFloat>* vector_to_init);
+  void RandomInitVector(std::vector<GNNFloat>* vector_to_init) {
+    galois::do_all(
+        galois::iterate(static_cast<size_t>(0), vector_to_init->size()),
+        [&](size_t i) {
+          // pull from the class's per thread RNG
+          (*vector_to_init)[i] = random_init_rng_.GetRandomNumber();
+        },
+        galois::loopname("RandomInitVector"));
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      CopyLayerWeightsToGPU();
+    }
+#endif
+  }
 
   //! CPU variant of dropout
-  void DoDropoutCPU(const PointerWithSize<GNNFloat> input_to_drop,
-                    PointerWithSize<GNNFloat>* output_matrix);
+  void DoDropoutCPU(const PointerWithSize<GNNFloat> input_to_dropout,
+                    PointerWithSize<GNNFloat>* output_matrix) {
+    // TODO This (and dropout in general) may not work in the sampling setting
+    size_t num_elements =
+        layer_dimensions_.input_rows * layer_dimensions_.input_columns;
+
+    // determine which parts to drop
+    galois::do_all(
+        galois::iterate(static_cast<size_t>(0), num_elements),
+        [&](size_t i) {
+          dropout_mask_[i] = dropout_rng_.DoBernoulli(config_.dropout_rate);
+        },
+        galois::loopname("LayerDropoutRNG"));
+
+    // create new matrix with non-dropped input + some scaling
+    // TODO save scaling elsewhere?
+    GNNFloat scale = 1. / (1. - config_.dropout_rate);
+    galois::do_all(
+        galois::iterate(static_cast<size_t>(0), num_elements),
+        [&](size_t i) {
+          (*output_matrix)[i] = input_to_dropout[i] *
+                                static_cast<GNNFloat>(dropout_mask_[i]) * scale;
+        },
+        galois::loopname("LayerDropout"));
+  }
 
   //! Choose a set of weights from this layer's weights to keep and save to
   //! the output matrix + apply some scaling to the kept weights based on
   //! dropout rate
-  void DoDropout(const PointerWithSize<GNNFloat> input_to_drop,
-                 PointerWithSize<GNNFloat>* output_matrix);
+  void DoDropout(const PointerWithSize<GNNFloat> input_to_dropout,
+                 PointerWithSize<GNNFloat>* output_matrix) {
+    galois::StatTimer timer("ForwardDropout", "GNNLayer");
+    TimerStart(&timer);
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      base_gpu_object_.DoDropoutGPU(input_to_dropout, *output_matrix,
+                                    config_.dropout_rate);
+    } else {
+#endif
+      DoDropoutCPU(input_to_dropout, output_matrix);
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+    TimerStop(&timer);
+  }
+
   //! Apply the derivative of dropout to the backward phase output
-  void DoDropoutDerivative();
-  void ReconstructDropoutMatrix(const PointerWithSize<GNNFloat> input_to_drop,
-                                PointerWithSize<GNNFloat>* output_matrix);
+  void DoDropoutDerivative() {
+    galois::StatTimer timer("BackwardDropout", "GNNLayer");
+    TimerStart(&timer);
+    assert(p_backward_output_matrix_.size() == dropout_mask_.size());
+    GNNFloat scale = 1. / (1. - config_.dropout_rate);
+
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      base_gpu_object_.DoDropoutDerivativeGPU(p_backward_output_matrix_.size(),
+                                              scale);
+    } else {
+#endif
+      // use dropout mask to figure out derivative
+      galois::do_all(
+          galois::iterate(static_cast<size_t>(0),
+                          p_backward_output_matrix_.size()),
+          [&](size_t i) {
+            p_backward_output_matrix_[i] =
+                p_backward_output_matrix_[i] *
+                static_cast<GNNFloat>(dropout_mask_[i]) * scale;
+          },
+          galois::loopname("LayerDropoutDerivative"));
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+    TimerStop(&timer);
+  }
+
+  void
+  ReconstructDropoutMatrix(const PointerWithSize<GNNFloat> input_to_dropout,
+                           PointerWithSize<GNNFloat>* output_matrix) {
+    galois::StatTimer timer("ReconstructDropoutMatrix", "GNNLayer");
+    TimerStart(&timer);
+    // reuse the dropout mask from a previous dropout call
+    size_t num_elements = output_matrix->size();
+    GNNFloat scale      = 1. / (1. - config_.dropout_rate);
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      base_gpu_object_.ReconstructDropoutMatrixGPU(
+          input_to_dropout, output_matrix, num_elements, scale);
+    } else {
+#endif
+      galois::do_all(
+          galois::iterate(static_cast<size_t>(0), num_elements),
+          [&](size_t i) {
+            (*output_matrix)[i] = input_to_dropout[i] *
+                                  static_cast<GNNFloat>(dropout_mask_[i]) *
+                                  scale;
+          },
+          galois::loopname("ReconstructDropout"));
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+    TimerStop(&timer);
+  }
 
   //! Does some activation function based on configuration on forward output
   //! matrix
-  void Activation();
+  void Activation() {
+    galois::StatTimer timer("ForwardActivation", "GNNLayer");
+    TimerStart(&timer);
+
+    // TODO only does relu at the moment; should check user specified activation
+    // and act accordingly
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      base_gpu_object_.ActivationGPU(p_forward_output_matrix_.size());
+    } else {
+#endif
+      if (activation_memo_.size() != p_forward_output_matrix_.size()) {
+        activation_memo_.resize(p_forward_output_matrix_.size());
+      }
+      activation_memo_.reset();
+      assert(activation_memo_.size() == p_forward_output_matrix_.size());
+      assert(layer_dimensions_.output_rows * layer_dimensions_.output_columns <=
+             p_forward_output_matrix_.size());
+
+      galois::do_all(galois::iterate(static_cast<size_t>(0),
+                                     layer_dimensions_.output_rows *
+                                         layer_dimensions_.output_columns),
+                     [&](size_t i) {
+                       if (p_forward_output_matrix_[i] > 0.0) {
+                         // do nothing, keep value; set the memo though
+                         activation_memo_.set(i);
+                       } else {
+                         p_forward_output_matrix_[i] = 0;
+                       }
+                     });
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+    TimerStop(&timer);
+  }
+
   void ActivationCPU();
   //! Calculate derivative of activation function based on config on the matrix
-  void ActivationDerivative(PointerWithSize<GNNFloat>* matrix);
+  void ActivationDerivative(PointerWithSize<GNNFloat>* gradient) {
+    galois::StatTimer timer("BackwardActivation", "GNNLayer");
+    TimerStart(&timer);
+
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      base_gpu_object_.ActivationDerivativeGPU(gradient->data(),
+                                               gradient->size());
+    } else {
+#endif
+      assert(gradient->size() >=
+             layer_dimensions_.output_rows * layer_dimensions_.output_columns);
+      // TODO only does relu at the moment; should check user specified
+      // activation and act accordingly keep gradient if the original output was
+      // greater than 0
+      galois::do_all(
+          galois::iterate(static_cast<size_t>(0),
+                          layer_dimensions_.output_rows *
+                              layer_dimensions_.output_columns),
+          [&](size_t i) {
+            // it was <= 0 before; set back to 0
+            if (!activation_memo_.test(i)) {
+              (*gradient)[i] = 0;
+            }
+          },
+          galois::loopname("ReLU-Derivative"));
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+    TimerStop(&timer);
+  }
 
   //! Synchronize weight gradients with a summation
-  void WeightGradientSyncSum();
+  void WeightGradientSyncSum() {
+    galois::StatTimer clubbed_timer("Sync_BackwardSync", "Gluon");
+    TimerStart(&clubbed_timer);
+    galois::StatTimer t("Sync_WeightGradientsSum", "GNNLayer");
+    TimerStart(&t);
+    int weight_size = static_cast<int>(p_layer_weight_gradients_.size());
+
+    // TODO(loc) remove this limitation later; can just do a loop over the
+    // weight matrix
+    if (p_layer_weight_gradients_.size() >
+        size_t{std::numeric_limits<int>::max()}) {
+      GALOIS_LOG_FATAL("Weight sync code does not handle size larger than max "
+                       "int at the moment");
+    }
+#ifdef GALOIS_ENABLE_GPU
+    // TODO(lhc) make this clang option later
+    bool gpu_direct_enabled = false;
+    if (device_personality == DevicePersonality::GPU_CUDA &&
+        !gpu_direct_enabled) {
+      base_gpu_object_.CopyWeightGradientsToCPU(&layer_weight_gradients_);
+      MPI_Allreduce(MPI_IN_PLACE, layer_weight_gradients_.data(), weight_size,
+                    MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);
+      base_gpu_object_.CopyToWeightGradients(layer_weight_gradients_);
+    } else {
+#endif
+      MPI_Allreduce(MPI_IN_PLACE,
+                    static_cast<void*>(p_layer_weight_gradients_.data()),
+                    weight_size, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+    TimerStop(&t);
+    TimerStop(&clubbed_timer);
+  }
 
 #ifdef GALOIS_ENABLE_GPU
   //! Object that holds all GPU allocated pointers to memory related to layers
@@ -363,18 +724,176 @@ class GNNLayer {
   void MaskInputNonMasters(PointerWithSize<GNNFloat>* input) {
     MaskInputNonMasters(input, std::numeric_limits<size_t>::max());
   }
-  void MaskInputNonMasters(PointerWithSize<GNNFloat>* input, size_t max_rows);
+  void MaskInputNonMasters(PointerWithSize<GNNFloat>* input, size_t max_rows) {
+    assert(*(graph_.begin_owned()) == 0);
+    size_t start_node = *(graph_.end_owned());
+    size_t end_node   = graph_.active_size();
+
+    if (start_node > max_rows) {
+      start_node = max_rows;
+    }
+    if (end_node > max_rows) {
+      end_node = max_rows;
+    }
+
+    size_t row_index = layer_dimensions_.input_columns;
+    assert(start_node * row_index <= input->size());
+    assert(end_node * row_index <= input->size());
+
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      base_gpu_object_.MaskNonMastersGPU(input, start_node, end_node,
+                                         row_index);
+    } else {
+#endif
+      galois::do_all(
+          galois::iterate(start_node, end_node),
+          [&](size_t non_master) {
+            // TODO(loc) use a std function for this for max efficiency
+            for (size_t i = 0; i < row_index; i++) {
+              (*input)[non_master * row_index + i] = 0;
+            }
+          },
+          galois::loopname("MaskInputNonMasters"));
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+  }
+
   void MaskInputNonMasters(PointerWithSize<GNNFloat>* input, size_t max_rows,
-                           const galois::DynamicBitSet&);
+                           const galois::DynamicBitSet& bs) {
+    assert(*(graph_.begin_owned()) == 0);
+    size_t start_node = *(graph_.end_owned());
+    size_t end_node   = graph_.active_size();
+
+    if (start_node > max_rows) {
+      start_node = max_rows;
+    }
+    if (end_node > max_rows) {
+      end_node = max_rows;
+    }
+
+    size_t row_index = layer_dimensions_.input_columns;
+    assert(start_node * row_index <= input->size());
+    assert(end_node * row_index <= input->size());
+
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      base_gpu_object_.MaskNonMastersGPU(input, start_node, end_node,
+                                         row_index);
+    } else {
+#endif
+      galois::do_all(
+          galois::iterate(start_node, end_node),
+          [&](size_t non_master) {
+            if (!bs.test(non_master)) {
+              // TODO(loc) use a std function for this for max efficiency
+              for (size_t i = 0; i < row_index; i++) {
+                (*input)[non_master * row_index + i] = 0;
+              }
+            }
+          },
+          galois::loopname("MaskInputNonMasters"));
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+  }
 
   //! Mask a gradient size'd matrix's rows that correspond to mirrors
   void MaskGradientNonMasters(PointerWithSize<GNNFloat>* input) {
     MaskGradientNonMasters(input, std::numeric_limits<size_t>::max());
   }
   void MaskGradientNonMasters(PointerWithSize<GNNFloat>* gradients,
-                              size_t max_rows);
+                              size_t max_rows) {
+    assert(*(graph_.begin_owned()) == 0);
+    size_t start_node = *(graph_.end_owned());
+    size_t end_node   = graph_.active_size();
+
+    if (start_node > max_rows) {
+      start_node = max_rows;
+    }
+    if (end_node > max_rows) {
+      end_node = max_rows;
+    }
+
+    size_t row_index = layer_dimensions_.output_columns;
+    if (start_node > max_rows) {
+      start_node = max_rows;
+    }
+    if (end_node > max_rows) {
+      end_node = max_rows;
+    }
+    assert(start_node * row_index <= gradients->size());
+    assert(end_node * row_index <= gradients->size());
+
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      base_gpu_object_.MaskNonMastersGPU(gradients, start_node, end_node,
+                                         row_index);
+    } else {
+#endif
+      galois::do_all(
+          galois::iterate(start_node, end_node),
+          [&](size_t non_master) {
+            // TODO(loc) use a std function for this for max efficiency
+            for (size_t i = 0; i < row_index; i++) {
+              (*gradients)[non_master * row_index + i] = 0;
+            }
+          },
+          galois::loopname("MaskGradientNonMasters"));
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+  }
+
   void MaskGradientNonMasters(PointerWithSize<GNNFloat>* gradients,
-                              size_t max_rows, const galois::DynamicBitSet&);
+                              size_t max_rows,
+                              const galois::DynamicBitSet& bs) {
+    assert(*(graph_.begin_owned()) == 0);
+    size_t start_node = *(graph_.end_owned());
+    size_t end_node   = graph_.active_size();
+
+    if (start_node > max_rows) {
+      start_node = max_rows;
+    }
+    if (end_node > max_rows) {
+      end_node = max_rows;
+    }
+
+    size_t row_index = layer_dimensions_.output_columns;
+    if (start_node > max_rows) {
+      start_node = max_rows;
+    }
+    if (end_node > max_rows) {
+      end_node = max_rows;
+    }
+    assert(start_node * row_index <= gradients->size());
+    assert(end_node * row_index <= gradients->size());
+
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      base_gpu_object_.MaskNonMastersGPU(gradients, start_node, end_node,
+                                         row_index);
+    } else {
+#endif
+      // galois::gInfo(start_node, " to ", end_node);
+      galois::do_all(
+          galois::iterate(start_node, end_node),
+          [&](size_t non_master) {
+            // if something is not a master, kill it
+            if (!bs.test(non_master)) {
+              // galois::gInfo("don't keep ", non_master);
+              // TODO(loc) use a std function for this for max efficiency
+              for (size_t i = 0; i < row_index; i++) {
+                (*gradients)[non_master * row_index + i] = 0;
+              }
+            }
+          },
+          galois::loopname("MaskGradientNonMasters"));
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+  }
 
   //! Does some math to get GB used by some # of floats
   double FloatElementsToGB(size_t num_of_floats) const {
diff --git a/libgnn/include/galois/layers/GraphConvolutionalLayer.h b/libgnn/include/galois/layers/GraphConvolutionalLayer.h
index 988276965d..2c7a41ecab 100644
--- a/libgnn/include/galois/layers/GraphConvolutionalLayer.h
+++ b/libgnn/include/galois/layers/GraphConvolutionalLayer.h
@@ -1,5 +1,7 @@
 #pragma once
 #include "galois/layers/GNNLayer.h"
+#include "galois/Logging.h"
+#include "galois/GNNMath.h"
 
 #ifdef GALOIS_ENABLE_GPU
 #include "galois/layers/GraphConvolutionalLayer.cuh"
@@ -9,19 +11,113 @@ namespace galois {
 
 extern galois::DynamicBitSet graphs::bitset_graph_aggregate;
 
-class GraphConvolutionalLayer : public GNNLayer {
+template <typename VTy, typename ETy>
+class GraphConvolutionalLayer : public GNNLayer<VTy, ETy> {
 public:
   //! Initializes the variables of the base class and also allocates additional
   //! memory for temporary matrices. Also initializes sync substrate for the
   //! weight matrix
   GraphConvolutionalLayer(size_t layer_num,
-                          const galois::graphs::GNNGraph& graph,
+                          const galois::graphs::GNNGraph<VTy, ETy>& graph,
                           PointerWithSize<GNNFloat>* backward_output_matrix,
                           const GNNLayerDimensions& dimensions,
-                          const GNNLayerConfig& config);
+                          const GNNLayerConfig& config)
+      : GNNLayer<VTy, ETy>(layer_num, graph, backward_output_matrix, dimensions,
+                           config),
+        input_column_intermediates_(dimensions.input_columns),
+        output_column_intermediates_(dimensions.output_columns) {
+    galois::gWarn(
+        "GCN layer not up to date with new subgraph/sampling changes; "
+        "do not use until updated to reflect changes (see GraphSAGE layer)");
+
+    size_t num_input_elements = this->layer_dimensions_.input_rows *
+                                this->layer_dimensions_.input_columns;
+    if (!this->config_.disable_dropout ||
+        this->config_.disable_aggregate_after_update ||
+        this->layer_dimensions_.input_columns <=
+            this->layer_dimensions_.output_columns) {
+      galois::gInfo(this->graph_.host_prefix(), "Creating layer ",
+                    this->layer_number_, ", GCN input temp var 1 ",
+                    num_input_elements, " (",
+                    this->FloatElementsToGB(num_input_elements), " GB)");
+#ifdef GALOIS_ENABLE_GPU
+      if (device_personality == DevicePersonality::GPU_CUDA) {
+        gpu_object_.AllocateInTemp1(num_input_elements);
+      } else {
+#endif
+        in_temp_1_.resize(num_input_elements, 0);
+#ifdef GALOIS_ENABLE_GPU
+      }
+#endif
+    }
+
+    // only on in dropout case + if in temp is smaller than out temp
+    if (!this->config_.disable_dropout &&
+        (this->config_.disable_aggregate_after_update ||
+         this->layer_dimensions_.input_columns <=
+             this->layer_dimensions_.output_columns)) {
+      galois::gInfo(this->graph_.host_prefix(), "Creating layer ",
+                    this->layer_number_, ", GCN input temp var 2 ",
+                    num_input_elements, " (",
+                    this->FloatElementsToGB(num_input_elements), " GB)");
+#ifdef GALOIS_ENABLE_GPU
+      if (device_personality == DevicePersonality::GPU_CUDA) {
+        gpu_object_.AllocateInTemp2(num_input_elements);
+      } else {
+#endif
+        in_temp_2_.resize(num_input_elements, 0);
+#ifdef GALOIS_ENABLE_GPU
+      }
+#endif
+    }
+
+    size_t num_output_elements = this->layer_dimensions_.input_rows *
+                                 this->layer_dimensions_.output_columns;
+
+    // only needed if out temp would be smaller than intemp
+    if (!this->config_.disable_aggregate_after_update &&
+        this->layer_dimensions_.input_columns >
+            this->layer_dimensions_.output_columns) {
+      // xform matrix first to work with a smaller output size
+      galois::gInfo(this->graph_.host_prefix(), "Creating layer ",
+                    this->layer_number_, ", GCN output temp var ",
+                    num_output_elements, " (",
+                    this->FloatElementsToGB(num_output_elements), " GB)");
+#ifdef GALOIS_ENABLE_GPU
+      if (device_personality == DevicePersonality::GPU_CUDA) {
+        gpu_object_.AllocateOutTemp(num_output_elements);
+      } else {
+#endif
+        out_temp_.resize(num_output_elements, 0);
+#ifdef GALOIS_ENABLE_GPU
+      }
+#endif
+    }
+
+    this->layer_type_ = galois::GNNLayerType::kGraphConvolutional;
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      // init pointers with size
+      p_in_temp_1_ = PointerWithSize<GNNFloat>(gpu_object_.in_temp_1(),
+                                               num_input_elements);
+      p_in_temp_2_ = PointerWithSize<GNNFloat>(gpu_object_.in_temp_2(),
+                                               num_input_elements);
+      p_out_temp_  = PointerWithSize<GNNFloat>(gpu_object_.out_temp(),
+                                              num_output_elements);
+    } else {
+#endif
+      p_in_temp_1_ = PointerWithSize<GNNFloat>(in_temp_1_);
+      p_in_temp_2_ = PointerWithSize<GNNFloat>(in_temp_2_);
+      p_out_temp_  = PointerWithSize<GNNFloat>(out_temp_);
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+
+    GALOIS_LOG_VERBOSE("Conv layer initialized");
+  }
 
   GraphConvolutionalLayer(size_t layer_num,
-                          const galois::graphs::GNNGraph& graph,
+                          const galois::graphs::GNNGraph<VTy, ETy>& graph,
                           PointerWithSize<GNNFloat>* backward_output_matrix,
                           const GNNLayerDimensions& dimensions)
       : GraphConvolutionalLayer(layer_num, graph, backward_output_matrix,
@@ -29,11 +125,200 @@ class GraphConvolutionalLayer : public GNNLayer {
 
   // Parent functions
   const PointerWithSize<galois::GNNFloat>
-  ForwardPhase(const PointerWithSize<galois::GNNFloat> input_embeddings) final;
+  ForwardPhase(const PointerWithSize<galois::GNNFloat> input_embeddings) final {
+    galois::StatTimer timer("ForwardPhase", kRegionName);
+    timer.start();
+    GALOIS_LOG_VERBOSE("Calling forward phase");
+    assert(input_embeddings.size() == (this->layer_dimensions_.input_rows *
+                                       this->layer_dimensions_.input_columns));
+    assert(this->p_forward_output_matrix_.size() ==
+           (this->layer_dimensions_.input_rows *
+            this->layer_dimensions_.output_columns));
+    // pointer to input to operate on
+    const GNNFloat* input_data = input_embeddings.data();
+    GNNFloat* agg_data;
+    // first, dropout
+    if (!this->config_.disable_dropout &&
+        (this->layer_phase_ == GNNPhase::kTrain)) {
+      this->DoDropout(input_embeddings, &p_in_temp_1_);
+      input_data = p_in_temp_1_.data();
+      agg_data   = p_in_temp_2_.data();
+    } else {
+      agg_data = p_in_temp_1_.data();
+    }
+
+    // flip aggregate/update if dimensions favor it (do less work)
+    if (this->config_.disable_aggregate_after_update ||
+        this->layer_dimensions_.input_columns <=
+            this->layer_dimensions_.output_columns) {
+      // aggregation and update
+      AggregateAll(this->layer_dimensions_.input_columns, input_data, agg_data,
+                   &input_column_intermediates_);
+      UpdateEmbeddings(agg_data, this->p_forward_output_matrix_.data());
+    } else {
+      // update to aggregate
+      // FW
+      UpdateEmbeddings(input_data, p_out_temp_.data());
+      // A(FW)
+      AggregateAll(this->layer_dimensions_.output_columns, p_out_temp_.data(),
+                   this->p_forward_output_matrix_.data(),
+                   &output_column_intermediates_);
+    }
+
+    if (!this->config_.disable_activation) {
+      GALOIS_LOG_VERBOSE("Doing activation");
+      this->Activation();
+    }
+
+    assert(this->p_forward_output_matrix_.size() ==
+           (this->layer_dimensions_.input_rows *
+            this->layer_dimensions_.output_columns));
+    timer.stop();
+
+    return this->p_forward_output_matrix_;
+  }
 
   PointerWithSize<galois::GNNFloat>
   BackwardPhase(PointerWithSize<galois::GNNFloat> prev_layer_input,
-                PointerWithSize<galois::GNNFloat>* input_gradient) final;
+                PointerWithSize<galois::GNNFloat>* input_gradient) final {
+    galois::StatTimer timer("BackwardPhase", kRegionName);
+    galois::StatTimer weight_gradient_timer("BackwardPhaseWeight", kRegionName);
+    galois::StatTimer weight_gradient_sync_timer("BackwardPhaseWeightSync",
+                                                 kRegionName);
+    timer.start();
+
+    assert(this->layer_phase_ == GNNPhase::kTrain);
+
+    // derivative of activation
+    if (!this->config_.disable_activation) {
+      this->ActivationDerivative(input_gradient);
+    }
+
+    // AFW = O
+    galois::PointerWithSize<galois::GNNFloat> input_data;
+    galois::PointerWithSize<galois::GNNFloat> agg_data;
+    if (!this->config_.disable_dropout) {
+      // dropout result is currently stored in temp 1
+      // needs to be used before it gets overwritten
+      input_data = p_in_temp_1_;
+      agg_data   = p_in_temp_2_;
+    } else {
+      // no dropout = use vanilla input
+      input_data = prev_layer_input;
+      agg_data   = p_in_temp_1_;
+    }
+
+    // NOTE: PREV LAYER INPUT AND BACKWARDOUTPUT ARE THE SAME MEMORY LOCATION;
+    // BEWARE OF DEPENDENCIES
+
+    // derivative of aggregation/update
+    // TODO clean up logic here to reduce nesting
+    if (this->config_.disable_aggregate_after_update ||
+        this->layer_dimensions_.input_columns <=
+            this->layer_dimensions_.output_columns) {
+      // aggdata can == p_intemp1; in other words, need to use before overwrite
+      // mask it, then use it
+      this->MaskInputNonMasters(&agg_data);
+
+#ifdef GALOIS_ENABLE_GPU
+      if (device_personality == DevicePersonality::GPU_CUDA) {
+        gpu_object_.GetWeightGradientsGPU(
+            this->layer_dimensions_.input_rows,
+            this->layer_dimensions_.input_columns,
+            this->layer_dimensions_.output_columns, agg_data.data(),
+            input_gradient->data(), this->p_layer_weight_gradients.data());
+      } else {
+#endif
+        weight_gradient_timer.start();
+        // temp 2 holds aggregated feature vectors from forward phase
+        galois::CBlasSGEMM(
+            CblasTrans, CblasNoTrans, this->layer_dimensions_.input_columns,
+            this->layer_dimensions_.input_rows,
+            this->layer_dimensions_.output_columns, agg_data.data(),
+            input_gradient->data(), this->p_layer_weight_gradients_.data());
+        weight_gradient_timer.stop();
+#ifdef GALOIS_ENABLE_GPU
+      }
+#endif
+
+      // gradient isn't masked here; only temp1, which has already been
+      // overwritten = fine
+      if (this->layer_number_ != 0) {
+        // transposed sgemm for derivative; in_temp is output
+        assert(input_gradient->size() ==
+               this->layer_dimensions_.input_rows *
+                   this->layer_dimensions_.output_columns);
+        // pintemp1 contains (AF)'
+        UpdateEmbeddingsDerivative(input_gradient->data(), p_in_temp_1_.data());
+        // pback contains F'
+        // derivative of aggregate is the same due to symmetric graph
+        AggregateAll(this->layer_dimensions_.input_columns, p_in_temp_1_.data(),
+                     this->p_backward_output_matrix_.data(),
+                     &input_column_intermediates_, true);
+      }
+    } else {
+      // TODO at this point, out_temp contains memoized FW
+      // can use it to get A' = O' (FW)^T
+      // aggregate occurs regardless of layer being equal to 0 because it is
+      // required in this case for the weight gradient calculation
+      // this is (FW)'
+      AggregateAll(this->layer_dimensions_.output_columns,
+                   input_gradient->data(), p_out_temp_.data(),
+                   &output_column_intermediates_, true);
+
+      // done after above because input_data = p_backward_output_matrix in some
+      // cases; use first before overwriting here if layer # doesn't = 0, it
+      // means I can mess with the input data itself instad of masking the
+      // gradients I can mask the input
+      if (this->layer_number_ != 0) {
+        this->MaskInputNonMasters(&input_data);
+      } else {
+        // if 0 then no input to mask: mask the gradient
+        // this is fine because gradient won't be used to get feature gradients
+        this->MaskGradientNonMasters(&p_out_temp_);
+      }
+
+#ifdef GALOIS_ENABLE_GPU
+      if (device_personality == DevicePersonality::GPU_CUDA) {
+        gpu_object_.GetWeightGradientsGPU(
+            this->layer_dimensions_.input_rows,
+            this->layer_dimensions_.input_columns,
+            this->layer_dimensions_.output_columns, input_data.data(),
+            p_out_temp_.data(), this->p_layer_weight_gradients.data());
+      } else {
+#endif
+        weight_gradient_timer.start();
+        galois::CBlasSGEMM(
+            CblasTrans, CblasNoTrans, this->layer_dimensions_.input_columns,
+            this->layer_dimensions_.input_rows,
+            this->layer_dimensions_.output_columns, input_data.data(),
+            p_out_temp_.data(), this->p_layer_weight_gradients_.data());
+        weight_gradient_timer.stop();
+#ifdef GALOIS_ENABLE_GPU
+      }
+#endif
+
+      if (this->layer_number_ != 0) {
+        // can now overwrite p_backward without issue; since input gradient
+        // is untouched if layer number isn't 0 this will be correct
+        UpdateEmbeddingsDerivative(p_out_temp_.data(),
+                                   this->p_backward_output_matrix_.data());
+      }
+    }
+
+    // sync weight gradients; note aggregation sync occurs in the function call
+    // already
+    weight_gradient_sync_timer.start();
+    this->WeightGradientSyncSum();
+    weight_gradient_sync_timer.stop();
+
+    if (!this->config_.disable_dropout && this->layer_number_ != 0) {
+      this->DoDropoutDerivative();
+    }
+
+    timer.stop();
+    return this->p_backward_output_matrix_;
+  }
 
 private:
   static const constexpr char* kRegionName = "GCNLayer";
@@ -59,28 +344,194 @@ class GraphConvolutionalLayer : public GNNLayer {
       output_column_intermediates_;
 
   //! CPU aggregation
-  void AggregateAllCPU(
-      size_t column_length, const GNNFloat* node_embeddings,
-      GNNFloat* aggregate_output,
-      galois::substrate::PerThreadStorage<std::vector<GNNFloat>>* pts);
+  void
+  AggregateAllCPU(size_t column_length, const GNNFloat* node_embeddings,
+                  GNNFloat* aggregate_output,
+                  galois::substrate::PerThreadStorage<std::vector<GNNFloat>>*) {
+    galois::StatTimer aggregate_all_sync_timer("AggregateSync", kRegionName);
+    size_t num_nodes   = this->graph_.size();
+    size_t last_master = *(this->graph_.end_owned());
+    assert(0 == *(this->graph_.begin_owned()));
+
+    galois::do_all(
+        galois::iterate(static_cast<size_t>(0), num_nodes),
+        [&](size_t src) {
+          size_t index_to_src_feature = src * column_length;
+          // zero out src feature first
+          for (size_t i = 0; i < column_length; i++) {
+            aggregate_output[index_to_src_feature + i] = 0;
+          }
+
+          if (this->layer_phase_ == GNNPhase::kTrain) {
+            if (this->IsSampledLayer()) {
+              // XXX(loc)
+              GALOIS_LOG_WARN(
+                  "Edge sampling not yet implemented for GCN; only SAGE");
+              // check if node is part of sampled graph; ignore after 0'ing if
+              // not sampled
+              if (!this->graph_.IsInSampledGraph(src))
+                return;
+            }
+          }
+
+          GNNFloat source_norm = 0.0;
+          if (!this->config_.disable_normalization) {
+            source_norm = this->graph_.GetGCNNormFactor(src);
+          }
+
+          // init to self
+          if (!this->config_.disable_self_aggregate) {
+            graphs::bitset_graph_aggregate.set(src);
+            // only aggregate self once on master
+            if (src < last_master) {
+              for (size_t i = 0; i < column_length; i++) {
+                aggregate_output[index_to_src_feature + i] =
+                    node_embeddings[index_to_src_feature + i] * source_norm *
+                    source_norm;
+              }
+            }
+          }
+
+          // loop through all destinations to grab the feature to aggregate
+          for (auto e = this->graph_.edge_begin(src);
+               e != this->graph_.edge_end(src); e++) {
+            size_t dst = this->graph_.GetEdgeDest(e);
+            graphs::bitset_graph_aggregate.set(src);
+
+            if (this->layer_phase_ == GNNPhase::kTrain) {
+              if (this->IsSampledLayer()) {
+                // ignore non-sampled nodes
+                if (this->layer_phase_ == GNNPhase::kTrain &&
+                    !this->graph_.IsInSampledGraph(dst))
+                  continue;
+              }
+            }
+
+            size_t index_to_dst_feature = dst * column_length;
+
+            if (!this->config_.disable_normalization) {
+              GNNFloat norm_scale =
+                  source_norm * this->graph_.GetGCNNormFactor(dst);
+              galois::VectorMulAdd(
+                  column_length, &aggregate_output[index_to_src_feature],
+                  &node_embeddings[index_to_dst_feature], norm_scale,
+                  &aggregate_output[index_to_src_feature]);
+            } else {
+              // add dst feature to aggregate output
+              galois::VectorAdd(column_length,
+                                &aggregate_output[index_to_src_feature],
+                                &node_embeddings[index_to_dst_feature],
+                                &aggregate_output[index_to_src_feature]);
+            }
+          }
+        },
+        galois::chunk_size<1>(), galois::steal(),
+        galois::loopname("ConvolutionalAggregateAll"));
+    // aggregate sync
+    aggregate_all_sync_timer.start();
+    this->graph_.AggregateSync(aggregate_output, column_length);
+    aggregate_all_sync_timer.stop();
+  }
 
   //! Performs aggregation for all nodes of the graph given the length of the
   //! vector to aggregate, the features themselves, an output array, and per
   //! thread storage for the intermediate scaling via norm factor
-  void
-  AggregateAll(size_t column_length, const GNNFloat* node_embeddings,
-               GNNFloat* aggregate_output,
-               galois::substrate::PerThreadStorage<std::vector<GNNFloat>>* pts);
+  void AggregateAll(
+      size_t column_length, const GNNFloat* node_embeddings,
+      GNNFloat* aggregate_output,
+      galois::substrate::PerThreadStorage<std::vector<GNNFloat>>* pts) {
+    AggregateAll(column_length, node_embeddings, aggregate_output, pts, false);
+  }
+
   void
   AggregateAll(size_t column_length, const GNNFloat* node_embeddings,
                GNNFloat* aggregate_output,
                galois::substrate::PerThreadStorage<std::vector<GNNFloat>>* pts,
-               bool is_backward);
+               bool is_backward) {
+    std::string agg_timer_name = "Aggregate";
+    if (!is_backward) {
+      agg_timer_name += "Forward";
+    } else {
+      agg_timer_name += "Backward";
+    }
+    galois::StatTimer timer(agg_timer_name.c_str(), kRegionName);
+    timer.start();
+
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      size_t last_master = *(this->graph_.end_owned());
+      gpu_object_.AggregateAllGPU(
+          this->graph_.GetGPUGraph(), this->graph_.size(), column_length,
+          node_embeddings, aggregate_output,
+          !this->config_.disable_normalization,
+          this->config_.disable_self_aggregate, last_master);
+      this->graph_.AggregateSyncGPU(aggregate_output, column_length,
+                                    this->layer_number_);
+    } else {
+#endif
+      AggregateAllCPU(column_length, node_embeddings, aggregate_output, pts);
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+    timer.stop();
+  }
 
   //! Do embedding update via mxm with this layer's weights (forward)
-  void UpdateEmbeddings(const GNNFloat* node_embeddings, GNNFloat* output);
+  void UpdateEmbeddings(const GNNFloat* node_embeddings, GNNFloat* output) {
+    galois::StatTimer timer("ForwardXform", kRegionName);
+    timer.start();
+
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      gpu_object_.UpdateEmbeddingsGPU(this->layer_dimensions_.input_rows,
+                                      this->layer_dimensions_.input_columns,
+                                      this->layer_dimensions_.output_columns,
+                                      node_embeddings,
+                                      base_gpu_object_.layer_weights(), output);
+    } else {
+#endif
+      // CPU version is just a call into CBlas
+      galois::CBlasSGEMM(CblasNoTrans, CblasNoTrans,
+                         this->layer_dimensions_.input_rows,
+                         this->layer_dimensions_.input_columns,
+                         this->layer_dimensions_.output_columns,
+                         node_embeddings, this->layer_weights_.data(), output);
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+    timer.stop();
+  }
+
   //! Calculate graident via mxm with last layer's gradients (backward)
-  void UpdateEmbeddingsDerivative(const GNNFloat* gradients, GNNFloat* output);
+  void UpdateEmbeddingsDerivative(const GNNFloat* gradients, GNNFloat* output) {
+    galois::StatTimer timer("BackwardXform", kRegionName);
+    timer.start();
+
+    assert(this->p_layer_weights_.size() ==
+           this->layer_dimensions_.input_columns *
+               this->layer_dimensions_.output_columns);
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      gpu_object_.UpdateEmbeddingsDerivativeGPU(
+          this->layer_dimensions_.input_rows,
+          this->layer_dimensions_.input_columns,
+          this->layer_dimensions_.output_columns, gradients,
+          base_gpu_object_.layer_weights(), output);
+    } else {
+#endif
+      // difference is Trans for B matrix (data) to get z by y (weights is y by
+      // z normally); result is x by y
+      galois::CBlasSGEMM(CblasNoTrans, CblasTrans,
+                         this->layer_dimensions_.input_rows,
+                         this->layer_dimensions_.output_columns,
+                         this->layer_dimensions_.input_columns, gradients,
+                         this->layer_weights_.data(), output);
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+    timer.stop();
+  }
+
 #ifdef GALOIS_ENABLE_GPU
   GCNGPUAllocations gpu_object_;
 #endif
diff --git a/libgnn/include/galois/layers/L2NormLayer.h b/libgnn/include/galois/layers/L2NormLayer.h
index 0ed1a0d0df..e3ec67f726 100644
--- a/libgnn/include/galois/layers/L2NormLayer.h
+++ b/libgnn/include/galois/layers/L2NormLayer.h
@@ -8,39 +8,152 @@
 namespace galois {
 
 //! Applies L2 norm to rows of the input
-class L2NormLayer : public GNNLayer {
+template <typename VTy, typename ETy>
+class L2NormLayer : public GNNLayer<VTy, ETy> {
 public:
-  L2NormLayer(size_t layer_num, const galois::graphs::GNNGraph& graph,
-
+  L2NormLayer(size_t layer_num, const galois::graphs::GNNGraph<VTy, ETy>& graph,
               PointerWithSize<GNNFloat>* backward_output_matrix,
               const GNNLayerDimensions& dimensions)
       : L2NormLayer(layer_num, graph, backward_output_matrix, dimensions,
                     GNNLayerConfig{.allocate_weights = false}) {}
-  L2NormLayer(size_t layer_num, const galois::graphs::GNNGraph& graph,
+  L2NormLayer(size_t layer_num, const galois::graphs::GNNGraph<VTy, ETy>& graph,
               PointerWithSize<GNNFloat>* backward_output_matrix,
               const GNNLayerDimensions& dimensions,
               const GNNLayerConfig& config)
-      : GNNLayer(layer_num, graph, backward_output_matrix, dimensions, config) {
-    layer_type_ = galois::GNNLayerType::kL2Norm;
+      : GNNLayer<VTy, ETy>(layer_num, graph, backward_output_matrix, dimensions,
+                           config) {
+    this->layer_type_ = galois::GNNLayerType::kL2Norm;
     // input/output columns must be equivalent in a softmax
     GALOIS_LOG_ASSERT(dimensions.input_columns == dimensions.output_columns);
     GALOIS_LOG_VERBOSE("L2 norm initialized");
   }
 
   const PointerWithSize<galois::GNNFloat>
-  ForwardPhase(const PointerWithSize<galois::GNNFloat> input_embeddings);
+  ForwardPhase(const PointerWithSize<galois::GNNFloat> input_embeddings) {
+#ifdef GALOIS_ENABLE_GPU
+    // TODO
+#endif
+    GALOIS_LOG_FATAL(
+        "L2 Layer has not been kept up to date for months; do not use");
+    return ForwardPhaseCPU(input_embeddings);
+  }
 
   PointerWithSize<galois::GNNFloat>
   BackwardPhase(PointerWithSize<galois::GNNFloat> prev_layer_input,
-                PointerWithSize<galois::GNNFloat>* input_gradient);
+                PointerWithSize<galois::GNNFloat>* input_gradient) {
+#ifdef GALOIS_ENABLE_GPU
+    // TODO
+#endif
+    return BackwardPhaseCPU(prev_layer_input, input_gradient);
+  }
 
 private:
   const PointerWithSize<galois::GNNFloat>
-  ForwardPhaseCPU(const PointerWithSize<galois::GNNFloat> input_embeddings);
+  ForwardPhaseCPU(const PointerWithSize<galois::GNNFloat> input_embeddings) {
+    this->forward_output_matrix_.assign(this->forward_output_matrix_.size(),
+                                        0.0);
+    // for each row, get square root of squared sums then normalize
+    const size_t feature_length = this->layer_dimensions_.input_columns;
+    // TODO(loc) make sure this works in distributed setting as well
+    galois::do_all(
+        galois::iterate(this->graph_.begin_owned(), this->graph_.end_owned()),
+        [&](const unsigned row) {
+          if (this->IsSampledLayer()) {
+            if (this->layer_phase_ == GNNPhase::kTrain &&
+                !this->graph_.IsInSampledGraph(row))
+              return;
+          }
+
+          if (this->graph_.IsValidForPhase(row, this->layer_phase_)) {
+            size_t row_offset        = row * feature_length;
+            float running_square_sum = 0.0;
+            // get square sums
+            for (size_t row_index = row_offset;
+                 row_index < (row_offset + feature_length); row_index++) {
+              running_square_sum += std::pow(input_embeddings[row_index], 2);
+            }
+
+            // make sure running sum isn't too small
+            running_square_sum =
+                (running_square_sum < 1.0e-12) ? 10e-12 : running_square_sum;
+
+            // sqrt of sums, then divide row by it
+            float sqrt_squares = std::pow(running_square_sum, 0.5);
+            for (size_t row_index = row_offset;
+                 row_index < (row_offset + feature_length); row_index++) {
+              this->forward_output_matrix_[row_index] =
+                  input_embeddings[row_index] / sqrt_squares;
+            }
+          }
+        },
+        galois::loopname("L2ForwardNormalization"));
+
+    return this->forward_output_matrix_;
+  }
 
   PointerWithSize<galois::GNNFloat>
   BackwardPhaseCPU(PointerWithSize<galois::GNNFloat> prev_layer_input,
-                   PointerWithSize<galois::GNNFloat>* input_gradient);
+                   PointerWithSize<galois::GNNFloat>* input_gradient) {
+    galois::do_all(
+        galois::iterate(size_t{0}, this->p_backward_output_matrix_.size()),
+        [&](size_t i) { this->p_backward_output_matrix_[i] = 0; });
+    const size_t feature_length = this->layer_dimensions_.input_columns;
+
+    // derivative of some x_1 is sum of gradient w.r.t. x_1 for all elements of
+    // the row (since l2 norm affects entire row)
+    // The math itself can be derived using quotient/chain rule on each element
+    // of the normalized row
+    galois::do_all(
+        galois::iterate(this->graph_.begin_owned(), this->graph_.end_owned()),
+        [&](const unsigned row) {
+          if (this->IsSampledLayer()) {
+            if (this->layer_phase_ == GNNPhase::kTrain &&
+                !this->graph_.IsInSampledGraph(row))
+              return;
+          }
+
+          if (this->graph_.IsValidForPhase(row, this->layer_phase_)) {
+            size_t row_offset = row * feature_length;
+            // note: if you work this out on paper it turns out that terms that
+            // seem extra in the way this is calculated below simply get
+            // canceled out, so this ends up working out This implementation is
+            // taken from the IPDPS GraphSAINT implementation: I (loc) have
+            // confirmed the math checks out
+            float running_square_sum = 0.0;
+            float mult_with_input    = 0.0;
+
+            // get square sums
+            for (size_t row_index = row_offset;
+                 row_index < (row_offset + feature_length); row_index++) {
+              running_square_sum += std::pow(prev_layer_input[row_index], 2);
+              // gradient multiplied with corresponding input; subtraction
+              // because derivative math ends up working out that way
+              mult_with_input -=
+                  prev_layer_input[row_index] * (*input_gradient)[row_index];
+            }
+            running_square_sum =
+                (running_square_sum < 1.0e-12) ? 10e-12 : running_square_sum;
+            assert(running_square_sum != 0.0);
+
+            // denominator for all gradients is just the square sum to the
+            // -3/2'd power since this is -, all we have to do is multiply it
+            // later rather than divide
+            float denominator = std::pow(running_square_sum, -1.5);
+            assert(denominator != 0.0);
+
+            for (size_t row_index = row_offset;
+                 row_index < (row_offset + feature_length); row_index++) {
+              this->p_backward_output_matrix_[row_index] =
+                  denominator *
+                  (prev_layer_input[row_index] * mult_with_input +
+                   (*input_gradient)[row_index] * running_square_sum);
+            }
+          }
+        },
+        galois::loopname("L2Backward"));
+
+    return this->p_backward_output_matrix_;
+  }
 
   //! No op
   void OptimizeLayer(BaseOptimizer*, size_t) { return; };
diff --git a/libgnn/include/galois/layers/SAGELayer.h b/libgnn/include/galois/layers/SAGELayer.h
index 581115a00e..19d5a75815 100644
--- a/libgnn/include/galois/layers/SAGELayer.h
+++ b/libgnn/include/galois/layers/SAGELayer.h
@@ -1,6 +1,8 @@
 #pragma once
 #include "galois/layers/GNNLayer.h"
 #include "galois/layers/GradientSyncStructures.h"
+#include "galois/GNNMath.h"
+#include "galois/Logging.h"
 
 #ifdef GALOIS_ENABLE_GPU
 #include "galois/layers/SAGELayer.cuh"
@@ -22,23 +24,177 @@ struct SAGELayerConfig {
 //! ends up performing better for some graphs)
 //! - Concatination of the self: rather than aggregating self
 //! feature it is concatinated (i.e. dimensions are doubled)
-class SAGELayer : public GNNLayer {
+template <typename VTy, typename ETy>
+class SAGELayer : public GNNLayer<VTy, ETy> {
 public:
   //! Initializes the variables of the base class and also allocates additional
   //! memory for temporary matrices. Also initializes sync substrate for the
   //! weight matrix
-  SAGELayer(size_t layer_num, const galois::graphs::GNNGraph& graph,
+  SAGELayer(size_t layer_num, const galois::graphs::GNNGraph<VTy, ETy>& graph,
             PointerWithSize<GNNFloat>* backward_output_matrix,
             const GNNLayerDimensions& dimensions, const GNNLayerConfig& config,
-            const SAGELayerConfig& sage_config);
+            const SAGELayerConfig& sage_config)
+      : GNNLayer<VTy, ETy>(layer_num, graph, backward_output_matrix, dimensions,
+                           config),
+        sage_config_(sage_config),
+        input_column_intermediates_(dimensions.input_columns),
+        output_column_intermediates_(dimensions.output_columns) {
+    if (!sage_config_.disable_concat) {
+      // there are now 2 weight matrices used: one for self, one for aggregation
+      // abstractly it's one matrix: W = W1 | W2
+      size_t num_weight_elements = this->layer_dimensions_.input_columns *
+                                   this->layer_dimensions_.output_columns;
+      galois::gInfo(this->graph_.host_prefix(), "Creating layer ",
+                    this->layer_number_, ", SAGE second layer weights ",
+                    num_weight_elements, " (",
+                    this->FloatElementsToGB(num_weight_elements), " GB)");
+      // TODO(lhc) for now, allocate dummy cpu weight2 for copying to GPU
+      layer_weights_2_.resize(num_weight_elements);
+#ifdef GALOIS_ENABLE_GPU
+      if (device_personality == DevicePersonality::GPU_CUDA) {
+        gpu_object_.AllocateWeight2(num_weight_elements);
+      }
+#endif
+      galois::gInfo(this->graph_.host_prefix(), "Creating layer ",
+                    this->layer_number_, ", SAGE second layer gradients ",
+                    num_weight_elements, " (",
+                    this->FloatElementsToGB(num_weight_elements), " GB)");
+      layer_weight_gradients_2_.resize(num_weight_elements, 0);
+#ifdef GALOIS_ENABLE_GPU
+      if (device_personality == DevicePersonality::GPU_CUDA) {
+        gpu_object_.AllocateWeightGradient2(num_weight_elements);
+      }
+#endif
+
+      // reinit both weight matrices as one unit
+      this->PairGlorotBengioInit(&this->layer_weights_, &layer_weights_2_);
+#ifdef GALOIS_ENABLE_GPU
+      if (device_personality == DevicePersonality::GPU_CUDA) {
+        // copy weight2 to GPU
+        gpu_object_.CopyToWeights2(layer_weights_2_);
+        p_layer_weights_2_ = PointerWithSize<GNNFloat>(
+            gpu_object_.layer_weights_2(), num_weight_elements);
+        p_layer_weight_gradients_2_ = PointerWithSize<GNNFloat>(
+            gpu_object_.layer_weight_gradients_2(), num_weight_elements);
+      } else {
+#endif
+        // update the pointers to them as well as realloc will require it
+        p_layer_weights_2_ = PointerWithSize<GNNFloat>(layer_weights_2_);
+        p_layer_weight_gradients_2_ =
+            PointerWithSize<GNNFloat>(layer_weight_gradients_2_);
+#ifdef GALOIS_ENABLE_GPU
+      }
+#endif
+      std::vector<size_t> weight_size = {num_weight_elements};
+      // initialize the optimizer
+      second_weight_optimizer_ =
+          std::make_unique<AdamOptimizer>(weight_size, 1);
+    }
+
+    // TODO(loc) dropout uses input rows; this won't work if dropout is enabled
+    size_t num_in_temp_elements = this->layer_dimensions_.output_rows *
+                                  this->layer_dimensions_.input_columns;
+
+    // if (this->layer_number_ == 0) {
+    //   // set this to true for layer 0; it avoids aggregation completely
+    //   // in the last layer for the backward phase
+    //   config_.disable_aggregate_after_update = true;
+    //   // TODO this *will* hurt test evaluation because test eval has no
+    //   // backward phase, so the end-to-end benefits do not exist there
+    //   // Solution to this is to allocate all intermediate structures for both
+    //   // cases + make sure resize handles both cases
+    // }
+
+    // if in temp is smaller than out temp, or if dropout exists
+    if (!this->config_.disable_dropout ||
+        this->config_.disable_aggregate_after_update ||
+        this->layer_dimensions_.input_columns <=
+            this->layer_dimensions_.output_columns) {
+      galois::gInfo(this->graph_.host_prefix(), "Creating layer ",
+                    this->layer_number_, ", SAGE input temp var 1 ",
+                    num_in_temp_elements, " (",
+                    this->FloatElementsToGB(num_in_temp_elements), " GB)");
+#ifdef GALOIS_ENABLE_GPU
+      if (device_personality == DevicePersonality::GPU_CUDA) {
+        gpu_object_.AllocateInTemp1(num_in_temp_elements);
+      } else {
+#endif
+        in_temp_1_.resize(num_in_temp_elements, 0);
+#ifdef GALOIS_ENABLE_GPU
+      }
+#endif
+    }
+
+    // only on in dropout case + if in temp is smaller than out temp
+    if (!this->config_.disable_dropout &&
+        (this->config_.disable_aggregate_after_update ||
+         this->layer_dimensions_.input_columns <=
+             this->layer_dimensions_.output_columns)) {
+      galois::gInfo(this->graph_.host_prefix(), "Creating layer ",
+                    this->layer_number_, ", SAGE input temp var 2 ",
+                    num_in_temp_elements, " (",
+                    this->FloatElementsToGB(num_in_temp_elements), " GB)");
+#ifdef GALOIS_ENABLE_GPU
+      if (device_personality == DevicePersonality::GPU_CUDA) {
+        gpu_object_.AllocateInTemp2(num_in_temp_elements);
+      } else {
+#endif
+        in_temp_2_.resize(num_in_temp_elements, 0);
+#ifdef GALOIS_ENABLE_GPU
+      }
+#endif
+    }
+
+    size_t num_out_temp = this->layer_dimensions_.input_rows *
+                          this->layer_dimensions_.output_columns;
+    // only needed if out temp would be smaller than intemp
+    if (!this->config_.disable_aggregate_after_update &&
+        this->layer_dimensions_.input_columns >
+            this->layer_dimensions_.output_columns) {
+      galois::gInfo(this->graph_.host_prefix(), "Creating layer ",
+                    this->layer_number_, ", SAGE output temp var ",
+                    num_out_temp, " (", this->FloatElementsToGB(num_out_temp),
+                    " GB)");
+#ifdef GALOIS_ENABLE_GPU
+      if (device_personality == DevicePersonality::GPU_CUDA) {
+        gpu_object_.AllocateOutTemp(num_out_temp);
+      } else {
+#endif
+        out_temp_.resize(num_out_temp, 0);
+#ifdef GALOIS_ENABLE_GPU
+      }
+#endif
+    }
+
+    this->layer_type_ = galois::GNNLayerType::kSAGE;
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      // init pointers with size
+      p_in_temp_1_ = PointerWithSize<GNNFloat>(gpu_object_.in_temp_1(),
+                                               num_in_temp_elements);
+      p_in_temp_2_ = PointerWithSize<GNNFloat>(gpu_object_.in_temp_2(),
+                                               num_in_temp_elements);
+      p_out_temp_  = PointerWithSize<GNNFloat>(gpu_object_.out_temp(),
+                                              num_output_elements);
+    } else {
+#endif
+      p_in_temp_1_ = PointerWithSize<GNNFloat>(in_temp_1_);
+      p_in_temp_2_ = PointerWithSize<GNNFloat>(in_temp_2_);
+      p_out_temp_  = PointerWithSize<GNNFloat>(out_temp_);
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+
+    GALOIS_LOG_VERBOSE("SAGE layer initialized");
+  }
 
-  SAGELayer(size_t layer_num, const galois::graphs::GNNGraph& graph,
+  SAGELayer(size_t layer_num, const galois::graphs::GNNGraph<VTy, ETy>& graph,
             PointerWithSize<GNNFloat>* backward_output_matrix,
             const GNNLayerDimensions& dimensions, const GNNLayerConfig& config)
       : SAGELayer(layer_num, graph, backward_output_matrix, dimensions, config,
                   SAGELayerConfig()) {}
 
-  SAGELayer(size_t layer_num, const galois::graphs::GNNGraph& graph,
+  SAGELayer(size_t layer_num, const galois::graphs::GNNGraph<VTy, ETy>& graph,
             PointerWithSize<GNNFloat>* backward_output_matrix,
             const GNNLayerDimensions& dimensions)
       : SAGELayer(layer_num, graph, backward_output_matrix, dimensions,
@@ -69,11 +225,350 @@ class SAGELayer : public GNNLayer {
 
   // Parent functions
   const PointerWithSize<galois::GNNFloat>
-  ForwardPhase(const PointerWithSize<galois::GNNFloat> input_embeddings) final;
+  ForwardPhase(const PointerWithSize<galois::GNNFloat> input_embeddings) final {
+    // galois::gDebug(
+    //    "Layer ", this->layer_number_, " dims: ",
+    //    layer_dimensions_.input_rows, " ", layer_dimensions_.output_rows, " ",
+    //    layer_dimensions_.input_columns, "
+    //    ", layer_dimensions_.output_columns, " ", input_embeddings.size(), "
+    //    ", layer_dimensions_.input_rows * layer_dimensions_.input_columns);
+    galois::StatTimer timer("ForwardPhase", kRegionName);
+    this->TimerStart(&timer);
+
+    assert(input_embeddings.size() >= (this->layer_dimensions_.input_rows *
+                                       this->layer_dimensions_.input_columns));
+    assert(this->p_forward_output_matrix_.size() >=
+           (this->layer_dimensions_.output_rows *
+            this->layer_dimensions_.output_columns));
+
+    // pointer to input to operate on
+    const GNNFloat* input_data = input_embeddings.data();
+    GNNFloat* agg_data;
+    // first, dropout
+    if (!this->config_.disable_dropout &&
+        (this->layer_phase_ == GNNPhase::kTrain)) {
+      this->DoDropout(input_embeddings, &p_in_temp_1_);
+      input_data = p_in_temp_1_.data();
+      agg_data   = p_in_temp_2_.data();
+    } else {
+      agg_data = p_in_temp_1_.data();
+    }
+
+    // O = FW1 + AFW2 is what is done if concat is on: below is the AFW2 part
+    // which is done regardless
+
+    // flip aggregate/update if dimensions favor it (do less work)
+    if (this->config_.disable_aggregate_after_update ||
+        this->layer_dimensions_.input_columns <=
+            this->layer_dimensions_.output_columns) {
+      if (!this->config_.disable_dropout &&
+          (this->layer_phase_ == GNNPhase::kTrain)) {
+        assert(p_in_temp_2_.size() >=
+               this->layer_dimensions_.output_rows *
+                   this->layer_dimensions_.input_columns);
+      } else {
+        assert(p_in_temp_1_.size() >=
+               this->layer_dimensions_.output_rows *
+                   this->layer_dimensions_.input_columns);
+      }
+
+      // aggregation and update
+      AggregateAll(this->layer_dimensions_.input_columns, input_data, agg_data,
+                   &input_column_intermediates_);
+      assert(this->p_forward_output_matrix_.size() >=
+             this->layer_dimensions_.output_rows *
+                 this->layer_dimensions_.output_columns);
+      UpdateEmbeddings(agg_data, this->p_forward_output_matrix_.data(), true);
+    } else {
+      assert(p_out_temp_.size() >= this->layer_dimensions_.input_rows *
+                                       this->layer_dimensions_.output_columns);
+
+      // update to aggregate
+      // FW
+      UpdateEmbeddings(input_data, p_out_temp_.data(), false);
+
+      // A(FW)
+      assert(this->p_forward_output_matrix_.size() >=
+             this->layer_dimensions_.output_rows *
+                 this->layer_dimensions_.output_columns);
+      AggregateAll(this->layer_dimensions_.output_columns, p_out_temp_.data(),
+                   this->p_forward_output_matrix_.data(),
+                   &output_column_intermediates_);
+    }
+
+    if (!sage_config_.disable_concat) {
+      // FW1 is unaffected by the agg/update flip, so can to it
+      // separately
+      SelfFeatureUpdateEmbeddings(input_data,
+                                  this->p_forward_output_matrix_.data());
+    }
+
+    if (!this->config_.disable_activation) {
+      GALOIS_LOG_VERBOSE("Doing activation");
+      this->Activation();
+    }
+
+    assert(this->p_forward_output_matrix_.size() >=
+           (this->layer_dimensions_.output_rows *
+            this->layer_dimensions_.output_columns));
+
+    this->TimerStop(&timer);
+
+    return this->p_forward_output_matrix_;
+  }
 
   PointerWithSize<galois::GNNFloat>
   BackwardPhase(PointerWithSize<galois::GNNFloat> prev_layer_input,
-                PointerWithSize<galois::GNNFloat>* input_gradient) final;
+                PointerWithSize<galois::GNNFloat>* input_gradient) final {
+    galois::StatTimer timer("BackwardPhase", kRegionName);
+    galois::StatTimer weight_gradient_sync_timer("BackwardPhaseWeightSync",
+                                                 kRegionName);
+    galois::StatTimer weight_gradient_sync_timer2("BackwardPhaseWeight2Sync",
+                                                  kRegionName);
+    this->TimerStart(&timer);
+
+    assert(this->layer_phase_ == GNNPhase::kTrain ||
+           this->layer_phase_ == GNNPhase::kBatch);
+
+    // derivative of activation
+    if (!this->config_.disable_activation) {
+      this->ActivationDerivative(input_gradient);
+    }
+
+    // if dropout was used, use the dropout matrix for the input
+    galois::PointerWithSize<galois::GNNFloat> input_data;
+    galois::PointerWithSize<galois::GNNFloat> agg_data;
+    if (!this->config_.disable_dropout) {
+      // dropout result is currently stored in temp 1
+      // needs to be used before it gets overwritten
+      input_data = p_in_temp_1_;
+      agg_data   = p_in_temp_2_;
+    } else {
+      // no dropout = use vanilla input
+      input_data = prev_layer_input;
+      agg_data   = p_in_temp_1_;
+    }
+
+    // aggregate this here before gradient starts to get overwritten
+    // this is xform ffirst
+    if (!this->config_.disable_aggregate_after_update &&
+        this->layer_dimensions_.input_columns >
+            this->layer_dimensions_.output_columns) {
+      // aggregate occurs regardless of layer being equal to 0 because it is
+      // required in this case for the weight gradient calculation
+      // this is (FW)'
+      // TODO: this is absolutely terrible performance wise as well; keep
+      // in mind
+      AggregateAll(this->layer_dimensions_.output_columns,
+                   input_gradient->data(), p_out_temp_.data(),
+                   &output_column_intermediates_, true);
+    }
+
+    if (!sage_config_.disable_concat) {
+      if (this->layer_number_ != 0) {
+        if (this->graph_.IsSubgraphOn()) {
+          this->MaskInputNonMasters(&input_data,
+                                    this->layer_dimensions_.input_rows,
+                                    this->graph_.GetNonLayerZeroMasters());
+        } else {
+          this->MaskInputNonMasters(&input_data,
+                                    this->layer_dimensions_.input_rows);
+        }
+      } else {
+        // if 0 then no input to mask: mask the gradient
+        // this is fine because gradient won't be used to get feature gradients
+        if (this->graph_.IsSubgraphOn()) {
+          this->MaskGradientNonMasters(input_gradient,
+                                       this->layer_dimensions_.output_rows,
+                                       this->graph_.GetNonLayerZeroMasters());
+        } else {
+          this->MaskGradientNonMasters(input_gradient,
+                                       this->layer_dimensions_.output_rows);
+        }
+      }
+
+#ifdef GALOIS_ENABLE_GPU
+      if (device_personality == DevicePersonality::GPU_CUDA) {
+        gpu_object_.UpdateWeight2DerivativeGPU(
+            this->layer_dimensions_.input_columns,
+            this->layer_dimensions_.input_rows,
+            this->layer_dimensions_.output_columns, input_data.data(),
+            input_gradient->data(), p_layer_weight_gradients_2_.data());
+      } else {
+#endif
+        // input data (prev layer input or temp1) or gradient need mask
+        // can mask gradient if layer == 0
+        // otherwise must mask other
+
+        galois::StatTimer concat_grad_timer("ConcatGradMultiply", kRegionName);
+        this->TimerStart(&concat_grad_timer);
+        galois::CBlasSGEMM(
+            CblasTrans, CblasNoTrans, this->layer_dimensions_.input_columns,
+            this->layer_dimensions_.output_rows,
+            this->layer_dimensions_.output_columns, input_data.data(),
+            input_gradient->data(), p_layer_weight_gradients_2_.data());
+        this->TimerStop(&concat_grad_timer);
+
+#ifdef GALOIS_ENABLE_GPU
+      }
+#endif
+    }
+
+    weight_gradient_sync_timer2.start();
+    this->WeightGradientSyncSum2();
+    weight_gradient_sync_timer2.stop();
+
+    // derivative of aggregation/update
+    // TODO clean up logic here to reduce nesting
+    if (this->config_.disable_aggregate_after_update ||
+        this->layer_dimensions_.input_columns <=
+            this->layer_dimensions_.output_columns) {
+      // aggdata can == p_intemp1; in other words, need to use before overwrite
+      // mask it, then use it
+      // XXX masking may not be required in sampling case where rows change
+      if (this->layer_number_ != 0 || sage_config_.disable_concat) {
+        if (this->graph_.IsSubgraphOn()) {
+          this->MaskInputNonMasters(&agg_data,
+                                    this->layer_dimensions_.output_rows,
+                                    this->graph_.GetNonLayerZeroMasters());
+        } else {
+          this->MaskInputNonMasters(&agg_data,
+                                    this->layer_dimensions_.output_rows);
+        }
+      }
+
+#ifdef GALOIS_ENABLE_GPU
+      if (device_personality == DevicePersonality::GPU_CUDA) {
+        // XXX output rows
+        gpu_object_.GetWeightGradientsGPU(
+            this->layer_dimensions_.input_rows,
+            this->layer_dimensions_.input_columns,
+            this->layer_dimensions_.output_columns, agg_data.data(),
+            input_gradient->data(), this->p_layer_weight_gradients_.data());
+      } else {
+#endif
+        // agg data holds aggregated feature vectors from forward phase
+        galois::StatTimer normal_grad_timer("NormalGradMultiply", kRegionName);
+        this->TimerStart(&normal_grad_timer);
+        galois::CBlasSGEMM(
+            CblasTrans, CblasNoTrans, this->layer_dimensions_.input_columns,
+            this->layer_dimensions_.output_rows,
+            this->layer_dimensions_.output_columns, agg_data.data(),
+            input_gradient->data(), this->p_layer_weight_gradients_.data());
+        this->TimerStop(&normal_grad_timer);
+#ifdef GALOIS_ENABLE_GPU
+      }
+#endif
+
+      // 0 means input gradient shouldn't get masked
+      if (this->layer_number_ != 0) {
+        // NOTE: this is super nice because it avoids aggregation completely
+        // in the layer 0 setting
+        // ---unmasked---
+        // transposed sgemm for derivative; in_temp is output
+        assert(input_gradient->size() >=
+               this->layer_dimensions_.output_rows *
+                   this->layer_dimensions_.output_columns);
+        // pintemp1 contains (AF)'
+        // overwrites the dropout matrix that was in ptemp1 (needed for second
+        // weight matrix)
+        UpdateEmbeddingsDerivative(input_gradient->data(), p_in_temp_1_.data(),
+                                   true);
+
+        // pback contains F'
+        // derivative of aggregate is the same due to symmetric graph
+        AggregateAll(this->layer_dimensions_.input_columns, p_in_temp_1_.data(),
+                     this->p_backward_output_matrix_.data(),
+                     &input_column_intermediates_, true);
+      }
+    } else {
+      // xform first
+
+      // --unmasked--
+
+      // disable concat is part of condition because otherwise this mask
+      // should have gotten done elsewhere
+      if (this->layer_number_ != 0 && sage_config_.disable_concat) {
+        if (this->graph_.IsSubgraphOn()) {
+          this->MaskInputNonMasters(&input_data,
+                                    this->layer_dimensions_.input_rows,
+                                    this->graph_.GetNonLayerZeroMasters());
+        } else {
+          this->MaskInputNonMasters(&input_data,
+                                    this->layer_dimensions_.input_rows);
+        }
+      }
+
+      // layer number 0 means output needs to be masked because input cannot
+      // be masked
+      if (this->layer_number_ == 0) {
+        // if 0 then no input to mask: mask the gradient
+        // this is fine because gradient won't be used to get feature gradients
+        if (this->graph_.IsSubgraphOn()) {
+          this->MaskGradientNonMasters(&p_out_temp_,
+                                       this->layer_dimensions_.input_rows,
+                                       this->graph_.GetNonLayerZeroMasters());
+        } else {
+          this->MaskGradientNonMasters(&p_out_temp_,
+                                       this->layer_dimensions_.input_rows);
+        }
+      }
+
+      // W' = F^T (FW)'
+      // TODO put this in a function
+#ifdef GALOIS_ENABLE_GPU
+      if (device_personality == DevicePersonality::GPU_CUDA) {
+        gpu_object_.GetWeightGradientsGPU(
+            this->layer_dimensions_.input_rows,
+            this->layer_dimensions_.input_columns,
+            this->layer_dimensions_.output_columns, input_data.data(),
+            p_out_temp_.data(), this->p_layer_weight_gradients_.data());
+      } else {
+#endif
+        // input col x input row * input row x output col
+        galois::StatTimer normal_grad_timer("NormalGradMultiply", kRegionName);
+        this->TimerStart(&normal_grad_timer);
+        galois::CBlasSGEMM(
+            CblasTrans, CblasNoTrans, this->layer_dimensions_.input_columns,
+            this->layer_dimensions_.input_rows,
+            this->layer_dimensions_.output_columns, input_data.data(),
+            p_out_temp_.data(), this->p_layer_weight_gradients_.data());
+        this->TimerStop(&normal_grad_timer);
+#ifdef GALOIS_ENABLE_GPU
+      }
+#endif
+
+      // to get a correct result out temp mask cannot be masked;
+      // outtemp will only be masked if layer number is 0, so this
+      // is safe in all other cases
+      if (this->layer_number_ != 0) {
+        // derivative for update
+        // backout = F'
+        UpdateEmbeddingsDerivative(
+            p_out_temp_.data(), this->p_backward_output_matrix_.data(), false);
+      }
+    }
+
+    weight_gradient_sync_timer.start();
+    this->WeightGradientSyncSum();
+    weight_gradient_sync_timer.stop();
+
+    // full gradient needed here; should occur after all updates
+    if (this->layer_number_ != 0) {
+      // deal with feature gradients for the self feature here
+      // this function will sum directly into the backward matrix
+      // input gradient never gets masked if layer number != 0
+      SelfFeatureUpdateEmbeddingsDerivative(
+          input_gradient->data(), this->p_backward_output_matrix_.data());
+    }
+
+    if (!this->config_.disable_dropout && this->layer_number_ != 0) {
+      this->DoDropoutDerivative();
+    }
+
+    this->TimerStop(&timer);
+    return this->p_backward_output_matrix_;
+  }
 
 #ifdef GALOIS_ENABLE_GPU
   //! Copies over self weight gradients to CPU from GPU
@@ -93,52 +588,457 @@ class SAGELayer : public GNNLayer {
   void AggregateAllCPU(
       size_t column_length, const GNNFloat* node_embeddings,
       GNNFloat* aggregate_output,
-      galois::substrate::PerThreadStorage<std::vector<GNNFloat>>* pts,
-      bool is_backward);
+      galois::substrate::PerThreadStorage<std::vector<GNNFloat>>*,
+      bool is_backward) {
+    // aggregation causes a row count change
+    size_t num_rows_to_handle;
+    if (!is_backward) {
+      num_rows_to_handle = this->layer_dimensions_.output_rows;
+    } else {
+      num_rows_to_handle = this->layer_dimensions_.input_rows;
+    }
+
+    galois::do_all(
+        galois::iterate(*(this->graph_.begin()), num_rows_to_handle),
+        [&](size_t src) {
+          size_t index_to_src_feature = src * column_length;
+          // zero out src feature first
+          for (size_t i = 0; i < column_length; i++) {
+            aggregate_output[index_to_src_feature + i] = 0;
+          }
+
+          GNNFloat source_norm = 0.0;
+          if (!this->config_.disable_normalization) {
+            source_norm =
+                this->graph_.GetDegreeNorm(src, this->graph_user_layer_number_);
+          }
+
+          if (!is_backward) {
+            // loop through all destinations to grab the feature to aggregate
+            for (auto e = this->graph_.edge_begin(src);
+                 e != this->graph_.edge_end(src); e++) {
+              if (this->layer_phase_ == GNNPhase::kTrain ||
+                  this->layer_phase_ == GNNPhase::kBatch) {
+                // XXX
+                // galois::gDebug("In here");
+                if (this->IsSampledLayer()) {
+                  if (!this->graph_.IsEdgeSampled(
+                          e, this->graph_user_layer_number_)) {
+                    continue;
+                  }
+                }
+              }
+              size_t dst = this->graph_.GetEdgeDest(e);
+              graphs::bitset_graph_aggregate.set(
+                  this->graph_.ConvertToLID(src));
+              size_t index_to_dst_feature = dst * column_length;
+
+              if (!this->config_.disable_normalization) {
+                GNNFloat norm_scale = source_norm;
+                assert(norm_scale != 0);
+
+                galois::VectorMulAdd(
+                    column_length, &aggregate_output[index_to_src_feature],
+                    &node_embeddings[index_to_dst_feature], norm_scale,
+                    &aggregate_output[index_to_src_feature]);
+              } else {
+                // add dst feature to aggregate output
+                galois::VectorAdd(column_length,
+                                  &aggregate_output[index_to_src_feature],
+                                  &node_embeddings[index_to_dst_feature],
+                                  &aggregate_output[index_to_src_feature]);
+              }
+            }
+          } else {
+            // loop through all destinations to grab the feature to aggregate
+            for (auto e = this->graph_.in_edge_begin(src);
+                 e != this->graph_.in_edge_end(src); e++) {
+              if (this->layer_phase_ == GNNPhase::kTrain ||
+                  this->layer_phase_ == GNNPhase::kBatch) {
+                // XXX
+                if (this->IsSampledLayer()) {
+                  if (!this->graph_.IsInEdgeSampled(
+                          e, this->graph_user_layer_number_)) {
+                    continue;
+                  }
+                }
+              }
+              size_t dst = this->graph_.GetInEdgeDest(e);
+              graphs::bitset_graph_aggregate.set(
+                  this->graph_.ConvertToLID(src));
+
+              // input row x output row in backward means that i shouldn't be
+              // touching nodes past output rows; the above sample check
+              // should deal with this where this matters
+              assert(dst < this->layer_dimensions_.output_rows);
+
+              size_t index_to_dst_feature = dst * column_length;
+
+              if (!this->config_.disable_normalization) {
+                GNNFloat norm_scale = this->graph_.GetDegreeNorm(
+                    dst, this->graph_user_layer_number_);
+
+                assert(norm_scale != 0);
+
+                galois::VectorMulAdd(
+                    column_length, &aggregate_output[index_to_src_feature],
+                    &node_embeddings[index_to_dst_feature], norm_scale,
+                    &aggregate_output[index_to_src_feature]);
+              } else {
+                // add dst feature to aggregate output
+                galois::VectorAdd(column_length,
+                                  &aggregate_output[index_to_src_feature],
+                                  &node_embeddings[index_to_dst_feature],
+                                  &aggregate_output[index_to_src_feature]);
+              }
+            }
+          }
+        },
+        galois::chunk_size<1>(), galois::steal(),
+        galois::loopname("SAGEAggregateAll"));
+  }
 
   //! Performs aggregation for all nodes of the graph given the length of the
   //! vector to aggregate, the features themselves, an output array, and per
   //! thread storage for the intermediate scaling via norm factor
-  void
-  AggregateAll(size_t column_length, const GNNFloat* node_embeddings,
-               GNNFloat* aggregate_output,
-               galois::substrate::PerThreadStorage<std::vector<GNNFloat>>* pts);
+  void AggregateAll(
+      size_t column_length, const GNNFloat* node_embeddings,
+      GNNFloat* aggregate_output,
+      galois::substrate::PerThreadStorage<std::vector<GNNFloat>>* pts) {
+    AggregateAll(column_length, node_embeddings, aggregate_output, pts, false);
+  }
+
   void
   AggregateAll(size_t column_length, const GNNFloat* node_embeddings,
                GNNFloat* aggregate_output,
                galois::substrate::PerThreadStorage<std::vector<GNNFloat>>* pts,
-               bool is_backward);
+               bool is_backward) {
+    std::string agg_timer_name      = "AggregateCompute";
+    std::string agg_sync_timer_name = "AggregateSync";
+    size_t num_rows_to_handle;
+    if (!is_backward) {
+      agg_timer_name += "Forward";
+      agg_sync_timer_name += "Forward";
+      num_rows_to_handle = this->layer_dimensions_.output_rows;
+    } else {
+      agg_timer_name += "Backward";
+      agg_sync_timer_name += "Backward";
+      num_rows_to_handle = this->layer_dimensions_.input_rows;
+    }
+    galois::StatTimer timer(agg_timer_name.c_str(), kRegionName);
+    galois::StatTimer aggregate_all_sync_timer(agg_sync_timer_name.c_str(),
+                                               kRegionName);
+    this->TimerStart(&timer);
+
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      if (!this->IsSampledLayer()) {
+        gpu_object_.AggregateAllGPU(
+            this->graph_.GetGPUGraph(), this->graph_.size(), column_length,
+            node_embeddings, aggregate_output,
+            !this->config_.disable_normalization, is_backward);
+      } else {
+        // TODO(hochan)
+        GALOIS_LOG_FATAL("SAMPLING IMPLEMENTATION");
+      }
+      this->graph_.AggregateSyncGPU(aggregate_output, column_length,
+                                    this->layer_number_);
+    } else {
+#endif
+      AggregateAllCPU(column_length, node_embeddings, aggregate_output, pts,
+                      is_backward);
+      this->TimerStop(&timer);
+
+      // aggregate sync
+      aggregate_all_sync_timer.start();
+      this->graph_.AggregateSync(aggregate_output, column_length, is_backward,
+                                 num_rows_to_handle);
+      aggregate_all_sync_timer.stop();
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+  }
 
   //! Do embedding update via mxm with this layer's weights (forward)
   void UpdateEmbeddings(const GNNFloat* node_embeddings, GNNFloat* output,
-                        bool after);
+                        bool after) {
+    galois::StatTimer timer("ForwardXForm", kRegionName);
+    this->TimerStart(&timer);
+#ifdef GALOIS_ENABLE_GPU
+    // TODO self change
+    // XXX(hochan) output rows
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      gpu_object_.UpdateEmbeddingsGPU(this->layer_dimensions_.input_rows,
+                                      this->layer_dimensions_.input_columns,
+                                      this->layer_dimensions_.output_columns,
+                                      node_embeddings,
+                                      base_gpu_object_.layer_weights(), output);
+    } else {
+#endif
+      // galois::gDebug("Layer ", this->graph_user_layer_number_, " ",
+      //               layer_dimensions_.output_rows, " ",
+      //               layer_dimensions_.input_columns, " ",
+      //               layer_dimensions_.output_columns);
+      // CPU version is just a call into CBlas
+      if (after) {
+        galois::CBlasSGEMM(
+            CblasNoTrans, CblasNoTrans, this->layer_dimensions_.output_rows,
+            this->layer_dimensions_.input_columns,
+            this->layer_dimensions_.output_columns, node_embeddings,
+            this->p_layer_weights_.data(), output);
+      } else {
+        galois::CBlasSGEMM(
+            CblasNoTrans, CblasNoTrans, this->layer_dimensions_.input_rows,
+            this->layer_dimensions_.input_columns,
+            this->layer_dimensions_.output_columns, node_embeddings,
+            this->p_layer_weights_.data(), output);
+      }
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+    this->TimerStop(&timer);
+  }
+
   //! Same as above but uses the second set of weights (self feature weights)
   void SelfFeatureUpdateEmbeddings(const GNNFloat* node_embeddings,
-                                   GNNFloat* output);
+                                   GNNFloat* output) {
+    galois::StatTimer timer("SelfForwardXForm", kRegionName);
+    this->TimerStart(&timer);
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      gpu_object_.SelfFeatureUpdateEmbeddingsGPU(
+          this->layer_dimensions_.input_rows,
+          this->layer_dimensions_.input_columns,
+          this->layer_dimensions_.output_columns, node_embeddings, output);
+    } else {
+#endif
+      // note use of layer weights 2 differentiates this from above
+      galois::CBlasSGEMM(
+          CblasNoTrans, CblasNoTrans, this->layer_dimensions_.output_rows,
+          this->layer_dimensions_.input_columns,
+          this->layer_dimensions_.output_columns, node_embeddings,
+          layer_weights_2_.data(), output, true);
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+    this->TimerStop(&timer);
+  }
+
   //! Calculate graident via mxm with last layer's gradients (backward)
   void UpdateEmbeddingsDerivative(const GNNFloat* gradients, GNNFloat* output,
-                                  bool after);
+                                  bool after) {
+    galois::StatTimer timer("BackwardXForm", kRegionName);
+    this->TimerStart(&timer);
+
+    assert(this->p_layer_weights_.size() >=
+           this->layer_dimensions_.input_columns *
+               this->layer_dimensions_.output_columns);
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      gpu_object_.UpdateEmbeddingsDerivativeGPU(
+          this->layer_dimensions_.input_rows,
+          this->layer_dimensions_.input_columns,
+          this->layer_dimensions_.output_columns, gradients,
+          base_gpu_object_.layer_weights(), output);
+    } else {
+#endif
+      // difference is Trans for B matrix (data) to get z by y (weights is y by
+      // z normally); result is x by y note input rows is used here due to
+      // transpose of aggregation
+      if (after) {
+        galois::CBlasSGEMM(CblasNoTrans, CblasTrans,
+                           this->layer_dimensions_.output_rows,
+                           this->layer_dimensions_.output_columns,
+                           this->layer_dimensions_.input_columns, gradients,
+                           this->p_layer_weights_.data(), output);
+      } else {
+        galois::CBlasSGEMM(CblasNoTrans, CblasTrans,
+                           this->layer_dimensions_.input_rows,
+                           this->layer_dimensions_.output_columns,
+                           this->layer_dimensions_.input_columns, gradients,
+                           this->p_layer_weights_.data(), output);
+      }
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+    this->TimerStop(&timer);
+  }
+
   //! Same as above but uses the second set of weights (self feature weights)
   void SelfFeatureUpdateEmbeddingsDerivative(const GNNFloat* gradients,
-                                             GNNFloat* output);
+                                             GNNFloat* output) {
+    galois::StatTimer timer("SelfBackwardXForm", kRegionName);
+    this->TimerStart(&timer);
+
+    assert(this->p_layer_weights_.size() >=
+           this->layer_dimensions_.input_columns *
+               this->layer_dimensions_.output_columns);
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      gpu_object_.SelfFeatureUpdateEmbeddingsDerivativeGPU(
+          this->layer_dimensions_.input_rows,
+          this->layer_dimensions_.output_columns,
+          this->layer_dimensions_.input_columns, gradients, output);
+    } else {
+#endif
+      // difference is Trans for B matrix (data) to get z by y (weights is y by
+      // z normally); result is x by y true at end -> accumulate
+      galois::CBlasSGEMM(CblasNoTrans, CblasTrans,
+                         this->layer_dimensions_.output_rows,
+                         this->layer_dimensions_.output_columns,
+                         this->layer_dimensions_.input_columns, gradients,
+                         layer_weights_2_.data(), output, true);
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+    this->TimerStop(&timer);
+  }
 
   //! override parent function: optimizes the second set of weights as well
-  void OptimizeLayer(BaseOptimizer* optimizer, size_t trainable_layer_number);
+  void OptimizeLayer(BaseOptimizer* optimizer, size_t trainable_layer_number) {
+    galois::StatTimer total_gradient_timer("GradientDescent", kRegionName);
+    total_gradient_timer.start();
+    optimizer->GradientDescent(this->p_layer_weight_gradients_,
+                               this->p_layer_weights_, trainable_layer_number);
+    if (!sage_config_.disable_concat) {
+      second_weight_optimizer_->GradientDescent(p_layer_weight_gradients_2_,
+                                                p_layer_weights_2_, 0);
+    }
+    total_gradient_timer.stop();
+  }
 
   //! Sync second set of weight gradients
-  void WeightGradientSyncSum2();
+  void WeightGradientSyncSum2() {
+    galois::StatTimer clubbed_timer("Sync_BackwardSync", "Gluon");
+    this->TimerStart(&clubbed_timer);
+    galois::StatTimer t("Sync_WeightGradientsSum2", kRegionName);
+    this->TimerStart(&t);
+    int weight_size = static_cast<int>(p_layer_weight_gradients_2_.size());
+
+#ifdef GALOIS_ENABLE_GPU
+    bool gpu_direct_enabled = false;
+    if (device_personality == DevicePersonality::GPU_CUDA &&
+        !gpu_direct_enabled) {
+      gpu_object_.CopyWeight2GradientsToCPU(&layer_weight_gradients_2_);
+      MPI_Allreduce(MPI_IN_PLACE,
+                    static_cast<void*>(layer_weight_gradients_2_.data()),
+                    weight_size, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);
+      gpu_object_.CopyToWeight2Gradients(layer_weight_gradients_2_);
+    } else {
+#endif
+      // TODO(loc) remove this limitation later; can just do a loop over the
+      // weight matrix
+      if (p_layer_weight_gradients_2_.size() >
+          size_t{std::numeric_limits<int>::max()}) {
+        GALOIS_LOG_FATAL(
+            "Weight sync code does not handle size larger than max "
+            "int at the moment");
+      }
+      MPI_Allreduce(MPI_IN_PLACE,
+                    static_cast<void*>(p_layer_weight_gradients_2_.data()),
+                    weight_size, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+    this->TimerStop(&t);
+    this->TimerStop(&clubbed_timer);
+  }
 
   void ResizeRows(size_t new_row_count) {
-    GNNLayer::ResizeRows(new_row_count);
+    GNNLayer<VTy, ETy>::ResizeRows(new_row_count);
     ResizeIntermediates(new_row_count, new_row_count);
   }
 
   void ResizeInputOutputRows(size_t input_row, size_t output_row) {
-    GNNLayer::ResizeInputOutputRows(input_row, output_row);
+    GNNLayer<VTy, ETy>::ResizeInputOutputRows(input_row, output_row);
     ResizeIntermediates(input_row, output_row);
   }
 
-  void ResizeIntermediates(size_t new_input_rows, size_t new_output_rows);
+  void ResizeIntermediates(size_t new_input_rows, size_t new_output_rows) {
+    size_t num_in_temp_elements =
+        new_output_rows * this->layer_dimensions_.input_columns;
+    // galois::gDebug(this->graph_.host_prefix(), "Layer num ",
+    // this->layer_number_, " ",
+    //               in_temp_1_.size(), " and ", num_in_temp_elements, " ",
+    //               layer_dimensions_.input_columns, " ",
+    //               layer_dimensions_.output_columns);
+
+    // if in temp is smaller than out temp, or if dropout exists
+    if (!this->config_.disable_dropout ||
+        this->config_.disable_aggregate_after_update ||
+        this->layer_dimensions_.input_columns <=
+            this->layer_dimensions_.output_columns) {
+      if (in_temp_1_.size() < num_in_temp_elements) {
+        galois::gInfo(this->graph_.host_prefix(), "Resize layer ",
+                      this->layer_number_, ", SAGE input temp var 1 ",
+                      num_in_temp_elements, " (",
+                      this->FloatElementsToGB(num_in_temp_elements), " GB)");
+        size_t buffer_size = num_in_temp_elements * 0.02;
+#ifdef GALOIS_ENABLE_GPU
+        // XXX(hochan)
+        if (device_personality == DevicePersonality::GPU_CUDA) {
+          gpu_object_.AllocateInTemp1(num_in_temp_elements + buffer_size);
+        } else {
+#endif
+          in_temp_1_.resize(num_in_temp_elements + buffer_size, 0);
+#ifdef GALOIS_ENABLE_GPU
+        }
+#endif
+        // XXX(hochan) GPU
+        p_in_temp_1_ = PointerWithSize<GNNFloat>(in_temp_1_);
+      }
+    }
+
+    // only on in dropout case + if in temp is smaller than out temp
+    if (!this->config_.disable_dropout &&
+        (this->config_.disable_aggregate_after_update ||
+         this->layer_dimensions_.input_columns <=
+             this->layer_dimensions_.output_columns)) {
+      if (in_temp_2_.size() < num_in_temp_elements) {
+        galois::gInfo(this->graph_.host_prefix(), "Resize layer ",
+                      this->layer_number_, ", SAGE input temp var 2 ",
+                      num_in_temp_elements, " (",
+                      this->FloatElementsToGB(num_in_temp_elements), " GB)");
+        size_t buffer_size = num_in_temp_elements * 0.02;
+#ifdef GALOIS_ENABLE_GPU
+        if (device_personality == DevicePersonality::GPU_CUDA) {
+          gpu_object_.AllocateInTemp2(num_in_temp_elements + buffer_size);
+        } else {
+#endif
+          in_temp_2_.resize(num_in_temp_elements + buffer_size, 0);
+#ifdef GALOIS_ENABLE_GPU
+        }
+#endif
+        // XXX(hochan) GPU
+        p_in_temp_2_ = PointerWithSize<GNNFloat>(in_temp_2_);
+      }
+    }
+
+    size_t num_output_temp_elements =
+        new_input_rows * this->layer_dimensions_.output_columns;
+    // only needed if out temp would be smaller than intemp
+    if (!this->config_.disable_aggregate_after_update &&
+        this->layer_dimensions_.input_columns >
+            this->layer_dimensions_.output_columns) {
+      if (out_temp_.size() < num_output_temp_elements) {
+        galois::gInfo(
+            this->graph_.host_prefix(), "Resize layer ", this->layer_number_,
+            ", SAGE output temp var ", num_output_temp_elements, " (",
+            this->FloatElementsToGB(num_output_temp_elements), " GB)");
+        size_t buffer_size = (num_output_temp_elements * 0.02);
+#ifdef GALOIS_ENABLE_GPU
+        if (device_personality == DevicePersonality::GPU_CUDA) {
+          gpu_object_.AllocateOutTemp(num_output_temp_elements + buffer_size);
+        } else {
+#endif
+          out_temp_.resize(num_output_temp_elements + buffer_size, 0);
+#ifdef GALOIS_ENABLE_GPU
+        }
+#endif
+        p_out_temp_ = PointerWithSize<GNNFloat>(out_temp_);
+      }
+    }
+  }
 
   //! SAGE config params
   SAGELayerConfig sage_config_;
diff --git a/libgnn/include/galois/layers/SigmoidLayer.h b/libgnn/include/galois/layers/SigmoidLayer.h
index 209929bf30..26d9271d37 100644
--- a/libgnn/include/galois/layers/SigmoidLayer.h
+++ b/libgnn/include/galois/layers/SigmoidLayer.h
@@ -1,5 +1,8 @@
 #pragma once
 #include "galois/layers/GNNLayer.h"
+#include "galois/GNNMath.h"
+
+#include <math.h>
 
 // TODO(loc) GPU support
 
@@ -8,17 +11,18 @@ namespace galois {
 //! Sigmoid layer: applies sigmoid function element wise to each element of the
 //! input.
 //! Meant for use with *multi-class* labels.
-class SigmoidLayer : public GNNLayer {
+template <typename VTy, typename ETy>
+class SigmoidLayer : public GNNLayer<VTy, ETy> {
 public:
-  SigmoidLayer(size_t layer_num, const galois::graphs::GNNGraph& graph,
-
+  SigmoidLayer(size_t layer_num,
+               const galois::graphs::GNNGraph<VTy, ETy>& graph,
                PointerWithSize<GNNFloat>* backward_output_matrix,
                const GNNLayerDimensions& dimensions)
-      : GNNLayer(layer_num, graph, backward_output_matrix, dimensions,
-                 GNNLayerConfig{.allocate_weights = false}),
+      : GNNLayer<VTy, ETy>(layer_num, graph, backward_output_matrix, dimensions,
+                           GNNLayerConfig{.allocate_weights = false}),
         input_loss_(dimensions.input_rows),
         norm_gradient_vectors_(dimensions.input_columns) {
-    output_layer_type_ = galois::GNNOutputLayerType::kSigmoid;
+    this->output_layer_type_ = galois::GNNOutputLayerType::kSigmoid;
     // input/output columns must be equivalent
     GALOIS_LOG_ASSERT(dimensions.input_columns == dimensions.output_columns);
     // output needs to match number of possible classes
@@ -27,18 +31,117 @@ class SigmoidLayer : public GNNLayer {
 
   //! Normalizes all elements by applying sigmoid to all of them
   const PointerWithSize<galois::GNNFloat>
-  ForwardPhase(const PointerWithSize<galois::GNNFloat> input_embeddings) final;
+  ForwardPhase(const PointerWithSize<galois::GNNFloat> input_embeddings) final {
+#ifdef GALOIS_ENABLE_GPU
+    // TODO(loc) when GPU needs it
+    printf("%p\n", input_embeddings.data());
+    return p_layer_weights_;
+#else
+    return ForwardPhaseCPU(input_embeddings);
+#endif
+  }
 
   //! Get gradients to fix distribution such that it leans more towards
   //! multiclass ground truth.
   PointerWithSize<galois::GNNFloat>
   BackwardPhase(PointerWithSize<galois::GNNFloat>,
-                PointerWithSize<galois::GNNFloat>*) final;
+                PointerWithSize<galois::GNNFloat>*) final {
+#ifdef GALOIS_ENABLE_GPU
+    // TODO(loc) when GPU needs it
+    return p_layer_weights_;
+#else
+    return BackwardPhaseCPU();
+#endif
+  }
 
 private:
   const PointerWithSize<galois::GNNFloat>
-  ForwardPhaseCPU(const PointerWithSize<galois::GNNFloat> input_embeddings);
-  PointerWithSize<galois::GNNFloat> BackwardPhaseCPU();
+  ForwardPhaseCPU(const PointerWithSize<galois::GNNFloat> input_embeddings) {
+    galois::gWarn(
+        "Sigmoid layer has not been kept up to date; do not use unless sure"
+        " it works with new changes");
+
+    input_loss_.assign(input_loss_.size(), 0.0);
+    this->forward_output_matrix_.assign(this->forward_output_matrix_.size(),
+                                        0.0);
+    const size_t feature_length = this->layer_dimensions_.input_columns;
+    this->node_count_.reset();
+    this->float_accumulator_.reset();
+
+    galois::do_all(
+        galois::iterate(this->graph_.begin(), this->graph_.end()),
+        [&](const unsigned local_node) {
+          if (this->graph_.IsValidForPhase(local_node, this->layer_phase_)) {
+            if (this->IsSampledLayer()) {
+              if (this->layer_phase_ == GNNPhase::kTrain &&
+                  !this->graph_.IsInSampledGraph(local_node))
+                return;
+            }
+
+            this->node_count_ += 1;
+
+            size_t node_offset = feature_length * local_node;
+            // sigmoid the values for this node
+            for (unsigned index = 0; index < feature_length; index++) {
+              // splitting in half is done for numerical stability of log
+              if (input_embeddings[node_offset + index] >= 0) {
+                this->forward_output_matrix_[node_offset + index] =
+                    1.0 / (1.0 + expf(-input_embeddings[node_offset + index]));
+              } else {
+                this->forward_output_matrix_[node_offset + index] =
+                    expf(input_embeddings[node_offset + index]) /
+                    (1.0 + expf(input_embeddings[node_offset + index]));
+              }
+            }
+
+            input_loss_[local_node] = GNNCrossEntropy(
+                feature_length, this->graph_.GetMultiClassLabel(local_node),
+                &this->forward_output_matrix_[node_offset]);
+            // TODO(loc) normalize the loss
+            this->float_accumulator_ += input_loss_[local_node];
+          }
+        },
+        galois::steal(), galois::loopname("SigmoidForward"));
+
+    galois::gPrint(
+        "Average loss is ",
+        this->float_accumulator_.reduce() / this->node_count_.reduce(), "\n");
+    return this->forward_output_matrix_;
+  }
+
+  PointerWithSize<galois::GNNFloat> BackwardPhaseCPU() {
+    const size_t feature_length = this->layer_dimensions_.input_columns;
+    galois::do_all(
+        galois::iterate(size_t{0}, this->p_backward_output_matrix_.size()),
+        [&](size_t i) { this->p_backward_output_matrix_[i] = 0; });
+
+    galois::do_all(
+        galois::iterate(this->graph_.begin(), this->graph_.end()),
+        [&](const unsigned local_node) {
+          if (this->graph_.IsValidForPhase(local_node, this->layer_phase_)) {
+            if (this->IsSampledLayer()) {
+              if (this->layer_phase_ == GNNPhase::kTrain &&
+                  !this->graph_.IsInSampledGraph(local_node))
+                return;
+            }
+
+            // derivative cross entropy into norm grad
+            const GNNLabel* ground_truth =
+                this->graph_.GetMultiClassLabel(local_node);
+            size_t node_offset = feature_length * local_node;
+            // sigmoid-cross-entropy derivative: turns out all it is is simple
+            // subtraction
+            for (unsigned index = 0; index < feature_length; index++) {
+              this->p_backward_output_matrix_[node_offset + index] =
+                  this->forward_output_matrix_[node_offset + index] -
+                  ground_truth[index];
+            }
+          }
+        },
+        galois::steal(), galois::loopname("SigmoidBackward"));
+
+    return this->p_backward_output_matrix_;
+  }
 
   //! Loss for each row of the input
   std::vector<GNNFloat> input_loss_;
diff --git a/libgnn/include/galois/layers/SoftmaxLayer.h b/libgnn/include/galois/layers/SoftmaxLayer.h
index 3878b29685..b55e37f05d 100644
--- a/libgnn/include/galois/layers/SoftmaxLayer.h
+++ b/libgnn/include/galois/layers/SoftmaxLayer.h
@@ -1,5 +1,7 @@
 #pragma once
 #include "galois/layers/GNNLayer.h"
+#include "galois/GNNMath.h"
+
 #ifdef GALOIS_ENABLE_GPU
 #include "galois/layers/SoftmaxLayer.cuh"
 #endif
@@ -9,13 +11,14 @@ namespace galois {
 //! Softmax layer: takes each row of the input matrix and creates a probability
 //! distribution based on the magnitude of elements in each row.
 //! Currently this only works with **single class* labels and is coded as such.
-class SoftmaxLayer : public GNNLayer {
+template <typename VTy, typename ETy>
+class SoftmaxLayer : public GNNLayer<VTy, ETy> {
 public:
-  SoftmaxLayer(size_t layer_num, const galois::graphs::GNNGraph& graph,
-
+  SoftmaxLayer(size_t layer_num,
+               const galois::graphs::GNNGraph<VTy, ETy>& graph,
                PointerWithSize<GNNFloat>* backward_output_matrix,
                const GNNLayerDimensions& dimensions)
-      : GNNLayer(
+      : GNNLayer<VTy, ETy>(
             layer_num, graph, backward_output_matrix, dimensions,
             GNNLayerConfig{.allocate_weights = false, .disable_output = true}),
 #ifdef GALOIS_ENABLE_GPU
@@ -27,7 +30,7 @@ class SoftmaxLayer : public GNNLayer {
         softmax_temp_vectors_(dimensions.input_columns)
 
   {
-    output_layer_type_ = galois::GNNOutputLayerType::kSoftmax;
+    this->output_layer_type_ = galois::GNNOutputLayerType::kSoftmax;
     // input/output columns must be equivalent in a softmax
     GALOIS_LOG_ASSERT(dimensions.input_columns == dimensions.output_columns);
     // output needs to match number of possible classes
@@ -35,21 +38,146 @@ class SoftmaxLayer : public GNNLayer {
   }
 
   const PointerWithSize<galois::GNNFloat>
-  ForwardPhaseCPU(const PointerWithSize<galois::GNNFloat> input_embeddings);
+  ForwardPhaseCPU(const PointerWithSize<galois::GNNFloat> input_embeddings) {
+    galois::StatTimer Timer("SoftmaxForward", "SoftmaxLayer");
+    this->TimerStart(&Timer);
+
+    // note: p_backward == input_embeddings
+    input_loss_.assign(input_loss_.size(), 0.0);
+    const size_t feature_length = this->layer_dimensions_.input_columns;
+#ifndef NDEBUG
+    galois::DGAccumulator<GNNFloat> loss_accum;
+    galois::DGAccumulator<size_t> handled;
+    loss_accum.reset();
+    handled.reset();
+#endif
+
+    galois::do_all(
+        galois::iterate(size_t{0}, this->layer_dimensions_.input_rows),
+        [&](const unsigned i) {
+          if (this->IsSampledLayer()) {
+            if ((this->layer_phase_ == GNNPhase::kTrain ||
+                 this->layer_phase_ == GNNPhase::kBatch) &&
+                !this->graph_.IsInSampledGraphSubgraph(i)) {
+              // XXX
+              VectorZero(feature_length,
+                         &this->p_backward_output_matrix_[i * feature_length]);
+              return;
+            }
+          }
+
+          // do softmax
+          GNNSoftmax(feature_length, &input_embeddings[feature_length * i],
+                     &this->p_backward_output_matrix_[feature_length * i]);
+          // create ground truth vector for this LID
+          std::vector<GNNFloat>* ground_truth_vec =
+              ground_truth_vectors_.getLocal();
+          assert(ground_truth_vec->size() == feature_length);
+          ground_truth_vec->assign(ground_truth_vec->size(), 0.0);
+          // single class label is an index; set the correct one
+          (*ground_truth_vec)[static_cast<size_t>(
+              this->graph_.GetSingleClassLabel(i))] = 1.0;
+
+          // calculate loss for this LID (note not all i will be filled)
+          input_loss_[i] = GNNCrossEntropy(
+              feature_length, ground_truth_vec->data(),
+              &this->p_backward_output_matrix_[feature_length * i]);
+#ifndef NDEBUG
+          loss_accum += input_loss_[i];
+          handled += 1;
+#endif
+        },
+        // TODO chunk size?
+        // steal on as some threads may have nothing to work on
+        // galois::steal(), galois::loopname("SoftmaxForward"));
+        galois::steal());
+#ifndef NDEBUG
+    GNNFloat reduced_loss = loss_accum.reduce();
+    size_t t              = handled.reduce();
+    galois::gPrint("Loss is ", reduced_loss / t, " ", reduced_loss, " ", t,
+                   "\n");
+#endif
+
+    this->TimerStop(&Timer);
+    return this->p_backward_output_matrix_;
+  }
+
   //! Creates probability distribution of each row of input
   const PointerWithSize<galois::GNNFloat>
-  ForwardPhase(const PointerWithSize<galois::GNNFloat> input_embeddings) final;
+  ForwardPhase(const PointerWithSize<galois::GNNFloat> input_embeddings) final {
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      gpu_object_.ForwardPhaseGPU(this->layer_phase_, this->graph_.size(),
+                                  this->layer_dimensions_.input_columns,
+                                  input_embeddings.data(),
+                                  this->p_backward_output_matrix_.data());
+      return this->p_backward_output_matrix_;
+    }
+#endif
+    return ForwardPhaseCPU(input_embeddings);
+  }
+
+  PointerWithSize<galois::GNNFloat> BackwardPhaseCPU() {
+    galois::StatTimer Timer("SoftmaxBackward", "SoftmaxLayer");
+    this->TimerStart(&Timer);
+
+    const size_t feature_length = this->layer_dimensions_.input_columns;
+
+    galois::do_all(
+        galois::iterate(size_t{0}, this->layer_dimensions_.input_rows),
+        [&](const unsigned node) {
+          if (this->IsSampledLayer()) {
+            if (this->layer_phase_ == GNNPhase::kTrain &&
+                !this->graph_.IsInSampledGraphSubgraph(node))
+              return;
+          }
+
+          size_t correct = this->graph_.GetSingleClassLabel(node);
+          // See here for explanation for why this works
+          // https://gombru.github.io/2018/05/23/cross_entropy_loss/
+          // Derivation of full combined derivative isn't there, but some
+          // emperical inspection tells me this is likely correct
+          // TODO(loc) work it out myself
+          for (size_t idx = 0; idx < feature_length; idx++) {
+            if (idx == correct) {
+              // positive class
+              this->p_backward_output_matrix_[node * feature_length + idx] =
+                  this->p_backward_output_matrix_[node * feature_length + idx] -
+                  1;
+            } else {
+              // negative class
+              this->p_backward_output_matrix_[node * feature_length + idx] =
+                  this->p_backward_output_matrix_[node * feature_length + idx];
+            }
+          }
+        },
+        galois::steal(), galois::loopname("SoftmaxBackward"));
+
+    this->TimerStop(&Timer);
+
+    return this->p_backward_output_matrix_;
+  }
 
-  PointerWithSize<galois::GNNFloat> BackwardPhaseCPU();
   //! Get gradients to fix distribution such that it leans more towards single
   //! class ground truth.
   PointerWithSize<galois::GNNFloat>
-  BackwardPhase(PointerWithSize<galois::GNNFloat> in_out,
-                PointerWithSize<galois::GNNFloat>* input_gradient) final;
+  BackwardPhase(PointerWithSize<galois::GNNFloat>,
+                PointerWithSize<galois::GNNFloat>*) final {
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      gpu_object_.BackwardPhaseGPU(this->layer_phase_, this->graph_.size(),
+                                   this->layer_dimensions_.input_columns,
+                                   this->p_backward_output_matrix_.data(),
+                                   this->p_backward_output_matrix_.data());
+      return this->p_backward_output_matrix_;
+    }
+#endif
+    return BackwardPhaseCPU();
+  }
 
   void ResizeRows(size_t new_row_count) {
-    layer_dimensions_.input_rows  = new_row_count;
-    layer_dimensions_.output_rows = new_row_count;
+    this->layer_dimensions_.input_rows  = new_row_count;
+    this->layer_dimensions_.output_rows = new_row_count;
     // no output resize
     if (input_loss_.size() < new_row_count) {
       input_loss_.resize(new_row_count * 1.02);
@@ -58,8 +186,8 @@ class SoftmaxLayer : public GNNLayer {
 
   void ResizeInputOutputRows(size_t in, size_t out) {
     assert(in == out);
-    layer_dimensions_.input_rows  = in;
-    layer_dimensions_.output_rows = out;
+    this->layer_dimensions_.input_rows  = in;
+    this->layer_dimensions_.output_rows = out;
     // no output resize
     if (input_loss_.size() < in) {
       input_loss_.resize(in * 1.02);
diff --git a/libgnn/src/DistributedMinibatchTracker.cpp b/libgnn/src/DistributedMinibatchTracker.cpp
index dddbc33519..4f25252b0a 100644
--- a/libgnn/src/DistributedMinibatchTracker.cpp
+++ b/libgnn/src/DistributedMinibatchTracker.cpp
@@ -32,7 +32,7 @@ size_t galois::DistributedMinibatchTracker::GetNumberForNextMinibatch() {
     if (host == 0) {
       start = 0;
       end   = std::min(num_per_unit * sampled_num_on_hosts_[host],
-                     (uint32_t)total_minibatch_size_);
+                       (uint32_t)total_minibatch_size_);
     } else if (host == (num_hosts_ - 1)) {
       start = std::min(num_per_unit * sampled_num_on_hosts_[host - 1],
                        (uint32_t)total_minibatch_size_);
@@ -41,7 +41,7 @@ size_t galois::DistributedMinibatchTracker::GetNumberForNextMinibatch() {
       start = std::min(num_per_unit * sampled_num_on_hosts_[host - 1],
                        (uint32_t)total_minibatch_size_);
       end   = std::min(num_per_unit * sampled_num_on_hosts_[host],
-                     (uint32_t)total_minibatch_size_);
+                       (uint32_t)total_minibatch_size_);
     }
 
     uint32_t proposed_to_take = end - start;
diff --git a/libgnn/src/GNNMath.cpp b/libgnn/src/GNNMath.cpp
index 582fba95f6..c25f3ae7ec 100644
--- a/libgnn/src/GNNMath.cpp
+++ b/libgnn/src/GNNMath.cpp
@@ -57,8 +57,8 @@ void galois::VectorMulAdd(size_t length, const GNNFloat* a, const GNNFloat* b,
   constexpr size_t vectorization_length = 16;
   const size_t aligned_end = length - length % vectorization_length;
   __m512 scale_vec_main    = _mm512_set_ps(
-      b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale,
-      b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale);
+         b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale,
+         b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale);
   for (size_t i = 0; i < aligned_end; i += vectorization_length) {
     _mm512_storeu_ps(
         &output[i],
diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp
deleted file mode 100644
index 201da985d5..0000000000
--- a/libgnn/src/GraphNeuralNetwork.cpp
+++ /dev/null
@@ -1,818 +0,0 @@
-#include "galois/GNNMath.h"
-#include "galois/GraphNeuralNetwork.h"
-#include "galois/layers/DenseLayer.h"
-#include "galois/layers/GraphConvolutionalLayer.h"
-#include "galois/layers/L2NormLayer.h"
-#include "galois/layers/SAGELayer.h"
-#include "galois/layers/SigmoidLayer.h"
-#include "galois/layers/SoftmaxLayer.h"
-
-galois::GraphNeuralNetwork::GraphNeuralNetwork(
-    std::unique_ptr<galois::graphs::GNNGraph> graph,
-    std::unique_ptr<BaseOptimizer> optimizer,
-    galois::GraphNeuralNetworkConfig&& config)
-    : graph_(std::move(graph)), optimizer_(std::move(optimizer)),
-      config_(std::move(config)) {
-  if (config_.do_sampling_ && config_.use_train_subgraph_) {
-    GALOIS_LOG_FATAL("Do not set train subgraph and sampling at same time "
-                     "(sampling uses training subgraph already)");
-  }
-  // max number of rows that can be passed as inputs; allocate space for it as
-  // this will be the # of rows for each layer
-  size_t max_rows = graph_->size();
-
-#ifdef GALOIS_ENABLE_GPU
-  if (device_personality == DevicePersonality::GPU_CUDA) {
-    graph_->ResizeGPULayerVector(config_.num_intermediate_layers());
-  }
-#endif
-  // used for chaining layers together; begins as nullptr
-  PointerWithSize<GNNFloat> prev_output_layer(nullptr, 0);
-  num_graph_user_layers_ = 0;
-
-  // create the intermediate layers
-  for (size_t i = 0; i < config_.num_intermediate_layers(); i++) {
-    GNNLayerType layer_type = config_.intermediate_layer_type(i);
-    size_t prev_layer_columns;
-
-    if (i != 0) {
-      // grab previous layer's size
-      prev_layer_columns = config_.intermediate_layer_size(i - 1);
-    } else {
-      // first layer means the input columns are # features in graph
-      prev_layer_columns = graph_->node_feature_length();
-    }
-
-    // max dims
-    GNNLayerDimensions layer_dims = {.input_rows    = max_rows,
-                                     .input_columns = prev_layer_columns,
-                                     .output_columns =
-                                         config_.intermediate_layer_size(i),
-                                     .output_rows = max_rows};
-
-    // test minibatch size: if it's not enabled, then currently the full
-    // graph is used (should really only subgraph the test nodes, though;
-    // that's a TODO)
-    if ((config_.train_minibatch_size() || config_.use_train_subgraph_) &&
-        config_.test_minibatch_size()) {
-      galois::gInfo("Not allocating rows");
-      // set to 0 here to make it allocate nothing
-      layer_dims.input_rows  = 0;
-      layer_dims.output_rows = 0;
-    }
-
-    switch (layer_type) {
-    case GNNLayerType::kGraphConvolutional:
-      gnn_layers_.push_back(std::move(std::make_unique<GraphConvolutionalLayer>(
-          i, *graph_, &prev_output_layer, layer_dims,
-          config_.default_layer_config())));
-      gnn_layers_.back()->SetGraphUserLayerNumber(num_graph_user_layers_++);
-      break;
-    case GNNLayerType::kSAGE:
-      gnn_layers_.push_back(std::move(std::make_unique<SAGELayer>(
-          i, *graph_, &prev_output_layer, layer_dims,
-          config_.default_layer_config())));
-      gnn_layers_.back()->SetGraphUserLayerNumber(num_graph_user_layers_++);
-#ifdef GALOIS_ENABLE_GPU
-      // TODO(loc/hochan) sage layer gpu
-#endif
-      break;
-    case GNNLayerType::kL2Norm:
-      gnn_layers_.push_back(std::move(std::make_unique<L2NormLayer>(
-          i, *graph_, &prev_output_layer, layer_dims,
-          config_.default_layer_config())));
-      break;
-    case GNNLayerType::kDense:
-      gnn_layers_.push_back(std::move(std::make_unique<DenseLayer>(
-          i, *graph_, &prev_output_layer, layer_dims,
-          config_.default_layer_config())));
-      break;
-    default:
-      GALOIS_LOG_FATAL("Invalid layer type during network construction");
-    }
-
-    // update output layer for next layer
-    prev_output_layer = gnn_layers_.back()->GetForwardOutput();
-#ifdef GALOIS_ENABLE_GPU
-    if (device_personality == DevicePersonality::GPU_CUDA) {
-      graph_->InitLayerVectorMetaObjects(
-          i, galois::runtime::getSystemNetworkInterface().Num,
-          layer_dims.input_columns, layer_dims.output_columns);
-    }
-#endif
-  }
-
-  // loop backward and find last GCN/SAGE (main) layer to disable activation
-  for (auto back_iter = gnn_layers_.rbegin(); back_iter != gnn_layers_.rend();
-       back_iter++) {
-    GNNLayerType layer_type = (*back_iter)->layer_type();
-    if (layer_type == GNNLayerType::kGraphConvolutional ||
-        layer_type == GNNLayerType::kSAGE) {
-      galois::gDebug("Disabling activation on layer ",
-                     (*back_iter)->layer_number(), "\n");
-      (*back_iter)->DisableActivation();
-      break;
-    }
-  }
-
-  if (config_.do_sampling() || config_.use_train_subgraph_ ||
-      config.train_minibatch_size() || config.test_minibatch_size()) {
-    // output layer not included; it will never involve sampling
-    graph_->InitializeSamplingData(num_graph_user_layers_,
-                                   config_.use_train_subgraph_);
-  }
-
-  num_hosts_ = galois::runtime::getSystemNetworkInterface().Num;
-  if (config_.train_minibatch_size()) {
-    graph_->SetupTrainBatcher(config_.train_minibatch_size());
-    // size_t local_num =
-    // if (num_hosts_ > 1) {
-    //  dist_minibatch_tracker_ = std::make_unique<DistributedMinibatchTracker>(
-    //      galois::runtime::getSystemNetworkInterface().ID, num_hosts_,
-    //      local_num, config_.train_minibatch_size());
-    //}
-  }
-
-  if (config_.test_minibatch_size()) {
-    graph_->SetupTestBatcher(config_.test_minibatch_size());
-  }
-
-  // create the output layer
-  GNNLayerDimensions output_dims = {
-      .input_rows = max_rows,
-      // get last intermediate layer column size
-      .input_columns = config_.intermediate_layer_size(
-          config_.num_intermediate_layers() - 1),
-      .output_columns = config_.output_layer_size(),
-      .output_rows    = max_rows};
-
-  if ((config_.train_minibatch_size() || config_.use_train_subgraph_) &&
-      config_.test_minibatch_size()) {
-    output_dims.input_rows  = 0;
-    output_dims.output_rows = 0;
-  }
-
-  switch (config_.output_layer_type()) {
-  case (GNNOutputLayerType::kSoftmax):
-    gnn_layers_.push_back(std::move(std::make_unique<SoftmaxLayer>(
-        config_.num_intermediate_layers(), *graph_, &prev_output_layer,
-        output_dims)));
-    break;
-  case (GNNOutputLayerType::kSigmoid):
-    gnn_layers_.push_back(std::move(std::make_unique<SigmoidLayer>(
-        config_.num_intermediate_layers(), *graph_, &prev_output_layer,
-        output_dims)));
-    break;
-  default:
-    GALOIS_LOG_FATAL("Invalid layer type during network construction");
-  }
-
-  // sanity checking multi-class + output layer
-  if (!graph_->is_single_class_label() &&
-      (config_.output_layer_type() != GNNOutputLayerType::kSigmoid)) {
-    GALOIS_LOG_WARN(
-        "Using a non-sigmoid output layer with a multi-class label!");
-    // if debug mode just kill program
-    assert(false);
-  }
-
-  // flip sampling on layers
-  if (config_.use_train_subgraph_ || config_.do_sampling() ||
-      config_.train_minibatch_size()) {
-    for (std::unique_ptr<galois::GNNLayer>& ptr : gnn_layers_) {
-      ptr->EnableSampling();
-    }
-  }
-}
-
-float galois::GraphNeuralNetwork::MinibatchedTesting() {
-  galois::gDebug("Minibatched Testing");
-  graph_->DisableSubgraph();
-  graph_->ResetTestMinibatcher();
-  SetLayerPhases(galois::GNNPhase::kBatch);
-
-  bool choose_all_status = graph_->SubgraphChooseAllStatus();
-
-  uint32_t correct = 0;
-  uint32_t total   = 0;
-  while (true) {
-    work_left_.reset();
-    // size_t seed_node_count = graph_->PrepareNextTestMinibatch();
-    graph_->PrepareNextTestMinibatch();
-    // last layer input size/output rows becomes seed node size
-    // gnn_layers_.back()->ResizeInputOutputRows(seed_node_count,
-    // seed_node_count);
-    size_t num_sampled_layers = 0;
-
-    for (auto back_iter = gnn_layers_.rbegin(); back_iter != gnn_layers_.rend();
-         back_iter++) {
-      GNNLayerType layer_type = (*back_iter)->layer_type();
-      if (layer_type == GNNLayerType::kGraphConvolutional ||
-          layer_type == GNNLayerType::kSAGE) {
-        // you can minibatch with sampling or minibatch and grab all
-        // relevant neighbors
-        // size_t current_sample_size;
-        graph_->SampleAllEdges((*back_iter)->graph_user_layer_number(), false,
-                               num_sampled_layers + 1);
-        // resize this layer, change seed node count
-        //(*back_iter)
-        //    ->ResizeInputOutputRows(current_sample_size, seed_node_count);
-        // seed_node_count = current_sample_size;
-
-        num_sampled_layers++;
-        // XXX resizes above only work for SAGE layers; will break if other
-        // layers are tested
-      }
-    }
-
-    // resize layer matrices
-    CorrectRowCounts(graph_->ConstructSampledSubgraph(num_sampled_layers));
-    graph_->EnableSubgraphChooseAll();
-    CorrectBackwardLinks();
-
-    const PointerWithSize<galois::GNNFloat> batch_pred = DoInference();
-    std::pair<uint32_t, uint32_t> correct_total =
-        graph_->GetBatchAccuracy(batch_pred);
-
-    correct += correct_total.first;
-    total += correct_total.second;
-
-    work_left_ += graph_->MoreTestMinibatches();
-    char global_work_left = work_left_.reduce();
-    if (!global_work_left) {
-      break;
-    }
-  }
-
-  galois::gInfo("Minibatching Correct / Total ", correct, " ", total);
-
-  if (choose_all_status) {
-    graph_->EnableSubgraphChooseAll();
-  } else {
-    graph_->DisableSubgraphChooseAll();
-  }
-
-  return (1.0 * correct) / (1.0 * total);
-}
-
-float galois::GraphNeuralNetwork::Train(size_t num_epochs) {
-  EnableTimers();
-  const size_t this_host = graph_->host_id();
-  float train_accuracy{0.f};
-  std::vector<size_t> subgraph_layer_sizes;
-  // this subgraph only needs to be created once
-  if (config_.use_train_subgraph_ && !config_.train_minibatch_size()) {
-    galois::StatTimer total_subgraph_construction_timer("TotalSubGraphConstruction", kRegionName);
-    galois::StatTimer setup_neighborhood_sample_timer("SetupNeighborhoodSample", kRegionName);
-    galois::StatTimer edge_sampling_timer("SampleAllEdges", kRegionName);
-    galois::StatTimer subgraph_construction_timer("SubGraphConstruction", kRegionName);
-    total_subgraph_construction_timer.start();
-
-    setup_neighborhood_sample_timer.start();
-    // Setup the subgraph to only be the training graph
-    size_t local_seed_node_count = graph_->SetupNeighborhoodSample();
-    setup_neighborhood_sample_timer.stop();
-
-    subgraph_layer_sizes.emplace_back(local_seed_node_count);
-    galois::gDebug(graph_->host_prefix(), "Number of local seed nodes is ",
-                   local_seed_node_count);
-    size_t num_sampled_layers = 0;
-    edge_sampling_timer.start();
-    // gnn_layers_.back()->ResizeRows(local_seed_node_count);
-    for (auto back_iter = gnn_layers_.rbegin(); back_iter != gnn_layers_.rend();
-         back_iter++) {
-      GNNLayerType layer_type = (*back_iter)->layer_type();
-      if (layer_type == GNNLayerType::kGraphConvolutional ||
-          layer_type == GNNLayerType::kSAGE) {
-        size_t current_sample_size = graph_->SampleAllEdges(
-            (*back_iter)->graph_user_layer_number(),
-            config_.inductive_subgraph_, num_sampled_layers + 1);
-        galois::gDebug(graph_->host_prefix(),
-                       "Number of local nodes for train subgraph for layer ",
-                       (*back_iter)->graph_user_layer_number(), " is ",
-                       current_sample_size);
-        // resizing
-        //(*back_iter)
-        //    ->ResizeInputOutputRows(current_sample_size,
-        //    local_seed_node_count);
-        local_seed_node_count = current_sample_size;
-        subgraph_layer_sizes.emplace_back(local_seed_node_count);
-        num_sampled_layers++;
-      }
-    }
-    edge_sampling_timer.stop();
-    subgraph_construction_timer.start();
-    CorrectRowCounts(graph_->ConstructSampledSubgraph(num_sampled_layers));
-    subgraph_construction_timer.stop();
-    CorrectBackwardLinks();
-    total_subgraph_construction_timer.stop();
-  }
-
-  galois::StatTimer epoch_timer("TrainingTime", kRegionName);
-  galois::StatTimer validation_timer("ValidationTime", kRegionName);
-  galois::StatTimer epoch_test_timer("TestTime", kRegionName);
-
-  for (size_t epoch = 0; epoch < num_epochs; epoch++) {
-    epoch_timer.start();
-    // swap to train subgraph
-    if (config_.use_train_subgraph_ && !config_.train_minibatch_size()) {
-      graph_->EnableSubgraph();
-      // TODO(loc) this doesn't actually function as expected anymore
-      // with the numerous changes to the system; this commenting
-      // out is more of a hack for the train subgraph option (which
-      // probably shouldn't be used anyways)
-
-      // size_t l_count = 0;
-      // gnn_layers_.back()->ResizeRows(subgraph_layer_sizes[0]);
-      // for (auto back_iter = gnn_layers_.rbegin();
-      //     back_iter != gnn_layers_.rend(); back_iter++) {
-      //  GNNLayerType layer_type = (*back_iter)->layer_type();
-      //  if (layer_type == GNNLayerType::kGraphConvolutional ||
-      //      layer_type == GNNLayerType::kSAGE) {
-      //    (*back_iter)
-      //        ->ResizeInputOutputRows(subgraph_layer_sizes[l_count + 1],
-      //                                subgraph_layer_sizes[l_count]);
-      //    l_count++;
-      //  }
-      //}
-      CorrectBackwardLinks();
-    }
-
-    // beginning of epoch sampling (no minibatches)
-    if (config_.do_sampling() && !config_.train_minibatch_size()) {
-      galois::StatTimer mb_timer("EpochSubgraphCreation", kRegionName);
-      galois::StatTimer subgraph_construction_timer("SubGraphConstruction", kRegionName);
-      galois::StatTimer setup_neighborhood_sample_timer("SetupNeighborhoodSample", kRegionName);
-      galois::StatTimer edge_sampling_timer("SampleEdges", kRegionName);
-      mb_timer.start();
-
-      setup_neighborhood_sample_timer.start();
-      size_t local_seed_node_count = graph_->SetupNeighborhoodSample();
-      setup_neighborhood_sample_timer.stop();
-      // gnn_layers_.back()->ResizeRows(local_seed_node_count);
-      galois::gDebug(graph_->host_prefix(), "Number of local seed nodes is ",
-                     local_seed_node_count);
-      size_t num_sampled_layers = 0;
-
-      edge_sampling_timer.start();
-      // work backwards on GCN/SAGE layers
-      // loop backward and find last GCN/SAGE (main) layer to disable activation
-      for (auto back_iter = gnn_layers_.rbegin();
-           back_iter != gnn_layers_.rend(); back_iter++) {
-        GNNLayerType layer_type = (*back_iter)->layer_type();
-        if (layer_type == GNNLayerType::kGraphConvolutional ||
-            layer_type == GNNLayerType::kSAGE) {
-          size_t current_sample_size = graph_->SampleEdges(
-              (*back_iter)->graph_user_layer_number(),
-              config_.fan_out_vector_[num_sampled_layers],
-              config_.inductive_subgraph_, num_sampled_layers + 1);
-          galois::gDebug(graph_->host_prefix(),
-                         "Number of local nodes for layer ",
-                         (*back_iter)->graph_user_layer_number(), " is ",
-                         current_sample_size);
-
-          //(*back_iter)
-          //    ->ResizeInputOutputRows(current_sample_size,
-          //                            local_seed_node_count);
-          local_seed_node_count = current_sample_size;
-          num_sampled_layers++;
-        }
-      }
-      edge_sampling_timer.stop();
-      // resize layer matrices
-      subgraph_construction_timer.start();
-      CorrectRowCounts(graph_->ConstructSampledSubgraph(num_sampled_layers));
-      subgraph_construction_timer.stop();
-      CorrectBackwardLinks();
-      mb_timer.stop();
-    }
-
-    if (!config_.train_minibatch_size()) {
-      // no minibatching, full batch
-      const PointerWithSize<galois::GNNFloat> predictions = DoInference();
-      // have to get accuracy here because gradient prop destroys the
-      // predictions matrix
-      train_accuracy = GetGlobalAccuracy(predictions);
-      GradientPropagation();
-    } else {
-      graph_->ResetTrainMinibatcher();
-      // if (num_hosts_ > 1) {
-      //  dist_minibatch_tracker_->ResetEpoch();
-      //}
-
-      SetLayerPhases(galois::GNNPhase::kBatch);
-
-      size_t batch_num = 0;
-
-      // create mini batch graphs and loop until minibatches on all hosts done
-      while (true) {
-        galois::StatTimer prep_timer("PrepNextMinibatch", kRegionName);
-        galois::StatTimer sample_time("MinibatchSampling", kRegionName);
-        galois::StatTimer mb_timer("MinibatchSubgraphCreation", kRegionName);
-        galois::StatTimer subgraph_construction_timer("SubGraphConstruction", kRegionName);
-        mb_timer.start();
-
-        galois::Timer batch_timer;
-        batch_timer.start();
-        work_left_.reset();
-        galois::gInfo("Epoch ", epoch, " batch ", batch_num++);
-        // break when all hosts are done with minibatches
-        prep_timer.start();
-        size_t seed_node_count;
-        // if (num_hosts_ > 1) {
-        //  size_t num_for_next_batch =
-        //      dist_minibatch_tracker_->GetNumberForNextMinibatch();
-        //  galois::gInfo(graph_->host_prefix(), "Sampling ",
-        //  num_for_next_batch,
-        //                " for this minibatch");
-        //  seed_node_count =
-        //      graph_->PrepareNextTrainMinibatch(num_for_next_batch);
-        //} else {
-        //}
-        seed_node_count = graph_->PrepareNextTrainMinibatch();
-
-        galois::gDebug(graph_->host_prefix(),
-                       "Number of local seed nodes is for batch is ",
-                       seed_node_count);
-        prep_timer.stop();
-
-        // last layer input size/output rows becomes seed node size
-        // gnn_layers_.back()->ResizeInputOutputRows(seed_node_count,
-        //                                          seed_node_count);
-
-        sample_time.start();
-        // +1 later in call because 0 is already taken
-        size_t num_sampled_layers = 0;
-        for (auto back_iter = gnn_layers_.rbegin();
-             back_iter != gnn_layers_.rend(); back_iter++) {
-          GNNLayerType layer_type = (*back_iter)->layer_type();
-          if (layer_type == GNNLayerType::kGraphConvolutional ||
-              layer_type == GNNLayerType::kSAGE) {
-            // you can minibatch with sampling or minibatch and grab all
-            // relevant neighbors
-            size_t current_sample_size;
-
-            if (config_.do_sampling()) {
-              current_sample_size = graph_->SampleEdges(
-                  (*back_iter)->graph_user_layer_number(),
-                  config_.fan_out_vector_[num_sampled_layers],
-                  config_.inductive_subgraph_, num_sampled_layers + 1);
-            } else {
-              current_sample_size = graph_->SampleAllEdges(
-                  (*back_iter)->graph_user_layer_number(),
-                  config_.inductive_subgraph_, num_sampled_layers + 1);
-            }
-
-            galois::gDebug(graph_->host_prefix(),
-                           "Number of local nodes for layer ",
-                           (*back_iter)->graph_user_layer_number(), " is ",
-                           current_sample_size);
-
-            // resize this layer, change seed node count
-            //(*back_iter)
-            //    ->ResizeInputOutputRows(current_sample_size, seed_node_count);
-            seed_node_count = current_sample_size;
-            num_sampled_layers++;
-          }
-        }
-        sample_time.stop();
-
-        // resize layer matrices
-        subgraph_construction_timer.start();
-        CorrectRowCounts(graph_->ConstructSampledSubgraph(num_sampled_layers));
-        subgraph_construction_timer.stop();
-        CorrectBackwardLinks();
-
-        // XXX resizes above only work for SAGE layers; will break if other
-        // layers are tested
-
-        mb_timer.stop();
-
-        const PointerWithSize<galois::GNNFloat> batch_pred = DoInference();
-        train_accuracy = GetGlobalAccuracy(batch_pred);
-        GradientPropagation();
-
-        work_left_ += graph_->MoreTrainMinibatches();
-        char global_work_left = work_left_.reduce();
-        batch_timer.stop();
-        epoch_timer.stop();
-        galois::gPrint("Epoch ", epoch, " Batch ", batch_num - 1,
-                       ": Train accuracy/F1 micro is ", train_accuracy,
-                       " time ", batch_timer.get(), "\n");
-
-        bool test_eval =
-            config_.minibatch_test_interval_
-                ? (batch_num - 1) % config_.minibatch_test_interval_ == 0
-                : false;
-
-        if (test_eval) {
-          DisableTimers();
-          float test_acc;
-          if (!config_.test_minibatch_size()) {
-            // TODO something about this path breaks accuracy
-            GALOIS_LOG_FATAL("this path breaks accuracy for the rest of the "
-                             "run for some reason");
-            bool f = graph_->SubgraphChooseAllStatus();
-            graph_->DisableSubgraph();
-            for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end();
-                 layer++) {
-              // TODO nuclear resize
-              (*layer)->ResizeRows(graph_->size());
-            }
-            CorrectBackwardLinks();
-            SetLayerPhases(galois::GNNPhase::kTest);
-            graph_->EnableSubgraphChooseAll();
-            const PointerWithSize<galois::GNNFloat> test_pred = DoInference();
-            test_acc = GetGlobalAccuracy(test_pred);
-            graph_->SetSubgraphChooseAll(f);
-          } else {
-            test_acc = MinibatchedTesting();
-          }
-
-          if (this_host == 0) {
-            galois::gPrint("Epoch ", epoch, " Batch ", batch_num - 1,
-                           ": Test accuracy is ", test_acc, "\n");
-            const std::string test_name_acc =
-                "TestEpoch" + std::to_string(epoch) + "Batch" +
-                std::to_string(batch_num - 1) + "Accuracy";
-            galois::runtime::reportStat_Single(kRegionName, test_name_acc,
-                                               test_acc);
-          }
-
-          // report the training time elapsed at this point in time
-          galois::runtime::reportStat_Single(
-              kRegionName,
-              "ElapsedTrainTimeEpoch" + std::to_string(epoch) + "Batch" +
-                  std::to_string(batch_num - 1),
-              epoch_timer.get());
-          // revert to training phase for next epoch
-          SetLayerPhases(galois::GNNPhase::kTrain);
-          EnableTimers();
-        }
-
-        epoch_timer.start();
-
-        if (!global_work_left) {
-          // if (num_hosts_ > 1) {
-          //  GALOIS_LOG_ASSERT(dist_minibatch_tracker_->OutOfWork());
-          //}
-          break;
-        }
-      }
-    }
-    epoch_timer.stop();
-
-    if (this_host == 0) {
-      const std::string t_name_acc =
-          "TrainEpoch" + std::to_string(epoch) + "Accuracy";
-      galois::gPrint("Epoch ", epoch, ": Train accuracy/F1 micro is ",
-                     train_accuracy, "\n");
-      galois::runtime::reportStat_Single(kRegionName, t_name_acc,
-                                         train_accuracy);
-    }
-
-    bool do_validate = config_.validation_interval_
-                           ? epoch % config_.validation_interval_ == 0
-                           : false;
-    bool do_test =
-        config_.test_interval_ ? epoch % config_.test_interval_ == 0 : false;
-
-    bool subgraph_choose_all_status = graph_->SubgraphChooseAllStatus();
-
-    if (do_validate || do_test) {
-      DisableTimers();
-      // disable subgraph
-      graph_->DisableSubgraph();
-      graph_->EnableSubgraphChooseAll();
-    }
-
-    if (do_validate) {
-      // XXX induced subgraph here
-      for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end();
-           layer++) {
-        // nuclear resize
-        (*layer)->ResizeRows(graph_->size());
-      }
-
-      CorrectBackwardLinks();
-      validation_timer.start();
-      SetLayerPhases(galois::GNNPhase::kValidate);
-      const PointerWithSize<galois::GNNFloat> val_pred = DoInference();
-      validation_timer.stop();
-
-      float val_acc = GetGlobalAccuracy(val_pred);
-      if (this_host == 0) {
-        galois::gPrint("Epoch ", epoch, ": Validation accuracy is ", val_acc,
-                       "\n");
-        const std::string v_name_acc =
-            "ValEpoch" + std::to_string(epoch) + "Accuracy";
-        galois::runtime::reportStat_Single(kRegionName, v_name_acc, val_acc);
-      }
-    }
-
-    if (do_test) {
-      epoch_test_timer.start();
-      float test_acc;
-
-      if (!config_.test_minibatch_size()) {
-        for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end();
-             layer++) {
-          // nuclear resize
-          (*layer)->ResizeRows(graph_->size());
-        }
-        CorrectBackwardLinks();
-        SetLayerPhases(galois::GNNPhase::kTest);
-        const PointerWithSize<galois::GNNFloat> test_pred = DoInference();
-        epoch_test_timer.stop();
-        test_acc = GetGlobalAccuracy(test_pred);
-      } else {
-        test_acc = MinibatchedTesting();
-        epoch_test_timer.stop();
-      }
-
-      if (this_host == 0) {
-        galois::gPrint("Epoch ", epoch, ": Test accuracy is ", test_acc, "\n");
-        const std::string test_name_acc =
-            "TestEpoch" + std::to_string(epoch) + "Accuracy";
-        galois::runtime::reportStat_Single(kRegionName, test_name_acc,
-                                           test_acc);
-      }
-    }
-
-    if (do_validate || do_test) {
-      // report the training time elapsed at this point in time
-      galois::runtime::reportStat_Single(
-          kRegionName, "ElapsedTrainTimeEpoch" + std::to_string(epoch),
-          epoch_timer.get());
-      // revert to training phase for next epoch
-      SetLayerPhases(galois::GNNPhase::kTrain);
-      graph_->SetSubgraphChooseAll(subgraph_choose_all_status);
-
-      // TODO too much code dupe
-      // Resconstruct the train subgraph since it was replaced by test subgraph
-      if (config_.use_train_subgraph_ && !config_.train_minibatch_size() &&
-          config_.test_minibatch_size() && do_test) {
-        // Setup the subgraph to only be the training graph
-        size_t local_seed_node_count = graph_->SetupNeighborhoodSample();
-        galois::gDebug(graph_->host_prefix(), "Number of local seed nodes is ",
-                       local_seed_node_count);
-        size_t num_sampled_layers = 0;
-        // gnn_layers_.back()->ResizeRows(local_seed_node_count);
-        for (auto back_iter = gnn_layers_.rbegin();
-             back_iter != gnn_layers_.rend(); back_iter++) {
-          GNNLayerType layer_type = (*back_iter)->layer_type();
-          if (layer_type == GNNLayerType::kGraphConvolutional ||
-              layer_type == GNNLayerType::kSAGE) {
-            size_t current_sample_size = graph_->SampleAllEdges(
-                (*back_iter)->graph_user_layer_number(),
-                config_.inductive_subgraph_, num_sampled_layers + 1);
-            // resizing
-            //(*back_iter)
-            //    ->ResizeInputOutputRows(current_sample_size,
-            //                            local_seed_node_count);
-            local_seed_node_count = current_sample_size;
-            num_sampled_layers++;
-          }
-        }
-        CorrectRowCounts(graph_->ConstructSampledSubgraph(num_sampled_layers));
-        CorrectBackwardLinks();
-      }
-
-      EnableTimers();
-    }
-  }
-
-  uint64_t average_epoch_time = epoch_timer.get() / num_epochs;
-  galois::runtime::reportStat_Tavg(kRegionName, "AverageEpochTime",
-                                   average_epoch_time);
-  //DisableTimers();
-  // disable subgraph
-  graph_->DisableSubgraph();
-  graph_->EnableSubgraphChooseAll();
-
-  // check test accuracy
-  galois::StatTimer test_timer("FinalTestRun", kRegionName);
-  float global_accuracy;
-
-  test_timer.start();
-
-  if (!config_.test_minibatch_size()) {
-    for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end();
-         layer++) {
-      // TODO nuclear resize; this is **ridiculously** inefficient
-      // because full graph will be used even if not included in test
-      // k-hop neighborhood for eval
-      (*layer)->ResizeRows(graph_->size());
-    }
-    CorrectBackwardLinks();
-    SetLayerPhases(galois::GNNPhase::kTest);
-    const PointerWithSize<galois::GNNFloat> predictions = DoInference();
-    global_accuracy = GetGlobalAccuracy(predictions);
-  } else {
-    global_accuracy = MinibatchedTesting();
-  }
-
-  test_timer.stop();
-
-  if (this_host == 0) {
-    galois::gPrint("Final test accuracy is ", global_accuracy, "\n");
-    galois::runtime::reportStat_Single(kRegionName, "FinalTestAccuracy",
-                                       global_accuracy);
-  }
-
-  return global_accuracy;
-}
-
-const galois::PointerWithSize<galois::GNNFloat>
-galois::GraphNeuralNetwork::DoInference() {
-  galois::StatTimer timer("DoInference", "GraphNeuralNetwork");
-  if (timers_on_) {
-    timer.start();
-  }
-
-  // start with graph features and pass it through all layers of the network
-  galois::PointerWithSize<galois::GNNFloat> layer_input =
-      graph_->GetLocalFeatures();
-
-  for (std::unique_ptr<galois::GNNLayer>& ptr : gnn_layers_) {
-    layer_input = ptr->ForwardPhase(layer_input);
-  }
-
-  if (timers_on_) {
-    timer.stop();
-  }
-
-  return layer_input;
-}
-
-float galois::GraphNeuralNetwork::GetGlobalAccuracy(
-    PointerWithSize<GNNFloat> predictions) {
-#ifdef GALOIS_ENABLE_GPU
-  if (device_personality == DevicePersonality::GPU_CUDA) {
-    if (cpu_pred_.size() != predictions.size()) {
-      cpu_pred_.resize(predictions.size());
-    }
-
-    // TODO get rid of CPU copy here if possible
-    AdamOptimizer* adam = static_cast<AdamOptimizer*>(optimizer_.get());
-    adam->CopyToVector(cpu_pred_, predictions);
-    return graph_->GetGlobalAccuracy(cpu_pred_, phase_, config_.do_sampling());
-  } else {
-#endif
-    return graph_->GetGlobalAccuracy(predictions, phase_,
-                                     config_.do_sampling());
-#ifdef GALOIS_ENABLE_GPU
-  }
-#endif
-}
-
-void galois::GraphNeuralNetwork::GradientPropagation() {
-  galois::StatTimer timer("GradientPropagation", "GraphNeuralNetwork");
-  if (timers_on_) {
-    timer.start();
-  }
-
-  // from output layer get initial gradients
-  std::vector<galois::GNNFloat> dummy;
-  std::unique_ptr<galois::GNNLayer>& output_layer = gnn_layers_.back();
-  galois::PointerWithSize<galois::GNNFloat> current_gradients =
-      output_layer->BackwardPhase(dummy, nullptr);
-  // loops through intermediate layers in a backward fashion
-  // -1 to ignore output layer which was handled above
-  for (size_t i = 0; i < gnn_layers_.size() - 1; i++) {
-    // note this assumes you have at least 2 layers (including output)
-    size_t layer_index = gnn_layers_.size() - 2 - i;
-
-    // get the input to the layer before this one
-    galois::PointerWithSize<galois::GNNFloat> prev_layer_input;
-    if (layer_index != 0) {
-      prev_layer_input = gnn_layers_[layer_index - 1]->GetForwardOutput();
-    } else {
-      prev_layer_input = graph_->GetLocalFeatures();
-    }
-
-    // backward prop and get a new set of gradients
-    current_gradients = gnn_layers_[layer_index]->BackwardPhase(
-        prev_layer_input, &current_gradients);
-    // if not output do optimization/gradient descent
-    // at this point in the layer the gradients exist; use the gradients to
-    // update the weights of the layer
-    gnn_layers_[layer_index]->OptimizeLayer(optimizer_.get(), layer_index);
-  }
-
-  if (timers_on_) {
-    timer.stop();
-  }
-}
-
-void galois::GraphNeuralNetwork::CorrectBackwardLinks() {
-  // layer chain pointer
-  PointerWithSize<GNNFloat> prev_output_layer(nullptr, 0);
-  for (size_t layer_num = 0; layer_num < gnn_layers_.size(); layer_num++) {
-    // first layer is nullptr so can be ignored
-    if (layer_num != 0) {
-      gnn_layers_[layer_num]->UpdateBackwardOutput(&prev_output_layer);
-    }
-    prev_output_layer = gnn_layers_[layer_num]->GetForwardOutput();
-  }
-}
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
index b0ed03d34c..7fe3fed8f4 100644
--- a/libgnn/src/graphs/GNNGraph.cpp
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -1,45 +1,12 @@
 // XXX include net interface if necessary
-#include "galois/Logging.h"
-#include "galois/graphs/ReadGraph.h"
 #include "galois/graphs/GNNGraph.h"
-#include "galois/GNNMath.h"
-#include "galois/graphs/DegreeSyncStructures.h"
-#include <limits>
 
-namespace {
-//! Partitions a particular dataset given some partitioning scheme
-std::unique_ptr<galois::graphs::GNNGraph::GNNDistGraph>
-LoadPartition(const std::string& input_directory,
-              const std::string& dataset_name,
-              galois::graphs::GNNPartitionScheme partition_scheme,
-              bool useShad) {
-  // XXX input path
-  std::string input_file = input_directory + dataset_name + ".csgr";
-  GALOIS_LOG_VERBOSE("Partition loading: File to read is {}", input_file);
-
-  // load partition
-  switch (partition_scheme) {
-  case galois::graphs::GNNPartitionScheme::kOEC:
-    return galois::cuspPartitionGraph<GnnOEC, char, void>(
-        input_file, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true, "", "", false, 1);
-  case galois::graphs::GNNPartitionScheme::kCVC:
-    return galois::cuspPartitionGraph<GnnCVC, char, void>(
-        input_file, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true, "", "", false, 1);
-  case galois::graphs::GNNPartitionScheme::kOCVC:
-    return galois::cuspPartitionGraph<GenericCVC, char, void>(
-        input_file, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true, "", "", false, 1);
-  default:
-    GALOIS_LOG_FATAL("Error: partition scheme specified is invalid");
-    return nullptr;
-  }
-}
-
-} // end namespace
+namespace galois {
+namespace graphs {
 
+std::vector<char>* sampled_nodes_ = nullptr;
 // Sync structure variables; global to get around sync structure
 // limitations at the moment
-namespace galois {
-namespace graphs {
 GNNFloat* gnn_matrix_to_sync_            = nullptr;
 size_t gnn_matrix_to_sync_column_length_ = 0;
 size_t subgraph_size_                    = 0;
@@ -47,6 +14,8 @@ size_t subgraph_size_                    = 0;
 galois::DynamicBitSet bitset_graph_aggregate;
 galois::LargeArray<uint32_t>* gnn_lid_to_sid_pointer_ = nullptr;
 size_t num_active_layer_rows_                         = 0;
+//! It specifies offset for feature aggregation
+size_t feature_aggregation_offset_ = 0;
 uint32_t* gnn_degree_vec_1_;
 uint32_t* gnn_degree_vec_2_;
 
@@ -61,1500 +30,6 @@ struct CUDA_Context* cuda_ctx_for_sync;
 struct CUDA_Context* cuda_ctx;
 unsigned layer_number_to_sync;
 #endif
-} // namespace graphs
-} // namespace galois
-
-galois::graphs::GNNGraph::GNNGraph(const std::string& dataset_name,
-                                   GNNPartitionScheme partition_scheme,
-                                   bool has_single_class_label,
-                                   bool useShad)
-    : GNNGraph(galois::default_gnn_dataset_path, dataset_name, partition_scheme,
-               has_single_class_label, useShad) {}
-
-galois::graphs::GNNGraph::GNNGraph(const std::string& input_directory,
-                                   const std::string& dataset_name,
-                                   GNNPartitionScheme partition_scheme,
-                                   bool has_single_class_label,
-                                   bool useShad)
-    : input_directory_(input_directory) {
-  GALOIS_LOG_VERBOSE("[{}] Constructing partitioning for {}", host_id_,
-                     dataset_name);
-  // save host id
-  host_id_ = galois::runtime::getSystemNetworkInterface().ID;
-  host_prefix_ =
-      std::string("[") +
-      std::to_string(galois::runtime::getSystemNetworkInterface().ID) +
-      std::string("] ");
-  // load partition
-  partitioned_graph_ =
-      LoadPartition(input_directory_, dataset_name, partition_scheme, useShad);
-  // reverse edges
-  partitioned_graph_->ConstructIncomingEdges();
-  // mark a node if it is sampled
-  mark_sampled_nodes_.resize(partitioned_graph_->size());
-
-  galois::gInfo(host_prefix_, "Number of local proxies is ",
-                partitioned_graph_->size());
-  galois::gInfo(host_prefix_, "Number of local edges is ",
-                partitioned_graph_->sizeEdges());
-
-  // read additional graph data
-  if (dataset_name != "ogbn-papers100M-remap") {
-    ReadLocalLabels(dataset_name, has_single_class_label);
-  } else {
-    galois::gInfo("Remapped ogbn 100M");
-    ReadLocalLabelsBin(dataset_name);
-  }
-  ReadLocalFeatures(dataset_name);
-  ReadLocalMasks(dataset_name);
-
-  // init gluon from the partitioned graph
-  sync_substrate_ =
-      std::make_unique<galois::graphs::GluonSubstrate<GNNDistGraph>>(
-          *partitioned_graph_, host_id_,
-          galois::runtime::getSystemNetworkInterface().Num, false,
-          partitioned_graph_->cartesianGrid());
-  bitset_graph_aggregate.resize(partitioned_graph_->size());
-
-  // init norm factors (involves a sync call)
-  InitNormFactor();
-
-#ifdef GALOIS_ENABLE_GPU
-  if (device_personality == DevicePersonality::GPU_CUDA) {
-    // allocate/copy data structures over to GPU
-    GALOIS_LOG_VERBOSE("[{}] Initializing GPU memory", host_id_);
-    InitGPUMemory();
-
-    // initialize CUDA context
-    cuda_ctx_ = get_CUDA_context(host_id_);
-    if (!init_CUDA_context(cuda_ctx_, ::gpudevice)) {
-      GALOIS_DIE("Failed to initialize CUDA context");
-    }
-    PartitionedGraphInfo g_info;
-    GetPartitionedGraphInfo(g_info);
-    load_graph_CUDA_GNN(cuda_ctx_, g_info,
-                        galois::runtime::getSystemNetworkInterface().Num);
-  }
-#endif
-}
-
-bool galois::graphs::GNNGraph::IsValidForPhaseCompleteRange(
-    const unsigned lid, const galois::GNNPhase current_phase) const {
-  // only use ranges if they're complete
-  // convert to gid first
-  size_t gid = partitioned_graph_->getGID(lid);
-
-  // select range to use based on phase
-  const GNNRange* range_to_use;
-  switch (current_phase) {
-  case GNNPhase::kTrain:
-    range_to_use = &global_training_mask_range_;
-    break;
-  case GNNPhase::kValidate:
-    range_to_use = &global_validation_mask_range_;
-    break;
-  case GNNPhase::kTest:
-    range_to_use = &global_testing_mask_range_;
-    break;
-  case GNNPhase::kOther:
-    GALOIS_LOG_FATAL("no range for other");
-    break;
-  default:
-    GALOIS_LOG_FATAL("Invalid phase used");
-    range_to_use = nullptr;
-  }
-
-  // if within range, it is valid
-  // there is an assumption here that ranges are contiguous; may not
-  // necessarily be the case in all inputs in which case using the mask is
-  // required (but less cache efficient)
-  if (range_to_use->begin <= gid && gid < range_to_use->end) {
-    return true;
-  } else {
-    return false;
-  }
-}
-
-bool galois::graphs::GNNGraph::IsValidForPhaseMasked(
-    const unsigned lid, const galois::GNNPhase current_phase) const {
-  // select mask to use based on phase
-  const GNNMask* mask_to_use;
-  switch (current_phase) {
-  case GNNPhase::kTrain:
-    mask_to_use = &local_training_mask_;
-    break;
-  case GNNPhase::kValidate:
-    mask_to_use = &local_validation_mask_;
-    break;
-  case GNNPhase::kTest:
-    mask_to_use = &local_testing_mask_;
-    break;
-  case GNNPhase::kOther:
-    if (valid_other_ == 0) {
-      return false;
-    }
-    mask_to_use = &other_mask_;
-    break;
-  case GNNPhase::kBatch:
-    mask_to_use = &local_minibatch_mask_;
-    break;
-  default:
-    GALOIS_LOG_FATAL("Invalid phase used");
-    mask_to_use = nullptr;
-  }
-
-  return (*mask_to_use)[lid];
-}
-
-void galois::graphs::GNNGraph::AggregateSync(
-    GNNFloat* matrix_to_sync, const size_t matrix_column_size, bool is_backward,
-    uint32_t active_row_boundary) const {
-  gnn_matrix_to_sync_               = matrix_to_sync;
-  gnn_matrix_to_sync_column_length_ = matrix_column_size;
-  subgraph_size_                    = active_size();
-  num_active_layer_rows_            = active_row_boundary;
-  if (!use_subgraph_ && !use_subgraph_view_) {
-    // set globals for the sync substrate
-    if (!is_backward) {
-      if (use_timer_) {
-        sync_substrate_->sync<writeSource, readAny, GNNSumAggregate,
-                              Bitset_graph_aggregate>("GraphAggregateSync");
-      } else {
-        sync_substrate_->sync<writeSource, readAny, GNNSumAggregate,
-                              Bitset_graph_aggregate>("Ignore");
-      }
-    } else {
-      galois::StatTimer clubbed_timer("Sync_BackwardSync", "Gluon");
-      clubbed_timer.start();
-      sync_substrate_->sync<writeDestination, readAny, GNNSumAggregate,
-                            Bitset_graph_aggregate>(
-          "BackwardGraphAggregateSync");
-      clubbed_timer.stop();
-    }
-  } else {
-    // setup the SID to LID map for the sync substrate to use (SID != LID)
-    gnn_lid_to_sid_pointer_ = subgraph_->GetLIDToSIDPointer();
-
-    if (!is_backward) {
-      if (use_timer_) {
-        sync_substrate_->sync<writeSource, readAny, GNNSampleSumAggregate,
-                              Bitset_graph_aggregate>("GraphAggregateSync");
-      } else {
-        sync_substrate_->sync<writeSource, readAny, GNNSampleSumAggregate,
-                              Bitset_graph_aggregate>("Ignore");
-      }
-    } else {
-      galois::StatTimer clubbed_timer("Sync_BackwardSync", "Gluon");
-      clubbed_timer.start();
-      sync_substrate_->sync<writeDestination, readAny, GNNSampleSumAggregate,
-                            Bitset_graph_aggregate>(
-          "BackwardGraphAggregateSync");
-      clubbed_timer.stop();
-    }
-  }
-}
-
-#ifdef GALOIS_ENABLE_GPU
-void galois::graphs::GNNGraph::AggregateSyncGPU(
-    GNNFloat* matrix_to_sync, const size_t matrix_column_size,
-    const unsigned layer_number) const {
-  size_t layer_input_mtx_column_size =
-      getLayerInputMatrixColumnSize(cuda_ctx_, layer_number);
-  size_t layer_output_mtx_column_size =
-      getLayerOutputMatrixColumnSize(cuda_ctx_, layer_number);
-  // set globals for the sync substrate
-  gnn_matrix_to_sync_               = matrix_to_sync;
-  gnn_matrix_to_sync_column_length_ = matrix_column_size;
-  cuda_ctx_for_sync                 = cuda_ctx_;
-  layer_number_to_sync              = layer_number;
-  // TODO bitset setting
-  // call sync
-  cudaSetLayerInputOutput(cuda_ctx_, matrix_to_sync, matrix_column_size, size(),
-                          layer_number);
-
-  // XXX no timer if use_timer is off
-  if (gnn_matrix_to_sync_column_length_ == layer_input_mtx_column_size) {
-    if (use_timer_) {
-      sync_substrate_->sync<writeSource, readAny, GNNSumAggregate_layer_input>(
-          "GraphAggregateSync", gnn_matrix_to_sync_column_length_);
-    } else {
-      sync_substrate_->sync<writeSource, readAny, GNNSumAggregate_layer_input>(
-          "Ignore", gnn_matrix_to_sync_column_length_);
-    }
-  } else if (gnn_matrix_to_sync_column_length_ ==
-             layer_output_mtx_column_size) {
-    if (use_timer_) {
-      sync_substrate_->sync<writeSource, readAny, GNNSumAggregate_layer_output>(
-          "GraphAggregateSync", gnn_matrix_to_sync_column_length_);
-    } else {
-      sync_substrate_->sync<writeSource, readAny, GNNSumAggregate_layer_output>(
-          "Ignore", gnn_matrix_to_sync_column_length_);
-    }
-  } else {
-    GALOIS_LOG_FATAL("Column size of the synchronized matrix does not"
-                     " match to the column size of the CUDA context");
-  }
-}
-#endif
-void galois::graphs::GNNGraph::ReadLocalLabelsBin(
-    const std::string& dataset_name) {
-  GALOIS_LOG_VERBOSE("[{}] Reading labels from disk...", host_id_);
-
-  std::ifstream file_stream;
-  file_stream.open(input_directory_ + dataset_name + "-labels-dims.txt",
-                   std::ios::in);
-  size_t num_nodes;
-  file_stream >> num_nodes >> num_label_classes_ >> std::ws;
-  assert(num_nodes == partitioned_graph_->globalSize());
-  if (host_id_ == 0) {
-    galois::gInfo("Number of label classes is ", num_label_classes_);
-  }
-  file_stream.close();
-
-  std::string filename = input_directory_ + dataset_name + "-labels.bin";
-  std::ifstream file_stream_bin;
-  file_stream_bin.open(filename, std::ios::binary | std::ios::in);
-
-  std::vector<GNNLabel> all_labels(num_nodes);
-  // read all labels into a vector
-  file_stream_bin.read((char*)all_labels.data(), sizeof(GNNLabel) * num_nodes);
-
-  using_single_class_labels_ = true;
-  local_ground_truth_labels_.resize(partitioned_graph_->size());
-
-  galois::GAccumulator<size_t> found_local_vertices;
-  found_local_vertices.reset();
-
-  // save only local ones; can do in parallel as well
-  // assumes -1 already dealt with
-  galois::do_all(galois::iterate(size_t{0}, partitioned_graph_->size()),
-                 [&](size_t lid) {
-                   local_ground_truth_labels_[lid] = all_labels[GetGID(lid)];
-                   found_local_vertices += 1;
-                 });
-
-  size_t fli = found_local_vertices.reduce();
-  galois::gInfo(host_prefix_, "Read ", fli, " labels (",
-                local_ground_truth_labels_.size() * double{4} / (1 << 30),
-                " GB)");
-  GALOIS_LOG_ASSERT(fli == partitioned_graph_->size());
-}
-
-void galois::graphs::GNNGraph::ReadLocalLabels(const std::string& dataset_name,
-                                               bool has_single_class_label) {
-  GALOIS_LOG_VERBOSE("[{}] Reading labels from disk...", host_id_);
-  std::string filename;
-  if (has_single_class_label) {
-    filename = input_directory_ + dataset_name + "-labels.txt";
-  } else {
-    filename = input_directory_ + dataset_name + "-mlabels.txt";
-  }
-
-  // read file header, save num label classes while at it
-  std::ifstream file_stream;
-  file_stream.open(filename, std::ios::in);
-  size_t num_nodes;
-  file_stream >> num_nodes >> num_label_classes_ >> std::ws;
-  assert(num_nodes == partitioned_graph_->globalSize());
-  if (host_id_ == 0) {
-    galois::gInfo("Number of label classes is ", num_label_classes_);
-  }
-
-  // allocate memory for labels
-  if (has_single_class_label) {
-    // single-class (one-hot) label for each vertex: N x 1
-    using_single_class_labels_ = true;
-    local_ground_truth_labels_.resize(partitioned_graph_->size());
-  } else {
-    // multi-class label for each vertex: N x num classes
-    using_single_class_labels_ = false;
-    local_ground_truth_labels_.resize(partitioned_graph_->size() *
-                                      num_label_classes_);
-  }
-
-  size_t cur_gid              = 0;
-  size_t found_local_vertices = 0;
-  // each line contains a set of 0s and 1s
-  std::string read_line;
-
-  // loop through all labels of the graph
-  while (std::getline(file_stream, read_line)) {
-    // only process label if this node is local
-    if (partitioned_graph_->isLocal(cur_gid)) {
-      uint32_t cur_lid = partitioned_graph_->getLID(cur_gid);
-      // read line as bitset of 0s and 1s
-      std::istringstream label_stream(read_line);
-      int cur_bit;
-      // bitset size is # of label classes
-      for (size_t cur_class = 0; cur_class < num_label_classes_; ++cur_class) {
-        // read a bit
-        label_stream >> cur_bit;
-
-        if (has_single_class_label) {
-          // no label
-          if (cur_bit == -1) {
-            local_ground_truth_labels_[cur_lid] = num_label_classes_;
-            break;
-          }
-
-          // in single class, only 1 bit is set in bitset; that represents the
-          // class to take
-          if (cur_bit != 0) {
-            // set class and break (assumption is that's the only bit that is
-            // set)
-            local_ground_truth_labels_[cur_lid] = cur_class;
-            break;
-          }
-        } else {
-          // else the entire bitset needs to be copied over to the label array
-          // TODO this can possibly be saved all at once rather than bit by bit?
-          local_ground_truth_labels_[cur_lid * num_label_classes_ + cur_class] =
-              cur_bit;
-        }
-      }
-      found_local_vertices++;
-    }
-    // always increment cur_gid
-    cur_gid++;
-  }
-
-  file_stream.close();
-
-  galois::gInfo(host_prefix_, "Read ", found_local_vertices, " labels (",
-                local_ground_truth_labels_.size() * double{4} / (1 << 30),
-                " GB)");
-  GALOIS_LOG_ASSERT(found_local_vertices == partitioned_graph_->size());
-}
-
-void galois::graphs::GNNGraph::ReadLocalFeatures(
-    const std::string& dataset_name) {
-  GALOIS_LOG_VERBOSE("[{}] Reading features from disk...", host_id_);
-
-  // read in dimensions of features, specifically node feature length
-  size_t num_global_vertices;
-
-  std::string file_dims = input_directory_ + dataset_name + "-dims.txt";
-  std::ifstream ifs;
-  ifs.open(file_dims, std::ios::in);
-  ifs >> num_global_vertices >> node_feature_length_;
-  ifs.close();
-
-  GALOIS_LOG_ASSERT(num_global_vertices == partitioned_graph_->globalSize());
-  GALOIS_LOG_VERBOSE("[{}] N x D: {} x {}", host_id_, num_global_vertices,
-                     node_feature_length_);
-
-  // memory for all features of all nodes in graph
-  // TODO read features without loading entire feature file into memory; this
-  // is quite inefficient
-  std::unique_ptr<GNNFloat[]> full_feature_set =
-      std::make_unique<GNNFloat[]>(num_global_vertices * node_feature_length_);
-
-  // read in all features
-  std::ifstream file_stream;
-  std::string feature_file = input_directory_ + dataset_name + "-feats.bin";
-  file_stream.open(feature_file, std::ios::binary | std::ios::in);
-  file_stream.read((char*)full_feature_set.get(), sizeof(GNNFloat) *
-                                                      num_global_vertices *
-                                                      node_feature_length_);
-  file_stream.close();
-
-  // allocate memory for local features
-  local_node_features_.resize(partitioned_graph_->size() *
-                              node_feature_length_);
-
-  // copy over features for local nodes only
-  galois::GAccumulator<size_t> num_kept_vertices;
-  num_kept_vertices.reset();
-  galois::do_all(
-      galois::iterate(size_t{0}, num_global_vertices), [&](size_t gid) {
-        if (partitioned_graph_->isLocal(gid)) {
-          // copy over feature vector
-          std::copy(full_feature_set.get() + gid * node_feature_length_,
-                    full_feature_set.get() + (gid + 1) * node_feature_length_,
-                    &local_node_features_[partitioned_graph_->getLID(gid) *
-                                          node_feature_length_]);
-          num_kept_vertices += 1;
-        }
-      });
-  full_feature_set.reset();
-
-  galois::gInfo(host_prefix_, "Read ", local_node_features_.size(),
-                " features (",
-                local_node_features_.size() * double{4} / (1 << 30), " GB)");
-  GALOIS_LOG_ASSERT(num_kept_vertices.reduce() == partitioned_graph_->size());
-}
-
-//! Helper function to read masks from file into the appropriate structures
-//! given a name, mask type, and arrays to save into
-size_t galois::graphs::GNNGraph::ReadLocalMasksFromFile(
-    const std::string& dataset_name, const std::string& mask_type,
-    GNNRange* mask_range, std::vector<char>* masks) {
-  size_t range_begin;
-  size_t range_end;
-
-  // read mask range
-  std::string mask_filename =
-      input_directory_ + dataset_name + "-" + mask_type + "_mask.txt";
-  bool train_is_on = false;
-  if (mask_type == "train") {
-    train_is_on = true;
-  }
-
-  std::ifstream mask_stream;
-  mask_stream.open(mask_filename, std::ios::in);
-  mask_stream >> range_begin >> range_end >> std::ws;
-  GALOIS_LOG_ASSERT(range_begin <= range_end);
-
-  // set the range object
-  mask_range->begin = range_begin;
-  mask_range->end   = range_end;
-  mask_range->size  = range_end - range_begin;
-
-  size_t cur_line_num = 0;
-  // valid nodes on this host
-  size_t local_sample_count = 0;
-  // this tracks TOTAL # of valid nodes in this group (not necessarily valid
-  // ones on this host)
-  size_t valid_count = 0;
-  std::string line;
-  // each line is a number signifying if mask is set for the vertex
-  while (std::getline(mask_stream, line)) {
-    std::istringstream mask_stream(line);
-    // only examine vertices/lines in range
-    if (cur_line_num >= range_begin && cur_line_num < range_end) {
-      unsigned mask = 0;
-      mask_stream >> mask;
-      if (mask == 1) {
-        valid_count++;
-        if (partitioned_graph_->isLocal(cur_line_num)) {
-          (*masks)[partitioned_graph_->getLID(cur_line_num)] = 1;
-          local_sample_count++;
-        }
-        if (train_is_on) {
-          global_training_mask_[cur_line_num] = 1;
-        }
-      }
-    }
-    cur_line_num++;
-  }
-  mask_stream.close();
-
-  if (train_is_on) {
-    global_training_count_ = valid_count;
-  }
-
-  if (valid_count != mask_range->size) {
-    // overlapping masks: need to actually check the masks rather than use
-    // ranges
-    if (!incomplete_masks_) {
-      galois::gInfo(
-          "Masks are not contained in range: must actually check mask");
-    }
-    incomplete_masks_ = true;
-  }
-
-  return valid_count;
-}
-
-size_t galois::graphs::GNNGraph::FindOtherMask() {
-  galois::GAccumulator<size_t> other_accum;
-  other_accum.reset();
-  other_mask_.resize(partitioned_graph_->size());
-
-  galois::do_all(
-      galois::iterate(size_t{0}, partitioned_graph_->size()),
-      [&](size_t local_id) {
-        if (!IsValidForPhase(local_id, GNNPhase::kTrain) &&
-            !IsValidForPhase(local_id, GNNPhase::kValidate) &&
-            !IsValidForPhase(local_id, GNNPhase::kTest)) {
-          other_mask_[local_id] = 1;
-          other_accum += 1;
-        }
-      },
-      galois::loopname("FindOtherMask"));
-  return other_accum.reduce();
-}
-
-void galois::graphs::GNNGraph::ReadLocalMasks(const std::string& dataset_name) {
-  // allocate the memory for the local masks
-  global_training_mask_.resize(partitioned_graph_->globalSize());
-  local_training_mask_.resize(partitioned_graph_->size());
-  local_validation_mask_.resize(partitioned_graph_->size());
-  local_testing_mask_.resize(partitioned_graph_->size());
-
-  if (dataset_name == "reddit") {
-    global_training_count_ = 153431;
-
-    // TODO reddit is hardcode handled at the moment; better way to not do
-    // this?
-    global_training_mask_range_   = {.begin = 0, .end = 153431, .size = 153431};
-    global_validation_mask_range_ = {
-        .begin = 153431, .end = 153431 + 23831, .size = 23831};
-    global_testing_mask_range_ = {
-        .begin = 177262, .end = 177262 + 55703, .size = 55703};
-
-    // training
-    for (size_t i = global_training_mask_range_.begin;
-         i < global_training_mask_range_.end; i++) {
-      if (partitioned_graph_->isLocal(i)) {
-        local_training_mask_[partitioned_graph_->getLID(i)] = 1;
-      }
-      global_training_mask_[i] = 1;
-    }
-
-    // validation
-    for (size_t i = global_validation_mask_range_.begin;
-         i < global_validation_mask_range_.end; i++) {
-      if (partitioned_graph_->isLocal(i)) {
-        local_validation_mask_[partitioned_graph_->getLID(i)] = 1;
-      }
-    }
-
-    // testing
-    for (size_t i = global_testing_mask_range_.begin;
-         i < global_testing_mask_range_.end; i++) {
-      if (partitioned_graph_->isLocal(i)) {
-        local_testing_mask_[partitioned_graph_->getLID(i)] = 1;
-      }
-    }
-  } else if (dataset_name == "ogbn-papers100M-remap") {
-    global_training_count_ = 1207178;
-
-    global_training_mask_range_ = {.begin = 0, .end = 1207178, .size = 1207178};
-    global_validation_mask_range_ = {
-        .begin = 1207178, .end = 1207178 + 125264, .size = 125264};
-    global_testing_mask_range_ = {
-        .begin = 1332442, .end = 1332442 + 214337, .size = 214337};
-    // training
-    for (size_t i = global_training_mask_range_.begin;
-         i < global_training_mask_range_.end; i++) {
-      if (partitioned_graph_->isLocal(i)) {
-        local_training_mask_[partitioned_graph_->getLID(i)] = 1;
-      }
-      global_training_mask_[i] = 1;
-    }
-    // validation
-    for (size_t i = global_validation_mask_range_.begin;
-         i < global_validation_mask_range_.end; i++) {
-      if (partitioned_graph_->isLocal(i)) {
-        local_validation_mask_[partitioned_graph_->getLID(i)] = 1;
-      }
-    }
-    // testing
-    for (size_t i = global_testing_mask_range_.begin;
-         i < global_testing_mask_range_.end; i++) {
-      if (partitioned_graph_->isLocal(i)) {
-        local_testing_mask_[partitioned_graph_->getLID(i)] = 1;
-      }
-    }
-    valid_other_ = FindOtherMask();
-    GALOIS_LOG_ASSERT(valid_other_ <= 109513177);
-  } else {
-    size_t valid_train = ReadLocalMasksFromFile(dataset_name, "train",
-                                                &global_training_mask_range_,
-                                                &local_training_mask_);
-    size_t valid_val   = ReadLocalMasksFromFile(dataset_name, "val",
-                                              &global_validation_mask_range_,
-                                              &local_validation_mask_);
-    size_t valid_test  = ReadLocalMasksFromFile(dataset_name, "test",
-                                               &global_testing_mask_range_,
-                                               &local_testing_mask_);
-    valid_other_       = FindOtherMask();
-    // the "other" set of nodes that don't fall into any classification
-    if (galois::runtime::getSystemNetworkInterface().ID == 0) {
-      galois::gInfo("Valid # training nodes is ", valid_train);
-      galois::gInfo("Valid # validation nodes is ", valid_val);
-      galois::gInfo("Valid # test nodes is ", valid_test);
-      galois::gInfo("Valid # other nodes is ", valid_other_);
-    }
-  }
-}
-
-void galois::graphs::GNNGraph::InitNormFactor() {
-  GALOIS_LOG_VERBOSE("[{}] Initializing norm factors", host_id_);
-  global_degrees_.resize(partitioned_graph_->size(), 0.0);
-  global_train_degrees_.resize(partitioned_graph_->size(), 0.0);
-  CalculateFullNormFactor();
-#ifdef GALOIS_ENABLE_GPU
-  if (device_personality == DevicePersonality::GPU_CUDA) {
-    gpu_memory_.InitNormFactor(partitioned_graph_->size());
-  }
-#endif
-}
-
-void galois::graphs::GNNGraph::CalculateFullNormFactor() {
-  // TODO(loc) reset all degrees if this is called multiple times?
-  // get the norm factor contribution for each node based on the GLOBAL graph
-  galois::do_all(
-      galois::iterate(static_cast<size_t>(0), partitioned_graph_->size()),
-      [&](size_t src) {
-        for (auto edge_iter = partitioned_graph_->edge_begin(src);
-             edge_iter != partitioned_graph_->edge_end(src); edge_iter++) {
-          // count degrees for all + train/other
-          size_t dest = GetEdgeDest(edge_iter);
-          if (IsValidForPhase(dest, GNNPhase::kTrain) ||
-              IsValidForPhase(dest, GNNPhase::kOther)) {
-            global_train_degrees_[src] += 1;
-          }
-          global_degrees_[src] += 1;
-        }
-      },
-      galois::loopname("CalculateLocalDegrees"));
-  // degree sync
-  gnn_degree_vec_1_ = global_train_degrees_.data();
-  gnn_degree_vec_2_ = global_degrees_.data();
-  sync_substrate_->sync<writeSource, readAny, InitialDegreeSync>(
-      "InitialDegreeSync");
-}
-
-float galois::graphs::GNNGraph::GetGlobalAccuracy(
-    PointerWithSize<GNNFloat> predictions, GNNPhase phase) {
-  // No GPU version yet, but this is where it would be
-  return GetGlobalAccuracy(predictions, phase, false);
-}
-
-float galois::graphs::GNNGraph::GetGlobalAccuracy(
-    PointerWithSize<GNNFloat> predictions, GNNPhase phase, bool sampling) {
-  // No GPU version yet, but this is where it would be
-  return GetGlobalAccuracyCPU(predictions, phase, sampling);
-}
-
-float galois::graphs::GNNGraph::GetGlobalAccuracyCPU(
-    PointerWithSize<GNNFloat> predictions, GNNPhase phase, bool sampling) {
-  galois::StatTimer global_accuracy_timer("GetGlobalAccuracy");
-  galois::StatTimer global_accuracy_for_singleclass_timer("GetGlobalAccuracyForSingleClass");
-  galois::StatTimer global_accuracy_for_multiclass_timer("GetGlobalAccuracyForMultiClass");
-  global_accuracy_timer.start();
-  float accuracy{0};
-  if (is_single_class_label()) {
-    global_accuracy_for_singleclass_timer.start();
-    accuracy = GetGlobalAccuracyCPUSingle(predictions, phase, sampling);
-    global_accuracy_for_singleclass_timer.stop();
-  } else {
-    global_accuracy_for_multiclass_timer.start();
-    accuracy = GetGlobalAccuracyCPUMulti(predictions, phase, sampling);
-    global_accuracy_for_multiclass_timer.stop();
-  }
-  global_accuracy_timer.stop();
-  return accuracy;
-}
-
-float galois::graphs::GNNGraph::GetGlobalAccuracyCPUSingle(
-    PointerWithSize<GNNFloat> predictions, GNNPhase phase, bool) {
-  // check owned nodes' accuracy
-  num_correct_.reset();
-  total_checked_.reset();
-
-  galois::do_all(
-      // will only loop over sampled nodes if sampling is on
-      galois::iterate(begin_owned(), end_owned()),
-      // this is possibly the subgraph id
-      [&](const unsigned node_id) {
-        if (IsValidForPhase(node_id, phase)) {
-          total_checked_ += 1;
-          // get prediction by getting max
-          // note the use of node_id here: lid only used to check original
-          // labels
-          size_t predicted_label = galois::MaxIndex(
-              num_label_classes_, &(predictions[node_id * num_label_classes_]));
-          // check against ground truth and track accordingly
-          // TODO static cast used here is dangerous
-          if (predicted_label ==
-              static_cast<size_t>(GetSingleClassLabel(node_id))) {
-            num_correct_ += 1;
-          }
-        }
-      },
-      // steal on as some threads may have nothing to work on
-      galois::steal());
-
-  size_t global_correct = num_correct_.reduce();
-  size_t global_checked = total_checked_.reduce();
-
-  GALOIS_LOG_DEBUG("Sub: {}, Accuracy: {} / {}", use_subgraph_, global_correct,
-                   global_checked);
-
-  return static_cast<float>(global_correct) /
-         static_cast<float>(global_checked);
-}
-std::pair<uint32_t, uint32_t> galois::graphs::GNNGraph::GetBatchAccuracy(
-    PointerWithSize<GNNFloat> predictions) {
-  // check owned nodes' accuracy
-  num_correct_.reset();
-  total_checked_.reset();
-
-  galois::do_all(
-      // will only loop over sampled nodes if sampling is on
-      galois::iterate(begin_owned(), end_owned()),
-      // this is possibly the subgraph id
-      [&](const unsigned node_id) {
-        if (IsValidForPhase(node_id, GNNPhase::kBatch)) {
-          total_checked_ += 1;
-          size_t predicted_label = galois::MaxIndex(
-              num_label_classes_, &(predictions[node_id * num_label_classes_]));
-          if (predicted_label ==
-              static_cast<size_t>(GetSingleClassLabel(node_id))) {
-            num_correct_ += 1;
-          }
-        }
-      },
-      // steal on as some threads may have nothing to work on
-      galois::steal(), galois::loopname("GlobalAccuracy"));
-
-  size_t global_correct = num_correct_.reduce();
-  size_t global_checked = total_checked_.reduce();
-
-  return std::make_pair(global_correct, global_checked);
-}
-
-float galois::graphs::GNNGraph::GetGlobalAccuracyCPUMulti(
-    PointerWithSize<GNNFloat> predictions, GNNPhase phase, bool sampling) {
-
-  const GNNLabel* full_ground_truth = GetMultiClassLabel(0);
-  assert(predictions.size() == (num_label_classes_ * size()));
-
-  size_t global_true_positive  = 0;
-  size_t global_true_negative  = 0;
-  size_t global_false_positive = 0;
-  size_t global_false_negative = 0;
-  size_t global_f1_score       = 0;
-
-  // per class check
-  for (size_t label_class = 0; label_class < num_label_classes_;
-       label_class++) {
-    local_true_positive_.reset();
-    local_true_negative_.reset();
-    local_false_positive_.reset();
-    local_false_negative_.reset();
-
-    // loop through all *owned* nodes (do not want to overcount)
-    galois::do_all(
-        galois::iterate(begin_owned(), end_owned()),
-        [&](const unsigned lid) {
-          if (IsValidForPhase(lid, phase)) {
-            if (sampling) {
-              if (phase == GNNPhase::kTrain && !IsInSampledGraph(lid)) {
-                return;
-              }
-            }
-
-            size_t label_index = lid * num_label_classes_ + label_class;
-
-            GNNLabel true_label = full_ground_truth[label_index];
-            GNNLabel prediction_is_positive =
-                (predictions[label_index] > 0.5) ? 1 : 0;
-
-            if (true_label && prediction_is_positive) {
-              local_true_positive_ += 1;
-            } else if (true_label && !prediction_is_positive) {
-              local_false_negative_ += 1;
-            } else if (!true_label && prediction_is_positive) {
-              local_false_positive_ += 1;
-            } else if (!true_label && !prediction_is_positive) {
-              local_true_negative_ += 1;
-            } else {
-              // all cases should be covered with clauses above, so it should
-              // NEVER get here; adding it here just for sanity purposes
-              GALOIS_LOG_FATAL(
-                  "Logic error with true label and prediction label");
-            }
-          }
-          total_checked_ += 1;
-        },
-        galois::steal(), galois::loopname("GlobalMultiAccuracy"));
-
-    // reduce from accumulators across all hosts for this particular class
-    size_t class_true_positives  = local_true_positive_.reduce();
-    size_t class_false_positives = local_false_positive_.reduce();
-    size_t class_true_negatives  = local_true_negative_.reduce();
-    size_t class_false_negatives = local_false_negative_.reduce();
-
-    // add to global counts
-    global_true_positive += class_true_positives;
-    global_false_positive += class_false_positives;
-    global_true_negative += class_true_negatives;
-    global_false_negative += class_false_negatives;
-
-    // calculate precision, recall, and f1 score for this class
-    // ternery op used to avoid division by 0
-    double class_precision =
-        (class_true_positives + class_true_negatives) > 0
-            ? static_cast<double>(class_true_positives) /
-                  (class_true_positives + class_false_positives)
-            : 0.0;
-    double class_recall =
-        (class_true_positives + class_false_negatives) > 0
-            ? static_cast<double>(class_true_positives) /
-                  (class_true_positives + class_false_negatives)
-            : 0.0;
-    double class_f1_score = (class_precision + class_recall) > 0
-                                ? (2.0 * (class_precision * class_recall)) /
-                                      (class_precision + class_recall)
-                                : 0.0;
-
-    global_f1_score += class_f1_score;
-  } // end label class loop
-
-  // GALOIS_LOG_WARN("{} {} {} {}", global_true_positive, global_true_negative,
-  // global_false_positive, global_false_negative);
-
-  // double global_f1_macro_score = global_f1_score / num_label_classes_;
-
-  // micro = considers all classes for precision/recall
-  double global_micro_precision =
-      (global_true_positive + global_true_negative) > 0
-          ? static_cast<double>(global_true_positive) /
-                (global_true_positive + global_false_positive)
-          : 0.0;
-  double global_micro_recall =
-      (global_true_positive + global_false_negative) > 0
-          ? static_cast<double>(global_true_positive) /
-                (global_true_positive + global_false_negative)
-          : 0.0;
-
-  double global_f1_micro_score =
-      (global_micro_precision + global_micro_recall) > 0
-          ? (2.0 * (global_micro_precision * global_micro_recall)) /
-                (global_micro_precision + global_micro_recall)
-          : 0.0;
-
-  return global_f1_micro_score;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-void galois::graphs::GNNGraph::InitializeSamplingData(size_t num_layers,
-                                                      bool choose_all) {
-  subgraph_ = std::make_unique<GNNSubgraph>(partitioned_graph_->size());
-  sample_node_timestamps_.create(partitioned_graph_->size(),
-                                 std::numeric_limits<uint32_t>::max());
-  edge_sample_status_.resize(num_layers);
-  for (size_t i = 0; i < num_layers; i++) {
-    edge_sample_status_[i].resize(partitioned_graph_->sizeEdges());
-  }
-  sampled_edges_.resize(partitioned_graph_->sizeEdges());
-  // this is to hold the degree of a sampled graph considering all hosts; yes,
-  // memory wise this is slightly problematic possibly, but each layer is its
-  // own subgraph
-  if (!choose_all) {
-    sampled_out_degrees_.resize(num_layers);
-    for (galois::LargeArray<uint32_t>& array : sampled_out_degrees_) {
-      array.create(partitioned_graph_->size());
-    }
-  } else {
-    subgraph_choose_all_ = true;
-  }
-  definitely_sampled_nodes_.resize(partitioned_graph_->size());
-  master_offset_accum_.resize(num_layers + 1);
-  mirror_offset_accum_.resize(num_layers + 1);
-  sample_master_offsets_.resize(num_layers + 1, 0);
-  sample_mirror_offsets_.resize(num_layers + 1, 0);
-}
-
-size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) {
-  DisableSubgraph();
-
-  if (!bitset_sample_flag_.size()) {
-    bitset_sample_flag_.resize(size());
-  }
-  bitset_sample_flag_.ParallelReset();
-  definitely_sampled_nodes_.ParallelReset();
-
-  galois::do_all(
-      galois::iterate(begin_owned(), end_owned()),
-      [&](const NodeIterator& x) {
-        if (IsValidForPhase(*x, seed_phase)) {
-          SetSampledNode(*x);
-          bitset_sample_flag_.set(*x);
-          definitely_sampled_nodes_.set(*x);
-        } else {
-          UnsetSampledNode(*x);
-        }
-      },
-      galois::loopname("InitialSeedSetting"));
-  // unsets nodes set in previous iterations; for some reason they get
-  // synchronized along  with everything else even though bitset sample flag
-  // should prevent it (that, or it's because they don't get sync'd that they
-  // remain the same)
-  galois::do_all(galois::iterate(end_owned(), end()),
-                 [&](const NodeIterator& x) { UnsetSampledNode(*x); });
-
-  // clear node timestamps
-  galois::StatTimer fill_time("ClearFillTime");
-  fill_time.start();
-  galois::ParallelSTL::fill(sample_node_timestamps_.begin(),
-                            sample_node_timestamps_.end(),
-                            std::numeric_limits<uint32_t>::max());
-  galois::ParallelSTL::fill(sample_master_offsets_.begin(),
-                            sample_master_offsets_.end(), 0);
-  galois::ParallelSTL::fill(sample_mirror_offsets_.begin(),
-                            sample_mirror_offsets_.end(), 0);
-  fill_time.stop();
-
-  for (unsigned i = 0; i < master_offset_accum_.size(); i++) {
-    master_offset_accum_[i].reset();
-    mirror_offset_accum_[i].reset();
-  }
-
-  // clear all sampled edges
-  galois::StatTimer ctime("ClearSampleEdges");
-  ctime.start();
-  for (galois::DynamicBitSet& edge_layer : edge_sample_status_) {
-    edge_layer.ParallelReset();
-  }
-  ctime.stop();
-  //  galois::do_all(
-  //      galois::iterate(edge_sample_status_.begin(),
-  //      edge_sample_status_.end()),
-  //      [&](galois::DynamicBitSet& edge_layer) { edge_layer.reset(); },
-  //      galois::loopname("ClearSampleEdges"));
-
-  sampled_edges_.ParallelReset();
-
-  // reset all degrees
-  if (!subgraph_choose_all_) {
-    galois::StatTimer cad_timer("ClearAllDegrees");
-    cad_timer.start();
-    for (galois::LargeArray<uint32_t>& array : sampled_out_degrees_) {
-      galois::ParallelSTL::fill(array.begin(), array.end(), 0);
-    }
-    cad_timer.stop();
-  }
-
-  if (!bitset_sampled_degrees_.size()) {
-    bitset_sampled_degrees_.resize(partitioned_graph_->size());
-  }
-  bitset_sampled_degrees_.reset();
-
-  // Seed nodes sync
-  if (use_timer_) {
-    sync_substrate_
-        ->sync<writeSource, readAny, SampleFlagSync, SampleFlagBitset>(
-            "SeedNodeSample");
-  } else {
-    sync_substrate_
-        ->sync<writeSource, readAny, SampleFlagSync, SampleFlagBitset>(
-            "Ignore");
-  }
-
-  galois::GAccumulator<unsigned> local_seed_count;
-  local_seed_count.reset();
-  galois::GAccumulator<unsigned> master_offset;
-  master_offset.reset();
-  galois::GAccumulator<unsigned> mirror_offset;
-  mirror_offset.reset();
-  // count # of seed nodes
-  galois::do_all(
-      galois::iterate(begin(), end()),
-      [&](const NodeIterator& x) {
-        if (IsInSampledGraph(x)) {
-          if (*x < *end_owned()) {
-            master_offset += 1;
-          } else {
-            // mirror
-            mirror_offset += 1;
-          }
-
-          // galois::gInfo(host_prefix_, "Seed node is ", GetGID(*x));
-          local_seed_count += 1;
-          // 0 = seed node
-          sample_node_timestamps_[*x] = 0;
-        }
-      },
-      galois::loopname("SeedNodeOffsetCounting"));
-
-  sample_master_offsets_[0] = master_offset.reduce();
-  sample_mirror_offsets_[0] = mirror_offset.reduce();
-
-  return local_seed_count.reduce();
-}
-
-size_t galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num,
-                                                bool inductive_subgraph,
-                                                size_t timestamp) {
-  DisableSubgraph();
-
-  galois::do_all(
-      galois::iterate(begin(), end()),
-      [&](const NodeIterator& src_iter) {
-        // only operate on if sampled
-        if (IsInSampledGraph(src_iter)) {
-          // marks ALL edges of nodes that connect to train/other nodes
-          for (auto edge_iter : partitioned_graph_->edges(*src_iter)) {
-            // total += 1;
-            if (inductive_subgraph) {
-              if (!IsValidForPhase(partitioned_graph_->getEdgeDst(edge_iter),
-                                   GNNPhase::kTrain) &&
-                  !IsValidForPhase(partitioned_graph_->getEdgeDst(edge_iter),
-                                   GNNPhase::kOther)) {
-                continue;
-              }
-            }
-
-            MakeEdgeSampled(edge_iter, agg_layer_num);
-            uint32_t dest = partitioned_graph_->getEdgeDst(edge_iter);
-            if (!IsInSampledGraph(dest)) {
-              bitset_sample_flag_.set(dest);
-            }
-            definitely_sampled_nodes_.set(*src_iter);
-            definitely_sampled_nodes_.set(dest);
-          }
-        }
-      },
-      galois::steal(), galois::loopname("ChooseAllEdges"));
-
-  // update nodes, then communicate update to all hosts so that they can
-  // continue the exploration
-  galois::do_all(
-      galois::iterate(size_t{0}, bitset_sample_flag_.size()),
-      [&](uint32_t new_node_id) {
-        if (bitset_sample_flag_.test(new_node_id)) {
-          SetSampledNode(new_node_id);
-        }
-      },
-      galois::loopname("NeighborhoodSampleSet"));
-
-  if (use_timer_) {
-    sync_substrate_
-        ->sync<writeDestination, readAny, SampleFlagSync, SampleFlagBitset>(
-            "SampleFlag");
-  } else {
-    sync_substrate_
-        ->sync<writeDestination, readAny, SampleFlagSync, SampleFlagBitset>(
-            "Ignore");
-  }
-
-  galois::GAccumulator<unsigned> local_sample_count;
-  local_sample_count.reset();
-  // count # of seed nodes
-  galois::do_all(galois::iterate(begin(), end()), [&](const NodeIterator& x) {
-    if (IsInSampledGraph(x)) {
-      local_sample_count += 1;
-      if (sample_node_timestamps_[*x] == std::numeric_limits<uint32_t>::max()) {
-        if (x < end_owned()) {
-          // owned nodes that are activated on other hosts shoudl always
-          // be activated because it's responsible for keeping others in
-          // sync during comms; ignoring it = bad
-          // TODO(gluon) make it so you don't have to deal with this
-          // and just use host as a reducer point
-          definitely_sampled_nodes_.set(*x);
-        }
-        sample_node_timestamps_[*x] = timestamp;
-      }
-    }
-  });
-
-  EnableSubgraphChooseAll();
-  return local_sample_count.reduce();
-}
-
-size_t galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num,
-                                             size_t num_to_sample,
-                                             bool inductive_subgraph,
-                                             size_t timestamp) {
-  use_subgraph_      = false;
-  use_subgraph_view_ = false;
-
-  galois::do_all(
-      galois::iterate(begin(), end()),
-      [&](const NodeIterator& src_iter) {
-        // only operate on if sampled
-        if (IsInSampledGraph(src_iter)) {
-          // chance of not uniformly choosing an edge of this node num_to_sample
-          // times (degree norm is 1 / degree)
-          double probability_of_reject;
-          if (!inductive_subgraph) {
-            probability_of_reject =
-                std::pow(1 - GetGlobalDegreeNorm(*src_iter), num_to_sample);
-          } else {
-            probability_of_reject = std::pow(
-                1 - GetGlobalTrainDegreeNorm(*src_iter), num_to_sample);
-          }
-
-          // loop through edges, turn "on" edge with some probability
-          for (auto edge_iter : partitioned_graph_->edges(*src_iter)) {
-            if (sample_rng_.DoBernoulli(probability_of_reject)) {
-              if (inductive_subgraph) {
-                // only take if node is training node or a node not classified
-                // into train/test/val
-                if (!IsValidForPhase(partitioned_graph_->getEdgeDst(edge_iter),
-                                     GNNPhase::kTrain) &&
-                    !IsValidForPhase(partitioned_graph_->getEdgeDst(edge_iter),
-                                     GNNPhase::kOther)) {
-                  continue;
-                }
-              }
-
-              uint32_t edge_dst = partitioned_graph_->getEdgeDst(edge_iter);
-              // if here, it means edge accepted; set sampled on, mark
-              // as part of next set
-              MakeEdgeSampled(edge_iter, sample_layer_num);
-              if (!IsInSampledGraph(edge_dst)) {
-                bitset_sample_flag_.set(edge_dst);
-              }
-              bitset_sampled_degrees_.set(*src_iter);
-              definitely_sampled_nodes_.set(*src_iter);
-              definitely_sampled_nodes_.set(edge_dst);
-              // degree increment
-              sampled_out_degrees_[sample_layer_num][*src_iter]++;
-            }
-          }
-        }
-      },
-      galois::steal(), galois::loopname("NeighborhoodSample"));
-
-  // update nodes, then communicate update to all hosts so that they can
-  // continue the exploration
-  galois::do_all(
-      galois::iterate(size_t{0}, bitset_sample_flag_.size()),
-      [&](uint32_t new_node_id) {
-        if (bitset_sample_flag_.test(new_node_id)) {
-          SetSampledNode(new_node_id);
-        }
-      },
-      galois::loopname("NeighborhoodSampleSet"));
-
-  // why not read source? even if it doesn't need to sample anything, it needs
-  // to know that it's active so that subgraph construction can proceed
-  // correctly
-  if (use_timer_) {
-    sync_substrate_
-        ->sync<writeDestination, readAny, SampleFlagSync, SampleFlagBitset>(
-            "SampleFlag");
-  } else {
-    sync_substrate_
-        ->sync<writeDestination, readAny, SampleFlagSync, SampleFlagBitset>(
-            "Ignore");
-  }
-
-  // count sampled node size
-  galois::GAccumulator<unsigned> local_sample_count;
-  local_sample_count.reset();
-  // count # of seed nodes
-  galois::do_all(galois::iterate(begin(), end()), [&](const NodeIterator& x) {
-    if (IsInSampledGraph(x)) {
-      local_sample_count += 1;
-      if (sample_node_timestamps_[*x] == std::numeric_limits<uint32_t>::max()) {
-        if (x < end_owned()) {
-          // owned nodes that are activated on other hosts shoudl always
-          // be activated because it's responsible for keeping others in
-          // sync during comms; ignoring it = bad
-          // TODO(gluon) make it so you don't have to deal with this
-          // and just use host as a reducer point
-          definitely_sampled_nodes_.set(*x);
-        }
-        sample_node_timestamps_[*x] = timestamp;
-      }
-    }
-  });
-
-  DisableSubgraphChooseAll();
-  return local_sample_count.reduce();
-}
-
-//! Construct the subgraph from sampled edges and corresponding nodes
-std::vector<unsigned>
-galois::graphs::GNNGraph::ConstructSampledSubgraph(size_t num_sampled_layers,
-                                                   bool use_view) {
-  // false first so that the build process can use functions to access the
-  // real graph
-  DisableSubgraph();
-
-  gnn_sampled_out_degrees_ = &sampled_out_degrees_;
-
-  // first, sync the degres of the sampled edges across all hosts
-  // read any because destinations need it to for reverse phase
-  if (use_timer_) {
-    sync_substrate_
-        ->sync<writeSource, readAny, SubgraphDegreeSync, SubgraphDegreeBitset>(
-            "SubgraphDegree");
-  } else {
-    sync_substrate_
-        ->sync<writeSource, readAny, SubgraphDegreeSync, SubgraphDegreeBitset>(
-            "Ignore");
-  }
-
-  galois::StatTimer offsets_n_rows_time("OffsetRowSubgraphTime");
-  offsets_n_rows_time.start();
-  galois::do_all(
-      galois::iterate(begin(), end()),
-      [&](const NodeIterator& x) {
-        if (IsActiveInSubgraph(*x)) {
-          if (sample_node_timestamps_[*x] !=
-              std::numeric_limits<uint32_t>::max()) {
-            if (*x < *end_owned()) {
-              // master
-              master_offset_accum_[sample_node_timestamps_[*x]] += 1;
-            } else {
-              // mirror
-              mirror_offset_accum_[sample_node_timestamps_[*x]] += 1;
-            }
-          } else {
-            GALOIS_LOG_FATAL(
-                "should have been timestamped at some point if active");
-          }
-        }
-      },
-      galois::loopname("MasterMirrorOffset"));
-
-  std::vector<unsigned> new_rows(master_offset_accum_.size());
-  for (unsigned i = 0; i < master_offset_accum_.size(); i++) {
-    sample_master_offsets_[i] = master_offset_accum_[i].reduce();
-    sample_mirror_offsets_[i] = mirror_offset_accum_[i].reduce();
-    new_rows[i] = sample_master_offsets_[i] + sample_mirror_offsets_[i];
-    if (i > 0) {
-      new_rows[i] += new_rows[i - 1];
-    }
-  }
-
-  offsets_n_rows_time.stop();
-
-  if (!use_view) {
-    subgraph_->BuildSubgraph(*this, num_sampled_layers);
-  } else {
-    // a view only has lid<->sid mappings
-    subgraph_->BuildSubgraphView(*this, num_sampled_layers);
-  }
-
-  sync_substrate_->SetupSubgraphMirrors(subgraph_->GetSubgraphMirrors(),
-                                        use_timer_);
-
-  // after this, this graph is a subgraph
-  if (!use_view) {
-    use_subgraph_ = true;
-  } else {
-    use_subgraph_view_ = true;
-  }
-
-  return new_rows;
-}
-
-size_t galois::graphs::GNNGraph::PrepareNextTrainMinibatch() {
-  train_batcher_->GetNextMinibatch(&local_minibatch_mask_);
-#ifndef NDEBUG
-  size_t count = 0;
-  // galois::gPrint("Minibatch : ");
-  for (unsigned i = 0; i < local_minibatch_mask_.size(); i++) {
-    if (local_minibatch_mask_[i]) {
-      // galois::gPrint(partitioned_graph_->getGID(i), ",");
-      count++;
-    }
-  }
-  // galois::gPrint("\n");
-  galois::gInfo(host_prefix(), "Batched nodes ", count);
-#endif
-  return SetupNeighborhoodSample(GNNPhase::kBatch);
-}
-
-size_t galois::graphs::GNNGraph::PrepareNextTestMinibatch() {
-  test_batcher_->GetNextMinibatch(&local_minibatch_mask_);
-  return SetupNeighborhoodSample(GNNPhase::kBatch);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-#ifdef GALOIS_ENABLE_GPU
-void galois::graphs::GNNGraph::InitGPUMemory() {
-  // create int casted CSR
-  uint64_t* e_index_ptr = partitioned_graph_->row_start_ptr();
-  uint32_t* e_dest_ptr  = partitioned_graph_->edge_dst_ptr();
-
-  // + 1 because first element is 0 in BLAS CSRs
-  std::vector<int> e_index(partitioned_graph_->size() + 1);
-  std::vector<int> e_dest(partitioned_graph_->sizeEdges());
-
-  // set in parallel
-  galois::do_all(
-      galois::iterate(static_cast<size_t>(0), partitioned_graph_->size() + 1),
-      [&](size_t index) {
-        if (index != 0) {
-          if (e_index_ptr[index - 1] >
-              static_cast<size_t>(std::numeric_limits<int>::max())) {
-            GALOIS_LOG_FATAL("{} is too big a number for int arrays on GPUs",
-                             e_index_ptr[index - 1]);
-          }
-          e_index[index] = static_cast<int>(e_index_ptr[index - 1]);
-        } else {
-          e_index[index] = 0;
-        }
-      },
-      galois::loopname("GPUEdgeIndexConstruction"));
-  galois::do_all(
-      galois::iterate(static_cast<size_t>(0), partitioned_graph_->sizeEdges()),
-      [&](size_t edge) {
-        if (e_dest_ptr[edge] >
-            static_cast<size_t>(std::numeric_limits<int>::max())) {
-          GALOIS_LOG_FATAL("{} is too big a number for int arrays on GPUs",
-                           e_dest_ptr[edge]);
-        }
-
-        e_dest[edge] = static_cast<int>(e_dest_ptr[edge]);
-      },
-      galois::loopname("GPUEdgeDestConstruction"));
-
-  gpu_memory_.SetGraphTopology(e_index, e_dest);
-  e_index.clear();
-  e_dest.clear();
-
-  gpu_memory_.SetFeatures(local_node_features_, node_feature_length_);
-  gpu_memory_.SetLabels(local_ground_truth_labels_);
-  gpu_memory_.SetMasks(local_training_mask_, local_validation_mask_,
-                       local_testing_mask_);
-  gpu_memory_.AllocAggregateBitset(partitioned_graph_->size());
-  gpu_memory_.SetGlobalTrainDegrees(global_train_degrees_);
-  gpu_memory_.SetGlobalDegrees(global_degrees_);
-}
-
-void galois::graphs::GNNGraph::InitLayerVectorMetaObjects(
-    size_t layer_number, unsigned num_hosts, size_t infl_in_size,
-    size_t infl_out_size) {
-  init_CUDA_layer_vector_meta_obj(cuda_ctx_, layer_number, num_hosts, size(),
-                                  infl_in_size, infl_out_size);
-}
-
-void galois::graphs::GNNGraph::ResizeGPULayerVector(size_t num_layers) {
-  resize_CUDA_layer_vector(cuda_ctx_, num_layers);
-}
-#endif
-void galois::graphs::GNNGraph::ContiguousRemap(const std::string& new_name) {
-  node_remapping_.resize(partitioned_graph_->size());
-
-  uint32_t new_node_id = 0;
-
-  // serial loops because new ID needs to be kept consistent
-  // first, train nodes
-  for (size_t cur_node = 0; cur_node < partitioned_graph_->size(); cur_node++) {
-    if (IsValidForPhase(cur_node, GNNPhase::kTrain)) {
-      node_remapping_[new_node_id++] = cur_node;
-    }
-  }
-  galois::gInfo("Train nodes are from 0 to ", new_node_id);
-
-  // second, val nodes
-  uint32_t val_start = new_node_id;
-  for (size_t cur_node = 0; cur_node < partitioned_graph_->size(); cur_node++) {
-    if (IsValidForPhase(cur_node, GNNPhase::kValidate)) {
-      node_remapping_[new_node_id++] = cur_node;
-    }
-  }
-  galois::gInfo("Val nodes are from ", val_start, " to ", new_node_id, "(",
-                new_node_id - val_start, ")");
-
-  // third, test nodes
-  uint32_t test_start = new_node_id;
-  for (size_t cur_node = 0; cur_node < partitioned_graph_->size(); cur_node++) {
-    if (IsValidForPhase(cur_node, GNNPhase::kTest)) {
-      node_remapping_[new_node_id++] = cur_node;
-    }
-  }
-  galois::gInfo("Test nodes are from ", test_start, " to ", new_node_id, "(",
-                new_node_id - test_start, ")");
-
-  // last, everything else
-  uint32_t other_start = new_node_id;
-  for (size_t cur_node = 0; cur_node < partitioned_graph_->size(); cur_node++) {
-    if (IsValidForPhase(cur_node, GNNPhase::kOther)) {
-      node_remapping_[new_node_id++] = cur_node;
-    }
-  }
-  galois::gInfo("Other nodes are from ", other_start, " to ", new_node_id, "(",
-                new_node_id - other_start, ")");
-  GALOIS_LOG_ASSERT(new_node_id == partitioned_graph_->size());
-
-  // remap features to match new node mapping, save to disk
-  // std::vector<GNNFeature> remapped_features(local_node_features_.size());
-  //// do all works because can copy in parallel
-  // galois::do_all(
-  //  galois::iterate(size_t{0}, partitioned_graph_->size()),
-  //  [&] (size_t remap_node_id) {
-  //    std::memcpy(
-  //        &(remapped_features[remap_node_id * node_feature_length_]),
-  //        &((local_node_features_.data())[node_remapping_[remap_node_id] *
-  //        node_feature_length_]), node_feature_length_ * sizeof(GNNFeature));
-  //  }
-  //);
-  //// sanity check
-  // galois::do_all(
-  //  galois::iterate(size_t{0}, partitioned_graph_->size()),
-  //  [&] (size_t remap_node_id) {
-  //    for (size_t i = 0; i < node_feature_length_; i++) {
-  //      GALOIS_LOG_ASSERT(remapped_features[remap_node_id *
-  //      node_feature_length_ + i] ==
-  //                        local_node_features_[node_remapping_[remap_node_id]
-  //                        * node_feature_length_ + i]);
-  //    }
-  //  }
-  //);
-  //// save to disk
-  // std::ofstream write_file_stream;
-  // std::string feature_file = input_directory_ + new_name + "-feats.bin";
-  // galois::gPrint(feature_file, "\n");
-  // write_file_stream.open(feature_file, std::ios::binary | std::ios::out);
-  // write_file_stream.write((char*)remapped_features.data(), sizeof(GNNFeature)
-  // *
-  //                                                   partitioned_graph_->size()
-  //                                                   * node_feature_length_);
-  // write_file_stream.close();
-
-  // std::ifstream file_stream;
-  // file_stream.open(feature_file, std::ios::binary | std::ios::in);
-  // file_stream.read((char*)remapped_features.data(), sizeof(GNNFloat) *
-  //                                                  partitioned_graph_->size()
-  //                                                  * node_feature_length_);
-  // file_stream.close();
-  //// sanity check again
-  // galois::do_all(
-  //  galois::iterate(size_t{0}, partitioned_graph_->size()),
-  //  [&] (size_t remap_node_id) {
-  //    for (size_t i = 0; i < node_feature_length_; i++) {
-  //      GALOIS_LOG_ASSERT(remapped_features[remap_node_id *
-  //      node_feature_length_ + i] ==
-  //                        local_node_features_[node_remapping_[remap_node_id]
-  //                        * node_feature_length_ + i]);
-  //    }
-  //  }
-  //);
-  // remapped_features.clear();
-
-  // std::vector<GNNLabel> remapped_labels(local_ground_truth_labels_.size());
-  //// save new labels order to disk (binary file)
-  // galois::do_all(
-  //  galois::iterate(size_t{0}, partitioned_graph_->size()),
-  //  [&] (size_t remap_node_id) {
-  //    remapped_labels[remap_node_id] =
-  //    local_ground_truth_labels_[node_remapping_[remap_node_id]];
-  //  }
-  //);
-
-  // std::string label_filename = input_directory_ + new_name + "-labels.bin";
-  // std::ofstream label_write_stream;
-  // label_write_stream.open(label_filename, std::ios::binary | std::ios::out);
-  // label_write_stream.write((char*)remapped_labels.data(), sizeof(GNNLabel) *
-  //                                                        partitioned_graph_->size());
-  // label_write_stream.close();
-
-  // galois::do_all(
-  //  galois::iterate(size_t{0}, partitioned_graph_->size()),
-  //  [&] (size_t remap_node_id) {
-  //    remapped_labels[remap_node_id] =
-  //    local_ground_truth_labels_[remap_node_id];
-  //  }
-  //);
-  // ReadLocalLabelsBin(new_name);
-  // galois::do_all(
-  //  galois::iterate(size_t{0}, partitioned_graph_->size()),
-  //  [&] (size_t remap_node_id) {
-  //    GALOIS_LOG_ASSERT(local_ground_truth_labels_[remap_node_id] ==
-  //    remapped_labels[node_remapping_[remap_node_id]]);
-  //  }
-  //);
 
-  // save the mapping to a binary file for use by graph convert to deal with
-  // the gr
-  std::string label_filename = input_directory_ + new_name + "-mapping.bin";
-  std::ofstream label_write_stream;
-  label_write_stream.open(label_filename, std::ios::binary | std::ios::out);
-  label_write_stream.write((char*)node_remapping_.data(),
-                           sizeof(uint32_t) * node_remapping_.size());
-  label_write_stream.close();
-}
+}; // namespace graphs
+}; // namespace galois
diff --git a/libgnn/src/graphs/GNNSubgraph.cpp b/libgnn/src/graphs/GNNSubgraph.cpp
index f2148b2706..3bea1063c8 100644
--- a/libgnn/src/graphs/GNNSubgraph.cpp
+++ b/libgnn/src/graphs/GNNSubgraph.cpp
@@ -1,441 +1 @@
 #include "galois/graphs/GNNGraph.h"
-#include <limits>
-
-size_t galois::graphs::GNNGraph::GNNSubgraph::BuildSubgraph(
-    GNNGraph& gnn_graph, size_t num_sampled_layers) {
-  galois::StatTimer timer("BuildSubgraph", kRegionName);
-  TimerStart(&timer);
-  for (auto& vec : subgraph_mirrors_) {
-    vec.clear();
-  }
-  CreateSubgraphMapping(gnn_graph, num_sampled_layers);
-  if (num_subgraph_nodes_ == 0) {
-    return 0;
-  }
-  DegreeCounting(gnn_graph);
-  EdgeCreation(gnn_graph);
-  NodeFeatureCreation(gnn_graph);
-  // loop over each node, grab out/in edges, construct them in LC_CSR_CSC
-  // no edge data, just topology
-  TimerStop(&timer);
-  return num_subgraph_nodes_;
-}
-
-size_t galois::graphs::GNNGraph::GNNSubgraph::BuildSubgraphView(
-    GNNGraph& gnn_graph, size_t num_sampled_layers) {
-  galois::StatTimer timer("BuildSubgraphView", kRegionName);
-  TimerStart(&timer);
-  CreateSubgraphMapping(gnn_graph, num_sampled_layers);
-  NodeFeatureCreation(gnn_graph);
-  TimerStop(&timer);
-  return num_subgraph_nodes_;
-}
-
-// TODO signature cleanup
-void galois::graphs::GNNGraph::GNNSubgraph::CreateSubgraphMapping(
-    GNNGraph& gnn_graph, size_t) {
-  galois::StatTimer timer("SIDMapping", kRegionName);
-  TimerStart(&timer);
-
-  assert(gnn_graph.size() == lid_to_subgraph_id_.size());
-  // clear all mappings
-  galois::ParallelSTL::fill(lid_to_subgraph_id_.begin(),
-                            lid_to_subgraph_id_.end(),
-                            std::numeric_limits<uint32_t>::max());
-
-  galois::GAccumulator<uint32_t> subgraph_count;
-  subgraph_count.reset();
-  galois::do_all(galois::iterate(gnn_graph.begin(), gnn_graph.end()),
-                 [&](uint32_t node_id) {
-                   if (gnn_graph.IsActiveInSubgraph(node_id)) {
-                     subgraph_count += 1;
-                   }
-                 });
-  num_subgraph_nodes_ = subgraph_count.reduce();
-  // if no subgraph, get out
-  if (num_subgraph_nodes_ == 0) {
-    subgraph_master_boundary_ = 0;
-    TimerStop(&timer);
-    return;
-  }
-
-  // checking sanity
-  // galois::do_all(galois::iterate(gnn_graph.begin(), gnn_graph.end()),
-  //               [&](uint32_t node_id) {
-  //                 if (gnn_graph.IsInSampledGraph(node_id) &&
-  //                 !gnn_graph.IsActiveInSubgraph(node_id)) {
-  //                  // check if any edges are active
-  //                  for (auto a = gnn_graph.edge_begin(node_id); a !=
-  //                  gnn_graph.edge_end(node_id);a++) {
-  //                    if (gnn_graph.IsEdgeSampledAny(a)) {
-  //                      galois::gWarn("ERROR node ", node_id);
-  //                    }
-  //                  }
-  //                  for (auto a = gnn_graph.in_edge_begin(node_id); a !=
-  //                  gnn_graph.in_edge_end(node_id);a++) {
-  //                    if (gnn_graph.IsInEdgeSampledAny(a)) {
-  //                      galois::gWarn("ERROR in node ", node_id);
-  //                    }
-  //                  }
-  //                 }
-  //               });
-
-  if (subgraph_id_to_lid_.size() < num_subgraph_nodes_) {
-    // allocate a bit more than necessary to avoid a big realloc
-    // if node value changes slightly later
-    subgraph_id_to_lid_.resize(num_subgraph_nodes_ * 1.02);
-  }
-
-  // bitset to mark if a master is outside the "master only" boundary
-  // and not contiguous; needed to mask out non-masters
-  galois::DynamicBitSet& non_layer_zero_masters =
-      gnn_graph.GetNonLayerZeroMasters();
-  // init the bitset as necessary
-  if (non_layer_zero_masters.size() < num_subgraph_nodes_) {
-    non_layer_zero_masters.resize(num_subgraph_nodes_);
-  } else {
-    non_layer_zero_masters.ParallelReset();
-  }
-
-  std::vector<unsigned>& master_offsets = gnn_graph.GetMasterOffsets();
-  std::vector<unsigned>& mirror_offsets = gnn_graph.GetMirrorOffsets();
-
-  ResetSIDThreadOffsets(master_offsets.size());
-
-  // compute offsets for each layer
-  galois::PODResizeableArray<unsigned> layer_offsets;
-  layer_offsets.resize(master_offsets.size() - 1);
-  for (unsigned i = 0; i < layer_offsets.size(); i++) {
-    layer_offsets[i] = master_offsets[i] + mirror_offsets[i];
-    if (i > 0) {
-      // prefix summing
-      layer_offsets[i] += layer_offsets[i - 1];
-    }
-  }
-
-  // all nodes before this SID are master nodes in layer 0;
-  // NOTE: there are master nodes past this boundary that will
-  // not be covered by a begin_owned loop, which may cause problems down
-  // the line; this is handled by the bitset above
-  subgraph_master_boundary_ = master_offsets[0];
-
-  size_t last_owned_node = *(gnn_graph.end_owned());
-  // compute amount of work each thread needs to do
-  galois::on_each([&](size_t thread_id, size_t num_threads) {
-    unsigned start_node;
-    unsigned end_node;
-    // this thread always has a set number of nodes to run; this is it
-    std::tie(start_node, end_node) = galois::block_range(
-        size_t{0}, gnn_graph.size(), thread_id, num_threads);
-    // these arrays track how much work will need to be done by this
-    // thread
-    galois::PODResizeableArray<unsigned>& my_offsets =
-        sid_thread_offsets_[thread_id];
-    galois::PODResizeableArray<unsigned>& my_mirror_offsets =
-        subgraph_mirror_offsets_[thread_id];
-
-    for (size_t local_node_id = start_node; local_node_id < end_node;
-         local_node_id++) {
-      // only bother if node was active
-      if (gnn_graph.IsActiveInSubgraph(local_node_id)) {
-        unsigned node_timestamp = gnn_graph.SampleNodeTimestamp(local_node_id);
-        // TODO(loc) this check shouldn't even be necessary; active in subgraph
-        // implies added at somepoint
-        if (node_timestamp != std::numeric_limits<unsigned>::max()) {
-          // tracks how many nodes for each timestamp this node will
-          // work with by incrementing this
-          my_offsets[node_timestamp]++;
-
-          if (local_node_id >= last_owned_node) {
-            // this is a mirror node; get the host that the master is located
-            // on and increment this thread's mirror node count for that host
-            uint32_t node_gid = gnn_graph.GetGID(local_node_id);
-            my_mirror_offsets[gnn_graph.GetHostID(node_gid)]++;
-          }
-        } else {
-          GALOIS_LOG_WARN("shouldn't ever get here right?");
-        }
-      }
-    }
-  });
-
-  // prefix sum the threads
-  galois::do_all(galois::iterate(size_t{0}, master_offsets.size()),
-                 [&](size_t layer_num) {
-                   for (size_t thread_id = 1;
-                        thread_id < galois::getActiveThreads(); thread_id++) {
-                     sid_thread_offsets_[thread_id][layer_num] +=
-                         sid_thread_offsets_[thread_id - 1][layer_num];
-                   }
-                 });
-
-  for (unsigned i = 0; i < master_offsets.size() - 1; i++) {
-    if (i > 0) {
-      GALOIS_LOG_VASSERT(
-          sid_thread_offsets_[galois::getActiveThreads() - 1][i] +
-                  layer_offsets[i - 1] ==
-              (layer_offsets[i]),
-          "layer {} wrong {} vs correct {}", i,
-          sid_thread_offsets_[galois::getActiveThreads() - 1][i],
-          layer_offsets[i]);
-    } else {
-      GALOIS_LOG_VASSERT(
-          sid_thread_offsets_[galois::getActiveThreads() - 1][i] ==
-              (layer_offsets[i]),
-          "layer {} wrong {} vs correct {}", i,
-          sid_thread_offsets_[galois::getActiveThreads() - 1][i],
-          layer_offsets[i]);
-    }
-  }
-
-  // last element of prefix sum needs to equal the correct layer offset
-  galois::do_all(
-      galois::iterate(uint32_t{0},
-                      galois::runtime::getSystemNetworkInterface().Num),
-      [&](size_t host_num) {
-        // for each host, get prefix sum of each thread's mirrors
-        for (size_t thread_id = 1; thread_id < galois::getActiveThreads();
-             thread_id++) {
-          subgraph_mirror_offsets_[thread_id][host_num] +=
-              subgraph_mirror_offsets_[thread_id - 1][host_num];
-        }
-      });
-
-  // allocate the mirror space; last element of prefix sum is total size
-  for (unsigned host_num = 0;
-       host_num < galois::runtime::getSystemNetworkInterface().Num;
-       host_num++) {
-    if (galois::runtime::getSystemNetworkInterface().ID == host_num) {
-      continue;
-    }
-    subgraph_mirrors_[host_num].resize(
-        subgraph_mirror_offsets_[galois::getActiveThreads() - 1][host_num]);
-  }
-
-  galois::on_each([&](size_t thread_id, size_t num_threads) {
-    unsigned start_node;
-    unsigned end_node;
-    std::tie(start_node, end_node) = galois::block_range(
-        size_t{0}, gnn_graph.size(), thread_id, num_threads);
-
-    galois::PODResizeableArray<unsigned>& current_thread_offset =
-        thread_id != 0 ? sid_thread_offsets_[thread_id - 1] : thread_zero_work_;
-    galois::PODResizeableArray<unsigned>& my_mirror_offsets =
-        thread_id != 0 ? subgraph_mirror_offsets_[thread_id - 1]
-                       : thread_zero_mirror_offsets_;
-
-    for (size_t local_node_id = start_node; local_node_id < end_node;
-         local_node_id++) {
-      if (gnn_graph.IsActiveInSubgraph(local_node_id)) {
-        unsigned node_timestamp = gnn_graph.SampleNodeTimestamp(local_node_id);
-        if (node_timestamp != std::numeric_limits<unsigned>::max()) {
-          uint32_t sid_to_use;
-          if (node_timestamp != 0) {
-            sid_to_use = layer_offsets[node_timestamp - 1] +
-                         current_thread_offset[node_timestamp]++;
-            if (local_node_id < last_owned_node) {
-              // master node that is not in layer 0 (i.e. node_timestamp != 0)
-              non_layer_zero_masters.set(sid_to_use);
-            }
-          } else {
-            // node timestamp == 0; no layer offset needed because offset
-            // is 0
-            sid_to_use = current_thread_offset[node_timestamp]++;
-          }
-
-          // this is a mirror
-          if (local_node_id >= last_owned_node) {
-            // XXX(loc) mirror offsets
-            uint32_t node_gid = gnn_graph.GetGID(local_node_id);
-            size_t my_offset =
-                my_mirror_offsets[gnn_graph.GetHostID(node_gid)]++;
-
-            if (my_offset >
-                subgraph_mirrors_[gnn_graph.GetHostID(node_gid)].size())
-              GALOIS_LOG_FATAL(
-                  "{} {}", my_offset,
-                  subgraph_mirrors_[gnn_graph.GetHostID(node_gid)].size());
-
-            subgraph_mirrors_[gnn_graph.GetHostID(node_gid)][my_offset] =
-                node_gid;
-          }
-
-          subgraph_id_to_lid_[sid_to_use]    = local_node_id;
-          lid_to_subgraph_id_[local_node_id] = sid_to_use;
-        } else {
-          GALOIS_LOG_WARN("shouldn't ever get here right?");
-        }
-      }
-    }
-  });
-
-  TimerStop(&timer);
-}
-
-// TODO optimize further?
-void galois::graphs::GNNGraph::GNNSubgraph::DegreeCounting(
-    const GNNGraph& gnn_graph) {
-  galois::StatTimer timer("DegreeCounting", kRegionName);
-  TimerStart(&timer);
-
-  if (local_subgraph_out_degrees_.size() < num_subgraph_nodes_) {
-    local_subgraph_out_degrees_.resize(num_subgraph_nodes_ * 1.02);
-  }
-
-  if (local_subgraph_in_degrees_.size() < num_subgraph_nodes_) {
-    local_subgraph_in_degrees_.resize(num_subgraph_nodes_ * 1.02);
-  }
-
-  galois::do_all(
-      galois::iterate(begin(), end()),
-      [&](uint32_t subgraph_id) {
-        uint32_t node_id     = subgraph_id_to_lid_[subgraph_id];
-        uint32_t out_degrees = 0;
-        for (auto out_edge_iter : gnn_graph.edges(node_id)) {
-          if (gnn_graph.IsEdgeSampledAny(out_edge_iter)) {
-            out_degrees++;
-          }
-        }
-        local_subgraph_out_degrees_[subgraph_id] = out_degrees;
-
-        uint32_t in_degrees = 0;
-        for (auto in_edge_iter : gnn_graph.in_edges(node_id)) {
-          if (gnn_graph.IsInEdgeSampledAny(in_edge_iter)) {
-            in_degrees++;
-          }
-        }
-        local_subgraph_in_degrees_[subgraph_id] = in_degrees;
-      },
-      galois::loopname("DegreeCountingDoAll"), galois::steal());
-
-  TimerStop(&timer);
-}
-
-// TODO optimize further?
-void galois::graphs::GNNGraph::GNNSubgraph::EdgeCreation(
-    const GNNGraph& gnn_graph) {
-  galois::StatTimer timer("EdgeConstruction", kRegionName);
-  TimerStart(&timer);
-  // galois::DGAccumulator<uint32_t> empty_masters;
-  // galois::DGAccumulator<uint32_t> empty_mirrors;
-  // empty_masters.reset();
-  // empty_mirrors.reset();
-
-  // galois::DGAccumulator<uint32_t> total_sn;
-  // total_sn.reset();
-  // total_sn += num_subgraph_nodes_;
-  // size_t global_sub_size = total_sn.reduce();
-
-  // prefix sum over subgraph degrees from previous phase to get starting points
-  for (size_t i = 1; i < num_subgraph_nodes_; i++) {
-    // if (local_subgraph_out_degrees_[i] == 0 &&
-    //    local_subgraph_in_degrees_[i] == 0) {
-    //  if (i < subgraph_master_boundary_) {
-    //    empty_masters += 1;
-    //  } else {
-    //    if (gnn_graph.GetNonLayerZeroMasters().test(i)) {
-    //      empty_masters += 1;
-    //    } else {
-    //      empty_mirrors += 1;
-    //    }
-    //  }
-    //}
-    local_subgraph_out_degrees_[i] += local_subgraph_out_degrees_[i - 1];
-    local_subgraph_in_degrees_[i] += local_subgraph_in_degrees_[i - 1];
-  }
-
-  // uint32_t emaster = empty_masters.reduce();
-  // uint32_t emirror = empty_mirrors.reduce();
-  // if (gnn_graph.host_id() == 0) {
-  //  galois::gInfo("Empty masters percent is ", emaster /
-  //  (float)global_sub_size,
-  //                " ", emaster, " ", global_sub_size);
-  //  galois::gInfo("Empty mirrors percent is ", emirror /
-  //  (float)global_sub_size,
-  //                " ", emirror, " ", global_sub_size);
-  //}
-
-  // allocate then set node endpoints
-  num_subgraph_edges_ = local_subgraph_out_degrees_[num_subgraph_nodes_ - 1];
-
-  galois::StatTimer alloc_time("EdgeCreationAlloc", kRegionName);
-  TimerStart(&alloc_time);
-  underlying_graph_.DeallocateOnly();
-  underlying_graph_.allocateFrom(num_subgraph_nodes_, num_subgraph_edges_);
-  underlying_graph_.CSCAllocate();
-  TimerStop(&alloc_time);
-
-  galois::gInfo(gnn_graph.host_prefix(), "Subgraph nodes and edges are ",
-                num_subgraph_nodes_, " ", num_subgraph_edges_);
-
-  galois::do_all(galois::iterate(uint32_t{0}, num_subgraph_nodes_),
-                 [&](uint32_t subgraph_id) {
-                   underlying_graph_.fixEndEdge(
-                       subgraph_id, local_subgraph_out_degrees_[subgraph_id]);
-                   underlying_graph_.FixEndInEdge(
-                       subgraph_id, local_subgraph_in_degrees_[subgraph_id]);
-                 });
-  if (subedge_to_original_edge_.size() < num_subgraph_edges_) {
-    subedge_to_original_edge_.resize(num_subgraph_edges_ * 1.02);
-  }
-  if (in_subedge_to_original_edge_.size() < num_subgraph_edges_) {
-    in_subedge_to_original_edge_.resize(num_subgraph_edges_ * 1.02);
-  }
-
-  // save edges + save reference to layer sample status
-  galois::do_all(
-      galois::iterate(begin(), end()),
-      [&](uint32_t subgraph_id) {
-        uint32_t node_id = subgraph_id_to_lid_[subgraph_id];
-        assert(subgraph_id != std::numeric_limits<uint32_t>::max());
-        uint32_t out_location = 0;
-        uint32_t in_location  = 0;
-        if (subgraph_id != 0) {
-          out_location = local_subgraph_out_degrees_[subgraph_id - 1];
-          in_location  = local_subgraph_in_degrees_[subgraph_id - 1];
-        }
-
-        for (auto out_edge_iter : gnn_graph.edges(node_id)) {
-          if (gnn_graph.IsEdgeSampledAny(out_edge_iter)) {
-            assert(lid_to_subgraph_id_[gnn_graph.GetEdgeDest(out_edge_iter)] !=
-                   std::numeric_limits<uint32_t>::max());
-            subedge_to_original_edge_[out_location] = *out_edge_iter;
-
-            underlying_graph_.constructEdge(
-                out_location++,
-                lid_to_subgraph_id_[gnn_graph.GetEdgeDest(out_edge_iter)]);
-          }
-        }
-
-        for (auto in_edge_iter : gnn_graph.in_edges(node_id)) {
-          if (gnn_graph.IsInEdgeSampledAny(in_edge_iter)) {
-            in_subedge_to_original_edge_[in_location] =
-                *(gnn_graph.InEdgeToOutEdge(in_edge_iter));
-            underlying_graph_.ConstructInEdge(
-                in_location++,
-                lid_to_subgraph_id_[gnn_graph.GetInEdgeDest(in_edge_iter)]);
-          }
-        }
-        assert(out_location == local_subgraph_out_degrees_[subgraph_id]);
-        assert(in_location == local_subgraph_in_degrees_[subgraph_id]);
-      },
-      galois::loopname("EdgeCreationDoAll"), galois::steal());
-  TimerStop(&timer);
-}
-
-void galois::graphs::GNNGraph::GNNSubgraph::NodeFeatureCreation(
-    GNNGraph& gnn_graph) {
-  galois::StatTimer timer("NodeFeatureCreation", kRegionName);
-  TimerStart(&timer);
-  size_t feat_length = gnn_graph.node_feature_length();
-  subgraph_node_features_.resize(feat_length * num_subgraph_nodes_);
-
-  galois::do_all(galois::iterate(begin(), end()), [&](size_t subgraph_node_id) {
-    size_t local_id = subgraph_id_to_lid_[subgraph_node_id];
-    std::memcpy(
-        &(subgraph_node_features_[subgraph_node_id * feat_length]),
-        &((gnn_graph.GetLocalFeatures().data())[local_id * feat_length]),
-        feat_length * sizeof(GNNFeature));
-  });
-  TimerStop(&timer);
-}
diff --git a/libgnn/src/layers/DenseLayer.cpp b/libgnn/src/layers/DenseLayer.cpp
index eed3143a01..8b13789179 100644
--- a/libgnn/src/layers/DenseLayer.cpp
+++ b/libgnn/src/layers/DenseLayer.cpp
@@ -1,145 +1 @@
-#include "galois/Logging.h"
-#include "galois/GNNMath.h"
-#include "galois/layers/DenseLayer.h"
 
-galois::DenseLayer::DenseLayer(
-    size_t layer_num, const galois::graphs::GNNGraph& graph,
-    PointerWithSize<GNNFloat>* backward_output_matrix,
-    const GNNLayerDimensions& dimensions, const GNNLayerConfig& config)
-    : GNNLayer(layer_num, graph, backward_output_matrix, dimensions, config),
-      input_column_intermediates_(dimensions.input_columns),
-      output_column_intermediates_(dimensions.output_columns) {
-  // TODO Need to make sure that layer knows about forward/backward matrix
-  // sharing (e.g., overwriting previously used input to save space)
-  GALOIS_LOG_FATAL("This layer has not been kept up to date; do not use until "
-                   "sure it's been updated");
-  size_t num_input_elements =
-      layer_dimensions_.input_rows * layer_dimensions_.input_columns;
-  in_temp_1_.resize(num_input_elements, 0);
-  size_t num_output_elements =
-      layer_dimensions_.input_rows * layer_dimensions_.output_columns;
-  GALOIS_LOG_VERBOSE("Output elements {}", num_output_elements);
-  layer_type_  = galois::GNNLayerType::kDense;
-  p_in_temp_1_ = PointerWithSize<GNNFloat>(in_temp_1_);
-  GALOIS_LOG_VERBOSE("Dense initialized");
-}
-
-const galois::PointerWithSize<galois::GNNFloat>
-galois::DenseLayer::ForwardPhase(
-    const galois::PointerWithSize<galois::GNNFloat> input_embeddings) {
-  GALOIS_LOG_VERBOSE("Calling forward phase");
-  assert(input_embeddings.size() ==
-         (layer_dimensions_.input_rows * layer_dimensions_.input_columns));
-  assert(p_in_temp_1_.size() == input_embeddings.size());
-  assert(p_forward_output_matrix_.size() ==
-         (layer_dimensions_.input_rows * layer_dimensions_.output_columns));
-  // pointer to input to operate on
-  const GNNFloat* input_data = input_embeddings.data();
-  // first, dropout
-  if (!config_.disable_dropout && (layer_phase_ == GNNPhase::kTrain)) {
-    DoDropout(input_embeddings, &p_in_temp_1_);
-    input_data = p_in_temp_1_.data();
-  }
-
-  // FW
-  UpdateEmbeddings(input_data, p_forward_output_matrix_.data());
-
-  if (!config_.disable_activation) {
-    GALOIS_LOG_VERBOSE("Doing activation");
-    Activation();
-  }
-
-  assert(p_forward_output_matrix_.size() ==
-         (layer_dimensions_.input_rows * layer_dimensions_.output_columns));
-  return p_forward_output_matrix_;
-}
-
-galois::PointerWithSize<galois::GNNFloat> galois::DenseLayer::BackwardPhase(
-    galois::PointerWithSize<galois::GNNFloat> prev_layer_input,
-    galois::PointerWithSize<galois::GNNFloat>* input_gradient) {
-  assert(layer_phase_ == GNNPhase::kTrain);
-
-  // derivative of activation
-  if (!config_.disable_activation) {
-    ActivationDerivative(input_gradient);
-  }
-
-  if (layer_number_ != 0) {
-    // derivative for update
-    // backout = F'
-    UpdateEmbeddingsDerivative(input_gradient->data(),
-                               p_backward_output_matrix_.data());
-  }
-
-  galois::PointerWithSize<galois::GNNFloat> input_data;
-  if (!config_.disable_dropout) {
-    // dropout result is currently stored in temp 1
-    // needs to be used before it gets overwritten
-    input_data = p_in_temp_1_;
-  } else {
-    // no dropout = use vanilla input
-    input_data = prev_layer_input;
-  }
-
-  // W' = F^T (FW)'
-  galois::CBlasSGEMM(CblasTrans, CblasNoTrans, layer_dimensions_.input_columns,
-                     layer_dimensions_.input_rows,
-                     layer_dimensions_.output_columns, input_data.data(),
-                     input_gradient->data(), p_layer_weight_gradients_.data());
-  // sync weight gradients; note aggregation sync occurs in the function call
-  // already
-  WeightGradientSyncSum();
-
-  if (!config_.disable_dropout && layer_number_ != 0) {
-    DoDropoutDerivative();
-  }
-
-  return p_backward_output_matrix_;
-}
-
-void galois::DenseLayer::UpdateEmbeddings(const GNNFloat* node_embeddings,
-                                          GNNFloat* output) {
-#ifdef GALOIS_ENABLE_GPU
-  if (device_personality == DevicePersonality::GPU_CUDA) {
-    /* TODO(lhc) implement this
-    gpu_object_.UpdateEmbeddingsGPU(
-        layer_dimensions_.input_rows, layer_dimensions_.input_columns,
-        layer_dimensions_.output_columns, node_embeddings,
-        base_gpu_object_.layer_weights(), output);
-        */
-  } else {
-#endif
-    // CPU version is just a call into CBlas
-    galois::CBlasSGEMM(CblasNoTrans, CblasNoTrans, layer_dimensions_.input_rows,
-                       layer_dimensions_.input_columns,
-                       layer_dimensions_.output_columns, node_embeddings,
-                       layer_weights_.data(), output);
-#ifdef GALOIS_ENABLE_GPU
-  }
-#endif
-}
-
-void galois::DenseLayer::UpdateEmbeddingsDerivative(const GNNFloat* gradients,
-                                                    GNNFloat* output) {
-  assert(p_layer_weights_.size() ==
-         layer_dimensions_.input_columns * layer_dimensions_.output_columns);
-#ifdef GALOIS_ENABLE_GPU
-  if (device_personality == DevicePersonality::GPU_CUDA) {
-    /* TODO(lhc) implement this
-    gpu_object_.UpdateEmbeddingsDerivativeGPU(
-        layer_dimensions_.input_rows, layer_dimensions_.input_columns,
-        layer_dimensions_.output_columns, gradients,
-        base_gpu_object_.layer_weights(), output);
-        */
-  } else {
-#endif
-    // difference is Trans for B matrix (data) to get z by y (weights is y by z
-    // normally); result is x by y
-    galois::CBlasSGEMM(CblasNoTrans, CblasTrans, layer_dimensions_.input_rows,
-                       layer_dimensions_.output_columns,
-                       layer_dimensions_.input_columns, gradients,
-                       layer_weights_.data(), output);
-#ifdef GALOIS_ENABLE_GPU
-  }
-#endif
-}
diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp
index 82a864a41d..8b13789179 100644
--- a/libgnn/src/layers/GNNLayer.cpp
+++ b/libgnn/src/layers/GNNLayer.cpp
@@ -1,558 +1 @@
-#include "galois/Logging.h"
-#include "galois/layers/GNNLayer.h"
-#include "galois/layers/GradientSyncStructures.h"
 
-galois::GNNLayer::GNNLayer(size_t layer_num,
-                           const galois::graphs::GNNGraph& graph,
-                           PointerWithSize<GNNFloat>* backward_output_matrix,
-                           const GNNLayerDimensions& dimensions,
-                           const GNNLayerConfig& config)
-    : layer_number_(layer_num), graph_(graph), layer_dimensions_(dimensions),
-      config_(config) {
-  // TODO(loc)
-  // this is currently a backward-compatibility hack, need to have caller
-  // set output rows rather than created here
-  layer_dimensions_.output_rows = layer_dimensions_.input_rows;
-
-  if (config_.allocate_weights) {
-    // dropout allocation; dropout is same as input
-    if (!config_.disable_dropout) {
-      dropout_mask_.resize(layer_dimensions_.input_rows *
-                               layer_dimensions_.input_columns,
-                           false);
-    }
-    // allocate memory based on layer dimensions
-    size_t num_weight_elements =
-        layer_dimensions_.input_columns * layer_dimensions_.output_columns;
-    galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
-                  ", layer weights ", num_weight_elements, " (",
-                  FloatElementsToGB(num_weight_elements), " GB)");
-    layer_weights_.resize(num_weight_elements);
-    galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
-                  ", layer gradients ", num_weight_elements, " (",
-                  FloatElementsToGB(num_weight_elements), " GB)");
-    layer_weight_gradients_.resize(num_weight_elements, 0);
-#ifdef GALOIS_ENABLE_GPU
-    if (device_personality == DevicePersonality::GPU_CUDA) {
-      base_gpu_object_.InitWeightMemory(num_weight_elements);
-      base_gpu_object_.InitDropoutMemory(layer_dimensions_.input_rows *
-                                         layer_dimensions_.input_columns);
-    }
-#endif
-
-    GlorotBengioInit(&layer_weights_);
-  }
-
-  // TODO(loc) optimize this and layer creation in general
-  // this does not use output_rows and assumes the worst case where
-  // all nodes are generated
-  // for now it's kept as input_rows so as to not break things
-  size_t num_output_elements =
-      layer_dimensions_.input_rows * layer_dimensions_.output_columns;
-
-  if (!config_.disable_output) {
-    galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
-                  ", forward output matrix ", num_output_elements, " (",
-                  FloatElementsToGB(num_output_elements), " GB)");
-    forward_output_matrix_.resize(num_output_elements, 0);
-  }
-
-  if (layer_number_ != 0) {
-    GALOIS_LOG_VASSERT(
-        backward_output_matrix->size() ==
-            layer_dimensions_.input_rows * layer_dimensions_.input_columns,
-        "backward output size {} should equal input size {}",
-        backward_output_matrix->size(),
-        layer_dimensions_.input_rows * layer_dimensions_.input_columns);
-  } else {
-    GALOIS_LOG_VASSERT(backward_output_matrix->data() == nullptr,
-                       "layer 0 should null ptr backward output");
-    GALOIS_LOG_VASSERT(backward_output_matrix->size() == 0,
-                       "layer 0 should size 0 backward output");
-  }
-
-#ifdef GALOIS_ENABLE_GPU
-  if (device_personality == DevicePersonality::GPU_CUDA) {
-    base_gpu_object_.InitInOutMemory(num_output_elements,
-                                     layer_dimensions_.input_rows *
-                                         layer_dimensions_.input_columns);
-
-    // initialize the PointerWithSize wrappers
-    p_layer_weights_ = PointerWithSize<GNNFloat>(
-        base_gpu_object_.layer_weights(), layer_weights_.size());
-    p_layer_weight_gradients_ =
-        PointerWithSize<GNNFloat>(base_gpu_object_.layer_weight_gradients(),
-                                  layer_weight_gradients_.size());
-    p_forward_output_matrix_ = PointerWithSize<GNNFloat>(
-        base_gpu_object_.forward_output(), forward_output_matrix_.size());
-    p_backward_output_matrix_ = PointerWithSize<GNNFloat>(
-        base_gpu_object_.backward_output(), backward_output_matrix->size());
-    // TODO can clear the cpu side vectors/don't use .size() since optimally
-    // they aren't initialized
-  } else {
-#endif
-    // initialize the PointerWithSize wrappers
-    p_layer_weights_ = PointerWithSize<GNNFloat>(layer_weights_);
-    p_layer_weight_gradients_ =
-        PointerWithSize<GNNFloat>(layer_weight_gradients_);
-    p_forward_output_matrix_ =
-        PointerWithSize<GNNFloat>(forward_output_matrix_);
-    p_backward_output_matrix_ = *backward_output_matrix;
-#ifdef GALOIS_ENABLE_GPU
-  }
-#endif
-}
-
-void galois::GNNLayer::ResizeOutputMatrix(size_t new_output_row) {
-  size_t num_output_elements =
-      new_output_row * layer_dimensions_.output_columns;
-
-  if (!config_.disable_output &&
-      (forward_output_matrix_.size() < num_output_elements)) {
-    galois::gInfo(graph_.host_prefix(), "Resizing layer ", layer_number_,
-                  ", forward output matrix to ", num_output_elements, " (",
-                  FloatElementsToGB(num_output_elements), " GB)");
-    // resize with a bit of a buffer to prevent possible future resizes
-    size_t buffer_size = (num_output_elements * 0.02);
-    forward_output_matrix_.resize(num_output_elements + buffer_size, 0);
-  }
-
-  // XXX(hochan) GPU end
-#ifdef GALOIS_ENABLE_GPU
-  // XXX(hochan)
-#endif
-  // reinitialize the PointerWithSize wrappers
-  p_forward_output_matrix_ = PointerWithSize<GNNFloat>(forward_output_matrix_);
-#ifdef GALOIS_ENABLE_GPU
-  // XXX(hochan)
-#endif
-}
-
-void galois::GNNLayer::GlorotBengioInit(std::vector<GNNFloat>* vector_to_init) {
-  float max = std::sqrt(6.0) / std::sqrt(layer_dimensions_.output_columns +
-                                         layer_dimensions_.input_columns);
-  std::default_random_engine rng(1 + layer_number_);
-  std::uniform_real_distribution<GNNFloat> dist(-max, max);
-
-  for (size_t i = 0; i < vector_to_init->size(); i++) {
-    (*vector_to_init)[i] = dist(rng);
-  }
-#ifdef GALOIS_ENABLE_GPU
-  if (device_personality == DevicePersonality::GPU_CUDA) {
-    CopyLayerWeightsToGPU();
-  }
-#endif
-}
-
-void galois::GNNLayer::PairGlorotBengioInit(std::vector<GNNFloat>* vector1,
-                                            std::vector<GNNFloat>* vector2) {
-  // multiplied by 2 here because 2 pieces are 1 unit
-  float max =
-      std::sqrt(6.0) / std::sqrt((2 * layer_dimensions_.output_columns) +
-                                 layer_dimensions_.input_columns);
-  assert(vector1->size() ==
-         (layer_dimensions_.input_columns * layer_dimensions_.output_columns));
-  assert(vector2->size() ==
-         (layer_dimensions_.input_columns * layer_dimensions_.output_columns));
-  std::default_random_engine rng(1 + layer_number_);
-  std::uniform_real_distribution<GNNFloat> dist(-max, max);
-
-  for (size_t i = 0; i < vector1->size(); i++) {
-    (*vector1)[i] = dist(rng);
-  }
-  for (size_t i = 0; i < vector2->size(); i++) {
-    (*vector2)[i] = dist(rng);
-  }
-
-#ifdef GALOIS_ENABLE_GPU
-  if (device_personality == DevicePersonality::GPU_CUDA) {
-    CopyLayerWeightsToGPU();
-  }
-#endif
-}
-
-void galois::GNNLayer::RandomInitVector(std::vector<GNNFloat>* vector_to_init) {
-  galois::do_all(
-      galois::iterate(static_cast<size_t>(0), vector_to_init->size()),
-      [&](size_t i) {
-        // pull from the class's per thread RNG
-        (*vector_to_init)[i] = random_init_rng_.GetRandomNumber();
-      },
-      galois::loopname("RandomInitVector"));
-#ifdef GALOIS_ENABLE_GPU
-  if (device_personality == DevicePersonality::GPU_CUDA) {
-    CopyLayerWeightsToGPU();
-  }
-#endif
-}
-
-void galois::GNNLayer::DoDropoutCPU(
-    const PointerWithSize<GNNFloat> input_to_dropout,
-    PointerWithSize<GNNFloat>* output_matrix) {
-  // TODO This (and dropout in general) may not work in the sampling setting
-  size_t num_elements =
-      layer_dimensions_.input_rows * layer_dimensions_.input_columns;
-
-  // determine which parts to drop
-  galois::do_all(
-      galois::iterate(static_cast<size_t>(0), num_elements),
-      [&](size_t i) {
-        dropout_mask_[i] = dropout_rng_.DoBernoulli(config_.dropout_rate);
-      },
-      galois::loopname("LayerDropoutRNG"));
-
-  // create new matrix with non-dropped input + some scaling
-  // TODO save scaling elsewhere?
-  GNNFloat scale = 1. / (1. - config_.dropout_rate);
-  galois::do_all(
-      galois::iterate(static_cast<size_t>(0), num_elements),
-      [&](size_t i) {
-        (*output_matrix)[i] = input_to_dropout[i] *
-                              static_cast<GNNFloat>(dropout_mask_[i]) * scale;
-      },
-      galois::loopname("LayerDropout"));
-}
-
-void galois::GNNLayer::DoDropout(
-    const PointerWithSize<GNNFloat> input_to_dropout,
-    PointerWithSize<GNNFloat>* output_matrix) {
-  galois::StatTimer timer("ForwardDropout", "GNNLayer");
-  TimerStart(&timer);
-#ifdef GALOIS_ENABLE_GPU
-  if (device_personality == DevicePersonality::GPU_CUDA) {
-    base_gpu_object_.DoDropoutGPU(input_to_dropout, *output_matrix,
-                                  config_.dropout_rate);
-  } else {
-#endif
-    DoDropoutCPU(input_to_dropout, output_matrix);
-#ifdef GALOIS_ENABLE_GPU
-  }
-#endif
-  TimerStop(&timer);
-}
-
-void galois::GNNLayer::ReconstructDropoutMatrix(
-    const PointerWithSize<GNNFloat> input_to_dropout,
-    PointerWithSize<GNNFloat>* output_matrix) {
-  galois::StatTimer timer("ReconstructDropoutMatrix", "GNNLayer");
-  TimerStart(&timer);
-  // reuse the dropout mask from a previous dropout call
-  size_t num_elements = output_matrix->size();
-  GNNFloat scale      = 1. / (1. - config_.dropout_rate);
-#ifdef GALOIS_ENABLE_GPU
-  if (device_personality == DevicePersonality::GPU_CUDA) {
-    base_gpu_object_.ReconstructDropoutMatrixGPU(
-        input_to_dropout, output_matrix, num_elements, scale);
-  } else {
-#endif
-    galois::do_all(
-        galois::iterate(static_cast<size_t>(0), num_elements),
-        [&](size_t i) {
-          (*output_matrix)[i] = input_to_dropout[i] *
-                                static_cast<GNNFloat>(dropout_mask_[i]) * scale;
-        },
-        galois::loopname("ReconstructDropout"));
-#ifdef GALOIS_ENABLE_GPU
-  }
-#endif
-  TimerStop(&timer);
-}
-
-void galois::GNNLayer::DoDropoutDerivative() {
-  galois::StatTimer timer("BackwardDropout", "GNNLayer");
-  TimerStart(&timer);
-  assert(p_backward_output_matrix_.size() == dropout_mask_.size());
-  GNNFloat scale = 1. / (1. - config_.dropout_rate);
-
-#ifdef GALOIS_ENABLE_GPU
-  if (device_personality == DevicePersonality::GPU_CUDA) {
-    base_gpu_object_.DoDropoutDerivativeGPU(p_backward_output_matrix_.size(),
-                                            scale);
-  } else {
-#endif
-    // use dropout mask to figure out derivative
-    galois::do_all(
-        galois::iterate(static_cast<size_t>(0),
-                        p_backward_output_matrix_.size()),
-        [&](size_t i) {
-          p_backward_output_matrix_[i] =
-              p_backward_output_matrix_[i] *
-              static_cast<GNNFloat>(dropout_mask_[i]) * scale;
-        },
-        galois::loopname("LayerDropoutDerivative"));
-#ifdef GALOIS_ENABLE_GPU
-  }
-#endif
-  TimerStop(&timer);
-}
-
-void galois::GNNLayer::Activation() {
-  galois::StatTimer timer("ForwardActivation", "GNNLayer");
-  TimerStart(&timer);
-
-  // TODO only does relu at the moment; should check user specified activation
-  // and act accordingly
-#ifdef GALOIS_ENABLE_GPU
-  if (device_personality == DevicePersonality::GPU_CUDA) {
-    base_gpu_object_.ActivationGPU(p_forward_output_matrix_.size());
-  } else {
-#endif
-    if (activation_memo_.size() != p_forward_output_matrix_.size()) {
-      activation_memo_.resize(p_forward_output_matrix_.size());
-    }
-    activation_memo_.reset();
-    assert(activation_memo_.size() == p_forward_output_matrix_.size());
-    assert(layer_dimensions_.output_rows * layer_dimensions_.output_columns <=
-           p_forward_output_matrix_.size());
-
-    galois::do_all(galois::iterate(static_cast<size_t>(0),
-                                   layer_dimensions_.output_rows *
-                                       layer_dimensions_.output_columns),
-                   [&](size_t i) {
-                     if (p_forward_output_matrix_[i] > 0.0) {
-                       // do nothing, keep value; set the memo though
-                       activation_memo_.set(i);
-                     } else {
-                       p_forward_output_matrix_[i] = 0;
-                     }
-                   });
-#ifdef GALOIS_ENABLE_GPU
-  }
-#endif
-  TimerStop(&timer);
-}
-
-void galois::GNNLayer::ActivationDerivative(
-    PointerWithSize<GNNFloat>* gradient) {
-  galois::StatTimer timer("BackwardActivation", "GNNLayer");
-  TimerStart(&timer);
-
-#ifdef GALOIS_ENABLE_GPU
-  if (device_personality == DevicePersonality::GPU_CUDA) {
-    base_gpu_object_.ActivationDerivativeGPU(gradient->data(),
-                                             gradient->size());
-  } else {
-#endif
-    assert(gradient->size() >=
-           layer_dimensions_.output_rows * layer_dimensions_.output_columns);
-    // TODO only does relu at the moment; should check user specified activation
-    // and act accordingly
-    // keep gradient if the original output was greater than 0
-    galois::do_all(
-        galois::iterate(static_cast<size_t>(0),
-                        layer_dimensions_.output_rows *
-                            layer_dimensions_.output_columns),
-        [&](size_t i) {
-          // it was <= 0 before; set back to 0
-          if (!activation_memo_.test(i)) {
-            (*gradient)[i] = 0;
-          }
-        },
-        galois::loopname("ReLU-Derivative"));
-#ifdef GALOIS_ENABLE_GPU
-  }
-#endif
-  TimerStop(&timer);
-}
-
-void galois::GNNLayer::WeightGradientSyncSum() {
-  galois::StatTimer clubbed_timer("Sync_BackwardSync", "Gluon");
-  TimerStart(&clubbed_timer);
-  galois::StatTimer t("Sync_WeightGradientsSum", "GNNLayer");
-  TimerStart(&t);
-  int weight_size = static_cast<int>(p_layer_weight_gradients_.size());
-
-  // TODO(loc) remove this limitation later; can just do a loop over the weight
-  // matrix
-  if (p_layer_weight_gradients_.size() >
-      size_t{std::numeric_limits<int>::max()}) {
-    GALOIS_LOG_FATAL("Weight sync code does not handle size larger than max "
-                     "int at the moment");
-  }
-#ifdef GALOIS_ENABLE_GPU
-  // TODO(lhc) make this clang option later
-  bool gpu_direct_enabled = false;
-  if (device_personality == DevicePersonality::GPU_CUDA &&
-      !gpu_direct_enabled) {
-    base_gpu_object_.CopyWeightGradientsToCPU(&layer_weight_gradients_);
-    MPI_Allreduce(MPI_IN_PLACE, layer_weight_gradients_.data(), weight_size,
-                  MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);
-    base_gpu_object_.CopyToWeightGradients(layer_weight_gradients_);
-  } else {
-#endif
-    MPI_Allreduce(MPI_IN_PLACE,
-                  static_cast<void*>(p_layer_weight_gradients_.data()),
-                  weight_size, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);
-#ifdef GALOIS_ENABLE_GPU
-  }
-#endif
-  TimerStop(&t);
-  TimerStop(&clubbed_timer);
-}
-
-void galois::GNNLayer::MaskInputNonMasters(PointerWithSize<GNNFloat>* input,
-                                           size_t max_rows) {
-  assert(*(graph_.begin_owned()) == 0);
-  size_t start_node = *(graph_.end_owned());
-  size_t end_node   = graph_.active_size();
-
-  if (start_node > max_rows) {
-    start_node = max_rows;
-  }
-  if (end_node > max_rows) {
-    end_node = max_rows;
-  }
-
-  size_t row_index = layer_dimensions_.input_columns;
-  assert(start_node * row_index <= input->size());
-  assert(end_node * row_index <= input->size());
-
-#ifdef GALOIS_ENABLE_GPU
-  if (device_personality == DevicePersonality::GPU_CUDA) {
-    base_gpu_object_.MaskNonMastersGPU(input, start_node, end_node, row_index);
-  } else {
-#endif
-    galois::do_all(
-        galois::iterate(start_node, end_node),
-        [&](size_t non_master) {
-          // TODO(loc) use a std function for this for max efficiency
-          for (size_t i = 0; i < row_index; i++) {
-            (*input)[non_master * row_index + i] = 0;
-          }
-        },
-        galois::loopname("MaskInputNonMasters"));
-#ifdef GALOIS_ENABLE_GPU
-  }
-#endif
-}
-
-void galois::GNNLayer::MaskInputNonMasters(PointerWithSize<GNNFloat>* input,
-                                           size_t max_rows,
-                                           const galois::DynamicBitSet& bs) {
-  assert(*(graph_.begin_owned()) == 0);
-  size_t start_node = *(graph_.end_owned());
-  size_t end_node   = graph_.active_size();
-
-  if (start_node > max_rows) {
-    start_node = max_rows;
-  }
-  if (end_node > max_rows) {
-    end_node = max_rows;
-  }
-
-  size_t row_index = layer_dimensions_.input_columns;
-  assert(start_node * row_index <= input->size());
-  assert(end_node * row_index <= input->size());
-
-#ifdef GALOIS_ENABLE_GPU
-  if (device_personality == DevicePersonality::GPU_CUDA) {
-    base_gpu_object_.MaskNonMastersGPU(input, start_node, end_node, row_index);
-  } else {
-#endif
-    galois::do_all(
-        galois::iterate(start_node, end_node),
-        [&](size_t non_master) {
-          if (!bs.test(non_master)) {
-            // TODO(loc) use a std function for this for max efficiency
-            for (size_t i = 0; i < row_index; i++) {
-              (*input)[non_master * row_index + i] = 0;
-            }
-          }
-        },
-        galois::loopname("MaskInputNonMasters"));
-#ifdef GALOIS_ENABLE_GPU
-  }
-#endif
-}
-
-void galois::GNNLayer::MaskGradientNonMasters(
-    PointerWithSize<GNNFloat>* gradient, size_t max_rows) {
-  assert(*(graph_.begin_owned()) == 0);
-  size_t start_node = *(graph_.end_owned());
-  size_t end_node   = graph_.active_size();
-
-  if (start_node > max_rows) {
-    start_node = max_rows;
-  }
-  if (end_node > max_rows) {
-    end_node = max_rows;
-  }
-
-  size_t row_index = layer_dimensions_.output_columns;
-  if (start_node > max_rows) {
-    start_node = max_rows;
-  }
-  if (end_node > max_rows) {
-    end_node = max_rows;
-  }
-  assert(start_node * row_index <= gradient->size());
-  assert(end_node * row_index <= gradient->size());
-
-#ifdef GALOIS_ENABLE_GPU
-  if (device_personality == DevicePersonality::GPU_CUDA) {
-    base_gpu_object_.MaskNonMastersGPU(gradient, start_node, end_node,
-                                       row_index);
-  } else {
-#endif
-    galois::do_all(
-        galois::iterate(start_node, end_node),
-        [&](size_t non_master) {
-          // TODO(loc) use a std function for this for max efficiency
-          for (size_t i = 0; i < row_index; i++) {
-            (*gradient)[non_master * row_index + i] = 0;
-          }
-        },
-        galois::loopname("MaskGradientNonMasters"));
-#ifdef GALOIS_ENABLE_GPU
-  }
-#endif
-}
-
-void galois::GNNLayer::MaskGradientNonMasters(
-    PointerWithSize<GNNFloat>* gradient, size_t max_rows,
-    const galois::DynamicBitSet& bs) {
-  assert(*(graph_.begin_owned()) == 0);
-  size_t start_node = *(graph_.end_owned());
-  size_t end_node   = graph_.active_size();
-
-  if (start_node > max_rows) {
-    start_node = max_rows;
-  }
-  if (end_node > max_rows) {
-    end_node = max_rows;
-  }
-
-  size_t row_index = layer_dimensions_.output_columns;
-  if (start_node > max_rows) {
-    start_node = max_rows;
-  }
-  if (end_node > max_rows) {
-    end_node = max_rows;
-  }
-  assert(start_node * row_index <= gradient->size());
-  assert(end_node * row_index <= gradient->size());
-
-#ifdef GALOIS_ENABLE_GPU
-  if (device_personality == DevicePersonality::GPU_CUDA) {
-    base_gpu_object_.MaskNonMastersGPU(gradient, start_node, end_node,
-                                       row_index);
-  } else {
-#endif
-    // galois::gInfo(start_node, " to ", end_node);
-    galois::do_all(
-        galois::iterate(start_node, end_node),
-        [&](size_t non_master) {
-          // if something is not a master, kill it
-          if (!bs.test(non_master)) {
-            // galois::gInfo("don't keep ", non_master);
-            // TODO(loc) use a std function for this for max efficiency
-            for (size_t i = 0; i < row_index; i++) {
-              (*gradient)[non_master * row_index + i] = 0;
-            }
-          }
-        },
-        galois::loopname("MaskGradientNonMasters"));
-#ifdef GALOIS_ENABLE_GPU
-  }
-#endif
-}
diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp
index b9a9c2120c..e69de29bb2 100644
--- a/libgnn/src/layers/GraphConvolutionalLayer.cpp
+++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp
@@ -1,459 +0,0 @@
-#include "galois/Logging.h"
-#include "galois/GNNMath.h"
-#include "galois/layers/GraphConvolutionalLayer.h"
-
-galois::GraphConvolutionalLayer::GraphConvolutionalLayer(
-    size_t layer_num, const galois::graphs::GNNGraph& graph,
-    PointerWithSize<GNNFloat>* backward_output_matrix,
-    const GNNLayerDimensions& dimensions, const GNNLayerConfig& config)
-    : GNNLayer(layer_num, graph, backward_output_matrix, dimensions, config),
-      input_column_intermediates_(dimensions.input_columns),
-      output_column_intermediates_(dimensions.output_columns) {
-  galois::gWarn(
-      "GCN layer not up to date with new subgraph/sampling changes; "
-      "do not use until updated to reflect changes (see GraphSAGE layer)");
-
-  size_t num_input_elements =
-      layer_dimensions_.input_rows * layer_dimensions_.input_columns;
-  if (!config_.disable_dropout || config_.disable_aggregate_after_update ||
-      layer_dimensions_.input_columns <= layer_dimensions_.output_columns) {
-    galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
-                  ", GCN input temp var 1 ", num_input_elements, " (",
-                  FloatElementsToGB(num_input_elements), " GB)");
-#ifdef GALOIS_ENABLE_GPU
-    if (device_personality == DevicePersonality::GPU_CUDA) {
-      gpu_object_.AllocateInTemp1(num_input_elements);
-    } else {
-#endif
-      in_temp_1_.resize(num_input_elements, 0);
-#ifdef GALOIS_ENABLE_GPU
-    }
-#endif
-  }
-
-  // only on in dropout case + if in temp is smaller than out temp
-  if (!config_.disable_dropout &&
-      (config_.disable_aggregate_after_update ||
-       layer_dimensions_.input_columns <= layer_dimensions_.output_columns)) {
-    galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
-                  ", GCN input temp var 2 ", num_input_elements, " (",
-                  FloatElementsToGB(num_input_elements), " GB)");
-#ifdef GALOIS_ENABLE_GPU
-    if (device_personality == DevicePersonality::GPU_CUDA) {
-      gpu_object_.AllocateInTemp2(num_input_elements);
-    } else {
-#endif
-      in_temp_2_.resize(num_input_elements, 0);
-#ifdef GALOIS_ENABLE_GPU
-    }
-#endif
-  }
-
-  size_t num_output_elements =
-      layer_dimensions_.input_rows * layer_dimensions_.output_columns;
-
-  // only needed if out temp would be smaller than intemp
-  if (!config_.disable_aggregate_after_update &&
-      layer_dimensions_.input_columns > layer_dimensions_.output_columns) {
-    // xform matrix first to work with a smaller output size
-    galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
-                  ", GCN output temp var ", num_output_elements, " (",
-                  FloatElementsToGB(num_output_elements), " GB)");
-#ifdef GALOIS_ENABLE_GPU
-    if (device_personality == DevicePersonality::GPU_CUDA) {
-      gpu_object_.AllocateOutTemp(num_output_elements);
-    } else {
-#endif
-      out_temp_.resize(num_output_elements, 0);
-#ifdef GALOIS_ENABLE_GPU
-    }
-#endif
-  }
-
-  layer_type_ = galois::GNNLayerType::kGraphConvolutional;
-#ifdef GALOIS_ENABLE_GPU
-  if (device_personality == DevicePersonality::GPU_CUDA) {
-    // init pointers with size
-    p_in_temp_1_ =
-        PointerWithSize<GNNFloat>(gpu_object_.in_temp_1(), num_input_elements);
-    p_in_temp_2_ =
-        PointerWithSize<GNNFloat>(gpu_object_.in_temp_2(), num_input_elements);
-    p_out_temp_ =
-        PointerWithSize<GNNFloat>(gpu_object_.out_temp(), num_output_elements);
-  } else {
-#endif
-    p_in_temp_1_ = PointerWithSize<GNNFloat>(in_temp_1_);
-    p_in_temp_2_ = PointerWithSize<GNNFloat>(in_temp_2_);
-    p_out_temp_  = PointerWithSize<GNNFloat>(out_temp_);
-#ifdef GALOIS_ENABLE_GPU
-  }
-#endif
-
-  GALOIS_LOG_VERBOSE("Conv layer initialized");
-}
-
-const galois::PointerWithSize<galois::GNNFloat>
-galois::GraphConvolutionalLayer::ForwardPhase(
-    const galois::PointerWithSize<galois::GNNFloat> input_embeddings) {
-  galois::StatTimer timer("ForwardPhase", kRegionName);
-  timer.start();
-  GALOIS_LOG_VERBOSE("Calling forward phase");
-  assert(input_embeddings.size() ==
-         (layer_dimensions_.input_rows * layer_dimensions_.input_columns));
-  assert(p_forward_output_matrix_.size() ==
-         (layer_dimensions_.input_rows * layer_dimensions_.output_columns));
-  // pointer to input to operate on
-  const GNNFloat* input_data = input_embeddings.data();
-  GNNFloat* agg_data;
-  // first, dropout
-  if (!config_.disable_dropout && (layer_phase_ == GNNPhase::kTrain)) {
-    DoDropout(input_embeddings, &p_in_temp_1_);
-    input_data = p_in_temp_1_.data();
-    agg_data   = p_in_temp_2_.data();
-  } else {
-    agg_data = p_in_temp_1_.data();
-  }
-
-  // flip aggregate/update if dimensions favor it (do less work)
-  if (config_.disable_aggregate_after_update ||
-      layer_dimensions_.input_columns <= layer_dimensions_.output_columns) {
-    // aggregation and update
-    AggregateAll(layer_dimensions_.input_columns, input_data, agg_data,
-                 &input_column_intermediates_);
-    UpdateEmbeddings(agg_data, p_forward_output_matrix_.data());
-  } else {
-    // update to aggregate
-    // FW
-    UpdateEmbeddings(input_data, p_out_temp_.data());
-    // A(FW)
-    AggregateAll(layer_dimensions_.output_columns, p_out_temp_.data(),
-                 p_forward_output_matrix_.data(),
-                 &output_column_intermediates_);
-  }
-
-  if (!config_.disable_activation) {
-    GALOIS_LOG_VERBOSE("Doing activation");
-    Activation();
-  }
-
-  assert(p_forward_output_matrix_.size() ==
-         (layer_dimensions_.input_rows * layer_dimensions_.output_columns));
-  timer.stop();
-
-  return p_forward_output_matrix_;
-}
-
-galois::PointerWithSize<galois::GNNFloat>
-galois::GraphConvolutionalLayer::BackwardPhase(
-    galois::PointerWithSize<galois::GNNFloat> prev_layer_input,
-    galois::PointerWithSize<galois::GNNFloat>* input_gradient) {
-  galois::StatTimer timer("BackwardPhase", kRegionName);
-  galois::StatTimer weight_gradient_timer("BackwardPhaseWeight", kRegionName);
-  galois::StatTimer weight_gradient_sync_timer("BackwardPhaseWeightSync", kRegionName);
-  timer.start();
-
-  assert(layer_phase_ == GNNPhase::kTrain);
-
-  // derivative of activation
-  if (!config_.disable_activation) {
-    ActivationDerivative(input_gradient);
-  }
-
-  // AFW = O
-  galois::PointerWithSize<galois::GNNFloat> input_data;
-  galois::PointerWithSize<galois::GNNFloat> agg_data;
-  if (!config_.disable_dropout) {
-    // dropout result is currently stored in temp 1
-    // needs to be used before it gets overwritten
-    input_data = p_in_temp_1_;
-    agg_data   = p_in_temp_2_;
-  } else {
-    // no dropout = use vanilla input
-    input_data = prev_layer_input;
-    agg_data   = p_in_temp_1_;
-  }
-
-  // NOTE: PREV LAYER INPUT AND BACKWARDOUTPUT ARE THE SAME MEMORY LOCATION;
-  // BEWARE OF DEPENDENCIES
-
-  // derivative of aggregation/update
-  // TODO clean up logic here to reduce nesting
-  if (config_.disable_aggregate_after_update ||
-      layer_dimensions_.input_columns <= layer_dimensions_.output_columns) {
-    // aggdata can == p_intemp1; in other words, need to use before overwrite
-    // mask it, then use it
-    MaskInputNonMasters(&agg_data);
-
-#ifdef GALOIS_ENABLE_GPU
-    if (device_personality == DevicePersonality::GPU_CUDA) {
-      gpu_object_.GetWeightGradientsGPU(
-          layer_dimensions_.input_rows, layer_dimensions_.input_columns,
-          layer_dimensions_.output_columns, agg_data.data(),
-          input_gradient->data(), p_layer_weight_gradients_.data());
-    } else {
-#endif
-      weight_gradient_timer.start();
-      // temp 2 holds aggregated feature vectors from forward phase
-      galois::CBlasSGEMM(
-          CblasTrans, CblasNoTrans, layer_dimensions_.input_columns,
-          layer_dimensions_.input_rows, layer_dimensions_.output_columns,
-          agg_data.data(), input_gradient->data(),
-          p_layer_weight_gradients_.data());
-      weight_gradient_timer.stop();
-#ifdef GALOIS_ENABLE_GPU
-    }
-#endif
-
-    // gradient isn't masked here; only temp1, which has already been
-    // overwritten = fine
-    if (layer_number_ != 0) {
-      // transposed sgemm for derivative; in_temp is output
-      assert(input_gradient->size() ==
-             layer_dimensions_.input_rows * layer_dimensions_.output_columns);
-      // pintemp1 contains (AF)'
-      UpdateEmbeddingsDerivative(input_gradient->data(), p_in_temp_1_.data());
-      // pback contains F'
-      // derivative of aggregate is the same due to symmetric graph
-      AggregateAll(layer_dimensions_.input_columns, p_in_temp_1_.data(),
-                   p_backward_output_matrix_.data(),
-                   &input_column_intermediates_, true);
-    }
-  } else {
-    // TODO at this point, out_temp contains memoized FW
-    // can use it to get A' = O' (FW)^T
-    // aggregate occurs regardless of layer being equal to 0 because it is
-    // required in this case for the weight gradient calculation
-    // this is (FW)'
-    AggregateAll(layer_dimensions_.output_columns, input_gradient->data(),
-                 p_out_temp_.data(), &output_column_intermediates_, true);
-
-    // done after above because input_data = p_backward_output_matrix in some
-    // cases; use first before overwriting here if layer # doesn't = 0, it means
-    // I can mess with the input data itself instad of masking the gradients I
-    // can mask the input
-    if (layer_number_ != 0) {
-      MaskInputNonMasters(&input_data);
-    } else {
-      // if 0 then no input to mask: mask the gradient
-      // this is fine because gradient won't be used to get feature gradients
-      MaskGradientNonMasters(&p_out_temp_);
-    }
-
-#ifdef GALOIS_ENABLE_GPU
-    if (device_personality == DevicePersonality::GPU_CUDA) {
-      gpu_object_.GetWeightGradientsGPU(
-          layer_dimensions_.input_rows, layer_dimensions_.input_columns,
-          layer_dimensions_.output_columns, input_data.data(),
-          p_out_temp_.data(), p_layer_weight_gradients_.data());
-    } else {
-#endif
-      weight_gradient_timer.start();
-      galois::CBlasSGEMM(CblasTrans, CblasNoTrans,
-                         layer_dimensions_.input_columns,
-                         layer_dimensions_.input_rows,
-                         layer_dimensions_.output_columns, input_data.data(),
-                         p_out_temp_.data(), p_layer_weight_gradients_.data());
-      weight_gradient_timer.stop();
-#ifdef GALOIS_ENABLE_GPU
-    }
-#endif
-
-    if (layer_number_ != 0) {
-      // can now overwrite p_backward without issue; since input gradient
-      // is untouched if layer number isn't 0 this will be correct
-      UpdateEmbeddingsDerivative(p_out_temp_.data(),
-                                 p_backward_output_matrix_.data());
-    }
-  }
-
-  // sync weight gradients; note aggregation sync occurs in the function call
-  // already
-  weight_gradient_sync_timer.start();
-  WeightGradientSyncSum();
-  weight_gradient_sync_timer.stop();
-
-  if (!config_.disable_dropout && layer_number_ != 0) {
-    DoDropoutDerivative();
-  }
-
-  timer.stop();
-  return p_backward_output_matrix_;
-}
-
-void galois::GraphConvolutionalLayer::AggregateAll(
-    size_t column_length, const GNNFloat* node_embeddings,
-    GNNFloat* aggregate_output,
-    [[maybe_unused]] galois::substrate::PerThreadStorage<std::vector<GNNFloat>>*
-        pts) {
-  AggregateAll(column_length, node_embeddings, aggregate_output, pts, false);
-}
-
-void galois::GraphConvolutionalLayer::AggregateAll(
-    size_t column_length, const GNNFloat* node_embeddings,
-    GNNFloat* aggregate_output,
-    [[maybe_unused]] galois::substrate::PerThreadStorage<std::vector<GNNFloat>>*
-        pts,
-    bool is_backward) {
-  std::string agg_timer_name = "Aggregate";
-  if (!is_backward) {
-    agg_timer_name += "Forward";
-  } else {
-    agg_timer_name += "Backward";
-  }
-  galois::StatTimer timer(agg_timer_name.c_str(), kRegionName);
-  timer.start();
-
-#ifdef GALOIS_ENABLE_GPU
-  if (device_personality == DevicePersonality::GPU_CUDA) {
-    size_t last_master = *(graph_.end_owned());
-    gpu_object_.AggregateAllGPU(
-        graph_.GetGPUGraph(), graph_.size(), column_length, node_embeddings,
-        aggregate_output, !config_.disable_normalization,
-        config_.disable_self_aggregate, last_master);
-    graph_.AggregateSyncGPU(aggregate_output, column_length, layer_number_);
-  } else {
-#endif
-    AggregateAllCPU(column_length, node_embeddings, aggregate_output, pts);
-#ifdef GALOIS_ENABLE_GPU
-  }
-#endif
-  timer.stop();
-}
-
-void galois::GraphConvolutionalLayer::AggregateAllCPU(
-    size_t column_length, const GNNFloat* node_embeddings,
-    GNNFloat* aggregate_output,
-    galois::substrate::PerThreadStorage<std::vector<GNNFloat>>*) {
-  galois::StatTimer aggregate_all_sync_timer("AggregateSync", kRegionName);
-  size_t num_nodes   = graph_.size();
-  size_t last_master = *(graph_.end_owned());
-  assert(0 == *(graph_.begin_owned()));
-
-  galois::do_all(
-      galois::iterate(static_cast<size_t>(0), num_nodes),
-      [&](size_t src) {
-        size_t index_to_src_feature = src * column_length;
-        // zero out src feature first
-        for (size_t i = 0; i < column_length; i++) {
-          aggregate_output[index_to_src_feature + i] = 0;
-        }
-
-        if (layer_phase_ == GNNPhase::kTrain) {
-          if (IsSampledLayer()) {
-            // XXX(loc)
-            GALOIS_LOG_WARN(
-                "Edge sampling not yet implemented for GCN; only SAGE");
-            // check if node is part of sampled graph; ignore after 0'ing if not
-            // sampled
-            if (!graph_.IsInSampledGraph(src))
-              return;
-          }
-        }
-
-        GNNFloat source_norm = 0.0;
-        if (!config_.disable_normalization) {
-          source_norm = graph_.GetGCNNormFactor(src);
-        }
-
-        // init to self
-        if (!config_.disable_self_aggregate) {
-          graphs::bitset_graph_aggregate.set(src);
-          // only aggregate self once on master
-          if (src < last_master) {
-            for (size_t i = 0; i < column_length; i++) {
-              aggregate_output[index_to_src_feature + i] =
-                  node_embeddings[index_to_src_feature + i] * source_norm *
-                  source_norm;
-            }
-          }
-        }
-
-        // loop through all destinations to grab the feature to aggregate
-        for (auto e = graph_.edge_begin(src); e != graph_.edge_end(src); e++) {
-          size_t dst = graph_.GetEdgeDest(e);
-          graphs::bitset_graph_aggregate.set(src);
-
-          if (layer_phase_ == GNNPhase::kTrain) {
-            if (IsSampledLayer()) {
-              // ignore non-sampled nodes
-              if (layer_phase_ == GNNPhase::kTrain &&
-                  !graph_.IsInSampledGraph(dst))
-                continue;
-            }
-          }
-
-          size_t index_to_dst_feature = dst * column_length;
-
-          if (!config_.disable_normalization) {
-            GNNFloat norm_scale = source_norm * graph_.GetGCNNormFactor(dst);
-            galois::VectorMulAdd(
-                column_length, &aggregate_output[index_to_src_feature],
-                &node_embeddings[index_to_dst_feature], norm_scale,
-                &aggregate_output[index_to_src_feature]);
-          } else {
-            // add dst feature to aggregate output
-            galois::VectorAdd(column_length,
-                              &aggregate_output[index_to_src_feature],
-                              &node_embeddings[index_to_dst_feature],
-                              &aggregate_output[index_to_src_feature]);
-          }
-        }
-      },
-      galois::chunk_size<1>(), galois::steal(),
-      galois::loopname("ConvolutionalAggregateAll"));
-  // aggregate sync
-  aggregate_all_sync_timer.start();
-  graph_.AggregateSync(aggregate_output, column_length);
-  aggregate_all_sync_timer.stop();
-}
-
-void galois::GraphConvolutionalLayer::UpdateEmbeddings(
-    const GNNFloat* node_embeddings, GNNFloat* output) {
-  galois::StatTimer timer("ForwardXform", kRegionName);
-  timer.start();
-
-#ifdef GALOIS_ENABLE_GPU
-  if (device_personality == DevicePersonality::GPU_CUDA) {
-    gpu_object_.UpdateEmbeddingsGPU(
-        layer_dimensions_.input_rows, layer_dimensions_.input_columns,
-        layer_dimensions_.output_columns, node_embeddings,
-        base_gpu_object_.layer_weights(), output);
-  } else {
-#endif
-    // CPU version is just a call into CBlas
-    galois::CBlasSGEMM(CblasNoTrans, CblasNoTrans, layer_dimensions_.input_rows,
-                       layer_dimensions_.input_columns,
-                       layer_dimensions_.output_columns, node_embeddings,
-                       layer_weights_.data(), output);
-#ifdef GALOIS_ENABLE_GPU
-  }
-#endif
-  timer.stop();
-}
-
-void galois::GraphConvolutionalLayer::UpdateEmbeddingsDerivative(
-    const GNNFloat* gradients, GNNFloat* output) {
-  galois::StatTimer timer("BackwardXform", kRegionName);
-  timer.start();
-
-  assert(p_layer_weights_.size() ==
-         layer_dimensions_.input_columns * layer_dimensions_.output_columns);
-#ifdef GALOIS_ENABLE_GPU
-  if (device_personality == DevicePersonality::GPU_CUDA) {
-    gpu_object_.UpdateEmbeddingsDerivativeGPU(
-        layer_dimensions_.input_rows, layer_dimensions_.input_columns,
-        layer_dimensions_.output_columns, gradients,
-        base_gpu_object_.layer_weights(), output);
-  } else {
-#endif
-    // difference is Trans for B matrix (data) to get z by y (weights is y by z
-    // normally); result is x by y
-    galois::CBlasSGEMM(CblasNoTrans, CblasTrans, layer_dimensions_.input_rows,
-                       layer_dimensions_.output_columns,
-                       layer_dimensions_.input_columns, gradients,
-                       layer_weights_.data(), output);
-#ifdef GALOIS_ENABLE_GPU
-  }
-#endif
-  timer.stop();
-}
diff --git a/libgnn/src/layers/L2NormLayer.cpp b/libgnn/src/layers/L2NormLayer.cpp
index 0d566f0b66..e69de29bb2 100644
--- a/libgnn/src/layers/L2NormLayer.cpp
+++ b/libgnn/src/layers/L2NormLayer.cpp
@@ -1,124 +0,0 @@
-#include "galois/layers/L2NormLayer.h"
-const galois::PointerWithSize<galois::GNNFloat>
-galois::L2NormLayer::ForwardPhase(
-    const galois::PointerWithSize<galois::GNNFloat> input_embeddings) {
-#ifdef GALOIS_ENABLE_GPU
-  // TODO
-#endif
-  GALOIS_LOG_FATAL(
-      "L2 layer has not been kept up to date for months; do not use");
-  return ForwardPhaseCPU(input_embeddings);
-}
-
-const galois::PointerWithSize<galois::GNNFloat>
-galois::L2NormLayer::ForwardPhaseCPU(
-    const galois::PointerWithSize<galois::GNNFloat> input_embeddings) {
-  forward_output_matrix_.assign(forward_output_matrix_.size(), 0.0);
-  // for each row, get square root of squared sums then normalize
-  const size_t feature_length = layer_dimensions_.input_columns;
-  // TODO(loc) make sure this works in distributed setting as well
-  galois::do_all(
-      galois::iterate(graph_.begin_owned(), graph_.end_owned()),
-      [&](const unsigned row) {
-        if (IsSampledLayer()) {
-          if (layer_phase_ == GNNPhase::kTrain && !graph_.IsInSampledGraph(row))
-            return;
-        }
-
-        if (graph_.IsValidForPhase(row, layer_phase_)) {
-          size_t row_offset        = row * feature_length;
-          float running_square_sum = 0.0;
-          // get square sums
-          for (size_t row_index = row_offset;
-               row_index < (row_offset + feature_length); row_index++) {
-            running_square_sum += std::pow(input_embeddings[row_index], 2);
-          }
-
-          // make sure running sum isn't too small
-          running_square_sum =
-              (running_square_sum < 1.0e-12) ? 10e-12 : running_square_sum;
-
-          // sqrt of sums, then divide row by it
-          float sqrt_squares = std::pow(running_square_sum, 0.5);
-          for (size_t row_index = row_offset;
-               row_index < (row_offset + feature_length); row_index++) {
-            forward_output_matrix_[row_index] =
-                input_embeddings[row_index] / sqrt_squares;
-          }
-        }
-      },
-      galois::loopname("L2ForwardNormalization"));
-
-  return forward_output_matrix_;
-}
-
-galois::PointerWithSize<galois::GNNFloat> galois::L2NormLayer::BackwardPhase(
-    PointerWithSize<galois::GNNFloat> prev_layer_input,
-    PointerWithSize<galois::GNNFloat>* input_gradient) {
-#ifdef GALOIS_ENABLE_GPU
-  // TODO
-#endif
-  return BackwardPhaseCPU(prev_layer_input, input_gradient);
-}
-
-galois::PointerWithSize<galois::GNNFloat> galois::L2NormLayer::BackwardPhaseCPU(
-    galois::PointerWithSize<galois::GNNFloat> prev_layer_input,
-    galois::PointerWithSize<galois::GNNFloat>* input_gradient) {
-  galois::do_all(galois::iterate(size_t{0}, p_backward_output_matrix_.size()),
-                 [&](size_t i) { p_backward_output_matrix_[i] = 0; });
-  const size_t feature_length = layer_dimensions_.input_columns;
-
-  // derivative of some x_1 is sum of gradient w.r.t. x_1 for all elements of
-  // the row (since l2 norm affects entire row)
-  // The math itself can be derived using quotient/chain rule on each element
-  // of the normalized row
-  galois::do_all(
-      galois::iterate(graph_.begin_owned(), graph_.end_owned()),
-      [&](const unsigned row) {
-        if (IsSampledLayer()) {
-          if (layer_phase_ == GNNPhase::kTrain && !graph_.IsInSampledGraph(row))
-            return;
-        }
-
-        if (graph_.IsValidForPhase(row, layer_phase_)) {
-          size_t row_offset = row * feature_length;
-          // note: if you work this out on paper it turns out that terms that
-          // seem extra in the way this is calculated below simply get canceled
-          // out, so this ends up working out This implementation is taken from
-          // the IPDPS GraphSAINT implementation: I (loc) have confirmed the
-          // math checks out
-          float running_square_sum = 0.0;
-          float mult_with_input    = 0.0;
-
-          // get square sums
-          for (size_t row_index = row_offset;
-               row_index < (row_offset + feature_length); row_index++) {
-            running_square_sum += std::pow(prev_layer_input[row_index], 2);
-            // gradient multiplied with corresponding input; subtraction because
-            // derivative math ends up working out that way
-            mult_with_input -=
-                prev_layer_input[row_index] * (*input_gradient)[row_index];
-          }
-          running_square_sum =
-              (running_square_sum < 1.0e-12) ? 10e-12 : running_square_sum;
-          assert(running_square_sum != 0.0);
-
-          // denominator for all gradients is just the square sum to the -3/2'd
-          // power since this is -, all we have to do is multiply it later
-          // rather than divide
-          float denominator = std::pow(running_square_sum, -1.5);
-          assert(denominator != 0.0);
-
-          for (size_t row_index = row_offset;
-               row_index < (row_offset + feature_length); row_index++) {
-            p_backward_output_matrix_[row_index] =
-                denominator *
-                (prev_layer_input[row_index] * mult_with_input +
-                 (*input_gradient)[row_index] * running_square_sum);
-          }
-        }
-      },
-      galois::loopname("L2Backward"));
-
-  return p_backward_output_matrix_;
-}
diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp
index 032478745d..99d0ffc5f0 100644
--- a/libgnn/src/layers/SAGELayer.cpp
+++ b/libgnn/src/layers/SAGELayer.cpp
@@ -1,880 +1 @@
-#include "galois/Logging.h"
-#include "galois/GNNMath.h"
 #include "galois/layers/SAGELayer.h"
-
-galois::SAGELayer::SAGELayer(size_t layer_num,
-                             const galois::graphs::GNNGraph& graph,
-                             PointerWithSize<GNNFloat>* backward_output_matrix,
-                             const GNNLayerDimensions& dimensions,
-                             const GNNLayerConfig& config,
-                             const SAGELayerConfig& sage_config)
-    : GNNLayer(layer_num, graph, backward_output_matrix, dimensions, config),
-      sage_config_(sage_config),
-      input_column_intermediates_(dimensions.input_columns),
-      output_column_intermediates_(dimensions.output_columns) {
-  if (!sage_config_.disable_concat) {
-    // there are now 2 weight matrices used: one for self, one for aggregation
-    // abstractly it's one matrix: W = W1 | W2
-    size_t num_weight_elements =
-        layer_dimensions_.input_columns * layer_dimensions_.output_columns;
-    galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
-                  ", SAGE second layer weights ", num_weight_elements, " (",
-                  FloatElementsToGB(num_weight_elements), " GB)");
-    // TODO(lhc) for now, allocate dummy cpu weight2 for copying to GPU
-    layer_weights_2_.resize(num_weight_elements);
-#ifdef GALOIS_ENABLE_GPU
-    if (device_personality == DevicePersonality::GPU_CUDA) {
-      gpu_object_.AllocateWeight2(num_weight_elements);
-    }
-#endif
-    galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
-                  ", SAGE second layer gradients ", num_weight_elements, " (",
-                  FloatElementsToGB(num_weight_elements), " GB)");
-    layer_weight_gradients_2_.resize(num_weight_elements, 0);
-#ifdef GALOIS_ENABLE_GPU
-    if (device_personality == DevicePersonality::GPU_CUDA) {
-      gpu_object_.AllocateWeightGradient2(num_weight_elements);
-    }
-#endif
-
-    // reinit both weight matrices as one unit
-    PairGlorotBengioInit(&layer_weights_, &layer_weights_2_);
-#ifdef GALOIS_ENABLE_GPU
-    if (device_personality == DevicePersonality::GPU_CUDA) {
-      // copy weight2 to GPU
-      gpu_object_.CopyToWeights2(layer_weights_2_);
-      p_layer_weights_2_ = PointerWithSize<GNNFloat>(
-          gpu_object_.layer_weights_2(), num_weight_elements);
-      p_layer_weight_gradients_2_ = PointerWithSize<GNNFloat>(
-          gpu_object_.layer_weight_gradients_2(), num_weight_elements);
-    } else {
-#endif
-      // update the pointers to them as well as realloc will require it
-      p_layer_weights_2_ = PointerWithSize<GNNFloat>(layer_weights_2_);
-      p_layer_weight_gradients_2_ =
-          PointerWithSize<GNNFloat>(layer_weight_gradients_2_);
-#ifdef GALOIS_ENABLE_GPU
-    }
-#endif
-    std::vector<size_t> weight_size = {num_weight_elements};
-    // initialize the optimizer
-    second_weight_optimizer_ = std::make_unique<AdamOptimizer>(weight_size, 1);
-  }
-
-  // TODO(loc) dropout uses input rows; this won't work if dropout is enabled
-  size_t num_in_temp_elements =
-      layer_dimensions_.output_rows * layer_dimensions_.input_columns;
-
-  // if (layer_number_ == 0) {
-  //   // set this to true for layer 0; it avoids aggregation completely
-  //   // in the last layer for the backward phase
-  //   config_.disable_aggregate_after_update = true;
-  //   // TODO this *will* hurt test evaluation because test eval has no
-  //   // backward phase, so the end-to-end benefits do not exist there
-  //   // Solution to this is to allocate all intermediate structures for both
-  //   // cases + make sure resize handles both cases
-  // }
-
-  // if in temp is smaller than out temp, or if dropout exists
-  if (!config_.disable_dropout || config_.disable_aggregate_after_update ||
-      layer_dimensions_.input_columns <= layer_dimensions_.output_columns) {
-    galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
-                  ", SAGE input temp var 1 ", num_in_temp_elements, " (",
-                  FloatElementsToGB(num_in_temp_elements), " GB)");
-#ifdef GALOIS_ENABLE_GPU
-    if (device_personality == DevicePersonality::GPU_CUDA) {
-      gpu_object_.AllocateInTemp1(num_in_temp_elements);
-    } else {
-#endif
-      in_temp_1_.resize(num_in_temp_elements, 0);
-#ifdef GALOIS_ENABLE_GPU
-    }
-#endif
-  }
-
-  // only on in dropout case + if in temp is smaller than out temp
-  if (!config_.disable_dropout &&
-      (config_.disable_aggregate_after_update ||
-       layer_dimensions_.input_columns <= layer_dimensions_.output_columns)) {
-    galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
-                  ", SAGE input temp var 2 ", num_in_temp_elements, " (",
-                  FloatElementsToGB(num_in_temp_elements), " GB)");
-#ifdef GALOIS_ENABLE_GPU
-    if (device_personality == DevicePersonality::GPU_CUDA) {
-      gpu_object_.AllocateInTemp2(num_in_temp_elements);
-    } else {
-#endif
-      in_temp_2_.resize(num_in_temp_elements, 0);
-#ifdef GALOIS_ENABLE_GPU
-    }
-#endif
-  }
-
-  size_t num_out_temp =
-      layer_dimensions_.input_rows * layer_dimensions_.output_columns;
-  // only needed if out temp would be smaller than intemp
-  if (!config_.disable_aggregate_after_update &&
-      layer_dimensions_.input_columns > layer_dimensions_.output_columns) {
-    galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
-                  ", SAGE output temp var ", num_out_temp, " (",
-                  FloatElementsToGB(num_out_temp), " GB)");
-#ifdef GALOIS_ENABLE_GPU
-    if (device_personality == DevicePersonality::GPU_CUDA) {
-      gpu_object_.AllocateOutTemp(num_out_temp);
-    } else {
-#endif
-      out_temp_.resize(num_out_temp, 0);
-#ifdef GALOIS_ENABLE_GPU
-    }
-#endif
-  }
-
-  layer_type_ = galois::GNNLayerType::kSAGE;
-#ifdef GALOIS_ENABLE_GPU
-  if (device_personality == DevicePersonality::GPU_CUDA) {
-    // init pointers with size
-    p_in_temp_1_ = PointerWithSize<GNNFloat>(gpu_object_.in_temp_1(),
-                                             num_in_temp_elements);
-    p_in_temp_2_ = PointerWithSize<GNNFloat>(gpu_object_.in_temp_2(),
-                                             num_in_temp_elements);
-    p_out_temp_ =
-        PointerWithSize<GNNFloat>(gpu_object_.out_temp(), num_output_elements);
-  } else {
-#endif
-    p_in_temp_1_ = PointerWithSize<GNNFloat>(in_temp_1_);
-    p_in_temp_2_ = PointerWithSize<GNNFloat>(in_temp_2_);
-    p_out_temp_  = PointerWithSize<GNNFloat>(out_temp_);
-#ifdef GALOIS_ENABLE_GPU
-  }
-#endif
-
-  GALOIS_LOG_VERBOSE("SAGE layer initialized");
-}
-
-void galois::SAGELayer::ResizeIntermediates(size_t new_input_rows,
-                                            size_t new_output_rows) {
-  size_t num_in_temp_elements =
-      new_output_rows * layer_dimensions_.input_columns;
-  // galois::gDebug(graph_.host_prefix(), "Layer num ", layer_number_, " ",
-  //               in_temp_1_.size(), " and ", num_in_temp_elements, " ",
-  //               layer_dimensions_.input_columns, " ",
-  //               layer_dimensions_.output_columns);
-
-  // if in temp is smaller than out temp, or if dropout exists
-  if (!config_.disable_dropout || config_.disable_aggregate_after_update ||
-      layer_dimensions_.input_columns <= layer_dimensions_.output_columns) {
-    if (in_temp_1_.size() < num_in_temp_elements) {
-      galois::gInfo(graph_.host_prefix(), "Resize layer ", layer_number_,
-                    ", SAGE input temp var 1 ", num_in_temp_elements, " (",
-                    FloatElementsToGB(num_in_temp_elements), " GB)");
-      size_t buffer_size = num_in_temp_elements * 0.02;
-#ifdef GALOIS_ENABLE_GPU
-      // XXX(hochan)
-      if (device_personality == DevicePersonality::GPU_CUDA) {
-        gpu_object_.AllocateInTemp1(num_in_temp_elements + buffer_size);
-      } else {
-#endif
-        in_temp_1_.resize(num_in_temp_elements + buffer_size, 0);
-#ifdef GALOIS_ENABLE_GPU
-      }
-#endif
-      // XXX(hochan) GPU
-      p_in_temp_1_ = PointerWithSize<GNNFloat>(in_temp_1_);
-    }
-  }
-
-  // only on in dropout case + if in temp is smaller than out temp
-  if (!config_.disable_dropout &&
-      (config_.disable_aggregate_after_update ||
-       layer_dimensions_.input_columns <= layer_dimensions_.output_columns)) {
-    if (in_temp_2_.size() < num_in_temp_elements) {
-      galois::gInfo(graph_.host_prefix(), "Resize layer ", layer_number_,
-                    ", SAGE input temp var 2 ", num_in_temp_elements, " (",
-                    FloatElementsToGB(num_in_temp_elements), " GB)");
-      size_t buffer_size = num_in_temp_elements * 0.02;
-#ifdef GALOIS_ENABLE_GPU
-      if (device_personality == DevicePersonality::GPU_CUDA) {
-        gpu_object_.AllocateInTemp2(num_in_temp_elements + buffer_size);
-      } else {
-#endif
-        in_temp_2_.resize(num_in_temp_elements + buffer_size, 0);
-#ifdef GALOIS_ENABLE_GPU
-      }
-#endif
-      // XXX(hochan) GPU
-      p_in_temp_2_ = PointerWithSize<GNNFloat>(in_temp_2_);
-    }
-  }
-
-  size_t num_output_temp_elements =
-      new_input_rows * layer_dimensions_.output_columns;
-  // only needed if out temp would be smaller than intemp
-  if (!config_.disable_aggregate_after_update &&
-      layer_dimensions_.input_columns > layer_dimensions_.output_columns) {
-    if (out_temp_.size() < num_output_temp_elements) {
-      galois::gInfo(graph_.host_prefix(), "Resize layer ", layer_number_,
-                    ", SAGE output temp var ", num_output_temp_elements, " (",
-                    FloatElementsToGB(num_output_temp_elements), " GB)");
-      size_t buffer_size = (num_output_temp_elements * 0.02);
-#ifdef GALOIS_ENABLE_GPU
-      if (device_personality == DevicePersonality::GPU_CUDA) {
-        gpu_object_.AllocateOutTemp(num_output_temp_elements + buffer_size);
-      } else {
-#endif
-        out_temp_.resize(num_output_temp_elements + buffer_size, 0);
-#ifdef GALOIS_ENABLE_GPU
-      }
-#endif
-      p_out_temp_ = PointerWithSize<GNNFloat>(out_temp_);
-    }
-  }
-}
-
-void galois::SAGELayer::WeightGradientSyncSum2() {
-  galois::StatTimer clubbed_timer("Sync_BackwardSync", "Gluon");
-  TimerStart(&clubbed_timer);
-  galois::StatTimer t("Sync_WeightGradientsSum2", kRegionName);
-  TimerStart(&t);
-  int weight_size = static_cast<int>(p_layer_weight_gradients_2_.size());
-
-#ifdef GALOIS_ENABLE_GPU
-  bool gpu_direct_enabled = false;
-  if (device_personality == DevicePersonality::GPU_CUDA &&
-      !gpu_direct_enabled) {
-    gpu_object_.CopyWeight2GradientsToCPU(&layer_weight_gradients_2_);
-    MPI_Allreduce(MPI_IN_PLACE,
-                  static_cast<void*>(layer_weight_gradients_2_.data()),
-                  weight_size, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);
-    gpu_object_.CopyToWeight2Gradients(layer_weight_gradients_2_);
-  } else {
-#endif
-    // TODO(loc) remove this limitation later; can just do a loop over the
-    // weight matrix
-    if (p_layer_weight_gradients_2_.size() >
-        size_t{std::numeric_limits<int>::max()}) {
-      GALOIS_LOG_FATAL("Weight sync code does not handle size larger than max "
-                       "int at the moment");
-    }
-    MPI_Allreduce(MPI_IN_PLACE,
-                  static_cast<void*>(p_layer_weight_gradients_2_.data()),
-                  weight_size, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);
-#ifdef GALOIS_ENABLE_GPU
-  }
-#endif
-  TimerStop(&t);
-  TimerStop(&clubbed_timer);
-}
-
-const galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::ForwardPhase(
-    const galois::PointerWithSize<galois::GNNFloat> input_embeddings) {
-  // galois::gDebug(
-  //    "Layer ", layer_number_, " dims: ", layer_dimensions_.input_rows, " ",
-  //    layer_dimensions_.output_rows, " ", layer_dimensions_.input_columns, "
-  //    ", layer_dimensions_.output_columns, " ", input_embeddings.size(), " ",
-  //    layer_dimensions_.input_rows * layer_dimensions_.input_columns);
-  galois::StatTimer timer("ForwardPhase", kRegionName);
-  TimerStart(&timer);
-
-  assert(input_embeddings.size() >=
-         (layer_dimensions_.input_rows * layer_dimensions_.input_columns));
-  assert(p_forward_output_matrix_.size() >=
-         (layer_dimensions_.output_rows * layer_dimensions_.output_columns));
-
-  // pointer to input to operate on
-  const GNNFloat* input_data = input_embeddings.data();
-  GNNFloat* agg_data;
-  // first, dropout
-  if (!config_.disable_dropout && (layer_phase_ == GNNPhase::kTrain)) {
-    DoDropout(input_embeddings, &p_in_temp_1_);
-    input_data = p_in_temp_1_.data();
-    agg_data   = p_in_temp_2_.data();
-  } else {
-    agg_data = p_in_temp_1_.data();
-  }
-
-  // O = FW1 + AFW2 is what is done if concat is on: below is the AFW2 part
-  // which is done regardless
-
-  // flip aggregate/update if dimensions favor it (do less work)
-  if (config_.disable_aggregate_after_update ||
-      layer_dimensions_.input_columns <= layer_dimensions_.output_columns) {
-    if (!config_.disable_dropout && (layer_phase_ == GNNPhase::kTrain)) {
-      assert(p_in_temp_2_.size() >=
-             layer_dimensions_.output_rows * layer_dimensions_.input_columns);
-    } else {
-      assert(p_in_temp_1_.size() >=
-             layer_dimensions_.output_rows * layer_dimensions_.input_columns);
-    }
-
-    // aggregation and update
-    AggregateAll(layer_dimensions_.input_columns, input_data, agg_data,
-                 &input_column_intermediates_);
-    assert(p_forward_output_matrix_.size() >=
-           layer_dimensions_.output_rows * layer_dimensions_.output_columns);
-    UpdateEmbeddings(agg_data, p_forward_output_matrix_.data(), true);
-  } else {
-    assert(p_out_temp_.size() >=
-           layer_dimensions_.input_rows * layer_dimensions_.output_columns);
-
-    // update to aggregate
-    // FW
-    UpdateEmbeddings(input_data, p_out_temp_.data(), false);
-
-    // A(FW)
-    assert(p_forward_output_matrix_.size() >=
-           layer_dimensions_.output_rows * layer_dimensions_.output_columns);
-    AggregateAll(layer_dimensions_.output_columns, p_out_temp_.data(),
-                 p_forward_output_matrix_.data(),
-                 &output_column_intermediates_);
-  }
-
-  if (!sage_config_.disable_concat) {
-    // FW1 is unaffected by the agg/update flip, so can to it
-    // separately
-    SelfFeatureUpdateEmbeddings(input_data, p_forward_output_matrix_.data());
-  }
-
-  if (!config_.disable_activation) {
-    GALOIS_LOG_VERBOSE("Doing activation");
-    Activation();
-  }
-
-  assert(p_forward_output_matrix_.size() >=
-         (layer_dimensions_.output_rows * layer_dimensions_.output_columns));
-
-  TimerStop(&timer);
-
-  return p_forward_output_matrix_;
-}
-
-galois::PointerWithSize<galois::GNNFloat> galois::SAGELayer::BackwardPhase(
-    galois::PointerWithSize<galois::GNNFloat> prev_layer_input,
-    galois::PointerWithSize<galois::GNNFloat>* input_gradient) {
-  galois::StatTimer timer("BackwardPhase", kRegionName);
-  galois::StatTimer weight_gradient_sync_timer("BackwardPhaseWeightSync", kRegionName);
-  galois::StatTimer weight_gradient_sync_timer2("BackwardPhaseWeight2Sync", kRegionName);
-  TimerStart(&timer);
-
-  assert(layer_phase_ == GNNPhase::kTrain || layer_phase_ == GNNPhase::kBatch);
-
-  // derivative of activation
-  if (!config_.disable_activation) {
-    ActivationDerivative(input_gradient);
-  }
-
-  // if dropout was used, use the dropout matrix for the input
-  galois::PointerWithSize<galois::GNNFloat> input_data;
-  galois::PointerWithSize<galois::GNNFloat> agg_data;
-  if (!config_.disable_dropout) {
-    // dropout result is currently stored in temp 1
-    // needs to be used before it gets overwritten
-    input_data = p_in_temp_1_;
-    agg_data   = p_in_temp_2_;
-  } else {
-    // no dropout = use vanilla input
-    input_data = prev_layer_input;
-    agg_data   = p_in_temp_1_;
-  }
-
-  // aggregate this here before gradient starts to get overwritten
-  // this is xform ffirst
-  if (!config_.disable_aggregate_after_update &&
-      layer_dimensions_.input_columns > layer_dimensions_.output_columns) {
-    // aggregate occurs regardless of layer being equal to 0 because it is
-    // required in this case for the weight gradient calculation
-    // this is (FW)'
-    // TODO: this is absolutely terrible performance wise as well; keep
-    // in mind
-    AggregateAll(layer_dimensions_.output_columns, input_gradient->data(),
-                 p_out_temp_.data(), &output_column_intermediates_, true);
-  }
-
-  if (!sage_config_.disable_concat) {
-    if (layer_number_ != 0) {
-      if (graph_.IsSubgraphOn()) {
-        MaskInputNonMasters(&input_data, layer_dimensions_.input_rows,
-                            graph_.GetNonLayerZeroMasters());
-      } else {
-        MaskInputNonMasters(&input_data, layer_dimensions_.input_rows);
-      }
-    } else {
-      // if 0 then no input to mask: mask the gradient
-      // this is fine because gradient won't be used to get feature gradients
-      if (graph_.IsSubgraphOn()) {
-        MaskGradientNonMasters(input_gradient, layer_dimensions_.output_rows,
-                               graph_.GetNonLayerZeroMasters());
-      } else {
-        MaskGradientNonMasters(input_gradient, layer_dimensions_.output_rows);
-      }
-    }
-
-#ifdef GALOIS_ENABLE_GPU
-    if (device_personality == DevicePersonality::GPU_CUDA) {
-      gpu_object_.UpdateWeight2DerivativeGPU(
-          layer_dimensions_.input_columns, layer_dimensions_.input_rows,
-          layer_dimensions_.output_columns, input_data.data(),
-          input_gradient->data(), p_layer_weight_gradients_2_.data());
-    } else {
-#endif
-      // input data (prev layer input or temp1) or gradient need mask
-      // can mask gradient if layer == 0
-      // otherwise must mask other
-
-      galois::StatTimer concat_grad_timer("ConcatGradMultiply", kRegionName);
-      TimerStart(&concat_grad_timer);
-      galois::CBlasSGEMM(
-          CblasTrans, CblasNoTrans, layer_dimensions_.input_columns,
-          layer_dimensions_.output_rows, layer_dimensions_.output_columns,
-          input_data.data(), input_gradient->data(),
-          p_layer_weight_gradients_2_.data());
-      TimerStop(&concat_grad_timer);
-
-#ifdef GALOIS_ENABLE_GPU
-    }
-#endif
-  }
-
-  weight_gradient_sync_timer2.start();
-  WeightGradientSyncSum2();
-  weight_gradient_sync_timer2.stop();
-
-  // derivative of aggregation/update
-  // TODO clean up logic here to reduce nesting
-  if (config_.disable_aggregate_after_update ||
-      layer_dimensions_.input_columns <= layer_dimensions_.output_columns) {
-    // aggdata can == p_intemp1; in other words, need to use before overwrite
-    // mask it, then use it
-    // XXX masking may not be required in sampling case where rows change
-    if (layer_number_ != 0 || sage_config_.disable_concat) {
-      if (graph_.IsSubgraphOn()) {
-        MaskInputNonMasters(&agg_data, layer_dimensions_.output_rows,
-                            graph_.GetNonLayerZeroMasters());
-      } else {
-        MaskInputNonMasters(&agg_data, layer_dimensions_.output_rows);
-      }
-    }
-
-#ifdef GALOIS_ENABLE_GPU
-    if (device_personality == DevicePersonality::GPU_CUDA) {
-      // XXX output rows
-      gpu_object_.GetWeightGradientsGPU(
-          layer_dimensions_.input_rows, layer_dimensions_.input_columns,
-          layer_dimensions_.output_columns, agg_data.data(),
-          input_gradient->data(), p_layer_weight_gradients_.data());
-    } else {
-#endif
-      // agg data holds aggregated feature vectors from forward phase
-      galois::StatTimer normal_grad_timer("NormalGradMultiply", kRegionName);
-      TimerStart(&normal_grad_timer);
-      galois::CBlasSGEMM(
-          CblasTrans, CblasNoTrans, layer_dimensions_.input_columns,
-          layer_dimensions_.output_rows, layer_dimensions_.output_columns,
-          agg_data.data(), input_gradient->data(),
-          p_layer_weight_gradients_.data());
-      TimerStop(&normal_grad_timer);
-#ifdef GALOIS_ENABLE_GPU
-    }
-#endif
-
-    // 0 means input gradient shouldn't get masked
-    if (layer_number_ != 0) {
-      // NOTE: this is super nice because it avoids aggregation completely
-      // in the layer 0 setting
-      // ---unmasked---
-      // transposed sgemm for derivative; in_temp is output
-      assert(input_gradient->size() >=
-             layer_dimensions_.output_rows * layer_dimensions_.output_columns);
-      // pintemp1 contains (AF)'
-      // overwrites the dropout matrix that was in ptemp1 (needed for second
-      // weight matrix)
-      UpdateEmbeddingsDerivative(input_gradient->data(), p_in_temp_1_.data(),
-                                 true);
-
-      // pback contains F'
-      // derivative of aggregate is the same due to symmetric graph
-      AggregateAll(layer_dimensions_.input_columns, p_in_temp_1_.data(),
-                   p_backward_output_matrix_.data(),
-                   &input_column_intermediates_, true);
-    }
-  } else {
-    // xform first
-
-    // --unmasked--
-
-    // disable concat is part of condition because otherwise this mask
-    // should have gotten done elsewhere
-    if (layer_number_ != 0 && sage_config_.disable_concat) {
-      if (graph_.IsSubgraphOn()) {
-        MaskInputNonMasters(&input_data, layer_dimensions_.input_rows,
-                            graph_.GetNonLayerZeroMasters());
-      } else {
-        MaskInputNonMasters(&input_data, layer_dimensions_.input_rows);
-      }
-    }
-
-    // layer number 0 means output needs to be masked because input cannot
-    // be masked
-    if (layer_number_ == 0) {
-      // if 0 then no input to mask: mask the gradient
-      // this is fine because gradient won't be used to get feature gradients
-      if (graph_.IsSubgraphOn()) {
-        MaskGradientNonMasters(&p_out_temp_, layer_dimensions_.input_rows,
-                               graph_.GetNonLayerZeroMasters());
-      } else {
-        MaskGradientNonMasters(&p_out_temp_, layer_dimensions_.input_rows);
-      }
-    }
-
-    // W' = F^T (FW)'
-    // TODO put this in a function
-#ifdef GALOIS_ENABLE_GPU
-    if (device_personality == DevicePersonality::GPU_CUDA) {
-      gpu_object_.GetWeightGradientsGPU(
-          layer_dimensions_.input_rows, layer_dimensions_.input_columns,
-          layer_dimensions_.output_columns, input_data.data(),
-          p_out_temp_.data(), p_layer_weight_gradients_.data());
-    } else {
-#endif
-      // input col x input row * input row x output col
-      galois::StatTimer normal_grad_timer("NormalGradMultiply", kRegionName);
-      TimerStart(&normal_grad_timer);
-      galois::CBlasSGEMM(CblasTrans, CblasNoTrans,
-                         layer_dimensions_.input_columns,
-                         layer_dimensions_.input_rows,
-                         layer_dimensions_.output_columns, input_data.data(),
-                         p_out_temp_.data(), p_layer_weight_gradients_.data());
-      TimerStop(&normal_grad_timer);
-#ifdef GALOIS_ENABLE_GPU
-    }
-#endif
-
-    // to get a correct result out temp mask cannot be masked;
-    // outtemp will only be masked if layer number is 0, so this
-    // is safe in all other cases
-    if (layer_number_ != 0) {
-      // derivative for update
-      // backout = F'
-      UpdateEmbeddingsDerivative(p_out_temp_.data(),
-                                 p_backward_output_matrix_.data(), false);
-    }
-  }
-
-  weight_gradient_sync_timer.start();
-  WeightGradientSyncSum();
-  weight_gradient_sync_timer.stop();
-
-  // full gradient needed here; should occur after all updates
-  if (layer_number_ != 0) {
-    // deal with feature gradients for the self feature here
-    // this function will sum directly into the backward matrix
-    // input gradient never gets masked if layer number != 0
-    SelfFeatureUpdateEmbeddingsDerivative(input_gradient->data(),
-                                          p_backward_output_matrix_.data());
-  }
-
-  if (!config_.disable_dropout && layer_number_ != 0) {
-    DoDropoutDerivative();
-  }
-
-  TimerStop(&timer);
-  return p_backward_output_matrix_;
-}
-
-void galois::SAGELayer::AggregateAll(
-    size_t column_length, const GNNFloat* node_embeddings,
-    GNNFloat* aggregate_output,
-    [[maybe_unused]] galois::substrate::PerThreadStorage<std::vector<GNNFloat>>*
-        pts) {
-  AggregateAll(column_length, node_embeddings, aggregate_output, pts, false);
-}
-
-void galois::SAGELayer::AggregateAll(
-    size_t column_length, const GNNFloat* node_embeddings,
-    GNNFloat* aggregate_output,
-    [[maybe_unused]] galois::substrate::PerThreadStorage<std::vector<GNNFloat>>*
-        pts,
-    bool is_backward) {
-  std::string agg_timer_name = "AggregateCompute";
-  std::string agg_sync_timer_name = "AggregateSync";
-  size_t num_rows_to_handle;
-  if (!is_backward) {
-    agg_timer_name += "Forward";
-    agg_sync_timer_name += "Forward";
-    num_rows_to_handle = layer_dimensions_.output_rows;
-  } else {
-    agg_timer_name += "Backward";
-    agg_sync_timer_name += "Backward";
-    num_rows_to_handle = layer_dimensions_.input_rows;
-  }
-  galois::StatTimer timer(agg_timer_name.c_str(), kRegionName);
-  galois::StatTimer aggregate_all_sync_timer(agg_sync_timer_name.c_str(), kRegionName);
-  TimerStart(&timer);
-
-#ifdef GALOIS_ENABLE_GPU
-  if (device_personality == DevicePersonality::GPU_CUDA) {
-    if (!IsSampledLayer()) {
-      gpu_object_.AggregateAllGPU(
-          graph_.GetGPUGraph(), graph_.size(), column_length, node_embeddings,
-          aggregate_output, !config_.disable_normalization, is_backward);
-    } else {
-      // TODO(hochan)
-      GALOIS_LOG_FATAL("SAMPLING IMPLEMENTATION");
-    }
-    graph_.AggregateSyncGPU(aggregate_output, column_length, layer_number_);
-  } else {
-#endif
-    AggregateAllCPU(column_length, node_embeddings, aggregate_output, pts,
-                    is_backward);
-    TimerStop(&timer);
-
-    // aggregate sync
-    aggregate_all_sync_timer.start();
-    graph_.AggregateSync(aggregate_output, column_length, is_backward,
-                         num_rows_to_handle);
-    aggregate_all_sync_timer.stop();
-#ifdef GALOIS_ENABLE_GPU
-  }
-#endif
-}
-
-void galois::SAGELayer::AggregateAllCPU(
-    size_t column_length, const GNNFloat* node_embeddings,
-    GNNFloat* aggregate_output,
-    galois::substrate::PerThreadStorage<std::vector<GNNFloat>>*,
-    bool is_backward) {
-  // aggregation causes a row count change
-  size_t num_rows_to_handle;
-  if (!is_backward) {
-    num_rows_to_handle = layer_dimensions_.output_rows;
-  } else {
-    num_rows_to_handle = layer_dimensions_.input_rows;
-  }
-
-  galois::do_all(
-      galois::iterate(*(graph_.begin()), num_rows_to_handle),
-      [&](size_t src) {
-        size_t index_to_src_feature = src * column_length;
-        // zero out src feature first
-        for (size_t i = 0; i < column_length; i++) {
-          aggregate_output[index_to_src_feature + i] = 0;
-        }
-
-        GNNFloat source_norm = 0.0;
-        if (!config_.disable_normalization) {
-          source_norm = graph_.GetDegreeNorm(src, graph_user_layer_number_);
-        }
-
-        if (!is_backward) {
-          // loop through all destinations to grab the feature to aggregate
-          for (auto e = graph_.edge_begin(src); e != graph_.edge_end(src);
-               e++) {
-            if (layer_phase_ == GNNPhase::kTrain ||
-                layer_phase_ == GNNPhase::kBatch) {
-              // XXX
-              // galois::gDebug("In here");
-              if (IsSampledLayer()) {
-                if (!graph_.IsEdgeSampled(e, graph_user_layer_number_)) {
-                  continue;
-                }
-              }
-            }
-            size_t dst = graph_.GetEdgeDest(e);
-            graphs::bitset_graph_aggregate.set(graph_.ConvertToLID(src));
-            size_t index_to_dst_feature = dst * column_length;
-
-            if (!config_.disable_normalization) {
-              GNNFloat norm_scale = source_norm;
-              assert(norm_scale != 0);
-
-              galois::VectorMulAdd(
-                  column_length, &aggregate_output[index_to_src_feature],
-                  &node_embeddings[index_to_dst_feature], norm_scale,
-                  &aggregate_output[index_to_src_feature]);
-            } else {
-              // add dst feature to aggregate output
-              galois::VectorAdd(column_length,
-                                &aggregate_output[index_to_src_feature],
-                                &node_embeddings[index_to_dst_feature],
-                                &aggregate_output[index_to_src_feature]);
-            }
-          }
-        } else {
-          // loop through all destinations to grab the feature to aggregate
-          for (auto e = graph_.in_edge_begin(src); e != graph_.in_edge_end(src);
-               e++) {
-            if (layer_phase_ == GNNPhase::kTrain ||
-                layer_phase_ == GNNPhase::kBatch) {
-              // XXX
-              if (IsSampledLayer()) {
-                if (!graph_.IsInEdgeSampled(e, graph_user_layer_number_)) {
-                  continue;
-                }
-              }
-            }
-            size_t dst = graph_.GetInEdgeDest(e);
-            graphs::bitset_graph_aggregate.set(graph_.ConvertToLID(src));
-
-            // input row x output row in backward means that i shouldn't be
-            // touching nodes past output rows; the above sample check
-            // should deal with this where this matters
-            assert(dst < layer_dimensions_.output_rows);
-
-            size_t index_to_dst_feature = dst * column_length;
-
-            if (!config_.disable_normalization) {
-              GNNFloat norm_scale =
-                  graph_.GetDegreeNorm(dst, graph_user_layer_number_);
-
-              assert(norm_scale != 0);
-
-              galois::VectorMulAdd(
-                  column_length, &aggregate_output[index_to_src_feature],
-                  &node_embeddings[index_to_dst_feature], norm_scale,
-                  &aggregate_output[index_to_src_feature]);
-            } else {
-              // add dst feature to aggregate output
-              galois::VectorAdd(column_length,
-                                &aggregate_output[index_to_src_feature],
-                                &node_embeddings[index_to_dst_feature],
-                                &aggregate_output[index_to_src_feature]);
-            }
-          }
-        }
-      },
-      galois::chunk_size<1>(), galois::steal(),
-      galois::loopname("SAGEAggregateAll"));
-}
-
-void galois::SAGELayer::UpdateEmbeddings(const GNNFloat* node_embeddings,
-                                         GNNFloat* output, bool after) {
-  galois::StatTimer timer("ForwardXForm", kRegionName);
-  TimerStart(&timer);
-#ifdef GALOIS_ENABLE_GPU
-  // TODO self change
-  // XXX(hochan) output rows
-  if (device_personality == DevicePersonality::GPU_CUDA) {
-    gpu_object_.UpdateEmbeddingsGPU(
-        layer_dimensions_.input_rows, layer_dimensions_.input_columns,
-        layer_dimensions_.output_columns, node_embeddings,
-        base_gpu_object_.layer_weights(), output);
-  } else {
-#endif
-    // galois::gDebug("Layer ", graph_user_layer_number_, " ",
-    //               layer_dimensions_.output_rows, " ",
-    //               layer_dimensions_.input_columns, " ",
-    //               layer_dimensions_.output_columns);
-    // CPU version is just a call into CBlas
-    if (after) {
-      galois::CBlasSGEMM(
-          CblasNoTrans, CblasNoTrans, layer_dimensions_.output_rows,
-          layer_dimensions_.input_columns, layer_dimensions_.output_columns,
-          node_embeddings, p_layer_weights_.data(), output);
-    } else {
-      galois::CBlasSGEMM(
-          CblasNoTrans, CblasNoTrans, layer_dimensions_.input_rows,
-          layer_dimensions_.input_columns, layer_dimensions_.output_columns,
-          node_embeddings, p_layer_weights_.data(), output);
-    }
-#ifdef GALOIS_ENABLE_GPU
-  }
-#endif
-  TimerStop(&timer);
-}
-
-void galois::SAGELayer::SelfFeatureUpdateEmbeddings(
-    const GNNFloat* node_embeddings, GNNFloat* output) {
-  galois::StatTimer timer("SelfForwardXForm", kRegionName);
-  TimerStart(&timer);
-#ifdef GALOIS_ENABLE_GPU
-  if (device_personality == DevicePersonality::GPU_CUDA) {
-    gpu_object_.SelfFeatureUpdateEmbeddingsGPU(
-        layer_dimensions_.input_rows, layer_dimensions_.input_columns,
-        layer_dimensions_.output_columns, node_embeddings, output);
-  } else {
-#endif
-    // note use of layer weights 2 differentiates this from above
-    galois::CBlasSGEMM(
-        CblasNoTrans, CblasNoTrans, layer_dimensions_.output_rows,
-        layer_dimensions_.input_columns, layer_dimensions_.output_columns,
-        node_embeddings, layer_weights_2_.data(), output, true);
-#ifdef GALOIS_ENABLE_GPU
-  }
-#endif
-  TimerStop(&timer);
-}
-
-void galois::SAGELayer::UpdateEmbeddingsDerivative(const GNNFloat* gradients,
-                                                   GNNFloat* output,
-                                                   bool after) {
-  galois::StatTimer timer("BackwardXForm", kRegionName);
-  TimerStart(&timer);
-
-  assert(p_layer_weights_.size() >=
-         layer_dimensions_.input_columns * layer_dimensions_.output_columns);
-#ifdef GALOIS_ENABLE_GPU
-  if (device_personality == DevicePersonality::GPU_CUDA) {
-    gpu_object_.UpdateEmbeddingsDerivativeGPU(
-        layer_dimensions_.input_rows, layer_dimensions_.input_columns,
-        layer_dimensions_.output_columns, gradients,
-        base_gpu_object_.layer_weights(), output);
-  } else {
-#endif
-    // difference is Trans for B matrix (data) to get z by y (weights is y by z
-    // normally); result is x by y
-    // note input rows is used here due to transpose of aggregation
-    if (after) {
-      galois::CBlasSGEMM(
-          CblasNoTrans, CblasTrans, layer_dimensions_.output_rows,
-          layer_dimensions_.output_columns, layer_dimensions_.input_columns,
-          gradients, p_layer_weights_.data(), output);
-    } else {
-      galois::CBlasSGEMM(CblasNoTrans, CblasTrans, layer_dimensions_.input_rows,
-                         layer_dimensions_.output_columns,
-                         layer_dimensions_.input_columns, gradients,
-                         p_layer_weights_.data(), output);
-    }
-#ifdef GALOIS_ENABLE_GPU
-  }
-#endif
-  TimerStop(&timer);
-}
-
-void galois::SAGELayer::SelfFeatureUpdateEmbeddingsDerivative(
-    const GNNFloat* gradients, GNNFloat* output) {
-  galois::StatTimer timer("SelfBackwardXForm", kRegionName);
-  TimerStart(&timer);
-
-  assert(p_layer_weights_.size() >=
-         layer_dimensions_.input_columns * layer_dimensions_.output_columns);
-#ifdef GALOIS_ENABLE_GPU
-  if (device_personality == DevicePersonality::GPU_CUDA) {
-    gpu_object_.SelfFeatureUpdateEmbeddingsDerivativeGPU(
-        layer_dimensions_.input_rows, layer_dimensions_.output_columns,
-        layer_dimensions_.input_columns, gradients, output);
-  } else {
-#endif
-    // difference is Trans for B matrix (data) to get z by y (weights is y by z
-    // normally); result is x by y
-    // true at end -> accumulate
-    galois::CBlasSGEMM(CblasNoTrans, CblasTrans, layer_dimensions_.output_rows,
-                       layer_dimensions_.output_columns,
-                       layer_dimensions_.input_columns, gradients,
-                       layer_weights_2_.data(), output, true);
-#ifdef GALOIS_ENABLE_GPU
-  }
-#endif
-  TimerStop(&timer);
-}
-
-void galois::SAGELayer::OptimizeLayer(BaseOptimizer* optimizer,
-                                      size_t trainable_layer_number) {
-  galois::StatTimer total_gradient_timer("GradientDescent", kRegionName);
-  total_gradient_timer.start();
-  optimizer->GradientDescent(p_layer_weight_gradients_, p_layer_weights_,
-                             trainable_layer_number);
-  if (!sage_config_.disable_concat) {
-    second_weight_optimizer_->GradientDescent(p_layer_weight_gradients_2_,
-                                              p_layer_weights_2_, 0);
-  }
-  total_gradient_timer.stop();
-}
diff --git a/libgnn/src/layers/SigmoidLayer.cpp b/libgnn/src/layers/SigmoidLayer.cpp
index 595fd5c023..372751f052 100644
--- a/libgnn/src/layers/SigmoidLayer.cpp
+++ b/libgnn/src/layers/SigmoidLayer.cpp
@@ -1,114 +1 @@
 #include "galois/layers/SigmoidLayer.h"
-#include "galois/GNNMath.h"
-#include <math.h>
-
-// TODO(loc) GPU support
-
-const galois::PointerWithSize<galois::GNNFloat>
-galois::SigmoidLayer::ForwardPhaseCPU(
-    const galois::PointerWithSize<galois::GNNFloat> input_embeddings) {
-  galois::gWarn(
-      "Sigmoid layer has not been kept up to date; do not use unless sure"
-      " it works with new changes");
-
-  input_loss_.assign(input_loss_.size(), 0.0);
-  forward_output_matrix_.assign(forward_output_matrix_.size(), 0.0);
-  const size_t feature_length = layer_dimensions_.input_columns;
-  node_count_.reset();
-  float_accumulator_.reset();
-
-  galois::do_all(
-      galois::iterate(graph_.begin(), graph_.end()),
-      [&](const unsigned local_node) {
-        if (graph_.IsValidForPhase(local_node, layer_phase_)) {
-          if (IsSampledLayer()) {
-            if (layer_phase_ == GNNPhase::kTrain &&
-                !graph_.IsInSampledGraph(local_node))
-              return;
-          }
-
-          node_count_ += 1;
-
-          size_t node_offset = feature_length * local_node;
-          // sigmoid the values for this node
-          for (unsigned index = 0; index < feature_length; index++) {
-            // splitting in half is done for numerical stability of log
-            if (input_embeddings[node_offset + index] >= 0) {
-              forward_output_matrix_[node_offset + index] =
-                  1.0 / (1.0 + expf(-input_embeddings[node_offset + index]));
-            } else {
-              forward_output_matrix_[node_offset + index] =
-                  expf(input_embeddings[node_offset + index]) /
-                  (1.0 + expf(input_embeddings[node_offset + index]));
-            }
-          }
-
-          input_loss_[local_node] = GNNCrossEntropy(
-              feature_length, graph_.GetMultiClassLabel(local_node),
-              &forward_output_matrix_[node_offset]);
-          // TODO(loc) normalize the loss
-          float_accumulator_ += input_loss_[local_node];
-        }
-      },
-      galois::steal(), galois::loopname("SigmoidForward"));
-
-  galois::gPrint("Average loss is ",
-                 float_accumulator_.reduce() / node_count_.reduce(), "\n");
-  return forward_output_matrix_;
-}
-
-const galois::PointerWithSize<galois::GNNFloat>
-galois::SigmoidLayer::ForwardPhase(
-    const galois::PointerWithSize<galois::GNNFloat> input_embeddings) {
-#ifdef GALOIS_ENABLE_GPU
-  // TODO(loc) when GPU needs it
-  printf("%p\n", input_embeddings.data());
-  return p_layer_weights_;
-#else
-  return ForwardPhaseCPU(input_embeddings);
-#endif
-}
-
-galois::PointerWithSize<galois::GNNFloat>
-galois::SigmoidLayer::BackwardPhaseCPU() {
-  const size_t feature_length = layer_dimensions_.input_columns;
-  galois::do_all(galois::iterate(size_t{0}, p_backward_output_matrix_.size()),
-                 [&](size_t i) { p_backward_output_matrix_[i] = 0; });
-
-  galois::do_all(
-      galois::iterate(graph_.begin(), graph_.end()),
-      [&](const unsigned local_node) {
-        if (graph_.IsValidForPhase(local_node, layer_phase_)) {
-          if (IsSampledLayer()) {
-            if (layer_phase_ == GNNPhase::kTrain &&
-                !graph_.IsInSampledGraph(local_node))
-              return;
-          }
-
-          // derivative cross entropy into norm grad
-          const GNNLabel* ground_truth = graph_.GetMultiClassLabel(local_node);
-          size_t node_offset           = feature_length * local_node;
-          // sigmoid-cross-entropy derivative: turns out all it is is simple
-          // subtraction
-          for (unsigned index = 0; index < feature_length; index++) {
-            p_backward_output_matrix_[node_offset + index] =
-                forward_output_matrix_[node_offset + index] -
-                ground_truth[index];
-          }
-        }
-      },
-      galois::steal(), galois::loopname("SigmoidBackward"));
-
-  return p_backward_output_matrix_;
-}
-
-galois::PointerWithSize<galois::GNNFloat>
-galois::SigmoidLayer::BackwardPhase(PointerWithSize<galois::GNNFloat>,
-                                    PointerWithSize<galois::GNNFloat>*) {
-#ifdef GALOIS_ENABLE_GPU
-  // TODO(loc) when GPU needs it
-  return p_layer_weights_;
-#else
-  return BackwardPhaseCPU();
-#endif
-}
diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp
deleted file mode 100644
index aebbb3dd9b..0000000000
--- a/libgnn/src/layers/SoftmaxLayer.cpp
+++ /dev/null
@@ -1,139 +0,0 @@
-#include "galois/Logging.h"
-#include "galois/GNNMath.h"
-#include "galois/layers/SoftmaxLayer.h"
-
-const galois::PointerWithSize<galois::GNNFloat>
-galois::SoftmaxLayer::ForwardPhaseCPU(
-    const galois::PointerWithSize<galois::GNNFloat> input_embeddings) {
-  galois::StatTimer timer("SoftmaxForward", "SoftmaxLayer");
-  TimerStart(&timer);
-
-  // note: p_backward == input_embeddings
-  input_loss_.assign(input_loss_.size(), 0.0);
-  const size_t feature_length = layer_dimensions_.input_columns;
-#ifndef NDEBUG
-  galois::DGAccumulator<GNNFloat> loss_accum;
-  galois::DGAccumulator<size_t> handled;
-  loss_accum.reset();
-  handled.reset();
-#endif
-
-  galois::do_all(
-      galois::iterate(size_t{0}, layer_dimensions_.input_rows),
-      [&](const unsigned i) {
-        if (IsSampledLayer()) {
-          if ((layer_phase_ == GNNPhase::kTrain ||
-               layer_phase_ == GNNPhase::kBatch) &&
-              !graph_.IsInSampledGraphSubgraph(i)) {
-            // XXX
-            VectorZero(feature_length,
-                       &p_backward_output_matrix_[i * feature_length]);
-            return;
-          }
-        }
-
-        // do softmax
-        GNNSoftmax(feature_length, &input_embeddings[feature_length * i],
-                   &p_backward_output_matrix_[feature_length * i]);
-        // create ground truth vector for this LID
-        std::vector<GNNFloat>* ground_truth_vec =
-            ground_truth_vectors_.getLocal();
-        assert(ground_truth_vec->size() == feature_length);
-        ground_truth_vec->assign(ground_truth_vec->size(), 0.0);
-        // single class label is an index; set the correct one
-        (*ground_truth_vec)[static_cast<size_t>(
-            graph_.GetSingleClassLabel(i))] = 1.0;
-
-        // calculate loss for this LID (note not all i will be filled)
-        input_loss_[i] =
-            GNNCrossEntropy(feature_length, ground_truth_vec->data(),
-                            &p_backward_output_matrix_[feature_length * i]);
-#ifndef NDEBUG
-        loss_accum += input_loss_[i];
-        handled += 1;
-#endif
-      },
-      // TODO chunk size?
-      // steal on as some threads may have nothing to work on
-      // galois::steal(), galois::loopname("SoftmaxForward"));
-      galois::steal());
-#ifndef NDEBUG
-  GNNFloat reduced_loss = loss_accum.reduce();
-  size_t t              = handled.reduce();
-  galois::gPrint("Loss is ", reduced_loss / t, " ", reduced_loss, " ", t, "\n");
-#endif
-
-  TimerStop(&timer);
-  return p_backward_output_matrix_;
-}
-
-const galois::PointerWithSize<galois::GNNFloat>
-galois::SoftmaxLayer::ForwardPhase(
-    const galois::PointerWithSize<galois::GNNFloat> input_embeddings) {
-#ifdef GALOIS_ENABLE_GPU
-  if (device_personality == DevicePersonality::GPU_CUDA) {
-    gpu_object_.ForwardPhaseGPU(
-        layer_phase_, graph_.size(), layer_dimensions_.input_columns,
-        input_embeddings.data(), p_backward_output_matrix_.data());
-    return p_backward_output_matrix_;
-  }
-#endif
-  return ForwardPhaseCPU(input_embeddings);
-}
-
-galois::PointerWithSize<galois::GNNFloat>
-galois::SoftmaxLayer::BackwardPhaseCPU() {
-  galois::StatTimer timer("SoftmaxBackward", "SoftmaxLayer");
-  TimerStart(&timer);
-
-  const size_t feature_length = layer_dimensions_.input_columns;
-
-  galois::do_all(
-      galois::iterate(size_t{0}, layer_dimensions_.input_rows),
-      [&](const unsigned node) {
-        if (IsSampledLayer()) {
-          if (layer_phase_ == GNNPhase::kTrain &&
-              !graph_.IsInSampledGraphSubgraph(node))
-            return;
-        }
-
-        size_t correct = graph_.GetSingleClassLabel(node);
-        // See here for explanation for why this works
-        // https://gombru.github.io/2018/05/23/cross_entropy_loss/
-        // Derivation of full combined derivative isn't there, but some
-        // emperical inspection tells me this is likely correct
-        // TODO(loc) work it out myself
-        for (size_t idx = 0; idx < feature_length; idx++) {
-          if (idx == correct) {
-            // positive class
-            p_backward_output_matrix_[node * feature_length + idx] =
-                p_backward_output_matrix_[node * feature_length + idx] - 1;
-          } else {
-            // negative class
-            p_backward_output_matrix_[node * feature_length + idx] =
-                p_backward_output_matrix_[node * feature_length + idx];
-          }
-        }
-      },
-      galois::steal(), galois::loopname("SoftmaxBackward"));
-
-  TimerStop(&timer);
-
-  return p_backward_output_matrix_;
-}
-
-galois::PointerWithSize<galois::GNNFloat>
-galois::SoftmaxLayer::BackwardPhase(PointerWithSize<galois::GNNFloat>,
-                                    PointerWithSize<galois::GNNFloat>*) {
-#ifdef GALOIS_ENABLE_GPU
-  if (device_personality == DevicePersonality::GPU_CUDA) {
-    gpu_object_.BackwardPhaseGPU(
-        layer_phase_, graph_.size(), layer_dimensions_.input_columns,
-        p_backward_output_matrix_.data(), p_backward_output_matrix_.data());
-    return p_backward_output_matrix_;
-  }
-#endif
-  return BackwardPhaseCPU();
-}
-
-// TODO function for getting loss
diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt
index e646259f87..00aa14bce6 100644
--- a/libgnn/test/CMakeLists.txt
+++ b/libgnn/test/CMakeLists.txt
@@ -2,6 +2,7 @@ find_package(OpenMP)
 
 add_executable(mkl_micro mkl_micro.cpp)
 target_link_directories(mkl_micro PUBLIC ${MKL_LIBRARIES})
+target_link_directories(mkl_micro PUBLIC ${INTEL_COMPILER_LIBRARIES})
 target_include_directories(mkl_micro PUBLIC
   ${MKL_INCLUDE_DIRS}
 )
@@ -9,6 +10,7 @@ target_link_libraries(mkl_micro ${INTEL_LIBS})
 
 add_executable(mkl_micro_omp mkl_micro.cpp)
 target_link_directories(mkl_micro_omp PUBLIC ${MKL_LIBRARIES})
+target_link_directories(mkl_micro_omp PUBLIC ${INTEL_COMPILER_LIBRARIES})
 target_include_directories(mkl_micro_omp PUBLIC
   ${MKL_INCLUDE_DIRS}
 )
diff --git a/libgnn/test/accuracy-test.cpp b/libgnn/test/accuracy-test.cpp
index 6d26284325..f2d34c0403 100644
--- a/libgnn/test/accuracy-test.cpp
+++ b/libgnn/test/accuracy-test.cpp
@@ -13,8 +13,8 @@ int main() {
   GALOIS_LOG_VERBOSE("Num threads is {}", num_threads);
 
   // load test graph
-  auto test_graph = std::make_unique<galois::graphs::GNNGraph>(
-      "tester", galois::graphs::GNNPartitionScheme::kOEC, true);
+  auto test_graph = std::make_unique<galois::graphs::GNNGraph<char, void>>(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
 
   std::vector<galois::GNNLayerType> layer_types = {
       galois::GNNLayerType::kGraphConvolutional};
@@ -26,7 +26,7 @@ int main() {
   std::vector<size_t> adam_sizes = {21};
   auto adam = std::make_unique<galois::AdamOptimizer>(adam_sizes, 1);
 
-  auto gnn = std::make_unique<galois::GraphNeuralNetwork>(
+  auto gnn = std::make_unique<galois::GraphNeuralNetwork<char, void>>(
       std::move(test_graph), std::move(adam), std::move(gnn_config));
   // for constancy set everything to 1
   gnn->SetAllLayerWeightsTo1();
diff --git a/libgnn/test/aggregate-sync-test.cpp b/libgnn/test/aggregate-sync-test.cpp
index d95931a798..549e6c7c53 100644
--- a/libgnn/test/aggregate-sync-test.cpp
+++ b/libgnn/test/aggregate-sync-test.cpp
@@ -9,8 +9,8 @@ int main() {
     GALOIS_LOG_WARN("This test should be run with multiple hosts/processes!");
   }
 
-  auto test_graph = std::make_unique<galois::graphs::GNNGraph>(
-      "tester", galois::graphs::GNNPartitionScheme::kOEC, true);
+  auto test_graph = std::make_unique<galois::graphs::GNNGraph<char, void>>(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
 
   // print edges for sanity
   for (size_t node = 0; node < test_graph->size(); node++) {
@@ -42,8 +42,8 @@ int main() {
   galois::PointerWithSize<galois::GNNFloat> p_back(back_matrix);
 
   // create the layer, no norm factor
-  std::unique_ptr<galois::GraphConvolutionalLayer> layer_0 =
-      std::make_unique<galois::GraphConvolutionalLayer>(
+  std::unique_ptr<galois::GraphConvolutionalLayer<char, void>> layer_0 =
+      std::make_unique<galois::GraphConvolutionalLayer<char, void>>(
           0, *(test_graph.get()), &p_null, dimension_0, l_config);
   layer_0->InitAllWeightsTo1();
   // make sure it runs in a sane manner
@@ -125,8 +125,8 @@ int main() {
   //////////////////////////////////////////////////////////////////////////////
   // layer 1 to check backward output
   //////////////////////////////////////////////////////////////////////////////
-  std::unique_ptr<galois::GraphConvolutionalLayer> layer_1 =
-      std::make_unique<galois::GraphConvolutionalLayer>(
+  std::unique_ptr<galois::GraphConvolutionalLayer<char, void>> layer_1 =
+      std::make_unique<galois::GraphConvolutionalLayer<char, void>>(
           1, *(test_graph.get()), &p_back, dimension_0, l_config);
   layer_1->InitAllWeightsTo1();
   galois::PointerWithSize<galois::GNNFloat> layer_1_forward_output =
@@ -206,8 +206,8 @@ int main() {
     }
   }
   //////////////////////////////////////////////////////////////////////////////
-  auto test_graph_2 = std::make_unique<galois::graphs::GNNGraph>(
-      "tester", galois::graphs::GNNPartitionScheme::kCVC, true);
+  auto test_graph_2 = std::make_unique<galois::graphs::GNNGraph<char, void>>(
+      "tester", galois::graphs::GNNPartitionScheme::kCVC, true, false);
   // print edges for sanity
   for (size_t node = 0; node < test_graph_2->size(); node++) {
     for (auto e = test_graph_2->edge_begin(node);
@@ -232,7 +232,7 @@ int main() {
   l_config.DebugConfig();
 
   // create the layer, no norm factor
-  layer_0 = std::make_unique<galois::GraphConvolutionalLayer>(
+  layer_0 = std::make_unique<galois::GraphConvolutionalLayer<char, void>>(
       0, *(test_graph_2.get()), &p_null, dimension_0, l_config);
   layer_0->InitAllWeightsTo1();
 
@@ -300,7 +300,7 @@ int main() {
   std::vector<galois::GNNFloat> back_matrix_2(test_graph_2->size() * 3);
   galois::PointerWithSize<galois::GNNFloat> p_back_2(back_matrix_2);
 
-  layer_1 = std::make_unique<galois::GraphConvolutionalLayer>(
+  layer_1 = std::make_unique<galois::GraphConvolutionalLayer<char, void>>(
       1, *(test_graph_2.get()), &p_back_2, dimension_0, l_config);
   layer_1->InitAllWeightsTo1();
   layer_1_forward_output =
diff --git a/libgnn/test/back-conv-test.cpp b/libgnn/test/back-conv-test.cpp
index 480058f6ae..6229c9288c 100644
--- a/libgnn/test/back-conv-test.cpp
+++ b/libgnn/test/back-conv-test.cpp
@@ -11,8 +11,8 @@ int main() {
                      galois::runtime::getSystemNetworkInterface().ID,
                      num_threads);
   // load test graph
-  galois::graphs::GNNGraph test_graph(
-      "tester", galois::graphs::GNNPartitionScheme::kCVC, true);
+  galois::graphs::GNNGraph<char, void> test_graph(
+      "tester", galois::graphs::GNNPartitionScheme::kCVC, true, false);
   galois::PointerWithSize<galois::GNNFloat> feats =
       test_graph.GetLocalFeatures();
   for (size_t row = 0; row < test_graph.size(); row++) {
@@ -70,8 +70,8 @@ int main() {
   galois::PointerWithSize<galois::GNNFloat> p_back(back_matrix);
 
   // create layer 1 for testing backward prop actually giving weights back
-  std::unique_ptr<galois::GraphConvolutionalLayer> layer_1 =
-      std::make_unique<galois::GraphConvolutionalLayer>(1, test_graph, &p_back,
+  std::unique_ptr<galois::GraphConvolutionalLayer<char, void>> layer_1 =
+      std::make_unique<galois::GraphConvolutionalLayer<char, void>>(1, test_graph, &p_back,
                                                         dimension_0, dcon);
   layer_1->InitAllWeightsTo1();
   galois::PointerWithSize<galois::GNNFloat> layer_1_forward_output =
diff --git a/libgnn/test/convlayer-test.cpp b/libgnn/test/convlayer-test.cpp
index 5902d059fa..1bec3b4b31 100644
--- a/libgnn/test/convlayer-test.cpp
+++ b/libgnn/test/convlayer-test.cpp
@@ -14,8 +14,8 @@ int main() {
                      galois::runtime::getSystemNetworkInterface().ID,
                      num_threads);
   // load test graph
-  galois::graphs::GNNGraph test_graph(
-      "tester", galois::graphs::GNNPartitionScheme::kOEC, true);
+  galois::graphs::GNNGraph<char, void> test_graph(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
 
   galois::PointerWithSize<galois::GNNFloat> feats =
       test_graph.GetLocalFeatures();
@@ -60,8 +60,8 @@ int main() {
   galois::PointerWithSize<galois::GNNFloat> p_back(back_matrix);
 
   // create the layer, no norm factor
-  std::unique_ptr<galois::GraphConvolutionalLayer> layer_0 =
-      std::make_unique<galois::GraphConvolutionalLayer>(0, test_graph, &p_null,
+  std::unique_ptr<galois::GraphConvolutionalLayer<char, void>> layer_0 =
+      std::make_unique<galois::GraphConvolutionalLayer<char, void>>(0, test_graph, &p_null,
                                                         dimension_0, dcon);
   layer_0->InitAllWeightsTo1();
   // make sure it runs in a sane manner
@@ -125,8 +125,8 @@ int main() {
 
   // create layer 1 for testing backward prop actually giving weights back
 
-  std::unique_ptr<galois::GraphConvolutionalLayer> layer_1 =
-      std::make_unique<galois::GraphConvolutionalLayer>(1, test_graph, &p_back,
+  std::unique_ptr<galois::GraphConvolutionalLayer<char, void>> layer_1 =
+      std::make_unique<galois::GraphConvolutionalLayer<char, void>>(1, test_graph, &p_back,
                                                         dimension_0, dcon);
   layer_1->InitAllWeightsTo1();
   galois::PointerWithSize<galois::GNNFloat> layer_1_forward_output =
@@ -202,8 +202,8 @@ int main() {
   // (verification requires floating point accuracy or setting a seed which I
   // don't have time for at the moment
   // TODO in future maybe add better unit test for this
-  std::unique_ptr<galois::GraphConvolutionalLayer> layer_2 =
-      std::make_unique<galois::GraphConvolutionalLayer>(1, test_graph, &p_back,
+  std::unique_ptr<galois::GraphConvolutionalLayer<char, void>> layer_2 =
+      std::make_unique<galois::GraphConvolutionalLayer<char, void>>(1, test_graph, &p_back,
                                                         dimension_0, config);
   galois::PointerWithSize<galois::GNNFloat> l2_fo =
       layer_2->ForwardPhase(test_graph.GetLocalFeatures());
diff --git a/libgnn/test/epoch-test.cpp b/libgnn/test/epoch-test.cpp
index ed665684f1..c0b4ede716 100644
--- a/libgnn/test/epoch-test.cpp
+++ b/libgnn/test/epoch-test.cpp
@@ -13,8 +13,8 @@ int main() {
   GALOIS_LOG_VERBOSE("Num threads is {}", num_threads);
 
   // load graph
-  auto test_graph = std::make_unique<galois::graphs::GNNGraph>(
-      "cora", galois::graphs::GNNPartitionScheme::kCVC, true);
+  auto test_graph = std::make_unique<galois::graphs::GNNGraph<char, void>>(
+      "cora", galois::graphs::GNNPartitionScheme::kCVC, true, false);
 
   std::vector<galois::GNNLayerType> layer_types = {
       galois::GNNLayerType::kGraphConvolutional,
@@ -34,7 +34,7 @@ int main() {
                                     16 * test_graph->GetNumLabelClasses()};
   auto adam = std::make_unique<galois::AdamOptimizer>(adam_sizes, 2);
 
-  auto gnn = std::make_unique<galois::GraphNeuralNetwork>(
+  auto gnn = std::make_unique<galois::GraphNeuralNetwork<char, void>>(
       std::move(test_graph), std::move(adam), std::move(gnn_config));
 
   //////////////////////////////////////////////////////////////////////////////
diff --git a/libgnn/test/f1-test.cpp b/libgnn/test/f1-test.cpp
index 64935bc235..363c12861b 100644
--- a/libgnn/test/f1-test.cpp
+++ b/libgnn/test/f1-test.cpp
@@ -8,8 +8,8 @@ int main() {
   galois::DistMemSys G;
 
   // load test graph; false at end = multilabel
-  galois::graphs::GNNGraph test_graph(
-      "tester", galois::graphs::GNNPartitionScheme::kOEC, false);
+  galois::graphs::GNNGraph<char, void> test_graph(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, false, false);
 
   // perfect precision and recall
   std::vector<galois::GNNFloat> prediction = {
diff --git a/libgnn/test/gnnconstruct-test.cpp b/libgnn/test/gnnconstruct-test.cpp
index 69c64105f6..da0e6bd3f9 100644
--- a/libgnn/test/gnnconstruct-test.cpp
+++ b/libgnn/test/gnnconstruct-test.cpp
@@ -14,8 +14,8 @@ int main() {
                      galois::runtime::getSystemNetworkInterface().ID,
                      num_threads);
   // load test graph
-  auto test_graph = std::make_unique<galois::graphs::GNNGraph>(
-      "tester", galois::graphs::GNNPartitionScheme::kOEC, true);
+  auto test_graph = std::make_unique<galois::graphs::GNNGraph<char, void>>(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
 
   // 2 layer test with softmax
   std::vector<galois::GNNLayerType> layer_types = {
@@ -28,8 +28,8 @@ int main() {
   std::vector<size_t> adam_sizes = {12, 28};
   auto adam = std::make_unique<galois::AdamOptimizer>(adam_sizes, 2);
 
-  galois::GraphNeuralNetwork gnn(std::move(test_graph), std::move(adam),
-                                 std::move(gnn_config));
+  galois::GraphNeuralNetwork<char, void>
+      gnn(std::move(test_graph), std::move(adam), std::move(gnn_config));
 
   // note this does not include output layer
   GALOIS_LOG_ASSERT(gnn.num_intermediate_layers() == 2);
diff --git a/libgnn/test/gnnfb-test.cpp b/libgnn/test/gnnfb-test.cpp
index b99c8aeb8d..eb74ffb78a 100644
--- a/libgnn/test/gnnfb-test.cpp
+++ b/libgnn/test/gnnfb-test.cpp
@@ -14,8 +14,8 @@ int main() {
                      galois::runtime::getSystemNetworkInterface().ID,
                      num_threads);
   // load test graph
-  auto test_graph = std::make_unique<galois::graphs::GNNGraph>(
-      "tester", galois::graphs::GNNPartitionScheme::kOEC, true);
+  auto test_graph = std::make_unique<galois::graphs::GNNGraph<char, void>>(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
 
   // 2 layer test with softmax
   std::vector<galois::GNNLayerType> layer_types = {
@@ -36,7 +36,7 @@ int main() {
   // middle 2 are trainable so 12 and 28
   std::vector<size_t> adam_sizes = {12, 28};
   auto adam = std::make_unique<galois::AdamOptimizer>(adam_sizes, 2);
-  auto gnn  = std::make_unique<galois::GraphNeuralNetwork>(
+  auto gnn  = std::make_unique<galois::GraphNeuralNetwork<char, void>>(
       std::move(test_graph), std::move(adam), std::move(gnn_config));
   // for constancy set everything to 1
   gnn->SetAllLayerWeightsTo1();
@@ -171,13 +171,13 @@ int main() {
 
   GALOIS_LOG_VERBOSE("Running with different congifuration");
 
-  test_graph = std::make_unique<galois::graphs::GNNGraph>(
-      "tester", galois::graphs::GNNPartitionScheme::kOEC, true);
+  test_graph = std::make_unique<galois::graphs::GNNGraph<char, void>>(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
   galois::GraphNeuralNetworkConfig gnn_config2(
       2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax,
       dcon);
   auto adam2 = std::make_unique<galois::AdamOptimizer>(adam_sizes, 2);
-  auto gnn2  = std::make_unique<galois::GraphNeuralNetwork>(
+  auto gnn2  = std::make_unique<galois::GraphNeuralNetwork<char, void>>(
       std::move(test_graph), std::move(adam2), std::move(gnn_config2));
   // run to make sure no crashes occur
   gnn2->DoInference();
diff --git a/libgnn/test/gnngraph-test.cpp b/libgnn/test/gnngraph-test.cpp
index 6e12b13899..e4451a4900 100644
--- a/libgnn/test/gnngraph-test.cpp
+++ b/libgnn/test/gnngraph-test.cpp
@@ -16,11 +16,11 @@ int main() {
 
   // note multi level reading tested in another test
   GALOIS_LOG_VERBOSE("reddit with single label, oec");
-  galois::graphs::GNNGraph("cora", galois::graphs::GNNPartitionScheme::kOEC,
-                           true);
+  galois::graphs::GNNGraph<char, void>("cora", galois::graphs::GNNPartitionScheme::kOEC,
+                           true, false);
   GALOIS_LOG_VERBOSE("reddit with single label, cvc");
-  galois::graphs::GNNGraph("cora", galois::graphs::GNNPartitionScheme::kCVC,
-                           true);
+  galois::graphs::GNNGraph<char, void>("cora", galois::graphs::GNNPartitionScheme::kCVC,
+                           true, false);
 
   // below for when I want to check the remapper
   // galois::graphs::GNNGraph remapper("ogbn-papers100M",
diff --git a/libgnn/test/gpu-adam-test.cpp b/libgnn/test/gpu-adam-test.cpp
index 58da1d3b68..646cba3b16 100644
--- a/libgnn/test/gpu-adam-test.cpp
+++ b/libgnn/test/gpu-adam-test.cpp
@@ -26,8 +26,8 @@ int main() {
 
   // make this layer to get access to a gpu helper function; TODO
   // need a helper alloc function
-  galois::graphs::GNNGraph test_graph(
-      "tester", galois::graphs::GNNPartitionScheme::kOEC, true);
+  galois::graphs::GNNGraph<char, void> test_graph(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
   galois::GNNLayerDimensions dimension_0;
   dimension_0.input_rows     = 7;
   dimension_0.input_columns  = test_graph.GetNumLabelClasses();
diff --git a/libgnn/test/gpu-aggregate-sync-test.cpp b/libgnn/test/gpu-aggregate-sync-test.cpp
index 3a0ee7f3d4..e8d0b9b683 100644
--- a/libgnn/test/gpu-aggregate-sync-test.cpp
+++ b/libgnn/test/gpu-aggregate-sync-test.cpp
@@ -16,8 +16,8 @@ int main() {
   gpudevice          = galois::runtime::getSystemNetworkInterface().ID;
   SetCUDADeviceId(gpudevice);
 
-  auto test_graph = std::make_unique<galois::graphs::GNNGraph>(
-      "tester", galois::graphs::GNNPartitionScheme::kOEC, true);
+  auto test_graph = std::make_unique<galois::graphs::GNNGraph<char, void>>(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
 
   // create same layer from convlayer-test and make sure result is the same even
   // in multi-host environment
@@ -42,8 +42,8 @@ int main() {
   galois::PointerWithSize<galois::GNNFloat> p_back(back_matrix);
 
   // create the layer, no norm factor
-  std::unique_ptr<galois::GraphConvolutionalLayer> layer_0 =
-      std::make_unique<galois::GraphConvolutionalLayer>(
+  std::unique_ptr<galois::GraphConvolutionalLayer<char, void>> layer_0 =
+      std::make_unique<galois::GraphConvolutionalLayer<char, void>>(
           0, *(test_graph.get()), &p_null, dimension_0, l_config);
   layer_0->InitAllWeightsTo1();
   // make sure it runs in a sane manner
@@ -130,8 +130,8 @@ int main() {
   //////////////////////////////////////////////////////////////////////////////
   // layer 1 to check backward output
   //////////////////////////////////////////////////////////////////////////////
-  std::unique_ptr<galois::GraphConvolutionalLayer> layer_1 =
-      std::make_unique<galois::GraphConvolutionalLayer>(
+  std::unique_ptr<galois::GraphConvolutionalLayer<char, void>> layer_1 =
+      std::make_unique<galois::GraphConvolutionalLayer<char, void>>(
           1, *(test_graph.get()), &p_back, dimension_0, l_config);
   layer_1->InitAllWeightsTo1();
   layer_1->ForwardPhase(test_graph->GetLocalFeatures());
diff --git a/libgnn/test/gpu-back-conv-test.cpp b/libgnn/test/gpu-back-conv-test.cpp
index c089ffb698..2df78d694d 100644
--- a/libgnn/test/gpu-back-conv-test.cpp
+++ b/libgnn/test/gpu-back-conv-test.cpp
@@ -20,8 +20,8 @@ int main() {
                      galois::runtime::getSystemNetworkInterface().ID,
                      num_threads);
   // load test graph
-  galois::graphs::GNNGraph test_graph(
-      "tester", galois::graphs::GNNPartitionScheme::kCVC, true);
+  galois::graphs::GNNGraph<char, void> test_graph(
+      "tester", galois::graphs::GNNPartitionScheme::kCVC, true, false);
 
   galois::GNNLayerDimensions dimension_0;
   dimension_0.input_rows     = test_graph.size();
@@ -53,8 +53,8 @@ int main() {
   galois::PointerWithSize<galois::GNNFloat> output_layer(output_matrix);
 
   // create layer 1 for testing backward prop actually giving weights back
-  std::unique_ptr<galois::GraphConvolutionalLayer> layer_1 =
-      std::make_unique<galois::GraphConvolutionalLayer>(1, test_graph, &p_back,
+  std::unique_ptr<galois::GraphConvolutionalLayer<char, void>> layer_1 =
+      std::make_unique<galois::GraphConvolutionalLayer<char, void>>(1, test_graph, &p_back,
                                                         dimension_0, dcon);
   galois::PointerWithSize dummy_ones = layer_1->AllocateGPU(dummy_ones_v);
   layer_1->InitAllWeightsTo1();
diff --git a/libgnn/test/gpu-convlayer-test.cpp b/libgnn/test/gpu-convlayer-test.cpp
index 3a822cf9c5..a36740b5e3 100644
--- a/libgnn/test/gpu-convlayer-test.cpp
+++ b/libgnn/test/gpu-convlayer-test.cpp
@@ -15,8 +15,8 @@ int main() {
                      num_threads);
   device_personality = DevicePersonality::GPU_CUDA;
   // load test graph
-  galois::graphs::GNNGraph test_graph(
-      "tester", galois::graphs::GNNPartitionScheme::kOEC, true);
+  galois::graphs::GNNGraph<char, void> test_graph(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
 
   galois::PointerWithSize<galois::GNNFloat> feats =
       test_graph.GetLocalFeatures();
@@ -52,8 +52,8 @@ int main() {
       dimension_0.input_columns, dimension_0.output_columns);
 
   // create the layer, no norm factor
-  std::unique_ptr<galois::GraphConvolutionalLayer> layer_0 =
-      std::make_unique<galois::GraphConvolutionalLayer>(0, test_graph, &p_null,
+  std::unique_ptr<galois::GraphConvolutionalLayer<char, void>> layer_0 =
+      std::make_unique<galois::GraphConvolutionalLayer<char, void>>(0, test_graph, &p_null,
                                                         dimension_0, dcon);
   layer_0->InitAllWeightsTo1();
   // make sure it runs in a sane manner
@@ -112,8 +112,8 @@ int main() {
 
   // create layer 1 for testing backward prop actually giving weights back
 
-  std::unique_ptr<galois::GraphConvolutionalLayer> layer_1 =
-      std::make_unique<galois::GraphConvolutionalLayer>(1, test_graph, &p_back,
+  std::unique_ptr<galois::GraphConvolutionalLayer<char, void>> layer_1 =
+      std::make_unique<galois::GraphConvolutionalLayer<char, void>>(1, test_graph, &p_back,
                                                         dimension_0, dcon);
   layer_1->InitAllWeightsTo1();
   layer_1->ForwardPhase(test_graph.GetLocalFeatures());
@@ -194,8 +194,8 @@ int main() {
   // (verification requires floating point accuracy or setting a seed which I
   // don't have time for at the moment
   // TODO in future maybe add better unit test for this
-  std::unique_ptr<galois::GraphConvolutionalLayer> layer_2 =
-      std::make_unique<galois::GraphConvolutionalLayer>(2, test_graph, &p_back,
+  std::unique_ptr<galois::GraphConvolutionalLayer<char, void>> layer_2 =
+      std::make_unique<galois::GraphConvolutionalLayer<char, void>>(2, test_graph, &p_back,
                                                         dimension_0, config);
   layer_2->ForwardPhase(test_graph.GetLocalFeatures());
   // pointer is to GPU memory: copy it over to a CPU source for verification
diff --git a/libgnn/test/gpu-epoch-test.cpp b/libgnn/test/gpu-epoch-test.cpp
index 8b71b81e3f..71a227416c 100644
--- a/libgnn/test/gpu-epoch-test.cpp
+++ b/libgnn/test/gpu-epoch-test.cpp
@@ -14,8 +14,8 @@ int main() {
   device_personality = DevicePersonality::GPU_CUDA;
 
   // load graph
-  auto test_graph = std::make_unique<galois::graphs::GNNGraph>(
-      "cora", galois::graphs::GNNPartitionScheme::kCVC, true);
+  auto test_graph = std::make_unique<galois::graphs::GNNGraph<char, void>>(
+      "cora", galois::graphs::GNNPartitionScheme::kCVC, true, false);
 
   std::vector<galois::GNNLayerType> layer_types = {
       galois::GNNLayerType::kGraphConvolutional,
@@ -36,7 +36,7 @@ int main() {
   std::vector<galois::GNNFloat> cpu_pred;
   cpu_pred.resize(test_graph->GetNumLabelClasses() * test_graph->size());
 
-  auto gnn = std::make_unique<galois::GraphNeuralNetwork>(
+  auto gnn = std::make_unique<galois::GraphNeuralNetwork<char, void>>(
       std::move(test_graph), std::move(adam), std::move(gnn_config));
 
   //////////////////////////////////////////////////////////////////////////////
diff --git a/libgnn/test/gpu-sage-layer-test.cpp b/libgnn/test/gpu-sage-layer-test.cpp
index 7cec3b9a2b..7af3808c85 100644
--- a/libgnn/test/gpu-sage-layer-test.cpp
+++ b/libgnn/test/gpu-sage-layer-test.cpp
@@ -21,8 +21,8 @@ int main() {
   dimension_0.output_columns = 2;
 
   // load test graph
-  galois::graphs::GNNGraph test_graph(
-      "tester", galois::graphs::GNNPartitionScheme::kOEC, true);
+  galois::graphs::GNNGraph<char, void> test_graph(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
   unsigned num_layers = 3;
   test_graph.ResizeGPULayerVector(num_layers);
   test_graph.InitLayerVectorMetaObjects(
@@ -46,8 +46,8 @@ int main() {
   galois::SAGELayerConfig scon;
   scon.disable_concat = false;
 
-  std::unique_ptr<galois::SAGELayer> layer_0 =
-      std::make_unique<galois::SAGELayer>(0, test_graph, &p_null, dimension_0,
+  std::unique_ptr<galois::SAGELayer<char, void>> layer_0 =
+      std::make_unique<galois::SAGELayer<char, void>>(0, test_graph, &p_null, dimension_0,
                                           dcon, scon);
   layer_0->InitAllWeightsTo1();
   // sage weights for self
@@ -121,7 +121,7 @@ int main() {
   ////////////////////////////////////////////////////////////////////////////////
 
   // create layer 1 for testing backward prop actually giving weights back
-  auto layer_1 = std::make_unique<galois::SAGELayer>(1, test_graph, &p_back,
+  auto layer_1 = std::make_unique<galois::SAGELayer<char, void>>(1, test_graph, &p_back,
                                                      dimension_0, dcon, scon);
   layer_1->InitAllWeightsTo1();
   layer_1->InitSelfWeightsTo1();
@@ -217,7 +217,7 @@ int main() {
   // (verification requires floating point accuracy or setting a seed which I
   // don't have time for at the moment
   // TODO in future maybe add better unit test for this
-  auto layer_2 = std::make_unique<galois::SAGELayer>(2, test_graph, &p_back,
+  auto layer_2 = std::make_unique<galois::SAGELayer<char, void>>(2, test_graph, &p_back,
                                                      dimension_0, config, scon);
   layer_2->ForwardPhase(test_graph.GetLocalFeatures());
   const std::vector<galois::GNNFloat>& l2_fo =
diff --git a/libgnn/test/gpu-softmaxlayer-test.cpp b/libgnn/test/gpu-softmaxlayer-test.cpp
index 64b7c9e6f0..96875feffa 100644
--- a/libgnn/test/gpu-softmaxlayer-test.cpp
+++ b/libgnn/test/gpu-softmaxlayer-test.cpp
@@ -14,8 +14,8 @@ int main() {
   device_personality = DevicePersonality::GPU_CUDA;
 
   // load test graph
-  galois::graphs::GNNGraph test_graph(
-      "tester", galois::graphs::GNNPartitionScheme::kOEC, true);
+  galois::graphs::GNNGraph<char, void> test_graph(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
 
   // input/output columns must be same in softmax
   galois::GNNLayerDimensions dimension_0;
@@ -29,7 +29,7 @@ int main() {
   galois::PointerWithSize<galois::GNNFloat> p_back(back_matrix);
 
   // train mode
-  auto output_layer = std::make_unique<galois::SoftmaxLayer>(
+  auto output_layer = std::make_unique<galois::SoftmaxLayer<char, void>>(
       3, test_graph, &p_back, dimension_0);
   // input to softmax
   std::vector<galois::GNNFloat> softmax_input(49, 0.0);
diff --git a/libgnn/test/l2norm-layer-test.cpp b/libgnn/test/l2norm-layer-test.cpp
index ca30c99ac0..6d6b30942e 100644
--- a/libgnn/test/l2norm-layer-test.cpp
+++ b/libgnn/test/l2norm-layer-test.cpp
@@ -10,8 +10,8 @@ int main() {
   GALOIS_LOG_VERBOSE("Num threads is {}", num_threads);
 
   // load test graph
-  galois::graphs::GNNGraph test_graph(
-      "tester", galois::graphs::GNNPartitionScheme::kOEC, true);
+  galois::graphs::GNNGraph<char, void> test_graph(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
 
   // input/output columns must be same in softmax
   galois::GNNLayerDimensions dimension_0;
@@ -38,7 +38,7 @@ int main() {
   std::vector<galois::GNNFloat> back_matrix(14);
   galois::PointerWithSize<galois::GNNFloat> p_back(back_matrix);
 
-  auto l2_layer = std::make_unique<galois::L2NormLayer>(2, test_graph, &p_back,
+  auto l2_layer = std::make_unique<galois::L2NormLayer<char, void>>(2, test_graph, &p_back,
                                                         dimension_0);
   galois::PointerWithSize<galois::GNNFloat> normed =
       l2_layer->ForwardPhase(l2_input);
diff --git a/libgnn/test/multilabel-epoch-test.cpp b/libgnn/test/multilabel-epoch-test.cpp
index 7626abda1d..b0a2430bd1 100644
--- a/libgnn/test/multilabel-epoch-test.cpp
+++ b/libgnn/test/multilabel-epoch-test.cpp
@@ -13,8 +13,8 @@ int main() {
   GALOIS_LOG_VERBOSE("Num threads is {}", num_threads);
 
   // load graph
-  auto test_graph = std::make_unique<galois::graphs::GNNGraph>(
-      "tester", galois::graphs::GNNPartitionScheme::kOEC, false);
+  auto test_graph = std::make_unique<galois::graphs::GNNGraph<char, void>>(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, false, false);
 
   std::vector<galois::GNNLayerType> layer_types = {
       galois::GNNLayerType::kGraphConvolutional,
@@ -34,7 +34,7 @@ int main() {
                                     16 * test_graph->GetNumLabelClasses()};
   auto adam = std::make_unique<galois::AdamOptimizer>(adam_sizes, 2);
 
-  auto gnn = std::make_unique<galois::GraphNeuralNetwork>(
+  auto gnn = std::make_unique<galois::GraphNeuralNetwork<char, void>>(
       std::move(test_graph), std::move(adam), std::move(gnn_config));
 
   //////////////////////////////////////////////////////////////////////////////
diff --git a/libgnn/test/multilabel-read.cpp b/libgnn/test/multilabel-read.cpp
index 83debfa2bc..56b8b42071 100644
--- a/libgnn/test/multilabel-read.cpp
+++ b/libgnn/test/multilabel-read.cpp
@@ -8,8 +8,8 @@ int main() {
   galois::DistMemSys G;
 
   // load test graph; false at end = multilabel
-  galois::graphs::GNNGraph test_graph(
-      "tester", galois::graphs::GNNPartitionScheme::kOEC, false);
+  galois::graphs::GNNGraph<char, void> test_graph(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, false, false);
   const galois::GNNLabel* labels = test_graph.GetMultiClassLabel(0);
 
   unsigned i = 0;
diff --git a/libgnn/test/sage-layer-test.cpp b/libgnn/test/sage-layer-test.cpp
index 830e147a7c..8551126d37 100644
--- a/libgnn/test/sage-layer-test.cpp
+++ b/libgnn/test/sage-layer-test.cpp
@@ -14,8 +14,8 @@ int main() {
                      galois::runtime::getSystemNetworkInterface().ID,
                      num_threads);
   // load test graph
-  galois::graphs::GNNGraph test_graph(
-      "tester", galois::graphs::GNNPartitionScheme::kOEC, true);
+  galois::graphs::GNNGraph<char, void> test_graph(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
 
   galois::GNNLayerDimensions dimension_0;
   dimension_0.input_rows     = 7;
@@ -32,8 +32,8 @@ int main() {
   std::vector<galois::GNNFloat> back_matrix(21);
   galois::PointerWithSize<galois::GNNFloat> p_back(back_matrix);
 
-  std::unique_ptr<galois::SAGELayer> layer_0 =
-      std::make_unique<galois::SAGELayer>(0, test_graph, &p_null, dimension_0,
+  std::unique_ptr<galois::SAGELayer<char, void>> layer_0 =
+      std::make_unique<galois::SAGELayer<char, void>>(0, test_graph, &p_null, dimension_0,
                                           dcon, scon);
   layer_0->InitAllWeightsTo1();
   // sage weights for self
@@ -113,7 +113,7 @@ int main() {
 
   // create layer 1 for testing backward prop actually giving weights back
 
-  auto layer_1 = std::make_unique<galois::SAGELayer>(1, test_graph, &p_back,
+  auto layer_1 = std::make_unique<galois::SAGELayer<char, void>>(1, test_graph, &p_back,
                                                      dimension_0, dcon, scon);
   layer_1->InitAllWeightsTo1();
   layer_1->InitSelfWeightsTo1();
@@ -205,7 +205,7 @@ int main() {
   // (verification requires floating point accuracy or setting a seed which I
   // don't have time for at the moment
   // TODO in future maybe add better unit test for this
-  auto layer_2 = std::make_unique<galois::SAGELayer>(1, test_graph, &p_back,
+  auto layer_2 = std::make_unique<galois::SAGELayer<char, void>>(1, test_graph, &p_back,
                                                      dimension_0, config, scon);
   galois::PointerWithSize<galois::GNNFloat> l2_fo =
       layer_2->ForwardPhase(test_graph.GetLocalFeatures());
diff --git a/libgnn/test/sample-bit-test.cpp b/libgnn/test/sample-bit-test.cpp
index f603578c13..b53860d950 100644
--- a/libgnn/test/sample-bit-test.cpp
+++ b/libgnn/test/sample-bit-test.cpp
@@ -13,8 +13,8 @@ int main() {
                      galois::runtime::getSystemNetworkInterface().ID,
                      num_threads);
 
-  galois::graphs::GNNGraph graph(
-      "tester", galois::graphs::GNNPartitionScheme::kOEC, true);
+  galois::graphs::GNNGraph<char, void> graph(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
   graph.InitializeSamplingData(3, false);
 
   // first, assert all edges are not sampled (should come with all 0s)
diff --git a/libgnn/test/sample-test.cpp b/libgnn/test/sample-test.cpp
index 3540582ade..927f4b0e9f 100644
--- a/libgnn/test/sample-test.cpp
+++ b/libgnn/test/sample-test.cpp
@@ -17,8 +17,8 @@ int main() {
                      galois::runtime::getSystemNetworkInterface().ID,
                      num_threads);
   // load test graph
-  galois::graphs::GNNGraph test_graph(
-      "tester", galois::graphs::GNNPartitionScheme::kOEC, true);
+  galois::graphs::GNNGraph<char, void> test_graph(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
 
   galois::GNNLayerDimensions dimension_0;
   dimension_0.input_rows     = 7;
@@ -43,8 +43,8 @@ int main() {
   std::vector<galois::GNNFloat> back_matrix(21);
   galois::PointerWithSize<galois::GNNFloat> p_back(back_matrix);
 
-  std::unique_ptr<galois::GraphConvolutionalLayer> layer_1 =
-      std::make_unique<galois::GraphConvolutionalLayer>(1, test_graph, &p_back,
+  std::unique_ptr<galois::GraphConvolutionalLayer<char, void>> layer_1 =
+      std::make_unique<galois::GraphConvolutionalLayer<char, void>>(1, test_graph, &p_back,
                                                         dimension_0, dcon);
   layer_1->InitAllWeightsTo1();
   layer_1->EnableSampling();
@@ -145,7 +145,7 @@ int main() {
   std::vector<galois::GNNFloat> back_matrix_2(49);
   galois::PointerWithSize<galois::GNNFloat> p_back_2(back_matrix_2);
 
-  auto output_layer = std::make_unique<galois::SoftmaxLayer>(
+  auto output_layer = std::make_unique<galois::SoftmaxLayer<char, void>>(
       3, test_graph, &p_back_2, dimension_out);
   output_layer->EnableSampling();
   galois::PointerWithSize<galois::GNNFloat> prediction_distribution =
@@ -186,10 +186,10 @@ int main() {
   //////////////////////////////////////////////////////////////////////////////
   // sigmoid
   //////////////////////////////////////////////////////////////////////////////
-  galois::graphs::GNNGraph multi_graph(
-      "tester", galois::graphs::GNNPartitionScheme::kOEC, false);
+  galois::graphs::GNNGraph<char, void> multi_graph(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, false, false);
 
-  auto sigmoid_layer = std::make_unique<galois::SigmoidLayer>(
+  auto sigmoid_layer = std::make_unique<galois::SigmoidLayer<char, void>>(
       3, multi_graph, &p_back_2, dimension_out);
   sigmoid_layer->EnableSampling();
   // reuse softmax input; only thing interested in is checking for 0s
diff --git a/libgnn/test/sigmoidlayer-test.cpp b/libgnn/test/sigmoidlayer-test.cpp
index 0bc2cd7252..9fd861deff 100644
--- a/libgnn/test/sigmoidlayer-test.cpp
+++ b/libgnn/test/sigmoidlayer-test.cpp
@@ -15,8 +15,8 @@ int main() {
   galois::setActiveThreads(1);
 
   // load test graph
-  galois::graphs::GNNGraph test_graph(
-      "tester", galois::graphs::GNNPartitionScheme::kOEC, false);
+  galois::graphs::GNNGraph<char, void> test_graph(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, false, false);
 
   // input/output columns must be same in softmax
   galois::GNNLayerDimensions dimension_0;
@@ -51,7 +51,7 @@ int main() {
   galois::PointerWithSize<galois::GNNFloat> p_back(back_matrix);
 
   // train mode
-  auto output_layer = std::make_unique<galois::SigmoidLayer>(
+  auto output_layer = std::make_unique<galois::SigmoidLayer<char, void>>(
       3, test_graph, &p_back, dimension_0);
   output_layer->ForwardPhase(softmax_input);
 
diff --git a/libgnn/test/softmaxlayer-test.cpp b/libgnn/test/softmaxlayer-test.cpp
index 66c4e557bc..1ca2740729 100644
--- a/libgnn/test/softmaxlayer-test.cpp
+++ b/libgnn/test/softmaxlayer-test.cpp
@@ -17,8 +17,8 @@ int main() {
   GALOIS_LOG_VERBOSE("Num threads is {}", num_threads);
 
   // load test graph
-  galois::graphs::GNNGraph test_graph(
-      "tester", galois::graphs::GNNPartitionScheme::kOEC, true);
+  galois::graphs::GNNGraph<char, void> test_graph(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
 
   // input/output columns must be same in softmax
   galois::GNNLayerDimensions dimension_0;
@@ -43,7 +43,7 @@ int main() {
   galois::PointerWithSize<galois::GNNFloat> p_back(back_matrix);
 
   // train mode
-  auto output_layer = std::make_unique<galois::SoftmaxLayer>(
+  auto output_layer = std::make_unique<galois::SoftmaxLayer<char, void>>(
       3, test_graph, &p_back, dimension_0);
   galois::PointerWithSize<galois::GNNFloat> prediction_distribution =
       output_layer->ForwardPhase(softmax_input);
diff --git a/lonestar/gnn/distributed/gcn/gcn-dist.cpp b/lonestar/gnn/distributed/gcn/gcn-dist.cpp
index f33fd89c38..60e9fe75b4 100644
--- a/lonestar/gnn/distributed/gcn/gcn-dist.cpp
+++ b/lonestar/gnn/distributed/gcn/gcn-dist.cpp
@@ -8,8 +8,9 @@ int main(int argc, char* argv[]) {
 
   galois::StatTimer init_timer("InitializationTime");
   init_timer.start();
-  std::unique_ptr<galois::GraphNeuralNetwork> gnn =
-      InitializeGraphNeuralNetwork();
+  std::unique_ptr<
+      galois::GraphNeuralNetwork<shad::ShadNodeTy, shad::ShadEdgeTy>> gnn =
+      InitializeGraphNeuralNetwork<shad::ShadNodeTy, shad::ShadEdgeTy>();
   gnn->SetLayerPhases(galois::GNNPhase::kTrain);
   init_timer.stop();
 
diff --git a/lonestar/gnn/include/DistributedGraphLoader.h b/lonestar/gnn/include/DistributedGraphLoader.h
index 87b12de63d..0bce4b5819 100644
--- a/lonestar/gnn/include/DistributedGraphLoader.h
+++ b/lonestar/gnn/include/DistributedGraphLoader.h
@@ -108,7 +108,7 @@ extern cll::opt<std::string> dataset;
 //! partitioning scheme to use
 extern cll::opt<galois::graphs::PARTITIONING_SCHEME> partitionScheme;
 //! true if input graph file format is SHAD WMD
-extern cll::opt<bool> useShad;
+extern cll::opt<bool> useWMD;
 
 // @todo command line argument for read balancing across hosts
 
@@ -136,22 +136,22 @@ std::unique_ptr<DistGraph<NodeData, EdgeData>> constructSymmetricGraph(std::vect
   case OEC:
   case IEC:
     return cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true, "");
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, true, "");
   case HOVC:
   case HIVC:
     return cuspPartitionGraph<GenericHVC, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true, "");
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, true, "");
 
   case CART_VCUT:
   case CART_VCUT_IEC:
     return cuspPartitionGraph<GenericCVC, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true, "");
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, true, "");
   case GNN_OEC:
     return cuspPartitionGraph<GnnOEC, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true, "");
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, true, "");
   case GNN_CVC:
     return cuspPartitionGraph<GnnCVC, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true, "");
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, true, "");
   default:
     GALOIS_DIE("Error: partition scheme specified is invalid");
     return nullptr;
diff --git a/lonestar/gnn/src/DistributedGraphLoader.cpp b/lonestar/gnn/src/DistributedGraphLoader.cpp
index 611a7c3e50..5e1a2dbe81 100644
--- a/lonestar/gnn/src/DistributedGraphLoader.cpp
+++ b/lonestar/gnn/src/DistributedGraphLoader.cpp
@@ -42,7 +42,7 @@ cll::opt<PARTITIONING_SCHEME> partitionScheme(
                    "gnn cvc: train nodes evenly distributed")),
     cll::init(GNN_OEC));
 
-cll::opt<bool> useShad("useShad", cll::desc("true if the input graph is"
-                                            " SHAD WMD graph format."
-                                            " Otheriwse, set false."),
+cll::opt<bool> useWMD("useWMD", cll::desc("true if the input graph is"
+                                          " SHAD WMD graph format."
+                                          " Otheriwse, set false."),
                        cll::init(false));
diff --git a/lonestar/libdistbench/include/DistBench/Input.h b/lonestar/libdistbench/include/DistBench/Input.h
index 396b01a983..d7e9cb8568 100644
--- a/lonestar/libdistbench/include/DistBench/Input.h
+++ b/lonestar/libdistbench/include/DistBench/Input.h
@@ -100,7 +100,7 @@ extern cll::opt<bool> symmetricGraph;
 //! partitioning scheme to use
 extern cll::opt<PARTITIONING_SCHEME> partitionScheme;
 //! true if input graph file format is SHAD WMD
-extern cll::opt<bool> useShad;
+extern cll::opt<bool> useWMD;
 ////! path to vertex id map for custom edge cut
 // extern cll::opt<std::string> vertexIDMapFileName;
 //! true if you want to read graph structure from a file
@@ -145,18 +145,18 @@ constructSymmetricGraph(std::vector<unsigned>& GALOIS_UNUSED(scaleFactor)) {
   case OEC:
   case IEC:
     return galois::cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, true,
         inputFileTranspose, mastersFile);
   case HOVC:
   case HIVC:
     return galois::cuspPartitionGraph<GenericHVC, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, true,
         inputFileTranspose);
 
   case CART_VCUT:
   case CART_VCUT_IEC:
     return galois::cuspPartitionGraph<GenericCVC, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, true,
         inputFileTranspose);
 
     // case CEC:
@@ -166,18 +166,18 @@ constructSymmetricGraph(std::vector<unsigned>& GALOIS_UNUSED(scaleFactor)) {
   case GINGER_O:
   case GINGER_I:
     return galois::cuspPartitionGraph<GingerP, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad ,true,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD ,true,
         inputFileTranspose);
 
   case FENNEL_O:
   case FENNEL_I:
     return galois::cuspPartitionGraph<FennelP, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, true,
         inputFileTranspose);
 
   case SUGAR_O:
     return galois::cuspPartitionGraph<SugarP, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, true,
         inputFileTranspose);
   default:
     GALOIS_DIE("partition scheme specified is invalid: ", partitionScheme);
@@ -206,19 +206,19 @@ constructGraph(std::vector<unsigned>& GALOIS_UNUSED(scaleFactor)) {
   auto& net = galois::runtime::getSystemNetworkInterface();
   if (net.Num == 1) {
     return galois::cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, false,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, false,
         inputFileTranspose);
   }
 
   switch (partitionScheme) {
   case OEC:
     return galois::cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, false,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, false,
         inputFileTranspose, mastersFile);
   case IEC:
     if (inputFileTranspose.size()) {
       return galois::cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
-          inputFile, galois::CUSP_CSC, galois::CUSP_CSR, useShad, false,
+          inputFile, galois::CUSP_CSC, galois::CUSP_CSR, useWMD, false,
           inputFileTranspose, mastersFile);
     } else {
       GALOIS_DIE("incoming edge cut requires transpose graph");
@@ -227,12 +227,12 @@ constructGraph(std::vector<unsigned>& GALOIS_UNUSED(scaleFactor)) {
 
   case HOVC:
     return galois::cuspPartitionGraph<GenericHVC, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, false,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, false,
         inputFileTranspose);
   case HIVC:
     if (inputFileTranspose.size()) {
       return galois::cuspPartitionGraph<GenericHVC, NodeData, EdgeData>(
-          inputFile, galois::CUSP_CSC, galois::CUSP_CSR, useShad, false,
+          inputFile, galois::CUSP_CSC, galois::CUSP_CSR, useWMD, false,
           inputFileTranspose);
     } else {
       GALOIS_DIE("incoming hybrid cut requires transpose graph");
@@ -241,13 +241,13 @@ constructGraph(std::vector<unsigned>& GALOIS_UNUSED(scaleFactor)) {
 
   case CART_VCUT:
     return galois::cuspPartitionGraph<GenericCVC, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, false,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, false,
         inputFileTranspose);
 
   case CART_VCUT_IEC:
     if (inputFileTranspose.size()) {
       return galois::cuspPartitionGraph<GenericCVC, NodeData, EdgeData>(
-          inputFile, galois::CUSP_CSC, galois::CUSP_CSR, useShad, false,
+          inputFile, galois::CUSP_CSC, galois::CUSP_CSR, useWMD, false,
           inputFileTranspose);
     } else {
       GALOIS_DIE("cvc incoming cut requires transpose graph");
@@ -260,12 +260,12 @@ constructGraph(std::vector<unsigned>& GALOIS_UNUSED(scaleFactor)) {
 
   case GINGER_O:
     return galois::cuspPartitionGraph<GingerP, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, false,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, false,
         inputFileTranspose);
   case GINGER_I:
     if (inputFileTranspose.size()) {
       return galois::cuspPartitionGraph<GingerP, NodeData, EdgeData>(
-          inputFile, galois::CUSP_CSC, galois::CUSP_CSR, useShad, false,
+          inputFile, galois::CUSP_CSC, galois::CUSP_CSR, useWMD, false,
           inputFileTranspose);
     } else {
       GALOIS_DIE("Ginger requires transpose graph");
@@ -274,12 +274,12 @@ constructGraph(std::vector<unsigned>& GALOIS_UNUSED(scaleFactor)) {
 
   case FENNEL_O:
     return galois::cuspPartitionGraph<FennelP, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, false,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, false,
         inputFileTranspose);
   case FENNEL_I:
     if (inputFileTranspose.size()) {
       return galois::cuspPartitionGraph<FennelP, NodeData, EdgeData>(
-          inputFile, galois::CUSP_CSC, galois::CUSP_CSR, useShad, false,
+          inputFile, galois::CUSP_CSC, galois::CUSP_CSR, useWMD, false,
           inputFileTranspose);
     } else {
       GALOIS_DIE("Fennel requires transpose graph");
@@ -288,7 +288,7 @@ constructGraph(std::vector<unsigned>& GALOIS_UNUSED(scaleFactor)) {
 
   case SUGAR_O:
     return galois::cuspPartitionGraph<SugarP, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, false,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, false,
         inputFileTranspose);
 
   default:
@@ -320,7 +320,7 @@ DistGraphPtr<NodeData, EdgeData> constructGraph(std::vector<unsigned>&) {
   if (net.Num == 1) {
     if (inputFileTranspose.size()) {
       return galois::cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
-          inputFile, galois::CUSP_CSC, galois::CUSP_CSC, useShad, false,
+          inputFile, galois::CUSP_CSC, galois::CUSP_CSC, useWMD, false,
           inputFileTranspose);
     } else {
       fprintf(stderr, "WARNING: Loading transpose graph through in-memory "
@@ -328,7 +328,7 @@ DistGraphPtr<NodeData, EdgeData> constructGraph(std::vector<unsigned>&) {
                       "graph with -graphTranspose to avoid unnecessary "
                       "overhead.\n");
       return galois::cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
-          inputFile, galois::CUSP_CSR, galois::CUSP_CSC, useShad, false,
+          inputFile, galois::CUSP_CSR, galois::CUSP_CSC, useWMD, false,
           inputFileTranspose);
     }
   }
@@ -336,12 +336,12 @@ DistGraphPtr<NodeData, EdgeData> constructGraph(std::vector<unsigned>&) {
   switch (partitionScheme) {
   case OEC:
     return galois::cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSC, useShad, false,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSC, useWMD, false,
         inputFileTranspose, mastersFile);
   case IEC:
     if (inputFileTranspose.size()) {
       return galois::cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
-          inputFile, galois::CUSP_CSC, galois::CUSP_CSC, useShad, false,
+          inputFile, galois::CUSP_CSC, galois::CUSP_CSC, useWMD, false,
           inputFileTranspose, mastersFile);
     } else {
       GALOIS_DIE("iec requires transpose graph");
@@ -350,12 +350,12 @@ DistGraphPtr<NodeData, EdgeData> constructGraph(std::vector<unsigned>&) {
 
   case HOVC:
     return galois::cuspPartitionGraph<GenericHVC, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSC, useShad, false,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSC, useWMD, false,
         inputFileTranspose);
   case HIVC:
     if (inputFileTranspose.size()) {
       return galois::cuspPartitionGraph<GenericHVC, NodeData, EdgeData>(
-          inputFile, galois::CUSP_CSC, galois::CUSP_CSC, useShad, false,
+          inputFile, galois::CUSP_CSC, galois::CUSP_CSC, useWMD, false,
           inputFileTranspose);
     } else {
       GALOIS_DIE("hivc requires transpose graph");
@@ -364,13 +364,13 @@ DistGraphPtr<NodeData, EdgeData> constructGraph(std::vector<unsigned>&) {
 
   case CART_VCUT:
     return galois::cuspPartitionGraph<GenericCVCColumnFlip, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSC, useShad, false,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSC, useWMD, false,
         inputFileTranspose);
   case CART_VCUT_IEC:
     if (inputFileTranspose.size()) {
       return galois::cuspPartitionGraph<GenericCVCColumnFlip, NodeData,
                                         EdgeData>(inputFile, galois::CUSP_CSC,
-                                                  galois::CUSP_CSC, useShad,
+                                                  galois::CUSP_CSC, useWMD,
                                                   false,
                                                   inputFileTranspose);
     } else {
@@ -380,12 +380,12 @@ DistGraphPtr<NodeData, EdgeData> constructGraph(std::vector<unsigned>&) {
 
   case GINGER_O:
     return galois::cuspPartitionGraph<GingerP, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSC, useShad, false,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSC, useWMD, false,
         inputFileTranspose);
   case GINGER_I:
     if (inputFileTranspose.size()) {
       return galois::cuspPartitionGraph<GingerP, NodeData, EdgeData>(
-          inputFile, galois::CUSP_CSC, galois::CUSP_CSC, useShad, false,
+          inputFile, galois::CUSP_CSC, galois::CUSP_CSC, useWMD, false,
           inputFileTranspose);
     } else {
       GALOIS_DIE("Ginger requires transpose graph");
@@ -394,12 +394,12 @@ DistGraphPtr<NodeData, EdgeData> constructGraph(std::vector<unsigned>&) {
 
   case FENNEL_O:
     return galois::cuspPartitionGraph<FennelP, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSC, useShad, false,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSC, useWMD, false,
         inputFileTranspose);
   case FENNEL_I:
     if (inputFileTranspose.size()) {
       return galois::cuspPartitionGraph<FennelP, NodeData, EdgeData>(
-          inputFile, galois::CUSP_CSC, galois::CUSP_CSC, useShad, false,
+          inputFile, galois::CUSP_CSC, galois::CUSP_CSC, useWMD, false,
           inputFileTranspose);
     } else {
       GALOIS_DIE("Fennel requires transpose graph");
@@ -408,7 +408,7 @@ DistGraphPtr<NodeData, EdgeData> constructGraph(std::vector<unsigned>&) {
 
   case SUGAR_O:
     return galois::cuspPartitionGraph<SugarColumnFlipP, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSC, useShad, false,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSC, useWMD, false,
         inputFileTranspose);
 
   default:
diff --git a/lonestar/libdistbench/include/DistBench/Output.h b/lonestar/libdistbench/include/DistBench/Output.h
index 51df733952..e15bbc45c6 100644
--- a/lonestar/libdistbench/include/DistBench/Output.h
+++ b/lonestar/libdistbench/include/DistBench/Output.h
@@ -1,6 +1,7 @@
 #ifndef GALOIS_DISTBENCH_OUTPUT_H
 #define GALOIS_DISTBENCH_OUTPUT_H
 
+#include <cstdint>
 #include <string>
 #include <fstream>
 #include "galois/gIO.h"
diff --git a/lonestar/libdistbench/src/Input.cpp b/lonestar/libdistbench/src/Input.cpp
index 04321bd14e..844591506f 100644
--- a/lonestar/libdistbench/src/Input.cpp
+++ b/lonestar/libdistbench/src/Input.cpp
@@ -60,9 +60,9 @@ cll::opt<PARTITIONING_SCHEME> partitionScheme(
                    "fennel, incoming edge cut, using CuSP")),
     cll::init(OEC));
 
-cll::opt<bool> useShad("useShad", cll::desc("true if the input graph is"
-                                            " SHAD WMD graph format."
-                                            " Otheriwse, set false."),
+cll::opt<bool> useWMD("useWMD", cll::desc("true if the input graph is"
+                                          " SHAD WMD graph format."
+                                          " Otheriwse, set false."),
                        cll::init(false));
 
 cll::opt<bool> readFromFile("readFromFile",
diff --git a/lonestar/libgnnbench/include/GNNBench/Input.h b/lonestar/libgnnbench/include/GNNBench/Input.h
index bb417a90f2..50713cae67 100644
--- a/lonestar/libgnnbench/include/GNNBench/Input.h
+++ b/lonestar/libgnnbench/include/GNNBench/Input.h
@@ -14,9 +14,168 @@ extern llvm::cl::opt<std::string> input_directory;
 extern llvm::cl::opt<std::string> input_name;
 //! Scheme used to partition the graph
 extern llvm::cl::opt<galois::graphs::GNNPartitionScheme> partition_scheme;
+extern llvm::cl::opt<unsigned> num_layers;
+extern llvm::cl::opt<unsigned> layer_size;
+extern llvm::cl::opt<float> learning_rate;
+extern llvm::cl::opt<galois::GNNOutputLayerType> output_layer_type;
+extern llvm::cl::opt<bool> multiclass_labels;
+extern llvm::cl::opt<bool> do_graph_sampling;
+extern llvm::cl::opt<bool> useWMD;
+extern llvm::cl::opt<bool> use_train_subgraph;
+extern llvm::cl::opt<unsigned> minibatch_test_interval;
+extern llvm::cl::opt<unsigned> test_interval;
+extern llvm::cl::opt<unsigned> val_interval;
+extern llvm::cl::opt<unsigned> train_minibatch_size;
+extern llvm::cl::opt<unsigned> test_minibatch_size;
+extern llvm::cl::opt<bool> inductive_subgraph;
 
 const char* GNNPartitionToString(galois::graphs::GNNPartitionScheme s);
 
+std::vector<galois::GNNLayerType> CreateLayerTypesVector();
+
+template <typename VTy, typename ETy>
+std::vector<size_t>
+CreateLayerSizesVector(const galois::graphs::GNNGraph<VTy, ETy>* gnn_graph) {
+  // set layer sizes for intermdiate and output layers
+  std::vector<size_t> layer_sizes_vector;
+
+  // if (layer_sizes.size()) {
+  //  GALOIS_LOG_ASSERT(layer_sizes.size() == num_layers);
+  //  for (size_t i = 0; i < num_layers; i++) {
+  //    layer_sizes_vector.emplace_back(layer_sizes[i]);
+  //  }
+  //  // verify user satisfies last intermediate layer needing to have same size
+  //  // as # label classes
+  //  if (layer_sizes_vector.back() != gnn_graph->GetNumLabelClasses()) {
+  //    galois::gWarn(
+  //        "Size of last layer (", layer_sizes_vector.back(),
+  //        ") is not equal to # label classes: forcefully changing it to ",
+  //        gnn_graph->GetNumLabelClasses());
+  //    layer_sizes_vector.back()   = gnn_graph->GetNumLabelClasses();
+  //    layer_sizes[num_layers - 1] = gnn_graph->GetNumLabelClasses();
+  //  }
+
+  //  GALOIS_LOG_ASSERT(layer_sizes_vector.back() ==
+  //                    gnn_graph->GetNumLabelClasses());
+  //} else {
+  //  // default 16 for everything until last 2
+  //  for (size_t i = 0; i < num_layers - 1; i++) {
+  //    layer_sizes_vector.emplace_back(16);
+  //  }
+  //  // last 2 sizes must be equivalent to # label classes; this is the last
+  //  // intermediate layer
+  //  layer_sizes_vector.emplace_back(gnn_graph->GetNumLabelClasses());
+  //}
+
+  for (size_t i = 0; i < num_layers - 1; i++) {
+    layer_sizes_vector.emplace_back(layer_size);
+  }
+  // last 2 sizes must be equivalent to # label classes; this is the last
+  // intermediate layer
+  layer_sizes_vector.emplace_back(gnn_graph->GetNumLabelClasses());
+  // TODO
+  // for now only softmax layer which dictates the output size of the last
+  // intermediate layer + size of the output layer
+  // output layer at the moment required to be same as # label classes
+  layer_sizes_vector.emplace_back(gnn_graph->GetNumLabelClasses());
+
+  return layer_sizes_vector;
+}
+
+galois::GNNLayerConfig CreateLayerConfig();
+
+template <typename VTy, typename ETy>
+std::unique_ptr<galois::BaseOptimizer>
+CreateOptimizer(const galois::graphs::GNNGraph<VTy, ETy>* gnn_graph) {
+  std::vector<size_t> opt_sizes;
+
+  // optimizer sizes are based on intermediate layer sizes, input feats, and
+  // # label classes
+  // if (layer_sizes.size()) {
+  //  GALOIS_LOG_ASSERT(layer_sizes.size() == num_layers);
+  //  opt_sizes.emplace_back(gnn_graph->node_feature_length() * layer_sizes[0]);
+  //  // assumption here is that if it reached this point then layer sizes were
+  //  // already sanity checked previously (esp. last layer)
+  //  for (size_t i = 1; i < num_layers; i++) {
+  //    opt_sizes.emplace_back(layer_sizes[i] * layer_sizes[i - 1]);
+  //  }
+  //} else {
+  //  // everything is size 16 until last
+  //  if (num_layers == 1) {
+  //    // single layer requires a bit of special handling
+  //    opt_sizes.emplace_back(gnn_graph->node_feature_length() *
+  //                           gnn_graph->GetNumLabelClasses());
+  //  } else {
+  //    // first
+  //    opt_sizes.emplace_back(gnn_graph->node_feature_length() * 16);
+  //    for (size_t i = 1; i < num_layers - 1; i++) {
+  //      opt_sizes.emplace_back(16 * 16);
+  //    }
+  //    // last
+  //    opt_sizes.emplace_back(16 * gnn_graph->GetNumLabelClasses());
+  //  }
+  //}
+
+  // everything is size 16 until last
+  if (num_layers == 1) {
+    // single layer requires a bit of special handling
+    opt_sizes.emplace_back(gnn_graph->node_feature_length() *
+                           gnn_graph->GetNumLabelClasses());
+  } else {
+    // first
+    opt_sizes.emplace_back(gnn_graph->node_feature_length() * layer_size);
+    for (size_t i = 1; i < num_layers - 1; i++) {
+      opt_sizes.emplace_back(layer_size * layer_size);
+    }
+    // last
+    opt_sizes.emplace_back(layer_size * gnn_graph->GetNumLabelClasses());
+  }
+  GALOIS_LOG_ASSERT(opt_sizes.size() == num_layers);
+
+  galois::AdamOptimizer::AdamConfiguration adam_config;
+  adam_config.alpha = learning_rate;
+
+  // TODO only adam works right now, add the others later
+  return std::make_unique<galois::AdamOptimizer>(adam_config, opt_sizes,
+                                                 num_layers);
+}
+
+std::vector<unsigned> CreateFanOutVector();
+
 //! Using command line args above, create a GNN using some specified layer type
 //! as the intermediate layer.
-std::unique_ptr<galois::GraphNeuralNetwork> InitializeGraphNeuralNetwork();
+template <typename VTy, typename ETy>
+std::unique_ptr<galois::GraphNeuralNetwork<VTy, ETy>>
+InitializeGraphNeuralNetwork() {
+  // partition/load graph
+  auto gnn_graph = std::make_unique<galois::graphs::GNNGraph<VTy, ETy>>(
+      input_directory, input_name, partition_scheme, !multiclass_labels,
+      useWMD);
+
+  // create layer types vector
+  std::vector<galois::GNNLayerType> layer_types = CreateLayerTypesVector();
+  // sizes
+  std::vector<size_t> layer_sizes_vector =
+      CreateLayerSizesVector(gnn_graph.get());
+  // layer config object
+  galois::GNNLayerConfig layer_config = CreateLayerConfig();
+  // GNN config object
+  galois::GraphNeuralNetworkConfig gnn_config(
+      num_layers, layer_types, layer_sizes_vector, output_layer_type,
+      do_graph_sampling, layer_config);
+  gnn_config.use_train_subgraph_      = use_train_subgraph;
+  gnn_config.validation_interval_     = val_interval;
+  gnn_config.test_interval_           = test_interval;
+  gnn_config.train_minibatch_size_    = train_minibatch_size;
+  gnn_config.test_minibatch_size_     = test_minibatch_size;
+  gnn_config.minibatch_test_interval_ = minibatch_test_interval;
+  gnn_config.inductive_subgraph_      = inductive_subgraph;
+  gnn_config.fan_out_vector_          = CreateFanOutVector();
+
+  // optimizer
+  std::unique_ptr<galois::BaseOptimizer> opt = CreateOptimizer(gnn_graph.get());
+
+  // create the gnn
+  return std::make_unique<galois::GraphNeuralNetwork<VTy, ETy>>(
+      std::move(gnn_graph), std::move(opt), std::move(gnn_config));
+}
diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp
index d1685b8e2b..44b11cfa9b 100644
--- a/lonestar/libgnnbench/src/Input.cpp
+++ b/lonestar/libgnnbench/src/Input.cpp
@@ -26,9 +26,9 @@ llvm::cl::opt<galois::graphs::GNNPartitionScheme> partition_scheme(
                            "Original Cartesian Vertex-Cut")),
     cll::init(galois::graphs::GNNPartitionScheme::kOEC));
 
-cll::opt<bool> useShad("useShad", cll::desc("true if the input graph is"
-                                            " SHAD WMD graph format."
-                                            " Otheriwse, set false."),
+cll::opt<bool> useWMD("useWMD", cll::desc("true if the input graph is"
+                                          " SHAD WMD graph format."
+                                          " Otheriwse, set false."),
                        cll::init(false));
 
 llvm::cl::opt<unsigned> num_layers(
@@ -206,55 +206,6 @@ std::vector<galois::GNNLayerType> CreateLayerTypesVector() {
   return layer_types;
 }
 
-//! Initializes the vector of layer sizes from command line args + graph
-std::vector<size_t>
-CreateLayerSizesVector(const galois::graphs::GNNGraph* gnn_graph) {
-  // set layer sizes for intermdiate and output layers
-  std::vector<size_t> layer_sizes_vector;
-
-  // if (layer_sizes.size()) {
-  //  GALOIS_LOG_ASSERT(layer_sizes.size() == num_layers);
-  //  for (size_t i = 0; i < num_layers; i++) {
-  //    layer_sizes_vector.emplace_back(layer_sizes[i]);
-  //  }
-  //  // verify user satisfies last intermediate layer needing to have same size
-  //  // as # label classes
-  //  if (layer_sizes_vector.back() != gnn_graph->GetNumLabelClasses()) {
-  //    galois::gWarn(
-  //        "Size of last layer (", layer_sizes_vector.back(),
-  //        ") is not equal to # label classes: forcefully changing it to ",
-  //        gnn_graph->GetNumLabelClasses());
-  //    layer_sizes_vector.back()   = gnn_graph->GetNumLabelClasses();
-  //    layer_sizes[num_layers - 1] = gnn_graph->GetNumLabelClasses();
-  //  }
-
-  //  GALOIS_LOG_ASSERT(layer_sizes_vector.back() ==
-  //                    gnn_graph->GetNumLabelClasses());
-  //} else {
-  //  // default 16 for everything until last 2
-  //  for (size_t i = 0; i < num_layers - 1; i++) {
-  //    layer_sizes_vector.emplace_back(16);
-  //  }
-  //  // last 2 sizes must be equivalent to # label classes; this is the last
-  //  // intermediate layer
-  //  layer_sizes_vector.emplace_back(gnn_graph->GetNumLabelClasses());
-  //}
-
-  for (size_t i = 0; i < num_layers - 1; i++) {
-    layer_sizes_vector.emplace_back(layer_size);
-  }
-  // last 2 sizes must be equivalent to # label classes; this is the last
-  // intermediate layer
-  layer_sizes_vector.emplace_back(gnn_graph->GetNumLabelClasses());
-  // TODO
-  // for now only softmax layer which dictates the output size of the last
-  // intermediate layer + size of the output layer
-  // output layer at the moment required to be same as # label classes
-  layer_sizes_vector.emplace_back(gnn_graph->GetNumLabelClasses());
-
-  return layer_sizes_vector;
-}
-
 //! Setup layer config struct based on cli args
 galois::GNNLayerConfig CreateLayerConfig() {
   galois::GNNLayerConfig layer_config;
@@ -267,61 +218,6 @@ galois::GNNLayerConfig CreateLayerConfig() {
   return layer_config;
 }
 
-std::unique_ptr<galois::BaseOptimizer>
-CreateOptimizer(const galois::graphs::GNNGraph* gnn_graph) {
-  std::vector<size_t> opt_sizes;
-
-  // optimizer sizes are based on intermediate layer sizes, input feats, and
-  // # label classes
-  // if (layer_sizes.size()) {
-  //  GALOIS_LOG_ASSERT(layer_sizes.size() == num_layers);
-  //  opt_sizes.emplace_back(gnn_graph->node_feature_length() * layer_sizes[0]);
-  //  // assumption here is that if it reached this point then layer sizes were
-  //  // already sanity checked previously (esp. last layer)
-  //  for (size_t i = 1; i < num_layers; i++) {
-  //    opt_sizes.emplace_back(layer_sizes[i] * layer_sizes[i - 1]);
-  //  }
-  //} else {
-  //  // everything is size 16 until last
-  //  if (num_layers == 1) {
-  //    // single layer requires a bit of special handling
-  //    opt_sizes.emplace_back(gnn_graph->node_feature_length() *
-  //                           gnn_graph->GetNumLabelClasses());
-  //  } else {
-  //    // first
-  //    opt_sizes.emplace_back(gnn_graph->node_feature_length() * 16);
-  //    for (size_t i = 1; i < num_layers - 1; i++) {
-  //      opt_sizes.emplace_back(16 * 16);
-  //    }
-  //    // last
-  //    opt_sizes.emplace_back(16 * gnn_graph->GetNumLabelClasses());
-  //  }
-  //}
-
-  // everything is size 16 until last
-  if (num_layers == 1) {
-    // single layer requires a bit of special handling
-    opt_sizes.emplace_back(gnn_graph->node_feature_length() *
-                           gnn_graph->GetNumLabelClasses());
-  } else {
-    // first
-    opt_sizes.emplace_back(gnn_graph->node_feature_length() * layer_size);
-    for (size_t i = 1; i < num_layers - 1; i++) {
-      opt_sizes.emplace_back(layer_size * layer_size);
-    }
-    // last
-    opt_sizes.emplace_back(layer_size * gnn_graph->GetNumLabelClasses());
-  }
-  GALOIS_LOG_ASSERT(opt_sizes.size() == num_layers);
-
-  galois::AdamOptimizer::AdamConfiguration adam_config;
-  adam_config.alpha = learning_rate;
-
-  // TODO only adam works right now, add the others later
-  return std::make_unique<galois::AdamOptimizer>(adam_config, opt_sizes,
-                                                 num_layers);
-}
-
 std::vector<unsigned> CreateFanOutVector() {
   std::vector<unsigned> fan_out;
   // fan out only matters if graph sampling is enabled
@@ -342,37 +238,3 @@ std::vector<unsigned> CreateFanOutVector() {
   }
   return fan_out;
 }
-
-std::unique_ptr<galois::GraphNeuralNetwork> InitializeGraphNeuralNetwork() {
-  // partition/load graph
-  auto gnn_graph = std::make_unique<galois::graphs::GNNGraph>(
-      input_directory, input_name, partition_scheme, !multiclass_labels,
-      useShad);
-
-  // create layer types vector
-  std::vector<galois::GNNLayerType> layer_types = CreateLayerTypesVector();
-  // sizes
-  std::vector<size_t> layer_sizes_vector =
-      CreateLayerSizesVector(gnn_graph.get());
-  // layer config object
-  galois::GNNLayerConfig layer_config = CreateLayerConfig();
-  // GNN config object
-  galois::GraphNeuralNetworkConfig gnn_config(
-      num_layers, layer_types, layer_sizes_vector, output_layer_type,
-      do_graph_sampling, layer_config);
-  gnn_config.use_train_subgraph_      = use_train_subgraph;
-  gnn_config.validation_interval_     = val_interval;
-  gnn_config.test_interval_           = test_interval;
-  gnn_config.train_minibatch_size_    = train_minibatch_size;
-  gnn_config.test_minibatch_size_     = test_minibatch_size;
-  gnn_config.minibatch_test_interval_ = minibatch_test_interval;
-  gnn_config.inductive_subgraph_      = inductive_subgraph;
-  gnn_config.fan_out_vector_          = CreateFanOutVector();
-
-  // optimizer
-  std::unique_ptr<galois::BaseOptimizer> opt = CreateOptimizer(gnn_graph.get());
-
-  // create the gnn
-  return std::make_unique<galois::GraphNeuralNetwork>(
-      std::move(gnn_graph), std::move(opt), std::move(gnn_config));
-}
diff --git a/lonestar/libgnnbench/src/Start.cpp b/lonestar/libgnnbench/src/Start.cpp
index daff6ad114..ed928374cc 100644
--- a/lonestar/libgnnbench/src/Start.cpp
+++ b/lonestar/libgnnbench/src/Start.cpp
@@ -117,6 +117,33 @@ void GNNBenchStart(int argc, char** argv, const char* app, const char* desc,
     galois::runtime::reportParam("GNNBench", "IsGraphSampled",
                                  do_graph_sampling);
     galois::runtime::reportParam("GNNBench", "LearningRate", learning_rate);
+
+    if (useWMD &&
+        partition_scheme != galois::graphs::GNNPartitionScheme::kOCVC) {
+      // cvc/oec (GNN-CVC, GNN-OEC in CuSP), not ocvc, are variants
+      // of the default CuSP cvc/oec partitioning policies.
+      // The original partitioning policies (including ocvc) only
+      // consider and attempt to balance the number of master nodes
+      // for each host.
+      // However, Galois-GNN chooses training vertices from the original graph,
+      // and extracts, constructs, uses a subgraph only with them for training.
+      // In this case, especially Galois-GNN typically chooses a consecutive
+      // range of vertices as the training vertices.
+      // This method might cause load imbalancing among hosts since most of the
+      // training master nodes are skewed to the few hosts.
+      // In order to alleviate this issue, Galois-GNN provides those variant
+      // partitioning policies. They consider and attempt to balance the
+      // number of master "training" nodes for each host.
+      // SHAD-GNN on WMD graphs is not necessarily constrained to this design.
+      // SHAD-GNN has the specific number of training vertices, and randomly 
+      // selects vertices from a graph as that, which means that Galois-GNN
+      // could avoid vertex imbalancing due to the skewness if it chooses
+      // vertices in balance manner.
+      // To sum up, we do not support the specialized partitioning policies,
+      // but choose vertices in balance manner.
+      GALOIS_LOG_FATAL("Gnn CVC and OEC are not supported for WMD graphs {}",
+          GNNPartitionToString(partition_scheme));
+    }
   }
 
   char name[256];
diff --git a/scripts/shad-gnn/check_feature_construction.py b/scripts/shad-gnn/check_feature_construction.py
new file mode 100644
index 0000000000..62538431a6
--- /dev/null
+++ b/scripts/shad-gnn/check_feature_construction.py
@@ -0,0 +1,51 @@
+import csv
+
+"""
+@autor: Hochan Lee (hochan.lee@amd.com)
+
+Requirement:
+
+The below two files should exist on the directory
+where this script runs.
+
+1) solution.csv is the solution file.
+2) 2hop.[host id].feat is the results of the feature construction
+that we want to check correctness.
+
+Command:
+python check_feature_construction.py
+
+"""
+num_hosts = 4
+
+solution = {}
+with open("solution.csv", "r") as f:
+  reader = csv.reader(f)
+  for row in reader:
+    rlen = len(row)
+    feat = []
+    for i in range(1, rlen):
+      feat.append(int(row[i]))
+    solution[row[0]] = feat
+
+fail = False
+for i in range(0, num_hosts):
+  with open("2hop."+str(i)+".feat", "r") as f:
+    reader = csv.reader(f)
+    for row in reader:
+      rlen = len(row)
+      feat = []
+      for j in range(1, rlen):
+        feat.append(int(row[j]))
+      key = row[0]
+
+      solution_feat = solution[key]
+      for j in range(0, rlen-1):
+        if solution_feat[j] != feat[j]:
+            print(key, " failed at ", j, " on host:", i)
+            fail = True
+
+if fail:
+  print("Verification failed")
+else:
+  print("Verification succeeded")

From 40609e2182badbc2bbabfdc85d617088c6f66c3c Mon Sep 17 00:00:00 2001
From: "Lee, Hochan" <133701794+hochanlee-amd@users.noreply.github.com>
Date: Fri, 15 Sep 2023 20:45:30 -0500
Subject: [PATCH 605/660] Add ego graph construction to GCN

This commit adds ego graph construction for each epoch to GCN layer.
The original GCN paper did not use ego graphs, but the whole
graph, and so, Galois-GNN didn't implement that intentionally. To follow SHAD GCN
reference code, we now added that.

This commit only contains a single host execution unit test because
designing and implementing multi-host execution is a time consuming task
and so, I decided to do that later.
But, without the unit test, I confirmed correctness of the multi-host
execution based on the below changes.

1. Set all layer weights to 1, instead of random values.
2. Used nodes within global node ID range for training.
(So, the nodes are deterministic)
(The original code uses random selection to match SHAD's one)
3. Compared 1-host and multi hosts, like 2 and 4 hosts,
accuracy results on the graph sampling mode.
4. They should be same if the GCN graph sampling is correct.
---
 libgnn/include/galois/graphs/GNNGraph.h       |  74 ++++++++-
 .../galois/layers/GraphConvolutionalLayer.h   | 133 ++++++++++++----
 libgnn/test/CMakeLists.txt                    |  10 +-
 libgnn/test/gcn-sample-edge-test.cpp          | 148 ++++++++++++++++++
 libgnn/test/sample-test.cpp                   |  16 ++
 5 files changed, 342 insertions(+), 39 deletions(-)
 create mode 100644 libgnn/test/gcn-sample-edge-test.cpp

diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index ad41def334..146daf24b3 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -297,7 +297,6 @@ class GNNGraph {
       return edge_sample_status_[layer_num].test(ei);
     } else {
       return subgraph_->OutEdgeSampled(ei, layer_num, *this);
-      return false;
     }
   };
   bool IsEdgeSampled(EdgeIterator ei, size_t layer_num) const {
@@ -545,7 +544,6 @@ class GNNGraph {
                   continue;
                 }
               }
-
               MakeEdgeSampled(edge_iter, agg_layer_num);
               uint32_t dest = partitioned_graph_->getEdgeDst(edge_iter);
               if (!IsInSampledGraph(dest)) {
@@ -1072,7 +1070,57 @@ class GNNGraph {
   bool MoreTestMinibatches() { return !test_batcher_->NoMoreMinibatches(); };
 
   //////////////////////////////////////////////////////////////////////////////
-  GNNFloat GetGCNNormFactor(GraphNode lid) const {
+
+  /**
+   * @brief Normalization factor calculation for GCN without graph sampling
+   *
+   * @detail This function calculates normalization factor for nodes
+   * on a GCN layer, but not with graph sampling (ego graph construction).
+   * This normalization is proposed in GCN paper, and its equation is
+   * D^(-1/2)*A*D^(-1/2).
+   * XXX(hc): This degraded accuracy when graph sampling was enabled.
+   * That could be many reasons for that, for example, a graph was already
+   * small, and so, sampled graphs across layers are too small to normalize,
+   * or, it might be theoretical design reason as the original GCN
+   * did not consider ego graph construction.
+   * For example, the one possible reason is that backward phase and
+   * forward phase edge iterators are also different and maybe need to
+   * use different iterators.
+   * For now, I stopped this analysis and
+   * just enabled this method for only GCN without graph
+   * sampling. With graph sampling, I used SAGE's graph normalization.
+   */ 
+  GNNFloat GetGCNNormFactor(GraphNode lid
+      /*, size_t graph_user_layer_num*/) const {
+#if 0
+    if (use_subgraph_ || use_subgraph_view_) {
+      size_t degree;
+      if (!subgraph_choose_all_) {
+        // case because degrees in each layer differ
+        degree =
+            sampled_out_degrees_[graph_user_layer_num][
+                subgraph_->SIDToLID(lid)];
+      } else {
+        // XXX if inductive
+        // degree = global_train_degrees_[subgraph_->SIDToLID(n)];
+        degree = global_degrees_[subgraph_->SIDToLID(lid)];
+      }
+      if (degree) {
+        return 1.0 / std::sqrt(static_cast<float>(degree) + 1);
+      } else {
+        return 0;
+      }
+    } else {
+      if (global_degrees_[lid]) {
+        if (this->size() != this->active_size()) {
+          std::cout << lid << " does not match\n";
+        }
+        return 1.0 / std::sqrt(static_cast<float>(global_degrees_[lid]) + 1);
+      } else {
+        return 0.0;
+      }
+    }
+#endif
     if (global_degrees_[lid]) {
       return 1.0 / std::sqrt(static_cast<float>(global_degrees_[lid]) + 1);
     } else {
@@ -1556,6 +1604,26 @@ class GNNGraph {
     return non_layer_zero_masters_;
   }
 
+  // TODO(hc): `ResizeSamplingBitsets()` and
+  // `GetDefinitelySampledNodesBset()` expose private member variables
+  // for unit tests. Other than them, these should not be used.
+
+  void ResizeSamplingBitsets() {
+    if (!bitset_sampled_degrees_.size()) {
+      bitset_sampled_degrees_.resize(partitioned_graph_->size());
+    }
+    if (!bitset_sample_flag_.size()) {
+      bitset_sample_flag_.resize(size());
+    }
+    if (!definitely_sampled_nodes_.size()) {
+      definitely_sampled_nodes_.resize(partitioned_graph_->size());
+    }
+  }
+
+  galois::DynamicBitSet& GetDefinitelySampledNodesBset() {
+    return definitely_sampled_nodes_;
+  }
+
 private:
 // included like this to avoid cyclic dependency issues + not used anywhere but
 // in this class anyways
diff --git a/libgnn/include/galois/layers/GraphConvolutionalLayer.h b/libgnn/include/galois/layers/GraphConvolutionalLayer.h
index 2c7a41ecab..be882647a1 100644
--- a/libgnn/include/galois/layers/GraphConvolutionalLayer.h
+++ b/libgnn/include/galois/layers/GraphConvolutionalLayer.h
@@ -218,7 +218,25 @@ class GraphConvolutionalLayer : public GNNLayer<VTy, ETy> {
             this->layer_dimensions_.output_columns) {
       // aggdata can == p_intemp1; in other words, need to use before overwrite
       // mask it, then use it
-      this->MaskInputNonMasters(&agg_data);
+      if (this->layer_number_ != 0) {
+        if (this->graph_.IsSubgraphOn()) {
+          this->MaskInputNonMasters(&agg_data,
+                                    this->layer_dimensions_.output_rows,
+                                    this->graph_.GetNonLayerZeroMasters());
+        } else {
+          this->MaskInputNonMasters(&agg_data,
+                                    this->layer_dimensions_.output_rows);
+        }
+      } else {
+        if (this->graph_.IsSubgraphOn()) {
+          this->MaskGradientNonMasters(input_gradient,
+                                       this->layer_dimensions_.output_rows,
+                                       this->graph_.GetNonLayerZeroMasters());
+        } else {
+          this->MaskGradientNonMasters(input_gradient,
+                                       this->layer_dimensions_.output_rows);
+        }
+      }
 
 #ifdef GALOIS_ENABLE_GPU
       if (device_personality == DevicePersonality::GPU_CUDA) {
@@ -231,9 +249,10 @@ class GraphConvolutionalLayer : public GNNLayer<VTy, ETy> {
 #endif
         weight_gradient_timer.start();
         // temp 2 holds aggregated feature vectors from forward phase
+        // use output rows since gcn can use subgraphs
         galois::CBlasSGEMM(
             CblasTrans, CblasNoTrans, this->layer_dimensions_.input_columns,
-            this->layer_dimensions_.input_rows,
+            this->layer_dimensions_.output_rows,
             this->layer_dimensions_.output_columns, agg_data.data(),
             input_gradient->data(), this->p_layer_weight_gradients_.data());
         weight_gradient_timer.stop();
@@ -249,7 +268,8 @@ class GraphConvolutionalLayer : public GNNLayer<VTy, ETy> {
                this->layer_dimensions_.input_rows *
                    this->layer_dimensions_.output_columns);
         // pintemp1 contains (AF)'
-        UpdateEmbeddingsDerivative(input_gradient->data(), p_in_temp_1_.data());
+        UpdateEmbeddingsDerivative(
+            input_gradient->data(), p_in_temp_1_.data());
         // pback contains F'
         // derivative of aggregate is the same due to symmetric graph
         AggregateAll(this->layer_dimensions_.input_columns, p_in_temp_1_.data(),
@@ -271,11 +291,24 @@ class GraphConvolutionalLayer : public GNNLayer<VTy, ETy> {
       // means I can mess with the input data itself instad of masking the
       // gradients I can mask the input
       if (this->layer_number_ != 0) {
-        this->MaskInputNonMasters(&input_data);
+        if (this->graph_.IsSubgraphOn()) {
+          this->MaskInputNonMasters(&input_data,
+                                    this->layer_dimensions_.input_rows,
+                                    this->graph_.GetNonLayerZeroMasters());
+        } else {
+          this->MaskInputNonMasters(&input_data,
+                                    this->layer_dimensions_.input_rows);
+        }
       } else {
         // if 0 then no input to mask: mask the gradient
         // this is fine because gradient won't be used to get feature gradients
-        this->MaskGradientNonMasters(&p_out_temp_);
+        if (this->graph_.IsSubgraphOn()) {
+          this->MaskGradientNonMasters(&p_out_temp_,
+                                       this->layer_dimensions_.input_rows,
+                                       this->graph_.GetNonLayerZeroMasters());
+        } else {
+          this->MaskGradientNonMasters(&p_out_temp_);
+        }
       }
 
 #ifdef GALOIS_ENABLE_GPU
@@ -347,14 +380,18 @@ class GraphConvolutionalLayer : public GNNLayer<VTy, ETy> {
   void
   AggregateAllCPU(size_t column_length, const GNNFloat* node_embeddings,
                   GNNFloat* aggregate_output,
-                  galois::substrate::PerThreadStorage<std::vector<GNNFloat>>*) {
+                  galois::substrate::PerThreadStorage<std::vector<GNNFloat>>*,
+                  bool is_backward) {
     galois::StatTimer aggregate_all_sync_timer("AggregateSync", kRegionName);
-    size_t num_nodes   = this->graph_.size();
+    size_t num_nodes   = (is_backward)?
+        this->layer_dimensions_.input_rows :
+        this->layer_dimensions_.output_rows;
     size_t last_master = *(this->graph_.end_owned());
+
     assert(0 == *(this->graph_.begin_owned()));
 
     galois::do_all(
-        galois::iterate(static_cast<size_t>(0), num_nodes),
+        galois::iterate(*(this->graph_.begin()), num_nodes),
         [&](size_t src) {
           size_t index_to_src_feature = src * column_length;
           // zero out src feature first
@@ -364,24 +401,31 @@ class GraphConvolutionalLayer : public GNNLayer<VTy, ETy> {
 
           if (this->layer_phase_ == GNNPhase::kTrain) {
             if (this->IsSampledLayer()) {
-              // XXX(loc)
-              GALOIS_LOG_WARN(
-                  "Edge sampling not yet implemented for GCN; only SAGE");
-              // check if node is part of sampled graph; ignore after 0'ing if
-              // not sampled
-              if (!this->graph_.IsInSampledGraph(src))
+              // Check if node is part of sampled graph; ignore after
+              // 0'ing if it is not sampled.
+              // TODO(hc): check if SAGE also checks this
+              if (!this->graph_.IsInSampledGraph(src)) {
                 return;
+              }
             }
           }
 
-          GNNFloat source_norm = 0.0;
+          GNNFloat source_norm = 1.0;
           if (!this->config_.disable_normalization) {
-            source_norm = this->graph_.GetGCNNormFactor(src);
+            if (this->graph_.IsSubgraphOn() ||
+                this->graph_.IsSubgraphViewOn()) {
+              source_norm =
+                  this->graph_.GetDegreeNorm(
+                      src, this->graph_user_layer_number_);
+            } else {
+              source_norm = this->graph_.GetGCNNormFactor(src);
+            }
           }
 
           // init to self
           if (!this->config_.disable_self_aggregate) {
-            graphs::bitset_graph_aggregate.set(src);
+            graphs::bitset_graph_aggregate.set(
+                this->graph_.ConvertToLID(src));
             // only aggregate self once on master
             if (src < last_master) {
               for (size_t i = 0; i < column_length; i++) {
@@ -393,25 +437,44 @@ class GraphConvolutionalLayer : public GNNLayer<VTy, ETy> {
           }
 
           // loop through all destinations to grab the feature to aggregate
-          for (auto e = this->graph_.edge_begin(src);
-               e != this->graph_.edge_end(src); e++) {
-            size_t dst = this->graph_.GetEdgeDest(e);
-            graphs::bitset_graph_aggregate.set(src);
-
-            if (this->layer_phase_ == GNNPhase::kTrain) {
+          auto e_beg = (is_backward)?
+              this->graph_.in_edge_begin(src) : this->graph_.edge_begin(src);
+          auto e_end = (is_backward)?
+              this->graph_.in_edge_end(src) : this->graph_.edge_end(src);
+          for (auto e = e_beg; e != e_end; e++) {
+            if (this->layer_phase_ == GNNPhase::kTrain ||
+                this->layer_phase_ == GNNPhase::kBatch) {
               if (this->IsSampledLayer()) {
-                // ignore non-sampled nodes
-                if (this->layer_phase_ == GNNPhase::kTrain &&
-                    !this->graph_.IsInSampledGraph(dst))
+                bool is_sampled = (is_backward)?
+                    this->graph_.IsInEdgeSampled(
+                        e, this->graph_user_layer_number_) :
+                    this->graph_.IsEdgeSampled(
+                        e, this->graph_user_layer_number_);
+                // ignore non-sampled nodes and edges
+                if (!is_sampled) {
                   continue;
+                }
               }
             }
-
+            size_t dst = (is_backward)?
+                this->graph_.GetInEdgeDest(e) : this->graph_.GetEdgeDest(e);
+            graphs::bitset_graph_aggregate.set(
+                this->graph_.ConvertToLID(src));
             size_t index_to_dst_feature = dst * column_length;
 
             if (!this->config_.disable_normalization) {
-              GNNFloat norm_scale =
-                  source_norm * this->graph_.GetGCNNormFactor(dst);
+              GNNFloat norm_scale;
+              if (this->graph_.IsSubgraphOn() ||
+                  this->graph_.IsSubgraphViewOn()) {
+                norm_scale = (is_backward)?
+                    this->graph_.GetDegreeNorm(
+                        dst, this->graph_user_layer_number_)
+                    : source_norm;
+              } else {
+                norm_scale =
+                    source_norm * this->graph_.GetGCNNormFactor(dst);
+              }
+
               galois::VectorMulAdd(
                   column_length, &aggregate_output[index_to_src_feature],
                   &node_embeddings[index_to_dst_feature], norm_scale,
@@ -429,7 +492,8 @@ class GraphConvolutionalLayer : public GNNLayer<VTy, ETy> {
         galois::loopname("ConvolutionalAggregateAll"));
     // aggregate sync
     aggregate_all_sync_timer.start();
-    this->graph_.AggregateSync(aggregate_output, column_length);
+    this->graph_.AggregateSync(aggregate_output, column_length,
+          is_backward, num_nodes);
     aggregate_all_sync_timer.stop();
   }
 
@@ -469,7 +533,8 @@ class GraphConvolutionalLayer : public GNNLayer<VTy, ETy> {
                                     this->layer_number_);
     } else {
 #endif
-      AggregateAllCPU(column_length, node_embeddings, aggregate_output, pts);
+      AggregateAllCPU(column_length, node_embeddings, aggregate_output, pts,
+          is_backward);
 #ifdef GALOIS_ENABLE_GPU
     }
 #endif
@@ -495,7 +560,8 @@ class GraphConvolutionalLayer : public GNNLayer<VTy, ETy> {
                          this->layer_dimensions_.input_rows,
                          this->layer_dimensions_.input_columns,
                          this->layer_dimensions_.output_columns,
-                         node_embeddings, this->layer_weights_.data(), output);
+                         node_embeddings, this->layer_weights_.data(),
+                         output);
 #ifdef GALOIS_ENABLE_GPU
     }
 #endif
@@ -503,7 +569,8 @@ class GraphConvolutionalLayer : public GNNLayer<VTy, ETy> {
   }
 
   //! Calculate graident via mxm with last layer's gradients (backward)
-  void UpdateEmbeddingsDerivative(const GNNFloat* gradients, GNNFloat* output) {
+  void UpdateEmbeddingsDerivative(
+      const GNNFloat* gradients, GNNFloat* output) {
     galois::StatTimer timer("BackwardXform", kRegionName);
     timer.start();
 
diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt
index 00aa14bce6..d005ddd6bc 100644
--- a/libgnn/test/CMakeLists.txt
+++ b/libgnn/test/CMakeLists.txt
@@ -145,13 +145,17 @@ if (NOT GALOIS_ENABLE_GPU)
   target_link_libraries(f1-test galois_gnn)
   add_test(NAME f1-test COMMAND f1-test)
 
-  add_executable(sample-test sample-test.cpp)
-  target_link_libraries(sample-test galois_gnn)
-  add_test(NAME sample-test COMMAND sample-test)
+  #add_executable(sample-test sample-test.cpp)
+  #target_link_libraries(sample-test galois_gnn)
+  #add_test(NAME sample-test COMMAND sample-test)
 
   add_executable(sample-bit-test sample-bit-test.cpp)
   target_link_libraries(sample-bit-test galois_gnn)
   add_test(NAME sample-bit-test COMMAND sample-bit-test)
+
+  add_executable(gcn-sample-edge-test gcn-sample-edge-test.cpp)
+  target_link_libraries(gcn-sample-edge-test galois_gnn)
+  add_test(NAME gcn-sample-edge-test COMMAND gcn-sample-edge-test)
 else()
   add_executable(gpu-sage-layer-test gpu-sage-layer-test.cpp)
   target_link_libraries(gpu-sage-layer-test galois_gnn)
diff --git a/libgnn/test/gcn-sample-edge-test.cpp b/libgnn/test/gcn-sample-edge-test.cpp
new file mode 100644
index 0000000000..8bb4e74f9a
--- /dev/null
+++ b/libgnn/test/gcn-sample-edge-test.cpp
@@ -0,0 +1,148 @@
+/**
+ * This test checks correctness by comparing hand calculation
+ * of the forward and backward phases.
+ * This is implemented to check correctness of GCN layer.
+ * Below is the process:
+ * 1. Mark and check nodes and edges to be initially sampled.
+ * 2. Nodes adjacent to the sampled edges are sampled.
+ * 3. Perform forward/backward phases and compare the results
+ *    with hand calculation results.
+ */
+
+// TODO(hc): Designing and implementing multi-host execution is
+// a time consuming task and so, I will work on that later.
+// But, without test, I confirmed correctness of the multi-host
+// execution based on the below changes.
+//
+// 1. Set all layer weights to 1, instead of random values.
+// 2. Used nodes within global node ID range for training.
+// (So, the nodes are deterministic)
+// (The original code uses random selection to match SHAD's one)
+// 3. Compared 1-host and multi hosts, like 2 and 4 hosts,
+// accuracy results on the graph sampling mode.
+// 4. They should be same if the GCN graph sampling is correct.
+// (It was on the test done on 09/15/2023)
+
+#include "galois/layers/GraphConvolutionalLayer.h"
+#include "galois/layers/SAGELayer.h"
+
+int main() {
+  galois::DistMemSys G;
+
+  size_t num_threads = 1;
+  // tester graph: 0 - 1 - 2 - 3 - 4 - 5 - 6
+  galois::graphs::GNNGraph<char, void> test_graph(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
+  test_graph.InitializeSamplingData();
+
+  galois::GNNLayerConfig dcon;
+  dcon.disable_aggregate_after_update = false;
+  dcon.disable_normalization = false;
+  dcon.DebugConfig();
+  // Choose a few sample nodes
+  test_graph.SetSampledNode(0);
+  test_graph.SetSampledNode(4);
+  test_graph.UnsetSampledNode(1);
+  test_graph.UnsetSampledNode(2);
+  test_graph.UnsetSampledNode(3);
+  test_graph.UnsetSampledNode(5);
+  test_graph.UnsetSampledNode(6);
+
+  test_graph.ResizeSamplingBitsets();
+  test_graph.SampleAllEdges(0, false, 1);
+
+  // After the above lines, nodes 0, 1, 3, 4, 5 and
+  // edges 0, 7, 8 should be sampled. 
+  // So,
+  // 0 -> 1, 2 <- 3 -> 4
+  GALOIS_LOG_ASSERT(test_graph.IsInSampledGraph(0));
+  GALOIS_LOG_ASSERT(test_graph.IsInSampledGraph(1));
+  GALOIS_LOG_ASSERT(test_graph.IsInSampledGraph(3));
+  GALOIS_LOG_ASSERT(test_graph.IsInSampledGraph(4));
+  GALOIS_LOG_ASSERT(test_graph.IsInSampledGraph(5));
+
+  GALOIS_LOG_ASSERT(test_graph.IsEdgeSampledAny(7));
+  GALOIS_LOG_ASSERT(test_graph.IsEdgeSampledAny(8));
+
+
+  galois::DynamicBitSet& bset =
+      test_graph.GetDefinitelySampledNodesBset();
+  bset.ParallelReset();
+  bset.set(0);
+  bset.set(1);
+  bset.set(3);
+  bset.set(4);
+  bset.set(5);
+  test_graph.ConstructSampledSubgraph(1);
+  test_graph.EnableSubgraph();
+
+  galois::GNNLayerDimensions dimension_0;
+  dimension_0.input_rows = 5;
+  dimension_0.input_columns = 3;
+  dimension_0.output_columns = 2;
+
+  // Layer declaration
+  std::vector<galois::GNNFloat> back_matrix(15);
+  galois::PointerWithSize<galois::GNNFloat> p_back(back_matrix);
+  std::unique_ptr<galois::GraphConvolutionalLayer<char, void>> layer_1 =
+      std::make_unique<galois::GraphConvolutionalLayer<char, void>>(
+          1, test_graph, &p_back, dimension_0, dcon);
+
+  layer_1->InitAllWeightsTo1();
+  layer_1->EnableSampling();
+  galois::PointerWithSize<galois::GNNFloat> features =
+      test_graph.GetLocalFeatures();
+
+  galois::PointerWithSize<galois::GNNFloat> layer_1_forward_output =
+      layer_1->ForwardPhase(features);
+
+  GALOIS_LOG_ASSERT(layer_1_forward_output[0] == 3);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[1] == 3);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[2] == 0);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[3] == 0);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[4] == 0);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[5] == 0);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[6] == 24);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[7] == 24);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[8] == 0);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[9] == 0);
+
+  // Dummy gradients
+  std::vector<galois::GNNFloat> dummy_ones_v(10, 1);
+  galois::PointerWithSize dummy_ones(dummy_ones_v);
+  dummy_ones_v.assign(10, 1);
+  dummy_ones_v[4] = 0;
+  dummy_ones_v[5] = 0;
+
+  galois::PointerWithSize<galois::GNNFloat> layer_1_backward_output =
+      layer_1->BackwardPhase(
+          test_graph.GetLocalFeatures(), &dummy_ones);
+
+  GALOIS_LOG_ASSERT(layer_1_backward_output[0] == 0);
+  GALOIS_LOG_ASSERT(layer_1_backward_output[1] == 0);
+  GALOIS_LOG_ASSERT(layer_1_backward_output[2] == 0);
+  GALOIS_LOG_ASSERT(layer_1_backward_output[3] == 2);
+  GALOIS_LOG_ASSERT(layer_1_backward_output[4] == 2);
+  GALOIS_LOG_ASSERT(layer_1_backward_output[5] == 2);
+  GALOIS_LOG_ASSERT(layer_1_backward_output[6] == 0);
+  GALOIS_LOG_ASSERT(layer_1_backward_output[7] == 0);
+  GALOIS_LOG_ASSERT(layer_1_backward_output[8] == 0);
+  GALOIS_LOG_ASSERT(layer_1_backward_output[9] == 0);
+  GALOIS_LOG_ASSERT(layer_1_backward_output[10] == 0);
+  GALOIS_LOG_ASSERT(layer_1_backward_output[11] == 0);
+  GALOIS_LOG_ASSERT(layer_1_backward_output[12] == 2);
+  GALOIS_LOG_ASSERT(layer_1_backward_output[13] == 2);
+  GALOIS_LOG_ASSERT(layer_1_backward_output[14] == 2);
+
+  galois::PointerWithSize<galois::GNNFloat> layer_1_weight_gradients =
+      layer_1->GetLayerWeightGradients();
+  
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[0] == 6);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[1] == 6);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[2] == 6);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[3] == 6);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[4] == 6);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[5] == 6);
+
+  return 0;
+}
diff --git a/libgnn/test/sample-test.cpp b/libgnn/test/sample-test.cpp
index 927f4b0e9f..0bda9d81a8 100644
--- a/libgnn/test/sample-test.cpp
+++ b/libgnn/test/sample-test.cpp
@@ -1,6 +1,22 @@
 //! @file sample-test.cpp
 //! Sampling tester
 
+/// TODO(hc): This test is deprecated as GCN layer now supports
+/// edge sampling, as well as node sampling.
+/// The previous GCN only checks if node is sampled, but
+/// now it checks edge sampling and for that, it utilizes
+/// a bitset to mark sampled edges.
+/// If that bitset is not set, the corresponding edge is ignored.
+/// However, this test currently does not consider this case,
+/// and doesn't work.
+/// To satisfy the previous assumption and make this test work,
+/// we should mark the entire adjacent edges of the sampled nodes.
+/// In this case, we should not mark the edges' destination nodes as
+/// sampled nodes, and so, let src node iterator skip those nodes
+/// but only allow to iterate them as outgoing destinations.
+/// We can reuse this code later, and so, I will not remove this
+/// from the current source tree.
+
 #include "galois/Logging.h"
 #include "galois/GNNMath.h"
 #include "galois/layers/GraphConvolutionalLayer.h"

From e9a2a03d13c6329185820f28e9983faa95877ca3 Mon Sep 17 00:00:00 2001
From: marcopolo4096 <marcopolo4096@users.noreply.github.com>
Date: Wed, 20 Sep 2023 13:16:03 -0500
Subject: [PATCH 606/660] Fixed cpuinf parsing bug (#10)

Co-authored-by: Marko <marko.scrbak@amd.com>
---
 libgalois/src/HWTopoLinux.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libgalois/src/HWTopoLinux.cpp b/libgalois/src/HWTopoLinux.cpp
index 0835b0070e..486e707230 100644
--- a/libgalois/src/HWTopoLinux.cpp
+++ b/libgalois/src/HWTopoLinux.cpp
@@ -100,7 +100,7 @@ unsigned getNumaNode(cpuinfo& c) {
 std::vector<cpuinfo> parseCPUInfo() {
   std::vector<cpuinfo> vals;
 
-  const int len = 1024;
+  const int len = 4096;
   std::array<char, len> line;
 
   std::ifstream procInfo("/proc/cpuinfo");

From 31e32c210ee5da23d0d68d882e071b258511e12c Mon Sep 17 00:00:00 2001
From: "Lee, Hochan" <133701794+hochanlee-amd@users.noreply.github.com>
Date: Thu, 21 Sep 2023 20:40:57 -0500
Subject: [PATCH 607/660] Add ReLU layer (#9)

Co-authored-by: Hochan Lee <hochan.amd@amd.com>
---
 libgnn/include/galois/GraphNeuralNetwork.h |   6 +
 libgnn/include/galois/layers/GNNLayer.h    |   5 +-
 libgnn/include/galois/layers/ReLULayer.h   | 126 +++++++++++++++++++++
 lonestar/libgnnbench/src/Input.cpp         |   1 +
 4 files changed, 136 insertions(+), 2 deletions(-)
 create mode 100644 libgnn/include/galois/layers/ReLULayer.h

diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h
index c63175f65e..88a48f961c 100644
--- a/libgnn/include/galois/GraphNeuralNetwork.h
+++ b/libgnn/include/galois/GraphNeuralNetwork.h
@@ -14,6 +14,7 @@
 #include "galois/layers/DenseLayer.h"
 #include "galois/layers/GraphConvolutionalLayer.h"
 #include "galois/layers/L2NormLayer.h"
+#include "galois/layers/ReLULayer.h"
 #include "galois/layers/SAGELayer.h"
 #include "galois/layers/SigmoidLayer.h"
 #include "galois/layers/SoftmaxLayer.h"
@@ -227,6 +228,11 @@ class GraphNeuralNetwork {
             i, *graph_, &prev_output_layer, layer_dims,
             config_.default_layer_config())));
         break;
+      case GNNLayerType::kReLU:
+        gnn_layers_.push_back(std::move(std::make_unique<ReLULayer<VTy, ETy>>(
+            i, *graph_, &prev_output_layer, layer_dims,
+            config_.default_layer_config())));
+        break;
       case GNNLayerType::kDense:
         gnn_layers_.push_back(std::move(std::make_unique<DenseLayer<VTy, ETy>>(
             i, *graph_, &prev_output_layer, layer_dims,
diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
index 9ac6b925ae..6929eb70a2 100644
--- a/libgnn/include/galois/layers/GNNLayer.h
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -26,7 +26,9 @@ enum class GNNLayerType {
   //! Dense linear xform layer
   kDense,
   //! L2 normalization layer
-  kL2Norm
+  kL2Norm,
+  //! ReLU layer
+  kReLU
   // TODO GAT
 };
 
@@ -647,7 +649,6 @@ class GNNLayer {
   void ActivationDerivative(PointerWithSize<GNNFloat>* gradient) {
     galois::StatTimer timer("BackwardActivation", "GNNLayer");
     TimerStart(&timer);
-
 #ifdef GALOIS_ENABLE_GPU
     if (device_personality == DevicePersonality::GPU_CUDA) {
       base_gpu_object_.ActivationDerivativeGPU(gradient->data(),
diff --git a/libgnn/include/galois/layers/ReLULayer.h b/libgnn/include/galois/layers/ReLULayer.h
new file mode 100644
index 0000000000..879c462330
--- /dev/null
+++ b/libgnn/include/galois/layers/ReLULayer.h
@@ -0,0 +1,126 @@
+#pragma once
+#include "galois/layers/GNNLayer.h"
+#include "galois/GNNMath.h"
+
+// XXX(hc): We don't have GPU ReLU implementation.
+
+// TODO(hc): All intermediate layers in Galois-GNN have internal ReLU
+// layer. So, this is not yet being used.
+// BUT, I would like to leave this for the future.
+
+namespace galois {
+
+//! ReLU layer: takes each row of the input matrix and sets 0 to elements < 0 in a row.
+//! Currently this only works with **single class* labels and is coded as such.
+template <typename VTy, typename ETy>
+class ReLULayer : public GNNLayer<VTy, ETy> {
+public:
+  ReLULayer(size_t layer_num,
+            const galois::graphs::GNNGraph<VTy, ETy>& graph,
+            PointerWithSize<GNNFloat>* backward_output_matrix,
+            const GNNLayerDimensions& dimensions)
+      : ReLULayer<VTy, ETy>(
+            layer_num, graph, backward_output_matrix, dimensions,
+            GNNLayerConfig{.allocate_weights = false, .disable_output = true})
+      {}
+
+  ReLULayer(size_t layer_num, const galois::graphs::GNNGraph<VTy, ETy>& graph,
+            PointerWithSize<GNNFloat>* backward_output_matrix,
+            const GNNLayerDimensions& dimensions,
+            const GNNLayerConfig& config) : GNNLayer<VTy, ETy>(layer_num, graph,
+            backward_output_matrix, dimensions, config) {
+    this->layer_type_ = galois::GNNLayerType::kReLU;
+    GALOIS_LOG_ASSERT(dimensions.input_columns == dimensions.output_columns);
+    GALOIS_LOG_VERBOSE("ReLU initialized");
+  }
+
+  //! Perform max(0, input) to each row of input
+  const PointerWithSize<galois::GNNFloat>
+  ForwardPhase(const PointerWithSize<galois::GNNFloat> input_embeddings) final {
+    return ForwardPhaseCPU(input_embeddings);
+  }
+
+  const PointerWithSize<galois::GNNFloat>
+  ForwardPhaseCPU(const PointerWithSize<galois::GNNFloat> input_embeddings) {
+    galois::StatTimer Timer("ReLULayer", "ReLULayer");
+    this->TimerStart(&Timer);
+
+    // note: p_backward == input_embeddings
+    const size_t feature_length = this->layer_dimensions_.input_columns;
+
+    galois::do_all(
+        galois::iterate(size_t{0}, this->layer_dimensions_.input_rows),
+        [&](const unsigned row) {
+          if (this->IsSampledLayer()) {
+            if ((this->layer_phase_ == GNNPhase::kTrain ||
+                 this->layer_phase_ == GNNPhase::kBatch) &&
+                !this->graph_.IsInSampledGraphSubgraph(row)) {
+              return;
+            }
+          }
+
+          if (this->graph_.IsValidForPhase(row, this->layer_phase_)) {
+            size_t row_offset = row * feature_length;
+            for (size_t row_index = row_offset;
+                 row_index < (row_offset + feature_length); row_index++) {
+              // TODO(hc): SHAD uses inplace update but Galois-GNN uses
+              // separate vector for outputs.
+              // Revisit this if there is performance differences.
+              this->forward_output_matrix_[row_index] =
+                  std::max(float{0}, input_embeddings[row_index]);
+            }
+          }
+        },
+        // TODO chunk size?
+        // steal on as some threads may have nothing to work on
+        // galois::steal(), galois::loopname("ReLUForward"));
+        galois::steal());
+    this->TimerStop(&Timer);
+    return this->forward_output_matrix_;
+  }
+
+  PointerWithSize<galois::GNNFloat> BackwardPhaseCPU(
+      PointerWithSize<galois::GNNFloat> prev_layer_input,
+      PointerWithSize<galois::GNNFloat>* input_gradients) {
+    galois::StatTimer Timer("ReLUBackward", "ReLULayer");
+    this->TimerStart(&Timer);
+
+    const size_t feature_length = this->layer_dimensions_.input_columns;
+
+    galois::do_all(
+        galois::iterate(size_t{0}, this->layer_dimensions_.input_rows),
+        [&](const unsigned row) {
+          if (this->IsSampledLayer()) {
+            if (this->layer_phase_ == GNNPhase::kTrain &&
+                !this->graph_.IsInSampledGraphSubgraph(row))
+              return;
+          }
+          // Even though ReLU is non-differentiable at 0,
+          // PyTorch's ReLU returns 0 for the derivative of 0.
+          if (this->graph_.IsValidForPhase(row, this->layer_phase_)) {
+            size_t row_offset = row * feature_length;
+            for (size_t row_index = row_offset;
+                 row_index < (row_offset + feature_length); row_index++) {
+              this->p_backward_output_matrix_[row_index] =
+                (prev_layer_input[row_index] > 0? 1 : 0) *
+                (*input_gradients)[row_index];
+            }
+          }
+        },
+        galois::steal(), galois::loopname("ReLUBackward"));
+
+    this->TimerStop(&Timer);
+
+    return this->p_backward_output_matrix_;
+  }
+
+  //! Get gradients to fix distribution such that it leans more towards single
+  //! class ground truth.
+  PointerWithSize<galois::GNNFloat>
+  BackwardPhase(PointerWithSize<galois::GNNFloat> prev_layer_input,
+                PointerWithSize<galois::GNNFloat>* input_gradients) final {
+    return BackwardPhaseCPU(prev_layer_input, input_gradients);
+  }
+};
+
+} // namespace galois
diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp
index 44b11cfa9b..c1da754222 100644
--- a/lonestar/libgnnbench/src/Input.cpp
+++ b/lonestar/libgnnbench/src/Input.cpp
@@ -63,6 +63,7 @@ llvm::cl::opt<galois::GNNLayerType> cl_layer_type(
         clEnumValN(galois::GNNLayerType::kSAGE, "sage",
                    "SAGE layer (GCN with concat + mean)"),
         clEnumValN(galois::GNNLayerType::kL2Norm, "l2norm", "L2 norm layer"),
+        clEnumValN(galois::GNNLayerType::kReLU, "ReLU", "ReLU norm layer"),
         clEnumValN(galois::GNNLayerType::kDense, "dense", "Dense layer")),
     cll::init(galois::GNNLayerType::kSAGE));
 

From 9c74645629cc456ff20232bcd17316f4e3888be4 Mon Sep 17 00:00:00 2001
From: patrickkenney9801 <patrickkenney9801@gmail.com>
Date: Mon, 2 Oct 2023 15:58:35 -0500
Subject: [PATCH 608/660] feat: Add pre-commit to the repo and contributing
 guidelines

---
 .pre-commit-config.yaml | 25 +++++++++++++++++++++++++
 .tool-versions          |  1 +
 CONTRIBUTING.md         | 31 +++++++++++++++++++++++++++++++
 Makefile                | 20 ++++++++++++++++++++
 4 files changed, 77 insertions(+)
 create mode 100644 .pre-commit-config.yaml
 create mode 100644 .tool-versions
 create mode 100644 CONTRIBUTING.md
 create mode 100644 Makefile

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000000..c30b4276e2
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,25 @@
+---
+files: ^libcusp|^libdeepgalois|^libdist|^libgalois|^libgluon|^libgnn|^libwmd
+exclude: ^scripts|^python|^inputs
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.2.0
+    hooks:
+      - id: end-of-file-fixer
+      - id: mixed-line-ending
+      - id: trailing-whitespace
+  - repo: https://github.com/Lucas-C/pre-commit-hooks
+    rev: v1.2.0
+    hooks:
+      - id: forbid-tabs
+        exclude: ^scripts|^python
+      - id: remove-tabs
+        exclude: ^scripts|^python
+        args: [--whitespaces-count, '2']
+  - repo: https://github.com/pocc/pre-commit-hooks
+    rev: v1.3.5
+    hooks:
+      - id: clang-format
+        args: [-i]
+      # - id: clang-tidy
+      #   args: [--fix, -p=build/compile_commands.json]
diff --git a/.tool-versions b/.tool-versions
new file mode 100644
index 0000000000..c00efa2d48
--- /dev/null
+++ b/.tool-versions
@@ -0,0 +1 @@
+pre-commit 2.19.0
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000000..2297468d67
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,31 @@
+# Contributing
+
+Contributors must run quality checks on code.  In place of CI we
+recommend using `pre-commit` (described below) instead of running
+tools like `clang-format` manually.
+
+Code should be clear and documented where needed.
+
+## Tools
+
+### [asdf](https://asdf-vm.com)
+
+Provides a declarative set of tools pinned to
+specific versions for environmental consistency.
+
+These tools are defined in `.tool-versions`.
+Run `make dependencies` to initialize a new environment.
+
+### [pre-commit](https://pre-commit.com)
+
+A left shifting tool to consistently run a set of checks on the code repo.
+Our checks enforce syntax validations and formatting.
+We encourage contributors to use pre-commit hooks.
+
+```shell
+# install all pre-commit hooks
+make hooks
+
+# run pre-commit on repo once
+make pre-commit
+```
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000000..2457b3c0a1
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,20 @@
+dependencies: dependencies-asdf
+
+dependencies-asdf:
+	@echo "Updating asdf plugins..."
+	@asdf plugin update --all >/dev/null 2>&1 || true
+	@echo "Adding new asdf plugins..."
+	@cut -d" " -f1 ./.tool-versions | xargs -I % asdf plugin-add % >/dev/null 2>&1 || true
+	@echo "Installing asdf tools..."
+	@cat ./.tool-versions | xargs -I{} bash -c 'asdf install {}'
+	@echo "Updating local environment to use proper tool versions..."
+	@cat ./.tool-versions | xargs -I{} bash -c 'asdf local {}'
+	@asdf reshim
+	@echo "Done!"
+
+hooks:
+	@pre-commit install --hook-type pre-commit
+	@pre-commit install-hooks
+
+pre-commit:
+	@pre-commit run -a

From 57618b1ea1da4755ad5c1eb14a4f1904e939b48e Mon Sep 17 00:00:00 2001
From: patrickkenney9801 <patrickkenney9801@gmail.com>
Date: Mon, 2 Oct 2023 16:04:54 -0500
Subject: [PATCH 609/660] chore: Run clang-format and pre-commit checks on repo

---
 libcusp/include/galois/graphs/NewGeneric.h    |  30 +-
 libcusp/test/shad-dist-graph.cpp              |  44 +-
 .../deepgalois/layers/GradientSyncStructs.h   |   2 +-
 .../include/deepgalois/layers/aggregator.h    |   2 +-
 .../deepgalois/layers/graph_conv_layer.h      |   2 +-
 .../include/deepgalois/layers/layer.h         |  28 +-
 .../include/deepgalois/math_functions.hh      |  10 +-
 libdeepgalois/include/deepgalois/optimizer.h  |   8 +-
 libdeepgalois/include/deepgalois/random.h     |   2 +-
 libdeepgalois/include/deepgalois/reader.h     |   2 +-
 libdeepgalois/licensenote.txt                 |   6 +-
 libdeepgalois/scripts/run-multi.sh            |  34 +-
 libdeepgalois/scripts/run-single.sh           |  34 +-
 libdeepgalois/src/DistContext.cu              |   8 +-
 libdeepgalois/src/Net.cu                      |   2 +-
 libdeepgalois/src/layers/gat_fw.h             |  77 +-
 libdeepgalois/src/layers/graph_conv_layer.cpp |  29 +-
 libdeepgalois/src/math_functions.cpp          |   6 +-
 libdeepgalois/src/math_functions.cu           |   4 +-
 libdeepgalois/src/utils.cpp                   |   6 +-
 libdist/CMakeLists.txt                        |   2 +-
 libdist/include/galois/DistGalois.h           |   4 +-
 libdist/include/galois/runtime/Serialize.h    |   7 +-
 libgalois/CMakeLists.txt                      |   2 +-
 libgalois/include/galois/Bag.h                |   2 +-
 libgalois/include/galois/FixedSizeRing.h      |   4 +-
 libgalois/include/galois/LargeArray.h         |   6 +-
 libgalois/include/galois/ParallelSTL.h        |   4 +-
 libgalois/include/galois/SharedMemSys.h       |   4 +-
 libgalois/include/galois/Timer.h              |   6 +-
 libgalois/include/galois/gdeque.h             |   4 +-
 .../include/galois/graphs/BufferedGraph.h     |  18 +-
 .../include/galois/graphs/LC_CSR_Graph.h      |   6 +-
 .../include/galois/graphs/LC_CSR_Hypergraph.h |  10 +-
 .../galois/graphs/LC_InlineEdge_Graph.h       |   6 +-
 .../include/galois/graphs/LC_Linear_Graph.h   |   6 +-
 .../include/galois/graphs/LC_Morph_Graph.h    |   6 +-
 libgalois/include/galois/graphs/MorphGraph.h  |   8 +-
 .../include/galois/graphs/MorphHyperGraph.h   |   8 +-
 .../galois/graphs/Morph_SepInOut_Graph.h      |   8 +-
 libgalois/include/galois/graphs/OCGraph.h     |   4 +-
 libgalois/include/galois/gslist.h             |   2 +-
 .../include/galois/runtime/Executor_ForEach.h |  13 +-
 libgalois/include/galois/runtime/Mem.h        |   4 +-
 libgalois/include/galois/runtime/Range.h      |   4 +-
 libgalois/include/galois/runtime/SharedMem.h  |   4 +-
 .../include/galois/runtime/ThreadTimer.h      |  12 +-
 .../galois/substrate/PerThreadStorage.h       |   4 +-
 .../include/galois/substrate/SharedMem.h      |   4 +-
 .../include/galois/substrate/ThreadPool.h     |   4 +-
 .../include/galois/worklists/AdaptiveObim.h   |   2 +-
 libgalois/include/galois/worklists/Chunk.h    |   4 +-
 libgalois/include/galois/worklists/WorkList.h |   2 +-
 libgalois/include/shad/DataTypes.h            | 785 +++++++++---------
 libgalois/include/shad/Graph.h                | 212 ++---
 libgalois/include/shad/GraphTypes.h           |   2 +-
 libgalois/include/shad/ShadGraphConverter.h   | 294 ++++---
 libgalois/src/FileGraph.cpp                   |   2 +-
 libgalois/test/bandwidth.cpp                  |   2 +-
 libgalois/test/move.cpp                       |  10 +-
 libgalois/test/reduction.cpp                  |   4 +-
 .../include/galois/graphs/GluonSubstrate.h    |  23 +-
 .../include/galois/runtime/SyncStructures.h   |   2 +-
 libgnn/README.md                              |  18 +-
 .../galois/graphs/DegreeSyncStructures.h      |   2 +-
 libgnn/include/galois/graphs/GNNGraph.h       | 193 +++--
 libgnn/include/galois/layers/GNNLayer.h       |   4 +-
 .../galois/layers/GraphConvolutionalLayer.h   |  64 +-
 libgnn/include/galois/layers/ReLULayer.h      |  28 +-
 libgnn/include/galois/layers/SAGELayer.h      |  10 +-
 libgnn/src/GNNMath.cpp                        |   4 +-
 libgnn/src/layers/DenseLayer.cpp              |   1 -
 libgnn/src/layers/GNNLayer.cpp                |   1 -
 libgnn/test/CMakeLists.txt                    |  10 +-
 libgnn/test/back-conv-test.cpp                |   4 +-
 libgnn/test/convlayer-test.cpp                |  12 +-
 libgnn/test/gcn-sample-edge-test.cpp          |  17 +-
 libgnn/test/gnnconstruct-test.cpp             |   4 +-
 libgnn/test/gnngraph-test.cpp                 |   8 +-
 libgnn/test/gpu-back-conv-test.cpp            |   4 +-
 libgnn/test/gpu-convlayer-test.cpp            |  12 +-
 libgnn/test/gpu-sage-layer-test.cpp           |  12 +-
 libgnn/test/l2norm-layer-test.cpp             |   4 +-
 libgnn/test/mkl_micro.cpp                     |  17 +-
 libgnn/test/sage-layer-test.cpp               |  12 +-
 libgnn/test/sample-test.cpp                   |   4 +-
 libgnn/test/single_mkl_micro.cpp              | 120 +--
 87 files changed, 1212 insertions(+), 1209 deletions(-)

diff --git a/libcusp/include/galois/graphs/NewGeneric.h b/libcusp/include/galois/graphs/NewGeneric.h
index 9fa37159f1..e8d7e15d8e 100644
--- a/libcusp/include/galois/graphs/NewGeneric.h
+++ b/libcusp/include/galois/graphs/NewGeneric.h
@@ -397,8 +397,8 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
                                 *edgeEnd, base_DistGraph::numGlobalNodes,
                                 base_DistGraph::numGlobalEdges);
     } else {
-      constructCSRFromSHADGraph(
-          &bufGraph, &shadConverter, nodeBegin, nodeEnd, host_prefix);
+      constructCSRFromSHADGraph(&bufGraph, &shadConverter, nodeBegin, nodeEnd,
+                                host_prefix);
     }
 
     graphReadTimer.stop();
@@ -608,10 +608,11 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
   template <
       typename T                                                      = NodeTy,
       typename std::enable_if_t<std::is_same_v<T, shad::ShadNodeTy>>* = nullptr>
-  void constructCSRFromSHADGraph(
-      galois::graphs::BufferedGraph<EdgeTy>* bufGraph,
-      shad::ShadGraphConverter* shadConverter,
-      uint64_t nodeBegin, uint64_t nodeEnd, std::string host_prefix) {
+  void
+  constructCSRFromSHADGraph(galois::graphs::BufferedGraph<EdgeTy>* bufGraph,
+                            shad::ShadGraphConverter* shadConverter,
+                            uint64_t nodeBegin, uint64_t nodeEnd,
+                            std::string host_prefix) {
     uint32_t numLocalNodes = nodeEnd - nodeBegin;
     // So, this holds outgoing edge array of a whole (global) graph.
     uint64_t* outIndexBuffer = shadConverter->getOutIndexBuffer();
@@ -625,14 +626,13 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
     // From now on, those arrays store local node information
     // as a dense memory representation.
     shadConverter->extractLocalOutIndexArray(nodeBegin, nodeEnd);
-    galois::gInfo(host_prefix,
-                  "Completes local out index array construction");
+    galois::gInfo(host_prefix, "Completes local out index array construction");
 
     galois::gInfo(host_prefix, "Starts edge destination/data "
                                "array construction");
     uint64_t numLocalEdges = edgeEnd - edgeBegin;
     shadConverter->constructEdgeArrays(nodeBegin, edgeBegin, numLocalNodes,
-                                      numLocalEdges);
+                                       numLocalEdges);
 
     galois::gInfo(host_prefix, "Completes edge destination/data "
                                "array construction");
@@ -646,12 +646,12 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
   }
 
   // Disable this method for non-SHAD graph construction.
-  template <
-      typename T                                                      = NodeTy,
-      typename std::enable_if_t<!std::is_same_v<T, shad::ShadNodeTy>>* = nullptr>
-  void constructCSRFromSHADGraph(
-      galois::graphs::BufferedGraph<EdgeTy>*,
-      shad::ShadGraphConverter*, uint64_t, uint64_t, std::string) {}
+  template <typename T = NodeTy,
+            typename std::enable_if_t<!std::is_same_v<T, shad::ShadNodeTy>>* =
+                nullptr>
+  void constructCSRFromSHADGraph(galois::graphs::BufferedGraph<EdgeTy>*,
+                                 shad::ShadGraphConverter*, uint64_t, uint64_t,
+                                 std::string) {}
 
   /**
    * @brief Assign a SHAD node type to a node data.
diff --git a/libcusp/test/shad-dist-graph.cpp b/libcusp/test/shad-dist-graph.cpp
index dedc3c34cb..492bfeb2ad 100644
--- a/libcusp/test/shad-dist-graph.cpp
+++ b/libcusp/test/shad-dist-graph.cpp
@@ -21,12 +21,12 @@
 
 #include "galois/Galois.h"
 #include "galois/graphs/CuSPPartitioner.h"
-#include "shad/ShadGraphConverter.h" 
+#include "shad/ShadGraphConverter.h"
 
 int main() {
   galois::DistMemSys G;
   unsigned M = galois::substrate::getThreadPool().getMaxThreads();
-  //M = 1;
+  // M = 1;
   galois::setActiveThreads(M);
 
   shad::ShadGraphConverter shadConverter;
@@ -38,7 +38,8 @@ int main() {
   std::string filename = "/home/hochan/data.01.csv";
   shadConverter.readSHADFile(filename, &numNodes, &numEdges);
   std::unique_ptr<galois::graphs::DistGraph<shad::ShadNodeTy, shad::ShadEdgeTy>>
-      graph = galois::cuspPartitionGraph<GenericCVC, shad::ShadNodeTy, shad::ShadEdgeTy>(
+      graph = galois::cuspPartitionGraph<GenericCVC, shad::ShadNodeTy,
+                                         shad::ShadEdgeTy>(
           filename, galois::CUSP_CSR, galois::CUSP_CSR, true, true);
 
   std::cout << "Test starts...\n";
@@ -62,31 +63,34 @@ int main() {
 
   std::cout << "Num. nodes/edges tests has been passed\n";
 
-  uint32_t id = galois::runtime::getSystemNetworkInterface().ID;
+  uint32_t id       = galois::runtime::getSystemNetworkInterface().ID;
   uint32_t numHosts = galois::runtime::getSystemNetworkInterface().Num;
   {
-  std::ofstream fp(std::to_string(id) + ".master");
-  for (uint32_t src = 0; src < graph->numMasters(); ++src) {
-    uint64_t srcglobal = graph->getGID(src);
-    fp << "node " << srcglobal << ", type: " << graph->getData(src).type << 
-      ", key: " << graph->getData(src).key << "\n";
-    for (auto e : graph->edges(src)) {
-      uint32_t dstlocal = graph->getEdgeDst(e);
-      uint64_t dstglobal = graph->getGID(dstlocal);
-      fp << "\t edge dst " << dstglobal << ", type: " <<
-          graph->getEdgeData(e) << "\n";
+    std::ofstream fp(std::to_string(id) + ".master");
+    for (uint32_t src = 0; src < graph->numMasters(); ++src) {
+      uint64_t srcglobal = graph->getGID(src);
+      fp << "node " << srcglobal << ", type: " << graph->getData(src).type
+         << ", key: " << graph->getData(src).key << "\n";
+      for (auto e : graph->edges(src)) {
+        uint32_t dstlocal  = graph->getEdgeDst(e);
+        uint64_t dstglobal = graph->getGID(dstlocal);
+        fp << "\t edge dst " << dstglobal << ", type: " << graph->getEdgeData(e)
+           << "\n";
+      }
     }
-  }
-  fp.close();
+    fp.close();
   }
 
   {
     for (uint32_t host = 0; host < numHosts; ++host) {
-      if (host == id) { continue; }
-      std::ofstream fp(std::to_string(id) + "-" + std::to_string(host) + ".graph");
+      if (host == id) {
+        continue;
+      }
+      std::ofstream fp(std::to_string(id) + "-" + std::to_string(host) +
+                       ".graph");
       for (uint32_t i = 0; i < graph->size(); ++i) {
-        fp << i << ", " << graph->getGID(i) << ", " <<
-          graph->getData(i).type << ", " << graph->getData(i).key << "\n";
+        fp << i << ", " << graph->getGID(i) << ", " << graph->getData(i).type
+           << ", " << graph->getData(i).key << "\n";
       }
       fp.close();
     }
diff --git a/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h
index 6f600b40a8..d4c23af1bb 100644
--- a/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h
+++ b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h
@@ -20,7 +20,7 @@ struct GradientSync {
     }
     weight += y;
     // need a post process divide all step
-    //weight /= 2;
+    // weight /= 2;
     return true;
   }
 
diff --git a/libdeepgalois/include/deepgalois/layers/aggregator.h b/libdeepgalois/include/deepgalois/layers/aggregator.h
index 3f2d3c7f1b..8ef845b1d9 100644
--- a/libdeepgalois/include/deepgalois/layers/aggregator.h
+++ b/libdeepgalois/include/deepgalois/layers/aggregator.h
@@ -13,7 +13,7 @@ void update_all_csrmm(size_t len, Graph& g, const float_t* in, float_t* out,
 } // namespace deepgalois
 #else
 #include "deepgalois/GraphTypes.h"
-//#include "graph_gpu.h"
+// #include "graph_gpu.h"
 namespace deepgalois {
 void update_all(size_t len, GraphGPU& g, const float_t* in, float_t* out,
                 bool norm, const float_t* norm_factor);
diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
index d112ddf785..14c47c9813 100644
--- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
+++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
@@ -74,7 +74,7 @@ class graph_conv_layer : public layer {
   float_t* in_temp1;
   float_t* trans_data;  // y*x
   mask_t* dropout_mask; // x*y
-  float_t epsilon; // LeakyReLU angle of negative slope: set to 0.2
+  float_t epsilon;      // LeakyReLU angle of negative slope: set to 0.2
 
   // Glorot & Bengio (AISTATS 2010)
   inline void rand_init_matrix(size_t dim_x, size_t dim_y, vec_t& matrix,
diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h
index 534d99b821..6e1ac879cc 100644
--- a/libdeepgalois/include/deepgalois/layers/layer.h
+++ b/libdeepgalois/include/deepgalois/layers/layer.h
@@ -58,23 +58,23 @@ class layer : public deepgalois::node {
   bool use_mask;
   vec_t W; // parameters to learn, for vertex v, layer0: D x 16, layer1: 16 x E
   vec_t Q; // parameters to learn, for vertex v's neighbors, same size as W
-  vec_t weight_grad; // weight gradient for updating parameters
-  float_t* d_W; // parameters to learn on device (GPU)
+  vec_t weight_grad;      // weight gradient for updating parameters
+  float_t* d_W;           // parameters to learn on device (GPU)
   float_t* d_weight_grad; // weight gradient on device (GPU)
-  vec_t alpha_l; // parameters to learn (H x 1), only used for GAT
-  vec_t alpha_r; // parameters to learn (H x 1), only used for GAT
-  vec_t alpha_lgrad; // gradients for updating alpha (GAT only)
-  vec_t alpha_rgrad; // gradients for updating alpha (GAT only)
-  mask_t* masks_; // masks to show which samples are valid
-  mask_t* d_masks_; // masks on device (GPU)
-  float_t* loss; // error for each vertex: N x 1
+  vec_t alpha_l;          // parameters to learn (H x 1), only used for GAT
+  vec_t alpha_r;          // parameters to learn (H x 1), only used for GAT
+  vec_t alpha_lgrad;      // gradients for updating alpha (GAT only)
+  vec_t alpha_rgrad;      // gradients for updating alpha (GAT only)
+  mask_t* masks_;         // masks to show which samples are valid
+  mask_t* d_masks_;       // masks on device (GPU)
+  float_t* loss;          // error for each vertex: N x 1
   ContextType* context;
   label_t* labels;
-  float_t* norm_consts; // normalization score
-  vec_t scores; // un-normalized scores
-  vec_t temp_scores; // un-normalized scores
-  vec_t scores_grad; // gradients of un-normalized scores
-  vec_t norm_scores; // normalized scores
+  float_t* norm_consts;   // normalization score
+  vec_t scores;           // un-normalized scores
+  vec_t temp_scores;      // un-normalized scores
+  vec_t scores_grad;      // gradients of un-normalized scores
+  vec_t norm_scores;      // normalized scores
   vec_t norm_scores_grad; // gradients of normalized scores
 // TODO
 #ifdef GALOIS_ENABLE_GPU
diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh
index 38f461620a..e6b5836386 100644
--- a/libdeepgalois/include/deepgalois/math_functions.hh
+++ b/libdeepgalois/include/deepgalois/math_functions.hh
@@ -81,8 +81,8 @@ void relu_cpu(size_t n, const float_t* in, float_t* out);
 void d_relu_cpu(size_t n, const float_t* in, const float_t* data, float_t* out);
 
 // Leaky ReLU
-void leaky_relu(float_t epsilon, float_t in, float_t &out);
-void d_leaky_relu(float_t epsilon, float_t in, float_t data, float_t &out);
+void leaky_relu(float_t epsilon, float_t in, float_t& out);
+void d_leaky_relu(float_t epsilon, float_t in, float_t data, float_t& out);
 void leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in, float_t* out);
 void d_leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in,
                       const float_t* data, float_t* out);
@@ -171,8 +171,10 @@ void float_copy_device(int n, float_t* h_ptr, float_t* d_ptr);
 void uint8_malloc_device(int n, uint8_t*& ptr);
 void uint8_free_device(uint8_t*& ptr);
 void uint8_copy_device(int n, uint8_t* h_ptr, uint8_t* d_ptr);
-acc_t masked_avg_loss_gpu(int begin, int end, int count, mask_t* masks, float_t* loss);
+acc_t masked_avg_loss_gpu(int begin, int end, int count, mask_t* masks,
+                          float_t* loss);
 acc_t l2_norm_gpu(int n, const float_t* in);
 void l2_norm_gpu(size_t x, size_t y, const float_t* in, float_t* out);
-void d_l2_norm_gpu(size_t x, size_t y, const float_t* in_data, float_t* in_diff, float_t* out_diff);
+void d_l2_norm_gpu(size_t x, size_t y, const float_t* in_data, float_t* in_diff,
+                   float_t* out_diff);
 #endif
diff --git a/libdeepgalois/include/deepgalois/optimizer.h b/libdeepgalois/include/deepgalois/optimizer.h
index f5eb4b54ec..694819591c 100644
--- a/libdeepgalois/include/deepgalois/optimizer.h
+++ b/libdeepgalois/include/deepgalois/optimizer.h
@@ -23,10 +23,10 @@ namespace deepgalois {
 // usesHessian : true if an optimizer uses hessian (2nd order derivative of loss
 // function)
 struct optimizer {
-  optimizer()                 = default;
-  optimizer(const optimizer&) = default;
-  optimizer(optimizer&&)      = default;
-  optimizer& operator=(const optimizer&) = default;
+  optimizer()                                    = default;
+  optimizer(const optimizer&)                    = default;
+  optimizer(optimizer&&)                         = default;
+  optimizer& operator=(const optimizer&)         = default;
   optimizer& operator=(optimizer&&)              = default;
   virtual ~optimizer()                           = default;
   virtual void update(const vec_t& dW, vec_t& W) = 0;
diff --git a/libdeepgalois/include/deepgalois/random.h b/libdeepgalois/include/deepgalois/random.h
index bf1648bc2a..6e5cb0fe5b 100644
--- a/libdeepgalois/include/deepgalois/random.h
+++ b/libdeepgalois/include/deepgalois/random.h
@@ -50,4 +50,4 @@ uniform_rand(T min, T max) {
   std::uniform_real_distribution<T> dst(min, max);
   return dst(random_generator::get_instance()());
 }
-} //end of namespace
+} // namespace deepgalois
diff --git a/libdeepgalois/include/deepgalois/reader.h b/libdeepgalois/include/deepgalois/reader.h
index 5e034ec210..c25eeceac2 100644
--- a/libdeepgalois/include/deepgalois/reader.h
+++ b/libdeepgalois/include/deepgalois/reader.h
@@ -1,6 +1,6 @@
 #pragma once
 #include "deepgalois/lgraph.h"
-//#include "galois/DistGalois.h"
+// #include "galois/DistGalois.h"
 namespace deepgalois {
 
 class Reader {
diff --git a/libdeepgalois/licensenote.txt b/libdeepgalois/licensenote.txt
index cf1aeb6caf..d9bf751eac 100644
--- a/libdeepgalois/licensenote.txt
+++ b/libdeepgalois/licensenote.txt
@@ -33,13 +33,13 @@ committed.
 LICENSE
 
 Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met: 
+modification, are permitted provided that the following conditions are met:
 
 1. Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer. 
+   list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright notice,
    this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution. 
+   and/or other materials provided with the distribution.
 
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
diff --git a/libdeepgalois/scripts/run-multi.sh b/libdeepgalois/scripts/run-multi.sh
index 660fac74b3..da9861fb2e 100755
--- a/libdeepgalois/scripts/run-multi.sh
+++ b/libdeepgalois/scripts/run-multi.sh
@@ -13,21 +13,21 @@ HIDDENDIM="16 64 128"
 OUTDIR=/net/ohm/export/cdgc/cxh/outputs/DeepGalois
 
 for GNN in $GNNS; do
-	for NT in $NTHREADS; do
-		for GR in $GRAPHS; do
-			for K in $EPOCHS; do
-				for DR in $DROPOUT; do
-					for LR in $LEARNINGRATES; do
-						for HD in $HIDDENDIM; do
-							EXEC_DIR=$LONESTARGNN/$GNN
-							echo $EXEC_DIR
-							echo "$EXEC_DIR/$GNN $GR -k=$K -t=$NT -d=$DR -lr=$LR -h=$HD &> $OUTDIR/$GNN-$GR-$K-$DR-$LR-$HD-$NT.log"
-							$EXEC_DIR/$GNN $GR -k=$K -t=$NT -d=$DR -lr=$LR -h=$HD -sc=0 &> $OUTDIR/$GNN-$GR-$K-$DR-$LR-$HD-$NT.log
-							echo "Done. Check out $OUTDIR/$GNN-$GR-$K-$DR-$NT.log"
-						done
-					done
-				done
-			done
-		done
-	done
+  for NT in $NTHREADS; do
+    for GR in $GRAPHS; do
+      for K in $EPOCHS; do
+        for DR in $DROPOUT; do
+          for LR in $LEARNINGRATES; do
+            for HD in $HIDDENDIM; do
+              EXEC_DIR=$LONESTARGNN/$GNN
+              echo $EXEC_DIR
+              echo "$EXEC_DIR/$GNN $GR -k=$K -t=$NT -d=$DR -lr=$LR -h=$HD &> $OUTDIR/$GNN-$GR-$K-$DR-$LR-$HD-$NT.log"
+              $EXEC_DIR/$GNN $GR -k=$K -t=$NT -d=$DR -lr=$LR -h=$HD -sc=0 &> $OUTDIR/$GNN-$GR-$K-$DR-$LR-$HD-$NT.log
+              echo "Done. Check out $OUTDIR/$GNN-$GR-$K-$DR-$NT.log"
+            done
+          done
+        done
+      done
+    done
+  done
 done
diff --git a/libdeepgalois/scripts/run-single.sh b/libdeepgalois/scripts/run-single.sh
index 37a393d788..a6bc223ebd 100755
--- a/libdeepgalois/scripts/run-single.sh
+++ b/libdeepgalois/scripts/run-single.sh
@@ -13,21 +13,21 @@ HIDDENDIM="16 32 64 128 256 512"
 OUTDIR=/net/ohm/export/cdgc/cxh/outputs/DeepGalois
 
 for GNN in $GNNS; do
-	for NT in $NTHREADS; do
-		for GR in $GRAPHS; do
-			for K in $EPOCHS; do
-				for DR in $DROPOUT; do
-					for LR in $LEARNINGRATES; do
-						for HD in $HIDDENDIM; do
-							EXEC_DIR=$LONESTARGNN/$GNN
-							echo $EXEC_DIR
-							echo "$EXEC_DIR/$GNN $GR -k=$K -t=$NT -d=$DR -lr=$LR -h=$HD &> $OUTDIR/$GNN-$GR-$K-$DR-$LR-$HD-$NT.log"
-							$EXEC_DIR/$GNN $GR -k=$K -t=$NT -d=$DR -lr=$LR -h=$HD &> $OUTDIR/$GNN-$GR-$K-$DR-$LR-$HD-$NT.log
-							echo "Done. Check out $OUTDIR/$GNN-$GR-$K-$DR-$NT.log"
-						done
-					done
-				done
-			done
-		done
-	done
+  for NT in $NTHREADS; do
+    for GR in $GRAPHS; do
+      for K in $EPOCHS; do
+        for DR in $DROPOUT; do
+          for LR in $LEARNINGRATES; do
+            for HD in $HIDDENDIM; do
+              EXEC_DIR=$LONESTARGNN/$GNN
+              echo $EXEC_DIR
+              echo "$EXEC_DIR/$GNN $GR -k=$K -t=$NT -d=$DR -lr=$LR -h=$HD &> $OUTDIR/$GNN-$GR-$K-$DR-$LR-$HD-$NT.log"
+              $EXEC_DIR/$GNN $GR -k=$K -t=$NT -d=$DR -lr=$LR -h=$HD &> $OUTDIR/$GNN-$GR-$K-$DR-$LR-$HD-$NT.log
+              echo "Done. Check out $OUTDIR/$GNN-$GR-$K-$DR-$NT.log"
+            done
+          done
+        done
+      done
+    done
+  done
 done
diff --git a/libdeepgalois/src/DistContext.cu b/libdeepgalois/src/DistContext.cu
index b67f0f9125..30704b0748 100644
--- a/libdeepgalois/src/DistContext.cu
+++ b/libdeepgalois/src/DistContext.cu
@@ -64,9 +64,9 @@ cusparseMatDescr_t DistContext::cusparse_matdescr_ = 0;
 curandGenerator_t DistContext::curand_generator_   = 0;
 
 DistContext::DistContext() : DistContext(true) {
-  d_labels = NULL; 
+  d_labels = NULL;
   d_feats = NULL;
-  d_labels_subg = NULL; 
+  d_labels_subg = NULL;
   d_feats_subg = NULL;
   d_normFactors = NULL;
   d_normFactorsSub = NULL;
@@ -110,7 +110,7 @@ size_t DistContext::read_features(std::string dataset_str) {
   return feat_len;
 }
 
-size_t DistContext::read_masks(std::string dataset_str, std::string mask_type, size_t n, 
+size_t DistContext::read_masks(std::string dataset_str, std::string mask_type, size_t n,
                                size_t& begin, size_t& end, mask_t* masks, DGraph* dGraph) {
   return reader.read_masks(mask_type, n, begin, end, masks);
 }
@@ -132,7 +132,7 @@ void DistContext::constructSubgraphLabels(size_t m, const mask_t* masks) {
   for (size_t i = 0; i < this->partitionedGraph->size(); i++) {
     if (masks[i] == 1) {
       if (usingSingleClass) h_labels_subg[count] = h_labels[i];
-      else std::copy(h_labels + i * num_classes, h_labels + (i + 1) * num_classes, 
+      else std::copy(h_labels + i * num_classes, h_labels + (i + 1) * num_classes,
                      &h_labels_subg[count * num_classes]);
       count++;
     }
diff --git a/libdeepgalois/src/Net.cu b/libdeepgalois/src/Net.cu
index 2921b81996..ee70e1d578 100644
--- a/libdeepgalois/src/Net.cu
+++ b/libdeepgalois/src/Net.cu
@@ -191,7 +191,7 @@ void Net::read_test_masks(std::string dataset) {
     for (size_t i = globalTestBegin; i < globalTestEnd; i++)
         globalTestMasks[i] = 1;
   } else {
-    globalTestCount = distContext->read_masks(dataset, std::string("test"), 
+    globalTestCount = distContext->read_masks(dataset, std::string("test"),
         globalSamples, globalTestBegin, globalTestEnd, globalTestMasks, NULL);
   }
   //copy_test_masks_to_device();
diff --git a/libdeepgalois/src/layers/gat_fw.h b/libdeepgalois/src/layers/gat_fw.h
index e9a7bada37..d57f485a8c 100644
--- a/libdeepgalois/src/layers/gat_fw.h
+++ b/libdeepgalois/src/layers/gat_fw.h
@@ -1,6 +1,6 @@
-//#define USE_GAT
+// #define USE_GAT
 #ifdef USE_GAT
-// `Graph Attention Network <https://arxiv.org/pdf/1710.10903.pdf>` 
+// `Graph Attention Network <https://arxiv.org/pdf/1710.10903.pdf>`
 // NOTE: GAT paper uses "first concatenation then linear projection"
 //  to compute attention scores, while ours is "first projection then
 //  addition", the two approaches are mathematically equivalent:
@@ -10,7 +10,7 @@
 //  save [Wh_i || Wh_j] on edges, which is not memory-efficient. Plus,
 //  addition could be optimized with DGL's built-in function u_add_v,
 //  which further speeds up computation and saves memory footprint.
- 
+
 void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in,
                                  float_t* out) {
   size_t n = g.size();
@@ -19,34 +19,34 @@ void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in,
     auto deg_src = g.get_degree(src);
 
     // concatenation, dot product, LeakyReLU
-    //int i = 0;
-    //vec_t scores(deg_src);
+    // int i = 0;
+    // vec_t scores(deg_src);
     auto begin = g.edge_begin(src);
-    auto end = g.edge_end(src);
+    auto end   = g.edge_end(src);
     // alpha: learnable weight vector (shared by all vertices)
     float_t src_score = math::dot(len, &alpha_l[0], &in[src_idx]);
     for (auto e = begin; e != end; e++) {
-      auto dst = g.getEdgeDst(e);
+      auto dst     = g.getEdgeDst(e);
       auto dst_idx = dst * len;
-      //vec_t concat_vec(2*len);
-      //math::concat(len, &in[src_idx], &in[dst_idx], &concat_vec[0]);
-      //float_t score = math::dot(2*len, &alpha[0], &concat_vec[0]);
+      // vec_t concat_vec(2*len);
+      // math::concat(len, &in[src_idx], &in[dst_idx], &concat_vec[0]);
+      // float_t score = math::dot(2*len, &alpha[0], &concat_vec[0]);
       float_t dst_score = math::dot(len, &alpha_r[0], &in[dst_idx]);
-      temp_scores[e] = src_score + dst_score;
+      temp_scores[e]    = src_score + dst_score;
       math::leaky_relu(epsilon, temp_scores[e], scores[e]);
     }
 
     // softmax to normalize the attention scores on each vertex’s incoming edges
-    //vec_t normalized_scores(deg_src, 0);
-    //math::softmax(deg_src, &scores[0], &normalized_scores[0]);
+    // vec_t normalized_scores(deg_src, 0);
+    // math::softmax(deg_src, &scores[0], &normalized_scores[0]);
     math::softmax(deg_src, &scores[begin], &norm_scores[begin]);
 
     // aggregation: scaled by the attention scores
     math::clear_cpu(len, &out[src_idx]);
     for (auto e = begin; e != end; e++) {
-      auto dst = g.getEdgeDst(e);
+      auto dst     = g.getEdgeDst(e);
       auto dst_idx = dst * len;
-      auto score = norm_scores[e];
+      auto score   = norm_scores[e];
       vec_t neighbor(len);
       math::scale(len, score, &in[dst_idx], &neighbor[0]);
       math::vadd_cpu(len, &out[src_idx], &neighbor[0], &out[src_idx]);
@@ -55,47 +55,48 @@ void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in,
 }
 
 void graph_conv_layer::d_compute_scores(size_t len, Graph& g,
-                                        const float_t* in_data, 
-                                        const float_t *out_data,
+                                        const float_t* in_data,
+                                        const float_t* out_data,
                                         const float_t* in_grad) {
   size_t n = g.size();
 
   // compute gradients for the learnable vector `alpha`
-  //vec_t temp_grad(n*n);
-  //math::sgemm_cpu(CblasTrans, CblasNoTrans, n, len, n, 1.0, out_data,
+  // vec_t temp_grad(n*n);
+  // math::sgemm_cpu(CblasTrans, CblasNoTrans, n, len, n, 1.0, out_data,
   //                in_grad, 0.0, temp_grad);
   galois::do_all(galois::iterate(size_t(0), n), [&](const auto src) {
-    auto begin = g.edge_begin(src);
-    auto end = g.edge_end(src);
+    auto begin   = g.edge_begin(src);
+    auto end     = g.edge_end(src);
     auto deg_src = g.get_degree(src);
-    math::d_softmax(deg_src, &scores[begin], &norm_scores[begin], 
+    math::d_softmax(deg_src, &scores[begin], &norm_scores[begin],
                     &scores_grad[begin], &norm_scores_grad[begin]);
     for (auto e = begin; e != end; e++) {
       auto dst = g.getEdgeDst(e);
-      // use norm_scores_grad as temp_scores_grad since its data is useless already
-      math::d_leaky_relu(epsilon, &scores_grad[e], 
-                         &temp_scores[e], &norm_scores_grad[e]);
+      // use norm_scores_grad as temp_scores_grad since its data is useless
+      // already
+      math::d_leaky_relu(epsilon, &scores_grad[e], &temp_scores[e],
+                         &norm_scores_grad[e]);
       math::scale(len, norm_scores_grad[e], &in_data[src_idx], &alpha_lgrad[0]);
       math::scale(len, norm_scores_grad[e], &in_data[dst_idx], &alpha_rgrad[0]);
     }
   });
 }
 
-void graph_conv_layer::d_aggregate(size_t len, Graph& g,
-                                   const float_t* in_grad, float_t* out_grad) {
+void graph_conv_layer::d_aggregate(size_t len, Graph& g, const float_t* in_grad,
+                                   float_t* out_grad) {
   size_t n = g.size();
 
   // aggregation: the derivative is transposed;
-  // the graph is undirected (structurally symmetric), 
+  // the graph is undirected (structurally symmetric),
   // but values are not the same for the symmetric positions
   galois::do_all(galois::iterate(size_t(0), n), [&](const auto src) {
-    auto src_idx = src * len;
+    auto src_idx   = src * len;
     auto src_begin = g.edge_begin(src);
     for (auto e = src_begin; e != g.edge_end(src); e++) {
-      auto dst = g.getEdgeDst(e);
-      auto dst_idx = dst * len;
+      auto dst       = g.getEdgeDst(e);
+      auto dst_idx   = dst * len;
       auto dst_begin = g.edge_begin(dst);
-      auto score = norm_scores[dst_begin+e-src_begin]; // transposed
+      auto score     = norm_scores[dst_begin + e - src_begin]; // transposed
       vec_t neighbor(len);
       math::scale(len, score, &in_grad[dst_idx], &neighbor[0]);
       math::vadd_cpu(len, &out_grad[src_idx], &neighbor[0], &out_grad[src_idx]);
@@ -113,8 +114,8 @@ void graph_conv_layer::forward_propagation(const float_t* in_data,
 
   // dropout
   if (dropout_ && phase_ == net_phase::train) {
-    math::dropout_cpu(x, y, scale_, dropout_rate_, in_data,
-                      dropout_mask, in_temp);
+    math::dropout_cpu(x, y, scale_, dropout_rate_, in_data, dropout_mask,
+                      in_temp);
   } else {
     math::copy_cpu(x * y, in_data, in_temp);
   }
@@ -125,9 +126,10 @@ void graph_conv_layer::forward_propagation(const float_t* in_data,
 
   // aggregation
   aggregate(z, *graph_cpu, out_temp, out_data);
-  
+
   // ReLU
-  if (act_) math::relu_cpu(x * z, out_data, out_data);
+  if (act_)
+    math::relu_cpu(x * z, out_data, out_data);
 }
 
 void graph_conv_layer::back_propagation(const float_t* in_data,
@@ -136,7 +138,8 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
   size_t x = input_dims[0];
   size_t y = input_dims[1];
   size_t z = output_dims[1];
-  if (act_) math::d_relu_cpu(x * z, out_grad, out_data, out_grad);
+  if (act_)
+    math::d_relu_cpu(x * z, out_grad, out_data, out_grad);
 
   // compute gradients for alpha (alpha is a learnable vector)
   d_compute_scores(z, *graph_cpu, in_temp, out_temp, out_grad);
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
index da9b01dbae..f13b26be25 100644
--- a/libdeepgalois/src/layers/graph_conv_layer.cpp
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -56,7 +56,7 @@ void graph_conv_layer::malloc_and_init() {
 
   // make sure seed consistent across all hosts for weight matrix
   rand_init_matrix(y, z, W, 1);
-  //rand_init_matrix(y, z, Q, 1); // for GraphSAGE
+  // rand_init_matrix(y, z, Q, 1); // for GraphSAGE
 
   zero_init_matrix(y, z, layer::weight_grad);
 
@@ -64,12 +64,12 @@ void graph_conv_layer::malloc_and_init() {
   // alpha is only used for GAT
   rand_init_matrix(z, 1, alpha_l, 1);
   rand_init_matrix(z, 1, alpha_r, 1);
-  alpha_lgrad.resize(2*z);
-  alpha_rgrad.resize(2*z);
+  alpha_lgrad.resize(2 * z);
+  alpha_rgrad.resize(2 * z);
   std::fill(alpha_lgrad.begin(), alpha_lgrad.end(), 0);
   std::fill(alpha_rgrad.begin(), alpha_rgrad.end(), 0);
   auto ne = graph_cpu->sizeEdges(); // number of edges
-  scores.resize(ne); // a score for each edge
+  scores.resize(ne);                // a score for each edge
   temp_scores.resize(ne);
   scores_grad.resize(ne);
   norm_scores.resize(ne);
@@ -77,7 +77,7 @@ void graph_conv_layer::malloc_and_init() {
   epsilon = 0.2; // LeakyReLU angle of negative slope
 #endif
   dropout_ = true;
-  act_ = false;
+  act_     = false;
 
   if (dropout_)
     dropout_mask = new mask_t[x * y];
@@ -233,7 +233,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
     // at this point, out_temp has the derivative of data from last step to
     // use for both updating gradients for features and gradients for weights
     // this calculates gradients for the node predictions
-    if (level_ != 0) {// no need to calculate in_grad for the first layer
+    if (level_ != 0) { // no need to calculate in_grad for the first layer
       // derivative of matmul needs transposed matrix
       math::sgemm_cpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, &W[0],
                       0.0, in_grad); // x*z; z*y -> x*y
@@ -254,7 +254,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
   compute_timer.stop();
 
   // sync agg
-  //galois::gPrint(header, "x is ", x, " y is ", y,  " z is ", z, "\n");
+  // galois::gPrint(header, "x is ", x, " y is ", y,  " z is ", z, "\n");
   if (level_ != 0) {
     deepgalois::_syncVectorSize = y;
     deepgalois::_dataToSync     = in_grad;
@@ -275,14 +275,13 @@ void graph_conv_layer::back_propagation(const float_t* in_data,
   unsigned host_num = galois::runtime::getSystemNetworkInterface().Num;
   layer::syncSub->sync<writeAny, readAny, GradientSync>("Gradients");
   galois::do_all(
-    galois::iterate((size_t)0, (size_t)z),
-    [&] (size_t i) {
-      //galois::gPrint("before ", i, " ", layer::weight_grad[i], "\n");
-      layer::weight_grad[i] /= host_num;
-      //galois::gPrint("after ", i, " ", layer::weight_grad[i], "\n");
-    },
-    galois::loopname("sync post process")
-  );
+      galois::iterate((size_t)0, (size_t)z),
+      [&](size_t i) {
+        // galois::gPrint("before ", i, " ", layer::weight_grad[i], "\n");
+        layer::weight_grad[i] /= host_num;
+        // galois::gPrint("after ", i, " ", layer::weight_grad[i], "\n");
+      },
+      galois::loopname("sync post process"));
 
   galois::gDebug("[", layer::gradientGraph->myHostID(), "] Sync done");
   conv_timer.stop();
diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp
index aed0ac79b9..b8addfe887 100644
--- a/libdeepgalois/src/math_functions.cpp
+++ b/libdeepgalois/src/math_functions.cpp
@@ -178,7 +178,7 @@ float_t dot(size_t n, const float_t* x, const float_t* y) {
 // concatenation of two vectors into one
 void concat(size_t n, const float_t* x, const float_t* y, float_t* z) {
   copy_cpu(n, x, z);
-  copy_cpu(n, y, z+n);
+  copy_cpu(n, y, z + n);
 }
 
 void clear_cpu(size_t n, float_t* in) {
@@ -244,11 +244,11 @@ void d_relu_cpu(size_t n, const float_t* in, const float_t* data,
       galois::chunk_size<64>(), galois::loopname("d_relu"));
 }
 
-void leaky_relu(float_t epsilon, float_t in, float_t &out) {
+void leaky_relu(float_t epsilon, float_t in, float_t& out) {
   out = in > 0.0 ? in : epsilon * in;
 }
 
-void d_leaky_relu(float_t epsilon, float_t in, float_t data, float_t &out) {
+void d_leaky_relu(float_t epsilon, float_t in, float_t data, float_t& out) {
   out = in * (data > 0.0 ? 1.0 : epsilon);
 }
 
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
index 9a7c4bc1dd..b9f7686867 100644
--- a/libdeepgalois/src/math_functions.cu
+++ b/libdeepgalois/src/math_functions.cu
@@ -234,9 +234,9 @@ void csrmm_gpu(const int M, const int N, const int K, const int nnz,
                const int* A_nnz_idx, const float* B, const float beta,
                float* transpose_C, float* C) {
   //std::cout << "[debug] csrmm_gpu m=" << M << ", n=" << N << ", k=" << K << ", nnz=" << nnz << "\n";
-  CUSPARSE_CHECK(cusparseScsrmm2(deepgalois::DistContext::cusparse_handle(), 
+  CUSPARSE_CHECK(cusparseScsrmm2(deepgalois::DistContext::cusparse_handle(),
              CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE,
-             M, N, K, nnz, &alpha, deepgalois::DistContext::cusparse_matdescr(), 
+             M, N, K, nnz, &alpha, deepgalois::DistContext::cusparse_matdescr(),
              A_nonzeros, A_idx_ptr, A_nnz_idx, B, N, &beta, transpose_C, M));
   // transpose C
   const float one  = 1.0;
diff --git a/libdeepgalois/src/utils.cpp b/libdeepgalois/src/utils.cpp
index 61ff3a2e58..1b237ff7c3 100644
--- a/libdeepgalois/src/utils.cpp
+++ b/libdeepgalois/src/utils.cpp
@@ -109,9 +109,9 @@ acc_t masked_f1_score(size_t begin, size_t end, size_t, mask_t* masks,
   double precision_mic = tp_accum + fp_accum > 0
                              ? (double)tp_accum / (double)(tp_accum + fp_accum)
                              : 0.;
-  double recall_mic = tp_accum + fn_accum > 0
-                          ? (double)tp_accum / (double)(tp_accum + fn_accum)
-                          : 0.;
+  double recall_mic    = tp_accum + fn_accum > 0
+                             ? (double)tp_accum / (double)(tp_accum + fn_accum)
+                             : 0.;
   double f1_micro =
       recall_mic + precision_mic > 0.
           ? 2. * (recall_mic * precision_mic) / (recall_mic + precision_mic)
diff --git a/libdist/CMakeLists.txt b/libdist/CMakeLists.txt
index 57e6aa1750..2930d37cbf 100644
--- a/libdist/CMakeLists.txt
+++ b/libdist/CMakeLists.txt
@@ -49,7 +49,7 @@ if (GALOIS_USE_LCI)
 
   add_dependencies(galois_dist_async lci)
   target_link_libraries(galois_dist_async PRIVATE ${LCI_LIBRARY} -lpsm2)
-  target_include_directories(galois_dist_async PUBLIC 
+  target_include_directories(galois_dist_async PUBLIC
     $<BUILD_INTERFACE:${LCI_INCLUDE}>
     $<INSTALL_INTERFACE:include>
   )
diff --git a/libdist/include/galois/DistGalois.h b/libdist/include/galois/DistGalois.h
index b87c539f3e..e39f311470 100644
--- a/libdist/include/galois/DistGalois.h
+++ b/libdist/include/galois/DistGalois.h
@@ -44,10 +44,10 @@ class DistMemSys : public runtime::SharedMem<runtime::DistStatManager> {
 
   ~DistMemSys();
 
-  DistMemSys(const DistMemSys&) = delete;
+  DistMemSys(const DistMemSys&)            = delete;
   DistMemSys& operator=(const DistMemSys&) = delete;
 
-  DistMemSys(DistMemSys&&) = delete;
+  DistMemSys(DistMemSys&&)            = delete;
   DistMemSys& operator=(DistMemSys&&) = delete;
 };
 
diff --git a/libdist/include/galois/runtime/Serialize.h b/libdist/include/galois/runtime/Serialize.h
index bfd25c3cf3..a7b83174b7 100644
--- a/libdist/include/galois/runtime/Serialize.h
+++ b/libdist/include/galois/runtime/Serialize.h
@@ -1055,9 +1055,10 @@ inline void gDeserialize(DeSerializeBuffer&) {}
  * @param data Object to save data in the iterator type into
  */
 template <typename Iter, typename T>
-auto gDeserializeRaw(Iter iter, T& data) -> decltype(
-    std::declval<typename std::enable_if<is_memory_copyable<T>::value>::type>(),
-    Iter()) {
+auto gDeserializeRaw(Iter iter, T& data)
+    -> decltype(std::declval<typename std::enable_if<
+                    is_memory_copyable<T>::value>::type>(),
+                Iter()) {
   unsigned char* pdata = (unsigned char*)&data;
   for (size_t i = 0; i < sizeof(T); ++i)
     pdata[i] = *iter++;
diff --git a/libgalois/CMakeLists.txt b/libgalois/CMakeLists.txt
index 76161160f6..4721bc0261 100644
--- a/libgalois/CMakeLists.txt
+++ b/libgalois/CMakeLists.txt
@@ -10,7 +10,7 @@ set(sources
         "${CMAKE_CURRENT_BINARY_DIR}/Version.cpp"
         src/Barrier_Counting.cpp
         src/Barrier.cpp
-        src/Barrier_Dissemination.cpp 
+        src/Barrier_Dissemination.cpp
         src/Barrier_MCS.cpp
         src/Barrier_Pthread.cpp
         src/Barrier_Simple.cpp
diff --git a/libgalois/include/galois/Bag.h b/libgalois/include/galois/Bag.h
index 6592bec529..985fdffcb7 100644
--- a/libgalois/include/galois/Bag.h
+++ b/libgalois/include/galois/Bag.h
@@ -212,7 +212,7 @@ class InsertBag {
     return *this;
   }
 
-  InsertBag(const InsertBag&) = delete;
+  InsertBag(const InsertBag&)            = delete;
   InsertBag& operator=(const InsertBag&) = delete;
 
   ~InsertBag() { destruct_parallel(); }
diff --git a/libgalois/include/galois/FixedSizeRing.h b/libgalois/include/galois/FixedSizeRing.h
index 51e1466011..e1d7896781 100644
--- a/libgalois/include/galois/FixedSizeRing.h
+++ b/libgalois/include/galois/FixedSizeRing.h
@@ -67,7 +67,7 @@ class FixedSizeBagBase {
     }
   }
 
-  FixedSizeBagBase(const FixedSizeBagBase& o) = delete;
+  FixedSizeBagBase(const FixedSizeBagBase& o)            = delete;
   FixedSizeBagBase& operator=(const FixedSizeBagBase& o) = delete;
 
   ~FixedSizeBagBase() { clear(); }
@@ -284,7 +284,7 @@ class FixedSizeRing {
     }
   }
 
-  FixedSizeRing(const FixedSizeRing& o) = delete;
+  FixedSizeRing(const FixedSizeRing& o)            = delete;
   FixedSizeRing& operator=(const FixedSizeRing& o) = delete;
 
   ~FixedSizeRing() { clear(); }
diff --git a/libgalois/include/galois/LargeArray.h b/libgalois/include/galois/LargeArray.h
index 71df3036ff..fe2e99c364 100644
--- a/libgalois/include/galois/LargeArray.h
+++ b/libgalois/include/galois/LargeArray.h
@@ -172,7 +172,7 @@ class LargeArray {
     return *this;
   }
 
-  LargeArray(const LargeArray&) = delete;
+  LargeArray(const LargeArray&)            = delete;
   LargeArray& operator=(const LargeArray&) = delete;
 
   ~LargeArray() {
@@ -305,8 +305,8 @@ class LargeArray<void> {
 
 public:
   LargeArray(void*, size_t) {}
-  LargeArray()                  = default;
-  LargeArray(const LargeArray&) = delete;
+  LargeArray()                             = default;
+  LargeArray(const LargeArray&)            = delete;
   LargeArray& operator=(const LargeArray&) = delete;
 
   friend void swap(LargeArray&, LargeArray&) {}
diff --git a/libgalois/include/galois/ParallelSTL.h b/libgalois/include/galois/ParallelSTL.h
index 4158a6dc5c..f5878686a9 100644
--- a/libgalois/include/galois/ParallelSTL.h
+++ b/libgalois/include/galois/ParallelSTL.h
@@ -119,7 +119,7 @@ struct sort_helper {
       RandomAccessIterator pivot = choose_rand(bounds.first, bounds.second);
       VT pv                      = *pivot;
       pivot                      = std::partition(bounds.first, bounds.second,
-                             std::bind(comp, std::placeholders::_1, pv));
+                                                  std::bind(comp, std::placeholders::_1, pv));
       // push the lower bit
       if (bounds.first != pivot)
         ctx.push(std::make_pair(bounds.first, pivot));
@@ -209,7 +209,7 @@ struct partition_helper {
     RP high, low;
     do {
       RP parts  = dual_partition(low.first, low.second, high.first, high.second,
-                                state->pred);
+                                 state->pred);
       low.first = parts.first;
       high.second = parts.second;
       if (low.first == low.second)
diff --git a/libgalois/include/galois/SharedMemSys.h b/libgalois/include/galois/SharedMemSys.h
index 8177a2283a..52459032d1 100644
--- a/libgalois/include/galois/SharedMemSys.h
+++ b/libgalois/include/galois/SharedMemSys.h
@@ -16,10 +16,10 @@ class SharedMemSys : public runtime::SharedMem<runtime::StatManager> {
   explicit SharedMemSys();
   ~SharedMemSys();
 
-  SharedMemSys(const SharedMemSys&) = delete;
+  SharedMemSys(const SharedMemSys&)            = delete;
   SharedMemSys& operator=(const SharedMemSys&) = delete;
 
-  SharedMemSys(SharedMemSys&&) = delete;
+  SharedMemSys(SharedMemSys&&)            = delete;
   SharedMemSys& operator=(SharedMemSys&&) = delete;
 };
 
diff --git a/libgalois/include/galois/Timer.h b/libgalois/include/galois/Timer.h
index f12c41c6b0..51ab492ff4 100644
--- a/libgalois/include/galois/Timer.h
+++ b/libgalois/include/galois/Timer.h
@@ -72,10 +72,10 @@ class StatTimer : public TimeAccumulator {
 
   StatTimer() : StatTimer(nullptr, nullptr) {}
 
-  StatTimer(const StatTimer&) = delete;
-  StatTimer(StatTimer&&)      = delete;
+  StatTimer(const StatTimer&)            = delete;
+  StatTimer(StatTimer&&)                 = delete;
   StatTimer& operator=(const StatTimer&) = delete;
-  StatTimer& operator=(StatTimer&&) = delete;
+  StatTimer& operator=(StatTimer&&)      = delete;
 
   ~StatTimer();
 
diff --git a/libgalois/include/galois/gdeque.h b/libgalois/include/galois/gdeque.h
index 737f989107..19830c0309 100644
--- a/libgalois/include/galois/gdeque.h
+++ b/libgalois/include/galois/gdeque.h
@@ -35,7 +35,7 @@ namespace galois {
 
 // Experimental random access iterator. Slower than old iterator for simple
 // traversals, so disable for now
-//#define _NEW_ITERATOR
+// #define _NEW_ITERATOR
 
 //! Like std::deque but use Galois memory management functionality
 template <typename T, unsigned ChunkSize = 64,
@@ -277,7 +277,7 @@ class gdeque {
     return *this;
   }
 
-  gdeque(const gdeque&) = delete;
+  gdeque(const gdeque&)            = delete;
   gdeque& operator=(const gdeque&) = delete;
 
   ~gdeque() { clear(); }
diff --git a/libgalois/include/galois/graphs/BufferedGraph.h b/libgalois/include/galois/graphs/BufferedGraph.h
index 956c9d7d7a..df7d69f330 100644
--- a/libgalois/include/galois/graphs/BufferedGraph.h
+++ b/libgalois/include/galois/graphs/BufferedGraph.h
@@ -294,22 +294,22 @@ class BufferedGraph {
    * the current host
    */
   void constructFrom(uint64_t* _outIndexBuffer, uint32_t* _edgeDestBuffer,
-      EdgeDataType* _edgeDataBuffer, uint32_t _globalSize,
-      uint64_t _globalEdgeSize, uint32_t _numLocalNodes,
-      uint64_t _numLocalEdges, uint64_t _nodeOffset,
-      uint64_t _edgeOffset) {
+                     EdgeDataType* _edgeDataBuffer, uint32_t _globalSize,
+                     uint64_t _globalEdgeSize, uint32_t _numLocalNodes,
+                     uint64_t _numLocalEdges, uint64_t _nodeOffset,
+                     uint64_t _edgeOffset) {
     assert(_outIndexBuffer != nullptr);
     assert(_edgeDestBuffer != nullptr);
     assert(_edgeDataBuffer != nullptr);
     outIndexBuffer = _outIndexBuffer;
     edgeDestBuffer = _edgeDestBuffer;
     edgeDataBuffer = _edgeDataBuffer;
-    globalSize = _globalSize;
+    globalSize     = _globalSize;
     globalEdgeSize = _globalEdgeSize;
-    numLocalNodes = _numLocalNodes;
-    numLocalEdges = _numLocalEdges;
-    nodeOffset = _nodeOffset;
-    edgeOffset = _edgeOffset;
+    numLocalNodes  = _numLocalNodes;
+    numLocalEdges  = _numLocalEdges;
+    nodeOffset     = _nodeOffset;
+    edgeOffset     = _edgeOffset;
     resetReadCounters();
     graphLoaded = true;
     numBytesReadOutIndex += sizeof(uint64_t);
diff --git a/libgalois/include/galois/graphs/LC_CSR_Graph.h b/libgalois/include/galois/graphs/LC_CSR_Graph.h
index 45d39fafaa..c8d37f4f85 100644
--- a/libgalois/include/galois/graphs/LC_CSR_Graph.h
+++ b/libgalois/include/galois/graphs/LC_CSR_Graph.h
@@ -189,13 +189,13 @@ class LC_CSR_Graph :
 
   template <bool _A1 = HasNoLockable, bool _A2 = HasOutOfLineLockable>
   void acquireNode(GraphNode N, MethodFlag mflag,
-                   typename std::enable_if<!_A1 && !_A2>::type* = 0) {
+                   typename std::enable_if<!_A1&& !_A2>::type* = 0) {
     galois::runtime::acquire(&nodeData[N], mflag);
   }
 
   template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>
   void acquireNode(GraphNode N, MethodFlag mflag,
-                   typename std::enable_if<_A1 && !_A2>::type* = 0) {
+                   typename std::enable_if<_A1&& !_A2>::type* = 0) {
     this->outOfLineAcquire(getId(N), mflag);
   }
 
@@ -216,7 +216,7 @@ class LC_CSR_Graph :
   template <bool _A1 = EdgeData::has_value,
             bool _A2 = LargeArray<FileEdgeTy>::has_value>
   void constructEdgeValue(FileGraph&, typename FileGraph::edge_iterator nn,
-                          typename std::enable_if<_A1 && !_A2>::type* = 0) {
+                          typename std::enable_if<_A1&& !_A2>::type* = 0) {
     edgeData.set(*nn, {});
   }
 
diff --git a/libgalois/include/galois/graphs/LC_CSR_Hypergraph.h b/libgalois/include/galois/graphs/LC_CSR_Hypergraph.h
index 7c76391a46..d2ba3aad6f 100644
--- a/libgalois/include/galois/graphs/LC_CSR_Hypergraph.h
+++ b/libgalois/include/galois/graphs/LC_CSR_Hypergraph.h
@@ -190,13 +190,13 @@ class LC_CSR_Hypergraph :
 
   template <bool _A1 = HasNoLockable, bool _A2 = HasOutOfLineLockable>
   void acquireNode(GraphNode N, MethodFlag mflag,
-                   typename std::enable_if<!_A1 && !_A2>::type* = 0) {
+                   typename std::enable_if<!_A1&& !_A2>::type* = 0) {
     galois::runtime::acquire(&nodeData[N], mflag);
   }
 
   template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>
   void acquireNode(GraphNode N, MethodFlag mflag,
-                   typename std::enable_if<_A1 && !_A2>::type* = 0) {
+                   typename std::enable_if<_A1&& !_A2>::type* = 0) {
     this->outOfLineAcquire(getId(N), mflag);
   }
 
@@ -217,7 +217,7 @@ class LC_CSR_Hypergraph :
   template <bool _A1 = EdgeData::has_value,
             bool _A2 = LargeArray<FileEdgeTy>::has_value>
   void constructEdgeValue(FileGraph&, typename FileGraph::edge_iterator nn,
-                          typename std::enable_if<_A1 && !_A2>::type* = 0) {
+                          typename std::enable_if<_A1&& !_A2>::type* = 0) {
     edgeData.set(*nn, {});
   }
 
@@ -269,8 +269,8 @@ class LC_CSR_Hypergraph :
   BOOST_SERIALIZATION_SPLIT_MEMBER()
 
 public:
-  LC_CSR_Hypergraph(LC_CSR_Hypergraph&& rhs) = default;
-  LC_CSR_Hypergraph()                        = default;
+  LC_CSR_Hypergraph(LC_CSR_Hypergraph&& rhs)        = default;
+  LC_CSR_Hypergraph()                               = default;
   LC_CSR_Hypergraph& operator=(LC_CSR_Hypergraph&&) = default;
 
   /**
diff --git a/libgalois/include/galois/graphs/LC_InlineEdge_Graph.h b/libgalois/include/galois/graphs/LC_InlineEdge_Graph.h
index c0d8021167..f3db63a7fe 100644
--- a/libgalois/include/galois/graphs/LC_InlineEdge_Graph.h
+++ b/libgalois/include/galois/graphs/LC_InlineEdge_Graph.h
@@ -186,13 +186,13 @@ class LC_InlineEdge_Graph
 
   template <bool _A1 = HasNoLockable, bool _A2 = HasOutOfLineLockable>
   void acquireNode(GraphNode N, MethodFlag mflag,
-                   typename std::enable_if<!_A1 && !_A2>::type* = 0) {
+                   typename std::enable_if<!_A1&& !_A2>::type* = 0) {
     galois::runtime::acquire(N, mflag);
   }
 
   template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>
   void acquireNode(GraphNode N, MethodFlag mflag,
-                   typename std::enable_if<_A1 && !_A2>::type* = 0) {
+                   typename std::enable_if<_A1&& !_A2>::type* = 0) {
     this->outOfLineAcquire(getId(N), mflag);
   }
 
@@ -220,7 +220,7 @@ class LC_InlineEdge_Graph
             bool _A2 = LargeArray<FileEdgeTy>::has_value>
   void constructEdgeValue(FileGraph&, typename FileGraph::edge_iterator,
                           EdgeInfo* edge,
-                          typename std::enable_if<_A1 && !_A2>::type* = 0) {
+                          typename std::enable_if<_A1&& !_A2>::type* = 0) {
     edge->construct();
   }
 
diff --git a/libgalois/include/galois/graphs/LC_Linear_Graph.h b/libgalois/include/galois/graphs/LC_Linear_Graph.h
index a884bfc91b..f92a0a77de 100644
--- a/libgalois/include/galois/graphs/LC_Linear_Graph.h
+++ b/libgalois/include/galois/graphs/LC_Linear_Graph.h
@@ -163,13 +163,13 @@ class LC_Linear_Graph
 
   template <bool _A1 = HasNoLockable, bool _A2 = HasOutOfLineLockable>
   void acquireNode(GraphNode N, MethodFlag mflag,
-                   typename std::enable_if<!_A1 && !_A2>::type* = 0) {
+                   typename std::enable_if<!_A1&& !_A2>::type* = 0) {
     galois::runtime::acquire(N, mflag);
   }
 
   template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>
   void acquireNode(GraphNode N, MethodFlag mflag,
-                   typename std::enable_if<_A1 && !_A2>::type* = 0) {
+                   typename std::enable_if<_A1&& !_A2>::type* = 0) {
     this->outOfLineAcquire(getId(N), mflag);
   }
 
@@ -195,7 +195,7 @@ class LC_Linear_Graph
             bool _A2 = LargeArray<FileEdgeTy>::has_value>
   void constructEdgeValue(FileGraph&, typename FileGraph::edge_iterator,
                           EdgeInfo* edge,
-                          typename std::enable_if<_A1 && !_A2>::type* = 0) {
+                          typename std::enable_if<_A1&& !_A2>::type* = 0) {
     edge->construct();
   }
 
diff --git a/libgalois/include/galois/graphs/LC_Morph_Graph.h b/libgalois/include/galois/graphs/LC_Morph_Graph.h
index 78cf28b9ae..fdc02c468e 100644
--- a/libgalois/include/galois/graphs/LC_Morph_Graph.h
+++ b/libgalois/include/galois/graphs/LC_Morph_Graph.h
@@ -240,7 +240,7 @@ class LC_Morph_Graph
    */
   template <bool _A1 = HasNoLockable, bool _A2 = HasOutOfLineLockable>
   void acquireNode(GraphNode N, MethodFlag mflag,
-                   typename std::enable_if<!_A1 && !_A2>::type* = 0) {
+                   typename std::enable_if<!_A1&& !_A2>::type* = 0) {
     galois::runtime::acquire(N, mflag);
   }
 
@@ -254,7 +254,7 @@ class LC_Morph_Graph
    */
   template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>
   void acquireNode(GraphNode N, MethodFlag mflag,
-                   typename std::enable_if<_A1 && !_A2>::type* = 0) {
+                   typename std::enable_if<_A1&& !_A2>::type* = 0) {
     this->outOfLineAcquire(getId(N), mflag);
   }
 
@@ -288,7 +288,7 @@ class LC_Morph_Graph
             bool _A2 = LargeArray<FileEdgeTy>::has_value>
   void constructEdgeValue(FileGraph&, typename FileGraph::edge_iterator,
                           GraphNode src, GraphNode dst,
-                          typename std::enable_if<_A1 && !_A2>::type* = 0) {
+                          typename std::enable_if<_A1&& !_A2>::type* = 0) {
     addMultiEdge(src, dst, galois::MethodFlag::UNPROTECTED);
   }
 
diff --git a/libgalois/include/galois/graphs/MorphGraph.h b/libgalois/include/galois/graphs/MorphGraph.h
index a52d9dd676..7a7b89bef6 100644
--- a/libgalois/include/galois/graphs/MorphGraph.h
+++ b/libgalois/include/galois/graphs/MorphGraph.h
@@ -609,9 +609,9 @@ class MorphGraph : private boost::noncopyable {
         dst->acquire(mflag);
         EdgeTy* e = edgesF.mkEdge(std::forward<Args>(args)...);
         ii        = dst->createEdgeWithReuse(src, e, Directional ? true : false,
-                                      std::forward<Args>(args)...);
+                                             std::forward<Args>(args)...);
         ii        = src->createEdgeWithReuse(dst, e, false,
-                                      std::forward<Args>(args)...);
+                                             std::forward<Args>(args)...);
       }
     }
     return boost::make_filter_iterator(is_out_edge(), ii, src->end());
@@ -633,7 +633,7 @@ class MorphGraph : private boost::noncopyable {
         dst->acquire(mflag);
         EdgeTy* e = edgesF.mkEdge(std::forward<Args>(args)...);
         ii        = dst->createEdge(src, e, Directional ? true : false,
-                             std::forward<Args>(args)...);
+                                    std::forward<Args>(args)...);
         ii        = src->createEdge(dst, e, false, std::forward<Args>(args)...);
       }
     }
@@ -702,7 +702,7 @@ class MorphGraph : private boost::noncopyable {
   EdgeTy*
   constructOutEdgeValue(FileGraph&, typename FileGraph::edge_iterator,
                         GraphNode src, GraphNode dst,
-                        typename std::enable_if<_A1 && !_A2>::type* = 0) {
+                        typename std::enable_if<_A1&& !_A2>::type* = 0) {
     return createOutEdge(src, dst, galois::MethodFlag::UNPROTECTED);
   }
 
diff --git a/libgalois/include/galois/graphs/MorphHyperGraph.h b/libgalois/include/galois/graphs/MorphHyperGraph.h
index 1dae113408..f408d9fa9b 100644
--- a/libgalois/include/galois/graphs/MorphHyperGraph.h
+++ b/libgalois/include/galois/graphs/MorphHyperGraph.h
@@ -620,9 +620,9 @@ class MorphHyperGraph : private boost::noncopyable {
         dst->acquire(mflag);
         EdgeTy* e = edgesF.mkEdge(std::forward<Args>(args)...);
         ii        = dst->createEdgeWithReuse(src, e, Directional ? true : false,
-                                      std::forward<Args>(args)...);
+                                             std::forward<Args>(args)...);
         ii        = src->createEdgeWithReuse(dst, e, false,
-                                      std::forward<Args>(args)...);
+                                             std::forward<Args>(args)...);
       }
     }
     return boost::make_filter_iterator(is_out_edge(), ii, src->end());
@@ -644,7 +644,7 @@ class MorphHyperGraph : private boost::noncopyable {
         dst->acquire(mflag);
         EdgeTy* e = edgesF.mkEdge(std::forward<Args>(args)...);
         ii        = dst->createEdge(src, e, Directional ? true : false,
-                             std::forward<Args>(args)...);
+                                    std::forward<Args>(args)...);
         ii        = src->createEdge(dst, e, false, std::forward<Args>(args)...);
       }
     }
@@ -713,7 +713,7 @@ class MorphHyperGraph : private boost::noncopyable {
   EdgeTy*
   constructOutEdgeValue(FileGraph&, typename FileGraph::edge_iterator,
                         GraphNode src, GraphNode dst,
-                        typename std::enable_if<_A1 && !_A2>::type* = 0) {
+                        typename std::enable_if<_A1&& !_A2>::type* = 0) {
     return createOutEdge(src, dst, galois::MethodFlag::UNPROTECTED);
   }
 
diff --git a/libgalois/include/galois/graphs/Morph_SepInOut_Graph.h b/libgalois/include/galois/graphs/Morph_SepInOut_Graph.h
index db19218240..86b811a914 100644
--- a/libgalois/include/galois/graphs/Morph_SepInOut_Graph.h
+++ b/libgalois/include/galois/graphs/Morph_SepInOut_Graph.h
@@ -542,9 +542,9 @@ class Morph_SepInOut_Graph : private boost::noncopyable {
         dst->acquire(mflag);
         EdgeTy* e = edgesF.mkEdge(std::forward<Args>(args)...);
         ii        = dst->createEdgeWithReuse(src, e, Directional ? true : false,
-                                      std::forward<Args>(args)...);
+                                             std::forward<Args>(args)...);
         ii        = src->createEdgeWithReuse(dst, e, false,
-                                      std::forward<Args>(args)...);
+                                             std::forward<Args>(args)...);
       }
     }
     return boost::make_filter_iterator(is_out_edge(), ii, src->end());
@@ -565,7 +565,7 @@ class Morph_SepInOut_Graph : private boost::noncopyable {
         dst->acquire(mflag);
         EdgeTy* e = edgesF.mkEdge(std::forward<Args>(args)...);
         ii        = dst->createEdge(src, e, Directional ? true : false,
-                             std::forward<Args>(args)...);
+                                    std::forward<Args>(args)...);
         ii        = src->createEdge(dst, e, false, std::forward<Args>(args)...);
       }
     }
@@ -634,7 +634,7 @@ class Morph_SepInOut_Graph : private boost::noncopyable {
   EdgeTy*
   constructOutEdgeValue(FileGraph&, typename FileGraph::edge_iterator,
                         GraphNode src, GraphNode dst,
-                        typename std::enable_if<_A1 && !_A2>::type* = 0) {
+                        typename std::enable_if<_A1&& !_A2>::type* = 0) {
     return createOutEdge(src, dst, galois::MethodFlag::UNPROTECTED);
   }
 
diff --git a/libgalois/include/galois/graphs/OCGraph.h b/libgalois/include/galois/graphs/OCGraph.h
index 02cb9afd9e..5e1c2d7c26 100644
--- a/libgalois/include/galois/graphs/OCGraph.h
+++ b/libgalois/include/galois/graphs/OCGraph.h
@@ -394,13 +394,13 @@ class OCImmutableEdgeGraph
 
   template <bool _A1 = HasNoLockable, bool _A2 = HasOutOfLineLockable>
   void acquireNode(GraphNode N, MethodFlag mflag,
-                   typename std::enable_if<!_A1 && !_A2>::type* = 0) {
+                   typename std::enable_if<!_A1&& !_A2>::type* = 0) {
     galois::runtime::acquire(&nodeData[N], mflag);
   }
 
   template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>
   void acquireNode(GraphNode N, MethodFlag mflag,
-                   typename std::enable_if<_A1 && !_A2>::type* = 0) {
+                   typename std::enable_if<_A1&& !_A2>::type* = 0) {
     this->outOfLineAcquire(idFromNode(N), mflag);
   }
 
diff --git a/libgalois/include/galois/gslist.h b/libgalois/include/galois/gslist.h
index dd7fcc8c70..ca8a372515 100644
--- a/libgalois/include/galois/gslist.h
+++ b/libgalois/include/galois/gslist.h
@@ -174,7 +174,7 @@ class gslist_base {
 
   gslist_base() : first(0) {}
 
-  gslist_base(const gslist_base&) = delete;
+  gslist_base(const gslist_base&)            = delete;
   gslist_base& operator=(const gslist_base&) = delete;
 
   gslist_base(gslist_base&& other) : first(0) { *this = std::move(other); }
diff --git a/libgalois/include/galois/runtime/Executor_ForEach.h b/libgalois/include/galois/runtime/Executor_ForEach.h
index 5b40cb6e48..ff17133009 100644
--- a/libgalois/include/galois/runtime/Executor_ForEach.h
+++ b/libgalois/include/galois/runtime/Executor_ForEach.h
@@ -383,9 +383,10 @@ class ForEachExecutor {
 
 public:
   ForEachExecutor(FunctionTy f, const ArgsTy& args)
-      : ForEachExecutor(T1{}, f, args, get_trait_value<wl_tag>(args).args,
-                        std::make_index_sequence<std::tuple_size<decltype(
-                            get_trait_value<wl_tag>(args).args)>::value>{}) {}
+      : ForEachExecutor(
+            T1{}, f, args, get_trait_value<wl_tag>(args).args,
+            std::make_index_sequence<std::tuple_size<
+                decltype(get_trait_value<wl_tag>(args).args)>::value>{}) {}
 
   template <typename RangeTy>
   void init(const RangeTy&) {}
@@ -416,8 +417,10 @@ class ForEachExecutor {
 };
 
 template <typename WLTy>
-constexpr auto has_with_iterator(int) -> decltype(
-    std::declval<typename WLTy::template with_iterator<int*>::type>(), bool()) {
+constexpr auto has_with_iterator(int)
+    -> decltype(std::declval<
+                    typename WLTy::template with_iterator<int*>::type>(),
+                bool()) {
   return true;
 }
 
diff --git a/libgalois/include/galois/runtime/Mem.h b/libgalois/include/galois/runtime/Mem.h
index 994f27ac25..3d2a353f3c 100644
--- a/libgalois/include/galois/runtime/Mem.h
+++ b/libgalois/include/galois/runtime/Mem.h
@@ -554,8 +554,8 @@ class StaticSingleInstance : private boost::noncopyable {
 // std::unique_ptr<Derived>();
 
 template <typename Derived>
-substrate::PtrLock<Derived>
-    StaticSingleInstance<Derived>::ptr = substrate::PtrLock<Derived>();
+substrate::PtrLock<Derived> StaticSingleInstance<Derived>::ptr =
+    substrate::PtrLock<Derived>();
 
 class PageHeap : public StaticSingleInstance<PageHeap> {
 
diff --git a/libgalois/include/galois/runtime/Range.h b/libgalois/include/galois/runtime/Range.h
index 01632dcd3a..135686e83f 100644
--- a/libgalois/include/galois/runtime/Range.h
+++ b/libgalois/include/galois/runtime/Range.h
@@ -303,8 +303,8 @@ template <typename C>
 class HasLocalIter {
 
   template <typename T>
-  using CallExprType = typename std::remove_reference<decltype(
-      std::declval<T>().local_begin())>::type;
+  using CallExprType = typename std::remove_reference<
+      decltype(std::declval<T>().local_begin())>::type;
 
   template <typename T>
   static std::true_type go(typename std::add_pointer<CallExprType<T>>::type);
diff --git a/libgalois/include/galois/runtime/SharedMem.h b/libgalois/include/galois/runtime/SharedMem.h
index 34d847d6ed..7389502bd9 100644
--- a/libgalois/include/galois/runtime/SharedMem.h
+++ b/libgalois/include/galois/runtime/SharedMem.h
@@ -46,10 +46,10 @@ class SharedMem : public galois::substrate::SharedMem {
     internal::setPagePoolState(nullptr);
   }
 
-  SharedMem(const SharedMem&) = delete;
+  SharedMem(const SharedMem&)            = delete;
   SharedMem& operator=(const SharedMem&) = delete;
 
-  SharedMem(SharedMem&&) = delete;
+  SharedMem(SharedMem&&)            = delete;
   SharedMem& operator=(SharedMem&&) = delete;
 };
 
diff --git a/libgalois/include/galois/runtime/ThreadTimer.h b/libgalois/include/galois/runtime/ThreadTimer.h
index 86ae77389d..e75ba51efd 100644
--- a/libgalois/include/galois/runtime/ThreadTimer.h
+++ b/libgalois/include/galois/runtime/ThreadTimer.h
@@ -49,10 +49,10 @@ class PerThreadTimer : private ThreadTimers {
   PerThreadTimer(const char* const region, const char* const category)
       : region_(region), category_(category) {}
 
-  PerThreadTimer(const PerThreadTimer&) = delete;
-  PerThreadTimer(PerThreadTimer&&)      = delete;
+  PerThreadTimer(const PerThreadTimer&)            = delete;
+  PerThreadTimer(PerThreadTimer&&)                 = delete;
   PerThreadTimer& operator=(const PerThreadTimer&) = delete;
-  PerThreadTimer& operator=(PerThreadTimer&&) = delete;
+  PerThreadTimer& operator=(PerThreadTimer&&)      = delete;
 
   ~PerThreadTimer() { reportTimes(); }
 
@@ -67,10 +67,10 @@ class PerThreadTimer<false> {
 public:
   PerThreadTimer(const char* const, const char* const) {}
 
-  PerThreadTimer(const PerThreadTimer&) = delete;
-  PerThreadTimer(PerThreadTimer&&)      = delete;
+  PerThreadTimer(const PerThreadTimer&)            = delete;
+  PerThreadTimer(PerThreadTimer&&)                 = delete;
   PerThreadTimer& operator=(const PerThreadTimer&) = delete;
-  PerThreadTimer& operator=(PerThreadTimer&&) = delete;
+  PerThreadTimer& operator=(PerThreadTimer&&)      = delete;
 
   ~PerThreadTimer() = default;
 
diff --git a/libgalois/include/galois/substrate/PerThreadStorage.h b/libgalois/include/galois/substrate/PerThreadStorage.h
index b4a6140dd4..fc43055853 100644
--- a/libgalois/include/galois/substrate/PerThreadStorage.h
+++ b/libgalois/include/galois/substrate/PerThreadStorage.h
@@ -55,7 +55,7 @@ class PerBackend {
 public:
   PerBackend();
 
-  PerBackend(const PerBackend&) = delete;
+  PerBackend(const PerBackend&)            = delete;
   PerBackend& operator=(const PerBackend&) = delete;
 
   ~PerBackend() {
@@ -194,7 +194,7 @@ class PerSocketStorage {
     return *this;
   }
 
-  PerSocketStorage(const PerSocketStorage&) = delete;
+  PerSocketStorage(const PerSocketStorage&)            = delete;
   PerSocketStorage& operator=(const PerSocketStorage&) = delete;
 
   ~PerSocketStorage() { destruct(); }
diff --git a/libgalois/include/galois/substrate/SharedMem.h b/libgalois/include/galois/substrate/SharedMem.h
index 1c809b52ad..e8a6fe58a4 100644
--- a/libgalois/include/galois/substrate/SharedMem.h
+++ b/libgalois/include/galois/substrate/SharedMem.h
@@ -48,10 +48,10 @@ class SharedMem {
    */
   ~SharedMem();
 
-  SharedMem(const SharedMem&) = delete;
+  SharedMem(const SharedMem&)            = delete;
   SharedMem& operator=(const SharedMem&) = delete;
 
-  SharedMem(SharedMem&&) = delete;
+  SharedMem(SharedMem&&)            = delete;
   SharedMem& operator=(SharedMem&&) = delete;
 };
 
diff --git a/libgalois/include/galois/substrate/ThreadPool.h b/libgalois/include/galois/substrate/ThreadPool.h
index 4158b87321..1ed295d8a0 100644
--- a/libgalois/include/galois/substrate/ThreadPool.h
+++ b/libgalois/include/galois/substrate/ThreadPool.h
@@ -130,10 +130,10 @@ class ThreadPool {
 public:
   ~ThreadPool();
 
-  ThreadPool(const ThreadPool&) = delete;
+  ThreadPool(const ThreadPool&)            = delete;
   ThreadPool& operator=(const ThreadPool&) = delete;
 
-  ThreadPool(ThreadPool&&) = delete;
+  ThreadPool(ThreadPool&&)            = delete;
   ThreadPool& operator=(ThreadPool&&) = delete;
 
   //! execute work on all threads
diff --git a/libgalois/include/galois/worklists/AdaptiveObim.h b/libgalois/include/galois/worklists/AdaptiveObim.h
index 79223cf628..758af8582f 100644
--- a/libgalois/include/galois/worklists/AdaptiveObim.h
+++ b/libgalois/include/galois/worklists/AdaptiveObim.h
@@ -402,7 +402,7 @@ struct AdaptiveOrderedByIntegerMetric
         double diff = ((p.maxPrio >> delta) - (p.minPrio >> delta)) >= 1
                           ? ((p.maxPrio >> delta) - (p.minPrio >> delta))
                           : 1;
-        double xx = 16 / diff;
+        double xx   = 16 / diff;
         if (delta > (unsigned int)(std::floor(std::log2(xx))))
           delta -= (unsigned int)(std::floor(std::log2(xx)));
         else
diff --git a/libgalois/include/galois/worklists/Chunk.h b/libgalois/include/galois/worklists/Chunk.h
index cf6d697e6a..17398e9ff7 100644
--- a/libgalois/include/galois/worklists/Chunk.h
+++ b/libgalois/include/galois/worklists/Chunk.h
@@ -145,8 +145,8 @@ struct ChunkMaster {
 public:
   typedef T value_type;
 
-  ChunkMaster()                   = default;
-  ChunkMaster(const ChunkMaster&) = delete;
+  ChunkMaster()                              = default;
+  ChunkMaster(const ChunkMaster&)            = delete;
   ChunkMaster& operator=(const ChunkMaster&) = delete;
 
   void flush() {
diff --git a/libgalois/include/galois/worklists/WorkList.h b/libgalois/include/galois/worklists/WorkList.h
index fab4e80e2a..25eb900785 100644
--- a/libgalois/include/galois/worklists/WorkList.h
+++ b/libgalois/include/galois/worklists/WorkList.h
@@ -56,7 +56,7 @@ namespace { // don't pollute the symbol table with the example
 // All classes (should) conform to:
 template <typename T>
 class AbstractWorkList {
-  AbstractWorkList(const AbstractWorkList&) = delete;
+  AbstractWorkList(const AbstractWorkList&)                  = delete;
   const AbstractWorkList& operator=(const AbstractWorkList&) = delete;
 
 public:
diff --git a/libgalois/include/shad/DataTypes.h b/libgalois/include/shad/DataTypes.h
index 84dc770bee..253d064cbf 100644
--- a/libgalois/include/shad/DataTypes.h
+++ b/libgalois/include/shad/DataTypes.h
@@ -35,237 +35,245 @@
 #include <utility>
 #include <vector>
 
-
 namespace shad {
 
 /// @brief Data conversion utilities.
-/// 
+///
 /// Please refer to methods specialization to check
 /// which data types are supported.
 namespace data_types {
 
-  /// @brief Enumeration of supported data types.
-  /// 
-  /// The enumeration is meant to be used when parsing data
-  /// (i.e. type information is not known at compile time).
-  enum data_t {
-    STRING = 0,  // string support is currenlty limited
-    CHARS,       // sequence of characters
-    UINT,        // unsigned, binds by default to uint64_t
-    INT,         // int, binds by default to int64_t
-    FLOAT,       // float, binds by default to float
-    DOUBLE,      // double, binds by default to double
-    BOOL,        // bool, binds by default to bool
-    DATE,        // date in "%y-%m-%d" format, binds by default to time_t
-    USDATE,      // date in "%m/%d/%y" format, binds by default to time_t
-    DATE_TIME,   // date in "%y-%m-%dT%H:%M:%S" format,
-                 // binds by default to time_t
-    IP_ADDRESS,  // IPv4, binds by default to data_types::ipv4_t
-    LIST_UINT,   // Sequence of unsigneds, support currently limited
-    LIST_INT,    // Sequence of integers, support currently limited
-    LIST_DOUBLE, // Sequence of doubles, support currently limited
-    NONE
-  };
-
-  /// @brief Data structures for storing schema information.
-  /// Given a tuple of data, it associates elements labels and data types
-  /// to their position in the tuple.
-  using schema_t = std::vector<std::pair<std::string, data_t>>;
-
-  /// @brief Encoded null value.
-  /// @tparam ENC_t encoding type.
-  /// @return Encoded null value for ENC_t.
-  template <typename ENC_t>
-  constexpr ENC_t kNullValue = ENC_t();
-
-  /// @brief Encoded null value for uint64_t.
-  /// @return Null encoded value for uint64_t.
-  template <>
-  constexpr uint64_t kNullValue<uint64_t> = std::numeric_limits<int64_t>::max();
-
-  /// @brief Encoded null value for time_t (same as long).
-  /// @return Null encoded value for time_t (same as long).
-  template <>
-  constexpr time_t kNullValue<time_t> = std::numeric_limits<time_t>::max();
-
-  /// @brief Encoded null value for double.
-  /// @return Null encoded value for double.
-  template <>
-  constexpr double kNullValue<double> = std::numeric_limits<double>::max();
-  
-  /// @brief Encode Function
-  /// Available specializations:
-  ///    ENC_t = uint64_t, IN_t = std::string
-  /// @tparam ENC_t The type to encode to.
-  /// @tparam IN_t The type (format) of the data to encode.
-  /// @tparam DT data_types::data_t of the data to encode.
-  /// @param in Data to encode.
-  /// @return Encoded data.
-  template <typename ENC_t, typename IN_t, data_t DT>
-  ENC_t encode(IN_t &in);
-
-  /// @brief Encode Function
-  /// Available specializations:
-  ///    ENC_t = uint64_t, IN_t = default bindings of data_types::data_t
-  /// @tparam ENC_t The type to encode to.
-  /// @tparam IN_t The type of the data to encode.
-  /// @param in Data to encode.
-  /// @return Encoded data.
-  template <typename ENC_t, typename IN_t>
-  ENC_t encode(IN_t &in);
-
-  template <typename ENC_t, typename IN_t>
-  ENC_t encode(IN_t &in, data_t dt);
-
-  template <typename ENC_t, size_t MAX_s, data_t ST>
-  std::array<ENC_t, MAX_s> encode(std::string &str) {
-    std::array<ENC_t, MAX_s> res;
-    if (str.size() > 0) {
-      memcpy(res.data(), str.data(), sizeof(ENC_t)*MAX_s);
-    } else {
-      res.fill('\0');
-    }
-    return res;
-  }
+/// @brief Enumeration of supported data types.
+///
+/// The enumeration is meant to be used when parsing data
+/// (i.e. type information is not known at compile time).
+enum data_t {
+  STRING = 0,  // string support is currenlty limited
+  CHARS,       // sequence of characters
+  UINT,        // unsigned, binds by default to uint64_t
+  INT,         // int, binds by default to int64_t
+  FLOAT,       // float, binds by default to float
+  DOUBLE,      // double, binds by default to double
+  BOOL,        // bool, binds by default to bool
+  DATE,        // date in "%y-%m-%d" format, binds by default to time_t
+  USDATE,      // date in "%m/%d/%y" format, binds by default to time_t
+  DATE_TIME,   // date in "%y-%m-%dT%H:%M:%S" format,
+               // binds by default to time_t
+  IP_ADDRESS,  // IPv4, binds by default to data_types::ipv4_t
+  LIST_UINT,   // Sequence of unsigneds, support currently limited
+  LIST_INT,    // Sequence of integers, support currently limited
+  LIST_DOUBLE, // Sequence of doubles, support currently limited
+  NONE
+};
+
+/// @brief Data structures for storing schema information.
+/// Given a tuple of data, it associates elements labels and data types
+/// to their position in the tuple.
+using schema_t = std::vector<std::pair<std::string, data_t>>;
+
+/// @brief Encoded null value.
+/// @tparam ENC_t encoding type.
+/// @return Encoded null value for ENC_t.
+template <typename ENC_t>
+constexpr ENC_t kNullValue = ENC_t();
+
+/// @brief Encoded null value for uint64_t.
+/// @return Null encoded value for uint64_t.
+template <>
+constexpr uint64_t kNullValue<uint64_t> = std::numeric_limits<int64_t>::max();
+
+/// @brief Encoded null value for time_t (same as long).
+/// @return Null encoded value for time_t (same as long).
+template <>
+constexpr time_t kNullValue<time_t> = std::numeric_limits<time_t>::max();
+
+/// @brief Encoded null value for double.
+/// @return Null encoded value for double.
+template <>
+constexpr double kNullValue<double> = std::numeric_limits<double>::max();
+
+/// @brief Encode Function
+/// Available specializations:
+///    ENC_t = uint64_t, IN_t = std::string
+/// @tparam ENC_t The type to encode to.
+/// @tparam IN_t The type (format) of the data to encode.
+/// @tparam DT data_types::data_t of the data to encode.
+/// @param in Data to encode.
+/// @return Encoded data.
+template <typename ENC_t, typename IN_t, data_t DT>
+ENC_t encode(IN_t& in);
+
+/// @brief Encode Function
+/// Available specializations:
+///    ENC_t = uint64_t, IN_t = default bindings of data_types::data_t
+/// @tparam ENC_t The type to encode to.
+/// @tparam IN_t The type of the data to encode.
+/// @param in Data to encode.
+/// @return Encoded data.
+template <typename ENC_t, typename IN_t>
+ENC_t encode(IN_t& in);
 
-  template <typename ENC_t, typename DEC_t>
-  typename std::enable_if<(std::is_arithmetic<DEC_t>::value or (sizeof(DEC_t) == sizeof(ENC_t))), DEC_t>::type
-  decode(ENC_t encvalue) {
-    DEC_t val;
-    memcpy(&val, &encvalue, sizeof(DEC_t));
-    return val;
+template <typename ENC_t, typename IN_t>
+ENC_t encode(IN_t& in, data_t dt);
+
+template <typename ENC_t, size_t MAX_s, data_t ST>
+std::array<ENC_t, MAX_s> encode(std::string& str) {
+  std::array<ENC_t, MAX_s> res;
+  if (str.size() > 0) {
+    memcpy(res.data(), str.data(), sizeof(ENC_t) * MAX_s);
+  } else {
+    res.fill('\0');
   }
+  return res;
+}
 
-  template <typename ENC_t, typename DEC_t, data_t ST>
-  DEC_t decode(ENC_t value);
+template <typename ENC_t, typename DEC_t>
+typename std::enable_if<(std::is_arithmetic<DEC_t>::value or
+                         (sizeof(DEC_t) == sizeof(ENC_t))),
+                        DEC_t>::type
+decode(ENC_t encvalue) {
+  DEC_t val;
+  memcpy(&val, &encvalue, sizeof(DEC_t));
+  return val;
+}
 
-  template <typename ENC_t, data_t ST>
-  typename std::enable_if<(ST==data_t::INT), int64_t>::type 
-  decode(ENC_t encvalue) {
-    return decode<ENC_t, int64_t>(encvalue);
-  }
+template <typename ENC_t, typename DEC_t, data_t ST>
+DEC_t decode(ENC_t value);
 
-  template <typename ENC_t, data_t ST>
-  typename std::enable_if<(ST==data_t::UINT), uint64_t>::type 
-  decode(ENC_t encvalue) {
-    return decode<ENC_t, uint64_t>(encvalue);
-  }
+template <typename ENC_t, data_t ST>
+typename std::enable_if<(ST == data_t::INT), int64_t>::type
+decode(ENC_t encvalue) {
+  return decode<ENC_t, int64_t>(encvalue);
+}
 
-  template <typename ENC_t, data_t ST>
-  typename std::enable_if<(ST==data_t::FLOAT), float>::type 
-  decode(ENC_t encvalue) {
-    return decode<ENC_t, float>(encvalue);
-  }
+template <typename ENC_t, data_t ST>
+typename std::enable_if<(ST == data_t::UINT), uint64_t>::type
+decode(ENC_t encvalue) {
+  return decode<ENC_t, uint64_t>(encvalue);
+}
 
-  template <typename ENC_t, data_t ST>
-  typename std::enable_if<(ST==data_t::DOUBLE), double>::type 
-  decode(ENC_t encvalue) {
-    return decode<ENC_t, double>(encvalue);
-  }
+template <typename ENC_t, data_t ST>
+typename std::enable_if<(ST == data_t::FLOAT), float>::type
+decode(ENC_t encvalue) {
+  return decode<ENC_t, float>(encvalue);
+}
 
-  template <typename ENC_t, data_t ST>
-  typename std::enable_if<(ST==data_t::BOOL), bool>::type 
-  decode(ENC_t encvalue) {
-    return decode<ENC_t, bool>(encvalue);
-  }
+template <typename ENC_t, data_t ST>
+typename std::enable_if<(ST == data_t::DOUBLE), double>::type
+decode(ENC_t encvalue) {
+  return decode<ENC_t, double>(encvalue);
+}
 
-  template <typename ENC_t, data_t ST>
-  typename std::enable_if<(ST==data_t::DATE), std::time_t>::type 
-  decode(ENC_t encvalue) {
-    return decode<ENC_t, std::time_t>(encvalue);
-  }
-  
-  template <typename ENC_t, size_t MAX_s, data_t ST>
-  std::string decode(std::array<ENC_t, MAX_s> &val) {
-    return std::string(reinterpret_cast<const char*>(val.data()));
-  }
-}  // namespace data_types
+template <typename ENC_t, data_t ST>
+typename std::enable_if<(ST == data_t::BOOL), bool>::type
+decode(ENC_t encvalue) {
+  return decode<ENC_t, bool>(encvalue);
+}
 
+template <typename ENC_t, data_t ST>
+typename std::enable_if<(ST == data_t::DATE), std::time_t>::type
+decode(ENC_t encvalue) {
+  return decode<ENC_t, std::time_t>(encvalue);
+}
+
+template <typename ENC_t, size_t MAX_s, data_t ST>
+std::string decode(std::array<ENC_t, MAX_s>& val) {
+  return std::string(reinterpret_cast<const char*>(val.data()));
+}
+} // namespace data_types
 
 // ENCODE METHODS SPECIALIZATION FOR UINT64 ENC_t
-template<> inline
-uint64_t data_types::encode<uint64_t,
-                            std::string,
-                            data_types::UINT>(std::string &str) {
+template <>
+inline uint64_t
+data_types::encode<uint64_t, std::string, data_types::UINT>(std::string& str) {
   uint64_t value;
-  try { value = std::stoull(str); }
-  catch(...) { value = kNullValue<uint64_t>; }
+  try {
+    value = std::stoull(str);
+  } catch (...) {
+    value = kNullValue<uint64_t>;
+  }
   return value;
 }
 
-template<> inline
-uint64_t data_types::encode<uint64_t,
-                            std::string,
-                            data_types::INT>(std::string &str) {
+template <>
+inline uint64_t
+data_types::encode<uint64_t, std::string, data_types::INT>(std::string& str) {
   uint64_t encval;
   int64_t value;
-  try { value = stoll(str); }
-  catch(...) { return kNullValue<uint64_t>; }
+  try {
+    value = stoll(str);
+  } catch (...) {
+    return kNullValue<uint64_t>;
+  }
   memcpy(&encval, &value, sizeof(value));
   return encval;
 }
 
-template<> inline
-uint64_t data_types::encode<uint64_t,
-                            std::string,
-                            data_types::FLOAT>(std::string &str) {
+template <>
+inline uint64_t
+data_types::encode<uint64_t, std::string, data_types::FLOAT>(std::string& str) {
   uint64_t encval;
   float value;
-  try { value = stof(str); }
-  catch(...) { return kNullValue<uint64_t>; }
+  try {
+    value = stof(str);
+  } catch (...) {
+    return kNullValue<uint64_t>;
+  }
   memcpy(&encval, &value, sizeof(value));
   return encval;
 }
 
-template<> inline
-uint64_t data_types::encode<uint64_t,
-                            std::string,
-                            data_types::DOUBLE>(std::string &str) {
+template <>
+inline uint64_t data_types::encode<uint64_t, std::string, data_types::DOUBLE>(
+    std::string& str) {
   uint64_t encval;
   double value;
-  try { value = stod(str); }
-  catch(...) { return kNullValue<uint64_t>; }
+  try {
+    value = stod(str);
+  } catch (...) {
+    return kNullValue<uint64_t>;
+  }
   memcpy(&encval, &value, sizeof(value));
   return encval;
 }
 
-template<> inline
-uint64_t data_types::encode<uint64_t,
-                            std::string,
-                            data_types::BOOL>(std::string &str) {
-  if (str.size() == 0) return kNullValue<uint64_t>;
+template <>
+inline uint64_t
+data_types::encode<uint64_t, std::string, data_types::BOOL>(std::string& str) {
+  if (str.size() == 0)
+    return kNullValue<uint64_t>;
   uint64_t encval = 1;
-  if ((str == "F") || (str == "f") || (str == "FALSE") 
-                   || (str == "false") || (str == "0")) encval = 0;
+  if ((str == "F") || (str == "f") || (str == "FALSE") || (str == "false") ||
+      (str == "0"))
+    encval = 0;
   return encval;
 }
 
-
-template<> inline
-uint64_t data_types::encode<uint64_t,
-                            std::string,
-                            data_types::CHARS>(std::string &str) {
+template <>
+inline uint64_t
+data_types::encode<uint64_t, std::string, data_types::CHARS>(std::string& str) {
   uint64_t encval = 0;
   memset(&encval, '\0', sizeof(encval));
-  memcpy(&encval, str.c_str(), sizeof(encval)-1);
+  memcpy(&encval, str.c_str(), sizeof(encval) - 1);
   return encval;
 }
 
-template<> inline
-uint64_t data_types::encode<uint64_t,
-                            std::string,
-                            data_types::IP_ADDRESS>(std::string &str) {
+template <>
+inline uint64_t
+data_types::encode<uint64_t, std::string, data_types::IP_ADDRESS>(
+    std::string& str) {
   uint64_t val, value = 0;
   std::string::iterator start = str.begin();
-  for (unsigned i = 0; i < 4; i ++) {
+  for (unsigned i = 0; i < 4; i++) {
     std::string::iterator end = std::find(start, str.end(), '.');
     try {
       val = std::stoull(std::string(start, end));
-    } catch(...) {
+    } catch (...) {
       return kNullValue<uint64_t>;
     }
     if (val < 256) {
-      value = (value << 8) + val; start = end + 1;
+      value = (value << 8) + val;
+      start = end + 1;
     } else {
       return kNullValue<uint64_t>;
     }
@@ -273,57 +281,52 @@ uint64_t data_types::encode<uint64_t,
   return value;
 }
 
-template<> inline
-uint64_t data_types::encode<uint64_t,
-                            std::string,
-                            data_types::DATE>(std::string &str) {
+template <>
+inline uint64_t
+data_types::encode<uint64_t, std::string, data_types::DATE>(std::string& str) {
   uint64_t value = 0;
-  struct tm date{};
+  struct tm date {};
   date.tm_isdst = -1;
   strptime(str.c_str(), "%Y-%m-%d", &date);
   time_t t;
   try {
     t = mktime(&date);
-  }
-  catch(...) {
+  } catch (...) {
     return kNullValue<uint64_t>;
   }
   memcpy(&value, &t, sizeof(value));
   return value;
 }
 
-template<> inline
-uint64_t data_types::encode<uint64_t,
-                            std::string,
-                            data_types::USDATE>(std::string &str) {
+template <>
+inline uint64_t data_types::encode<uint64_t, std::string, data_types::USDATE>(
+    std::string& str) {
   uint64_t value = 0;
-  struct tm date{};
+  struct tm date {};
   date.tm_isdst = -1;
   strptime(str.c_str(), "%m/%d/%y", &date);
   time_t t;
   try {
     t = mktime(&date);
-  }
-  catch(...) {
+  } catch (...) {
     return kNullValue<uint64_t>;
   }
   memcpy(&value, &t, sizeof(value));
   return value;
 }
 
-template<> inline
-uint64_t data_types::encode<uint64_t,
-                            std::string,
-                            data_types::DATE_TIME>(std::string &str) {
+template <>
+inline uint64_t
+data_types::encode<uint64_t, std::string, data_types::DATE_TIME>(
+    std::string& str) {
   uint64_t value = 0;
-  struct tm date{};
+  struct tm date {};
   date.tm_isdst = -1;
   strptime(str.c_str(), "%Y-%m-%dT%H:%M:%S", &date);
   time_t t;
   try {
     t = mktime(&date);
-  }
-  catch(...) {
+  } catch (...) {
     return kNullValue<uint64_t>;
   }
   memcpy(&value, &t, sizeof(value));
@@ -332,89 +335,96 @@ uint64_t data_types::encode<uint64_t,
 
 // ENCODE METHODS SPECIALIZATION FOR DOUBLE ENC_t
 
-template<> inline
-double data_types::encode<double,
-                          std::string,
-                          data_types::UINT>(std::string &str) {
+template <>
+inline double
+data_types::encode<double, std::string, data_types::UINT>(std::string& str) {
   double encval;
   uint64_t value;
-  try { value = std::stoull(str); }
-  catch(...) { return kNullValue<double>; }
+  try {
+    value = std::stoull(str);
+  } catch (...) {
+    return kNullValue<double>;
+  }
   memcpy(&encval, &value, sizeof(value));
   return encval;
 }
 
-template<> inline
-double data_types::encode<double,
-                          std::string,
-                          data_types::INT>(std::string &str) {
+template <>
+inline double
+data_types::encode<double, std::string, data_types::INT>(std::string& str) {
   double encval;
   int64_t value;
-  try { value = stoll(str); }
-  catch(...) { return kNullValue<double>; }
+  try {
+    value = stoll(str);
+  } catch (...) {
+    return kNullValue<double>;
+  }
   memcpy(&encval, &value, sizeof(value));
   return encval;
 }
 
-template<> inline
-double data_types::encode<double,
-                          std::string,
-                          data_types::FLOAT>(std::string &str) {
+template <>
+inline double
+data_types::encode<double, std::string, data_types::FLOAT>(std::string& str) {
   double encval;
   float value;
-  try { value = stof(str); }
-  catch(...) { return kNullValue<double>; }
+  try {
+    value = stof(str);
+  } catch (...) {
+    return kNullValue<double>;
+  }
   memcpy(&encval, &value, sizeof(value));
   return encval;
 }
 
-template<> inline
-double data_types::encode<double,
-                          std::string,
-                          data_types::DOUBLE>(std::string &str) {
+template <>
+inline double
+data_types::encode<double, std::string, data_types::DOUBLE>(std::string& str) {
   double value;
-  try { value = stod(str); }
-  catch(...) { return kNullValue<double>; }
+  try {
+    value = stod(str);
+  } catch (...) {
+    return kNullValue<double>;
+  }
   return value;
 }
 
-template<> inline
-double data_types::encode<double,
-                          std::string,
-                          data_types::BOOL>(std::string &str) {
-  if (str.size() == 0) return kNullValue<uint64_t>;
+template <>
+inline double
+data_types::encode<double, std::string, data_types::BOOL>(std::string& str) {
+  if (str.size() == 0)
+    return kNullValue<uint64_t>;
   double encval = 1;
-  if ((str == "F") || (str == "f") || (str == "FALSE") 
-                   || (str == "false") || (str == "0")) encval = 0;
+  if ((str == "F") || (str == "f") || (str == "FALSE") || (str == "false") ||
+      (str == "0"))
+    encval = 0;
   return encval;
 }
 
-
-template<> inline
-double data_types::encode<double,
-                          std::string,
-                          data_types::CHARS>(std::string &str) {
+template <>
+inline double
+data_types::encode<double, std::string, data_types::CHARS>(std::string& str) {
   double encval = 0;
   memset(&encval, '\0', sizeof(encval));
-  memcpy(&encval, str.c_str(), sizeof(encval)-1);
+  memcpy(&encval, str.c_str(), sizeof(encval) - 1);
   return encval;
 }
 
-template<> inline
-double data_types::encode<double,
-                          std::string,
-                          data_types::IP_ADDRESS>(std::string &str) {
+template <>
+inline double data_types::encode<double, std::string, data_types::IP_ADDRESS>(
+    std::string& str) {
   uint64_t val, value = 0;
   std::string::iterator start = str.begin();
-  for (unsigned i = 0; i < 4; i ++) {
+  for (unsigned i = 0; i < 4; i++) {
     std::string::iterator end = std::find(start, str.end(), '.');
     try {
       val = std::stoull(std::string(start, end));
-    } catch(...) {
+    } catch (...) {
       return kNullValue<double>;
     }
     if (val < 256) {
-      value = (value << 8) + val; start = end + 1;
+      value = (value << 8) + val;
+      start = end + 1;
     } else {
       return kNullValue<double>;
     }
@@ -424,57 +434,51 @@ double data_types::encode<double,
   return encval;
 }
 
-template<> inline
-double data_types::encode<double,
-                          std::string,
-                          data_types::DATE>(std::string &str) {
+template <>
+inline double
+data_types::encode<double, std::string, data_types::DATE>(std::string& str) {
   double value = 0;
-  struct tm date{};
+  struct tm date {};
   date.tm_isdst = -1;
   strptime(str.c_str(), "%Y-%m-%d", &date);
   time_t t;
   try {
     t = mktime(&date);
-  }
-  catch(...) {
+  } catch (...) {
     return kNullValue<double>;
   }
   memcpy(&value, &t, sizeof(value));
   return value;
 }
 
-template<> inline
-double data_types::encode<double,
-                          std::string,
-                          data_types::USDATE>(std::string &str) {
+template <>
+inline double
+data_types::encode<double, std::string, data_types::USDATE>(std::string& str) {
   double value = 0;
-  struct tm date{};
+  struct tm date {};
   date.tm_isdst = -1;
   strptime(str.c_str(), "%m/%d/%y", &date);
   time_t t;
   try {
     t = mktime(&date);
-  }
-  catch(...) {
+  } catch (...) {
     return kNullValue<uint64_t>;
   }
   memcpy(&value, &t, sizeof(value));
   return value;
 }
 
-template<> inline
-double data_types::encode<double,
-                          std::string,
-                          data_types::DATE_TIME>(std::string &str) {
+template <>
+inline double data_types::encode<double, std::string, data_types::DATE_TIME>(
+    std::string& str) {
   double value = 0;
-  struct tm date{};
+  struct tm date {};
   date.tm_isdst = -1;
   strptime(str.c_str(), "%Y-%m-%dT%H:%M:%S", &date);
   time_t t;
   try {
     t = mktime(&date);
-  }
-  catch(...) {
+  } catch (...) {
     return kNullValue<uint64_t>;
   }
   memcpy(&value, &t, sizeof(value));
@@ -482,87 +486,94 @@ double data_types::encode<double,
 }
 
 // ENCODE METHODS SPECIALIZATION FOR TIME_T ENC_t (same as long)
-template<> inline
-time_t data_types::encode<time_t,
-                          std::string,
-                          data_types::UINT>(std::string &str) {
+template <>
+inline time_t
+data_types::encode<time_t, std::string, data_types::UINT>(std::string& str) {
   time_t value;
-  try { value = std::stoul(str); }
-  catch(...) { value = kNullValue<time_t>; }
+  try {
+    value = std::stoul(str);
+  } catch (...) {
+    value = kNullValue<time_t>;
+  }
   return value;
 }
 
-template<> inline
-time_t data_types::encode<time_t,
-                          std::string,
-                          data_types::INT>(std::string &str) {
+template <>
+inline time_t
+data_types::encode<time_t, std::string, data_types::INT>(std::string& str) {
   int64_t value;
-  try { value = stol(str); }
-  catch(...) { return kNullValue<time_t>; }
+  try {
+    value = stol(str);
+  } catch (...) {
+    return kNullValue<time_t>;
+  }
   return value;
 }
 
-template<> inline
-time_t data_types::encode<time_t,
-                          std::string,
-                          data_types::FLOAT>(std::string &str) {
+template <>
+inline time_t
+data_types::encode<time_t, std::string, data_types::FLOAT>(std::string& str) {
   time_t encval;
   float value;
-  try { value = stof(str); }
-  catch(...) { return kNullValue<time_t>; }
+  try {
+    value = stof(str);
+  } catch (...) {
+    return kNullValue<time_t>;
+  }
   memcpy(&encval, &value, sizeof(value));
   return encval;
 }
 
-template<> inline
-time_t data_types::encode<time_t,
-                          std::string,
-                          data_types::DOUBLE>(std::string &str) {
+template <>
+inline time_t
+data_types::encode<time_t, std::string, data_types::DOUBLE>(std::string& str) {
   time_t encval;
   double value;
-  try { value = stod(str); }
-  catch(...) { return kNullValue<time_t>; }
+  try {
+    value = stod(str);
+  } catch (...) {
+    return kNullValue<time_t>;
+  }
   memcpy(&encval, &value, sizeof(value));
   return encval;
 }
 
-template<> inline
-time_t data_types::encode<time_t,
-                          std::string,
-                          data_types::BOOL>(std::string &str) {
-  if (str.size() == 0) return kNullValue<uint64_t>;
+template <>
+inline time_t
+data_types::encode<time_t, std::string, data_types::BOOL>(std::string& str) {
+  if (str.size() == 0)
+    return kNullValue<uint64_t>;
   time_t encval = 1;
-  if ((str == "F") || (str == "f") || (str == "FALSE") 
-                   || (str == "false") || (str == "0")) encval = 0;
+  if ((str == "F") || (str == "f") || (str == "FALSE") || (str == "false") ||
+      (str == "0"))
+    encval = 0;
   return encval;
 }
 
-
-template<> inline
-time_t data_types::encode<time_t,
-                          std::string,
-                          data_types::CHARS>(std::string &str) {
+template <>
+inline time_t
+data_types::encode<time_t, std::string, data_types::CHARS>(std::string& str) {
   time_t encval = 0;
   memset(&encval, '\0', sizeof(encval));
-  memcpy(&encval, str.c_str(), sizeof(encval)-1);
+  memcpy(&encval, str.c_str(), sizeof(encval) - 1);
   return encval;
 }
 
-template<> inline
-time_t data_types::encode<time_t,
-                          std::string,
-                          data_types::IP_ADDRESS>(std::string &str) {
+template <>
+inline time_t data_types::encode<time_t, std::string, data_types::IP_ADDRESS>(
+    std::string& str) {
   time_t val, value = 0;
   std::string::iterator start = str.begin();
-  for (unsigned i = 0; i < 4; i ++) {
+  for (unsigned i = 0; i < 4; i++) {
     std::string::iterator end = std::find(start, str.end(), '.');
     try {
       val = std::stoull(std::string(start, end));
-    } catch(...) {
+    } catch (...) {
       return kNullValue<time_t>;
     }
     if (val < 256) {
-      value = (value << 8) + val; start = end + 1;
+      value = (value << 8) + val;
+      start = end + 1;
     } else {
       return kNullValue<time_t>;
     }
@@ -570,165 +581,161 @@ time_t data_types::encode<time_t,
   return value;
 }
 
-template<> inline
-time_t data_types::encode<time_t,
-                          std::string,
-                          data_types::DATE>(std::string &str) {
-  struct tm date{};
+template <>
+inline time_t
+data_types::encode<time_t, std::string, data_types::DATE>(std::string& str) {
+  struct tm date {};
   date.tm_isdst = -1;
   strptime(str.c_str(), "%Y-%m-%d", &date);
   time_t t;
   try {
     t = mktime(&date);
-  }
-  catch(...) {
+  } catch (...) {
     return kNullValue<time_t>;
   }
   return t;
 }
 
-template<> inline
-time_t data_types::encode<time_t,
-                          std::string,
-                          data_types::USDATE>(std::string &str) {
-  struct tm date{};
+template <>
+inline time_t
+data_types::encode<time_t, std::string, data_types::USDATE>(std::string& str) {
+  struct tm date {};
   date.tm_isdst = -1;
   strptime(str.c_str(), "%m/%d/%y", &date);
   time_t t;
   try {
     t = mktime(&date);
-  }
-  catch(...) {
+  } catch (...) {
     return kNullValue<time_t>;
   }
   return t;
 }
 
-template<> inline
-time_t data_types::encode<time_t,
-                          std::string,
-                          data_types::DATE_TIME>(std::string &str) {
-  struct tm date{};
+template <>
+inline time_t data_types::encode<time_t, std::string, data_types::DATE_TIME>(
+    std::string& str) {
+  struct tm date {};
   date.tm_isdst = -1;
   strptime(str.c_str(), "%Y-%m-%dT%H:%M:%S", &date);
   time_t t;
   try {
     t = mktime(&date);
-  }
-  catch(...) {
+  } catch (...) {
     return kNullValue<uint64_t>;
   }
   return t;
 }
 
 template <typename ENC_t, typename IN_t>
-ENC_t data_types::encode(IN_t &in, data_types::data_t dt) {
+ENC_t data_types::encode(IN_t& in, data_types::data_t dt) {
   switch (dt) {
-//     case data_types::STRING :
-//       return data_types::encode<ENC_t, IN_t, data_types::STRING>(in);
-//     case data_types::CHARS :
-//       return data_types::encode<ENC_t, IN_t, data_types::CHARS>(in);
-    case data_types::UINT :
-      return data_types::encode<ENC_t, IN_t, data_types::UINT>(in);
-    case data_types::INT :
-      return data_types::encode<ENC_t, IN_t, data_types::INT>(in);
-    case data_types::FLOAT :
-      return data_types::encode<ENC_t, IN_t, data_types::FLOAT>(in);
-    case data_types::DOUBLE :
-      return data_types::encode<ENC_t, IN_t, data_types::DOUBLE>(in);
-    case data_types::BOOL :
-      return data_types::encode<ENC_t, IN_t, data_types::BOOL>(in);
-    case data_types::DATE :
-      return data_types::encode<ENC_t, IN_t, data_types::DATE>(in);
-    case data_types::USDATE :
-      return data_types::encode<ENC_t, IN_t, data_types::USDATE>(in);
-    case data_types::DATE_TIME :
-      return data_types::encode<ENC_t, IN_t, data_types::DATE_TIME>(in);
-    case data_types::IP_ADDRESS :
-      return data_types::encode<ENC_t, IN_t, data_types::IP_ADDRESS>(in);
+    //     case data_types::STRING :
+    //       return data_types::encode<ENC_t, IN_t, data_types::STRING>(in);
+    //     case data_types::CHARS :
+    //       return data_types::encode<ENC_t, IN_t, data_types::CHARS>(in);
+  case data_types::UINT:
+    return data_types::encode<ENC_t, IN_t, data_types::UINT>(in);
+  case data_types::INT:
+    return data_types::encode<ENC_t, IN_t, data_types::INT>(in);
+  case data_types::FLOAT:
+    return data_types::encode<ENC_t, IN_t, data_types::FLOAT>(in);
+  case data_types::DOUBLE:
+    return data_types::encode<ENC_t, IN_t, data_types::DOUBLE>(in);
+  case data_types::BOOL:
+    return data_types::encode<ENC_t, IN_t, data_types::BOOL>(in);
+  case data_types::DATE:
+    return data_types::encode<ENC_t, IN_t, data_types::DATE>(in);
+  case data_types::USDATE:
+    return data_types::encode<ENC_t, IN_t, data_types::USDATE>(in);
+  case data_types::DATE_TIME:
+    return data_types::encode<ENC_t, IN_t, data_types::DATE_TIME>(in);
+  case data_types::IP_ADDRESS:
+    return data_types::encode<ENC_t, IN_t, data_types::IP_ADDRESS>(in);
   }
   return data_types::kNullValue<ENC_t>;
 }
 
-template<> inline
-std::string data_types::decode<uint64_t,
-                               std::string,
-                               data_types::UINT>(uint64_t value) {
-  if (value == kNullValue<uint64_t>) return "";
+template <>
+inline std::string
+data_types::decode<uint64_t, std::string, data_types::UINT>(uint64_t value) {
+  if (value == kNullValue<uint64_t>)
+    return "";
   return std::to_string(value);
 }
 
-template<> inline
-std::string data_types::decode<uint64_t,
-                               std::string,
-                               data_types::INT>(uint64_t value) {
-  if (value == kNullValue<uint64_t>) return "";
+template <>
+inline std::string
+data_types::decode<uint64_t, std::string, data_types::INT>(uint64_t value) {
+  if (value == kNullValue<uint64_t>)
+    return "";
   int64_t v;
   memcpy(&v, &value, sizeof(v));
   return std::to_string(v);
 }
 
-template<> inline
-std::string data_types::decode<uint64_t,
-                               std::string,
-                               data_types::FLOAT>(uint64_t value) {
-  if (value == kNullValue<uint64_t>) return "";
+template <>
+inline std::string
+data_types::decode<uint64_t, std::string, data_types::FLOAT>(uint64_t value) {
+  if (value == kNullValue<uint64_t>)
+    return "";
   float v;
   memcpy(&v, &value, sizeof(v));
   return std::to_string(v);
 }
 
-template<> inline
-std::string data_types::decode<uint64_t,
-                               std::string,
-                               data_types::DOUBLE>(uint64_t value) {
-  if (value == kNullValue<uint64_t>) return "";
+template <>
+inline std::string
+data_types::decode<uint64_t, std::string, data_types::DOUBLE>(uint64_t value) {
+  if (value == kNullValue<uint64_t>)
+    return "";
   double v;
   memcpy(&v, &value, sizeof(v));
   return std::to_string(v);
 }
 
-template<> inline
-std::string data_types::decode<uint64_t,
-                               std::string,
-                               data_types::IP_ADDRESS>(uint64_t value) {
+template <>
+inline std::string
+data_types::decode<uint64_t, std::string, data_types::IP_ADDRESS>(
+    uint64_t value) {
   std::string ipAddr = "";
   uint64_t octets[4];
-  for (uint64_t k = 0; k < 4; k ++) {octets[k] = value & 255; value = value >> 8;}
-  for (uint64_t k = 3; k >= 1; k --) ipAddr += std::to_string(octets[k]) + '.';
+  for (uint64_t k = 0; k < 4; k++) {
+    octets[k] = value & 255;
+    value     = value >> 8;
+  }
+  for (uint64_t k = 3; k >= 1; k--)
+    ipAddr += std::to_string(octets[k]) + '.';
   return ipAddr + std::to_string(octets[0]);
 }
 
-template<> inline
-std::string data_types::decode<uint64_t,
-                               std::string,
-                               data_types::BOOL>(uint64_t value) {
-  if (value == kNullValue<uint64_t>) return "";
+template <>
+inline std::string
+data_types::decode<uint64_t, std::string, data_types::BOOL>(uint64_t value) {
+  if (value == kNullValue<uint64_t>)
+    return "";
   return std::to_string(value);
 }
 
-template<> inline
-std::string data_types::decode<uint64_t,
-                               std::string,
-                               data_types::DATE>(uint64_t value) {
+template <>
+inline std::string
+data_types::decode<uint64_t, std::string, data_types::DATE>(uint64_t value) {
   time_t t = data_types::decode<uint64_t, data_types::DATE>(value);
   char dateString[11];
   strftime(dateString, 11, "%Y-%m-%d", std::localtime(&t));
   return std::string(dateString);
 }
 
-template<> inline
-std::string data_types::decode<uint64_t,
-                               std::string,
-                               data_types::CHARS>(uint64_t value) {
+template <>
+inline std::string
+data_types::decode<uint64_t, std::string, data_types::CHARS>(uint64_t value) {
   const char* c = reinterpret_cast<const char*>(&value);
   return std::string(c);
 }
 
-template <> inline
-uint64_t data_types::decode<uint64_t, uint64_t>(uint64_t encvalue) {
+template <>
+inline uint64_t data_types::decode<uint64_t, uint64_t>(uint64_t encvalue) {
   return encvalue;
 }
-}  // namespace shad
+} // namespace shad
 
 #endif // LIBGALOIS_INCLUDE_SHAD_DATA_TYPES_H_
diff --git a/libgalois/include/shad/Graph.h b/libgalois/include/shad/Graph.h
index 9029b1ef32..2c785e53d5 100644
--- a/libgalois/include/shad/Graph.h
+++ b/libgalois/include/shad/Graph.h
@@ -1,5 +1,5 @@
-//TODO(hc): Upgrade copyright if it is necessary; for now, we have no plan
-// to make this public.
+// TODO(hc): Upgrade copyright if it is necessary; for now, we have no plan
+//  to make this public.
 
 //===------------------------------------------------------------*- C++ -*-===//
 //
@@ -50,7 +50,7 @@
 #include "DataTypes.h"
 #include "GraphTypes.h"
 
-#define UINT   shad::data_types::UINT
+#define UINT shad::data_types::UINT
 #define DOUBLE shad::data_types::DOUBLE
 #define USDATE shad::data_types::USDATE
 #define ENCODE shad::data_types::encode
@@ -58,112 +58,120 @@
 namespace shad {
 
 class Vertex {
-  public:
-    // Vertex id; initially it is set
-    // to a local node id while CuSP reads a file and constructs
-    // this vertex. After each host finishes and synchronizes it to construct
-    // a full CSR graph, it is updated to a global node id.
-    uint64_t id;
-    TYPES    type;
-    uint64_t shadKey;
-    // Number of edges.
-    // This is incremented while reads a graph.
-    uint64_t numEdges{0};
-
-    Vertex () {
-      this->id    = shad::data_types::kNullValue<uint64_t>;
-      this->type  = TYPES::NONE;
-      this->shadKey = shad::data_types::kNullValue<uint64_t>;
-    }
-
-    Vertex (uint64_t id_, TYPES type_, uint64_t shadKey_) {
-      this->id    = id_;
-      this->type  = type_;
-      this->shadKey = shadKey_;
-    }
-
-    void incrNumEdges() {
-      this->numEdges += 1;
-    }
-
-    uint64_t getNumEdges() {
-      return this->numEdges;
-    }
+public:
+  // Vertex id; initially it is set
+  // to a local node id while CuSP reads a file and constructs
+  // this vertex. After each host finishes and synchronizes it to construct
+  // a full CSR graph, it is updated to a global node id.
+  uint64_t id;
+  TYPES type;
+  uint64_t shadKey;
+  // Number of edges.
+  // This is incremented while reads a graph.
+  uint64_t numEdges{0};
+
+  Vertex() {
+    this->id      = shad::data_types::kNullValue<uint64_t>;
+    this->type    = TYPES::NONE;
+    this->shadKey = shad::data_types::kNullValue<uint64_t>;
+  }
+
+  Vertex(uint64_t id_, TYPES type_, uint64_t shadKey_) {
+    this->id      = id_;
+    this->type    = type_;
+    this->shadKey = shadKey_;
+  }
+
+  void incrNumEdges() { this->numEdges += 1; }
+
+  uint64_t getNumEdges() { return this->numEdges; }
 };
 
 class Edge {
-  public:
-    uint64_t src;     // vertex id of src
-    uint64_t dst;     // vertex id of dst
-    TYPES    type;
-    TYPES    src_type;
-    TYPES    dst_type;
-    uint64_t src_glbid;
-    uint64_t dst_glbid;
-
-    Edge () {
-      src       = shad::data_types::kNullValue<uint64_t>;
-      dst       = shad::data_types::kNullValue<uint64_t>;
-      type      = TYPES::NONE;
-      src_type  = TYPES::NONE;
-      dst_type  = TYPES::NONE;
+public:
+  uint64_t src; // vertex id of src
+  uint64_t dst; // vertex id of dst
+  TYPES type;
+  TYPES src_type;
+  TYPES dst_type;
+  uint64_t src_glbid;
+  uint64_t dst_glbid;
+
+  Edge() {
+    src       = shad::data_types::kNullValue<uint64_t>;
+    dst       = shad::data_types::kNullValue<uint64_t>;
+    type      = TYPES::NONE;
+    src_type  = TYPES::NONE;
+    dst_type  = TYPES::NONE;
+    src_glbid = shad::data_types::kNullValue<uint64_t>;
+    dst_glbid = shad::data_types::kNullValue<uint64_t>;
+  }
+
+  Edge(std::vector<std::string>& tokens) {
+    if (tokens[0] == "Sale") {
+      src       = ENCODE<uint64_t, std::string, UINT>(tokens[1]);
+      dst       = ENCODE<uint64_t, std::string, UINT>(tokens[2]);
+      type      = TYPES::SALE;
+      src_type  = TYPES::PERSON;
+      dst_type  = TYPES::PERSON;
+      src_glbid = shad::data_types::kNullValue<uint64_t>;
+      dst_glbid = shad::data_types::kNullValue<uint64_t>;
+    } else if (tokens[0] == "Author") {
+      src       = ENCODE<uint64_t, std::string, UINT>(tokens[1]);
+      type      = TYPES::AUTHOR;
+      src_type  = TYPES::PERSON;
+      src_glbid = shad::data_types::kNullValue<uint64_t>;
+      dst_glbid = shad::data_types::kNullValue<uint64_t>;
+      if (tokens[3] != "")
+        dst = ENCODE<uint64_t, std::string, UINT>(tokens[3]);
+      else if (tokens[4] != "")
+        dst = ENCODE<uint64_t, std::string, UINT>(tokens[4]);
+      else if (tokens[5] != "")
+        dst = ENCODE<uint64_t, std::string, UINT>(tokens[5]);
+      if (tokens[3] != "")
+        dst_type = TYPES::FORUM;
+      else if (tokens[4] != "")
+        dst_type = TYPES::FORUMEVENT;
+      else if (tokens[5] != "")
+        dst_type = TYPES::PUBLICATION;
+    } else if (tokens[0] == "Includes") {
+      src       = ENCODE<uint64_t, std::string, UINT>(tokens[3]);
+      dst       = ENCODE<uint64_t, std::string, UINT>(tokens[4]);
+      type      = TYPES::INCLUDES;
+      src_type  = TYPES::FORUM;
+      dst_type  = TYPES::FORUMEVENT;
+      src_glbid = shad::data_types::kNullValue<uint64_t>;
+      dst_glbid = shad::data_types::kNullValue<uint64_t>;
+    } else if (tokens[0] == "HasTopic") {
+      dst       = ENCODE<uint64_t, std::string, UINT>(tokens[6]);
+      type      = TYPES::HASTOPIC;
+      dst_type  = TYPES::TOPIC;
+      src_glbid = shad::data_types::kNullValue<uint64_t>;
+      dst_glbid = shad::data_types::kNullValue<uint64_t>;
+      if (tokens[3] != "")
+        src = ENCODE<uint64_t, std::string, UINT>(tokens[3]);
+      else if (tokens[4] != "")
+        src = ENCODE<uint64_t, std::string, UINT>(tokens[4]);
+      else if (tokens[5] != "")
+        src = ENCODE<uint64_t, std::string, UINT>(tokens[5]);
+      if (tokens[3] != "")
+        src_type = TYPES::FORUM;
+      else if (tokens[4] != "")
+        src_type = TYPES::FORUMEVENT;
+      else if (tokens[5] != "")
+        src_type = TYPES::PUBLICATION;
+    } else if (tokens[0] == "HasOrg") {
+      src       = ENCODE<uint64_t, std::string, UINT>(tokens[5]);
+      dst       = ENCODE<uint64_t, std::string, UINT>(tokens[6]);
+      type      = TYPES::HASORG;
+      src_type  = TYPES::PUBLICATION;
+      dst_type  = TYPES::TOPIC;
       src_glbid = shad::data_types::kNullValue<uint64_t>;
       dst_glbid = shad::data_types::kNullValue<uint64_t>;
     }
-
-    Edge (std::vector <std::string> & tokens) {
-      if (tokens[0] == "Sale") {
-         src       = ENCODE<uint64_t, std::string, UINT>(tokens[1]);
-         dst       = ENCODE<uint64_t, std::string, UINT>(tokens[2]);
-         type      = TYPES::SALE;
-         src_type  = TYPES::PERSON;
-         dst_type  = TYPES::PERSON;
-         src_glbid = shad::data_types::kNullValue<uint64_t>;
-         dst_glbid = shad::data_types::kNullValue<uint64_t>;
-      } else if (tokens[0] == "Author") {
-         src  = ENCODE<uint64_t, std::string, UINT>(tokens[1]);
-         type      = TYPES::AUTHOR;
-         src_type  = TYPES::PERSON;
-         src_glbid = shad::data_types::kNullValue<uint64_t>;
-         dst_glbid = shad::data_types::kNullValue<uint64_t>;
-         if      (tokens[3] != "") dst = ENCODE<uint64_t, std::string, UINT>(tokens[3]);
-         else if (tokens[4] != "") dst = ENCODE<uint64_t, std::string, UINT>(tokens[4]);
-         else if (tokens[5] != "") dst = ENCODE<uint64_t, std::string, UINT>(tokens[5]);
-         if      (tokens[3] != "") dst_type = TYPES::FORUM;
-         else if (tokens[4] != "") dst_type = TYPES::FORUMEVENT;
-         else if (tokens[5] != "") dst_type = TYPES::PUBLICATION;
-      } else if (tokens[0] == "Includes") {
-         src       = ENCODE<uint64_t, std::string, UINT>(tokens[3]);
-         dst       = ENCODE<uint64_t, std::string, UINT>(tokens[4]);
-         type      = TYPES::INCLUDES;
-         src_type  = TYPES::FORUM;
-         dst_type  = TYPES::FORUMEVENT;
-         src_glbid = shad::data_types::kNullValue<uint64_t>;
-         dst_glbid = shad::data_types::kNullValue<uint64_t>;
-      } else if (tokens[0] == "HasTopic") {
-         dst       = ENCODE<uint64_t, std::string, UINT>(tokens[6]);
-         type      = TYPES::HASTOPIC;
-         dst_type  = TYPES::TOPIC;
-         src_glbid = shad::data_types::kNullValue<uint64_t>;
-         dst_glbid = shad::data_types::kNullValue<uint64_t>;
-         if      (tokens[3] != "") src = ENCODE<uint64_t, std::string, UINT>(tokens[3]);
-         else if (tokens[4] != "") src = ENCODE<uint64_t, std::string, UINT>(tokens[4]);
-         else if (tokens[5] != "") src = ENCODE<uint64_t, std::string, UINT>(tokens[5]);
-         if      (tokens[3] != "") src_type = TYPES::FORUM;
-         else if (tokens[4] != "") src_type = TYPES::FORUMEVENT;
-         else if (tokens[5] != "") src_type = TYPES::PUBLICATION;
-      } else if (tokens[0] == "HasOrg") {
-         src       = ENCODE<uint64_t, std::string, UINT>(tokens[5]);
-         dst       = ENCODE<uint64_t, std::string, UINT>(tokens[6]);
-         type      = TYPES::HASORG;
-         src_type  = TYPES::PUBLICATION;
-         dst_type  = TYPES::TOPIC;
-         src_glbid = shad::data_types::kNullValue<uint64_t>;
-         dst_glbid = shad::data_types::kNullValue<uint64_t>;
-      }
-    }
+  }
 };
 
-} // namespace agile::workflow1
+} // namespace shad
 
 #endif // GRAPH_H
diff --git a/libgalois/include/shad/GraphTypes.h b/libgalois/include/shad/GraphTypes.h
index eb84e123c2..e9f7afc0ab 100644
--- a/libgalois/include/shad/GraphTypes.h
+++ b/libgalois/include/shad/GraphTypes.h
@@ -66,6 +66,6 @@ enum class TYPES {
   NONE
 };
 
-} // namespace agile::workflow1
+} // namespace shad
 
 #endif // GRAPHTYPES_H
diff --git a/libgalois/include/shad/ShadGraphConverter.h b/libgalois/include/shad/ShadGraphConverter.h
index 4b1c0351db..87cef93a93 100644
--- a/libgalois/include/shad/ShadGraphConverter.h
+++ b/libgalois/include/shad/ShadGraphConverter.h
@@ -25,8 +25,7 @@ using ShadEdgeTy = uint64_t;
 class ShadGraphConverter {
 
 public:
-  ShadGraphConverter() :
-      nodeDataBuffer(nullptr) {}
+  ShadGraphConverter() : nodeDataBuffer(nullptr) {}
 
   ~ShadGraphConverter() {
     // BufferedGraph holds these arrays.
@@ -43,15 +42,15 @@ class ShadGraphConverter {
     std::ofstream fp("shad_graph.out");
     for (size_t i = 0; i < this->verticeIdKeyMapping.size(); ++i) {
       uint64_t key = this->verticeIdKeyMapping[i];
-      Vertex v = this->vertices[key];
-      fp << "node " << i << ", type: " << to_underlying(v.type) << ", key: " <<
-        key << "\n";
+      Vertex v     = this->vertices[key];
+      fp << "node " << i << ", type: " << to_underlying(v.type)
+         << ", key: " << key << "\n";
       auto edgeRange = this->edges.equal_range(key);
-      for (auto ei = edgeRange.first ; ei != edgeRange.second; ++ei) {
+      for (auto ei = edgeRange.first; ei != edgeRange.second; ++ei) {
         Edge& edge = ei->second;
         Vertex dst = this->vertices[edge.dst];
-        fp << "\t edge dst " << dst.id << ", type: " <<
-            to_underlying(edge.type) << ", key: " << dst.shadKey << "\n";
+        fp << "\t edge dst " << dst.id << ", type: " << to_underlying(edge.type)
+           << ", key: " << dst.shadKey << "\n";
       }
     }
     fp.close();
@@ -71,7 +70,7 @@ class ShadGraphConverter {
    * @param numEdges number of edges that this method reads
    */
   void InspectGraph(const std::string& filename, size_t* numNodes,
-      size_t* numEdges) {
+                    size_t* numEdges) {
     // TODO(hc): Get the number of nodes and edges from file
     // For example, it reads {SALE, Author, Includes, HasTopic, HasOrg} as
     // edges. So we just count how many they exist in the file.
@@ -85,9 +84,10 @@ class ShadGraphConverter {
     while (!file.eof()) {
       getline(file, line);
       // Skip comments.
-      if (line[0] == '#') continue;
+      if (line[0] == '#')
+        continue;
       // Delimiter and # tokens set for WMD data file.
-      std::vector <std::string> tokens = splitTokens(line, ',', 10);
+      std::vector<std::string> tokens = splitTokens(line, ',', 10);
 
       if (this->isTokenNodeType(tokens[0])) {
         ++(*numNodes);
@@ -96,15 +96,15 @@ class ShadGraphConverter {
       }
     }
 
-    std::cout << "Number of nodes:" << *numNodes << ", number of edges:" <<
-        *numEdges << "\n";
+    std::cout << "Number of nodes:" << *numNodes
+              << ", number of edges:" << *numEdges << "\n";
   }
 
   /**
    * @brief Construct a buffered graph from existing arrays constructed
    * by constructNodeArrays() and constructEdgeArrays().
    *
-   * @param numGlobalNodes The number of global nodes 
+   * @param numGlobalNodes The number of global nodes
    * @param numGlobalEdges The number of global edges
    * @param nodeBegin Global node ID of the first local node
    * @param nodeEnd (Global node ID of the last local node) + 1
@@ -113,32 +113,31 @@ class ShadGraphConverter {
    * @param bufferedGraph Buffered graph for CuSP
    */
   void constructBufferedGraph(
-      uint64_t numGlobalNodes, uint64_t numGlobalEdges,
-      uint32_t nodeBegin, uint32_t nodeEnd,
-      uint64_t edgeBegin, uint64_t edgeEnd,
-      [[maybe_unused]]galois::graphs::BufferedGraph<ShadEdgeTy>* bufferedGraph) {
+      uint64_t numGlobalNodes, uint64_t numGlobalEdges, uint32_t nodeBegin,
+      uint32_t nodeEnd, uint64_t edgeBegin, uint64_t edgeEnd,
+      [[maybe_unused]] galois::graphs::BufferedGraph<ShadEdgeTy>*
+          bufferedGraph) {
     // TODO(hc): Each of these functions first construct graphs in the SHAD
-    // format as this file is written in not binary, but string, and also 
+    // format as this file is written in not binary, but string, and also
     // nodes or edges are not sorted. So, until we preprocess the input graph
     // file, we should first read it in memory, and reconstruct this to Galois
-    // compatible 
+    // compatible
 
     uint32_t numLocalNodes = nodeEnd - nodeBegin;
     uint64_t numLocalEdges = edgeEnd - edgeBegin;
 
-    bufferedGraph->constructFrom(
-        outIndexBuffer, edgeDestBuffer, edgeDataBuffer,
-        numGlobalNodes, numGlobalEdges, numLocalNodes, numLocalEdges,
-        nodeBegin, edgeBegin);
+    bufferedGraph->constructFrom(outIndexBuffer, edgeDestBuffer, edgeDataBuffer,
+                                 numGlobalNodes, numGlobalEdges, numLocalNodes,
+                                 numLocalEdges, nodeBegin, edgeBegin);
 #if 0
-    TODO(hc): This verification should be fixed since it tests 
+    TODO(hc): This verification should be fixed since it tests
               a shared-memory execution that one host loads the whole
               graph. It should not work on distributed-memory machine
               since a CSR graph should be partitioned but tepmorary
               maps reading and holding SHAD graphs are for global graph.
 #ifndef NDEBUG
     std::cout << "CSR verification starts.." << std::endl << std::flush;
-    this->VerifyCSRConstruction(outIndexBuffer, nodeDataBuffer, 
+    this->VerifyCSRConstruction(outIndexBuffer, nodeDataBuffer,
         edgeDestBuffer, edgeDataBuffer);
     std::cout << "CSR verification starts.. [done]" << std::endl << std::flush;
 #endif
@@ -153,20 +152,20 @@ class ShadGraphConverter {
    */
   // TODO(hc): We can assign a disjointed range of file for each host.
   // For now, let all hosts read the whole file.
-  void readSHADFile(
-      const std::string& filename, uint64_t* numGlobalNodes,
-      uint64_t *numGlobalEdges) {
+  void readSHADFile(const std::string& filename, uint64_t* numGlobalNodes,
+                    uint64_t* numGlobalEdges) {
     std::ifstream graphFile(filename.c_str());
     uint64_t vertexId{0};
     std::string line;
     uint64_t numNodes{0}, numEdges{0};
     // TODO(hc): We can parallelize it by assigning disjointed
     // ranges with some inspection.
-    // But this would be the future work as 
+    // But this would be the future work as
     while (!graphFile.eof()) {
       getline(graphFile, line);
       // Skip comments.
-      if (line[0] == '#') continue;
+      if (line[0] == '#')
+        continue;
       // Delimiter and # tokens set for WMD data file.
       std::vector<std::string> tokens = splitTokens(line, ',', 10);
 
@@ -242,7 +241,7 @@ class ShadGraphConverter {
       }
     }
 
-    // After the above loop, vertices and edges are complete. 
+    // After the above loop, vertices and edges are complete.
     this->CountNumEdgesForEachVertex(numNodes, numEdges);
     *numGlobalNodes = numNodes;
     *numGlobalEdges = numEdges;
@@ -253,20 +252,16 @@ class ShadGraphConverter {
   }
 
   /**
-   * @brief Return node data array. 
+   * @brief Return node data array.
    * Note that this can be either of global graph or local graph.
    */
-  ShadNodeTy* getNodeDataBuffer() {
-    return nodeDataBuffer;
-  }
+  ShadNodeTy* getNodeDataBuffer() { return nodeDataBuffer; }
 
   /**
    * @brief Return node outgoing edge index array
    * Note that this can be either of global graph or local graph.
    */
-  uint64_t* getOutIndexBuffer() {
-    return outIndexBuffer;
-  }
+  uint64_t* getOutIndexBuffer() { return outIndexBuffer; }
 
   /**
    * @brief Construct vertex outgoing edge range buffer and
@@ -282,36 +277,33 @@ class ShadGraphConverter {
    * @param numLocalNodes The number of local nodes
    *
    */
-  void constructNodeArrays(
-      uint32_t nodeBegin, uint32_t nodeEnd, uint32_t numLocalNodes) {
+  void constructNodeArrays(uint32_t nodeBegin, uint32_t nodeEnd,
+                           uint32_t numLocalNodes) {
     // 1) Construct an edge index array (size == number of nodes).
     this->outIndexBuffer = new uint64_t[numLocalNodes];
     this->nodeDataBuffer = new ShadNodeTy[numLocalNodes];
 
-    // TODO(hc): for now, only consider a single host, but need to add offset later.
-    galois::do_all(galois::iterate(this->vertices),
-        [&](auto element) {
-          Vertex& vertex = element.second;
-          uint64_t vertexId = vertex.id;
-          if (vertexId >= nodeBegin && vertexId < nodeEnd) {
-            this->outIndexBuffer[vertexId - nodeBegin] =
-                vertex.getNumEdges();
-            // Fill vertex data too; This assumes that a SHAD graph
-            // has a type, which is considered as a vertex data.
-            this->nodeDataBuffer[vertexId - nodeBegin].type =
-                this->to_underlying(vertex.type);
-            this->nodeDataBuffer[vertexId - nodeBegin].key =
-                vertex.shadKey;
-            //std::cout << vertexId - nodeBegin << " is set to "
-            //<< this->nodeDataBuffer[vertexId - nodeBegin].type << " and " <<
-            //this->nodeDataBuffer[vertexId - nodeBegin].key << "\n";
-          }
-        });
+    // TODO(hc): for now, only consider a single host, but need to add offset
+    // later.
+    galois::do_all(galois::iterate(this->vertices), [&](auto element) {
+      Vertex& vertex    = element.second;
+      uint64_t vertexId = vertex.id;
+      if (vertexId >= nodeBegin && vertexId < nodeEnd) {
+        this->outIndexBuffer[vertexId - nodeBegin] = vertex.getNumEdges();
+        // Fill vertex data too; This assumes that a SHAD graph
+        // has a type, which is considered as a vertex data.
+        this->nodeDataBuffer[vertexId - nodeBegin].type =
+            this->to_underlying(vertex.type);
+        this->nodeDataBuffer[vertexId - nodeBegin].key = vertex.shadKey;
+        // std::cout << vertexId - nodeBegin << " is set to "
+        //<< this->nodeDataBuffer[vertexId - nodeBegin].type << " and " <<
+        // this->nodeDataBuffer[vertexId - nodeBegin].key << "\n";
+      }
+    });
     // 2) Perform parallel prefix sum to finalize outgoing edge index
     // array construction.
     galois::ParallelSTL::partial_sum(
-        outIndexBuffer, &(outIndexBuffer[numLocalNodes]),
-        outIndexBuffer);
+        outIndexBuffer, &(outIndexBuffer[numLocalNodes]), outIndexBuffer);
   }
 
   /**
@@ -331,12 +323,10 @@ class ShadGraphConverter {
    * @param numLocalEdges The number of local edges
    *
    */
-  template <typename T = ShadEdgeTy,
-            typename std::enable_if_t<!std::is_same_v<
-                T, void>>* = nullptr>
-  void constructEdgeArrays(
-      uint32_t nodeBegin, uint64_t edgeBegin, uint32_t numLocalNodes,
-      uint64_t numLocalEdges) {
+  template <typename T                                           = ShadEdgeTy,
+            typename std::enable_if_t<!std::is_same_v<T, void>>* = nullptr>
+  void constructEdgeArrays(uint32_t nodeBegin, uint64_t edgeBegin,
+                           uint32_t numLocalNodes, uint64_t numLocalEdges) {
     this->edgeDestBuffer = new uint32_t[numLocalEdges];
     this->edgeDataBuffer = new ShadEdgeTy[numLocalEdges];
     std::vector<uint32_t> edgeIndexPointers(numLocalNodes, 0);
@@ -346,8 +336,8 @@ class ShadGraphConverter {
           galois::block_range(uint32_t{0}, numLocalNodes, tid, numThreads);
       // 2) Each thread iterates the whole edges.
       for (auto edgeElem : this->edges) {
-        uint64_t srcVertex = edgeElem.first;
-        Vertex& vertex = this->vertices[srcVertex];
+        uint64_t srcVertex   = edgeElem.first;
+        Vertex& vertex       = this->vertices[srcVertex];
         uint64_t srcVertexId = vertex.id;
         // 3) Each thread fills edge destination for the assigned nodes.
         if (srcVertexId >= thread_work_range.first + nodeBegin &&
@@ -356,8 +346,9 @@ class ShadGraphConverter {
           // OutIndexBuffer now contains global edge range.
           // So we need to subtract edge offset to get the local edge id.
           uint64_t nodeBaseOffset =
-              ((srcVertexId - nodeBegin) == 0)?
-                  0 : outIndexBuffer[srcVertexId - nodeBegin - 1] - edgeBegin;
+              ((srcVertexId - nodeBegin) == 0)
+                  ? 0
+                  : outIndexBuffer[srcVertexId - nodeBegin - 1] - edgeBegin;
           edgeDestBuffer[edgeIdx + nodeBaseOffset] =
               this->vertices[edgeElem.second.dst].id;
           edgeDataBuffer[edgeIdx + nodeBaseOffset] =
@@ -386,12 +377,10 @@ class ShadGraphConverter {
    * @param numLocalEdges The number of local edges
    *
    */
-  template <typename T = ShadEdgeTy,
-            typename std::enable_if_t<std::is_same_v<
-                T, void>>* = nullptr>
-  void constructEdgeArrays(
-      uint32_t nodeBegin, uint64_t edgeBegin, uint32_t numLocalNodes,
-      uint64_t numLocalEdges) {
+  template <typename T                                          = ShadEdgeTy,
+            typename std::enable_if_t<std::is_same_v<T, void>>* = nullptr>
+  void constructEdgeArrays(uint32_t nodeBegin, uint64_t edgeBegin,
+                           uint32_t numLocalNodes, uint64_t numLocalEdges) {
     edgeDestBuffer = new uint32_t[numLocalEdges];
     std::vector<uint32_t> edgeIndexPointers(numLocalNodes, 0);
     galois::on_each([&](uint32_t tid, uint32_t numThreads) {
@@ -400,16 +389,17 @@ class ShadGraphConverter {
           galois::block_range(uint32_t{0}, numLocalNodes, tid, numThreads);
       // 2) Each thread iterates the whole edges.
       for (auto edgeElem : this->edges) {
-        uint64_t srcVertex = edgeElem.first;
-        Vertex& vertex = this->vertices[srcVertex];
+        uint64_t srcVertex   = edgeElem.first;
+        Vertex& vertex       = this->vertices[srcVertex];
         uint64_t srcVertexId = vertex.id;
         // 3) Each thread fills edge destination for the assigned nodes.
         if (srcVertexId >= thread_work_range.first + nodeBegin &&
             srcVertexId < thread_work_range.second + nodeBegin) {
           uint64_t edgeIdx = edgeIndexPointers[srcVertexId - nodeBegin]++;
           uint64_t nodeBaseOffset =
-              ((srcVertexId - nodeBegin)== 0)?
-                  0 : outIndexBuffer[srcVertexId - 1] - edgeBegin;
+              ((srcVertexId - nodeBegin) == 0)
+                  ? 0
+                  : outIndexBuffer[srcVertexId - 1] - edgeBegin;
           edgeDestBuffer[edgeIdx + nodeBaseOffset] =
               this->vertices[edgeElem.second.dst].id;
         }
@@ -421,7 +411,7 @@ class ShadGraphConverter {
 
   /**
    * @brief Extract outgoing edge index ranges for local vertices
-   * from the global outgoing edge index range array. 
+   * from the global outgoing edge index range array.
    *
    * @param nodeBegin Node global id of the first local node
    * @param nodeEnd (Node global id for the last local node + 1)
@@ -429,10 +419,9 @@ class ShadGraphConverter {
   void extractLocalOutIndexArray(uint32_t nodeBegin, uint32_t nodeEnd) {
 
     uint64_t* newOutIndexBuffer = new uint64_t[nodeEnd - nodeBegin];
-    galois::do_all(galois::iterate(nodeBegin, nodeEnd),
-        [&](uint32_t n) {
-          newOutIndexBuffer[n - nodeBegin] = this->outIndexBuffer[n];
-        } );
+    galois::do_all(galois::iterate(nodeBegin, nodeEnd), [&](uint32_t n) {
+      newOutIndexBuffer[n - nodeBegin] = this->outIndexBuffer[n];
+    });
     delete[] this->outIndexBuffer;
     this->outIndexBuffer = newOutIndexBuffer;
   }
@@ -449,7 +438,7 @@ class ShadGraphConverter {
    * a temporary vertex map
    */
   bool checkNode(uint64_t id, int type) {
-    uint64_t key = this->verticeIdKeyMapping[id];
+    uint64_t key   = this->verticeIdKeyMapping[id];
     Vertex& vertex = this->vertices[key];
     return (this->to_underlying(vertex.type) == type);
   }
@@ -467,20 +456,19 @@ class ShadGraphConverter {
    * @return True if passed information matches to the one in
    * a temporary edge map
    */
-  bool checkEdge(uint64_t snid, uint64_t dnid,
-      uint64_t /*eid*/, int type) {
-    uint64_t skey = this->verticeIdKeyMapping[snid];
+  bool checkEdge(uint64_t snid, uint64_t dnid, uint64_t /*eid*/, int type) {
+    uint64_t skey  = this->verticeIdKeyMapping[snid];
     auto edgeRange = this->edges.equal_range(skey);
     uint64_t eidx{0};
     Edge edge;
     bool found{false};
-    for (auto ei = edgeRange.first ; ei != edgeRange.second; ++ei, ++eidx) {
+    for (auto ei = edgeRange.first; ei != edgeRange.second; ++ei, ++eidx) {
       edge = ei->second;
       // Multiple edges having the same source and destination could
       // exist. So we repeat until find the one that has the same type to
       // the passed one.
       if (this->vertices[edge.dst].id == dnid &&
-            this->to_underlying(edge.type) == type) {
+          this->to_underlying(edge.type) == type) {
         found = true;
         break;
       }
@@ -517,16 +505,16 @@ class ShadGraphConverter {
     }
   }
 
-  std::vector<std::string> splitTokens(
-      std::string& line, char delim, uint64_t size = 0) {
+  std::vector<std::string> splitTokens(std::string& line, char delim,
+                                       uint64_t size = 0) {
     uint64_t ndx = 0, start = 0, end = 0;
-    std::vector <std::string> tokens(size);
+    std::vector<std::string> tokens(size);
 
-    for ( ; end < line.length(); end ++)  {
+    for (; end < line.length(); end++) {
       if ((line[end] == delim) || (line[end] == '\n')) {
-         tokens[ndx] = line.substr(start, end - start);
-         start = end + 1;
-         ndx ++;
+        tokens[ndx] = line.substr(start, end - start);
+        start       = end + 1;
+        ndx++;
       }
     }
 
@@ -536,9 +524,8 @@ class ShadGraphConverter {
   }
 
   void CountNumEdgesForEachVertex(uint64_t numNodes, uint64_t /*numEdges*/) {
-    //galois::on_each([this, numNodes, numEdges](
-    galois::on_each([&](
-        uint32_t tid, uint32_t numThreads) {
+    // galois::on_each([this, numNodes, numEdges](
+    galois::on_each([&](uint32_t tid, uint32_t numThreads) {
       // Each thread is assigned disjointed range of nodes.
       // Each thread iterates edges and accumulates edges for only
       // the nodes assigned to that.
@@ -546,7 +533,7 @@ class ShadGraphConverter {
           galois::block_range(uint64_t{0}, numNodes, tid, numThreads);
       for (auto edgeElem : this->edges) {
         uint64_t srcVertex = edgeElem.first;
-        Vertex& vertex = this->vertices[srcVertex];
+        Vertex& vertex     = this->vertices[srcVertex];
         if (vertex.id >= thread_work_range.first &&
             vertex.id < thread_work_range.second) {
           vertex.incrNumEdges();
@@ -569,7 +556,7 @@ class ShadGraphConverter {
   void insertSHADVertex(const uint64_t& key, const TYPES& type, uint64_t& id) {
     auto found = this->vertices.find(key);
     if (found == this->vertices.end()) {
-      this->vertices[key] = Vertex(id, type, key); 
+      this->vertices[key]           = Vertex(id, type, key);
       this->verticeIdKeyMapping[id] = key;
       id++;
     } else {
@@ -579,7 +566,7 @@ class ShadGraphConverter {
 
   /**
    * @brief Insert SHAD edge to a edge map.
-   * @detail Edges 
+   * @detail Edges
    *
    * @param vertexKey Source vertex's SHAD token key
    * @param edge Adjacent edge of the vertex
@@ -616,9 +603,9 @@ class ShadGraphConverter {
       // 3) Check if vertex information in the edges map is equal to the one
       // in the vertex map.
       assert(element.second.src_type ==
-          this->vertices[element.second.src].type);
+             this->vertices[element.second.src].type);
       assert(element.second.dst_type ==
-          this->vertices[element.second.dst].type);
+             this->vertices[element.second.dst].type);
     }
   }
 
@@ -627,62 +614,63 @@ class ShadGraphConverter {
     // the number of total edges counted during inspection.
     uint64_t numAccumulatedEdges{0};
     for (auto& element : this->vertices) {
-      numAccumulatedEdges += element.second.getNumEdges(); 
+      numAccumulatedEdges += element.second.getNumEdges();
     }
     assert(numAccumulatedEdges == numEdges);
   }
 
-  void VerifyCSRConstruction(
-      [[maybe_unused]] uint64_t* outIndexBuffer,
-      [[maybe_unused]] ShadNodeTy* nodeDataBuffer,
-      [[maybe_unused]] uint32_t* edgeDestBuffer,
-      [[maybe_unused]] void* edgeDataBuffer) {}
+  void VerifyCSRConstruction([[maybe_unused]] uint64_t* outIndexBuffer,
+                             [[maybe_unused]] ShadNodeTy* nodeDataBuffer,
+                             [[maybe_unused]] uint32_t* edgeDestBuffer,
+                             [[maybe_unused]] void* edgeDataBuffer) {}
 
   template <typename T = ShadEdgeTy,
-            typename std::enable_if_t<std::is_same_v<
-                T, uint64_t>>* = nullptr>
-  void VerifyCSRConstruction(
-      uint64_t* outIndexBuffer, [[maybe_unused]] ShadNodeTy* nodeDataBuffer,
-      uint32_t* edgeDestBuffer, ShadEdgeTy* edgeDataBuffer) {
+            typename std::enable_if_t<std::is_same_v<T, uint64_t>>* = nullptr>
+  void VerifyCSRConstruction(uint64_t* outIndexBuffer,
+                             [[maybe_unused]] ShadNodeTy* nodeDataBuffer,
+                             uint32_t* edgeDestBuffer,
+                             ShadEdgeTy* edgeDataBuffer) {
     // 1) Iterate edge index array.
-    // 2) Compare each verteices' edge range with SHAD vertex 
+    // 2) Compare each verteices' edge range with SHAD vertex
     for (size_t i = 0; i < this->vertices.size(); ++i) {
-      Vertex& srcV = this->vertices[this->verticeIdKeyMapping[i]];
+      Vertex& srcV        = this->vertices[this->verticeIdKeyMapping[i]];
       uint64_t srcShadKey = srcV.shadKey;
       assert(this->verticeIdKeyMapping[i] == srcV.shadKey);
-      uint64_t edgeBegin = (i == 0)? 0 : outIndexBuffer[i - 1];
-      uint64_t edgeEnd = outIndexBuffer[i];
+      uint64_t edgeBegin = (i == 0) ? 0 : outIndexBuffer[i - 1];
+      uint64_t edgeEnd   = outIndexBuffer[i];
       assert(srcV.numEdges == edgeEnd - edgeBegin);
       assert(this->to_underlying(srcV.type) == int(nodeDataBuffer[i].type));
       assert(srcV.id == i);
-      galois::do_all(galois::iterate(edgeBegin, edgeEnd),
+      galois::do_all(
+          galois::iterate(edgeBegin, edgeEnd),
           [&](size_t j) {
-        uint32_t dstV = edgeDestBuffer[j];
-        [[maybe_unused]] uint64_t edgeData = edgeDataBuffer[j];
-
-        [[maybe_unused]] bool found{false};
-        auto edgeRange = this->edges.equal_range(srcShadKey);
-        size_t cnt{0};
-        for (auto ei = edgeRange.first ; ei != edgeRange.second; ++ei) {
-          Edge& edge = ei->second;
-          if (this->vertices[edge.dst].id == dstV) {
-            // Multiple edges between vertices are possible.
-            if (this->to_underlying(edge.type) == int(edgeData)) {
-              assert(this->vertices[edge.src].id == i);
-              assert(this->vertices[edge.src].id == srcV.id);
-              found = true;
+            uint32_t dstV                      = edgeDestBuffer[j];
+            [[maybe_unused]] uint64_t edgeData = edgeDataBuffer[j];
+
+            [[maybe_unused]] bool found{false};
+            auto edgeRange = this->edges.equal_range(srcShadKey);
+            size_t cnt{0};
+            for (auto ei = edgeRange.first; ei != edgeRange.second; ++ei) {
+              Edge& edge = ei->second;
+              if (this->vertices[edge.dst].id == dstV) {
+                // Multiple edges between vertices are possible.
+                if (this->to_underlying(edge.type) == int(edgeData)) {
+                  assert(this->vertices[edge.src].id == i);
+                  assert(this->vertices[edge.src].id == srcV.id);
+                  found = true;
+                }
+              }
+              cnt++;
             }
-          }
-          cnt++;
-        }
-        assert((edgeEnd - edgeBegin) == cnt);
-        /*
-        for (auto i = this->edges.begin(); i != this->edges.end(); ++i) {
-          std::cout << srcId << " vs " << i->first << "\n"; 
-        }
-        */
-        assert(found);
-      }, galois::steal());
+            assert((edgeEnd - edgeBegin) == cnt);
+            /*
+            for (auto i = this->edges.begin(); i != this->edges.end(); ++i) {
+              std::cout << srcId << " vs " << i->first << "\n";
+            }
+            */
+            assert(found);
+          },
+          galois::steal());
     }
   }
 #endif
@@ -691,14 +679,14 @@ class ShadGraphConverter {
    * @brief Cast a type to an underlying type; in case of scoped enum,
    * this should be an integral type.
    *
-   * @param e 
+   * @param e
    */
   template <typename E>
   constexpr typename std::underlying_type<E>::type to_underlying(E e) noexcept {
-      return static_cast<typename std::underlying_type<E>::type>(e);
+    return static_cast<typename std::underlying_type<E>::type>(e);
   }
 
-  // This holds the whole global vertices and their 
+  // This holds the whole global vertices and their
   // information such as its type. A key is globla node ID, and its value
   // is the information.
   std::unordered_map<uint64_t, Vertex> vertices;
@@ -715,6 +703,6 @@ class ShadGraphConverter {
   ShadEdgeTy* edgeDataBuffer;
 };
 
-}; // shad namespace
+}; // namespace shad
 
 #endif
diff --git a/libgalois/src/FileGraph.cpp b/libgalois/src/FileGraph.cpp
index 420854378b..97db8c7aac 100644
--- a/libgalois/src/FileGraph.cpp
+++ b/libgalois/src/FileGraph.cpp
@@ -709,7 +709,7 @@ void FileGraphWriter::phase1() {
   graphVersion = numNodes <= std::numeric_limits<uint32_t>::max() ? 1 : 2;
 
   size_t bytes    = galois::graphs::rawBlockSize(numNodes, numEdges, sizeofEdge,
-                                              graphVersion);
+                                                 graphVersion);
   char* mmap_base = reinterpret_cast<char*>(mmap(
       nullptr, bytes, PROT_READ | PROT_WRITE, _MAP_ANON | MAP_PRIVATE, -1, 0));
   if (mmap_base == MAP_FAILED)
diff --git a/libgalois/test/bandwidth.cpp b/libgalois/test/bandwidth.cpp
index e30d8cf061..0550c20000 100644
--- a/libgalois/test/bandwidth.cpp
+++ b/libgalois/test/bandwidth.cpp
@@ -79,7 +79,7 @@ void run_interleaved(size_t seed, size_t mega, bool full) {
   auto ptr    = galois::substrate::largeMallocInterleaved(
       size * sizeof(int),
       full ? galois::substrate::getThreadPool().getMaxThreads()
-           : galois::runtime::activeThreads);
+              : galois::runtime::activeThreads);
   int* block = (int*)ptr.get();
 
   run_interleaved_helper r(block, seed, size);
diff --git a/libgalois/test/move.cpp b/libgalois/test/move.cpp
index 608fc4651b..5f04b7fa8e 100644
--- a/libgalois/test/move.cpp
+++ b/libgalois/test/move.cpp
@@ -26,17 +26,17 @@
 #include "galois/substrate/PerThreadStorage.h"
 
 struct MoveOnly {
-  MoveOnly()           = default;
-  MoveOnly(MoveOnly&&) = default;
-  MoveOnly& operator=(MoveOnly&&) = default;
-  MoveOnly(const MoveOnly&)       = delete;
+  MoveOnly()                           = default;
+  MoveOnly(MoveOnly&&)                 = default;
+  MoveOnly& operator=(MoveOnly&&)      = default;
+  MoveOnly(const MoveOnly&)            = delete;
   MoveOnly& operator=(const MoveOnly&) = delete;
 };
 
 struct MoveOnlyA {
   int* x;
   MoveOnlyA() {}
-  MoveOnlyA(const MoveOnlyA&) = delete;
+  MoveOnlyA(const MoveOnlyA&)           = delete;
   MoveOnly& operator=(const MoveOnlyA&) = delete;
   ~MoveOnlyA() {}
 };
diff --git a/libgalois/test/reduction.cpp b/libgalois/test/reduction.cpp
index ef5fc3be99..3285fcf9e8 100644
--- a/libgalois/test/reduction.cpp
+++ b/libgalois/test/reduction.cpp
@@ -12,11 +12,11 @@ struct Move {
   Move(const Move&) = delete;
   Move(Move&&) noexcept {}
   Move& operator=(const Move&) = delete;
-  Move& operator               =(Move&&) noexcept { return *this; }
+  Move& operator=(Move&&) noexcept { return *this; }
 };
 
 void test_move() {
-  auto merge_fn = [](Move& a, Move &&) -> Move& { return a; };
+  auto merge_fn = [](Move& a, Move&&) -> Move& { return a; };
 
   auto identity_fn = []() { return Move(); };
 
diff --git a/libgluon/include/galois/graphs/GluonSubstrate.h b/libgluon/include/galois/graphs/GluonSubstrate.h
index ec24bf2ce6..9d53b080ba 100644
--- a/libgluon/include/galois/graphs/GluonSubstrate.h
+++ b/libgluon/include/galois/graphs/GluonSubstrate.h
@@ -473,14 +473,13 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
 
   void RevertHandshakeToRealGraph() {
     // XXX make sure I dont need anything else
-    masterNodes = &master_nodes_concrete_;
-    mirrorNodes = &(userGraph.getMirrorNodes());
+    masterNodes   = &master_nodes_concrete_;
+    mirrorNodes   = &(userGraph.getMirrorNodes());
     maxSharedSize = original_max_shared_size_;
   }
 
-  void
-  SetupSubgraphMirrors(std::vector<std::vector<size_t>>& subgraph_mirrors,
-                       bool use_timer) {
+  void SetupSubgraphMirrors(std::vector<std::vector<size_t>>& subgraph_mirrors,
+                            bool use_timer) {
     galois::StatTimer t("SubgraphMirrorSetup");
     if (use_timer) {
       t.start();
@@ -4751,13 +4750,13 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
 ///*
 // * Headers for boost serialization
 // */
-//#include <boost/archive/binary_oarchive.hpp>
-//#include <boost/archive/binary_iarchive.hpp>
-//#include <boost/serialization/split_member.hpp>
-//#include <boost/serialization/binary_object.hpp>
-//#include <boost/serialization/serialization.hpp>
-//#include <boost/serialization/vector.hpp>
-//#include <boost/serialization/unordered_map.hpp>
+// #include <boost/archive/binary_oarchive.hpp>
+// #include <boost/archive/binary_iarchive.hpp>
+// #include <boost/serialization/split_member.hpp>
+// #include <boost/serialization/binary_object.hpp>
+// #include <boost/serialization/serialization.hpp>
+// #include <boost/serialization/vector.hpp>
+// #include <boost/serialization/unordered_map.hpp>
 //
 // public:
 //  /**
diff --git a/libgluon/include/galois/runtime/SyncStructures.h b/libgluon/include/galois/runtime/SyncStructures.h
index 56cf8dd311..588403ad83 100644
--- a/libgluon/include/galois/runtime/SyncStructures.h
+++ b/libgluon/include/galois/runtime/SyncStructures.h
@@ -1988,7 +1988,7 @@ class FieldFlags {
   template <typename NTy>                                                      \
   struct GNNSumAggregate_##fieldname {                                         \
     using NodeTy = NTy;                                                        \
-    using ValTy = GNNFloat;                                                    \
+    using ValTy  = GNNFloat;                                                   \
                                                                                \
     static ValTy extract(uint32_t, NodeTy&) { return 0.f; }                    \
                                                                                \
diff --git a/libgnn/README.md b/libgnn/README.md
index dbca774922..2f3bf1a3aa 100644
--- a/libgnn/README.md
+++ b/libgnn/README.md
@@ -91,7 +91,7 @@ code has to occur before backward is called).
 
 Regarding the backward step: it turns out that for single class
 classification, the gradient if the answer is wrong is simply
-the softmax value itself, and if the answer is right, then its 
+the softmax value itself, and if the answer is right, then its
 the softmax value - 1. This has the advantage of being very
 numerically stable as well.
 
@@ -106,7 +106,7 @@ ReLU activation is used by the compute layers: if the value
 is greater than 0, it is kept, else it is discarded.
 
 Because the forward output matrix gets overwritten during
-the backward step and because the derivative of the 
+the backward step and because the derivative of the
 ReLU operation requires knowledge of what elements were
 affected by the ReLU, the system must *track* which
 elements were not set to 0 using a bitmask. This
@@ -151,7 +151,7 @@ the length of the vector. Actually doing this in the feature
 matrix is not great as it would mean that the original weight
 matrix needs to double in size, and additional space would have
 to be allocated on top of the existing input features
-with the aggregated copied over to it. 
+with the aggregated copied over to it.
 
 Instead of doing this, you can allocate a separate weight matrix
 of the same size as the original, multiply the original input
@@ -187,7 +187,7 @@ after aggregation, the rows of the matrix go from IR to OR.
 Therefore, after linear xform, IC turns to OC.
 
 After both operations, the output matrix to the next layer is the
-expected OR by OC. Depending on which one occurs first, 
+expected OR by OC. Depending on which one occurs first,
 the code generates an intermediate of OR by IC *or* IC by OC.
 (more than one may be needed if dropout is used as that generates
 a new dropout matrix).
@@ -370,7 +370,7 @@ The way this works is relatively simple: the code loops
 through each layer and calls the forward or backward pass function
 on it.
 
-Depending on how the test interval is set, between each epoch 
+Depending on how the test interval is set, between each epoch
 a test subgraph may be used to check test accuracy.
 The flaw with the current design is that the graph object is
 only aware of one 'graph' at any one point, meaning the code
@@ -382,7 +382,7 @@ a status that is set on nodes based on the minibatch and only
 includes *local seed nodes*, so keep this in mind when using it (there
 have been unintentional problems where I assumed `kBatch` meant
 more than just local seed nodes). The main reason for this is
-that it helps to distinguish local and global seed nodes to avoid 
+that it helps to distinguish local and global seed nodes to avoid
 over-calculating gradients.
 
 # GNN Graph
@@ -450,7 +450,7 @@ to keep things correct.
 In addition, the degree of a node for each sampled phase locally
 is kept track of. At the end of all sampling, the degrees
 of the nodes at each layer are synchronized among all hosts.
-This is required because normalization in aggregation uses 
+This is required because normalization in aggregation uses
 the subgraph degrees (this is actually quite annoying runtime
 wise as it adds this extra degree sync step).
 
@@ -465,7 +465,7 @@ the CSR; this includes edges that may not always be active.
 4) Create the local subgraph features matrix by copying
 them over from the original feature matrix.
 
-In order to make row elimination easier, 
+In order to make row elimination easier,
 the SID of the vertices are ordered such that seed nodes are
 first, the 1-hop samples next, then 2-hops, 3-hops, etc.
 This makes it easy to eliminate vertices that aren't used after
@@ -559,4 +559,4 @@ Some updates will need to be made in order to do dynamic resizing of the
 data depending on the size of the minibatch. The best way to avoid this
 in general, though, is to just allocate space for the test subgraph's
 k-hops since that is likely to be more expensive than whatever
-the minibatch size for the train nodes are (unless it's all nodes).
\ No newline at end of file
+the minibatch size for the train nodes are (unless it's all nodes).
diff --git a/libgnn/include/galois/graphs/DegreeSyncStructures.h b/libgnn/include/galois/graphs/DegreeSyncStructures.h
index a104f18bff..d08913caf0 100644
--- a/libgnn/include/galois/graphs/DegreeSyncStructures.h
+++ b/libgnn/include/galois/graphs/DegreeSyncStructures.h
@@ -1,5 +1,5 @@
 #include "galois/GNNTypes.h"
-//#include "galois/Logging.h"
+// #include "galois/Logging.h"
 
 namespace galois {
 namespace graphs {
diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 146daf24b3..db5df02223 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -55,8 +55,7 @@ class GNNGraph {
   GNNGraph(const std::string& dataset_name, GNNPartitionScheme partition_scheme,
            bool has_single_class_label, bool useWMD = false)
       : GNNGraph(galois::default_gnn_dataset_path, dataset_name,
-                 partition_scheme, has_single_class_label,
-                 useWMD) {}
+                 partition_scheme, has_single_class_label, useWMD) {}
 
   //! Loads a graph and all relevant metadata (labels, features, masks, etc.)
   GNNGraph(const std::string& input_directory, const std::string& dataset_name,
@@ -72,8 +71,8 @@ class GNNGraph {
         std::to_string(galois::runtime::getSystemNetworkInterface().ID) +
         std::string("] ");
     // load partition
-    partitioned_graph_ = LoadPartition(input_directory_, dataset_name,
-                                       partition_scheme, useWMD);
+    partitioned_graph_ =
+        LoadPartition(input_directory_, dataset_name, partition_scheme, useWMD);
     galois::gInfo(host_prefix_, "Loading partition is completed");
     // reverse edges
     partitioned_graph_->ConstructIncomingEdges();
@@ -918,9 +917,9 @@ class GNNGraph {
     }
   }
 
-  template <
-      typename T                                                      = VTy,
-      typename std::enable_if_t<!std::is_same_v<T, shad::ShadNodeTy>>* = nullptr>
+  template <typename T = VTy,
+            typename std::enable_if_t<!std::is_same_v<T, shad::ShadNodeTy>>* =
+                nullptr>
   void ConstructFeatureBy2HopAggregation() {}
 
   void ConstructFeatureBy2HopAggregationGPU() {
@@ -934,11 +933,11 @@ class GNNGraph {
   void ConstructFeatureBy2HopAggregationCPU() {
     galois::gInfo("Construct an initial feature on CPU by "
                   "aggregating and concatenating neighbors' features.");
-    //this->PrintFeatures("0hop");
-    // this->FillTestNodeType();
-    //this->PrintGraphTopo("before");
+    // this->PrintFeatures("0hop");
+    //  this->FillTestNodeType();
+    // this->PrintGraphTopo("before");
     this->Construct1HopFeatureCPU();
-    //this->PrintFeatures("1hop");
+    // this->PrintFeatures("1hop");
     this->Construct2HopFeatureCPU();
     this->PrintFeatures("2hop");
   }
@@ -1009,7 +1008,6 @@ class GNNGraph {
         "GraphAggregateSync");
   }
 
-
   /// Construct feature from 2-hop neighbors.
   /// After `Construct1HopFeatureCPU()`, each vertex aggregates types of
   /// the outgoing edges and neighbors, and constructs a histogram for
@@ -1089,9 +1087,9 @@ class GNNGraph {
    * For now, I stopped this analysis and
    * just enabled this method for only GCN without graph
    * sampling. With graph sampling, I used SAGE's graph normalization.
-   */ 
+   */
   GNNFloat GetGCNNormFactor(GraphNode lid
-      /*, size_t graph_user_layer_num*/) const {
+                            /*, size_t graph_user_layer_num*/) const {
 #if 0
     if (use_subgraph_ || use_subgraph_view_) {
       size_t degree;
@@ -1283,7 +1281,7 @@ class GNNGraph {
   //! that follows SHAD GNN feature construction. This aggregates features of
   //! the neighbor vertices that are from (vertex's feature offset +
   //! 1/2 * feature length) to (vertex's feature offset + feature length),
-  //! to (vertex's feature offset) of the current vertex, from its proxies. 
+  //! to (vertex's feature offset) of the current vertex, from its proxies.
   //!
   //! @param matrix_to_sync Float pointer pointing to features of the target
   //! vertex
@@ -1296,13 +1294,13 @@ class GNNGraph {
 
     // set globals for the sync substrate
     if (use_timer_) {
-      sync_substrate_->template sync<
-          writeSource, readAny, SHADGNNSumAggregate<VTy>, Bitset_graph_aggregate>(
-          "SHADGraphAggregateSync");
+      sync_substrate_
+          ->template sync<writeSource, readAny, SHADGNNSumAggregate<VTy>,
+                          Bitset_graph_aggregate>("SHADGraphAggregateSync");
     } else {
-      sync_substrate_->template sync<
-          writeSource, readAny, SHADGNNSumAggregate<VTy>, Bitset_graph_aggregate>(
-          "Ignore");
+      sync_substrate_
+          ->template sync<writeSource, readAny, SHADGNNSumAggregate<VTy>,
+                          Bitset_graph_aggregate>("Ignore");
     }
   }
 
@@ -1682,18 +1680,16 @@ class GNNGraph {
     // is better
     std::mutex label_class_set_mtx;
     std::unordered_set<int> label_class_set;
-    galois::do_all(
-        galois::iterate(size_t{0}, graph.size()),
-        [&](size_t lid) {
-          local_ground_truth_labels_[lid] = graph.getData(lid).type;
-          label_class_set_mtx.lock();
-          auto found = label_class_set.find(local_ground_truth_labels_[lid]);
-          if (found == label_class_set.end()) {
-            label_class_set.emplace(local_ground_truth_labels_[lid]);
-            ++num_label_classes_;
-          }
-          label_class_set_mtx.unlock();
-        });
+    galois::do_all(galois::iterate(size_t{0}, graph.size()), [&](size_t lid) {
+      local_ground_truth_labels_[lid] = graph.getData(lid).type;
+      label_class_set_mtx.lock();
+      auto found = label_class_set.find(local_ground_truth_labels_[lid]);
+      if (found == label_class_set.end()) {
+        label_class_set.emplace(local_ground_truth_labels_[lid]);
+        ++num_label_classes_;
+      }
+      label_class_set_mtx.unlock();
+    });
 
     // Exchange found local vertex classes with other hosts to
     // calculate the total number of the classes.
@@ -1703,9 +1699,11 @@ class GNNGraph {
     // support std::set and std::unordered_set de/serialization.
     // TODO(hc): support this type of serialization.
     std::vector<int> label_vec(label_class_set.begin(), label_class_set.end());
-    auto &net = galois::runtime::getSystemNetworkInterface();
+    auto& net = galois::runtime::getSystemNetworkInterface();
     for (uint32_t h = 0; h < net.Num; ++h) {
-      if (h == net.ID) { continue; }
+      if (h == net.ID) {
+        continue;
+      }
       galois::runtime::SendBuffer b;
       galois::runtime::gSerialize(b, label_vec);
       net.sendTagged(h, galois::runtime::evilPhase, std::move(b));
@@ -1719,25 +1717,24 @@ class GNNGraph {
 
       std::vector<int> h_label_vec;
       galois::runtime::gDeserialize(p->second, h_label_vec);
-      galois::do_all(galois::iterate(h_label_vec),
-          [&](int i) {
-            label_class_set_mtx.lock();
-            auto found = label_class_set.find(i);
-            if (found == label_class_set.end()) {
-              label_class_set.emplace(i);
-              // Increaes the number of classes only if
-              // it was not found in the local host.
-              ++num_label_classes_;
-            }
-            label_class_set_mtx.unlock();
-          } );
+      galois::do_all(galois::iterate(h_label_vec), [&](int i) {
+        label_class_set_mtx.lock();
+        auto found = label_class_set.find(i);
+        if (found == label_class_set.end()) {
+          label_class_set.emplace(i);
+          // Increaes the number of classes only if
+          // it was not found in the local host.
+          ++num_label_classes_;
+        }
+        label_class_set_mtx.unlock();
+      });
     }
     increment_evilPhase();
   }
 
-  template <
-      typename T                                                      = VTy,
-      typename std::enable_if_t<!std::is_same_v<T, shad::ShadNodeTy>>* = nullptr>
+  template <typename T = VTy,
+            typename std::enable_if_t<!std::is_same_v<T, shad::ShadNodeTy>>* =
+                nullptr>
   void ConstructLocalLabels() {}
 
   void ReadLocalLabelsBin(const std::string& dataset_name) {
@@ -2024,7 +2021,7 @@ class GNNGraph {
     return other_accum.reduce();
   }
 
-  //! @brief Choose and set local training/validation/testing vertices 
+  //! @brief Choose and set local training/validation/testing vertices
   //! consecutively.
   void SetLocalMasksConsecutively() {
     // allocate the memory for the local masks
@@ -2033,20 +2030,19 @@ class GNNGraph {
     local_validation_mask_.resize(partitioned_graph_->size());
     local_testing_mask_.resize(partitioned_graph_->size());
 
-    global_training_count_ = partitioned_graph_->globalSize() / 4;
-    size_t global_testing_count = global_training_count_ / 2;
-    global_training_mask_range_ = {
-        .begin = 0, .end = global_training_count_, .size = global_training_count_};
-    global_testing_mask_range_ = {
-        .begin = global_training_count_,
-        .end = global_training_count_ + global_testing_count,
-        .size = global_testing_count
-    };
+    global_training_count_        = partitioned_graph_->globalSize() / 4;
+    size_t global_testing_count   = global_training_count_ / 2;
+    global_training_mask_range_   = {.begin = 0,
+                                     .end   = global_training_count_,
+                                     .size  = global_training_count_};
+    global_testing_mask_range_    = {.begin = global_training_count_,
+                                     .end   = global_training_count_ +
+                                            global_testing_count,
+                                     .size = global_testing_count};
     global_validation_mask_range_ = {
         .begin = global_training_count_ + global_testing_count,
-        .end = global_training_count_ + 2 * global_testing_count,
-        .size = global_testing_count
-    };
+        .end   = global_training_count_ + 2 * global_testing_count,
+        .size  = global_testing_count};
     // training
     for (size_t i = global_training_mask_range_.begin;
          i < global_training_mask_range_.end; i++) {
@@ -2076,8 +2072,8 @@ class GNNGraph {
   //! @brief Randomly choose and set local training/validation/testing
   //! vertices. This mimics what AGILE GNN does through Pytorch
   //! `DistributedRandomSampler`.
-  void DistributedRandomSampling(
-      size_t local_sample_size, std::vector<char>* masks) {
+  void DistributedRandomSampling(size_t local_sample_size,
+                                 std::vector<char>* masks) {
     // Pytorch's DistributedRandomSampler,
     // first materializes an array populated with
     // 0 to (num_local_vertices - 1), shuffles this array, and
@@ -2088,15 +2084,16 @@ class GNNGraph {
     // the current host, but also others, and mark vertices to
     // the corresponding mask array if they are locals.
     auto& net = galois::runtime::getSystemNetworkInterface();
-    std::vector<
-        std::pair<uint64_t, uint64_t>> num_masters_per_hosts(net.Num);
-    std::pair<uint64_t, uint64_t> master_ranges =
-        { partitioned_graph_->getGID(0),
-          partitioned_graph_->getGID(partitioned_graph_->numMasters() - 1) };
+    std::vector<std::pair<uint64_t, uint64_t>> num_masters_per_hosts(net.Num);
+    std::pair<uint64_t, uint64_t> master_ranges = {
+        partitioned_graph_->getGID(0),
+        partitioned_graph_->getGID(partitioned_graph_->numMasters() - 1)};
     // 1) Exchange node master ranges, and so, each host knows
     // the range of vertex sampling.
     for (uint32_t h = 0; h < net.Num; ++h) {
-      if (h == net.ID) { continue; }
+      if (h == net.ID) {
+        continue;
+      }
       galois::runtime::SendBuffer b;
       galois::runtime::gSerialize(b, master_ranges);
       net.sendTagged(h, galois::runtime::evilPhase, std::move(b));
@@ -2108,34 +2105,32 @@ class GNNGraph {
         p = net.recieveTagged(galois::runtime::evilPhase);
       } while (!p);
 
-      galois::runtime::gDeserialize(p->second,
-          num_masters_per_hosts[p->first]);
+      galois::runtime::gDeserialize(p->second, num_masters_per_hosts[p->first]);
     }
     increment_evilPhase();
 
     // 2) Sample vertices and mark them to the `masks` array
     // if a vertex is local.
     for (uint32_t h = 0; h < net.Num; ++h) {
-      size_t h_begin = (h == net.ID)? master_ranges.first : num_masters_per_hosts[h].first;
-      size_t h_end = (h == net.ID)? master_ranges.second : num_masters_per_hosts[h].second;
+      size_t h_begin =
+          (h == net.ID) ? master_ranges.first : num_masters_per_hosts[h].first;
+      size_t h_end = (h == net.ID) ? master_ranges.second
+                                   : num_masters_per_hosts[h].second;
       std::vector<uint64_t> h_all_indices(h_end - h_begin);
       // Fill global vertex ids to h_global_ids.
       galois::do_all(galois::iterate(h_begin, h_end),
-          [&](size_t i) {
-            h_all_indices[i - h_begin] = i;
-          } );
+                     [&](size_t i) { h_all_indices[i - h_begin] = i; });
       std::mt19937 rand(0);
       std::shuffle(h_all_indices.begin(), h_all_indices.end(), rand);
       galois::do_all(
-          galois::iterate(size_t{0}, local_sample_size),
-          [&](size_t i) {
+          galois::iterate(size_t{0}, local_sample_size), [&](size_t i) {
             // First, it doens't have duplications.
             // Second, only mark `masks` if the checking vertex is a local
             // vertex.
             if (partitioned_graph_->isLocal(h_all_indices[i])) {
               (*masks)[partitioned_graph_->getLID(h_all_indices[i])] = 1;
             }
-          } );
+          });
     }
   }
 
@@ -2146,26 +2141,28 @@ class GNNGraph {
     local_validation_mask_.resize(partitioned_graph_->size());
     local_testing_mask_.resize(partitioned_graph_->size());
 
-    auto& net = galois::runtime::getSystemNetworkInterface();
-    global_training_count_ = partitioned_graph_->globalSize() / 4;
+    auto& net                   = galois::runtime::getSystemNetworkInterface();
+    global_training_count_      = partitioned_graph_->globalSize() / 4;
     size_t global_testing_count = global_training_count_ / 2;
-    size_t num_local_training_samples = global_training_count_ / net.Num;
-    size_t num_local_testing_samples = global_testing_count / net.Num;
+    size_t num_local_training_samples   = global_training_count_ / net.Num;
+    size_t num_local_testing_samples    = global_testing_count / net.Num;
     size_t num_local_validating_samples = num_local_testing_samples;
-    global_training_mask_range_ = {
-        .begin = 0, .end = global_training_count_, .size = global_training_count_};
-    global_testing_mask_range_ = {
-        .begin = 0, .end = global_training_count_, .size = global_training_count_};
-    global_validation_mask_range_ = {
-        .begin = 0, .end = global_training_count_, .size = global_training_count_};
+    global_training_mask_range_         = {.begin = 0,
+                                           .end   = global_training_count_,
+                                           .size  = global_training_count_};
+    global_testing_mask_range_          = {.begin = 0,
+                                           .end   = global_training_count_,
+                                           .size  = global_training_count_};
+    global_validation_mask_range_       = {.begin = 0,
+                                           .end   = global_training_count_,
+                                           .size  = global_training_count_};
 
     incomplete_masks_ = true;
-    DistributedRandomSampling(
-        num_local_training_samples, &local_training_mask_);
-    DistributedRandomSampling(
-        num_local_testing_samples, &local_testing_mask_);
-    DistributedRandomSampling(
-        num_local_validating_samples, &local_validation_mask_);
+    DistributedRandomSampling(num_local_training_samples,
+                              &local_training_mask_);
+    DistributedRandomSampling(num_local_testing_samples, &local_testing_mask_);
+    DistributedRandomSampling(num_local_validating_samples,
+                              &local_validation_mask_);
   }
 
   //! Read masks of local nodes only for training, validation, and testing
@@ -2533,7 +2530,7 @@ class GNNGraph {
     ++galois::runtime::evilPhase;
     if (galois::runtime::evilPhase >=
         static_cast<uint32_t>(std::numeric_limits<int64_t>::max())) {
-      galois::runtime::evilPhase = 1; 
+      galois::runtime::evilPhase = 1;
     }
   }
 
diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
index 6929eb70a2..73153a44de 100644
--- a/libgnn/include/galois/layers/GNNLayer.h
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -10,8 +10,8 @@
 #include "galois/layers/GNNLayer.cuh"
 #endif
 
-//#define PRINT_VEC_LOG_
-//#define PRINT_GPU_VEC_
+// #define PRINT_VEC_LOG_
+// #define PRINT_GPU_VEC_
 
 namespace galois {
 
diff --git a/libgnn/include/galois/layers/GraphConvolutionalLayer.h b/libgnn/include/galois/layers/GraphConvolutionalLayer.h
index be882647a1..3931ed06e1 100644
--- a/libgnn/include/galois/layers/GraphConvolutionalLayer.h
+++ b/libgnn/include/galois/layers/GraphConvolutionalLayer.h
@@ -268,8 +268,7 @@ class GraphConvolutionalLayer : public GNNLayer<VTy, ETy> {
                this->layer_dimensions_.input_rows *
                    this->layer_dimensions_.output_columns);
         // pintemp1 contains (AF)'
-        UpdateEmbeddingsDerivative(
-            input_gradient->data(), p_in_temp_1_.data());
+        UpdateEmbeddingsDerivative(input_gradient->data(), p_in_temp_1_.data());
         // pback contains F'
         // derivative of aggregate is the same due to symmetric graph
         AggregateAll(this->layer_dimensions_.input_columns, p_in_temp_1_.data(),
@@ -383,9 +382,8 @@ class GraphConvolutionalLayer : public GNNLayer<VTy, ETy> {
                   galois::substrate::PerThreadStorage<std::vector<GNNFloat>>*,
                   bool is_backward) {
     galois::StatTimer aggregate_all_sync_timer("AggregateSync", kRegionName);
-    size_t num_nodes   = (is_backward)?
-        this->layer_dimensions_.input_rows :
-        this->layer_dimensions_.output_rows;
+    size_t num_nodes   = (is_backward) ? this->layer_dimensions_.input_rows
+                                       : this->layer_dimensions_.output_rows;
     size_t last_master = *(this->graph_.end_owned());
 
     assert(0 == *(this->graph_.begin_owned()));
@@ -414,9 +412,8 @@ class GraphConvolutionalLayer : public GNNLayer<VTy, ETy> {
           if (!this->config_.disable_normalization) {
             if (this->graph_.IsSubgraphOn() ||
                 this->graph_.IsSubgraphViewOn()) {
-              source_norm =
-                  this->graph_.GetDegreeNorm(
-                      src, this->graph_user_layer_number_);
+              source_norm = this->graph_.GetDegreeNorm(
+                  src, this->graph_user_layer_number_);
             } else {
               source_norm = this->graph_.GetGCNNormFactor(src);
             }
@@ -424,8 +421,7 @@ class GraphConvolutionalLayer : public GNNLayer<VTy, ETy> {
 
           // init to self
           if (!this->config_.disable_self_aggregate) {
-            graphs::bitset_graph_aggregate.set(
-                this->graph_.ConvertToLID(src));
+            graphs::bitset_graph_aggregate.set(this->graph_.ConvertToLID(src));
             // only aggregate self once on master
             if (src < last_master) {
               for (size_t i = 0; i < column_length; i++) {
@@ -437,42 +433,40 @@ class GraphConvolutionalLayer : public GNNLayer<VTy, ETy> {
           }
 
           // loop through all destinations to grab the feature to aggregate
-          auto e_beg = (is_backward)?
-              this->graph_.in_edge_begin(src) : this->graph_.edge_begin(src);
-          auto e_end = (is_backward)?
-              this->graph_.in_edge_end(src) : this->graph_.edge_end(src);
+          auto e_beg = (is_backward) ? this->graph_.in_edge_begin(src)
+                                     : this->graph_.edge_begin(src);
+          auto e_end = (is_backward) ? this->graph_.in_edge_end(src)
+                                     : this->graph_.edge_end(src);
           for (auto e = e_beg; e != e_end; e++) {
             if (this->layer_phase_ == GNNPhase::kTrain ||
                 this->layer_phase_ == GNNPhase::kBatch) {
               if (this->IsSampledLayer()) {
-                bool is_sampled = (is_backward)?
-                    this->graph_.IsInEdgeSampled(
-                        e, this->graph_user_layer_number_) :
-                    this->graph_.IsEdgeSampled(
-                        e, this->graph_user_layer_number_);
+                bool is_sampled = (is_backward)
+                                      ? this->graph_.IsInEdgeSampled(
+                                            e, this->graph_user_layer_number_)
+                                      : this->graph_.IsEdgeSampled(
+                                            e, this->graph_user_layer_number_);
                 // ignore non-sampled nodes and edges
                 if (!is_sampled) {
                   continue;
                 }
               }
             }
-            size_t dst = (is_backward)?
-                this->graph_.GetInEdgeDest(e) : this->graph_.GetEdgeDest(e);
-            graphs::bitset_graph_aggregate.set(
-                this->graph_.ConvertToLID(src));
+            size_t dst = (is_backward) ? this->graph_.GetInEdgeDest(e)
+                                       : this->graph_.GetEdgeDest(e);
+            graphs::bitset_graph_aggregate.set(this->graph_.ConvertToLID(src));
             size_t index_to_dst_feature = dst * column_length;
 
             if (!this->config_.disable_normalization) {
               GNNFloat norm_scale;
               if (this->graph_.IsSubgraphOn() ||
                   this->graph_.IsSubgraphViewOn()) {
-                norm_scale = (is_backward)?
-                    this->graph_.GetDegreeNorm(
-                        dst, this->graph_user_layer_number_)
-                    : source_norm;
+                norm_scale = (is_backward)
+                                 ? this->graph_.GetDegreeNorm(
+                                       dst, this->graph_user_layer_number_)
+                                 : source_norm;
               } else {
-                norm_scale =
-                    source_norm * this->graph_.GetGCNNormFactor(dst);
+                norm_scale = source_norm * this->graph_.GetGCNNormFactor(dst);
               }
 
               galois::VectorMulAdd(
@@ -492,8 +486,8 @@ class GraphConvolutionalLayer : public GNNLayer<VTy, ETy> {
         galois::loopname("ConvolutionalAggregateAll"));
     // aggregate sync
     aggregate_all_sync_timer.start();
-    this->graph_.AggregateSync(aggregate_output, column_length,
-          is_backward, num_nodes);
+    this->graph_.AggregateSync(aggregate_output, column_length, is_backward,
+                               num_nodes);
     aggregate_all_sync_timer.stop();
   }
 
@@ -534,7 +528,7 @@ class GraphConvolutionalLayer : public GNNLayer<VTy, ETy> {
     } else {
 #endif
       AggregateAllCPU(column_length, node_embeddings, aggregate_output, pts,
-          is_backward);
+                      is_backward);
 #ifdef GALOIS_ENABLE_GPU
     }
 #endif
@@ -560,8 +554,7 @@ class GraphConvolutionalLayer : public GNNLayer<VTy, ETy> {
                          this->layer_dimensions_.input_rows,
                          this->layer_dimensions_.input_columns,
                          this->layer_dimensions_.output_columns,
-                         node_embeddings, this->layer_weights_.data(),
-                         output);
+                         node_embeddings, this->layer_weights_.data(), output);
 #ifdef GALOIS_ENABLE_GPU
     }
 #endif
@@ -569,8 +562,7 @@ class GraphConvolutionalLayer : public GNNLayer<VTy, ETy> {
   }
 
   //! Calculate graident via mxm with last layer's gradients (backward)
-  void UpdateEmbeddingsDerivative(
-      const GNNFloat* gradients, GNNFloat* output) {
+  void UpdateEmbeddingsDerivative(const GNNFloat* gradients, GNNFloat* output) {
     galois::StatTimer timer("BackwardXform", kRegionName);
     timer.start();
 
diff --git a/libgnn/include/galois/layers/ReLULayer.h b/libgnn/include/galois/layers/ReLULayer.h
index 879c462330..c35704a28e 100644
--- a/libgnn/include/galois/layers/ReLULayer.h
+++ b/libgnn/include/galois/layers/ReLULayer.h
@@ -10,25 +10,25 @@
 
 namespace galois {
 
-//! ReLU layer: takes each row of the input matrix and sets 0 to elements < 0 in a row.
-//! Currently this only works with **single class* labels and is coded as such.
+//! ReLU layer: takes each row of the input matrix and sets 0 to elements < 0 in
+//! a row. Currently this only works with **single class* labels and is coded as
+//! such.
 template <typename VTy, typename ETy>
 class ReLULayer : public GNNLayer<VTy, ETy> {
 public:
-  ReLULayer(size_t layer_num,
-            const galois::graphs::GNNGraph<VTy, ETy>& graph,
+  ReLULayer(size_t layer_num, const galois::graphs::GNNGraph<VTy, ETy>& graph,
             PointerWithSize<GNNFloat>* backward_output_matrix,
             const GNNLayerDimensions& dimensions)
       : ReLULayer<VTy, ETy>(
             layer_num, graph, backward_output_matrix, dimensions,
-            GNNLayerConfig{.allocate_weights = false, .disable_output = true})
-      {}
+            GNNLayerConfig{.allocate_weights = false, .disable_output = true}) {
+  }
 
   ReLULayer(size_t layer_num, const galois::graphs::GNNGraph<VTy, ETy>& graph,
             PointerWithSize<GNNFloat>* backward_output_matrix,
-            const GNNLayerDimensions& dimensions,
-            const GNNLayerConfig& config) : GNNLayer<VTy, ETy>(layer_num, graph,
-            backward_output_matrix, dimensions, config) {
+            const GNNLayerDimensions& dimensions, const GNNLayerConfig& config)
+      : GNNLayer<VTy, ETy>(layer_num, graph, backward_output_matrix, dimensions,
+                           config) {
     this->layer_type_ = galois::GNNLayerType::kReLU;
     GALOIS_LOG_ASSERT(dimensions.input_columns == dimensions.output_columns);
     GALOIS_LOG_VERBOSE("ReLU initialized");
@@ -79,9 +79,9 @@ class ReLULayer : public GNNLayer<VTy, ETy> {
     return this->forward_output_matrix_;
   }
 
-  PointerWithSize<galois::GNNFloat> BackwardPhaseCPU(
-      PointerWithSize<galois::GNNFloat> prev_layer_input,
-      PointerWithSize<galois::GNNFloat>* input_gradients) {
+  PointerWithSize<galois::GNNFloat>
+  BackwardPhaseCPU(PointerWithSize<galois::GNNFloat> prev_layer_input,
+                   PointerWithSize<galois::GNNFloat>* input_gradients) {
     galois::StatTimer Timer("ReLUBackward", "ReLULayer");
     this->TimerStart(&Timer);
 
@@ -102,8 +102,8 @@ class ReLULayer : public GNNLayer<VTy, ETy> {
             for (size_t row_index = row_offset;
                  row_index < (row_offset + feature_length); row_index++) {
               this->p_backward_output_matrix_[row_index] =
-                (prev_layer_input[row_index] > 0? 1 : 0) *
-                (*input_gradients)[row_index];
+                  (prev_layer_input[row_index] > 0 ? 1 : 0) *
+                  (*input_gradients)[row_index];
             }
           }
         },
diff --git a/libgnn/include/galois/layers/SAGELayer.h b/libgnn/include/galois/layers/SAGELayer.h
index 19d5a75815..5bcaf66589 100644
--- a/libgnn/include/galois/layers/SAGELayer.h
+++ b/libgnn/include/galois/layers/SAGELayer.h
@@ -585,11 +585,11 @@ class SAGELayer : public GNNLayer<VTy, ETy> {
   static const constexpr char* kRegionName = "SAGELayer";
 
   //! CPU aggregation
-  void AggregateAllCPU(
-      size_t column_length, const GNNFloat* node_embeddings,
-      GNNFloat* aggregate_output,
-      galois::substrate::PerThreadStorage<std::vector<GNNFloat>>*,
-      bool is_backward) {
+  void
+  AggregateAllCPU(size_t column_length, const GNNFloat* node_embeddings,
+                  GNNFloat* aggregate_output,
+                  galois::substrate::PerThreadStorage<std::vector<GNNFloat>>*,
+                  bool is_backward) {
     // aggregation causes a row count change
     size_t num_rows_to_handle;
     if (!is_backward) {
diff --git a/libgnn/src/GNNMath.cpp b/libgnn/src/GNNMath.cpp
index c25f3ae7ec..582fba95f6 100644
--- a/libgnn/src/GNNMath.cpp
+++ b/libgnn/src/GNNMath.cpp
@@ -57,8 +57,8 @@ void galois::VectorMulAdd(size_t length, const GNNFloat* a, const GNNFloat* b,
   constexpr size_t vectorization_length = 16;
   const size_t aligned_end = length - length % vectorization_length;
   __m512 scale_vec_main    = _mm512_set_ps(
-         b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale,
-         b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale);
+      b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale,
+      b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale);
   for (size_t i = 0; i < aligned_end; i += vectorization_length) {
     _mm512_storeu_ps(
         &output[i],
diff --git a/libgnn/src/layers/DenseLayer.cpp b/libgnn/src/layers/DenseLayer.cpp
index 8b13789179..e69de29bb2 100644
--- a/libgnn/src/layers/DenseLayer.cpp
+++ b/libgnn/src/layers/DenseLayer.cpp
@@ -1 +0,0 @@
-
diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp
index 8b13789179..e69de29bb2 100644
--- a/libgnn/src/layers/GNNLayer.cpp
+++ b/libgnn/src/layers/GNNLayer.cpp
@@ -1 +0,0 @@
-
diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt
index d005ddd6bc..75cb516b7a 100644
--- a/libgnn/test/CMakeLists.txt
+++ b/libgnn/test/CMakeLists.txt
@@ -94,7 +94,7 @@ if (NOT GALOIS_ENABLE_GPU)
   add_executable(softmaxlayer-test softmaxlayer-test.cpp)
   target_link_libraries(softmaxlayer-test galois_gnn)
   add_test(NAME softmaxlayer-test COMMAND softmaxlayer-test)
-  
+
   add_executable(sigmoidlayer-test sigmoidlayer-test.cpp)
   target_link_libraries(sigmoidlayer-test galois_gnn)
   add_test(NAME sigmoidlayer-test COMMAND sigmoidlayer-test)
@@ -102,19 +102,19 @@ if (NOT GALOIS_ENABLE_GPU)
   add_executable(gnnconstruct-test gnnconstruct-test.cpp)
   target_link_libraries(gnnconstruct-test galois_gnn)
   add_test(NAME gnnconstruct-test COMMAND gnnconstruct-test)
-  
+
   add_executable(gnnfb-test gnnfb-test.cpp)
   target_link_libraries(gnnfb-test galois_gnn)
   add_test(NAME gnnfb-test COMMAND gnnfb-test)
-  
+
   add_executable(adam-test adam-test.cpp)
   target_link_libraries(adam-test galois_gnn)
   add_test(NAME adam-test COMMAND adam-test)
-  
+
   add_executable(accuracy-test accuracy-test.cpp)
   target_link_libraries(accuracy-test galois_gnn)
   add_test(NAME accuracy-test COMMAND accuracy-test)
-  
+
   add_executable(epoch-test epoch-test.cpp)
   target_link_libraries(epoch-test galois_gnn)
   add_test(NAME epoch-test COMMAND epoch-test)
diff --git a/libgnn/test/back-conv-test.cpp b/libgnn/test/back-conv-test.cpp
index 6229c9288c..df3dfe915e 100644
--- a/libgnn/test/back-conv-test.cpp
+++ b/libgnn/test/back-conv-test.cpp
@@ -71,8 +71,8 @@ int main() {
 
   // create layer 1 for testing backward prop actually giving weights back
   std::unique_ptr<galois::GraphConvolutionalLayer<char, void>> layer_1 =
-      std::make_unique<galois::GraphConvolutionalLayer<char, void>>(1, test_graph, &p_back,
-                                                        dimension_0, dcon);
+      std::make_unique<galois::GraphConvolutionalLayer<char, void>>(
+          1, test_graph, &p_back, dimension_0, dcon);
   layer_1->InitAllWeightsTo1();
   galois::PointerWithSize<galois::GNNFloat> layer_1_forward_output =
       layer_1->ForwardPhase(test_graph.GetLocalFeatures());
diff --git a/libgnn/test/convlayer-test.cpp b/libgnn/test/convlayer-test.cpp
index 1bec3b4b31..6170e87d50 100644
--- a/libgnn/test/convlayer-test.cpp
+++ b/libgnn/test/convlayer-test.cpp
@@ -61,8 +61,8 @@ int main() {
 
   // create the layer, no norm factor
   std::unique_ptr<galois::GraphConvolutionalLayer<char, void>> layer_0 =
-      std::make_unique<galois::GraphConvolutionalLayer<char, void>>(0, test_graph, &p_null,
-                                                        dimension_0, dcon);
+      std::make_unique<galois::GraphConvolutionalLayer<char, void>>(
+          0, test_graph, &p_null, dimension_0, dcon);
   layer_0->InitAllWeightsTo1();
   // make sure it runs in a sane manner
   const galois::PointerWithSize<galois::GNNFloat> layer_0_forward_output =
@@ -126,8 +126,8 @@ int main() {
   // create layer 1 for testing backward prop actually giving weights back
 
   std::unique_ptr<galois::GraphConvolutionalLayer<char, void>> layer_1 =
-      std::make_unique<galois::GraphConvolutionalLayer<char, void>>(1, test_graph, &p_back,
-                                                        dimension_0, dcon);
+      std::make_unique<galois::GraphConvolutionalLayer<char, void>>(
+          1, test_graph, &p_back, dimension_0, dcon);
   layer_1->InitAllWeightsTo1();
   galois::PointerWithSize<galois::GNNFloat> layer_1_forward_output =
       layer_1->ForwardPhase(test_graph.GetLocalFeatures());
@@ -203,8 +203,8 @@ int main() {
   // don't have time for at the moment
   // TODO in future maybe add better unit test for this
   std::unique_ptr<galois::GraphConvolutionalLayer<char, void>> layer_2 =
-      std::make_unique<galois::GraphConvolutionalLayer<char, void>>(1, test_graph, &p_back,
-                                                        dimension_0, config);
+      std::make_unique<galois::GraphConvolutionalLayer<char, void>>(
+          1, test_graph, &p_back, dimension_0, config);
   galois::PointerWithSize<galois::GNNFloat> l2_fo =
       layer_2->ForwardPhase(test_graph.GetLocalFeatures());
   GALOIS_LOG_ASSERT(l2_fo.size() == 14);
diff --git a/libgnn/test/gcn-sample-edge-test.cpp b/libgnn/test/gcn-sample-edge-test.cpp
index 8bb4e74f9a..c612639d10 100644
--- a/libgnn/test/gcn-sample-edge-test.cpp
+++ b/libgnn/test/gcn-sample-edge-test.cpp
@@ -37,7 +37,7 @@ int main() {
 
   galois::GNNLayerConfig dcon;
   dcon.disable_aggregate_after_update = false;
-  dcon.disable_normalization = false;
+  dcon.disable_normalization          = false;
   dcon.DebugConfig();
   // Choose a few sample nodes
   test_graph.SetSampledNode(0);
@@ -52,7 +52,7 @@ int main() {
   test_graph.SampleAllEdges(0, false, 1);
 
   // After the above lines, nodes 0, 1, 3, 4, 5 and
-  // edges 0, 7, 8 should be sampled. 
+  // edges 0, 7, 8 should be sampled.
   // So,
   // 0 -> 1, 2 <- 3 -> 4
   GALOIS_LOG_ASSERT(test_graph.IsInSampledGraph(0));
@@ -64,9 +64,7 @@ int main() {
   GALOIS_LOG_ASSERT(test_graph.IsEdgeSampledAny(7));
   GALOIS_LOG_ASSERT(test_graph.IsEdgeSampledAny(8));
 
-
-  galois::DynamicBitSet& bset =
-      test_graph.GetDefinitelySampledNodesBset();
+  galois::DynamicBitSet& bset = test_graph.GetDefinitelySampledNodesBset();
   bset.ParallelReset();
   bset.set(0);
   bset.set(1);
@@ -77,8 +75,8 @@ int main() {
   test_graph.EnableSubgraph();
 
   galois::GNNLayerDimensions dimension_0;
-  dimension_0.input_rows = 5;
-  dimension_0.input_columns = 3;
+  dimension_0.input_rows     = 5;
+  dimension_0.input_columns  = 3;
   dimension_0.output_columns = 2;
 
   // Layer declaration
@@ -115,8 +113,7 @@ int main() {
   dummy_ones_v[5] = 0;
 
   galois::PointerWithSize<galois::GNNFloat> layer_1_backward_output =
-      layer_1->BackwardPhase(
-          test_graph.GetLocalFeatures(), &dummy_ones);
+      layer_1->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
 
   GALOIS_LOG_ASSERT(layer_1_backward_output[0] == 0);
   GALOIS_LOG_ASSERT(layer_1_backward_output[1] == 0);
@@ -136,7 +133,7 @@ int main() {
 
   galois::PointerWithSize<galois::GNNFloat> layer_1_weight_gradients =
       layer_1->GetLayerWeightGradients();
-  
+
   GALOIS_LOG_ASSERT(layer_1_weight_gradients[0] == 6);
   GALOIS_LOG_ASSERT(layer_1_weight_gradients[1] == 6);
   GALOIS_LOG_ASSERT(layer_1_weight_gradients[2] == 6);
diff --git a/libgnn/test/gnnconstruct-test.cpp b/libgnn/test/gnnconstruct-test.cpp
index da0e6bd3f9..aa1513ca91 100644
--- a/libgnn/test/gnnconstruct-test.cpp
+++ b/libgnn/test/gnnconstruct-test.cpp
@@ -28,8 +28,8 @@ int main() {
   std::vector<size_t> adam_sizes = {12, 28};
   auto adam = std::make_unique<galois::AdamOptimizer>(adam_sizes, 2);
 
-  galois::GraphNeuralNetwork<char, void>
-      gnn(std::move(test_graph), std::move(adam), std::move(gnn_config));
+  galois::GraphNeuralNetwork<char, void> gnn(
+      std::move(test_graph), std::move(adam), std::move(gnn_config));
 
   // note this does not include output layer
   GALOIS_LOG_ASSERT(gnn.num_intermediate_layers() == 2);
diff --git a/libgnn/test/gnngraph-test.cpp b/libgnn/test/gnngraph-test.cpp
index e4451a4900..b8a05fc8cc 100644
--- a/libgnn/test/gnngraph-test.cpp
+++ b/libgnn/test/gnngraph-test.cpp
@@ -16,11 +16,11 @@ int main() {
 
   // note multi level reading tested in another test
   GALOIS_LOG_VERBOSE("reddit with single label, oec");
-  galois::graphs::GNNGraph<char, void>("cora", galois::graphs::GNNPartitionScheme::kOEC,
-                           true, false);
+  galois::graphs::GNNGraph<char, void>(
+      "cora", galois::graphs::GNNPartitionScheme::kOEC, true, false);
   GALOIS_LOG_VERBOSE("reddit with single label, cvc");
-  galois::graphs::GNNGraph<char, void>("cora", galois::graphs::GNNPartitionScheme::kCVC,
-                           true, false);
+  galois::graphs::GNNGraph<char, void>(
+      "cora", galois::graphs::GNNPartitionScheme::kCVC, true, false);
 
   // below for when I want to check the remapper
   // galois::graphs::GNNGraph remapper("ogbn-papers100M",
diff --git a/libgnn/test/gpu-back-conv-test.cpp b/libgnn/test/gpu-back-conv-test.cpp
index 2df78d694d..7fedffeda6 100644
--- a/libgnn/test/gpu-back-conv-test.cpp
+++ b/libgnn/test/gpu-back-conv-test.cpp
@@ -54,8 +54,8 @@ int main() {
 
   // create layer 1 for testing backward prop actually giving weights back
   std::unique_ptr<galois::GraphConvolutionalLayer<char, void>> layer_1 =
-      std::make_unique<galois::GraphConvolutionalLayer<char, void>>(1, test_graph, &p_back,
-                                                        dimension_0, dcon);
+      std::make_unique<galois::GraphConvolutionalLayer<char, void>>(
+          1, test_graph, &p_back, dimension_0, dcon);
   galois::PointerWithSize dummy_ones = layer_1->AllocateGPU(dummy_ones_v);
   layer_1->InitAllWeightsTo1();
   layer_1->ForwardPhase(test_graph.GetLocalFeatures());
diff --git a/libgnn/test/gpu-convlayer-test.cpp b/libgnn/test/gpu-convlayer-test.cpp
index a36740b5e3..dc5a4ad917 100644
--- a/libgnn/test/gpu-convlayer-test.cpp
+++ b/libgnn/test/gpu-convlayer-test.cpp
@@ -53,8 +53,8 @@ int main() {
 
   // create the layer, no norm factor
   std::unique_ptr<galois::GraphConvolutionalLayer<char, void>> layer_0 =
-      std::make_unique<galois::GraphConvolutionalLayer<char, void>>(0, test_graph, &p_null,
-                                                        dimension_0, dcon);
+      std::make_unique<galois::GraphConvolutionalLayer<char, void>>(
+          0, test_graph, &p_null, dimension_0, dcon);
   layer_0->InitAllWeightsTo1();
   // make sure it runs in a sane manner
   layer_0->ForwardPhase(test_graph.GetLocalFeatures());
@@ -113,8 +113,8 @@ int main() {
   // create layer 1 for testing backward prop actually giving weights back
 
   std::unique_ptr<galois::GraphConvolutionalLayer<char, void>> layer_1 =
-      std::make_unique<galois::GraphConvolutionalLayer<char, void>>(1, test_graph, &p_back,
-                                                        dimension_0, dcon);
+      std::make_unique<galois::GraphConvolutionalLayer<char, void>>(
+          1, test_graph, &p_back, dimension_0, dcon);
   layer_1->InitAllWeightsTo1();
   layer_1->ForwardPhase(test_graph.GetLocalFeatures());
   const std::vector<galois::GNNFloat>& layer_1_forward_output =
@@ -195,8 +195,8 @@ int main() {
   // don't have time for at the moment
   // TODO in future maybe add better unit test for this
   std::unique_ptr<galois::GraphConvolutionalLayer<char, void>> layer_2 =
-      std::make_unique<galois::GraphConvolutionalLayer<char, void>>(2, test_graph, &p_back,
-                                                        dimension_0, config);
+      std::make_unique<galois::GraphConvolutionalLayer<char, void>>(
+          2, test_graph, &p_back, dimension_0, config);
   layer_2->ForwardPhase(test_graph.GetLocalFeatures());
   // pointer is to GPU memory: copy it over to a CPU source for verification
   const std::vector<galois::GNNFloat>& l2_fo =
diff --git a/libgnn/test/gpu-sage-layer-test.cpp b/libgnn/test/gpu-sage-layer-test.cpp
index 7af3808c85..bbe5cc97cb 100644
--- a/libgnn/test/gpu-sage-layer-test.cpp
+++ b/libgnn/test/gpu-sage-layer-test.cpp
@@ -47,8 +47,8 @@ int main() {
   scon.disable_concat = false;
 
   std::unique_ptr<galois::SAGELayer<char, void>> layer_0 =
-      std::make_unique<galois::SAGELayer<char, void>>(0, test_graph, &p_null, dimension_0,
-                                          dcon, scon);
+      std::make_unique<galois::SAGELayer<char, void>>(0, test_graph, &p_null,
+                                                      dimension_0, dcon, scon);
   layer_0->InitAllWeightsTo1();
   // sage weights for self
   layer_0->InitSelfWeightsTo1();
@@ -121,8 +121,8 @@ int main() {
   ////////////////////////////////////////////////////////////////////////////////
 
   // create layer 1 for testing backward prop actually giving weights back
-  auto layer_1 = std::make_unique<galois::SAGELayer<char, void>>(1, test_graph, &p_back,
-                                                     dimension_0, dcon, scon);
+  auto layer_1 = std::make_unique<galois::SAGELayer<char, void>>(
+      1, test_graph, &p_back, dimension_0, dcon, scon);
   layer_1->InitAllWeightsTo1();
   layer_1->InitSelfWeightsTo1();
 
@@ -217,8 +217,8 @@ int main() {
   // (verification requires floating point accuracy or setting a seed which I
   // don't have time for at the moment
   // TODO in future maybe add better unit test for this
-  auto layer_2 = std::make_unique<galois::SAGELayer<char, void>>(2, test_graph, &p_back,
-                                                     dimension_0, config, scon);
+  auto layer_2 = std::make_unique<galois::SAGELayer<char, void>>(
+      2, test_graph, &p_back, dimension_0, config, scon);
   layer_2->ForwardPhase(test_graph.GetLocalFeatures());
   const std::vector<galois::GNNFloat>& l2_fo =
       layer_2->CopyForwardOutputFromGPU();
diff --git a/libgnn/test/l2norm-layer-test.cpp b/libgnn/test/l2norm-layer-test.cpp
index 6d6b30942e..d2b659f238 100644
--- a/libgnn/test/l2norm-layer-test.cpp
+++ b/libgnn/test/l2norm-layer-test.cpp
@@ -38,8 +38,8 @@ int main() {
   std::vector<galois::GNNFloat> back_matrix(14);
   galois::PointerWithSize<galois::GNNFloat> p_back(back_matrix);
 
-  auto l2_layer = std::make_unique<galois::L2NormLayer<char, void>>(2, test_graph, &p_back,
-                                                        dimension_0);
+  auto l2_layer = std::make_unique<galois::L2NormLayer<char, void>>(
+      2, test_graph, &p_back, dimension_0);
   galois::PointerWithSize<galois::GNNFloat> normed =
       l2_layer->ForwardPhase(l2_input);
 
diff --git a/libgnn/test/mkl_micro.cpp b/libgnn/test/mkl_micro.cpp
index 73b3a08893..a2e68fa9df 100644
--- a/libgnn/test/mkl_micro.cpp
+++ b/libgnn/test/mkl_micro.cpp
@@ -82,7 +82,7 @@ int main(int argc, char* argv[]) {
 
   // dimensions from test case
   size_t a_dim = 12000000;
-  //size_t a_dim = 120000;
+  // size_t a_dim = 120000;
   size_t b_dim = 128;
   size_t c_dim = 16;
 
@@ -90,7 +90,7 @@ int main(int argc, char* argv[]) {
   std::vector<float> matrix_1(a_dim * b_dim);
   std::vector<float> matrix_2(a_dim * c_dim);
   // output
-  //std::vector<float> matrix_3(a_dim * c_dim);
+  // std::vector<float> matrix_3(a_dim * c_dim);
   std::vector<float> matrix_3(b_dim * c_dim);
 
   size_t kBigSize = 1000000000;
@@ -126,16 +126,19 @@ int main(int argc, char* argv[]) {
     auto start = std::chrono::high_resolution_clock::now();
     // transpose because it's the same as the problematic call in GNN
     // TODO(loc) non transpose version
-    //CBlasSGEMM(CblasNoTrans, CblasNoTrans, a_dim, b_dim, c_dim, matrix_1.data(),
+    // CBlasSGEMM(CblasNoTrans, CblasNoTrans, a_dim, b_dim, c_dim,
+    // matrix_1.data(),
     //           matrix_2.data(), matrix_3.data());
     CBlasSGEMM(CblasTrans, CblasNoTrans, b_dim, a_dim, c_dim, matrix_1.data(),
                matrix_2.data(), matrix_3.data());
-    //CBlasSGEMM(CblasNoTrans, CblasTrans, b_dim, a_dim, c_dim, matrix_1.data(),
-    //           matrix_2.data(), matrix_3.data());
+    // CBlasSGEMM(CblasNoTrans, CblasTrans, b_dim, a_dim, c_dim,
+    // matrix_1.data(),
+    //            matrix_2.data(), matrix_3.data());
     auto stop = std::chrono::high_resolution_clock::now();
 
-    auto duration = std::chrono::time_point_cast<std::chrono::milliseconds>(stop) - 
-                    std::chrono::time_point_cast<std::chrono::microseconds>(start);
+    auto duration =
+        std::chrono::time_point_cast<std::chrono::milliseconds>(stop) -
+        std::chrono::time_point_cast<std::chrono::microseconds>(start);
     printf("Run duration is %lf ms\n", duration.count() / 1000.0);
   }
 
diff --git a/libgnn/test/sage-layer-test.cpp b/libgnn/test/sage-layer-test.cpp
index 8551126d37..3f53921795 100644
--- a/libgnn/test/sage-layer-test.cpp
+++ b/libgnn/test/sage-layer-test.cpp
@@ -33,8 +33,8 @@ int main() {
   galois::PointerWithSize<galois::GNNFloat> p_back(back_matrix);
 
   std::unique_ptr<galois::SAGELayer<char, void>> layer_0 =
-      std::make_unique<galois::SAGELayer<char, void>>(0, test_graph, &p_null, dimension_0,
-                                          dcon, scon);
+      std::make_unique<galois::SAGELayer<char, void>>(0, test_graph, &p_null,
+                                                      dimension_0, dcon, scon);
   layer_0->InitAllWeightsTo1();
   // sage weights for self
   layer_0->InitSelfWeightsTo1();
@@ -113,8 +113,8 @@ int main() {
 
   // create layer 1 for testing backward prop actually giving weights back
 
-  auto layer_1 = std::make_unique<galois::SAGELayer<char, void>>(1, test_graph, &p_back,
-                                                     dimension_0, dcon, scon);
+  auto layer_1 = std::make_unique<galois::SAGELayer<char, void>>(
+      1, test_graph, &p_back, dimension_0, dcon, scon);
   layer_1->InitAllWeightsTo1();
   layer_1->InitSelfWeightsTo1();
 
@@ -205,8 +205,8 @@ int main() {
   // (verification requires floating point accuracy or setting a seed which I
   // don't have time for at the moment
   // TODO in future maybe add better unit test for this
-  auto layer_2 = std::make_unique<galois::SAGELayer<char, void>>(1, test_graph, &p_back,
-                                                     dimension_0, config, scon);
+  auto layer_2 = std::make_unique<galois::SAGELayer<char, void>>(
+      1, test_graph, &p_back, dimension_0, config, scon);
   galois::PointerWithSize<galois::GNNFloat> l2_fo =
       layer_2->ForwardPhase(test_graph.GetLocalFeatures());
   GALOIS_LOG_ASSERT(l2_fo.size() == 14);
diff --git a/libgnn/test/sample-test.cpp b/libgnn/test/sample-test.cpp
index 0bda9d81a8..d875a72ee4 100644
--- a/libgnn/test/sample-test.cpp
+++ b/libgnn/test/sample-test.cpp
@@ -60,8 +60,8 @@ int main() {
   galois::PointerWithSize<galois::GNNFloat> p_back(back_matrix);
 
   std::unique_ptr<galois::GraphConvolutionalLayer<char, void>> layer_1 =
-      std::make_unique<galois::GraphConvolutionalLayer<char, void>>(1, test_graph, &p_back,
-                                                        dimension_0, dcon);
+      std::make_unique<galois::GraphConvolutionalLayer<char, void>>(
+          1, test_graph, &p_back, dimension_0, dcon);
   layer_1->InitAllWeightsTo1();
   layer_1->EnableSampling();
 
diff --git a/libgnn/test/single_mkl_micro.cpp b/libgnn/test/single_mkl_micro.cpp
index 7111b1b057..97035bdfba 100644
--- a/libgnn/test/single_mkl_micro.cpp
+++ b/libgnn/test/single_mkl_micro.cpp
@@ -20,25 +20,26 @@
 // MKL wrapper
 #ifdef USE_OMP
 void CBlasSGEMMOMP(const CBLAS_TRANSPOSE trans_a, const CBLAS_TRANSPOSE trans_b,
-                size_t input_rows, size_t input_columns, size_t output_columns,
-                const float* a, const float* b, float* output) {
+                   size_t input_rows, size_t input_columns,
+                   size_t output_columns, const float* a, const float* b,
+                   float* output) {
   // set lead dimension based on cblas spec w.r.t. transpose setting
   size_t lead_dim_a = (trans_a == CblasNoTrans) ? input_columns : input_rows;
   size_t lead_dim_b =
       (trans_b == CblasNoTrans) ? output_columns : input_columns;
 
-  #pragma omp parallel for
+#pragma omp parallel for
   for (int i = 0; i < omp_get_num_threads(); i++) {
     unsigned chunk_size = input_rows / omp_get_num_threads();
-    unsigned my_start = chunk_size * i;
-    unsigned my_end = chunk_size * (i + 1);
+    unsigned my_start   = chunk_size * i;
+    unsigned my_end     = chunk_size * (i + 1);
     if (omp_get_num_threads() - 1 == i) {
       my_end = input_rows;
     }
     unsigned rows_to_use = my_end - my_start;
 
     const float* my_a = a + (my_start * input_columns);
-    float* my_output = output + (my_start * output_columns);
+    float* my_output  = output + (my_start * output_columns);
 
     // do the MM
     cblas_sgemm(CblasRowMajor, trans_a, trans_b, rows_to_use, output_columns,
@@ -49,9 +50,10 @@ void CBlasSGEMMOMP(const CBLAS_TRANSPOSE trans_a, const CBLAS_TRANSPOSE trans_b,
 #endif
 
 #if defined(USE_SHARED_GALOIS) || defined(USE_DIST_GALOIS)
-void CBlasSGEMMGalois(const CBLAS_TRANSPOSE trans_a, const CBLAS_TRANSPOSE trans_b,
-                size_t input_rows, size_t input_columns, size_t output_columns,
-                const float* a, const float* b, float* output) {
+void CBlasSGEMMGalois(const CBLAS_TRANSPOSE trans_a,
+                      const CBLAS_TRANSPOSE trans_b, size_t input_rows,
+                      size_t input_columns, size_t output_columns,
+                      const float* a, const float* b, float* output) {
   // set lead dimension based on cblas spec w.r.t. transpose setting
   size_t lead_dim_a = (trans_a == CblasNoTrans) ? input_columns : input_rows;
   size_t lead_dim_b =
@@ -62,46 +64,44 @@ void CBlasSGEMMGalois(const CBLAS_TRANSPOSE trans_a, const CBLAS_TRANSPOSE trans
     temps.resize(galois::getActiveThreads());
   }
 
-  galois::on_each(
-    [&] (size_t i, size_t num_threads) {
-      if (trans_a != CblasTrans) {
-        unsigned chunk_size = input_rows / num_threads;
-        unsigned my_start = chunk_size * i;
-        unsigned my_end = chunk_size * (i + 1);
-        if (num_threads - 1 == i) {
-          my_end = input_rows;
-        }
-        unsigned rows_to_use = my_end - my_start;
-
-        const float* my_a = a + (my_start * input_columns);
-        float* my_output = output + (my_start * output_columns);
-
-        // do the MM
-        cblas_sgemm(CblasRowMajor, trans_a, trans_b, rows_to_use, output_columns,
-                    input_columns, 1.0, my_a, lead_dim_a, b, lead_dim_b,
-                    false ? 1.0 : 0.0, my_output, output_columns);
-      } else {
-        galois::PODResizeableArray<float>& my_pod = temps[i];
-        my_pod.resize(input_rows * output_columns);
-
-        unsigned chunk_size = input_columns / num_threads;
-        unsigned my_start = chunk_size * i;
-        unsigned my_end = chunk_size * (i + 1);
-        if (num_threads - 1 == i) {
-          my_end = input_columns;
-        }
-        unsigned b_rows_to_use = my_end - my_start;
-
-        const float* my_a = a + (my_start * input_rows);
-        const float* my_b = b + (my_start * output_columns);
-
-        // do the MM
-        cblas_sgemm(CblasRowMajor, trans_a, trans_b, input_rows, output_columns,
-                    b_rows_to_use, 1.0, my_a, lead_dim_a, my_b, lead_dim_b,
-                    false ? 1.0 : 0.0, my_pod.data(), output_columns);
+  galois::on_each([&](size_t i, size_t num_threads) {
+    if (trans_a != CblasTrans) {
+      unsigned chunk_size = input_rows / num_threads;
+      unsigned my_start   = chunk_size * i;
+      unsigned my_end     = chunk_size * (i + 1);
+      if (num_threads - 1 == i) {
+        my_end = input_rows;
       }
+      unsigned rows_to_use = my_end - my_start;
+
+      const float* my_a = a + (my_start * input_columns);
+      float* my_output  = output + (my_start * output_columns);
+
+      // do the MM
+      cblas_sgemm(CblasRowMajor, trans_a, trans_b, rows_to_use, output_columns,
+                  input_columns, 1.0, my_a, lead_dim_a, b, lead_dim_b,
+                  false ? 1.0 : 0.0, my_output, output_columns);
+    } else {
+      galois::PODResizeableArray<float>& my_pod = temps[i];
+      my_pod.resize(input_rows * output_columns);
+
+      unsigned chunk_size = input_columns / num_threads;
+      unsigned my_start   = chunk_size * i;
+      unsigned my_end     = chunk_size * (i + 1);
+      if (num_threads - 1 == i) {
+        my_end = input_columns;
+      }
+      unsigned b_rows_to_use = my_end - my_start;
+
+      const float* my_a = a + (my_start * input_rows);
+      const float* my_b = b + (my_start * output_columns);
+
+      // do the MM
+      cblas_sgemm(CblasRowMajor, trans_a, trans_b, input_rows, output_columns,
+                  b_rows_to_use, 1.0, my_a, lead_dim_a, my_b, lead_dim_b,
+                  false ? 1.0 : 0.0, my_pod.data(), output_columns);
     }
-  );
+  });
 
   if (trans_a == CblasTrans) {
     printf("Manual summation\n");
@@ -114,7 +114,6 @@ void CBlasSGEMMGalois(const CBLAS_TRANSPOSE trans_a, const CBLAS_TRANSPOSE trans
 }
 #endif
 
-
 void CacheFlush(std::vector<float>* matrix) {
   for (size_t i = 0; i < matrix->size(); i++) {
     (*matrix)[i] = i;
@@ -155,7 +154,7 @@ int main(int argc, char* argv[]) {
   std::vector<float> matrix_1(a_dim * b_dim);
   std::vector<float> matrix_2(a_dim * c_dim);
   // output
-  //std::vector<float> matrix_3(a_dim * c_dim);
+  // std::vector<float> matrix_3(a_dim * c_dim);
   std::vector<float> matrix_3(b_dim * c_dim);
 
   size_t kBigSize = 1000000000;
@@ -184,21 +183,24 @@ int main(int argc, char* argv[]) {
     // transpose because it's the same as the problematic call in GNN
     // TODO(loc) non transpose version
 #ifdef USE_OMP
-    CBlasSGEMMOMP(CblasNoTrans, CblasNoTrans, a_dim, b_dim, c_dim, matrix_1.data(),
-               matrix_2.data(), matrix_3.data());
+    CBlasSGEMMOMP(CblasNoTrans, CblasNoTrans, a_dim, b_dim, c_dim,
+                  matrix_1.data(), matrix_2.data(), matrix_3.data());
 #endif
 #if defined(USE_SHARED_GALOIS) || defined(USE_DIST_GALOIS)
-    //CBlasSGEMMGalois(CblasNoTrans, CblasNoTrans, a_dim, b_dim, c_dim, matrix_1.data(),
-    //           matrix_2.data(), matrix_3.data());
-    CBlasSGEMMGalois(CblasTrans, CblasNoTrans, b_dim, a_dim, c_dim, matrix_1.data(),
-               matrix_2.data(), matrix_3.data());
+    // CBlasSGEMMGalois(CblasNoTrans, CblasNoTrans, a_dim, b_dim, c_dim,
+    // matrix_1.data(),
+    //            matrix_2.data(), matrix_3.data());
+    CBlasSGEMMGalois(CblasTrans, CblasNoTrans, b_dim, a_dim, c_dim,
+                     matrix_1.data(), matrix_2.data(), matrix_3.data());
 #endif
-    //CBlasSGEMM(CblasTrans, CblasNoTrans, b_dim, a_dim, c_dim, matrix_1.data(),
-    //           matrix_2.data(), matrix_3.data());
+    // CBlasSGEMM(CblasTrans, CblasNoTrans, b_dim, a_dim, c_dim,
+    // matrix_1.data(),
+    //            matrix_2.data(), matrix_3.data());
     auto stop = std::chrono::high_resolution_clock::now();
 
-    auto duration = std::chrono::time_point_cast<std::chrono::milliseconds>(stop) - 
-                    std::chrono::time_point_cast<std::chrono::microseconds>(start);
+    auto duration =
+        std::chrono::time_point_cast<std::chrono::milliseconds>(stop) -
+        std::chrono::time_point_cast<std::chrono::microseconds>(start);
     printf("Run duration is %lf ms\n", duration.count() / 1000.0);
   }
 

From ce1a079890e8182946d2914a3e453864b93db709 Mon Sep 17 00:00:00 2001
From: "Lee, Hochan" <133701794+hochanlee-amd@users.noreply.github.com>
Date: Mon, 23 Oct 2023 22:05:52 -0500
Subject: [PATCH 610/660] Make linking MKL libraries in a portable way

This commit removes the absolute path to link the MKL library from CMakeLists and source codes, but uses Intel's MKL CMake configuration file and achieves portable linking.
---
 CMakeLists.txt             |   4 +-
 libgnn/CMakeLists.txt      |  33 +++--
 libgnn/README.md           |  18 +++
 libgnn/test/CMakeLists.txt | 249 ++++++++++++++-----------------------
 4 files changed, 126 insertions(+), 178 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 88eaa64d74..4731b8b99d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -141,9 +141,7 @@ endif()
 # TODO (loc) prefix with GALOIS, move elsewhere more fitting in this file
 ################################################################################
 if(USE_MKL_BLAS)
-  SET(MKL_ROOT /home/hochan/intel/oneapi/mkl/2023.1.0)
-  find_package(MKL REQUIRED)
-  message(STATUS "MKL: ${MKL_INCLUDE_DIRS}")
+  find_package(MKL CONFIG REQUIRED PATH $ENV{MKL_ROOT})
   if (MKL_FOUND)
   else()
     message(WARNING "MKL not found")
diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt
index ca799c34b4..030e5bb516 100644
--- a/libgnn/CMakeLists.txt
+++ b/libgnn/CMakeLists.txt
@@ -6,34 +6,29 @@ set(sources
   src/graphs/GNNGraph.cpp
 )
 
-## TODO(hc): Note that these libraries should be hard-coded
-## based on your own system.
-## These should be automatic library linking.
-set(MKL_LIBRARIES ${MKL_ROOT}/lib/intel64)
-set(INTEL_COMPILER_LIBRARIES /home/hochan/intel/oneapi/compiler/2023.1.0/linux/compiler/lib/intel64_lin)
-set(INTEL_LIBS "-lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5")
-set(SINGLE_INTEL_LIBS "-lmkl_intel_lp64 -lmkl_sequential -lmkl_core")
-
 add_library(galois_gnn STATIC ${sources})
 
-target_link_directories(galois_gnn PUBLIC ${MKL_LIBRARIES})
-target_link_libraries(galois_gnn ${INTEL_LIBS})
-target_link_directories(galois_gnn PUBLIC ${INTEL_COMPILER_LIBRARIES})
-target_link_libraries(galois_gnn galois_shmem)
-target_link_libraries(galois_gnn galois_dist_async galois_cusp galois_gluon galois_support)
+target_compile_options(galois_gnn PUBLIC
+		$<TARGET_PROPERTY:MKL::MKL,INTERFACE_COMPILE_OPTIONS>)
+target_include_directories(galois_gnn PUBLIC
+		$<TARGET_PROPERTY:MKL::MKL,INTERFACE_INCLUDE_DIRECTORIES>)
+target_link_libraries(galois_gnn PUBLIC $<LINK_ONLY:MKL::MKL>)
+target_link_libraries(galois_gnn PUBLIC galois_shmem)
+target_link_libraries(galois_gnn PUBLIC galois_dist_async galois_cusp galois_gluon galois_support)
 target_include_directories(galois_gnn PUBLIC
   ${CMAKE_CURRENT_SOURCE_DIR}/include
-  ${MKL_INCLUDE_DIRS}
 )
 
 add_library(galois_gnn_single STATIC ${sources})
-target_link_directories(galois_gnn_single PUBLIC ${MKL_LIBRARIES})
-target_link_libraries(galois_gnn_single galois_shmem)
-target_link_libraries(galois_gnn_single ${SINGLE_INTEL_LIBS})
-target_link_libraries(galois_gnn_single galois_dist_async galois_cusp galois_gluon galois_support)
+target_compile_options(galois_gnn_single PUBLIC
+		$<TARGET_PROPERTY:MKL::MKL,INTERFACE_COMPILE_OPTIONS>)
+target_include_directories(galois_gnn_single PUBLIC
+		$<TARGET_PROPERTY:MKL::MKL,INTERFACE_INCLUDE_DIRECTORIES>)
+target_link_libraries(galois_gnn_single PUBLIC $<LINK_ONLY:MKL::MKL>)
+target_link_libraries(galois_gnn_single PUBLIC galois_shmem)
+target_link_libraries(galois_gnn_single PUBLIC galois_dist_async galois_cusp galois_gluon galois_support)
 target_include_directories(galois_gnn_single PUBLIC
   ${CMAKE_CURRENT_SOURCE_DIR}/include
-  ${MKL_INCLUDE_DIRS}
 )
 
 set_target_properties(galois_gnn PROPERTIES EXPORT_NAME galois_gnn)
diff --git a/libgnn/README.md b/libgnn/README.md
index 2f3bf1a3aa..ded103c9b9 100644
--- a/libgnn/README.md
+++ b/libgnn/README.md
@@ -560,3 +560,21 @@ data depending on the size of the minibatch. The best way to avoid this
 in general, though, is to just allocate space for the test subgraph's
 k-hops since that is likely to be more expensive than whatever
 the minibatch size for the train nodes are (unless it's all nodes).
+
+Author: Hochan Lee, <hochan@utexas.edu>
+
+# Intel Open API MKL
+
+Galois-GNN requires Intel Math Kernel Library (MKL), and so, you are required to
+install Intel oneAPI. This toolkit contains all the necessary tools and libraries including
+the MKL library. We recommend to get Intel oneAPI >= 2023.1.0. from the Intel official website
+(https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html)
+as this is what we have used and tested.
+
+Once you followed their instruction and installed Intel oneAPI,
+you should export the MKL path in the installation path to your environment
+before you cmake and install Galois-GNN. CMakeLists.txt will look for the MKL root path.
+
+```Shell
+export MKL_ROOT=[THE PARENT PATH OF THE INSTALLATION PATH]/intel/openapi/mkl/2023.1.0
+```
diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt
index 75cb516b7a..40efcfa0e3 100644
--- a/libgnn/test/CMakeLists.txt
+++ b/libgnn/test/CMakeLists.txt
@@ -1,75 +1,5 @@
 find_package(OpenMP)
 
-add_executable(mkl_micro mkl_micro.cpp)
-target_link_directories(mkl_micro PUBLIC ${MKL_LIBRARIES})
-target_link_directories(mkl_micro PUBLIC ${INTEL_COMPILER_LIBRARIES})
-target_include_directories(mkl_micro PUBLIC
-  ${MKL_INCLUDE_DIRS}
-)
-target_link_libraries(mkl_micro ${INTEL_LIBS})
-
-add_executable(mkl_micro_omp mkl_micro.cpp)
-target_link_directories(mkl_micro_omp PUBLIC ${MKL_LIBRARIES})
-target_link_directories(mkl_micro_omp PUBLIC ${INTEL_COMPILER_LIBRARIES})
-target_include_directories(mkl_micro_omp PUBLIC
-  ${MKL_INCLUDE_DIRS}
-)
-target_link_libraries(mkl_micro_omp PUBLIC ${INTEL_LIBS} OpenMP::OpenMP_CXX)
-target_compile_definitions(mkl_micro_omp PUBLIC USE_OMP=1)
-
-add_executable(mkl_micro_sgalois mkl_micro.cpp)
-target_link_libraries(mkl_micro_sgalois galois_gnn)
-target_compile_definitions(mkl_micro_sgalois PUBLIC USE_SHARED_GALOIS=1)
-
-add_executable(mkl_micro_dgalois mkl_micro.cpp)
-target_link_libraries(mkl_micro_dgalois galois_gnn)
-target_compile_definitions(mkl_micro_dgalois PUBLIC USE_DIST_GALOIS=1)
-
-add_executable(remapverify remapverify.cpp)
-target_link_libraries(remapverify galois_gnn)
-target_compile_definitions(remapverify PUBLIC USE_DIST_GALOIS=1)
-
-add_executable(mkl_micro_delete_galois mkl_micro.cpp)
-target_link_libraries(mkl_micro_delete_galois galois_gnn)
-target_compile_definitions(mkl_micro_delete_galois PUBLIC USE_SHARED_GALOIS_DELETE=1)
-
-################################################################################
-
-#add_executable(single_mkl_micro single_mkl_micro.cpp)
-#target_link_directories(single_mkl_micro PUBLIC ${MKL_LIBRARIES})
-#target_include_directories(single_mkl_micro PUBLIC
-#  ${MKL_INCLUDE_DIRS}
-#)
-#target_link_libraries(single_mkl_micro ${SINGLE_INTEL_LIBS})
-
-add_executable(single_mkl_micro_omp single_mkl_micro.cpp)
-target_link_directories(single_mkl_micro_omp PUBLIC ${MKL_LIBRARIES})
-target_include_directories(single_mkl_micro_omp PUBLIC
-  ${MKL_INCLUDE_DIRS}
-)
-target_link_libraries(single_mkl_micro_omp ${SINGLE_INTEL_LIBS} OpenMP::OpenMP_CXX)
-target_compile_definitions(single_mkl_micro_omp PUBLIC USE_OMP=1)
-
-add_executable(single_mkl_micro_sgalois single_mkl_micro.cpp)
-target_link_libraries(single_mkl_micro_sgalois galois_gnn_single)
-target_compile_definitions(single_mkl_micro_sgalois PUBLIC USE_SHARED_GALOIS=1)
-
-add_executable(single_mkl_micro_dgalois single_mkl_micro.cpp)
-target_link_libraries(single_mkl_micro_dgalois galois_gnn_single)
-target_compile_definitions(single_mkl_micro_dgalois PUBLIC USE_DIST_GALOIS=1)
-
-################################################################################
-
-add_executable(gstl_test gstl_test.cpp)
-target_link_libraries(gstl_test galois_shmem)
-
-################################################################################
-
-add_executable(gnngraph-test gnngraph-test.cpp)
-target_link_libraries(gnngraph-test galois_gnn)
-add_test(NAME gnngraph-test COMMAND gnngraph-test)
-
-# multihost testing things
 set(hosts)
 set(host 12)
 while (${host} GREATER 1)
@@ -78,105 +8,50 @@ while (${host} GREATER 1)
 endwhile()
 list(APPEND hosts "1")
 
-if (NOT GALOIS_ENABLE_GPU)
-  add_executable(convlayer-test convlayer-test.cpp)
-  target_link_libraries(convlayer-test galois_gnn)
-  add_test(NAME convlayer-test COMMAND convlayer-test)
-
-  add_executable(sage-layer-test sage-layer-test.cpp)
-  target_link_libraries(sage-layer-test galois_gnn)
-  add_test(NAME sage-layer-test COMMAND sage-layer-test)
-
-  add_executable(l2norm-layer-test l2norm-layer-test.cpp)
-  target_link_libraries(l2norm-layer-test galois_gnn)
-  add_test(NAME l2norm-layer-test COMMAND l2norm-layer-test)
-
-  add_executable(softmaxlayer-test softmaxlayer-test.cpp)
-  target_link_libraries(softmaxlayer-test galois_gnn)
-  add_test(NAME softmaxlayer-test COMMAND softmaxlayer-test)
-
-  add_executable(sigmoidlayer-test sigmoidlayer-test.cpp)
-  target_link_libraries(sigmoidlayer-test galois_gnn)
-  add_test(NAME sigmoidlayer-test COMMAND sigmoidlayer-test)
-
-  add_executable(gnnconstruct-test gnnconstruct-test.cpp)
-  target_link_libraries(gnnconstruct-test galois_gnn)
-  add_test(NAME gnnconstruct-test COMMAND gnnconstruct-test)
-
-  add_executable(gnnfb-test gnnfb-test.cpp)
-  target_link_libraries(gnnfb-test galois_gnn)
-  add_test(NAME gnnfb-test COMMAND gnnfb-test)
-
-  add_executable(adam-test adam-test.cpp)
-  target_link_libraries(adam-test galois_gnn)
-  add_test(NAME adam-test COMMAND adam-test)
-
-  add_executable(accuracy-test accuracy-test.cpp)
-  target_link_libraries(accuracy-test galois_gnn)
-  add_test(NAME accuracy-test COMMAND accuracy-test)
-
-  add_executable(epoch-test epoch-test.cpp)
-  target_link_libraries(epoch-test galois_gnn)
-  add_test(NAME epoch-test COMMAND epoch-test)
-
-  add_executable(multilabel-epoch-test multilabel-epoch-test.cpp)
-  target_link_libraries(multilabel-epoch-test galois_gnn)
-  add_test(NAME multilabel-epoch-test COMMAND multilabel-epoch-test)
+add_executable(gnngraph-test gnngraph-test.cpp)
+target_link_libraries(gnngraph-test galois_gnn)
+add_test(NAME gnngraph-test COMMAND gnngraph-test)
 
+if (NOT GALOIS_ENABLE_GPU)
+  set(GALOIS_TESTS
+      ${GALOIS_TESTS}
+      convlayer-test
+      sage-layer-test
+      l2norm-layer-test
+      softmaxlayer-test
+      sigmoidlayer-test
+      gnnconstruct-test
+      gnnfb-test
+      adam-test
+      accuracy-test
+      epoch-test
+      multilabel-epoch-test
+      multilabel-read
+      f1-test
+      sample-bit-test
+      gcn-sample-edge-test
+  )
   add_executable(aggregate-sync-test aggregate-sync-test.cpp)
   target_link_libraries(aggregate-sync-test galois_gnn)
   foreach(host_count ${hosts})
     add_test(NAME run-aggsync-${host_count} COMMAND mpiexec --bind-to none -n ${host_count} ./aggregate-sync-test)
     set_tests_properties(run-aggsync-${host_count} PROPERTIES ENVIRONMENT "GALOIS_DO_NOT_BIND_THREADS=1")
   endforeach()
-
   add_executable(back-conv-test back-conv-test.cpp)
   target_link_libraries(back-conv-test galois_gnn)
   foreach(host_count ${hosts})
     add_test(NAME run-back-conv-${host_count} COMMAND mpiexec --bind-to none -n ${host_count} ./back-conv-test)
     set_tests_properties(run-back-conv-${host_count} PROPERTIES ENVIRONMENT "GALOIS_DO_NOT_BIND_THREADS=1")
   endforeach()
-
-  add_executable(multilabel-read multilabel-read.cpp)
-  target_link_libraries(multilabel-read galois_gnn)
-  add_test(NAME multilabel-read COMMAND multilabel-read)
-
-  add_executable(f1-test f1-test.cpp)
-  target_link_libraries(f1-test galois_gnn)
-  add_test(NAME f1-test COMMAND f1-test)
-
-  #add_executable(sample-test sample-test.cpp)
-  #target_link_libraries(sample-test galois_gnn)
-  #add_test(NAME sample-test COMMAND sample-test)
-
-  add_executable(sample-bit-test sample-bit-test.cpp)
-  target_link_libraries(sample-bit-test galois_gnn)
-  add_test(NAME sample-bit-test COMMAND sample-bit-test)
-
-  add_executable(gcn-sample-edge-test gcn-sample-edge-test.cpp)
-  target_link_libraries(gcn-sample-edge-test galois_gnn)
-  add_test(NAME gcn-sample-edge-test COMMAND gcn-sample-edge-test)
 else()
-  add_executable(gpu-sage-layer-test gpu-sage-layer-test.cpp)
-  target_link_libraries(gpu-sage-layer-test galois_gnn)
-  add_test(NAME gpu-sage-layer-test COMMAND gpu-sage-layer-test)
-
-  add_executable(gpu-convlayer-test gpu-convlayer-test.cpp)
-  target_link_libraries(gpu-convlayer-test galois_gnn)
-  add_test(NAME gpu-convlayer-test COMMAND gpu-convlayer-test)
-
-  add_executable(gpu-softmaxlayer-test gpu-softmaxlayer-test.cpp)
-  target_link_libraries(gpu-softmaxlayer-test galois_gnn)
-  add_test(NAME gpu-softmaxlayer-test COMMAND gpu-softmaxlayer-test)
-
-  add_executable(gpu-adam-test gpu-adam-test.cpp)
-  target_link_libraries(gpu-adam-test galois_gnn)
-  add_test(NAME gpu-adam-test COMMAND gpu-adam-test)
-
-  add_executable(gpu-epoch-test gpu-epoch-test.cpp)
-  target_link_libraries(gpu-epoch-test galois_gnn)
-  #add_test(NAME gpu-epoch-test COMMAND gpu-epoch-test)
-
+  set(GALOIS_TESTS
+      ${GALOIS_TESTS}
+      gpu-sage-layer-test
+      gpu-convlayer-test
+      gpu-softmaxlayer-test
+      gpu-adam-test
+      gpu-epoch-test
+     )
   add_executable(gpu-aggregate-sync-test gpu-aggregate-sync-test.cpp)
   target_link_libraries(gpu-aggregate-sync-test galois_gnn)
 
@@ -202,4 +77,66 @@ else()
   endforeach()
 endif()
 
-# TODO multi host tests?
+message("Galois Tests..")
+foreach(galois_test ${GALOIS_TESTS})
+  add_executable(${galois_test} ${galois_test}.cpp)
+  target_link_libraries(${galois_test} galois_gnn)
+  add_test(NAME ${galois_test} COMMAND ${galois_test})
+endforeach()
+
+add_executable(remapverify remapverify.cpp)
+target_link_libraries(remapverify galois_gnn)
+target_compile_definitions(remapverify PUBLIC USE_DIST_GALOIS=1)
+
+# MKL Test
+set(MKL_TESTS
+    mkl_micro_sgalois
+    mkl_micro_dgalois
+    mkl_micro_delete_galois
+    single_mkl_micro_sgalois
+    single_mkl_micro_dgalois
+    mkl_micro
+    mkl_micro_omp
+    single_mkl_micro_omp
+)
+
+add_executable(mkl_micro_sgalois mkl_micro.cpp)
+target_link_libraries(mkl_micro_sgalois PUBLIC galois_gnn)
+target_compile_definitions(mkl_micro_sgalois PUBLIC USE_SHARED_GALOIS=1)
+
+add_executable(mkl_micro_dgalois mkl_micro.cpp)
+target_link_libraries(mkl_micro_dgalois PUBLIC galois_gnn)
+target_compile_definitions(mkl_micro_dgalois PUBLIC USE_DIST_GALOIS=1)
+
+add_executable(mkl_micro_delete_galois mkl_micro.cpp)
+target_link_libraries(mkl_micro_delete_galois PUBLIC galois_gnn)
+target_compile_definitions(mkl_micro_delete_galois PUBLIC USE_SHARED_GALOIS_DELETE=1)
+
+add_executable(single_mkl_micro_sgalois single_mkl_micro.cpp)
+target_link_libraries(single_mkl_micro_sgalois PUBLIC galois_gnn_single)
+target_compile_definitions(single_mkl_micro_sgalois PUBLIC USE_SHARED_GALOIS=1)
+
+add_executable(single_mkl_micro_dgalois single_mkl_micro.cpp)
+target_link_libraries(single_mkl_micro_dgalois PUBLIC galois_gnn_single)
+target_compile_definitions(single_mkl_micro_dgalois PUBLIC USE_DIST_GALOIS=1)
+
+add_executable(mkl_micro mkl_micro.cpp)
+
+add_executable(mkl_micro_omp mkl_micro.cpp)
+target_link_libraries(mkl_micro_omp PUBLIC ${INTEL_LIBS} OpenMP::OpenMP_CXX)
+target_compile_definitions(mkl_micro_omp PUBLIC USE_OMP=1)
+
+add_executable(single_mkl_micro_omp single_mkl_micro.cpp)
+target_link_libraries(single_mkl_micro_omp PUBLIC ${SINGLE_INTEL_LIBS} OpenMP::OpenMP_CXX)
+target_compile_definitions(single_mkl_micro_omp PUBLIC USE_OMP=1)
+
+foreach(mkl_test ${MKL_TESTS})
+  target_compile_options(${mkl_test} PUBLIC
+      $<TARGET_PROPERTY:MKL::MKL,INTERFACE_COMPILE_OPTIONS>)
+  target_include_directories(${mkl_test} PUBLIC
+      $<TARGET_PROPERTY:MKL::MKL,INTERFACE_INCLUDE_DIRECTORIES>)
+  target_link_libraries(${mkl_test} PUBLIC $<LINK_ONLY:MKL::MKL>)
+endforeach()
+
+add_executable(gstl_test gstl_test.cpp)
+target_link_libraries(gstl_test galois_shmem)

From 80cf95269c34be52146784627e7287ac779f7c59 Mon Sep 17 00:00:00 2001
From: patrickkenney9801 <patrickkenney9801@gmail.com>
Date: Tue, 24 Oct 2023 18:29:29 -0500
Subject: [PATCH 611/660] feat: Add Dockerfile to repo

---
 .gitignore      |  1 +
 CONTRIBUTING.md |  6 +++++
 Dockerfile      | 66 +++++++++++++++++++++++++++++++++++++++++++++++++
 Makefile        | 21 ++++++++++++++++
 4 files changed, 94 insertions(+)
 create mode 100644 Dockerfile

diff --git a/.gitignore b/.gitignore
index 94fc673c6e..8f0aff5b96 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,6 +24,7 @@ tags
 
 # no build files
 /build*
+/dockerbuild*
 
 # no python build artifacts
 *.pyc
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 2297468d67..007227dc70 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -6,6 +6,12 @@ tools like `clang-format` manually.
 
 Code should be clear and documented where needed.
 
+## Setup
+
+Users can run `make docker-image` to setup all dependecies needed for
+`pando-galois`.  After creating the image it can be run via `make docker`.
+And for first time cmake users can run `make run-cmake`.
+
 ## Tools
 
 ### [asdf](https://asdf-vm.com)
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000..d49b9d3211
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,66 @@
+ARG BUILD_IMAGE=ubuntu:22.04
+FROM --platform=linux/amd64 ${BUILD_IMAGE} AS build
+
+WORKDIR /tmp
+
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt update && \
+  apt install -y \
+  cmake \
+  gcc \
+  g++ \
+  build-essential \
+  make \
+  libboost-all-dev \
+  libfmt-dev \
+  libzstd-dev \
+  lsb-release \
+  wget \
+  software-properties-common \
+  gnupg \
+  gdb \
+  vim \
+  git \
+  python3 \
+  python3-pip \
+  && apt-get clean && rm -rf /var/lib/apt/lists/*
+
+# setup intel repo for intel-basekit
+RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | \
+  gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
+RUN echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | \
+  tee /etc/apt/sources.list.d/oneAPI.list
+RUN apt update && \
+  apt install -y \
+  intel-basekit \
+  && apt-get clean && rm -rf /var/lib/apt/lists/*
+
+RUN bash -c "$(wget -O - https://apt.llvm.org/llvm.sh)"
+
+ARG SRC_DIR=/pando-galois
+ARG BUILD_DIR=/pando-galois/dockerbuild
+ARG UNAME
+ARG UID
+ARG GID
+
+RUN if [ "${UNAME}" != "root" ] ; then groupadd -g ${GID} ${UNAME} \
+  &&  useradd -ms /bin/bash  -u "${UID}" -g "${GID}" ${UNAME} ; fi
+
+RUN mkdir -p /home/${UNAME} \
+  && chown ${UNAME}:${UNAME} /home/${UNAME}
+
+USER ${UNAME}
+WORKDIR /home/${UNAME}
+ENV BUILD_DIR=${BUILD_DIR}
+
+RUN pip3 install compdb pre-commit cpplint "clang-format>=12.0.1"
+
+RUN echo "PATH=/home/${UNAME}/.local/bin/:\$PATH" >> /home/${UNAME}/.zshenv
+
+RUN echo "export SRC_DIR=${SRC_DIR}" >> /home/${UNAME}/.bashrc
+RUN echo "export BUILD_DIR=${BUILD_DIR}" >> /home/${UNAME}/.bashrc
+RUN echo "export OMPI_ALLOW_RUN_AS_ROOT=1" >> /home/${UNAME}/.bashrc
+RUN echo "export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1" >> /home/${UNAME}/.bashrc
+RUN echo "export MKL_ROOT=/opt/intel/oneapi/mkl/2023.2.0" >> /home/${UNAME}/.bashrc
+
+WORKDIR ${SRC_DIR}
diff --git a/Makefile b/Makefile
index 2457b3c0a1..df77923812 100644
--- a/Makefile
+++ b/Makefile
@@ -1,3 +1,9 @@
+SHELL := /bin/bash
+
+IMAGE_NAME := pando-galois
+VERSION := 0.0.1
+CONTAINER_SRC_DIR := /pando-galois
+
 dependencies: dependencies-asdf
 
 dependencies-asdf:
@@ -18,3 +24,18 @@ hooks:
 
 pre-commit:
 	@pre-commit run -a
+
+docker-image:
+	@docker --context default build --build-arg VERSION=${VERSION} \
+	--build-arg UNAME=$(shell whoami) \
+  --build-arg UID=$(shell id -u) \
+  --build-arg GID=$(shell id -g) \
+	-t ${IMAGE_NAME}:${VERSION} \
+	--file Dockerfile \
+	--target build .
+
+docker:
+	@docker --context default run --rm -v $(shell pwd)/:${CONTAINER_SRC_DIR} --privileged --workdir=${CONTAINER_SRC_DIR} -it ${IMAGE_NAME}:${VERSION} bash -l
+
+run-cmake:
+	@cmake -S . -B ${BUILD_DIR} -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DUSE_MKL_BLAS=ON -DGALOIS_ENABLE_DIST=ON

From 5fadc59352102f606737ecc69795e42812f61c5b Mon Sep 17 00:00:00 2001
From: "Lee, Hochan" <133701794+hochanlee-amd@users.noreply.github.com>
Date: Wed, 1 Nov 2023 11:30:50 -0500
Subject: [PATCH 612/660] Fix for GCN minibatching (#19)

GCN didn't correctly check minibatch subgraphs, and this commit fixes that.
---
 libgnn/include/galois/GraphNeuralNetwork.h    | 29 +++++--
 libgnn/include/galois/graphs/GNNGraph.h       | 83 ++++++++++++++-----
 .../galois/layers/GraphConvolutionalLayer.h   | 48 +++++++----
 libgnn/include/galois/layers/SoftmaxLayer.h   |  5 +-
 4 files changed, 120 insertions(+), 45 deletions(-)

diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h
index 88a48f961c..1d9108fbe5 100644
--- a/libgnn/include/galois/GraphNeuralNetwork.h
+++ b/libgnn/include/galois/GraphNeuralNetwork.h
@@ -500,8 +500,10 @@ class GraphNeuralNetwork {
     galois::StatTimer epoch_timer("TrainingTime", kRegionName);
     galois::StatTimer validation_timer("ValidationTime", kRegionName);
     galois::StatTimer epoch_test_timer("TestTime", kRegionName);
-
+    float total_checked{0}, correct{0};
     for (size_t epoch = 0; epoch < num_epochs; epoch++) {
+      total_checked = 0;
+      correct       = 0;
       epoch_timer.start();
       // swap to train subgraph
       if (config_.use_train_subgraph_ && !config_.train_minibatch_size()) {
@@ -684,16 +686,30 @@ class GraphNeuralNetwork {
           mb_timer.stop();
 
           const PointerWithSize<galois::GNNFloat> batch_pred = DoInference();
-          train_accuracy = GetGlobalAccuracy(batch_pred);
+
+          if (graph_->is_using_wmd()) {
+            std::pair<float, float> accuracy_results =
+                this->graph_->GetGlobalAccuracyCheckResult(
+                    batch_pred, phase_, config_.do_sampling());
+            train_accuracy = accuracy_results.first / accuracy_results.second;
+            correct += accuracy_results.first;
+            total_checked += accuracy_results.second;
+            galois::gPrint("Epoch ", epoch, " Batch ", batch_num - 1,
+                           ": The number of correct answers is ", correct, "/",
+                           total_checked, "\n");
+          } else {
+            train_accuracy = GetGlobalAccuracy(batch_pred);
+            galois::gPrint("Epoch ", epoch, " Batch ", batch_num - 1,
+                           ": Train accuracy/F1 micro is ", train_accuracy,
+                           " time ", batch_timer.get(), "\n");
+          }
+
           GradientPropagation();
 
           work_left_ += graph_->MoreTrainMinibatches();
           char global_work_left = work_left_.reduce();
           batch_timer.stop();
           epoch_timer.stop();
-          galois::gPrint("Epoch ", epoch, " Batch ", batch_num - 1,
-                         ": Train accuracy/F1 micro is ", train_accuracy,
-                         " time ", batch_timer.get(), "\n");
 
           bool test_eval =
               config_.minibatch_test_interval_
@@ -760,6 +776,9 @@ class GraphNeuralNetwork {
       if (this_host == 0) {
         const std::string t_name_acc =
             "TrainEpoch" + std::to_string(epoch) + "Accuracy";
+        if (config_.train_minibatch_size() && this->graph_->is_using_wmd()) {
+          train_accuracy = correct / total_checked;
+        }
         galois::gPrint("Epoch ", epoch, ": Train accuracy/F1 micro is ",
                        train_accuracy, "\n");
         galois::runtime::reportStat_Single(kRegionName, t_name_acc,
diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index db5df02223..31b1fbf120 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -53,15 +53,15 @@ class GNNGraph {
   //  galois::LargeArray<std::vector<bool>>>;
 
   GNNGraph(const std::string& dataset_name, GNNPartitionScheme partition_scheme,
-           bool has_single_class_label, bool useWMD = false)
+           bool has_single_class_label, bool use_wmd = false)
       : GNNGraph(galois::default_gnn_dataset_path, dataset_name,
-                 partition_scheme, has_single_class_label, useWMD) {}
+                 partition_scheme, has_single_class_label, use_wmd) {}
 
   //! Loads a graph and all relevant metadata (labels, features, masks, etc.)
   GNNGraph(const std::string& input_directory, const std::string& dataset_name,
            GNNPartitionScheme partition_scheme, bool has_single_class_label,
-           bool useWMD = false)
-      : input_directory_(input_directory) {
+           bool use_wmd = false)
+      : input_directory_(input_directory), use_wmd_(use_wmd) {
     GALOIS_LOG_VERBOSE("[{}] Constructing partitioning for {}", host_id_,
                        dataset_name);
     // save host id
@@ -72,7 +72,7 @@ class GNNGraph {
         std::string("] ");
     // load partition
     partitioned_graph_ =
-        LoadPartition(input_directory_, dataset_name, partition_scheme, useWMD);
+        LoadPartition(input_directory_, dataset_name, partition_scheme);
     galois::gInfo(host_prefix_, "Loading partition is completed");
     // reverse edges
     partitioned_graph_->ConstructIncomingEdges();
@@ -93,7 +93,7 @@ class GNNGraph {
     bitset_graph_aggregate.resize(partitioned_graph_->size());
 
     // Construct/read additional graph data
-    if (useWMD) {
+    if (use_wmd) {
       galois::gInfo("Feature is constructed by aggregating 2-hop features, "
                     "instead from feature files");
       this->ConstructFeatureBy2HopAggregation();
@@ -939,7 +939,7 @@ class GNNGraph {
     this->Construct1HopFeatureCPU();
     // this->PrintFeatures("1hop");
     this->Construct2HopFeatureCPU();
-    this->PrintFeatures("2hop");
+    //this->PrintFeatures("2hop");
   }
 
   void PrintFeatures(std::string postfix) {
@@ -1178,6 +1178,16 @@ class GNNGraph {
     return GetGlobalAccuracyCPU(predictions, phase, sampling);
   }
 
+  /**
+   * @brief Compare predictions from a model and ground truths, and return the
+   * results.
+   */
+  std::pair<float, float>
+  GetGlobalAccuracyCheckResult(PointerWithSize<GNNFloat> predictions,
+                               GNNPhase phase, bool sampling) {
+    return GetGlobalAccuracyCPUSingle(predictions, phase, sampling);
+  }
+
   std::pair<uint32_t, uint32_t>
   GetBatchAccuracy(PointerWithSize<GNNFloat> predictions) {
     // check owned nodes' accuracy
@@ -1622,6 +1632,9 @@ class GNNGraph {
     return definitely_sampled_nodes_;
   }
 
+  /* @brief Return true if this is constructed from a WMD graph otherwise false. */
+  bool is_using_wmd() { return this->use_wmd_; }
+
 private:
 // included like this to avoid cyclic dependency issues + not used anywhere but
 // in this class anyways
@@ -1632,12 +1645,13 @@ class GNNGraph {
   //////////////////////////////////////////////////////////////////////////////
 
   //! Partitions a particular dataset given some partitioning scheme
-  std::unique_ptr<GNNDistGraph> LoadPartition(
-      const std::string& input_directory, const std::string& dataset_name,
-      galois::graphs::GNNPartitionScheme partition_scheme, bool useWMD) {
+  std::unique_ptr<GNNDistGraph>
+  LoadPartition(const std::string& input_directory,
+                const std::string& dataset_name,
+                galois::graphs::GNNPartitionScheme partition_scheme) {
     // XXX input path
     std::string input_file = input_directory + dataset_name + ".csgr";
-    if (useWMD) {
+    if (this->use_wmd_) {
       input_file = dataset_name;
     }
     GALOIS_LOG_VERBOSE("Partition loading: File to read is {}", input_file);
@@ -1646,16 +1660,16 @@ class GNNGraph {
     switch (partition_scheme) {
     case galois::graphs::GNNPartitionScheme::kOEC:
       return galois::cuspPartitionGraph<GnnOEC, VTy, ETy>(
-          input_file, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, true, "", "",
-          false, 1);
+          input_file, galois::CUSP_CSR, galois::CUSP_CSR, this->use_wmd_, true,
+          "", "", false, 1);
     case galois::graphs::GNNPartitionScheme::kCVC:
       return galois::cuspPartitionGraph<GnnCVC, VTy, ETy>(
-          input_file, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, true, "", "",
-          false, 1);
+          input_file, galois::CUSP_CSR, galois::CUSP_CSR, this->use_wmd_, true,
+          "", "", false, 1);
     case galois::graphs::GNNPartitionScheme::kOCVC:
       return galois::cuspPartitionGraph<GenericCVC, VTy, ETy>(
-          input_file, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, true, "", "",
-          false, 1);
+          input_file, galois::CUSP_CSR, galois::CUSP_CSR, this->use_wmd_, true,
+          "", "", false, 1);
     default:
       GALOIS_LOG_FATAL("Error: partition scheme specified is invalid");
       return nullptr;
@@ -1680,6 +1694,7 @@ class GNNGraph {
     // is better
     std::mutex label_class_set_mtx;
     std::unordered_set<int> label_class_set;
+    num_label_classes_ = 0;
     galois::do_all(galois::iterate(size_t{0}, graph.size()), [&](size_t lid) {
       local_ground_truth_labels_[lid] = graph.getData(lid).type;
       label_class_set_mtx.lock();
@@ -2362,7 +2377,9 @@ class GNNGraph {
     float accuracy{0};
     if (is_single_class_label()) {
       global_accuracy_for_singleclass_timer.start();
-      accuracy = GetGlobalAccuracyCPUSingle(predictions, phase, sampling);
+      auto accuracy_result =
+          GetGlobalAccuracyCPUSingle(predictions, phase, sampling);
+      accuracy = accuracy_result.first / accuracy_result.second;
       global_accuracy_for_singleclass_timer.stop();
     } else {
       global_accuracy_for_multiclass_timer.start();
@@ -2373,12 +2390,30 @@ class GNNGraph {
     return accuracy;
   }
 
-  float GetGlobalAccuracyCPUSingle(PointerWithSize<GNNFloat> predictions,
-                                   GNNPhase phase, bool) {
+  std::pair<float, float>
+  GetGlobalAccuracyCPUSingle(PointerWithSize<GNNFloat> predictions,
+                             GNNPhase phase, bool) {
     // check owned nodes' accuracy
     num_correct_.reset();
     total_checked_.reset();
 
+#if 0
+    std::cout << "single accuracy print:\n";
+    for (int i = *begin_owned(); i < *end_owned(); ++i) {
+      if (!IsValidForPhase(i, GNNPhase::kBatch)) {
+        continue; 
+      }
+      //std::cout << subgraph_->SIDToLID(i) << ", " << galois::MaxIndex(num_label_classes_, &predictions[i * num_label_classes_]) <<
+      std::cout << "accuracy:" << subgraph_->SIDToLID(i) << ", " << 
+      predictions[i * num_label_classes_] << ", " <<
+      predictions[i * num_label_classes_ + 1] << ", " <<
+      predictions[i * num_label_classes_ + 2] << ", " <<
+      predictions[i * num_label_classes_ + 3] << ", " <<
+      predictions[i * num_label_classes_ + 4] << "-> " <<
+      galois::MaxIndex(num_label_classes_, &predictions[i * num_label_classes_]) <<
+      " vs " << GetSingleClassLabel(i) << "\n";
+    }
+#endif
     galois::do_all(
         // will only loop over sampled nodes if sampling is on
         galois::iterate(begin_owned(), end_owned()),
@@ -2408,9 +2443,8 @@ class GNNGraph {
 
     GALOIS_LOG_DEBUG("Sub: {}, Accuracy: {} / {}", use_subgraph_,
                      global_correct, global_checked);
-
-    return static_cast<float>(global_correct) /
-           static_cast<float>(global_checked);
+    return std::make_pair(static_cast<float>(global_correct),
+                          static_cast<float>(global_checked));
   }
 
   float GetGlobalAccuracyCPUMulti(PointerWithSize<GNNFloat> predictions,
@@ -2646,6 +2680,9 @@ class GNNGraph {
 
   std::vector<uint32_t> node_remapping_;
 
+  // True if a WMD graph is being used otherwise false
+  bool use_wmd_{false};
+
   //////////////////////////////////////////////////////////////////////////////
   // GPU things
   //////////////////////////////////////////////////////////////////////////////
diff --git a/libgnn/include/galois/layers/GraphConvolutionalLayer.h b/libgnn/include/galois/layers/GraphConvolutionalLayer.h
index 3931ed06e1..d5259a7af9 100644
--- a/libgnn/include/galois/layers/GraphConvolutionalLayer.h
+++ b/libgnn/include/galois/layers/GraphConvolutionalLayer.h
@@ -139,7 +139,8 @@ class GraphConvolutionalLayer : public GNNLayer<VTy, ETy> {
     GNNFloat* agg_data;
     // first, dropout
     if (!this->config_.disable_dropout &&
-        (this->layer_phase_ == GNNPhase::kTrain)) {
+        (this->layer_phase_ == GNNPhase::kTrain ||
+         this->layer_phase_ == GNNPhase::kBatch)) {
       this->DoDropout(input_embeddings, &p_in_temp_1_);
       input_data = p_in_temp_1_.data();
       agg_data   = p_in_temp_2_.data();
@@ -187,7 +188,8 @@ class GraphConvolutionalLayer : public GNNLayer<VTy, ETy> {
                                                  kRegionName);
     timer.start();
 
-    assert(this->layer_phase_ == GNNPhase::kTrain);
+    assert(this->layer_phase_ == GNNPhase::kTrain ||
+           this->layer_phase_ == GNNPhase::kBatch);
 
     // derivative of activation
     if (!this->config_.disable_activation) {
@@ -285,12 +287,11 @@ class GraphConvolutionalLayer : public GNNLayer<VTy, ETy> {
                    input_gradient->data(), p_out_temp_.data(),
                    &output_column_intermediates_, true);
 
-      // done after above because input_data = p_backward_output_matrix in some
-      // cases; use first before overwriting here if layer # doesn't = 0, it
-      // means I can mess with the input data itself instad of masking the
-      // gradients I can mask the input
       if (this->layer_number_ != 0) {
         if (this->graph_.IsSubgraphOn()) {
+          // Gradients for mirror nodes should be updated by their owner
+          // hosts. In case of graph sampling, we should let this know whether
+          // a node is a sampled master or not.
           this->MaskInputNonMasters(&input_data,
                                     this->layer_dimensions_.input_rows,
                                     this->graph_.GetNonLayerZeroMasters());
@@ -299,6 +300,8 @@ class GraphConvolutionalLayer : public GNNLayer<VTy, ETy> {
                                     this->layer_dimensions_.input_rows);
         }
       } else {
+        // The first layer can zerofy non-master nodes' gradients since
+        // it is the last gradient aggregation.
         // if 0 then no input to mask: mask the gradient
         // this is fine because gradient won't be used to get feature gradients
         if (this->graph_.IsSubgraphOn()) {
@@ -320,6 +323,12 @@ class GraphConvolutionalLayer : public GNNLayer<VTy, ETy> {
       } else {
 #endif
         weight_gradient_timer.start();
+        // p_out_temp aggregated gradients from the next layer.
+        // The weight gradients for this layer is calculated by
+        // (The current vertex embedding x p_out_temp).
+        // Vertex embedding dimension is (input row x input column),
+        // p_out_temp dimension is (input row x output column),
+        // and weight is (input column x output column).
         galois::CBlasSGEMM(
             CblasTrans, CblasNoTrans, this->layer_dimensions_.input_columns,
             this->layer_dimensions_.input_rows,
@@ -382,13 +391,18 @@ class GraphConvolutionalLayer : public GNNLayer<VTy, ETy> {
                   galois::substrate::PerThreadStorage<std::vector<GNNFloat>>*,
                   bool is_backward) {
     galois::StatTimer aggregate_all_sync_timer("AggregateSync", kRegionName);
-    size_t num_nodes   = (is_backward) ? this->layer_dimensions_.input_rows
-                                       : this->layer_dimensions_.output_rows;
+    size_t num_nodes   = (is_backward)
+                             ? this->layer_dimensions_.input_rows
+                           // In case of minibatching or graph sampling,
+                           // the outut row must be the samped graph's number of
+                           // nodes of that layer.
+                             : this->layer_dimensions_.output_rows;
     size_t last_master = *(this->graph_.end_owned());
 
     assert(0 == *(this->graph_.begin_owned()));
 
     galois::do_all(
+        /* Either an original or a sampled graph iterator is used */
         galois::iterate(*(this->graph_.begin()), num_nodes),
         [&](size_t src) {
           size_t index_to_src_feature = src * column_length;
@@ -397,12 +411,13 @@ class GraphConvolutionalLayer : public GNNLayer<VTy, ETy> {
             aggregate_output[index_to_src_feature + i] = 0;
           }
 
-          if (this->layer_phase_ == GNNPhase::kTrain) {
+          if (this->layer_phase_ == GNNPhase::kTrain ||
+              this->layer_phase_ == GNNPhase::kBatch) {
             if (this->IsSampledLayer()) {
               // Check if node is part of sampled graph; ignore after
               // 0'ing if it is not sampled.
               // TODO(hc): check if SAGE also checks this
-              if (!this->graph_.IsInSampledGraph(src)) {
+              if (!this->graph_.IsInSampledGraphSubgraph(src)) {
                 return;
               }
             }
@@ -550,11 +565,14 @@ class GraphConvolutionalLayer : public GNNLayer<VTy, ETy> {
     } else {
 #endif
       // CPU version is just a call into CBlas
-      galois::CBlasSGEMM(CblasNoTrans, CblasNoTrans,
-                         this->layer_dimensions_.input_rows,
-                         this->layer_dimensions_.input_columns,
-                         this->layer_dimensions_.output_columns,
-                         node_embeddings, this->layer_weights_.data(), output);
+      galois::CBlasSGEMM(
+          CblasNoTrans, CblasNoTrans,
+          this->layer_dimensions_.input_rows /* Graph or sampled graph nodes */,
+          this->layer_dimensions_.input_columns,
+          this->layer_dimensions_.output_columns,
+          node_embeddings /* input row x input columns */,
+          this->layer_weights_.data() /* input column x output column */,
+          output);
 #ifdef GALOIS_ENABLE_GPU
     }
 #endif
diff --git a/libgnn/include/galois/layers/SoftmaxLayer.h b/libgnn/include/galois/layers/SoftmaxLayer.h
index b55e37f05d..0fe3d66284 100644
--- a/libgnn/include/galois/layers/SoftmaxLayer.h
+++ b/libgnn/include/galois/layers/SoftmaxLayer.h
@@ -69,6 +69,7 @@ class SoftmaxLayer : public GNNLayer<VTy, ETy> {
           // do softmax
           GNNSoftmax(feature_length, &input_embeddings[feature_length * i],
                      &this->p_backward_output_matrix_[feature_length * i]);
+
           // create ground truth vector for this LID
           std::vector<GNNFloat>* ground_truth_vec =
               ground_truth_vectors_.getLocal();
@@ -97,7 +98,6 @@ class SoftmaxLayer : public GNNLayer<VTy, ETy> {
     galois::gPrint("Loss is ", reduced_loss / t, " ", reduced_loss, " ", t,
                    "\n");
 #endif
-
     this->TimerStop(&Timer);
     return this->p_backward_output_matrix_;
   }
@@ -127,7 +127,8 @@ class SoftmaxLayer : public GNNLayer<VTy, ETy> {
         galois::iterate(size_t{0}, this->layer_dimensions_.input_rows),
         [&](const unsigned node) {
           if (this->IsSampledLayer()) {
-            if (this->layer_phase_ == GNNPhase::kTrain &&
+            if ((this->layer_phase_ == GNNPhase::kTrain ||
+                 this->layer_phase_ == GNNPhase::kBatch) &&
                 !this->graph_.IsInSampledGraphSubgraph(node))
               return;
           }

From 5f7f4d28ada758e5919b332810b4e37e3736b8fb Mon Sep 17 00:00:00 2001
From: Ian Henriksen <insertinterestingnamehere@gmail.com>
Date: Fri, 21 Aug 2020 15:17:23 -0500
Subject: [PATCH 613/660] Use relaxed consistency in atomic helpers.

Remove unused Galois atomic header.

Fix broken barrier in DistStats

Shamelessly stolen from KatanaGraph.

Fixes an issue where host 0 can stay in termination detection because it
detects work after other hosts have decided that they are done with
termination detection.

kway fixed

comment out stdout

coarsening fix

time sep

output

imbalance

Fix 0-initialization of elements in a multiple_sum structure on GPU

Update Refine.cpp

self edge check

multi edge support for tc

Update README.md

Small change to help compilation

Added an LC_CSR graph with 64 bit node indexes

Changes to support a common interface

Added partial log-structure for LS_CSR with out of line objects

Not ready for consumption

Changes to dev

Small fix so pangolin compiles

Fix Catch so it compiles easily on Linux

Parallel stuff for LS_LC_CSR_64_Graph

Some changes for different insertions types

Experimental changes for Distributed Graph

Made a passable transpose function that should work for our purposes

Added parallel constructor

Update LS_LC_CSR_64 to be usable with distributed traingle counting

feat: add setter for OffilineGraph size

feat: a flag to set size on addEdgesUnSort

Initial commit of instrumentation for Yineng

Added a print method

fix: typo

feat: profile each host seperatly

feat: write profile to seperate file

fix: typo

fix: initalize EdgeEnd on construction

Added sane defaults for initilization

feat: add OEC policy and make OfflineGraph virtual

chore: Port over wmd graph from graph-log-sketch

revert: Disable transferring node data due to serialization issues

chore: clean up code

Stack Sizes

Made BFS graph500 compliant and added stack_capture

Added StackTracer to CMakeLists

Added documentation for Stack Capture

Fixed CMakeLists.txt for Stack_Capture

Prefix sums for thread ranges

Added documentation to PrefixSum and WaterFallLock

feat: make wmd a lib and add test

Another small compilation bug, needed to return the right object

A few changes in order to make liblonestar compile with PrefixSum

Old bugfix got borked

feat: support loop ranges for LS_LC_CSR_64

chore: remove unused profile code

fix: resolve compile error and warning

fix: resolve unused arg

fix: resolve minor compile warning

feat: add active threads arg to wmd graph test

fix: add steal to do all in WMDGraph

feat: remove unnecessary computation

feat: support file striping in read graph file

feat: support multithreading file reading

fix: remove slow memory op

fix: a bug about getline and '\n'

fix: incorrect loop condition

fix: remove slow reserve()

Adding new Graph Part policy with less metadata

memory error

fix: change int typing and restyle

fixed bug + improved perf

feat: add testing script

feat: add glbid field to vertex struct of wmd

feat: print full graph in test script

bug fix

style: address some style issues and add comments

Small fixes for compilation issues

Another small compilation bug, needed to return the right object

A few changes in order to make liblonestar compile with PrefixSum

Old fix that got borked somehow

Remove constants that didn't seemed to creep back up during merging

fix: typo and TODO

chores: add TODO for numa ds

restyle: change the way to deal with unused var

chores: add comment for perthread storage and test

changes for PR

bug fix

chore: Run clang-format on the repo and add git hooks from gnn branch

chore: Add instrument.h header file to libwmd

feat: Add extensible import for multiple files and projection

chore: Add instrumentation for memory accesses to import/projection

bugfix: Add Divija's fix for setting edge destinations to local ids

trying to commit

fix: remove uncessary code

feat: add instrument to wmd graph importer

fixed all bugs (hopefully)

test: update instrument

feat: add a scrip to process instrument result
---
 .gitignore                                    |    5 +
 CMakeLists.txt                                |   41 +-
 CONTRIBUTING.md                               |   45 +
 Makefile                                      |    6 +
 README.md                                     |   43 +-
 libcusp/include/galois/graphs/BasePolicies.h  |   25 +-
 .../include/galois/graphs/DistributedGraph.h  |   22 +-
 .../galois/graphs/GenericPartitioners.h       |   13 +-
 libcusp/include/galois/graphs/NewGeneric.h    |   12 +
 libdist/include/galois/runtime/Serialize.h    |   33 +
 libdist/src/DistStats.cpp                     |   14 +-
 libgalois/include/galois/Atomic.h             |  284 ---
 libgalois/include/galois/AtomicHelpers.h      |   26 +-
 libgalois/include/galois/PrefixSum.h          |  196 ++
 libgalois/include/galois/WaterFallLock.h      |   77 +
 .../include/galois/graphs/GraphHelpers.h      |   38 +-
 .../include/galois/graphs/LC_CSR_64_Graph.h   | 1027 ++++++++++
 .../include/galois/graphs/LC_CSR_Graph.h      |    4 +-
 .../galois/graphs/LS_LC_CSR_64_Graph.h        | 1647 ++++++++++++++++
 .../include/galois/graphs/LS_LC_CSR_Graph.h   | 1113 +++++++++++
 libgalois/include/galois/graphs/MorphGraph.h  |   49 +-
 .../include/galois/graphs/OfflineGraph.h      |   12 +-
 .../include/galois/runtime/StackTracer.h      |  219 +++
 .../galois/substrate/PerThreadStorage.h       |    4 +
 libgalois/src/HWTopoLinux.cpp                 |    1 +
 libgalois/test/CMakeLists.txt                 |    2 +
 libgalois/test/prefixsum.cpp                  |  101 +
 libgalois/test/wfl.cpp                        |  106 +
 libgpu/include/internal.h                     |    4 +-
 libwmd/CMakeLists.txt                         |   31 +
 libwmd/include/galois/wmd/WMDGraph.h          | 1720 +++++++++++++++++
 libwmd/include/galois/wmd/WMDPartitioner.h    |  931 +++++++++
 libwmd/include/galois/wmd/data_types.h        |  741 +++++++
 libwmd/include/galois/wmd/graph.h             |  190 ++
 libwmd/include/galois/wmd/graphTypes.h        |   75 +
 libwmd/include/galois/wmd/instrument.h        |  190 ++
 libwmd/include/galois/wmd/schema.h            |  177 ++
 libwmd/test/CMakeLists.txt                    |   12 +
 libwmd/test/wmd-graph-build.cpp               |  125 ++
 lonestar/analytics/cpu/bipart/Coarsening.cpp  |  248 ++-
 lonestar/analytics/cpu/bipart/Refine.cpp      |   50 +-
 lonestar/analytics/cpu/bipart/bipart.cpp      |  316 +--
 lonestar/analytics/cpu/bipart/bipart.h        |   22 +-
 .../cpu/triangle-counting/Triangles.cpp       |   14 +-
 .../analytics/distributed/bfs/bfs_push.cpp    |  116 +-
 .../scientific/cpu/longestedge/test/catch.hpp |    1 +
 scripts/generate_wmdpartitioner_statstics.py  |   56 +
 tools/graph-convert/graph-convert.cpp         |    1 +
 48 files changed, 9570 insertions(+), 615 deletions(-)
 delete mode 100644 libgalois/include/galois/Atomic.h
 create mode 100644 libgalois/include/galois/PrefixSum.h
 create mode 100644 libgalois/include/galois/WaterFallLock.h
 create mode 100644 libgalois/include/galois/graphs/LC_CSR_64_Graph.h
 create mode 100644 libgalois/include/galois/graphs/LS_LC_CSR_64_Graph.h
 create mode 100644 libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
 create mode 100644 libgalois/include/galois/runtime/StackTracer.h
 create mode 100644 libgalois/test/prefixsum.cpp
 create mode 100644 libgalois/test/wfl.cpp
 create mode 100644 libwmd/CMakeLists.txt
 create mode 100644 libwmd/include/galois/wmd/WMDGraph.h
 create mode 100644 libwmd/include/galois/wmd/WMDPartitioner.h
 create mode 100644 libwmd/include/galois/wmd/data_types.h
 create mode 100644 libwmd/include/galois/wmd/graph.h
 create mode 100644 libwmd/include/galois/wmd/graphTypes.h
 create mode 100644 libwmd/include/galois/wmd/instrument.h
 create mode 100644 libwmd/include/galois/wmd/schema.h
 create mode 100644 libwmd/test/CMakeLists.txt
 create mode 100644 libwmd/test/wmd-graph-build.cpp
 create mode 100755 scripts/generate_wmdpartitioner_statstics.py

diff --git a/.gitignore b/.gitignore
index 8f0aff5b96..3a054d27a7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -23,11 +23,16 @@ tags
 .ycm_extra_conf.py
 
 # no build files
+<<<<<<< HEAD
 /build*
 /dockerbuild*
+=======
+/*build*
+>>>>>>> 8e396b028 (Fixed CMakeLists.txt for Stack_Capture)
 
 # no python build artifacts
 *.pyc
 /python/galois.egg-info
 /python/galois/*.so
 /_skbuild
+
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4731b8b99d..146f4adb25 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,7 +6,19 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules")
 
 include(GNUInstallDirs)
 
-
+<<<<<<< HEAD
+<<<<<<< HEAD
+=======
+if(STACK_CAPTURE)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -finstrument-functions")
+  set(INSTRUMENT_EXCLUDE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/libgalois/include/galois/runtime/StackTracer.h")
+  set(INSTRUMENT_EXCLUDE_FILE "${INSTRUMENT_EXCLUDE_FILE},/usr/include/c++/11/sstream")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -finstrument-functions-exclude-file-list=${INSTRUMENT_EXCLUDE_FILE}")
+endif(STACK_CAPTURE)
+>>>>>>> deb11e279 (Added StackTracer to CMakeLists)
+
+=======
+>>>>>>> 8e396b028 (Fixed CMakeLists.txt for Stack_Capture)
 file(STRINGS config/version.txt GALOIS_VERSION)
 string(REGEX REPLACE "[ \t\n]" "" GALOIS_VERSION ${GALOIS_VERSION})
 string(REGEX REPLACE "([0-9]+)\\.([0-9]+)\\.([0-9]+)" "\\1" GALOIS_VERSION_MAJOR ${GALOIS_VERSION})
@@ -23,9 +35,10 @@ endif()
 
 ###### Options (alternatively pass as options to cmake -DName=Value) ######
 ###### Distributed-heterogeneous features ######
-set(GALOIS_ENABLE_DIST OFF CACHE BOOL "Enable distributed features")
+set(GALOIS_ENABLE_DIST ON CACHE BOOL "Enable distributed features")
 set(GALOIS_CUDA_CAPABILITY "" CACHE STRING "Semi-colon list of CUDA compute capability version numbers to enable GPU features") # e.g., "3.7;6.1"
 set(GALOIS_COMM_STATS OFF CACHE BOOL "Report more detailed statistics of communication")
+set(GALOIS_ENABLE_WMD ON CACHE BOOL "Enable WMD dataset support")
 ###### General features ######
 set(GALOIS_ENABLE_PAPI OFF CACHE BOOL "Use PAPI counters for profiling")
 set(GALOIS_ENABLE_VTUNE OFF CACHE BOOL "Use VTune for profiling")
@@ -51,6 +64,7 @@ set(GALOIS_NUM_TEST_GPUS "0" CACHE STRING "Number of test GPUs to use (on a sing
 set(GALOIS_USE_LCI OFF CACHE BOOL "Use LCI network runtime instead of MPI")
 set(GALOIS_USE_BARE_MPI OFF CACHE BOOL "Use MPI directly (no dedicated network-runtime thread)")
 set(GALOIS_NUM_TEST_THREADS "" CACHE STRING "Maximum number of threads to use when running tests (default: number of physical cores)")
+set(GALOIS_ENABLE_INSTRUMENT OFF CACHE BOOL "Enable generating instrument in the runtime")
 
 if(NOT GALOIS_NUM_TEST_THREADS)
   cmake_host_system_information(RESULT GALOIS_NUM_TEST_THREADS QUERY NUMBER_OF_PHYSICAL_CORES)
@@ -175,6 +189,10 @@ if(GALOIS_ENABLE_PAPI)
   add_definitions(-DGALOIS_ENABLE_PAPI)
 endif()
 
+if (GALOIS_ENABLE_INSTRUMENT)
+  add_definitions(-DGALOIS_INSTRUMENT)
+endif()
+
 find_package(Threads REQUIRED)
 
 include(CheckMmap)
@@ -233,6 +251,10 @@ if (GALOIS_ENABLE_DIST)
   add_subdirectory(libdist)
   add_subdirectory(libcusp)
   add_subdirectory(libgluon)
+  if (GALOIS_ENABLE_WMD)
+    find_package(MPI REQUIRED)
+    add_subdirectory(libwmd)
+  endif()
 endif()
 
 # TODO(loc) prefix with GALOIS
@@ -248,6 +270,13 @@ if (GALOIS_ENABLE_GPU)
     string(REPLACE "." "" GENCODE ${GENCODE})
     add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_${GENCODE},code=sm_${GENCODE}>")
   endforeach()
+<<<<<<< HEAD
+=======
+
+  # This is necessary to allow building for CUDA 11.x (where CUB is bundled) and earlier versions (where CUB is not included)
+  add_definitions(-DTHRUST_IGNORE_CUB_VERSION_CHECK)
+
+>>>>>>> 191e9ff91 (Fix 0-initialization of elements in a multiple_sum structure on GPU)
   add_subdirectory(libgpu)
 
   if (USE_DEEPGALOIS)
@@ -343,3 +372,11 @@ set(CPACK_PACKAGE_VERSION_MAJOR ${GALOIS_VERSION_MAJOR})
 set(CPACK_PACKAGE_VERSION_MINOR ${GALOIS_VERSION_MINOR})
 set(CPACK_PACKAGE_VERSION_PATCH ${GALOIS_VERSION_PATCH})
 include(CPack)
+
+if(STACK_CAPTURE)
+  message("Writing CMAKE_CXX_FLAGS")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSTACK_TRACE -finstrument-functions")
+  set(INSTRUMENT_EXCLUDE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/libgalois/include/galois/runtime/StackTracer.h")
+  set(INSTRUMENT_EXCLUDE_FILE "${INSTRUMENT_EXCLUDE_FILE},/usr/include/c++/11/sstream")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -finstrument-functions-exclude-file-list=${INSTRUMENT_EXCLUDE_FILE}" CACHE STRING "CMAKE Flags" FORCE)
+endif(STACK_CAPTURE)
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 007227dc70..36e317c15b 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -6,12 +6,57 @@ tools like `clang-format` manually.
 
 Code should be clear and documented where needed.
 
+<<<<<<< HEAD
+<<<<<<< HEAD
 ## Setup
 
 Users can run `make docker-image` to setup all dependecies needed for
 `pando-galois`.  After creating the image it can be run via `make docker`.
 And for first time cmake users can run `make run-cmake`.
 
+=======
+>>>>>>> 5901b24b6 (chore: Run clang-format on the repo and add git hooks from gnn branch)
+=======
+# Instrumentation
+
+This section pertains to enabling and instrumenting memory accesses for
+performance projections on the theoretical PANDO hardware.
+
+In order for the instrumentation code in `libwmd/include/galois/wmd/instrument.h`,
+the following should be added to your top level source directory:
+
+```cmake
+set(GALOIS_ENABLE_INSTRUMENT ON)
+if (GALOIS_ENABLE_INSTRUMENT)
+  add_definitions(-DGALOIS_INSTRUMENT)
+endif()
+```
+
+Here is a description of the control-flow macros used by the instrumentation
+and when they should be used.
+
+```cpp
+// Should be called once at the start of the program to initialize the instrumentation
+// For example specifying `GRAPH_NAME=example-graph` will result in instrumentation
+// files starting with `example-graph`
+I_INIT(GRAPH_NAME, HOST, NUM_HOSTS, NUM_EDGES)
+// Should be called once at the end of the program to cleanup the instrumentation
+I_DEINIT()
+// Should be called after the first kernel measured if multiple kernels are being measured
+// For example if you specified `GRAPH_NAME=example-graph` above then specifying here that
+// `NAME_SUFFIX=-kernel2` will result in instrumentation files starting `example-graph-kernel2`
+I_NEW_FILE(NAME_SUFFIX, NUM_EDGES)
+// I_ROUND should be called at the end of a communication round to log all memory accesses
+// and communication recorded into instrumentation files
+// I_CLEAR should be called after I_ROUND
+I_ROUND(ROUND_NUM)
+I_CLEAR()
+// Should be called when sending custom communication to a remote host, recommended practice
+// is to just pass in the size of the SendBuffer you are using
+I_LC(REMOTE_HOST, BYTES)
+```
+
+>>>>>>> 43672aff5 (chore: Add instrument.h header file to libwmd)
 ## Tools
 
 ### [asdf](https://asdf-vm.com)
diff --git a/Makefile b/Makefile
index df77923812..d9fc8742ba 100644
--- a/Makefile
+++ b/Makefile
@@ -1,9 +1,12 @@
+<<<<<<< HEAD
 SHELL := /bin/bash
 
 IMAGE_NAME := pando-galois
 VERSION := 0.0.1
 CONTAINER_SRC_DIR := /pando-galois
 
+=======
+>>>>>>> 5901b24b6 (chore: Run clang-format on the repo and add git hooks from gnn branch)
 dependencies: dependencies-asdf
 
 dependencies-asdf:
@@ -24,6 +27,7 @@ hooks:
 
 pre-commit:
 	@pre-commit run -a
+<<<<<<< HEAD
 
 docker-image:
 	@docker --context default build --build-arg VERSION=${VERSION} \
@@ -39,3 +43,5 @@ docker:
 
 run-cmake:
 	@cmake -S . -B ${BUILD_DIR} -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DUSE_MKL_BLAS=ON -DGALOIS_ENABLE_DIST=ON
+=======
+>>>>>>> 5901b24b6 (chore: Run clang-format on the repo and add git hooks from gnn branch)
diff --git a/README.md b/README.md
index 3375e800ee..ffda74f765 100644
--- a/README.md
+++ b/README.md
@@ -11,17 +11,17 @@ an implicitly parallel programming model, where the programmer replaces serial l
 constructs (e.g. for and while) and serial data structures in their algorithms with parallel loop
 constructs and concurrent data structures provided by Galois to express their algorithms.
 Galois is designed so that the programmer does not have to deal with low-level parallel programming constructs such as
-threads, locks, barriers, condition variables, etc. 
+threads, locks, barriers, condition variables, etc.
 
 Highlights include:
 - Parallel *for_each* loop that handles dependencies between iterations, as well as
   dynamic work creation, and a *do_all* loop for simple parallelism. Both provide load balancing and excellent
   scalability on multi-socket systems
 - A concurrent graph library designed for graph analytics algorithms as well as
-  other domains such as irregular meshes. 
-- Scalable concurrent containers such as bag, vector, list, etc. 
+  other domains such as irregular meshes.
+- Scalable concurrent containers such as bag, vector, list, etc.
 
-Galois is released under the BSD-3-Clause license. 
+Galois is released under the BSD-3-Clause license.
 
 
 Building Galois
@@ -45,7 +45,7 @@ Dependencies
 
 Galois builds, runs, and has been tested on GNU/Linux. Even though
 Galois may build on systems similar to Linux, we have not tested correctness or performance, so please
-beware. 
+beware.
 
 At the minimum, Galois depends on the following software:
 
@@ -55,7 +55,7 @@ At the minimum, Galois depends on the following software:
 - libllvm (>= 7.0 with RTTI support)
 - libfmt (>= 4.0)
 
-Here are the dependencies for the optional features: 
+Here are the dependencies for the optional features:
 
 - Linux HUGE_PAGES support (please see [www.kernel.org/doc/Documentation/vm/hugetlbpage.txt](https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt)). Performance will most likely degrade without HUGE_PAGES
   enabled. Galois uses 2MB huge page size and relies on the kernel configuration to set aside a large amount of 2MB pages. For example, our performance testing machine (4x14 cores, 192GB RAM) is configured to support up to 65536 2MB pages:
@@ -70,13 +70,14 @@ Here are the dependencies for the optional features:
   ```
 
 - libnuma support. Performance may degrade without it. Please install
-  libnuma-dev on Debian like systems, and numactl-dev on Red Hat like systems. 
-- Doxygen (>= 1.8.5) for compiling documentation as webpages or latex files 
+  libnuma-dev on Debian like systems, and numactl-dev on Red Hat like systems.
+- Doxygen (>= 1.8.5) for compiling documentation as webpages or latex files
 - PAPI (>= 5.2.0.0 ) for profiling sections of code
 - Vtune (>= 2017 ) for profiling sections of code
 - MPICH2 (>= 3.2) if you are interested in building and running distributed system
   applications in Galois
-- CUDA (>= 8.0) if you want to build GPU or distributed heterogeneous applications
+- CUDA (>= 8.0 and < 11.0) if you want to build GPU or distributed heterogeneous applications.
+  Note that versions >= 11.0 use an incompatible CUB module and will fail to execute.
 - Eigen (3.3.1 works for us) for some matrix-completion app variants
 
 
@@ -148,6 +149,12 @@ ctest
 
 in the build directory.
 
+Capturing Stack Information
+---------------------------
+Currently if you add `-DSTACK_CAPTURE` to your `cmake` line then you will configure stack capturing.
+Please view `libgalois/include/runtime/StackTracer.h` for documentation on functions for printing and reseting.
+Do not attempt to modify the capture process otherwise.
+
 
 Running Galois Applications
 ===========================
@@ -156,9 +163,9 @@ Graph Format
 ------------
 
 Many Galois/Lonestar applications work with graphs. We store graphs in a binary format
-called *galois graph file* 
+called *galois graph file*
 (`.gr` file extension). Other formats such as edge-list or Matrix-Market can be
-converted to `.gr` format with `graph-convert` tool provided in galois. 
+converted to `.gr` format with `graph-convert` tool provided in galois.
 You can build graph-convert as follows:
 
 ```Shell
@@ -168,20 +175,20 @@ make graph-convert
 ```
 
 Other applications, such as Delaunay Mesh Refinement may read special file formats
-or some may even generate random inputs on the fly. 
+or some may even generate random inputs on the fly.
 
 Running
 -------
 
 All Lonestar applications take a `-t` command-line option to specify the number of
 threads to use. All applications run a basic sanity check (often insufficient for
-correctness) on the program output, which can be turned off with the `-noverify` option. You 
-can specify `-help` command-line option to print all available options. 
+correctness) on the program output, which can be turned off with the `-noverify` option. You
+can specify `-help` command-line option to print all available options.
 
 Upon successful completion, each application will produce some stats regarding running
 time of various sections, parallel loop iterations and memory usage, etc. These
 stats are in CSV format and can be redirected to a file using `-statFile` option.
-Please refer to the manual for details on stats. 
+Please refer to the manual for details on stats.
 
 Running LonestarGPU applications
 --------------------------
@@ -199,7 +206,7 @@ Documentation
 =============
 
 Galois documentation is produced using doxygen, included in this repository, which includes a tutorial, a user's
-manual and API documentation for the Galois library. 
+manual and API documentation for the Galois library.
 
 Users can build doxygen documentation in the build directory using:
 
@@ -215,12 +222,12 @@ See online documentation at:
 Source-Tree Organization
 ========================
 
-- `libgalois` contains the source code for the shared-memory Galois library, e.g., runtime, graphs, worklists, etc. 
+- `libgalois` contains the source code for the shared-memory Galois library, e.g., runtime, graphs, worklists, etc.
 - `lonestar` contains the Lonestar benchmark applications and tutorial examples for Galois
 - `libdist` contains the source code for the distributed-memory and heterogeneous Galois library
 - `lonestardist` contains the source code for the distributed-memory and heterogeneous
   benchmark applications. Please refer to `lonestardist/README.md` for instructions on
-  building and running these apps. 
+  building and running these apps.
 - `tools` contains various helper programs such as graph-converter to convert
   between graph file formats and graph-stats to print graph properties
 
diff --git a/libcusp/include/galois/graphs/BasePolicies.h b/libcusp/include/galois/graphs/BasePolicies.h
index 446e9c7dae..d0cc16c354 100644
--- a/libcusp/include/galois/graphs/BasePolicies.h
+++ b/libcusp/include/galois/graphs/BasePolicies.h
@@ -42,6 +42,8 @@ class PartitioningScaffold {
   uint64_t _numEdges; //!< number of edges in graph
   //! maps from host id to nodes that host as read from disk
   std::vector<std::pair<uint64_t, uint64_t>> _gid2host;
+  std::vector<uint32_t> _virtualToPhyMapping; //saving Virtual hosts to Phy hosts map
+  bool hash; //switch between using gid2host and VtoP maps
 
 public:
   /**
@@ -64,6 +66,11 @@ class PartitioningScaffold {
    */
   void saveGIDToHost(std::vector<std::pair<uint64_t, uint64_t>>& gid2host) {
     _gid2host = gid2host;
+    hash = false;
+  }
+  void saveGIDToHost(std::vector<uint32_t>& virtualToPhyMapping) {
+    _virtualToPhyMapping = virtualToPhyMapping;
+    hash = true;
   }
 
   bool predeterminedMapping(std::vector<uint32_t>&) { return false; }
@@ -90,15 +97,19 @@ class ReadMasterAssignment : public PartitioningScaffold {
    * @returns Host ID of host that read the node specified by the GID.
    */
   uint32_t retrieveMaster(uint32_t gid) const {
-    for (auto h = 0U; h < _numHosts; ++h) {
-      uint64_t start, end;
-      std::tie(start, end) = _gid2host[h];
-      if (gid >= start && gid < end) {
-        return h;
+    if(hash == false) {
+      for (auto h = 0U; h < _numHosts; ++h) {
+        uint64_t start, end;
+        std::tie(start, end) = _gid2host[h];
+        if (gid >= start && gid < end) {
+          return h;
+        }
       }
+      assert(false);
+      return _numHosts;
+    } else {
+      return _virtualToPhyMapping[gid%(_virtualToPhyMapping.size())];
     }
-    assert(false);
-    return _numHosts;
   }
 
   // below all unused if not assigning masters in default manner, but must be
diff --git a/libcusp/include/galois/graphs/DistributedGraph.h b/libcusp/include/galois/graphs/DistributedGraph.h
index 540b25e120..e4f38d80ea 100644
--- a/libcusp/include/galois/graphs/DistributedGraph.h
+++ b/libcusp/include/galois/graphs/DistributedGraph.h
@@ -30,8 +30,12 @@
 #include <unordered_map>
 #include <fstream>
 
+<<<<<<< HEAD
 #include "galois/graphs/LC_CSR_Graph.h"
 #include "galois/graphs/LC_CSR_CSC_Graph.h"
+=======
+#include "galois/graphs/LS_LC_CSR_64_Graph.h"
+>>>>>>> 3945b1acc (Experimental changes for Distributed Graph)
 #include "galois/graphs/BufferedGraph.h"
 #include "galois/runtime/DistStats.h"
 #include "galois/graphs/OfflineGraph.h"
@@ -68,9 +72,13 @@ class DistGraph {
   //! Graph name used for printing things
   constexpr static const char* const GRNAME = "dGraph";
 
+<<<<<<< HEAD
   using GraphTy =
       galois::graphs::LC_CSR_CSC_Graph<NodeTy, EdgeTy, false, true, false,
                                        false, EdgeTy, NodeIndexTy, EdgeIndexTy>;
+=======
+  using GraphTy = galois::graphs::LS_LC_CSR_64_Graph<NodeTy, EdgeTy, true>;
+>>>>>>> 3945b1acc (Experimental changes for Distributed Graph)
 
   // vector for determining range objects for master nodes + nodes
   // with edges (which includes masters)
@@ -896,6 +904,11 @@ class DistGraph {
     return graph.edge_end(N, galois::MethodFlag::UNPROTECTED);
   }
 
+  /**
+   * Return the degree of the edge in the local graph
+   **/
+  inline uint64_t localDegree(GraphNode N) { return graph.getDegree(N); }
+
   /**
    * Returns an iterable object over the edges of a particular node in the
    * graph.
@@ -1081,7 +1094,8 @@ class DistGraph {
     } else {
       masterRanges = galois::graphs::determineUnitRangesFromGraph(
           graph, galois::runtime::activeThreads, beginMaster,
-          beginMaster + numOwned, 0);
+          beginMaster + numOwned, 0,
+          (galois::graphs::is_LS_LC_CSR_64_Graph<decltype(graph)>::value == 1));
     }
   }
 
@@ -1149,6 +1163,12 @@ class DistGraph {
    */
   void edgesEqualMasters() { specificRanges[2] = specificRanges[1]; }
 
+  void recalculateG2LMap() {
+    for (uint64_t i = 0; i < localToGlobalVector.size(); i++) {
+      globalToLocalMap[localToGlobalVector[i]] = i;
+    }
+  }
+
 public:
   /**
    * Write the local LC_CSR graph to the file on a disk.
diff --git a/libcusp/include/galois/graphs/GenericPartitioners.h b/libcusp/include/galois/graphs/GenericPartitioners.h
index b02d2c9594..3794d9eef1 100644
--- a/libcusp/include/galois/graphs/GenericPartitioners.h
+++ b/libcusp/include/galois/graphs/GenericPartitioners.h
@@ -25,8 +25,6 @@ class NoCommunication : public galois::graphs::ReadMasterAssignment {
   }
 };
 
-/**
- */
 class MiningPolicyNaive : public galois::graphs::ReadMasterAssignment {
 public:
   MiningPolicyNaive(uint32_t, uint32_t numHosts, uint64_t, uint64_t,
@@ -38,6 +36,17 @@ class MiningPolicyNaive : public galois::graphs::ReadMasterAssignment {
   bool keepEdge(uint32_t src, uint32_t dst) const { return src < dst; }
 };
 
+class OECPolicy : public galois::graphs::ReadMasterAssignment {
+public:
+  OECPolicy(uint32_t, uint32_t numHosts, uint64_t, uint64_t,
+            std::vector<uint64_t>&)
+      : galois::graphs::ReadMasterAssignment(0, numHosts, 0, 0) {}
+
+  static bool needNodeDegrees() { return false; }
+
+  bool keepEdge(uint32_t, uint32_t) const { return true; }
+};
+
 class MiningPolicyDegrees : public galois::graphs::ReadMasterAssignment {
   std::vector<uint64_t>& ndegrees;
 
diff --git a/libcusp/include/galois/graphs/NewGeneric.h b/libcusp/include/galois/graphs/NewGeneric.h
index e8d7e15d8e..d1ad172080 100644
--- a/libcusp/include/galois/graphs/NewGeneric.h
+++ b/libcusp/include/galois/graphs/NewGeneric.h
@@ -1654,7 +1654,19 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
       waitTime.start();
       while (hostFinished.count() != base_DistGraph::numHosts ||
              loadsClear.count() != base_DistGraph::numHosts) {
+<<<<<<< HEAD
         // make sure all assignments are done and all loads are done
+=======
+        // #ifndef NDEBUG
+        //  galois::gDebug("[", base_DistGraph::id, "] waiting for all hosts to
+        //  finish, ",
+        //                hostFinished.count());
+        //  galois::gDebug("[", base_DistGraph::id, "] waiting for all hosts
+        //  loads "
+        //                "syncs to finish, ", loadsClear.count());
+        // #endif
+        //  make sure all assignments are done and all loads are done
+>>>>>>> 5901b24b6 (chore: Run clang-format on the repo and add git hooks from gnn branch)
         syncAssignmentReceivesAsync(localNodeToMaster, gid2offsets,
                                     hostFinished);
         asyncRecvLoad(nodeLoads, edgeLoads, loadsClear);
diff --git a/libdist/include/galois/runtime/Serialize.h b/libdist/include/galois/runtime/Serialize.h
index a7b83174b7..6832a1afc4 100644
--- a/libdist/include/galois/runtime/Serialize.h
+++ b/libdist/include/galois/runtime/Serialize.h
@@ -28,7 +28,9 @@
 #define GALOIS_RUNTIME_SERIALIZE_H
 
 #include <type_traits>
+#include <fstream>
 #include <ostream>
+#include <cstdlib>
 #include <vector>
 #include <deque>
 #include <string>
@@ -276,12 +278,22 @@ gSizedObj(const T&,
   return sizeof(uintptr_t);
 }
 
+<<<<<<< HEAD
 //! Size of BufferWrapper is size + number of things in it
 template <typename T>
 inline size_t gSizedObj(const galois::BufferWrapper<T>& data) {
   return sizeof(size_t) + data.size() * sizeof(T);
 }
 
+=======
+template <typename T1, typename T2>
+inline size_t gSizedObj(const std::unordered_map<T1, T2>& data) {
+  size_t sz = 0;
+  for (auto i : data)
+    sz += gSizedObj(i.first) + gSizedObj(i.second);
+  return sz;
+}
+>>>>>>> b1a39cdd7 (bug fix)
 /**
  * Returns the size necessary for storing 2 elements of a pair into a
  * serialize buffer.
@@ -447,6 +459,16 @@ inline void gSerializeObj(
  * @param [in,out] buf Serialize buffer to serialize into
  * @param [in] data Data to serialize
  */
+template <typename T1, typename T2>
+inline void gSerializeObj(SerializeBuffer& buf,
+                          const std::unordered_map<T1, T2>& data) {
+  uint64_t cnt = 0;
+  for (auto i : data) {
+    cnt++;
+    gSerialize(buf, i.first, i.second);
+  }
+}
+
 template <typename T>
 inline void
 gSerializeObj(SerializeBuffer& buf, const T& data,
@@ -794,6 +816,17 @@ void gDeserializeObj(
   data.deserialize(buf);
 }
 
+template <typename T1, typename T2>
+void gDeserializeObj(DeSerializeBuffer& buf, std::unordered_map<T1, T2>& data) {
+  while (!buf.empty()) {
+    std::pair<T1, T2> i;
+    gDeserialize(buf, i.first, i.second);
+    if (buf.getOffset() > buf.size()) {
+      break;
+    }
+    data[i.first] = i.second;
+  }
+}
 /**
  * Deserialize a pair from a buffer.
  *
diff --git a/libdist/src/DistStats.cpp b/libdist/src/DistStats.cpp
index e8399451f3..1fe46bc514 100644
--- a/libdist/src/DistStats.cpp
+++ b/libdist/src/DistStats.cpp
@@ -286,13 +286,18 @@ void DistStatManager::combineAtHost_0(void) {
   combineAtHost_0_helper();
   getSystemNetworkInterface().flush();
 
+  // work done before check
+  td += 1;
+
   // barrier
   while (td.reduce()) {
+    td.reset();
     if (getHostID() == 0) {
       // receive from other hosts
       receiveAtHost_0_helper();
     }
-  };
+  }
+
   // explicit barrier after logical barrier is required
   // as next async phase begins immediately
   getHostBarrier().wait();
@@ -302,13 +307,18 @@ void DistStatManager::combineAtHost_0(void) {
   combineAtHost_0_helper2();
   getSystemNetworkInterface().flush();
 
+  td += 1;
+
   // barrier
   while (td.reduce()) {
+    td.reset();
+
     if (getHostID() == 0) {
       // receive from other hosts
       receiveAtHost_0_helper2();
     }
-  };
+  }
+
   // explicit barrier after logical barrier is required
   // as next async phase begins immediately
   getHostBarrier().wait();
diff --git a/libgalois/include/galois/Atomic.h b/libgalois/include/galois/Atomic.h
deleted file mode 100644
index e073bf5aa7..0000000000
--- a/libgalois/include/galois/Atomic.h
+++ /dev/null
@@ -1,284 +0,0 @@
-/*
- * This file belongs to the Galois project, a C++ library for exploiting
- * parallelism. The code is being released under the terms of the 3-Clause BSD
- * License (a copy is located in LICENSE.txt at the top-level directory).
- *
- * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
- * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
- * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
- * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
- * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
- * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
- * shall University be liable for incidental, special, indirect, direct or
- * consequential damages or loss of profits, interruption of business, or
- * related expenses which may arise from use of Software or Documentation,
- * including but not limited to those resulting from defects in Software and/or
- * Documentation, or loss or inaccuracy of data of any kind.
- */
-
-#ifndef GALOIS_ATOMIC_H
-#define GALOIS_ATOMIC_H
-
-#include <iterator>
-
-#include "galois/config.h"
-#include "galois/substrate/CacheLineStorage.h"
-
-namespace galois {
-
-namespace internal {
-/**
- * Common implementation.
- */
-template <typename T, template <typename _> class W, bool CONCURRENT>
-class GAtomicImpl {
-  // galois::runtime::LL::CacheLineStorage<T> val;
-  W<T> val;
-
-public:
-  //! Initialize with a value
-  explicit GAtomicImpl(const T& i) : val(i) {}
-  //! default constructor
-  GAtomicImpl() {}
-
-  //! atomic add and fetch
-  T operator+=(const T& rhs) { return __sync_add_and_fetch(&val.data, rhs); }
-  //! atomic sub and fetch
-  T operator-=(const T& rhs) { return __sync_sub_and_fetch(&(val.data), rhs); }
-  //! atomic increment and fetch
-  T operator++() { return __sync_add_and_fetch(&(val.data), 1); }
-  //! atomic fetch and increment
-  T operator++(int) { return __sync_fetch_and_add(&(val.data), 1); }
-  //! atomic decrement and fetch
-  T operator--() { return __sync_sub_and_fetch(&(val.data), 1); }
-  //! atomic fetch and decrement
-  T operator--(int) { return __sync_fetch_and_sub(&(val.data), 1); }
-  //! conversion operator to base data type
-  operator T() const { return val.data; }
-  //! assign from underlying type
-  T& operator=(const T& i) { return val.data = i; }
-  //! assignment operator
-  T& operator=(const GAtomicImpl& i) { return val.data = i.val.data; }
-  //! direct compare and swap
-  bool cas(const T& expected, const T& updated) {
-    if (val.data != expected) {
-      return false;
-    }
-#if defined(__INTEL_COMPILER)
-    return __sync_bool_compare_and_swap(
-        &val.data, *reinterpret_cast<const ptrdiff_t*>(&expected),
-        *reinterpret_cast<const ptrdiff_t*>(&updated));
-#else
-    return __sync_bool_compare_and_swap(&val.data, expected, updated);
-#endif
-  }
-};
-
-// non-current version
-template <typename T, template <typename _> class W>
-class GAtomicImpl<T, W, false> {
-  // galois::runtime::LL::CacheLineStorage<T> val;
-  W<T> val;
-
-public:
-  //! Initialize with a value
-  explicit GAtomicImpl(const T& i) : val(i) {}
-  //! default constructor
-  GAtomicImpl() {}
-
-  //! atomic add and fetch
-  T operator+=(const T& rhs) { return (val.data += rhs); }
-  //! atomic sub and fetch
-  T operator-=(const T& rhs) { return (val.data -= rhs); }
-  //! atomic increment and fetch
-  T operator++() { return ++(val.data); }
-  //! atomic fetch and increment
-  T operator++(int) { return (val.data)++; }
-  //! atomic decrement and fetch
-  T operator--() { return --(val.data); }
-  //! atomic fetch and decrement
-  T operator--(int) { return (val.data)--; }
-  //! conversion operator to base data type
-  operator T() const { return val.data; }
-  //! assign from underlying type
-  T& operator=(const T& i) { return val.data = i; }
-  //! assignment operator
-  T& operator=(const GAtomicImpl& i) { return val.data = i.val.data; }
-  //! direct compare and swap
-  bool cas(const T& expected, const T& updated) {
-    if (val.data != expected) {
-      return false;
-    } else {
-      val.data = updated;
-      return true;
-    }
-  }
-};
-
-//! Basic atomic
-template <typename T, template <typename _> class W, bool CONCURRENT>
-class GAtomicBase : public GAtomicImpl<T, W, CONCURRENT> {
-  typedef GAtomicImpl<T, W, CONCURRENT> Super_ty;
-
-public:
-  //! Initialize with a value
-  explicit GAtomicBase(const T& i) : Super_ty(i) {}
-
-  //! default constructor
-  GAtomicBase() : Super_ty() {}
-
-  T& operator=(const GAtomicBase& that) { return Super_ty::operator=(that); }
-
-  T& operator=(const T& that) { return Super_ty::operator=(that); }
-};
-
-//! Specialization for pointers
-template <typename T, template <typename _> class W, bool CONCURRENT>
-class GAtomicBase<T*, W, CONCURRENT> : public GAtomicImpl<T*, W, CONCURRENT> {
-  typedef GAtomicImpl<T*, W, CONCURRENT> Super_ty;
-
-public:
-  typedef typename std::iterator_traits<T*>::difference_type difference_type;
-
-  GAtomicBase() : Super_ty() {}
-
-  GAtomicBase(T* i) : Super_ty(i) {}
-
-  T*& operator=(const GAtomicBase& that) { return Super_ty::operator=(that); }
-
-  T*& operator=(T* that) { return Super_ty::operator=(that); }
-
-  T* operator+=(const difference_type& rhs) {
-    if (CONCURRENT) {
-      return __sync_add_and_fetch(&Super_ty::val.data, rhs);
-    } else {
-      return (Super_ty::val.data += rhs);
-    }
-  }
-
-  T* operator-=(const difference_type& rhs) {
-    if (CONCURRENT) {
-      return __sync_sub_and_fetch(&Super_ty::val.data, rhs);
-    } else {
-      return (Super_ty::val.data -= rhs);
-    }
-  }
-};
-
-//! Specialization for const pointers
-template <typename T, template <typename _> class W, bool CONCURRENT>
-class GAtomicBase<const T*, W, CONCURRENT>
-    : public GAtomicImpl<const T*, W, CONCURRENT> {
-  typedef GAtomicImpl<const T*, W, CONCURRENT> Super_ty;
-
-public:
-  typedef
-      typename std::iterator_traits<const T*>::difference_type difference_type;
-
-  GAtomicBase() : Super_ty() {}
-
-  GAtomicBase(const T* i) : Super_ty(i) {}
-
-  const T*& operator=(const GAtomicBase& that) {
-    return Super_ty::operator=(that);
-  }
-
-  const T*& operator=(const T* that) { return Super_ty::operator=(that); }
-
-  const T* operator+=(const difference_type& rhs) {
-    if (CONCURRENT) {
-      return __sync_add_and_fetch(&Super_ty::val.data, rhs);
-    } else {
-      return (Super_ty::val.data += rhs);
-    }
-  }
-
-  const T* operator-=(const difference_type& rhs) {
-    if (CONCURRENT) {
-      return __sync_sub_and_fetch(&Super_ty::val.data, rhs);
-    } else {
-      return (Super_ty::val.data -= rhs);
-    }
-  }
-};
-
-//! Specialization for bools
-template <template <typename _> class W, bool CONCURRENT>
-class GAtomicBase<bool, W, CONCURRENT>
-    : private GAtomicImpl<bool, W, CONCURRENT> {
-  typedef GAtomicImpl<bool, W, CONCURRENT> Super_ty;
-
-public:
-  //! Initialize with a value
-  explicit GAtomicBase(bool i) : Super_ty(i) {}
-
-  GAtomicBase() : Super_ty() {}
-
-  //! conversion operator to base data type
-  operator bool() const { return Super_ty::operator bool(); }
-
-  //! assignment operator
-  bool& operator=(const GAtomicBase& i) { return Super_ty::operator=(i); }
-
-  //! assign from underlying type
-  bool& operator=(bool i) { return Super_ty::operator=(i); }
-  //! direct compare and swap
-  bool cas(bool expected, bool updated) {
-    return Super_ty::cas(expected, updated);
-  }
-};
-
-template <typename T>
-struct DummyWrapper {
-  T data;
-
-  explicit DummyWrapper(const T& d) : data(d) {}
-  DummyWrapper() {}
-};
-
-} // namespace internal
-
-/**
- * An atomic wrapper that provides sensible atomic behavior for most
- * primative data types.  Operators return the value of type T so as to
- * retain atomic RMW semantics.
- */
-template <typename T, bool CONCURRENT = true>
-class GAtomic
-    : public internal::GAtomicBase<T, internal::DummyWrapper, CONCURRENT> {
-  typedef internal::GAtomicBase<T, internal::DummyWrapper, CONCURRENT> Super_ty;
-
-public:
-  GAtomic() : Super_ty() {}
-  explicit GAtomic(const T& v) : Super_ty(v) {}
-
-  T& operator=(const GAtomic& that) { return Super_ty::operator=(that); }
-
-  T& operator=(const T& that) { return Super_ty::operator=(that); }
-};
-
-/**
- * Cache-line padded version of {@link GAtomic}.
- */
-template <typename T, bool CONCURRENT = true>
-class GAtomicPadded
-    : public internal::GAtomicBase<T, galois::substrate::CacheLineStorage,
-                                   CONCURRENT> {
-
-  typedef internal::GAtomicBase<T, galois::substrate::CacheLineStorage,
-                                CONCURRENT>
-      Super_ty;
-
-public:
-  GAtomicPadded() : Super_ty() {}
-  explicit GAtomicPadded(const T& v) : Super_ty(v) {}
-
-  T& operator=(const GAtomicPadded& that) { return Super_ty::operator=(that); }
-
-  T& operator=(const T& that) { return Super_ty::operator=(that); }
-};
-
-} // namespace galois
-
-#endif
diff --git a/libgalois/include/galois/AtomicHelpers.h b/libgalois/include/galois/AtomicHelpers.h
index bd8504ec97..59ad57c93c 100644
--- a/libgalois/include/galois/AtomicHelpers.h
+++ b/libgalois/include/galois/AtomicHelpers.h
@@ -28,7 +28,7 @@ namespace galois {
 /** galois::atomicMax + non-atomic max calls **/
 template <typename Ty>
 const Ty atomicMax(std::atomic<Ty>& a, const Ty b) {
-  Ty old_a = a;
+  Ty old_a = a.load(std::memory_order_relaxed);
   // if old value is less than new value, atomically exchange
   while (old_a < b &&
          !a.compare_exchange_weak(old_a, b, std::memory_order_relaxed))
@@ -38,10 +38,10 @@ const Ty atomicMax(std::atomic<Ty>& a, const Ty b) {
 
 template <typename Ty>
 const Ty max(std::atomic<Ty>& a, const Ty& b) {
-  Ty old_a = a;
+  Ty old_a = a.load(std::memory_order_relaxed);
 
   if (a < b) {
-    a = b;
+    a.store(b, std::memory_order_relaxed);
   }
   return old_a;
 }
@@ -59,7 +59,7 @@ const Ty max(Ty& a, const Ty& b) {
 /** galois::atomicMin **/
 template <typename Ty>
 const Ty atomicMin(std::atomic<Ty>& a, const Ty b) {
-  Ty old_a = a;
+  Ty old_a = a.load(std::memory_order_relaxed);
   while (old_a > b &&
          !a.compare_exchange_weak(old_a, b, std::memory_order_relaxed))
     ;
@@ -68,9 +68,9 @@ const Ty atomicMin(std::atomic<Ty>& a, const Ty b) {
 
 template <typename Ty>
 const Ty min(std::atomic<Ty>& a, const Ty& b) {
-  Ty old_a = a;
+  Ty old_a = a.load(std::memory_order_relaxed);
   if (a > b) {
-    a = b;
+    a.store(b, std::memory_order_relaxed);
   }
   return old_a;
 }
@@ -87,7 +87,7 @@ const Ty min(Ty& a, const Ty& b) {
 /** galois::atomicAdd **/
 template <typename Ty>
 const Ty atomicAdd(std::atomic<Ty>& val, Ty delta) {
-  Ty old_val = val;
+  Ty old_val = val.load(std::memory_order_relaxed);
   while (!val.compare_exchange_weak(old_val, old_val + delta,
                                     std::memory_order_relaxed))
     ;
@@ -96,15 +96,15 @@ const Ty atomicAdd(std::atomic<Ty>& val, Ty delta) {
 
 template <typename Ty>
 const Ty add(std::atomic<Ty>& a, const Ty& b) {
-  Ty old_a = a;
-  a        = a + b;
+  Ty old_a = a.load(std::memory_order_relaxed);
+  a.store(a + b, std::memory_order_relaxed);
   return old_a;
 }
 
 template <typename Ty>
 const Ty add(Ty& a, std::atomic<Ty>& b) {
   Ty old_a = a;
-  a        = a + b.load();
+  a        = a + b.load(std::memory_order_relaxed);
   return old_a;
 }
 
@@ -121,7 +121,7 @@ const Ty add(Ty& a, const Ty& b) {
  */
 template <typename Ty>
 const Ty atomicSubtract(std::atomic<Ty>& val, Ty delta) {
-  Ty old_val = val;
+  Ty old_val = val.load(std::memory_order_relaxed);
   while (!val.compare_exchange_weak(old_val, old_val - delta,
                                     std::memory_order_relaxed))
     ;
@@ -136,7 +136,7 @@ const Ty set(Ty& a, const Ty& b) {
 
 template <typename Ty>
 const Ty set(std::atomic<Ty>& a, const Ty& b) {
-  a = b;
+  a.store(b, std::memory_order_relaxed);
   return a;
 }
 
@@ -205,6 +205,6 @@ void reset(Ty& var, Ty val) {
 
 template <typename Ty>
 void reset(std::atomic<Ty>& var, Ty val) {
-  var = val;
+  var.store(val, std::memory_order_relaxed);
 }
 } // end namespace galois
diff --git a/libgalois/include/galois/PrefixSum.h b/libgalois/include/galois/PrefixSum.h
new file mode 100644
index 0000000000..9fc854a4c4
--- /dev/null
+++ b/libgalois/include/galois/PrefixSum.h
@@ -0,0 +1,196 @@
+#include <cstdint>
+#include "galois/WaterFallLock.h"
+#include "galois/Galois.h"
+#include <variant>
+
+namespace galois {
+
+inline void empty(std::monostate& a, uint64_t i) {
+  (void)a;
+  (void)i;
+}
+
+template <typename T>
+inline T equalizer(const T& t) {
+  return t;
+}
+
+template <typename T>
+inline void before(T l, uint64_t tid) {
+  l.template wait<1>(tid);
+}
+template <typename T>
+inline void after(T l, uint64_t tid) {
+  l.template done<2>(tid);
+}
+
+/** This is a struct used for repeated PrefixSums
+ * It works using a 2 level algorithm
+ * @param A The type of the source array
+ * @param B The type of the dst array
+ * @param transmute is function A -> B
+ * @param scan_op is a function A x B -> B
+ * @param combiner is a function B x B -> B
+ * @param Conduit is the type used inside the WaterFallLock as well as the Paste
+ * array (used for measurement)
+ * @param src the source array user is required to ensure the size is correct
+ * @param dst the destination array user is required to ensure the size is
+ * correct
+ * @param lock a reference to a WaterFallLock, which should have length of the
+ * number of threads
+ * @param paste a conduit assigned per thread in order to ensure cache_line
+ * padding for speed
+ */
+template <typename A, typename B, B (*transmute)(const A&),
+          B (*scan_op)(const A& x, const B& y),
+          B (*combiner)(const B& x, const B& y),
+          template <typename C> typename Conduit>
+class PrefixSum {
+public:
+  /**
+   * These are exposed in order to be changed between subsequest calls in the
+   * case of dynamic structures
+   */
+  A* src;
+  B* dst;
+
+private:
+  using PArr = Conduit<B>;
+  Conduit<B> paste;
+  using WFLType = galois::WaterFallLock<Conduit<unsigned>>;
+  WFLType lock;
+
+  /** Type to make pointers into an array for serial_pfxsum
+   *
+   */
+  template <typename T>
+  struct Arr {
+    T* arr;
+
+    Arr(T* arr) : arr(arr) {}
+
+    template <typename i_type>
+    T& operator[](i_type i) {
+      return arr[i];
+    }
+  };
+
+  /** The templates are used to make this function usable in many different
+   * places Enables before and after to take in some context that can be
+   * triggered after an object is in the paste array
+   * @param Holder is used to specify any object holding A1 (A) or A2 (B)
+   */
+  template <typename A1, typename A2, A2 (*trans)(const A1&),
+            A2 (*scan)(const A1& x, const A2& y), typename CTX,
+            void (*before)(CTX&, uint64_t), void (*after)(CTX&, uint64_t),
+            template <typename C> typename Holder, bool combine = false>
+  inline void serial_pfxsum(Holder<A1> src, Holder<A2> dst, uint64_t ns,
+                            CTX ctx) {
+    if (!combine)
+      dst[0] = trans(src[0]);
+    for (uint64_t i = 1; i < ns; i++) {
+      before(ctx, i);
+      dst[i] = scan(src[i], dst[i - 1]);
+      after(ctx, i);
+    }
+  }
+
+  /** Does the serial pfxsum and puts the final value in the paste_loc for
+   * future processing
+   *
+   */
+  inline void parallel_pfxsum_phase_0(A* src, B* dst, uint64_t ns, B& paste_loc,
+                                      uint64_t wfl_id) {
+    serial_pfxsum<A, B, transmute, scan_op, std::monostate, empty, empty, Arr>(
+        src, dst, ns, std::monostate());
+    paste_loc = dst[ns - 1];
+    lock.template done<1>(wfl_id);
+  }
+
+  /** Sums up the paste locations in a single thread to prepare the finished
+   * product
+   *
+   */
+  inline void parallel_pfxsum_phase_1(uint64_t ns, uint64_t wfl_id) {
+    if (!wfl_id) {
+      lock.template done<2>(wfl_id);
+      serial_pfxsum<B, B, equalizer, combiner, WFLType&, before<WFLType&>,
+                    after<WFLType&>, Conduit>(paste, paste, ns, lock);
+    } else {
+      lock.template wait<2>(wfl_id - 1);
+    }
+  }
+
+  /** Does the final prefix sums with the last part of the array being handeled
+   * by tid = 0 */
+  inline void parallel_pfxsum_phase_2(A* src, B* dst, uint64_t ns,
+                                      const B& phase1_val, bool pfxsum) {
+    if (pfxsum) {
+      dst[0] = scan_op(src[0], phase1_val);
+      serial_pfxsum<A, B, transmute, scan_op, std::monostate, empty, empty, Arr,
+                    true>(src, dst, ns, std::monostate());
+    } else {
+      for (uint64_t i = 0; i < ns; i++)
+        dst[i] = combiner(phase1_val, dst[i]);
+    }
+  }
+
+  inline void parallel_pfxsum_work(uint64_t phase0_ind, uint64_t phase0_sz,
+                                   uint64_t phase2_ind, uint64_t phase2_sz,
+                                   uint64_t wfl_id, uint64_t nt) {
+
+    parallel_pfxsum_phase_0(&src[phase0_ind], &dst[phase0_ind], phase0_sz,
+                            paste[wfl_id], wfl_id);
+
+    parallel_pfxsum_phase_1(nt, wfl_id);
+
+    const B& paste_val = paste[wfl_id ? wfl_id - 1 : nt - 1];
+    parallel_pfxsum_phase_2(&src[phase2_ind], &dst[phase2_ind], phase2_sz,
+                            paste_val, !wfl_id);
+  }
+
+  /** This function computes the indices for the different phases and forwards
+   * them.
+   * @param ns the number of items to sum
+   * @param wf_id this corresponds to the thread id
+   * @param nt this is the number of threads
+   */
+  void parallel_pfxsum_op(uint64_t ns, uint64_t wf_id, uint64_t nt) {
+    uint64_t div_sz = ns / (nt + 1);
+    uint64_t bigs   = ns % (nt + 1);
+    uint64_t mid    = nt >> 1;
+    bool is_mid     = mid == wf_id;
+    // Concentrate the big in the middle thread
+    uint64_t phase0_sz = is_mid ? div_sz + bigs : div_sz;
+    uint64_t phase0_ind;
+    if (wf_id <= mid)
+      phase0_ind = div_sz * wf_id;
+    else
+      phase0_ind = bigs + (div_sz * wf_id);
+
+    uint64_t phase2_sz  = phase0_sz;
+    uint64_t phase2_ind = wf_id ? phase0_ind : ns - div_sz;
+    parallel_pfxsum_work(phase0_ind, phase0_sz, phase2_ind, phase2_sz, wf_id,
+                         nt);
+  }
+
+public:
+  PrefixSum(A* src, B* dst) : src(src), dst(dst), paste(B()), lock() {}
+
+  /** computePrefixSum is the interface exposed to actually have a prefixSum
+   * computed NOTE: this uses on_each be careful!!!
+   * @param ns the number of objects in src to sum
+   */
+  void computePrefixSum(uint64_t ns) {
+    galois::on_each([&](unsigned tid, unsigned numThreads) {
+      this->parallel_pfxsum_op(ns, tid, numThreads);
+    });
+    this->lock.reset();
+  }
+
+  const char* name() {
+    return typeid(PrefixSum<A, B, transmute, scan_op, combiner, Conduit>)
+        .name();
+  }
+};
+} // namespace galois
diff --git a/libgalois/include/galois/WaterFallLock.h b/libgalois/include/galois/WaterFallLock.h
new file mode 100644
index 0000000000..f668ac8044
--- /dev/null
+++ b/libgalois/include/galois/WaterFallLock.h
@@ -0,0 +1,77 @@
+#ifndef _wf_h_
+#define _wf_h_
+
+#include <cstdint>
+#include <stdlib.h>
+#include <galois/substrate/PerThreadStorage.h>
+#include <galois/substrate/CacheLineStorage.h>
+
+using namespace galois::substrate;
+namespace galois {
+/** This is an array that is cacheline padded
+ * TODO(AdityaAtulTewari) Should be moved to substrate
+ * @author AdityaAtulTewari
+ */
+template <typename T>
+class CacheLinePaddedArr {
+  galois::substrate::CacheLineStorage<T>* arr;
+  uint64_t sz;
+
+  void initialize_values(T def) {
+    for (uint64_t i = 0; i < sz; i++) {
+      arr[i] = def;
+    }
+  }
+
+public:
+  CacheLinePaddedArr(T def)
+      : arr(new CacheLineStorage<T>[getThreadPool().getMaxThreads()]),
+        sz(getThreadPool().getMaxThreads()) {
+    initialize_values(def);
+  }
+
+  CacheLinePaddedArr(uint64_t sz, T def)
+      : arr(new CacheLineStorage<T>[sz]), sz(sz) {
+    initialize_values(def);
+  }
+
+  T* get(uint64_t i) { return &arr[i].data; }
+
+  uint64_t size() { return sz; }
+
+  template <typename n_type>
+  T& operator[](n_type i) {
+    return arr[i].data;
+  }
+};
+
+/** This is a Barrier style lock used for fine grained release control
+ * TODO(AdityaAtulTewari) Should be moved to a substrate
+ * @author AdityaAtulTewari
+ */
+template <typename T>
+class WaterFallLock {
+  T wfc{0};
+
+public:
+  WaterFallLock() : wfc(0) {}
+
+  void reset() {
+    for (unsigned i = 0; i < wfc.size(); i++)
+      *(wfc.get(i)) = 0;
+  }
+
+  const char* name() { return typeid(WaterFallLock<T>).name(); }
+
+  template <char val>
+  void wait(uint64_t num) {
+    while (__atomic_load_1((char*)wfc.get(num), __ATOMIC_ACQUIRE) != val)
+      ;
+  }
+  template <char val>
+  void done(uint64_t num) {
+    __atomic_store_1((char*)wfc.get(num), val, __ATOMIC_RELEASE);
+  }
+};
+} // namespace galois
+#endif
diff --git a/libgalois/include/galois/graphs/GraphHelpers.h b/libgalois/include/galois/graphs/GraphHelpers.h
index ab0b48c5a5..db9d5c5114 100644
--- a/libgalois/include/galois/graphs/GraphHelpers.h
+++ b/libgalois/include/galois/graphs/GraphHelpers.h
@@ -248,19 +248,24 @@ bool unitRangeCornerCaseHandle(uint32_t unitsToSplit, uint32_t beginNode,
  * @param endNode End of range, non-inclusive
  * @param returnRanges Vector to store unit offsets for ranges in
  * @param nodeAlpha The higher the number, the more weight nodes have in
+ * @param is_LS_LC_CSR True if `graph` is a log structured csr graph
  * determining division of nodes (edges have weight 1).
  */
 template <typename GraphTy>
 void determineUnitRangesLoopGraph(GraphTy& graph, uint32_t unitsToSplit,
                                   uint32_t beginNode, uint32_t endNode,
                                   std::vector<uint32_t>& returnRanges,
-                                  uint32_t nodeAlpha) {
+                                  uint32_t nodeAlpha, bool is_LS_LC_CSR) {
   assert(beginNode != endNode);
 
   uint32_t numNodesInRange = endNode - beginNode;
-  uint64_t numEdgesInRange =
-      graph.edge_end(endNode - 1) - graph.edge_begin(beginNode);
-  uint64_t edgeOffset = *graph.edge_begin(beginNode);
+
+  // cannot use edge_end/begin on log strcutred CSR since its edges are not
+  // consecutive.
+  uint64_t numEdgesInRange = (is_LS_LC_CSR) ? graph.sizeEdges()
+                                            : graph.edge_end(endNode - 1) -
+                                                  graph.edge_begin(beginNode);
+  uint64_t edgeOffset      = (is_LS_LC_CSR) ? 0 : *graph.edge_begin(beginNode);
 
   returnRanges[0] = beginNode;
   std::vector<unsigned int> dummyScaleFactor;
@@ -285,6 +290,20 @@ void determineUnitRangesLoopGraph(GraphTy& graph, uint32_t unitsToSplit,
       // unit assinged no nodes, copy last one
       returnRanges[i + 1] = returnRanges[i];
     }
+<<<<<<< HEAD
+=======
+
+    if (is_LS_LC_CSR) {
+      galois::gDebug("LoopGraph Unit ", i, " gets nodes ", returnRanges[i],
+                     " to ", returnRanges[i + 1], ", num edges is ",
+                     graph[returnRanges[i + 1] - 1] - graph[returnRanges[i]]);
+    } else {
+      galois::gDebug("LoopGraph Unit ", i, " gets nodes ", returnRanges[i],
+                     " to ", returnRanges[i + 1], ", num edges is ",
+                     graph.edge_end(returnRanges[i + 1] - 1) -
+                         graph.edge_begin(returnRanges[i]));
+    }
+>>>>>>> 1d4ff12ff (feat: support loop ranges for LS_LC_CSR_64)
   }
 }
 
@@ -425,13 +444,13 @@ std::vector<uint32_t> determineUnitRangesFromGraph(GraphTy& graph,
  * @param endNode End of range, non-inclusive
  * @param nodeAlpha The higher the number, the more weight nodes have in
  * determining division of nodes (edges have weight 1).
+ * @param is_LS_LC_CSR True if `graph` is a log structured csr graph
  * @returns vector that indirectly specifies which units get which nodes
  */
 template <typename GraphTy>
-std::vector<uint32_t>
-determineUnitRangesFromGraph(GraphTy& graph, uint32_t unitsToSplit,
-                             uint32_t beginNode, uint32_t endNode,
-                             uint32_t nodeAlpha = 0) {
+std::vector<uint32_t> determineUnitRangesFromGraph(
+    GraphTy& graph, uint32_t unitsToSplit, uint32_t beginNode, uint32_t endNode,
+    uint32_t nodeAlpha = 0, bool is_LS_LC_CSR = false) {
   std::vector<uint32_t> returnRanges;
   returnRanges.resize(unitsToSplit + 1);
 
@@ -443,7 +462,8 @@ determineUnitRangesFromGraph(GraphTy& graph, uint32_t unitsToSplit,
   // no corner cases: onto main loop over nodes that determines
   // node ranges
   internal::determineUnitRangesLoopGraph(graph, unitsToSplit, beginNode,
-                                         endNode, returnRanges, nodeAlpha);
+                                         endNode, returnRanges, nodeAlpha,
+                                         is_LS_LC_CSR);
 
   internal::unitRangeSanity(unitsToSplit, beginNode, endNode, returnRanges);
 
diff --git a/libgalois/include/galois/graphs/LC_CSR_64_Graph.h b/libgalois/include/galois/graphs/LC_CSR_64_Graph.h
new file mode 100644
index 0000000000..5ce74d331e
--- /dev/null
+++ b/libgalois/include/galois/graphs/LC_CSR_64_Graph.h
@@ -0,0 +1,1027 @@
+/*
+ * This file belongs to the Galois project, a C++ library for exploiting
+ * parallelism. The code is being released under the terms of the 3-Clause BSD
+ * License (a copy is located in LICENSE.txt at the top-level directory).
+ *
+ * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
+ * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
+ * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
+ * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
+ * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
+ * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
+ * shall University be liable for incidental, special, indirect, direct or
+ * consequential damages or loss of profits, interruption of business, or
+ * related expenses which may arise from use of Software or Documentation,
+ * including but not limited to those resulting from defects in Software and/or
+ * Documentation, or loss or inaccuracy of data of any kind.
+ */
+
+#ifndef GALOIS_GRAPHS_LC_CSR_64_GRAPH_H
+#define GALOIS_GRAPHS_LC_CSR_64_GRAPH_H
+
+#include <fstream>
+#include <type_traits>
+
+#include <boost/archive/binary_oarchive.hpp>
+#include <boost/archive/binary_iarchive.hpp>
+#include <boost/serialization/split_member.hpp>
+#include <boost/serialization/binary_object.hpp>
+#include <boost/serialization/serialization.hpp>
+
+#include "galois/config.h"
+#include "galois/Galois.h"
+#include "galois/graphs/Details.h"
+#include "galois/graphs/FileGraph.h"
+#include "galois/graphs/GraphHelpers.h"
+#include "galois/PODResizeableArray.h"
+
+namespace galois::graphs {
+/**
+ * Local computation graph (i.e., graph structure does not change). The data
+ * representation is the traditional compressed-sparse-row (CSR) format.
+ *
+ * The position of template parameters may change between Galois releases; the
+ * most robust way to specify them is through the with_XXX nested templates.
+ *
+ * An example of use:
+ *
+ * \snippet test/graph.cpp Using a graph
+ *
+ * And in C++11:
+ *
+ * \snippet test/graph.cpp Using a graph cxx11
+ *
+ * @tparam NodeTy data on nodes
+ * @tparam EdgeTy data on out edges
+ */
+//! [doxygennuma]
+template <typename NodeTy, typename EdgeTy, bool HasNoLockable = false,
+          bool UseNumaAlloc = false, bool HasOutOfLineLockable = false,
+          typename FileEdgeTy = EdgeTy>
+class LC_CSR_64_Graph :
+    //! [doxygennuma]
+    private boost::noncopyable,
+    private internal::LocalIteratorFeature<UseNumaAlloc>,
+    private internal::OutOfLineLockableFeature<HasOutOfLineLockable &&
+                                               !HasNoLockable> {
+  template <typename Graph>
+  friend class LC_InOut_Graph;
+
+public:
+  template <bool _has_id>
+  struct with_id {
+    typedef LC_CSR_64_Graph type;
+  };
+
+  template <typename _node_data>
+  struct with_node_data {
+    typedef LC_CSR_64_Graph<_node_data, EdgeTy, HasNoLockable, UseNumaAlloc,
+                            HasOutOfLineLockable, FileEdgeTy>
+        type;
+  };
+
+  template <typename _edge_data>
+  struct with_edge_data {
+    typedef LC_CSR_64_Graph<NodeTy, _edge_data, HasNoLockable, UseNumaAlloc,
+                            HasOutOfLineLockable, FileEdgeTy>
+        type;
+  };
+
+  template <typename _file_edge_data>
+  struct with_file_edge_data {
+    typedef LC_CSR_64_Graph<NodeTy, EdgeTy, HasNoLockable, UseNumaAlloc,
+                            HasOutOfLineLockable, _file_edge_data>
+        type;
+  };
+
+  //! If true, do not use abstract locks in graph
+  template <bool _has_no_lockable>
+  struct with_no_lockable {
+    typedef LC_CSR_64_Graph<NodeTy, EdgeTy, _has_no_lockable, UseNumaAlloc,
+                            HasOutOfLineLockable, FileEdgeTy>
+        type;
+  };
+  template <bool _has_no_lockable>
+  using _with_no_lockable =
+      LC_CSR_64_Graph<NodeTy, EdgeTy, _has_no_lockable, UseNumaAlloc,
+                      HasOutOfLineLockable, FileEdgeTy>;
+
+  //! If true, use NUMA-aware graph allocation; otherwise, use NUMA interleaved
+  //! allocation.
+  template <bool _use_numa_alloc>
+  struct with_numa_alloc {
+    typedef LC_CSR_64_Graph<NodeTy, EdgeTy, HasNoLockable, _use_numa_alloc,
+                            HasOutOfLineLockable, FileEdgeTy>
+        type;
+  };
+  template <bool _use_numa_alloc>
+  using _with_numa_alloc =
+      LC_CSR_64_Graph<NodeTy, EdgeTy, HasNoLockable, _use_numa_alloc,
+                      HasOutOfLineLockable, FileEdgeTy>;
+
+  //! If true, store abstract locks separate from nodes
+  template <bool _has_out_of_line_lockable>
+  struct with_out_of_line_lockable {
+    typedef LC_CSR_64_Graph<NodeTy, EdgeTy, HasNoLockable, UseNumaAlloc,
+                            _has_out_of_line_lockable, FileEdgeTy>
+        type;
+  };
+
+  typedef read_default_graph_tag read_tag;
+
+protected:
+  typedef LargeArray<EdgeTy> EdgeData;
+  typedef LargeArray<uint64_t> EdgeDst;
+  typedef internal::NodeInfoBaseTypes<NodeTy,
+                                      !HasNoLockable && !HasOutOfLineLockable>
+      NodeInfoTypes;
+  typedef internal::NodeInfoBase<NodeTy,
+                                 !HasNoLockable && !HasOutOfLineLockable>
+      NodeInfo;
+  typedef LargeArray<uint64_t> EdgeIndData;
+  typedef LargeArray<NodeInfo> NodeData;
+
+public:
+  typedef uint64_t GraphNode;
+  typedef EdgeTy edge_data_type;
+  typedef FileEdgeTy file_edge_data_type;
+  typedef NodeTy node_data_type;
+  typedef typename EdgeData::reference edge_data_reference;
+  typedef typename NodeInfoTypes::reference node_data_reference;
+  using edge_iterator =
+      boost::counting_iterator<typename EdgeIndData::value_type>;
+  using iterator = boost::counting_iterator<typename EdgeDst::value_type>;
+  typedef iterator const_iterator;
+  typedef iterator local_iterator;
+  typedef iterator const_local_iterator;
+
+protected:
+  NodeData nodeData;
+  EdgeIndData edgeIndData;
+  EdgeDst edgeDst;
+  EdgeData edgeData;
+
+  uint64_t numNodes;
+  uint64_t numEdges;
+
+  typedef internal::EdgeSortIterator<
+      GraphNode, typename EdgeIndData::value_type, EdgeDst, EdgeData>
+      edge_sort_iterator;
+
+  edge_iterator raw_begin(GraphNode N) const {
+    return edge_iterator((N == 0) ? 0 : edgeIndData[N - 1]);
+  }
+
+  edge_iterator raw_end(GraphNode N) const {
+    return edge_iterator(edgeIndData[N]);
+  }
+
+  edge_sort_iterator edge_sort_begin(GraphNode N) {
+    return edge_sort_iterator(*raw_begin(N), &edgeDst, &edgeData);
+  }
+
+  edge_sort_iterator edge_sort_end(GraphNode N) {
+    return edge_sort_iterator(*raw_end(N), &edgeDst, &edgeData);
+  }
+
+  template <bool _A1 = HasNoLockable, bool _A2 = HasOutOfLineLockable>
+  void acquireNode(GraphNode N, MethodFlag mflag,
+                   typename std::enable_if<!_A1&& !_A2>::type* = 0) {
+    galois::runtime::acquire(&nodeData[N], mflag);
+  }
+
+  template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>
+  void acquireNode(GraphNode N, MethodFlag mflag,
+                   typename std::enable_if<_A1&& !_A2>::type* = 0) {
+    this->outOfLineAcquire(getId(N), mflag);
+  }
+
+  template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>
+  void acquireNode(GraphNode, MethodFlag,
+                   typename std::enable_if<_A2>::type* = 0) {}
+
+  template <bool _A1 = EdgeData::has_value,
+            bool _A2 = LargeArray<FileEdgeTy>::has_value>
+  void constructEdgeValue(FileGraph& graph,
+                          typename FileGraph::edge_iterator nn,
+                          typename std::enable_if<!_A1 || _A2>::type* = 0) {
+    typedef LargeArray<FileEdgeTy> FED;
+    if (EdgeData::has_value)
+      edgeData.set(*nn, graph.getEdgeData<typename FED::value_type>(nn));
+  }
+
+  template <bool _A1 = EdgeData::has_value,
+            bool _A2 = LargeArray<FileEdgeTy>::has_value>
+  void constructEdgeValue(FileGraph&, typename FileGraph::edge_iterator nn,
+                          typename std::enable_if<_A1&& !_A2>::type* = 0) {
+    edgeData.set(*nn, {});
+  }
+
+  uint64_t getId(GraphNode N) { return N; }
+
+  GraphNode getNode(uint64_t n) { return n; }
+
+private:
+  friend class boost::serialization::access;
+
+  template <typename Archive>
+  void save(Archive& ar, const unsigned int) const {
+    ar << numNodes;
+    ar << numEdges;
+
+    // Large Arrays
+    ar << edgeIndData;
+    ar << edgeDst;
+    ar << edgeData;
+  }
+
+  template <typename Archive>
+  void load(Archive& ar, const unsigned int) {
+    ar >> numNodes;
+    ar >> numEdges;
+
+    // Large Arrays
+    ar >> edgeIndData;
+    ar >> edgeDst;
+    ar >> edgeData;
+
+    if (!nodeData.data()) {
+      if (UseNumaAlloc) {
+        nodeData.allocateBlocked(numNodes);
+        this->outOfLineAllocateBlocked(numNodes);
+      } else {
+        nodeData.allocateInterleaved(numNodes);
+        this->outOfLineAllocateInterleaved(numNodes);
+      }
+
+      // Construct nodeData largeArray
+      for (size_t n = 0; n < numNodes; ++n) {
+        nodeData.constructAt(n);
+      }
+    }
+  }
+
+  // The macro BOOST_SERIALIZATION_SPLIT_MEMBER() generates code which invokes
+  // the save or load depending on whether the archive is used for saving or
+  // loading
+  BOOST_SERIALIZATION_SPLIT_MEMBER()
+
+public:
+  LC_CSR_64_Graph(LC_CSR_64_Graph&& rhs) = default;
+
+  LC_CSR_64_Graph() = default;
+
+  LC_CSR_64_Graph& operator=(LC_CSR_64_Graph&&) = default;
+
+  /**
+   * Serializes node data using Boost.
+   *
+   * @param ar Boost archive to serialize to.
+   */
+  void serializeNodeData(boost::archive::binary_oarchive& ar) const {
+    ar << nodeData;
+  }
+
+  /**
+   * Deserializes a Boost archive containing node data to the local node data
+   * variable.
+   *
+   * @param ar Boost archive to deserialize from.
+   */
+  void deSerializeNodeData(boost::archive::binary_iarchive& ar) {
+    ar >> nodeData;
+  }
+
+  /**
+   * Serializes graph using Boost.
+   *
+   * @param ar Boost archive to serialize to.
+   */
+  void serializeGraph(boost::archive::binary_oarchive& ar) const {
+    ar << numNodes;
+    ar << numEdges;
+
+    // Large Arrays
+    ar << nodeData;
+    ar << edgeIndData;
+    ar << edgeDst;
+    ar << edgeData;
+  }
+
+  /**
+   * Deserializes a Boost archive to the local graph.
+   *
+   * @param ar Boost archive to deserialize from.
+   */
+  void deSerializeGraph(boost::archive::binary_iarchive& ar) {
+    ar >> numNodes;
+    ar >> numEdges;
+
+    // Large Arrays
+    ar >> nodeData;
+    ar >> edgeIndData;
+    ar >> edgeDst;
+    ar >> edgeData;
+  }
+
+  /**
+   * Accesses the "prefix sum" of this graph; takes advantage of the fact
+   * that edge_end(n) is basically prefix_sum[n] (if a prefix sum existed +
+   * if prefix_sum[0] = number of edges in node 0).
+   *
+   * ONLY USE IF GRAPH HAS BEEN LOADED
+   *
+   * @param n Index into edge prefix sum
+   * @returns The value that would be located at index n in an edge prefix sum
+   * array
+   */
+  uint64_t operator[](uint64_t n) { return *(edge_end(n)); }
+
+  template <typename EdgeNumFnTy, typename EdgeDstFnTy, typename EdgeDataFnTy>
+  LC_CSR_64_Graph(uint64_t _numNodes, uint64_t _numEdges, EdgeNumFnTy edgeNum,
+                  EdgeDstFnTy _edgeDst, EdgeDataFnTy _edgeData)
+      : numNodes(_numNodes), numEdges(_numEdges) {
+    if (UseNumaAlloc) {
+      //! [numaallocex]
+      nodeData.allocateBlocked(numNodes);
+      edgeIndData.allocateBlocked(numNodes);
+      edgeDst.allocateBlocked(numEdges);
+      edgeData.allocateBlocked(numEdges);
+      //! [numaallocex]
+      this->outOfLineAllocateBlocked(numNodes);
+    } else {
+      nodeData.allocateInterleaved(numNodes);
+      edgeIndData.allocateInterleaved(numNodes);
+      edgeDst.allocateInterleaved(numEdges);
+      edgeData.allocateInterleaved(numEdges);
+      this->outOfLineAllocateInterleaved(numNodes);
+    }
+    for (size_t n = 0; n < numNodes; ++n) {
+      nodeData.constructAt(n);
+    }
+    uint64_t cur = 0;
+    for (size_t n = 0; n < numNodes; ++n) {
+      cur += edgeNum(n);
+      edgeIndData[n] = cur;
+    }
+    cur = 0;
+    for (size_t n = 0; n < numNodes; ++n) {
+      for (uint64_t e = 0, ee = edgeNum(n); e < ee; ++e) {
+        if (EdgeData::has_value)
+          edgeData.set(cur, _edgeData(n, e));
+        edgeDst[cur] = _edgeDst(n, e);
+        ++cur;
+      }
+    }
+  }
+
+  friend void swap(LC_CSR_64_Graph& lhs, LC_CSR_64_Graph& rhs) {
+    swap(lhs.nodeData, rhs.nodeData);
+    swap(lhs.edgeIndData, rhs.edgeIndData);
+    swap(lhs.edgeDst, rhs.edgeDst);
+    swap(lhs.edgeData, rhs.edgeData);
+    std::swap(lhs.numNodes, rhs.numNodes);
+    std::swap(lhs.numEdges, rhs.numEdges);
+  }
+
+  node_data_reference getData(GraphNode N,
+                              MethodFlag mflag = MethodFlag::WRITE) {
+    // galois::runtime::checkWrite(mflag, false);
+    NodeInfo& NI = nodeData[N];
+    acquireNode(N, mflag);
+    return NI.getData();
+  }
+
+  edge_data_reference
+  getEdgeData(edge_iterator ni,
+              MethodFlag GALOIS_UNUSED(mflag) = MethodFlag::UNPROTECTED) {
+    // galois::runtime::checkWrite(mflag, false);
+    return edgeData[*ni];
+  }
+
+  GraphNode getEdgeDst(edge_iterator ni) { return edgeDst[*ni]; }
+
+  size_t size() const { return numNodes; }
+  size_t sizeEdges() const { return numEdges; }
+
+  iterator begin() const { return iterator(0); }
+  iterator end() const { return iterator(numNodes); }
+
+  const_local_iterator local_begin() const {
+    return const_local_iterator(this->localBegin(numNodes));
+  }
+
+  const_local_iterator local_end() const {
+    return const_local_iterator(this->localEnd(numNodes));
+  }
+
+  local_iterator local_begin() {
+    return local_iterator(this->localBegin(numNodes));
+  }
+
+  local_iterator local_end() {
+    return local_iterator(this->localEnd(numNodes));
+  }
+
+  edge_iterator edge_begin(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
+    acquireNode(N, mflag);
+    if (!HasNoLockable && galois::runtime::shouldLock(mflag)) {
+      for (edge_iterator ii = raw_begin(N), ee = raw_end(N); ii != ee; ++ii) {
+        acquireNode(edgeDst[*ii], mflag);
+      }
+    }
+    return raw_begin(N);
+  }
+
+  edge_iterator edge_end(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
+    acquireNode(N, mflag);
+    return raw_end(N);
+  }
+
+  uint64_t getDegree(GraphNode N) const { return (raw_end(N) - raw_begin(N)); }
+
+  edge_iterator findEdge(GraphNode N1, GraphNode N2) {
+    return std::find_if(edge_begin(N1), edge_end(N1),
+                        [=](edge_iterator e) { return getEdgeDst(e) == N2; });
+  }
+
+  edge_iterator findEdgeSortedByDst(GraphNode N1, GraphNode N2) {
+    auto e = std::lower_bound(
+        edge_begin(N1), edge_end(N1), N2,
+        [=](edge_iterator e, GraphNode N) { return getEdgeDst(e) < N; });
+    return (getEdgeDst(e) == N2) ? e : edge_end(N1);
+  }
+
+  runtime::iterable<NoDerefIterator<edge_iterator>>
+  edges(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
+    return internal::make_no_deref_range(edge_begin(N, mflag),
+                                         edge_end(N, mflag));
+  }
+
+  runtime::iterable<NoDerefIterator<edge_iterator>>
+  out_edges(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
+    return edges(N, mflag);
+  }
+
+  /**
+   * Sorts outgoing edges of a node. Comparison function is over EdgeTy.
+   */
+  template <typename CompTy>
+  void sortEdgesByEdgeData(GraphNode N,
+                           const CompTy& comp = std::less<EdgeTy>(),
+                           MethodFlag mflag   = MethodFlag::WRITE) {
+    acquireNode(N, mflag);
+    std::sort(
+        edge_sort_begin(N), edge_sort_end(N),
+        internal::EdgeSortCompWrapper<EdgeSortValue<GraphNode, EdgeTy>, CompTy>(
+            comp));
+  }
+
+  /**
+   * Sorts outgoing edges of a node.
+   * Comparison function is over <code>EdgeSortValue<EdgeTy></code>.
+   */
+  template <typename CompTy>
+  void sortEdges(GraphNode N, const CompTy& comp,
+                 MethodFlag mflag = MethodFlag::WRITE) {
+    acquireNode(N, mflag);
+    std::sort(edge_sort_begin(N), edge_sort_end(N), comp);
+  }
+
+  /**
+   * Sorts outgoing edges of a node. Comparison is over getEdgeDst(e).
+   */
+  void sortEdgesByDst(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
+    acquireNode(N, mflag);
+    typedef EdgeSortValue<GraphNode, EdgeTy> EdgeSortVal;
+    std::sort(edge_sort_begin(N), edge_sort_end(N),
+              [=](const EdgeSortVal& e1, const EdgeSortVal& e2) {
+                return e1.dst < e2.dst;
+              });
+  }
+
+  /**
+   * Sorts all outgoing edges of all nodes in parallel. Comparison is over
+   * getEdgeDst(e).
+   */
+  void sortAllEdgesByDst(MethodFlag mflag = MethodFlag::WRITE) {
+    galois::do_all(
+        galois::iterate(size_t{0}, this->size()),
+        [=](GraphNode N) { this->sortEdgesByDst(N, mflag); },
+        galois::no_stats(), galois::steal());
+  }
+
+  void allocateFrom(const FileGraph& graph) {
+    numNodes = graph.size();
+    numEdges = graph.sizeEdges();
+    if (UseNumaAlloc) {
+      nodeData.allocateBlocked(numNodes);
+      edgeIndData.allocateBlocked(numNodes);
+      edgeDst.allocateBlocked(numEdges);
+      edgeData.allocateBlocked(numEdges);
+      this->outOfLineAllocateBlocked(numNodes);
+    } else {
+      nodeData.allocateInterleaved(numNodes);
+      edgeIndData.allocateInterleaved(numNodes);
+      edgeDst.allocateInterleaved(numEdges);
+      edgeData.allocateInterleaved(numEdges);
+      this->outOfLineAllocateInterleaved(numNodes);
+    }
+  }
+
+  void allocateFrom(uint64_t nNodes, uint64_t nEdges) {
+    numNodes = nNodes;
+    numEdges = nEdges;
+
+    if (UseNumaAlloc) {
+      nodeData.allocateBlocked(numNodes);
+      edgeIndData.allocateBlocked(numNodes);
+      edgeDst.allocateBlocked(numEdges);
+      edgeData.allocateBlocked(numEdges);
+      this->outOfLineAllocateBlocked(numNodes);
+    } else {
+      nodeData.allocateInterleaved(numNodes);
+      edgeIndData.allocateInterleaved(numNodes);
+      edgeDst.allocateInterleaved(numEdges);
+      edgeData.allocateInterleaved(numEdges);
+      this->outOfLineAllocateInterleaved(numNodes);
+    }
+  }
+
+  void destroyAndAllocateFrom(uint64_t nNodes, uint64_t nEdges) {
+    numNodes = nNodes;
+    numEdges = nEdges;
+
+    deallocate();
+    if (UseNumaAlloc) {
+      nodeData.allocateBlocked(numNodes);
+      edgeIndData.allocateBlocked(numNodes);
+      edgeDst.allocateBlocked(numEdges);
+      edgeData.allocateBlocked(numEdges);
+      this->outOfLineAllocateBlocked(numNodes);
+    } else {
+      nodeData.allocateInterleaved(numNodes);
+      edgeIndData.allocateInterleaved(numNodes);
+      edgeDst.allocateInterleaved(numEdges);
+      edgeData.allocateInterleaved(numEdges);
+      this->outOfLineAllocateInterleaved(numNodes);
+    }
+  }
+
+  void constructNodes() {
+#ifndef GALOIS_GRAPH_CONSTRUCT_SERIAL
+    for (uint64_t x = 0; x < numNodes; ++x) {
+      nodeData.constructAt(x);
+      this->outOfLineConstructAt(x);
+    }
+#else
+    galois::do_all(
+        galois::iterate(UINT64_C(0), numNodes),
+        [&](uint64_t x) {
+          nodeData.constructAt(x);
+          this->outOfLineConstructAt(x);
+        },
+        galois::no_stats(), galois::loopname("CONSTRUCT_NODES"));
+#endif
+  }
+
+  void deallocate() {
+    nodeData.destroy();
+    nodeData.deallocate();
+
+    edgeIndData.deallocate();
+    edgeIndData.destroy();
+
+    edgeDst.deallocate();
+    edgeDst.destroy();
+
+    edgeData.deallocate();
+    edgeData.destroy();
+  }
+
+  void constructEdge(uint64_t e, uint64_t dst,
+                     const typename EdgeData::value_type& val) {
+    edgeData.set(e, val);
+    edgeDst[e] = dst;
+  }
+
+  void constructEdge(uint64_t e, uint64_t dst) { edgeDst[e] = dst; }
+
+  void fixEndEdge(uint64_t n, uint64_t e) { edgeIndData[n] = e; }
+
+  /**
+   * Perform an in-memory transpose of the graph, replacing the original
+   * CSR to CSC
+   */
+  void transpose(const char* regionName = NULL) {
+    galois::StatTimer timer("TIMER_GRAPH_TRANSPOSE", regionName);
+    timer.start();
+
+    EdgeDst edgeDst_old;
+    EdgeData edgeData_new;
+    EdgeIndData edgeIndData_old;
+    EdgeIndData edgeIndData_temp;
+
+    if (UseNumaAlloc) {
+      edgeIndData_old.allocateBlocked(numNodes);
+      edgeIndData_temp.allocateBlocked(numNodes);
+      edgeDst_old.allocateBlocked(numEdges);
+      edgeData_new.allocateBlocked(numEdges);
+    } else {
+      edgeIndData_old.allocateInterleaved(numNodes);
+      edgeIndData_temp.allocateInterleaved(numNodes);
+      edgeDst_old.allocateInterleaved(numEdges);
+      edgeData_new.allocateInterleaved(numEdges);
+    }
+
+    // Copy old node->index location + initialize the temp array
+    galois::do_all(
+        galois::iterate(UINT64_C(0), numNodes),
+        [&](uint64_t n) {
+          edgeIndData_old[n]  = edgeIndData[n];
+          edgeIndData_temp[n] = 0;
+        },
+        galois::no_stats(), galois::loopname("TRANSPOSE_EDGEINTDATA_COPY"));
+
+    // get destination of edge, copy to array, and
+    galois::do_all(
+        galois::iterate(UINT64_C(0), numEdges),
+        [&](uint64_t e) {
+          auto dst       = edgeDst[e];
+          edgeDst_old[e] = dst;
+          // counting outgoing edges in the tranpose graph by
+          // counting incoming edges in the original graph
+          __sync_add_and_fetch(&edgeIndData_temp[dst], 1);
+        },
+        galois::no_stats(), galois::loopname("TRANSPOSE_EDGEINTDATA_INC"));
+
+    // TODO is it worth doing parallel prefix sum?
+    // prefix sum calculation of the edge index array
+    for (uint64_t n = 1; n < numNodes; ++n) {
+      edgeIndData_temp[n] += edgeIndData_temp[n - 1];
+    }
+
+    // copy over the new tranposed edge index data
+    galois::do_all(
+        galois::iterate(UINT64_C(0), numNodes),
+        [&](uint64_t n) { edgeIndData[n] = edgeIndData_temp[n]; },
+        galois::no_stats(), galois::loopname("TRANSPOSE_EDGEINTDATA_SET"));
+
+    // edgeIndData_temp[i] will now hold number of edges that all nodes
+    // before the ith node have
+    if (numNodes >= 1) {
+      edgeIndData_temp[0] = 0;
+      galois::do_all(
+          galois::iterate(UINT64_C(1), numNodes),
+          [&](uint64_t n) { edgeIndData_temp[n] = edgeIndData[n - 1]; },
+          galois::no_stats(), galois::loopname("TRANSPOSE_EDGEINTDATA_TEMP"));
+    }
+
+    galois::do_all(
+        galois::iterate(UINT64_C(0), numNodes),
+        [&](uint64_t src) {
+          // e = start index into edge array for a particular node
+          uint64_t e = (src == 0) ? 0 : edgeIndData_old[src - 1];
+
+          // get all outgoing edges of a particular node in the
+          // non-transpose and convert to incoming
+          while (e < edgeIndData_old[src]) {
+            // destination nodde
+            auto dst = edgeDst_old[e];
+            // location to save edge
+            auto e_new = __sync_fetch_and_add(&(edgeIndData_temp[dst]), 1);
+            // save src as destination
+            edgeDst[e_new] = src;
+            // copy edge data to "new" array
+            edgeDataCopy(edgeData_new, edgeData, e_new, e);
+            e++;
+          }
+        },
+        galois::no_stats(), galois::loopname("TRANSPOSE_EDGEDST"));
+
+    // if edge weights, then overwrite edgeData with new edge data
+    if (EdgeData::has_value) {
+      galois::do_all(
+          galois::iterate(UINT64_C(0), numEdges),
+          [&](uint64_t e) { edgeDataCopy(edgeData, edgeData_new, e, e); },
+          galois::no_stats(), galois::loopname("TRANSPOSE_EDGEDATA_SET"));
+    }
+
+    timer.stop();
+  }
+
+  template <bool is_non_void = EdgeData::has_value>
+  void edgeDataCopy(EdgeData& edgeData_new, EdgeData& edgeData, uint64_t e_new,
+                    uint64_t e,
+                    typename std::enable_if<is_non_void>::type* = 0) {
+    edgeData_new[e_new] = edgeData[e];
+  }
+
+  template <bool is_non_void = EdgeData::has_value>
+  void edgeDataCopy(EdgeData&, EdgeData&, uint64_t, uint64_t,
+                    typename std::enable_if<!is_non_void>::type* = 0) {
+    // does nothing
+  }
+
+  template <typename E                                            = EdgeTy,
+            std::enable_if_t<!std::is_same<E, void>::value, int>* = nullptr>
+  void constructFrom(FileGraph& graph, unsigned tid, unsigned total,
+                     const bool readUnweighted = false) {
+    // at this point memory should already be allocated
+    auto r =
+        graph
+            .divideByNode(
+                NodeData::size_of::value + EdgeIndData::size_of::value +
+                    LC_CSR_64_Graph::size_of_out_of_line::value,
+                EdgeDst::size_of::value + EdgeData::size_of::value, tid, total)
+            .first;
+
+    this->setLocalRange(*r.first, *r.second);
+
+    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {
+      nodeData.constructAt(*ii);
+      edgeIndData[*ii] = *graph.edge_end(*ii);
+
+      this->outOfLineConstructAt(*ii);
+
+      for (FileGraph::edge_iterator nn = graph.edge_begin(*ii),
+                                    en = graph.edge_end(*ii);
+           nn != en; ++nn) {
+        if (readUnweighted) {
+          edgeData.set(*nn, {});
+        } else {
+          constructEdgeValue(graph, nn);
+        }
+        edgeDst[*nn] = graph.getEdgeDst(nn);
+      }
+    }
+  }
+
+  template <typename E                                           = EdgeTy,
+            std::enable_if_t<std::is_same<E, void>::value, int>* = nullptr>
+  void constructFrom(FileGraph& graph, unsigned tid, unsigned total,
+                     const bool GALOIS_UNUSED(readUnweighted) = false) {
+    // at this point memory should already be allocated
+    auto r =
+        graph
+            .divideByNode(
+                NodeData::size_of::value + EdgeIndData::size_of::value +
+                    LC_CSR_64_Graph::size_of_out_of_line::value,
+                EdgeDst::size_of::value + EdgeData::size_of::value, tid, total)
+            .first;
+
+    this->setLocalRange(*r.first, *r.second);
+
+    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {
+      nodeData.constructAt(*ii);
+      edgeIndData[*ii] = *graph.edge_end(*ii);
+
+      this->outOfLineConstructAt(*ii);
+
+      for (FileGraph::edge_iterator nn = graph.edge_begin(*ii),
+                                    en = graph.edge_end(*ii);
+           nn != en; ++nn) {
+        constructEdgeValue(graph, nn);
+        edgeDst[*nn] = graph.getEdgeDst(nn);
+      }
+    }
+  }
+
+  /**
+   * Returns the reference to the edgeIndData LargeArray
+   * (a prefix sum of edges)
+   *
+   * @returns reference to LargeArray edgeIndData
+   */
+  const EdgeIndData& getEdgePrefixSum() const { return edgeIndData; }
+
+  auto divideByNode(size_t nodeSize, size_t edgeSize, size_t id, size_t total) {
+    return galois::graphs::divideNodesBinarySearch(
+        numNodes, numEdges, nodeSize, edgeSize, id, total, edgeIndData);
+  }
+  /**
+   *
+   * custom allocator for vector<vector<>>
+   * Adding for Louvain clustering
+   * TODO: Find better way to do this
+   */
+  void constructFrom(uint64_t numNodes, uint64_t numEdges,
+                     std::vector<uint64_t>& prefix_sum,
+                     std::vector<std::vector<uint64_t>>& edges_id,
+                     std::vector<std::vector<EdgeTy>>& edges_data) {
+    // allocateFrom(numNodes, numEdges);
+    /*
+     * Deallocate if reusing the graph
+     */
+    destroyAndAllocateFrom(numNodes, numEdges);
+    constructNodes();
+
+    galois::do_all(galois::iterate((uint64_t)0, numNodes),
+                   [&](uint64_t n) { edgeIndData[n] = prefix_sum[n]; });
+
+    galois::do_all(galois::iterate((uint64_t)0, numNodes), [&](uint64_t n) {
+      if (n == 0) {
+        if (edgeIndData[n] > 0) {
+          std::copy(edges_id[n].begin(), edges_id[n].end(), edgeDst.begin());
+          std::copy(edges_data[n].begin(), edges_data[n].end(),
+                    edgeData.begin());
+        }
+      } else {
+        if (edgeIndData[n] - edgeIndData[n - 1] > 0) {
+          std::copy(edges_id[n].begin(), edges_id[n].end(),
+                    edgeDst.begin() + edgeIndData[n - 1]);
+          std::copy(edges_data[n].begin(), edges_data[n].end(),
+                    edgeData.begin() + edgeIndData[n - 1]);
+        }
+      }
+    });
+
+    initializeLocalRanges();
+  }
+  void constructFrom(
+      uint64_t numNodes, uint64_t numEdges, std::vector<uint64_t>& prefix_sum,
+      galois::gstl::Vector<galois::PODResizeableArray<uint64_t>>& edges_id,
+      std::vector<std::vector<EdgeTy>>& edges_data) {
+    allocateFrom(numNodes, numEdges);
+    constructNodes();
+
+    galois::do_all(galois::iterate((uint64_t)0, numNodes),
+                   [&](uint64_t n) { edgeIndData[n] = prefix_sum[n]; });
+
+    galois::do_all(galois::iterate((uint64_t)0, numNodes), [&](uint64_t n) {
+      if (n == 0) {
+        if (edgeIndData[n] > 0) {
+          std::copy(edges_id[n].begin(), edges_id[n].end(), edgeDst.begin());
+          std::copy(edges_data[n].begin(), edges_data[n].end(),
+                    edgeData.begin());
+        }
+      } else {
+        if (edgeIndData[n] - edgeIndData[n - 1] > 0) {
+          std::copy(edges_id[n].begin(), edges_id[n].end(),
+                    edgeDst.begin() + edgeIndData[n - 1]);
+          std::copy(edges_data[n].begin(), edges_data[n].end(),
+                    edgeData.begin() + edgeIndData[n - 1]);
+        }
+      }
+    });
+
+    initializeLocalRanges();
+  }
+
+  /**
+   * Reads the GR files directly into in-memory
+   * data-structures of LC_CSR graphs using freads.
+   *
+   * Edge is not void.
+   *
+   */
+  template <
+      typename U                                                      = void,
+      typename std::enable_if<!std::is_void<EdgeTy>::value, U>::type* = nullptr>
+  void readGraphFromGRFile(const std::string& filename) {
+    std::ifstream graphFile(filename.c_str());
+    if (!graphFile.is_open()) {
+      GALOIS_DIE("failed to open file");
+    }
+    uint64_t header[4];
+    graphFile.read(reinterpret_cast<char*>(header), sizeof(uint64_t) * 4);
+    uint64_t version = header[0];
+    numNodes         = header[2];
+    numEdges         = header[3];
+    galois::gPrint("Number of Nodes: ", numNodes,
+                   ", Number of Edges: ", numEdges, "\n");
+    allocateFrom(numNodes, numEdges);
+    constructNodes();
+    /**
+     * Load outIndex array
+     **/
+    assert(edgeIndData.data());
+    if (!edgeIndData.data()) {
+      GALOIS_DIE("out of memory");
+    }
+
+    // start position to read index data
+    uint64_t readPosition = (4 * sizeof(uint64_t));
+    graphFile.seekg(readPosition);
+    graphFile.read(reinterpret_cast<char*>(edgeIndData.data()),
+                   sizeof(uint64_t) * numNodes);
+    /**
+     * Load edgeDst array
+     **/
+    assert(edgeDst.data());
+    if (!edgeDst.data()) {
+      GALOIS_DIE("out of memory");
+    }
+
+    readPosition = ((4 + numNodes) * sizeof(uint64_t));
+    graphFile.seekg(readPosition);
+    if (version == 1) {
+      graphFile.read(reinterpret_cast<char*>(edgeDst.data()),
+                     sizeof(uint32_t) * numEdges);
+      readPosition =
+          ((4 + numNodes) * sizeof(uint64_t) + numEdges * sizeof(uint32_t));
+      // version 1 padding TODO make version agnostic
+      if (numEdges % 2) {
+        readPosition += sizeof(uint32_t);
+      }
+    } else if (version == 2) {
+      graphFile.read(reinterpret_cast<char*>(edgeDst.data()),
+                     sizeof(uint64_t) * numEdges);
+      readPosition =
+          ((4 + numNodes) * sizeof(uint64_t) + numEdges * sizeof(uint64_t));
+      if (numEdges % 2) {
+        readPosition += sizeof(uint64_t);
+      }
+    } else {
+      GALOIS_DIE("unknown file version: ", version);
+    }
+    /**
+     * Load edge data array
+     **/
+    assert(edgeData.data());
+    if (!edgeData.data()) {
+      GALOIS_DIE("out of memory");
+    }
+    graphFile.seekg(readPosition);
+    graphFile.read(reinterpret_cast<char*>(edgeData.data()),
+                   sizeof(EdgeTy) * numEdges);
+
+    initializeLocalRanges();
+    graphFile.close();
+  }
+
+  /**
+   * Reads the GR files directly into in-memory
+   * data-structures of LC_CSR graphs using freads.
+   *
+   * Edge is void.
+   *
+   */
+  template <
+      typename U                                                     = void,
+      typename std::enable_if<std::is_void<EdgeTy>::value, U>::type* = nullptr>
+  void readGraphFromGRFile(const std::string& filename) {
+    std::ifstream graphFile(filename.c_str());
+    if (!graphFile.is_open()) {
+      GALOIS_DIE("failed to open file");
+    }
+    uint64_t header[4];
+    graphFile.read(reinterpret_cast<char*>(header), sizeof(uint64_t) * 4);
+    uint64_t version = header[0];
+    numNodes         = header[2];
+    numEdges         = header[3];
+    galois::gPrint("Number of Nodes: ", numNodes,
+                   ", Number of Edges: ", numEdges, "\n");
+    allocateFrom(numNodes, numEdges);
+    constructNodes();
+    /**
+     * Load outIndex array
+     **/
+    assert(edgeIndData.data());
+    if (!edgeIndData.data()) {
+      GALOIS_DIE("out of memory");
+    }
+    // start position to read index data
+    uint64_t readPosition = (4 * sizeof(uint64_t));
+    graphFile.seekg(readPosition);
+    graphFile.read(reinterpret_cast<char*>(edgeIndData.data()),
+                   sizeof(uint64_t) * numNodes);
+    /**
+     * Load edgeDst array
+     **/
+    assert(edgeDst.data());
+    if (!edgeDst.data()) {
+      GALOIS_DIE("out of memory");
+    }
+    readPosition = ((4 + numNodes) * sizeof(uint64_t));
+    graphFile.seekg(readPosition);
+    if (version == 1) {
+      graphFile.read(reinterpret_cast<char*>(edgeDst.data()),
+                     sizeof(uint32_t) * numEdges);
+    } else if (version == 2) {
+      graphFile.read(reinterpret_cast<char*>(edgeDst.data()),
+                     sizeof(uint64_t) * numEdges);
+    } else {
+      GALOIS_DIE("unknown file version: ", version);
+    }
+
+    initializeLocalRanges();
+    graphFile.close();
+  }
+
+  /**
+   * Given a manually created graph, initialize the local ranges on this graph
+   * so that threads can iterate over a balanced number of vertices.
+   */
+  void initializeLocalRanges() {
+    galois::on_each([&](unsigned tid, unsigned total) {
+      auto r = divideByNode(0, 1, tid, total).first;
+      this->setLocalRange(*r.first, *r.second);
+    });
+  }
+};
+
+} // namespace galois::graphs
+
+#endif
diff --git a/libgalois/include/galois/graphs/LC_CSR_Graph.h b/libgalois/include/galois/graphs/LC_CSR_Graph.h
index c8d37f4f85..48f2740fdb 100644
--- a/libgalois/include/galois/graphs/LC_CSR_Graph.h
+++ b/libgalois/include/galois/graphs/LC_CSR_Graph.h
@@ -201,7 +201,7 @@ class LC_CSR_Graph :
 
   template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>
   void acquireNode(GraphNode, MethodFlag,
-                   typename std::enable_if<_A2>::type* = 0) {}
+                   typename std::enable_if<_A2>::type* = 0) const {}
 
   template <bool _A1 = EdgeData::has_value,
             bool _A2 = LargeArray<FileEdgeTy>::has_value>
@@ -355,7 +355,7 @@ class LC_CSR_Graph :
       edgeDst.allocateBlocked(numEdges);
       edgeData.allocateBlocked(numEdges);
       //! [numaallocex]
-      this->outOfLineAllocateBlocked(numNodes, false);
+      this->outOfLineAllocateBlocked(numNodes);
     } else {
       nodeData.allocateInterleaved(numNodes);
       edgeIndData.allocateInterleaved(numNodes);
diff --git a/libgalois/include/galois/graphs/LS_LC_CSR_64_Graph.h b/libgalois/include/galois/graphs/LS_LC_CSR_64_Graph.h
new file mode 100644
index 0000000000..b8b1fa4e44
--- /dev/null
+++ b/libgalois/include/galois/graphs/LS_LC_CSR_64_Graph.h
@@ -0,0 +1,1647 @@
+/*
+ * This file belongs to the Galois project, a C++ library for exploiting
+ * parallelism. The code is being released under the terms of the 3-Clause BSD
+ * License (a copy is located in LICENSE.txt at the top-level directory).
+ *
+ * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
+ * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
+ * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
+ * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
+ * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
+ * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
+ * shall University be liable for incidental, special, indirect, direct or
+ * consequential damages or loss of profits, interruption of business, or
+ * related expenses which may arise from use of Software or Documentation,
+ * including but not limited to those resulting from defects in Software and/or
+ * Documentation, or loss or inaccuracy of data of any kind.
+ */
+
+#ifndef GALOIS_GRAPHS_LS_LC_CSR_64_GRAPH_H
+#define GALOIS_GRAPHS_LS_LC_CSR_64_GRAPH_H
+
+#include <fstream>
+#include <type_traits>
+
+#include <boost/archive/binary_oarchive.hpp>
+#include <boost/archive/binary_iarchive.hpp>
+#include <boost/serialization/split_member.hpp>
+#include <boost/serialization/binary_object.hpp>
+#include <boost/serialization/serialization.hpp>
+
+#include "galois/config.h"
+#include "galois/Galois.h"
+#include "galois/graphs/Details.h"
+#include "galois/graphs/FileGraph.h"
+#include "galois/graphs/GraphHelpers.h"
+#include "galois/PODResizeableArray.h"
+<<<<<<< HEAD
+#include "galois/Reduction.h"
+
+
+=======
+#include "galois/PrefixSum.h"
+>>>>>>> 1729f3c40 (Prefix sums for thread ranges)
+
+namespace galois::graphs {
+/**
+ * Local computation graph (i.e., graph structure does not change). The data
+ * representation is the traditional compressed-sparse-row (CSR) format.
+ *
+ * The position of template parameters may change between Galois releases; the
+ * most robust way to specify them is through the with_XXX nested templates.
+ *
+ * An example of use:
+ *
+ * \snippet test/graph.cpp Using a graph
+ *
+ * And in C++11:
+ *
+ * \snippet test/graph.cpp Using a graph cxx11
+ *
+ * @tparam NodeTy data on nodes
+ * @tparam EdgeTy data on out edges
+ */
+//! [doxygennuma]
+template <typename NodeTy, typename EdgeTy, bool HasNoLockable = false,
+          bool UseNumaAlloc = false, bool HasOutOfLineLockable = false,
+          typename FileEdgeTy = EdgeTy>
+class LS_LC_CSR_64_Graph :
+    //! [doxygennuma]
+    private boost::noncopyable,
+    private internal::LocalIteratorFeature<UseNumaAlloc>,
+    private internal::OutOfLineLockableFeature<HasOutOfLineLockable &&
+                                               !HasNoLockable> {
+  template <typename Graph>
+  friend class LC_InOut_Graph;
+
+public:
+  template <bool _has_id>
+  struct with_id {
+    typedef LS_LC_CSR_64_Graph type;
+  };
+
+  template <typename _node_data>
+  struct with_node_data {
+    typedef LS_LC_CSR_64_Graph<_node_data, EdgeTy, HasNoLockable, UseNumaAlloc,
+                               HasOutOfLineLockable, FileEdgeTy>
+        type;
+  };
+
+  template <typename _edge_data>
+  struct with_edge_data {
+    typedef LS_LC_CSR_64_Graph<NodeTy, _edge_data, HasNoLockable, UseNumaAlloc,
+                               HasOutOfLineLockable, FileEdgeTy>
+        type;
+  };
+
+  template <typename _file_edge_data>
+  struct with_file_edge_data {
+    typedef LS_LC_CSR_64_Graph<NodeTy, EdgeTy, HasNoLockable, UseNumaAlloc,
+                               HasOutOfLineLockable, _file_edge_data>
+        type;
+  };
+
+  //! If true, do not use abstract locks in graph
+  template <bool _has_no_lockable>
+  struct with_no_lockable {
+    typedef LS_LC_CSR_64_Graph<NodeTy, EdgeTy, _has_no_lockable, UseNumaAlloc,
+                               HasOutOfLineLockable, FileEdgeTy>
+        type;
+  };
+  template <bool _has_no_lockable>
+  using _with_no_lockable =
+      LS_LC_CSR_64_Graph<NodeTy, EdgeTy, _has_no_lockable, UseNumaAlloc,
+                         HasOutOfLineLockable, FileEdgeTy>;
+
+  //! If true, use NUMA-aware graph allocation; otherwise, use NUMA interleaved
+  //! allocation.
+  template <bool _use_numa_alloc>
+  struct with_numa_alloc {
+    typedef LS_LC_CSR_64_Graph<NodeTy, EdgeTy, HasNoLockable, _use_numa_alloc,
+                               HasOutOfLineLockable, FileEdgeTy>
+        type;
+  };
+  template <bool _use_numa_alloc>
+  using _with_numa_alloc =
+      LS_LC_CSR_64_Graph<NodeTy, EdgeTy, HasNoLockable, _use_numa_alloc,
+                         HasOutOfLineLockable, FileEdgeTy>;
+
+  //! If true, store abstract locks separate from nodes
+  template <bool _has_out_of_line_lockable>
+  struct with_out_of_line_lockable {
+    typedef LS_LC_CSR_64_Graph<NodeTy, EdgeTy, HasNoLockable, UseNumaAlloc,
+                               _has_out_of_line_lockable, FileEdgeTy>
+        type;
+  };
+
+  typedef read_default_graph_tag read_tag;
+
+protected:
+  enum VertexState : uint16_t {
+    UNLK = 0x0 << 0,
+    LOCK = 0x1 << 0,
+    TOMB = 0x1 << 1,
+    UMAX = 0x1 << 2
+  };
+
+  constexpr uint64_t mask(uint8_t mask, uint8_t shift) { return mask << shift; }
+  constexpr uint64_t lower(uint8_t num) { return (1 << num) - 1; }
+
+  // Pack things in the same order of VertexState
+  template <typename T>
+  struct __attribute__((packed)) PackedVal {
+    VertexState get_vertex_state(uint64_t v) const {
+      return (VertexState)(v >> 48);
+    }
+    uint64_t get_raw_value(uint64_t v) const { return v & lower(48); }
+    uint16_t get_flags_unlock(uint16_t f) const { return f & (lower(15) << 1); }
+    uint16_t get_flags_untomb(uint16_t f) const {
+      return f & (lower(14) << 2 | 0x1);
+    }
+
+    volatile uint16_t flags : 16;
+    uint64_t value : 48;
+
+    PackedVal(T t)
+        : flags(get_vertex_state((uint64_t)t)),
+          value(get_raw_value((uint64_t)t)) {}
+
+    inline VertexState try_lock() {
+      uint16_t f = __atomic_load_2(this, __ATOMIC_RELAXED);
+      bool b     = false;
+      if (!(f & LOCK))
+        b = __atomic_compare_exchange_2(this, &f, f | LOCK, true,
+                                        __ATOMIC_ACQUIRE, __ATOMIC_RELAXED);
+      return (VertexState)((b ? UNLK : LOCK) | get_flags_unlock(f));
+    }
+
+    // Make an explicit function that returns tombstone and locks
+    inline bool lock() {
+      uint64_t ret;
+      VertexState s;
+      do {
+        s = this->try_lock();
+      } while (s & LOCK);
+      return !(s & TOMB);
+    }
+
+    inline void unlock() {
+      uint64_t f = flags;
+      __atomic_store_2(this, f & (~LOCK), __ATOMIC_RELEASE);
+    }
+
+    inline void set_value(T p) {
+      if ((uint64_t)p == UINT64_MAX) {
+        flags |= UMAX;
+      } else {
+        value = get_raw_value(p);
+      }
+    }
+
+    inline T get_value() { return (flags & UMAX) ? (T)UINT64_MAX : (T)value; }
+
+    inline void unset_tomb() { flags = flags & (~TOMB); }
+
+    inline void set_tomb() { flags = flags | TOMB; }
+
+    inline bool is_tomb() { return flags & TOMB; }
+
+    inline bool atomic_is_tomb() {
+      return __atomic_load_2(this, __ATOMIC_RELAXED) & TOMB;
+    }
+
+    inline PackedVal<T>& operator=(const T& val) {
+      this.set_value(val);
+      return *this;
+    }
+  };
+
+  struct EdgeInd {
+    uint64_t first;
+    uint64_t second;
+    operator uint64_t() const { return second; }
+    uint64_t operator++() { return ++second; }
+    uint64_t operator--() { return --second; }
+    uint64_t operator+=(uint64_t t) { return (second += t); }
+  };
+
+  typedef LargeArray<EdgeTy> EdgeData;
+  typedef LargeArray<uint64_t> EdgeDst;
+  typedef internal::NodeInfoBaseTypes<NodeTy,
+                                      !HasNoLockable && !HasOutOfLineLockable>
+      NodeInfoTypes;
+  typedef internal::NodeInfoBase<NodeTy,
+                                 !HasNoLockable && !HasOutOfLineLockable>
+      NodeInfo;
+  typedef LargeArray<EdgeInd> EdgeIndData;
+  typedef LargeArray<NodeInfo> NodeData;
+
+public:
+  typedef uint64_t GraphNode;
+  typedef EdgeTy edge_data_type;
+  typedef FileEdgeTy file_edge_data_type;
+  typedef NodeTy node_data_type;
+  typedef typename EdgeData::reference edge_data_reference;
+  typedef typename NodeInfoTypes::reference node_data_reference;
+  using edge_iterator = boost::counting_iterator<uint64_t>;
+  using iterator      = boost::counting_iterator<typename EdgeDst::value_type>;
+  typedef iterator const_iterator;
+  typedef iterator local_iterator;
+  typedef iterator const_local_iterator;
+
+protected:
+  NodeData nodeData;
+  EdgeIndData edgeIndData;
+  EdgeDst edgeDst;
+  EdgeData edgeData;
+  EdgeDst prefixSumCache;
+
+  static uint64_t transmute(const EdgeInd& p) { return p.second - p.first; };
+  static uint64_t scan_op(const EdgeInd& p, const EdgeDst::value_type& l) {
+    return p.second - p.first + l;
+  };
+  static uint64_t combiner(const EdgeDst::value_type& f,
+                           const EdgeDst::value_type& s) {
+    return f + s;
+  };
+
+  PrefixSum<EdgeInd, EdgeDst::value_type, transmute, scan_op, combiner,
+            CacheLinePaddedArr>
+      pfxsum{&edgeIndData[0], &prefixSumCache[0]};
+
+  std::atomic<bool> prefixValid = false;
+  std::atomic<uint64_t> numNodes;
+  std::atomic<uint64_t> numEdges = 0;
+  std::atomic<uint64_t> edgeEnd  = 0;
+
+  uint64_t maxNodes = ((uint64_t)1) << 30;
+  uint64_t maxEdges = ((uint64_t)1) << 32;
+
+  typedef internal::EdgeSortIterator<GraphNode, uint64_t, EdgeDst, EdgeData>
+      edge_sort_iterator;
+
+  edge_iterator raw_begin(GraphNode N) const {
+    return edge_iterator(edgeIndData[N].first);
+  }
+
+  edge_iterator raw_end(GraphNode N) const {
+    return edge_iterator(edgeIndData[N].second);
+  }
+
+  edge_sort_iterator edge_sort_begin(GraphNode N) {
+    return edge_sort_iterator(*raw_begin(N), &edgeDst, &edgeData);
+  }
+
+  edge_sort_iterator edge_sort_end(GraphNode N) {
+    return edge_sort_iterator(*raw_end(N), &edgeDst, &edgeData);
+  }
+
+  template <bool _A1 = HasNoLockable, bool _A2 = HasOutOfLineLockable>
+  void acquireNode(GraphNode N, MethodFlag mflag,
+                   typename std::enable_if<!_A1&& !_A2>::type* = 0) {
+    galois::runtime::acquire(&nodeData[N], mflag);
+  }
+
+  template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>
+  void acquireNode(GraphNode N, MethodFlag mflag,
+                   typename std::enable_if<_A1&& !_A2>::type* = 0) {
+    this->outOfLineAcquire(getId(N), mflag);
+  }
+
+  template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>
+  void acquireNode(GraphNode, MethodFlag,
+                   typename std::enable_if<_A2>::type* = 0) {}
+
+  /**
+  template <bool _A1 = HasNoLockable, bool _A2 = HasOutOfLineLockable>
+  void releaseNode(GraphNode N,
+                   typename std::enable_if<!_A1 && !_A2>::type* = 0) {
+    galois::runtime::release(&nodeData[N]);
+  }
+
+  template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>
+  void releaseNode(GraphNode N,
+                   typename std::enable_if<_A1 && !_A2>::type* = 0) {
+    this->outOfLineRelease(getId(N));
+  }
+
+  template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>
+  void releaseNode(GraphNode,
+                   typename std::enable_if<_A2>::type* = 0) {}
+  */
+
+  template <bool _A1 = EdgeData::has_value,
+            bool _A2 = LargeArray<FileEdgeTy>::has_value>
+  void constructEdgeValue(FileGraph& graph,
+                          typename FileGraph::edge_iterator nn,
+                          typename std::enable_if<!_A1 || _A2>::type* = 0) {
+    typedef LargeArray<FileEdgeTy> FED;
+    if (EdgeData::has_value)
+      edgeData.set(*nn, graph.getEdgeData<typename FED::value_type>(nn));
+  }
+
+  template <bool _A1 = EdgeData::has_value,
+            bool _A2 = LargeArray<FileEdgeTy>::has_value>
+  void constructEdgeValue(FileGraph&, typename FileGraph::edge_iterator nn,
+                          typename std::enable_if<_A1&& !_A2>::type* = 0) {
+    edgeData.set(*nn, {});
+  }
+
+  size_t getId(GraphNode N) { return N; }
+
+  GraphNode getNode(uint64_t n) { return n; }
+
+private:
+  friend class boost::serialization::access;
+
+  template <typename Archive>
+  void save(Archive& ar, const unsigned int) const {
+    ar << numNodes;
+    ar << numEdges;
+
+    // Large Arrays
+    ar << edgeIndData;
+    ar << edgeDst;
+    ar << edgeData;
+  }
+
+  template <typename Archive>
+  void load(Archive& ar, const unsigned int) {
+    ar >> numNodes;
+    ar >> numEdges;
+
+    // Large Arrays
+    ar >> edgeIndData;
+    ar >> edgeDst;
+    ar >> edgeData;
+
+    if (!nodeData.data()) {
+      if (UseNumaAlloc) {
+        nodeData.allocateBlocked(numNodes);
+        this->outOfLineAllocateBlocked(numNodes);
+      } else {
+        nodeData.allocateInterleaved(numNodes);
+        this->outOfLineAllocateInterleaved(numNodes);
+      }
+
+      // Construct nodeData largeArray
+      for (size_t n = 0; n < numNodes; ++n) {
+        nodeData.constructAt(n);
+      }
+    }
+  }
+
+  // The macro BOOST_SERIALIZATION_SPLIT_MEMBER() generates code which invokes
+  // the save or load depending on whether the archive is used for saving or
+  // loading
+  BOOST_SERIALIZATION_SPLIT_MEMBER()
+
+public:
+  LS_LC_CSR_64_Graph(LS_LC_CSR_64_Graph&& rhs) = default;
+
+  LS_LC_CSR_64_Graph() = default;
+
+  LS_LC_CSR_64_Graph& operator=(LS_LC_CSR_64_Graph&&) = default;
+
+  /**
+   * Serializes node data using Boost.
+   *
+   * @param ar Boost archive to serialize to.
+   */
+  void serializeNodeData(boost::archive::binary_oarchive& ar) const {
+    ar << nodeData;
+  }
+
+  /**
+   * Deserializes a Boost archive containing node data to the local node data
+   * variable.
+   *
+   * @param ar Boost archive to deserialize from.
+   */
+  void deSerializeNodeData(boost::archive::binary_iarchive& ar) {
+    ar >> nodeData;
+  }
+
+  /**
+   * Serializes graph using Boost.
+   *
+   * @param ar Boost archive to serialize to.
+   */
+  void serializeGraph(boost::archive::binary_oarchive& ar) const {
+    ar << numNodes;
+    ar << numEdges;
+
+    // Large Arrays
+    ar << nodeData;
+    ar << edgeIndData;
+    ar << edgeDst;
+    ar << edgeData;
+  }
+
+  /**
+   * Deserializes a Boost archive to the local graph.
+   *
+   * @param ar Boost archive to deserialize from.
+   */
+  void deSerializeGraph(boost::archive::binary_iarchive& ar) {
+    ar >> numNodes;
+    ar >> numEdges;
+
+    // Large Arrays
+    ar >> nodeData;
+    ar >> edgeIndData;
+    ar >> edgeDst;
+    ar >> edgeData;
+  }
+
+  void resetPrefixSum() {
+    pfxsum.src = &edgeIndData[0];
+    pfxsum.dst = &prefixSumCache[0];
+  }
+  // Compute the prefix sum using the two level method
+  void computePrefixSum() {
+    pfxsum.computePrefixSum(numNodes);
+    prefixValid = true;
+  }
+
+  /**
+   * DO NOT USE WHILE MODIFYING THE GRAPH!
+   * ONLY USE IF GRAPH HAS BEEN LOADED
+   *
+   * @param n Index into edge prefix sum
+   * @returns The value that would be located at index n in an edge prefix sum
+   * array
+   */
+  uint64_t operator[](uint64_t n) {
+    if (!prefixValid)
+      computePrefixSum();
+    return prefixSumCache[n];
+  }
+
+  template <typename EdgeNumFnTy, typename EdgeDstFnTy, typename EdgeDataFnTy>
+  LS_LC_CSR_64_Graph(uint64_t _numNodes, uint64_t _numEdges,
+                     EdgeNumFnTy edgeNum, EdgeDstFnTy _edgeDst,
+                     EdgeDataFnTy _edgeData)
+      : numNodes(_numNodes), numEdges(_numEdges), edgeEnd(_numEdges) {
+    assert(numNodes <= maxNodes);
+    assert(numEdges <= maxEdges);
+    if (UseNumaAlloc) {
+      //! [numaallocex]
+      nodeData.allocateBlocked(maxNodes);
+      edgeIndData.allocateBlocked(maxNodes);
+      edgeDst.allocateBlocked(maxEdges);
+      edgeData.allocateBlocked(maxEdges);
+      prefixSumCache.allocateBlocked(maxNodes);
+      //! [numaallocex]
+      this->outOfLineAllocateBlocked(maxNodes);
+    } else {
+      nodeData.allocateInterleaved(maxNodes);
+      edgeIndData.allocateInterleaved(maxNodes);
+      edgeDst.allocateInterleaved(maxEdges);
+      edgeData.allocateInterleaved(maxEdges);
+      prefixSumCache.allocateInterleaved(maxNodes);
+      this->outOfLineAllocateInterleaved(maxNodes);
+    }
+    resetPrefixSum();
+    for (size_t n = 0; n < numNodes; ++n) {
+      nodeData.constructAt(n);
+    }
+    uint64_t cur = 0;
+    for (size_t n = 0; n < numNodes; ++n) {
+      edgeIndData[n].first = cur;
+      cur += edgeNum(n);
+      edgeIndData[n].second = cur;
+    }
+    cur = 0;
+    for (size_t n = 0; n < numNodes; ++n) {
+      for (uint64_t e = 0, ee = edgeNum(n); e < ee; ++e) {
+        if (EdgeData::has_value)
+          edgeData.set(cur, _edgeData(n, e));
+        edgeDst[cur] = _edgeDst(n, e);
+        ++cur;
+      }
+    }
+  }
+
+  /* dangerous parallel constructor, call this outside a galois kernel */
+  template <typename EdgeNumFnTy, typename EdgeDstFnTy, typename EdgeDataFnTy>
+  LS_LC_CSR_64_Graph(bool setEdgeVals, uint64_t _numNodes, uint64_t _numEdges,
+                     EdgeNumFnTy edgeNum, EdgeDstFnTy _edgeDst,
+                     EdgeDataFnTy _edgeData)
+      : numNodes(_numNodes), numEdges(_numEdges), edgeEnd(_numEdges) {
+    assert(numNodes <= maxNodes);
+    assert(numEdges <= maxEdges);
+    if (UseNumaAlloc) {
+      //! [numaallocex]
+      nodeData.allocateBlocked(maxNodes);
+      edgeIndData.allocateBlocked(maxNodes);
+      edgeDst.allocateBlocked(maxEdges);
+      edgeData.allocateBlocked(maxEdges);
+      prefixSumCache.allocateBlocked(maxNodes);
+      //! [numaallocex]
+      this->outOfLineAllocateBlocked(maxNodes);
+    } else {
+      nodeData.allocateInterleaved(maxNodes);
+      edgeIndData.allocateInterleaved(maxNodes);
+      edgeDst.allocateInterleaved(maxEdges);
+      edgeData.allocateInterleaved(maxEdges);
+      prefixSumCache.allocateInterleaved(maxNodes);
+      this->outOfLineAllocateInterleaved(maxNodes);
+    }
+    resetPrefixSum();
+    galois::do_all(
+        galois::iterate((uint64_t)0, _numNodes),
+        [&](uint64_t n) { nodeData.constructAt(n); }, galois::steal());
+
+    galois::do_all(
+        galois::iterate((uint64_t)0, _numNodes),
+        [&](uint64_t n) {
+          addEdgesUnSort(setEdgeVals, n, _edgeDst(n), _edgeData(n), edgeNum(n));
+        },
+        galois::steal());
+  }
+
+  /**
+   * Add edges into the graph
+   *
+   * @param setEdgeVals if true, will set edges data
+   * @param src source node of edges to add
+   * @param dst array of edges dst
+   * @param dst_data array of dst nodes data
+   * @param num_dst number of dst these edges has
+   * @param keep_size if true, the number of edges in the graph are not
+   * increment, by default is false
+   */
+  template <typename T>
+  void addEdgesUnSort(bool setEdgeVals, GraphNode src, EdgeDst::value_type* dst,
+                      T* dst_data, uint64_t num_dst, bool keep_size = false) {
+    acquireNode(src, galois::MethodFlag::WRITE);
+    auto orig_deg = getDegree(src);
+    auto ee = edgeEnd.fetch_add(num_dst + orig_deg, std::memory_order_relaxed);
+
+    auto edgeStart = ee;
+    auto orig_itr  = edge_begin(src);
+
+    std::memcpy(&edgeDst[edgeStart], &edgeDst[*orig_itr],
+                sizeof(EdgeDst::value_type) * orig_deg);
+    std::memcpy(&edgeDst[edgeStart + orig_deg], dst,
+                sizeof(EdgeDst::value_type) * num_dst);
+
+    if (EdgeData::has_value && setEdgeVals) {
+      for (uint64_t i = 0; i < orig_deg; i++) {
+        edgeData.set(edgeStart + i, edgeData[*orig_itr]);
+      }
+      for (uint64_t i = 0; i < num_dst; i++) {
+        edgeData.set(edgeStart + orig_deg + i, dst_data[i]);
+      }
+    }
+
+    edgeIndData[src].first  = edgeStart;
+    edgeIndData[src].second = edgeStart + num_dst + orig_deg;
+
+<<<<<<< HEAD
+<<<<<<< HEAD
+<<<<<<< HEAD
+<<<<<<< HEAD
+    if (!keep_size)
+      numEdges.fetch_add(num_dst, std::memory_order_relaxed);
+=======
+#if GRAPH_PROFILE
+=======
+#ifdef GRAPH_PROFILE
+<<<<<<< HEAD
+>>>>>>> b09e68c3c (fix: typo)
+    this->local_rnd_write_counts  += 1;
+    this->local_rnd_write_bytes   += 8;
+=======
+    this->local_rand_write_count  += 1;
+    this->local_rand_write_size   += 8;
+>>>>>>> 709cc4565 (feat: profile each host seperatly)
+#endif
+
+<<<<<<< HEAD
+    numEdges.fetch_add(num_dst, std::memory_order_relaxed);
+<<<<<<< HEAD
+#if GRAPH_PROFILE
+    this->local_rnd_rmw_counts += 1;
+    this->local_rnd_rmw_bytes += 8;
+#endif
+
+>>>>>>> 76d458d30 (Initial commit of instrumentation for Yineng)
+=======
+=======
+>>>>>>> f4d386172 (style: address some style issues and add comments)
+=======
+>>>>>>> 5901b24b6 (chore: Run clang-format on the repo and add git hooks from gnn branch)
+    if (!keep_size) {
+      numEdges.fetch_add(num_dst, std::memory_order_relaxed);
+    }
+<<<<<<< HEAD
+>>>>>>> b09e68c3c (fix: typo)
+=======
+    prefixValid = false;
+>>>>>>> dd7b8210a (Another small compilation bug, needed to return the right object)
+=======
+    prefixValid = false;
+>>>>>>> a599b7169 (Small fixes for compilation issues)
+  }
+
+  void addEdgeSort(const uint64_t src, const uint64_t dst) {
+    acquireNode(src, galois::MethodFlag::WRITE);
+    auto orig_deg  = getDegree(src);
+    auto ee        = edgeEnd.fetch_add(1 + orig_deg, std::memory_order_relaxed);
+    auto edgeStart = ee;
+    auto edgePlace = ee;
+    auto orig_itr  = edge_begin(src);
+    auto orig_end  = edge_end(src);
+    bool dst_insert = false;
+
+    uint64_t orig_dst;
+    while (orig_itr != orig_end || !dst_insert) {
+      if (dst_insert || (orig_dst = getEdgeDst(orig_itr)) < dst) {
+        edgeDst[edgePlace] = orig_dst;
+        orig_itr++;
+      } else if (orig_itr == orig_end ||
+                 dst < (orig_dst = getEdgeDst(orig_itr))) {
+        edgeDst[edgePlace] = dst;
+        dst_insert         = true;
+      } else {
+        edgeDst[edgePlace] = dst;
+        dst_insert         = true;
+        orig_itr++;
+      }
+      edgePlace++;
+    }
+
+    edgeIndData[src].first  = edgeStart;
+    edgeIndData[src].second = edgePlace;
+    numEdges.fetch_add(edgePlace - edgeStart - orig_deg,
+                       std::memory_order_relaxed);
+    prefixValid = false;
+  }
+
+  template <typename PQ>
+  void addEdges(uint64_t src, PQ& dst) {
+    acquireNode(src, galois::MethodFlag::WRITE);
+    auto orig_deg = getDegree(src);
+    auto num_dst  = dst.size();
+    auto ee = edgeEnd.fetch_add(num_dst + orig_deg, std::memory_order_relaxed);
+    auto edgeStart = ee;
+    auto edgePlace = ee;
+    auto orig_itr  = edge_begin(src);
+    auto orig_end  = edge_end(src);
+
+    bool empty = dst.empty();
+    while (orig_itr != orig_end || !empty) {
+
+      auto orig_dst = getEdgeDst(orig_itr);
+      if (orig_itr != orig_end && (empty || orig_dst < dst.top())) {
+        edgeDst[edgePlace] = orig_dst;
+        /*
+        if (EdgeData::has_value)
+        {
+          edgeData.set(edgePlace, getEdgeData(orig_itr));
+        }
+        */
+        orig_itr++;
+      } else if (orig_itr == orig_end || dst.top() < orig_dst) {
+        edgeDst[edgePlace] = dst.top();
+        /*
+        if(EdgeData::has_value)
+          edgeData.set(edgePlace, *dst_data);
+
+        dst_data++;
+        */
+        dst.pop();
+      } else {
+        edgeDst[edgePlace] = dst.top();
+        /*
+        if(EdgeData::has_value)
+          edgeData.set(edgePlace, *dst_data);
+
+        dst_data++;
+        */
+        dst.pop();
+        orig_itr++;
+      }
+      edgePlace++;
+      empty = dst.empty();
+    }
+
+    edgeIndData[src].first  = edgeStart;
+    edgeIndData[src].second = edgePlace;
+    numEdges.fetch_add(edgePlace - edgeStart - orig_deg,
+                       std::memory_order_relaxed);
+    prefixValid = false;
+  }
+
+  template <typename PTM>
+  void insertEdgesSerially(uint64_t src, const PTM& dst) {
+    acquireNode(src, galois::MethodFlag::WRITE);
+    auto orig_deg    = getDegree(src);
+    uint64_t num_dst = 0;
+    for (uint64_t t = 0; t < dst.numRows(); t++) {
+      const auto& map = dst.get(t);
+      if (auto search = map.find(src); search != map.end()) {
+        num_dst += search->second.size();
+      }
+    }
+
+    auto ee = edgeEnd.fetch_add(num_dst + orig_deg, std::memory_order_relaxed);
+    auto edgeStart = ee;
+    auto edgePlace = ee;
+    auto orig_itr  = edgeIndData[src].first;
+    auto orig_end  = edgeIndData[src].second;
+
+    std::memcpy(&edgeDst[edgePlace], &edgeDst[orig_itr],
+                sizeof(EdgeDst::value_type) * orig_deg);
+    edgePlace += orig_deg;
+
+    uint64_t i = 0;
+
+    for (uint64_t t = 0; t < dst.numRows(); t++) {
+      auto& map = dst.get(t);
+      if (auto search = map.find(src); search != map.end()) {
+        const auto& stack = search->second;
+        for (auto it = stack.begin(); it != stack.end(); it++, i++) {
+          edgeDst[edgePlace + i] = *it;
+        }
+      }
+    }
+
+    assert(i == num_dst);
+
+    edgeIndData[src].first  = edgeStart;
+    edgeIndData[src].second = edgePlace + num_dst;
+    numEdges.fetch_add(num_dst, std::memory_order_relaxed);
+    prefixValid = false;
+  }
+
+  template <typename Cont>
+  void insertEdgesSerially(uint64_t src, uint64_t dst_sz, uint64_t start_index,
+                           const Cont& cont) {
+    acquireNode(src, galois::MethodFlag::WRITE);
+    auto orig_deg = getDegree(src);
+
+    auto ee = edgeEnd.fetch_add(orig_deg + dst_sz, std::memory_order_relaxed);
+    auto edgeStart = ee;
+    auto edgePlace = ee;
+    auto orig_itr  = edgeIndData[src].first;
+    auto orig_end  = edgeIndData[src].second;
+
+    auto dst_end   = cont.end();
+    auto dst_start = cont.begin() + start_index;
+    auto dst_left  = dst_sz;
+
+    std::memcpy(&edgeDst[edgePlace], &edgeDst[orig_itr],
+                sizeof(EdgeDst::value_type) * orig_deg);
+    edgePlace += orig_deg;
+
+    while (dst_left != 0) {
+      while (dst_start->first != src)
+        dst_start++;
+      edgeDst[edgePlace] = dst_start->second;
+      edgePlace++;
+      dst_start++;
+      dst_left--;
+    }
+
+    edgeIndData[src].first  = edgeStart;
+    edgeIndData[src].second = edgePlace;
+    numEdges.fetch_add(dst_sz, std::memory_order_relaxed);
+    prefixValid = false;
+  }
+
+  void sortVertexSerially(uint64_t src) {
+    acquireNode(src, galois::MethodFlag::WRITE);
+    auto orig_itr = edgeIndData[src].first;
+    auto orig_end = edgeIndData[src].second;
+    std::sort(&edgeDst[orig_itr], &edgeDst[orig_end],
+              [](const EdgeDst::value_type& e0, const EdgeDst::value_type& e1) {
+                return e0 < e1;
+              });
+  }
+
+  template <typename PQ>
+  void addEdges(PQ* edges) {
+    for (uint64_t i = 0; i < numNodes; i++) {
+      if (!edges[i].empty())
+        addEdges<PQ>(i, edges[i]);
+    }
+  }
+
+  friend void swap(LS_LC_CSR_64_Graph& lhs, LS_LC_CSR_64_Graph& rhs) {
+    swap(lhs.nodeData, rhs.nodeData);
+    swap(lhs.edgeIndData, rhs.edgeIndData);
+    swap(lhs.edgeDst, rhs.edgeDst);
+    swap(lhs.edgeData, rhs.edgeData);
+    swap(lhs.pfxsum, rhs.pfxsum);
+    swap(lhs.prefixSumCache, rhs.prefixSumCache);
+
+    bool pv         = lhs.prefixValid;
+    lhs.prefixValid = rhs.prefixValid;
+    rhs.prefixValid = pv;
+
+    uint64_t blah = lhs.numNodes;
+    lhs.numNodes  = rhs.numNodes;
+    rhs.numNodes  = blah;
+
+    blah         = lhs.numEdges;
+    lhs.numEdges = rhs.numEdges;
+    rhs.numEdges = blah;
+  }
+
+  node_data_reference getData(GraphNode N,
+                              MethodFlag mflag = MethodFlag::WRITE) {
+    // galois::runtime::checkWrite(mflag, false);
+    NodeInfo& NI = nodeData[N];
+    acquireNode(N, mflag);
+    return NI.getData();
+  }
+
+  edge_data_reference
+  getEdgeData(edge_iterator ni,
+              MethodFlag GALOIS_UNUSED(mflag) = MethodFlag::UNPROTECTED) {
+    // galois::runtime::checkWrite(mflag, false);
+    return edgeData[*ni];
+  }
+
+  GraphNode getEdgeDst(edge_iterator ni) { return edgeDst[*ni]; }
+
+  size_t size() const { return numNodes; }
+  size_t sizeEdges() const { return numEdges; }
+
+  iterator begin() const { return iterator(0); }
+  iterator end() const { return iterator(numNodes); }
+
+  const_local_iterator local_begin() const {
+    return const_local_iterator(this->localBegin(numNodes));
+  }
+
+  const_local_iterator local_end() const {
+    return const_local_iterator(this->localEnd(numNodes));
+  }
+
+  local_iterator local_begin() {
+    return local_iterator(this->localBegin(numNodes));
+  }
+
+  local_iterator local_end() {
+    return local_iterator(this->localEnd(numNodes));
+  }
+
+  edge_iterator edge_begin(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
+    acquireNode(N, mflag);
+    if (!HasNoLockable && galois::runtime::shouldLock(mflag)) {
+      for (edge_iterator ii = raw_begin(N), ee = raw_end(N); ii != ee; ++ii) {
+        acquireNode(edgeDst[*ii], mflag);
+      }
+    }
+    return raw_begin(N);
+  }
+
+  edge_iterator edge_end(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
+    acquireNode(N, mflag);
+    return raw_end(N);
+  }
+
+  uint64_t getDegree(GraphNode N) const { return (raw_end(N) - raw_begin(N)); }
+
+  edge_iterator findEdge(GraphNode N1, GraphNode N2) {
+    return std::find_if(edge_begin(N1), edge_end(N1),
+                        [=](edge_iterator e) { return getEdgeDst(e) == N2; });
+  }
+
+  edge_iterator findEdgeSortedByDst(GraphNode N1, GraphNode N2) {
+    auto e = std::lower_bound(
+        edge_begin(N1), edge_end(N1), N2,
+        [=](edge_iterator e, GraphNode N) { return getEdgeDst(e) < N; });
+    return (getEdgeDst(e) == N2) ? e : edge_end(N1);
+  }
+
+  runtime::iterable<NoDerefIterator<edge_iterator>>
+  edges(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
+    return internal::make_no_deref_range(edge_begin(N, mflag),
+                                         edge_end(N, mflag));
+  }
+
+  runtime::iterable<NoDerefIterator<edge_iterator>>
+  out_edges(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
+    return edges(N, mflag);
+  }
+
+  /**
+   * Sorts outgoing edges of a node. Comparison function is over EdgeTy.
+   */
+  template <typename CompTy>
+  void sortEdgesByEdgeData(GraphNode N,
+                           const CompTy& comp = std::less<EdgeTy>(),
+                           MethodFlag mflag   = MethodFlag::WRITE) {
+    acquireNode(N, mflag);
+    std::sort(
+        edge_sort_begin(N), edge_sort_end(N),
+        internal::EdgeSortCompWrapper<EdgeSortValue<GraphNode, EdgeTy>, CompTy>(
+            comp));
+  }
+
+  /**
+   * Sorts outgoing edges of a node.
+   * Comparison function is over <code>EdgeSortValue<EdgeTy></code>.
+   */
+  template <typename CompTy>
+  void sortEdges(GraphNode N, const CompTy& comp,
+                 MethodFlag mflag = MethodFlag::WRITE) {
+    acquireNode(N, mflag);
+    std::sort(edge_sort_begin(N), edge_sort_end(N), comp);
+  }
+
+  /**
+   * Sorts outgoing edges of a node. Comparison is over getEdgeDst(e).
+   */
+  void sortEdgesByDst(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
+    acquireNode(N, mflag);
+    typedef EdgeSortValue<GraphNode, EdgeTy> EdgeSortVal;
+    std::sort(edge_sort_begin(N), edge_sort_end(N),
+              [=](const EdgeSortVal& e1, const EdgeSortVal& e2) {
+                return e1.dst < e2.dst;
+              });
+  }
+
+  /**
+   * Sorts all outgoing edges of all nodes in parallel. Comparison is over
+   * getEdgeDst(e).
+   */
+  void sortAllEdgesByDst(MethodFlag mflag = MethodFlag::WRITE) {
+    galois::do_all(
+        galois::iterate(size_t{0}, this->size()),
+        [=](GraphNode N) { this->sortEdgesByDst(N, mflag); },
+        galois::no_stats(), galois::steal());
+  }
+
+  void allocateFrom(const FileGraph& graph) {
+    numNodes = graph.size();
+    numEdges = 0;
+    edgeEnd  = 0;
+    maxEdges = graph.sizeEdges();
+    maxNodes = numNodes;
+
+    if (UseNumaAlloc) {
+<<<<<<< HEAD
+      nodeData.allocateBlocked(maxNodes);
+      edgeIndData.allocateBlocked(maxNodes);
+      edgeDst.allocateBlocked(maxEdges);
+      edgeData.allocateBlocked(maxEdges);
+      this->outOfLineAllocateBlocked(maxNodes);
+    } else {
+      nodeData.allocateInterleaved(maxNodes);
+      edgeIndData.allocateInterleaved(maxNodes);
+      edgeDst.allocateInterleaved(maxEdges);
+      edgeData.allocateInterleaved(maxEdges);
+      this->outOfLineAllocateInterleaved(maxNodes);
+=======
+      nodeData.allocateBlocked(numNodes);
+      edgeIndData.allocateBlocked(numNodes);
+      edgeDst.allocateBlocked(numEdges);
+      edgeData.allocateBlocked(numEdges);
+      pfxsum.allocateInterleaved(numNodes);
+      this->outOfLineAllocateBlocked(numNodes);
+    } else {
+      nodeData.allocateInterleaved(numNodes);
+      edgeIndData.allocateInterleaved(numNodes);
+      edgeDst.allocateInterleaved(numEdges);
+      edgeData.allocateInterleaved(numEdges);
+      pfxsum.allocateInterleaved(numNodes);
+      this->outOfLineAllocateInterleaved(numNodes);
+>>>>>>> 1729f3c40 (Prefix sums for thread ranges)
+    }
+    resetPrefixSum();
+  }
+
+  void allocateFrom(uint64_t nNodes, uint64_t nEdges) {
+    numNodes = nNodes;
+    numEdges = 0;
+<<<<<<< HEAD
+<<<<<<< HEAD
+    edgeEnd = 0;
+=======
+    edgeEnd  = 0;
+>>>>>>> 2fff1eb52 (Old fix that got borked somehow)
+=======
+    edgeEnd  = 0;
+>>>>>>> 5901b24b6 (chore: Run clang-format on the repo and add git hooks from gnn branch)
+    maxEdges = nEdges;
+    maxNodes = nNodes;
+
+    if (UseNumaAlloc) {
+<<<<<<< HEAD
+<<<<<<< HEAD
+<<<<<<< HEAD
+=======
+>>>>>>> 07ed363c0 (Old bugfix got borked)
+=======
+>>>>>>> 2fff1eb52 (Old fix that got borked somehow)
+      nodeData.allocateBlocked(maxNodes);
+      edgeIndData.allocateBlocked(maxNodes);
+      edgeDst.allocateBlocked(maxEdges);
+      edgeData.allocateBlocked(maxEdges);
+<<<<<<< HEAD
+<<<<<<< HEAD
+      this->outOfLineAllocateBlocked(maxNodes);
+    } else {
+      nodeData.allocateInterleaved(maxNodes);
+      edgeIndData.allocateInterleaved(maxNodes);
+      edgeDst.allocateInterleaved(maxEdges);
+      edgeData.allocateInterleaved(maxEdges);
+      this->outOfLineAllocateInterleaved(maxNodes);
+<<<<<<< HEAD
+=======
+=======
+    }
+    resetPrefixSum();
+  }
+
+  void destroyAndAllocateFrom(uint64_t nNodes, uint64_t nEdges) {
+    numNodes = nNodes;
+    numEdges = 0;
+    edgeEnd  = 0;
+    maxEdges = nEdges;
+    maxNodes = nNodes;
+
+    deallocate();
+    if (UseNumaAlloc) {
+>>>>>>> 5901b24b6 (chore: Run clang-format on the repo and add git hooks from gnn branch)
+      nodeData.allocateBlocked(numNodes);
+      edgeIndData.allocateBlocked(numNodes);
+      edgeDst.allocateBlocked(numEdges);
+      edgeData.allocateBlocked(numEdges);
+      prefixSumCache.allocateBlocked(numNodes);
+      this->outOfLineAllocateBlocked(numNodes);
+    } else {
+      nodeData.allocateInterleaved(numNodes);
+      edgeIndData.allocateInterleaved(numNodes);
+      edgeDst.allocateInterleaved(numEdges);
+      edgeData.allocateInterleaved(numEdges);
+      prefixSumCache.allocateInterleaved(numNodes);
+      this->outOfLineAllocateInterleaved(numNodes);
+>>>>>>> 1729f3c40 (Prefix sums for thread ranges)
+=======
+      prefixSumCache.allocateBlocked(maxNodes);
+      this->outOfLineAllocateBlocked(maxNodes);
+    } else {
+=======
+      prefixSumCache.allocateBlocked(maxNodes);
+      this->outOfLineAllocateBlocked(maxNodes);
+    } else {
+>>>>>>> 2fff1eb52 (Old fix that got borked somehow)
+      nodeData.allocateInterleaved(maxNodes);
+      edgeIndData.allocateInterleaved(maxNodes);
+      edgeDst.allocateInterleaved(maxEdges);
+      edgeData.allocateInterleaved(maxEdges);
+      prefixSumCache.allocateInterleaved(maxNodes);
+      this->outOfLineAllocateInterleaved(maxNodes);
+<<<<<<< HEAD
+>>>>>>> 07ed363c0 (Old bugfix got borked)
+=======
+>>>>>>> 2fff1eb52 (Old fix that got borked somehow)
+    }
+    resetPrefixSum();
+  }
+
+  void destroyAndAllocateFrom(uint64_t nNodes, uint64_t nEdges) {
+    numNodes = nNodes;
+    numEdges = 0;
+    edgeEnd = 0;
+    maxEdges = nEdges;
+    maxNodes = nNodes;
+
+    deallocate();
+    if (UseNumaAlloc) {
+<<<<<<< HEAD
+      nodeData.allocateBlocked(maxNodes);
+      edgeIndData.allocateBlocked(maxNodes);
+      edgeDst.allocateBlocked(maxEdges);
+      edgeData.allocateBlocked(maxEdges);
+      this->outOfLineAllocateBlocked(maxNodes);
+    } else {
+      nodeData.allocateInterleaved(maxNodes);
+      edgeIndData.allocateInterleaved(maxNodes);
+      edgeDst.allocateInterleaved(maxEdges);
+      edgeData.allocateInterleaved(maxEdges);
+      this->outOfLineAllocateInterleaved(maxNodes);
+=======
+      nodeData.allocateBlocked(numNodes);
+      edgeIndData.allocateBlocked(numNodes);
+      edgeDst.allocateBlocked(numEdges);
+      edgeData.allocateBlocked(numEdges);
+      prefixSumCache.allocateBlocked(numNodes);
+      this->outOfLineAllocateBlocked(numNodes);
+    } else {
+      nodeData.allocateInterleaved(numNodes);
+      edgeIndData.allocateInterleaved(numNodes);
+      edgeDst.allocateInterleaved(numEdges);
+      edgeData.allocateInterleaved(numEdges);
+      prefixSumCache.allocateInterleaved(numNodes);
+      this->outOfLineAllocateInterleaved(numNodes);
+>>>>>>> 1729f3c40 (Prefix sums for thread ranges)
+    }
+    resetPrefixSum();
+  }
+
+  void constructNodes() {
+#ifndef GALOIS_GRAPH_CONSTRUCT_SERIAL
+    for (uint64_t x = 0; x < numNodes; ++x) {
+      nodeData.constructAt(x);
+      this->outOfLineConstructAt(x);
+    }
+#else
+    galois::do_all(
+        galois::iterate(UINT64_C(0), numNodes),
+        [&](uint64_t x) {
+          nodeData.constructAt(x);
+          this->outOfLineConstructAt(x);
+        },
+        galois::no_stats(), galois::loopname("CONSTRUCT_NODES"));
+#endif
+  }
+
+  void deallocate() {
+    nodeData.destroy();
+    nodeData.deallocate();
+
+    edgeIndData.deallocate();
+    edgeIndData.destroy();
+
+    edgeDst.deallocate();
+    edgeDst.destroy();
+
+    edgeData.deallocate();
+    edgeData.destroy();
+
+    prefixSumCache.deallocate();
+    prefixSumCache.destroy();
+  }
+
+  void constructEdge(uint64_t e, uint64_t dst,
+                     const typename EdgeData::value_type& val) {
+    edgeData.set(e, val);
+    edgeDst[e] = dst;
+  }
+
+  void constructEdge(uint64_t e, uint64_t dst) { edgeDst[e] = dst; }
+
+  void fixEndEdge(uint64_t n, uint64_t e) { edgeIndData[n].second = e; }
+  void fixStartEdge(uint64_t n, uint64_t e) { edgeIndData[n].first = e; }
+
+  /**
+   * Perform an in-memory transpose of the graph, replacing the original
+   * CSR to CSC
+   */
+  template <bool ComputePFXSum = true>
+  void transpose(const char* regionName = NULL) {
+    galois::StatTimer timer("TIMER_GRAPH_TRANSPOSE", regionName);
+    timer.start();
+
+    EdgeDst edgeDst_old;
+    EdgeData edgeData_new;
+    EdgeIndData edgeIndData_old;
+    EdgeIndData edgeIndData_temp;
+
+    if (UseNumaAlloc) {
+      edgeIndData_old.allocateBlocked(numNodes);
+      edgeIndData_temp.allocateBlocked(numNodes);
+      edgeDst_old.allocateBlocked(edgeEnd);
+      edgeData_new.allocateBlocked(maxEdges);
+    } else {
+      edgeIndData_old.allocateInterleaved(numNodes);
+      edgeIndData_temp.allocateInterleaved(numNodes);
+      edgeDst_old.allocateInterleaved(edgeEnd);
+      edgeData_new.allocateInterleaved(maxEdges);
+    }
+
+    uint64_t numNodes_temp = numNodes.load(std::memory_order_relaxed);
+    uint64_t edgeEnd_temp  = edgeEnd.load(std::memory_order_relaxed);
+    // Copy old node->index location + initialize the temp array
+    galois::do_all(
+        galois::iterate(UINT64_C(0), numNodes_temp),
+        [&](uint64_t n) {
+          edgeIndData_old[n]         = edgeIndData[n];
+          edgeIndData_temp[n].first  = 0;
+          edgeIndData_temp[n].second = 0;
+        },
+        galois::no_stats(), galois::loopname("TRANSPOSE_EDGEINTDATA_COPY"));
+
+    // get destination of edge, copy to array, and
+    galois::do_all(
+        galois::iterate(UINT64_C(0), edgeEnd_temp),
+        [&](uint64_t e) {
+          auto dst       = edgeDst[e];
+          edgeDst_old[e] = dst;
+          // counting outgoing edges in the tranpose graph by
+          // counting incoming edges in the original graph
+          __sync_add_and_fetch(&edgeIndData_temp[dst].second, 1);
+        },
+        galois::no_stats(), galois::loopname("TRANSPOSE_EDGEINTDATA_INC"));
+
+    // TODO is it worth doing parallel prefix sum?
+    // prefix sum calculation of the edge index array
+    edgeIndData_temp[0].first = 0;
+    for (uint64_t n = 1; n < numNodes_temp; ++n) {
+      edgeIndData_temp[n].second += edgeIndData_temp[n - 1].second;
+      edgeIndData_temp[n].first = edgeIndData_temp[n - 1].second;
+    }
+
+    // copy over the new tranposed edge index data
+    galois::do_all(
+        galois::iterate(UINT64_C(0), numNodes_temp),
+        [&](uint64_t n) { edgeIndData[n] = edgeIndData_temp[n]; },
+        galois::no_stats(), galois::loopname("TRANSPOSE_EDGEINTDATA_SET"));
+
+    /* AdityaAtulTewari edit: Elided since this was stored in above loop.
+    // edgeIndData_temp[i] will now hold number of edges that all nodes
+    // before the ith node have
+    if (numNodes >= 1) {
+      edgeIndData_temp[0] = 0;
+      galois::do_all(
+          galois::iterate(UINT64_C(1), numNodes),
+          [&](uint64_t n) { edgeIndData_temp[n] = edgeIndData[n - 1]; },
+          galois::no_stats(), galois::loopname("TRANSPOSE_EDGEINTDATA_TEMP"));
+    }
+    */
+
+    galois::do_all(
+        galois::iterate(UINT64_C(0), numNodes_temp),
+        [&](uint64_t src) {
+          // e = start index into edge array for a particular node
+          uint64_t e = edgeIndData_old[src].first;
+
+          // get all outgoing edges of a particular node in the
+          // non-transpose and convert to incoming
+          while (e < edgeIndData_old[src].second) {
+            // destination nodde
+            auto dst = edgeDst_old[e];
+            // location to save edge
+            auto e_new =
+                __sync_fetch_and_add(&(edgeIndData_temp[dst].first), 1);
+            // save src as destination
+            edgeDst[e_new] = src;
+            // copy edge data to "new" array
+            edgeDataCopy(edgeData_new, edgeData, e_new, e);
+            e++;
+          }
+        },
+        galois::no_stats(), galois::loopname("TRANSPOSE_EDGEDST"));
+
+    // if edge weights, then overwrite edgeData with new edge data
+    if (EdgeData::has_value) {
+      galois::do_all(
+          galois::iterate(UINT64_C(0), edgeEnd_temp),
+          [&](uint64_t e) { edgeDataCopy(edgeData, edgeData_new, e, e); },
+          galois::no_stats(), galois::loopname("TRANSPOSE_EDGEDATA_SET"));
+    }
+    edgeEnd.store(numEdges, std::memory_order_relaxed);
+
+    resetPrefixSum();
+    if (ComputePFXSum) {
+      computePrefixSum();
+    }
+
+    timer.stop();
+  }
+
+  template <bool is_non_void = EdgeData::has_value>
+  void edgeDataCopy(EdgeData& edgeData_new, EdgeData& edgeData, uint64_t e_new,
+                    uint64_t e,
+                    typename std::enable_if<is_non_void>::type* = 0) {
+    edgeData_new[e_new] = edgeData[e];
+  }
+
+  template <bool is_non_void = EdgeData::has_value>
+  void edgeDataCopy(EdgeData&, EdgeData&, uint64_t, uint64_t,
+                    typename std::enable_if<!is_non_void>::type* = 0) {
+    // does nothing
+  }
+
+  template <typename E                                            = EdgeTy,
+            std::enable_if_t<!std::is_same<E, void>::value, int>* = nullptr>
+  void constructFrom(FileGraph& graph, unsigned tid, unsigned total,
+                     const bool readUnweighted = false) {
+    // at this point memory should already be allocated
+    auto r =
+        graph
+            .divideByNode(
+                NodeData::size_of::value + EdgeIndData::size_of::value +
+                    LS_LC_CSR_64_Graph::size_of_out_of_line::value,
+                EdgeDst::size_of::value + EdgeData::size_of::value, tid, total)
+            .first;
+
+    this->setLocalRange(*r.first, *r.second);
+
+    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {
+      nodeData.constructAt(*ii);
+      edgeIndData[*ii] = *graph.edge_end(*ii);
+
+      this->outOfLineConstructAt(*ii);
+
+      for (FileGraph::edge_iterator nn = graph.edge_begin(*ii),
+                                    en = graph.edge_end(*ii);
+           nn != en; ++nn) {
+        if (readUnweighted) {
+          edgeData.set(*nn, {});
+        } else {
+          constructEdgeValue(graph, nn);
+        }
+        edgeDst[*nn] = graph.getEdgeDst(nn);
+      }
+    }
+    resetPrefixSum();
+  }
+
+  template <typename E                                           = EdgeTy,
+            std::enable_if_t<std::is_same<E, void>::value, int>* = nullptr>
+  void constructFrom(FileGraph& graph, unsigned tid, unsigned total,
+                     const bool GALOIS_UNUSED(readUnweighted) = false) {
+    // at this point memory should already be allocated
+    auto r =
+        graph
+            .divideByNode(
+                NodeData::size_of::value + EdgeIndData::size_of::value +
+                    LS_LC_CSR_64_Graph::size_of_out_of_line::value,
+                EdgeDst::size_of::value + EdgeData::size_of::value, tid, total)
+            .first;
+
+    this->setLocalRange(*r.first, *r.second);
+
+    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {
+      nodeData.constructAt(*ii);
+      edgeIndData[*ii] = *graph.edge_end(*ii);
+
+      this->outOfLineConstructAt(*ii);
+
+      for (FileGraph::edge_iterator nn = graph.edge_begin(*ii),
+                                    en = graph.edge_end(*ii);
+           nn != en; ++nn) {
+        constructEdgeValue(graph, nn);
+        edgeDst[*nn] = graph.getEdgeDst(nn);
+      }
+    }
+    resetPrefixSum();
+  }
+
+  /**
+   * Returns the reference to the edgeIndData LargeArray
+   * (a prefix sum of edges)
+   *
+   * @returns reference to LargeArray prefixSumCache
+   */
+  const EdgeDst& getEdgePrefixSum() {
+    if (!prefixValid)
+      computePrefixSum();
+    return prefixSumCache;
+  }
+
+  auto divideByNode(size_t nodeSize, size_t edgeSize, size_t id, size_t total) {
+    return galois::graphs::divideNodesBinarySearch(
+        numNodes, numEdges, nodeSize, edgeSize, id, total, edgeIndData);
+  }
+
+  /**
+   *
+   * custom allocator for vector<vector<>>
+   * Adding for Louvain clustering
+   * TODO: Find better way to do this
+   */
+  void constructFrom(uint64_t numNodes, uint64_t numEdges,
+                     std::vector<uint64_t>& prefix_sum,
+                     std::vector<std::vector<uint64_t>>& edges_id,
+                     std::vector<std::vector<EdgeTy>>& edges_data) {
+    // allocateFrom(numNodes, numEdges);
+    /*
+     * Deallocate if reusing the graph
+     */
+    destroyAndAllocateFrom(numNodes, numEdges);
+    constructNodes();
+
+    galois::do_all(galois::iterate((uint64_t)0, numNodes),
+                   [&](uint64_t n) { edgeIndData[n].second = prefix_sum[n]; });
+
+    galois::do_all(galois::iterate((uint64_t)0, numNodes), [&](uint64_t n) {
+      if (n == 0) {
+        edgeIndData[n].first = 0;
+        if (edgeIndData[n].second > 0) {
+          std::copy(edges_id[n].begin(), edges_id[n].end(), edgeDst.begin());
+          std::copy(edges_data[n].begin(), edges_data[n].end(),
+                    edgeData.begin());
+        }
+      } else {
+        edgeIndData[n].first = edgeIndData[n - 1].second;
+        if (edgeIndData[n].second - edgeIndData[n].first > 0) {
+          std::copy(edges_id[n].begin(), edges_id[n].end(),
+                    edgeDst.begin() + edgeIndData[n].first);
+          std::copy(edges_data[n].begin(), edges_data[n].end(),
+                    edgeData.begin() + edgeIndData[n].second);
+        }
+      }
+    });
+
+    resetPrefixSum();
+    initializeLocalRanges();
+  }
+  void constructFrom(
+      uint64_t numNodes, uint64_t numEdges, std::vector<uint64_t>& prefix_sum,
+      galois::gstl::Vector<galois::PODResizeableArray<uint64_t>>& edges_id,
+      std::vector<std::vector<EdgeTy>>& edges_data) {
+    allocateFrom(numNodes, numEdges);
+    constructNodes();
+
+    galois::do_all(galois::iterate((uint64_t)0, numNodes),
+                   [&](uint64_t n) { edgeIndData[n].second = prefix_sum[n]; });
+
+    galois::do_all(galois::iterate((uint64_t)0, numNodes), [&](uint64_t n) {
+      if (n == 0) {
+        edgeIndData[n].first = 0;
+        if (edgeIndData[n].second > 0) {
+          std::copy(edges_id[n].begin(), edges_id[n].end(), edgeDst.begin());
+          std::copy(edges_data[n].begin(), edges_data[n].end(),
+                    edgeData.begin());
+        }
+      } else {
+        edgeIndData[n].first = edgeIndData[n - 1].second;
+        if (edgeIndData[n].second - edgeIndData[n].first > 0) {
+          std::copy(edges_id[n].begin(), edges_id[n].end(),
+                    edgeDst.begin() + edgeIndData[n].first);
+          std::copy(edges_data[n].begin(), edges_data[n].end(),
+                    edgeData.begin() + edgeIndData[n].first);
+        }
+      }
+    });
+
+    initializeLocalRanges();
+  }
+
+  /**
+   * Reads the GR files directly into in-memory
+   * data-structures of LC_CSR graphs using freads.
+   *
+   * Edge is not void.
+   *
+   */
+  template <
+      typename U                                                      = void,
+      typename std::enable_if<!std::is_void<EdgeTy>::value, U>::type* = nullptr>
+  void readGraphFromGRFile(const std::string& filename) {
+    std::ifstream graphFile(filename.c_str());
+    if (!graphFile.is_open()) {
+      GALOIS_DIE("failed to open file");
+    }
+    uint64_t header[4];
+    graphFile.read(reinterpret_cast<char*>(header), sizeof(uint64_t) * 4);
+    uint64_t version = header[0];
+    numNodes         = header[2];
+    numEdges         = header[3];
+    galois::gPrint("Number of Nodes: ", numNodes,
+                   ", Number of Edges: ", numEdges, "\n");
+    allocateFrom(numNodes, numEdges);
+    constructNodes();
+    /**
+     * Load outIndex array
+     **/
+    assert(edgeIndData.data());
+    if (!edgeIndData.data()) {
+      GALOIS_DIE("out of memory");
+    }
+
+    // start position to read index data
+    uint64_t readPosition = (4 * sizeof(uint64_t));
+    graphFile.seekg(readPosition);
+    graphFile.read(reinterpret_cast<char*>(edgeIndData.data()),
+                   sizeof(uint64_t) * numNodes);
+    /**
+     * Load edgeDst array
+     **/
+    assert(edgeDst.data());
+    if (!edgeDst.data()) {
+      GALOIS_DIE("out of memory");
+    }
+
+    readPosition = ((4 + numNodes) * sizeof(uint64_t));
+    graphFile.seekg(readPosition);
+    if (version == 1) {
+      graphFile.read(reinterpret_cast<char*>(edgeDst.data()),
+                     sizeof(uint32_t) * numEdges);
+      readPosition =
+          ((4 + numNodes) * sizeof(uint64_t) + numEdges * sizeof(uint32_t));
+      // version 1 padding TODO make version agnostic
+      if (numEdges % 2) {
+        readPosition += sizeof(uint32_t);
+      }
+    } else if (version == 2) {
+      graphFile.read(reinterpret_cast<char*>(edgeDst.data()),
+                     sizeof(uint64_t) * numEdges);
+      readPosition =
+          ((4 + numNodes) * sizeof(uint64_t) + numEdges * sizeof(uint64_t));
+      if (numEdges % 2) {
+        readPosition += sizeof(uint64_t);
+      }
+    } else {
+      GALOIS_DIE("unknown file version: ", version);
+    }
+    /**
+     * Load edge data array
+     **/
+    assert(edgeData.data());
+    if (!edgeData.data()) {
+      GALOIS_DIE("out of memory");
+    }
+    graphFile.seekg(readPosition);
+    graphFile.read(reinterpret_cast<char*>(edgeData.data()),
+                   sizeof(EdgeTy) * numEdges);
+
+    initializeLocalRanges();
+    graphFile.close();
+  }
+
+  /**
+   * Reads the GR files directly into in-memory
+   * data-structures of LC_CSR graphs using freads.
+   *
+   * Edge is void.
+   *
+   */
+  template <
+      typename U                                                     = void,
+      typename std::enable_if<std::is_void<EdgeTy>::value, U>::type* = nullptr>
+  void readGraphFromGRFile(const std::string& filename) {
+    std::ifstream graphFile(filename.c_str());
+    if (!graphFile.is_open()) {
+      GALOIS_DIE("failed to open file");
+    }
+    uint64_t header[4];
+    graphFile.read(reinterpret_cast<char*>(header), sizeof(uint64_t) * 4);
+    uint64_t version = header[0];
+    numNodes         = header[2];
+    numEdges         = header[3];
+    galois::gPrint("Number of Nodes: ", numNodes,
+                   ", Number of Edges: ", numEdges, "\n");
+    allocateFrom(numNodes, numEdges);
+    constructNodes();
+    /**
+     * Load outIndex array
+     **/
+    assert(edgeIndData.data());
+    if (!edgeIndData.data()) {
+      GALOIS_DIE("out of memory");
+    }
+    // start position to read index data
+    uint64_t readPosition = (4 * sizeof(uint64_t));
+    graphFile.seekg(readPosition);
+    graphFile.read(reinterpret_cast<char*>(edgeIndData.data()),
+                   sizeof(uint64_t) * numNodes);
+    /**
+     * Load edgeDst array
+     **/
+    assert(edgeDst.data());
+    if (!edgeDst.data()) {
+      GALOIS_DIE("out of memory");
+    }
+    readPosition = ((4 + numNodes) * sizeof(uint64_t));
+    graphFile.seekg(readPosition);
+    if (version == 1) {
+      graphFile.read(reinterpret_cast<char*>(edgeDst.data()),
+                     sizeof(uint32_t) * numEdges);
+    } else if (version == 2) {
+      graphFile.read(reinterpret_cast<char*>(edgeDst.data()),
+                     sizeof(uint64_t) * numEdges);
+    } else {
+      GALOIS_DIE("unknown file version: ", version);
+    }
+
+    initializeLocalRanges();
+    graphFile.close();
+  }
+
+  /**
+   * Given a manually created graph, initialize the local ranges on this graph
+   * so that threads can iterate over a balanced number of vertices.
+   */
+  void initializeLocalRanges() {
+    galois::on_each([&](unsigned tid, unsigned total) {
+      auto r = divideByNode(0, 1, tid, total).first;
+      this->setLocalRange(*r.first, *r.second);
+    });
+  }
+};
+
+// used to determine if a instance is this template
+template <typename Type>
+struct is_LS_LC_CSR_64_Graph : std::false_type {};
+
+template <typename NodeTy, typename EdgeTy, bool HasNoLockable,
+          bool UseNumaAlloc, bool HasOutOfLineLockable, typename FileEdgeTy>
+struct is_LS_LC_CSR_64_Graph<
+    LS_LC_CSR_64_Graph<NodeTy, EdgeTy, HasNoLockable, UseNumaAlloc,
+                       HasOutOfLineLockable, FileEdgeTy>> : std::true_type {};
+
+} // namespace galois::graphs
+
+#endif
diff --git a/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h b/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
new file mode 100644
index 0000000000..1cf56cc5f9
--- /dev/null
+++ b/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
@@ -0,0 +1,1113 @@
+/*
+ * This file belongs to the Galois project, a C++ library for exploiting
+ * parallelism. The code is being released under the terms of the 3-Clause BSD
+ * License (a copy is located in LICENSE.txt at the top-level directory).
+ *
+ * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
+ * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
+ * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
+ * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
+ * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
+ * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
+ * shall University be liable for incidental, special, indirect, direct or
+ * consequential damages or loss of profits, interruption of business, or
+ * related expenses which may arise from use of Software or Documentation,
+ * including but not limited to those resulting from defects in Software and/or
+ * Documentation, or loss or inaccuracy of data of any kind.
+ */
+
+#ifndef GALOIS_GRAPHS_LC_CSR_GRAPH_H
+#define GALOIS_GRAPHS_LC_CSR_GRAPH_H
+
+#include <fstream>
+#include <type_traits>
+
+#include <boost/archive/binary_oarchive.hpp>
+#include <boost/archive/binary_iarchive.hpp>
+#include <boost/serialization/split_member.hpp>
+#include <boost/serialization/binary_object.hpp>
+#include <boost/serialization/serialization.hpp>
+
+#include "galois/config.h"
+#include "galois/Galois.h"
+#include "galois/graphs/Details.h"
+#include "galois/graphs/FileGraph.h"
+#include "galois/graphs/GraphHelpers.h"
+#include "galois/PODResizeableArray.h"
+
+namespace galois::graphs {
+/**
+ * Local computation graph (i.e., graph structure does not change). The data
+ * representation is the traditional compressed-sparse-row (CSR) format.
+ *
+ * The position of template parameters may change between Galois releases; the
+ * most robust way to specify them is through the with_XXX nested templates.
+ *
+ * An example of use:
+ *
+ * \snippet test/graph.cpp Using a graph
+ *
+ * And in C++11:
+ *
+ * \snippet test/graph.cpp Using a graph cxx11
+ *
+ * @tparam NodeTy data on nodes
+ * @tparam EdgeTy data on out edges
+ */
+//! [doxygennuma]
+template <typename NodeTy, typename EdgeTy, bool HasNoLockable = false,
+          bool UseNumaAlloc = false, bool HasOutOfLineLockable = false,
+          typename FileEdgeTy = EdgeTy>
+class LS_LC_CSR_48_Graph :
+    //! [doxygennuma]
+    private boost::noncopyable,
+    private internal::LocalIteratorFeature<UseNumaAlloc>,
+    private internal::OutOfLineLockableFeature<HasOutOfLineLockable &&
+                                               !HasNoLockable> {
+  template <typename Graph>
+  friend class LC_InOut_Graph;
+
+public:
+  template <bool _has_id>
+  struct with_id {
+    typedef LS_LC_CSR_48_Graph type;
+  };
+
+  template <typename _node_data>
+  struct with_node_data {
+    typedef LS_LC_CSR_48_Graph<_node_data, EdgeTy, HasNoLockable, UseNumaAlloc,
+                               HasOutOfLineLockable, FileEdgeTy>
+        type;
+  };
+
+  template <typename _edge_data>
+  struct with_edge_data {
+    typedef LS_LC_CSR_48_Graph<NodeTy, _edge_data, HasNoLockable, UseNumaAlloc,
+                               HasOutOfLineLockable, FileEdgeTy>
+        type;
+  };
+
+  template <typename _file_edge_data>
+  struct with_file_edge_data {
+    typedef LS_LC_CSR_48_Graph<NodeTy, EdgeTy, HasNoLockable, UseNumaAlloc,
+                               HasOutOfLineLockable, _file_edge_data>
+        type;
+  };
+
+  //! If true, do not use abstract locks in graph
+  template <bool _has_no_lockable>
+  struct with_no_lockable {
+    typedef LS_LC_CSR_48_Graph<NodeTy, EdgeTy, _has_no_lockable, UseNumaAlloc,
+                               HasOutOfLineLockable, FileEdgeTy>
+        type;
+  };
+  template <bool _has_no_lockable>
+  using _with_no_lockable =
+      LS_LC_CSR_48_Graph<NodeTy, EdgeTy, _has_no_lockable, UseNumaAlloc,
+                         HasOutOfLineLockable, FileEdgeTy>;
+
+  //! If true, use NUMA-aware graph allocation; otherwise, use NUMA interleaved
+  //! allocation.
+  template <bool _use_numa_alloc>
+  struct with_numa_alloc {
+    typedef LS_LC_CSR_48_Graph<NodeTy, EdgeTy, HasNoLockable, _use_numa_alloc,
+                               HasOutOfLineLockable, FileEdgeTy>
+        type;
+  };
+  template <bool _use_numa_alloc>
+  using _with_numa_alloc =
+      LS_LC_CSR_48_Graph<NodeTy, EdgeTy, HasNoLockable, _use_numa_alloc,
+                         HasOutOfLineLockable, FileEdgeTy>;
+
+  //! If true, store abstract locks separate from nodes
+  template <bool _has_out_of_line_lockable>
+  struct with_out_of_line_lockable {
+    typedef LS_LC_CSR_48_Graph<NodeTy, EdgeTy, HasNoLockable, UseNumaAlloc,
+                               _has_out_of_line_lockable, FileEdgeTy>
+        type;
+  };
+
+  typedef read_default_graph_tag read_tag;
+
+protected:
+  enum VertexState : uint16_t {
+    UNLK = 0x0 << 0,
+    LOCK = 0x1 << 0,
+    TOMB = 0x1 << 1,
+    UMAX = 0x1 << 2
+  };
+
+  constexpr uint64_t mask(uint8_t mask, uint8_t shift) { return mask << shift; }
+  constexpr uint64_t lower(uint8_t num) { return (1 << num) - 1; }
+
+  // Pack things in the same order of VertexState
+  template <typename T>
+  struct __attribute__((packed)) PackedVal {
+    VertexState get_vertex_state(uint64_t v) const {
+      return (VertexState)(v >> 48);
+    }
+    uint64_t get_raw_value(uint64_t v) const { return v & lower(48); }
+    uint16_t get_flags_unlock(uint16_t f) const { return f & (lower(15) << 1); }
+    uint16_t get_flags_untomb(uint16_t f) const {
+      return f & (lower(14) << 2 | 0x1);
+    }
+
+    volatile uint16_t flags : 16;
+    uint64_t value : 48;
+
+    PackedVal(T t)
+        : flags(get_vertex_state((uint64_t)t)),
+          value(get_raw_value((uint64_t)t)) {}
+
+    inline VertexState try_lock() {
+      uint16_t f = __atomic_load_2(this, __ATOMIC_RELAXED);
+      bool b     = false;
+      if (!(f & LOCK))
+        b = __atomic_compare_exchange_2(this, &f, f | LOCK, true,
+                                        __ATOMIC_ACQUIRE, __ATOMIC_RELAXED);
+      return (VertexState)((b ? UNLK : LOCK) | get_flags_unlock(f));
+    }
+
+    // Make an explicit function that returns tombstone and locks
+    inline bool lock() {
+      uint64_t ret;
+      VertexState s;
+      do {
+        s = this->try_lock();
+      } while (s & LOCK);
+      return !(s & TOMB);
+    }
+
+    inline void unlock() {
+      uint64_t f = flags;
+      __atomic_store_2(this, f & (~LOCK), __ATOMIC_RELEASE);
+    }
+
+    inline void set_value(T p) {
+      if ((uint64_t)p == UINT64_MAX) {
+        flags |= UMAX;
+      } else {
+        value = get_raw_value(p);
+      }
+    }
+
+    inline T get_value() { return (flags & UMAX) ? (T)UINT64_MAX : (T)value; }
+
+    inline void unset_tomb() { flags = flags & (~TOMB); }
+
+    inline void set_tomb() { flags = flags | TOMB; }
+
+    inline bool is_tomb() { return flags & TOMB; }
+
+    inline bool atomic_is_tomb() {
+      return __atomic_load_2(this, __ATOMIC_RELAXED) & TOMB;
+    }
+
+    inline PackedVal<T>& operator=(const T& val) {
+      this.set_value(val);
+      return *this;
+    }
+  };
+
+  typedef LargeArray<EdgeTy> EdgeData;
+  typedef LargeArray<uint64_t> EdgeDst;
+  typedef internal::NodeInfoBaseTypes<NodeTy,
+                                      !HasNoLockable && !HasOutOfLineLockable>
+      NodeInfoTypes;
+  typedef internal::NodeInfoBase<NodeTy,
+                                 !HasNoLockable && !HasOutOfLineLockable>
+      NodeInfo;
+  typedef LargeArray<std::pair<uint64_t, uint64_t>> EdgeIndData;
+  typedef LargeArray<NodeInfo> NodeData;
+
+public:
+  typedef uint64_t GraphNode;
+  typedef EdgeTy edge_data_type;
+  typedef FileEdgeTy file_edge_data_type;
+  typedef NodeTy node_data_type;
+  typedef typename EdgeData::reference edge_data_reference;
+  typedef typename NodeInfoTypes::reference node_data_reference;
+  using edge_iterator =
+      boost::counting_iterator<typename EdgeIndData::value_type>;
+  using iterator = boost::counting_iterator<typename EdgeDst::value_type>;
+  typedef iterator const_iterator;
+  typedef iterator local_iterator;
+  typedef iterator const_local_iterator;
+
+protected:
+  NodeData nodeData;
+  EdgeIndData edgeIndData;
+  EdgeDst edgeDst;
+  EdgeData edgeData;
+
+  uint64_t numNodes;
+  uint64_t numEdges;
+
+  const uint64_t maxNodes = (1 << 10) / sizeof(std::pair<uint64_t, uint64_t>);
+  const uint64_t maxEdges = (1 << 10) / sizeof(uint64_t);
+
+  typedef internal::EdgeSortIterator<
+      GraphNode, typename EdgeIndData::value_type, EdgeDst, EdgeData>
+      edge_sort_iterator;
+
+  edge_iterator raw_begin(GraphNode N) const {
+    return edge_iterator(edgeIndData[N].first);
+  }
+
+  edge_iterator raw_end(GraphNode N) const {
+    return edge_iterator(edgeIndData[N].last);
+  }
+
+  edge_sort_iterator edge_sort_begin(GraphNode N) {
+    return edge_sort_iterator(*raw_begin(N), &edgeDst, &edgeData);
+  }
+
+  edge_sort_iterator edge_sort_end(GraphNode N) {
+    return edge_sort_iterator(*raw_end(N), &edgeDst, &edgeData);
+  }
+
+  template <bool _A1 = HasNoLockable, bool _A2 = HasOutOfLineLockable>
+  void acquireNode(GraphNode N, MethodFlag mflag,
+                   typename std::enable_if<!_A1&& !_A2>::type* = 0) {
+    galois::runtime::acquire(&nodeData[N], mflag);
+  }
+
+  template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>
+  void acquireNode(GraphNode N, MethodFlag mflag,
+                   typename std::enable_if<_A1&& !_A2>::type* = 0) {
+    this->outOfLineAcquire(getId(N), mflag);
+  }
+
+  template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>
+  void acquireNode(GraphNode, MethodFlag,
+                   typename std::enable_if<_A2>::type* = 0) {}
+
+  template <bool _A1 = EdgeData::has_value,
+            bool _A2 = LargeArray<FileEdgeTy>::has_value>
+  void constructEdgeValue(FileGraph& graph,
+                          typename FileGraph::edge_iterator nn,
+                          typename std::enable_if<!_A1 || _A2>::type* = 0) {
+    typedef LargeArray<FileEdgeTy> FED;
+    if (EdgeData::has_value)
+      edgeData.set(*nn, graph.getEdgeData<typename FED::value_type>(nn));
+  }
+
+  template <bool _A1 = EdgeData::has_value,
+            bool _A2 = LargeArray<FileEdgeTy>::has_value>
+  void constructEdgeValue(FileGraph&, typename FileGraph::edge_iterator nn,
+                          typename std::enable_if<_A1&& !_A2>::type* = 0) {
+    edgeData.set(*nn, {});
+  }
+
+  size_t getId(GraphNode N) { return N; }
+
+  GraphNode getNode(uint64_t n) { return n; }
+
+private:
+  friend class boost::serialization::access;
+
+  template <typename Archive>
+  void save(Archive& ar, const unsigned int) const {
+    ar << numNodes;
+    ar << numEdges;
+
+    // Large Arrays
+    ar << edgeIndData;
+    ar << edgeDst;
+    ar << edgeData;
+  }
+
+  template <typename Archive>
+  void load(Archive& ar, const unsigned int) {
+    ar >> numNodes;
+    ar >> numEdges;
+
+    // Large Arrays
+    ar >> edgeIndData;
+    ar >> edgeDst;
+    ar >> edgeData;
+
+    if (!nodeData.data()) {
+      if (UseNumaAlloc) {
+        nodeData.allocateBlocked(numNodes);
+        this->outOfLineAllocateBlocked(numNodes);
+      } else {
+        nodeData.allocateInterleaved(numNodes);
+        this->outOfLineAllocateInterleaved(numNodes);
+      }
+
+      // Construct nodeData largeArray
+      for (size_t n = 0; n < numNodes; ++n) {
+        nodeData.constructAt(n);
+      }
+    }
+  }
+
+  // The macro BOOST_SERIALIZATION_SPLIT_MEMBER() generates code which invokes
+  // the save or load depending on whether the archive is used for saving or
+  // loading
+  BOOST_SERIALIZATION_SPLIT_MEMBER()
+
+public:
+  LS_LC_CSR_48_Graph(LS_LC_CSR_48_Graph&& rhs) = default;
+
+  LS_LC_CSR_48_Graph() = default;
+
+  LS_LC_CSR_48_Graph& operator=(LS_LC_CSR_48_Graph&&) = default;
+
+  /**
+   * Serializes node data using Boost.
+   *
+   * @param ar Boost archive to serialize to.
+   */
+  void serializeNodeData(boost::archive::binary_oarchive& ar) const {
+    ar << nodeData;
+  }
+
+  /**
+   * Deserializes a Boost archive containing node data to the local node data
+   * variable.
+   *
+   * @param ar Boost archive to deserialize from.
+   */
+  void deSerializeNodeData(boost::archive::binary_iarchive& ar) {
+    ar >> nodeData;
+  }
+
+  /**
+   * Serializes graph using Boost.
+   *
+   * @param ar Boost archive to serialize to.
+   */
+  void serializeGraph(boost::archive::binary_oarchive& ar) const {
+    ar << numNodes;
+    ar << numEdges;
+
+    // Large Arrays
+    ar << nodeData;
+    ar << edgeIndData;
+    ar << edgeDst;
+    ar << edgeData;
+  }
+
+  /**
+   * Deserializes a Boost archive to the local graph.
+   *
+   * @param ar Boost archive to deserialize from.
+   */
+  void deSerializeGraph(boost::archive::binary_iarchive& ar) {
+    ar >> numNodes;
+    ar >> numEdges;
+
+    // Large Arrays
+    ar >> nodeData;
+    ar >> edgeIndData;
+    ar >> edgeDst;
+    ar >> edgeData;
+  }
+
+  /**
+   * Accesses the "prefix sum" of this graph; takes advantage of the fact
+   * that edge_end(n) is basically prefix_sum[n] (if a prefix sum existed +
+   * if prefix_sum[0] = number of edges in node 0).
+   *
+   * ONLY USE IF GRAPH HAS BEEN LOADED
+   *
+   * @param n Index into edge prefix sum
+   * @returns The value that would be located at index n in an edge prefix sum
+   * array
+   */
+  uint64_t operator[](uint64_t n) { return *(edge_end(n)); }
+
+  template <typename EdgeNumFnTy, typename EdgeDstFnTy, typename EdgeDataFnTy>
+  LS_LC_CSR_48_Graph(uint64_t _numNodes, uint64_t _numEdges,
+                     EdgeNumFnTy edgeNum, EdgeDstFnTy _edgeDst,
+                     EdgeDataFnTy _edgeData)
+      : numNodes(_numNodes), numEdges(_numEdges) {
+    assert(numNodes <= maxNodes);
+    assert(numedges <= maxEdges);
+    if (UseNumaAlloc) {
+      //! [numaallocex]
+      nodeData.allocateBlocked(maxNodes);
+      edgeIndData.allocateBlocked(maxNodes);
+      edgeDst.allocateBlocked(maxEdges);
+      edgeData.allocateBlocked(maxEdges);
+      //! [numaallocex]
+      this->outOfLineAllocateBlocked(maxNodes);
+    } else {
+      nodeData.allocateInterleaved(maxNodes);
+      edgeIndData.allocateInterleaved(maxNodes);
+      edgeDst.allocateInterleaved(maxEdges);
+      edgeData.allocateInterleaved(maxEdges);
+      this->outOfLineAllocateInterleaved(maxNodes);
+    }
+    for (size_t n = 0; n < numNodes; ++n) {
+      nodeData.constructAt(n);
+    }
+    uint64_t cur = 0;
+    for (size_t n = 0; n < numNodes; ++n) {
+      edgeIndData[n].first = cur;
+      cur += edgeNum(n);
+      edgeIndData[n].second = cur;
+    }
+    cur = 0;
+    for (size_t n = 0; n < numNodes; ++n) {
+      for (uint64_t e = 0, ee = edgeNum(n); e < ee; ++e) {
+        if (EdgeData::has_value)
+          edgeData.set(cur, _edgeData(n, e));
+        edgeDst[cur] = _edgeDst(n, e);
+        ++cur;
+      }
+    }
+  }
+
+  friend void swap(LS_LC_CSR_48_Graph& lhs, LS_LC_CSR_48_Graph& rhs) {
+    swap(lhs.nodeData, rhs.nodeData);
+    swap(lhs.edgeIndData, rhs.edgeIndData);
+    swap(lhs.edgeDst, rhs.edgeDst);
+    swap(lhs.edgeData, rhs.edgeData);
+    std::swap(lhs.numNodes, rhs.numNodes);
+    std::swap(lhs.numEdges, rhs.numEdges);
+  }
+
+  node_data_reference getData(GraphNode N,
+                              MethodFlag mflag = MethodFlag::WRITE) {
+    // galois::runtime::checkWrite(mflag, false);
+    NodeInfo& NI = nodeData[N];
+    acquireNode(N, mflag);
+    return NI.getData();
+  }
+
+  edge_data_reference
+  getEdgeData(edge_iterator ni,
+              MethodFlag GALOIS_UNUSED(mflag) = MethodFlag::UNPROTECTED) {
+    // galois::runtime::checkWrite(mflag, false);
+    return edgeData[*ni];
+  }
+
+  GraphNode getEdgeDst(edge_iterator ni) { return edgeDst[*ni]; }
+
+  size_t size() const { return numNodes; }
+  size_t sizeEdges() const { return numEdges; }
+
+  iterator begin() const { return iterator(0); }
+  iterator end() const { return iterator(numNodes); }
+
+  const_local_iterator local_begin() const {
+    return const_local_iterator(this->localBegin(numNodes));
+  }
+
+  const_local_iterator local_end() const {
+    return const_local_iterator(this->localEnd(numNodes));
+  }
+
+  local_iterator local_begin() {
+    return local_iterator(this->localBegin(numNodes));
+  }
+
+  local_iterator local_end() {
+    return local_iterator(this->localEnd(numNodes));
+  }
+
+  edge_iterator edge_begin(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
+    acquireNode(N, mflag);
+    if (!HasNoLockable && galois::runtime::shouldLock(mflag)) {
+      for (edge_iterator ii = raw_begin(N), ee = raw_end(N); ii != ee; ++ii) {
+        acquireNode(edgeDst[*ii], mflag);
+      }
+    }
+    return raw_begin(N);
+  }
+
+  edge_iterator edge_end(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
+    acquireNode(N, mflag);
+    return raw_end(N);
+  }
+
+  uint64_t getDegree(GraphNode N) const { return (raw_end(N) - raw_begin(N)); }
+
+  edge_iterator findEdge(GraphNode N1, GraphNode N2) {
+    return std::find_if(edge_begin(N1), edge_end(N1),
+                        [=](edge_iterator e) { return getEdgeDst(e) == N2; });
+  }
+
+  edge_iterator findEdgeSortedByDst(GraphNode N1, GraphNode N2) {
+    auto e = std::lower_bound(
+        edge_begin(N1), edge_end(N1), N2,
+        [=](edge_iterator e, GraphNode N) { return getEdgeDst(e) < N; });
+    return (getEdgeDst(e) == N2) ? e : edge_end(N1);
+  }
+
+  runtime::iterable<NoDerefIterator<edge_iterator>>
+  edges(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
+    return internal::make_no_deref_range(edge_begin(N, mflag),
+                                         edge_end(N, mflag));
+  }
+
+  runtime::iterable<NoDerefIterator<edge_iterator>>
+  out_edges(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
+    return edges(N, mflag);
+  }
+
+  /**
+   * Sorts outgoing edges of a node. Comparison function is over EdgeTy.
+   */
+  template <typename CompTy>
+  void sortEdgesByEdgeData(GraphNode N,
+                           const CompTy& comp = std::less<EdgeTy>(),
+                           MethodFlag mflag   = MethodFlag::WRITE) {
+    acquireNode(N, mflag);
+    std::sort(
+        edge_sort_begin(N), edge_sort_end(N),
+        internal::EdgeSortCompWrapper<EdgeSortValue<GraphNode, EdgeTy>, CompTy>(
+            comp));
+  }
+
+  /**
+   * Sorts outgoing edges of a node.
+   * Comparison function is over <code>EdgeSortValue<EdgeTy></code>.
+   */
+  template <typename CompTy>
+  void sortEdges(GraphNode N, const CompTy& comp,
+                 MethodFlag mflag = MethodFlag::WRITE) {
+    acquireNode(N, mflag);
+    std::sort(edge_sort_begin(N), edge_sort_end(N), comp);
+  }
+
+  /**
+   * Sorts outgoing edges of a node. Comparison is over getEdgeDst(e).
+   */
+  void sortEdgesByDst(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
+    acquireNode(N, mflag);
+    typedef EdgeSortValue<GraphNode, EdgeTy> EdgeSortVal;
+    std::sort(edge_sort_begin(N), edge_sort_end(N),
+              [=](const EdgeSortVal& e1, const EdgeSortVal& e2) {
+                return e1.dst < e2.dst;
+              });
+  }
+
+  /**
+   * Sorts all outgoing edges of all nodes in parallel. Comparison is over
+   * getEdgeDst(e).
+   */
+  void sortAllEdgesByDst(MethodFlag mflag = MethodFlag::WRITE) {
+    galois::do_all(
+        galois::iterate(size_t{0}, this->size()),
+        [=](GraphNode N) { this->sortEdgesByDst(N, mflag); },
+        galois::no_stats(), galois::steal());
+  }
+
+  void allocateFrom(const FileGraph& graph) {
+    numNodes = graph.size();
+    numEdges = graph.sizeEdges();
+    if (UseNumaAlloc) {
+      nodeData.allocateBlocked(numNodes);
+      edgeIndData.allocateBlocked(numNodes);
+      edgeDst.allocateBlocked(numEdges);
+      edgeData.allocateBlocked(numEdges);
+      this->outOfLineAllocateBlocked(numNodes);
+    } else {
+      nodeData.allocateInterleaved(numNodes);
+      edgeIndData.allocateInterleaved(numNodes);
+      edgeDst.allocateInterleaved(numEdges);
+      edgeData.allocateInterleaved(numEdges);
+      this->outOfLineAllocateInterleaved(numNodes);
+    }
+  }
+
+  void allocateFrom(uint64_t nNodes, uint64_t nEdges) {
+    numNodes = nNodes;
+    numEdges = nEdges;
+
+    if (UseNumaAlloc) {
+      nodeData.allocateBlocked(numNodes);
+      edgeIndData.allocateBlocked(numNodes);
+      edgeDst.allocateBlocked(numEdges);
+      edgeData.allocateBlocked(numEdges);
+      this->outOfLineAllocateBlocked(numNodes);
+    } else {
+      nodeData.allocateInterleaved(numNodes);
+      edgeIndData.allocateInterleaved(numNodes);
+      edgeDst.allocateInterleaved(numEdges);
+      edgeData.allocateInterleaved(numEdges);
+      this->outOfLineAllocateInterleaved(numNodes);
+    }
+  }
+
+  void destroyAndAllocateFrom(uint64_t nNodes, uint64_t nEdges) {
+    numNodes = nNodes;
+    numEdges = nEdges;
+
+    deallocate();
+    if (UseNumaAlloc) {
+      nodeData.allocateBlocked(numNodes);
+      edgeIndData.allocateBlocked(numNodes);
+      edgeDst.allocateBlocked(numEdges);
+      edgeData.allocateBlocked(numEdges);
+      this->outOfLineAllocateBlocked(numNodes);
+    } else {
+      nodeData.allocateInterleaved(numNodes);
+      edgeIndData.allocateInterleaved(numNodes);
+      edgeDst.allocateInterleaved(numEdges);
+      edgeData.allocateInterleaved(numEdges);
+      this->outOfLineAllocateInterleaved(numNodes);
+    }
+  }
+
+  void constructNodes() {
+#ifndef GALOIS_GRAPH_CONSTRUCT_SERIAL
+    for (uint64_t x = 0; x < numNodes; ++x) {
+      nodeData.constructAt(x);
+      this->outOfLineConstructAt(x);
+    }
+#else
+    galois::do_all(
+        galois::iterate(UINT64_C(0), numNodes),
+        [&](uint64_t x) {
+          nodeData.constructAt(x);
+          this->outOfLineConstructAt(x);
+        },
+        galois::no_stats(), galois::loopname("CONSTRUCT_NODES"));
+#endif
+  }
+
+  void deallocate() {
+    nodeData.destroy();
+    nodeData.deallocate();
+
+    edgeIndData.deallocate();
+    edgeIndData.destroy();
+
+    edgeDst.deallocate();
+    edgeDst.destroy();
+
+    edgeData.deallocate();
+    edgeData.destroy();
+  }
+
+  void constructEdge(uint64_t e, uint64_t dst,
+                     const typename EdgeData::value_type& val) {
+    edgeData.set(e, val);
+    edgeDst[e] = dst;
+  }
+
+  void constructEdge(uint64_t e, uint64_t dst) { edgeDst[e] = dst; }
+
+  void fixEndEdge(uint64_t n, uint64_t e) { edgeIndData[n] = e; }
+
+  /**
+   * Perform an in-memory transpose of the graph, replacing the original
+   * CSR to CSC
+   */
+  void transpose(const char* regionName = NULL) {
+    galois::StatTimer timer("TIMER_GRAPH_TRANSPOSE", regionName);
+    timer.start();
+
+    EdgeDst edgeDst_old;
+    EdgeData edgeData_new;
+    EdgeIndData edgeIndData_old;
+    EdgeIndData edgeIndData_temp;
+
+    if (UseNumaAlloc) {
+      edgeIndData_old.allocateBlocked(numNodes);
+      edgeIndData_temp.allocateBlocked(numNodes);
+      edgeDst_old.allocateBlocked(numEdges);
+      edgeData_new.allocateBlocked(numEdges);
+    } else {
+      edgeIndData_old.allocateInterleaved(numNodes);
+      edgeIndData_temp.allocateInterleaved(numNodes);
+      edgeDst_old.allocateInterleaved(numEdges);
+      edgeData_new.allocateInterleaved(numEdges);
+    }
+
+    // Copy old node->index location + initialize the temp array
+    galois::do_all(
+        galois::iterate(UINT64_C(0), numNodes),
+        [&](uint64_t n) {
+          edgeIndData_old[n]  = edgeIndData[n];
+          edgeIndData_temp[n] = 0;
+        },
+        galois::no_stats(), galois::loopname("TRANSPOSE_EDGEINTDATA_COPY"));
+
+    // get destination of edge, copy to array, and
+    galois::do_all(
+        galois::iterate(UINT64_C(0), numEdges),
+        [&](uint64_t e) {
+          auto dst       = edgeDst[e];
+          edgeDst_old[e] = dst;
+          // counting outgoing edges in the tranpose graph by
+          // counting incoming edges in the original graph
+          __sync_add_and_fetch(&edgeIndData_temp[dst], 1);
+        },
+        galois::no_stats(), galois::loopname("TRANSPOSE_EDGEINTDATA_INC"));
+
+    // TODO is it worth doing parallel prefix sum?
+    // prefix sum calculation of the edge index array
+    for (uint64_t n = 1; n < numNodes; ++n) {
+      edgeIndData_temp[n] += edgeIndData_temp[n - 1];
+    }
+
+    // copy over the new tranposed edge index data
+    galois::do_all(
+        galois::iterate(UINT64_C(0), numNodes),
+        [&](uint64_t n) { edgeIndData[n] = edgeIndData_temp[n]; },
+        galois::no_stats(), galois::loopname("TRANSPOSE_EDGEINTDATA_SET"));
+
+    // edgeIndData_temp[i] will now hold number of edges that all nodes
+    // before the ith node have
+    if (numNodes >= 1) {
+      edgeIndData_temp[0] = 0;
+      galois::do_all(
+          galois::iterate(UINT64_C(1), numNodes),
+          [&](uint64_t n) { edgeIndData_temp[n] = edgeIndData[n - 1]; },
+          galois::no_stats(), galois::loopname("TRANSPOSE_EDGEINTDATA_TEMP"));
+    }
+
+    galois::do_all(
+        galois::iterate(UINT64_C(0), numNodes),
+        [&](uint64_t src) {
+          // e = start index into edge array for a particular node
+          uint64_t e = (src == 0) ? 0 : edgeIndData_old[src - 1];
+
+          // get all outgoing edges of a particular node in the
+          // non-transpose and convert to incoming
+          while (e < edgeIndData_old[src]) {
+            // destination nodde
+            auto dst = edgeDst_old[e];
+            // location to save edge
+            auto e_new = __sync_fetch_and_add(&(edgeIndData_temp[dst]), 1);
+            // save src as destination
+            edgeDst[e_new] = src;
+            // copy edge data to "new" array
+            edgeDataCopy(edgeData_new, edgeData, e_new, e);
+            e++;
+          }
+        },
+        galois::no_stats(), galois::loopname("TRANSPOSE_EDGEDST"));
+
+    // if edge weights, then overwrite edgeData with new edge data
+    if (EdgeData::has_value) {
+      galois::do_all(
+          galois::iterate(UINT64_C(0), numEdges),
+          [&](uint64_t e) { edgeDataCopy(edgeData, edgeData_new, e, e); },
+          galois::no_stats(), galois::loopname("TRANSPOSE_EDGEDATA_SET"));
+    }
+
+    timer.stop();
+  }
+
+  template <bool is_non_void = EdgeData::has_value>
+  void edgeDataCopy(EdgeData& edgeData_new, EdgeData& edgeData, uint64_t e_new,
+                    uint64_t e,
+                    typename std::enable_if<is_non_void>::type* = 0) {
+    edgeData_new[e_new] = edgeData[e];
+  }
+
+  template <bool is_non_void = EdgeData::has_value>
+  void edgeDataCopy(EdgeData&, EdgeData&, uint64_t, uint64_t,
+                    typename std::enable_if<!is_non_void>::type* = 0) {
+    // does nothing
+  }
+
+  template <typename E                                            = EdgeTy,
+            std::enable_if_t<!std::is_same<E, void>::value, int>* = nullptr>
+  void constructFrom(FileGraph& graph, unsigned tid, unsigned total,
+                     const bool readUnweighted = false) {
+    // at this point memory should already be allocated
+    auto r =
+        graph
+            .divideByNode(
+                NodeData::size_of::value + EdgeIndData::size_of::value +
+                    LS_LC_CSR_48_Graph::size_of_out_of_line::value,
+                EdgeDst::size_of::value + EdgeData::size_of::value, tid, total)
+            .first;
+
+    this->setLocalRange(*r.first, *r.second);
+
+    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {
+      nodeData.constructAt(*ii);
+      edgeIndData[*ii] = *graph.edge_end(*ii);
+
+      this->outOfLineConstructAt(*ii);
+
+      for (FileGraph::edge_iterator nn = graph.edge_begin(*ii),
+                                    en = graph.edge_end(*ii);
+           nn != en; ++nn) {
+        if (readUnweighted) {
+          edgeData.set(*nn, {});
+        } else {
+          constructEdgeValue(graph, nn);
+        }
+        edgeDst[*nn] = graph.getEdgeDst(nn);
+      }
+    }
+  }
+
+  template <typename E                                           = EdgeTy,
+            std::enable_if_t<std::is_same<E, void>::value, int>* = nullptr>
+  void constructFrom(FileGraph& graph, unsigned tid, unsigned total,
+                     const bool GALOIS_UNUSED(readUnweighted) = false) {
+    // at this point memory should already be allocated
+    auto r =
+        graph
+            .divideByNode(
+                NodeData::size_of::value + EdgeIndData::size_of::value +
+                    LS_LC_CSR_48_Graph::size_of_out_of_line::value,
+                EdgeDst::size_of::value + EdgeData::size_of::value, tid, total)
+            .first;
+
+    this->setLocalRange(*r.first, *r.second);
+
+    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {
+      nodeData.constructAt(*ii);
+      edgeIndData[*ii] = *graph.edge_end(*ii);
+
+      this->outOfLineConstructAt(*ii);
+
+      for (FileGraph::edge_iterator nn = graph.edge_begin(*ii),
+                                    en = graph.edge_end(*ii);
+           nn != en; ++nn) {
+        constructEdgeValue(graph, nn);
+        edgeDst[*nn] = graph.getEdgeDst(nn);
+      }
+    }
+  }
+
+  /**
+   * Returns the reference to the edgeIndData LargeArray
+   * (a prefix sum of edges)
+   *
+   * @returns reference to LargeArray edgeIndData
+   */
+  const EdgeIndData& getEdgePrefixSum() const { return edgeIndData; }
+
+  auto divideByNode(size_t nodeSize, size_t edgeSize, size_t id, size_t total) {
+    return galois::graphs::divideNodesBinarySearch(
+        numNodes, numEdges, nodeSize, edgeSize, id, total, edgeIndData);
+  }
+  /**
+   *
+   * custom allocator for vector<vector<>>
+   * Adding for Louvain clustering
+   * TODO: Find better way to do this
+   */
+  void constructFrom(uint64_t numNodes, uint64_t numEdges,
+                     std::vector<uint64_t>& prefix_sum,
+                     std::vector<std::vector<uint64_t>>& edges_id,
+                     std::vector<std::vector<EdgeTy>>& edges_data) {
+    // allocateFrom(numNodes, numEdges);
+    /*
+     * Deallocate if reusing the graph
+     */
+    destroyAndAllocateFrom(numNodes, numEdges);
+    constructNodes();
+
+    galois::do_all(galois::iterate((uint64_t)0, numNodes),
+                   [&](uint64_t n) { edgeIndData[n] = prefix_sum[n]; });
+
+    galois::do_all(galois::iterate((uint64_t)0, numNodes), [&](uint64_t n) {
+      if (n == 0) {
+        if (edgeIndData[n] > 0) {
+          std::copy(edges_id[n].begin(), edges_id[n].end(), edgeDst.begin());
+          std::copy(edges_data[n].begin(), edges_data[n].end(),
+                    edgeData.begin());
+        }
+      } else {
+        if (edgeIndData[n] - edgeIndData[n - 1] > 0) {
+          std::copy(edges_id[n].begin(), edges_id[n].end(),
+                    edgeDst.begin() + edgeIndData[n - 1]);
+          std::copy(edges_data[n].begin(), edges_data[n].end(),
+                    edgeData.begin() + edgeIndData[n - 1]);
+        }
+      }
+    });
+
+    initializeLocalRanges();
+  }
+  void constructFrom(
+      uint64_t numNodes, uint64_t numEdges, std::vector<uint64_t>& prefix_sum,
+      galois::gstl::Vector<galois::PODResizeableArray<uint64_t>>& edges_id,
+      std::vector<std::vector<EdgeTy>>& edges_data) {
+    allocateFrom(numNodes, numEdges);
+    constructNodes();
+
+    galois::do_all(galois::iterate((uint64_t)0, numNodes),
+                   [&](uint64_t n) { edgeIndData[n] = prefix_sum[n]; });
+
+    galois::do_all(galois::iterate((uint64_t)0, numNodes), [&](uint64_t n) {
+      if (n == 0) {
+        if (edgeIndData[n] > 0) {
+          std::copy(edges_id[n].begin(), edges_id[n].end(), edgeDst.begin());
+          std::copy(edges_data[n].begin(), edges_data[n].end(),
+                    edgeData.begin());
+        }
+      } else {
+        if (edgeIndData[n] - edgeIndData[n - 1] > 0) {
+          std::copy(edges_id[n].begin(), edges_id[n].end(),
+                    edgeDst.begin() + edgeIndData[n - 1]);
+          std::copy(edges_data[n].begin(), edges_data[n].end(),
+                    edgeData.begin() + edgeIndData[n - 1]);
+        }
+      }
+    });
+
+    initializeLocalRanges();
+  }
+
+  /**
+   * Reads the GR files directly into in-memory
+   * data-structures of LC_CSR graphs using freads.
+   *
+   * Edge is not void.
+   *
+   */
+  template <
+      typename U                                                      = void,
+      typename std::enable_if<!std::is_void<EdgeTy>::value, U>::type* = nullptr>
+  void readGraphFromGRFile(const std::string& filename) {
+    std::ifstream graphFile(filename.c_str());
+    if (!graphFile.is_open()) {
+      GALOIS_DIE("failed to open file");
+    }
+    uint64_t header[4];
+    graphFile.read(reinterpret_cast<char*>(header), sizeof(uint64_t) * 4);
+    uint64_t version = header[0];
+    numNodes         = header[2];
+    numEdges         = header[3];
+    galois::gPrint("Number of Nodes: ", numNodes,
+                   ", Number of Edges: ", numEdges, "\n");
+    allocateFrom(numNodes, numEdges);
+    constructNodes();
+    /**
+     * Load outIndex array
+     **/
+    assert(edgeIndData.data());
+    if (!edgeIndData.data()) {
+      GALOIS_DIE("out of memory");
+    }
+
+    // start position to read index data
+    uint64_t readPosition = (4 * sizeof(uint64_t));
+    graphFile.seekg(readPosition);
+    graphFile.read(reinterpret_cast<char*>(edgeIndData.data()),
+                   sizeof(uint64_t) * numNodes);
+    /**
+     * Load edgeDst array
+     **/
+    assert(edgeDst.data());
+    if (!edgeDst.data()) {
+      GALOIS_DIE("out of memory");
+    }
+
+    readPosition = ((4 + numNodes) * sizeof(uint64_t));
+    graphFile.seekg(readPosition);
+    if (version == 1) {
+      graphFile.read(reinterpret_cast<char*>(edgeDst.data()),
+                     sizeof(uint32_t) * numEdges);
+      readPosition =
+          ((4 + numNodes) * sizeof(uint64_t) + numEdges * sizeof(uint32_t));
+      // version 1 padding TODO make version agnostic
+      if (numEdges % 2) {
+        readPosition += sizeof(uint32_t);
+      }
+    } else if (version == 2) {
+      graphFile.read(reinterpret_cast<char*>(edgeDst.data()),
+                     sizeof(uint64_t) * numEdges);
+      readPosition =
+          ((4 + numNodes) * sizeof(uint64_t) + numEdges * sizeof(uint64_t));
+      if (numEdges % 2) {
+        readPosition += sizeof(uint64_t);
+      }
+    } else {
+      GALOIS_DIE("unknown file version: ", version);
+    }
+    /**
+     * Load edge data array
+     **/
+    assert(edgeData.data());
+    if (!edgeData.data()) {
+      GALOIS_DIE("out of memory");
+    }
+    graphFile.seekg(readPosition);
+    graphFile.read(reinterpret_cast<char*>(edgeData.data()),
+                   sizeof(EdgeTy) * numEdges);
+
+    initializeLocalRanges();
+    graphFile.close();
+  }
+
+  /**
+   * Reads the GR files directly into in-memory
+   * data-structures of LC_CSR graphs using freads.
+   *
+   * Edge is void.
+   *
+   */
+  template <
+      typename U                                                     = void,
+      typename std::enable_if<std::is_void<EdgeTy>::value, U>::type* = nullptr>
+  void readGraphFromGRFile(const std::string& filename) {
+    std::ifstream graphFile(filename.c_str());
+    if (!graphFile.is_open()) {
+      GALOIS_DIE("failed to open file");
+    }
+    uint64_t header[4];
+    graphFile.read(reinterpret_cast<char*>(header), sizeof(uint64_t) * 4);
+    uint64_t version = header[0];
+    numNodes         = header[2];
+    numEdges         = header[3];
+    galois::gPrint("Number of Nodes: ", numNodes,
+                   ", Number of Edges: ", numEdges, "\n");
+    allocateFrom(numNodes, numEdges);
+    constructNodes();
+    /**
+     * Load outIndex array
+     **/
+    assert(edgeIndData.data());
+    if (!edgeIndData.data()) {
+      GALOIS_DIE("out of memory");
+    }
+    // start position to read index data
+    uint64_t readPosition = (4 * sizeof(uint64_t));
+    graphFile.seekg(readPosition);
+    graphFile.read(reinterpret_cast<char*>(edgeIndData.data()),
+                   sizeof(uint64_t) * numNodes);
+    /**
+     * Load edgeDst array
+     **/
+    assert(edgeDst.data());
+    if (!edgeDst.data()) {
+      GALOIS_DIE("out of memory");
+    }
+    readPosition = ((4 + numNodes) * sizeof(uint64_t));
+    graphFile.seekg(readPosition);
+    if (version == 1) {
+      graphFile.read(reinterpret_cast<char*>(edgeDst.data()),
+                     sizeof(uint32_t) * numEdges);
+    } else if (version == 2) {
+      graphFile.read(reinterpret_cast<char*>(edgeDst.data()),
+                     sizeof(uint64_t) * numEdges);
+    } else {
+      GALOIS_DIE("unknown file version: ", version);
+    }
+
+    initializeLocalRanges();
+    graphFile.close();
+  }
+
+  /**
+   * Given a manually created graph, initialize the local ranges on this graph
+   * so that threads can iterate over a balanced number of vertices.
+   */
+  void initializeLocalRanges() {
+    galois::on_each([&](unsigned tid, unsigned total) {
+      auto r = divideByNode(0, 1, tid, total).first;
+      this->setLocalRange(*r.first, *r.second);
+    });
+  }
+};
+
+} // namespace galois::graphs
+
+#endif
diff --git a/libgalois/include/galois/graphs/MorphGraph.h b/libgalois/include/galois/graphs/MorphGraph.h
index 7a7b89bef6..6157f50f09 100644
--- a/libgalois/include/galois/graphs/MorphGraph.h
+++ b/libgalois/include/galois/graphs/MorphGraph.h
@@ -748,6 +748,8 @@ public
     return n->getData();
   }
 
+  GraphNode& getNode(uint64_t n) { return std::advance(this->begin(), n); }
+
   //! Checks if a node is in the graph
   //! @returns true if a node has is in the graph
   bool containsNode(const GraphNode& n,
@@ -786,6 +788,22 @@ public
     src->resizeEdges(size);
   }
 
+  /**
+   * Adds an edge to graph, replacing existing value if edge already exists.
+   *
+   * Ignore the edge data, let the caller use the returned iterator to set the
+   * value if desired.  This frees us from dealing with the void edge data
+   * problem in this API
+   */
+  edge_iterator addEdge(uint64_t src, uint64_t dst,
+                        galois::MethodFlag mflag = MethodFlag::WRITE) {
+    auto s = this->begin();
+    std::advance(s, src);
+    auto d = this->begin();
+    std::advance(d, dst);
+    return createEdgeWithReuse(*s, *d, mflag);
+  }
+
   /**
    * Adds an edge to graph, replacing existing value if edge already exists.
    *
@@ -959,7 +977,7 @@ public
   //! Sorts edge of a node by destination.
   void sortEdgesByDst(GraphNode N,
                       galois::MethodFlag mflag = MethodFlag::WRITE) {
-    acquire(N, mflag);
+    // acquire(N, mflag);
     typedef typename gNode::EdgeInfo EdgeInfo;
     std::sort(N->begin(), N->end(),
               [=](const EdgeInfo& e1, const EdgeInfo& e2) {
@@ -1030,6 +1048,14 @@ public
     return boost::make_filter_iterator(is_out_edge(), N->end(), N->end());
   }
 
+  uint64_t getDegree(GraphNode N) {
+    uint64_t ret;
+    for (auto& edge : out_edges(N)) {
+      ret++;
+    }
+    return ret;
+  }
+
   //! Returns the end of an in-neighbor edge iterator
   template <bool _Undirected = !Directional>
   in_edge_iterator
@@ -1128,6 +1154,27 @@ public
   //! Returns the size of edge data.
   size_t sizeOfEdgeData() const { return gNode::EdgeInfo::sizeOfSecond(); }
 
+  MorphGraph() = default;
+
+  template <typename EdgeNumFnTy, typename EdgeDstFnTy, typename EdgeDataFnTy>
+  MorphGraph(uint32_t numNodes, uint64_t numEdges, EdgeNumFnTy edgeNum,
+             EdgeDstFnTy _edgeDst, EdgeDataFnTy _edgeData) {
+    std::vector<GraphNode> nodes{numNodes};
+    for (size_t n = 0; n < numNodes; ++n) {
+      // NodeTy node;
+      GraphNode a = this->createNode();
+      this->addNode(a);
+      nodes[n] = a;
+    }
+    for (size_t n = 0; n < numNodes; ++n) {
+      for (size_t e = 0; e < edgeNum(n); ++e) {
+        auto edge = this->addEdge(nodes[n], nodes[_edgeDst(n, e)]);
+        if (!std::is_void<EdgeTy>::value)
+          this->getEdgeData(edge) = _edgeData(n, e);
+      }
+    }
+  }
+
 #ifdef AUX_MAP
   /**
    * Allocate memory for nodes given a file graph with a particular number of
diff --git a/libgalois/include/galois/graphs/OfflineGraph.h b/libgalois/include/galois/graphs/OfflineGraph.h
index e3ba4cd17e..eeafb528db 100644
--- a/libgalois/include/galois/graphs/OfflineGraph.h
+++ b/libgalois/include/galois/graphs/OfflineGraph.h
@@ -189,11 +189,17 @@ class OfflineGraph {
     return retval;
   }
 
+protected:
+  void setSize(size_t val) { numNodes = val; }
+  void setSizeEdges(size_t val) { numEdges = val; }
+
 public:
   typedef boost::counting_iterator<uint64_t> iterator;
   typedef boost::counting_iterator<uint64_t> edge_iterator;
   typedef uint64_t GraphNode;
 
+  OfflineGraph() {}
+
   OfflineGraph(const std::string& name)
       : fileEdgeDst(name, std::ios_base::binary),
         fileIndex(name, std::ios_base::binary),
@@ -321,9 +327,9 @@ class OfflineGraph {
    * @param scaleFactor Vector specifying if certain divisions should get more
    * than other divisions
    */
-  auto divideByNode(size_t nodeWeight, size_t edgeWeight, size_t id,
-                    size_t total,
-                    std::vector<unsigned> scaleFactor = std::vector<unsigned>())
+  virtual auto
+  divideByNode(size_t nodeWeight, size_t edgeWeight, size_t id, size_t total,
+               std::vector<unsigned> scaleFactor = std::vector<unsigned>())
       -> GraphRange {
     return galois::graphs::divideNodesBinarySearch<OfflineGraph>(
         numNodes, numEdges, nodeWeight, edgeWeight, id, total, *this,
diff --git a/libgalois/include/galois/runtime/StackTracer.h b/libgalois/include/galois/runtime/StackTracer.h
new file mode 100644
index 0000000000..d847db7508
--- /dev/null
+++ b/libgalois/include/galois/runtime/StackTracer.h
@@ -0,0 +1,219 @@
+#ifndef GALOIS_RUNTIME_STACKTRACER_H
+#define GALOIS_RUNTIME_STACKTRACER_H
+#include <cstdint>
+
+struct MPSCBuffer {
+  volatile uint64_t head;
+  volatile uintptr_t* buf;
+  uint16_t LIMIT;
+
+public:
+  /* Old Constructors
+  MPSCBuffer(uint16_t lim) : head(0), buf(new uintptr_t[lim]), LIMIT(lim)
+  {
+    for(uint16_t i = 0; i < LIMIT; i++) buf[i] = 0;
+  }
+
+  MPSCBuffer(uint16_t lim, volatile uintptr_t* buf) : head(0), buf(buf),
+  LIMIT(lim) {}
+  */
+
+  // Will overwrite value if you go over allocated buf size
+  int put(uintptr_t val) {
+    // Check if input is valid
+    if (val == 0)
+      return -1;
+
+    uint64_t my_buf_ptr = __atomic_fetch_add(&head, 1, __ATOMIC_RELAXED);
+    __atomic_store_8(&buf[my_buf_ptr % LIMIT], val, __ATOMIC_RELAXED);
+
+    return 0;
+  }
+
+  uint16_t get_size() {
+    uint64_t head_local = __atomic_load_8(&head, __ATOMIC_RELAXED);
+    if (head_local > LIMIT)
+      return LIMIT;
+    else
+      return head_local;
+  }
+
+  int get(uint16_t loc, uintptr_t& ret) {
+    auto size = get_size();
+    if (loc >= size)
+      return -1;
+    uintptr_t val;
+    while (!(val = __atomic_load_8(&buf[loc], __ATOMIC_RELAXED)))
+      ;
+
+    ret = val;
+
+    return 0;
+  }
+};
+
+#ifdef STACK_CAPTURE
+
+struct ThreadStackCap {
+  bool non_recurse          = true;
+  volatile bool initialized = false;
+  volatile uint64_t top;
+  volatile uint64_t bot;
+};
+
+thread_local ThreadStackCap cap;
+static volatile uintptr_t mpsc_cap_buffer[1024] = {0};
+
+struct StackCap {
+  bool grows_down;
+  uint64_t init_top;
+  uint64_t init_bot;
+  MPSCBuffer caps;
+
+  uint64_t getStackPtr() {
+    uint64_t a;
+    return (uint64_t)&a;
+  }
+
+public:
+  /* Old Constructors
+  StackCap() : caps(1024)
+  {
+    uint64_t bot_val;
+
+    uint64_t top_p = getStackPtr();
+    grows_down = ((uint64_t)&bot_val > top_p);
+    init_top = grows_down ? UINT64_MAX : 0;
+    init_bot = grows_down ? 0 : UINT64_MAX;
+  }
+
+  StackCap(uint16_t lim, volatile uintptr_t* cap_buf) : caps(lim, cap_buf) {}
+  */
+
+  void setup() {
+    uint64_t bot_val;
+
+    uint64_t top_p = getStackPtr();
+    grows_down     = ((uint64_t)&bot_val > top_p);
+    init_top       = grows_down ? UINT64_MAX : 0;
+    init_bot       = grows_down ? 0 : UINT64_MAX;
+  }
+
+  void capture_stack_info() {
+    if (!cap.initialized) {
+      this->setup();
+      cap.top               = init_top;
+      cap.bot               = init_bot;
+      uintptr_t tl_cap_addr = (uintptr_t)&cap;
+      caps.put(tl_cap_addr);
+      __atomic_store_n(&cap.initialized, true, __ATOMIC_RELAXED);
+    }
+
+    uint64_t curr   = getStackPtr();
+    bool change_top = grows_down ? (curr < cap.top) : (curr > cap.top);
+    bool change_bot = grows_down ? (curr > cap.bot) : (curr < cap.bot);
+    if (change_top)
+      __atomic_store_8(&cap.top, curr, __ATOMIC_RELAXED);
+    if (change_bot)
+      __atomic_store_8(&cap.bot, curr, __ATOMIC_RELAXED);
+  }
+
+  /**
+   * @return the maximum stack value
+   * */
+  uint64_t get_max() {
+    uint64_t max = 0;
+    for (uint16_t i = 0; i < caps.get_size(); i++) {
+      uintptr_t tr_cap_addr = 0;
+      caps.get(i, tr_cap_addr);
+      ThreadStackCap* cont_cap = (ThreadStackCap*)tr_cap_addr;
+      bool valid_vals =
+          (cont_cap->top != init_top) && (cont_cap->bot != init_bot);
+      uint64_t candidate_max = grows_down ? (cont_cap->bot - cont_cap->top)
+                                          : (cont_cap->top - cont_cap->bot);
+      if (valid_vals && (candidate_max > max))
+        max = candidate_max;
+    }
+
+    return max;
+  }
+
+  /**
+   * @param idx the index of the capacity you want to get
+   * @param top a reference where the top of the stack should be put
+   * @param bot a reference where the bottom of the stack should be put
+   * @return tells you if the value at top and bot should be trusted (no errors
+   *is 0)
+   **/
+  int get_top_bot(uint16_t idx, uint64_t& top, uint64_t& bot) {
+    uintptr_t tr_cap_addr    = 0;
+    int ret                  = caps.get(idx, tr_cap_addr);
+    ThreadStackCap* cont_cap = (ThreadStackCap*)tr_cap_addr;
+    if (ret != 0 || cont_cap == nullptr)
+      return -1;
+    top = cont_cap->top;
+    bot = cont_cap->bot;
+    return 0;
+  }
+
+  bool& is_non_recurse() { return *&cap.non_recurse; }
+
+  /** This is a very dangerous function please be careful when calling this. */
+  int reset() {
+    for (uint16_t i = 0; i < caps.get_size(); i++) {
+      uintptr_t tr_cap_addr = 0;
+      caps.get(i, tr_cap_addr);
+      ThreadStackCap* cont_cap = (ThreadStackCap*)tr_cap_addr;
+      if (cont_cap == nullptr)
+        return -1;
+      cont_cap->top = init_top;
+      cont_cap->bot = init_bot;
+    }
+    return 0;
+  }
+};
+
+StackCap stack_capture = {
+    .grows_down = true,
+    .init_top   = 0,
+    .init_bot   = 0,
+    .caps       = {.head = 0, .buf = mpsc_cap_buffer, .LIMIT = 1024}};
+
+#else
+struct StackCap {
+public:
+  bool non_recurse = true;
+
+  void capture_stack_info() {}
+
+  uint64_t get_max() { return 0; }
+
+  bool& is_non_recurse() { return non_recurse; }
+
+  int reset() { return 0; }
+};
+
+StackCap stack_capture;
+
+#endif
+
+void cyg_profile_func_stack(void* this_fn, void* call_site) {
+  (void)this_fn;
+  (void)call_site;
+  if (stack_capture.is_non_recurse()) {
+    stack_capture.is_non_recurse() = false;
+    stack_capture.capture_stack_info();
+    stack_capture.is_non_recurse() = true;
+  }
+}
+
+extern "C" {
+void __cyg_profile_func_enter(void* this_fn, void* call_site) {
+  cyg_profile_func_stack(this_fn, call_site);
+}
+void __cyg_profile_func_exit(void* this_fn, void* call_site) {
+  cyg_profile_func_stack(this_fn, call_site);
+}
+}
+
+#endif // GALOIS_RUNTIME_STACKTRACER_H
diff --git a/libgalois/include/galois/substrate/PerThreadStorage.h b/libgalois/include/galois/substrate/PerThreadStorage.h
index fc43055853..dabcc56fd7 100644
--- a/libgalois/include/galois/substrate/PerThreadStorage.h
+++ b/libgalois/include/galois/substrate/PerThreadStorage.h
@@ -156,6 +156,10 @@ class PerThreadStorage {
     return reinterpret_cast<T*>(ditem);
   }
 
+  T* get(unsigned int thread) { return getRemote(thread); }
+
+  const T* get(unsigned int thread) const { return getRemote(thread); }
+
   unsigned size() const { return getThreadPool().getMaxThreads(); }
 };
 
diff --git a/libgalois/src/HWTopoLinux.cpp b/libgalois/src/HWTopoLinux.cpp
index 486e707230..cac6265914 100644
--- a/libgalois/src/HWTopoLinux.cpp
+++ b/libgalois/src/HWTopoLinux.cpp
@@ -118,6 +118,7 @@ std::vector<cpuinfo> parseCPUInfo() {
     if (sscanf(line.data(), "processor : %d", &num) == 1) {
       assert(cur < num);
       cur = num;
+      // if(cur != 0) break;
       vals.resize(cur + 1);
       vals.at(cur).proc = num;
     } else if (sscanf(line.data(), "physical id : %d", &num) == 1) {
diff --git a/libgalois/test/CMakeLists.txt b/libgalois/test/CMakeLists.txt
index d28e294794..e4e1d09b9f 100644
--- a/libgalois/test/CMakeLists.txt
+++ b/libgalois/test/CMakeLists.txt
@@ -52,6 +52,7 @@ add_test_unit(move)
 add_test_unit(oneach)
 add_test_unit(papi 2)
 add_test_unit(pc)
+add_test_unit(prefixsum)
 add_test_unit(reduction)
 add_test_unit(sort)
 add_test_unit(static)
@@ -60,3 +61,4 @@ add_test_unit(twoleveliteratora)
 add_test_unit(wakeup-overhead)
 add_test_unit(worklists-compile)
 add_test_unit(morphgraph-removal)
+add_test_unit(wfl)
diff --git a/libgalois/test/prefixsum.cpp b/libgalois/test/prefixsum.cpp
new file mode 100644
index 0000000000..e05c54971e
--- /dev/null
+++ b/libgalois/test/prefixsum.cpp
@@ -0,0 +1,101 @@
+#include "galois/Timer.h"
+#include "galois/Galois.h"
+#include "galois/WaterFallLock.h"
+#include "galois/substrate/PerThreadStorage.h"
+#include "galois/PrefixSum.h"
+
+#include <iostream>
+#include <cstdlib>
+#include <unistd.h>
+#include <cxxabi.h>
+#include <utility>
+
+unsigned iter       = 0;
+unsigned numThreads = 0;
+
+char bname[100];
+
+template <typename T, typename Y>
+void test(T& prefix_sum, uint64_t sz, Y* dst) {
+  gethostname(bname, sizeof(bname));
+  char* name = 0;
+  int status;
+  name = abi::__cxa_demangle(prefix_sum.name(), 0, 0, &status);
+  if (status || !name)
+    std::abort();
+
+  auto run = [&prefix_sum, sz]() { prefix_sum.computePrefixSum(sz); };
+
+  unsigned M = numThreads;
+  while (M) {
+    galois::setActiveThreads(M);
+    galois::Timer t;
+    run();
+    for (uint64_t i = 0; i < sz; i++) {
+      if (dst[i] != i + 1)
+        std::abort();
+    }
+
+    t.start();
+    run();
+    t.stop();
+    std::cout << bname << "," << name << "," << M << "," << t.get_usec()
+              << "\n";
+    M -= 1;
+  }
+  free(name);
+}
+
+uint64_t transmute(const std::pair<uint64_t, uint64_t>& p) {
+  return p.second - p.first;
+};
+uint64_t scan_op(const std::pair<uint64_t, uint64_t>& p, const uint64_t& l) {
+  return p.second - p.first + l;
+};
+uint64_t combiner(const uint64_t& f, const uint64_t& s) { return f + s; };
+
+int main(int argc, char** argv) {
+  galois::SharedMemSys Galois_runtime;
+  if (argc > 1)
+    iter = atoi(argv[1]);
+  else
+    iter = 16 * 1024;
+  if (argc > 2)
+    numThreads = atoi(argv[2]);
+  else
+    numThreads = galois::substrate::getThreadPool().getMaxThreads();
+
+  gethostname(bname, sizeof(bname));
+  using namespace galois;
+
+  std::cout << "Host"
+            << ","
+            << "Lock Name"
+            << ","
+            << "numThreads"
+            << ","
+            << "Time (us)" << std::endl;
+
+  // using PTS   = PerThreadStorage<unsigned>;
+
+  auto src = (std::pair<uint64_t, uint64_t>*)malloc(
+      sizeof(std::pair<uint64_t, uint64_t>) * (1 << 30));
+  auto dst = (uint64_t*)malloc(sizeof(uint64_t) * (1 << 30));
+
+  for (uint64_t i = 0; i < (1 << 30); i++)
+    src[i] = {0, 1};
+
+  using PSUM = PrefixSum<std::pair<uint64_t, uint64_t>, uint64_t, transmute,
+                         scan_op, combiner, CacheLinePaddedArr>;
+
+  PSUM prefix{src, dst};
+
+  test<PSUM>(prefix, 50, dst);
+  test<PSUM>(prefix, 1000, dst);
+  test<PSUM>(prefix, 40000, dst);
+  test<PSUM>(prefix, (1 << 30), dst);
+  free(src);
+  free(dst);
+
+  return 0;
+}
diff --git a/libgalois/test/wfl.cpp b/libgalois/test/wfl.cpp
new file mode 100644
index 0000000000..79e68a64a0
--- /dev/null
+++ b/libgalois/test/wfl.cpp
@@ -0,0 +1,106 @@
+#include "galois/Timer.h"
+#include "galois/Galois.h"
+#include "galois/WaterFallLock.h"
+#include "galois/substrate/PerThreadStorage.h"
+
+#include <iostream>
+#include <cstdlib>
+#include <unistd.h>
+#include <cxxabi.h>
+
+unsigned iter       = 0;
+unsigned numThreads = 0;
+
+char bname[100];
+
+template <typename T>
+struct emp {
+  galois::WaterFallLock<T>& w;
+  std::vector<uint64_t>& arr_a;
+
+  void operator()(const uint64_t& tid, const uint64_t& numThreads) {
+    for (unsigned i = 0; i < iter; i++) {
+      auto perm_tid = tid ? tid - 1 : numThreads - 1;
+      arr_a[tid]++;
+      w.template done<1>(tid);
+      w.template wait<1>(perm_tid);
+      arr_a[perm_tid]++;
+      if (arr_a[perm_tid] != 2)
+        std::abort();
+      w.template done<2>(perm_tid);
+      w.template wait<2>(tid);
+      arr_a[tid]++;
+      if (arr_a[tid] != 3)
+        std::abort();
+      w.template done<3>(tid);
+      w.template wait<3>(perm_tid);
+      arr_a[perm_tid] -= 3;
+      if (arr_a[perm_tid] != 0)
+        std::abort();
+      w.template done<0>(perm_tid);
+      w.template wait<0>(tid);
+    }
+  }
+};
+
+template <typename T>
+void test(galois::WaterFallLock<T>* w, std::vector<uint64_t>& arr0) {
+  if (w == nullptr) {
+    std::cout << "skipping " << std::endl;
+    return;
+  }
+  gethostname(bname, sizeof(bname));
+  char* name = 0;
+  int status;
+  name = abi::__cxa_demangle(w->name(), 0, 0, &status);
+  if (status || !name)
+    std::abort();
+
+  emp<T> e{*w, arr0};
+
+  unsigned M = numThreads;
+  while (M) {
+    galois::setActiveThreads(M);
+    w->reset();
+    galois::Timer t;
+    galois::on_each(e);
+    t.start();
+    galois::on_each(e);
+    t.stop();
+    std::cout << bname << "," << name << "," << M << "," << t.get_usec()
+              << "\n";
+    M -= 1;
+  }
+  free(name);
+}
+
+int main(int argc, char** argv) {
+  galois::SharedMemSys Galois_runtime;
+  if (argc > 1)
+    iter = atoi(argv[1]);
+  else
+    iter = 16 * 1024;
+  if (argc > 2)
+    numThreads = atoi(argv[2]);
+  else
+    numThreads = galois::substrate::getThreadPool().getMaxThreads();
+
+  gethostname(bname, sizeof(bname));
+  using namespace galois;
+
+  std::cout << "Host"
+            << ","
+            << "Lock Name"
+            << ","
+            << "numThreads"
+            << ","
+            << "Time (us)" << std::endl;
+
+  using PTS = PerThreadStorage<unsigned>;
+  using CLP = CacheLinePaddedArr<unsigned>;
+
+  auto arr0 = std::vector<uint64_t>(numThreads, 0);
+  test<PTS>(new WaterFallLock<PTS>(), arr0);
+  test<CLP>(new WaterFallLock<CLP>(), arr0);
+  return 0;
+}
diff --git a/libgpu/include/internal.h b/libgpu/include/internal.h
index 696c37729c..7d8e6f8a9a 100644
--- a/libgpu/include/internal.h
+++ b/libgpu/include/internal.h
@@ -23,7 +23,9 @@ template <int items, typename T>
 struct multiple_sum {
   T el[items];
 
-  __device__ __host__ multiple_sum() {}
+  // https://nvlabs.github.io/cub/classcub_1_1_block_scan.html#a6ed3f77795e582df31d3d6d9d950615e
+  // "This operation assumes the value of obtained by the T's default constructor (or by zero-initialization if no user-defined default constructor exists) is suitable as the identity value zero for addition."
+  __device__ __host__ multiple_sum() : multiple_sum(T()) { }
 
   __device__ __host__ multiple_sum(const T e) {
     for (int i = 0; i < items; i++)
diff --git a/libwmd/CMakeLists.txt b/libwmd/CMakeLists.txt
new file mode 100644
index 0000000000..b166ac78b8
--- /dev/null
+++ b/libwmd/CMakeLists.txt
@@ -0,0 +1,31 @@
+add_library(galois_wmd INTERFACE)
+add_library(Galois::wmd ALIAS galois_wmd)
+set_target_properties(galois_wmd PROPERTIES EXPORT_NAME wmd)
+add_dependencies(lib galois_wmd)
+
+target_include_directories(galois_wmd INTERFACE
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  $<INSTALL_INTERFACE:include>
+)
+
+target_link_libraries(galois_wmd INTERFACE Galois::dist_async Galois::cusp)
+
+add_subdirectory(test)
+
+install(
+  DIRECTORY include/
+  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
+  COMPONENT dev
+  FILES_MATCHING PATTERN "*.h"
+)
+
+install(TARGETS galois_wmd
+  EXPORT GaloisTargets
+  LIBRARY
+    DESTINATION "${CMAKE_INSTALL_LIBDIR}"
+    COMPONENT shlib
+  ARCHIVE
+    DESTINATION "${CMAKE_INSTALL_LIBDIR}"
+    COMPONENT lib
+  INCLUDES DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
+)
diff --git a/libwmd/include/galois/wmd/WMDGraph.h b/libwmd/include/galois/wmd/WMDGraph.h
new file mode 100644
index 0000000000..06126a1fd3
--- /dev/null
+++ b/libwmd/include/galois/wmd/WMDGraph.h
@@ -0,0 +1,1720 @@
+/**
+ * @file WMDGraph.h
+ *
+ * Contains the implementation of WMDBufferedGraph and WMDOfflineGraph which is
+ * a galois graph constructed from WMD dataset
+ */
+
+#ifndef WMD_BUFFERED_GRAPH_H
+#define WMD_BUFFERED_GRAPH_H
+
+#include <fstream>
+#include <unordered_map>
+#include <atomic>
+#include <cstring>
+#include <cmath>
+#include <iterator>
+#include <sys/stat.h>
+
+#include <boost/iterator/counting_iterator.hpp>
+#include <boost/iostreams/stream.hpp>
+#include <boost/archive/binary_iarchive.hpp>
+#include <boost/archive/binary_oarchive.hpp>
+#include <boost/iostreams/device/array.hpp>
+#include <boost/iostreams/device/back_inserter.hpp>
+#include <boost/serialization/vector.hpp>
+
+#include "galois/runtime/Network.h"
+#include "galois/config.h"
+#include "galois/gIO.h"
+#include "galois/Reduction.h"
+
+#include "graphTypes.h"
+#include "data_types.h"
+#include "graph.h"
+#include "schema.h"
+#include "instrument.h"
+
+namespace galois {
+namespace graphs {
+
+void inline increment_evilPhase() {
+  ++galois::runtime::evilPhase;
+  if (galois::runtime::evilPhase >=
+      static_cast<uint32_t>(
+          std::numeric_limits<int16_t>::max())) { // limit defined by MPI or
+                                                  // LCI
+    galois::runtime::evilPhase = 1;
+  }
+}
+
+/**
+ * Load a WMD format graph from file into memory.
+ *
+ * Inherit from OffilineGraph only to make it compatible with Partitioner
+ * Internal implementation are completed different.
+ */
+template <typename NodeDataType, typename EdgeDataType>
+class WMDOfflineGraph : public OfflineGraph {
+protected:
+  // TODO: consider typedef uint64_t NodeIDType ?
+  typedef boost::counting_iterator<uint64_t> iterator;
+  typedef boost::counting_iterator<uint64_t> edge_iterator;
+
+  // private feilds from base class that will be updated
+  // uint64_t numNodes;  // num of global nodes
+  // uint64_t numEdges;  // num of global edges
+
+  // local feilds (different on each hosts)
+  std::vector<uint64_t> localNodeSize; // number of local nodes in each hosts
+  uint64_t localEdgeSize;              // number of local edges in this host
+
+  // TODO: it may be possible to optimize these vectors by numa aware data
+  // structures
+  std::vector<uint64_t>
+      localEdgesIdxToGlobalNodeID; // map idx in localEdges to global node ID
+  std::vector<NodeDataType> localNodes; // nodes in this host, index by local ID
+
+<<<<<<< HEAD
+  // global feilds (same on each hosts)
+<<<<<<< HEAD
+=======
+  // global fields (same on each hosts)
+  std::unordered_map<uint64_t, uint64_t> tokenToGlobalNodeID;  // map node token to Global ID
+>>>>>>> dd3e7ce00 (fix: typo and TODO)
+  std::vector<uint64_t> nodeOffset;  // each hosts' local ID offset wrt global ID
+  std::vector<uint64_t> globalEdgePrefixSum;  // a prefix sum of degree of each global nodes
+=======
+  std::vector<uint64_t> nodeOffset; // each hosts' local ID offset wrt global ID
+  std::vector<uint64_t>
+      globalEdgePrefixSum; // a prefix sum of degree of each global nodes
+>>>>>>> 5901b24b6 (chore: Run clang-format on the repo and add git hooks from gnn branch)
+
+  // per thread data struct (will be combined into a single data struct)
+  std::vector<std::unordered_map<uint64_t, size_t>>
+      perThreadTokenToLocalEdgesIdx;
+  std::vector<std::vector<NodeDataType>> perThreadLocalNodes;
+  std::vector<std::vector<std::vector<EdgeDataType>>> perThreadLocalEdges;
+
+  uint32_t hostID;
+  uint32_t numHosts;
+
+  /**
+   * @brief this releases memory by swapping p_container with an empty container
+   * and so, by using out-of-scope
+   */
+  template <typename T>
+  static inline void freeContainer(T& p_container) {
+    T empty;
+    std::swap(p_container, empty);
+  }
+
+  inline void insertlocalEdgesPerThread(unsigned tid, uint64_t token,
+                                        EdgeDataType& edge) {
+    I_RR();
+    if (auto search = perThreadTokenToLocalEdgesIdx[tid].find(token);
+        search !=
+        perThreadTokenToLocalEdgesIdx[tid].end()) { // if token already exists
+      I_WR();
+      perThreadLocalEdges[tid][search->second].push_back(std::move(edge));
+    } else { // not exist, make a new one
+      I_WR();
+      perThreadTokenToLocalEdgesIdx[tid].insert(
+          {token, perThreadLocalEdges[tid].size()});
+      I_WR();
+      std::vector<EdgeDataType> v;
+      v.push_back(std::move(edge));
+      perThreadLocalEdges[tid].push_back(std::move(v));
+    }
+  }
+
+  /**
+   * Load graph info from the file.
+   * Expect a WMD format csv
+   *
+   * @param filename loaded file for the graph
+   * @param segmentsPerHost the number of file segments each host will load.
+   * If value is 1, no file striping is performed. The file is striped into
+   * (segementsPerHost * numHosts) segments.
+   * @param setEdgeSize if True, will update local edges size on this step.
+   * Only set to ture when prefixsum will not be computed.
+   *
+   * @details File striping is used to randomize the order of nodes/edges
+   * loaded from the graph. WMD dataset csv typically grouped nodes/edges by its
+   * types, which will produce an imbalanced graph if you break the file evenly
+   * among hosts. So file striping make each host be able to load multiple
+   * segments in different positions of the file, which produced a more balanced
+   * graph.
+   */
+  void loadGraphFile(const std::string& filename,
+                     FileParser<NodeDataType, EdgeDataType>& parser,
+                     uint64_t segmentsPerHost,
+                     galois::GAccumulator<uint64_t>& nodeCounter,
+                     galois::DGAccumulator<uint64_t>& edgeCounter) {
+    std::string line;
+    struct stat stats;
+
+    std::ifstream graphFile = std::ifstream(filename, std::ifstream::in);
+    if (!graphFile.is_open()) {
+      printf("cannot open file %s\n", filename.c_str());
+      exit(-1);
+    }
+    stat(filename.c_str(), &stats);
+
+    uint64_t numThreads  = galois::getActiveThreads();
+    uint64_t numSegments = numHosts * segmentsPerHost;
+    uint64_t fileSize    = stats.st_size;
+    uint64_t bytesPerSegment =
+        fileSize / numSegments; // file size / number of segments
+
+    // for each host N, it will read segment like:
+    // N, N + numHosts, N + numHosts * 2, ..., N + numHosts * (segmentsPerHost -
+    // 1)
+    for (uint64_t cur = 0; cur < segmentsPerHost; cur++) {
+      uint64_t segmentID = hostID + cur * numHosts;
+      uint64_t start     = segmentID * bytesPerSegment;
+      uint64_t end       = start + bytesPerSegment;
+
+      // check for partial line at start
+      if (segmentID != 0) {
+        graphFile.seekg(start - 1);
+        getline(graphFile, line);
+        I_RS();
+
+        // if not at start of a line, discard partial line
+        if (!line.empty())
+          start += line.size();
+      }
+
+      // check for partial line at end
+      if (segmentID != numSegments - 1) {
+        graphFile.seekg(end - 1);
+        getline(graphFile, line);
+        I_RS();
+
+        // if not at end of a line, include next line
+        if (!line.empty())
+          end += line.size();
+      } else { // last locale processes to end of file
+        end = fileSize;
+      }
+
+      graphFile.seekg(start);
+
+      // load segment into memory
+      uint64_t segmentLength = end - start;
+      char* segmentBuffer    = new char[segmentLength];
+      graphFile.read(segmentBuffer, segmentLength);
+      I_RS();
+
+      if (!graphFile)
+        galois::gError("failed to read segment start: ", start, ", end: ", end,
+                       ", only ", graphFile.gcount(), " could be read from ",
+                       filename);
+      galois::gDebug("[", hostID, "] read file, start: ", start, ", end: ", end,
+                     "/", fileSize);
+
+      // A parallel loop that parse the segment
+      // task 1: get token to global id mapping
+      // task 2: get token to edges mapping
+      uint64_t lengthPerThread = segmentLength / numThreads;
+      galois::on_each([&](unsigned tid, unsigned nthreads) {
+        char* currentLine = segmentBuffer + tid * lengthPerThread;
+        char* endLine     = currentLine + lengthPerThread;
+
+        // check for partial line
+        if (tid != 0) {
+          // if not at start of a line, discard partial line
+          if (*(currentLine - 1) != '\n')
+            currentLine = std::strchr(currentLine, '\n') + 1;
+          I_RR();
+        }
+
+        // last thread processes to end of file
+        if (tid == (nthreads - 1))
+          endLine = segmentBuffer + segmentLength;
+        galois::gDebug("[", hostID, "] thread ", tid,
+                       " read file, start: ", currentLine - segmentBuffer,
+                       ", end: ", endLine - segmentBuffer, "/", segmentLength);
+        // init per thread counter
+        uint64_t edgeAdded = 0;
+        while (currentLine < endLine) {
+          assert(std::strchr(currentLine, '\n'));
+          char* nextLine      = std::strchr(currentLine, '\n') + 1;
+          uint64_t lineLength = nextLine - currentLine;
+          I_RR();
+
+          // skip comments
+          if (currentLine[0] == '#') {
+            currentLine = nextLine;
+            continue;
+          }
+
+          // delimiter and # tokens set for wmd data file
+          ParsedGraphStructure<NodeDataType, EdgeDataType> value =
+              parser.ParseLine(currentLine, lineLength);
+          I_RS();
+
+          if (value.isNode) {
+            I_WR();
+            perThreadLocalNodes[tid].emplace_back(value.node);
+          } else if (value.isEdge) {
+            for (auto& edge : value.edges) {
+              insertlocalEdgesPerThread(tid, edge.src, edge);
+              edgeAdded += 1;
+            }
+          }
+          currentLine = nextLine;
+        }
+        // update accumulator
+        edgeCounter += edgeAdded;
+        if (cur == segmentsPerHost - 1) {
+          nodeCounter += perThreadLocalNodes[tid].size();
+          I_RR();
+        }
+      });
+
+      delete[] segmentBuffer;
+    }
+    graphFile.close();
+  }
+
+  /**
+   * Load graph info from the file.
+   * Expect a WMD format csv
+   *
+   * @param filename loaded file for the graph
+   * @param segmentsPerHost the number of file segments each host will load.
+   * If value is 1, no file striping is performed. The file is striped into
+   * (segementsPerHost * numHosts) segments.
+   * @param setEdgeSize if True, will update local edges size on this step.
+   * Only set to ture when prefixsum will not be computed.
+   *
+   * @details File striping is used to randomize the order of nodes/edges
+   * loaded from the graph. WMD dataset csv typically grouped nodes/edges by its
+   * types, which will produce an imbalanced graph if you break the file evenly
+   * among hosts. So file striping make each host be able to load multiple
+   * segments in different positions of the file, which produced a more balanced
+   * graph.
+   */
+  void loadGraphFiles(
+      std::vector<std::unique_ptr<FileParser<NodeDataType, EdgeDataType>>>&
+          parsers,
+      uint64_t segmentsPerHost, bool setEdgeSize) {
+    galois::GAccumulator<uint64_t> nodeCounter;
+    nodeCounter.reset();
+    galois::DGAccumulator<uint64_t> edgeCounter;
+    edgeCounter.reset();
+
+    // init per thread data struct
+    uint64_t numThreads = galois::getActiveThreads();
+    perThreadTokenToLocalEdgesIdx.resize(numThreads);
+    perThreadLocalNodes.resize(numThreads);
+    perThreadLocalEdges.resize(numThreads);
+
+    for (std::unique_ptr<FileParser<NodeDataType, EdgeDataType>>& parser :
+         parsers) {
+      I_RR();
+      for (const std::string& file : parser->GetFiles()) {
+<<<<<<< HEAD
+<<<<<<< HEAD
+        I_RS();
+=======
+        I_RR(); 
+>>>>>>> d18e4e8d0 (fixed all bugs (hopefully))
+        loadGraphFile(file, *parser, segmentsPerHost, nodeCounter,
+=======
+        I_RR();
+        loadGraphFile(file, *parser, segmentsPerHost, setEdgeSize, nodeCounter,
+>>>>>>> 73f44af9e (feat: add instrument to wmd graph importer)
+                      edgeCounter);
+      }
+    }
+
+    perThreadTokenToLocalEdgesIdx.clear();
+    perThreadTokenToLocalEdgesIdx.shrink_to_fit();
+
+    if (setEdgeSize) {
+      setSizeEdges(edgeCounter.reduce());
+    }
+
+    I_RR();
+    localEdgeSize = edgeCounter.read_local();
+    localNodeSize.resize(numHosts);
+    I_BM(numHosts);
+    localNodeSize[hostID] = nodeCounter.reduce();
+  }
+
+  /**
+   * Compute global ID of edges by exchange tokenToLocalNodeID
+   */
+  void exchangeEdgeCnt() {
+    // determine edgecnt for each host (partial/local)
+    std::vector<uint64_t> edgeCnt(numVirtualHosts, 0);
+    auto& net               = galois::runtime::getSystemNetworkInterface();
+    uint32_t activeThreads  = galois::getActiveThreads();
+    uint64_t localEdgesSize = localEdges.size();
+    std::vector<std::vector<uint64_t>> threadEdgeCnt(activeThreads);
+    for (uint32_t i = 0; i < activeThreads; i++) {
+      threadEdgeCnt[i].resize(numVirtualHosts, 0);
+    }
+    galois::on_each([&](unsigned tid, unsigned nthreads) {
+      uint64_t beginNode;
+      uint64_t endNode;
+      std::tie(beginNode, endNode) =
+          galois::block_range((uint64_t)0, localEdgesSize, tid, nthreads);
+
+      for (uint64_t i = beginNode; i < endNode; ++i) {
+        uint32_t index = (localEdges[i][0].src) % numVirtualHosts;
+        threadEdgeCnt[tid][index] += localEdges[i].size();
+        I_WR();
+      }
+    });
+    for (uint32_t i = 0; i < activeThreads; i++) {
+      for (uint32_t j = 0; j < numVirtualHosts; j++) {
+        edgeCnt[j] += threadEdgeCnt[i][j];
+        I_RS();
+        I_WR();
+      }
+    }
+    // Send EdgeCnt
+    for (unsigned int i = 0; i < numHosts; i++) {
+      if (i == hostID)
+        continue;
+
+      I_WM(edgeCnt.size());
+      galois::runtime::SendBuffer b;
+      galois::runtime::gSerialize(b, edgeCnt);
+      net.sendTagged(i, galois::runtime::evilPhase, b);
+    }
+    // Receive edgeCnt
+    for (uint32_t h = 0; h < (numHosts - 1); h++) {
+      std::vector<uint64_t> recvChunkCounts;
+
+      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+      do {
+        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+      } while (!p);
+      galois::runtime::gDeserialize(p->second, recvChunkCounts);
+      I_LC(p->first, recvChunkCounts.size() * sizeof(uint64_t));
+      galois::do_all(galois::iterate((size_t)0, recvChunkCounts.size()),
+                     [this, &edgeCnt, &recvChunkCounts](uint64_t i) {
+                       I_RR();
+                       I_WR();
+                       edgeCnt[i] += recvChunkCounts[i];
+                     });
+    }
+    increment_evilPhase();
+    uint64_t edgesNum = 0;
+    for (uint32_t h = 0; h < numVirtualHosts; h++) {
+      I_RS();
+      edgesNum += edgeCnt[h];
+    }
+    setSizeEdges(edgesNum);
+    I_WR();
+    // Process edgeCnt
+    std::vector<uint64_t> edgeCntBkp = edgeCnt;
+    uint32_t sf                      = scaleFactor;
+    std::vector<std::pair<uint64_t, std::vector<uint32_t>>> cnt_vec;
+    for (size_t i = 0; i < edgeCnt.size(); i++) {
+      std::vector<uint32_t> vec;
+      vec.push_back(i);
+      cnt_vec.push_back(std::make_pair(edgeCnt[i], vec));
+      I_WR();
+      I_RS();
+    }
+    std::sort(cnt_vec.begin(), cnt_vec.end());
+    while (sf > 1) {
+      for (uint32_t i = 0; i < (sf * numHosts / 2); i++) {
+        std::pair<uint64_t, std::vector<uint32_t>> mypair;
+        I_RR();
+        I_RR();
+        I_WR();
+        cnt_vec[i].first += cnt_vec[sf * numHosts - i - 1].first;
+        std::vector vec = cnt_vec[(sf * numHosts) - i - 1].second;
+        for (size_t j = 0; j < vec.size(); j++) {
+          cnt_vec[i].second.push_back(
+              cnt_vec[(sf * numHosts) - i - 1].second[j]);
+          I_RS();
+          I_WR();
+        }
+      }
+      sf /= 2;
+
+      #ifdef GALOIS_INSTRUMENT
+      std::sort(cnt_vec.begin(), cnt_vec.begin() + (sf * numHosts));
+      for (uint32_t i = 0; i < (uint32_t) sf * numHosts; i++) {
+        I_RR();
+        I_WR();
+      }
+      #endif
+    }
+    // Determine virtualToPhyMapping values
+    for (uint32_t i = 0; i < numHosts; i++) {
+      std::vector vec = cnt_vec[i].second;
+      for (size_t j = 0; j < vec.size(); j++) {
+        virtualToPhyMapping[vec[j]] = i;
+        I_RS();
+        I_WR();
+      }
+    }
+  }
+
+  /**
+   * Merge perThread Data Structures
+   */
+  void mergeThreadDS() {
+    // combine per thread edge list
+    // TODO: It may cause memory fragmentation and so use vector +
+    // inspector/executor in that case
+    std::unordered_map<uint64_t, size_t> globalNodeIDToLocalEdgesIdx;
+    uint64_t numThreads = perThreadLocalEdges.size();
+    for (size_t i = 0; i < numThreads; i++) {
+      I_RR();
+      uint64_t perThreadSize = perThreadLocalEdges[i].size();
+      for (size_t j = 0; j < perThreadSize; j++) {
+        I_RR();
+        uint64_t globalID = perThreadLocalEdges[i][j][0].src;
+        I_RR();
+        if (auto search = globalNodeIDToLocalEdgesIdx.find(globalID);
+            search !=
+            globalNodeIDToLocalEdgesIdx.end()) { // if token already exists
+          I_WM(perThreadLocalEdges[i][j].size());
+          std::move(perThreadLocalEdges[i][j].begin(),
+                    perThreadLocalEdges[i][j].end(),
+                    std::back_inserter(localEdges[search->second]));
+        } else { // not exist, make a new one
+          I_WR();
+          globalNodeIDToLocalEdgesIdx.insert({globalID, localEdges.size()});
+          I_WR();
+          localEdges.emplace_back(std::move(perThreadLocalEdges[i][j]));
+        }
+      }
+    }
+    perThreadLocalEdges.clear();
+    perThreadLocalEdges.shrink_to_fit();
+
+    // make a maping from localEdges idx to ID
+    localEdgesIdxToGlobalNodeID.resize(globalNodeIDToLocalEdgesIdx.size());
+    galois::do_all(
+        galois::iterate(globalNodeIDToLocalEdgesIdx),
+        [this](std::unordered_map<uint64_t, size_t>::value_type& p) {
+          I_WR();
+          localEdgesIdxToGlobalNodeID[p.second] = p.first;
+        },
+        galois::steal());
+
+    // combine per thread node list
+    std::vector<uint64_t> perThreadLocalNodesOffset(perThreadLocalNodes.size(),
+                                                    0);
+    for (size_t i = 1; i < perThreadLocalNodes.size(); i++) {
+      I_WR();
+      perThreadLocalNodesOffset[i] =
+          perThreadLocalNodes[i - 1].size() + perThreadLocalNodesOffset[i - 1];
+    }
+    localNodes.resize(localNodeSize[hostID]);
+    galois::on_each([&](unsigned tid, unsigned) {
+      uint64_t perThreadOffset = perThreadLocalNodesOffset[tid];
+      I_WM(perThreadLocalNodes[tid].size());
+      std::move(perThreadLocalNodes[tid].begin(),
+                perThreadLocalNodes[tid].end(),
+                localNodes.begin() + perThreadOffset);
+    });
+    perThreadLocalNodes.clear();
+    perThreadLocalNodes.shrink_to_fit();
+  }
+
+  /**
+   * Compute prefix sum of the size of edges of nodes in the graph
+   */
+  void computeEdgePrefixSum() {
+    auto& net = galois::runtime::getSystemNetworkInterface();
+
+    size_t numLocalNodes    = localEdges.size();
+    uint64_t numGlobalNodes = size();
+    std::vector<uint64_t> localNodeDegree(numLocalNodes);
+
+    galois::do_all(
+        galois::iterate((size_t)0, numLocalNodes),
+        [this, &localNodeDegree](size_t n) {
+          I_WR();
+          localNodeDegree[n] = localEdges[n].size();
+        },
+        galois::steal());
+
+    // broadcast node degrees and its global ID to other hosts
+    {
+      galois::runtime::SendBuffer sendBuffer;
+      galois::runtime::gSerialize(sendBuffer, localNodeDegree);
+      galois::runtime::gSerialize(
+          sendBuffer,
+          localEdgesIdxToGlobalNodeID); // global ID of the localNodeDegree
+
+      for (uint32_t h = 0; h < numHosts; ++h) {
+        if (h == hostID) {
+          continue;
+        }
+
+        galois::runtime::SendBuffer b;
+        galois::runtime::gSerialize(b, sendBuffer);
+        I_LC(h, b.size());
+        net.sendTagged(h, galois::runtime::evilPhase, b);
+      }
+    }
+
+    // init edge prefix sum
+    globalEdgePrefixSum.resize(numGlobalNodes);
+    galois::do_all(
+        galois::iterate((size_t)0, localEdgesIdxToGlobalNodeID.size()),
+        [this, &localNodeDegree](size_t n) {
+          I_WR();
+          globalEdgePrefixSum[localEdgesIdxToGlobalNodeID[n]] +=
+              localNodeDegree[n];
+        },
+        galois::steal());
+    localNodeDegree.clear();
+    localNodeDegree.shrink_to_fit();
+
+    // recv node degrees and its global ID from other hosts
+    // build a list of degree of all global nodes on `globalEdgePrefixSum`
+    for (uint32_t h = 0; h < numHosts - 1; h++) {
+      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+      do {
+        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+      } while (!p);
+      // deserialize
+      std::vector<uint64_t> recvNodeDegree;
+      std::vector<uint64_t> recvNodeGlobalID;
+      galois::runtime::gDeserialize(p->second, recvNodeDegree);
+      galois::runtime::gDeserialize(p->second, recvNodeGlobalID);
+
+      galois::do_all(
+          galois::iterate((size_t)0, recvNodeDegree.size()),
+          [this, &recvNodeDegree, &recvNodeGlobalID](size_t n) {
+            I_WR();
+            globalEdgePrefixSum[recvNodeGlobalID[n]] += recvNodeDegree[n];
+          },
+          galois::steal());
+    }
+
+    // globalEdgePrefixSum has degree info now, so could compute prefixsum
+    // in place
+    for (size_t h = 1; h < numGlobalNodes; h++) {
+      I_WR();
+      globalEdgePrefixSum[h] += globalEdgePrefixSum[h - 1];
+    }
+
+    // set numEdges (global size)
+    setSizeEdges(globalEdgePrefixSum[numGlobalNodes - 1]);
+    increment_evilPhase();
+  }
+
+public:
+  template <typename WMDBufferedGraph_EdgeType,
+            typename WMDBufferedGraph_NodeType>
+  friend class WMDBufferedGraph;
+  std::vector<uint32_t> virtualToPhyMapping;
+  uint64_t scaleFactor;
+  uint32_t numVirtualHosts;
+  std::vector<std::vector<EdgeDataType>>
+      localEdges; // edges list of local nodes, idx is local ID
+
+  WMDOfflineGraph() {}
+
+  /**
+   * An object that load graph info from the file.
+   * Expect a WMD format csv
+   *
+   * @param name loaded file for the graph.
+   * @param md Masters distribution policy that will be used for partition.
+   * @param segmentsPerHost the number of file segments each host will load.
+   * Default value is 1, no file striping is performed. The file is striped into
+   * (segementsPerHost * numHosts) segments.
+   * @param scaleFactor param decide how many virtual host will be used (as a
+   * scale of num physical host) Default value is 4. which means there will be 4
+   * * numHosts virtual hosts.
+   */
+  WMDOfflineGraph(
+      std::vector<std::unique_ptr<
+          galois::graphs::FileParser<NodeDataType, EdgeDataType>>>& parsers,
+      galois::graphs::MASTERS_DISTRIBUTION md, uint64_t segmentsPerHost = 1,
+      uint32_t scaleFactor = 4)
+      : OfflineGraph() {
+    auto& net         = galois::runtime::getSystemNetworkInterface();
+    hostID            = net.ID;
+    numHosts          = net.Num;
+    this->scaleFactor = scaleFactor;
+    numVirtualHosts   = scaleFactor * numHosts;
+    virtualToPhyMapping.resize(numVirtualHosts);
+
+    galois::gDebug("[", hostID, "] loadGraphFile!");
+    loadGraphFiles(parsers, segmentsPerHost, md == BALANCED_MASTERS);
+    mergeThreadDS();
+    galois::gDebug("[", hostID, "] exchangeEdgeCntMetadata!");
+    exchangeEdgeCnt();
+    galois::gInfo("[", hostID, "] read WMD csv file with local Nodes: ",
+                  localNodeSize[hostID], ", local Edges: ", localEdgeSize);
+  }
+
+  /**
+   * Accesses the prefix sum of degree up to node `n`.
+   *
+   * @param N global ID of node
+   * @returns The value located at index n in the edge prefix sum array
+   */
+  uint64_t operator[](uint64_t N) { return globalEdgePrefixSum[N]; }
+
+  size_t edgeSize() const { return sizeof(EdgeDataType); }
+
+  iterator begin() { return iterator(0); }
+
+  iterator end() { return iterator(size()); }
+
+  /**
+   * return the end idx of edges of node N
+   *
+   * @param N global ID of node
+   * @return edge_iterator
+   */
+  edge_iterator edge_begin(uint64_t N) {
+    if (N == 0)
+      return edge_iterator(0);
+    else
+      return edge_iterator(globalEdgePrefixSum[N - 1]);
+  }
+
+  /**
+   * return the begin idx of edges of node N
+   *
+   * @param N global ID of node
+   * @return edge_iterator
+   */
+  edge_iterator edge_end(uint64_t N) {
+    return edge_iterator(globalEdgePrefixSum[N]);
+  }
+
+  /**
+   * Returns 2 ranges (one for nodes, one for edges) for a particular
+   * division. The ranges specify the nodes/edges that a division is
+   * responsible for. The function attempts to split them evenly among threads
+   * given some kind of weighting
+   *
+   * @param nodeWeight weight to give to a node in division
+   * @param edgeWeight weight to give to an edge in division
+   * @param id Division number you want the ranges for
+   * @param total Total number of divisions
+   * @param scaleFactor Vector specifying if certain divisions should get more
+   * than other divisions
+   */
+  auto divideByNode(size_t nodeWeight, size_t edgeWeight, size_t id,
+                    size_t total,
+                    std::vector<unsigned> scaleFactor = std::vector<unsigned>())
+      -> GraphRange {
+    return galois::graphs::divideNodesBinarySearch<WMDOfflineGraph>(
+        size(), sizeEdges(), nodeWeight, edgeWeight, id, total, *this,
+        scaleFactor);
+  }
+
+  /**
+   * Release memory used by EdgePrefixSum
+   * After that, calls to `edge_begin` and `edge_end` will be invalid
+   */
+  void clearEdgePrefixSumInfo() {
+    globalEdgePrefixSum.clear();
+    globalEdgePrefixSum.shrink_to_fit();
+  }
+};
+
+/**
+ * Class that loads a portion of a Galois graph from disk directly into
+ * memory buffers for access.
+ *
+ * @tparam EdgeDataType type of the edge data
+ */
+template <typename NodeDataType, typename EdgeDataType>
+class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
+private:
+  typedef boost::counting_iterator<uint64_t> iterator;
+
+  // Edge iterator typedef
+  using EdgeIterator = boost::counting_iterator<uint64_t>;
+
+  // specifies whether or not the graph is loaded
+  bool graphLoaded = false;
+
+  // size of the entire graph (not just locallly loaded portion)
+  uint32_t globalSize = 0;
+  // number of edges in the entire graph (not just locallly loaded portion)
+  uint64_t globalEdgeSize = 0;
+
+  // number of nodes loaded into this graph
+  uint32_t numLocalNodes = 0;
+  // number of edges loaded into this graph
+  uint64_t numLocalEdges = 0;
+  // offset of local to global node id
+  uint64_t nodeOffset = 0;
+
+  uint32_t hostID;
+  uint32_t numHosts;
+
+  // CSR representation of edges
+  std::vector<uint64_t> offsets; // offsets[numLocalNodes] point to end of edges
+  std::vector<EdgeDataType> edges;
+
+  void
+  exchangeLocalNodeSize(WMDOfflineGraph<NodeDataType, EdgeDataType>& srcGraph) {
+    auto& net = galois::runtime::getSystemNetworkInterface();
+    globalNodeOffset.resize(numHosts);
+    localNodeSize.resize(numHosts);
+    std::vector<std::vector<uint64_t>> threadNodesToSend(
+        galois::runtime::activeThreads);
+    for (uint32_t i = 0; i < galois::runtime::activeThreads; i++) {
+      threadNodesToSend[i].resize(numHosts, 0);
+      I_WR();
+    }
+    galois::on_each([&](unsigned tid, unsigned nthreads) {
+      uint64_t beginNode;
+      uint64_t endNode;
+      std::tie(beginNode, endNode) = galois::block_range(
+          (uint64_t)0, srcGraph.localNodes.size(), tid, nthreads);
+
+      for (uint64_t i = beginNode; i < endNode; ++i) {
+        int host =
+            virtualToPhyMapping[srcGraph.localNodes[i].glbid % numVirtualHosts];
+        threadNodesToSend[tid][host]++;
+        I_WR();
+        for (int k = 0; k < 2; k++)
+          I_RR();
+      }
+    });
+    for (uint32_t tid = 0; tid < galois::runtime::activeThreads; tid++) {
+      for (uint32_t h = 0; h < numHosts; h++) {
+        localNodeSize[h] += threadNodesToSend[tid][h];
+        I_RR();
+        I_WR();
+      }
+    }
+    
+    numNodes = 0;
+
+    // send vertex size to other hosts
+    for (uint32_t h = 0; h < numHosts; ++h) {
+      if (h == hostID) {
+        continue;
+      }
+      // serialize size_t
+      galois::runtime::SendBuffer sendBuffer;
+      galois::runtime::gSerialize(sendBuffer, localNodeSize);
+      net.sendTagged(h, galois::runtime::evilPhase, sendBuffer);
+      I_WM(localNodeSize.size());
+    }
+
+    for (uint32_t h = 0; h < numHosts - 1; h++) {
+      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+      do {
+        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+      } while (!p);
+      std::vector<uint64_t> cnt;
+      // deserialize local_node_size
+      galois::runtime::gDeserialize(p->second, cnt);
+      I_LC(p->first, cnt.size() * sizeof(uint64_t));
+      for (uint32_t i = 0; i < numHosts; i++) {
+        localNodeSize[i] += cnt[i];
+        I_RR();
+        I_WR();
+      }
+    }
+
+    numNodes      = localNodeSize[hostID];
+    numLocalNodes = numNodes;
+    // compute prefix sum to get offset
+    globalNodeOffset[0] = 0;
+    for (size_t h = 1; h < numHosts; h++) {
+      globalNodeOffset[h] = localNodeSize[h - 1] + globalNodeOffset[h - 1];
+      for (int k = 0; k < 2; k++)
+        I_RR();
+      I_WR();
+    }
+    srcGraph.setSize(globalNodeOffset[numHosts - 1] +
+                     localNodeSize[numHosts - 1]);
+    for (int k = 0; k < 2; k++)
+        I_RR();
+    I_WR();
+
+    increment_evilPhase();
+  }
+
+
+  /**
+   * Exchanges vertex ids to form a global id to local id map before exchanging
+   * edges so that using the map edges can be inserted into the edgelist
+   */
+  void
+  gatherVerticesAndEdges(std::vector<std::vector<EdgeDataType>>& localEdges,
+                         std::vector<NodeDataType>& localNodes) {
+    auto& net              = galois::runtime::getSystemNetworkInterface();
+    uint32_t activeThreads = galois::getActiveThreads();
+
+    // prepare both nodedata and edgedata to send to all hosts
+    std::vector<std::vector<std::vector<EdgeDataType>>> edgesToSend(
+        numHosts, std::vector<std::vector<EdgeDataType>>());
+    std::vector<std::vector<NodeDataType>> nodesToSend(
+        numHosts, std::vector<NodeDataType>());
+
+    // PerThread DS
+    std::vector<std::vector<std::vector<std::vector<EdgeDataType>>>>
+        threadEdgesToSend(
+            activeThreads,
+            std::vector<std::vector<std::vector<EdgeDataType>>>());
+    std::vector<std::vector<std::vector<NodeDataType>>> threadNodesToSend(
+        activeThreads, std::vector<std::vector<NodeDataType>>());
+    for (uint32_t i = 0; i < activeThreads; i++) {
+      threadEdgesToSend[i].resize(numHosts);
+      threadNodesToSend[i].resize(numHosts);
+    }
+
+    // Prepare edgeList and Vertex ID list to send to other hosts
+    uint64_t sz = localEdges.size();
+    galois::on_each([&](unsigned tid, unsigned nthreads) {
+      uint64_t beginNode;
+      uint64_t endNode;
+      std::tie(beginNode, endNode) =
+          galois::block_range((uint64_t)0, sz, tid, nthreads);
+
+      for (uint64_t i = beginNode; i < endNode; ++i) {
+        uint64_t src = localEdges[i][0].src;
+        int host = virtualToPhyMapping[src % numVirtualHosts];
+        threadEdgesToSend[tid][host].push_back((localEdges[i]));
+        for (int k = 0; k < 3; k++)
+          I_RR();
+        I_WM(2);
+      }
+    });
+
+    // Prepare Nodedata to send to other hosts
+    galois::on_each([&](unsigned tid, unsigned nthreads) {
+      size_t beginNode;
+      size_t endNode;
+      std::tie(beginNode, endNode) =
+          galois::block_range((uint64_t)0, localNodes.size(), tid, nthreads);
+
+      for (size_t i = beginNode; i < (endNode); ++i) {
+        int host =
+            virtualToPhyMapping[(localNodes[i].glbid) % (scaleFactor * numHosts)];
+        threadNodesToSend[tid][host].push_back((localNodes[i]));
+        for (int k = 0; k < 2; k++)
+          I_RR();
+        I_WR();
+      }
+    });
+
+    for (uint32_t tid = 0; tid < activeThreads; tid++) {
+      for (uint32_t h = 0; h < numHosts; h++) {
+        nodesToSend[h].insert(nodesToSend[h].end(),
+                              threadNodesToSend[tid][h].begin(),
+                              threadNodesToSend[tid][h].end());
+        edgesToSend[h].insert(edgesToSend[h].end(),
+                              threadEdgesToSend[tid][h].begin(),
+                              threadEdgesToSend[tid][h].end());
+        for (int i = 0; i < 6; i++)
+          I_RR();
+        I_WM(3);
+      }
+    }
+
+    threadNodesToSend.clear();
+    threadEdgesToSend.clear();
+
+    localEdges.clear();
+
+    // Send Nodelist
+    for (uint32_t h = 0; h < numHosts; h++) {
+      if (h == hostID)
+        continue;
+      galois::runtime::SendBuffer sendBuffer;
+      galois::runtime::gSerialize(sendBuffer, nodesToSend[h]);
+      net.sendTagged(h, galois::runtime::evilPhase, sendBuffer);
+      I_WM(nodesToSend[h].size());
+    }
+
+    // Collect node data received from other hosts
+    for (uint32_t i = 0; i < (numHosts - 1); i++) {
+      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+      do {
+        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+      } while (!p);
+      std::vector<NodeDataType> NodeData;
+      galois::runtime::gDeserialize(p->second, NodeData);
+      I_LC(p->first, NodeData.size() * sizeof(NodeDataType));
+      std::vector<std::map<uint64_t, uint32_t>> threadMap(activeThreads);
+      std::vector<std::map<uint32_t, uint64_t>> threadLIDMap(activeThreads);
+
+      uint64_t offset = GIDtoLID.size();
+
+      galois::on_each([&](unsigned tid, unsigned nthreads) {
+        size_t beginNode;
+        size_t endNode;
+        std::tie(beginNode, endNode) =
+            galois::block_range((uint64_t)0, NodeData.size(), tid, nthreads);
+        uint64_t delta;
+        delta = std::ceil((double)NodeData.size() / activeThreads);
+        for (size_t j = beginNode; j < (endNode); ++j) {
+          threadMap[tid][NodeData[j].glbid] =
+              offset + (tid * (delta)) + j - beginNode;
+<<<<<<< HEAD
+          threadLIDMap[tid][offset + (tid * (delta)) + j - beginNode] = 
+              NodeData[j].glbid;
+=======
+          I_WR();
+          I_RR();
+>>>>>>> 73f44af9e (feat: add instrument to wmd graph importer)
+        }
+      });
+      for (uint32_t t = 0; t < activeThreads; t++) {
+        GIDtoLID.insert(threadMap[t].begin(), threadMap[t].end());
+<<<<<<< HEAD
+        LIDtoGID.insert(threadLIDMap[t].begin(), threadLIDMap[t].end());
+=======
+        I_WR();
+        I_RR();
+>>>>>>> 73f44af9e (feat: add instrument to wmd graph importer)
+      }
+      threadMap.clear();
+      NodeData.clear();
+      threadLIDMap.clear();
+    }
+
+    // Collect node data present in this host
+    std::vector<std::map<uint64_t, size_t>> threadMap(activeThreads);
+    std::vector<std::map<size_t, uint64_t>> threadLIDMap(activeThreads);
+    uint64_t offset = GIDtoLID.size();
+    galois::on_each([&](unsigned tid, unsigned nthreads) {
+      size_t beginNode;
+      size_t endNode;
+      std::tie(beginNode, endNode) = galois::block_range(
+          (uint64_t)0, nodesToSend[hostID].size(), tid, nthreads);
+      uint64_t delta;
+      delta = std::ceil((double)nodesToSend[hostID].size() / activeThreads);
+      for (size_t i = beginNode; i < (endNode); ++i) {
+        threadMap[tid][nodesToSend[hostID][i].glbid] =
+            offset + (tid * (delta)) + i - beginNode;
+<<<<<<< HEAD
+        threadLIDMap[tid][offset + (tid * (delta)) + i - beginNode] = 
+            nodesToSend[hostID][i].glbid;
+=======
+        I_WR();
+        I_RR();
+>>>>>>> 73f44af9e (feat: add instrument to wmd graph importer)
+      }
+    });
+    for (uint32_t t = 0; t < activeThreads; t++) {
+      GIDtoLID.insert(threadMap[t].begin(), threadMap[t].end());
+<<<<<<< HEAD
+      LIDtoGID.insert(threadLIDMap[t].begin(), threadLIDMap[t].end());
+=======
+      I_WR();
+      I_RR();
+>>>>>>> 73f44af9e (feat: add instrument to wmd graph importer)
+    }
+    threadMap.clear();
+<<<<<<< HEAD
+    threadLIDMap.clear();
+
+    numLocalNodes = GIDtoLID.size();
+    localEdges.clear();
+=======
+    
+>>>>>>> 6855c8e71 (fix: remove uncessary code)
+    nodesToSend.clear();
+
+
+    increment_evilPhase();
+    // Send Edgelist
+    for (uint32_t h = 0; h < numHosts; h++) {
+      if (h == hostID)
+        continue;
+      galois::runtime::SendBuffer sendBuffer;
+      galois::runtime::gSerialize(sendBuffer, edgesToSend[h]);
+      galois::gInfo("[", hostID, "] ", "send to ", h,
+                    " edgesToSend size: ", edgesToSend[h].size());
+      net.sendTagged(h, galois::runtime::evilPhase, sendBuffer);
+      I_WM(edgesToSend[h].size());
+    }
+
+    // Appending edges in each host that belong to self
+    localEdges.resize(GIDtoLID.size());
+
+    // Receiving edges from other hosts and populating edgelist
+    for (uint32_t h = 0; h < (numHosts - 1); h++) {
+      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+      do {
+        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+      } while (!p);
+      uint32_t sendingHost = p->first;
+
+      std::vector<std::vector<EdgeDataType>> edgeList;
+
+      galois::runtime::gDeserialize(p->second, edgeList);
+      #ifdef GALOIS_INSTRUMENT
+      for (auto l: edgeList)
+        I_LC(sendingHost, l.size() * sizeof(EdgeDataType));
+      #endif
+
+      galois::gInfo("[", hostID, "] recv from ", sendingHost,
+                    " edgeList size: ", edgeList.size());
+
+      galois::on_each([&](unsigned tid, unsigned nthreads) {
+        size_t beginNode;
+        size_t endNode;
+        std::tie(beginNode, endNode) =
+            galois::block_range((size_t)0, edgeList.size(), tid, nthreads);
+        for (size_t j = beginNode; j < endNode; j++) {
+          auto lid = GIDtoLID[edgeList[j][0].src];
+          localEdges[lid].insert(
+              std::end(localEdges[lid]),
+              std::begin(edgeList[j]), std::end(edgeList[j]));
+          for (int i = 0; i < 3; i++)
+            I_RR();
+          I_WR();
+        }
+      });
+      edgeList.clear();
+    }
+    galois::on_each([&](unsigned tid, unsigned nthreads) {
+      size_t beginNode;
+      size_t endNode;
+      std::tie(beginNode, endNode) = galois::block_range(
+          (size_t)0, edgesToSend[hostID].size(), tid, nthreads);
+      for (size_t j = beginNode; j < endNode; j++) {
+        auto lid = GIDtoLID[edgesToSend[hostID][j][0].src];
+        localEdges[lid].insert(
+            std::end(localEdges[lid]),
+            std::begin(edgesToSend[hostID][j]),
+            std::end(edgesToSend[hostID][j]));
+        for (int i = 0; i < 4; i++)
+          I_RR();
+        I_WR();
+      }
+    });
+    edgesToSend.clear();
+    increment_evilPhase();
+  }
+
+<<<<<<< HEAD
+=======
+  void exchangeTokenID() {
+    auto& net            = galois::runtime::getSystemNetworkInterface();
+    uint64_t this_offset = globalNodeOffset[hostID];
+    galois::do_all(
+        galois::iterate(GIDtoLID.begin(), GIDtoLID.end()),
+        [&](auto& el) { 
+          el.second += this_offset; 
+          I_WR();
+        }, galois::steal());
+    // GIDtoLID.reserve(size);
+
+    for (uint32_t h = 0; h < numHosts; h++) {
+      if (h == hostID)
+        continue;
+      galois::runtime::SendBuffer b;
+      galois::runtime::gSerialize(b, GIDtoLID);
+      net.sendTagged(h, galois::runtime::evilPhase, b);
+      I_WM(GIDtoLID.size());
+    }
+
+    // recv sorted token list from other hosts
+    for (uint32_t h = 0; h < (numHosts - 1); h++) {
+      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+      do {
+        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+      } while (!p);
+      I_LC(p->first, p->second.size());
+      std::unordered_map<uint64_t, uint32_t> tokensToGID;
+      galois::runtime::gDeserialize(p->second, tokensToGID);
+      GIDtoLID.insert(tokensToGID.begin(), tokensToGID.end());
+      tokensToGID.clear();
+      I_WM(tokensToGID.size());
+    }
+    increment_evilPhase();
+  }
+
+  void
+  exchangeLocalNodeSize(WMDOfflineGraph<NodeDataType, EdgeDataType>& srcGraph) {
+    auto& net = galois::runtime::getSystemNetworkInterface();
+    globalNodeOffset.resize(numHosts);
+    localNodeSize.resize(numHosts);
+    std::vector<std::vector<uint64_t>> threadNodesToSend(
+        galois::runtime::activeThreads);
+    for (uint32_t i = 0; i < galois::runtime::activeThreads; i++) {
+      threadNodesToSend[i].resize(numHosts, 0);
+      I_WR();
+    }
+    galois::on_each([&](unsigned tid, unsigned nthreads) {
+      uint64_t beginNode;
+      uint64_t endNode;
+      std::tie(beginNode, endNode) = galois::block_range(
+          (uint64_t)0, srcGraph.localNodes.size(), tid, nthreads);
+
+      for (uint64_t i = beginNode; i < endNode; ++i) {
+        int host =
+            virtualToPhyMapping[srcGraph.localNodes[i].id % numVirtualHosts];
+        threadNodesToSend[tid][host]++;
+        I_WR();
+        for (int k = 0; k < 2; k++)
+          I_RR();
+      }
+    });
+    for (uint32_t tid = 0; tid < galois::runtime::activeThreads; tid++) {
+      for (uint32_t h = 0; h < numHosts; h++) {
+        localNodeSize[h] += threadNodesToSend[tid][h];
+        I_RR();
+        I_WR();
+      }
+    }
+    numNodes = 0;
+
+    // send vertex size to other hosts
+    for (uint32_t h = 0; h < numHosts; ++h) {
+      if (h == hostID) {
+        continue;
+      }
+      // serialize size_t
+      galois::runtime::SendBuffer sendBuffer;
+      galois::runtime::gSerialize(sendBuffer, localNodeSize);
+      net.sendTagged(h, galois::runtime::evilPhase, sendBuffer);
+      I_WM(localNodeSize.size());
+    }
+
+    // recv node size from other hosts
+    for (uint32_t h = 0; h < numHosts - 1; h++) {
+      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+      do {
+        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+      } while (!p);
+      std::vector<uint64_t> cnt;
+      // deserialize local_node_size
+      galois::runtime::gDeserialize(p->second, cnt);
+      I_LC(p->first, cnt.size() * sizeof(uint64_t));
+      for (uint32_t i = 0; i < numHosts; i++) {
+        localNodeSize[i] += cnt[i];
+        I_RR();
+        I_WR();
+      }
+    }
+
+    numNodes      = localNodeSize[hostID];
+    numLocalNodes = numNodes;
+    // compute prefix sum to get offset
+    globalNodeOffset[0] = 0;
+    for (size_t h = 1; h < numHosts; h++) {
+      globalNodeOffset[h] = localNodeSize[h - 1] + globalNodeOffset[h - 1];
+      for (int k = 0; k < 2; k++)
+        I_RR();
+      I_WR();
+    }
+    srcGraph.setSize(globalNodeOffset[numHosts - 1] +
+                     localNodeSize[numHosts - 1]);
+    for (int k = 0; k < 2; k++)
+        I_RR();
+    I_WR();
+
+    increment_evilPhase();
+  }
+
+  void relabelEdges(std::vector<std::vector<EdgeDataType>>& localEdges) {
+    galois::do_all(
+        galois::iterate((size_t)0, localEdges.size()),
+        [this, &localEdges](size_t i) {
+          for (uint64_t j = 0; j < (localEdges[i].size()); j++) {
+            localEdges[i][j].src_glbid = GIDtoLID[localEdges[i][0].src];
+            localEdges[i][j].dst_glbid = GIDtoLID[localEdges[i][j].dst];
+            for (int k = 0; k < 4; k++)
+                I_RR();
+            I_WM(2);
+          }
+        },
+        galois::steal());
+  }
+
+>>>>>>> 73f44af9e (feat: add instrument to wmd graph importer)
+  /**
+   * Flatten the 2D vector localEdges into a CSR edge list
+   * Will compute edge size and build CSR edge offset mapping
+   */
+  void flattenEdges(std::vector<std::vector<EdgeDataType>>& localEdges) {
+    // build CSR edge offseto
+    offsets.resize(numLocalNodes + 1, 0);
+    for (size_t i = 0; i < numLocalNodes; i++) {
+      uint64_t cnt;
+      if (i >= localEdges.size())
+        cnt = 0;
+      else
+        cnt = localEdges[i].size();
+      offsets[i + 1] += cnt + offsets[i];
+      I_RR();
+      I_WR();
+    }
+    numLocalEdges = offsets[numLocalNodes];
+
+    // build flatten edge list
+    edges.resize(numLocalEdges);
+    galois::do_all(
+        galois::iterate((size_t)0, localEdges.size()),
+        [this, &localEdges](size_t i) {
+          I_WM(localEdges[i].size());
+          std::move(localEdges[i].begin(), localEdges[i].end(),
+                    edges.begin() + offsets[i]);
+        },
+        galois::steal());
+  }
+
+public:
+  WMDBufferedGraph() : BufferedGraph<EdgeDataType>() {}
+  std::unordered_map<uint64_t, uint32_t> GIDtoLID;
+  std::unordered_map<uint32_t, uint64_t> LIDtoGID;
+
+  // copy not allowed
+  //! disabled copy constructor
+  WMDBufferedGraph(const WMDBufferedGraph&) = delete;
+  //! disabled copy constructor operator
+  WMDBufferedGraph& operator=(const WMDBufferedGraph&) = delete;
+  // move not allowed
+  //! disabled move operator
+  WMDBufferedGraph(WMDBufferedGraph&&) = delete;
+  //! disabled move constructor operator
+  WMDBufferedGraph& operator=(WMDBufferedGraph&&) = delete;
+
+  uint32_t scaleFactor;
+  uint64_t numNodes;
+  uint32_t numVirtualHosts;
+  std::vector<uint64_t> localNodeSize; // number of local nodes in each hosts
+  std::vector<uint64_t>
+      globalNodeOffset; // each hosts' local ID offset wrt global ID
+  std::vector<uint32_t> virtualToPhyMapping;
+  /**
+   * Gets the number of global nodes in the graph
+   * @returns the total number of nodes in the graph (not just local loaded
+   * nodes)
+   */
+  uint32_t size() const { return globalSize; }
+
+  /**
+   * Gets the number of global edges in the graph
+   * @returns the total number of edges in the graph (not just local loaded
+   * edges)
+   */
+  uint32_t sizeEdges() const { return globalEdgeSize; }
+
+  /**
+   * Gets the number of local edges in the graph
+   * @returns the total number of edges in the local graph
+   */
+  uint32_t sizeLocalEdges() const { return numLocalEdges; }
+
+  //! @returns node offset of this buffered graph
+  uint64_t getNodeOffset() const { return nodeOffset; }
+
+  /**
+   * Given a node/edge range to load, loads the specified portion of the
+   * graph into memory buffers from OfflineGraph.
+   *
+   * @param srcGraph the OfflineGraph to load from
+   * @param nodeStart First node to load
+   * @param nodeEnd Last node to load, non-inclusive
+   * @param numGlobalNodes Total number of nodes in the graph
+   * @param numGlobalEdges Total number of edges in the graph
+   */
+  void loadPartialGraph(WMDOfflineGraph<NodeDataType, EdgeDataType>& srcGraph,
+                        uint64_t numGlobalEdges) {
+    if (graphLoaded) {
+      GALOIS_DIE("Cannot load an buffered graph more than once.");
+    }
+
+    // prepare meta data
+    auto& net = galois::runtime::getSystemNetworkInterface();
+    hostID    = net.ID;
+    numHosts  = net.Num;
+
+    globalEdgeSize = numGlobalEdges;
+
+    scaleFactor     = srcGraph.scaleFactor;
+    numVirtualHosts = srcGraph.numVirtualHosts;
+    virtualToPhyMapping.resize(numVirtualHosts);
+    for (uint32_t i = 0; i < numVirtualHosts; i++) {
+      virtualToPhyMapping[i] = srcGraph.virtualToPhyMapping[i];
+      I_RS();
+      I_WR();
+    }
+
+    // build local buffered graph 
+    exchangeLocalNodeSize(srcGraph);
+    galois::gDebug("[", hostID, "] gatherVerticesAndEdges!");
+    gatherVerticesAndEdges(srcGraph.localEdges, srcGraph.localNodes);
+<<<<<<< HEAD
+    galois::gDebug("[", hostID, "] ", "flattenEdges!");
+=======
+    galois::gDebug("[", hostID, "] exchangeLocalNodeSize!");
+    exchangeLocalNodeSize(srcGraph);
+    galois::gDebug("[", hostID, "] exchangeTokenID!");
+    exchangeTokenID();
+    galois::gDebug("[", hostID, "] relabelEdges!");
+    relabelEdges(srcGraph.localEdges);
+    galois::gDebug("[", hostID, "] flattenEdges!");
+>>>>>>> 6855c8e71 (fix: remove uncessary code)
+    flattenEdges(srcGraph.localEdges);
+
+    // clean unused data
+    srcGraph.localEdgesIdxToGlobalNodeID.clear();
+    srcGraph.localEdgesIdxToGlobalNodeID.shrink_to_fit();
+    srcGraph.localEdges.clear();
+    srcGraph.localEdges.shrink_to_fit();
+
+    graphLoaded = true;
+
+    galois::gDebug("[", hostID, "] ", "exchangeNodeRange!");
+    galois::gDebug("[", hostID, "] ",
+                   "BufferedGraph built, nodes: ", numLocalNodes,
+                   ", edges: ", numLocalEdges);
+  }
+
+  /**
+   * Gather local nodes data (mirror + master nodes) from other hosts to
+   * this host And save data to graph
+   *
+   * @param srcGraph the OfflineGraph owns node data (will be cleared after
+   * this call)
+   * @param proxiesOnHosts a list of bit vector which indicates node on that
+   * hosts (include mirror and master nodes)
+   * @param totalLocalNodes the total number of local nodes this host should
+   * have (include mirror and master nodes)
+   */
+  template <typename GraphTy>
+  void gatherNodes(WMDOfflineGraph<NodeDataType, EdgeDataType>& srcGraph,
+                   GraphTy& dstGraph,
+                   std::vector<std::vector<uint64_t>>& proxiesOnHosts,
+                   uint64_t totalLocalNodes,
+                   std::unordered_map<uint64_t, uint32_t> globalToLocalMap) {
+#ifdef NDEBUG
+    (void)totalLocalNodes;
+#endif
+    auto& net        = galois::runtime::getSystemNetworkInterface();
+    auto& localNodes = srcGraph.localNodes;
+
+    // prepare data to send for all hosts
+    // each host will receive its nodes and corresponding node global ID
+    // list
+    galois::gDebug("[", hostID, "] ", "prepare node data!");
+    std::vector<std::vector<NodeDataType>> nodesToSend(
+        numHosts, std::vector<NodeDataType>());
+    std::vector<std::vector<std::vector<NodeDataType>>> threadNodesToSend(
+        galois::runtime::activeThreads,
+        std::vector<std::vector<NodeDataType>>());
+    uint64_t globalIDOffset = 0;
+
+    for (uint32_t i = 0; i < galois::runtime::activeThreads; i++) {
+      threadNodesToSend[i].resize(numHosts);
+      I_WR();
+    }
+    // Phase 1
+    galois::on_each([&](unsigned tid, unsigned nthreads) {
+      size_t beginNode;
+      size_t endNode;
+      std::tie(beginNode, endNode) = galois::block_range(
+          (size_t)0, srcGraph.localNodes.size(), tid, nthreads);
+
+      for (size_t i = beginNode; i < endNode; ++i) {
+        int host =
+            virtualToPhyMapping[srcGraph.localNodes[i].id % numVirtualHosts];
+        threadNodesToSend[tid][host].push_back((srcGraph.localNodes[i]));
+        for (int k = 0; k < 2; k++)
+          I_RR();
+        I_WR();
+      }
+    });
+
+    for (uint32_t tid = 0; tid < galois::runtime::activeThreads; tid++) {
+      for (uint32_t h = 0; h < numHosts; h++) {
+        nodesToSend[h].insert(nodesToSend[h].end(),
+                              threadNodesToSend[tid][h].begin(),
+                              threadNodesToSend[tid][h].end());
+        for (int k = 0; k < 2; k++)
+          I_RR();
+        I_WR();
+      }
+    }
+    srcGraph.localNodes.clear();
+
+    // Send Nodedata only
+    for (uint32_t h = 0; h < numHosts; h++) {
+      if (h == hostID)
+        continue;
+      galois::runtime::SendBuffer sendBuffer;
+      galois::runtime::gSerialize(sendBuffer, nodesToSend[h]);
+      net.sendTagged(h, galois::runtime::evilPhase, sendBuffer);
+      I_WM(nodesToSend[h].size());
+    }
+#ifndef NDEBUG
+    std::atomic<uint64_t> addedData{0};
+#endif
+    for (uint32_t i = 0; i < (numHosts - 1); i++) {
+      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+      do {
+        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+      } while (!p);
+      std::vector<NodeDataType> NodeData;
+      I_LC(p->first, p->second.size());
+      galois::runtime::gDeserialize(p->second, NodeData);
+      galois::do_all(galois::iterate((size_t)0, NodeData.size()),
+                    [this, NodeData, &dstGraph, &globalToLocalMap
+#ifndef NDEBUG
+                      ,
+                      &addedData
+#endif
+<<<<<<< HEAD
+      ](size_t j) {
+                       dstGraph.getData(
+                           GIDtoLID[NodeData[j].id]) = NodeData[j];
+                     });
+=======
+                    ](size_t j) {
+                      dstGraph.getData(
+                          globalToLocalMap[GIDtoLID[NodeData[j].id]]) =
+                          NodeData[j];
+                      for (int k = 0; k < 3; k++)
+                        I_RR();
+                      I_WR();
+                    });
+>>>>>>> 73f44af9e (feat: add instrument to wmd graph importer)
+      NodeData.clear();
+    }
+    galois::do_all(
+        galois::iterate((size_t)0, nodesToSend[hostID].size()),
+        [this, nodesToSend, &dstGraph, &globalToLocalMap
+#ifndef NDEBUG
+          ,
+          &addedData
+#endif
+        ](size_t i) {
+          dstGraph.getData(
+<<<<<<< HEAD
+              GIDtoLID[nodesToSend[hostID][i].id]) = 
+              nodesToSend[hostID][i];
+=======
+            globalToLocalMap[GIDtoLID[nodesToSend[hostID][i].id]]) =
+            nodesToSend[hostID][i];
+          for (int k = 0; k < 2; k++)
+            I_RR();
+          I_WM(2);         
+>>>>>>> 73f44af9e (feat: add instrument to wmd graph importer)
+#ifndef NDEBUG
+          addedData++;
+#endif
+        });
+    nodesToSend.clear();
+    nodesToSend.resize(numHosts);
+    increment_evilPhase();
+
+    // Phase 2
+    //     uint64_t numNodes = srcGraph.localNodeSize[hostID];
+    galois::do_all(
+        galois::iterate((uint64_t)0, (uint64_t)numHosts),
+        [this, &nodesToSend, &localNodes, &proxiesOnHosts,
+         globalIDOffset, &dstGraph, &globalToLocalMap](uint64_t i) {
+          if (i != hostID) {
+<<<<<<< HEAD
+            I_RR();
+            for (uint64_t j = 0; j < proxiesOnHosts[i].size(); j++) {
+              auto& r = dstGraph.getData(
+                  globalToLocalMap[proxiesOnHosts[i][j]]);    
+              nodesToSend[i].push_back(r);
+=======
+            auto& proxiesOnThatHost = proxiesOnHosts[i];
+            for (uint64_t j = 0; j < numNodes; j++) {
+              uint64_t gid = j + globalIDOffset;
+              I_RR();
+              if (proxiesOnThatHost.test(gid)) {
+                IDofNodesToSend[i].push_back(gid);
+                auto& r = dstGraph.getData(globalToLocalMap[gid]);
+                nodesToSend[i].push_back(r);
+                I_RR();
+                I_WM(2);  
+              }
+>>>>>>> 73f44af9e (feat: add instrument to wmd graph importer)
+            };
+          }
+        },
+        galois::steal());
+
+    // send nodes to other hosts
+    galois::gDebug("[", hostID, "] ", "send nodes!");
+
+    for (uint32_t h = 0; h < numHosts; h++) {
+      if (h == hostID)
+        continue;
+      assert(nodesToSend[h].size() == proxiesOnHosts[h].size());
+      galois::runtime::SendBuffer sendBuffer;
+      galois::runtime::gSerialize(sendBuffer, nodesToSend[h]);
+      galois::runtime::gSerialize(sendBuffer, proxiesOnHosts[h]);
+      galois::gDebug("[", hostID, "] ", "send to ", h,
+                     " nodesToSend size: ", nodesToSend[h].size());
+      I_WM(nodesToSend[h].size() + proxiesOnHosts[h].size()); 
+      net.sendTagged(h, galois::runtime::evilPhase, sendBuffer);
+    }
+
+    nodesToSend.clear();
+    // recive nodes from other hosts
+    for (uint32_t i = 0; i < (numHosts - 1); i++) {
+      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+      do {
+        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+      } while (!p);
+      uint32_t sendingHost = p->first;
+
+      std::vector<NodeDataType> nodeRecv;
+      std::vector<uint64_t> IDofNodeRecv;
+      I_LC(sendingHost, p->second.size());
+      galois::runtime::gDeserialize(p->second, nodeRecv);
+      galois::runtime::gDeserialize(p->second, IDofNodeRecv);
+
+      assert(nodeRecv.size() == IDofNodeRecv.size());
+      galois::gDebug("[", hostID, "] recv from ", sendingHost,
+                     " nodeRecv size: ", nodeRecv.size());
+
+      galois::do_all(
+          galois::iterate((size_t)0, IDofNodeRecv.size()),
+          [this, &nodeRecv, &IDofNodeRecv, &dstGraph,
+           &globalToLocalMap](size_t j) {
+            dstGraph.getData(globalToLocalMap[IDofNodeRecv[j]]) = nodeRecv[j];
+            for (int k = 0; k < 2; k++)
+              I_RR();
+            I_WR();
+          },
+          galois::steal());
+      nodeRecv.clear();
+      IDofNodeRecv.clear();
+    }
+#ifndef NDEBUG
+    //    assert(addedData == totalLocalNodes);
+#endif
+
+    increment_evilPhase();
+
+    // clean unused memory
+    srcGraph.localNodes.clear();
+    srcGraph.localNodes.shrink_to_fit();
+    srcGraph.nodeOffset.clear();
+    srcGraph.nodeOffset.shrink_to_fit();
+  }
+
+  // NOTE: for below methods, it return local edge id instead of global id
+
+  /**
+   * Get the index to the first edge of the provided node THAT THIS GRAPH
+   * HAS LOADED (not necessary the first edge of it globally).
+   *
+   * @param globalNodeID the global node id of the node to get the edge
+   * for
+   * @returns a LOCAL edge id iterator
+   */
+  EdgeIterator edgeBegin(uint64_t globalNodeID) {
+    assert((globalNodeID - globalNodeOffset[hostID]) < GIDtoLID.size());
+    return EdgeIterator(offsets[globalNodeID - globalNodeOffset[hostID]]);
+  }
+
+  /**
+   * Get the index to the first edge of the node after the provided node.
+   *
+   * @param globalNodeID the global node id of the node to get the edge
+   * for
+   * @returns a LOCAL edge id iterator
+   */
+  EdgeIterator edgeEnd(uint64_t globalNodeID) {
+    assert((globalNodeID - globalNodeOffset[hostID]) < GIDtoLID.size());
+    return EdgeIterator(offsets[globalNodeID + 1 - globalNodeOffset[hostID]]);
+  }
+
+  /**
+   * Get the global node id of the destination of the provided edge.
+   *
+   * @param localEdgeID the local edge id of the edge to get the destination
+   * for (should obtain from edgeBegin/End)
+   */
+  uint64_t edgeDestination(uint64_t localEdgeID) {
+    return edges[localEdgeID].dst;
+  }
+
+  /**
+   * Get the edge data of some edge.
+   *
+   * @param localEdgeID the local edge id of the edge to get the data of
+   * @returns the edge data of the requested edge id
+   */
+  template <typename K = EdgeDataType,
+            typename std::enable_if<!std::is_void<K>::value>::type* = nullptr>
+  EdgeDataType edgeData(uint64_t localEdgeID) {
+    assert(localEdgeID < numLocalEdges);
+    return edges[localEdgeID];
+  }
+
+  /**
+   * Version of above function when edge data type is void.
+   */
+  template <typename K = EdgeDataType,
+            typename std::enable_if<std::is_void<K>::value>::type* = nullptr>
+  unsigned edgeData(uint64_t) {
+    galois::gWarn("Getting edge data on graph when it doesn't exist\n");
+    return 0;
+  }
+
+  /**
+   * Get the number of edges of the node
+   *
+   * @param globalNodeID the global node id of the node to get the edge
+   * for
+   * @returns number of edges
+   */
+  uint64_t edgeNum(uint64_t globalNodeID) {
+    return offsets[globalNodeID - globalNodeOffset[hostID] + 1] - offsets[globalNodeID - globalNodeOffset[hostID]];
+  }
+
+  /**
+   * Get the dst of edges of the node
+   *
+   * @param globalNodeID the global node id of the node to get the edge
+   * for
+   * @param G2L the global to local id mapping
+   * @returns a vector of dst local node id
+   */
+  std::vector<uint64_t> edgeLocalDst(uint64_t globalNodeID) {
+    std::vector<uint64_t> dst;
+    auto end = offsets[globalNodeID - globalNodeOffset[hostID] + 1];
+    for (auto itr = offsets[globalNodeID - globalNodeOffset[hostID]]; itr != end; ++itr) {
+        dst.push_back(edges[itr].dst);
+    }
+    return dst;
+  }
+
+  /**
+   * Get the data of edges of the node
+   *
+   * @param globalNodeID the global node id of the node to get the edge
+   * for
+   * @returns a pointer to the first edges of the node in the buffer
+   */
+  EdgeDataType* edgeDataPtr(uint64_t globalNodeID) {
+    return edges.data() + offsets[globalNodeID - globalNodeOffset[hostID]];
+  }
+
+  /**
+   * Free all of the in memory buffers in this object.
+   */
+  void resetAndFree() {
+    offsets.clear();
+    offsets.shrink_to_fit();
+    edges.clear();
+    edges.shrink_to_fit();
+  }
+};
+} // namespace graphs
+} // namespace galois
+#endif
diff --git a/libwmd/include/galois/wmd/WMDPartitioner.h b/libwmd/include/galois/wmd/WMDPartitioner.h
new file mode 100644
index 0000000000..979433edb1
--- /dev/null
+++ b/libwmd/include/galois/wmd/WMDPartitioner.h
@@ -0,0 +1,931 @@
+/**
+ * @file WMDPartitioner.h
+ *
+ * Graph partitioning that duplicates edges for WMD dataset. Currently only
+ * supports an outgoing edge cut.
+ *
+ */
+
+#ifndef _WMD_PARTITIONER_H
+#define _WMD_PARTITIONER_H
+
+#include "galois/Galois.h"
+#include "galois/graphs/DistributedGraph.h"
+#include "galois/DReducible.h"
+
+#include "WMDGraph.h"
+#include "instrument.h"
+
+#include <atomic>
+#include <unistd.h>
+#include <ios>
+#include <iostream>
+#include <fstream>
+#include <string>
+
+namespace galois {
+namespace graphs {
+/**
+ * @tparam NodeTy type of node data for the graph
+ * @tparam EdgeTy type of edge data for the graph
+ *
+ * @todo fully document and clean up code
+ * @warning not meant for public use + not fully documented yet
+ */
+template <typename NodeTy, typename EdgeTy, typename Partitioner>
+class WMDGraph : public DistGraph<NodeTy, EdgeTy> {
+
+  //! size used to buffer edge sends during partitioning
+  constexpr static unsigned edgePartitionSendBufSize = 8388608;
+  constexpr static const char* const GRNAME          = "dGraph_WMD";
+  std::unique_ptr<Partitioner> graphPartitioner;
+
+  uint32_t G2LEdgeCut(uint64_t gid, uint32_t globalOffset) const {
+    assert(base_DistGraph::isLocal(gid));
+    // optimized for edge cuts
+    if (gid >= globalOffset && gid < globalOffset + base_DistGraph::numOwned)
+      return gid - globalOffset;
+
+    return base_DistGraph::globalToLocalMap.at(gid);
+  }
+
+  /**
+   * Free memory of a vector by swapping an empty vector with it
+   */
+  template <typename V>
+  void freeVector(V& vectorToKill) {
+    V dummyVector;
+    vectorToKill.swap(dummyVector);
+  }
+
+  uint32_t nodesToReceive;
+
+  uint64_t myKeptEdges;
+  uint64_t globalKeptEdges;
+  uint64_t totalEdgeProxies;
+
+  std::vector<std::vector<size_t>> mirrorEdges;
+  std::unordered_map<uint64_t, uint64_t> localEdgeGIDToLID;
+
+  template <typename, typename, typename>
+  friend class WMDGraph;
+
+  virtual unsigned getHostIDImpl(uint64_t gid) const {
+    assert(gid < base_DistGraph::numGlobalNodes);
+    return graphPartitioner->retrieveMaster(gid);
+  }
+
+  virtual bool isOwnedImpl(uint64_t gid) const {
+    assert(gid < base_DistGraph::numGlobalNodes);
+    return (graphPartitioner->retrieveMaster(gid) == base_DistGraph::id);
+  }
+
+  virtual bool isLocalImpl(uint64_t gid) const {
+    assert(gid < base_DistGraph::numGlobalNodes);
+    return (base_DistGraph::globalToLocalMap.find(gid) !=
+            base_DistGraph::globalToLocalMap.end());
+  }
+
+  virtual bool isVertexCutImpl() const { return false; }
+
+public:
+  //! typedef for base DistGraph class
+  using base_DistGraph = DistGraph<NodeTy, EdgeTy>;
+
+  /**
+   * Returns edges owned by this graph (i.e. read).
+   */
+  uint64_t numOwnedEdges() const { return myKeptEdges; }
+
+  /**
+   * Returns # edges kept in all graphs.
+   */
+  uint64_t globalEdges() const { return globalKeptEdges; }
+
+  std::vector<std::vector<size_t>>& getMirrorEdges() { return mirrorEdges; }
+
+  /**
+   * Return the reader of a particular node.
+   * @param gid GID of node to get reader of
+   * @return Host reader of node passed in as param
+   */
+  unsigned getHostReader(uint64_t gid) const {
+    for (auto i = 0U; i < base_DistGraph::numHosts; ++i) {
+      uint64_t start, end;
+      std::tie(start, end) = base_DistGraph::gid2host[i];
+      if (gid >= start && gid < end) {
+        return i;
+      }
+    }
+    return -1;
+  }
+
+  /**
+   * Constructor
+   */
+  WMDGraph(
+      std::vector<std::unique_ptr<galois::graphs::FileParser<NodeTy, EdgeTy>>>&
+          parsers,
+      unsigned host, unsigned _numHosts, bool setupGluon = true,
+      bool doSort                             = false,
+      galois::graphs::MASTERS_DISTRIBUTION md = BALANCED_EDGES_OF_MASTERS)
+      : base_DistGraph(host, _numHosts) {
+    galois::gInfo("[", base_DistGraph::id, "] Start DistGraph construction.");
+    galois::runtime::reportParam(GRNAME, "WMDGraph", "0");
+    // TODO: who is responsible for init and deinit?
+    galois::StatTimer Tgraph_construct("GraphPartitioningTime", GRNAME);
+    Tgraph_construct.start();
+
+    ////////////////////////////////////////////////////////////////////////////
+    galois::gInfo("[", base_DistGraph::id, "] Start reading graph.");
+    galois::StatTimer graphReadTimer("GraphReading", GRNAME);
+    graphReadTimer.start();
+
+    galois::gDebug("[", base_DistGraph::id, "] WMDOfflineGraph End!");
+    galois::graphs::WMDOfflineGraph<NodeTy, EdgeTy> g(parsers, md, 8);
+    galois::gDebug("[", base_DistGraph::id, "] WMDOfflineGraph End!");
+    std::vector<unsigned> dummy;
+
+    // freeup memory that won't be used in the future
+    g.clearEdgePrefixSumInfo();
+
+    std::vector<uint64_t> ndegrees;
+
+    graphPartitioner = std::make_unique<Partitioner>(
+        host, _numHosts, base_DistGraph::numGlobalNodes,
+        base_DistGraph::numGlobalEdges, ndegrees);
+
+    graphReadTimer.stop();
+    galois::gInfo("[", base_DistGraph::id, "] Reading graph complete in ",
+                  graphReadTimer.get_usec() / 1000000.0, " sec.");
+    ////////////////////////////////////////////////////////////////////////////
+    galois::gInfo("[", base_DistGraph::id, "] Start exchanging edges.");
+    galois::StatTimer edgesExchangeTimer("EdgesExchange", GRNAME);
+    edgesExchangeTimer.start();
+
+    // never read edge data from disk
+    galois::graphs::WMDBufferedGraph<NodeTy, EdgeTy> bufGraph;
+    bufGraph.loadPartialGraph(g, base_DistGraph::numGlobalEdges);
+
+    edgesExchangeTimer.stop();
+    galois::gInfo("[", base_DistGraph::id, "] Exchanging edges complete in ",
+                  edgesExchangeTimer.get_usec() / 1000000.0, " sec.");
+    ////////////////////////////////////////////////////////////////////////////
+    galois::gInfo("[", base_DistGraph::id, "] Starting edge inspection.");
+    galois::StatTimer inspectionTimer("EdgeInspection", GRNAME);
+    inspectionTimer.start();
+    base_DistGraph::numGlobalNodes = g.size();
+    base_DistGraph::numGlobalEdges = g.sizeEdges();
+    for (int k = 0; k < 2; k++)
+      I_RR();
+    I_WM(2);
+
+    // galois::gstl::Vector<uint64_t> prefixSumOfEdges;
+    // prefixSumOfEdges.resize(base_DistGraph::numOwned);
+
+    // initial pass; set up lid-gid mappings, determine which proxies exist on
+    // this host
+<<<<<<< HEAD
+<<<<<<< HEAD
+    uint64_t nodeBegin = 0;
+    uint64_t nodeEnd = bufGraph.GIDtoLID.size();
+    base_DistGraph::numOwned = nodeEnd;
+
+    base_DistGraph::gid2host.resize(base_DistGraph::numHosts);
+=======
+=======
+>>>>>>> d18e4e8d0 (fixed all bugs (hopefully))
+    uint64_t nodeBegin = bufGraph.globalNodeOffset[base_DistGraph::id];
+    uint64_t nodeEnd   = bufGraph.globalNodeOffset[base_DistGraph::id] +
+                       bufGraph.localNodeSize[base_DistGraph::id];
+    base_DistGraph::numOwned = bufGraph.localNodeSize[base_DistGraph::id];
+    for (int k = 0; k < 4; k++)
+      I_RR();
+    I_WM(1);
+
+    base_DistGraph::gid2host.resize(base_DistGraph::numHosts);
+    for (uint64_t h = 0; h < base_DistGraph::numHosts - 1; h++) {
+      base_DistGraph::gid2host[h] = std::pair<uint64_t, uint64_t>(
+          bufGraph.globalNodeOffset[h], bufGraph.globalNodeOffset[h + 1]);
+      for (int k = 0; k < 2; k++)
+        I_RR();
+      I_WM(1);
+    }
+    base_DistGraph::gid2host[base_DistGraph::numHosts - 1] =
+        std::pair<uint64_t, uint64_t>(
+            bufGraph.globalNodeOffset[base_DistGraph::numHosts - 1],
+            base_DistGraph::numGlobalNodes);
+<<<<<<< HEAD
+    graphPartitioner->saveGIDToHost(base_DistGraph::gid2host);
+>>>>>>> 73f44af9e (feat: add instrument to wmd graph importer)
+=======
+    graphPartitioner->saveGIDToHost(bufGraph.virtualToPhyMapping);
+
+>>>>>>> d18e4e8d0 (fixed all bugs (hopefully))
+
+    std::vector<std::vector<uint64_t>> presentProxies = edgeInspectionRound1(bufGraph);
+
+    // vector to store bitsets received from other hosts
+    std::vector<std::vector<uint64_t>> proxiesOnOtherHosts;
+    proxiesOnOtherHosts.resize(_numHosts);
+
+    // send off mirror proxies that exist on this host to other hosts
+    communicateProxyInfo(presentProxies, proxiesOnOtherHosts);
+
+    base_DistGraph::numEdges = bufGraph.sizeLocalEdges();
+    // assumption: we keep all edges since mirror edges are not supported
+    myKeptEdges     = base_DistGraph::numEdges;
+    globalKeptEdges = base_DistGraph::numGlobalEdges;
+
+    inspectionTimer.stop();
+    galois::gInfo("[", base_DistGraph::id, "] Edge inspection complete in ",
+                  inspectionTimer.get_usec() / 1000000.0, " sec.");
+    ////////////////////////////////////////////////////////////////////////////
+    galois::gInfo("[", base_DistGraph::id, "] Starting building LS_CSR.");
+    galois::StatTimer buildingTimer("GraphBuilding", GRNAME);
+    buildingTimer.start();
+
+    // Graph construction related calls
+    base_DistGraph::beginMaster = 0;
+    // Allocate and construct the graph
+    base_DistGraph::graph.allocateFrom(base_DistGraph::numNodes,
+                                       base_DistGraph::numEdges);
+    I_WM(base_DistGraph::numNodes);
+    base_DistGraph::graph.constructNodes();
+
+    // construct edges
+    // not need to move edges from other host since all edges is already ready
+    // when no edge mirror are used.
+<<<<<<< HEAD
+=======
+    for (uint64_t i = nodeBegin; i < nodeEnd; i++) {
+      auto edgeDst = bufGraph.edgeLocalDst(i);
+      I_RR();
+    }
+>>>>>>> 73f44af9e (feat: add instrument to wmd graph importer)
+    galois::gDebug("[", base_DistGraph::id, "] add edges into graph.");
+    galois::do_all(
+        galois::iterate(nodeBegin, nodeEnd),
+        [&](uint64_t globalID) {
+          auto edgeDst = bufGraph.edgeLocalDst(globalID);
+          I_RR();
+          std::vector<uint64_t> dstData;
+          for (auto dst : edgeDst) {
+            dstData.emplace_back(base_DistGraph::globalToLocalMap[dst]);
+            I_RR();
+            I_WR();
+          }
+          auto edgeData = bufGraph.edgeDataPtr(globalID);
+          I_RR();
+          I_WM(bufGraph.edgeNum(globalID));
+          base_DistGraph::graph.addEdgesUnSort(
+              true, (globalID - bufGraph.globalNodeOffset[base_DistGraph::id]), dstData.data(), edgeData,
+              bufGraph.edgeNum(globalID), false);
+        },
+        galois::steal());
+
+    // move node data (include mirror nodes) from other hosts to graph in this
+    // host
+    galois::gDebug("[", base_DistGraph::id, "] add nodes data into graph.");
+    bufGraph.gatherNodes(g, base_DistGraph::graph, proxiesOnOtherHosts,
+                         base_DistGraph::numNodes,
+                         base_DistGraph::globalToLocalMap);
+
+    galois::gDebug("[", base_DistGraph::id, "] LS_CSR construction done.");
+    galois::gInfo("[", base_DistGraph::id,
+                  "] LS_CSR graph local nodes: ", base_DistGraph::numNodes);
+    galois::gInfo("[", base_DistGraph::id,
+                  "] LS_CSR graph master nodes: ", base_DistGraph::numOwned);
+    galois::gInfo("[", base_DistGraph::id, "] LS_CSR graph local edges: ",
+                  base_DistGraph::graph.sizeEdges());
+    assert(base_DistGraph::graph.sizeEdges() == base_DistGraph::numEdges);
+    assert(base_DistGraph::graph.size() == base_DistGraph::numNodes);
+
+    bufGraph.resetAndFree();
+
+    buildingTimer.stop();
+    galois::gInfo("[", base_DistGraph::id, "] Building LS_CSR complete in ",
+                  buildingTimer.get_usec() / 1000000.0, " sec.");
+    ////////////////////////////////////////////////////////////////////////////
+
+    if (setupGluon) {
+      galois::CondStatTimer<MORE_DIST_STATS> TfillMirrors("FillMirrors",
+                                                          GRNAME);
+
+      TfillMirrors.start();
+      fillMirrors();
+      TfillMirrors.stop();
+    }
+
+    ////////////////////////////////////////////////////////////////////////////
+
+    // TODO this might be useful to keep around
+    proxiesOnOtherHosts.clear();
+    proxiesOnOtherHosts.shrink_to_fit();
+    ndegrees.clear();
+    ndegrees.shrink_to_fit();
+
+    // SORT EDGES
+    if (doSort) {
+      base_DistGraph::sortEdgesByDestination();
+    }
+
+    ////////////////////////////////////////////////////////////////////////////
+
+    galois::CondStatTimer<MORE_DIST_STATS> Tthread_ranges("ThreadRangesTime",
+                                                          GRNAME);
+
+    galois::gInfo("[", base_DistGraph::id, "] Determining thread ranges");
+
+    Tthread_ranges.start();
+    base_DistGraph::determineThreadRanges();
+    base_DistGraph::determineThreadRangesMaster();
+    base_DistGraph::determineThreadRangesWithEdges();
+    base_DistGraph::initializeSpecificRanges();
+    Tthread_ranges.stop();
+
+    Tgraph_construct.stop();
+    galois::gInfo("[", base_DistGraph::id,
+                  "] Total time of DistGraph construction is ",
+                  Tgraph_construct.get_usec() / 1000000.0, " sec.");
+
+    galois::DGAccumulator<uint64_t> accumer;
+    accumer.reset();
+    accumer += base_DistGraph::sizeEdges();
+    totalEdgeProxies = accumer.reduce();
+
+    uint64_t totalNodeProxies;
+    accumer.reset();
+    accumer += base_DistGraph::size();
+    totalNodeProxies = accumer.reduce();
+
+    // report some statistics
+    if (base_DistGraph::id == 0) {
+      galois::runtime::reportStat_Single(
+          GRNAME, std::string("TotalNodeProxies"), totalNodeProxies);
+      galois::runtime::reportStat_Single(
+          GRNAME, std::string("TotalEdgeProxies"), totalEdgeProxies);
+      galois::runtime::reportStat_Single(GRNAME,
+                                         std::string("OriginalNumberEdges"),
+                                         base_DistGraph::globalSizeEdges());
+      galois::runtime::reportStat_Single(GRNAME, std::string("TotalKeptEdges"),
+                                         globalKeptEdges);
+      galois::runtime::reportStat_Single(
+          GRNAME, std::string("ReplicationFactorNodes"),
+          (totalNodeProxies) / (double)base_DistGraph::globalSize());
+      galois::runtime::reportStat_Single(
+          GRNAME, std::string("ReplicatonFactorEdges"),
+          (totalEdgeProxies) / (double)globalKeptEdges);
+    }
+  }
+
+  // this consumes the original graph
+  // this does not support mirror edges
+  template <class NewGraph, class Projection>
+  std::unique_ptr<NewGraph> Project(Projection projection) {
+    std::unique_ptr<NewGraph> newGraph = std::unique_ptr<NewGraph>(
+        new NewGraph(base_DistGraph::id, base_DistGraph::numHosts));
+    using NodeLID     = uint64_t;
+    using NodeGID     = uint64_t;
+    using NewEdgeType = typename NewGraph::EdgeType;
+
+    galois::gInfo("[", base_DistGraph::id, "] Start projection.");
+
+    newGraph->gid2host = base_DistGraph::gid2host;
+    newGraph->localToGlobalVector.resize(base_DistGraph::numNodes);
+    std::vector<bool> keepMirrors(base_DistGraph::numNodes -
+                                  base_DistGraph::numOwned);
+    // these 2 structures: newTopology and newEdgeData must mirror eachother
+    std::vector<std::vector<NodeGID>> newTopology(base_DistGraph::numNodes);
+    std::vector<std::vector<NewEdgeType>> newEdgeData(base_DistGraph::numNodes);
+
+    std::atomic<uint64_t> masterNodes = 0;
+    std::atomic<uint64_t> mirrorNodes = 0;
+    galois::GAccumulator<uint64_t> nodesWithEdges;
+    galois::DGAccumulator<uint64_t> globalNodes;
+    galois::DGAccumulator<uint64_t> globalEdges;
+    nodesWithEdges.reset();
+    globalNodes.reset();
+    globalEdges.reset();
+
+    galois::do_all(
+        galois::iterate(base_DistGraph::masterNodesRange().begin(),
+                        base_DistGraph::masterNodesRange().end()),
+        [&](auto& node) {
+          I_RS();
+          if (!projection.KeepNode(*this, node)) {
+            return;
+          }
+          I_RR();
+          NodeGID nodeGID = base_DistGraph::getGID(node);
+          std::vector<NodeGID> edgeDsts;
+          std::vector<NewEdgeType> keptEdgeData;
+
+          uint64_t keptEdges = 0;
+          for (const auto& edge : base_DistGraph::edges(node)) {
+            I_RS();
+            I_RR();
+            EdgeTy edgeData = base_DistGraph::getEdgeData(edge);
+            I_RR();
+            NodeLID dstNode = base_DistGraph::getEdgeDst(edge);
+            if (!projection.KeepEdge(*this, edgeData, node, dstNode)) {
+              continue;
+            }
+            keptEdges++;
+            I_RR();
+            I_WR();
+            edgeDsts.emplace_back(base_DistGraph::getGID(dstNode));
+            I_WR();
+            keptEdgeData.emplace_back(
+                projection.ProjectEdge(*this, edgeData, node, dstNode));
+            if (dstNode >= base_DistGraph::numOwned) {
+              I_WR();
+              keepMirrors[dstNode - base_DistGraph::numOwned] = true;
+            }
+          }
+          if (projection.KeepEdgeLessMasters() || keptEdges > 0) {
+            if (keptEdges > 0) {
+              nodesWithEdges += 1;
+            }
+            globalNodes += 1;
+            globalEdges += keptEdges;
+            NodeLID nodeLID = masterNodes.fetch_add(1);
+            I_WR();
+            newGraph->localToGlobalVector[nodeLID] = nodeGID;
+            newTopology[nodeLID]                   = std::move(edgeDsts);
+            newEdgeData[nodeLID]                   = std::move(keptEdgeData);
+          }
+        });
+
+    uint64_t numMasters = masterNodes;
+
+    galois::do_all(galois::iterate(uint64_t(base_DistGraph::numOwned),
+                                   uint64_t(base_DistGraph::numNodes)),
+                   [&](auto& mirrorNode) {
+                     I_RS();
+                     I_RR();
+                     if (!keepMirrors[mirrorNode - base_DistGraph::numOwned]) {
+                       return;
+                     }
+                     I_RR();
+                     NodeGID nodeGID = base_DistGraph::getGID(mirrorNode);
+                     NodeLID nodeLID = numMasters + mirrorNodes.fetch_add(1);
+                     I_WR();
+                     newGraph->localToGlobalVector[nodeLID] = nodeGID;
+                   });
+
+    newGraph->numGlobalNodes = globalNodes.reduce();
+    newGraph->numGlobalEdges = globalEdges.reduce();
+    newGraph->numEdges       = globalEdges.read_local();
+
+    newGraph->numOwned          = masterNodes;
+    uint64_t numMirrors         = mirrorNodes;
+    newGraph->numNodes          = newGraph->numOwned + numMirrors;
+    newGraph->beginMaster       = 0;
+    newGraph->numNodesWithEdges = nodesWithEdges.reduce();
+
+    galois::gInfo("[", base_DistGraph::id,
+                  "] Projected Global Nodes: ", newGraph->numGlobalNodes);
+    galois::gInfo("[", base_DistGraph::id,
+                  "] Projected Global Edges: ", newGraph->numGlobalEdges);
+    galois::gInfo("[", base_DistGraph::id,
+                  "] Projected Local Nodes: ", newGraph->numNodes);
+    galois::gInfo("[", base_DistGraph::id,
+                  "] Projected Local Edges: ", newGraph->numEdges);
+    galois::gInfo("[", base_DistGraph::id,
+                  "] Projected Master Nodes: ", newGraph->numOwned);
+    galois::gInfo("[", base_DistGraph::id,
+                  "] Projected Mirror Nodes: ", numMirrors);
+    galois::gInfo("[", base_DistGraph::id,
+                  "] Projected Edge Nodes: ", newGraph->numNodesWithEdges);
+
+    newTopology.resize(newGraph->numNodes);
+    newEdgeData.resize(newGraph->numNodes);
+    newGraph->localToGlobalVector.resize(newGraph->numNodes);
+    I_WM(newGraph->numNodes);
+    newGraph->recalculateG2LMap();
+
+    for (uint32_t i = newGraph->numOwned; i < newGraph->numNodes; i++) {
+      I_RR();
+      uint64_t globalID = newGraph->getGID(i);
+      // deliberately use the old graph partitioner to get the owner of the GID
+      I_RR();
+      I_WR();
+      newGraph->mirrorNodes[graphPartitioner->retrieveMaster(globalID)]
+          .emplace_back(globalID);
+    }
+
+    galois::gInfo("[", base_DistGraph::id, "] Start building projected graph.");
+    newGraph->graph.allocateFrom(newGraph->numNodes, newGraph->numEdges);
+
+    galois::do_all(
+        galois::iterate(uint64_t(0), uint64_t(newGraph->numNodes)),
+        [&](auto& node) {
+          I_RS();
+          I_RR();
+          NodeLID oldGraphLID =
+              base_DistGraph::getLID(newGraph->localToGlobalVector[node]);
+          I_WR(node >= newGraph->numOwned);
+          newGraph->graph.getData(node) = projection.ProjectNode(
+              *this, base_DistGraph::getData(oldGraphLID), oldGraphLID);
+
+          I_RR();
+          uint64_t numEdges = newTopology[node].size();
+          if (node >= newGraph->numOwned) {
+            return;
+          }
+          std::vector<NodeLID> localDsts;
+          localDsts.reserve(numEdges);
+          for (NodeGID gid : newTopology[node]) {
+            I_RR();
+            I_WR();
+            localDsts.emplace_back(newGraph->getLID(gid));
+          }
+          I_RR();
+          I_WM(numEdges);
+          newGraph->graph.addEdgesUnSort(true, node, localDsts.data(),
+                                         newEdgeData[node].data(), numEdges,
+                                         false);
+
+          newTopology[node].clear();
+          newEdgeData[node].clear();
+        });
+
+    galois::gInfo("[", base_DistGraph::id,
+                  "] Finished building projected graph.");
+
+    newGraph->graphPartitioner = std::move(graphPartitioner);
+    newGraph->determineThreadRanges();
+    newGraph->determineThreadRangesMaster();
+    newGraph->determineThreadRangesWithEdges();
+    newGraph->initializeSpecificRanges();
+
+    return newGraph;
+  }
+
+private:
+  WMDGraph(unsigned host, unsigned _numHosts)
+      : base_DistGraph(host, _numHosts) {}
+
+  std::vector<std::vector<uint64_t>> edgeInspectionRound1(
+      galois::graphs::WMDBufferedGraph<NodeTy, EdgeTy>& bufGraph) {
+<<<<<<< HEAD
+     std::vector<std::vector<uint64_t>> incomingMirrors(base_DistGraph::numHosts);
+    uint32_t myID         = base_DistGraph::id;
+=======
+    galois::DynamicBitSet incomingMirrors;
+    incomingMirrors.resize(base_DistGraph::numGlobalNodes);
+    incomingMirrors.reset();
+
+    uint64_t globalOffset;
+    globalOffset = bufGraph.globalNodeOffset[base_DistGraph::id];
+    I_RR();
+    // uint64_t globalOffset =
+    // base_DistGraph::gid2host[base_DistGraph::id].first;
+
+    // already set before this is called
+>>>>>>> 73f44af9e (feat: add instrument to wmd graph importer)
+    base_DistGraph::localToGlobalVector.resize(base_DistGraph::numOwned);
+    uint32_t activeThreads = galois::getActiveThreads();
+    std::vector<std::vector<std::set<uint64_t>>> incomingMirrorsPerThread(base_DistGraph::numHosts);
+    for(uint32_t h=0; h<base_DistGraph::numHosts; h++) {
+        incomingMirrorsPerThread[h].resize(activeThreads);
+    }
+
+<<<<<<< HEAD
+<<<<<<< HEAD
+=======
+    size_t start = bufGraph.globalNodeOffset[base_DistGraph::id];
+    size_t end;
+    if(base_DistGraph::id != base_DistGraph::numHosts - 1)
+        end = bufGraph.globalNodeOffset[base_DistGraph::id + 1];
+    else
+	end = bufGraph.localNodeSize[base_DistGraph::numHosts - 1] + bufGraph.globalNodeOffset[base_DistGraph::id];
+
+>>>>>>> d18e4e8d0 (fixed all bugs (hopefully))
+    galois::on_each([&](unsigned tid, unsigned nthreads) {
+      uint64_t beginNode;
+      uint64_t endNode;
+      std::tie(beginNode, endNode) = galois::block_range(start, end, tid, nthreads);
+
+      for(uint64_t i = beginNode; i < endNode; ++i) {
+        auto ii            = bufGraph.edgeBegin(i);
+        auto ee            = bufGraph.edgeEnd(i);
+        for (; ii < ee; ++ii) {
+          uint64_t dst = bufGraph.edgeDestination(*ii);
+          uint64_t master_dst = bufGraph.virtualToPhyMapping[dst%(bufGraph.scaleFactor*base_DistGraph::numHosts)];
+            if (master_dst != myID) {
+                  assert(master_dst < base_DistGraph::numHosts);
+                  incomingMirrorsPerThread[master_dst][tid].insert(dst);
+            }
+        }
+        base_DistGraph::localToGlobalVector[i - bufGraph.globalNodeOffset[base_DistGraph::id]] = bufGraph.LIDtoGID[i - bufGraph.globalNodeOffset[base_DistGraph::id]];
+      }
+      });
+=======
+    auto& ltgv = base_DistGraph::localToGlobalVector;
+    galois::do_all(
+        galois::iterate(globalOffset,
+                        (bufGraph.globalNodeOffset[base_DistGraph::id] +
+                         bufGraph.localNodeSize[base_DistGraph::id])),
+        [&](size_t n) {
+          auto ii = bufGraph.edgeBegin(n);
+          auto ee = bufGraph.edgeEnd(n);
+          for (int k = 0; k < 2; k++)
+            I_RR();
+          for (; ii < ee; ++ii) {
+            uint32_t dst = bufGraph.edgeDestination(*ii);
+
+            // we keep all edges in OEC so no need to do the check
+            if ((dst < globalOffset) ||
+                (dst >= (bufGraph.globalNodeOffset[base_DistGraph::id] +
+                         bufGraph.localNodeSize[base_DistGraph::id]))) {
+              incomingMirrors.set(dst);
+              I_WR();
+            }
+            for (int k = 0; k < 3; k++)
+              I_RR();
+          }
+          I_WR();
+          ltgv[n - globalOffset] = n;
+        },
+#if MORE_DIST_STATS
+        galois::loopname("EdgeInspectionLoop"),
+#endif
+        galois::steal(), galois::no_stats());
+>>>>>>> 73f44af9e (feat: add instrument to wmd graph importer)
+
+    std::vector<std::set<uint64_t>> dest(base_DistGraph::numHosts);
+    for(uint32_t h=0; h<base_DistGraph::numHosts; h++) {
+      for(uint32_t t=0; t<activeThreads; t++) {
+        std::set<uint64_t> tempUnion;
+        std::set_union(dest[h].begin(), dest[h].end(),
+                   incomingMirrorsPerThread[h][t].begin(), incomingMirrorsPerThread[h][t].end(),
+                   std::inserter(tempUnion, tempUnion.begin()));
+        dest[h] = tempUnion;
+      }
+        std::copy(dest[h].begin(), dest[h].end(), std::back_inserter(incomingMirrors[h]));
+    }
+    incomingMirrorsPerThread.clear();
+    uint64_t offset = base_DistGraph::localToGlobalVector.size();
+
+    uint64_t count = 0;
+    for(uint64_t i=0; i<incomingMirrors.size(); i++) {
+        count += incomingMirrors[i].size();
+    }
+    uint32_t additionalMirrorCount = count;
+    base_DistGraph::localToGlobalVector.resize(
+        base_DistGraph::localToGlobalVector.size() + additionalMirrorCount);
+
+    for (uint64_t i=0;i<incomingMirrors.size();i++) {
+            for(uint64_t j=0; j <incomingMirrors[i].size(); j++) {
+               base_DistGraph::localToGlobalVector[offset] = incomingMirrors[i][j];
+               offset++;
+            }
+        }
+<<<<<<< HEAD
+=======
+        I_WR();
+        threadPrefixSums[tid] = count;
+      });
+      // get prefix sums
+      for (unsigned int i = 1; i < threadPrefixSums.size(); i++) {
+        threadPrefixSums[i] += threadPrefixSums[i - 1];
+        I_WR();
+        I_RR();
+      }
+
+      assert(threadPrefixSums.back() == additionalMirrorCount);
+
+      uint32_t startingNodeIndex = base_DistGraph::numOwned;
+      // do actual work, second on_each
+      galois::on_each([&](unsigned tid, unsigned nthreads) {
+        size_t beginNode;
+        size_t endNode;
+        std::tie(beginNode, endNode) =
+            galois::block_range(0u, totalNumNodes, tid, nthreads);
+        // start location to start adding things into prefix sums/vectors
+        uint32_t threadStartLocation = 0;
+        if (tid != 0) {
+          threadStartLocation = threadPrefixSums[tid - 1];
+          I_RR();
+        }
+        uint32_t handledNodes = 0;
+        for (size_t i = beginNode; i < endNode; i++) {
+          I_RR();
+          if (incomingMirrors.test(i)) {
+            I_WR();
+            base_DistGraph::localToGlobalVector[startingNodeIndex +
+                                                threadStartLocation +
+                                                handledNodes] = i;
+            handledNodes++;
+          }
+        }
+      });
+    }
+>>>>>>> 73f44af9e (feat: add instrument to wmd graph importer)
+
+    base_DistGraph::numNodes = base_DistGraph::numOwned + additionalMirrorCount;
+    //Creating Global to Local ID map
+    base_DistGraph::globalToLocalMap.reserve(base_DistGraph::numNodes);
+    for (unsigned i = 0; i < base_DistGraph::numNodes; i++) {
+      base_DistGraph::globalToLocalMap[base_DistGraph::localToGlobalVector[i]] =
+          i;
+    }
+    base_DistGraph::numNodesWithEdges = base_DistGraph::numNodes;
+    return incomingMirrors;
+  }
+
+  /**
+   * Communicate to other hosts which proxies exist on this host.
+   *
+   * @param presentProxies Bitset marking which proxies are present on this host
+   * @param proxiesOnOtherHosts Vector to deserialize received bitsets into
+   */
+  void communicateProxyInfo(
+      std::vector<std::vector<uint64_t>> presentProxies,
+      std::vector<std::vector<uint64_t>> proxiesOnOtherHosts) {
+    auto& net = galois::runtime::getSystemNetworkInterface();
+    // Send proxies on this host to other hosts
+    for (unsigned h = 0; h < base_DistGraph::numHosts; ++h) {
+      if (h != base_DistGraph::id) {
+        galois::runtime::SendBuffer bitsetBuffer;
+<<<<<<< HEAD
+        galois::runtime::gSerialize(bitsetBuffer, presentProxies[h]);
+        I_LC(h, bitsetBuffer.size());
+=======
+        galois::runtime::gSerialize(bitsetBuffer, presentProxies);
+        I_WM(bitsetBuffer.size() / sizeof(uint64_t));
+>>>>>>> 73f44af9e (feat: add instrument to wmd graph importer)
+        net.sendTagged(h, galois::runtime::evilPhase, bitsetBuffer);
+      }
+    }
+
+    // receive loop
+    for (unsigned h = 0; h < net.Num - 1; h++) {
+      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+      do {
+        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+      } while (!p);
+      uint32_t sendingHost = p->first;
+      // deserialize proxiesOnOtherHosts
+      I_LC(sendingHost, p->second.size());
+      galois::runtime::gDeserialize(p->second,
+                                    proxiesOnOtherHosts[sendingHost]);
+    }
+
+    base_DistGraph::increment_evilPhase();
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+public:
+  galois::GAccumulator<uint64_t> lgMapAccesses;
+  /**
+   * Construct a map from local edge GIDs to LID
+   */
+  void constructLocalEdgeGIDMap() {
+    lgMapAccesses.reset();
+    galois::StatTimer mapConstructTimer("GID2LIDMapConstructTimer", GRNAME);
+    mapConstructTimer.start();
+
+    localEdgeGIDToLID.reserve(base_DistGraph::sizeEdges());
+
+    uint64_t count = 0;
+    for (unsigned src = 0; src < base_DistGraph::size(); src++) {
+      for (auto edge = base_DistGraph::edge_begin(src);
+           edge != base_DistGraph::edge_end(src); edge++) {
+        assert((*edge) == count);
+        unsigned dst      = base_DistGraph::getEdgeDst(edge);
+        uint64_t localGID = getEdgeGIDFromSD(src, dst);
+        // insert into map
+        localEdgeGIDToLID.insert(std::make_pair(localGID, count));
+        count++;
+      }
+    }
+
+    GALOIS_ASSERT(localEdgeGIDToLID.size() == base_DistGraph::sizeEdges());
+    GALOIS_ASSERT(count == base_DistGraph::sizeEdges());
+
+    mapConstructTimer.stop();
+  }
+
+  void reportAccessBefore() {
+    galois::runtime::reportStat_Single(GRNAME, std::string("MapAccessesBefore"),
+                                       lgMapAccesses.reduce());
+  }
+
+  void reportAccess() {
+    galois::runtime::reportStat_Single(GRNAME, std::string("MapAccesses"),
+                                       lgMapAccesses.reduce());
+  }
+
+  /**
+   * checks map constructed above to see which local id corresponds
+   * to a node/edge (if it exists)
+   *
+   * assumes map is generated
+   */
+  std::pair<uint64_t, bool> getLIDFromMap(unsigned src, unsigned dst) {
+    lgMapAccesses += 1;
+    // try to find gid in map
+    uint64_t localGID = getEdgeGIDFromSD(src, dst);
+    auto findResult   = localEdgeGIDToLID.find(localGID);
+
+    // return if found, else return a false
+    if (findResult != localEdgeGIDToLID.end()) {
+      return std::make_pair(findResult->second, true);
+    } else {
+      // not found
+      return std::make_pair((uint64_t)-1, false);
+    }
+  }
+
+  uint64_t getEdgeLID(uint64_t gid) {
+    uint64_t sourceNodeGID = edgeGIDToSource(gid);
+    uint64_t sourceNodeLID = base_DistGraph::getLID(sourceNodeGID);
+    uint64_t destNodeLID   = base_DistGraph::getLID(edgeGIDToDest(gid));
+
+    for (auto edge : base_DistGraph::edges(sourceNodeLID)) {
+      uint64_t edgeDst = base_DistGraph::getEdgeDst(edge);
+      if (edgeDst == destNodeLID) {
+        return *edge;
+      }
+    }
+    GALOIS_DIE("unreachable");
+    return (uint64_t)-1;
+  }
+
+  uint32_t findSourceFromEdge(uint64_t lid) {
+    // TODO binary search
+    // uint32_t left = 0;
+    // uint32_t right = base_DistGraph::numNodes;
+    // uint32_t mid = (left + right) / 2;
+
+    for (uint32_t mid = 0; mid < base_DistGraph::numNodes; mid++) {
+      uint64_t edge_left  = *(base_DistGraph::edge_begin(mid));
+      uint64_t edge_right = *(base_DistGraph::edge_begin(mid + 1));
+
+      if (edge_left <= lid && lid < edge_right) {
+        return mid;
+      }
+    }
+
+    GALOIS_DIE("unreachable");
+    return (uint32_t)-1;
+  }
+
+  uint64_t getEdgeGID(uint64_t lid) {
+    uint64_t src = base_DistGraph::getGID(findSourceFromEdge(lid));
+    uint64_t dst = base_DistGraph::getGID(base_DistGraph::getEdgeDst(lid));
+    return getEdgeGIDFromSD(src, dst);
+  }
+
+private:
+  // https://www.quora.com/
+  // Is-there-a-mathematical-function-that-converts-two-numbers-into-one-so-
+  // that-the-two-numbers-can-always-be-extracted-again
+  // GLOBAL IDS ONLY
+  uint64_t getEdgeGIDFromSD(uint64_t source, uint64_t dest) {
+    return source + (dest % base_DistGraph::numGlobalNodes) *
+                        base_DistGraph::numGlobalNodes;
+  }
+
+  uint64_t edgeGIDToSource(uint64_t gid) {
+    return gid % base_DistGraph::numGlobalNodes;
+  }
+
+  uint64_t edgeGIDToDest(uint64_t gid) {
+    // assuming this floors
+    return gid / base_DistGraph::numGlobalNodes;
+  }
+
+  /**
+   * Fill up mirror arrays.
+   * TODO make parallel?
+   */
+  void fillMirrors() {
+    base_DistGraph::mirrorNodes.reserve(base_DistGraph::numNodes -
+                                        base_DistGraph::numOwned);
+    for (uint32_t i = base_DistGraph::numOwned; i < base_DistGraph::numNodes;
+         i++) {
+      I_RR();
+      uint64_t globalID = base_DistGraph::localToGlobalVector[i];
+      I_RR();
+      I_WR();
+      assert(graphPartitioner->retrieveMaster(globalID) < base_DistGraph::numHosts);
+      base_DistGraph::mirrorNodes[graphPartitioner->retrieveMaster(globalID)]
+          .push_back(globalID);
+    }
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+};
+
+// make GRNAME visible to public
+template <typename NodeTy, typename EdgeTy, typename Partitioner>
+constexpr const char* const
+    galois::graphs::WMDGraph<NodeTy, EdgeTy, Partitioner>::GRNAME;
+
+} // end namespace graphs
+} // end namespace galois
+#endif
diff --git a/libwmd/include/galois/wmd/data_types.h b/libwmd/include/galois/wmd/data_types.h
new file mode 100644
index 0000000000..22650d3cca
--- /dev/null
+++ b/libwmd/include/galois/wmd/data_types.h
@@ -0,0 +1,741 @@
+//===------------------------------------------------------------*- C++ -*-===//
+//
+//                                     SHAD
+//
+//      The Scalable High-performance Algorithms and Data Structure Library
+//
+//===----------------------------------------------------------------------===//
+//
+// Copyright 2018 Battelle Memorial Institute
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef INCLUDE_DATA_TYPES_DATA_TYPES_H_
+#define INCLUDE_DATA_TYPES_DATA_TYPES_H_
+
+#include <ctime>
+#include <cstring>
+#include <algorithm>
+#include <iomanip>
+#include <iostream>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+namespace shad {
+
+/// @brief Data conversion utilities.
+///
+/// Please refer to methods specialization to check
+/// which data types are supported.
+namespace data_types {
+
+/// @brief Enumeration of supported data types.
+///
+/// The enumeration is meant to be used when parsing data
+/// (i.e. type information is not known at compile time).
+enum data_t {
+  STRING = 0,  // string support is currenlty limited
+  CHARS,       // sequence of characters
+  UINT,        // unsigned, binds by default to uint64_t
+  INT,         // int, binds by default to int64_t
+  FLOAT,       // float, binds by default to float
+  DOUBLE,      // double, binds by default to double
+  BOOL,        // bool, binds by default to bool
+  DATE,        // date in "%y-%m-%d" format, binds by default to time_t
+  USDATE,      // date in "%m/%d/%y" format, binds by default to time_t
+  DATE_TIME,   // date in "%y-%m-%dT%H:%M:%S" format,
+               // binds by default to time_t
+  IP_ADDRESS,  // IPv4, binds by default to data_types::ipv4_t
+  LIST_UINT,   // Sequence of unsigneds, support currently limited
+  LIST_INT,    // Sequence of integers, support currently limited
+  LIST_DOUBLE, // Sequence of doubles, support currently limited
+  NONE
+};
+
+/// @brief Data structures for storing schema information.
+/// Given a tuple of data, it associates elements labels and data types
+/// to their position in the tuple.
+using schema_t = std::vector<std::pair<std::string, data_t>>;
+
+/// @brief Encoded null value.
+/// @tparam ENC_t encoding type.
+/// @return Encoded null value for ENC_t.
+template <typename ENC_t>
+constexpr ENC_t kNullValue = ENC_t();
+
+/// @brief Encoded null value for uint64_t.
+/// @return Null encoded value for uint64_t.
+template <>
+constexpr uint64_t kNullValue<uint64_t> = std::numeric_limits<int64_t>::max();
+
+/// @brief Encoded null value for time_t (same as long).
+/// @return Null encoded value for time_t (same as long).
+template <>
+constexpr time_t kNullValue<time_t> = std::numeric_limits<time_t>::max();
+
+/// @brief Encoded null value for double.
+/// @return Null encoded value for double.
+template <>
+constexpr double kNullValue<double> = std::numeric_limits<double>::max();
+
+/// @brief Encode Function
+/// Available specializations:
+///    ENC_t = uint64_t, IN_t = std::string
+/// @tparam ENC_t The type to encode to.
+/// @tparam IN_t The type (format) of the data to encode.
+/// @tparam DT data_types::data_t of the data to encode.
+/// @param in Data to encode.
+/// @return Encoded data.
+template <typename ENC_t, typename IN_t, data_t DT>
+ENC_t encode(IN_t& in);
+
+/// @brief Encode Function
+/// Available specializations:
+///    ENC_t = uint64_t, IN_t = default bindings of data_types::data_t
+/// @tparam ENC_t The type to encode to.
+/// @tparam IN_t The type of the data to encode.
+/// @param in Data to encode.
+/// @return Encoded data.
+template <typename ENC_t, typename IN_t>
+ENC_t encode(IN_t& in);
+
+template <typename ENC_t, typename IN_t>
+ENC_t encode(IN_t& in, data_t dt);
+
+template <typename ENC_t, size_t MAX_s, data_t ST>
+std::array<ENC_t, MAX_s> encode(std::string& str) {
+  std::array<ENC_t, MAX_s> res;
+  if (str.size() > 0) {
+    memcpy(res.data(), str.data(), sizeof(ENC_t) * MAX_s);
+  } else {
+    res.fill('\0');
+  }
+  return res;
+}
+
+template <typename ENC_t, typename DEC_t>
+typename std::enable_if<(std::is_arithmetic<DEC_t>::value or
+                         (sizeof(DEC_t) == sizeof(ENC_t))),
+                        DEC_t>::type
+decode(ENC_t encvalue) {
+  DEC_t val;
+  memcpy(&val, &encvalue, sizeof(DEC_t));
+  return val;
+}
+
+template <typename ENC_t, typename DEC_t, data_t ST>
+DEC_t decode(ENC_t value);
+
+template <typename ENC_t, data_t ST>
+typename std::enable_if<(ST == data_t::INT), int64_t>::type
+decode(ENC_t encvalue) {
+  return decode<ENC_t, int64_t>(encvalue);
+}
+
+template <typename ENC_t, data_t ST>
+typename std::enable_if<(ST == data_t::UINT), uint64_t>::type
+decode(ENC_t encvalue) {
+  return decode<ENC_t, uint64_t>(encvalue);
+}
+
+template <typename ENC_t, data_t ST>
+typename std::enable_if<(ST == data_t::FLOAT), float>::type
+decode(ENC_t encvalue) {
+  return decode<ENC_t, float>(encvalue);
+}
+
+template <typename ENC_t, data_t ST>
+typename std::enable_if<(ST == data_t::DOUBLE), double>::type
+decode(ENC_t encvalue) {
+  return decode<ENC_t, double>(encvalue);
+}
+
+template <typename ENC_t, data_t ST>
+typename std::enable_if<(ST == data_t::BOOL), bool>::type
+decode(ENC_t encvalue) {
+  return decode<ENC_t, bool>(encvalue);
+}
+
+template <typename ENC_t, data_t ST>
+typename std::enable_if<(ST == data_t::DATE), std::time_t>::type
+decode(ENC_t encvalue) {
+  return decode<ENC_t, std::time_t>(encvalue);
+}
+
+template <typename ENC_t, size_t MAX_s, data_t ST>
+std::string decode(std::array<ENC_t, MAX_s>& val) {
+  return std::string(reinterpret_cast<const char*>(val.data()));
+}
+} // namespace data_types
+
+// ENCODE METHODS SPECIALIZATION FOR UINT64 ENC_t
+template <>
+inline uint64_t
+data_types::encode<uint64_t, std::string, data_types::UINT>(std::string& str) {
+  uint64_t value;
+  try {
+    value = std::stoull(str);
+  } catch (...) {
+    value = kNullValue<uint64_t>;
+  }
+  return value;
+}
+
+template <>
+inline uint64_t
+data_types::encode<uint64_t, std::string, data_types::INT>(std::string& str) {
+  uint64_t encval;
+  int64_t value;
+  try {
+    value = stoll(str);
+  } catch (...) {
+    return kNullValue<uint64_t>;
+  }
+  memcpy(&encval, &value, sizeof(value));
+  return encval;
+}
+
+template <>
+inline uint64_t
+data_types::encode<uint64_t, std::string, data_types::FLOAT>(std::string& str) {
+  uint64_t encval;
+  float value;
+  try {
+    value = stof(str);
+  } catch (...) {
+    return kNullValue<uint64_t>;
+  }
+  memcpy(&encval, &value, sizeof(value));
+  return encval;
+}
+
+template <>
+inline uint64_t data_types::encode<uint64_t, std::string, data_types::DOUBLE>(
+    std::string& str) {
+  uint64_t encval;
+  double value;
+  try {
+    value = stod(str);
+  } catch (...) {
+    return kNullValue<uint64_t>;
+  }
+  memcpy(&encval, &value, sizeof(value));
+  return encval;
+}
+
+template <>
+inline uint64_t
+data_types::encode<uint64_t, std::string, data_types::BOOL>(std::string& str) {
+  if (str.size() == 0)
+    return kNullValue<uint64_t>;
+  uint64_t encval = 1;
+  if ((str == "F") || (str == "f") || (str == "FALSE") || (str == "false") ||
+      (str == "0"))
+    encval = 0;
+  return encval;
+}
+
+template <>
+inline uint64_t
+data_types::encode<uint64_t, std::string, data_types::CHARS>(std::string& str) {
+  uint64_t encval = 0;
+  memset(&encval, '\0', sizeof(encval));
+  memcpy(&encval, str.c_str(), sizeof(encval) - 1);
+  return encval;
+}
+
+template <>
+inline uint64_t
+data_types::encode<uint64_t, std::string, data_types::IP_ADDRESS>(
+    std::string& str) {
+  uint64_t val, value = 0;
+  std::string::iterator start = str.begin();
+  for (unsigned i = 0; i < 4; i++) {
+    std::string::iterator end = std::find(start, str.end(), '.');
+    try {
+      val = std::stoull(std::string(start, end));
+    } catch (...) {
+      return kNullValue<uint64_t>;
+    }
+    if (val < 256) {
+      value = (value << 8) + val;
+      start = end + 1;
+    } else {
+      return kNullValue<uint64_t>;
+    }
+  }
+  return value;
+}
+
+template <>
+inline uint64_t
+data_types::encode<uint64_t, std::string, data_types::DATE>(std::string& str) {
+  uint64_t value = 0;
+  struct tm date {};
+  date.tm_isdst = -1;
+  strptime(str.c_str(), "%Y-%m-%d", &date);
+  time_t t;
+  try {
+    t = mktime(&date);
+  } catch (...) {
+    return kNullValue<uint64_t>;
+  }
+  memcpy(&value, &t, sizeof(value));
+  return value;
+}
+
+template <>
+inline uint64_t data_types::encode<uint64_t, std::string, data_types::USDATE>(
+    std::string& str) {
+  uint64_t value = 0;
+  struct tm date {};
+  date.tm_isdst = -1;
+  strptime(str.c_str(), "%m/%d/%y", &date);
+  time_t t;
+  try {
+    t = mktime(&date);
+  } catch (...) {
+    return kNullValue<uint64_t>;
+  }
+  memcpy(&value, &t, sizeof(value));
+  return value;
+}
+
+template <>
+inline uint64_t
+data_types::encode<uint64_t, std::string, data_types::DATE_TIME>(
+    std::string& str) {
+  uint64_t value = 0;
+  struct tm date {};
+  date.tm_isdst = -1;
+  strptime(str.c_str(), "%Y-%m-%dT%H:%M:%S", &date);
+  time_t t;
+  try {
+    t = mktime(&date);
+  } catch (...) {
+    return kNullValue<uint64_t>;
+  }
+  memcpy(&value, &t, sizeof(value));
+  return value;
+}
+
+// ENCODE METHODS SPECIALIZATION FOR DOUBLE ENC_t
+
+template <>
+inline double
+data_types::encode<double, std::string, data_types::UINT>(std::string& str) {
+  double encval;
+  uint64_t value;
+  try {
+    value = std::stoull(str);
+  } catch (...) {
+    return kNullValue<double>;
+  }
+  memcpy(&encval, &value, sizeof(value));
+  return encval;
+}
+
+template <>
+inline double
+data_types::encode<double, std::string, data_types::INT>(std::string& str) {
+  double encval;
+  int64_t value;
+  try {
+    value = stoll(str);
+  } catch (...) {
+    return kNullValue<double>;
+  }
+  memcpy(&encval, &value, sizeof(value));
+  return encval;
+}
+
+template <>
+inline double
+data_types::encode<double, std::string, data_types::FLOAT>(std::string& str) {
+  double encval;
+  float value;
+  try {
+    value = stof(str);
+  } catch (...) {
+    return kNullValue<double>;
+  }
+  memcpy(&encval, &value, sizeof(value));
+  return encval;
+}
+
+template <>
+inline double
+data_types::encode<double, std::string, data_types::DOUBLE>(std::string& str) {
+  double value;
+  try {
+    value = stod(str);
+  } catch (...) {
+    return kNullValue<double>;
+  }
+  return value;
+}
+
+template <>
+inline double
+data_types::encode<double, std::string, data_types::BOOL>(std::string& str) {
+  if (str.size() == 0)
+    return kNullValue<uint64_t>;
+  double encval = 1;
+  if ((str == "F") || (str == "f") || (str == "FALSE") || (str == "false") ||
+      (str == "0"))
+    encval = 0;
+  return encval;
+}
+
+template <>
+inline double
+data_types::encode<double, std::string, data_types::CHARS>(std::string& str) {
+  double encval = 0;
+  memset(&encval, '\0', sizeof(encval));
+  memcpy(&encval, str.c_str(), sizeof(encval) - 1);
+  return encval;
+}
+
+template <>
+inline double data_types::encode<double, std::string, data_types::IP_ADDRESS>(
+    std::string& str) {
+  uint64_t val, value = 0;
+  std::string::iterator start = str.begin();
+  for (unsigned i = 0; i < 4; i++) {
+    std::string::iterator end = std::find(start, str.end(), '.');
+    try {
+      val = std::stoull(std::string(start, end));
+    } catch (...) {
+      return kNullValue<double>;
+    }
+    if (val < 256) {
+      value = (value << 8) + val;
+      start = end + 1;
+    } else {
+      return kNullValue<double>;
+    }
+  }
+  double encval;
+  memcpy(&encval, &value, sizeof(value));
+  return encval;
+}
+
+template <>
+inline double
+data_types::encode<double, std::string, data_types::DATE>(std::string& str) {
+  double value = 0;
+  struct tm date {};
+  date.tm_isdst = -1;
+  strptime(str.c_str(), "%Y-%m-%d", &date);
+  time_t t;
+  try {
+    t = mktime(&date);
+  } catch (...) {
+    return kNullValue<double>;
+  }
+  memcpy(&value, &t, sizeof(value));
+  return value;
+}
+
+template <>
+inline double
+data_types::encode<double, std::string, data_types::USDATE>(std::string& str) {
+  double value = 0;
+  struct tm date {};
+  date.tm_isdst = -1;
+  strptime(str.c_str(), "%m/%d/%y", &date);
+  time_t t;
+  try {
+    t = mktime(&date);
+  } catch (...) {
+    return kNullValue<uint64_t>;
+  }
+  memcpy(&value, &t, sizeof(value));
+  return value;
+}
+
+template <>
+inline double data_types::encode<double, std::string, data_types::DATE_TIME>(
+    std::string& str) {
+  double value = 0;
+  struct tm date {};
+  date.tm_isdst = -1;
+  strptime(str.c_str(), "%Y-%m-%dT%H:%M:%S", &date);
+  time_t t;
+  try {
+    t = mktime(&date);
+  } catch (...) {
+    return kNullValue<uint64_t>;
+  }
+  memcpy(&value, &t, sizeof(value));
+  return value;
+}
+
+// ENCODE METHODS SPECIALIZATION FOR TIME_T ENC_t (same as long)
+template <>
+inline time_t
+data_types::encode<time_t, std::string, data_types::UINT>(std::string& str) {
+  time_t value;
+  try {
+    value = std::stoul(str);
+  } catch (...) {
+    value = kNullValue<time_t>;
+  }
+  return value;
+}
+
+template <>
+inline time_t
+data_types::encode<time_t, std::string, data_types::INT>(std::string& str) {
+  int64_t value;
+  try {
+    value = stol(str);
+  } catch (...) {
+    return kNullValue<time_t>;
+  }
+  return value;
+}
+
+template <>
+inline time_t
+data_types::encode<time_t, std::string, data_types::FLOAT>(std::string& str) {
+  time_t encval;
+  float value;
+  try {
+    value = stof(str);
+  } catch (...) {
+    return kNullValue<time_t>;
+  }
+  memcpy(&encval, &value, sizeof(value));
+  return encval;
+}
+
+template <>
+inline time_t
+data_types::encode<time_t, std::string, data_types::DOUBLE>(std::string& str) {
+  time_t encval;
+  double value;
+  try {
+    value = stod(str);
+  } catch (...) {
+    return kNullValue<time_t>;
+  }
+  memcpy(&encval, &value, sizeof(value));
+  return encval;
+}
+
+template <>
+inline time_t
+data_types::encode<time_t, std::string, data_types::BOOL>(std::string& str) {
+  if (str.size() == 0)
+    return kNullValue<uint64_t>;
+  time_t encval = 1;
+  if ((str == "F") || (str == "f") || (str == "FALSE") || (str == "false") ||
+      (str == "0"))
+    encval = 0;
+  return encval;
+}
+
+template <>
+inline time_t
+data_types::encode<time_t, std::string, data_types::CHARS>(std::string& str) {
+  time_t encval = 0;
+  memset(&encval, '\0', sizeof(encval));
+  memcpy(&encval, str.c_str(), sizeof(encval) - 1);
+  return encval;
+}
+
+template <>
+inline time_t data_types::encode<time_t, std::string, data_types::IP_ADDRESS>(
+    std::string& str) {
+  time_t val, value = 0;
+  std::string::iterator start = str.begin();
+  for (unsigned i = 0; i < 4; i++) {
+    std::string::iterator end = std::find(start, str.end(), '.');
+    try {
+      val = std::stoull(std::string(start, end));
+    } catch (...) {
+      return kNullValue<time_t>;
+    }
+    if (val < 256) {
+      value = (value << 8) + val;
+      start = end + 1;
+    } else {
+      return kNullValue<time_t>;
+    }
+  }
+  return value;
+}
+
+template <>
+inline time_t
+data_types::encode<time_t, std::string, data_types::DATE>(std::string& str) {
+  struct tm date {};
+  date.tm_isdst = -1;
+  strptime(str.c_str(), "%Y-%m-%d", &date);
+  time_t t;
+  try {
+    t = mktime(&date);
+  } catch (...) {
+    return kNullValue<time_t>;
+  }
+  return t;
+}
+
+template <>
+inline time_t
+data_types::encode<time_t, std::string, data_types::USDATE>(std::string& str) {
+  struct tm date {};
+  date.tm_isdst = -1;
+  strptime(str.c_str(), "%m/%d/%y", &date);
+  time_t t;
+  try {
+    t = mktime(&date);
+  } catch (...) {
+    return kNullValue<time_t>;
+  }
+  return t;
+}
+
+template <>
+inline time_t data_types::encode<time_t, std::string, data_types::DATE_TIME>(
+    std::string& str) {
+  struct tm date {};
+  date.tm_isdst = -1;
+  strptime(str.c_str(), "%Y-%m-%dT%H:%M:%S", &date);
+  time_t t;
+  try {
+    t = mktime(&date);
+  } catch (...) {
+    return kNullValue<uint64_t>;
+  }
+  return t;
+}
+
+template <typename ENC_t, typename IN_t>
+ENC_t data_types::encode(IN_t& in, data_types::data_t dt) {
+  switch (dt) {
+    //     case data_types::STRING :
+    //       return data_types::encode<ENC_t, IN_t, data_types::STRING>(in);
+    //     case data_types::CHARS :
+    //       return data_types::encode<ENC_t, IN_t, data_types::CHARS>(in);
+  case data_types::UINT:
+    return data_types::encode<ENC_t, IN_t, data_types::UINT>(in);
+  case data_types::INT:
+    return data_types::encode<ENC_t, IN_t, data_types::INT>(in);
+  case data_types::FLOAT:
+    return data_types::encode<ENC_t, IN_t, data_types::FLOAT>(in);
+  case data_types::DOUBLE:
+    return data_types::encode<ENC_t, IN_t, data_types::DOUBLE>(in);
+  case data_types::BOOL:
+    return data_types::encode<ENC_t, IN_t, data_types::BOOL>(in);
+  case data_types::DATE:
+    return data_types::encode<ENC_t, IN_t, data_types::DATE>(in);
+  case data_types::USDATE:
+    return data_types::encode<ENC_t, IN_t, data_types::USDATE>(in);
+  case data_types::DATE_TIME:
+    return data_types::encode<ENC_t, IN_t, data_types::DATE_TIME>(in);
+  case data_types::IP_ADDRESS:
+    return data_types::encode<ENC_t, IN_t, data_types::IP_ADDRESS>(in);
+  }
+  return data_types::kNullValue<ENC_t>;
+}
+
+template <>
+inline std::string
+data_types::decode<uint64_t, std::string, data_types::UINT>(uint64_t value) {
+  if (value == kNullValue<uint64_t>)
+    return "";
+  return std::to_string(value);
+}
+
+template <>
+inline std::string
+data_types::decode<uint64_t, std::string, data_types::INT>(uint64_t value) {
+  if (value == kNullValue<uint64_t>)
+    return "";
+  int64_t v;
+  memcpy(&v, &value, sizeof(v));
+  return std::to_string(v);
+}
+
+template <>
+inline std::string
+data_types::decode<uint64_t, std::string, data_types::FLOAT>(uint64_t value) {
+  if (value == kNullValue<uint64_t>)
+    return "";
+  float v;
+  memcpy(&v, &value, sizeof(v));
+  return std::to_string(v);
+}
+
+template <>
+inline std::string
+data_types::decode<uint64_t, std::string, data_types::DOUBLE>(uint64_t value) {
+  if (value == kNullValue<uint64_t>)
+    return "";
+  double v;
+  memcpy(&v, &value, sizeof(v));
+  return std::to_string(v);
+}
+
+template <>
+inline std::string
+data_types::decode<uint64_t, std::string, data_types::IP_ADDRESS>(
+    uint64_t value) {
+  std::string ipAddr = "";
+  uint64_t octets[4];
+  for (uint64_t k = 0; k < 4; k++) {
+    octets[k] = value & 255;
+    value     = value >> 8;
+  }
+  for (uint64_t k = 3; k >= 1; k--)
+    ipAddr += std::to_string(octets[k]) + '.';
+  return ipAddr + std::to_string(octets[0]);
+}
+
+template <>
+inline std::string
+data_types::decode<uint64_t, std::string, data_types::BOOL>(uint64_t value) {
+  if (value == kNullValue<uint64_t>)
+    return "";
+  return std::to_string(value);
+}
+
+template <>
+inline std::string
+data_types::decode<uint64_t, std::string, data_types::DATE>(uint64_t value) {
+  time_t t = data_types::decode<uint64_t, data_types::DATE>(value);
+  char dateString[11];
+  strftime(dateString, 11, "%Y-%m-%d", std::localtime(&t));
+  return std::string(dateString);
+}
+
+template <>
+inline std::string
+data_types::decode<uint64_t, std::string, data_types::CHARS>(uint64_t value) {
+  const char* c = reinterpret_cast<const char*>(&value);
+  return std::string(c);
+}
+
+template <>
+inline uint64_t data_types::decode<uint64_t, uint64_t>(uint64_t encvalue) {
+  return encvalue;
+}
+} // namespace shad
+
+#endif // INCLUDE_DATA_TYPES_DATA_TYPES_H_
diff --git a/libwmd/include/galois/wmd/graph.h b/libwmd/include/galois/wmd/graph.h
new file mode 100644
index 0000000000..6618b79b0c
--- /dev/null
+++ b/libwmd/include/galois/wmd/graph.h
@@ -0,0 +1,190 @@
+//===------------------------------------------------------------*- C++ -*-===//
+//
+//                            The AGILE Workflows
+//
+//===----------------------------------------------------------------------===//
+// ** Pre-Copyright Notice
+//
+// This computer software was prepared by Battelle Memorial Institute,
+// hereinafter the Contractor, under Contract No. DE-AC05-76RL01830 with the
+// Department of Energy (DOE). All rights in the computer software are reserved
+// by DOE on behalf of the United States Government and the Contractor as
+// provided in the Contract. You are authorized to use this computer software
+// for Governmental purposes but it is not to be released or distributed to the
+// public. NEITHER THE GOVERNMENT NOR THE CONTRACTOR MAKES ANY WARRANTY, EXPRESS
+// OR IMPLIED, OR ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. This
+// notice including this sentence must appear on any copies of this computer
+// software.
+//
+// ** Disclaimer Notice
+//
+// This material was prepared as an account of work sponsored by an agency of
+// the United States Government. Neither the United States Government nor the
+// United States Department of Energy, nor Battelle, nor any of their employees,
+// nor any jurisdiction or organization that has cooperated in the development
+// of these materials, makes any warranty, express or implied, or assumes any
+// legal liability or responsibility for the accuracy, completeness, or
+// usefulness or any information, apparatus, product, software, or process
+// disclosed, or represents that its use would not infringe privately owned
+// rights. Reference herein to any specific commercial product, process, or
+// service by trade name, trademark, manufacturer, or otherwise does not
+// necessarily constitute or imply its endorsement, recommendation, or favoring
+// by the United States Government or any agency thereof, or Battelle Memorial
+// Institute. The views and opinions of authors expressed herein do not
+// necessarily state or reflect those of the United States Government or any
+// agency thereof.
+//
+//                    PACIFIC NORTHWEST NATIONAL LABORATORY
+//                                 operated by
+//                                   BATTELLE
+//                                   for the
+//                      UNITED STATES DEPARTMENT OF ENERGY
+//                       under Contract DE-AC05-76RL01830
+//===----------------------------------------------------------------------===//
+
+#ifndef GRAPH_H_
+#define GRAPH_H_
+
+#include <cstdint>
+#include <limits>
+#include <vector>
+#include <unordered_map>
+
+#include <boost/archive/text_oarchive.hpp>
+#include <boost/archive/text_iarchive.hpp>
+
+#include "data_types.h"
+#include "graphTypes.h"
+#include "galois/graphs/LS_LC_CSR_64_Graph.h"
+
+#define UINT shad::data_types::UINT
+// #define DOUBLE shad::data_types::DOUBLE
+#define USDATE shad::data_types::USDATE
+#define ENCODE shad::data_types::encode
+
+namespace agile::workflow1 {
+
+class Vertex {
+public:
+  uint64_t id;    // user id of the vertex given in dataset (a.k.a token)
+  uint64_t glbid; // global id of the vertex in the graph
+  uint64_t edges; // number of edges
+  TYPES type;
+
+  Vertex() {
+    id    = shad::data_types::kNullValue<uint64_t>;
+    glbid = shad::data_types::kNullValue<uint64_t>;
+    edges = 0;
+    type  = TYPES::NONE;
+  }
+
+  Vertex(uint64_t glbid_, uint64_t id_, TYPES type_) {
+    id    = id_;
+    glbid = glbid_;
+    edges = 0;
+    type  = type_;
+  }
+};
+
+class Edge {
+public:
+  uint64_t src; // vertex id of src
+  uint64_t dst; // vertex id of dst
+  TYPES type;
+  TYPES src_type;
+  TYPES dst_type;
+  uint64_t src_glbid;
+  uint64_t dst_glbid;
+
+  Edge() {
+    src       = shad::data_types::kNullValue<uint64_t>;
+    dst       = shad::data_types::kNullValue<uint64_t>;
+    type      = TYPES::NONE;
+    src_type  = TYPES::NONE;
+    dst_type  = TYPES::NONE;
+    src_glbid = shad::data_types::kNullValue<uint64_t>;
+    dst_glbid = shad::data_types::kNullValue<uint64_t>;
+  }
+
+  Edge(std::vector<std::string>& tokens) {
+    if (tokens[0] == "Sale") {
+      src       = ENCODE<uint64_t, std::string, UINT>(tokens[1]);
+      dst       = ENCODE<uint64_t, std::string, UINT>(tokens[2]);
+      type      = TYPES::SALE;
+      src_type  = TYPES::PERSON;
+      dst_type  = TYPES::PERSON;
+      src_glbid = shad::data_types::kNullValue<uint64_t>;
+      dst_glbid = shad::data_types::kNullValue<uint64_t>;
+    } else if (tokens[0] == "Author") {
+      src       = ENCODE<uint64_t, std::string, UINT>(tokens[1]);
+      type      = TYPES::AUTHOR;
+      src_type  = TYPES::PERSON;
+      src_glbid = shad::data_types::kNullValue<uint64_t>;
+      dst_glbid = shad::data_types::kNullValue<uint64_t>;
+      if (tokens[3] != "")
+        dst = ENCODE<uint64_t, std::string, UINT>(tokens[3]);
+      else if (tokens[4] != "")
+        dst = ENCODE<uint64_t, std::string, UINT>(tokens[4]);
+      else if (tokens[5] != "")
+        dst = ENCODE<uint64_t, std::string, UINT>(tokens[5]);
+      if (tokens[3] != "")
+        dst_type = TYPES::FORUM;
+      else if (tokens[4] != "")
+        dst_type = TYPES::FORUMEVENT;
+      else if (tokens[5] != "")
+        dst_type = TYPES::PUBLICATION;
+    } else if (tokens[0] == "Includes") {
+      src       = ENCODE<uint64_t, std::string, UINT>(tokens[3]);
+      dst       = ENCODE<uint64_t, std::string, UINT>(tokens[4]);
+      type      = TYPES::INCLUDES;
+      src_type  = TYPES::FORUM;
+      dst_type  = TYPES::FORUMEVENT;
+      src_glbid = shad::data_types::kNullValue<uint64_t>;
+      dst_glbid = shad::data_types::kNullValue<uint64_t>;
+    } else if (tokens[0] == "HasTopic") {
+      dst       = ENCODE<uint64_t, std::string, UINT>(tokens[6]);
+      type      = TYPES::HASTOPIC;
+      dst_type  = TYPES::TOPIC;
+      src_glbid = shad::data_types::kNullValue<uint64_t>;
+      dst_glbid = shad::data_types::kNullValue<uint64_t>;
+      if (tokens[3] != "")
+        src = ENCODE<uint64_t, std::string, UINT>(tokens[3]);
+      else if (tokens[4] != "")
+        src = ENCODE<uint64_t, std::string, UINT>(tokens[4]);
+      else if (tokens[5] != "")
+        src = ENCODE<uint64_t, std::string, UINT>(tokens[5]);
+      if (tokens[3] != "")
+        src_type = TYPES::FORUM;
+      else if (tokens[4] != "")
+        src_type = TYPES::FORUMEVENT;
+      else if (tokens[5] != "")
+        src_type = TYPES::PUBLICATION;
+    } else if (tokens[0] == "HasOrg") {
+      src       = ENCODE<uint64_t, std::string, UINT>(tokens[5]);
+      dst       = ENCODE<uint64_t, std::string, UINT>(tokens[6]);
+      type      = TYPES::HASORG;
+      src_type  = TYPES::PUBLICATION;
+      dst_type  = TYPES::TOPIC;
+      src_glbid = shad::data_types::kNullValue<uint64_t>;
+      dst_glbid = shad::data_types::kNullValue<uint64_t>;
+    }
+  }
+
+private:
+  friend class boost::serialization::access;
+
+  template <class Archive>
+  void serialize(Archive& ar, const unsigned int version) {
+    ar& src;
+    ar& dst;
+    ar& type;
+    ar& src_type;
+    ar& dst_type;
+    ar& src_glbid;
+    ar& dst_glbid;
+  }
+};
+
+} // namespace agile::workflow1
+
+#endif // GRAPH_H
diff --git a/libwmd/include/galois/wmd/graphTypes.h b/libwmd/include/galois/wmd/graphTypes.h
new file mode 100644
index 0000000000..7eebea432d
--- /dev/null
+++ b/libwmd/include/galois/wmd/graphTypes.h
@@ -0,0 +1,75 @@
+//===------------------------------------------------------------*- C++ -*-===//
+//
+//                            The AGILE Workflows
+//
+//===----------------------------------------------------------------------===//
+// ** Pre-Copyright Notice
+//
+// This computer software was prepared by Battelle Memorial Institute,
+// hereinafter the Contractor, under Contract No. DE-AC05-76RL01830 with the
+// Department of Energy (DOE). All rights in the computer software are reserved
+// by DOE on behalf of the United States Government and the Contractor as
+// provided in the Contract. You are authorized to use this computer software
+// for Governmental purposes but it is not to be released or distributed to the
+// public. NEITHER THE GOVERNMENT NOR THE CONTRACTOR MAKES ANY WARRANTY, EXPRESS
+// OR IMPLIED, OR ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. This
+// notice including this sentence must appear on any copies of this computer
+// software.
+//
+// ** Disclaimer Notice
+//
+// This material was prepared as an account of work sponsored by an agency of
+// the United States Government. Neither the United States Government nor the
+// United States Department of Energy, nor Battelle, nor any of their employees,
+// nor any jurisdiction or organization that has cooperated in the development
+// of these materials, makes any warranty, express or implied, or assumes any
+// legal liability or responsibility for the accuracy, completeness, or
+// usefulness or any information, apparatus, product, software, or process
+// disclosed, or represents that its use would not infringe privately owned
+// rights. Reference herein to any specific commercial product, process, or
+// service by trade name, trademark, manufacturer, or otherwise does not
+// necessarily constitute or imply its endorsement, recommendation, or favoring
+// by the United States Government or any agency thereof, or Battelle Memorial
+// Institute. The views and opinions of authors expressed herein do not
+// necessarily state or reflect those of the United States Government or any
+// agency thereof.
+//
+//                    PACIFIC NORTHWEST NATIONAL LABORATORY
+//                                 operated by
+//                                   BATTELLE
+//                                   for the
+//                      UNITED STATES DEPARTMENT OF ENERGY
+//                       under Contract DE-AC05-76RL01830
+//===----------------------------------------------------------------------===//
+
+#ifndef GRAPHTYPES_H_
+#define GRAPHTYPES_H_
+
+namespace agile::workflow1 {
+
+enum class TYPES {
+  PERSON,
+  FORUMEVENT,
+  FORUM,
+  PUBLICATION,
+  TOPIC,
+  PURCHASE,
+  SALE,
+  AUTHOR,
+  WRITTENBY,
+  INCLUDES,
+  INCLUDEDIN,
+  HASTOPIC,
+  TOPICIN,
+  HASORG,
+  ORGIN,
+  DEVICE,
+  FRIEND,
+  USES,
+  COMMUNICATION,
+  NONE
+};
+
+} // namespace agile::workflow1
+
+#endif // GRAPHTYPES_H
diff --git a/libwmd/include/galois/wmd/instrument.h b/libwmd/include/galois/wmd/instrument.h
new file mode 100644
index 0000000000..283b91ff91
--- /dev/null
+++ b/libwmd/include/galois/wmd/instrument.h
@@ -0,0 +1,190 @@
+#ifndef WMD_INSTRUMENT_H_
+#define WMD_INSTRUMENT_H_
+
+#include <iomanip>
+#include <iostream>
+#include <fstream>
+#include <memory>
+
+#include "galois/AtomicWrapper.h"
+#include "galois/DynamicBitset.h"
+#include "galois/DReducible.h"
+
+namespace agile::workflow1 {
+
+class Instrument;
+
+inline std::unique_ptr<Instrument> instrument;
+
+#ifdef GALOIS_INSTRUMENT
+#define I_INIT(GRAPH_NAME, HOST, NUM_HOSTS, NUM_EDGES)                         \
+  ({                                                                           \
+    agile::workflow1::Instrument::init(GRAPH_NAME, HOST, NUM_HOSTS,            \
+                                       NUM_EDGES);                             \
+  })
+#define I_DEINIT()                                                             \
+  { agile::workflow1::instrument = nullptr; }
+#define I_NEW_FILE(NAME_SUFFIX, NUM_EDGES)                                     \
+  ({ agile::workflow1::instrument->new_file(NAME_SUFFIX, NUM_EDGES); })
+#define I_ROUND(ROUND_NUM)                                                     \
+  ({ agile::workflow1::instrument->log_round(ROUND_NUM); })
+#define I_CLEAR() ({ agile::workflow1::instrument->clear(); })
+#define I_RS() ({ agile::workflow1::instrument->record_local_read_stream(); })
+#define I_RR(MIRROR)                                                           \
+  ({ agile::workflow1::instrument->record_read_random(MIRROR); })
+#define I_WR(MIRROR)                                                           \
+  ({ agile::workflow1::instrument->record_write_random(MIRROR); })
+#define I_WRR(REMOTE_HOST)                                                     \
+  ({ agile::workflow1::instrument->record_write_random_remote(REMOTE_HOST); })
+#define I_WM(WRITES) ({ agile::workflow1::instrument->write_many(WRITES); })
+#define I_LC(REMOTE_HOST, BYTES)                                               \
+  ({ agile::workflow1::instrument->log_communication(REMOTE_HOST, BYTES); })
+#define I_BM(NODES)                                                            \
+  ({ agile::workflow1::instrument->broadcast_masters(NODES); })
+#else
+#define I_INIT(GRAPH_NAME, HOST, NUM_HOSTS, NUM_EDGES) ({})
+#define I_DEINIT() ({})
+#define I_NEW_FILE(NAME_SUFFIX) ({})
+#define I_ROUND(ROUND_NUM) ({})
+#define I_CLEAR() ({})
+#define I_RS() ({})
+#define I_RR(MIRROR) ({})
+#define I_WR(MIRROR) ({})
+#define I_WRR(REMOTE_HOST) ({})
+#define I_WM(WRITES) ({})
+#define I_LC(REMOTE_HOST, BYTES) ({})
+#define I_BM(NODES) ({})
+#endif
+
+class Instrument {
+public:
+  uint64_t hostID;
+  uint64_t numHosts;
+  std::string graph_name;
+
+  std::unique_ptr<galois::DGAccumulator<uint64_t>> local_read_stream;
+  std::unique_ptr<galois::DGAccumulator<uint64_t>> master_read;
+  std::unique_ptr<galois::DGAccumulator<uint64_t>> master_write;
+  std::unique_ptr<galois::DGAccumulator<uint64_t>> mirror_read;
+  std::unique_ptr<galois::DGAccumulator<uint64_t>> mirror_write;
+  std::unique_ptr<galois::DGAccumulator<uint64_t>[]> remote_comm_to_host;
+  std::ofstream file;
+
+  static void init(const std::string& graph_name_, uint64_t hid, uint64_t numH,
+                   uint64_t numEdges) {
+    instrument = std::make_unique<Instrument>(graph_name_, hid, numH, numEdges);
+  }
+
+  Instrument(const std::string& graph_name_, uint64_t hid, uint64_t numH,
+             uint64_t numEdges) {
+    hostID     = hid;
+    numHosts   = numH;
+    graph_name = graph_name_;
+
+    local_read_stream = std::make_unique<galois::DGAccumulator<uint64_t>>();
+    master_read       = std::make_unique<galois::DGAccumulator<uint64_t>>();
+    master_write      = std::make_unique<galois::DGAccumulator<uint64_t>>();
+    mirror_read       = std::make_unique<galois::DGAccumulator<uint64_t>>();
+    mirror_write      = std::make_unique<galois::DGAccumulator<uint64_t>>();
+    remote_comm_to_host =
+        std::make_unique<galois::DGAccumulator<uint64_t>[]>(numH);
+    clear();
+
+    // start instrumentation
+    file.open(graph_name + "_" + std::to_string(numH) + "procs_id" +
+              std::to_string(hid));
+    file << "#####   Stat   #####" << std::endl;
+    file << "host " << hid << " total edges: " << numEdges << std::endl;
+  }
+
+  void new_file(const std::string& filename_extension, uint64_t numEdges) {
+    file.close();
+    file.open(graph_name + filename_extension + "_" + std::to_string(numHosts) +
+              "procs_id" + std::to_string(hostID));
+    file << "#####   Stat   #####" << std::endl;
+    file << "host " << hostID << " total edges: " << numEdges << std::endl;
+  }
+
+  void clear() {
+    local_read_stream->reset();
+    master_read->reset();
+    master_write->reset();
+    mirror_read->reset();
+    mirror_write->reset();
+
+    for (auto i = 0ul; i < numHosts; i++) {
+      remote_comm_to_host[i].reset();
+    }
+  }
+
+  void record_local_read_stream() { *local_read_stream += 1; }
+
+  void record_read_random(bool mirror = false) {
+    if (!mirror) { // master
+      *master_read += 1;
+    } else { // mirror
+      *mirror_read += 1;
+    }
+  }
+
+  void record_write_random(bool mirror = false) {
+    if (!mirror) { // master
+      *master_write += 1;
+    } else { // mirror
+      *mirror_write += 1;
+    }
+  }
+
+  void record_write_random_remote(uint64_t remote_host) {
+    *mirror_write += 1;
+    remote_comm_to_host[remote_host] += 1;
+  }
+
+  void write_many(uint64_t accesses) { *master_write += accesses; }
+
+  void log_run(uint64_t run) {
+    file << "#####   Run " << run << "   #####" << std::endl;
+  }
+
+  void log_round(uint64_t num_iterations) {
+    auto host_id   = hostID;
+    auto num_hosts = numHosts;
+    file << "#####   Round " << num_iterations << "   #####" << std::endl;
+    file << "host " << host_id
+         << " local read (stream): " << local_read_stream->read_local()
+         << std::endl;
+    file << "host " << host_id << " master reads: " << master_read->read_local()
+         << std::endl;
+    file << "host " << host_id
+         << " master writes: " << master_write->read_local() << std::endl;
+    file << "host " << host_id << " mirror reads: " << mirror_read->read_local()
+         << std::endl;
+    file << "host " << host_id
+         << " mirror writes: " << mirror_write->read_local() << std::endl;
+
+    for (uint32_t i = 0; i < num_hosts; i++) {
+      file << "host " << host_id << " remote communication for host " << i
+           << ": " << remote_comm_to_host[i].read_local() << std::endl;
+    }
+  }
+
+  void log_communication(uint64_t target_host, uint64_t bytes) {
+    remote_comm_to_host[target_host] += bytes;
+  }
+
+  void broadcast_masters(uint64_t num_nodes) {
+    auto host_id   = hostID;
+    auto num_hosts = numHosts;
+
+    for (uint64_t h = 0; h < num_hosts; h++) {
+      if (h == host_id) {
+        continue;
+      }
+      remote_comm_to_host[h] += num_nodes / num_hosts;
+    }
+  }
+};
+
+} // namespace agile::workflow1
+
+#endif
diff --git a/libwmd/include/galois/wmd/schema.h b/libwmd/include/galois/wmd/schema.h
new file mode 100644
index 0000000000..a35bb2c967
--- /dev/null
+++ b/libwmd/include/galois/wmd/schema.h
@@ -0,0 +1,177 @@
+//===------------------------------------------------------------*- C++ -*-===//
+//
+//                            The AGILE Workflows
+//
+//===----------------------------------------------------------------------===//
+// ** Pre-Copyright Notice
+//
+// This computer software was prepared by Battelle Memorial Institute,
+// hereinafter the Contractor, under Contract No. DE-AC05-76RL01830 with the
+// Department of Energy (DOE). All rights in the computer software are reserved
+// by DOE on behalf of the United States Government and the Contractor as
+// provided in the Contract. You are authorized to use this computer software
+// for Governmental purposes but it is not to be released or distributed to the
+// public. NEITHER THE GOVERNMENT NOR THE CONTRACTOR MAKES ANY WARRANTY, EXPRESS
+// OR IMPLIED, OR ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. This
+// notice including this sentence must appear on any copies of this computer
+// software.
+//
+// ** Disclaimer Notice
+//
+// This material was prepared as an account of work sponsored by an agency of
+// the United States Government. Neither the United States Government nor the
+// United States Department of Energy, nor Battelle, nor any of their employees,
+// nor any jurisdiction or organization that has cooperated in the development
+// of these materials, makes any warranty, express or implied, or assumes any
+// legal liability or responsibility for the accuracy, completeness, or
+// usefulness or any information, apparatus, product, software, or process
+// disclosed, or represents that its use would not infringe privately owned
+// rights. Reference herein to any specific commercial product, process, or
+// service by trade name, trademark, manufacturer, or otherwise does not
+// necessarily constitute or imply its endorsement, recommendation, or favoring
+// by the United States Government or any agency thereof, or Battelle Memorial
+// Institute. The views and opinions of authors expressed herein do not
+// necessarily state or reflect those of the United States Government or any
+// agency thereof.
+//
+//                    PACIFIC NORTHWEST NATIONAL LABORATORY
+//                                 operated by
+//                                   BATTELLE
+//                                   for the
+//                      UNITED STATES DEPARTMENT OF ENERGY
+//                       under Contract DE-AC05-76RL01830
+//===----------------------------------------------------------------------===//
+
+#ifndef GALOIS_WMD_SCHEMA_H_
+#define GALOIS_WMD_SCHEMA_H_
+
+#include "graph.h"
+
+#include <memory>
+#include <string>
+#include <variant>
+#include <vector>
+
+namespace galois {
+namespace graphs {
+
+using ParsedUID = uint64_t;
+
+template <typename V, typename E>
+struct ParsedGraphStructure {
+  ParsedGraphStructure() : isNode(false), isEdge(false) {}
+  ParsedGraphStructure(V node_) : node(node_), isNode(true), isEdge(false) {}
+  ParsedGraphStructure(std::vector<E> edges_)
+      : edges(edges_), isNode(false), isEdge(true) {}
+
+  V node;
+  std::vector<E> edges;
+  bool isNode;
+  bool isEdge;
+};
+
+template <typename V, typename E>
+class FileParser {
+public:
+  virtual const std::vector<std::string>& GetFiles()                = 0;
+  virtual ParsedGraphStructure<V, E> ParseLine(char* line,
+                                               uint64_t lineLength) = 0;
+  static std::vector<std::string> SplitLine(const char* line,
+                                            uint64_t lineLength, char delim,
+                                            uint64_t numTokens) {
+    uint64_t ndx = 0, start = 0, end = 0;
+    std::vector<std::string> tokens(numTokens);
+
+    for (; end < lineLength - 1; end++) {
+      if (line[end] == delim) {
+        tokens[ndx] = std::string(line + start, end - start);
+        start       = end + 1;
+        ndx++;
+      }
+    }
+
+    tokens[numTokens - 1] =
+        std::string(line + start, end - start); // flush last token
+    return tokens;
+  }
+};
+
+template <typename V, typename E>
+class WMDParser : public FileParser<V, E> {
+public:
+  WMDParser(std::vector<std::string> files) : csvFields_(10), files_(files) {}
+  WMDParser(uint64_t csvFields, std::vector<std::string> files)
+      : csvFields_(csvFields), files_(files) {}
+
+  virtual const std::vector<std::string>& GetFiles() override { return files_; }
+  virtual ParsedGraphStructure<V, E> ParseLine(char* line,
+                                               uint64_t lineLength) override {
+    std::vector<std::string> tokens =
+        this->SplitLine(line, lineLength, ',', csvFields_);
+
+    if (tokens[0] == "Person") {
+      ParsedUID uid =
+          shad::data_types::encode<uint64_t, std::string, UINT>(tokens[1]);
+      return ParsedGraphStructure<V, E>(
+          V(uid, 0, agile::workflow1::TYPES::PERSON));
+    } else if (tokens[0] == "ForumEvent") {
+      ParsedUID uid =
+          shad::data_types::encode<uint64_t, std::string, UINT>(tokens[4]);
+      return ParsedGraphStructure<V, E>(
+          V(uid, 0, agile::workflow1::TYPES::FORUMEVENT));
+    } else if (tokens[0] == "Forum") {
+      ParsedUID uid =
+          shad::data_types::encode<uint64_t, std::string, UINT>(tokens[3]);
+      return ParsedGraphStructure<V, E>(
+          V(uid, 0, agile::workflow1::TYPES::FORUM));
+    } else if (tokens[0] == "Publication") {
+      ParsedUID uid =
+          shad::data_types::encode<uint64_t, std::string, UINT>(tokens[5]);
+      return ParsedGraphStructure<V, E>(
+          V(uid, 0, agile::workflow1::TYPES::PUBLICATION));
+    } else if (tokens[0] == "Topic") {
+      ParsedUID uid =
+          shad::data_types::encode<uint64_t, std::string, UINT>(tokens[6]);
+      return ParsedGraphStructure<V, E>(
+          V(uid, 0, agile::workflow1::TYPES::TOPIC));
+    } else { // edge type
+      agile::workflow1::TYPES inverseEdgeType;
+      if (tokens[0] == "Sale") {
+        inverseEdgeType = agile::workflow1::TYPES::PURCHASE;
+      } else if (tokens[0] == "Author") {
+        inverseEdgeType = agile::workflow1::TYPES::WRITTENBY;
+      } else if (tokens[0] == "Includes") {
+        inverseEdgeType = agile::workflow1::TYPES::INCLUDEDIN;
+      } else if (tokens[0] == "HasTopic") {
+        inverseEdgeType = agile::workflow1::TYPES::TOPICIN;
+      } else if (tokens[0] == "HasOrg") {
+        inverseEdgeType = agile::workflow1::TYPES::ORGIN;
+      } else {
+        // skip nodes
+        return ParsedGraphStructure<V, E>();
+      }
+      std::vector<E> edges;
+      E edge(tokens);
+
+      // insert inverse edges to the graph
+      E inverseEdge    = edge;
+      inverseEdge.type = inverseEdgeType;
+      std::swap(inverseEdge.src, inverseEdge.dst);
+      std::swap(inverseEdge.src_type, inverseEdge.dst_type);
+
+      edges.emplace_back(edge);
+      edges.emplace_back(inverseEdge);
+
+      return ParsedGraphStructure<V, E>(edges);
+    }
+  }
+
+private:
+  uint64_t csvFields_;
+  std::vector<std::string> files_;
+};
+
+} // namespace graphs
+} // namespace galois
+
+#endif
diff --git a/libwmd/test/CMakeLists.txt b/libwmd/test/CMakeLists.txt
new file mode 100644
index 0000000000..3b4887bfe3
--- /dev/null
+++ b/libwmd/test/CMakeLists.txt
@@ -0,0 +1,12 @@
+function(add_test_unit name)
+  set(test_name unit-${name})
+
+  add_executable(${test_name} ${name}.cpp)
+  target_link_libraries(${test_name} Galois::wmd Galois::cusp Galois::dist_async Galois::gluon)
+
+  set(command_line "$<TARGET_FILE:${test_name}>")
+
+  add_test(NAME ${test_name} COMMAND ${command_line})
+endfunction()
+
+add_test_unit(wmd-graph-build)
diff --git a/libwmd/test/wmd-graph-build.cpp b/libwmd/test/wmd-graph-build.cpp
new file mode 100644
index 0000000000..cf11a476da
--- /dev/null
+++ b/libwmd/test/wmd-graph-build.cpp
@@ -0,0 +1,125 @@
+/*
+ * Run this script in Debug view and compare its result to result of
+ * MiningPartitioner.h
+ *
+ * A testing result sheet could be found at
+ * https://docs.google.com/spreadsheets/u/1/d/1D0dAab29uazRKVroBZdEvAsEUT92aNHsiIYuHK6dlDA
+ *
+ * TODO: include a script to gen dataset and compare result
+ *
+ */
+#include "galois/wmd/graph.h"
+#include "galois/wmd/WMDPartitioner.h"
+
+#include "galois/DistGalois.h"
+#include "galois/graphs/GenericPartitioners.h"
+
+#include <cstdlib>
+#include <iostream>
+#include <fstream>
+
+using namespace agile::workflow1;
+
+typedef galois::graphs::WMDGraph<agile::workflow1::Vertex,
+                                 agile::workflow1::Edge, OECPolicy>
+    Graph;
+
+int main(int argc, char* argv[]) {
+  galois::DistMemSys G; // init galois memory
+  auto& net = galois::runtime::getSystemNetworkInterface();
+
+  if (argc == 3)
+    galois::setActiveThreads(atoi(argv[2]));
+
+  if (net.ID == 0) {
+    galois::gPrint("Testing building WMD graph from file.\n");
+    galois::gPrint("Num Hosts: ", net.Num,
+                   ", Active Threads Per Hosts: ", galois::getActiveThreads(),
+                   "\n");
+  }
+
+  std::string dataFile = argv[1];
+  std::vector<std::string> filenames;
+  filenames.emplace_back(dataFile);
+  std::vector<std::unique_ptr<galois::graphs::FileParser<
+      agile::workflow1::Vertex, agile::workflow1::Edge>>>
+      parsers;
+  parsers.emplace_back(
+      std::make_unique<galois::graphs::WMDParser<agile::workflow1::Vertex,
+                                                 agile::workflow1::Edge>>(
+          10, filenames));
+  I_INIT("tmp/WMDGraph", net.ID, net.Num, 0);
+  Graph* graph = new Graph(parsers, net.ID, net.Num, true, false,
+                           galois::graphs::BALANCED_EDGES_OF_MASTERS);
+  I_ROUND(0);
+  I_CLEAR();
+  assert(graph != nullptr);
+
+  // generate a file with sorted token of all nodes and its outgoing edge dst
+  // compare it with other implementation to verify the correctness
+  std::vector<std::pair<uint64_t, std::vector<uint64_t>>> tokenAndEdges;
+  tokenAndEdges.resize(graph->numMasters());
+
+  galois::do_all(
+      galois::iterate(graph->masterNodesRange()),
+      [&](size_t lid) {
+        auto token = graph->getData(lid).id;
+
+        std::vector<uint64_t> edgeDst;
+        auto end = graph->edge_end(lid);
+        auto itr = graph->edge_begin(lid);
+        for (; itr != end; itr++) {
+          edgeDst.push_back(graph->getEdgeData(itr).dst);
+        }
+        std::sort(edgeDst.begin(), edgeDst.end());
+
+        tokenAndEdges[lid] = std::make_pair(token, std::move(edgeDst));
+      },
+      galois::steal());
+
+  // gather node info from other hosts
+  if (net.ID != 0) { // send token and degree pairs to host 0
+    galois::runtime::SendBuffer sendBuffer;
+    galois::runtime::gSerialize(sendBuffer, tokenAndEdges);
+    net.sendTagged(0, galois::runtime::evilPhase, sendBuffer);
+  } else { // recv node range from other hosts
+    for (size_t i = 0; i < net.Num - 1; i++) {
+      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+      do {
+        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+      } while (!p);
+
+      std::vector<std::pair<uint64_t, std::vector<uint64_t>>>
+          incomingtokenAndEdges;
+      galois::runtime::gDeserialize(p->second, incomingtokenAndEdges);
+
+      // combine data
+      std::move(incomingtokenAndEdges.begin(), incomingtokenAndEdges.end(),
+                std::back_inserter(tokenAndEdges));
+    }
+  }
+
+  // sort the node info by token order
+  // serilize it to file
+  if (net.ID == 0) {
+    std::sort(tokenAndEdges.begin(), tokenAndEdges.end(),
+              [](const std::pair<uint64_t, std::vector<uint64_t>>& a,
+                 const std::pair<uint64_t, std::vector<uint64_t>>& b) {
+                return a.first < b.first;
+              });
+
+    std::ofstream output;
+    output.open("wmd-graph-build-result.txt");
+
+    for (auto itr : tokenAndEdges) {
+      output << itr.first;
+      for (auto edge : itr.second) {
+        output << "," << edge;
+      }
+      output << "\n";
+    }
+    output.close();
+  }
+  I_DEINIT();
+  return 0;
+}
diff --git a/lonestar/analytics/cpu/bipart/Coarsening.cpp b/lonestar/analytics/cpu/bipart/Coarsening.cpp
index cd4f37daf7..96d17671dd 100644
--- a/lonestar/analytics/cpu/bipart/Coarsening.cpp
+++ b/lonestar/analytics/cpu/bipart/Coarsening.cpp
@@ -29,6 +29,8 @@
 #include <unordered_set>
 #include <unordered_map>
 
+constexpr static const unsigned CHUNK_SIZE      = 512U;
+
 int TOTALW;
 int LIMIT;
 bool FLAG = false;
@@ -42,13 +44,40 @@ int hash(unsigned val) {
 void parallelRand(MetisGraph* graph, int) {
 
   GGraph* fineGGraph = graph->getFinerGraph()->getGraph();
-  galois::do_all(
-      galois::iterate(size_t{0}, fineGGraph->hedges),
-      [&](GNode item) {
-        fineGGraph->getData(item).netrand =
-            hash(fineGGraph->getData(item).netnum);
+  
+	galois::StatTimer T_RAND("RAND");
+  T_RAND.start();
+
+	galois::do_all(
+      galois::iterate((uint64_t) 0, fineGGraph->hedges),
+      [&fineGGraph](uint64_t item) {
+				unsigned netnum = fineGGraph->getData(item, flag_no_lock).netnum;
+				netnum= hash(netnum);
+        fineGGraph->getData(item, flag_no_lock).netrand = netnum;
+      },
+			galois::steal(),
+//			 galois::chunk_size<CHUNK_SIZE>());
+     galois::loopname("rand"));
+	T_RAND.stop();
+
+		//std::cout <<"hedges: " << fineGGraph->hedges << std::endl;
+
+		galois::StatTimer T_INDEX("INDEX");
+  	T_INDEX.start();
+		galois::do_all(
+      galois::iterate((uint64_t) 0, fineGGraph->hedges),
+      [&fineGGraph](uint64_t item) {
+        unsigned netnum = fineGGraph->getData(item, flag_no_lock).index;
+        netnum= hash(1);
+        fineGGraph->getData(item, flag_no_lock).index = netnum;
       },
-      galois::loopname("rand"));
+      galois::steal(),
+	//		 galois::chunk_size<CHUNK_SIZE>());
+      galois::loopname("rand_index"));
+		T_INDEX.stop();
+
+		//std::cout <<"rand: " << T_RAND.get() << std::endl;
+		//std::cout << "rand_index: " << T_INDEX.get() << std::endl;
 }
 
 using MatchingPolicy = void(GNode, GGraph*);
@@ -123,7 +152,6 @@ void parallelPrioRand(MetisGraph* graph, int iter) {
   GGraph* fineGGraph = graph->getFinerGraph()->getGraph();
   parallelRand(graph, iter);
 
-  // Making deterministic
   galois::do_all(
       galois::iterate(size_t{0}, fineGGraph->hedges),
       [&](GNode item) {
@@ -165,7 +193,7 @@ void parallelPrioRand(MetisGraph* graph, int iter) {
 template <MatchingPolicy matcher>
 void parallelHMatchAndCreateNodes(MetisGraph* graph, int iter, GNodeBag& bag,
                                   std::vector<bool>& hedges,
-                                  std::vector<unsigned>& weight) {
+                                  galois::LargeArray<unsigned>& weight) {
   parallelPrioRand<matcher>(graph, iter);
   GGraph* fineGGraph = graph->getFinerGraph()->getGraph();
   assert(fineGGraph != graph->getGraph());
@@ -173,20 +201,14 @@ void parallelHMatchAndCreateNodes(MetisGraph* graph, int iter, GNodeBag& bag,
   typedef galois::substrate::PerThreadStorage<VecTy> ThreadLocalData;
   ThreadLocalData edgesThreadLocal;
   std::string name = "phaseI";
-  // galois::GAccumulator<unsigned> nnodes;
+  
   galois::GAccumulator<unsigned> hedge;
-  // hyperedge coarsening
-
-  galois::InsertBag<GNode> hedge_bag;
+  
+	galois::InsertBag<GNode> hedge_bag;
 
   galois::do_all(
       galois::iterate(size_t{0}, fineGGraph->hedges),
       [&](GNode item) {
-        // unsigned id = fineGGraph->getData(item).netnum;
-        /*if (fmod(id, pow(2,iter)) <= pow(2,(iter - 1)) - 1) { //final
-            hedges[item] = true;
-            return;
-        }*/
         bool flag       = false;
         unsigned nodeid = INT_MAX;
         auto& edges     = *edgesThreadLocal.getLocal();
@@ -215,8 +237,8 @@ void parallelHMatchAndCreateNodes(MetisGraph* graph, int iter, GNodeBag& bag,
             return;
           fineGGraph->getData(item).setMatched();
           if (flag)
-            hedge_bag.push(item);
-          // hedges[item] = true;
+						hedge_bag.push(item);
+          
           bag.push(nodeid);
           unsigned ww = 0;
           for (auto pp : edges) {
@@ -224,17 +246,18 @@ void parallelHMatchAndCreateNodes(MetisGraph* graph, int iter, GNodeBag& bag,
             fineGGraph->getData(pp).setMatched();
             fineGGraph->getData(pp).setParent(nodeid);
             fineGGraph->getData(pp).netnum = fineGGraph->getData(item).netnum;
-          }
+         		//fineGGraph->getData(pp).netnum = fineGGraph->getData(item).netnum.load();
+				  }
           weight[nodeid - fineGGraph->hedges] = ww;
         }
       },
       galois::loopname("phaseI"));
 
-  for (auto item : hedge_bag)
-    hedges[item] = true;
+			for(auto item: hedge_bag)
+				hedges[item] = true;
 }
 
-void moreCoarse(MetisGraph* graph, std::vector<unsigned>& weight) {
+void moreCoarse(MetisGraph* graph, galois::LargeArray<unsigned>& weight) {
 
   GGraph* fineGGraph = graph->getFinerGraph()->getGraph();
   typedef std::vector<GNode> VecTy;
@@ -287,7 +310,8 @@ void moreCoarse(MetisGraph* graph, std::vector<unsigned>& weight) {
               fineGGraph->getData(e).setMatched();
               fineGGraph->getData(e).setParent(nn);
               fineGGraph->getData(e).netnum = fineGGraph->getData(b).netnum;
-            }
+           		//fineGGraph->getData(e).netnum = fineGGraph->getData(b).netnum.load();
+						 }
           }
         }
       },
@@ -302,7 +326,7 @@ void moreCoarse(MetisGraph* graph, std::vector<unsigned>& weight) {
 
 // Coarsening phaseII
 void coarsePhaseII(MetisGraph* graph, std::vector<bool>& hedges,
-                   std::vector<unsigned>& weight) {
+                   galois::LargeArray<unsigned>& weight) {
 
   GGraph* fineGGraph = graph->getFinerGraph()->getGraph();
   typedef std::set<int> SecTy;
@@ -316,7 +340,7 @@ void coarsePhaseII(MetisGraph* graph, std::vector<bool>& hedges,
   galois::GAccumulator<int> hnode;
   moreCoarse(graph, weight);
 
-  galois::InsertBag<GNode> hedge_bag;
+	galois::InsertBag<GNode> hedge_bag;
 
   galois::do_all(
       galois::iterate(size_t{0}, fineGGraph->hedges),
@@ -345,19 +369,41 @@ void coarsePhaseII(MetisGraph* graph, std::vector<bool>& hedges,
           fineGGraph->getData(item).setMatched();
 
         } else {
-          hedge_bag.push(item);
-          fineGGraph->getData(item).setMatched();
+				//	auto& vec = *edgesThreadLocalV.getLocal();
+          //vec.push_back(item);
+					hedge_bag.push(item);
+					fineGGraph->getData(item).setMatched();
         }
-      },
+      },galois::steal(),
       galois::loopname("count # Hyperedges"));
 
-  for (auto item : hedge_bag)
-    hedges[item] = true;
+			for(auto item:hedge_bag)
+				hedges[item] = true;
 }
 
+//find nodes that are not incident to any hyperedge
+void findLoneNodes(GGraph& graph){
+	
+	galois::do_all(
+		galois::iterate((uint64_t) graph.hedges, graph.size()),
+			[&](GNode n){
+				
+				graph.getData(n).notAlone = false;
+			}, galois::steal(), galois::loopname("initialize not alone variables"));
+	
+	galois::do_all(
+		galois::iterate((uint64_t) 0, graph.hedges),
+			[&](GNode h){
+
+				for(auto n:graph.edges(h))
+					graph.getData(graph.getEdgeDst(n)).notAlone = true;
+			}, galois::steal(), galois::loopname("set not alone variables"));
+}
+
+//create coarsened graphs
 void parallelCreateEdges(MetisGraph* graph, GNodeBag& bag,
-                         std::vector<bool> hedges,
-                         std::vector<unsigned> weight) {
+                         std::vector<bool>& hedges,
+                         galois::LargeArray<unsigned>& weight) {
 
   GGraph* fineGGraph   = graph->getFinerGraph()->getGraph();
   GGraph* coarseGGraph = graph->getGraph();
@@ -370,36 +416,96 @@ void parallelCreateEdges(MetisGraph* graph, GNodeBag& bag,
           hg += 1;
       },
       galois::steal(), galois::loopname("number of hyperedges loop"));
-  galois::do_all(
+ 
+	//find lone nodes
+	findLoneNodes(*fineGGraph);
+ 
+
+	galois::do_all(
       galois::iterate(fineGGraph->hedges, fineGGraph->size()),
       [&](GNode ii) {
-        if (!fineGGraph->getData(ii).isMatched()) {
+        if (!fineGGraph->getData(ii).isMatched()){// && fineGGraph->getData(ii).notAlone) {
           bag.push(ii);
           fineGGraph->getData(ii).setMatched();
           fineGGraph->getData(ii).setParent(ii);
           fineGGraph->getData(ii).netnum  = INT_MAX;
           weight[ii - fineGGraph->hedges] = fineGGraph->getData(ii).getWeight();
-        }
+ 
+	      }
       },
       galois::steal(), galois::loopname("noedgebag match"));
-  unsigned hnum   = hg.reduce();
-  unsigned nodes  = std::distance(bag.begin(), bag.end());
+
+  
+	galois::StatTimer T_BAG("BAG");
+	T_BAG.start();
+	std::vector<bool> inNodeBag(1000, false);
+	std::vector<unsigned> nodeid(1000, INT_MAX);
+
+	for(GNode ii = fineGGraph->hedges; ii<fineGGraph->size();ii++){
+		
+		if(!fineGGraph->getData(ii).isMatched() && !fineGGraph->getData(ii).notAlone){
+			int index = ii%1000;
+			inNodeBag[index] = true;
+			if(ii < nodeid[index])
+				nodeid[index] = ii;
+			
+		}
+	}
+
+	for(int i=0;i<1000;i++){
+	
+		if(inNodeBag[i]){
+			bag.push(nodeid[i]);
+			weight[nodeid[i]-fineGGraph->hedges] =  0;
+		}
+	}
+
+	for(GNode ii = fineGGraph->hedges; ii<fineGGraph->size();ii++){
+
+    if(!fineGGraph->getData(ii).isMatched() && !fineGGraph->getData(ii).notAlone){
+      int index = ii%1000;
+   		fineGGraph->getData(ii).setMatched();
+      fineGGraph->getData(ii).setParent(nodeid[index]);
+      fineGGraph->getData(ii).netnum =  INT_MAX;
+  
+      weight[nodeid[index]-fineGGraph->hedges] += fineGGraph->getData(ii).getWeight();   
+    }
+  }
+	T_BAG.stop();
+
+	//std::cout <<"bag time: "<< T_BAG.get() << std::endl;
+	unsigned hnum   = hg.reduce();
+  unsigned nodes  = std::distance(bag.begin(), bag.end()); // + numnodes;
   unsigned newval = hnum;
+  
   std::vector<unsigned> idmap(fineGGraph->hnodes);
   std::vector<unsigned> newrand(nodes);
   std::vector<unsigned> newWeight(nodes);
   galois::StatTimer Tloop("for loop");
   Tloop.start();
   std::vector<unsigned> v;
+
+	galois::LargeArray<bool> inBag;
+
+	inBag.allocateBlocked(fineGGraph->size());
+	for(GNode n = fineGGraph->hedges;n<fineGGraph->size() ; n++)
+		inBag[n] = false;
+
   for (auto n : bag)
-    v.push_back(n);
-  std::sort(v.begin(), v.end());
+		inBag[n] = true;
+  
+  for(GNode n = fineGGraph->hedges; n<fineGGraph->size(); n++)
+		if(inBag[n])
+			v.push_back(n);
+
   for (auto n : v) {
     newrand[newval - hnum]        = n;
     idmap[n - fineGGraph->hedges] = newval++;
     newWeight[idmap[n - fineGGraph->hedges] - hnum] =
         weight[n - fineGGraph->hedges];
   }
+
+  // for (GNode n = fineGGraph->hedges; n < fineGGraph->size(); n++) {
   galois::do_all(
       galois::iterate(fineGGraph->hedges, fineGGraph->size()),
       [&](GNode n) {
@@ -413,34 +519,43 @@ void parallelCreateEdges(MetisGraph* graph, GNodeBag& bag,
   galois::gstl::Vector<galois::PODResizeableArray<uint32_t>> edges_id(
       num_nodes_next);
   std::vector<std::vector<EdgeTy>> edges_data(num_nodes_next);
-  std::vector<unsigned> old_id(hnum);
-  unsigned h_id = 0;
+ 	std::vector<unsigned> old_id(hnum);
+ 
+
+	unsigned h_id = 0;
+  
   for (GNode n = 0; n < fineGGraph->hedges; n++) {
-    if (hedges[n]) {
-      old_id[h_id]                  = fineGGraph->getData(n).netnum;
-      fineGGraph->getData(n).nodeid = h_id++;
-    }
-  }
+	 			if (hedges[n]) {
+      		old_id[h_id]                  = fineGGraph->getData(n).netnum;
+      		fineGGraph->getData(n).nodeid = h_id++;
+    		}
+ 	}
+
   galois::do_all(
       galois::iterate(size_t{0}, fineGGraph->hedges),
       [&](GNode n) {
         if (!hedges[n])
           return;
-        auto data   = fineGGraph->getData(n, flag_no_lock);
+        //auto data   = fineGGraph->getData(n, flag_no_lock);
         unsigned id = fineGGraph->getData(n).nodeid;
 
         for (auto ii : fineGGraph->edges(n)) {
           GNode dst     = fineGGraph->getEdgeDst(ii);
-          auto dst_data = fineGGraph->getData(dst, flag_no_lock);
-          unsigned pid  = dst_data.getParent();
+        //  auto dst_data = fineGGraph->getData(dst, flag_no_lock);
+          //unsigned pid  = dst_data.getParent();
+					unsigned pid = fineGGraph->getData(dst).getParent();
+
           auto f = std::find(edges_id[id].begin(), edges_id[id].end(), pid);
-          if (f == edges_id[id].end()) {
+         if (f == edges_id[id].end()) {
+
             edges_id[id].push_back(pid);
           }
         } // End edge loop
+
       },
       galois::steal(), galois::loopname("BuildGrah: Find edges"));
 
+		
   std::vector<uint64_t> prefix_edges(num_nodes_next);
   galois::GAccumulator<uint64_t> num_edges_acc;
   galois::do_all(
@@ -455,6 +570,7 @@ void parallelCreateEdges(MetisGraph* graph, GNodeBag& bag,
   for (uint32_t c = 1; c < num_nodes_next; ++c) {
     prefix_edges[c] += prefix_edges[c - 1];
   }
+
   coarseGGraph->constructFrom(num_nodes_next, num_edges_next, prefix_edges,
                               edges_id, edges_data);
   coarseGGraph->hedges = hnum;
@@ -465,16 +581,20 @@ void parallelCreateEdges(MetisGraph* graph, GNodeBag& bag,
         if (ii < hnum) {
           coarseGGraph->getData(ii).netval = INT_MAX;
           coarseGGraph->getData(ii).netnum = old_id[ii];
-        } else {
+				} else {
           coarseGGraph->getData(ii).netval  = INT_MAX;
           coarseGGraph->getData(ii).netnum  = INT_MAX;
           coarseGGraph->getData(ii).netrand = INT_MAX;
-          coarseGGraph->getData(ii).nodeid  = ii;
+          coarseGGraph->getData(ii).nodeid =
+              ii;
           coarseGGraph->getData(ii).setWeight(
               newWeight[ii - coarseGGraph->hedges]);
         }
       },
       galois::steal(), galois::loopname("noedgebag match"));
+
+	inBag.destroy();
+	inBag.deallocate();
 }
 
 void findMatching(MetisGraph* coarseMetisGraph, scheduleMode sch, int iter) {
@@ -482,7 +602,8 @@ void findMatching(MetisGraph* coarseMetisGraph, scheduleMode sch, int iter) {
   GNodeBag nodes;
   int sz = coarseMetisGraph->getFinerGraph()->getGraph()->hedges;
   std::vector<bool> hedges(sz, false);
-  std::vector<unsigned> weight(fineMetisGraph->getGraph()->hnodes);
+  galois::LargeArray<unsigned> weight;
+	weight.allocateBlocked(fineMetisGraph->getGraph()->hnodes);
 
   switch (sch) {
   case PLD:
@@ -526,6 +647,9 @@ void findMatching(MetisGraph* coarseMetisGraph, scheduleMode sch, int iter) {
   }
   coarsePhaseII(coarseMetisGraph, hedges, weight);
   parallelCreateEdges(coarseMetisGraph, nodes, hedges, weight);
+
+	weight.destroy();
+	weight.deallocate();
 }
 
 MetisGraph* coarsenOnce(MetisGraph* fineMetisGraph, scheduleMode sch,
@@ -543,31 +667,29 @@ MetisGraph* coarsen(MetisGraph* fineMetisGraph, unsigned coarsenTo,
   MetisGraph* coarseGraph = fineMetisGraph;
   unsigned size =
       fineMetisGraph->getGraph()
-          ->hnodes; //, fineMetisGraph->getGraph()->cellList().end());
+          ->hnodes;
   unsigned hedgeSize = 0;
-  const float ratio  = 55.0 / 45.0; // change if needed
+  const float ratio  = 55.0 / 45.0;
   const float tol    = std::max(ratio, 1 - ratio) - 1;
   const int hi       = (1 + tol) * size / (2 + tol);
   LIMIT              = hi / 4;
 
-  // std::cout<<"inital weight is "<<totw<<"\n";
   unsigned Size    = size;
   unsigned iterNum = 0;
   unsigned newSize = size;
-  while (size > coarsenTo) {
+  while (Size > coarsenTo) {
     if (iterNum > coarsenTo)
       break;
     if (Size - newSize <= 0 && iterNum > 2)
-      break; // final
+      break; 
     newSize     = coarseGraph->getGraph()->hnodes;
     coarseGraph = coarsenOnce(coarseGraph, sch, iterNum);
     Size        = coarseGraph->getGraph()->hnodes;
     hedgeSize   = coarseGraph->getGraph()->hedges;
-    std::cout << "SIZE IS " << coarseGraph->getGraph()->hnodes << " and net is "
-              << hedgeSize << "\n";
+    //std::cout << "SIZE IS " << coarseGraph->getGraph()->hnodes << " and net is "
+    //          << hedgeSize << "\n";
     if (hedgeSize < 1000)
-      return coarseGraph->getFinerGraph();
-    // if (Size < 300) return coarseGraph->getFinerGraph();
+			break;
 
     ++iterNum;
   }
diff --git a/lonestar/analytics/cpu/bipart/Refine.cpp b/lonestar/analytics/cpu/bipart/Refine.cpp
index 9c36f91ec0..112dc06277 100644
--- a/lonestar/analytics/cpu/bipart/Refine.cpp
+++ b/lonestar/analytics/cpu/bipart/Refine.cpp
@@ -28,50 +28,6 @@
 
 namespace {
 
-// This is only used on the terminal graph (find graph)
-// Should workd for hmetis
-
-/*int calculate_cutsize(GGraph& g) {
-
-  GNodeBag bag;
-  galois::do_all(galois::iterate(g.getNets()),
-        [&](GNode n) {
-            auto c = g.edges(n).begin();
-            GNode cn = g.getEdgeDst(*c);
-            int part = g.getData(cn).getPart();
-            for (auto x : g.edges(n)) {
-              auto cc = g.getEdgeDst(x);
-              int partc = g.getData(cc).getPart();
-              if (partc != part) {
-                bag.push(n);
-                return;
-              }
-            }
-        },
-        galois::loopname("cutsize"));
-  return std::distance(bag.begin(), bag.end());
-}*/
-
-/*int calculate_cutsize(GGraph& g, std::map<GNode, unsigned> part) {
-
-  GNodeBag bag;
-  galois::do_all(galois::iterate(g.getNets()),
-        [&](GNode n) {
-            auto c = g.edges(n).begin();
-            GNode cn = g.getEdgeDst(*c);
-            unsigned ppart = part[cn];
-            for (auto x : g.edges(n, galois::MethodFlag::UNPROTECTED)) {
-              auto cc = g.getEdgeDst(x);
-              unsigned partc = part[cc];
-              if (partc != ppart) {
-                bag.push(n);
-                return;
-              }
-            }
-        },
-        galois::steal(), galois::loopname("cutsize"));
-  return std::distance(bag.begin(), bag.end());
-}*/
 void projectPart(MetisGraph* Graph) {
   GGraph* fineGraph   = Graph->getFinerGraph()->getGraph();
   GGraph* coarseGraph = Graph->getGraph();
@@ -312,7 +268,7 @@ void parallel_make_balance(GGraph& g, float tol, int p) {
                 int d   = gain * 10.0f;
                 int idx = 10 - d;
                 nodelistz[idx].push(n);
-              } else if (gain >= -9.0f) {
+              } else if (gain > -9.0f) {
                 int d   = gain * 10.0f - 1;
                 int idx = 10 - d;
                 nodelistz[idx].push(n);
@@ -578,12 +534,12 @@ bool isPT(int n) {
   return (ceil(log2(n)) == floor(log2(n)));
 }
 
-void refine(MetisGraph* coarseGraph, unsigned K) {
+void refine(MetisGraph* coarseGraph, unsigned K, double imbalance) {
   float ratio = 0.0f;
   float tol   = 0.0f;
   bool flag   = isPT(K);
   if (flag) {
-    ratio = 55.0 / 45.0; // change if needed
+    ratio = (50.0f + (double) imbalance)/(50.0f - (double) imbalance);
     tol   = std::max(ratio, 1 - ratio) - 1;
   } else {
     ratio = ((float)((K + 1) / 2)) / ((float)(K / 2)); // change if needed
diff --git a/lonestar/analytics/cpu/bipart/bipart.cpp b/lonestar/analytics/cpu/bipart/bipart.cpp
index 7da44e7cc3..27761209ea 100644
--- a/lonestar/analytics/cpu/bipart/bipart.cpp
+++ b/lonestar/analytics/cpu/bipart/bipart.cpp
@@ -54,7 +54,7 @@ static cll::opt<scheduleMode> schedulingMode(
                 clEnumVal(MDEG, "MDEG"), clEnumVal(DEG, "DEG"),
                 clEnumVal(MWD, "MWD"), clEnumVal(HIS, "HIS"),
                 clEnumVal(RAND, "random")),
-    cll::init(PLD));
+    cll::init(RAND));
 
 static cll::opt<bool>
     mtxInput("mtxinput",
@@ -84,8 +84,8 @@ static cll::opt<unsigned> numPartitions(cll::Positional,
                                         cll::init(2));
 static cll::opt<double> imbalance(
     "balance",
-    cll::desc("Fraction deviated from mean partition size (default 0.01)"),
-    cll::init(0.01));
+    cll::desc("Percentage deviated from mean partition size (default 5)"),
+    cll::init(5.0));
 
 //! Flag that forces user to be aware that they should be passing in a
 //! hMetis graph.
@@ -97,7 +97,9 @@ static cll::opt<bool>
 static cll::opt<bool>
     output("output", cll::desc("Specify if partitions need to be written"),
            cll::init(false));
-
+double Ctime = 0.0f;
+double Ptime = 0.0f;
+double Rtime = 0.0f;
 /**
  * Partitioning
  */
@@ -117,11 +119,11 @@ void Partition(MetisGraph* metisGraph, unsigned coarsenTo, unsigned K) {
 
   galois::StatTimer T3("Refine");
   T3.start();
-  refine(mcg, K);
+  refine(mcg, K, imbalance);
   T3.stop();
-  std::cout << "coarsen:," << T.get() << "\n";
-  std::cout << "clustering:," << T2.get() << '\n';
-  std::cout << "Refinement:," << T3.get() << "\n";
+  Ctime += (T.get()/1000.0f);
+  Ptime += (T2.get()/1000.0f);
+  Rtime += (T3.get()/1000.0f);
 
   execTime.stop();
 }
@@ -146,15 +148,17 @@ int computingCut(GGraph& g) {
 }
 
 int computingBalance(GGraph& g) {
-  int zero = 0, one = 0;
+  int max = 0;
+  std::vector<int> parts(numPartitions, 0);
   for (size_t c = g.hedges; c < g.size(); c++) {
-    int part = g.getData(c).getPart();
-    if (part == 0)
-      zero++;
-    else
-      one++;
+    unsigned pp = g.getData(c).getPart();
+    parts[pp]++;
+  }
+  for (unsigned i = 0; i <numPartitions; i++) {
+    if (parts[i] > max)
+      max = parts[i];
   }
-  return std::abs(zero - one);
+  return max;
 }
 // printGraphBeg(*graph)
 
@@ -198,6 +202,7 @@ int hash(unsigned val) {
 
 int main(int argc, char** argv) {
   galois::SharedMemSys G;
+
   LonestarStart(argc, argv, name, desc, url, &inputFile);
 
   galois::StatTimer totalTime("TimerTotal");
@@ -209,25 +214,23 @@ int main(int argc, char** argv) {
                " to indicate the input is a hMetisGraph graph.");
   }
 
-  // srand(-1);
   MetisGraph metisGraph;
   GGraph& graph = *metisGraph.getGraph();
   std::ifstream f(inputFile.c_str());
-  // GGraph graph;// = *metisGraph.getGraph();
   std::string line;
   std::getline(f, line);
   std::stringstream ss(line);
   uint32_t i1;
   uint64_t i2;
   ss >> i1 >> i2;
-  const uint32_t hedges = i1;
-  const uint64_t nodes  = i2;
+  uint32_t hedges = i1;
+  uint64_t nodes  = i2;
   std::cout << "hedges: " << hedges << "\n";
   std::cout << "nodes: " << nodes << "\n\n";
 
   galois::StatTimer T("buildingG");
   T.start();
-  // read rest of input and initialize hedges (build hgraph)
+
   galois::gstl::Vector<galois::PODResizeableArray<uint32_t>> edges_id(hedges +
                                                                       nodes);
   std::vector<std::vector<EdgeTy>> edges_data(hedges + nodes);
@@ -255,33 +258,35 @@ int main(int argc, char** argv) {
   f.close();
   graph.hedges = hedges;
   graph.hnodes = nodes;
-  std::cout << "number of edges " << edges << "\n";
+  std::cout << "number of hedges " << hedges << "\n";
   uint32_t sizes = hedges + nodes;
+
   galois::do_all(galois::iterate(uint32_t{0}, sizes),
                  [&](uint32_t c) { prefix_edges[c] = edges_id[c].size(); });
 
   for (uint64_t c = 1; c < nodes + hedges; ++c) {
     prefix_edges[c] += prefix_edges[c - 1];
   }
-  // edges = #edges, hedgecount = how many edges each node has, edges_id: for
-  // each node, which ndoes it is connected to edges_data: data for each edge =
-  // 1
+
   graph.constructFrom(nodes + hedges, edges, prefix_edges, edges_id,
                       edges_data);
-  galois::do_all(galois::iterate(graph), [&](GNode n) {
-    if (n < hedges)
-      graph.getData(n).netnum = n + 1;
-    else
-      graph.getData(n).netnum = INT_MAX;
-    graph.getData(n).netrand = INT_MAX;
-    graph.getData(n).netval  = INT_MAX;
-    graph.getData(n).nodeid  = n + 1;
-  });
+  galois::do_all(
+      galois::iterate(graph),
+      [&](GNode n) {
+        if (n < hedges)
+          graph.getData(n).netnum = n + 1;
+        else
+          graph.getData(n).netnum = INT_MAX;
+        graph.getData(n).netrand = INT_MAX;
+        graph.getData(n).netval  = INT_MAX;
+        graph.getData(n).nodeid  = n + 1;
+      },
+      galois::steal(), galois::loopname("build initial graph"));
   T.stop();
   std::cout << "time to build a graph " << T.get() << "\n";
   graphStat(graph);
   std::cout << "\n";
-  galois::preAlloc(galois::runtime::numPagePoolAllocTotal() * 5);
+  galois::preAlloc(galois::runtime::numPagePoolAllocTotal() * 10);
   galois::reportPageAlloc("MeminfoPre");
   galois::do_all(
       galois::iterate(graph.hedges, graph.size()),
@@ -291,105 +296,161 @@ int main(int argc, char** argv) {
             .initRefine(0, true);
         graph.getData(item, galois::MethodFlag::UNPROTECTED).initPartition();
       },
-      galois::loopname("initPart"));
+      galois::steal(), galois::loopname("initPart"));
+
+  Partition(&metisGraph, csize, numPartitions);
 
   const int k = numPartitions;
   // calculating number of iterations/levels required
-  int num = log2(k) + 1;
+  int num = log2(k);
 
   int kValue[k];
   for (int i = 0; i < k; i++)
     kValue[i] = 0;
 
-  kValue[0] = k;
+  kValue[0]           = (k + 1) / 2;
+  kValue[(k + 1) / 2] = k / 2;
+
+  galois::do_all(
+      galois::iterate((uint64_t)graph.hedges, graph.size()),
+      [&](GNode n) {
+        unsigned pp = graph.getData(n).getPart();
+        if (pp == 1) {
+          graph.getData(n).setPart((k + 1) / 2);
+        }
+      },
+      galois::steal(), galois::loopname("set part (original graph)"));
+
   // running it level by level
 
+  // toProcess contains nodes to be executed in a given level
   std::set<int> toProcess;
   std::set<int> toProcessNew;
   toProcess.insert(0);
+  toProcess.insert((k + 1) / 2);
+
+  std::vector<std::vector<GNode>> nodesvec(k);
+  // std::array<std::vector<GNode>, 100> hedgesvec;
+
   for (int level = 0; level < num; level++) {
-    // calling Partition for each partition number
-    for (auto i : toProcess) {
-      if (kValue[i] > 1) {
-        MetisGraph metisG;
-        GGraph& gr = *metisG.getGraph();
-        std::vector<GNode> nodesvec;
-        std::vector<GNode> hedgevec;
-        for (GNode n = graph.hedges; n < graph.size(); n++) {
-          int pp = graph.getData(n).getPart();
-          if (kValue[pp] > 1 && pp == i) {
-            nodesvec.push_back(n);
-          }
-        }
-        // unsigned nodesize = nodesvec.size();
-        std::map<GNode, unsigned> nodemap;
-        std::map<GNode, unsigned> edgemap;
-        unsigned ed = 0;
-        for (GNode h = 0; h < graph.hedges; h++) {
-          bool flag      = true;
-          auto c         = graph.edges(h).begin();
-          GNode dst      = graph.getEdgeDst(*c);
-          unsigned nPart = graph.getData(dst).getPart();
-          unsigned ii    = i;
-          if (nPart != ii)
-            continue;
+
+    for (int i = 0; i < k; i++)
+      nodesvec[i].clear();
+
+    // distributing nodes in relevant vectors according to their current
+    // partition assignment
+    for (GNode n = graph.hedges; n < graph.size(); n++) {
+      unsigned pp = graph.getData(n).getPart();
+      nodesvec[pp].push_back(n);
+    }
+
+    std::vector<std::vector<GNode>> hedgevec(k);
+
+    // distribute hyperedges according to their current partition
+    galois::do_all(
+        galois::iterate((uint64_t)0, graph.hedges),
+        [&](GNode h) {
+          auto edge = *(graph.edges(h).begin());
+          auto dst  = graph.getEdgeDst(edge);
+          auto ii   = graph.getData(dst).getPart();
+
+          bool flag = true;
+
           for (auto n : graph.edges(h)) {
-            if (graph.getData(graph.getEdgeDst(n)).getPart() != nPart) {
+            auto part = graph.getData(graph.getEdgeDst(n)).getPart();
+
+            if (part != ii) {
               flag = false;
               break;
             }
           }
 
-          if (flag && kValue[nPart] > 1) {
-            hedgevec.push_back(h);
-            edgemap[h] = ed++;
-          }
-        }
-        unsigned id = hedgevec.size();
-        for (auto n : nodesvec) {
-          nodemap[n] = id++;
+          if (flag)
+            graph.getData(h).setPart(ii);
+          else
+            graph.getData(h).setPart(100000);
+        },
+        galois::steal(), galois::loopname("distribute hedges"));
+
+    for (GNode h = 0; h < graph.hedges; h++) {
+      unsigned part = graph.getData(h).getPart();
+      if (part != 100000)
+        hedgevec[part].push_back(h);
+    }
+
+    // calling Partition for each partition number
+    for (unsigned i : toProcess) {
+      if (kValue[i] > 1) {
+        MetisGraph metisG;
+        GGraph& gr = *metisG.getGraph();
+
+        unsigned ed = 0;
+
+        for (auto h : hedgevec[i])
+          graph.getData(h).index = ed++;
+
+        unsigned id = ed;
+        for (auto n : nodesvec[i]) {
+          graph.getData(n).index = id++;
         }
-        unsigned totalnodes = hedgevec.size() + nodesvec.size();
+
+        unsigned totalnodes = id;
         galois::gstl::Vector<galois::PODResizeableArray<uint32_t>> edges_ids(
             totalnodes);
         std::vector<std::vector<EdgeTy>> edge_data(totalnodes);
         std::vector<uint64_t> pre_edges(totalnodes);
         unsigned edges = 0;
-        for (auto h : hedgevec) {
-          for (auto v : graph.edges(h)) {
-            auto vv        = graph.getEdgeDst(v);
-            uint32_t newid = edgemap[h];
-            unsigned nm    = nodemap[vv];
-            edges_ids[newid].push_back(nm);
-          }
-        }
-        galois::GAccumulator<uint64_t> num_edges_acc;
+
         galois::do_all(
-            galois::iterate(uint32_t{0}, totalnodes),
-            [&](uint32_t c) {
+            galois::iterate(hedgevec[i]),
+            [&](GNode h) {
+              for (auto v : graph.edges(h)) {
+                auto vv = graph.getEdgeDst(v);
+
+                uint32_t newid = graph.getData(h).index;
+                unsigned nm    = graph.getData(vv).index;
+                edges_ids[newid].push_back(nm);
+              }
+            },
+            galois::steal(), galois::loopname("populate edge ids"));
+
+        uint64_t num_edges_acc = 0;
+        //galois::do_all(
+          //  galois::iterate(uint32_t{0}, totalnodes),
+            for(uint32_t c = 0;c<totalnodes;c++) {
               pre_edges[c] = edges_ids[c].size();
               num_edges_acc += pre_edges[c];
-            },
-            galois::steal());
-        edges = num_edges_acc.reduce();
+            }
+            //galois::steal(), galois::loopname("set pre edges"));
+
+        edges = num_edges_acc;
+
         for (uint64_t c = 1; c < totalnodes; ++c) {
           pre_edges[c] += pre_edges[c - 1];
         }
         gr.constructFrom(totalnodes, edges, pre_edges, edges_ids, edge_data);
-        gr.hedges = hedgevec.size();
-        gr.hnodes = nodesvec.size();
-        galois::do_all(galois::iterate(gr), [&](GNode n) {
-          if (n < gr.hedges)
-            gr.getData(n).netnum = n + 1;
-          else
-            gr.getData(n).netnum = INT_MAX;
-          gr.getData(n).netrand = INT_MAX;
-          gr.getData(n).netval  = INT_MAX;
-          gr.getData(n).nodeid  = n + 1;
-        });
+
+        gr.hedges = ed;
+        gr.hnodes = id - ed;
+
+        galois::do_all(
+            galois::iterate(gr),
+            [&](GNode n) {
+              if (n < gr.hedges)
+                gr.getData(n).netnum = n + 1;
+              else
+                gr.getData(n).netnum = INT_MAX;
+              gr.getData(n).netrand = INT_MAX;
+              gr.getData(n).netval  = INT_MAX;
+              gr.getData(n).nodeid  = n + 1;
+            },
+            galois::steal(), galois::loopname("build graph: recursion level"));
+
         Partition(&metisG, csize, kValue[i]);
+
         MetisGraph* mcg = &metisG;
 
+        // now free up the memory by deleting all coarsened graphs
         while (mcg->getCoarserGraph() != NULL) {
           mcg = mcg->getCoarserGraph();
         }
@@ -405,47 +466,56 @@ int main(int argc, char** argv) {
         kValue[i + (tmp + 1) / 2] = (tmp) / 2;
         toProcessNew.insert(i);
         toProcessNew.insert(i + (tmp + 1) / 2);
-        for (GNode v : nodesvec) {
-          GNode n     = nodemap[v];
-          unsigned pp = gr.getData(n).getPart();
-          if (pp == 0) {
-            graph.getData(v).setPart(i);
-          } else if (pp == 1) {
-            graph.getData(v).setPart(i + (tmp + 1) / 2);
-          }
-        }
+
+        galois::do_all(
+            galois::iterate(nodesvec[i]),
+            [&](GNode v) {
+              GNode n     = graph.getData(v).index;
+              unsigned pp = gr.getData(n).getPart();
+              if (pp == 0) {
+                graph.getData(v).setPart(i);
+              } else if (pp == 1) {
+                graph.getData(v).setPart(i + (tmp + 1) / 2);
+              }
+            },
+            galois::steal(),
+            galois::loopname("set part: inside recursive call"));
+
         delete mcg;
-      }
-    }
+      } // end if
+    }   // end for
 
     toProcess = toProcessNew;
     toProcessNew.clear();
-  }
-  // std::cout<<"Total Edge Cut: "<<computingCut(graph)<<"\n";
+  } // end while
+  std::cout<<"Coarsening time(s):,"<<Ctime<<"\n";
+  std::cout<<"Partitiong time(s):,"<<Ptime<<"\n";
+  std::cout<<"Refinement time(s):,"<<Rtime<<"\n";
+  std::cout<<"\n";
+  std::cout<<"Edge Cut,"<<computingCut(graph)<<"\n\n";
+
   galois::runtime::reportStat_Single("BiPart", "Edge Cut", computingCut(graph));
-  galois::runtime::reportStat_Single("BiPart", "zero-one",
-                                     computingBalance(graph));
-  // galois::reportPageAlloc("MeminfoPost");
+  //galois::runtime::reportStat_Single("BiPart", "zero-one",
+  //                                   computingBalance(graph));
 
   totalTime.stop();
-
   if (output) {
 
-    std::cout << "hedgs: " << graph.hedges << "\n";
-    std::cout << "size: " << graph.size() << "\n";
-    std::vector<uint32_t> parts(graph.size() - graph.hedges);
-    std::vector<uint64_t> IDs(graph.size() - graph.hedges);
+    std::vector<std::vector<uint64_t> >parts(numPartitions);
 
     for (GNode n = graph.hedges; n < graph.size(); n++) {
-      parts[n - graph.hedges] = graph.getData(n).getPart();
-      IDs[n - graph.hedges]   = n - graph.hedges + 1;
+      unsigned p = graph.getData(n).getPart();
+      parts[p].push_back(n - graph.hedges + 1);
     }
 
     std::ofstream outputFile(outfile.c_str());
 
-    for (size_t i = 0; i < parts.size(); i++)
-      outputFile << IDs[i] << " " << parts[i] << "\n";
-
+    for (unsigned i = 0; i < numPartitions; i++) {
+      outputFile << i+1 << " ";
+      for (auto v : parts[i]) 
+        outputFile << v << " "; 
+      outputFile << "\n";
+    }
     outputFile.close();
   }
   return 0;
diff --git a/lonestar/analytics/cpu/bipart/bipart.h b/lonestar/analytics/cpu/bipart/bipart.h
index f203c51b13..ee69dfa29b 100644
--- a/lonestar/analytics/cpu/bipart/bipart.h
+++ b/lonestar/analytics/cpu/bipart/bipart.h
@@ -28,7 +28,8 @@ typedef uint32_t EdgeTy;
 
 struct GGraph
     : public galois::graphs::LC_CSR_Graph<MetisNode, EdgeTy>::with_no_lockable<
-          true>::type::with_numa_alloc<true>::type {
+        true>::type::with_numa_alloc<true>::type {
+    	//false>::type::with_numa_alloc<true>::type {
   size_t hedges;
   size_t hnodes;
 };
@@ -77,8 +78,19 @@ class MetisNode {
   galois::CopyableAtomic<int> netnum;
   galois::CopyableAtomic<int> netrand;
   galois::CopyableAtomic<int> netval;
-  void initPartition() { pd.locked = false; }
-
+  galois::CopyableAtomic<int> degree;
+  /*std::atomic<int> FS;
+	std::atomic<int> TE;
+	std::atomic<int> netnum;
+	std::atomic<int> netrand;
+	std::atomic<int> netval;
+	std::atomic<int> degree;
+*/	uint32_t index;
+	bool notAlone;
+	
+	void initPartition() { pd.locked = false; }
+	
+	
   // int num;
   explicit MetisNode(int weight) : _weight(weight) {
     initCoarsen();
@@ -193,8 +205,8 @@ MetisGraph* coarsen(MetisGraph* fineMetisGraph, unsigned coarsenTo,
                     scheduleMode sMode);
 
 // Partitioning
-void partition(MetisGraph* coarseMetisGraph, unsigned K);
+void partition(MetisGraph*, unsigned);
 // Refinement
-void refine(MetisGraph* coarseGraph, unsigned K);
+void refine(MetisGraph* coarseGraph, unsigned K, double imbalance);
 
 #endif
diff --git a/lonestar/analytics/cpu/triangle-counting/Triangles.cpp b/lonestar/analytics/cpu/triangle-counting/Triangles.cpp
index cf1dd9b40d..8c212d26ca 100644
--- a/lonestar/analytics/cpu/triangle-counting/Triangles.cpp
+++ b/lonestar/analytics/cpu/triangle-counting/Triangles.cpp
@@ -244,19 +244,27 @@ void orderedCountFunc(Graph& graph, GNode n,
   size_t numTriangles_local = 0;
   for (auto it_v : graph.edges(n)) {
     auto v = graph.getEdgeDst(it_v);
-    if (v > n)
+    if (v >= n)
       break;
     Graph::edge_iterator it_n =
         graph.edge_begin(n, galois::MethodFlag::UNPROTECTED);
 
     for (auto it_vv : graph.edges(v)) {
       auto vv = graph.getEdgeDst(it_vv);
-      if (vv > v)
+      if (vv >= v)
         break;
       while (graph.getEdgeDst(it_n) < vv)
         it_n++;
       if (vv == graph.getEdgeDst(it_n)) {
-        numTriangles_local += 1;
+
+        Graph::edge_iterator multi_it_n = it_n;
+
+        while (multi_it_n !=
+                   graph.edge_end(n, galois::MethodFlag::UNPROTECTED) &&
+               graph.getEdgeDst(multi_it_n) == vv) {
+          numTriangles_local += 1;
+          multi_it_n++;
+        }
       }
     }
   }
diff --git a/lonestar/analytics/distributed/bfs/bfs_push.cpp b/lonestar/analytics/distributed/bfs/bfs_push.cpp
index 814756ecec..34aa8031a3 100644
--- a/lonestar/analytics/distributed/bfs/bfs_push.cpp
+++ b/lonestar/analytics/distributed/bfs/bfs_push.cpp
@@ -21,12 +21,16 @@
 #include "DistBench/Start.h"
 #include "galois/DistGalois.h"
 #include "galois/gstl.h"
+#include "galois/substrate/PerThreadStorage.h"
 #include "galois/DReducible.h"
 #include "galois/DTerminationDetector.h"
 #include "galois/runtime/Tracer.h"
+#include "galois/runtime/StackTracer.h"
 
 #include <iostream>
 #include <limits>
+#include <random>
+#include <chrono>
 
 #ifdef GALOIS_ENABLE_GPU
 #include "bfs_push_cuda.h"
@@ -49,14 +53,18 @@ static cll::opt<unsigned int> maxIterations("maxIterations",
                                                       "Default 1000"),
                                             cll::init(1000));
 
-static cll::opt<uint64_t>
-    src_node("startNode", cll::desc("ID of the source node"), cll::init(0));
+static uint64_t src_node;
 
 static cll::opt<uint32_t>
     delta("delta",
           cll::desc("Shift value for the delta step (default value 0)"),
           cll::init(0));
 
+static cll::opt<unsigned>
+  rseed("rseed",
+        cll::desc("The random seed for choosing the hosts (default value 0)"),
+        cll::init(0));
+
 enum Exec { Sync, Async };
 
 static cll::opt<Exec> execution(
@@ -91,10 +99,10 @@ std::unique_ptr<galois::graphs::GluonSubstrate<Graph>> syncSubstrate;
 
 struct InitializeGraph {
   const uint32_t& local_infinity;
-  cll::opt<uint64_t>& local_src_node;
+  uint64_t local_src_node;
   Graph* graph;
 
-  InitializeGraph(cll::opt<uint64_t>& _src_node, const uint32_t& _infinity,
+  InitializeGraph(uint64_t& _src_node, const uint32_t& _infinity,
                   Graph* _graph)
       : local_infinity(_infinity), local_src_node(_src_node), graph(_graph) {}
 
@@ -267,9 +275,13 @@ struct BFS {
 
   void operator()(GNode src) const {
     NodeData& snode = graph->getData(src);
+    //stack_capture->capture_stack_info();
+    cyg_profile_func_stack(nullptr, nullptr);
 
     if (snode.dist_old > snode.dist_current) {
       active_vertices += 1;
+      //stack_capture->capture_stack_info();
+      cyg_profile_func_stack(nullptr, nullptr);
 
       if (local_priority > snode.dist_current) {
         snode.dist_old = snode.dist_current;
@@ -283,7 +295,11 @@ struct BFS {
           uint32_t old_dist = galois::atomicMin(dnode.dist_current, new_dist);
           if (old_dist > new_dist)
             bitset_dist_current.set(dst);
+          //stack_capture->capture_stack_info();
+          cyg_profile_func_stack(nullptr, nullptr);
         }
+        //stack_capture->capture_stack_info();
+        cyg_profile_func_stack(nullptr, nullptr);
       }
     }
   }
@@ -405,7 +421,13 @@ constexpr static const char* const desc = "BFS on Distributed Galois.";
 constexpr static const char* const url  = nullptr;
 
 int main(int argc, char** argv) {
+
+  stack_capture.reset();
+
+  auto st = std::chrono::high_resolution_clock::now();
+
   galois::DistMemSys G;
+
   DistBenchStart(argc, argv, name, desc, url);
 
   const auto& net = galois::runtime::getSystemNetworkInterface();
@@ -414,6 +436,10 @@ int main(int argc, char** argv) {
     galois::runtime::reportParam(REGION_NAME, "Source Node ID", src_node);
   }
 
+  //Setup Seeding information
+  uint64_t* src_nodes = (uint64_t*) malloc(sizeof(uint64_t) * numRuns);
+  std::mt19937 generator(rseed);
+
   galois::StatTimer StatTimer_total("TimerTotal", REGION_NAME);
 
   StatTimer_total.start();
@@ -428,16 +454,54 @@ int main(int argc, char** argv) {
   // bitset comm setup
   bitset_dist_current.resize(hg->size());
 
-  galois::gPrint("[", net.ID, "] InitializeGraph::go called\n");
-
-  InitializeGraph::go((*hg));
-  galois::runtime::getHostBarrier().wait();
-
   // accumulators for use in operators
   galois::DGAccumulator<uint64_t> DGAccumulator_sum;
   galois::DGReduceMax<uint32_t> m;
 
+  //get the src_nodes of the runs
+  galois::StatTimer StatTimer_select("VertexSelection", REGION_NAME);
+  StatTimer_select.start();
+  for(auto run = 0; run < numRuns; ++run)
+  {
+    uint64_t degree = 0;
+    auto num_nodes = hg->globalSize();
+    uint64_t cand = 0;
+    while(degree < 1)
+    {
+      DGAccumulator_sum.reset();
+      cand = generator() % num_nodes;
+
+      if(hg->isOwned(cand) || hg->isLocal(cand))
+      {
+        auto lcand = hg->getLID(cand);
+        DGAccumulator_sum += hg->localDegree(lcand);
+      }
+
+      degree = DGAccumulator_sum.reduce();
+    }
+    src_nodes[run] = cand;
+  }
+  StatTimer_select.stop();
+
+  DGAccumulator_sum.reset();
+
   for (auto run = 0; run < numRuns; ++run) {
+    src_node = src_nodes[run];
+    syncSubstrate->set_num_run(run);
+    if (personality == GPU_CUDA) {
+#ifdef GALOIS_ENABLE_GPU
+      bitset_dist_current_reset_cuda(cuda_ctx);
+#else
+      abort();
+#endif
+    } else {
+      bitset_dist_current.reset();
+    }
+
+    galois::gPrint("[", net.ID, "] InitializeGraph::go called\n");
+    InitializeGraph::go((*hg));
+    galois::runtime::getHostBarrier().wait();
+
     galois::gPrint("[", net.ID, "] BFS::go run ", run, " called\n");
     std::string timer_str("Timer_" + std::to_string(run));
     galois::StatTimer StatTimer_main(timer_str.c_str(), REGION_NAME);
@@ -448,38 +512,32 @@ int main(int argc, char** argv) {
     } else {
       BFS<false>::go(*hg);
     }
+
     StatTimer_main.stop();
 
     // sanity check
     BFSSanityCheck::go(*hg, DGAccumulator_sum, m);
 
-    if ((run + 1) != numRuns) {
-      if (personality == GPU_CUDA) {
-#ifdef GALOIS_ENABLE_GPU
-        bitset_dist_current_reset_cuda(cuda_ctx);
-#else
-        abort();
-#endif
-      } else {
-        bitset_dist_current.reset();
-      }
+    if (output) {
+      std::vector<uint32_t> results = makeResults(hg);
+      auto globalIDs                = hg->getMasterGlobalIDs();
+      assert(results.size() == globalIDs.size());
 
-      syncSubstrate->set_num_run(run + 1);
-      InitializeGraph::go((*hg));
-      galois::runtime::getHostBarrier().wait();
+      writeOutput(outputLocation, "level", results.data(), results.size(),
+                  globalIDs.data());
     }
+
   }
 
   StatTimer_total.stop();
+  galois::gPrint("[", net.ID, "] Max Stack Size ", stack_capture.get_max(), " bytes\n");
 
-  if (output) {
-    std::vector<uint32_t> results = makeResults(hg);
-    auto globalIDs                = hg->getMasterGlobalIDs();
-    assert(results.size() == globalIDs.size());
 
-    writeOutput(outputLocation, "level", results.data(), results.size(),
-                globalIDs.data());
-  }
+  struct rusage r_usage;
+  getrusage(RUSAGE_SELF,&r_usage);
+  galois::gPrint("[", net.ID, "] Memory usage: ", r_usage.ru_maxrss, " KB\n");
+  auto en = std::chrono::high_resolution_clock::now();
 
+  galois::gPrint("[", net.ID, "] E2ETime: ", std::chrono::duration_cast<std::chrono::nanoseconds>(en - st).count(), " ns\n");
   return 0;
 }
diff --git a/lonestar/scientific/cpu/longestedge/test/catch.hpp b/lonestar/scientific/cpu/longestedge/test/catch.hpp
index 841b9c8128..9232ff92fe 100644
--- a/lonestar/scientific/cpu/longestedge/test/catch.hpp
+++ b/lonestar/scientific/cpu/longestedge/test/catch.hpp
@@ -12,6 +12,7 @@
 #define TWOBLUECUBES_SINGLE_INCLUDE_CATCH_HPP_INCLUDED
 // start catch.hpp
 
+#define CATCH_CONFIG_NO_POSIX_SIGNALS
 
 #define CATCH_VERSION_MAJOR 2
 #define CATCH_VERSION_MINOR 11
diff --git a/scripts/generate_wmdpartitioner_statstics.py b/scripts/generate_wmdpartitioner_statstics.py
new file mode 100755
index 0000000000..cbf2426df3
--- /dev/null
+++ b/scripts/generate_wmdpartitioner_statstics.py
@@ -0,0 +1,56 @@
+#!/usr/bin/python3
+import sys
+import os
+import re
+
+def atoi(text):
+    return int(text) if text.isdigit() else text
+
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [ atoi(c) for c in re.split(r'(\d+)', text) ]
+
+if len(sys.argv) != 3:
+    print("Invalid arguments: expect <folder_name> <input_name>")
+input_folder = sys.argv[1]
+
+content = "BSD 0\n"
+host_counter = 0
+
+for filename in sorted([name for name in os.listdir(input_folder)], key=natural_keys):
+    print("read file " + filename)
+    with open(os.path.join(input_folder, filename), 'r') as f:
+        lines = [line.rstrip('\n') for line in f]
+
+        local_stats = ""
+
+        local_stats += f"# {host_counter} {host_counter}\n"
+
+        STR_RD = int(lines[3].split(":")[1])
+        local_stats += f"STR RD {STR_RD} {STR_RD*8}\n"
+
+        RND_RD = int(lines[4].split(":")[1])
+        local_stats += f"RND_RD {RND_RD} {RND_RD*8}\n"
+
+        RND_WR = int(lines[5].split(":")[1])
+        local_stats += f"RND_WR {RND_WR} {RND_WR*8}\n"
+
+
+        remote_line_offset = 8
+        for remote_host in range(0, len(lines) - remote_line_offset):
+            if remote_host == host_counter:
+                content += local_stats
+            else:
+                REMOTE_STR_RD = int(lines[remote_host + remote_line_offset].split(":")[1])
+                content += f"# {host_counter} {remote_host}\n"
+                content += f"STR RD {REMOTE_STR_RD} {REMOTE_STR_RD*8}\n"
+
+    host_counter += 1
+
+
+with open(f"GAL_WF1_{host_counter}_0_{sys.argv[2]}.stats", "w") as f:
+    f.write(content)
diff --git a/tools/graph-convert/graph-convert.cpp b/tools/graph-convert/graph-convert.cpp
index e283b55439..50bd565834 100644
--- a/tools/graph-convert/graph-convert.cpp
+++ b/tools/graph-convert/graph-convert.cpp
@@ -36,6 +36,7 @@
 
 #include <fcntl.h>
 #include <cstdlib>
+#include <optional>
 
 // TODO: move these enums to a common location for all graph convert tools
 enum ConvertMode {

From f8c3387bd16ef149413f4363155d3727e7d2b5f7 Mon Sep 17 00:00:00 2001
From: AdityaAtulTewari <adityaatewari@gmail.com>
Date: Wed, 6 Mar 2024 17:19:42 +0000
Subject: [PATCH 614/660] Fixed merging for gitignore

---
 .gitignore | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/.gitignore b/.gitignore
index 3a054d27a7..76b1e5b631 100644
--- a/.gitignore
+++ b/.gitignore
@@ -23,12 +23,8 @@ tags
 .ycm_extra_conf.py
 
 # no build files
-<<<<<<< HEAD
-/build*
-/dockerbuild*
-=======
 /*build*
->>>>>>> 8e396b028 (Fixed CMakeLists.txt for Stack_Capture)
+/dockerbuild*
 
 # no python build artifacts
 *.pyc

From b3ccbfec754477ed59ac9077a12afddbdf98726e Mon Sep 17 00:00:00 2001
From: AdityaAtulTewari <adityaatewari@gmail.com>
Date: Wed, 6 Mar 2024 17:20:41 +0000
Subject: [PATCH 615/660] Makefile commit

---
 Makefile | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/Makefile b/Makefile
index d9fc8742ba..df77923812 100644
--- a/Makefile
+++ b/Makefile
@@ -1,12 +1,9 @@
-<<<<<<< HEAD
 SHELL := /bin/bash
 
 IMAGE_NAME := pando-galois
 VERSION := 0.0.1
 CONTAINER_SRC_DIR := /pando-galois
 
-=======
->>>>>>> 5901b24b6 (chore: Run clang-format on the repo and add git hooks from gnn branch)
 dependencies: dependencies-asdf
 
 dependencies-asdf:
@@ -27,7 +24,6 @@ hooks:
 
 pre-commit:
 	@pre-commit run -a
-<<<<<<< HEAD
 
 docker-image:
 	@docker --context default build --build-arg VERSION=${VERSION} \
@@ -43,5 +39,3 @@ docker:
 
 run-cmake:
 	@cmake -S . -B ${BUILD_DIR} -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DUSE_MKL_BLAS=ON -DGALOIS_ENABLE_DIST=ON
-=======
->>>>>>> 5901b24b6 (chore: Run clang-format on the repo and add git hooks from gnn branch)

From 99adf0c65f9b65bff16306db3e01ba641cbd3921 Mon Sep 17 00:00:00 2001
From: AdityaAtulTewari <adityaatewari@gmail.com>
Date: Wed, 6 Mar 2024 17:22:31 +0000
Subject: [PATCH 616/660] merge CMakeLists.txt

---
 CMakeLists.txt | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 146f4adb25..f1de9ae84c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,19 +6,13 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules")
 
 include(GNUInstallDirs)
 
-<<<<<<< HEAD
-<<<<<<< HEAD
-=======
 if(STACK_CAPTURE)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -finstrument-functions")
   set(INSTRUMENT_EXCLUDE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/libgalois/include/galois/runtime/StackTracer.h")
   set(INSTRUMENT_EXCLUDE_FILE "${INSTRUMENT_EXCLUDE_FILE},/usr/include/c++/11/sstream")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -finstrument-functions-exclude-file-list=${INSTRUMENT_EXCLUDE_FILE}")
 endif(STACK_CAPTURE)
->>>>>>> deb11e279 (Added StackTracer to CMakeLists)
 
-=======
->>>>>>> 8e396b028 (Fixed CMakeLists.txt for Stack_Capture)
 file(STRINGS config/version.txt GALOIS_VERSION)
 string(REGEX REPLACE "[ \t\n]" "" GALOIS_VERSION ${GALOIS_VERSION})
 string(REGEX REPLACE "([0-9]+)\\.([0-9]+)\\.([0-9]+)" "\\1" GALOIS_VERSION_MAJOR ${GALOIS_VERSION})
@@ -270,13 +264,10 @@ if (GALOIS_ENABLE_GPU)
     string(REPLACE "." "" GENCODE ${GENCODE})
     add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_${GENCODE},code=sm_${GENCODE}>")
   endforeach()
-<<<<<<< HEAD
-=======
 
   # This is necessary to allow building for CUDA 11.x (where CUB is bundled) and earlier versions (where CUB is not included)
   add_definitions(-DTHRUST_IGNORE_CUB_VERSION_CHECK)
 
->>>>>>> 191e9ff91 (Fix 0-initialization of elements in a multiple_sum structure on GPU)
   add_subdirectory(libgpu)
 
   if (USE_DEEPGALOIS)

From 243a0e614eb0ef308a73cbb301770e62891e7386 Mon Sep 17 00:00:00 2001
From: AdityaAtulTewari <adityaatewari@gmail.com>
Date: Wed, 6 Mar 2024 17:28:28 +0000
Subject: [PATCH 617/660] LS_LC_CSR_64_Graph.h merge

---
 .../galois/graphs/LS_LC_CSR_64_Graph.h        | 112 +-----------------
 1 file changed, 2 insertions(+), 110 deletions(-)

diff --git a/libgalois/include/galois/graphs/LS_LC_CSR_64_Graph.h b/libgalois/include/galois/graphs/LS_LC_CSR_64_Graph.h
index b8b1fa4e44..e1e9165679 100644
--- a/libgalois/include/galois/graphs/LS_LC_CSR_64_Graph.h
+++ b/libgalois/include/galois/graphs/LS_LC_CSR_64_Graph.h
@@ -35,13 +35,7 @@
 #include "galois/graphs/FileGraph.h"
 #include "galois/graphs/GraphHelpers.h"
 #include "galois/PODResizeableArray.h"
-<<<<<<< HEAD
-#include "galois/Reduction.h"
-
-
-=======
 #include "galois/PrefixSum.h"
->>>>>>> 1729f3c40 (Prefix sums for thread ranges)
 
 namespace galois::graphs {
 /**
@@ -600,51 +594,16 @@ class LS_LC_CSR_64_Graph :
     edgeIndData[src].first  = edgeStart;
     edgeIndData[src].second = edgeStart + num_dst + orig_deg;
 
-<<<<<<< HEAD
-<<<<<<< HEAD
-<<<<<<< HEAD
-<<<<<<< HEAD
     if (!keep_size)
       numEdges.fetch_add(num_dst, std::memory_order_relaxed);
-=======
-#if GRAPH_PROFILE
-=======
 #ifdef GRAPH_PROFILE
-<<<<<<< HEAD
->>>>>>> b09e68c3c (fix: typo)
     this->local_rnd_write_counts  += 1;
     this->local_rnd_write_bytes   += 8;
-=======
-    this->local_rand_write_count  += 1;
-    this->local_rand_write_size   += 8;
->>>>>>> 709cc4565 (feat: profile each host seperatly)
 #endif
 
-<<<<<<< HEAD
     numEdges.fetch_add(num_dst, std::memory_order_relaxed);
-<<<<<<< HEAD
-#if GRAPH_PROFILE
-    this->local_rnd_rmw_counts += 1;
-    this->local_rnd_rmw_bytes += 8;
-#endif
 
->>>>>>> 76d458d30 (Initial commit of instrumentation for Yineng)
-=======
-=======
->>>>>>> f4d386172 (style: address some style issues and add comments)
-=======
->>>>>>> 5901b24b6 (chore: Run clang-format on the repo and add git hooks from gnn branch)
-    if (!keep_size) {
-      numEdges.fetch_add(num_dst, std::memory_order_relaxed);
-    }
-<<<<<<< HEAD
->>>>>>> b09e68c3c (fix: typo)
-=======
-    prefixValid = false;
->>>>>>> dd7b8210a (Another small compilation bug, needed to return the right object)
-=======
     prefixValid = false;
->>>>>>> a599b7169 (Small fixes for compilation issues)
   }
 
   void addEdgeSort(const uint64_t src, const uint64_t dst) {
@@ -987,19 +946,6 @@ class LS_LC_CSR_64_Graph :
     maxNodes = numNodes;
 
     if (UseNumaAlloc) {
-<<<<<<< HEAD
-      nodeData.allocateBlocked(maxNodes);
-      edgeIndData.allocateBlocked(maxNodes);
-      edgeDst.allocateBlocked(maxEdges);
-      edgeData.allocateBlocked(maxEdges);
-      this->outOfLineAllocateBlocked(maxNodes);
-    } else {
-      nodeData.allocateInterleaved(maxNodes);
-      edgeIndData.allocateInterleaved(maxNodes);
-      edgeDst.allocateInterleaved(maxEdges);
-      edgeData.allocateInterleaved(maxEdges);
-      this->outOfLineAllocateInterleaved(maxNodes);
-=======
       nodeData.allocateBlocked(numNodes);
       edgeIndData.allocateBlocked(numNodes);
       edgeDst.allocateBlocked(numEdges);
@@ -1013,7 +959,6 @@ class LS_LC_CSR_64_Graph :
       edgeData.allocateInterleaved(numEdges);
       pfxsum.allocateInterleaved(numNodes);
       this->outOfLineAllocateInterleaved(numNodes);
->>>>>>> 1729f3c40 (Prefix sums for thread ranges)
     }
     resetPrefixSum();
   }
@@ -1021,42 +966,24 @@ class LS_LC_CSR_64_Graph :
   void allocateFrom(uint64_t nNodes, uint64_t nEdges) {
     numNodes = nNodes;
     numEdges = 0;
-<<<<<<< HEAD
-<<<<<<< HEAD
-    edgeEnd = 0;
-=======
-    edgeEnd  = 0;
->>>>>>> 2fff1eb52 (Old fix that got borked somehow)
-=======
     edgeEnd  = 0;
->>>>>>> 5901b24b6 (chore: Run clang-format on the repo and add git hooks from gnn branch)
     maxEdges = nEdges;
     maxNodes = nNodes;
 
     if (UseNumaAlloc) {
-<<<<<<< HEAD
-<<<<<<< HEAD
-<<<<<<< HEAD
-=======
->>>>>>> 07ed363c0 (Old bugfix got borked)
-=======
->>>>>>> 2fff1eb52 (Old fix that got borked somehow)
       nodeData.allocateBlocked(maxNodes);
       edgeIndData.allocateBlocked(maxNodes);
       edgeDst.allocateBlocked(maxEdges);
       edgeData.allocateBlocked(maxEdges);
-<<<<<<< HEAD
-<<<<<<< HEAD
+      pfxsum.allocateInterleaved(maxNodes);
       this->outOfLineAllocateBlocked(maxNodes);
     } else {
       nodeData.allocateInterleaved(maxNodes);
       edgeIndData.allocateInterleaved(maxNodes);
       edgeDst.allocateInterleaved(maxEdges);
       edgeData.allocateInterleaved(maxEdges);
+      pfxsum.allocateInterleaved(maxNodes);
       this->outOfLineAllocateInterleaved(maxNodes);
-<<<<<<< HEAD
-=======
-=======
     }
     resetPrefixSum();
   }
@@ -1070,7 +997,6 @@ class LS_LC_CSR_64_Graph :
 
     deallocate();
     if (UseNumaAlloc) {
->>>>>>> 5901b24b6 (chore: Run clang-format on the repo and add git hooks from gnn branch)
       nodeData.allocateBlocked(numNodes);
       edgeIndData.allocateBlocked(numNodes);
       edgeDst.allocateBlocked(numEdges);
@@ -1078,32 +1004,12 @@ class LS_LC_CSR_64_Graph :
       prefixSumCache.allocateBlocked(numNodes);
       this->outOfLineAllocateBlocked(numNodes);
     } else {
-      nodeData.allocateInterleaved(numNodes);
-      edgeIndData.allocateInterleaved(numNodes);
-      edgeDst.allocateInterleaved(numEdges);
-      edgeData.allocateInterleaved(numEdges);
-      prefixSumCache.allocateInterleaved(numNodes);
-      this->outOfLineAllocateInterleaved(numNodes);
->>>>>>> 1729f3c40 (Prefix sums for thread ranges)
-=======
-      prefixSumCache.allocateBlocked(maxNodes);
-      this->outOfLineAllocateBlocked(maxNodes);
-    } else {
-=======
-      prefixSumCache.allocateBlocked(maxNodes);
-      this->outOfLineAllocateBlocked(maxNodes);
-    } else {
->>>>>>> 2fff1eb52 (Old fix that got borked somehow)
       nodeData.allocateInterleaved(maxNodes);
       edgeIndData.allocateInterleaved(maxNodes);
       edgeDst.allocateInterleaved(maxEdges);
       edgeData.allocateInterleaved(maxEdges);
       prefixSumCache.allocateInterleaved(maxNodes);
       this->outOfLineAllocateInterleaved(maxNodes);
-<<<<<<< HEAD
->>>>>>> 07ed363c0 (Old bugfix got borked)
-=======
->>>>>>> 2fff1eb52 (Old fix that got borked somehow)
     }
     resetPrefixSum();
   }
@@ -1117,19 +1023,6 @@ class LS_LC_CSR_64_Graph :
 
     deallocate();
     if (UseNumaAlloc) {
-<<<<<<< HEAD
-      nodeData.allocateBlocked(maxNodes);
-      edgeIndData.allocateBlocked(maxNodes);
-      edgeDst.allocateBlocked(maxEdges);
-      edgeData.allocateBlocked(maxEdges);
-      this->outOfLineAllocateBlocked(maxNodes);
-    } else {
-      nodeData.allocateInterleaved(maxNodes);
-      edgeIndData.allocateInterleaved(maxNodes);
-      edgeDst.allocateInterleaved(maxEdges);
-      edgeData.allocateInterleaved(maxEdges);
-      this->outOfLineAllocateInterleaved(maxNodes);
-=======
       nodeData.allocateBlocked(numNodes);
       edgeIndData.allocateBlocked(numNodes);
       edgeDst.allocateBlocked(numEdges);
@@ -1143,7 +1036,6 @@ class LS_LC_CSR_64_Graph :
       edgeData.allocateInterleaved(numEdges);
       prefixSumCache.allocateInterleaved(numNodes);
       this->outOfLineAllocateInterleaved(numNodes);
->>>>>>> 1729f3c40 (Prefix sums for thread ranges)
     }
     resetPrefixSum();
   }

From c8898a6d64e077c24e1d4e5a315a59883bd29f86 Mon Sep 17 00:00:00 2001
From: AdityaAtulTewari <adityaatewari@gmail.com>
Date: Wed, 6 Mar 2024 17:30:22 +0000
Subject: [PATCH 618/660] Update Contributing.md

---
 CONTRIBUTING.md | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 36e317c15b..7f91c20d41 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -6,17 +6,12 @@ tools like `clang-format` manually.
 
 Code should be clear and documented where needed.
 
-<<<<<<< HEAD
-<<<<<<< HEAD
 ## Setup
 
 Users can run `make docker-image` to setup all dependecies needed for
 `pando-galois`.  After creating the image it can be run via `make docker`.
 And for first time cmake users can run `make run-cmake`.
 
-=======
->>>>>>> 5901b24b6 (chore: Run clang-format on the repo and add git hooks from gnn branch)
-=======
 # Instrumentation
 
 This section pertains to enabling and instrumenting memory accesses for
@@ -56,7 +51,6 @@ I_CLEAR()
 I_LC(REMOTE_HOST, BYTES)
 ```
 
->>>>>>> 43672aff5 (chore: Add instrument.h header file to libwmd)
 ## Tools
 
 ### [asdf](https://asdf-vm.com)

From 42f4ca3b952e7be9cffb1c8c29636b80655d249e Mon Sep 17 00:00:00 2001
From: patrickkenney9801 <patrickkenney9801@gmail.com>
Date: Wed, 6 Mar 2024 11:55:10 -0600
Subject: [PATCH 619/660] chore: Resolve DistGraph conflicts by taking
 splitting into 2 files

---
 .../include/galois/graphs/DistributedGraph.h  |  22 +-
 .../galois/graphs/DistributedLocalGraph.h     | 903 ++++++++++++++++++
 2 files changed, 904 insertions(+), 21 deletions(-)
 create mode 100644 libcusp/include/galois/graphs/DistributedLocalGraph.h

diff --git a/libcusp/include/galois/graphs/DistributedGraph.h b/libcusp/include/galois/graphs/DistributedGraph.h
index e4f38d80ea..540b25e120 100644
--- a/libcusp/include/galois/graphs/DistributedGraph.h
+++ b/libcusp/include/galois/graphs/DistributedGraph.h
@@ -30,12 +30,8 @@
 #include <unordered_map>
 #include <fstream>
 
-<<<<<<< HEAD
 #include "galois/graphs/LC_CSR_Graph.h"
 #include "galois/graphs/LC_CSR_CSC_Graph.h"
-=======
-#include "galois/graphs/LS_LC_CSR_64_Graph.h"
->>>>>>> 3945b1acc (Experimental changes for Distributed Graph)
 #include "galois/graphs/BufferedGraph.h"
 #include "galois/runtime/DistStats.h"
 #include "galois/graphs/OfflineGraph.h"
@@ -72,13 +68,9 @@ class DistGraph {
   //! Graph name used for printing things
   constexpr static const char* const GRNAME = "dGraph";
 
-<<<<<<< HEAD
   using GraphTy =
       galois::graphs::LC_CSR_CSC_Graph<NodeTy, EdgeTy, false, true, false,
                                        false, EdgeTy, NodeIndexTy, EdgeIndexTy>;
-=======
-  using GraphTy = galois::graphs::LS_LC_CSR_64_Graph<NodeTy, EdgeTy, true>;
->>>>>>> 3945b1acc (Experimental changes for Distributed Graph)
 
   // vector for determining range objects for master nodes + nodes
   // with edges (which includes masters)
@@ -904,11 +896,6 @@ class DistGraph {
     return graph.edge_end(N, galois::MethodFlag::UNPROTECTED);
   }
 
-  /**
-   * Return the degree of the edge in the local graph
-   **/
-  inline uint64_t localDegree(GraphNode N) { return graph.getDegree(N); }
-
   /**
    * Returns an iterable object over the edges of a particular node in the
    * graph.
@@ -1094,8 +1081,7 @@ class DistGraph {
     } else {
       masterRanges = galois::graphs::determineUnitRangesFromGraph(
           graph, galois::runtime::activeThreads, beginMaster,
-          beginMaster + numOwned, 0,
-          (galois::graphs::is_LS_LC_CSR_64_Graph<decltype(graph)>::value == 1));
+          beginMaster + numOwned, 0);
     }
   }
 
@@ -1163,12 +1149,6 @@ class DistGraph {
    */
   void edgesEqualMasters() { specificRanges[2] = specificRanges[1]; }
 
-  void recalculateG2LMap() {
-    for (uint64_t i = 0; i < localToGlobalVector.size(); i++) {
-      globalToLocalMap[localToGlobalVector[i]] = i;
-    }
-  }
-
 public:
   /**
    * Write the local LC_CSR graph to the file on a disk.
diff --git a/libcusp/include/galois/graphs/DistributedLocalGraph.h b/libcusp/include/galois/graphs/DistributedLocalGraph.h
new file mode 100644
index 0000000000..a03e370e64
--- /dev/null
+++ b/libcusp/include/galois/graphs/DistributedLocalGraph.h
@@ -0,0 +1,903 @@
+/*
+ * This file belongs to the Galois project, a C++ library for exploiting
+ * parallelism. The code is being released under the terms of the 3-Clause BSD
+ * License (a copy is located in LICENSE.txt at the top-level directory).
+ *
+ * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
+ * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
+ * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
+ * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
+ * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
+ * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
+ * shall University be liable for incidental, special, indirect, direct or
+ * consequential damages or loss of profits, interruption of business, or
+ * related expenses which may arise from use of Software or Documentation,
+ * including but not limited to those resulting from defects in Software and/or
+ * Documentation, or loss or inaccuracy of data of any kind.
+ */
+
+/**
+ * @file DistributedLocalGraph.h
+ *
+ * Contains the implementation for DistGraph. Command line argument definitions
+ * are found in DistributedGraph.cpp.
+ */
+
+#ifndef _GALOIS_DISTRIBUTED_LOCAL_GRAPH_H
+#define _GALOIS_DISTRIBUTED_LOCAL_GRAPH_H
+
+#include <unordered_map>
+#include <fstream>
+
+#include "galois/graphs/LS_LC_CSR_64_Graph.h"
+#include "galois/graphs/BufferedGraph.h"
+#include "galois/runtime/DistStats.h"
+#include "galois/graphs/OfflineGraph.h"
+#include "galois/DynamicBitset.h"
+
+/*
+ * Headers for boost serialization
+ */
+
+namespace galois {
+namespace graphs {
+/**
+ * Enums specifying how masters are to be distributed among hosts.
+ */
+enum MASTERS_DISTRIBUTION {
+  //! balance nodes
+  BALANCED_MASTERS,
+  //! balance edges
+  BALANCED_EDGES_OF_MASTERS,
+  //! balance nodes and edges
+  BALANCED_MASTERS_AND_EDGES
+};
+
+/**
+ * Base DistGraph class that all distributed graphs extend from.
+ *
+ * @tparam NodeTy type of node data for the graph
+ * @tparam EdgeTy type of edge data for the graph
+ */
+template <typename NodeTy, typename EdgeTy>
+class DistLocalGraph {
+private:
+  //! Graph name used for printing things
+  constexpr static const char* const GRNAME = "dGraph";
+
+  using GraphTy = galois::graphs::LS_LC_CSR_64_Graph<NodeTy, EdgeTy, true>;
+
+  // vector for determining range objects for master nodes + nodes
+  // with edges (which includes masters)
+  //! represents split of all nodes among threads to balance edges
+  std::vector<uint32_t> allNodesRanges;
+  //! represents split of master nodes among threads to balance edges
+  std::vector<uint32_t> masterRanges;
+  //! represents split of nodes with edges (includes masters) among threads to
+  //! balance edges
+  std::vector<uint32_t> withEdgeRanges;
+  //! represents split of all nodes among threads to balance in-edges
+  std::vector<uint32_t> allNodesRangesIn;
+  //! represents split of master nodes among threads to balance in-edges
+  std::vector<uint32_t> masterRangesIn;
+
+  using NodeRangeType =
+      galois::runtime::SpecificRange<boost::counting_iterator<size_t>>;
+
+  //! Vector of ranges that stores the 3 different range objects that a user is
+  //! able to access
+  std::vector<NodeRangeType> specificRanges;
+  //! Like specificRanges, but for in edges
+  std::vector<NodeRangeType> specificRangesIn;
+
+protected:
+  //! The internal graph used by DistGraph to represent the graph
+  GraphTy graph;
+
+  //! Marks if the graph is transposed or not.
+  bool transposed;
+
+  // global graph variables
+  uint64_t numGlobalNodes; //!< Total nodes in the global unpartitioned graph.
+  uint64_t numGlobalEdges; //!< Total edges in the global unpartitioned graph.
+  uint32_t numNodes;       //!< Num nodes in this graph in total
+  uint64_t numEdges;       //!< Num edges in this graph in total
+
+  const unsigned id;       //!< ID of the machine.
+  const uint32_t numHosts; //!< Total number of machines
+
+  // local graph
+  // size() = Number of nodes created on this host (masters + mirrors)
+  uint32_t numOwned;    //!< Number of nodes owned (masters) by this host.
+                        //!< size() - numOwned = mirrors on this host
+  uint32_t beginMaster; //!< Local id of the beginning of master nodes.
+                        //!< beginMaster + numOwned = local id of the end of
+                        //!< master nodes
+  uint32_t numNodesWithEdges; //!< Number of nodes (masters + mirrors) that have
+                              //!< outgoing edges
+
+  //! Information that converts host to range of nodes that host reads
+  std::vector<std::pair<uint64_t, uint64_t>> gid2host;
+  //! Mirror nodes from different hosts. For reduce
+  std::vector<std::vector<size_t>> mirrorNodes;
+
+  //! GID = localToGlobalVector[LID]
+  std::vector<uint64_t> localToGlobalVector;
+  //! LID = globalToLocalMap[GID]
+  std::unordered_map<uint64_t, uint32_t> globalToLocalMap;
+
+  //! Increments evilPhase, a phase counter used by communication.
+  void inline increment_evilPhase() {
+    ++galois::runtime::evilPhase;
+    if (galois::runtime::evilPhase >=
+        static_cast<uint32_t>(
+            std::numeric_limits<int16_t>::max())) { // limit defined by MPI or
+                                                    // LCI
+      galois::runtime::evilPhase = 1;
+    }
+  }
+
+  //! Returns evilPhase + 1, handling loop around as necessary
+  unsigned inline evilPhasePlus1() {
+    unsigned result = galois::runtime::evilPhase + 1;
+
+    // limit defined by MPI or LCI
+    if (result >= uint32_t{std::numeric_limits<int16_t>::max()}) {
+      return 1;
+    }
+    return result;
+  }
+
+  //! used to sort edges in the sort edges function
+  template <typename GraphNode, typename ET>
+  struct IdLess {
+    bool
+    operator()(const galois::graphs::EdgeSortValue<GraphNode, ET>& e1,
+               const galois::graphs::EdgeSortValue<GraphNode, ET>& e2) const {
+      return e1.dst < e2.dst;
+    }
+  };
+
+private:
+  /**
+   * Given an OfflineGraph, compute the masters for each node by
+   * evenly (or unevenly as specified by scale factor)
+   * blocking the nodes off to assign to each host. Considers
+   * ONLY nodes and not edges.
+   *
+   * @param g The offline graph which has loaded the graph you want
+   * to get the masters for
+   * @param scalefactor A vector that specifies if a particular host
+   * should have more or less than other hosts
+   * @param DecomposeFactor Specifies how decomposed the blocking
+   * of nodes should be. For example, a factor of 2 will make 2 blocks
+   * out of 1 block had the decompose factor been set to 1.
+   */
+  void computeMastersBlockedNodes(galois::graphs::OfflineGraph& g,
+                                  const std::vector<unsigned>& scalefactor,
+                                  unsigned DecomposeFactor = 1) {
+    uint64_t numNodes_to_divide = g.size();
+    if (scalefactor.empty() || (numHosts * DecomposeFactor == 1)) {
+      for (unsigned i = 0; i < numHosts * DecomposeFactor; ++i)
+        gid2host.push_back(galois::block_range(uint64_t{0}, numNodes_to_divide,
+                                               i, numHosts * DecomposeFactor));
+      return;
+    }
+
+    // TODO: not compatible with DecomposeFactor.
+    assert(scalefactor.size() == numHosts);
+
+    unsigned numBlocks = 0;
+
+    for (unsigned i = 0; i < numHosts; ++i) {
+      numBlocks += scalefactor[i];
+    }
+
+    std::vector<std::pair<uint64_t, uint64_t>> blocks;
+    for (unsigned i = 0; i < numBlocks; ++i) {
+      blocks.push_back(
+          galois::block_range(uint64_t{0}, numNodes_to_divide, i, numBlocks));
+    }
+
+    std::vector<unsigned> prefixSums;
+    prefixSums.push_back(0);
+
+    for (unsigned i = 1; i < numHosts; ++i) {
+      prefixSums.push_back(prefixSums[i - 1] + scalefactor[i - 1]);
+    }
+
+    for (unsigned i = 0; i < numHosts; ++i) {
+      unsigned firstBlock = prefixSums[i];
+      unsigned lastBlock  = prefixSums[i] + scalefactor[i] - 1;
+      gid2host.push_back(
+          std::make_pair(blocks[firstBlock].first, blocks[lastBlock].second));
+    }
+  }
+
+  /**
+   * Given an OfflineGraph, compute the masters for each node by
+   * evenly (or unevenly as specified by scale factor)
+   * blocking the nodes off to assign to each host while taking
+   * into consideration the only edges of the node to get
+   * even blocks.
+   *
+   * @param g The offline graph which has loaded the graph you want
+   * to get the masters for
+   * @param scalefactor A vector that specifies if a particular host
+   * should have more or less than other hosts
+   * @param DecomposeFactor Specifies how decomposed the blocking
+   * of nodes should be. For example, a factor of 2 will make 2 blocks
+   * out of 1 block had the decompose factor been set to 1.
+   */
+  void computeMastersBalancedEdges(galois::graphs::OfflineGraph& g,
+                                   const std::vector<unsigned>& scalefactor,
+                                   uint32_t edgeWeight,
+                                   unsigned DecomposeFactor = 1) {
+    if (edgeWeight == 0) {
+      edgeWeight = 1;
+    }
+
+    auto& net = galois::runtime::getSystemNetworkInterface();
+
+    gid2host.resize(numHosts * DecomposeFactor);
+    for (unsigned d = 0; d < DecomposeFactor; ++d) {
+      auto r = g.divideByNode(0, edgeWeight, (id + d * numHosts),
+                              numHosts * DecomposeFactor, scalefactor);
+      gid2host[id + d * numHosts].first  = *(r.first.first);
+      gid2host[id + d * numHosts].second = *(r.first.second);
+    }
+
+    for (unsigned h = 0; h < numHosts; ++h) {
+      if (h == id) {
+        continue;
+      }
+      galois::runtime::SendBuffer b;
+      for (unsigned d = 0; d < DecomposeFactor; ++d) {
+        galois::runtime::gSerialize(b, gid2host[id + d * numHosts]);
+      }
+      net.sendTagged(h, galois::runtime::evilPhase, b);
+    }
+    net.flush();
+    unsigned received = 1;
+    while (received < numHosts) {
+      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+      do {
+        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+      } while (!p);
+      assert(p->first != id);
+      auto& b = p->second;
+      for (unsigned d = 0; d < DecomposeFactor; ++d) {
+        galois::runtime::gDeserialize(b, gid2host[p->first + d * numHosts]);
+      }
+      ++received;
+    }
+    increment_evilPhase();
+
+#ifndef NDEBUG
+    for (unsigned h = 0; h < numHosts; h++) {
+      if (h == 0) {
+        assert(gid2host[h].first == 0);
+      } else if (h == numHosts - 1) {
+        assert(gid2host[h].first == gid2host[h - 1].second);
+        assert(gid2host[h].second == g.size());
+      } else {
+        assert(gid2host[h].first == gid2host[h - 1].second);
+        assert(gid2host[h].second == gid2host[h + 1].first);
+      }
+    }
+#endif
+  }
+
+  /**
+   * Given an OfflineGraph, compute the masters for each node by
+   * evenly (or unevenly as specified by scale factor)
+   * blocking the nodes off to assign to each host while taking
+   * into consideration the edges of the node AND the node itself.
+   *
+   * @param g The offline graph which has loaded the graph you want
+   * to get the masters for
+   * @param scalefactor A vector that specifies if a particular host
+   * should have more or less than other hosts
+   * @param DecomposeFactor Specifies how decomposed the blocking
+   * of nodes should be. For example, a factor of 2 will make 2 blocks
+   * out of 1 block had the decompose factor been set to 1. Ignored
+   * in this function currently.
+   *
+   * @todo make this function work with decompose factor
+   */
+  void computeMastersBalancedNodesAndEdges(
+      galois::graphs::OfflineGraph& g, const std::vector<unsigned>& scalefactor,
+      uint32_t nodeWeight, uint32_t edgeWeight, unsigned) {
+    if (nodeWeight == 0) {
+      nodeWeight = g.sizeEdges() / g.size(); // average degree
+    }
+    if (edgeWeight == 0) {
+      edgeWeight = 1;
+    }
+
+    auto& net = galois::runtime::getSystemNetworkInterface();
+    gid2host.resize(numHosts);
+    auto r = g.divideByNode(nodeWeight, edgeWeight, id, numHosts, scalefactor);
+    gid2host[id].first  = *r.first.first;
+    gid2host[id].second = *r.first.second;
+    for (unsigned h = 0; h < numHosts; ++h) {
+      if (h == id)
+        continue;
+      galois::runtime::SendBuffer b;
+      galois::runtime::gSerialize(b, gid2host[id]);
+      net.sendTagged(h, galois::runtime::evilPhase, b);
+    }
+    net.flush();
+    unsigned received = 1;
+    while (received < numHosts) {
+      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+      do {
+        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+      } while (!p);
+      assert(p->first != id);
+      auto& b = p->second;
+      galois::runtime::gDeserialize(b, gid2host[p->first]);
+      ++received;
+    }
+    increment_evilPhase();
+  }
+
+protected:
+  /**
+   * Wrapper call that will call into more specific compute masters
+   * functions that compute masters based on nodes, edges, or both.
+   *
+   * @param masters_distribution method of masters distribution to use
+   * @param g The offline graph which has loaded the graph you want
+   * to get the masters for
+   * @param scalefactor A vector that specifies if a particular host
+   * should have more or less than other hosts
+   * @param nodeWeight weight to give nodes when computing balance
+   * @param edgeWeight weight to give edges when computing balance
+   * @param DecomposeFactor Specifies how decomposed the blocking
+   * of nodes should be. For example, a factor of 2 will make 2 blocks
+   * out of 1 block had the decompose factor been set to 1.
+   */
+  uint64_t computeMasters(MASTERS_DISTRIBUTION masters_distribution,
+                          galois::graphs::OfflineGraph& g,
+                          const std::vector<unsigned>& scalefactor,
+                          uint32_t nodeWeight = 0, uint32_t edgeWeight = 0,
+                          unsigned DecomposeFactor = 1) {
+    galois::Timer timer;
+    timer.start();
+    g.reset_seek_counters();
+
+    uint64_t numNodes_to_divide = g.size();
+
+    // compute masters for all nodes
+    switch (masters_distribution) {
+    case BALANCED_MASTERS:
+      computeMastersBlockedNodes(g, scalefactor, DecomposeFactor);
+      break;
+    case BALANCED_MASTERS_AND_EDGES:
+      computeMastersBalancedNodesAndEdges(g, scalefactor, nodeWeight,
+                                          edgeWeight, DecomposeFactor);
+      break;
+    case BALANCED_EDGES_OF_MASTERS:
+    default:
+      computeMastersBalancedEdges(g, scalefactor, edgeWeight, DecomposeFactor);
+      break;
+    }
+
+    timer.stop();
+
+    galois::runtime::reportStatCond_Tmax<MORE_DIST_STATS>(
+        GRNAME, "MasterDistTime", timer.get());
+
+    galois::gPrint(
+        "[", id, "] Master distribution time : ", timer.get_usec() / 1000000.0f,
+        " seconds to read ", g.num_bytes_read(), " bytes in ", g.num_seeks(),
+        " seeks (", g.num_bytes_read() / (float)timer.get_usec(), " MBPS)\n");
+    return numNodes_to_divide;
+  }
+
+  //! reader assignment from a file
+  //! corresponds to master assignment if using an edge cut
+  void readersFromFile(galois::graphs::OfflineGraph& g, std::string filename) {
+    // read file lines
+    std::ifstream mappings(filename);
+    std::string curLine;
+
+    unsigned timesToRead = id + 1;
+
+    for (unsigned i = 0; i < timesToRead; i++) {
+      std::getline(mappings, curLine);
+    }
+
+    std::vector<char> modifyLine(curLine.begin(), curLine.end());
+    char* tokenizedString = modifyLine.data();
+    char* token;
+    token = strtok(tokenizedString, " ");
+
+    // loop 6 more times
+    for (unsigned i = 0; i < 6; i++) {
+      token = strtok(NULL, " ");
+    }
+    std::string left(token);
+
+    // 3 more times for right
+    for (unsigned i = 0; i < 3; i++) {
+      token = strtok(NULL, " ");
+    }
+    std::string right(token);
+
+    gid2host.resize(numHosts);
+    gid2host[id].first  = std::stoul(left);
+    gid2host[id].second = std::stoul(right) + 1;
+    galois::gPrint("[", id, "] Left: ", gid2host[id].first,
+                   ", Right: ", gid2host[id].second, "\n");
+
+    /////////////////////////
+    // send/recv from other hosts
+    /////////////////////////
+    auto& net = galois::runtime::getSystemNetworkInterface();
+
+    for (unsigned h = 0; h < numHosts; ++h) {
+      if (h == id)
+        continue;
+      galois::runtime::SendBuffer b;
+      galois::runtime::gSerialize(b, gid2host[id]);
+      net.sendTagged(h, galois::runtime::evilPhase, b);
+    }
+    net.flush();
+    unsigned received = 1;
+    while (received < numHosts) {
+      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+      do {
+        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+      } while (!p);
+      assert(p->first != id);
+      auto& b = p->second;
+      galois::runtime::gDeserialize(b, gid2host[p->first]);
+      ++received;
+    }
+    increment_evilPhase();
+
+    // sanity checking assignment
+    for (unsigned h = 0; h < numHosts; h++) {
+      if (h == 0) {
+        GALOIS_ASSERT(gid2host[h].first == 0);
+      } else if (h == numHosts - 1) {
+        GALOIS_ASSERT(gid2host[h].first == gid2host[h - 1].second,
+                      gid2host[h].first, " ", gid2host[h - 1].second);
+        GALOIS_ASSERT(gid2host[h].second == g.size(), gid2host[h].second, " ",
+                      g.size());
+      } else {
+        GALOIS_ASSERT(gid2host[h].first == gid2host[h - 1].second,
+                      gid2host[h].first, " ", gid2host[h - 1].second);
+        GALOIS_ASSERT(gid2host[h].second == gid2host[h + 1].first,
+                      gid2host[h].second, " ", gid2host[h + 1].first);
+      }
+    }
+  }
+
+  uint32_t G2L(uint64_t gid) const {
+    assert(isLocal(gid));
+    return globalToLocalMap.at(gid);
+  }
+
+  uint64_t L2G(uint32_t lid) const { return localToGlobalVector[lid]; }
+
+public:
+  //! Type representing a node in this graph
+  using GraphNode = typename GraphTy::GraphNode;
+  //! Expose EdgeTy to other classes
+  using EdgeType = EdgeTy;
+  //! iterator type over nodes
+  using iterator = typename GraphTy::iterator;
+  //! constant iterator type over nodes
+  using const_iterator = typename GraphTy::const_iterator;
+  //! iterator type over edges
+  using edge_iterator = typename GraphTy::edge_iterator;
+
+  /**
+   * Constructor for DistGraph. Initializes metadata fields.
+   *
+   * @param host host number that this graph resides on
+   * @param numHosts total number of hosts in the currently executing program
+   */
+  DistGraph(unsigned host, unsigned numHosts)
+      : transposed(false), id(host), numHosts(numHosts) {
+    mirrorNodes.resize(numHosts);
+    numGlobalNodes = 0;
+    numGlobalEdges = 0;
+  }
+
+  /**
+   * Return a vector of pairs denoting mirror node ranges.
+   *
+   * Assumes all mirror nodes occur after the masters: this invariant should be
+   * held by CuSP.
+   */
+  std::vector<std::pair<uint32_t, uint32_t>> getMirrorRanges() const {
+    std::vector<std::pair<uint32_t, uint32_t>> mirrorRangesVector;
+    // order of nodes locally is masters, outgoing mirrors, incoming mirrors,
+    // so just get from numOwned to end
+    if (numOwned != numNodes) {
+      assert(numOwned < numNodes);
+      mirrorRangesVector.push_back(std::make_pair(numOwned, numNodes));
+    }
+    return mirrorRangesVector;
+  }
+
+  std::vector<std::vector<size_t>>& getMirrorNodes() { return mirrorNodes; }
+
+private:
+  virtual unsigned getHostIDImpl(uint64_t) const = 0;
+  virtual bool isOwnedImpl(uint64_t) const       = 0;
+  virtual bool isLocalImpl(uint64_t) const       = 0;
+  virtual bool isVertexCutImpl() const           = 0;
+  virtual std::pair<unsigned, unsigned> cartesianGridImpl() const {
+    return std::make_pair(0u, 0u);
+  }
+
+public:
+  virtual ~DistGraph() {}
+  //! Determines which host has the master for a particular node
+  //! @returns Host id of node in question
+  inline unsigned getHostID(uint64_t gid) const { return getHostIDImpl(gid); }
+  //! Determine if a node has a master on this host.
+  //! @returns True if passed in global id has a master on this host
+  inline bool isOwned(uint64_t gid) const { return isOwnedImpl(gid); }
+  //! Determine if a node has a proxy on this host
+  //! @returns True if passed in global id has a proxy on this host
+  inline bool isLocal(uint64_t gid) const { return isLocalImpl(gid); }
+  /**
+   * Returns true if current partition is a vertex cut
+   * @returns true if partition being stored in this graph is a vertex cut
+   */
+  inline bool is_vertex_cut() const { return isVertexCutImpl(); }
+  /**
+   * Returns Cartesian split (if it exists, else returns pair of 0s
+   */
+  inline std::pair<unsigned, unsigned> cartesianGrid() const {
+    return cartesianGridImpl();
+  }
+
+  bool isTransposed() { return transposed; }
+
+  /**
+   * Converts a local node id into a global node id
+   *
+   * @param nodeID local node id
+   * @returns global node id corresponding to the local one
+   */
+  inline uint64_t getGID(const uint32_t nodeID) const { return L2G(nodeID); }
+
+  /**
+   * Converts a global node id into a local node id
+   *
+   * @param nodeID global node id
+   * @returns local node id corresponding to the global one
+   */
+  inline uint32_t getLID(const uint64_t nodeID) const { return G2L(nodeID); }
+
+  /**
+   * Get data of a node.
+   *
+   * @param N node to get the data of
+   * @param mflag access flag for node data
+   * @returns A node data object
+   */
+  inline typename GraphTy::node_data_reference
+  getData(GraphNode N,
+          galois::MethodFlag mflag = galois::MethodFlag::UNPROTECTED) {
+    auto& r = graph.getData(N, mflag);
+    return r;
+  }
+
+  /**
+   * Get the edge data for a particular edge in the graph.
+   *
+   * @param ni edge to get the data of
+   * @param mflag access flag for edge data
+   * @returns The edge data for the requested edge
+   */
+  inline typename GraphTy::edge_data_reference
+  getEdgeData(edge_iterator ni,
+              galois::MethodFlag mflag = galois::MethodFlag::UNPROTECTED) {
+    auto& r = graph.getEdgeData(ni, mflag);
+    return r;
+  }
+
+  /**
+   * Gets edge destination of edge ni.
+   *
+   * @param ni edge id to get destination of
+   * @returns Local ID of destination of edge ni
+   */
+  GraphNode getEdgeDst(edge_iterator ni) { return graph.getEdgeDst(ni); }
+
+  /**
+   * Gets the first edge of some node.
+   *
+   * @param N node to get the edge of
+   * @returns iterator to first edge of N
+   */
+  inline edge_iterator edge_begin(GraphNode N) {
+    return graph.edge_begin(N, galois::MethodFlag::UNPROTECTED);
+  }
+
+  /**
+   * Gets the end edge boundary of some node.
+   *
+   * @param N node to get the edge of
+   * @returns iterator to the end of the edges of node N, i.e. the first edge
+   * of the next node (or an "end" iterator if there is no next node)
+   */
+  inline edge_iterator edge_end(GraphNode N) {
+    return graph.edge_end(N, galois::MethodFlag::UNPROTECTED);
+  }
+
+  /**
+   * Return the degree of the edge in the local graph
+   **/
+  inline uint64_t localDegree(GraphNode N) { return graph.getDegree(N); }
+
+  /**
+   * Returns an iterable object over the edges of a particular node in the
+   * graph.
+   *
+   * @param N node to get edges iterator over
+   */
+  inline galois::runtime::iterable<galois::NoDerefIterator<edge_iterator>>
+  edges(GraphNode N) {
+    return galois::graphs::internal::make_no_deref_range(edge_begin(N),
+                                                         edge_end(N));
+  }
+
+  /**
+   * Gets number of nodes on this (local) graph.
+   *
+   * @returns number of nodes present in this (local) graph
+   */
+  inline size_t size() const { return graph.size(); }
+
+  /**
+   * Gets number of edges on this (local) graph.
+   *
+   * @returns number of edges present in this (local) graph
+   */
+  inline size_t sizeEdges() const { return graph.sizeEdges(); }
+
+  /**
+   * Gets number of nodes on this (local) graph.
+   *
+   * @returns number of nodes present in this (local) graph
+   */
+  inline size_t numMasters() const { return numOwned; }
+
+  /**
+   * Gets number of nodes with edges (may include nodes without edges)
+   * on this (local) graph.
+   *
+   * @returns number of nodes with edges (may include nodes without edges
+   * as it measures a contiguous range)
+   */
+  inline size_t getNumNodesWithEdges() const { return numNodesWithEdges; }
+
+  /**
+   * Gets number of nodes on the global unpartitioned graph.
+   *
+   * @returns number of nodes present in the global unpartitioned graph
+   */
+  inline size_t globalSize() const { return numGlobalNodes; }
+
+  /**
+   * Gets number of edges on the global unpartitioned graph.
+   *
+   * @returns number of edges present in the global unpartitioned graph
+   */
+  inline size_t globalSizeEdges() const { return numGlobalEdges; }
+
+  /**
+   * Returns a range object that encapsulates all nodes of the graph.
+   *
+   * @returns A range object that contains all the nodes in this graph
+   */
+  inline const NodeRangeType& allNodesRange() const {
+    assert(specificRanges.size() == 3);
+    return specificRanges[0];
+  }
+
+  /**
+   * Returns a range object that encapsulates only master nodes in this
+   * graph.
+   *
+   * @returns A range object that contains the master nodes in this graph
+   */
+  inline const NodeRangeType& masterNodesRange() const {
+    assert(specificRanges.size() == 3);
+    return specificRanges[1];
+  }
+
+  /**
+   * Returns a range object that encapsulates master nodes and nodes
+   * with edges in this graph.
+   *
+   * @returns A range object that contains the master nodes and the nodes
+   * with outgoing edges in this graph
+   */
+  inline const NodeRangeType& allNodesWithEdgesRange() const {
+    assert(specificRanges.size() == 3);
+    return specificRanges[2];
+  }
+
+  /**
+   * Returns a vector object that contains the global IDs (in order) of
+   * the master nodes in this graph.
+   *
+   * @returns A vector object that contains the global IDs (in order) of
+   * the master nodes in this graph
+   */
+  std::vector<uint64_t> getMasterGlobalIDs() {
+    std::vector<uint64_t> IDs;
+
+    IDs.reserve(numMasters());
+    for (auto node : masterNodesRange()) {
+      IDs.push_back(getGID(node));
+    }
+
+    return IDs;
+  }
+
+protected:
+  /**
+   * Uses a pre-computed prefix sum to determine division of nodes among
+   * threads.
+   *
+   * The call uses binary search to determine the ranges.
+   */
+  inline void determineThreadRanges() {
+    allNodesRanges = galois::graphs::determineUnitRangesFromPrefixSum(
+        galois::runtime::activeThreads, graph.getEdgePrefixSum());
+  }
+
+  /**
+   * Determines the thread ranges for master nodes only and saves them to
+   * the object.
+   *
+   * Only call after graph is constructed + only call once
+   */
+  inline void determineThreadRangesMaster() {
+    // make sure this hasn't been called before
+    assert(masterRanges.size() == 0);
+
+    // first check if we even need to do any work; if already calculated,
+    // use already calculated vector
+    if (beginMaster == 0 && (beginMaster + numOwned) == size()) {
+      masterRanges = allNodesRanges;
+    } else if (beginMaster == 0 &&
+               (beginMaster + numOwned) == numNodesWithEdges &&
+               withEdgeRanges.size() != 0) {
+      masterRanges = withEdgeRanges;
+    } else {
+      galois::gDebug("Manually det. master thread ranges");
+      masterRanges = galois::graphs::determineUnitRangesFromGraph(
+          graph, galois::runtime::activeThreads, beginMaster,
+          beginMaster + numOwned, 0,
+          (galois::graphs::is_LS_LC_CSR_64_Graph<decltype(graph)>::value == 1));
+    }
+  }
+
+  /**
+   * Determines the thread ranges for nodes with edges only and saves them to
+   * the object.
+   *
+   * Only call after graph is constructed + only call once
+   */
+  inline void determineThreadRangesWithEdges() {
+    // make sure not called before
+    assert(withEdgeRanges.size() == 0);
+
+    // first check if we even need to do any work; if already calculated,
+    // use already calculated vector
+    if (numNodesWithEdges == size()) {
+      withEdgeRanges = allNodesRanges;
+    } else if (beginMaster == 0 &&
+               (beginMaster + numOwned) == numNodesWithEdges &&
+               masterRanges.size() != 0) {
+      withEdgeRanges = masterRanges;
+    } else {
+      galois::gDebug("Manually det. with edges thread ranges");
+      withEdgeRanges = galois::graphs::determineUnitRangesFromGraph(
+          graph, galois::runtime::activeThreads, 0, numNodesWithEdges, 0);
+    }
+  }
+
+  /**
+   * Initializes the 3 range objects that a user can access to iterate
+   * over the graph in different ways.
+   */
+  void initializeSpecificRanges() {
+    assert(specificRanges.size() == 0);
+
+    // TODO/FIXME assertion likely not safe if a host gets no nodes
+    // make sure the thread ranges have already been calculated
+    // for the 3 ranges
+    assert(allNodesRanges.size() != 0);
+    assert(masterRanges.size() != 0);
+    assert(withEdgeRanges.size() != 0);
+
+    // 0 is all nodes
+    specificRanges.push_back(galois::runtime::makeSpecificRange(
+        boost::counting_iterator<size_t>(0),
+        boost::counting_iterator<size_t>(size()), allNodesRanges.data()));
+
+    // 1 is master nodes
+    specificRanges.push_back(galois::runtime::makeSpecificRange(
+        boost::counting_iterator<size_t>(beginMaster),
+        boost::counting_iterator<size_t>(beginMaster + numOwned),
+        masterRanges.data()));
+
+    // 2 is with edge nodes
+    specificRanges.push_back(galois::runtime::makeSpecificRange(
+        boost::counting_iterator<size_t>(0),
+        boost::counting_iterator<size_t>(numNodesWithEdges),
+        withEdgeRanges.data()));
+
+    assert(specificRanges.size() == 3);
+  }
+
+  /**
+   * Specific range editor: makes the range for edges equivalent to the range
+   * for masters.
+   */
+  void edgesEqualMasters() { specificRanges[2] = specificRanges[1]; }
+
+  void recalculateG2LMap() {
+    for (uint64_t i = 0; i < localToGlobalVector.size(); i++) {
+      globalToLocalMap[localToGlobalVector[i]] = i;
+    }
+  }
+
+public:
+  /**
+   * Write the local LC_CSR graph to the file on a disk.
+   *
+   * @todo revive this
+   */
+  void save_local_graph_to_file(std::string) { GALOIS_DIE("not implemented"); }
+
+  /**
+   * Read the local LC_CSR graph from the file on a disk.
+   *
+   * @todo revive this
+   */
+  void read_local_graph_from_file(std::string) {
+    GALOIS_DIE("not implemented");
+  }
+
+  /**
+   * Deallocates underlying LC CSR Graph
+   */
+  void deallocate() {
+    galois::gDebug("Deallocating CSR in DistGraph");
+    graph.deallocate();
+  }
+
+  /**
+   * Sort the underlying LC_CSR_Graph by ID (destinations)
+   * It sorts edges of the nodes by destination.
+   */
+  void sortEdgesByDestination() {
+    using GN = typename GraphTy::GraphNode;
+    galois::do_all(
+        galois::iterate(graph),
+        [&](GN n) { graph.sortEdges(n, IdLess<GN, EdgeTy>()); },
+        galois::no_stats(), galois::loopname("CSREdgeSort"), galois::steal());
+  }
+};
+
+template <typename NodeTy, typename EdgeTy>
+constexpr const char* const galois::graphs::DistGraph<NodeTy, EdgeTy>::GRNAME;
+} // end namespace graphs
+} // end namespace galois
+
+#endif //_GALOIS_DISTRIBUTED_LOCAL_GRAPH_H

From 255dd41b45a12c281accd49299d6ad820f510c7c Mon Sep 17 00:00:00 2001
From: patrickkenney9801 <patrickkenney9801@gmail.com>
Date: Wed, 6 Mar 2024 12:09:07 -0600
Subject: [PATCH 620/660] chore: Fix WMD graph conflicts and network conflict

---
 libdist/include/galois/runtime/Serialize.h |   3 -
 libwmd/include/galois/wmd/WMDGraph.h       | 304 +++------------------
 libwmd/include/galois/wmd/WMDPartitioner.h | 211 ++++----------
 3 files changed, 94 insertions(+), 424 deletions(-)

diff --git a/libdist/include/galois/runtime/Serialize.h b/libdist/include/galois/runtime/Serialize.h
index 6832a1afc4..6d70b04193 100644
--- a/libdist/include/galois/runtime/Serialize.h
+++ b/libdist/include/galois/runtime/Serialize.h
@@ -278,14 +278,12 @@ gSizedObj(const T&,
   return sizeof(uintptr_t);
 }
 
-<<<<<<< HEAD
 //! Size of BufferWrapper is size + number of things in it
 template <typename T>
 inline size_t gSizedObj(const galois::BufferWrapper<T>& data) {
   return sizeof(size_t) + data.size() * sizeof(T);
 }
 
-=======
 template <typename T1, typename T2>
 inline size_t gSizedObj(const std::unordered_map<T1, T2>& data) {
   size_t sz = 0;
@@ -293,7 +291,6 @@ inline size_t gSizedObj(const std::unordered_map<T1, T2>& data) {
     sz += gSizedObj(i.first) + gSizedObj(i.second);
   return sz;
 }
->>>>>>> b1a39cdd7 (bug fix)
 /**
  * Returns the size necessary for storing 2 elements of a pair into a
  * serialize buffer.
diff --git a/libwmd/include/galois/wmd/WMDGraph.h b/libwmd/include/galois/wmd/WMDGraph.h
index 06126a1fd3..b528b1ade5 100644
--- a/libwmd/include/galois/wmd/WMDGraph.h
+++ b/libwmd/include/galois/wmd/WMDGraph.h
@@ -75,20 +75,10 @@ class WMDOfflineGraph : public OfflineGraph {
       localEdgesIdxToGlobalNodeID; // map idx in localEdges to global node ID
   std::vector<NodeDataType> localNodes; // nodes in this host, index by local ID
 
-<<<<<<< HEAD
   // global feilds (same on each hosts)
-<<<<<<< HEAD
-=======
-  // global fields (same on each hosts)
-  std::unordered_map<uint64_t, uint64_t> tokenToGlobalNodeID;  // map node token to Global ID
->>>>>>> dd3e7ce00 (fix: typo and TODO)
-  std::vector<uint64_t> nodeOffset;  // each hosts' local ID offset wrt global ID
-  std::vector<uint64_t> globalEdgePrefixSum;  // a prefix sum of degree of each global nodes
-=======
   std::vector<uint64_t> nodeOffset; // each hosts' local ID offset wrt global ID
   std::vector<uint64_t>
       globalEdgePrefixSum; // a prefix sum of degree of each global nodes
->>>>>>> 5901b24b6 (chore: Run clang-format on the repo and add git hooks from gnn branch)
 
   // per thread data struct (will be combined into a single data struct)
   std::vector<std::unordered_map<uint64_t, size_t>>
@@ -316,18 +306,8 @@ class WMDOfflineGraph : public OfflineGraph {
          parsers) {
       I_RR();
       for (const std::string& file : parser->GetFiles()) {
-<<<<<<< HEAD
-<<<<<<< HEAD
-        I_RS();
-=======
-        I_RR(); 
->>>>>>> d18e4e8d0 (fixed all bugs (hopefully))
-        loadGraphFile(file, *parser, segmentsPerHost, nodeCounter,
-=======
         I_RR();
-        loadGraphFile(file, *parser, segmentsPerHost, setEdgeSize, nodeCounter,
->>>>>>> 73f44af9e (feat: add instrument to wmd graph importer)
-                      edgeCounter);
+        loadGraphFile(file, *parser, segmentsPerHost, nodeCounter, edgeCounter);
       }
     }
 
@@ -441,13 +421,13 @@ class WMDOfflineGraph : public OfflineGraph {
       }
       sf /= 2;
 
-      #ifdef GALOIS_INSTRUMENT
+#ifdef GALOIS_INSTRUMENT
       std::sort(cnt_vec.begin(), cnt_vec.begin() + (sf * numHosts));
-      for (uint32_t i = 0; i < (uint32_t) sf * numHosts; i++) {
+      for (uint32_t i = 0; i < (uint32_t)sf * numHosts; i++) {
         I_RR();
         I_WR();
       }
-      #endif
+#endif
     }
     // Determine virtualToPhyMapping values
     for (uint32_t i = 0; i < numHosts; i++) {
@@ -794,7 +774,7 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
         I_WR();
       }
     }
-    
+
     numNodes = 0;
 
     // send vertex size to other hosts
@@ -838,13 +818,12 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
     srcGraph.setSize(globalNodeOffset[numHosts - 1] +
                      localNodeSize[numHosts - 1]);
     for (int k = 0; k < 2; k++)
-        I_RR();
+      I_RR();
     I_WR();
 
     increment_evilPhase();
   }
 
-
   /**
    * Exchanges vertex ids to form a global id to local id map before exchanging
    * edges so that using the map edges can be inserted into the edgelist
@@ -883,7 +862,7 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
 
       for (uint64_t i = beginNode; i < endNode; ++i) {
         uint64_t src = localEdges[i][0].src;
-        int host = virtualToPhyMapping[src % numVirtualHosts];
+        int host     = virtualToPhyMapping[src % numVirtualHosts];
         threadEdgesToSend[tid][host].push_back((localEdges[i]));
         for (int k = 0; k < 3; k++)
           I_RR();
@@ -899,8 +878,8 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
           galois::block_range((uint64_t)0, localNodes.size(), tid, nthreads);
 
       for (size_t i = beginNode; i < (endNode); ++i) {
-        int host =
-            virtualToPhyMapping[(localNodes[i].glbid) % (scaleFactor * numHosts)];
+        int host = virtualToPhyMapping[(localNodes[i].glbid) %
+                                       (scaleFactor * numHosts)];
         threadNodesToSend[tid][host].push_back((localNodes[i]));
         for (int k = 0; k < 2; k++)
           I_RR();
@@ -961,23 +940,17 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
         for (size_t j = beginNode; j < (endNode); ++j) {
           threadMap[tid][NodeData[j].glbid] =
               offset + (tid * (delta)) + j - beginNode;
-<<<<<<< HEAD
-          threadLIDMap[tid][offset + (tid * (delta)) + j - beginNode] = 
+          threadLIDMap[tid][offset + (tid * (delta)) + j - beginNode] =
               NodeData[j].glbid;
-=======
           I_WR();
           I_RR();
->>>>>>> 73f44af9e (feat: add instrument to wmd graph importer)
         }
       });
       for (uint32_t t = 0; t < activeThreads; t++) {
         GIDtoLID.insert(threadMap[t].begin(), threadMap[t].end());
-<<<<<<< HEAD
         LIDtoGID.insert(threadLIDMap[t].begin(), threadLIDMap[t].end());
-=======
         I_WR();
         I_RR();
->>>>>>> 73f44af9e (feat: add instrument to wmd graph importer)
       }
       threadMap.clear();
       NodeData.clear();
@@ -998,36 +971,25 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
       for (size_t i = beginNode; i < (endNode); ++i) {
         threadMap[tid][nodesToSend[hostID][i].glbid] =
             offset + (tid * (delta)) + i - beginNode;
-<<<<<<< HEAD
-        threadLIDMap[tid][offset + (tid * (delta)) + i - beginNode] = 
+        threadLIDMap[tid][offset + (tid * (delta)) + i - beginNode] =
             nodesToSend[hostID][i].glbid;
-=======
         I_WR();
         I_RR();
->>>>>>> 73f44af9e (feat: add instrument to wmd graph importer)
       }
     });
     for (uint32_t t = 0; t < activeThreads; t++) {
       GIDtoLID.insert(threadMap[t].begin(), threadMap[t].end());
-<<<<<<< HEAD
       LIDtoGID.insert(threadLIDMap[t].begin(), threadLIDMap[t].end());
-=======
       I_WR();
       I_RR();
->>>>>>> 73f44af9e (feat: add instrument to wmd graph importer)
     }
     threadMap.clear();
-<<<<<<< HEAD
     threadLIDMap.clear();
 
     numLocalNodes = GIDtoLID.size();
     localEdges.clear();
-=======
-    
->>>>>>> 6855c8e71 (fix: remove uncessary code)
     nodesToSend.clear();
 
-
     increment_evilPhase();
     // Send Edgelist
     for (uint32_t h = 0; h < numHosts; h++) {
@@ -1055,10 +1017,10 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
       std::vector<std::vector<EdgeDataType>> edgeList;
 
       galois::runtime::gDeserialize(p->second, edgeList);
-      #ifdef GALOIS_INSTRUMENT
-      for (auto l: edgeList)
+#ifdef GALOIS_INSTRUMENT
+      for (auto l : edgeList)
         I_LC(sendingHost, l.size() * sizeof(EdgeDataType));
-      #endif
+#endif
 
       galois::gInfo("[", hostID, "] recv from ", sendingHost,
                     " edgeList size: ", edgeList.size());
@@ -1070,9 +1032,9 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
             galois::block_range((size_t)0, edgeList.size(), tid, nthreads);
         for (size_t j = beginNode; j < endNode; j++) {
           auto lid = GIDtoLID[edgeList[j][0].src];
-          localEdges[lid].insert(
-              std::end(localEdges[lid]),
-              std::begin(edgeList[j]), std::end(edgeList[j]));
+          localEdges[lid].insert(std::end(localEdges[lid]),
+                                 std::begin(edgeList[j]),
+                                 std::end(edgeList[j]));
           for (int i = 0; i < 3; i++)
             I_RR();
           I_WR();
@@ -1087,10 +1049,9 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
           (size_t)0, edgesToSend[hostID].size(), tid, nthreads);
       for (size_t j = beginNode; j < endNode; j++) {
         auto lid = GIDtoLID[edgesToSend[hostID][j][0].src];
-        localEdges[lid].insert(
-            std::end(localEdges[lid]),
-            std::begin(edgesToSend[hostID][j]),
-            std::end(edgesToSend[hostID][j]));
+        localEdges[lid].insert(std::end(localEdges[lid]),
+                               std::begin(edgesToSend[hostID][j]),
+                               std::end(edgesToSend[hostID][j]));
         for (int i = 0; i < 4; i++)
           I_RR();
         I_WR();
@@ -1100,143 +1061,6 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
     increment_evilPhase();
   }
 
-<<<<<<< HEAD
-=======
-  void exchangeTokenID() {
-    auto& net            = galois::runtime::getSystemNetworkInterface();
-    uint64_t this_offset = globalNodeOffset[hostID];
-    galois::do_all(
-        galois::iterate(GIDtoLID.begin(), GIDtoLID.end()),
-        [&](auto& el) { 
-          el.second += this_offset; 
-          I_WR();
-        }, galois::steal());
-    // GIDtoLID.reserve(size);
-
-    for (uint32_t h = 0; h < numHosts; h++) {
-      if (h == hostID)
-        continue;
-      galois::runtime::SendBuffer b;
-      galois::runtime::gSerialize(b, GIDtoLID);
-      net.sendTagged(h, galois::runtime::evilPhase, b);
-      I_WM(GIDtoLID.size());
-    }
-
-    // recv sorted token list from other hosts
-    for (uint32_t h = 0; h < (numHosts - 1); h++) {
-      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
-      do {
-        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
-      } while (!p);
-      I_LC(p->first, p->second.size());
-      std::unordered_map<uint64_t, uint32_t> tokensToGID;
-      galois::runtime::gDeserialize(p->second, tokensToGID);
-      GIDtoLID.insert(tokensToGID.begin(), tokensToGID.end());
-      tokensToGID.clear();
-      I_WM(tokensToGID.size());
-    }
-    increment_evilPhase();
-  }
-
-  void
-  exchangeLocalNodeSize(WMDOfflineGraph<NodeDataType, EdgeDataType>& srcGraph) {
-    auto& net = galois::runtime::getSystemNetworkInterface();
-    globalNodeOffset.resize(numHosts);
-    localNodeSize.resize(numHosts);
-    std::vector<std::vector<uint64_t>> threadNodesToSend(
-        galois::runtime::activeThreads);
-    for (uint32_t i = 0; i < galois::runtime::activeThreads; i++) {
-      threadNodesToSend[i].resize(numHosts, 0);
-      I_WR();
-    }
-    galois::on_each([&](unsigned tid, unsigned nthreads) {
-      uint64_t beginNode;
-      uint64_t endNode;
-      std::tie(beginNode, endNode) = galois::block_range(
-          (uint64_t)0, srcGraph.localNodes.size(), tid, nthreads);
-
-      for (uint64_t i = beginNode; i < endNode; ++i) {
-        int host =
-            virtualToPhyMapping[srcGraph.localNodes[i].id % numVirtualHosts];
-        threadNodesToSend[tid][host]++;
-        I_WR();
-        for (int k = 0; k < 2; k++)
-          I_RR();
-      }
-    });
-    for (uint32_t tid = 0; tid < galois::runtime::activeThreads; tid++) {
-      for (uint32_t h = 0; h < numHosts; h++) {
-        localNodeSize[h] += threadNodesToSend[tid][h];
-        I_RR();
-        I_WR();
-      }
-    }
-    numNodes = 0;
-
-    // send vertex size to other hosts
-    for (uint32_t h = 0; h < numHosts; ++h) {
-      if (h == hostID) {
-        continue;
-      }
-      // serialize size_t
-      galois::runtime::SendBuffer sendBuffer;
-      galois::runtime::gSerialize(sendBuffer, localNodeSize);
-      net.sendTagged(h, galois::runtime::evilPhase, sendBuffer);
-      I_WM(localNodeSize.size());
-    }
-
-    // recv node size from other hosts
-    for (uint32_t h = 0; h < numHosts - 1; h++) {
-      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
-      do {
-        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
-      } while (!p);
-      std::vector<uint64_t> cnt;
-      // deserialize local_node_size
-      galois::runtime::gDeserialize(p->second, cnt);
-      I_LC(p->first, cnt.size() * sizeof(uint64_t));
-      for (uint32_t i = 0; i < numHosts; i++) {
-        localNodeSize[i] += cnt[i];
-        I_RR();
-        I_WR();
-      }
-    }
-
-    numNodes      = localNodeSize[hostID];
-    numLocalNodes = numNodes;
-    // compute prefix sum to get offset
-    globalNodeOffset[0] = 0;
-    for (size_t h = 1; h < numHosts; h++) {
-      globalNodeOffset[h] = localNodeSize[h - 1] + globalNodeOffset[h - 1];
-      for (int k = 0; k < 2; k++)
-        I_RR();
-      I_WR();
-    }
-    srcGraph.setSize(globalNodeOffset[numHosts - 1] +
-                     localNodeSize[numHosts - 1]);
-    for (int k = 0; k < 2; k++)
-        I_RR();
-    I_WR();
-
-    increment_evilPhase();
-  }
-
-  void relabelEdges(std::vector<std::vector<EdgeDataType>>& localEdges) {
-    galois::do_all(
-        galois::iterate((size_t)0, localEdges.size()),
-        [this, &localEdges](size_t i) {
-          for (uint64_t j = 0; j < (localEdges[i].size()); j++) {
-            localEdges[i][j].src_glbid = GIDtoLID[localEdges[i][0].src];
-            localEdges[i][j].dst_glbid = GIDtoLID[localEdges[i][j].dst];
-            for (int k = 0; k < 4; k++)
-                I_RR();
-            I_WM(2);
-          }
-        },
-        galois::steal());
-  }
-
->>>>>>> 73f44af9e (feat: add instrument to wmd graph importer)
   /**
    * Flatten the 2D vector localEdges into a CSR edge list
    * Will compute edge size and build CSR edge offset mapping
@@ -1346,21 +1170,11 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
       I_WR();
     }
 
-    // build local buffered graph 
+    // build local buffered graph
     exchangeLocalNodeSize(srcGraph);
     galois::gDebug("[", hostID, "] gatherVerticesAndEdges!");
     gatherVerticesAndEdges(srcGraph.localEdges, srcGraph.localNodes);
-<<<<<<< HEAD
     galois::gDebug("[", hostID, "] ", "flattenEdges!");
-=======
-    galois::gDebug("[", hostID, "] exchangeLocalNodeSize!");
-    exchangeLocalNodeSize(srcGraph);
-    galois::gDebug("[", hostID, "] exchangeTokenID!");
-    exchangeTokenID();
-    galois::gDebug("[", hostID, "] relabelEdges!");
-    relabelEdges(srcGraph.localEdges);
-    galois::gDebug("[", hostID, "] flattenEdges!");
->>>>>>> 6855c8e71 (fix: remove uncessary code)
     flattenEdges(srcGraph.localEdges);
 
     // clean unused data
@@ -1465,51 +1279,29 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
       I_LC(p->first, p->second.size());
       galois::runtime::gDeserialize(p->second, NodeData);
       galois::do_all(galois::iterate((size_t)0, NodeData.size()),
-                    [this, NodeData, &dstGraph, &globalToLocalMap
+                     [this, NodeData, &dstGraph, &globalToLocalMap
 #ifndef NDEBUG
                       ,
                       &addedData
 #endif
-<<<<<<< HEAD
       ](size_t j) {
-                       dstGraph.getData(
-                           GIDtoLID[NodeData[j].id]) = NodeData[j];
+                       dstGraph.getData(GIDtoLID[NodeData[j].id]) = NodeData[j];
                      });
-=======
-                    ](size_t j) {
-                      dstGraph.getData(
-                          globalToLocalMap[GIDtoLID[NodeData[j].id]]) =
-                          NodeData[j];
-                      for (int k = 0; k < 3; k++)
-                        I_RR();
-                      I_WR();
-                    });
->>>>>>> 73f44af9e (feat: add instrument to wmd graph importer)
       NodeData.clear();
     }
-    galois::do_all(
-        galois::iterate((size_t)0, nodesToSend[hostID].size()),
-        [this, nodesToSend, &dstGraph, &globalToLocalMap
+    galois::do_all(galois::iterate((size_t)0, nodesToSend[hostID].size()),
+                   [this, nodesToSend, &dstGraph, &globalToLocalMap
 #ifndef NDEBUG
-          ,
-          &addedData
+                    ,
+                    &addedData
 #endif
-        ](size_t i) {
-          dstGraph.getData(
-<<<<<<< HEAD
-              GIDtoLID[nodesToSend[hostID][i].id]) = 
-              nodesToSend[hostID][i];
-=======
-            globalToLocalMap[GIDtoLID[nodesToSend[hostID][i].id]]) =
-            nodesToSend[hostID][i];
-          for (int k = 0; k < 2; k++)
-            I_RR();
-          I_WM(2);         
->>>>>>> 73f44af9e (feat: add instrument to wmd graph importer)
+    ](size_t i) {
+                     dstGraph.getData(GIDtoLID[nodesToSend[hostID][i].id]) =
+                         nodesToSend[hostID][i];
 #ifndef NDEBUG
-          addedData++;
+                     addedData++;
 #endif
-        });
+                   });
     nodesToSend.clear();
     nodesToSend.resize(numHosts);
     increment_evilPhase();
@@ -1518,28 +1310,14 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
     //     uint64_t numNodes = srcGraph.localNodeSize[hostID];
     galois::do_all(
         galois::iterate((uint64_t)0, (uint64_t)numHosts),
-        [this, &nodesToSend, &localNodes, &proxiesOnHosts,
-         globalIDOffset, &dstGraph, &globalToLocalMap](uint64_t i) {
+        [this, &nodesToSend, &localNodes, &proxiesOnHosts, globalIDOffset,
+         &dstGraph, &globalToLocalMap](uint64_t i) {
           if (i != hostID) {
-<<<<<<< HEAD
             I_RR();
             for (uint64_t j = 0; j < proxiesOnHosts[i].size(); j++) {
-              auto& r = dstGraph.getData(
-                  globalToLocalMap[proxiesOnHosts[i][j]]);    
+              auto& r =
+                  dstGraph.getData(globalToLocalMap[proxiesOnHosts[i][j]]);
               nodesToSend[i].push_back(r);
-=======
-            auto& proxiesOnThatHost = proxiesOnHosts[i];
-            for (uint64_t j = 0; j < numNodes; j++) {
-              uint64_t gid = j + globalIDOffset;
-              I_RR();
-              if (proxiesOnThatHost.test(gid)) {
-                IDofNodesToSend[i].push_back(gid);
-                auto& r = dstGraph.getData(globalToLocalMap[gid]);
-                nodesToSend[i].push_back(r);
-                I_RR();
-                I_WM(2);  
-              }
->>>>>>> 73f44af9e (feat: add instrument to wmd graph importer)
             };
           }
         },
@@ -1557,7 +1335,7 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
       galois::runtime::gSerialize(sendBuffer, proxiesOnHosts[h]);
       galois::gDebug("[", hostID, "] ", "send to ", h,
                      " nodesToSend size: ", nodesToSend[h].size());
-      I_WM(nodesToSend[h].size() + proxiesOnHosts[h].size()); 
+      I_WM(nodesToSend[h].size() + proxiesOnHosts[h].size());
       net.sendTagged(h, galois::runtime::evilPhase, sendBuffer);
     }
 
@@ -1674,7 +1452,8 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
    * @returns number of edges
    */
   uint64_t edgeNum(uint64_t globalNodeID) {
-    return offsets[globalNodeID - globalNodeOffset[hostID] + 1] - offsets[globalNodeID - globalNodeOffset[hostID]];
+    return offsets[globalNodeID - globalNodeOffset[hostID] + 1] -
+           offsets[globalNodeID - globalNodeOffset[hostID]];
   }
 
   /**
@@ -1688,8 +1467,9 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
   std::vector<uint64_t> edgeLocalDst(uint64_t globalNodeID) {
     std::vector<uint64_t> dst;
     auto end = offsets[globalNodeID - globalNodeOffset[hostID] + 1];
-    for (auto itr = offsets[globalNodeID - globalNodeOffset[hostID]]; itr != end; ++itr) {
-        dst.push_back(edges[itr].dst);
+    for (auto itr = offsets[globalNodeID - globalNodeOffset[hostID]];
+         itr != end; ++itr) {
+      dst.push_back(edges[itr].dst);
     }
     return dst;
   }
diff --git a/libwmd/include/galois/wmd/WMDPartitioner.h b/libwmd/include/galois/wmd/WMDPartitioner.h
index 979433edb1..8917f175b1 100644
--- a/libwmd/include/galois/wmd/WMDPartitioner.h
+++ b/libwmd/include/galois/wmd/WMDPartitioner.h
@@ -185,16 +185,6 @@ class WMDGraph : public DistGraph<NodeTy, EdgeTy> {
 
     // initial pass; set up lid-gid mappings, determine which proxies exist on
     // this host
-<<<<<<< HEAD
-<<<<<<< HEAD
-    uint64_t nodeBegin = 0;
-    uint64_t nodeEnd = bufGraph.GIDtoLID.size();
-    base_DistGraph::numOwned = nodeEnd;
-
-    base_DistGraph::gid2host.resize(base_DistGraph::numHosts);
-=======
-=======
->>>>>>> d18e4e8d0 (fixed all bugs (hopefully))
     uint64_t nodeBegin = bufGraph.globalNodeOffset[base_DistGraph::id];
     uint64_t nodeEnd   = bufGraph.globalNodeOffset[base_DistGraph::id] +
                        bufGraph.localNodeSize[base_DistGraph::id];
@@ -215,15 +205,10 @@ class WMDGraph : public DistGraph<NodeTy, EdgeTy> {
         std::pair<uint64_t, uint64_t>(
             bufGraph.globalNodeOffset[base_DistGraph::numHosts - 1],
             base_DistGraph::numGlobalNodes);
-<<<<<<< HEAD
-    graphPartitioner->saveGIDToHost(base_DistGraph::gid2host);
->>>>>>> 73f44af9e (feat: add instrument to wmd graph importer)
-=======
     graphPartitioner->saveGIDToHost(bufGraph.virtualToPhyMapping);
 
->>>>>>> d18e4e8d0 (fixed all bugs (hopefully))
-
-    std::vector<std::vector<uint64_t>> presentProxies = edgeInspectionRound1(bufGraph);
+    std::vector<std::vector<uint64_t>> presentProxies =
+        edgeInspectionRound1(bufGraph);
 
     // vector to store bitsets received from other hosts
     std::vector<std::vector<uint64_t>> proxiesOnOtherHosts;
@@ -256,13 +241,6 @@ class WMDGraph : public DistGraph<NodeTy, EdgeTy> {
     // construct edges
     // not need to move edges from other host since all edges is already ready
     // when no edge mirror are used.
-<<<<<<< HEAD
-=======
-    for (uint64_t i = nodeBegin; i < nodeEnd; i++) {
-      auto edgeDst = bufGraph.edgeLocalDst(i);
-      I_RR();
-    }
->>>>>>> 73f44af9e (feat: add instrument to wmd graph importer)
     galois::gDebug("[", base_DistGraph::id, "] add edges into graph.");
     galois::do_all(
         galois::iterate(nodeBegin, nodeEnd),
@@ -279,8 +257,8 @@ class WMDGraph : public DistGraph<NodeTy, EdgeTy> {
           I_RR();
           I_WM(bufGraph.edgeNum(globalID));
           base_DistGraph::graph.addEdgesUnSort(
-              true, (globalID - bufGraph.globalNodeOffset[base_DistGraph::id]), dstData.data(), edgeData,
-              bufGraph.edgeNum(globalID), false);
+              true, (globalID - bufGraph.globalNodeOffset[base_DistGraph::id]),
+              dstData.data(), edgeData, bufGraph.edgeNum(globalID), false);
         },
         galois::steal());
 
@@ -569,164 +547,83 @@ class WMDGraph : public DistGraph<NodeTy, EdgeTy> {
 
   std::vector<std::vector<uint64_t>> edgeInspectionRound1(
       galois::graphs::WMDBufferedGraph<NodeTy, EdgeTy>& bufGraph) {
-<<<<<<< HEAD
-     std::vector<std::vector<uint64_t>> incomingMirrors(base_DistGraph::numHosts);
-    uint32_t myID         = base_DistGraph::id;
-=======
-    galois::DynamicBitSet incomingMirrors;
-    incomingMirrors.resize(base_DistGraph::numGlobalNodes);
-    incomingMirrors.reset();
-
-    uint64_t globalOffset;
-    globalOffset = bufGraph.globalNodeOffset[base_DistGraph::id];
-    I_RR();
-    // uint64_t globalOffset =
-    // base_DistGraph::gid2host[base_DistGraph::id].first;
-
-    // already set before this is called
->>>>>>> 73f44af9e (feat: add instrument to wmd graph importer)
+    std::vector<std::vector<uint64_t>> incomingMirrors(
+        base_DistGraph::numHosts);
+    uint32_t myID = base_DistGraph::id;
     base_DistGraph::localToGlobalVector.resize(base_DistGraph::numOwned);
     uint32_t activeThreads = galois::getActiveThreads();
-    std::vector<std::vector<std::set<uint64_t>>> incomingMirrorsPerThread(base_DistGraph::numHosts);
-    for(uint32_t h=0; h<base_DistGraph::numHosts; h++) {
-        incomingMirrorsPerThread[h].resize(activeThreads);
+    std::vector<std::vector<std::set<uint64_t>>> incomingMirrorsPerThread(
+        base_DistGraph::numHosts);
+    for (uint32_t h = 0; h < base_DistGraph::numHosts; h++) {
+      incomingMirrorsPerThread[h].resize(activeThreads);
     }
 
-<<<<<<< HEAD
-<<<<<<< HEAD
-=======
     size_t start = bufGraph.globalNodeOffset[base_DistGraph::id];
     size_t end;
-    if(base_DistGraph::id != base_DistGraph::numHosts - 1)
-        end = bufGraph.globalNodeOffset[base_DistGraph::id + 1];
+    if (base_DistGraph::id != base_DistGraph::numHosts - 1)
+      end = bufGraph.globalNodeOffset[base_DistGraph::id + 1];
     else
-	end = bufGraph.localNodeSize[base_DistGraph::numHosts - 1] + bufGraph.globalNodeOffset[base_DistGraph::id];
+      end = bufGraph.localNodeSize[base_DistGraph::numHosts - 1] +
+            bufGraph.globalNodeOffset[base_DistGraph::id];
 
->>>>>>> d18e4e8d0 (fixed all bugs (hopefully))
     galois::on_each([&](unsigned tid, unsigned nthreads) {
       uint64_t beginNode;
       uint64_t endNode;
-      std::tie(beginNode, endNode) = galois::block_range(start, end, tid, nthreads);
+      std::tie(beginNode, endNode) =
+          galois::block_range(start, end, tid, nthreads);
 
-      for(uint64_t i = beginNode; i < endNode; ++i) {
-        auto ii            = bufGraph.edgeBegin(i);
-        auto ee            = bufGraph.edgeEnd(i);
+      for (uint64_t i = beginNode; i < endNode; ++i) {
+        auto ii = bufGraph.edgeBegin(i);
+        auto ee = bufGraph.edgeEnd(i);
         for (; ii < ee; ++ii) {
           uint64_t dst = bufGraph.edgeDestination(*ii);
-          uint64_t master_dst = bufGraph.virtualToPhyMapping[dst%(bufGraph.scaleFactor*base_DistGraph::numHosts)];
-            if (master_dst != myID) {
-                  assert(master_dst < base_DistGraph::numHosts);
-                  incomingMirrorsPerThread[master_dst][tid].insert(dst);
-            }
+          uint64_t master_dst =
+              bufGraph.virtualToPhyMapping[dst % (bufGraph.scaleFactor *
+                                                  base_DistGraph::numHosts)];
+          if (master_dst != myID) {
+            assert(master_dst < base_DistGraph::numHosts);
+            incomingMirrorsPerThread[master_dst][tid].insert(dst);
+          }
         }
-        base_DistGraph::localToGlobalVector[i - bufGraph.globalNodeOffset[base_DistGraph::id]] = bufGraph.LIDtoGID[i - bufGraph.globalNodeOffset[base_DistGraph::id]];
+        base_DistGraph::localToGlobalVector[i - bufGraph.globalNodeOffset
+                                                    [base_DistGraph::id]] =
+            bufGraph
+                .LIDtoGID[i - bufGraph.globalNodeOffset[base_DistGraph::id]];
       }
-      });
-=======
-    auto& ltgv = base_DistGraph::localToGlobalVector;
-    galois::do_all(
-        galois::iterate(globalOffset,
-                        (bufGraph.globalNodeOffset[base_DistGraph::id] +
-                         bufGraph.localNodeSize[base_DistGraph::id])),
-        [&](size_t n) {
-          auto ii = bufGraph.edgeBegin(n);
-          auto ee = bufGraph.edgeEnd(n);
-          for (int k = 0; k < 2; k++)
-            I_RR();
-          for (; ii < ee; ++ii) {
-            uint32_t dst = bufGraph.edgeDestination(*ii);
-
-            // we keep all edges in OEC so no need to do the check
-            if ((dst < globalOffset) ||
-                (dst >= (bufGraph.globalNodeOffset[base_DistGraph::id] +
-                         bufGraph.localNodeSize[base_DistGraph::id]))) {
-              incomingMirrors.set(dst);
-              I_WR();
-            }
-            for (int k = 0; k < 3; k++)
-              I_RR();
-          }
-          I_WR();
-          ltgv[n - globalOffset] = n;
-        },
-#if MORE_DIST_STATS
-        galois::loopname("EdgeInspectionLoop"),
-#endif
-        galois::steal(), galois::no_stats());
->>>>>>> 73f44af9e (feat: add instrument to wmd graph importer)
+    });
 
     std::vector<std::set<uint64_t>> dest(base_DistGraph::numHosts);
-    for(uint32_t h=0; h<base_DistGraph::numHosts; h++) {
-      for(uint32_t t=0; t<activeThreads; t++) {
+    for (uint32_t h = 0; h < base_DistGraph::numHosts; h++) {
+      for (uint32_t t = 0; t < activeThreads; t++) {
         std::set<uint64_t> tempUnion;
         std::set_union(dest[h].begin(), dest[h].end(),
-                   incomingMirrorsPerThread[h][t].begin(), incomingMirrorsPerThread[h][t].end(),
-                   std::inserter(tempUnion, tempUnion.begin()));
+                       incomingMirrorsPerThread[h][t].begin(),
+                       incomingMirrorsPerThread[h][t].end(),
+                       std::inserter(tempUnion, tempUnion.begin()));
         dest[h] = tempUnion;
       }
-        std::copy(dest[h].begin(), dest[h].end(), std::back_inserter(incomingMirrors[h]));
+      std::copy(dest[h].begin(), dest[h].end(),
+                std::back_inserter(incomingMirrors[h]));
     }
     incomingMirrorsPerThread.clear();
     uint64_t offset = base_DistGraph::localToGlobalVector.size();
-
-    uint64_t count = 0;
-    for(uint64_t i=0; i<incomingMirrors.size(); i++) {
-        count += incomingMirrors[i].size();
+    uint64_t count  = 0;
+    for (uint64_t i = 0; i < incomingMirrors.size(); i++) {
+      count += incomingMirrors[i].size();
     }
     uint32_t additionalMirrorCount = count;
     base_DistGraph::localToGlobalVector.resize(
         base_DistGraph::localToGlobalVector.size() + additionalMirrorCount);
 
-    for (uint64_t i=0;i<incomingMirrors.size();i++) {
-            for(uint64_t j=0; j <incomingMirrors[i].size(); j++) {
-               base_DistGraph::localToGlobalVector[offset] = incomingMirrors[i][j];
-               offset++;
-            }
-        }
-<<<<<<< HEAD
-=======
-        I_WR();
-        threadPrefixSums[tid] = count;
-      });
-      // get prefix sums
-      for (unsigned int i = 1; i < threadPrefixSums.size(); i++) {
-        threadPrefixSums[i] += threadPrefixSums[i - 1];
-        I_WR();
-        I_RR();
+    for (uint64_t i = 0; i < incomingMirrors.size(); i++) {
+      for (uint64_t j = 0; j < incomingMirrors[i].size(); j++) {
+        base_DistGraph::localToGlobalVector[offset] = incomingMirrors[i][j];
+        offset++;
       }
-
-      assert(threadPrefixSums.back() == additionalMirrorCount);
-
-      uint32_t startingNodeIndex = base_DistGraph::numOwned;
-      // do actual work, second on_each
-      galois::on_each([&](unsigned tid, unsigned nthreads) {
-        size_t beginNode;
-        size_t endNode;
-        std::tie(beginNode, endNode) =
-            galois::block_range(0u, totalNumNodes, tid, nthreads);
-        // start location to start adding things into prefix sums/vectors
-        uint32_t threadStartLocation = 0;
-        if (tid != 0) {
-          threadStartLocation = threadPrefixSums[tid - 1];
-          I_RR();
-        }
-        uint32_t handledNodes = 0;
-        for (size_t i = beginNode; i < endNode; i++) {
-          I_RR();
-          if (incomingMirrors.test(i)) {
-            I_WR();
-            base_DistGraph::localToGlobalVector[startingNodeIndex +
-                                                threadStartLocation +
-                                                handledNodes] = i;
-            handledNodes++;
-          }
-        }
-      });
     }
->>>>>>> 73f44af9e (feat: add instrument to wmd graph importer)
 
     base_DistGraph::numNodes = base_DistGraph::numOwned + additionalMirrorCount;
-    //Creating Global to Local ID map
+    // Creating Global to Local ID map
     base_DistGraph::globalToLocalMap.reserve(base_DistGraph::numNodes);
     for (unsigned i = 0; i < base_DistGraph::numNodes; i++) {
       base_DistGraph::globalToLocalMap[base_DistGraph::localToGlobalVector[i]] =
@@ -742,21 +639,16 @@ class WMDGraph : public DistGraph<NodeTy, EdgeTy> {
    * @param presentProxies Bitset marking which proxies are present on this host
    * @param proxiesOnOtherHosts Vector to deserialize received bitsets into
    */
-  void communicateProxyInfo(
-      std::vector<std::vector<uint64_t>> presentProxies,
-      std::vector<std::vector<uint64_t>> proxiesOnOtherHosts) {
+  void
+  communicateProxyInfo(std::vector<std::vector<uint64_t>> presentProxies,
+                       std::vector<std::vector<uint64_t>> proxiesOnOtherHosts) {
     auto& net = galois::runtime::getSystemNetworkInterface();
     // Send proxies on this host to other hosts
     for (unsigned h = 0; h < base_DistGraph::numHosts; ++h) {
       if (h != base_DistGraph::id) {
         galois::runtime::SendBuffer bitsetBuffer;
-<<<<<<< HEAD
         galois::runtime::gSerialize(bitsetBuffer, presentProxies[h]);
         I_LC(h, bitsetBuffer.size());
-=======
-        galois::runtime::gSerialize(bitsetBuffer, presentProxies);
-        I_WM(bitsetBuffer.size() / sizeof(uint64_t));
->>>>>>> 73f44af9e (feat: add instrument to wmd graph importer)
         net.sendTagged(h, galois::runtime::evilPhase, bitsetBuffer);
       }
     }
@@ -912,7 +804,8 @@ class WMDGraph : public DistGraph<NodeTy, EdgeTy> {
       uint64_t globalID = base_DistGraph::localToGlobalVector[i];
       I_RR();
       I_WR();
-      assert(graphPartitioner->retrieveMaster(globalID) < base_DistGraph::numHosts);
+      assert(graphPartitioner->retrieveMaster(globalID) <
+             base_DistGraph::numHosts);
       base_DistGraph::mirrorNodes[graphPartitioner->retrieveMaster(globalID)]
           .push_back(globalID);
     }

From dd567bca30555ffb665127d312bd0e70d43e7f4e Mon Sep 17 00:00:00 2001
From: patrickkenney9801 <patrickkenney9801@gmail.com>
Date: Wed, 6 Mar 2024 12:10:14 -0600
Subject: [PATCH 621/660] chore: Fix last conflicts by removing debug content

---
 libcusp/include/galois/graphs/NewGeneric.h     | 12 ------------
 libgalois/include/galois/graphs/GraphHelpers.h | 14 --------------
 2 files changed, 26 deletions(-)

diff --git a/libcusp/include/galois/graphs/NewGeneric.h b/libcusp/include/galois/graphs/NewGeneric.h
index d1ad172080..e8d7e15d8e 100644
--- a/libcusp/include/galois/graphs/NewGeneric.h
+++ b/libcusp/include/galois/graphs/NewGeneric.h
@@ -1654,19 +1654,7 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
       waitTime.start();
       while (hostFinished.count() != base_DistGraph::numHosts ||
              loadsClear.count() != base_DistGraph::numHosts) {
-<<<<<<< HEAD
         // make sure all assignments are done and all loads are done
-=======
-        // #ifndef NDEBUG
-        //  galois::gDebug("[", base_DistGraph::id, "] waiting for all hosts to
-        //  finish, ",
-        //                hostFinished.count());
-        //  galois::gDebug("[", base_DistGraph::id, "] waiting for all hosts
-        //  loads "
-        //                "syncs to finish, ", loadsClear.count());
-        // #endif
-        //  make sure all assignments are done and all loads are done
->>>>>>> 5901b24b6 (chore: Run clang-format on the repo and add git hooks from gnn branch)
         syncAssignmentReceivesAsync(localNodeToMaster, gid2offsets,
                                     hostFinished);
         asyncRecvLoad(nodeLoads, edgeLoads, loadsClear);
diff --git a/libgalois/include/galois/graphs/GraphHelpers.h b/libgalois/include/galois/graphs/GraphHelpers.h
index db9d5c5114..a5dda328c7 100644
--- a/libgalois/include/galois/graphs/GraphHelpers.h
+++ b/libgalois/include/galois/graphs/GraphHelpers.h
@@ -290,20 +290,6 @@ void determineUnitRangesLoopGraph(GraphTy& graph, uint32_t unitsToSplit,
       // unit assinged no nodes, copy last one
       returnRanges[i + 1] = returnRanges[i];
     }
-<<<<<<< HEAD
-=======
-
-    if (is_LS_LC_CSR) {
-      galois::gDebug("LoopGraph Unit ", i, " gets nodes ", returnRanges[i],
-                     " to ", returnRanges[i + 1], ", num edges is ",
-                     graph[returnRanges[i + 1] - 1] - graph[returnRanges[i]]);
-    } else {
-      galois::gDebug("LoopGraph Unit ", i, " gets nodes ", returnRanges[i],
-                     " to ", returnRanges[i + 1], ", num edges is ",
-                     graph.edge_end(returnRanges[i + 1] - 1) -
-                         graph.edge_begin(returnRanges[i]));
-    }
->>>>>>> 1d4ff12ff (feat: support loop ranges for LS_LC_CSR_64)
   }
 }
 

From a051848794620adec0bee0c795b962dc8a438bf9 Mon Sep 17 00:00:00 2001
From: patrickkenney9801 <patrickkenney9801@gmail.com>
Date: Wed, 6 Mar 2024 13:03:26 -0600
Subject: [PATCH 622/660] chore: Fix docker image

---
 CMakeLists.txt | 2 +-
 Dockerfile     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f1de9ae84c..929371e642 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.13)
+cmake_minimum_required(VERSION 3.17)
 
 project(Galois)
 
diff --git a/Dockerfile b/Dockerfile
index d49b9d3211..6e4ec2285e 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -61,6 +61,6 @@ RUN echo "export SRC_DIR=${SRC_DIR}" >> /home/${UNAME}/.bashrc
 RUN echo "export BUILD_DIR=${BUILD_DIR}" >> /home/${UNAME}/.bashrc
 RUN echo "export OMPI_ALLOW_RUN_AS_ROOT=1" >> /home/${UNAME}/.bashrc
 RUN echo "export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1" >> /home/${UNAME}/.bashrc
-RUN echo "export MKL_ROOT=/opt/intel/oneapi/mkl/2023.2.0" >> /home/${UNAME}/.bashrc
+RUN echo "source /opt/intel/oneapi/setvars.sh > /dev/null" >> /home/${UNAME}/.bashrc
 
 WORKDIR ${SRC_DIR}

From ce71e4c786c4cef8fac336d7fbae7567b6b99746 Mon Sep 17 00:00:00 2001
From: patrickkenney9801 <patrickkenney9801@gmail.com>
Date: Wed, 6 Mar 2024 16:49:20 -0600
Subject: [PATCH 623/660] chore: Things compile

---
 .../galois/graphs/DistributedLocalGraph.h     | 49 ++++++++-----------
 libdist/include/galois/runtime/Serialize.h    |  4 +-
 .../galois/graphs/LS_LC_CSR_64_Graph.h        | 46 +++--------------
 libwmd/include/galois/wmd/WMDGraph.h          | 44 ++++++++---------
 libwmd/include/galois/wmd/WMDPartitioner.h    | 12 ++---
 libwmd/test/wmd-graph-build.cpp               |  6 +--
 .../analytics/distributed/bfs/CMakeLists.txt  |  4 +-
 7 files changed, 62 insertions(+), 103 deletions(-)

diff --git a/libcusp/include/galois/graphs/DistributedLocalGraph.h b/libcusp/include/galois/graphs/DistributedLocalGraph.h
index a03e370e64..bd0dd3ffd8 100644
--- a/libcusp/include/galois/graphs/DistributedLocalGraph.h
+++ b/libcusp/include/galois/graphs/DistributedLocalGraph.h
@@ -20,8 +20,8 @@
 /**
  * @file DistributedLocalGraph.h
  *
- * Contains the implementation for DistGraph. Command line argument definitions
- * are found in DistributedGraph.cpp.
+ * Contains the implementation for DistLocalGraph. Command line argument
+ * definitions are found in DistributedGraph.cpp.
  */
 
 #ifndef _GALOIS_DISTRIBUTED_LOCAL_GRAPH_H
@@ -30,6 +30,7 @@
 #include <unordered_map>
 #include <fstream>
 
+#include "galois/graphs/DistributedGraph.h"
 #include "galois/graphs/LS_LC_CSR_64_Graph.h"
 #include "galois/graphs/BufferedGraph.h"
 #include "galois/runtime/DistStats.h"
@@ -42,20 +43,9 @@
 
 namespace galois {
 namespace graphs {
-/**
- * Enums specifying how masters are to be distributed among hosts.
- */
-enum MASTERS_DISTRIBUTION {
-  //! balance nodes
-  BALANCED_MASTERS,
-  //! balance edges
-  BALANCED_EDGES_OF_MASTERS,
-  //! balance nodes and edges
-  BALANCED_MASTERS_AND_EDGES
-};
 
 /**
- * Base DistGraph class that all distributed graphs extend from.
+ * Base DistLocalGraph class that all distributed graphs extend from.
  *
  * @tparam NodeTy type of node data for the graph
  * @tparam EdgeTy type of edge data for the graph
@@ -92,7 +82,7 @@ class DistLocalGraph {
   std::vector<NodeRangeType> specificRangesIn;
 
 protected:
-  //! The internal graph used by DistGraph to represent the graph
+  //! The internal graph used by DistLocalGraph to represent the graph
   GraphTy graph;
 
   //! Marks if the graph is transposed or not.
@@ -256,14 +246,14 @@ class DistLocalGraph {
       for (unsigned d = 0; d < DecomposeFactor; ++d) {
         galois::runtime::gSerialize(b, gid2host[id + d * numHosts]);
       }
-      net.sendTagged(h, galois::runtime::evilPhase, b);
+      net.sendTagged(h, galois::runtime::evilPhase, std::move(b));
     }
     net.flush();
     unsigned received = 1;
     while (received < numHosts) {
-      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
       do {
-        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+        p = net.recieveTagged(galois::runtime::evilPhase);
       } while (!p);
       assert(p->first != id);
       auto& b = p->second;
@@ -326,14 +316,14 @@ class DistLocalGraph {
         continue;
       galois::runtime::SendBuffer b;
       galois::runtime::gSerialize(b, gid2host[id]);
-      net.sendTagged(h, galois::runtime::evilPhase, b);
+      net.sendTagged(h, galois::runtime::evilPhase, std::move(b));
     }
     net.flush();
     unsigned received = 1;
     while (received < numHosts) {
-      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
       do {
-        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+        p = net.recieveTagged(galois::runtime::evilPhase);
       } while (!p);
       assert(p->first != id);
       auto& b = p->second;
@@ -443,14 +433,14 @@ class DistLocalGraph {
         continue;
       galois::runtime::SendBuffer b;
       galois::runtime::gSerialize(b, gid2host[id]);
-      net.sendTagged(h, galois::runtime::evilPhase, b);
+      net.sendTagged(h, galois::runtime::evilPhase, std::move(b));
     }
     net.flush();
     unsigned received = 1;
     while (received < numHosts) {
-      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
       do {
-        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+        p = net.recieveTagged(galois::runtime::evilPhase);
       } while (!p);
       assert(p->first != id);
       auto& b = p->second;
@@ -497,12 +487,12 @@ class DistLocalGraph {
   using edge_iterator = typename GraphTy::edge_iterator;
 
   /**
-   * Constructor for DistGraph. Initializes metadata fields.
+   * Constructor for DistLocalGraph. Initializes metadata fields.
    *
    * @param host host number that this graph resides on
    * @param numHosts total number of hosts in the currently executing program
    */
-  DistGraph(unsigned host, unsigned numHosts)
+  DistLocalGraph(unsigned host, unsigned numHosts)
       : transposed(false), id(host), numHosts(numHosts) {
     mirrorNodes.resize(numHosts);
     numGlobalNodes = 0;
@@ -538,7 +528,7 @@ class DistLocalGraph {
   }
 
 public:
-  virtual ~DistGraph() {}
+  virtual ~DistLocalGraph() {}
   //! Determines which host has the master for a particular node
   //! @returns Host id of node in question
   inline unsigned getHostID(uint64_t gid) const { return getHostIDImpl(gid); }
@@ -878,7 +868,7 @@ class DistLocalGraph {
    * Deallocates underlying LC CSR Graph
    */
   void deallocate() {
-    galois::gDebug("Deallocating CSR in DistGraph");
+    galois::gDebug("Deallocating CSR in DistLocalGraph");
     graph.deallocate();
   }
 
@@ -896,7 +886,8 @@ class DistLocalGraph {
 };
 
 template <typename NodeTy, typename EdgeTy>
-constexpr const char* const galois::graphs::DistGraph<NodeTy, EdgeTy>::GRNAME;
+constexpr const char* const
+    galois::graphs::DistLocalGraph<NodeTy, EdgeTy>::GRNAME;
 } // end namespace graphs
 } // end namespace galois
 
diff --git a/libdist/include/galois/runtime/Serialize.h b/libdist/include/galois/runtime/Serialize.h
index 6d70b04193..1aab134327 100644
--- a/libdist/include/galois/runtime/Serialize.h
+++ b/libdist/include/galois/runtime/Serialize.h
@@ -815,10 +815,10 @@ void gDeserializeObj(
 
 template <typename T1, typename T2>
 void gDeserializeObj(DeSerializeBuffer& buf, std::unordered_map<T1, T2>& data) {
-  while (!buf.empty()) {
+  while (buf.size() > 0) {
     std::pair<T1, T2> i;
     gDeserialize(buf, i.first, i.second);
-    if (buf.getOffset() > buf.size()) {
+    if (buf.size() <= 0) {
       break;
     }
     data[i.first] = i.second;
diff --git a/libgalois/include/galois/graphs/LS_LC_CSR_64_Graph.h b/libgalois/include/galois/graphs/LS_LC_CSR_64_Graph.h
index e1e9165679..563039cdbb 100644
--- a/libgalois/include/galois/graphs/LS_LC_CSR_64_Graph.h
+++ b/libgalois/include/galois/graphs/LS_LC_CSR_64_Graph.h
@@ -293,13 +293,13 @@ class LS_LC_CSR_64_Graph :
 
   template <bool _A1 = HasNoLockable, bool _A2 = HasOutOfLineLockable>
   void acquireNode(GraphNode N, MethodFlag mflag,
-                   typename std::enable_if<!_A1&& !_A2>::type* = 0) {
+                   typename std::enable_if<!_A1 && !_A2>::type* = 0) {
     galois::runtime::acquire(&nodeData[N], mflag);
   }
 
   template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>
   void acquireNode(GraphNode N, MethodFlag mflag,
-                   typename std::enable_if<_A1&& !_A2>::type* = 0) {
+                   typename std::enable_if<_A1 && !_A2>::type* = 0) {
     this->outOfLineAcquire(getId(N), mflag);
   }
 
@@ -338,7 +338,7 @@ class LS_LC_CSR_64_Graph :
   template <bool _A1 = EdgeData::has_value,
             bool _A2 = LargeArray<FileEdgeTy>::has_value>
   void constructEdgeValue(FileGraph&, typename FileGraph::edge_iterator nn,
-                          typename std::enable_if<_A1&& !_A2>::type* = 0) {
+                          typename std::enable_if<_A1 && !_A2>::type* = 0) {
     edgeData.set(*nn, {});
   }
 
@@ -594,15 +594,9 @@ class LS_LC_CSR_64_Graph :
     edgeIndData[src].first  = edgeStart;
     edgeIndData[src].second = edgeStart + num_dst + orig_deg;
 
-    if (!keep_size)
+    if (!keep_size) {
       numEdges.fetch_add(num_dst, std::memory_order_relaxed);
-#ifdef GRAPH_PROFILE
-    this->local_rnd_write_counts  += 1;
-    this->local_rnd_write_bytes   += 8;
-#endif
-
-    numEdges.fetch_add(num_dst, std::memory_order_relaxed);
-
+    }
     prefixValid = false;
   }
 
@@ -975,34 +969,8 @@ class LS_LC_CSR_64_Graph :
       edgeIndData.allocateBlocked(maxNodes);
       edgeDst.allocateBlocked(maxEdges);
       edgeData.allocateBlocked(maxEdges);
-      pfxsum.allocateInterleaved(maxNodes);
+      prefixSumCache.allocateBlocked(maxNodes);
       this->outOfLineAllocateBlocked(maxNodes);
-    } else {
-      nodeData.allocateInterleaved(maxNodes);
-      edgeIndData.allocateInterleaved(maxNodes);
-      edgeDst.allocateInterleaved(maxEdges);
-      edgeData.allocateInterleaved(maxEdges);
-      pfxsum.allocateInterleaved(maxNodes);
-      this->outOfLineAllocateInterleaved(maxNodes);
-    }
-    resetPrefixSum();
-  }
-
-  void destroyAndAllocateFrom(uint64_t nNodes, uint64_t nEdges) {
-    numNodes = nNodes;
-    numEdges = 0;
-    edgeEnd  = 0;
-    maxEdges = nEdges;
-    maxNodes = nNodes;
-
-    deallocate();
-    if (UseNumaAlloc) {
-      nodeData.allocateBlocked(numNodes);
-      edgeIndData.allocateBlocked(numNodes);
-      edgeDst.allocateBlocked(numEdges);
-      edgeData.allocateBlocked(numEdges);
-      prefixSumCache.allocateBlocked(numNodes);
-      this->outOfLineAllocateBlocked(numNodes);
     } else {
       nodeData.allocateInterleaved(maxNodes);
       edgeIndData.allocateInterleaved(maxNodes);
@@ -1017,7 +985,7 @@ class LS_LC_CSR_64_Graph :
   void destroyAndAllocateFrom(uint64_t nNodes, uint64_t nEdges) {
     numNodes = nNodes;
     numEdges = 0;
-    edgeEnd = 0;
+    edgeEnd  = 0;
     maxEdges = nEdges;
     maxNodes = nNodes;
 
diff --git a/libwmd/include/galois/wmd/WMDGraph.h b/libwmd/include/galois/wmd/WMDGraph.h
index b528b1ade5..0ad00bf670 100644
--- a/libwmd/include/galois/wmd/WMDGraph.h
+++ b/libwmd/include/galois/wmd/WMDGraph.h
@@ -365,15 +365,15 @@ class WMDOfflineGraph : public OfflineGraph {
       I_WM(edgeCnt.size());
       galois::runtime::SendBuffer b;
       galois::runtime::gSerialize(b, edgeCnt);
-      net.sendTagged(i, galois::runtime::evilPhase, b);
+      net.sendTagged(i, galois::runtime::evilPhase, std::move(b));
     }
     // Receive edgeCnt
     for (uint32_t h = 0; h < (numHosts - 1); h++) {
       std::vector<uint64_t> recvChunkCounts;
 
-      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
       do {
-        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+        p = net.recieveTagged(galois::runtime::evilPhase);
       } while (!p);
       galois::runtime::gDeserialize(p->second, recvChunkCounts);
       I_LC(p->first, recvChunkCounts.size() * sizeof(uint64_t));
@@ -536,9 +536,9 @@ class WMDOfflineGraph : public OfflineGraph {
         }
 
         galois::runtime::SendBuffer b;
-        galois::runtime::gSerialize(b, sendBuffer);
+        galois::runtime::gSerialize(b, std::move(sendBuffer));
         I_LC(h, b.size());
-        net.sendTagged(h, galois::runtime::evilPhase, b);
+        net.sendTagged(h, galois::runtime::evilPhase, std::move(b));
       }
     }
 
@@ -558,9 +558,9 @@ class WMDOfflineGraph : public OfflineGraph {
     // recv node degrees and its global ID from other hosts
     // build a list of degree of all global nodes on `globalEdgePrefixSum`
     for (uint32_t h = 0; h < numHosts - 1; h++) {
-      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
       do {
-        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+        p = net.recieveTagged(galois::runtime::evilPhase);
       } while (!p);
       // deserialize
       std::vector<uint64_t> recvNodeDegree;
@@ -785,14 +785,14 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
       // serialize size_t
       galois::runtime::SendBuffer sendBuffer;
       galois::runtime::gSerialize(sendBuffer, localNodeSize);
-      net.sendTagged(h, galois::runtime::evilPhase, sendBuffer);
+      net.sendTagged(h, galois::runtime::evilPhase, std::move(sendBuffer));
       I_WM(localNodeSize.size());
     }
 
     for (uint32_t h = 0; h < numHosts - 1; h++) {
-      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
       do {
-        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+        p = net.recieveTagged(galois::runtime::evilPhase);
       } while (!p);
       std::vector<uint64_t> cnt;
       // deserialize local_node_size
@@ -912,15 +912,15 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
         continue;
       galois::runtime::SendBuffer sendBuffer;
       galois::runtime::gSerialize(sendBuffer, nodesToSend[h]);
-      net.sendTagged(h, galois::runtime::evilPhase, sendBuffer);
+      net.sendTagged(h, galois::runtime::evilPhase, std::move(sendBuffer));
       I_WM(nodesToSend[h].size());
     }
 
     // Collect node data received from other hosts
     for (uint32_t i = 0; i < (numHosts - 1); i++) {
-      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
       do {
-        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+        p = net.recieveTagged(galois::runtime::evilPhase);
       } while (!p);
       std::vector<NodeDataType> NodeData;
       galois::runtime::gDeserialize(p->second, NodeData);
@@ -999,7 +999,7 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
       galois::runtime::gSerialize(sendBuffer, edgesToSend[h]);
       galois::gInfo("[", hostID, "] ", "send to ", h,
                     " edgesToSend size: ", edgesToSend[h].size());
-      net.sendTagged(h, galois::runtime::evilPhase, sendBuffer);
+      net.sendTagged(h, galois::runtime::evilPhase, std::move(sendBuffer));
       I_WM(edgesToSend[h].size());
     }
 
@@ -1008,9 +1008,9 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
 
     // Receiving edges from other hosts and populating edgelist
     for (uint32_t h = 0; h < (numHosts - 1); h++) {
-      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
       do {
-        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+        p = net.recieveTagged(galois::runtime::evilPhase);
       } while (!p);
       uint32_t sendingHost = p->first;
 
@@ -1264,16 +1264,16 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
         continue;
       galois::runtime::SendBuffer sendBuffer;
       galois::runtime::gSerialize(sendBuffer, nodesToSend[h]);
-      net.sendTagged(h, galois::runtime::evilPhase, sendBuffer);
+      net.sendTagged(h, galois::runtime::evilPhase, std::move(sendBuffer));
       I_WM(nodesToSend[h].size());
     }
 #ifndef NDEBUG
     std::atomic<uint64_t> addedData{0};
 #endif
     for (uint32_t i = 0; i < (numHosts - 1); i++) {
-      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
       do {
-        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+        p = net.recieveTagged(galois::runtime::evilPhase);
       } while (!p);
       std::vector<NodeDataType> NodeData;
       I_LC(p->first, p->second.size());
@@ -1336,15 +1336,15 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
       galois::gDebug("[", hostID, "] ", "send to ", h,
                      " nodesToSend size: ", nodesToSend[h].size());
       I_WM(nodesToSend[h].size() + proxiesOnHosts[h].size());
-      net.sendTagged(h, galois::runtime::evilPhase, sendBuffer);
+      net.sendTagged(h, galois::runtime::evilPhase, std::move(sendBuffer));
     }
 
     nodesToSend.clear();
     // recive nodes from other hosts
     for (uint32_t i = 0; i < (numHosts - 1); i++) {
-      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
       do {
-        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+        p = net.recieveTagged(galois::runtime::evilPhase);
       } while (!p);
       uint32_t sendingHost = p->first;
 
diff --git a/libwmd/include/galois/wmd/WMDPartitioner.h b/libwmd/include/galois/wmd/WMDPartitioner.h
index 8917f175b1..4c71e9984f 100644
--- a/libwmd/include/galois/wmd/WMDPartitioner.h
+++ b/libwmd/include/galois/wmd/WMDPartitioner.h
@@ -10,7 +10,7 @@
 #define _WMD_PARTITIONER_H
 
 #include "galois/Galois.h"
-#include "galois/graphs/DistributedGraph.h"
+#include "galois/graphs/DistributedLocalGraph.h"
 #include "galois/DReducible.h"
 
 #include "WMDGraph.h"
@@ -33,7 +33,7 @@ namespace graphs {
  * @warning not meant for public use + not fully documented yet
  */
 template <typename NodeTy, typename EdgeTy, typename Partitioner>
-class WMDGraph : public DistGraph<NodeTy, EdgeTy> {
+class WMDGraph : public DistLocalGraph<NodeTy, EdgeTy> {
 
   //! size used to buffer edge sends during partitioning
   constexpr static unsigned edgePartitionSendBufSize = 8388608;
@@ -90,7 +90,7 @@ class WMDGraph : public DistGraph<NodeTy, EdgeTy> {
 
 public:
   //! typedef for base DistGraph class
-  using base_DistGraph = DistGraph<NodeTy, EdgeTy>;
+  using base_DistGraph = DistLocalGraph<NodeTy, EdgeTy>;
 
   /**
    * Returns edges owned by this graph (i.e. read).
@@ -649,15 +649,15 @@ class WMDGraph : public DistGraph<NodeTy, EdgeTy> {
         galois::runtime::SendBuffer bitsetBuffer;
         galois::runtime::gSerialize(bitsetBuffer, presentProxies[h]);
         I_LC(h, bitsetBuffer.size());
-        net.sendTagged(h, galois::runtime::evilPhase, bitsetBuffer);
+        net.sendTagged(h, galois::runtime::evilPhase, std::move(bitsetBuffer));
       }
     }
 
     // receive loop
     for (unsigned h = 0; h < net.Num - 1; h++) {
-      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
       do {
-        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+        p = net.recieveTagged(galois::runtime::evilPhase);
       } while (!p);
       uint32_t sendingHost = p->first;
       // deserialize proxiesOnOtherHosts
diff --git a/libwmd/test/wmd-graph-build.cpp b/libwmd/test/wmd-graph-build.cpp
index cf11a476da..99158508da 100644
--- a/libwmd/test/wmd-graph-build.cpp
+++ b/libwmd/test/wmd-graph-build.cpp
@@ -81,12 +81,12 @@ int main(int argc, char* argv[]) {
   if (net.ID != 0) { // send token and degree pairs to host 0
     galois::runtime::SendBuffer sendBuffer;
     galois::runtime::gSerialize(sendBuffer, tokenAndEdges);
-    net.sendTagged(0, galois::runtime::evilPhase, sendBuffer);
+    net.sendTagged(0, galois::runtime::evilPhase, std::move(sendBuffer));
   } else { // recv node range from other hosts
     for (size_t i = 0; i < net.Num - 1; i++) {
-      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
       do {
-        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+        p = net.recieveTagged(galois::runtime::evilPhase);
       } while (!p);
 
       std::vector<std::pair<uint64_t, std::vector<uint64_t>>>
diff --git a/lonestar/analytics/distributed/bfs/CMakeLists.txt b/lonestar/analytics/distributed/bfs/CMakeLists.txt
index 841754e999..4c421768e4 100644
--- a/lonestar/analytics/distributed/bfs/CMakeLists.txt
+++ b/lonestar/analytics/distributed/bfs/CMakeLists.txt
@@ -1,5 +1,5 @@
-app_dist(bfs_push bfs-push)
-add_test_dist(bfs-push-dist rmat15 ${BASEINPUT}/scalefree/rmat15.gr -graphTranspose=${BASEINPUT}/scalefree/transpose/rmat15.tgr)
+#app_dist(bfs_push bfs-push)
+#add_test_dist(bfs-push-dist rmat15 ${BASEINPUT}/scalefree/rmat15.gr -graphTranspose=${BASEINPUT}/scalefree/transpose/rmat15.tgr)
 
 app_dist(bfs_pull bfs-pull)
 add_test_dist(bfs-pull-dist rmat15 ${BASEINPUT}/scalefree/rmat15.gr -graphTranspose=${BASEINPUT}/scalefree/transpose/rmat15.tgr)

From 2255c1968aaea53c42078947b925df9c0483d9ae Mon Sep 17 00:00:00 2001
From: patrickkenney9801 <patrickkenney9801@gmail.com>
Date: Thu, 7 Mar 2024 09:30:33 -0600
Subject: [PATCH 624/660] chore: WMD test works

---
 inputs/wmd/data.00001.csv  | 744 +++++++++++++++++++++++++++++++++++++
 libwmd/test/CMakeLists.txt |   2 +-
 2 files changed, 745 insertions(+), 1 deletion(-)
 create mode 100644 inputs/wmd/data.00001.csv

diff --git a/inputs/wmd/data.00001.csv b/inputs/wmd/data.00001.csv
new file mode 100644
index 0000000000..0f18f74182
--- /dev/null
+++ b/inputs/wmd/data.00001.csv
@@ -0,0 +1,744 @@
+#delimieter: ,
+#columns:type,person1,person2,forum,forum_event,publication,topic,date,lat,lon
+#types:STRING,UINT,UINT,UINT,UINT,UINT,UINT,USDATE,DOUBLE,DOUBLE
+Publication,,,,,102583151124020340,,4/1/2013,,
+Publication,,,,,1004346153600881042,,12/2/2014,,
+Publication,,,,,1433303251800176474,,1/1/2014,,
+Publication,,,,,963345652072941810,,3/1/2017,,
+ForumEvent,,,1372844135435303981,1651365355351122204,,,1/7/2019,,
+ForumEvent,,,1372844135435303981,1060309546214304182,,,1/3/2018,,
+ForumEvent,,,1372844135435303981,932362105613871012,,,1/8/2018,,
+ForumEvent,,,1372844135435303981,618434247743641149,,,1/8/2018,,
+ForumEvent,,,1372844135435303981,1209342585680609487,,,1/10/2018,,
+ForumEvent,,,1615340315424362057,1245126351375505703,,,2/13/2018,,
+ForumEvent,,,1372844135435303981,581543512052485139,,,2/5/2018,,
+ForumEvent,,,1314315120197156050,833681012494554358,,,3/23/2018,,
+ForumEvent,,,1615340315424362057,1220295546212024391,,,3/26/2018,,
+ForumEvent,,,1372844135435303981,1424263331858043042,,,4/5/2018,,
+ForumEvent,,,1615340315424362057,1290121451283392110,,,4/12/2018,,
+ForumEvent,,,1427292001647224242,240337224527030225,,,4/24/2018,,
+ForumEvent,,,1615340315424362057,440265285168056234,,,5/17/2018,,
+ForumEvent,,,1615340315424362057,817526874194673140,,,5/31/2018,,
+ForumEvent,,,1314315120197156050,846536331643665114,,,6/12/2018,,
+ForumEvent,,,1202482536733844323,1114502034902546550,,,6/14/2018,,
+ForumEvent,,,1372844135435303981,1441762191425652442,,,7/8/2018,,
+ForumEvent,,,1615340315424362057,128423416112315798,,,7/20/2018,,
+ForumEvent,,,1615340315424362057,701755398615636460,,,8/1/2018,,
+ForumEvent,,,1314315120197156050,393285992310638641,,,8/12/2018,,
+ForumEvent,,,1615340315424362057,420762134340393550,,,9/9/2018,,
+ForumEvent,,,1372844135435303981,737353170652104031,,,9/14/2018,,
+ForumEvent,,,1615340315424362057,116892402526543412,,,10/13/2018,,
+ForumEvent,,,1372844135435303981,1028329324575034354,,,10/20/2018,,
+ForumEvent,,,1202482536733844323,1513662032452523252,,,10/30/2018,,
+ForumEvent,,,1314315120197156050,803952155714850701,,,11/14/2018,,
+ForumEvent,,,1372844135435303981,186108460103013588,,,11/12/2018,,
+ForumEvent,,,1615340315424362057,1184855350262395542,,,12/1/2018,,
+ForumEvent,,,1372844135435303981,1302313601603127196,,,12/16/2018,,
+ForumEvent,,,1615340315424362057,78678286442461987,,,1/11/2019,,
+ForumEvent,,,15133734353741126,1285128710332882742,,,1/10/2019,,
+ForumEvent,,,1615340315424362057,447169043921403064,,,2/2/2019,,
+ForumEvent,,,1372844135435303981,91431002216341149,,,2/13/2019,,
+ForumEvent,,,1202482536733844323,1296829658689065159,,,2/13/2019,,
+ForumEvent,,,1615340315424362057,877764733212222524,,,3/28/2019,,
+ForumEvent,,,1314315120197156050,1614534111336540475,,,3/3/2019,,
+ForumEvent,,,1615340315424362057,209800678458482108,,,4/14/2019,,
+ForumEvent,,,15133734353741126,1532662490035322233,,,4/1/2019,,
+ForumEvent,,,1314315120197156050,321724159614056152,,,5/29/2019,,
+ForumEvent,,,1372844135435303981,1512214307542520410,,,5/17/2019,,
+ForumEvent,,,1615340315424362057,740410432146852843,,,6/5/2019,,
+ForumEvent,,,1372844135435303981,82629615412640377,,,6/24/2019,,
+ForumEvent,,,1427292001647224242,936722743217343702,,,6/30/2019,,
+ForumEvent,,,1372844135435303981,747423119260925972,,,7/11/2019,,
+ForumEvent,,,451888058015735870,541215404780905313,,,7/3/2019,,
+ForumEvent,,,1615340315424362057,1424660009578332566,,,8/25/2019,,
+ForumEvent,,,1314315120197156050,1282227710122181132,,,8/5/2019,,
+ForumEvent,,,1314315120197156050,854149383334143372,,,9/19/2019,,
+ForumEvent,,,1615340315424362057,202421472143651025,,,9/21/2019,,
+ForumEvent,,,353365307219544531,956704137555154092,,,10/17/2019,,
+ForumEvent,,,,1142353335442842612,,,10/2/2019,,
+ForumEvent,,,,1417645062678302203,,,10/27/2019,,
+ForumEvent,,,,691612430615344311,,,11/18/2019,,
+ForumEvent,,,,499518911125406276,,,11/7/2019,,
+ForumEvent,,,,802203574353867462,,,12/26/2019,,
+ForumEvent,,,,1154045191214226005,,,12/19/2019,,
+Forum,,,227560344059645632,,,,,,
+Forum,,,642724485236726353,,,,,,
+Forum,,,1583773067440233990,,,,,,
+Forum,,,353365307219544531,,,,,,
+Forum,,,1372844135435303981,,,,,,
+Forum,,,817570614729612563,,,,,,
+Forum,,,1222966301068614432,,,,,,
+Forum,,,254347350613723281,,,,,,
+Forum,,,230406515001545612,,,,,,
+Forum,,,1561731546512891652,,,,,,
+Forum,,,188043543797416114,,,,,,
+Forum,,,1083041743586306041,,,,,,
+Forum,,,132472381132383125,,,,,,
+Forum,,,20118285562646166,,,,,,
+Forum,,,555784630220125214,,,,,,
+Forum,,,1015255971523263924,,,,,,
+Forum,,,1342495276080758813,,,,,,
+Forum,,,851350143155248158,,,,,,
+Forum,,,1427292001647224242,,,,,,
+Forum,,,722051276937327353,,,,,,
+Forum,,,1107212912316309796,,,,,,
+Forum,,,504490409499070811,,,,,,
+Forum,,,15133734353741126,,,,,,
+Forum,,,869745302967338810,,,,,,
+Forum,,,324124332757504717,,,,,,
+Forum,,,852491638004013222,,,,,,
+Forum,,,1040437236245414809,,,,,,
+Forum,,,442231451428861295,,,,,,
+Forum,,,101022092642335391,,,,,,
+Forum,,,1037815940207624157,,,,,,
+Forum,,,1331941318481662527,,,,,,
+Forum,,,1615340315424362057,,,,,,
+Forum,,,1425519641234605945,,,,,,
+Forum,,,705065952261175094,,,,,,
+Forum,,,1314315120197156050,,,,,,
+Forum,,,214214821270800149,,,,,,
+Forum,,,1361197157264541395,,,,,,
+Forum,,,1033538541314217453,,,,,,
+Forum,,,565733832133342431,,,,,,
+Forum,,,451888058015735870,,,,,,
+Forum,,,155345234637251110,,,,,,
+Forum,,,1371100161965701220,,,,,,
+Forum,,,1307221369082243900,,,,,,
+Forum,,,406508153569651122,,,,,,
+Forum,,,1202482536733844323,,,,,,
+Forum,,,912373284682369433,,,,,,
+Person,477384404927196020,,,,,,,,
+Person,182010581109145287,,,,,,,,
+Topic,,,,,,271997,,,
+Topic,,,,,,127197,,,
+Person,284405379592161575,,,,,,,,
+Topic,,,,,,11650,,,
+Topic,,,,,,185785,,,
+Topic,,,,,,1907525,,,
+Topic,,,,,,1333024,,,
+Topic,,,,,,2329,,,
+Topic,,,,,,571,,,
+Topic,,,,,,56683126,,,
+Topic,,,,,,146,,,
+Topic,,,,,,487,,,
+Topic,,,,,,193294,,,
+Topic,,,,,,177,,,
+Topic,,,,,,81944,,,
+Topic,,,,,,998,,,
+Topic,,,,,,55424107,,,
+Topic,,,,,,41323,,,
+Topic,,,,,,38695,,,
+Topic,,,,,,379860,,,
+Topic,,,,,,1149078,,,
+Topic,,,,,,172809,,,
+Topic,,,,,,1642639,,,
+Topic,,,,,,903552,,,
+Topic,,,,,,204,,,
+Topic,,,,,,7817,,,
+Topic,,,,,,201816,,,
+Topic,,,,,,785,,49.19,-2.11
+Topic,,,,,,127,,,
+Topic,,,,,,206021,,,
+Topic,,,,,,181508,,,
+Topic,,,,,,735,,,
+Topic,,,,,,304878,,,
+Topic,,,,,,7590,,,
+Topic,,,,,,8074,,,
+Topic,,,,,,24862,,,
+Topic,,,,,,35127,,,
+Topic,,,,,,60,,40.67,-73.94
+Topic,,,,,,443533,,,
+Person,1160244137181801222,,,,,,,,
+Topic,,,,,,192242,,,
+Topic,,,,,,11707,,,
+Topic,,,,,,73843,,,
+Topic,,,,,,505619,,,
+Topic,,,,,,158668,,,
+Topic,,,,,,889,,34.0,66.0
+Person,895197896920634500,,,,,,,,
+Topic,,,,,,18426,,40.84676,-73.873207
+Topic,,,,,,787185,,,
+Topic,,,,,,467,,,
+Person,1419850416906085161,,,,,,,,
+Topic,,,,,,2869238,,,
+Topic,,,,,,5,,,
+Topic,,,,,,334600,,,
+Topic,,,,,,191290,,,
+Topic,,,,,,122113,,,
+Topic,,,,,,179057,,,
+Topic,,,,,,11635,,,
+Topic,,,,,,329717,,,
+Person,33927662206515912,,,,,,,,
+Topic,,,,,,35140,,,
+Topic,,,,,,485537,,,
+Topic,,,,,,102014,,,
+Topic,,,,,,40357,,,
+Topic,,,,,,1337691,,,
+Topic,,,,,,160409,,40.7825,-73.966111111111
+Topic,,,,,,69871376,,,
+Topic,,,,,,177749,,,
+Topic,,,,,,11348,,,
+Topic,,,,,,182218,,,
+Topic,,,,,,1229,,47.568611111111,40.852783333333
+Topic,,,,,,5322,,,
+Person,1035098046740791143,,,,,,,,
+Topic,,,,,,792565,,48.10277778,20.78388889
+Topic,,,,,,37654,,,
+Topic,,,,,,25395,,40.735277777778,-74.185
+Topic,,,,,,169313,,,
+Topic,,,,,,728,,,
+Topic,,,,,,699385,,,
+Topic,,,,,,22983,,,
+Person,971383124880710240,,,,,,,,
+Person,1010629269012322480,,,,,,,,
+Topic,,,,,,11299,,40.728333333333,-73.994166666667
+Topic,,,,,,83460,,,
+Topic,,,,,,10289,,,
+Topic,,,,,,11019,,,
+Topic,,,,,,470118,,,
+Person,1426050562563532645,,,,,,,,
+Person,75415528634186650,,,,,,,,
+Person,1001287904525368324,,,,,,,,
+Person,242111862342742257,,,,,,,,
+Topic,,,,,,11249,,40.747,-73.986
+Topic,,,,,,3933135,,,
+Topic,,,,,,44311,,,
+Person,1025135622623992536,,,,,,,,
+Topic,,,,,,37497186,,,
+Person,584485814982143221,,,,,,,,
+Person,1508332501512270227,,,,,,,,
+Topic,,,,,,328473,,40.712,-74.002
+Topic,,,,,,25347,,,
+Topic,,,,,,175111,,,
+Person,1312322776399358210,,,,,,,,
+Topic,,,,,,16003594,,,
+Topic,,,,,,48789658,,,
+Topic,,,,,,8148,,,
+Topic,,,,,,9420,,,
+Topic,,,,,,771572,,40.699945,-73.950148
+Topic,,,,,,5088838,,,
+Person,1597454052092354280,,,,,,,,
+Person,961135479935321085,,,,,,,,
+Topic,,,,,,123705,,,
+Topic,,,,,,598435,,,
+Topic,,,,,,732934,,,
+Person,317248309514344163,,,,,,,,
+Person,1524681741257900519,,,,,,,,
+Topic,,,,,,254860,,,
+Topic,,,,,,335046,,,
+Person,534449219561977424,,,,,,,,
+Person,1035056342462002945,,,,,,,,
+Person,1222330726897222256,,,,,,,,
+Person,493345739124130581,,,,,,,,
+Topic,,,,,,831691,,,
+Topic,,,,,,28321638,,,
+Person,682588400093615551,,,,,,,,
+Person,920136262355651383,,,,,,,,
+Person,351354309273100074,,,,,,,,
+Person,495352903902152146,,,,,,,,
+Person,211778681592778731,,,,,,,,
+Topic,,,,,,1189753,,,
+Person,396953035572582107,,,,,,,,
+Topic,,,,,,828749,,,
+Topic,,,,,,904756,,,
+Topic,,,,,,7392008,,,
+Topic,,,,,,2566598,,,
+Person,363047312690634767,,,,,,,,
+Topic,,,,,,618102,,,
+Person,205415260510814362,,,,,,,,
+Person,1251650482793161774,,,,,,,,
+Topic,,,,,,620463,,,
+Person,1463522545161373807,,,,,,,,
+Person,1150357430325141247,,,,,,,,
+Person,674253449444876344,,,,,,,,
+Person,1073324208204442390,,,,,,,,
+Topic,,,,,,1049632,,40.665352,-73.969264
+Topic,,,,,,29171,,,
+Person,1637740339335566412,,,,,,,,
+Person,524508243055647325,,,,,,,,
+Person,320151361710953715,,,,,,,,
+Person,128643504412157535,,,,,,,,
+Topic,,,,,,361,,,
+Person,1243472362254658420,,,,,,,,
+Topic,,,,,,617927,,,
+Person,1275555184736572954,,,,,,,,
+Topic,,,,,,974850,,,
+Person,934144115142885657,,,,,,,,
+Topic,,,,,,217627,,,
+Topic,,,,,,223155,,,
+Person,1504217244688272832,,,,,,,,
+Person,144548678565311334,,,,,,,,
+Person,1400516284533535554,,,,,,,,
+Topic,,,,,,14528,,,
+Person,1508951542204233332,,,,,,,,
+Person,611325512448133762,,,,,,,,
+Person,635555368637193420,,,,,,,,
+Person,134403203055015143,,,,,,,,
+Topic,,,,,,202013,,,
+Topic,,,,,,7602643,,,
+Topic,,,,,,121765,,40.774444444444,-73.904166666667
+Person,765254641650259739,,,,,,,,
+Person,975526659664533195,,,,,,,,
+Person,273872236541568195,,,,,,,,
+Topic,,,,,,557887,,,
+Topic,,,,,,774228,,,
+Topic,,,,,,46744,,,
+Person,352033450190732475,,,,,,,,
+Person,841466124620556016,,,,,,,,
+Person,1517466541524095404,,,,,,,,
+Topic,,,,,,114633,,,
+Person,301710390995444087,,,,,,,,
+Topic,,,,,,16868955,,,
+Person,747231730275042400,,,,,,,,
+Person,1164902255571715230,,,,,,,,
+Person,291914370254601234,,,,,,,,
+Topic,,,,,,247154,,,
+Topic,,,,,,519,,,
+Topic,,,,,,3303945,,,
+Person,1022241560051472272,,,,,,,,
+Person,566448585007839403,,,,,,,,
+Topic,,,,,,11229,,,
+Person,735713441679521195,,,,,,,,
+Person,1128501731262832684,,,,,,,,
+Person,446962590481145702,,,,,,,,
+Person,1125113326787431160,,,,,,,,
+Person,437201545096608055,,,,,,,,
+Person,940377106445268064,,,,,,,,
+Person,1647329525841402942,,,,,,,,
+Topic,,,,,,3884230,,,
+Topic,,,,,,131191,,,
+Person,1376053313411407054,,,,,,,,
+Person,1347432655942023365,,,,,,,,
+Person,1472154222902711100,,,,,,,,
+Person,529550602103217450,,,,,,,,
+Topic,,,,,,43035,,,
+Topic,,,,,,126095,,,
+Topic,,,,,,49088,,40.8075,-73.961944444444
+Person,910075513854877065,,,,,,,,
+Topic,,,,,,5018694,,40.859105555556,-74.198686111111
+Topic,,,,,,2446683,,43.1189,20.0797
+Topic,,,,,,2030894,,40.850852,-73.844949
+Topic,,,,,,29718382,,,
+Topic,,,,,,130965,,40.860833333333,-73.884444444444
+Topic,,,,,,167172,,,
+Topic,,,,,,2456507,,,
+Person,842652402732741813,,,,,,,,
+Topic,,,,,,670897,,33.421111111111,-111.93166666667
+Topic,,,,,,1436668,,,
+Topic,,,,,,753651,,,
+Topic,,,,,,7451247,,,
+Topic,,,,,,2493,,,
+Person,719533111062900642,,,,,,,,
+Person,834321901190546647,,,,,,,,
+Topic,,,,,,12796,,,
+Person,937074421253040138,,,,,,,,
+Topic,,,,,,18159587,,,
+Person,101810442957214781,,,,,,,,
+Topic,,,,,,131401,,,
+Topic,,,,,,929920,,,
+Topic,,,,,,466439,,,
+Topic,,,,,,6498684,,,
+Topic,,,,,,206361,,,
+Topic,,,,,,41796,,,
+Person,1152266442105786574,,,,,,,,
+Person,95240187156237415,,,,,,,,
+Topic,,,,,,7897553,,,
+Topic,,,,,,206887,,,
+Topic,,,,,,5405633,,,
+Person,1031526243841315760,,,,,,,,
+Topic,,,,,,11348540,,,
+Topic,,,,,,4198163,,,
+Topic,,,,,,16048728,,,
+Topic,,,,,,189756,,,
+Topic,,,,,,643638,,,
+Topic,,,,,,783874,,,
+Topic,,,,,,492346,,37.2708,-76.7069
+Topic,,,,,,29042975,,,
+Topic,,,,,,12103677,,,
+Topic,,,,,,2329157,,,
+Person,1563598527979706128,,,,,,,,
+Topic,,,,,,4442,,,
+Person,264075025125849069,,,,,,,,
+Person,369370063627142227,,,,,,,,
+Person,1300183120520109060,,,,,,,,
+Topic,,,,,,18031504,,,
+Topic,,,,,,4229887,,,
+Person,611117914195523184,,,,,,,,
+Topic,,,,,,7543639,,,
+Topic,,,,,,13977,,,
+Topic,,,,,,18122778,,,
+Person,166319955306346577,,,,,,,,
+Topic,,,,,,588894,,,
+Topic,,,,,,2454265,,,
+Person,1547400408884914628,,,,,,,,
+Person,373641740834326257,,,,,,,,
+Topic,,,,,,5264957,,,
+Topic,,,,,,968598,,,
+Person,754480939973310112,,,,,,,,
+Topic,,,,,,1808877,,,
+Person,1443919105364146460,,,,,,,,
+Person,735243266472522113,,,,,,,,
+Person,1321304826561136177,,,,,,,,
+Person,1560601202484151215,,,,,,,,
+Person,1403521534163206962,,,,,,,,
+Person,231472126788137195,,,,,,,,
+Person,208411288512434105,,,,,,,,
+Topic,,,,,,7252790,,,
+Person,1211456636406749825,,,,,,,,
+Person,1071303249530347453,,,,,,,,
+Person,1069710216181783510,,,,,,,,
+Person,1578613817419480731,,,,,,,,
+Person,944546653739552042,,,,,,,,
+Topic,,,,,,8856932,,,
+Person,616673625330310949,,,,,,,,
+Person,1302421465423646583,,,,,,,,
+Person,720320812100121121,,,,,,,,
+Person,653345304799504620,,,,,,,,
+Person,346401281431409585,,,,,,,,
+Person,1526112405471861415,,,,,,,,
+Person,1501623481588541372,,,,,,,,
+Person,312380611598980641,,,,,,,,
+Person,1115244423173415593,,,,,,,,
+Person,1555348115336584230,,,,,,,,
+Person,12321118467056216,,,,,,,,
+Person,1352636429150180228,,,,,,,,
+Person,725324491051434870,,,,,,,,
+Person,846764541256336994,,,,,,,,
+Person,140443713446471314,,,,,,,,
+Person,1135272113235621141,,,,,,,,
+Person,775818654043059161,,,,,,,,
+Person,529476525413023401,,,,,,,,
+Person,1262668194076216011,,,,,,,,
+Person,119474435514352445,,,,,,,,
+Person,437573095319558705,,,,,,,,
+Person,1035555223142154728,,,,,,,,
+Person,556320934631523806,,,,,,,,
+Person,1356253242219285320,,,,,,,,
+Person,248654236829951090,,,,,,,,
+Person,481153633235353485,,,,,,,,
+Includes,,,1202482536733844323,1296829658689065159,,,,,
+HasTopic,,,,956704137555154092,,335046,,,
+HasTopic,,,,1028329324575034354,,1808877,,,
+HasTopic,,,,,1004346153600881042,735,,,
+Author,1560601202484151215,,,1285128710332882742,,,,,
+HasTopic,,,,1513662032452523252,,131401,,,
+HasTopic,,,,1302313601603127196,,48789658,,,
+HasTopic,,,,1114502034902546550,,40357,,,
+Sale,1275555184736572954,1463522545161373807,,,,,8/16/2018,,
+HasTopic,,,,78678286442461987,,28321638,,,
+HasTopic,,,,854149383334143372,,903552,,,
+HasTopic,,,,240337224527030225,,519,,,
+HasTopic,,,,116892402526543412,,5264957,,,
+HasTopic,,,,202421472143651025,,12103677,,,
+HasTopic,,,,393285992310638641,,470118,,,
+Author,910075513854877065,,,,102583151124020340,,,,
+Sale,1426050562563532645,75415528634186650,,,,,7/29/2018,,
+Author,1128501731262832684,,,1513662032452523252,,,,,
+Sale,971383124880710240,1010629269012322480,,,,38695,7/28/2018,,
+Author,477384404927196020,,,1651365355351122204,,,,,
+Author,725324491051434870,,,202421472143651025,,,,,
+HasTopic,,,,932362105613871012,,69871376,,,
+HasTopic,,,,1290121451283392110,,169313,,,
+Sale,396953035572582107,1400516284533535554,,,,41323,8/21/2018,,
+HasTopic,,,,701755398615636460,,8148,,,
+Sale,940377106445268064,1647329525841402942,,,,25347,10/15/2018,,
+HasTopic,,,,1651365355351122204,,643638,,,
+HasTopic,,,,1114502034902546550,,46744,,,
+HasTopic,,,,936722743217343702,,123705,,,
+HasTopic,,,,321724159614056152,,6498684,,,
+Sale,1419850416906085161,1128501731262832684,,,,2869238,09/28/2018,,
+HasTopic,,,1372844135435303981,,,60,,,
+Author,1578613817419480731,,,321724159614056152,,,,,
+HasTopic,,,,1512214307542520410,,8074,,,
+HasTopic,,,,618434247743641149,,192242,,,
+Includes,,,1202482536733844323,1114502034902546550,,,,,
+Includes,,,1615340315424362057,877764733212222524,,,,,
+HasTopic,,,,240337224527030225,,1229,,,
+HasTopic,,,,1209342585680609487,,179057,,,
+Author,834321901190546647,,,,1433303251800176474,,,,
+HasTopic,,,,1209342585680609487,,175111,,,
+Includes,,,1372844135435303981,581543512052485139,,,,,
+Author,720320812100121121,,,420762134340393550,,,,,
+Includes,,,1615340315424362057,1424660009578332566,,,,,
+HasTopic,,,,932362105613871012,,771572,,,
+Includes,,,353365307219544531,956704137555154092,,,,,
+HasTopic,,,,956704137555154092,,929920,,,
+HasTopic,,,,1441762191425652442,,177749,,,
+Author,1563598527979706128,,,1220295546212024391,,,,,
+Author,248654236829951090,,,1424660009578332566,,,,,
+Author,529476525413023401,,,1302313601603127196,,,,,
+HasTopic,,,,1209342585680609487,,771572,,,
+HasTopic,,,,,1433303251800176474,83460,,,
+Sale,1022241560051472272,1637740339335566412,,,,1642639,10/13/2018,,
+Includes,,,1615340315424362057,128423416112315798,,,,,
+Includes,,,1372844135435303981,932362105613871012,,,,,
+Includes,,,1372844135435303981,747423119260925972,,,,,
+HasTopic,,,,854149383334143372,,24862,,,
+Sale,1160244137181801222,1035056342462002945,,,,181508,10/4/2018,,
+HasTopic,,,,1114502034902546550,,328473,,,
+Includes,,,1314315120197156050,854149383334143372,,,,,
+Sale,1472154222902711100,1128501731262832684,,,,185785,09/28/2018,,
+HasOrg,,,,,102583151124020340,49088,,,
+HasTopic,,,,932362105613871012,,11299,,,
+HasTopic,,,,932362105613871012,,18426,,,
+HasTopic,,,,,1004346153600881042,7392008,,,
+Author,1526112405471861415,,,846536331643665114,,,,,
+HasTopic,,,,78678286442461987,,617927,,,
+HasTopic,,,,321724159614056152,,4229887,,,
+HasTopic,,,,82629615412640377,,5322,,,
+Author,1128501731262832684,,,1114502034902546550,,,,,
+Author,166319955306346577,,,209800678458482108,,,,,
+Sale,735713441679521195,1128501731262832684,,,,11650,10/10/2018,,
+Author,944546653739552042,,,803952155714850701,,,,,
+HasTopic,,,,581543512052485139,,3933135,,,
+HasTopic,,,,209800678458482108,,4198163,,,
+HasTopic,,,,932362105613871012,,7590,,,
+Includes,,,1372844135435303981,1060309546214304182,,,,,
+HasTopic,,,,846536331643665114,,167172,,,
+HasTopic,,,,740410432146852843,,11348540,,,
+Author,937074421253040138,,,,1004346153600881042,,,,
+HasTopic,,,,1282227710122181132,,2493,,,
+HasTopic,,,,,1433303251800176474,43035,,,
+Sale,495352903902152146,211778681592778731,,,,81944,8/6/2018,,
+Includes,,,1372844135435303981,1512214307542520410,,,,,
+HasTopic,,,353365307219544531,,,2329,,,
+Includes,,,1615340315424362057,701755398615636460,,,,,
+Includes,,,1372844135435303981,1209342585680609487,,,,,
+Includes,,,1202482536733844323,1513662032452523252,,,,,
+HasTopic,,,,701755398615636460,,968598,,,
+HasTopic,,,,128423416112315798,,11019,,,
+Author,1356253242219285320,,,854149383334143372,,,,,
+Author,1031526243841315760,,,,1433303251800176474,,,,
+Author,842652402732741813,,,,1433303251800176474,,,,
+HasTopic,,,,440265285168056234,,18159587,,,
+Author,1501623481588541372,,,833681012494554358,,,,,
+HasTopic,,,,1028329324575034354,,11299,,,
+Author,140443713446471314,,,1028329324575034354,,,,,
+Author,264075025125849069,,,1245126351375505703,,,,,
+Includes,,,1314315120197156050,1614534111336540475,,,,,
+HasTopic,,,,,1433303251800176474,131191,,,
+HasTopic,,,,420762134340393550,,1907525,,,
+Sale,975526659664533195,524508243055647325,,,,,10/15/2018,,
+Author,284405379592161575,,,,102583151124020340,,,,
+HasTopic,,,,,963345652072941810,735,,,
+HasTopic,,,,,1433303251800176474,3303945,,,
+Author,1152266442105786574,,,,963345652072941810,,,,
+HasTopic,,,,202421472143651025,,1189753,,,
+HasTopic,,,,1028329324575034354,,11229,,,
+Includes,,,1615340315424362057,1220295546212024391,,,,,
+Includes,,,1314315120197156050,846536331643665114,,,,,
+HasTopic,,,,1296829658689065159,,7451247,,,
+Includes,,,1615340315424362057,78678286442461987,,,,,
+HasTopic,,,,541215404780905313,,11635,,,
+Author,481153633235353485,,,956704137555154092,,,,,
+Author,611117914195523184,,,701755398615636460,,,,,
+Author,95240187156237415,,,,963345652072941810,,,,
+Includes,,,1372844135435303981,91431002216341149,,,,,
+Author,846764541256336994,,,1290121451283392110,,,,,
+HasTopic,,,,393285992310638641,,206887,,,
+HasTopic,,,,1512214307542520410,,177749,,,
+HasTopic,,,,1114502034902546550,,7252790,,,
+HasTopic,,,1202482536733844323,,,60,,,
+HasTopic,,,,1302313601603127196,,41796,,,
+HasTopic,,,,1114502034902546550,,44311,,,
+Author,1547400408884914628,,,393285992310638641,,,,,
+Author,312380611598980641,,,78678286442461987,,,,,
+Sale,273872236541568195,1251650482793161774,,,,172809,8/22/2018,,
+HasTopic,,,,1513662032452523252,,728,,,
+Includes,,,1615340315424362057,447169043921403064,,,,,
+Includes,,,1372844135435303981,82629615412640377,,,,,
+HasTopic,,,,1302313601603127196,,29042975,,,
+Author,1321304826561136177,,,1532662490035322233,,,,,
+Includes,,,1372844135435303981,1302313601603127196,,,,,
+Sale,363047312690634767,242111862342742257,,,,,10/4/2018,,
+Author,1071303249530347453,,,737353170652104031,,,,,
+HasTopic,,,,1282227710122181132,,35140,,,
+HasTopic,,,,91431002216341149,,46744,,,
+Includes,,,1372844135435303981,1441762191425652442,,,,,
+Sale,446962590481145702,534449219561977424,,,,,10/11/2018,,
+Author,1035555223142154728,,,877764733212222524,,,,,
+Author,1403521534163206962,,,932362105613871012,,,,,
+HasTopic,,,,701755398615636460,,35127,,,
+Includes,,,1372844135435303981,1028329324575034354,,,,,
+HasTopic,,,,1245126351375505703,,254860,,,
+HasTopic,,,,209800678458482108,,7897553,,,
+Includes,,,1372844135435303981,186108460103013588,,,,,
+HasOrg,,,,,102583151124020340,304878,,,
+HasTopic,,,,,1433303251800176474,998,,,
+Author,653345304799504620,,,581543512052485139,,,,,
+Author,1302421465423646583,,,240337224527030225,,,,,
+Author,1211456636406749825,,,618434247743641149,,,,,
+HasTopic,,,,240337224527030225,,785,,,
+HasTopic,,,,,1004346153600881042,83460,,,
+Includes,,,1615340315424362057,1245126351375505703,,,,,
+HasTopic,,,,747423119260925972,,16868955,,,
+HasTopic,,,,91431002216341149,,9420,,,
+Includes,,,451888058015735870,541215404780905313,,,,,
+HasTopic,,,,1209342585680609487,,492346,,,
+HasTopic,,,,1290121451283392110,,114633,,,
+HasTopic,,,,1441762191425652442,,11249,,,
+HasTopic,,,,1532662490035322233,,753651,,,
+HasTopic,,,,1532662490035322233,,11707,,,
+HasTopic,,,,1245126351375505703,,904756,,,
+HasTopic,,,15133734353741126,,,189756,,,
+Sale,1222330726897222256,493345739124130581,,,,177,8/4/2018,,
+Includes,,,1372844135435303981,1424263331858043042,,,,,
+Includes,,,1615340315424362057,1290121451283392110,,,,,
+Sale,477384404927196020,182010581109145287,,,,271997,2/17/2019,,
+HasTopic,,,,737353170652104031,,16003594,,,
+HasTopic,,,,747423119260925972,,2329157,,,
+Sale,566448585007839403,765254641650259739,,,,,10/9/2018,,
+HasOrg,,,,,963345652072941810,130965,,,
+Author,616673625330310949,,,936722743217343702,,,,,
+HasTopic,,,,1513662032452523252,,44311,,,
+Author,119474435514352445,,,1282227710122181132,,,,,
+HasTopic,,,,82629615412640377,,22983,,,
+HasTopic,,,,833681012494554358,,201816,,,
+HasTopic,,,,,1433303251800176474,14528,,,
+Sale,33927662206515912,934144115142885657,,,,,10/10/2018,,
+Includes,,,1372844135435303981,618434247743641149,,,,,
+Includes,,,1615340315424362057,740410432146852843,,,,,
+Author,1300183120520109060,,,128423416112315798,,,,,
+Author,1069710216181783510,,,1296829658689065159,,,,,
+Sale,1504217244688272832,144548678565311334,,,,56683126,8/13/2018,,
+Author,1115244423173415593,,,1614534111336540475,,,,,
+Sale,747231730275042400,584485814982143221,,,,127,10/7/2018,,
+HasTopic,,,,1285128710332882742,,37654,,,
+HasTopic,,,1615340315424362057,,,12796,,,
+Author,1555348115336584230,,,817526874194673140,,,,,
+HasTopic,,,,,102583151124020340,43035,,,
+HasTopic,,,,737353170652104031,,206361,,,
+Includes,,,1314315120197156050,833681012494554358,,,,,
+Sale,128643504412157535,320151361710953715,,,,443533,8/24/2018,,
+HasTopic,,,,1220295546212024391,,588894,,,
+HasTopic,,,,202421472143651025,,7602643,,,
+Sale,1150357430325141247,674253449444876344,,,,,10/7/2018,,
+Author,735243266472522113,,,1209342585680609487,,,,,
+Includes,,,1372844135435303981,1651365355351122204,,,,,
+HasTopic,,,,1424660009578332566,,618102,,,
+HasTopic,,,,,102583151124020340,126095,,,
+HasTopic,,,,,1004346153600881042,2446683,,,
+HasTopic,,,,,1433303251800176474,598435,,,
+Author,735713441679521195,,,,1433303251800176474,,,,
+Includes,,,1314315120197156050,321724159614056152,,,,,
+HasTopic,,,,1114502034902546550,,8856932,,,
+HasTopic,,,,,1433303251800176474,5405633,,,
+HasTopic,,,,1285128710332882742,,11299,,,
+HasTopic,,,,420762134340393550,,12796,,,
+HasTopic,,,,1184855350262395542,,329717,,,
+HasTopic,,,,116892402526543412,,2456507,,,
+Includes,,,1615340315424362057,817526874194673140,,,,,
+Sale,841466124620556016,1517466541524095404,,,,,8/27/2018,,
+HasTopic,,,,1209342585680609487,,127197,,,
+HasTopic,,,,420762134340393550,,16048728,,,
+HasTopic,,,,209800678458482108,,774228,,,
+Author,231472126788137195,,,82629615412640377,,,,,
+HasTopic,,,,,1433303251800176474,7817,,,
+HasTopic,,,,932362105613871012,,1049632,,,
+Sale,1164902255571715230,291914370254601234,,,,571,10/8/2018,,
+HasTopic,,,,,1433303251800176474,467,,,
+HasTopic,,,,1220295546212024391,,37497186,,,
+HasTopic,,,,817526874194673140,,732934,,,
+Includes,,,1615340315424362057,202421472143651025,,,,,
+HasTopic,,,,1513662032452523252,,889,,,
+Includes,,,1372844135435303981,737353170652104031,,,,,
+HasTopic,,,,1296829658689065159,,83460,,,
+HasTopic,,,,747423119260925972,,1333024,,,
+HasTopic,,,,1290121451283392110,,204,,,
+Author,611117914195523184,,,440265285168056234,,,,,
+Author,754480939973310112,,,1441762191425652442,,,,,
+Includes,,,1427292001647224242,936722743217343702,,,,,
+HasTopic,,,,1441762191425652442,,485537,,,
+Sale,1376053313411407054,1347432655942023365,,,,55424107,2/3/2019,,
+HasTopic,,,,581543512052485139,,519,,,
+Sale,682588400093615551,920136262355651383,,,,,10/5/2018,,
+Author,346401281431409585,,,1114502034902546550,,,,,
+Sale,317248309514344163,1524681741257900519,,,,,8/3/2018,,
+HasTopic,,,,618434247743641149,,3884230,,,
+Sale,1243472362254658420,205415260510814362,,,,,8/9/2018,,
+Sale,1472154222902711100,529550602103217450,,,,185785,2/17/2019,,
+Includes,,,1314315120197156050,803952155714850701,,,,,
+Author,1262668194076216011,,,747423119260925972,,,,,
+HasTopic,,,,833681012494554358,,787185,,,
+HasTopic,,,,932362105613871012,,160409,,,
+Sale,1125113326787431160,437201545096608055,,,,146,10/14/2018,,
+HasTopic,,,,1424660009578332566,,334600,,,
+HasTopic,,,451888058015735870,,,121765,,,
+HasTopic,,,,1285128710332882742,,193294,,,
+HasTopic,,,,956704137555154092,,217627,,,
+HasTopic,,,,1424263331858043042,,2454265,,,
+HasTopic,,,,1651365355351122204,,783874,,,
+HasTopic,,,,1209342585680609487,,792565,,,
+Author,1128501731262832684,,,1060309546214304182,,,,,
+Author,775818654043059161,,,740410432146852843,,,,,
+HasTopic,,,,854149383334143372,,4442,,,
+HasOrg,,,,,1004346153600881042,670897,,,
+Sale,1508951542204233332,611325512448133762,,,,,10/11/2018,,
+HasTopic,,,,747423119260925972,,202013,,,
+Author,208411288512434105,,,1513662032452523252,,,,,
+HasTopic,,,,740410432146852843,,5088838,,,
+Includes,,,15133734353741126,1532662490035322233,,,,,
+HasTopic,,,,817526874194673140,,18122778,,,
+HasTopic,,,,440265285168056234,,102014,,,
+HasTopic,,,,186108460103013588,,732934,,,
+Includes,,,1615340315424362057,209800678458482108,,,,,
+Author,373641740834326257,,,116892402526543412,,,,,
+HasTopic,,,,740410432146852843,,122113,,,
+Includes,,,1615340315424362057,440265285168056234,,,,,
+Author,719533111062900642,,,,1433303251800176474,,,,
+HasTopic,,,,420762134340393550,,10289,,,
+HasTopic,,,,1184855350262395542,,11348,,,
+HasTopic,,,,82629615412640377,,247154,,,
+Author,369370063627142227,,,1184855350262395542,,,,,
+Author,1135272113235621141,,,91431002216341149,,,,,
+Sale,635555368637193420,134403203055015143,,,,,8/15/2018,,
+Includes,,,1314315120197156050,1282227710122181132,,,,,
+HasTopic,,,,,1433303251800176474,828749,,,
+HasTopic,,,1314315120197156050,,,12796,,,
+HasTopic,,,,541215404780905313,,182218,,,
+HasTopic,,,,877764733212222524,,7543639,,,
+HasTopic,,,,,1433303251800176474,1436668,,,
+HasTopic,,,,447169043921403064,,29171,,,
+HasOrg,,,,,1433303251800176474,2030894,,,
+Sale,477384404927196020,1128501731262832684,,,,271997,09/30/2018,,
+HasTopic,,,,581543512052485139,,505619,,,
+HasTopic,,,,618434247743641149,,60,,,
+HasTopic,,,,803952155714850701,,557887,,,
+HasTopic,,,,78678286442461987,,831691,,,
+Author,101810442957214781,,,,1004346153600881042,,,,
+Sale,1597454052092354280,961135479935321085,,,,1149078,8/1/2018,,
+HasOrg,,,,,1004346153600881042,5018694,,,
+Sale,301710390995444087,1312322776399358210,,,,379860,10/12/2018,,
+HasTopic,,,,,1004346153600881042,29718382,,,
+HasTopic,,,,,1433303251800176474,620463,,,
+HasTopic,,,,1651365355351122204,,44311,,,
+Author,1443919105364146460,,,1424263331858043042,,,,,
+Sale,895197896920634500,1508332501512270227,,,,487,7/31/2018,,
+HasTopic,,,,321724159614056152,,158668,,,
+Includes,,,1427292001647224242,240337224527030225,,,,,
+HasTopic,,,,1184855350262395542,,223155,,,
+Includes,,,1615340315424362057,1184855350262395542,,,,,
+HasTopic,,,,1532662490035322233,,1337691,,,
+Author,1352636429150180228,,,186108460103013588,,,,,
+HasTopic,,,,393285992310638641,,974850,,,
+Includes,,,1615340315424362057,116892402526543412,,,,,
+HasTopic,,,,128423416112315798,,361,,,
+HasTopic,,,,541215404780905313,,13977,,,
+HasTopic,,,,1512214307542520410,,699385,,,
+Sale,1035098046740791143,352033450190732475,,,,,8/25/2018,,
+Includes,,,1615340315424362057,420762134340393550,,,,,
+Author,556320934631523806,,,541215404780905313,,,,,
+HasTopic,,,1427292001647224242,,,25395,,,
+HasTopic,,,,1424660009578332566,,191290,,,
+Sale,351354309273100074,1073324208204442390,,,,206021,8/7/2018,,
+Sale,1025135622623992536,1001287904525368324,,,,,10/2/2018,,
+Author,12321118467056216,,,1512214307542520410,,,,,
+HasTopic,,,,932362105613871012,,60,,,
+HasTopic,,,,803952155714850701,,5,,,
+HasTopic,,,,1220295546212024391,,18031504,,,
+HasTopic,,,,91431002216341149,,73843,,,
+Author,437573095319558705,,,447169043921403064,,,,,
+HasTopic,,,,1424263331858043042,,2566598,,,
+Includes,,,15133734353741126,1285128710332882742,,,,,
+HasTopic,,,,1513662032452523252,,466439,,,
+Includes,,,1314315120197156050,393285992310638641,,,,,
diff --git a/libwmd/test/CMakeLists.txt b/libwmd/test/CMakeLists.txt
index 3b4887bfe3..684f503069 100644
--- a/libwmd/test/CMakeLists.txt
+++ b/libwmd/test/CMakeLists.txt
@@ -6,7 +6,7 @@ function(add_test_unit name)
 
   set(command_line "$<TARGET_FILE:${test_name}>")
 
-  add_test(NAME ${test_name} COMMAND ${command_line})
+  add_test(NAME ${test_name} COMMAND ${command_line} ${Galois_SOURCE_DIR}/inputs/wmd/data.00001.csv)
 endfunction()
 
 add_test_unit(wmd-graph-build)

From c122b264600b0d6ffa0b0e4e50f3d5ff08945d34 Mon Sep 17 00:00:00 2001
From: patrickkenney9801 <patrickkenney9801@gmail.com>
Date: Thu, 7 Mar 2024 09:31:04 -0600
Subject: [PATCH 625/660] chore: Run pre-commit

---
 libcusp/include/galois/graphs/BasePolicies.h        | 13 +++++++------
 .../include/galois/graphs/LS_LC_CSR_64_Graph.h      |  6 +++---
 libgalois/include/galois/worklists/AdaptiveObim.h   |  5 ++---
 libgalois/src/Context.cpp                           |  9 ++++-----
 libgnn/CMakeLists.txt                               |  8 ++++----
 libgnn/include/galois/DistributedMinibatchTracker.h |  4 ++--
 libgnn/include/galois/graphs/GNNGraph.h             |  9 +++++----
 libgnn/include/galois/layers/L2NormLayer.h          |  2 +-
 8 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/libcusp/include/galois/graphs/BasePolicies.h b/libcusp/include/galois/graphs/BasePolicies.h
index d0cc16c354..515f957e54 100644
--- a/libcusp/include/galois/graphs/BasePolicies.h
+++ b/libcusp/include/galois/graphs/BasePolicies.h
@@ -42,8 +42,9 @@ class PartitioningScaffold {
   uint64_t _numEdges; //!< number of edges in graph
   //! maps from host id to nodes that host as read from disk
   std::vector<std::pair<uint64_t, uint64_t>> _gid2host;
-  std::vector<uint32_t> _virtualToPhyMapping; //saving Virtual hosts to Phy hosts map
-  bool hash; //switch between using gid2host and VtoP maps
+  std::vector<uint32_t>
+      _virtualToPhyMapping; // saving Virtual hosts to Phy hosts map
+  bool hash;                // switch between using gid2host and VtoP maps
 
 public:
   /**
@@ -66,11 +67,11 @@ class PartitioningScaffold {
    */
   void saveGIDToHost(std::vector<std::pair<uint64_t, uint64_t>>& gid2host) {
     _gid2host = gid2host;
-    hash = false;
+    hash      = false;
   }
   void saveGIDToHost(std::vector<uint32_t>& virtualToPhyMapping) {
     _virtualToPhyMapping = virtualToPhyMapping;
-    hash = true;
+    hash                 = true;
   }
 
   bool predeterminedMapping(std::vector<uint32_t>&) { return false; }
@@ -97,7 +98,7 @@ class ReadMasterAssignment : public PartitioningScaffold {
    * @returns Host ID of host that read the node specified by the GID.
    */
   uint32_t retrieveMaster(uint32_t gid) const {
-    if(hash == false) {
+    if (hash == false) {
       for (auto h = 0U; h < _numHosts; ++h) {
         uint64_t start, end;
         std::tie(start, end) = _gid2host[h];
@@ -108,7 +109,7 @@ class ReadMasterAssignment : public PartitioningScaffold {
       assert(false);
       return _numHosts;
     } else {
-      return _virtualToPhyMapping[gid%(_virtualToPhyMapping.size())];
+      return _virtualToPhyMapping[gid % (_virtualToPhyMapping.size())];
     }
   }
 
diff --git a/libgalois/include/galois/graphs/LS_LC_CSR_64_Graph.h b/libgalois/include/galois/graphs/LS_LC_CSR_64_Graph.h
index 563039cdbb..4ca7e63cda 100644
--- a/libgalois/include/galois/graphs/LS_LC_CSR_64_Graph.h
+++ b/libgalois/include/galois/graphs/LS_LC_CSR_64_Graph.h
@@ -293,13 +293,13 @@ class LS_LC_CSR_64_Graph :
 
   template <bool _A1 = HasNoLockable, bool _A2 = HasOutOfLineLockable>
   void acquireNode(GraphNode N, MethodFlag mflag,
-                   typename std::enable_if<!_A1 && !_A2>::type* = 0) {
+                   typename std::enable_if<!_A1&& !_A2>::type* = 0) {
     galois::runtime::acquire(&nodeData[N], mflag);
   }
 
   template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>
   void acquireNode(GraphNode N, MethodFlag mflag,
-                   typename std::enable_if<_A1 && !_A2>::type* = 0) {
+                   typename std::enable_if<_A1&& !_A2>::type* = 0) {
     this->outOfLineAcquire(getId(N), mflag);
   }
 
@@ -338,7 +338,7 @@ class LS_LC_CSR_64_Graph :
   template <bool _A1 = EdgeData::has_value,
             bool _A2 = LargeArray<FileEdgeTy>::has_value>
   void constructEdgeValue(FileGraph&, typename FileGraph::edge_iterator nn,
-                          typename std::enable_if<_A1 && !_A2>::type* = 0) {
+                          typename std::enable_if<_A1&& !_A2>::type* = 0) {
     edgeData.set(*nn, {});
   }
 
diff --git a/libgalois/include/galois/worklists/AdaptiveObim.h b/libgalois/include/galois/worklists/AdaptiveObim.h
index 758af8582f..8b03e936e9 100644
--- a/libgalois/include/galois/worklists/AdaptiveObim.h
+++ b/libgalois/include/galois/worklists/AdaptiveObim.h
@@ -303,9 +303,8 @@ struct AdaptiveOrderedByIntegerMetric
     ThreadData(Index initial)
         : curIndex(initial, 0), scanStart(initial, 0), current(0),
           lastMasterVersion(0), numPops(0), popsLastFix(0),
-          slowPopsLastPeriod(0), pushesLastPeriod(0),
-          popsFromSameQ(0), stats{0, 0, 0, 0},
-          minPrio(std::numeric_limits<Index>::max()),
+          slowPopsLastPeriod(0), pushesLastPeriod(0), popsFromSameQ(0),
+          stats{0, 0, 0, 0}, minPrio(std::numeric_limits<Index>::max()),
           maxPrio(std::numeric_limits<Index>::min()) {}
   };
 
diff --git a/libgalois/src/Context.cpp b/libgalois/src/Context.cpp
index 0d3f0014d2..7fa371dc09 100644
--- a/libgalois/src/Context.cpp
+++ b/libgalois/src/Context.cpp
@@ -57,11 +57,10 @@ galois::runtime::LockManagerBase::tryAcquire(
     lockable->owner.setValue(this);
     return NEW_OWNER;
 #endif
-}
-else if (getOwner(lockable) == this) {
-  return ALREADY_OWNER;
-}
-return FAIL;
+  } else if (getOwner(lockable) == this) {
+    return ALREADY_OWNER;
+  }
+  return FAIL;
 }
 
 void galois::runtime::SimpleRuntimeContext::release(
diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt
index 030e5bb516..4c3b8533ae 100644
--- a/libgnn/CMakeLists.txt
+++ b/libgnn/CMakeLists.txt
@@ -9,9 +9,9 @@ set(sources
 add_library(galois_gnn STATIC ${sources})
 
 target_compile_options(galois_gnn PUBLIC
-		$<TARGET_PROPERTY:MKL::MKL,INTERFACE_COMPILE_OPTIONS>)
+    $<TARGET_PROPERTY:MKL::MKL,INTERFACE_COMPILE_OPTIONS>)
 target_include_directories(galois_gnn PUBLIC
-		$<TARGET_PROPERTY:MKL::MKL,INTERFACE_INCLUDE_DIRECTORIES>)
+    $<TARGET_PROPERTY:MKL::MKL,INTERFACE_INCLUDE_DIRECTORIES>)
 target_link_libraries(galois_gnn PUBLIC $<LINK_ONLY:MKL::MKL>)
 target_link_libraries(galois_gnn PUBLIC galois_shmem)
 target_link_libraries(galois_gnn PUBLIC galois_dist_async galois_cusp galois_gluon galois_support)
@@ -21,9 +21,9 @@ target_include_directories(galois_gnn PUBLIC
 
 add_library(galois_gnn_single STATIC ${sources})
 target_compile_options(galois_gnn_single PUBLIC
-		$<TARGET_PROPERTY:MKL::MKL,INTERFACE_COMPILE_OPTIONS>)
+    $<TARGET_PROPERTY:MKL::MKL,INTERFACE_COMPILE_OPTIONS>)
 target_include_directories(galois_gnn_single PUBLIC
-		$<TARGET_PROPERTY:MKL::MKL,INTERFACE_INCLUDE_DIRECTORIES>)
+    $<TARGET_PROPERTY:MKL::MKL,INTERFACE_INCLUDE_DIRECTORIES>)
 target_link_libraries(galois_gnn_single PUBLIC $<LINK_ONLY:MKL::MKL>)
 target_link_libraries(galois_gnn_single PUBLIC galois_shmem)
 target_link_libraries(galois_gnn_single PUBLIC galois_dist_async galois_cusp galois_gluon galois_support)
diff --git a/libgnn/include/galois/DistributedMinibatchTracker.h b/libgnn/include/galois/DistributedMinibatchTracker.h
index be5496ba92..730a40400e 100644
--- a/libgnn/include/galois/DistributedMinibatchTracker.h
+++ b/libgnn/include/galois/DistributedMinibatchTracker.h
@@ -16,8 +16,8 @@ class DistributedMinibatchTracker {
       : my_host_id_{my_host_id}, num_hosts_{num_hosts},
         local_minibatch_size_{local_minibatch_size},
         total_minibatch_size_{local_minibatch_size_ * num_hosts_},
-        complete_hosts_{0}, rng_object_{(long unsigned)rand() *
-                                        (my_host_id_ + 1)},
+        complete_hosts_{0},
+        rng_object_{(long unsigned)rand() * (my_host_id_ + 1)},
         int_distribution_{1, 10} {
     max_num_on_hosts_.resize(num_hosts_, 0);
     current_num_on_hosts_.resize(num_hosts_, 0);
diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 31b1fbf120..525df23a1e 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -939,7 +939,7 @@ class GNNGraph {
     this->Construct1HopFeatureCPU();
     // this->PrintFeatures("1hop");
     this->Construct2HopFeatureCPU();
-    //this->PrintFeatures("2hop");
+    // this->PrintFeatures("2hop");
   }
 
   void PrintFeatures(std::string postfix) {
@@ -1632,7 +1632,8 @@ class GNNGraph {
     return definitely_sampled_nodes_;
   }
 
-  /* @brief Return true if this is constructed from a WMD graph otherwise false. */
+  /* @brief Return true if this is constructed from a WMD graph otherwise false.
+   */
   bool is_using_wmd() { return this->use_wmd_; }
 
 private:
@@ -2401,10 +2402,10 @@ class GNNGraph {
     std::cout << "single accuracy print:\n";
     for (int i = *begin_owned(); i < *end_owned(); ++i) {
       if (!IsValidForPhase(i, GNNPhase::kBatch)) {
-        continue; 
+        continue;
       }
       //std::cout << subgraph_->SIDToLID(i) << ", " << galois::MaxIndex(num_label_classes_, &predictions[i * num_label_classes_]) <<
-      std::cout << "accuracy:" << subgraph_->SIDToLID(i) << ", " << 
+      std::cout << "accuracy:" << subgraph_->SIDToLID(i) << ", " <<
       predictions[i * num_label_classes_] << ", " <<
       predictions[i * num_label_classes_ + 1] << ", " <<
       predictions[i * num_label_classes_ + 2] << ", " <<
diff --git a/libgnn/include/galois/layers/L2NormLayer.h b/libgnn/include/galois/layers/L2NormLayer.h
index e3ec67f726..fe201e453b 100644
--- a/libgnn/include/galois/layers/L2NormLayer.h
+++ b/libgnn/include/galois/layers/L2NormLayer.h
@@ -159,7 +159,7 @@ class L2NormLayer : public GNNLayer<VTy, ETy> {
   void OptimizeLayer(BaseOptimizer*, size_t) { return; };
 
 #ifdef GALOIS_ENABLE_GPU
-    // TODO(loc/hochan)
+  // TODO(loc/hochan)
 #endif
 };
 

From a724c4a2aea8aec7a4759044dfffc76d2bab0745 Mon Sep 17 00:00:00 2001
From: Meyer Zinn <6132034+meyerzinn@users.noreply.github.com>
Date: Tue, 27 Feb 2024 11:32:37 -0600
Subject: [PATCH 626/660] add new LS_CSR and LargeVector implementations (#1)

Co-authored-by: AdityaAtulTewari <adityaatewari@gmail.com>
---
 libgalois/include/galois/LargeVector.h        |  188 +++
 .../galois/graphs/LS_LC_CSR_64_Graph.h        |   24 +-
 .../include/galois/graphs/LS_LC_CSR_Graph.h   | 1230 +++--------------
 libgalois/include/galois/graphs/MorphGraph.h  |    2 +-
 libgalois/test/CMakeLists.txt                 |    2 +
 libgalois/test/graph-compile-lscsr.cpp        |   65 +
 libgalois/test/large-vector.cpp               |   75 +
 7 files changed, 566 insertions(+), 1020 deletions(-)
 create mode 100644 libgalois/include/galois/LargeVector.h
 create mode 100644 libgalois/test/graph-compile-lscsr.cpp
 create mode 100644 libgalois/test/large-vector.cpp

diff --git a/libgalois/include/galois/LargeVector.h b/libgalois/include/galois/LargeVector.h
new file mode 100644
index 0000000000..a5c56494f6
--- /dev/null
+++ b/libgalois/include/galois/LargeVector.h
@@ -0,0 +1,188 @@
+#ifndef GALOIS_LARGEVECTOR_H
+#define GALOIS_LARGEVECTOR_H
+
+#include <linux/memfd.h>
+#include <linux/mman.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <atomic>
+#include <cstddef>
+#include <stdexcept>
+#include <iterator>
+#include <iostream>
+#include <list>
+#include <string.h>
+#include <utility>
+namespace galois {
+/*
+ * A vector backed by huge pages. Guarantees addresss stability, so values do
+ * not have to be moveable.
+ *
+ * A note on iterator safety:
+ *  1. Resizing the container results in a new iterator generation.
+ *  2. All iterator methods (e.g. increment) preserve generation.
+ *  3. It is undefined behavior to compare iterators across generations.
+ *  4. Decreasing the container size invalidates some iterators.
+ */
+template <typename T>
+class LargeVector : public boost::noncopyable {
+private:
+  size_t m_capacity;
+  size_t m_size;
+  T* m_data;
+
+  int m_fd;
+  std::list<std::pair<void*, size_t>> m_mappings;
+
+  void ensure_capacity(size_t new_cap) {
+    using std::string;
+
+    if (m_capacity == new_cap)
+      return;
+
+    // Round up to the nearest huge page size.
+    constexpr size_t page_size = 1ull << 21;
+    const size_t file_size =
+        (new_cap * sizeof(T) + (page_size - 1)) & (~(page_size - 1));
+
+    if (ftruncate(m_fd, file_size) == -1)
+      throw std::runtime_error(string("ftruncate: ") + strerror(errno));
+
+    // Floor divide to find the real capacity.
+    new_cap = file_size / sizeof(T);
+
+    // We only need to remap if the new capacity is larger. Otherwise,
+    // we'll just truncate the file to release any used physical pages
+    // and keep the existing mapping.
+    const bool remap = new_cap > m_capacity;
+
+    m_capacity = new_cap;
+
+    if (!remap)
+      return;
+
+    const size_t mmap_size = m_capacity * sizeof(T);
+    if (!mmap_size) {
+      m_data = nullptr;
+      return;
+    }
+
+    m_data =
+        static_cast<T*>(mmap(nullptr, mmap_size, PROT_READ | PROT_WRITE,
+                             MAP_SHARED | MAP_HUGETLB | MAP_HUGE_2MB, m_fd, 0));
+    if (m_data == MAP_FAILED)
+      throw std::runtime_error(string("mmap failed: ") + strerror(errno));
+
+    m_mappings.push_back(std::make_pair(m_data, mmap_size));
+  }
+
+public:
+  LargeVector(size_t initial_capacity)
+      : m_capacity(0), m_size(0), m_data(nullptr),
+        m_fd(memfd_create("LargeVector", MFD_HUGETLB | MFD_HUGE_2MB)) {
+    if (m_fd == -1) {
+      throw std::runtime_error(std::string("creating memfd: ") +
+                               strerror(errno));
+    }
+    ensure_capacity(initial_capacity);
+  }
+
+  LargeVector() : LargeVector(1) {}
+
+  LargeVector(LargeVector&& other)
+      : m_capacity(other.m_capacity), m_size(other.m_size),
+        m_data(other.m_data), m_fd(other.m_fd),
+        m_mappings(std::move(other.m_mappings)) {
+    other.m_capacity = 0;
+    other.m_size     = 0;
+    other.m_data     = nullptr;
+    other.m_fd       = -1;
+    assert(other.m_mappings.empty());
+  }
+
+  LargeVector& operator=(LargeVector<T>&& other) {
+    m_capacity = std::move(other.m_capacity);
+    m_size     = std::move(other.m_size);
+    m_data     = std::move(other.m_data);
+    m_fd       = std::move(other.m_fd);
+    m_mappings = std::move(other.m_mappings);
+
+    other.m_capacity = 0;
+    other.m_size     = 0;
+    other.m_data     = nullptr;
+    other.m_fd       = -1;
+    assert(other.m_mappings.empty());
+
+    return *this;
+  }
+
+  ~LargeVector() {
+    for (; !m_mappings.empty(); m_mappings.pop_front())
+      munmap(m_mappings.front().first, m_mappings.front().second);
+
+    if (m_fd != -1)
+      close(m_fd);
+  }
+
+  uint64_t size() const noexcept { return m_size; }
+
+  template <typename... Args>
+  T& emplace_back(Args&&... args) {
+    if (m_size == m_capacity) {
+      ensure_capacity(m_size + 1);
+    }
+    return *new (m_data + m_size++) T(std::forward<Args>(args)...);
+  }
+
+  T& push_back(const T& t) { return emplace_back(t); }
+
+  T& push_back(T&& t) { return emplace_back(std::move(t)); }
+
+  T& operator[](size_t index) const { return m_data[index]; }
+
+  void pop_back() {
+    assert(m_size > 0);
+    m_data[--m_size].~T();
+  }
+
+  void resize(size_t count) {
+    for (T* ii = begin() + count; ii < end(); ++ii)
+      ii->~T();
+
+    ensure_capacity(count);
+
+    for (T* ii = end(); ii < begin() + count; ++ii)
+      new (ii) T();
+
+    m_size = count;
+  }
+
+  inline T* begin() { return m_data; }
+  inline T* end() { return m_data + m_size; }
+};
+
+}; // namespace galois
+
+namespace std {
+template <typename T>
+ostream& operator<<(std::ostream& os, const galois::LargeVector<T>& vec) {
+  for (uint64_t i = 0; i < vec.getSize(); i++) {
+    os << vec[i];
+    if (i < vec.getSize() - 1) {
+      os << " ";
+    }
+  }
+  return os;
+}
+
+template <typename T>
+istream& operator>>(istream& is, galois::LargeVector<T>& vec) {
+  T value;
+  while (is >> value) {
+    vec.push_back(value);
+  }
+  return is;
+}
+} // namespace std
+
+#endif
\ No newline at end of file
diff --git a/libgalois/include/galois/graphs/LS_LC_CSR_64_Graph.h b/libgalois/include/galois/graphs/LS_LC_CSR_64_Graph.h
index 4ca7e63cda..f6507df2f9 100644
--- a/libgalois/include/galois/graphs/LS_LC_CSR_64_Graph.h
+++ b/libgalois/include/galois/graphs/LS_LC_CSR_64_Graph.h
@@ -293,13 +293,13 @@ class LS_LC_CSR_64_Graph :
 
   template <bool _A1 = HasNoLockable, bool _A2 = HasOutOfLineLockable>
   void acquireNode(GraphNode N, MethodFlag mflag,
-                   typename std::enable_if<!_A1&& !_A2>::type* = 0) {
+                   typename std::enable_if<!_A1 && !_A2>::type* = 0) {
     galois::runtime::acquire(&nodeData[N], mflag);
   }
 
   template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>
   void acquireNode(GraphNode N, MethodFlag mflag,
-                   typename std::enable_if<_A1&& !_A2>::type* = 0) {
+                   typename std::enable_if<_A1 && !_A2>::type* = 0) {
     this->outOfLineAcquire(getId(N), mflag);
   }
 
@@ -338,7 +338,7 @@ class LS_LC_CSR_64_Graph :
   template <bool _A1 = EdgeData::has_value,
             bool _A2 = LargeArray<FileEdgeTy>::has_value>
   void constructEdgeValue(FileGraph&, typename FileGraph::edge_iterator nn,
-                          typename std::enable_if<_A1&& !_A2>::type* = 0) {
+                          typename std::enable_if<_A1 && !_A2>::type* = 0) {
     edgeData.set(*nn, {});
   }
 
@@ -556,6 +556,7 @@ class LS_LC_CSR_64_Graph :
         galois::steal());
   }
 
+<<<<<<< HEAD
   /**
    * Add edges into the graph
    *
@@ -576,6 +577,18 @@ class LS_LC_CSR_64_Graph :
 
     auto edgeStart = ee;
     auto orig_itr  = edge_begin(src);
+=======
+  template <typename T>
+  void addEdgesUnSort(bool setEdgeVals, GraphNode src, EdgeDst::value_type* dst,
+                      T* dst_data, uint64_t num_dst) {
+    acquireNode(src, galois::MethodFlag::WRITE);
+    auto orig_deg = getDegree(src);
+    auto ee = edgeEnd.fetch_add(num_dst + orig_deg, std::memory_order_relaxed);
+    auto edgeStart = ee;
+    auto orig_itr  = edge_begin(src);
+    auto orig_end  = edge_end(src);
+    auto dst_end   = dst + num_dst;
+>>>>>>> ccbe0f155 (add new LS_CSR and LargeVector implementations (#1))
 
     std::memcpy(&edgeDst[edgeStart], &edgeDst[*orig_itr],
                 sizeof(EdgeDst::value_type) * orig_deg);
@@ -593,11 +606,15 @@ class LS_LC_CSR_64_Graph :
 
     edgeIndData[src].first  = edgeStart;
     edgeIndData[src].second = edgeStart + num_dst + orig_deg;
+<<<<<<< HEAD
 
     if (!keep_size) {
       numEdges.fetch_add(num_dst, std::memory_order_relaxed);
     }
     prefixValid = false;
+=======
+    numEdges.fetch_add(num_dst, std::memory_order_relaxed);
+>>>>>>> ccbe0f155 (add new LS_CSR and LargeVector implementations (#1))
   }
 
   void addEdgeSort(const uint64_t src, const uint64_t dst) {
@@ -687,6 +704,7 @@ class LS_LC_CSR_64_Graph :
     numEdges.fetch_add(edgePlace - edgeStart - orig_deg,
                        std::memory_order_relaxed);
     prefixValid = false;
+    // releaseNode(src);
   }
 
   template <typename PTM>
diff --git a/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h b/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
index 1cf56cc5f9..36c76f88c5 100644
--- a/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
+++ b/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
@@ -3,7 +3,7 @@
  * parallelism. The code is being released under the terms of the 3-Clause BSD
  * License (a copy is located in LICENSE.txt at the top-level directory).
  *
- * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
+ * Copyright (C) 2024, The University of Texas at Austin. All rights reserved.
  * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
  * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
@@ -20,1094 +20,292 @@
 #ifndef GALOIS_GRAPHS_LC_CSR_GRAPH_H
 #define GALOIS_GRAPHS_LC_CSR_GRAPH_H
 
-#include <fstream>
-#include <type_traits>
+#include <unordered_set>
+#include <iterator>
+#include <cstddef>
 
-#include <boost/archive/binary_oarchive.hpp>
-#include <boost/archive/binary_iarchive.hpp>
-#include <boost/serialization/split_member.hpp>
-#include <boost/serialization/binary_object.hpp>
-#include <boost/serialization/serialization.hpp>
+#include <boost/range/iterator_range_core.hpp>
+#include <boost/range/counting_range.hpp>
+#include <boost/iterator/iterator_facade.hpp>
 
 #include "galois/config.h"
-#include "galois/Galois.h"
-#include "galois/graphs/Details.h"
-#include "galois/graphs/FileGraph.h"
-#include "galois/graphs/GraphHelpers.h"
-#include "galois/PODResizeableArray.h"
+#include "galois/LargeVector.h"
 
 namespace galois::graphs {
+
 /**
- * Local computation graph (i.e., graph structure does not change). The data
- * representation is the traditional compressed-sparse-row (CSR) format.
- *
- * The position of template parameters may change between Galois releases; the
- * most robust way to specify them is through the with_XXX nested templates.
- *
- * An example of use:
- *
- * \snippet test/graph.cpp Using a graph
- *
- * And in C++11:
- *
- * \snippet test/graph.cpp Using a graph cxx11
- *
- * @tparam NodeTy data on nodes
- * @tparam EdgeTy data on out edges
+ * Local computation graph.
  */
-//! [doxygennuma]
-template <typename NodeTy, typename EdgeTy, bool HasNoLockable = false,
-          bool UseNumaAlloc = false, bool HasOutOfLineLockable = false,
-          typename FileEdgeTy = EdgeTy>
-class LS_LC_CSR_48_Graph :
-    //! [doxygennuma]
-    private boost::noncopyable,
-    private internal::LocalIteratorFeature<UseNumaAlloc>,
-    private internal::OutOfLineLockableFeature<HasOutOfLineLockable &&
-                                               !HasNoLockable> {
-  template <typename Graph>
-  friend class LC_InOut_Graph;
-
+template <bool concurrent = true>
+class LS_LC_CSR_Graph : private boost::noncopyable {
 public:
-  template <bool _has_id>
-  struct with_id {
-    typedef LS_LC_CSR_48_Graph type;
-  };
-
-  template <typename _node_data>
-  struct with_node_data {
-    typedef LS_LC_CSR_48_Graph<_node_data, EdgeTy, HasNoLockable, UseNumaAlloc,
-                               HasOutOfLineLockable, FileEdgeTy>
-        type;
-  };
-
-  template <typename _edge_data>
-  struct with_edge_data {
-    typedef LS_LC_CSR_48_Graph<NodeTy, _edge_data, HasNoLockable, UseNumaAlloc,
-                               HasOutOfLineLockable, FileEdgeTy>
-        type;
-  };
-
-  template <typename _file_edge_data>
-  struct with_file_edge_data {
-    typedef LS_LC_CSR_48_Graph<NodeTy, EdgeTy, HasNoLockable, UseNumaAlloc,
-                               HasOutOfLineLockable, _file_edge_data>
-        type;
-  };
-
-  //! If true, do not use abstract locks in graph
-  template <bool _has_no_lockable>
-  struct with_no_lockable {
-    typedef LS_LC_CSR_48_Graph<NodeTy, EdgeTy, _has_no_lockable, UseNumaAlloc,
-                               HasOutOfLineLockable, FileEdgeTy>
-        type;
-  };
-  template <bool _has_no_lockable>
-  using _with_no_lockable =
-      LS_LC_CSR_48_Graph<NodeTy, EdgeTy, _has_no_lockable, UseNumaAlloc,
-                         HasOutOfLineLockable, FileEdgeTy>;
-
-  //! If true, use NUMA-aware graph allocation; otherwise, use NUMA interleaved
-  //! allocation.
-  template <bool _use_numa_alloc>
-  struct with_numa_alloc {
-    typedef LS_LC_CSR_48_Graph<NodeTy, EdgeTy, HasNoLockable, _use_numa_alloc,
-                               HasOutOfLineLockable, FileEdgeTy>
-        type;
-  };
-  template <bool _use_numa_alloc>
-  using _with_numa_alloc =
-      LS_LC_CSR_48_Graph<NodeTy, EdgeTy, HasNoLockable, _use_numa_alloc,
-                         HasOutOfLineLockable, FileEdgeTy>;
-
-  //! If true, store abstract locks separate from nodes
-  template <bool _has_out_of_line_lockable>
-  struct with_out_of_line_lockable {
-    typedef LS_LC_CSR_48_Graph<NodeTy, EdgeTy, HasNoLockable, UseNumaAlloc,
-                               _has_out_of_line_lockable, FileEdgeTy>
-        type;
-  };
-
-  typedef read_default_graph_tag read_tag;
-
-protected:
-  enum VertexState : uint16_t {
-    UNLK = 0x0 << 0,
-    LOCK = 0x1 << 0,
-    TOMB = 0x1 << 1,
-    UMAX = 0x1 << 2
-  };
-
-  constexpr uint64_t mask(uint8_t mask, uint8_t shift) { return mask << shift; }
-  constexpr uint64_t lower(uint8_t num) { return (1 << num) - 1; }
+  using VertexTopologyID = uint64_t;
+  using VertexRange =
+      boost::iterator_range<boost::counting_iterator<VertexTopologyID>>;
 
-  // Pack things in the same order of VertexState
-  template <typename T>
-  struct __attribute__((packed)) PackedVal {
-    VertexState get_vertex_state(uint64_t v) const {
-      return (VertexState)(v >> 48);
-    }
-    uint64_t get_raw_value(uint64_t v) const { return v & lower(48); }
-    uint16_t get_flags_unlock(uint16_t f) const { return f & (lower(15) << 1); }
-    uint16_t get_flags_untomb(uint16_t f) const {
-      return f & (lower(14) << 2 | 0x1);
-    }
+  struct EdgeHandle {
+  private:
+    uint8_t buffer : 1;
+    uint64_t index : 48;
 
-    volatile uint16_t flags : 16;
-    uint64_t value : 48;
+    EdgeHandle(uint8_t buffer, uint64_t index) : buffer(buffer), index(index) {}
 
-    PackedVal(T t)
-        : flags(get_vertex_state((uint64_t)t)),
-          value(get_raw_value((uint64_t)t)) {}
+    EdgeHandle(uint64_t const& v) : buffer(v >> 63), index(v) {}
 
-    inline VertexState try_lock() {
-      uint16_t f = __atomic_load_2(this, __ATOMIC_RELAXED);
-      bool b     = false;
-      if (!(f & LOCK))
-        b = __atomic_compare_exchange_2(this, &f, f | LOCK, true,
-                                        __ATOMIC_ACQUIRE, __ATOMIC_RELAXED);
-      return (VertexState)((b ? UNLK : LOCK) | get_flags_unlock(f));
-    }
+    friend class LS_LC_CSR_Graph;
 
-    // Make an explicit function that returns tombstone and locks
-    inline bool lock() {
-      uint64_t ret;
-      VertexState s;
-      do {
-        s = this->try_lock();
-      } while (s & LOCK);
-      return !(s & TOMB);
-    }
-
-    inline void unlock() {
-      uint64_t f = flags;
-      __atomic_store_2(this, f & (~LOCK), __ATOMIC_RELEASE);
-    }
-
-    inline void set_value(T p) {
-      if ((uint64_t)p == UINT64_MAX) {
-        flags |= UMAX;
-      } else {
-        value = get_raw_value(p);
-      }
-    }
+  public:
+    EdgeHandle(EdgeHandle const&) = default;
+    EdgeHandle(EdgeHandle&&)      = default;
 
-    inline T get_value() { return (flags & UMAX) ? (T)UINT64_MAX : (T)value; }
+  } __attribute__((packed));
 
-    inline void unset_tomb() { flags = flags & (~TOMB); }
-
-    inline void set_tomb() { flags = flags | TOMB; }
-
-    inline bool is_tomb() { return flags & TOMB; }
-
-    inline bool atomic_is_tomb() {
-      return __atomic_load_2(this, __ATOMIC_RELAXED) & TOMB;
-    }
-
-    inline PackedVal<T>& operator=(const T& val) {
-      this.set_value(val);
-      return *this;
-    }
-  };
-
-  typedef LargeArray<EdgeTy> EdgeData;
-  typedef LargeArray<uint64_t> EdgeDst;
-  typedef internal::NodeInfoBaseTypes<NodeTy,
-                                      !HasNoLockable && !HasOutOfLineLockable>
-      NodeInfoTypes;
-  typedef internal::NodeInfoBase<NodeTy,
-                                 !HasNoLockable && !HasOutOfLineLockable>
-      NodeInfo;
-  typedef LargeArray<std::pair<uint64_t, uint64_t>> EdgeIndData;
-  typedef LargeArray<NodeInfo> NodeData;
-
-public:
-  typedef uint64_t GraphNode;
-  typedef EdgeTy edge_data_type;
-  typedef FileEdgeTy file_edge_data_type;
-  typedef NodeTy node_data_type;
-  typedef typename EdgeData::reference edge_data_reference;
-  typedef typename NodeInfoTypes::reference node_data_reference;
-  using edge_iterator =
-      boost::counting_iterator<typename EdgeIndData::value_type>;
-  using iterator = boost::counting_iterator<typename EdgeDst::value_type>;
-  typedef iterator const_iterator;
-  typedef iterator local_iterator;
-  typedef iterator const_local_iterator;
-
-protected:
-  NodeData nodeData;
-  EdgeIndData edgeIndData;
-  EdgeDst edgeDst;
-  EdgeData edgeData;
-
-  uint64_t numNodes;
-  uint64_t numEdges;
-
-  const uint64_t maxNodes = (1 << 10) / sizeof(std::pair<uint64_t, uint64_t>);
-  const uint64_t maxEdges = (1 << 10) / sizeof(uint64_t);
-
-  typedef internal::EdgeSortIterator<
-      GraphNode, typename EdgeIndData::value_type, EdgeDst, EdgeData>
-      edge_sort_iterator;
-
-  edge_iterator raw_begin(GraphNode N) const {
-    return edge_iterator(edgeIndData[N].first);
-  }
-
-  edge_iterator raw_end(GraphNode N) const {
-    return edge_iterator(edgeIndData[N].last);
-  }
-
-  edge_sort_iterator edge_sort_begin(GraphNode N) {
-    return edge_sort_iterator(*raw_begin(N), &edgeDst, &edgeData);
-  }
-
-  edge_sort_iterator edge_sort_end(GraphNode N) {
-    return edge_sort_iterator(*raw_end(N), &edgeDst, &edgeData);
-  }
-
-  template <bool _A1 = HasNoLockable, bool _A2 = HasOutOfLineLockable>
-  void acquireNode(GraphNode N, MethodFlag mflag,
-                   typename std::enable_if<!_A1&& !_A2>::type* = 0) {
-    galois::runtime::acquire(&nodeData[N], mflag);
-  }
-
-  template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>
-  void acquireNode(GraphNode N, MethodFlag mflag,
-                   typename std::enable_if<_A1&& !_A2>::type* = 0) {
-    this->outOfLineAcquire(getId(N), mflag);
-  }
+private:
+  using SpinLock = galois::substrate::PaddedLock<concurrent>;
 
-  template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>
-  void acquireNode(GraphNode, MethodFlag,
-                   typename std::enable_if<_A2>::type* = 0) {}
-
-  template <bool _A1 = EdgeData::has_value,
-            bool _A2 = LargeArray<FileEdgeTy>::has_value>
-  void constructEdgeValue(FileGraph& graph,
-                          typename FileGraph::edge_iterator nn,
-                          typename std::enable_if<!_A1 || _A2>::type* = 0) {
-    typedef LargeArray<FileEdgeTy> FED;
-    if (EdgeData::has_value)
-      edgeData.set(*nn, graph.getEdgeData<typename FED::value_type>(nn));
-  }
+  // forward-declarations
+  struct VertexMetadata;
+  struct EdgeMetadata;
 
-  template <bool _A1 = EdgeData::has_value,
-            bool _A2 = LargeArray<FileEdgeTy>::has_value>
-  void constructEdgeValue(FileGraph&, typename FileGraph::edge_iterator nn,
-                          typename std::enable_if<_A1&& !_A2>::type* = 0) {
-    edgeData.set(*nn, {});
-  }
+  class EdgeIterator;
+  using EdgeRange = boost::iterator_range<EdgeIterator>;
 
-  size_t getId(GraphNode N) { return N; }
+  std::vector<VertexMetadata> m_vertices;
 
-  GraphNode getNode(uint64_t n) { return n; }
-
-private:
-  friend class boost::serialization::access;
+  LargeVector<EdgeMetadata> m_edges[2];
 
-  template <typename Archive>
-  void save(Archive& ar, const unsigned int) const {
-    ar << numNodes;
-    ar << numEdges;
+  // To avoid deadlock between updates and compaction, at least one vertex lock
+  // must be held to acquire m_edges_lock.
+  SpinLock m_edges_lock;
 
-    // Large Arrays
-    ar << edgeIndData;
-    ar << edgeDst;
-    ar << edgeData;
+  // returns a reference to the metadata for the pointed-to edge
+  inline EdgeMetadata& getEdgeMetadata(EdgeHandle const& handle) {
+    return getEdgeMetadata(handle.buffer, handle.index);
   }
 
-  template <typename Archive>
-  void load(Archive& ar, const unsigned int) {
-    ar >> numNodes;
-    ar >> numEdges;
-
-    // Large Arrays
-    ar >> edgeIndData;
-    ar >> edgeDst;
-    ar >> edgeData;
-
-    if (!nodeData.data()) {
-      if (UseNumaAlloc) {
-        nodeData.allocateBlocked(numNodes);
-        this->outOfLineAllocateBlocked(numNodes);
-      } else {
-        nodeData.allocateInterleaved(numNodes);
-        this->outOfLineAllocateInterleaved(numNodes);
-      }
-
-      // Construct nodeData largeArray
-      for (size_t n = 0; n < numNodes; ++n) {
-        nodeData.constructAt(n);
-      }
-    }
+  inline EdgeMetadata& getEdgeMetadata(uint8_t buffer, uint64_t index) const {
+    return m_edges[buffer][index];
   }
 
-  // The macro BOOST_SERIALIZATION_SPLIT_MEMBER() generates code which invokes
-  // the save or load depending on whether the archive is used for saving or
-  // loading
-  BOOST_SERIALIZATION_SPLIT_MEMBER()
-
 public:
-  LS_LC_CSR_48_Graph(LS_LC_CSR_48_Graph&& rhs) = default;
+  LS_LC_CSR_Graph(uint64_t num_vertices)
+      : m_vertices(num_vertices, VertexMetadata()) {}
 
-  LS_LC_CSR_48_Graph() = default;
+  inline uint64_t size() const noexcept { return m_vertices.size(); }
 
-  LS_LC_CSR_48_Graph& operator=(LS_LC_CSR_48_Graph&&) = default;
-
-  /**
-   * Serializes node data using Boost.
-   *
-   * @param ar Boost archive to serialize to.
-   */
-  void serializeNodeData(boost::archive::binary_oarchive& ar) const {
-    ar << nodeData;
-  }
-
-  /**
-   * Deserializes a Boost archive containing node data to the local node data
-   * variable.
-   *
-   * @param ar Boost archive to deserialize from.
-   */
-  void deSerializeNodeData(boost::archive::binary_iarchive& ar) {
-    ar >> nodeData;
-  }
-
-  /**
-   * Serializes graph using Boost.
-   *
-   * @param ar Boost archive to serialize to.
-   */
-  void serializeGraph(boost::archive::binary_oarchive& ar) const {
-    ar << numNodes;
-    ar << numEdges;
-
-    // Large Arrays
-    ar << nodeData;
-    ar << edgeIndData;
-    ar << edgeDst;
-    ar << edgeData;
+  inline VertexTopologyID begin() const noexcept {
+    return static_cast<VertexTopologyID>(0);
   }
 
-  /**
-   * Deserializes a Boost archive to the local graph.
-   *
-   * @param ar Boost archive to deserialize from.
-   */
-  void deSerializeGraph(boost::archive::binary_iarchive& ar) {
-    ar >> numNodes;
-    ar >> numEdges;
-
-    // Large Arrays
-    ar >> nodeData;
-    ar >> edgeIndData;
-    ar >> edgeDst;
-    ar >> edgeData;
-  }
-
-  /**
-   * Accesses the "prefix sum" of this graph; takes advantage of the fact
-   * that edge_end(n) is basically prefix_sum[n] (if a prefix sum existed +
-   * if prefix_sum[0] = number of edges in node 0).
-   *
-   * ONLY USE IF GRAPH HAS BEEN LOADED
-   *
-   * @param n Index into edge prefix sum
-   * @returns The value that would be located at index n in an edge prefix sum
-   * array
-   */
-  uint64_t operator[](uint64_t n) { return *(edge_end(n)); }
-
-  template <typename EdgeNumFnTy, typename EdgeDstFnTy, typename EdgeDataFnTy>
-  LS_LC_CSR_48_Graph(uint64_t _numNodes, uint64_t _numEdges,
-                     EdgeNumFnTy edgeNum, EdgeDstFnTy _edgeDst,
-                     EdgeDataFnTy _edgeData)
-      : numNodes(_numNodes), numEdges(_numEdges) {
-    assert(numNodes <= maxNodes);
-    assert(numedges <= maxEdges);
-    if (UseNumaAlloc) {
-      //! [numaallocex]
-      nodeData.allocateBlocked(maxNodes);
-      edgeIndData.allocateBlocked(maxNodes);
-      edgeDst.allocateBlocked(maxEdges);
-      edgeData.allocateBlocked(maxEdges);
-      //! [numaallocex]
-      this->outOfLineAllocateBlocked(maxNodes);
-    } else {
-      nodeData.allocateInterleaved(maxNodes);
-      edgeIndData.allocateInterleaved(maxNodes);
-      edgeDst.allocateInterleaved(maxEdges);
-      edgeData.allocateInterleaved(maxEdges);
-      this->outOfLineAllocateInterleaved(maxNodes);
-    }
-    for (size_t n = 0; n < numNodes; ++n) {
-      nodeData.constructAt(n);
-    }
-    uint64_t cur = 0;
-    for (size_t n = 0; n < numNodes; ++n) {
-      edgeIndData[n].first = cur;
-      cur += edgeNum(n);
-      edgeIndData[n].second = cur;
-    }
-    cur = 0;
-    for (size_t n = 0; n < numNodes; ++n) {
-      for (uint64_t e = 0, ee = edgeNum(n); e < ee; ++e) {
-        if (EdgeData::has_value)
-          edgeData.set(cur, _edgeData(n, e));
-        edgeDst[cur] = _edgeDst(n, e);
-        ++cur;
-      }
-    }
+  inline VertexTopologyID end() const noexcept {
+    return static_cast<VertexTopologyID>(m_vertices.size());
   }
 
-  friend void swap(LS_LC_CSR_48_Graph& lhs, LS_LC_CSR_48_Graph& rhs) {
-    swap(lhs.nodeData, rhs.nodeData);
-    swap(lhs.edgeIndData, rhs.edgeIndData);
-    swap(lhs.edgeDst, rhs.edgeDst);
-    swap(lhs.edgeData, rhs.edgeData);
-    std::swap(lhs.numNodes, rhs.numNodes);
-    std::swap(lhs.numEdges, rhs.numEdges);
-  }
+  VertexRange vertices() { return VertexRange(begin(), end()); }
 
-  node_data_reference getData(GraphNode N,
-                              MethodFlag mflag = MethodFlag::WRITE) {
-    // galois::runtime::checkWrite(mflag, false);
-    NodeInfo& NI = nodeData[N];
-    acquireNode(N, mflag);
-    return NI.getData();
-  }
+  EdgeRange edges(VertexTopologyID node) {
+    auto& vertex_meta = m_vertices[node];
+    auto const* ii    = &getEdgeMetadata(vertex_meta.buffer, vertex_meta.begin);
+    auto const* ee    = &getEdgeMetadata(vertex_meta.buffer, vertex_meta.end);
 
-  edge_data_reference
-  getEdgeData(edge_iterator ni,
-              MethodFlag GALOIS_UNUSED(mflag) = MethodFlag::UNPROTECTED) {
-    // galois::runtime::checkWrite(mflag, false);
-    return edgeData[*ni];
+    return EdgeRange(EdgeIterator(ii, ee), EdgeIterator(ee, ee));
   }
 
-  GraphNode getEdgeDst(edge_iterator ni) { return edgeDst[*ni]; }
-
-  size_t size() const { return numNodes; }
-  size_t sizeEdges() const { return numEdges; }
+  int addEdgesTopologyOnly(VertexTopologyID src,
+                           const std::vector<VertexTopologyID> dsts) {
+    auto& vertex_meta = m_vertices[src];
 
-  iterator begin() const { return iterator(0); }
-  iterator end() const { return iterator(numNodes); }
+    // Copies the edge list to the end of m_edges[1], prepending
+    // the new edges.
 
-  const_local_iterator local_begin() const {
-    return const_local_iterator(this->localBegin(numNodes));
-  }
-
-  const_local_iterator local_end() const {
-    return const_local_iterator(this->localEnd(numNodes));
-  }
-
-  local_iterator local_begin() {
-    return local_iterator(this->localBegin(numNodes));
-  }
-
-  local_iterator local_end() {
-    return local_iterator(this->localEnd(numNodes));
-  }
-
-  edge_iterator edge_begin(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
-    acquireNode(N, mflag);
-    if (!HasNoLockable && galois::runtime::shouldLock(mflag)) {
-      for (edge_iterator ii = raw_begin(N), ee = raw_end(N); ii != ee; ++ii) {
-        acquireNode(edgeDst[*ii], mflag);
+    vertex_meta.lock();
+    {
+      uint64_t new_begin, new_end, new_degree;
+      m_edges_lock.lock();
+      {
+        new_begin  = m_edges[1].size();
+        new_degree = vertex_meta.degree + dsts.size();
+        m_edges[1].resize(new_begin + new_degree);
+        new_end = m_edges[1].size();
+      }
+      m_edges_lock.unlock();
+
+      // insert new edges
+      std::transform(dsts.begin(), dsts.end(), &getEdgeMetadata(1, new_begin),
+                     [](VertexTopologyID dst) {
+                       return EdgeMetadata{.flags = 0, .dst = dst};
+                     });
+
+      // copy old, non-tombstoned edges
+      std::copy_if(&getEdgeMetadata(vertex_meta.buffer, vertex_meta.begin),
+                   &getEdgeMetadata(vertex_meta.buffer, vertex_meta.end),
+                   &getEdgeMetadata(1, new_begin + dsts.size()),
+                   [](EdgeMetadata& edge) { return !edge.is_tomb(); });
+
+      // update vertex metadata
+      vertex_meta.buffer = 1;
+      vertex_meta.begin  = new_begin;
+      vertex_meta.end    = new_end;
+      vertex_meta.degree += dsts.size();
+    }
+    vertex_meta.unlock();
+
+    return 0;
+  }
+
+  int deleteEdges(VertexTopologyID src,
+                  const std::vector<VertexTopologyID>& edges) {
+    std::unordered_set<VertexTopologyID> edges_set(edges.begin(), edges.end());
+
+    auto& vertex_meta = m_vertices[src];
+    vertex_meta.lock();
+    {
+      for (auto i = vertex_meta.begin; i < vertex_meta.end; ++i) {
+        EdgeMetadata& edge_meta =
+            getEdgeMetadata(EdgeHandle(vertex_meta.buffer, i));
+        if (!edge_meta.is_tomb() &&
+            edges_set.find(edge_meta.dst) != edges_set.end()) {
+          edge_meta.tomb();
+          --vertex_meta.degree;
+          // remove tombstoned edges from the start of the edge list
+          if (i == vertex_meta.begin)
+            ++vertex_meta.begin;
+        }
       }
-    }
-    return raw_begin(N);
-  }
-
-  edge_iterator edge_end(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
-    acquireNode(N, mflag);
-    return raw_end(N);
-  }
-
-  uint64_t getDegree(GraphNode N) const { return (raw_end(N) - raw_begin(N)); }
-
-  edge_iterator findEdge(GraphNode N1, GraphNode N2) {
-    return std::find_if(edge_begin(N1), edge_end(N1),
-                        [=](edge_iterator e) { return getEdgeDst(e) == N2; });
-  }
-
-  edge_iterator findEdgeSortedByDst(GraphNode N1, GraphNode N2) {
-    auto e = std::lower_bound(
-        edge_begin(N1), edge_end(N1), N2,
-        [=](edge_iterator e, GraphNode N) { return getEdgeDst(e) < N; });
-    return (getEdgeDst(e) == N2) ? e : edge_end(N1);
-  }
-
-  runtime::iterable<NoDerefIterator<edge_iterator>>
-  edges(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
-    return internal::make_no_deref_range(edge_begin(N, mflag),
-                                         edge_end(N, mflag));
-  }
-
-  runtime::iterable<NoDerefIterator<edge_iterator>>
-  out_edges(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
-    return edges(N, mflag);
-  }
-
-  /**
-   * Sorts outgoing edges of a node. Comparison function is over EdgeTy.
-   */
-  template <typename CompTy>
-  void sortEdgesByEdgeData(GraphNode N,
-                           const CompTy& comp = std::less<EdgeTy>(),
-                           MethodFlag mflag   = MethodFlag::WRITE) {
-    acquireNode(N, mflag);
-    std::sort(
-        edge_sort_begin(N), edge_sort_end(N),
-        internal::EdgeSortCompWrapper<EdgeSortValue<GraphNode, EdgeTy>, CompTy>(
-            comp));
-  }
-
-  /**
-   * Sorts outgoing edges of a node.
-   * Comparison function is over <code>EdgeSortValue<EdgeTy></code>.
-   */
-  template <typename CompTy>
-  void sortEdges(GraphNode N, const CompTy& comp,
-                 MethodFlag mflag = MethodFlag::WRITE) {
-    acquireNode(N, mflag);
-    std::sort(edge_sort_begin(N), edge_sort_end(N), comp);
-  }
-
-  /**
-   * Sorts outgoing edges of a node. Comparison is over getEdgeDst(e).
-   */
-  void sortEdgesByDst(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
-    acquireNode(N, mflag);
-    typedef EdgeSortValue<GraphNode, EdgeTy> EdgeSortVal;
-    std::sort(edge_sort_begin(N), edge_sort_end(N),
-              [=](const EdgeSortVal& e1, const EdgeSortVal& e2) {
-                return e1.dst < e2.dst;
-              });
-  }
-
-  /**
-   * Sorts all outgoing edges of all nodes in parallel. Comparison is over
-   * getEdgeDst(e).
-   */
-  void sortAllEdgesByDst(MethodFlag mflag = MethodFlag::WRITE) {
-    galois::do_all(
-        galois::iterate(size_t{0}, this->size()),
-        [=](GraphNode N) { this->sortEdgesByDst(N, mflag); },
-        galois::no_stats(), galois::steal());
-  }
 
-  void allocateFrom(const FileGraph& graph) {
-    numNodes = graph.size();
-    numEdges = graph.sizeEdges();
-    if (UseNumaAlloc) {
-      nodeData.allocateBlocked(numNodes);
-      edgeIndData.allocateBlocked(numNodes);
-      edgeDst.allocateBlocked(numEdges);
-      edgeData.allocateBlocked(numEdges);
-      this->outOfLineAllocateBlocked(numNodes);
-    } else {
-      nodeData.allocateInterleaved(numNodes);
-      edgeIndData.allocateInterleaved(numNodes);
-      edgeDst.allocateInterleaved(numEdges);
-      edgeData.allocateInterleaved(numEdges);
-      this->outOfLineAllocateInterleaved(numNodes);
+      // remove tombstoned edges from the end of the edge list
+      for (auto i = vertex_meta.end; i > vertex_meta.begin; --i) {
+        if (getEdgeMetadata(EdgeHandle(vertex_meta.buffer, i - 1)).is_tomb()) {
+          --vertex_meta.end;
+          --vertex_meta.degree;
+        } else {
+          break;
+        }
+      }
     }
-  }
+    vertex_meta.unlock();
 
-  void allocateFrom(uint64_t nNodes, uint64_t nEdges) {
-    numNodes = nNodes;
-    numEdges = nEdges;
-
-    if (UseNumaAlloc) {
-      nodeData.allocateBlocked(numNodes);
-      edgeIndData.allocateBlocked(numNodes);
-      edgeDst.allocateBlocked(numEdges);
-      edgeData.allocateBlocked(numEdges);
-      this->outOfLineAllocateBlocked(numNodes);
-    } else {
-      nodeData.allocateInterleaved(numNodes);
-      edgeIndData.allocateInterleaved(numNodes);
-      edgeDst.allocateInterleaved(numEdges);
-      edgeData.allocateInterleaved(numEdges);
-      this->outOfLineAllocateInterleaved(numNodes);
-    }
+    return 0;
   }
 
-  void destroyAndAllocateFrom(uint64_t nNodes, uint64_t nEdges) {
-    numNodes = nNodes;
-    numEdges = nEdges;
-
-    deallocate();
-    if (UseNumaAlloc) {
-      nodeData.allocateBlocked(numNodes);
-      edgeIndData.allocateBlocked(numNodes);
-      edgeDst.allocateBlocked(numEdges);
-      edgeData.allocateBlocked(numEdges);
-      this->outOfLineAllocateBlocked(numNodes);
-    } else {
-      nodeData.allocateInterleaved(numNodes);
-      edgeIndData.allocateInterleaved(numNodes);
-      edgeDst.allocateInterleaved(numEdges);
-      edgeData.allocateInterleaved(numEdges);
-      this->outOfLineAllocateInterleaved(numNodes);
-    }
+  VertexTopologyID getEdgeDst(EdgeHandle edge) {
+    return getEdgeMetadata(edge).dst;
   }
 
-  void constructNodes() {
-#ifndef GALOIS_GRAPH_CONSTRUCT_SERIAL
-    for (uint64_t x = 0; x < numNodes; ++x) {
-      nodeData.constructAt(x);
-      this->outOfLineConstructAt(x);
-    }
-#else
-    galois::do_all(
-        galois::iterate(UINT64_C(0), numNodes),
-        [&](uint64_t x) {
-          nodeData.constructAt(x);
-          this->outOfLineConstructAt(x);
-        },
-        galois::no_stats(), galois::loopname("CONSTRUCT_NODES"));
-#endif
-  }
+  // Performs the compaction algorithm by copying any vertices left in buffer 0
+  // to buffer 1, then swapping the buffers.
+  //
+  // Should not be called from within a Galois parallel kernel.
+  void compact() {
+    using std::swap;
 
-  void deallocate() {
-    nodeData.destroy();
-    nodeData.deallocate();
+    // move from buffer 0 to buffer 1
+    galois::do_all(galois::iterate(vertices().begin(), vertices().end()),
+                   [&](VertexTopologyID vertex_id) {
+                     VertexMetadata& vertex_meta = m_vertices[vertex_id];
+                     vertex_meta.lock();
 
-    edgeIndData.deallocate();
-    edgeIndData.destroy();
+                     if (vertex_meta.buffer == 0) {
+                       uint64_t new_begin;
+                       m_edges_lock.lock();
+                       {
+                         new_begin = m_edges[1].size();
+                         m_edges[1].resize(new_begin + vertex_meta.degree);
+                       }
+                       m_edges_lock.unlock();
 
-    edgeDst.deallocate();
-    edgeDst.destroy();
+                       uint64_t new_end = new_begin + vertex_meta.degree;
 
-    edgeData.deallocate();
-    edgeData.destroy();
-  }
+                       std::copy_if(&getEdgeMetadata(0, vertex_meta.begin),
+                                    &getEdgeMetadata(0, vertex_meta.end),
+                                    &getEdgeMetadata(1, new_begin),
+                                    [](EdgeMetadata& edge_meta) {
+                                      return !edge_meta.is_tomb();
+                                    });
 
-  void constructEdge(uint64_t e, uint64_t dst,
-                     const typename EdgeData::value_type& val) {
-    edgeData.set(e, val);
-    edgeDst[e] = dst;
-  }
+                       vertex_meta.begin = new_begin;
+                       vertex_meta.end   = new_end;
+                     }
 
-  void constructEdge(uint64_t e, uint64_t dst) { edgeDst[e] = dst; }
-
-  void fixEndEdge(uint64_t n, uint64_t e) { edgeIndData[n] = e; }
-
-  /**
-   * Perform an in-memory transpose of the graph, replacing the original
-   * CSR to CSC
-   */
-  void transpose(const char* regionName = NULL) {
-    galois::StatTimer timer("TIMER_GRAPH_TRANSPOSE", regionName);
-    timer.start();
-
-    EdgeDst edgeDst_old;
-    EdgeData edgeData_new;
-    EdgeIndData edgeIndData_old;
-    EdgeIndData edgeIndData_temp;
-
-    if (UseNumaAlloc) {
-      edgeIndData_old.allocateBlocked(numNodes);
-      edgeIndData_temp.allocateBlocked(numNodes);
-      edgeDst_old.allocateBlocked(numEdges);
-      edgeData_new.allocateBlocked(numEdges);
-    } else {
-      edgeIndData_old.allocateInterleaved(numNodes);
-      edgeIndData_temp.allocateInterleaved(numNodes);
-      edgeDst_old.allocateInterleaved(numEdges);
-      edgeData_new.allocateInterleaved(numEdges);
-    }
+                     // we are about to swap the buffers, so all vertices will
+                     // be in buffer 0
+                     vertex_meta.buffer = 0;
 
-    // Copy old node->index location + initialize the temp array
-    galois::do_all(
-        galois::iterate(UINT64_C(0), numNodes),
-        [&](uint64_t n) {
-          edgeIndData_old[n]  = edgeIndData[n];
-          edgeIndData_temp[n] = 0;
-        },
-        galois::no_stats(), galois::loopname("TRANSPOSE_EDGEINTDATA_COPY"));
-
-    // get destination of edge, copy to array, and
-    galois::do_all(
-        galois::iterate(UINT64_C(0), numEdges),
-        [&](uint64_t e) {
-          auto dst       = edgeDst[e];
-          edgeDst_old[e] = dst;
-          // counting outgoing edges in the tranpose graph by
-          // counting incoming edges in the original graph
-          __sync_add_and_fetch(&edgeIndData_temp[dst], 1);
-        },
-        galois::no_stats(), galois::loopname("TRANSPOSE_EDGEINTDATA_INC"));
-
-    // TODO is it worth doing parallel prefix sum?
-    // prefix sum calculation of the edge index array
-    for (uint64_t n = 1; n < numNodes; ++n) {
-      edgeIndData_temp[n] += edgeIndData_temp[n - 1];
-    }
+                     // don't release the vertex lock until after the edge
+                     // arrays are swapped
+                   });
 
-    // copy over the new tranposed edge index data
-    galois::do_all(
-        galois::iterate(UINT64_C(0), numNodes),
-        [&](uint64_t n) { edgeIndData[n] = edgeIndData_temp[n]; },
-        galois::no_stats(), galois::loopname("TRANSPOSE_EDGEINTDATA_SET"));
-
-    // edgeIndData_temp[i] will now hold number of edges that all nodes
-    // before the ith node have
-    if (numNodes >= 1) {
-      edgeIndData_temp[0] = 0;
-      galois::do_all(
-          galois::iterate(UINT64_C(1), numNodes),
-          [&](uint64_t n) { edgeIndData_temp[n] = edgeIndData[n - 1]; },
-          galois::no_stats(), galois::loopname("TRANSPOSE_EDGEINTDATA_TEMP"));
+    // At this point, there are no more live edges in buffer 0.
+    // We also hold the lock for all vertices, so nobody else can hold
+    // m_edges_lock.
+    m_edges_lock.lock();
+    {
+      m_edges[0].resize(0);
+      swap(m_edges[0], m_edges[1]);
     }
+    m_edges_lock.unlock();
 
     galois::do_all(
-        galois::iterate(UINT64_C(0), numNodes),
-        [&](uint64_t src) {
-          // e = start index into edge array for a particular node
-          uint64_t e = (src == 0) ? 0 : edgeIndData_old[src - 1];
-
-          // get all outgoing edges of a particular node in the
-          // non-transpose and convert to incoming
-          while (e < edgeIndData_old[src]) {
-            // destination nodde
-            auto dst = edgeDst_old[e];
-            // location to save edge
-            auto e_new = __sync_fetch_and_add(&(edgeIndData_temp[dst]), 1);
-            // save src as destination
-            edgeDst[e_new] = src;
-            // copy edge data to "new" array
-            edgeDataCopy(edgeData_new, edgeData, e_new, e);
-            e++;
-          }
-        },
-        galois::no_stats(), galois::loopname("TRANSPOSE_EDGEDST"));
-
-    // if edge weights, then overwrite edgeData with new edge data
-    if (EdgeData::has_value) {
-      galois::do_all(
-          galois::iterate(UINT64_C(0), numEdges),
-          [&](uint64_t e) { edgeDataCopy(edgeData, edgeData_new, e, e); },
-          galois::no_stats(), galois::loopname("TRANSPOSE_EDGEDATA_SET"));
-    }
-
-    timer.stop();
+        galois::iterate(vertices().begin(), vertices().end()),
+        [&](VertexTopologyID vertex_id) { m_vertices[vertex_id].unlock(); });
   }
 
-  template <bool is_non_void = EdgeData::has_value>
-  void edgeDataCopy(EdgeData& edgeData_new, EdgeData& edgeData, uint64_t e_new,
-                    uint64_t e,
-                    typename std::enable_if<is_non_void>::type* = 0) {
-    edgeData_new[e_new] = edgeData[e];
-  }
+private:
+  struct VertexMetadata : public SpinLock {
+    uint8_t buffer : 1;
+    uint64_t begin : 48; // inclusive
+    uint64_t end : 48;   // exclusive
+    uint64_t degree;
 
-  template <bool is_non_void = EdgeData::has_value>
-  void edgeDataCopy(EdgeData&, EdgeData&, uint64_t, uint64_t,
-                    typename std::enable_if<!is_non_void>::type* = 0) {
-    // does nothing
-  }
+    VertexMetadata() : buffer(0), begin(0), end(0), degree(0) {}
 
-  template <typename E                                            = EdgeTy,
-            std::enable_if_t<!std::is_same<E, void>::value, int>* = nullptr>
-  void constructFrom(FileGraph& graph, unsigned tid, unsigned total,
-                     const bool readUnweighted = false) {
-    // at this point memory should already be allocated
-    auto r =
-        graph
-            .divideByNode(
-                NodeData::size_of::value + EdgeIndData::size_of::value +
-                    LS_LC_CSR_48_Graph::size_of_out_of_line::value,
-                EdgeDst::size_of::value + EdgeData::size_of::value, tid, total)
-            .first;
-
-    this->setLocalRange(*r.first, *r.second);
-
-    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {
-      nodeData.constructAt(*ii);
-      edgeIndData[*ii] = *graph.edge_end(*ii);
-
-      this->outOfLineConstructAt(*ii);
-
-      for (FileGraph::edge_iterator nn = graph.edge_begin(*ii),
-                                    en = graph.edge_end(*ii);
-           nn != en; ++nn) {
-        if (readUnweighted) {
-          edgeData.set(*nn, {});
-        } else {
-          constructEdgeValue(graph, nn);
-        }
-        edgeDst[*nn] = graph.getEdgeDst(nn);
-      }
-    }
-  }
+    VertexMetadata(VertexMetadata const& other)
+        : buffer(other.buffer), begin(other.begin), end(other.end),
+          degree(other.degree) {}
 
-  template <typename E                                           = EdgeTy,
-            std::enable_if_t<std::is_same<E, void>::value, int>* = nullptr>
-  void constructFrom(FileGraph& graph, unsigned tid, unsigned total,
-                     const bool GALOIS_UNUSED(readUnweighted) = false) {
-    // at this point memory should already be allocated
-    auto r =
-        graph
-            .divideByNode(
-                NodeData::size_of::value + EdgeIndData::size_of::value +
-                    LS_LC_CSR_48_Graph::size_of_out_of_line::value,
-                EdgeDst::size_of::value + EdgeData::size_of::value, tid, total)
-            .first;
-
-    this->setLocalRange(*r.first, *r.second);
-
-    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {
-      nodeData.constructAt(*ii);
-      edgeIndData[*ii] = *graph.edge_end(*ii);
-
-      this->outOfLineConstructAt(*ii);
-
-      for (FileGraph::edge_iterator nn = graph.edge_begin(*ii),
-                                    en = graph.edge_end(*ii);
-           nn != en; ++nn) {
-        constructEdgeValue(graph, nn);
-        edgeDst[*nn] = graph.getEdgeDst(nn);
-      }
-    }
-  }
+    VertexMetadata(VertexMetadata&& other)
+        : buffer(std::move(other.buffer)), begin(std::move(other.begin)),
+          end(std::move(other.end)), degree(std::move(other.degree)) {}
+  };
 
-  /**
-   * Returns the reference to the edgeIndData LargeArray
-   * (a prefix sum of edges)
-   *
-   * @returns reference to LargeArray edgeIndData
-   */
-  const EdgeIndData& getEdgePrefixSum() const { return edgeIndData; }
-
-  auto divideByNode(size_t nodeSize, size_t edgeSize, size_t id, size_t total) {
-    return galois::graphs::divideNodesBinarySearch(
-        numNodes, numEdges, nodeSize, edgeSize, id, total, edgeIndData);
-  }
-  /**
-   *
-   * custom allocator for vector<vector<>>
-   * Adding for Louvain clustering
-   * TODO: Find better way to do this
-   */
-  void constructFrom(uint64_t numNodes, uint64_t numEdges,
-                     std::vector<uint64_t>& prefix_sum,
-                     std::vector<std::vector<uint64_t>>& edges_id,
-                     std::vector<std::vector<EdgeTy>>& edges_data) {
-    // allocateFrom(numNodes, numEdges);
-    /*
-     * Deallocate if reusing the graph
-     */
-    destroyAndAllocateFrom(numNodes, numEdges);
-    constructNodes();
-
-    galois::do_all(galois::iterate((uint64_t)0, numNodes),
-                   [&](uint64_t n) { edgeIndData[n] = prefix_sum[n]; });
-
-    galois::do_all(galois::iterate((uint64_t)0, numNodes), [&](uint64_t n) {
-      if (n == 0) {
-        if (edgeIndData[n] > 0) {
-          std::copy(edges_id[n].begin(), edges_id[n].end(), edgeDst.begin());
-          std::copy(edges_data[n].begin(), edges_data[n].end(),
-                    edgeData.begin());
-        }
-      } else {
-        if (edgeIndData[n] - edgeIndData[n - 1] > 0) {
-          std::copy(edges_id[n].begin(), edges_id[n].end(),
-                    edgeDst.begin() + edgeIndData[n - 1]);
-          std::copy(edges_data[n].begin(), edges_data[n].end(),
-                    edgeData.begin() + edgeIndData[n - 1]);
-        }
-      }
-    });
+  struct EdgeMetadata {
+    enum Flags : uint16_t { TOMB = 0x1 };
 
-    initializeLocalRanges();
-  }
-  void constructFrom(
-      uint64_t numNodes, uint64_t numEdges, std::vector<uint64_t>& prefix_sum,
-      galois::gstl::Vector<galois::PODResizeableArray<uint64_t>>& edges_id,
-      std::vector<std::vector<EdgeTy>>& edges_data) {
-    allocateFrom(numNodes, numEdges);
-    constructNodes();
-
-    galois::do_all(galois::iterate((uint64_t)0, numNodes),
-                   [&](uint64_t n) { edgeIndData[n] = prefix_sum[n]; });
-
-    galois::do_all(galois::iterate((uint64_t)0, numNodes), [&](uint64_t n) {
-      if (n == 0) {
-        if (edgeIndData[n] > 0) {
-          std::copy(edges_id[n].begin(), edges_id[n].end(), edgeDst.begin());
-          std::copy(edges_data[n].begin(), edges_data[n].end(),
-                    edgeData.begin());
-        }
-      } else {
-        if (edgeIndData[n] - edgeIndData[n - 1] > 0) {
-          std::copy(edges_id[n].begin(), edges_id[n].end(),
-                    edgeDst.begin() + edgeIndData[n - 1]);
-          std::copy(edges_data[n].begin(), edges_data[n].end(),
-                    edgeData.begin() + edgeIndData[n - 1]);
-        }
-      }
-    });
+    uint16_t flags : 16;
+    VertexTopologyID dst : 48;
 
-    initializeLocalRanges();
-  }
+    bool is_tomb() const noexcept { return (flags & TOMB) > 0; }
+    void tomb() { flags |= TOMB; }
+  } __attribute__((packed));
 
-  /**
-   * Reads the GR files directly into in-memory
-   * data-structures of LC_CSR graphs using freads.
-   *
-   * Edge is not void.
-   *
-   */
-  template <
-      typename U                                                      = void,
-      typename std::enable_if<!std::is_void<EdgeTy>::value, U>::type* = nullptr>
-  void readGraphFromGRFile(const std::string& filename) {
-    std::ifstream graphFile(filename.c_str());
-    if (!graphFile.is_open()) {
-      GALOIS_DIE("failed to open file");
-    }
-    uint64_t header[4];
-    graphFile.read(reinterpret_cast<char*>(header), sizeof(uint64_t) * 4);
-    uint64_t version = header[0];
-    numNodes         = header[2];
-    numEdges         = header[3];
-    galois::gPrint("Number of Nodes: ", numNodes,
-                   ", Number of Edges: ", numEdges, "\n");
-    allocateFrom(numNodes, numEdges);
-    constructNodes();
-    /**
-     * Load outIndex array
-     **/
-    assert(edgeIndData.data());
-    if (!edgeIndData.data()) {
-      GALOIS_DIE("out of memory");
-    }
+  static_assert(sizeof(EdgeMetadata) <= sizeof(uint64_t));
 
-    // start position to read index data
-    uint64_t readPosition = (4 * sizeof(uint64_t));
-    graphFile.seekg(readPosition);
-    graphFile.read(reinterpret_cast<char*>(edgeIndData.data()),
-                   sizeof(uint64_t) * numNodes);
-    /**
-     * Load edgeDst array
-     **/
-    assert(edgeDst.data());
-    if (!edgeDst.data()) {
-      GALOIS_DIE("out of memory");
-    }
+  class EdgeIterator
+      : public boost::iterator_facade<EdgeIterator, EdgeMetadata const,
+                                      boost::forward_traversal_tag,
+                                      VertexTopologyID> {
+  private:
+    EdgeMetadata const* m_ptr;
+    EdgeMetadata const* const m_end;
 
-    readPosition = ((4 + numNodes) * sizeof(uint64_t));
-    graphFile.seekg(readPosition);
-    if (version == 1) {
-      graphFile.read(reinterpret_cast<char*>(edgeDst.data()),
-                     sizeof(uint32_t) * numEdges);
-      readPosition =
-          ((4 + numNodes) * sizeof(uint64_t) + numEdges * sizeof(uint32_t));
-      // version 1 padding TODO make version agnostic
-      if (numEdges % 2) {
-        readPosition += sizeof(uint32_t);
-      }
-    } else if (version == 2) {
-      graphFile.read(reinterpret_cast<char*>(edgeDst.data()),
-                     sizeof(uint64_t) * numEdges);
-      readPosition =
-          ((4 + numNodes) * sizeof(uint64_t) + numEdges * sizeof(uint64_t));
-      if (numEdges % 2) {
-        readPosition += sizeof(uint64_t);
-      }
-    } else {
-      GALOIS_DIE("unknown file version: ", version);
-    }
-    /**
-     * Load edge data array
-     **/
-    assert(edgeData.data());
-    if (!edgeData.data()) {
-      GALOIS_DIE("out of memory");
-    }
-    graphFile.seekg(readPosition);
-    graphFile.read(reinterpret_cast<char*>(edgeData.data()),
-                   sizeof(EdgeTy) * numEdges);
+    explicit EdgeIterator(EdgeMetadata const* ptr, EdgeMetadata const* end)
+        : m_ptr(ptr), m_end(end) {}
 
-    initializeLocalRanges();
-    graphFile.close();
-  }
+    void increment() {
+      while (++m_ptr < m_end && m_ptr->is_tomb())
+        ;
+    };
 
-  /**
-   * Reads the GR files directly into in-memory
-   * data-structures of LC_CSR graphs using freads.
-   *
-   * Edge is void.
-   *
-   */
-  template <
-      typename U                                                     = void,
-      typename std::enable_if<std::is_void<EdgeTy>::value, U>::type* = nullptr>
-  void readGraphFromGRFile(const std::string& filename) {
-    std::ifstream graphFile(filename.c_str());
-    if (!graphFile.is_open()) {
-      GALOIS_DIE("failed to open file");
-    }
-    uint64_t header[4];
-    graphFile.read(reinterpret_cast<char*>(header), sizeof(uint64_t) * 4);
-    uint64_t version = header[0];
-    numNodes         = header[2];
-    numEdges         = header[3];
-    galois::gPrint("Number of Nodes: ", numNodes,
-                   ", Number of Edges: ", numEdges, "\n");
-    allocateFrom(numNodes, numEdges);
-    constructNodes();
-    /**
-     * Load outIndex array
-     **/
-    assert(edgeIndData.data());
-    if (!edgeIndData.data()) {
-      GALOIS_DIE("out of memory");
-    }
-    // start position to read index data
-    uint64_t readPosition = (4 * sizeof(uint64_t));
-    graphFile.seekg(readPosition);
-    graphFile.read(reinterpret_cast<char*>(edgeIndData.data()),
-                   sizeof(uint64_t) * numNodes);
-    /**
-     * Load edgeDst array
-     **/
-    assert(edgeDst.data());
-    if (!edgeDst.data()) {
-      GALOIS_DIE("out of memory");
-    }
-    readPosition = ((4 + numNodes) * sizeof(uint64_t));
-    graphFile.seekg(readPosition);
-    if (version == 1) {
-      graphFile.read(reinterpret_cast<char*>(edgeDst.data()),
-                     sizeof(uint32_t) * numEdges);
-    } else if (version == 2) {
-      graphFile.read(reinterpret_cast<char*>(edgeDst.data()),
-                     sizeof(uint64_t) * numEdges);
-    } else {
-      GALOIS_DIE("unknown file version: ", version);
-    }
+    // note: equality fails across generations
+    bool equal(EdgeIterator const& other) const { return m_ptr == other.m_ptr; }
 
-    initializeLocalRanges();
-    graphFile.close();
-  }
+    VertexTopologyID dereference() const { return m_ptr->dst; }
 
-  /**
-   * Given a manually created graph, initialize the local ranges on this graph
-   * so that threads can iterate over a balanced number of vertices.
-   */
-  void initializeLocalRanges() {
-    galois::on_each([&](unsigned tid, unsigned total) {
-      auto r = divideByNode(0, 1, tid, total).first;
-      this->setLocalRange(*r.first, *r.second);
-    });
-  }
+    friend class LS_LC_CSR_Graph;
+    friend class boost::iterator_core_access;
+  };
 };
 
-} // namespace galois::graphs
+}; // namespace galois::graphs
 
 #endif
diff --git a/libgalois/include/galois/graphs/MorphGraph.h b/libgalois/include/galois/graphs/MorphGraph.h
index 6157f50f09..e517e0fc5f 100644
--- a/libgalois/include/galois/graphs/MorphGraph.h
+++ b/libgalois/include/galois/graphs/MorphGraph.h
@@ -702,7 +702,7 @@ class MorphGraph : private boost::noncopyable {
   EdgeTy*
   constructOutEdgeValue(FileGraph&, typename FileGraph::edge_iterator,
                         GraphNode src, GraphNode dst,
-                        typename std::enable_if<_A1&& !_A2>::type* = 0) {
+                        typename std::enable_if<_A1 && !_A2>::type* = 0) {
     return createOutEdge(src, dst, galois::MethodFlag::UNPROTECTED);
   }
 
diff --git a/libgalois/test/CMakeLists.txt b/libgalois/test/CMakeLists.txt
index e4e1d09b9f..6ce4579fe7 100644
--- a/libgalois/test/CMakeLists.txt
+++ b/libgalois/test/CMakeLists.txt
@@ -41,8 +41,10 @@ add_test_unit(forward-declare-graph)
 add_test_unit(gcollections)
 add_test_unit(graph)
 add_test_unit(graph-compile)
+add_test_unit(graph-compile-lscsr)
 add_test_unit(gslist)
 add_test_unit(hwtopo)
+add_test_unit(large-vector)
 add_test_unit(lc-adaptor)
 add_test_unit(lock)
 add_test_unit(loop-overhead REQUIRES OPENMP_FOUND)
diff --git a/libgalois/test/graph-compile-lscsr.cpp b/libgalois/test/graph-compile-lscsr.cpp
new file mode 100644
index 0000000000..42f774bfed
--- /dev/null
+++ b/libgalois/test/graph-compile-lscsr.cpp
@@ -0,0 +1,65 @@
+/*
+ * This file belongs to the Galois project, a C++ library for exploiting
+ * parallelism. The code is being released under the terms of the 3-Clause BSD
+ * License (a copy is located in LICENSE.txt at the top-level directory).
+ *
+ * Copyright (C) 2024, The University of Texas at Austin. All rights reserved.
+ * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
+ * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
+ * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
+ * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
+ * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
+ * shall University be liable for incidental, special, indirect, direct or
+ * consequential damages or loss of profits, interruption of business, or
+ * related expenses which may arise from use of Software or Documentation,
+ * including but not limited to those resulting from defects in Software and/or
+ * Documentation, or loss or inaccuracy of data of any kind.
+ */
+
+#include <iostream>
+#include <queue>
+
+#include "galois/Galois.h"
+#include "galois/graphs/LS_LC_CSR_Graph.h"
+
+template <typename GraphTy>
+void check() {
+  GraphTy g(4);
+
+  g.addEdgesTopologyOnly(0, {1, 2});
+  g.addEdgesTopologyOnly(1, {2, 3});
+  g.addEdgesTopologyOnly(2, {3});
+
+  auto print_graph = [&g](std::string_view msg) {
+    std::cout << "- " << msg << " -" << std::endl;
+    for (auto src : g.vertices()) {
+      for (auto dst : g.edges(src)) {
+        std::cout << src << "->" << dst << std::endl;
+      }
+    }
+  };
+
+  print_graph("initial graph");
+
+  g.addEdgesTopologyOnly(2, {1});
+
+  print_graph("added 2->1");
+
+  g.deleteEdges(1, {3});
+
+  print_graph("deleted 1->3");
+
+  g.compact();
+
+  print_graph("compacted");
+
+  std::cout << std::endl << std::endl;
+}
+
+int main() {
+  galois::SharedMemSys Galois_runtime;
+  check<galois::graphs::LS_LC_CSR_Graph<>>();
+
+  return 0;
+}
diff --git a/libgalois/test/large-vector.cpp b/libgalois/test/large-vector.cpp
new file mode 100644
index 0000000000..3ac714de62
--- /dev/null
+++ b/libgalois/test/large-vector.cpp
@@ -0,0 +1,75 @@
+/*
+ * This file belongs to the Galois project, a C++ library for exploiting
+ * parallelism. The code is being released under the terms of the 3-Clause BSD
+ * License (a copy is located in LICENSE.txt at the top-level directory).
+ *
+ * Copyright (C) 2024, The University of Texas at Austin. All rights reserved.
+ * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
+ * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
+ * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
+ * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
+ * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
+ * shall University be liable for incidental, special, indirect, direct or
+ * consequential damages or loss of profits, interruption of business, or
+ * related expenses which may arise from use of Software or Documentation,
+ * including but not limited to those resulting from defects in Software and/or
+ * Documentation, or loss or inaccuracy of data of any kind.
+ */
+
+#include <iostream>
+
+#include "galois/Galois.h"
+#include "galois/LargeVector.h"
+
+int main() {
+  galois::SharedMemSys Galois_runtime;
+
+  {
+    galois::LargeVector<uint64_t> the_vector;
+
+    // should use 4 hugepages
+    std::vector<uint64_t*> refs;
+    for (size_t i = 0; i < (1 << 21); ++i) {
+      refs.emplace_back(&the_vector.emplace_back(i));
+    }
+
+    for (size_t i = 0; i < (1 << 21); ++i) {
+      GALOIS_ASSERT(*refs[i] == i);
+    }
+  }
+
+  {
+    static uint64_t num_constructed = 0, num_destructed = 0;
+    class Object {
+      uint8_t dummy;
+
+    public:
+      Object() { ++num_constructed; }
+      ~Object() { ++num_destructed; }
+    };
+    static_assert(sizeof(Object) > 0);
+
+    const size_t max_cap = (1 << 22);
+    galois::LargeVector<Object> the_vector(max_cap);
+    // constructor should not actually fill the vector
+    GALOIS_ASSERT(num_constructed == 0);
+
+    // entire vector should be mapped, even if it is empty
+    const Object* addr = &the_vector[max_cap];
+    GALOIS_ASSERT((addr - &the_vector[0]) == max_cap);
+
+    the_vector.resize(max_cap);
+
+    GALOIS_ASSERT(num_constructed == max_cap);
+    GALOIS_ASSERT(addr == &the_vector[max_cap]);
+
+    // resize should call the destructor, but vector should stay mapped
+    GALOIS_ASSERT(num_destructed == 0);
+    the_vector.resize(0);
+    GALOIS_ASSERT(num_destructed == max_cap);
+    GALOIS_ASSERT(addr == &the_vector[max_cap]);
+  }
+
+  return 0;
+}

From 15f9de9018a3eed901ea7e240f352d9674c6c563 Mon Sep 17 00:00:00 2001
From: Meyer Zinn <6132034+meyerzinn@users.noreply.github.com>
Date: Mon, 4 Mar 2024 17:53:30 -0600
Subject: [PATCH 627/660] fix LargeVector to avoid exhausting address space
 (#4)

---
 libgalois/include/galois/LargeVector.h        | 55 ++++++++++---------
 .../include/galois/graphs/LS_LC_CSR_Graph.h   |  8 +--
 2 files changed, 33 insertions(+), 30 deletions(-)

diff --git a/libgalois/include/galois/LargeVector.h b/libgalois/include/galois/LargeVector.h
index a5c56494f6..c26ab76d89 100644
--- a/libgalois/include/galois/LargeVector.h
+++ b/libgalois/include/galois/LargeVector.h
@@ -13,6 +13,7 @@
 #include <list>
 #include <string.h>
 #include <utility>
+
 namespace galois {
 /*
  * A vector backed by huge pages. Guarantees addresss stability, so values do
@@ -32,12 +33,12 @@ class LargeVector : public boost::noncopyable {
   T* m_data;
 
   int m_fd;
-  std::list<std::pair<void*, size_t>> m_mappings;
+  std::list<std::pair<void*, size_t>> m_mappings; // sorted by size decreasing
 
   void ensure_capacity(size_t new_cap) {
-    using std::string;
-
-    if (m_capacity == new_cap)
+    if (new_cap > m_capacity)
+      new_cap = std::max(new_cap, m_capacity * 2);
+    else if (new_cap > 0)
       return;
 
     // Round up to the nearest huge page size.
@@ -46,44 +47,45 @@ class LargeVector : public boost::noncopyable {
         (new_cap * sizeof(T) + (page_size - 1)) & (~(page_size - 1));
 
     if (ftruncate(m_fd, file_size) == -1)
-      throw std::runtime_error(string("ftruncate: ") + strerror(errno));
+      throw std::runtime_error(std::string("ftruncate: ") +
+                               std::strerror(errno));
 
     // Floor divide to find the real capacity.
-    new_cap = file_size / sizeof(T);
-
-    // We only need to remap if the new capacity is larger. Otherwise,
-    // we'll just truncate the file to release any used physical pages
-    // and keep the existing mapping.
-    const bool remap = new_cap > m_capacity;
-
-    m_capacity = new_cap;
+    m_capacity = file_size / sizeof(T);
 
-    if (!remap)
+    // Check whether the existing mapping covers the new capacity.
+    if (m_mappings.front().second >= m_capacity * sizeof(T))
       return;
 
-    const size_t mmap_size = m_capacity * sizeof(T);
-    if (!mmap_size) {
-      m_data = nullptr;
-      return;
-    }
+    // Create a new virtual address mapping if a previous mapping is not large
+    // enough to access the new capacity.
+    //
+    // To avoid exhausting the virtual address space with lots of
+    // similarly-sized allocations, we always at least double the size.
+    size_t const mmap_size =
+        std::max(m_mappings.front().second * 2, m_capacity * sizeof(T));
+
+    std::cout << "new_cap = " << new_cap << "\tmmap_size = " << mmap_size
+              << std::endl;
 
     m_data =
         static_cast<T*>(mmap(nullptr, mmap_size, PROT_READ | PROT_WRITE,
                              MAP_SHARED | MAP_HUGETLB | MAP_HUGE_2MB, m_fd, 0));
     if (m_data == MAP_FAILED)
-      throw std::runtime_error(string("mmap failed: ") + strerror(errno));
+      throw std::runtime_error(std::string("mmap failed: ") +
+                               std::strerror(errno));
 
-    m_mappings.push_back(std::make_pair(m_data, mmap_size));
+    m_mappings.push_front(std::make_pair(m_data, mmap_size));
   }
 
 public:
   LargeVector(size_t initial_capacity)
       : m_capacity(0), m_size(0), m_data(nullptr),
-        m_fd(memfd_create("LargeVector", MFD_HUGETLB | MFD_HUGE_2MB)) {
-    if (m_fd == -1) {
+        m_fd(memfd_create("LargeVector", MFD_HUGETLB | MFD_HUGE_2MB)),
+        m_mappings({std::make_pair(nullptr, 0)}) {
+    if (m_fd == -1)
       throw std::runtime_error(std::string("creating memfd: ") +
-                               strerror(errno));
-    }
+                               std::strerror(errno));
     ensure_capacity(initial_capacity);
   }
 
@@ -118,7 +120,8 @@ class LargeVector : public boost::noncopyable {
 
   ~LargeVector() {
     for (; !m_mappings.empty(); m_mappings.pop_front())
-      munmap(m_mappings.front().first, m_mappings.front().second);
+      if (m_mappings.front().first != nullptr)
+        munmap(m_mappings.front().first, m_mappings.front().second);
 
     if (m_fd != -1)
       close(m_fd);
diff --git a/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h b/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
index 36c76f88c5..78e78c62b6 100644
--- a/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
+++ b/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
@@ -120,15 +120,15 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
 
     vertex_meta.lock();
     {
-      uint64_t new_begin, new_end, new_degree;
+      uint64_t const new_degree = vertex_meta.degree + dsts.size();
+      uint64_t new_begin;
       m_edges_lock.lock();
       {
-        new_begin  = m_edges[1].size();
-        new_degree = vertex_meta.degree + dsts.size();
+        new_begin = m_edges[1].size();
         m_edges[1].resize(new_begin + new_degree);
-        new_end = m_edges[1].size();
       }
       m_edges_lock.unlock();
+      uint64_t const new_end = new_begin + new_degree;
 
       // insert new edges
       std::transform(dsts.begin(), dsts.end(), &getEdgeMetadata(1, new_begin),

From 850470f5af252fbf0b3197abb95afb4c101c11da Mon Sep 17 00:00:00 2001
From: patrickkenney9801 <patrickkenney9801@gmail.com>
Date: Thu, 7 Mar 2024 10:24:44 -0600
Subject: [PATCH 628/660] chore: Resolve last merge conflict

---
 .../include/galois/graphs/LS_LC_CSR_64_Graph.h  | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/libgalois/include/galois/graphs/LS_LC_CSR_64_Graph.h b/libgalois/include/galois/graphs/LS_LC_CSR_64_Graph.h
index f6507df2f9..6394e90cea 100644
--- a/libgalois/include/galois/graphs/LS_LC_CSR_64_Graph.h
+++ b/libgalois/include/galois/graphs/LS_LC_CSR_64_Graph.h
@@ -556,7 +556,6 @@ class LS_LC_CSR_64_Graph :
         galois::steal());
   }
 
-<<<<<<< HEAD
   /**
    * Add edges into the graph
    *
@@ -577,18 +576,6 @@ class LS_LC_CSR_64_Graph :
 
     auto edgeStart = ee;
     auto orig_itr  = edge_begin(src);
-=======
-  template <typename T>
-  void addEdgesUnSort(bool setEdgeVals, GraphNode src, EdgeDst::value_type* dst,
-                      T* dst_data, uint64_t num_dst) {
-    acquireNode(src, galois::MethodFlag::WRITE);
-    auto orig_deg = getDegree(src);
-    auto ee = edgeEnd.fetch_add(num_dst + orig_deg, std::memory_order_relaxed);
-    auto edgeStart = ee;
-    auto orig_itr  = edge_begin(src);
-    auto orig_end  = edge_end(src);
-    auto dst_end   = dst + num_dst;
->>>>>>> ccbe0f155 (add new LS_CSR and LargeVector implementations (#1))
 
     std::memcpy(&edgeDst[edgeStart], &edgeDst[*orig_itr],
                 sizeof(EdgeDst::value_type) * orig_deg);
@@ -606,15 +593,11 @@ class LS_LC_CSR_64_Graph :
 
     edgeIndData[src].first  = edgeStart;
     edgeIndData[src].second = edgeStart + num_dst + orig_deg;
-<<<<<<< HEAD
 
     if (!keep_size) {
       numEdges.fetch_add(num_dst, std::memory_order_relaxed);
     }
     prefixValid = false;
-=======
-    numEdges.fetch_add(num_dst, std::memory_order_relaxed);
->>>>>>> ccbe0f155 (add new LS_CSR and LargeVector implementations (#1))
   }
 
   void addEdgeSort(const uint64_t src, const uint64_t dst) {

From fd1c92bdb651c1eb8226655ea66b9da57d275c60 Mon Sep 17 00:00:00 2001
From: patrickkenney9801 <patrickkenney9801@gmail.com>
Date: Thu, 7 Mar 2024 10:35:55 -0600
Subject: [PATCH 629/660] chore: Only build libgnn if MKL is enabled

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 929371e642..389b279ea6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -296,7 +296,7 @@ if (GALOIS_ENABLE_GPU)
   endif()
 endif()
 
-if (GALOIS_ENABLE_DIST)
+if (GALOIS_ENABLE_DIST AND USE_MKL_BLAS)
   # here because I need the GPU declarations above
   add_subdirectory(libgnn)
 endif()

From ae40781a5528ec56e0aa67797f74c93c8e9296f6 Mon Sep 17 00:00:00 2001
From: patrickkenney9801 <patrickkenney9801@gmail.com>
Date: Thu, 7 Mar 2024 11:14:43 -0600
Subject: [PATCH 630/660] chore: Add make directive to run tests we care about

---
 Makefile | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/Makefile b/Makefile
index df77923812..e59067e656 100644
--- a/Makefile
+++ b/Makefile
@@ -39,3 +39,8 @@ docker:
 
 run-cmake:
 	@cmake -S . -B ${BUILD_DIR} -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DUSE_MKL_BLAS=ON -DGALOIS_ENABLE_DIST=ON
+
+run-tests:
+	@ctest --test-dir dockerbuild -R wmd --verbose
+	@ctest --test-dir dockerbuild -R large-vec --verbose
+	@ctest --test-dir dockerbuild -R compile-lscsr --verbose

From 7c9c07204d6fcb3fb3c5d3e7a0537a37bb8a9e6f Mon Sep 17 00:00:00 2001
From: patrickkenney9801 <patrickkenney9801@gmail.com>
Date: Thu, 7 Mar 2024 11:48:05 -0600
Subject: [PATCH 631/660] chore: Run pre-commit on repo

---
 libgalois/include/galois/LargeVector.h               | 2 +-
 libgalois/include/galois/graphs/LS_LC_CSR_64_Graph.h | 6 +++---
 libgalois/include/galois/graphs/MorphGraph.h         | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/libgalois/include/galois/LargeVector.h b/libgalois/include/galois/LargeVector.h
index c26ab76d89..d9d4aba22b 100644
--- a/libgalois/include/galois/LargeVector.h
+++ b/libgalois/include/galois/LargeVector.h
@@ -188,4 +188,4 @@ istream& operator>>(istream& is, galois::LargeVector<T>& vec) {
 }
 } // namespace std
 
-#endif
\ No newline at end of file
+#endif
diff --git a/libgalois/include/galois/graphs/LS_LC_CSR_64_Graph.h b/libgalois/include/galois/graphs/LS_LC_CSR_64_Graph.h
index 6394e90cea..ef42ae1a73 100644
--- a/libgalois/include/galois/graphs/LS_LC_CSR_64_Graph.h
+++ b/libgalois/include/galois/graphs/LS_LC_CSR_64_Graph.h
@@ -293,13 +293,13 @@ class LS_LC_CSR_64_Graph :
 
   template <bool _A1 = HasNoLockable, bool _A2 = HasOutOfLineLockable>
   void acquireNode(GraphNode N, MethodFlag mflag,
-                   typename std::enable_if<!_A1 && !_A2>::type* = 0) {
+                   typename std::enable_if<!_A1&& !_A2>::type* = 0) {
     galois::runtime::acquire(&nodeData[N], mflag);
   }
 
   template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>
   void acquireNode(GraphNode N, MethodFlag mflag,
-                   typename std::enable_if<_A1 && !_A2>::type* = 0) {
+                   typename std::enable_if<_A1&& !_A2>::type* = 0) {
     this->outOfLineAcquire(getId(N), mflag);
   }
 
@@ -338,7 +338,7 @@ class LS_LC_CSR_64_Graph :
   template <bool _A1 = EdgeData::has_value,
             bool _A2 = LargeArray<FileEdgeTy>::has_value>
   void constructEdgeValue(FileGraph&, typename FileGraph::edge_iterator nn,
-                          typename std::enable_if<_A1 && !_A2>::type* = 0) {
+                          typename std::enable_if<_A1&& !_A2>::type* = 0) {
     edgeData.set(*nn, {});
   }
 
diff --git a/libgalois/include/galois/graphs/MorphGraph.h b/libgalois/include/galois/graphs/MorphGraph.h
index e517e0fc5f..6157f50f09 100644
--- a/libgalois/include/galois/graphs/MorphGraph.h
+++ b/libgalois/include/galois/graphs/MorphGraph.h
@@ -702,7 +702,7 @@ class MorphGraph : private boost::noncopyable {
   EdgeTy*
   constructOutEdgeValue(FileGraph&, typename FileGraph::edge_iterator,
                         GraphNode src, GraphNode dst,
-                        typename std::enable_if<_A1 && !_A2>::type* = 0) {
+                        typename std::enable_if<_A1&& !_A2>::type* = 0) {
     return createOutEdge(src, dst, galois::MethodFlag::UNPROTECTED);
   }
 

From 15f203f3df290d2604a77f13b7d22ee6d94ef262 Mon Sep 17 00:00:00 2001
From: Patrick Kenney <patrickkenney9801@gmail.com>
Date: Mon, 18 Mar 2024 11:55:47 -0500
Subject: [PATCH 632/660] ci: Enable github ci

chore: Remove unused submodules
---
 .github/workflows/build-and-test.yml        | 148 ++++++++++++++++++++
 .gitignore                                  |   1 +
 .gitmodules                                 |   6 -
 CMakeLists.txt                              |  23 +++
 Dockerfile                                  |  30 +++-
 Makefile                                    | 107 ++++++++++++--
 external/cub                                |   1 -
 external/moderngpu                          |   1 -
 libgalois/include/shad/ShadGraphConverter.h |   4 +-
 9 files changed, 295 insertions(+), 26 deletions(-)
 create mode 100644 .github/workflows/build-and-test.yml
 delete mode 160000 external/cub
 delete mode 160000 external/moderngpu

diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
new file mode 100644
index 0000000000..a95f29e42e
--- /dev/null
+++ b/.github/workflows/build-and-test.yml
@@ -0,0 +1,148 @@
+name: Docker / Ubuntu 22.04 / Build and Test
+run-name: docker-ubuntu-2204 performed by @${{ github.triggering_actor }}
+
+on:
+  pull_request:
+    branches:
+      - master
+      - main
+  push:
+    branches:
+      - master
+      - main
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  docker-create-ubuntu-2204:
+    name: create
+    runs-on: self-hosted
+    permissions:
+      contents: read
+      packages: write
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
+    - name: Create Docker Image
+      timeout-minutes: 45
+      run: |
+        make ci-image
+
+  ubuntu-2204-docker:
+    name: gcc / ${{ matrix.build-type }} / ${{ matrix.sanitizer-type }}
+    runs-on: self-hosted
+    permissions:
+      contents: read
+      packages: write
+    env:
+      IMAGE_NAME: galois
+      CONTAINER_SRC_DIR: "/pando-galois"
+      CONTAINER_BUILD_DIR: "/pando-galois/build"
+      CONTAINER_WORK_DIR: "/pando-galois"
+      GALOIS_CONTAINER_ENV: "-e=GALOIS_BUILD_TOOL=Ninja"
+      GALOIS_CONTAINER_FLAGS: "--cpus=8"
+      INTERACTIVE: ""
+    defaults:
+      run:
+        shell: bash -l {0}
+    strategy:
+      matrix:
+        build-type: ['Release', 'Debug']
+        sanitizer-type: ['nosan', 'san']
+        exclude:
+          - build-type: 'Debug'
+            sanitizer-type: 'san'
+    needs: docker-create-ubuntu-2204
+
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
+        lfs: 'true'
+        submodules: recursive
+
+    - name: Set up environment variables
+      timeout-minutes: 1
+      run: |
+        echo "UNAME=$(whoami)" >> $GITHUB_ENV
+        echo "UID=$(id -u)" >> $GITHUB_ENV
+        echo "GID=$(id -g)" >> $GITHUB_ENV
+        echo "SRC_DIR=$(pwd)" >> $GITHUB_ENV
+        echo "GALOIS_CCACHE_DIR=/var/local/$(whoami)/.ccache" >> $GITHUB_ENV
+        echo "IMAGE_VERSION=$(git log --pretty="%h" -1 Dockerfile)" >> $GITHUB_ENV
+        if [ ${{ matrix.sanitizer-type }} == 'san' ]; then
+          echo "GALOIS_CONTAINER_ENV=$GALOIS_CONTAINER_ENV -e=GALOIS_EXTRA_CXX_FLAGS='\"-fsanitize=address -fsanitize=undefined\"'" >> $GITHUB_ENV
+        fi
+        if [ ${{ matrix.build-type }} == 'Debug' ]; then
+          echo "GALOIS_CONTAINER_ENV=$GALOIS_CONTAINER_ENV -e=GALOIS_EXTRA_CXX_FLAGS='-O3'" >> $GITHUB_ENV
+        fi
+        cat $GITHUB_ENV
+
+    - name: Configure
+      timeout-minutes: 10
+      run: |
+        mkdir -p ${{ env.GALOIS_CCACHE_DIR }} -m=777
+        CONTAINER_CMD="bash -lc 'source /opt/intel/oneapi/setvars.sh && make setup-ci'" \
+          CONTAINER_OPTS="-e=BUILD_TYPE=${{ matrix.build-type }}" \
+          IMAGE_NAME="${{ env.IMAGE_NAME }}" \
+          VERSION="${{ env.IMAGE_VERSION }}" \
+          make docker
+
+    - name: Build
+      timeout-minutes: 15
+      run: |
+        CONTAINER_CMD="bash -c 'ninja -j10 || ninja || ninja'" \
+          IMAGE_NAME="${{ env.IMAGE_NAME }}" \
+          VERSION="${{ env.IMAGE_VERSION }}" \
+          CONTAINER_WORKDIR="${{ env.CONTAINER_BUILD_DIR }}" \
+          make docker
+
+    - name: Run Tests
+      timeout-minutes: 5
+      run: |
+        CONTAINER_CMD="make run-tests" \
+          IMAGE_NAME="${{ env.IMAGE_NAME }}" \
+          VERSION="${{ env.IMAGE_VERSION }}" \
+          make docker
+
+  docker-pre-commit-ubuntu-2204:
+    name: pre-commit
+    runs-on: self-hosted
+    permissions:
+      contents: read
+      packages: write
+    env:
+      IMAGE_NAME: galois
+      CONTAINER_SRC_DIR: "/pando-galois"
+      CONTAINER_BUILD_DIR: "/pando-galois/build"
+      CONTAINER_WORK_DIR: "/pando-galois"
+      INTERACTIVE: ""
+    defaults:
+      run:
+        shell: bash -l {0}
+    needs: docker-create-ubuntu-2204
+
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
+    - name: Set up environment variables
+      timeout-minutes: 1
+      run: |
+        echo "SRC_DIR=$(pwd)" >> $GITHUB_ENV
+        echo "IMAGE_VERSION=$(git log --pretty="%h" -1 Dockerfile)" >> $GITHUB_ENV
+        cat $GITHUB_ENV
+    - name: Check pre-commit
+      timeout-minutes: 10
+      run: |
+        IMAGE_NAME="${{ env.IMAGE_NAME }}" \
+          VERSION="${{ env.IMAGE_VERSION }}" \
+          make docker-pre-commit
diff --git a/.gitignore b/.gitignore
index 76b1e5b631..a1238adb3e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,6 +21,7 @@ cscope.out
 .tags*
 tags
 .ycm_extra_conf.py
+.ccache
 
 # no build files
 /*build*
diff --git a/.gitmodules b/.gitmodules
index 0095886558..e69de29bb2 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +0,0 @@
-[submodule "moderngpu"]
-	path = external/moderngpu
-	url = https://github.com/moderngpu/moderngpu.git
-[submodule "cub"]
-	path = external/cub
-	url = https://github.com/NVlabs/cub.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 389b279ea6..61daad3f64 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -73,6 +73,29 @@ include(CTest)
 
 ###### Configure compiler ######
 
+if(PROJECT_IS_TOP_LEVEL)
+  if(CMAKE_CXX_FLAGS)
+    message(STATUS "Provided CXX Flags: " ${CMAKE_CXX_FLAGS})
+  endif()
+
+  set(CMAKE_OPTIMIZE_DEPENDENCIES true)
+
+  # Setup CCache
+  find_program(CCACHE_EXECUTABLE ccache)
+  if(CCACHE_EXECUTABLE)
+    message(STATUS "CCache found at: " ${CCACHE_EXECUTABLE})
+    set(ccacheEnv
+      CCACHE_SLOPPINESS=pch_defines,time_macros
+    )
+    # NOTE: Ccache 4.2+ required for reliable CUDA support
+    foreach(lang IN ITEMS C CXX OBJC OBJCXX CUDA)
+      set(CMAKE_${lang}_COMPILER_LAUNCHER
+        ${CMAKE_COMMAND} -E env ${ccacheEnv} ${CCACHE_EXECUTABLE}
+      )
+    endforeach()
+  endif()
+endif()
+
 # generate compile_commands.json
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
diff --git a/Dockerfile b/Dockerfile
index 6e4ec2285e..8c3499fe8f 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,5 +1,5 @@
 ARG BUILD_IMAGE=ubuntu:22.04
-FROM --platform=linux/amd64 ${BUILD_IMAGE} AS build
+FROM --platform=linux/amd64 ${BUILD_IMAGE} AS dev
 
 WORKDIR /tmp
 
@@ -9,6 +9,7 @@ RUN apt update && \
   cmake \
   gcc \
   g++ \
+  ccache \
   build-essential \
   make \
   libboost-all-dev \
@@ -23,6 +24,7 @@ RUN apt update && \
   git \
   python3 \
   python3-pip \
+  unzip \
   && apt-get clean && rm -rf /var/lib/apt/lists/*
 
 # setup intel repo for intel-basekit
@@ -37,6 +39,28 @@ RUN apt update && \
 
 RUN bash -c "$(wget -O - https://apt.llvm.org/llvm.sh)"
 
+ENV NINJA_BUILD_VERSION=1.11.1
+RUN wget https://github.com/ninja-build/ninja/releases/download/v${NINJA_BUILD_VERSION}/ninja-linux.zip -P /tmp && \
+  unzip /tmp/ninja-linux.zip -d /usr/bin && \
+  rm /tmp/ninja-linux.zip
+
+ARG IS_CI=true
+
+RUN if [ "${IS_CI}" != "true" ] ; then \
+  apt update -y \
+  &&  apt install -y \
+  vim \
+  gdb \
+  universal-ctags \
+  powerline \
+  zsh \
+  valgrind \
+  sudo \
+  doxygen \
+  texlive-latex-extra \
+  texlive-font-utils \
+  &&  apt clean; fi
+
 ARG SRC_DIR=/pando-galois
 ARG BUILD_DIR=/pando-galois/dockerbuild
 ARG UNAME
@@ -53,14 +77,12 @@ USER ${UNAME}
 WORKDIR /home/${UNAME}
 ENV BUILD_DIR=${BUILD_DIR}
 
-RUN pip3 install compdb pre-commit cpplint "clang-format>=12.0.1"
+RUN pip3 install compdb pre-commit cpplint "clang-format>=12.0.1,<17.0.0"
 
 RUN echo "PATH=/home/${UNAME}/.local/bin/:\$PATH" >> /home/${UNAME}/.zshenv
 
 RUN echo "export SRC_DIR=${SRC_DIR}" >> /home/${UNAME}/.bashrc
 RUN echo "export BUILD_DIR=${BUILD_DIR}" >> /home/${UNAME}/.bashrc
-RUN echo "export OMPI_ALLOW_RUN_AS_ROOT=1" >> /home/${UNAME}/.bashrc
-RUN echo "export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1" >> /home/${UNAME}/.bashrc
 RUN echo "source /opt/intel/oneapi/setvars.sh > /dev/null" >> /home/${UNAME}/.bashrc
 
 WORKDIR ${SRC_DIR}
diff --git a/Makefile b/Makefile
index e59067e656..a0544b1c2c 100644
--- a/Makefile
+++ b/Makefile
@@ -1,8 +1,34 @@
 SHELL := /bin/bash
 
-IMAGE_NAME := pando-galois
-VERSION := 0.0.1
-CONTAINER_SRC_DIR := /pando-galois
+UNAME ?= $(shell whoami)
+UID ?= $(shell id -u)
+GID ?= $(shell id -g)
+
+BASE_IMAGE_NAME ?= pando-galois
+IMAGE_NAME ?= ${UNAME}-${BASE_IMAGE_NAME}
+SRC_DIR ?= $(shell pwd)
+VERSION ?= $(shell git log --pretty="%h" -1 Dockerfile)
+
+CONTAINER_SRC_DIR ?= /pando-galois
+CONTAINER_BUILD_DIR ?= /pando-galois/build
+CONTAINER_WORKDIR ?= ${CONTAINER_SRC_DIR}
+CONTAINER_CONTEXT ?= default
+CONTAINER_OPTS ?=
+CONTAINER_CMD ?= bash -l
+INTERACTIVE ?= i
+
+BUILD_TYPE ?= RelWithDebInfo
+
+# CMake variables
+GALOIS_EXTRA_CMAKE_FLAGS ?= ""
+GALOIS_EXTRA_CXX_FLAGS ?= ""
+
+# Developer variables that should be set as env vars in startup files like .profile
+GALOIS_CONTAINER_MOUNTS ?=
+GALOIS_CONTAINER_ENV ?=
+GALOIS_CONTAINER_FLAGS ?=
+GALOIS_BUILD_TOOL ?= 'Unix Makefiles'
+GALOIS_CCACHE_DIR ?= ${SRC_DIR}/.ccache
 
 dependencies: dependencies-asdf
 
@@ -25,22 +51,77 @@ hooks:
 pre-commit:
 	@pre-commit run -a
 
+ci-image:
+	@${MAKE} docker-image-dependencies
+	@docker image inspect galois:${VERSION} >/dev/null 2>&1 || \
+	docker --context ${CONTAINER_CONTEXT} build \
+	--build-arg SRC_DIR=${CONTAINER_SRC_DIR} \
+	--build-arg BUILD_DIR=${CONTAINER_BUILD_DIR} \
+	--build-arg UNAME=runner \
+  --build-arg UID=1078 \
+  --build-arg GID=504 \
+	-t galois:${VERSION} \
+	--file Dockerfile \
+	--target dev .
+
 docker-image:
-	@docker --context default build --build-arg VERSION=${VERSION} \
-	--build-arg UNAME=$(shell whoami) \
-  --build-arg UID=$(shell id -u) \
-  --build-arg GID=$(shell id -g) \
+	@${MAKE} docker-image-dependencies
+	@docker image inspect ${IMAGE_NAME}:${VERSION} >/dev/null 2>&1 || \
+	docker --context ${CONTAINER_CONTEXT} build \
+	--build-arg SRC_DIR=${CONTAINER_SRC_DIR} \
+	--build-arg BUILD_DIR=${CONTAINER_BUILD_DIR} \
+	--build-arg UNAME=${UNAME} \
+	--build-arg IS_CI=false \
+  --build-arg UID=${UID} \
+  --build-arg GID=${GID} \
 	-t ${IMAGE_NAME}:${VERSION} \
 	--file Dockerfile \
-	--target build .
+	--target dev .
+
+docker-image-dependencies:
+	@mkdir -p build
+	@mkdir -p data
+	@mkdir -p .ccache
 
+.PHONY: docker
 docker:
-	@docker --context default run --rm -v $(shell pwd)/:${CONTAINER_SRC_DIR} --privileged --workdir=${CONTAINER_SRC_DIR} -it ${IMAGE_NAME}:${VERSION} bash -l
+	@docker --context ${CONTAINER_CONTEXT} run --rm \
+	-v ${SRC_DIR}/:${CONTAINER_SRC_DIR} \
+	-v ${GALOIS_CCACHE_DIR}/:/home/${UNAME}/.ccache \
+	${GALOIS_CONTAINER_MOUNTS} \
+	${GALOIS_CONTAINER_ENV} \
+	${GALOIS_CONTAINER_FLAGS} \
+	--privileged \
+	--workdir=${CONTAINER_WORKDIR} \
+	${CONTAINER_OPTS} \
+	-${INTERACTIVE}t \
+	${IMAGE_NAME}:${VERSION} \
+	${CONTAINER_CMD}
 
 run-cmake:
-	@cmake -S . -B ${BUILD_DIR} -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DUSE_MKL_BLAS=ON -DGALOIS_ENABLE_DIST=ON
+	@cmake \
+  -S ${SRC_DIR} \
+  -B ${BUILD_DIR} \
+	-G ${GALOIS_BUILD_TOOL} \
+  -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+	-DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
+	-DUSE_MKL_BLAS=ON \
+	-DGALOIS_ENABLE_DIST=ON \
+	${GALOIS_EXTRA_CMAKE_FLAGS}
+
+setup: run-cmake
+
+setup-ci: run-cmake
 
 run-tests:
-	@ctest --test-dir dockerbuild -R wmd --verbose
-	@ctest --test-dir dockerbuild -R large-vec --verbose
-	@ctest --test-dir dockerbuild -R compile-lscsr --verbose
+	@ctest --test-dir build -R wmd --verbose
+	@ctest --test-dir build -R large-vec --verbose
+	@ctest --test-dir build -R compile-lscsr --verbose
+
+# this command is slow since hooks are not stored in the container image
+# this is mostly for CI use
+docker-pre-commit:
+	@docker --context ${CONTAINER_CONTEXT} run --rm \
+	-v ${SRC_DIR}/:${CONTAINER_SRC_DIR} --privileged \
+	--workdir=${CONTAINER_WORKDIR} -t \
+	${IMAGE_NAME}:${VERSION} bash -lc "git config --global --add safe.directory /pando-galois && make hooks && make pre-commit"
diff --git a/external/cub b/external/cub
deleted file mode 160000
index c3cceac115..0000000000
--- a/external/cub
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit c3cceac115c072fb63df1836ff46d8c60d9eb304
diff --git a/external/moderngpu b/external/moderngpu
deleted file mode 160000
index 2b3985541c..0000000000
--- a/external/moderngpu
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 2b3985541c8e88a133769598c406c33ddde9d0a5
diff --git a/libgalois/include/shad/ShadGraphConverter.h b/libgalois/include/shad/ShadGraphConverter.h
index 87cef93a93..c33cfbad3a 100644
--- a/libgalois/include/shad/ShadGraphConverter.h
+++ b/libgalois/include/shad/ShadGraphConverter.h
@@ -523,7 +523,7 @@ class ShadGraphConverter {
     return tokens;
   }
 
-  void CountNumEdgesForEachVertex(uint64_t numNodes, uint64_t /*numEdges*/) {
+  void CountNumEdgesForEachVertex(uint64_t numNodes, uint64_t numEdges) {
     // galois::on_each([this, numNodes, numEdges](
     galois::on_each([&](uint32_t tid, uint32_t numThreads) {
       // Each thread is assigned disjointed range of nodes.
@@ -543,6 +543,8 @@ class ShadGraphConverter {
 
 #ifndef NDEBUG
     this->VerifyNumEdgesPerVertex(numEdges);
+#else
+    (void)numEdges;
 #endif
   }
 

From 6550c3f83347a78cafd794afb2ae45656d274dcc Mon Sep 17 00:00:00 2001
From: Meyer Zinn <6132034+meyerzinn@users.noreply.github.com>
Date: Tue, 19 Mar 2024 12:06:46 -0500
Subject: [PATCH 633/660] reduce contention on parallel insertions (#5)

---
 libgalois/include/galois/LargeVector.h        |  8 +++----
 .../include/galois/graphs/LS_LC_CSR_Graph.h   | 24 ++++++++++++-------
 2 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/libgalois/include/galois/LargeVector.h b/libgalois/include/galois/LargeVector.h
index d9d4aba22b..7b5deb4654 100644
--- a/libgalois/include/galois/LargeVector.h
+++ b/libgalois/include/galois/LargeVector.h
@@ -29,7 +29,7 @@ template <typename T>
 class LargeVector : public boost::noncopyable {
 private:
   size_t m_capacity;
-  size_t m_size;
+  size_t volatile m_size;
   T* m_data;
 
   int m_fd;
@@ -68,9 +68,9 @@ class LargeVector : public boost::noncopyable {
     std::cout << "new_cap = " << new_cap << "\tmmap_size = " << mmap_size
               << std::endl;
 
-    m_data =
-        static_cast<T*>(mmap(nullptr, mmap_size, PROT_READ | PROT_WRITE,
-                             MAP_SHARED | MAP_HUGETLB | MAP_HUGE_2MB, m_fd, 0));
+    m_data = static_cast<T*>(
+        mmap(nullptr, mmap_size, PROT_READ | PROT_WRITE,
+             MAP_SHARED | MAP_HUGETLB | MAP_HUGE_2MB | MAP_POPULATE, m_fd, 0));
     if (m_data == MAP_FAILED)
       throw std::runtime_error(std::string("mmap failed: ") +
                                std::strerror(errno));
diff --git a/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h b/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
index 78e78c62b6..c49e504aba 100644
--- a/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
+++ b/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
@@ -23,6 +23,7 @@
 #include <unordered_set>
 #include <iterator>
 #include <cstddef>
+#include <atomic>
 
 #include <boost/range/iterator_range_core.hpp>
 #include <boost/range/counting_range.hpp>
@@ -78,6 +79,8 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
   // must be held to acquire m_edges_lock.
   SpinLock m_edges_lock;
 
+  std::atomic_uint64_t m_edges_tail = ATOMIC_VAR_INIT(0);
+
   // returns a reference to the metadata for the pointed-to edge
   inline EdgeMetadata& getEdgeMetadata(EdgeHandle const& handle) {
     return getEdgeMetadata(handle.buffer, handle.index);
@@ -118,18 +121,22 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
     // Copies the edge list to the end of m_edges[1], prepending
     // the new edges.
 
-    vertex_meta.lock();
+    vertex_meta.lock(); // prevents compaction
     {
       uint64_t const new_degree = vertex_meta.degree + dsts.size();
-      uint64_t new_begin;
-      m_edges_lock.lock();
-      {
-        new_begin = m_edges[1].size();
-        m_edges[1].resize(new_begin + new_degree);
-      }
-      m_edges_lock.unlock();
+      uint64_t const new_begin =
+          m_edges_tail.fetch_add(new_degree, std::memory_order_relaxed);
       uint64_t const new_end = new_begin + new_degree;
 
+      if (m_edges[1].size() < new_end) {
+        m_edges_lock.lock();
+        {
+          if (m_edges[1].size() < new_end)
+            m_edges[1].resize(std::max(m_edges[1].size() * 2, new_end));
+        }
+        m_edges_lock.unlock();
+      }
+
       // insert new edges
       std::transform(dsts.begin(), dsts.end(), &getEdgeMetadata(1, new_begin),
                      [](VertexTopologyID dst) {
@@ -242,6 +249,7 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
     {
       m_edges[0].resize(0);
       swap(m_edges[0], m_edges[1]);
+      m_edges_tail.store(0, std::memory_order_relaxed); // fine because lock
     }
     m_edges_lock.unlock();
 

From 86f9a711f53f2852f8e68b9c94e71cc03203b08c Mon Sep 17 00:00:00 2001
From: Patrick Kenney <patrickkenney9801@gmail.com>
Date: Tue, 19 Mar 2024 14:32:59 -0500
Subject: [PATCH 634/660] chore: Add parallel map and pcg generator as
 dependencies (#3)

---
 .gitmodules               | 6 ++++++
 external/parallel-hashmap | 1 +
 external/pcg-cpp          | 1 +
 3 files changed, 8 insertions(+)
 create mode 160000 external/parallel-hashmap
 create mode 160000 external/pcg-cpp

diff --git a/.gitmodules b/.gitmodules
index e69de29bb2..d66cce84ad 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -0,0 +1,6 @@
+[submodule "external/pcg-cpp"]
+	path = external/pcg-cpp
+	url = https://github.com/imneme/pcg-cpp.git
+[submodule "external/parallel-hashmap"]
+	path = external/parallel-hashmap
+	url = https://github.com/greg7mdp/parallel-hashmap.git
diff --git a/external/parallel-hashmap b/external/parallel-hashmap
new file mode 160000
index 0000000000..67c24619e4
--- /dev/null
+++ b/external/parallel-hashmap
@@ -0,0 +1 @@
+Subproject commit 67c24619e4f5ab2097b74cc397732c17a25d6944
diff --git a/external/pcg-cpp b/external/pcg-cpp
new file mode 160000
index 0000000000..428802d1a5
--- /dev/null
+++ b/external/pcg-cpp
@@ -0,0 +1 @@
+Subproject commit 428802d1a5634f96bcd0705fab379ff0113bcf13

From 700cc9bc981a62c428104e6d7fd4b170017eb675 Mon Sep 17 00:00:00 2001
From: Meyer Zinn <6132034+meyerzinn@users.noreply.github.com>
Date: Thu, 21 Mar 2024 11:16:38 -0500
Subject: [PATCH 635/660] reduce vertex metadata size and remove vertex locks
 (#8)

---
 .../include/galois/graphs/LS_LC_CSR_Graph.h   | 126 ++++++++----------
 1 file changed, 52 insertions(+), 74 deletions(-)

diff --git a/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h b/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
index c49e504aba..8de92e2d5b 100644
--- a/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
+++ b/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
@@ -72,13 +72,8 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
   using EdgeRange = boost::iterator_range<EdgeIterator>;
 
   std::vector<VertexMetadata> m_vertices;
-
   LargeVector<EdgeMetadata> m_edges[2];
-
-  // To avoid deadlock between updates and compaction, at least one vertex lock
-  // must be held to acquire m_edges_lock.
-  SpinLock m_edges_lock;
-
+  SpinLock m_edges_lock; // guards resizing of edges vectors
   std::atomic_uint64_t m_edges_tail = ATOMIC_VAR_INIT(0);
 
   // returns a reference to the metadata for the pointed-to edge
@@ -116,46 +111,43 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
 
   int addEdgesTopologyOnly(VertexTopologyID src,
                            const std::vector<VertexTopologyID> dsts) {
-    auto& vertex_meta = m_vertices[src];
 
     // Copies the edge list to the end of m_edges[1], prepending
     // the new edges.
 
-    vertex_meta.lock(); // prevents compaction
-    {
-      uint64_t const new_degree = vertex_meta.degree + dsts.size();
-      uint64_t const new_begin =
-          m_edges_tail.fetch_add(new_degree, std::memory_order_relaxed);
-      uint64_t const new_end = new_begin + new_degree;
-
-      if (m_edges[1].size() < new_end) {
-        m_edges_lock.lock();
-        {
-          if (m_edges[1].size() < new_end)
-            m_edges[1].resize(std::max(m_edges[1].size() * 2, new_end));
-        }
-        m_edges_lock.unlock();
-      }
+    auto& vertex_meta = m_vertices[src];
 
-      // insert new edges
-      std::transform(dsts.begin(), dsts.end(), &getEdgeMetadata(1, new_begin),
-                     [](VertexTopologyID dst) {
-                       return EdgeMetadata{.flags = 0, .dst = dst};
-                     });
-
-      // copy old, non-tombstoned edges
-      std::copy_if(&getEdgeMetadata(vertex_meta.buffer, vertex_meta.begin),
-                   &getEdgeMetadata(vertex_meta.buffer, vertex_meta.end),
-                   &getEdgeMetadata(1, new_begin + dsts.size()),
-                   [](EdgeMetadata& edge) { return !edge.is_tomb(); });
-
-      // update vertex metadata
-      vertex_meta.buffer = 1;
-      vertex_meta.begin  = new_begin;
-      vertex_meta.end    = new_end;
-      vertex_meta.degree += dsts.size();
+    uint64_t const new_degree = vertex_meta.degree + dsts.size();
+    uint64_t const new_begin =
+        m_edges_tail.fetch_add(new_degree, std::memory_order_relaxed);
+    uint64_t const new_end = new_begin + new_degree;
+
+    if (m_edges[1].size() < new_end) {
+      m_edges_lock.lock();
+      {
+        if (m_edges[1].size() < new_end)
+          m_edges[1].resize(std::max(m_edges[1].size() * 2, new_end));
+      }
+      m_edges_lock.unlock();
     }
-    vertex_meta.unlock();
+
+    // insert new edges
+    std::transform(dsts.begin(), dsts.end(), &getEdgeMetadata(1, new_begin),
+                   [](VertexTopologyID dst) {
+                     return EdgeMetadata{.flags = 0, .dst = dst};
+                   });
+
+    // copy old, non-tombstoned edges
+    std::copy_if(&getEdgeMetadata(vertex_meta.buffer, vertex_meta.begin),
+                 &getEdgeMetadata(vertex_meta.buffer, vertex_meta.end),
+                 &getEdgeMetadata(1, new_begin + dsts.size()),
+                 [](EdgeMetadata& edge) { return !edge.is_tomb(); });
+
+    // update vertex metadata
+    vertex_meta.buffer = 1;
+    vertex_meta.begin  = new_begin;
+    vertex_meta.end    = new_end;
+    vertex_meta.degree += dsts.size();
 
     return 0;
   }
@@ -165,32 +157,28 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
     std::unordered_set<VertexTopologyID> edges_set(edges.begin(), edges.end());
 
     auto& vertex_meta = m_vertices[src];
-    vertex_meta.lock();
-    {
-      for (auto i = vertex_meta.begin; i < vertex_meta.end; ++i) {
-        EdgeMetadata& edge_meta =
-            getEdgeMetadata(EdgeHandle(vertex_meta.buffer, i));
-        if (!edge_meta.is_tomb() &&
-            edges_set.find(edge_meta.dst) != edges_set.end()) {
-          edge_meta.tomb();
-          --vertex_meta.degree;
-          // remove tombstoned edges from the start of the edge list
-          if (i == vertex_meta.begin)
-            ++vertex_meta.begin;
-        }
+    for (auto i = vertex_meta.begin; i < vertex_meta.end; ++i) {
+      EdgeMetadata& edge_meta =
+          getEdgeMetadata(EdgeHandle(vertex_meta.buffer, i));
+      if (!edge_meta.is_tomb() &&
+          edges_set.find(edge_meta.dst) != edges_set.end()) {
+        edge_meta.tomb();
+        --vertex_meta.degree;
+        // remove tombstoned edges from the start of the edge list
+        if (i == vertex_meta.begin)
+          ++vertex_meta.begin;
       }
+    }
 
-      // remove tombstoned edges from the end of the edge list
-      for (auto i = vertex_meta.end; i > vertex_meta.begin; --i) {
-        if (getEdgeMetadata(EdgeHandle(vertex_meta.buffer, i - 1)).is_tomb()) {
-          --vertex_meta.end;
-          --vertex_meta.degree;
-        } else {
-          break;
-        }
+    // remove tombstoned edges from the end of the edge list
+    for (auto i = vertex_meta.end; i > vertex_meta.begin; --i) {
+      if (getEdgeMetadata(EdgeHandle(vertex_meta.buffer, i - 1)).is_tomb()) {
+        --vertex_meta.end;
+        --vertex_meta.degree;
+      } else {
+        break;
       }
     }
-    vertex_meta.unlock();
 
     return 0;
   }
@@ -202,7 +190,7 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
   // Performs the compaction algorithm by copying any vertices left in buffer 0
   // to buffer 1, then swapping the buffers.
   //
-  // Should not be called from within a Galois parallel kernel.
+  // Not safe to call in parallel with insertions/deletions.
   void compact() {
     using std::swap;
 
@@ -210,7 +198,6 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
     galois::do_all(galois::iterate(vertices().begin(), vertices().end()),
                    [&](VertexTopologyID vertex_id) {
                      VertexMetadata& vertex_meta = m_vertices[vertex_id];
-                     vertex_meta.lock();
 
                      if (vertex_meta.buffer == 0) {
                        uint64_t new_begin;
@@ -237,14 +224,9 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
                      // we are about to swap the buffers, so all vertices will
                      // be in buffer 0
                      vertex_meta.buffer = 0;
-
-                     // don't release the vertex lock until after the edge
-                     // arrays are swapped
                    });
 
     // At this point, there are no more live edges in buffer 0.
-    // We also hold the lock for all vertices, so nobody else can hold
-    // m_edges_lock.
     m_edges_lock.lock();
     {
       m_edges[0].resize(0);
@@ -252,14 +234,10 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
       m_edges_tail.store(0, std::memory_order_relaxed); // fine because lock
     }
     m_edges_lock.unlock();
-
-    galois::do_all(
-        galois::iterate(vertices().begin(), vertices().end()),
-        [&](VertexTopologyID vertex_id) { m_vertices[vertex_id].unlock(); });
   }
 
 private:
-  struct VertexMetadata : public SpinLock {
+  struct VertexMetadata {
     uint8_t buffer : 1;
     uint64_t begin : 48; // inclusive
     uint64_t end : 48;   // exclusive

From 0db655775f82a72ed341e324f7e34091c428b3bb Mon Sep 17 00:00:00 2001
From: Meyer Zinn <6132034+meyerzinn@users.noreply.github.com>
Date: Thu, 21 Mar 2024 11:43:08 -0500
Subject: [PATCH 636/660] add info for compaction policy (#7)

* add info for compaction policy
* align atomics to cache line
---
 .../include/galois/graphs/LS_LC_CSR_Graph.h   | 42 +++++++++++++++++--
 1 file changed, 39 insertions(+), 3 deletions(-)

diff --git a/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h b/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
index 8de92e2d5b..9da3160e50 100644
--- a/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
+++ b/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
@@ -24,6 +24,7 @@
 #include <iterator>
 #include <cstddef>
 #include <atomic>
+#include <new>
 
 #include <boost/range/iterator_range_core.hpp>
 #include <boost/range/counting_range.hpp>
@@ -32,6 +33,12 @@
 #include "galois/config.h"
 #include "galois/LargeVector.h"
 
+#ifdef __cpp_lib_hardware_interference_size
+using std::hardware_destructive_interference_size;
+#else
+constexpr std::size_t hardware_destructive_interference_size = 64;
+#endif
+
 namespace galois::graphs {
 
 /**
@@ -74,7 +81,11 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
   std::vector<VertexMetadata> m_vertices;
   LargeVector<EdgeMetadata> m_edges[2];
   SpinLock m_edges_lock; // guards resizing of edges vectors
-  std::atomic_uint64_t m_edges_tail = ATOMIC_VAR_INIT(0);
+
+  alignas(hardware_destructive_interference_size) std::atomic_uint64_t
+      m_edges_tail = ATOMIC_VAR_INIT(0);
+  alignas(hardware_destructive_interference_size) std::atomic_uint64_t m_holes =
+      ATOMIC_VAR_INIT(0);
 
   // returns a reference to the metadata for the pointed-to edge
   inline EdgeMetadata& getEdgeMetadata(EdgeHandle const& handle) {
@@ -91,6 +102,23 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
 
   inline uint64_t size() const noexcept { return m_vertices.size(); }
 
+  // returns an estimated memory footprint
+  inline uint64_t getFootprint() {
+    uint64_t estimate;
+    m_edges_lock.lock();
+    {
+      estimate =
+          (m_edges[0].size() + m_edges_tail.load(std::memory_order_relaxed)) *
+          sizeof(EdgeMetadata);
+    }
+    m_edges_lock.unlock();
+    return estimate;
+  }
+
+  inline uint64_t numHoles() const noexcept {
+    return m_holes.load(std::memory_order_relaxed);
+  }
+
   inline VertexTopologyID begin() const noexcept {
     return static_cast<VertexTopologyID>(0);
   }
@@ -147,6 +175,8 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
     vertex_meta.buffer = 1;
     vertex_meta.begin  = new_begin;
     vertex_meta.end    = new_end;
+
+    m_holes.fetch_add(vertex_meta.degree, std::memory_order_relaxed);
     vertex_meta.degree += dsts.size();
 
     return 0;
@@ -156,7 +186,8 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
                   const std::vector<VertexTopologyID>& edges) {
     std::unordered_set<VertexTopologyID> edges_set(edges.begin(), edges.end());
 
-    auto& vertex_meta = m_vertices[src];
+    auto& vertex_meta    = m_vertices[src];
+    uint64_t holes_added = 0;
     for (auto i = vertex_meta.begin; i < vertex_meta.end; ++i) {
       EdgeMetadata& edge_meta =
           getEdgeMetadata(EdgeHandle(vertex_meta.buffer, i));
@@ -164,6 +195,7 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
           edges_set.find(edge_meta.dst) != edges_set.end()) {
         edge_meta.tomb();
         --vertex_meta.degree;
+        ++holes_added;
         // remove tombstoned edges from the start of the edge list
         if (i == vertex_meta.begin)
           ++vertex_meta.begin;
@@ -180,6 +212,8 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
       }
     }
 
+    m_holes.fetch_add(holes_added, std::memory_order_relaxed);
+
     return 0;
   }
 
@@ -231,7 +265,9 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
     {
       m_edges[0].resize(0);
       swap(m_edges[0], m_edges[1]);
-      m_edges_tail.store(0, std::memory_order_relaxed); // fine because lock
+      // relaxed is fine because of locks held:
+      m_edges_tail.store(0, std::memory_order_relaxed);
+      m_holes.store(0, std::memory_order_relaxed);
     }
     m_edges_lock.unlock();
   }

From 8e25f1d3490d45ba90aac63bf725099529d8c0db Mon Sep 17 00:00:00 2001
From: Meyer Zinn <6132034+meyerzinn@users.noreply.github.com>
Date: Thu, 21 Mar 2024 16:04:14 -0500
Subject: [PATCH 637/660] improve compaction performance by reducing lock
 contention (#9)

---
 .../include/galois/graphs/LS_LC_CSR_Graph.h   | 20 +------------------
 1 file changed, 1 insertion(+), 19 deletions(-)

diff --git a/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h b/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
index 9da3160e50..6b51bd9a65 100644
--- a/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
+++ b/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
@@ -234,25 +234,7 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
                      VertexMetadata& vertex_meta = m_vertices[vertex_id];
 
                      if (vertex_meta.buffer == 0) {
-                       uint64_t new_begin;
-                       m_edges_lock.lock();
-                       {
-                         new_begin = m_edges[1].size();
-                         m_edges[1].resize(new_begin + vertex_meta.degree);
-                       }
-                       m_edges_lock.unlock();
-
-                       uint64_t new_end = new_begin + vertex_meta.degree;
-
-                       std::copy_if(&getEdgeMetadata(0, vertex_meta.begin),
-                                    &getEdgeMetadata(0, vertex_meta.end),
-                                    &getEdgeMetadata(1, new_begin),
-                                    [](EdgeMetadata& edge_meta) {
-                                      return !edge_meta.is_tomb();
-                                    });
-
-                       vertex_meta.begin = new_begin;
-                       vertex_meta.end   = new_end;
+                       this->addEdgesTopologyOnly(vertex_id, {});
                      }
 
                      // we are about to swap the buffers, so all vertices will

From c54db229836e759b8c87792c6933bf780d2f021d Mon Sep 17 00:00:00 2001
From: Meyer Zinn <6132034+meyerzinn@users.noreply.github.com>
Date: Tue, 26 Mar 2024 10:52:28 -0500
Subject: [PATCH 638/660] add data to the LS_CSR (#10)

---
 CMakeLists.txt                                |   1 +
 Dockerfile                                    |   2 +-
 external/CMakeLists.txt                       |   1 +
 libgalois/CMakeLists.txt                      |   1 +
 .../include/galois/graphs/LS_LC_CSR_Graph.h   | 181 +++++++++++++-----
 libgalois/test/graph-compile-lscsr.cpp        |  24 ++-
 6 files changed, 153 insertions(+), 57 deletions(-)
 create mode 100644 external/CMakeLists.txt

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 61daad3f64..6fa2515f24 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -258,6 +258,7 @@ endif()
 
 add_custom_target(lib)
 add_custom_target(apps)
+add_subdirectory(external)
 
 # Core libraries (lib)
 add_subdirectory(libsupport)
diff --git a/Dockerfile b/Dockerfile
index 8c3499fe8f..cceb15b94a 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -77,7 +77,7 @@ USER ${UNAME}
 WORKDIR /home/${UNAME}
 ENV BUILD_DIR=${BUILD_DIR}
 
-RUN pip3 install compdb pre-commit cpplint "clang-format>=12.0.1,<17.0.0"
+RUN pip3 install compdb pre-commit cpplint "clang-format>=14.0.0,<17.0.0"
 
 RUN echo "PATH=/home/${UNAME}/.local/bin/:\$PATH" >> /home/${UNAME}/.zshenv
 
diff --git a/external/CMakeLists.txt b/external/CMakeLists.txt
new file mode 100644
index 0000000000..310000adc8
--- /dev/null
+++ b/external/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(parallel-hashmap)
\ No newline at end of file
diff --git a/libgalois/CMakeLists.txt b/libgalois/CMakeLists.txt
index 4721bc0261..b9b7737d4a 100644
--- a/libgalois/CMakeLists.txt
+++ b/libgalois/CMakeLists.txt
@@ -68,6 +68,7 @@ target_include_directories(galois_shmem PUBLIC
   $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/include>
   $<INSTALL_INTERFACE:include>
 )
+include_directories(${Galois_SOURCE_DIR}/external/parallel-hashmap)
 
 if (TARGET Boost::Boost)
   # Autogenerated conan module doesn't provide header-only target. Extract one
diff --git a/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h b/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
index 6b51bd9a65..d2a31c1da5 100644
--- a/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
+++ b/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
@@ -25,13 +25,18 @@
 #include <cstddef>
 #include <atomic>
 #include <new>
+#include <type_traits>
 
 #include <boost/range/iterator_range_core.hpp>
 #include <boost/range/counting_range.hpp>
 #include <boost/iterator/iterator_facade.hpp>
+#include <boost/functional/hash.hpp>
+
+#include <parallel_hashmap/phmap.h>
 
 #include "galois/config.h"
 #include "galois/LargeVector.h"
+#include "galois/LargeArray.h"
 
 #ifdef __cpp_lib_hardware_interference_size
 using std::hardware_destructive_interference_size;
@@ -44,7 +49,7 @@ namespace galois::graphs {
 /**
  * Local computation graph.
  */
-template <bool concurrent = true>
+template <typename VertexData, typename EdgeData, bool concurrent = true>
 class LS_LC_CSR_Graph : private boost::noncopyable {
 public:
   using VertexTopologyID = uint64_t;
@@ -55,10 +60,10 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
   private:
     uint8_t buffer : 1;
     uint64_t index : 48;
+    VertexTopologyID src : 48;
 
-    EdgeHandle(uint8_t buffer, uint64_t index) : buffer(buffer), index(index) {}
-
-    EdgeHandle(uint64_t const& v) : buffer(v >> 63), index(v) {}
+    EdgeHandle(uint8_t buffer, uint64_t index, uint64_t src)
+        : buffer(buffer), index(index), src(src) {}
 
     friend class LS_LC_CSR_Graph;
 
@@ -70,53 +75,77 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
 
 private:
   using SpinLock = galois::substrate::PaddedLock<concurrent>;
-
-  // forward-declarations
+  static constexpr bool HasVertexData = !std::is_same_v<VertexData, void>;
+  static constexpr bool HasEdgeData   = !std::is_same_v<EdgeData, void>;
+
+  using VertexDataStore =
+      std::conditional_t<HasVertexData, typename galois::LargeArray<VertexData>,
+                         typename std::tuple<>>;
+  using EdgeDataStore = std::conditional_t<
+      HasEdgeData,
+      phmap::flat_hash_map<
+          std::pair<VertexTopologyID, VertexTopologyID>, EdgeData,
+          boost::hash<std::pair<VertexTopologyID, VertexTopologyID>>>,
+      std::tuple<>>;
+
+  // forward-declarations of internal structs
   struct VertexMetadata;
   struct EdgeMetadata;
 
   class EdgeIterator;
   using EdgeRange = boost::iterator_range<EdgeIterator>;
 
+  VertexDataStore m_vertex_data;
   std::vector<VertexMetadata> m_vertices;
+
+  // m_edges[0] is the CSR with gaps, m_edges[1] is the update log.
   LargeVector<EdgeMetadata> m_edges[2];
   SpinLock m_edges_lock; // guards resizing of edges vectors
+  EdgeDataStore m_edge_data;
 
   alignas(hardware_destructive_interference_size) std::atomic_uint64_t
       m_edges_tail = ATOMIC_VAR_INIT(0);
+
+  // m_holes is the number of holes in the log (m_edges[1])
   alignas(hardware_destructive_interference_size) std::atomic_uint64_t m_holes =
       ATOMIC_VAR_INIT(0);
 
   // returns a reference to the metadata for the pointed-to edge
-  inline EdgeMetadata& getEdgeMetadata(EdgeHandle const& handle) {
-    return getEdgeMetadata(handle.buffer, handle.index);
-  }
-
   inline EdgeMetadata& getEdgeMetadata(uint8_t buffer, uint64_t index) const {
     return m_edges[buffer][index];
   }
 
+  inline EdgeMetadata& getEdgeMetadata(EdgeHandle handle) const {
+    return getEdgeMetadata(handle.buffer, handle.index);
+  }
+
 public:
   LS_LC_CSR_Graph(uint64_t num_vertices)
-      : m_vertices(num_vertices, VertexMetadata()) {}
+      : m_vertices(num_vertices, VertexMetadata()) {
+    if constexpr (HasVertexData) {
+      m_vertex_data.allocateBlocked(num_vertices);
+    }
+  }
 
   inline uint64_t size() const noexcept { return m_vertices.size(); }
 
-  // returns an estimated memory footprint
-  inline uint64_t getFootprint() {
-    uint64_t estimate;
-    m_edges_lock.lock();
-    {
-      estimate =
-          (m_edges[0].size() + m_edges_tail.load(std::memory_order_relaxed)) *
-          sizeof(EdgeMetadata);
-    }
-    m_edges_lock.unlock();
-    return estimate;
+  /** Data Manipulations **/
+
+  template <typename V = VertexData, typename = std::enable_if<HasVertexData>>
+  inline void setData(VertexTopologyID vertex, V data) {
+    m_vertex_data[vertex] = data;
+  }
+
+  // return data associated with a vertex
+  template <typename V = VertexData, typename = std::enable_if<HasVertexData>>
+  inline V& getData(VertexTopologyID vertex) {
+    return m_vertex_data[vertex];
   }
 
-  inline uint64_t numHoles() const noexcept {
-    return m_holes.load(std::memory_order_relaxed);
+  template <typename E = EdgeData, typename = std::enable_if<HasEdgeData>>
+  void setEdgeData(EdgeHandle handle, E data) {
+    VertexTopologyID const src                                    = handle.src;
+    m_edge_data[std::make_pair(src, getEdgeMetadata(handle).dst)] = data;
   }
 
   inline VertexTopologyID begin() const noexcept {
@@ -129,16 +158,35 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
 
   VertexRange vertices() { return VertexRange(begin(), end()); }
 
+  VertexTopologyID getEdgeDst(EdgeHandle eh) { return getEdgeMetadata(eh).dst; }
+
+  template <typename E = EdgeData, typename = std::enable_if<HasEdgeData>>
+  inline E& getEdgeData(EdgeHandle handle) {
+    VertexTopologyID const src = handle.src;
+    return m_edge_data[std::make_pair(src, getEdgeDst(handle))];
+  }
+
   EdgeRange edges(VertexTopologyID node) {
     auto& vertex_meta = m_vertices[node];
-    auto const* ii    = &getEdgeMetadata(vertex_meta.buffer, vertex_meta.begin);
-    auto const* ee    = &getEdgeMetadata(vertex_meta.buffer, vertex_meta.end);
 
-    return EdgeRange(EdgeIterator(ii, ee), EdgeIterator(ee, ee));
+    return EdgeRange(EdgeIterator(this, vertex_meta.buffer, vertex_meta.begin,
+                                  vertex_meta.end, node),
+                     EdgeIterator(this, vertex_meta.buffer, vertex_meta.end,
+                                  vertex_meta.end, node));
+  }
+
+  void addEdges(VertexTopologyID src, const std::vector<VertexTopologyID> dsts,
+                std::vector<EdgeData> data) {
+    GALOIS_ASSERT(data.size() == dsts.size());
+    this->addEdgesTopologyOnly(src, dsts);
+    for (size_t i = 0; i < dsts.size(); ++i) {
+      m_edge_data[std::make_pair(src, dsts[i])] = data[i];
+    }
+    // todo: save edge data
   }
 
-  int addEdgesTopologyOnly(VertexTopologyID src,
-                           const std::vector<VertexTopologyID> dsts) {
+  void addEdgesTopologyOnly(VertexTopologyID src,
+                            const std::vector<VertexTopologyID> dsts) {
 
     // Copies the edge list to the end of m_edges[1], prepending
     // the new edges.
@@ -178,19 +226,16 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
 
     m_holes.fetch_add(vertex_meta.degree, std::memory_order_relaxed);
     vertex_meta.degree += dsts.size();
-
-    return 0;
   }
 
-  int deleteEdges(VertexTopologyID src,
-                  const std::vector<VertexTopologyID>& edges) {
+  void deleteEdges(VertexTopologyID src,
+                   const std::vector<VertexTopologyID>& edges) {
     std::unordered_set<VertexTopologyID> edges_set(edges.begin(), edges.end());
 
     auto& vertex_meta    = m_vertices[src];
     uint64_t holes_added = 0;
     for (auto i = vertex_meta.begin; i < vertex_meta.end; ++i) {
-      EdgeMetadata& edge_meta =
-          getEdgeMetadata(EdgeHandle(vertex_meta.buffer, i));
+      EdgeMetadata& edge_meta = getEdgeMetadata(vertex_meta.buffer, i);
       if (!edge_meta.is_tomb() &&
           edges_set.find(edge_meta.dst) != edges_set.end()) {
         edge_meta.tomb();
@@ -204,7 +249,7 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
 
     // remove tombstoned edges from the end of the edge list
     for (auto i = vertex_meta.end; i > vertex_meta.begin; --i) {
-      if (getEdgeMetadata(EdgeHandle(vertex_meta.buffer, i - 1)).is_tomb()) {
+      if (getEdgeMetadata(vertex_meta.buffer, i - 1).is_tomb()) {
         --vertex_meta.end;
         --vertex_meta.degree;
       } else {
@@ -213,12 +258,6 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
     }
 
     m_holes.fetch_add(holes_added, std::memory_order_relaxed);
-
-    return 0;
-  }
-
-  VertexTopologyID getEdgeDst(EdgeHandle edge) {
-    return getEdgeMetadata(edge).dst;
   }
 
   // Performs the compaction algorithm by copying any vertices left in buffer 0
@@ -254,6 +293,36 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
     m_edges_lock.unlock();
   }
 
+  /*
+    Compaction policy utilities.
+  */
+
+  // Returns an estimated memory usage in bytes for the entire data structure.
+  inline size_t getMemoryUsageBytes() {
+    size_t estimate = m_vertices.size() * sizeof(VertexMetadata);
+    if constexpr (HasVertexData) {
+      estimate += m_vertices.size() * sizeof(VertexData);
+    }
+    m_edges_lock.lock();
+    {
+      estimate +=
+          (m_edges[0].size() + m_edges_tail.load(std::memory_order_relaxed)) *
+          sizeof(EdgeMetadata);
+    }
+    m_edges_lock.unlock();
+    if constexpr (HasEdgeData) {
+      estimate += m_edge_data.size() *
+                  (sizeof(EdgeData) +
+                   sizeof(std::pair<VertexTopologyID, VertexTopologyID>));
+    }
+    return estimate;
+  }
+
+  // Returns the number of bytes used for holes in the log.
+  inline size_t getLogHolesMemoryUsageBytes() {
+    return m_holes.load(std::memory_order_relaxed) * sizeof(EdgeMetadata);
+  }
+
 private:
   struct VertexMetadata {
     uint8_t buffer : 1;
@@ -285,25 +354,31 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
   static_assert(sizeof(EdgeMetadata) <= sizeof(uint64_t));
 
   class EdgeIterator
-      : public boost::iterator_facade<EdgeIterator, EdgeMetadata const,
+      : public boost::iterator_facade<EdgeIterator, EdgeHandle const,
                                       boost::forward_traversal_tag,
-                                      VertexTopologyID> {
+                                      EdgeHandle const> {
   private:
-    EdgeMetadata const* m_ptr;
-    EdgeMetadata const* const m_end;
+    LS_LC_CSR_Graph* graph;
+    uint8_t buffer;
+    uint64_t index;
+    uint64_t end;
+    VertexTopologyID src;
 
-    explicit EdgeIterator(EdgeMetadata const* ptr, EdgeMetadata const* end)
-        : m_ptr(ptr), m_end(end) {}
+    explicit EdgeIterator(LS_LC_CSR_Graph* graph, uint8_t buffer,
+                          uint64_t index, uint64_t end, VertexTopologyID src)
+        : graph(graph), buffer(buffer), index(index), end(end), src(src) {}
 
     void increment() {
-      while (++m_ptr < m_end && m_ptr->is_tomb())
+      while (++index < end && graph->getEdgeMetadata(buffer, index).is_tomb())
         ;
-    };
+    }
 
-    // note: equality fails across generations
-    bool equal(EdgeIterator const& other) const { return m_ptr == other.m_ptr; }
+    // updates to the graph will invalidate iterators
+    bool equal(EdgeIterator const& other) const {
+      return graph == other.graph && index == other.index;
+    }
 
-    VertexTopologyID dereference() const { return m_ptr->dst; }
+    EdgeHandle dereference() const { return EdgeHandle(buffer, index, src); }
 
     friend class LS_LC_CSR_Graph;
     friend class boost::iterator_core_access;
diff --git a/libgalois/test/graph-compile-lscsr.cpp b/libgalois/test/graph-compile-lscsr.cpp
index 42f774bfed..3f6448b2cb 100644
--- a/libgalois/test/graph-compile-lscsr.cpp
+++ b/libgalois/test/graph-compile-lscsr.cpp
@@ -34,8 +34,8 @@ void check() {
   auto print_graph = [&g](std::string_view msg) {
     std::cout << "- " << msg << " -" << std::endl;
     for (auto src : g.vertices()) {
-      for (auto dst : g.edges(src)) {
-        std::cout << src << "->" << dst << std::endl;
+      for (auto edge : g.edges(src)) {
+        std::cout << src << "->" << g.getEdgeDst(edge) << std::endl;
       }
     }
   };
@@ -59,7 +59,25 @@ void check() {
 
 int main() {
   galois::SharedMemSys Galois_runtime;
-  check<galois::graphs::LS_LC_CSR_Graph<>>();
+  check<galois::graphs::LS_LC_CSR_Graph<void, void>>();
+  check<galois::graphs::LS_LC_CSR_Graph<float, float>>();
+
+  // check that we can access data on nodes/edges
+  galois::graphs::LS_LC_CSR_Graph<uint32_t, uint32_t> g(4);
+
+  g.setData(0, 0);
+  GALOIS_ASSERT(g.getData(0) == 0);
+  g.setData(1, 1);
+  GALOIS_ASSERT(g.getData(1) == 1);
+  g.setData(2, 2);
+  GALOIS_ASSERT(g.getData(2) == 2);
+  g.setData(3, 3);
+  GALOIS_ASSERT(g.getData(3) == 3);
+
+  g.addEdges(0, {1, 2, 3}, {1, 2, 3});
+  for (auto const& handle : g.edges(0)) {
+    GALOIS_ASSERT(g.getEdgeDst(handle) == g.getEdgeData(handle));
+  }
 
   return 0;
 }

From 12c4e18ae750e3627b899c0187ae025a55af3f75 Mon Sep 17 00:00:00 2001
From: Patrick Kenney <patrickkenney9801@gmail.com>
Date: Thu, 28 Mar 2024 22:00:51 -0500
Subject: [PATCH 639/660] chore fix latent merge issues (#13)

* fix: WMD graph vertex schema and add phmap to part of importer
---
 CMakeLists.txt                                |   2 +
 .../galois/graphs/DistributedLocalGraph.h     |   3 +
 libdist/include/galois/runtime/Serialize.h    |  13 +-
 libgalois/CMakeLists.txt                      |   1 -
 libgalois/include/shad/Graph.h                |   2 +-
 libgalois/include/shad/ShadGraphConverter.h   |   2 +-
 .../include/galois}/shad/DataTypes.h          |   0
 libwmd/CMakeLists.txt                         |   2 +-
 libwmd/include/galois/wmd/WMDGraph.h          | 139 ++--
 libwmd/include/galois/wmd/WMDPartitioner.h    |   3 -
 libwmd/include/galois/wmd/data_types.h        | 741 ------------------
 libwmd/include/galois/wmd/graph.h             |   6 +-
 12 files changed, 62 insertions(+), 852 deletions(-)
 rename {libgalois/include => libsupport/include/galois}/shad/DataTypes.h (100%)
 delete mode 100644 libwmd/include/galois/wmd/data_types.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6fa2515f24..b505be17bc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -74,6 +74,8 @@ include(CTest)
 ###### Configure compiler ######
 
 if(PROJECT_IS_TOP_LEVEL)
+  include_directories(${Galois_SOURCE_DIR}/external/parallel-hashmap)
+  
   if(CMAKE_CXX_FLAGS)
     message(STATUS "Provided CXX Flags: " ${CMAKE_CXX_FLAGS})
   endif()
diff --git a/libcusp/include/galois/graphs/DistributedLocalGraph.h b/libcusp/include/galois/graphs/DistributedLocalGraph.h
index bd0dd3ffd8..4b803c5247 100644
--- a/libcusp/include/galois/graphs/DistributedLocalGraph.h
+++ b/libcusp/include/galois/graphs/DistributedLocalGraph.h
@@ -883,6 +883,9 @@ class DistLocalGraph {
         [&](GN n) { graph.sortEdges(n, IdLess<GN, EdgeTy>()); },
         galois::no_stats(), galois::loopname("CSREdgeSort"), galois::steal());
   }
+
+  //! Used by substrate to determine if some stats are to be reported
+  bool is_a_graph() const { return true; }
 };
 
 template <typename NodeTy, typename EdgeTy>
diff --git a/libdist/include/galois/runtime/Serialize.h b/libdist/include/galois/runtime/Serialize.h
index 1aab134327..bc3cad4b01 100644
--- a/libdist/include/galois/runtime/Serialize.h
+++ b/libdist/include/galois/runtime/Serialize.h
@@ -459,9 +459,8 @@ inline void gSerializeObj(
 template <typename T1, typename T2>
 inline void gSerializeObj(SerializeBuffer& buf,
                           const std::unordered_map<T1, T2>& data) {
-  uint64_t cnt = 0;
+  gSerialize(buf, data.size());
   for (auto i : data) {
-    cnt++;
     gSerialize(buf, i.first, i.second);
   }
 }
@@ -815,13 +814,15 @@ void gDeserializeObj(
 
 template <typename T1, typename T2>
 void gDeserializeObj(DeSerializeBuffer& buf, std::unordered_map<T1, T2>& data) {
-  while (buf.size() > 0) {
-    std::pair<T1, T2> i;
-    gDeserialize(buf, i.first, i.second);
+  uint64_t elts;
+  gDeserializeObj(buf, elts);
+  for (uint64_t i = 0; i < elts; i++) {
+    std::pair<T1, T2> elt;
+    gDeserialize(buf, elt.first, elt.second);
     if (buf.size() <= 0) {
       break;
     }
-    data[i.first] = i.second;
+    data[elt.first] = elt.second;
   }
 }
 /**
diff --git a/libgalois/CMakeLists.txt b/libgalois/CMakeLists.txt
index b9b7737d4a..4721bc0261 100644
--- a/libgalois/CMakeLists.txt
+++ b/libgalois/CMakeLists.txt
@@ -68,7 +68,6 @@ target_include_directories(galois_shmem PUBLIC
   $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/include>
   $<INSTALL_INTERFACE:include>
 )
-include_directories(${Galois_SOURCE_DIR}/external/parallel-hashmap)
 
 if (TARGET Boost::Boost)
   # Autogenerated conan module doesn't provide header-only target. Extract one
diff --git a/libgalois/include/shad/Graph.h b/libgalois/include/shad/Graph.h
index 2c785e53d5..3f3a41c02d 100644
--- a/libgalois/include/shad/Graph.h
+++ b/libgalois/include/shad/Graph.h
@@ -47,7 +47,7 @@
 #include <limits>
 #include <vector>
 
-#include "DataTypes.h"
+#include "galois/shad/DataTypes.h"
 #include "GraphTypes.h"
 
 #define UINT shad::data_types::UINT
diff --git a/libgalois/include/shad/ShadGraphConverter.h b/libgalois/include/shad/ShadGraphConverter.h
index c33cfbad3a..344b111254 100644
--- a/libgalois/include/shad/ShadGraphConverter.h
+++ b/libgalois/include/shad/ShadGraphConverter.h
@@ -5,8 +5,8 @@
 #include <string>
 
 #include "galois/graphs/BufferedGraph.h"
+#include "galois/shad/DataTypes.h"
 
-#include "shad/DataTypes.h"
 #include "shad/Graph.h"
 #include "shad/GraphTypes.h"
 
diff --git a/libgalois/include/shad/DataTypes.h b/libsupport/include/galois/shad/DataTypes.h
similarity index 100%
rename from libgalois/include/shad/DataTypes.h
rename to libsupport/include/galois/shad/DataTypes.h
diff --git a/libwmd/CMakeLists.txt b/libwmd/CMakeLists.txt
index b166ac78b8..62dac7fee1 100644
--- a/libwmd/CMakeLists.txt
+++ b/libwmd/CMakeLists.txt
@@ -8,7 +8,7 @@ target_include_directories(galois_wmd INTERFACE
   $<INSTALL_INTERFACE:include>
 )
 
-target_link_libraries(galois_wmd INTERFACE Galois::dist_async Galois::cusp)
+target_link_libraries(galois_wmd INTERFACE Galois::dist_async Galois::cusp galois_support)
 
 add_subdirectory(test)
 
diff --git a/libwmd/include/galois/wmd/WMDGraph.h b/libwmd/include/galois/wmd/WMDGraph.h
index 0ad00bf670..df92ca29c8 100644
--- a/libwmd/include/galois/wmd/WMDGraph.h
+++ b/libwmd/include/galois/wmd/WMDGraph.h
@@ -28,9 +28,10 @@
 #include "galois/config.h"
 #include "galois/gIO.h"
 #include "galois/Reduction.h"
+#include "galois/shad/DataTypes.h"
+#include "parallel_hashmap/phmap.h"
 
 #include "graphTypes.h"
-#include "data_types.h"
 #include "graph.h"
 #include "schema.h"
 #include "instrument.h"
@@ -760,7 +761,7 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
 
       for (uint64_t i = beginNode; i < endNode; ++i) {
         int host =
-            virtualToPhyMapping[srcGraph.localNodes[i].glbid % numVirtualHosts];
+            virtualToPhyMapping[srcGraph.localNodes[i].id % numVirtualHosts];
         threadNodesToSend[tid][host]++;
         I_WR();
         for (int k = 0; k < 2; k++)
@@ -824,6 +825,31 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
     increment_evilPhase();
   }
 
+  void addGatheredNodes(std::vector<NodeDataType>&& NodeData) {
+    uint64_t offset = GIDtoLID.size();
+
+    galois::on_each([&](unsigned tid, unsigned nthreads) {
+      size_t beginNode;
+      size_t endNode;
+      std::tie(beginNode, endNode) =
+          galois::block_range((uint64_t)0, NodeData.size(), tid, nthreads);
+      using map = phmap::parallel_flat_hash_map_m<uint64_t, uint64_t>;
+      for (size_t j = beginNode; j < (endNode); ++j) {
+        GIDtoLID.lazy_emplace_l(
+            NodeData[j].id, [&](map::value_type&) {},
+            [&](const map::constructor& ctor) {
+              ctor(std::pair(NodeData[j].id, offset + j));
+            });
+        LIDtoGID.lazy_emplace_l(
+            offset + j, [&](map::value_type&) {},
+            [&](const map::constructor& ctor) {
+              ctor(std::pair(offset + j, NodeData[j].id));
+            });
+      }
+    });
+    NodeData.clear();
+  }
+
   /**
    * Exchanges vertex ids to form a global id to local id map before exchanging
    * edges so that using the map edges can be inserted into the edgelist
@@ -841,35 +867,30 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
         numHosts, std::vector<NodeDataType>());
 
     // PerThread DS
-    std::vector<std::vector<std::vector<std::vector<EdgeDataType>>>>
-        threadEdgesToSend(
-            activeThreads,
-            std::vector<std::vector<std::vector<EdgeDataType>>>());
-    std::vector<std::vector<std::vector<NodeDataType>>> threadNodesToSend(
-        activeThreads, std::vector<std::vector<NodeDataType>>());
+    galois::PerThreadVector<std::vector<std::vector<EdgeDataType>>>
+        threadEdgesToSend;
+    galois::PerThreadVector<std::vector<NodeDataType>> threadNodesToSend;
     for (uint32_t i = 0; i < activeThreads; i++) {
       threadEdgesToSend[i].resize(numHosts);
       threadNodesToSend[i].resize(numHosts);
     }
 
     // Prepare edgeList and Vertex ID list to send to other hosts
-    uint64_t sz = localEdges.size();
     galois::on_each([&](unsigned tid, unsigned nthreads) {
       uint64_t beginNode;
       uint64_t endNode;
       std::tie(beginNode, endNode) =
-          galois::block_range((uint64_t)0, sz, tid, nthreads);
+          galois::block_range((uint64_t)0, localEdges.size(), tid, nthreads);
 
       for (uint64_t i = beginNode; i < endNode; ++i) {
         uint64_t src = localEdges[i][0].src;
         int host     = virtualToPhyMapping[src % numVirtualHosts];
-        threadEdgesToSend[tid][host].push_back((localEdges[i]));
-        for (int k = 0; k < 3; k++)
-          I_RR();
-        I_WM(2);
+        threadEdgesToSend.get()[host].emplace_back(std::move(localEdges[i]));
       }
     });
 
+    localEdges.clear();
+
     // Prepare Nodedata to send to other hosts
     galois::on_each([&](unsigned tid, unsigned nthreads) {
       size_t beginNode;
@@ -878,12 +899,8 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
           galois::block_range((uint64_t)0, localNodes.size(), tid, nthreads);
 
       for (size_t i = beginNode; i < (endNode); ++i) {
-        int host = virtualToPhyMapping[(localNodes[i].glbid) %
-                                       (scaleFactor * numHosts)];
-        threadNodesToSend[tid][host].push_back((localNodes[i]));
-        for (int k = 0; k < 2; k++)
-          I_RR();
-        I_WR();
+        int host = virtualToPhyMapping[localNodes[i].id % numVirtualHosts];
+        threadNodesToSend.get()[host].emplace_back(std::move(localNodes[i]));
       }
     });
 
@@ -895,16 +912,11 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
         edgesToSend[h].insert(edgesToSend[h].end(),
                               threadEdgesToSend[tid][h].begin(),
                               threadEdgesToSend[tid][h].end());
-        for (int i = 0; i < 6; i++)
-          I_RR();
-        I_WM(3);
       }
     }
 
-    threadNodesToSend.clear();
-    threadEdgesToSend.clear();
-
-    localEdges.clear();
+    threadNodesToSend.clear_all_parallel();
+    threadEdgesToSend.clear_all_parallel();
 
     // Send Nodelist
     for (uint32_t h = 0; h < numHosts; h++) {
@@ -913,7 +925,6 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
       galois::runtime::SendBuffer sendBuffer;
       galois::runtime::gSerialize(sendBuffer, nodesToSend[h]);
       net.sendTagged(h, galois::runtime::evilPhase, std::move(sendBuffer));
-      I_WM(nodesToSend[h].size());
     }
 
     // Collect node data received from other hosts
@@ -924,67 +935,11 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
       } while (!p);
       std::vector<NodeDataType> NodeData;
       galois::runtime::gDeserialize(p->second, NodeData);
-      I_LC(p->first, NodeData.size() * sizeof(NodeDataType));
-      std::vector<std::map<uint64_t, uint32_t>> threadMap(activeThreads);
-      std::vector<std::map<uint32_t, uint64_t>> threadLIDMap(activeThreads);
-
-      uint64_t offset = GIDtoLID.size();
-
-      galois::on_each([&](unsigned tid, unsigned nthreads) {
-        size_t beginNode;
-        size_t endNode;
-        std::tie(beginNode, endNode) =
-            galois::block_range((uint64_t)0, NodeData.size(), tid, nthreads);
-        uint64_t delta;
-        delta = std::ceil((double)NodeData.size() / activeThreads);
-        for (size_t j = beginNode; j < (endNode); ++j) {
-          threadMap[tid][NodeData[j].glbid] =
-              offset + (tid * (delta)) + j - beginNode;
-          threadLIDMap[tid][offset + (tid * (delta)) + j - beginNode] =
-              NodeData[j].glbid;
-          I_WR();
-          I_RR();
-        }
-      });
-      for (uint32_t t = 0; t < activeThreads; t++) {
-        GIDtoLID.insert(threadMap[t].begin(), threadMap[t].end());
-        LIDtoGID.insert(threadLIDMap[t].begin(), threadLIDMap[t].end());
-        I_WR();
-        I_RR();
-      }
-      threadMap.clear();
-      NodeData.clear();
-      threadLIDMap.clear();
+      addGatheredNodes(std::move(NodeData));
     }
 
     // Collect node data present in this host
-    std::vector<std::map<uint64_t, size_t>> threadMap(activeThreads);
-    std::vector<std::map<size_t, uint64_t>> threadLIDMap(activeThreads);
-    uint64_t offset = GIDtoLID.size();
-    galois::on_each([&](unsigned tid, unsigned nthreads) {
-      size_t beginNode;
-      size_t endNode;
-      std::tie(beginNode, endNode) = galois::block_range(
-          (uint64_t)0, nodesToSend[hostID].size(), tid, nthreads);
-      uint64_t delta;
-      delta = std::ceil((double)nodesToSend[hostID].size() / activeThreads);
-      for (size_t i = beginNode; i < (endNode); ++i) {
-        threadMap[tid][nodesToSend[hostID][i].glbid] =
-            offset + (tid * (delta)) + i - beginNode;
-        threadLIDMap[tid][offset + (tid * (delta)) + i - beginNode] =
-            nodesToSend[hostID][i].glbid;
-        I_WR();
-        I_RR();
-      }
-    });
-    for (uint32_t t = 0; t < activeThreads; t++) {
-      GIDtoLID.insert(threadMap[t].begin(), threadMap[t].end());
-      LIDtoGID.insert(threadLIDMap[t].begin(), threadLIDMap[t].end());
-      I_WR();
-      I_RR();
-    }
-    threadMap.clear();
-    threadLIDMap.clear();
+    addGatheredNodes(std::move(nodesToSend[hostID]));
 
     numLocalNodes = GIDtoLID.size();
     localEdges.clear();
@@ -1094,8 +1049,8 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
 
 public:
   WMDBufferedGraph() : BufferedGraph<EdgeDataType>() {}
-  std::unordered_map<uint64_t, uint32_t> GIDtoLID;
-  std::unordered_map<uint32_t, uint64_t> LIDtoGID;
+  phmap::parallel_flat_hash_map_m<uint64_t, uint64_t> GIDtoLID;
+  phmap::parallel_flat_hash_map_m<uint64_t, uint64_t> LIDtoGID;
 
   // copy not allowed
   //! disabled copy constructor
@@ -1239,10 +1194,7 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
       for (size_t i = beginNode; i < endNode; ++i) {
         int host =
             virtualToPhyMapping[srcGraph.localNodes[i].id % numVirtualHosts];
-        threadNodesToSend[tid][host].push_back((srcGraph.localNodes[i]));
-        for (int k = 0; k < 2; k++)
-          I_RR();
-        I_WR();
+        threadNodesToSend[tid][host].emplace_back((srcGraph.localNodes[i]));
       }
     });
 
@@ -1251,9 +1203,6 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
         nodesToSend[h].insert(nodesToSend[h].end(),
                               threadNodesToSend[tid][h].begin(),
                               threadNodesToSend[tid][h].end());
-        for (int k = 0; k < 2; k++)
-          I_RR();
-        I_WR();
       }
     }
     srcGraph.localNodes.clear();
diff --git a/libwmd/include/galois/wmd/WMDPartitioner.h b/libwmd/include/galois/wmd/WMDPartitioner.h
index 4c71e9984f..70b7c1bcee 100644
--- a/libwmd/include/galois/wmd/WMDPartitioner.h
+++ b/libwmd/include/galois/wmd/WMDPartitioner.h
@@ -71,17 +71,14 @@ class WMDGraph : public DistLocalGraph<NodeTy, EdgeTy> {
   friend class WMDGraph;
 
   virtual unsigned getHostIDImpl(uint64_t gid) const {
-    assert(gid < base_DistGraph::numGlobalNodes);
     return graphPartitioner->retrieveMaster(gid);
   }
 
   virtual bool isOwnedImpl(uint64_t gid) const {
-    assert(gid < base_DistGraph::numGlobalNodes);
     return (graphPartitioner->retrieveMaster(gid) == base_DistGraph::id);
   }
 
   virtual bool isLocalImpl(uint64_t gid) const {
-    assert(gid < base_DistGraph::numGlobalNodes);
     return (base_DistGraph::globalToLocalMap.find(gid) !=
             base_DistGraph::globalToLocalMap.end());
   }
diff --git a/libwmd/include/galois/wmd/data_types.h b/libwmd/include/galois/wmd/data_types.h
deleted file mode 100644
index 22650d3cca..0000000000
--- a/libwmd/include/galois/wmd/data_types.h
+++ /dev/null
@@ -1,741 +0,0 @@
-//===------------------------------------------------------------*- C++ -*-===//
-//
-//                                     SHAD
-//
-//      The Scalable High-performance Algorithms and Data Structure Library
-//
-//===----------------------------------------------------------------------===//
-//
-// Copyright 2018 Battelle Memorial Institute
-//
-// Licensed under the Apache License, Version 2.0 (the "License"); you may not
-// use this file except in compliance with the License. You may obtain a copy
-// of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-// License for the specific language governing permissions and limitations
-// under the License.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef INCLUDE_DATA_TYPES_DATA_TYPES_H_
-#define INCLUDE_DATA_TYPES_DATA_TYPES_H_
-
-#include <ctime>
-#include <cstring>
-#include <algorithm>
-#include <iomanip>
-#include <iostream>
-#include <string>
-#include <type_traits>
-#include <utility>
-#include <vector>
-
-namespace shad {
-
-/// @brief Data conversion utilities.
-///
-/// Please refer to methods specialization to check
-/// which data types are supported.
-namespace data_types {
-
-/// @brief Enumeration of supported data types.
-///
-/// The enumeration is meant to be used when parsing data
-/// (i.e. type information is not known at compile time).
-enum data_t {
-  STRING = 0,  // string support is currenlty limited
-  CHARS,       // sequence of characters
-  UINT,        // unsigned, binds by default to uint64_t
-  INT,         // int, binds by default to int64_t
-  FLOAT,       // float, binds by default to float
-  DOUBLE,      // double, binds by default to double
-  BOOL,        // bool, binds by default to bool
-  DATE,        // date in "%y-%m-%d" format, binds by default to time_t
-  USDATE,      // date in "%m/%d/%y" format, binds by default to time_t
-  DATE_TIME,   // date in "%y-%m-%dT%H:%M:%S" format,
-               // binds by default to time_t
-  IP_ADDRESS,  // IPv4, binds by default to data_types::ipv4_t
-  LIST_UINT,   // Sequence of unsigneds, support currently limited
-  LIST_INT,    // Sequence of integers, support currently limited
-  LIST_DOUBLE, // Sequence of doubles, support currently limited
-  NONE
-};
-
-/// @brief Data structures for storing schema information.
-/// Given a tuple of data, it associates elements labels and data types
-/// to their position in the tuple.
-using schema_t = std::vector<std::pair<std::string, data_t>>;
-
-/// @brief Encoded null value.
-/// @tparam ENC_t encoding type.
-/// @return Encoded null value for ENC_t.
-template <typename ENC_t>
-constexpr ENC_t kNullValue = ENC_t();
-
-/// @brief Encoded null value for uint64_t.
-/// @return Null encoded value for uint64_t.
-template <>
-constexpr uint64_t kNullValue<uint64_t> = std::numeric_limits<int64_t>::max();
-
-/// @brief Encoded null value for time_t (same as long).
-/// @return Null encoded value for time_t (same as long).
-template <>
-constexpr time_t kNullValue<time_t> = std::numeric_limits<time_t>::max();
-
-/// @brief Encoded null value for double.
-/// @return Null encoded value for double.
-template <>
-constexpr double kNullValue<double> = std::numeric_limits<double>::max();
-
-/// @brief Encode Function
-/// Available specializations:
-///    ENC_t = uint64_t, IN_t = std::string
-/// @tparam ENC_t The type to encode to.
-/// @tparam IN_t The type (format) of the data to encode.
-/// @tparam DT data_types::data_t of the data to encode.
-/// @param in Data to encode.
-/// @return Encoded data.
-template <typename ENC_t, typename IN_t, data_t DT>
-ENC_t encode(IN_t& in);
-
-/// @brief Encode Function
-/// Available specializations:
-///    ENC_t = uint64_t, IN_t = default bindings of data_types::data_t
-/// @tparam ENC_t The type to encode to.
-/// @tparam IN_t The type of the data to encode.
-/// @param in Data to encode.
-/// @return Encoded data.
-template <typename ENC_t, typename IN_t>
-ENC_t encode(IN_t& in);
-
-template <typename ENC_t, typename IN_t>
-ENC_t encode(IN_t& in, data_t dt);
-
-template <typename ENC_t, size_t MAX_s, data_t ST>
-std::array<ENC_t, MAX_s> encode(std::string& str) {
-  std::array<ENC_t, MAX_s> res;
-  if (str.size() > 0) {
-    memcpy(res.data(), str.data(), sizeof(ENC_t) * MAX_s);
-  } else {
-    res.fill('\0');
-  }
-  return res;
-}
-
-template <typename ENC_t, typename DEC_t>
-typename std::enable_if<(std::is_arithmetic<DEC_t>::value or
-                         (sizeof(DEC_t) == sizeof(ENC_t))),
-                        DEC_t>::type
-decode(ENC_t encvalue) {
-  DEC_t val;
-  memcpy(&val, &encvalue, sizeof(DEC_t));
-  return val;
-}
-
-template <typename ENC_t, typename DEC_t, data_t ST>
-DEC_t decode(ENC_t value);
-
-template <typename ENC_t, data_t ST>
-typename std::enable_if<(ST == data_t::INT), int64_t>::type
-decode(ENC_t encvalue) {
-  return decode<ENC_t, int64_t>(encvalue);
-}
-
-template <typename ENC_t, data_t ST>
-typename std::enable_if<(ST == data_t::UINT), uint64_t>::type
-decode(ENC_t encvalue) {
-  return decode<ENC_t, uint64_t>(encvalue);
-}
-
-template <typename ENC_t, data_t ST>
-typename std::enable_if<(ST == data_t::FLOAT), float>::type
-decode(ENC_t encvalue) {
-  return decode<ENC_t, float>(encvalue);
-}
-
-template <typename ENC_t, data_t ST>
-typename std::enable_if<(ST == data_t::DOUBLE), double>::type
-decode(ENC_t encvalue) {
-  return decode<ENC_t, double>(encvalue);
-}
-
-template <typename ENC_t, data_t ST>
-typename std::enable_if<(ST == data_t::BOOL), bool>::type
-decode(ENC_t encvalue) {
-  return decode<ENC_t, bool>(encvalue);
-}
-
-template <typename ENC_t, data_t ST>
-typename std::enable_if<(ST == data_t::DATE), std::time_t>::type
-decode(ENC_t encvalue) {
-  return decode<ENC_t, std::time_t>(encvalue);
-}
-
-template <typename ENC_t, size_t MAX_s, data_t ST>
-std::string decode(std::array<ENC_t, MAX_s>& val) {
-  return std::string(reinterpret_cast<const char*>(val.data()));
-}
-} // namespace data_types
-
-// ENCODE METHODS SPECIALIZATION FOR UINT64 ENC_t
-template <>
-inline uint64_t
-data_types::encode<uint64_t, std::string, data_types::UINT>(std::string& str) {
-  uint64_t value;
-  try {
-    value = std::stoull(str);
-  } catch (...) {
-    value = kNullValue<uint64_t>;
-  }
-  return value;
-}
-
-template <>
-inline uint64_t
-data_types::encode<uint64_t, std::string, data_types::INT>(std::string& str) {
-  uint64_t encval;
-  int64_t value;
-  try {
-    value = stoll(str);
-  } catch (...) {
-    return kNullValue<uint64_t>;
-  }
-  memcpy(&encval, &value, sizeof(value));
-  return encval;
-}
-
-template <>
-inline uint64_t
-data_types::encode<uint64_t, std::string, data_types::FLOAT>(std::string& str) {
-  uint64_t encval;
-  float value;
-  try {
-    value = stof(str);
-  } catch (...) {
-    return kNullValue<uint64_t>;
-  }
-  memcpy(&encval, &value, sizeof(value));
-  return encval;
-}
-
-template <>
-inline uint64_t data_types::encode<uint64_t, std::string, data_types::DOUBLE>(
-    std::string& str) {
-  uint64_t encval;
-  double value;
-  try {
-    value = stod(str);
-  } catch (...) {
-    return kNullValue<uint64_t>;
-  }
-  memcpy(&encval, &value, sizeof(value));
-  return encval;
-}
-
-template <>
-inline uint64_t
-data_types::encode<uint64_t, std::string, data_types::BOOL>(std::string& str) {
-  if (str.size() == 0)
-    return kNullValue<uint64_t>;
-  uint64_t encval = 1;
-  if ((str == "F") || (str == "f") || (str == "FALSE") || (str == "false") ||
-      (str == "0"))
-    encval = 0;
-  return encval;
-}
-
-template <>
-inline uint64_t
-data_types::encode<uint64_t, std::string, data_types::CHARS>(std::string& str) {
-  uint64_t encval = 0;
-  memset(&encval, '\0', sizeof(encval));
-  memcpy(&encval, str.c_str(), sizeof(encval) - 1);
-  return encval;
-}
-
-template <>
-inline uint64_t
-data_types::encode<uint64_t, std::string, data_types::IP_ADDRESS>(
-    std::string& str) {
-  uint64_t val, value = 0;
-  std::string::iterator start = str.begin();
-  for (unsigned i = 0; i < 4; i++) {
-    std::string::iterator end = std::find(start, str.end(), '.');
-    try {
-      val = std::stoull(std::string(start, end));
-    } catch (...) {
-      return kNullValue<uint64_t>;
-    }
-    if (val < 256) {
-      value = (value << 8) + val;
-      start = end + 1;
-    } else {
-      return kNullValue<uint64_t>;
-    }
-  }
-  return value;
-}
-
-template <>
-inline uint64_t
-data_types::encode<uint64_t, std::string, data_types::DATE>(std::string& str) {
-  uint64_t value = 0;
-  struct tm date {};
-  date.tm_isdst = -1;
-  strptime(str.c_str(), "%Y-%m-%d", &date);
-  time_t t;
-  try {
-    t = mktime(&date);
-  } catch (...) {
-    return kNullValue<uint64_t>;
-  }
-  memcpy(&value, &t, sizeof(value));
-  return value;
-}
-
-template <>
-inline uint64_t data_types::encode<uint64_t, std::string, data_types::USDATE>(
-    std::string& str) {
-  uint64_t value = 0;
-  struct tm date {};
-  date.tm_isdst = -1;
-  strptime(str.c_str(), "%m/%d/%y", &date);
-  time_t t;
-  try {
-    t = mktime(&date);
-  } catch (...) {
-    return kNullValue<uint64_t>;
-  }
-  memcpy(&value, &t, sizeof(value));
-  return value;
-}
-
-template <>
-inline uint64_t
-data_types::encode<uint64_t, std::string, data_types::DATE_TIME>(
-    std::string& str) {
-  uint64_t value = 0;
-  struct tm date {};
-  date.tm_isdst = -1;
-  strptime(str.c_str(), "%Y-%m-%dT%H:%M:%S", &date);
-  time_t t;
-  try {
-    t = mktime(&date);
-  } catch (...) {
-    return kNullValue<uint64_t>;
-  }
-  memcpy(&value, &t, sizeof(value));
-  return value;
-}
-
-// ENCODE METHODS SPECIALIZATION FOR DOUBLE ENC_t
-
-template <>
-inline double
-data_types::encode<double, std::string, data_types::UINT>(std::string& str) {
-  double encval;
-  uint64_t value;
-  try {
-    value = std::stoull(str);
-  } catch (...) {
-    return kNullValue<double>;
-  }
-  memcpy(&encval, &value, sizeof(value));
-  return encval;
-}
-
-template <>
-inline double
-data_types::encode<double, std::string, data_types::INT>(std::string& str) {
-  double encval;
-  int64_t value;
-  try {
-    value = stoll(str);
-  } catch (...) {
-    return kNullValue<double>;
-  }
-  memcpy(&encval, &value, sizeof(value));
-  return encval;
-}
-
-template <>
-inline double
-data_types::encode<double, std::string, data_types::FLOAT>(std::string& str) {
-  double encval;
-  float value;
-  try {
-    value = stof(str);
-  } catch (...) {
-    return kNullValue<double>;
-  }
-  memcpy(&encval, &value, sizeof(value));
-  return encval;
-}
-
-template <>
-inline double
-data_types::encode<double, std::string, data_types::DOUBLE>(std::string& str) {
-  double value;
-  try {
-    value = stod(str);
-  } catch (...) {
-    return kNullValue<double>;
-  }
-  return value;
-}
-
-template <>
-inline double
-data_types::encode<double, std::string, data_types::BOOL>(std::string& str) {
-  if (str.size() == 0)
-    return kNullValue<uint64_t>;
-  double encval = 1;
-  if ((str == "F") || (str == "f") || (str == "FALSE") || (str == "false") ||
-      (str == "0"))
-    encval = 0;
-  return encval;
-}
-
-template <>
-inline double
-data_types::encode<double, std::string, data_types::CHARS>(std::string& str) {
-  double encval = 0;
-  memset(&encval, '\0', sizeof(encval));
-  memcpy(&encval, str.c_str(), sizeof(encval) - 1);
-  return encval;
-}
-
-template <>
-inline double data_types::encode<double, std::string, data_types::IP_ADDRESS>(
-    std::string& str) {
-  uint64_t val, value = 0;
-  std::string::iterator start = str.begin();
-  for (unsigned i = 0; i < 4; i++) {
-    std::string::iterator end = std::find(start, str.end(), '.');
-    try {
-      val = std::stoull(std::string(start, end));
-    } catch (...) {
-      return kNullValue<double>;
-    }
-    if (val < 256) {
-      value = (value << 8) + val;
-      start = end + 1;
-    } else {
-      return kNullValue<double>;
-    }
-  }
-  double encval;
-  memcpy(&encval, &value, sizeof(value));
-  return encval;
-}
-
-template <>
-inline double
-data_types::encode<double, std::string, data_types::DATE>(std::string& str) {
-  double value = 0;
-  struct tm date {};
-  date.tm_isdst = -1;
-  strptime(str.c_str(), "%Y-%m-%d", &date);
-  time_t t;
-  try {
-    t = mktime(&date);
-  } catch (...) {
-    return kNullValue<double>;
-  }
-  memcpy(&value, &t, sizeof(value));
-  return value;
-}
-
-template <>
-inline double
-data_types::encode<double, std::string, data_types::USDATE>(std::string& str) {
-  double value = 0;
-  struct tm date {};
-  date.tm_isdst = -1;
-  strptime(str.c_str(), "%m/%d/%y", &date);
-  time_t t;
-  try {
-    t = mktime(&date);
-  } catch (...) {
-    return kNullValue<uint64_t>;
-  }
-  memcpy(&value, &t, sizeof(value));
-  return value;
-}
-
-template <>
-inline double data_types::encode<double, std::string, data_types::DATE_TIME>(
-    std::string& str) {
-  double value = 0;
-  struct tm date {};
-  date.tm_isdst = -1;
-  strptime(str.c_str(), "%Y-%m-%dT%H:%M:%S", &date);
-  time_t t;
-  try {
-    t = mktime(&date);
-  } catch (...) {
-    return kNullValue<uint64_t>;
-  }
-  memcpy(&value, &t, sizeof(value));
-  return value;
-}
-
-// ENCODE METHODS SPECIALIZATION FOR TIME_T ENC_t (same as long)
-template <>
-inline time_t
-data_types::encode<time_t, std::string, data_types::UINT>(std::string& str) {
-  time_t value;
-  try {
-    value = std::stoul(str);
-  } catch (...) {
-    value = kNullValue<time_t>;
-  }
-  return value;
-}
-
-template <>
-inline time_t
-data_types::encode<time_t, std::string, data_types::INT>(std::string& str) {
-  int64_t value;
-  try {
-    value = stol(str);
-  } catch (...) {
-    return kNullValue<time_t>;
-  }
-  return value;
-}
-
-template <>
-inline time_t
-data_types::encode<time_t, std::string, data_types::FLOAT>(std::string& str) {
-  time_t encval;
-  float value;
-  try {
-    value = stof(str);
-  } catch (...) {
-    return kNullValue<time_t>;
-  }
-  memcpy(&encval, &value, sizeof(value));
-  return encval;
-}
-
-template <>
-inline time_t
-data_types::encode<time_t, std::string, data_types::DOUBLE>(std::string& str) {
-  time_t encval;
-  double value;
-  try {
-    value = stod(str);
-  } catch (...) {
-    return kNullValue<time_t>;
-  }
-  memcpy(&encval, &value, sizeof(value));
-  return encval;
-}
-
-template <>
-inline time_t
-data_types::encode<time_t, std::string, data_types::BOOL>(std::string& str) {
-  if (str.size() == 0)
-    return kNullValue<uint64_t>;
-  time_t encval = 1;
-  if ((str == "F") || (str == "f") || (str == "FALSE") || (str == "false") ||
-      (str == "0"))
-    encval = 0;
-  return encval;
-}
-
-template <>
-inline time_t
-data_types::encode<time_t, std::string, data_types::CHARS>(std::string& str) {
-  time_t encval = 0;
-  memset(&encval, '\0', sizeof(encval));
-  memcpy(&encval, str.c_str(), sizeof(encval) - 1);
-  return encval;
-}
-
-template <>
-inline time_t data_types::encode<time_t, std::string, data_types::IP_ADDRESS>(
-    std::string& str) {
-  time_t val, value = 0;
-  std::string::iterator start = str.begin();
-  for (unsigned i = 0; i < 4; i++) {
-    std::string::iterator end = std::find(start, str.end(), '.');
-    try {
-      val = std::stoull(std::string(start, end));
-    } catch (...) {
-      return kNullValue<time_t>;
-    }
-    if (val < 256) {
-      value = (value << 8) + val;
-      start = end + 1;
-    } else {
-      return kNullValue<time_t>;
-    }
-  }
-  return value;
-}
-
-template <>
-inline time_t
-data_types::encode<time_t, std::string, data_types::DATE>(std::string& str) {
-  struct tm date {};
-  date.tm_isdst = -1;
-  strptime(str.c_str(), "%Y-%m-%d", &date);
-  time_t t;
-  try {
-    t = mktime(&date);
-  } catch (...) {
-    return kNullValue<time_t>;
-  }
-  return t;
-}
-
-template <>
-inline time_t
-data_types::encode<time_t, std::string, data_types::USDATE>(std::string& str) {
-  struct tm date {};
-  date.tm_isdst = -1;
-  strptime(str.c_str(), "%m/%d/%y", &date);
-  time_t t;
-  try {
-    t = mktime(&date);
-  } catch (...) {
-    return kNullValue<time_t>;
-  }
-  return t;
-}
-
-template <>
-inline time_t data_types::encode<time_t, std::string, data_types::DATE_TIME>(
-    std::string& str) {
-  struct tm date {};
-  date.tm_isdst = -1;
-  strptime(str.c_str(), "%Y-%m-%dT%H:%M:%S", &date);
-  time_t t;
-  try {
-    t = mktime(&date);
-  } catch (...) {
-    return kNullValue<uint64_t>;
-  }
-  return t;
-}
-
-template <typename ENC_t, typename IN_t>
-ENC_t data_types::encode(IN_t& in, data_types::data_t dt) {
-  switch (dt) {
-    //     case data_types::STRING :
-    //       return data_types::encode<ENC_t, IN_t, data_types::STRING>(in);
-    //     case data_types::CHARS :
-    //       return data_types::encode<ENC_t, IN_t, data_types::CHARS>(in);
-  case data_types::UINT:
-    return data_types::encode<ENC_t, IN_t, data_types::UINT>(in);
-  case data_types::INT:
-    return data_types::encode<ENC_t, IN_t, data_types::INT>(in);
-  case data_types::FLOAT:
-    return data_types::encode<ENC_t, IN_t, data_types::FLOAT>(in);
-  case data_types::DOUBLE:
-    return data_types::encode<ENC_t, IN_t, data_types::DOUBLE>(in);
-  case data_types::BOOL:
-    return data_types::encode<ENC_t, IN_t, data_types::BOOL>(in);
-  case data_types::DATE:
-    return data_types::encode<ENC_t, IN_t, data_types::DATE>(in);
-  case data_types::USDATE:
-    return data_types::encode<ENC_t, IN_t, data_types::USDATE>(in);
-  case data_types::DATE_TIME:
-    return data_types::encode<ENC_t, IN_t, data_types::DATE_TIME>(in);
-  case data_types::IP_ADDRESS:
-    return data_types::encode<ENC_t, IN_t, data_types::IP_ADDRESS>(in);
-  }
-  return data_types::kNullValue<ENC_t>;
-}
-
-template <>
-inline std::string
-data_types::decode<uint64_t, std::string, data_types::UINT>(uint64_t value) {
-  if (value == kNullValue<uint64_t>)
-    return "";
-  return std::to_string(value);
-}
-
-template <>
-inline std::string
-data_types::decode<uint64_t, std::string, data_types::INT>(uint64_t value) {
-  if (value == kNullValue<uint64_t>)
-    return "";
-  int64_t v;
-  memcpy(&v, &value, sizeof(v));
-  return std::to_string(v);
-}
-
-template <>
-inline std::string
-data_types::decode<uint64_t, std::string, data_types::FLOAT>(uint64_t value) {
-  if (value == kNullValue<uint64_t>)
-    return "";
-  float v;
-  memcpy(&v, &value, sizeof(v));
-  return std::to_string(v);
-}
-
-template <>
-inline std::string
-data_types::decode<uint64_t, std::string, data_types::DOUBLE>(uint64_t value) {
-  if (value == kNullValue<uint64_t>)
-    return "";
-  double v;
-  memcpy(&v, &value, sizeof(v));
-  return std::to_string(v);
-}
-
-template <>
-inline std::string
-data_types::decode<uint64_t, std::string, data_types::IP_ADDRESS>(
-    uint64_t value) {
-  std::string ipAddr = "";
-  uint64_t octets[4];
-  for (uint64_t k = 0; k < 4; k++) {
-    octets[k] = value & 255;
-    value     = value >> 8;
-  }
-  for (uint64_t k = 3; k >= 1; k--)
-    ipAddr += std::to_string(octets[k]) + '.';
-  return ipAddr + std::to_string(octets[0]);
-}
-
-template <>
-inline std::string
-data_types::decode<uint64_t, std::string, data_types::BOOL>(uint64_t value) {
-  if (value == kNullValue<uint64_t>)
-    return "";
-  return std::to_string(value);
-}
-
-template <>
-inline std::string
-data_types::decode<uint64_t, std::string, data_types::DATE>(uint64_t value) {
-  time_t t = data_types::decode<uint64_t, data_types::DATE>(value);
-  char dateString[11];
-  strftime(dateString, 11, "%Y-%m-%d", std::localtime(&t));
-  return std::string(dateString);
-}
-
-template <>
-inline std::string
-data_types::decode<uint64_t, std::string, data_types::CHARS>(uint64_t value) {
-  const char* c = reinterpret_cast<const char*>(&value);
-  return std::string(c);
-}
-
-template <>
-inline uint64_t data_types::decode<uint64_t, uint64_t>(uint64_t encvalue) {
-  return encvalue;
-}
-} // namespace shad
-
-#endif // INCLUDE_DATA_TYPES_DATA_TYPES_H_
diff --git a/libwmd/include/galois/wmd/graph.h b/libwmd/include/galois/wmd/graph.h
index 6618b79b0c..86a28be423 100644
--- a/libwmd/include/galois/wmd/graph.h
+++ b/libwmd/include/galois/wmd/graph.h
@@ -53,9 +53,9 @@
 #include <boost/archive/text_oarchive.hpp>
 #include <boost/archive/text_iarchive.hpp>
 
-#include "data_types.h"
 #include "graphTypes.h"
 #include "galois/graphs/LS_LC_CSR_64_Graph.h"
+#include "galois/shad/DataTypes.h"
 
 #define UINT shad::data_types::UINT
 // #define DOUBLE shad::data_types::DOUBLE
@@ -78,8 +78,8 @@ class Vertex {
     type  = TYPES::NONE;
   }
 
-  Vertex(uint64_t glbid_, uint64_t id_, TYPES type_) {
-    id    = id_;
+  Vertex(uint64_t glbid_, uint64_t, TYPES type_) {
+    id    = glbid_;
     glbid = glbid_;
     edges = 0;
     type  = type_;

From 89d65c3bdc48acb1565616adba89fcc575601899 Mon Sep 17 00:00:00 2001
From: Meyer Zinn <6132034+meyerzinn@users.noreply.github.com>
Date: Mon, 1 Apr 2024 17:55:34 -0500
Subject: [PATCH 640/660] support creation of new vertices in LSCSR (#15)

---
 .../include/galois/graphs/LS_LC_CSR_Graph.h   | 28 +++++++++++++++++--
 libgalois/test/graph-compile-lscsr.cpp        |  9 ++++++
 2 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h b/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
index d2a31c1da5..67a90dd1a9 100644
--- a/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
+++ b/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
@@ -79,7 +79,7 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
   static constexpr bool HasEdgeData   = !std::is_same_v<EdgeData, void>;
 
   using VertexDataStore =
-      std::conditional_t<HasVertexData, typename galois::LargeArray<VertexData>,
+      std::conditional_t<HasVertexData, typename std::vector<VertexData>,
                          typename std::tuple<>>;
   using EdgeDataStore = std::conditional_t<
       HasEdgeData,
@@ -123,7 +123,7 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
   LS_LC_CSR_Graph(uint64_t num_vertices)
       : m_vertices(num_vertices, VertexMetadata()) {
     if constexpr (HasVertexData) {
-      m_vertex_data.allocateBlocked(num_vertices);
+      m_vertex_data.resize(num_vertices);
     }
   }
 
@@ -158,6 +158,30 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
 
   VertexRange vertices() { return VertexRange(begin(), end()); }
 
+  VertexTopologyID addVertexTopologyOnly() {
+    m_vertices.emplace_back();
+    if constexpr (HasVertexData) {
+      m_vertex_data.resize(m_vertices.size());
+    }
+    return m_vertices.size() - 1;
+  }
+
+  // Adds multiple vertices to the graph. The new vertices will be assigned
+  // consecutive topology IDs, and the lowest new ID is returned.
+  template <typename V = VertexData, typename = std::enable_if<HasVertexData>>
+  VertexTopologyID addVertices(std::vector<V> data) {
+    VertexTopologyID const start = m_vertices.size();
+    m_vertices.resize(m_vertices.size() + data.size());
+    m_vertex_data.resize(m_vertices.size());
+
+    galois::do_all(
+        galois::iterate(0ul, data.size()),
+        [&](VertexTopologyID const& off) { setData(start + off, data[off]); });
+    return start;
+  }
+
+  size_t getDegree(VertexTopologyID id) { return m_vertices[id].degree; }
+
   VertexTopologyID getEdgeDst(EdgeHandle eh) { return getEdgeMetadata(eh).dst; }
 
   template <typename E = EdgeData, typename = std::enable_if<HasEdgeData>>
diff --git a/libgalois/test/graph-compile-lscsr.cpp b/libgalois/test/graph-compile-lscsr.cpp
index 3f6448b2cb..b09298010c 100644
--- a/libgalois/test/graph-compile-lscsr.cpp
+++ b/libgalois/test/graph-compile-lscsr.cpp
@@ -74,6 +74,15 @@ int main() {
   g.setData(3, 3);
   GALOIS_ASSERT(g.getData(3) == 3);
 
+  size_t four = g.addVertices({4, 5, 6, 7});
+
+  for (size_t ii = 0; ii < 4; ++ii) {
+    // make sure previous data survived the resize
+    GALOIS_ASSERT(g.getData(ii) == ii);
+    // check the new vertex data
+    GALOIS_ASSERT(g.getData(four + ii) == 4 + ii);
+  }
+
   g.addEdges(0, {1, 2, 3}, {1, 2, 3});
   for (auto const& handle : g.edges(0)) {
     GALOIS_ASSERT(g.getEdgeDst(handle) == g.getEdgeData(handle));

From 9b91847ba9538d7a384deee967b19f5d0bb909b8 Mon Sep 17 00:00:00 2001
From: Hochan Lee <nicelhc13@users.noreply.github.com>
Date: Tue, 2 Apr 2024 15:46:03 -0500
Subject: [PATCH 641/660] GNN tests (#12)

---
 .gitattributes                                |  31 ++
 .github/workflows/build-and-test.yml          |  14 +-
 Makefile                                      |   3 +-
 inputs/cora/cora-dims.txt                     |   3 +
 inputs/cora/cora-feat.bin                     |   3 +
 inputs/cora/cora-feats.bin                    |   3 +
 inputs/cora/cora-labels.txt                   |   3 +
 inputs/cora/cora-test_mask.txt                |   3 +
 inputs/cora/cora-train_mask.txt               |   3 +
 inputs/cora/cora-val_mask.txt                 |   3 +
 inputs/cora/cora.csgr                         |   3 +
 inputs/cora/cora.el                           |   3 +
 inputs/cora/cora.features                     |   3 +
 inputs/cora/cora.ft                           |   3 +
 inputs/cora/cora.gr                           |   3 +
 inputs/cora/cora.sgr                          |   3 +
 inputs/cora/cora_edgelist.txt                 |   3 +
 inputs/cora/cora_full.npz                     |   3 +
 inputs/cora/cora_labels.txt                   |   3 +
 inputs/tester/tester-dims.txt                 |   3 +
 inputs/tester/tester-feats.bin                |   3 +
 inputs/tester/tester-labels.txt               |   3 +
 inputs/tester/tester-mlabels.txt              |   3 +
 inputs/tester/tester-test-feats.bin           |   3 +
 inputs/tester/tester-test-labels-dims.txt     |   3 +
 inputs/tester/tester-test-labels.bin          |   3 +
 inputs/tester/tester-test-mapping.bin         |   3 +
 inputs/tester/tester-test_mask.txt            |   3 +
 inputs/tester/tester-train_mask.txt           |   3 +
 inputs/tester/tester-val_mask.txt             |   3 +
 inputs/tester/tester.csgr                     |   3 +
 inputs/tester/tester.el                       |   3 +
 libcusp/test/shad-dist-graph.cpp              |  11 +-
 libgnn/include/galois/graphs/GNNGraph.h       |  32 +-
 libgnn/test/CMakeLists.txt                    |  10 +-
 libgnn/test/accuracy-test.cpp                 |  16 +-
 libgnn/test/aggregate-sync-test.cpp           |   6 +-
 libgnn/test/back-conv-test.cpp                |   3 +-
 libgnn/test/convlayer-test.cpp                |   3 +-
 libgnn/test/epoch-test.cpp                    |   2 +-
 libgnn/test/f1-test.cpp                       |   3 +-
 libgnn/test/gcn-sample-edge-test.cpp          |  24 +-
 libgnn/test/gnnconstruct-test.cpp             |   3 +-
 libgnn/test/gnnfb-test.cpp                    |  14 +-
 libgnn/test/gnngraph-test.cpp                 |   4 +-
 libgnn/test/l2norm-layer-test.cpp             |   3 +-
 libgnn/test/multilabel-epoch-test.cpp         |   3 +-
 libgnn/test/multilabel-read.cpp               |   3 +-
 libgnn/test/sage-layer-test.cpp               |   3 +-
 libgnn/test/sample-bit-test.cpp               |   3 +-
 libgnn/test/sample-test.cpp                   |   6 +-
 libgnn/test/sigmoidlayer-test.cpp             |   3 +-
 libgnn/test/softmaxlayer-test.cpp             |  41 +--
 libgpu/include/csr_graph.h                    |   2 +-
 libgpu/include/graph_gpu.h                    | 181 ++++++-----
 libgpu/include/internal.h                     |   6 +-
 .../include/pangolin/canonical_graph.h        |   4 +-
 libpangolin/include/pangolin/edge_embedding.h |   4 +-
 libpangolin/include/pangolin/embedding.h      |   6 +-
 libpangolin/include/pangolin/quick_pattern.h  |   4 +-
 .../cpu/betweennesscentrality/OuterStructs.h  |   2 +-
 lonestar/analytics/cpu/bipart/Coarsening.cpp  | 295 +++++++++---------
 lonestar/analytics/cpu/bipart/Refine.cpp      |   2 +-
 lonestar/analytics/cpu/bipart/bipart.cpp      |  44 +--
 lonestar/analytics/cpu/bipart/bipart.h        |  25 +-
 .../cpu/clustering/louvainClustering.cpp      |   2 +-
 lonestar/analytics/cpu/gmetis/Coarsening.cpp  |   2 +-
 lonestar/analytics/cpu/gmetis/GMetis.cpp      |   2 +-
 lonestar/analytics/cpu/k-core/kcore.cpp       |   6 +-
 .../cpu/matrixcompletion/matrixCompletion.h   |   2 +-
 .../betweennesscentrality/bc_level.cpp        |   6 +-
 .../betweennesscentrality/bc_mr.cpp           |   2 +-
 .../analytics/distributed/bfs/bfs_push.cpp    |  48 ++-
 .../connected-components/cc_pull.cpp          |   2 +-
 .../distributed/pagerank/pagerank_pull.cpp    |   2 +-
 .../distributed/pagerank/pagerank_push.cpp    |   2 +-
 .../analytics/distributed/sssp/sssp_push.cpp  |   2 +-
 .../gpu/matrixcompletion/SGDAsyncEdgeCu.h     |   4 +-
 .../gpu/matrixcompletion/SGDCommonCu.h        |   2 +-
 .../gpu/matrixcompletion/SGDGraphCu.h         |   6 +-
 .../analytics/gpu/pointstoanalysis/andersen.h |  12 +-
 .../algorithms/RewriteManager.cpp             |   2 +-
 .../functional/FunctionHandler.h              |   2 +-
 lonestar/eda/cpu/sproute/flute.h              |  24 +-
 lonestar/eda/cpu/sproute/maze.h               |   2 +-
 lonestar/eda/cpu/sproute/maze3D.h             |   2 +-
 lonestar/eda/cpu/sproute/route.h              |   4 +-
 lonestar/gnn/distributed/gcn/gcn-dist.cpp     |   4 +-
 lonestar/gnn/gat/gat.cpp                      |  11 +-
 lonestar/gnn/gin/gin.cpp                      |   9 +-
 lonestar/gnn/include/DistributedGraphLoader.h |   3 +-
 lonestar/gnn/include/engine.h                 |   2 +-
 lonestar/gnn/include/lonestargnn.h            |  92 ++++--
 lonestar/gnn/sage/sage.cpp                    |  52 +--
 lonestar/gnn/src/DistributedGraphLoader.cpp   |   9 +-
 .../libdistbench/include/DistBench/Input.h    |   5 +-
 lonestar/libdistbench/src/Input.cpp           |   9 +-
 lonestar/libgnnbench/src/Input.cpp            |   9 +-
 lonestar/libgnnbench/src/Start.cpp            |   4 +-
 .../mining/cpu/subgraph-listing/sgl_cycle.cpp |   2 +-
 .../cpu/subgraph-listing/sgl_diamond.cpp      |   2 +-
 .../delaunayrefinement/DelaunayRefinement.cpp |   2 +-
 .../cpu/longestedge/src/LongestEdge.cpp       |   2 +-
 .../longestedge/src/readers/SrtmReader.cpp    |   2 +-
 104 files changed, 701 insertions(+), 562 deletions(-)
 create mode 100644 .gitattributes
 create mode 100644 inputs/cora/cora-dims.txt
 create mode 100644 inputs/cora/cora-feat.bin
 create mode 100644 inputs/cora/cora-feats.bin
 create mode 100644 inputs/cora/cora-labels.txt
 create mode 100644 inputs/cora/cora-test_mask.txt
 create mode 100644 inputs/cora/cora-train_mask.txt
 create mode 100644 inputs/cora/cora-val_mask.txt
 create mode 100644 inputs/cora/cora.csgr
 create mode 100644 inputs/cora/cora.el
 create mode 100644 inputs/cora/cora.features
 create mode 100644 inputs/cora/cora.ft
 create mode 100644 inputs/cora/cora.gr
 create mode 100644 inputs/cora/cora.sgr
 create mode 100644 inputs/cora/cora_edgelist.txt
 create mode 100644 inputs/cora/cora_full.npz
 create mode 100644 inputs/cora/cora_labels.txt
 create mode 100644 inputs/tester/tester-dims.txt
 create mode 100644 inputs/tester/tester-feats.bin
 create mode 100644 inputs/tester/tester-labels.txt
 create mode 100644 inputs/tester/tester-mlabels.txt
 create mode 100644 inputs/tester/tester-test-feats.bin
 create mode 100644 inputs/tester/tester-test-labels-dims.txt
 create mode 100644 inputs/tester/tester-test-labels.bin
 create mode 100644 inputs/tester/tester-test-mapping.bin
 create mode 100644 inputs/tester/tester-test_mask.txt
 create mode 100644 inputs/tester/tester-train_mask.txt
 create mode 100644 inputs/tester/tester-val_mask.txt
 create mode 100644 inputs/tester/tester.csgr
 create mode 100644 inputs/tester/tester.el

diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000000..03199fcb2d
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,31 @@
+inputs/cora filter=lfs diff=lfs merge=lfs -text
+inputs/tester filter=lfs diff=lfs merge=lfs -text
+inputs/cora/cora_edgelist.txt filter=lfs diff=lfs merge=lfs -text
+inputs/cora/cora-labels.txt filter=lfs diff=lfs merge=lfs -text
+inputs/cora/cora.sgr filter=lfs diff=lfs merge=lfs -text
+inputs/cora/cora.csgr filter=lfs diff=lfs merge=lfs -text
+inputs/cora/cora-dims.txt filter=lfs diff=lfs merge=lfs -text
+inputs/cora/cora_full.npz filter=lfs diff=lfs merge=lfs -text
+inputs/cora/cora_labels.txt filter=lfs diff=lfs merge=lfs -text
+inputs/cora/cora-val_mask.txt filter=lfs diff=lfs merge=lfs -text
+inputs/cora/cora.el filter=lfs diff=lfs merge=lfs -text
+inputs/cora/cora.features filter=lfs diff=lfs merge=lfs -text
+inputs/cora/cora.ft filter=lfs diff=lfs merge=lfs -text
+inputs/cora/cora-train_mask.txt filter=lfs diff=lfs merge=lfs -text
+inputs/cora/cora-feat.bin filter=lfs diff=lfs merge=lfs -text
+inputs/cora/cora-feats.bin filter=lfs diff=lfs merge=lfs -text
+inputs/cora/cora.gr filter=lfs diff=lfs merge=lfs -text
+inputs/cora/cora-test_mask.txt filter=lfs diff=lfs merge=lfs -text
+inputs/tester/tester-test-feats.bin filter=lfs diff=lfs merge=lfs -text
+inputs/tester/tester-test-labels.bin filter=lfs diff=lfs merge=lfs -text
+inputs/tester/tester-test-labels-dims.txt filter=lfs diff=lfs merge=lfs -text
+inputs/tester/tester-test_mask.txt filter=lfs diff=lfs merge=lfs -text
+inputs/tester/tester-dims.txt filter=lfs diff=lfs merge=lfs -text
+inputs/tester/tester.el filter=lfs diff=lfs merge=lfs -text
+inputs/tester/tester-feats.bin filter=lfs diff=lfs merge=lfs -text
+inputs/tester/tester-mlabels.txt filter=lfs diff=lfs merge=lfs -text
+inputs/tester/tester-train_mask.txt filter=lfs diff=lfs merge=lfs -text
+inputs/tester/tester.csgr filter=lfs diff=lfs merge=lfs -text
+inputs/tester/tester-labels.txt filter=lfs diff=lfs merge=lfs -text
+inputs/tester/tester-test-mapping.bin filter=lfs diff=lfs merge=lfs -text
+inputs/tester/tester-val_mask.txt filter=lfs diff=lfs merge=lfs -text
diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
index a95f29e42e..7ecf6046fc 100644
--- a/.github/workflows/build-and-test.yml
+++ b/.github/workflows/build-and-test.yml
@@ -53,11 +53,11 @@ jobs:
         shell: bash -l {0}
     strategy:
       matrix:
-        build-type: ['Release', 'Debug']
-        sanitizer-type: ['nosan', 'san']
-        exclude:
-          - build-type: 'Debug'
-            sanitizer-type: 'san'
+        build-type: ['Release']
+        sanitizer-type: ['nosan']
+        # exclude:
+        #   - build-type: 'Debug'
+        #     sanitizer-type: 'san'
     needs: docker-create-ubuntu-2204
 
     steps:
@@ -105,9 +105,9 @@ jobs:
           make docker
 
     - name: Run Tests
-      timeout-minutes: 5
+      timeout-minutes: 15
       run: |
-        CONTAINER_CMD="make run-tests" \
+        CONTAINER_CMD="bash -lc 'source /opt/intel/oneapi/setvars.sh &&  make run-tests'" \
           IMAGE_NAME="${{ env.IMAGE_NAME }}" \
           VERSION="${{ env.IMAGE_VERSION }}" \
           make docker
diff --git a/Makefile b/Makefile
index a0544b1c2c..9259620058 100644
--- a/Makefile
+++ b/Makefile
@@ -117,6 +117,7 @@ run-tests:
 	@ctest --test-dir build -R wmd --verbose
 	@ctest --test-dir build -R large-vec --verbose
 	@ctest --test-dir build -R compile-lscsr --verbose
+	@ctest --test-dir build/libgnn/test --verbose
 
 # this command is slow since hooks are not stored in the container image
 # this is mostly for CI use
@@ -124,4 +125,4 @@ docker-pre-commit:
 	@docker --context ${CONTAINER_CONTEXT} run --rm \
 	-v ${SRC_DIR}/:${CONTAINER_SRC_DIR} --privileged \
 	--workdir=${CONTAINER_WORKDIR} -t \
-	${IMAGE_NAME}:${VERSION} bash -lc "git config --global --add safe.directory /pando-galois && make hooks && make pre-commit"
+	${IMAGE_NAME}:${VERSION} bash -lc "git config --global --add safe.directory /galois && make hooks && make pre-commit"
diff --git a/inputs/cora/cora-dims.txt b/inputs/cora/cora-dims.txt
new file mode 100644
index 0000000000..a92cf3378c
--- /dev/null
+++ b/inputs/cora/cora-dims.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dbb10c2013c41f9395f2ed0bac8c17bbd5e6114c0fc22737d3e5e757f37ad579
+size 10
diff --git a/inputs/cora/cora-feat.bin b/inputs/cora/cora-feat.bin
new file mode 100644
index 0000000000..ec044f57ce
--- /dev/null
+++ b/inputs/cora/cora-feat.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:718d06f9d0ee6f7568d9fa9c67c53cc206cdb594fa7ef857295604dd97ea0948
+size 15522256
diff --git a/inputs/cora/cora-feats.bin b/inputs/cora/cora-feats.bin
new file mode 100644
index 0000000000..ec044f57ce
--- /dev/null
+++ b/inputs/cora/cora-feats.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:718d06f9d0ee6f7568d9fa9c67c53cc206cdb594fa7ef857295604dd97ea0948
+size 15522256
diff --git a/inputs/cora/cora-labels.txt b/inputs/cora/cora-labels.txt
new file mode 100644
index 0000000000..c7b71c890b
--- /dev/null
+++ b/inputs/cora/cora-labels.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5a9a43663c60084738f6f42f625c7bed529b2e87bb5ba37851b9bdca0d401845
+size 37919
diff --git a/inputs/cora/cora-test_mask.txt b/inputs/cora/cora-test_mask.txt
new file mode 100644
index 0000000000..21b79a3e9c
--- /dev/null
+++ b/inputs/cora/cora-test_mask.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:41ac837719e878b0b55edb8ac57f68fc875197505ded161d27ed2da70015be69
+size 5426
diff --git a/inputs/cora/cora-train_mask.txt b/inputs/cora/cora-train_mask.txt
new file mode 100644
index 0000000000..2197485a30
--- /dev/null
+++ b/inputs/cora/cora-train_mask.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:30746c81e77730da65200f597f5db42e3234e1e0c36ae19a327c0172aba9cb2a
+size 5422
diff --git a/inputs/cora/cora-val_mask.txt b/inputs/cora/cora-val_mask.txt
new file mode 100644
index 0000000000..c858137960
--- /dev/null
+++ b/inputs/cora/cora-val_mask.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c396d702e58eac6309fdd73f222ae4881c0572e80eed760979096bf44d2ea31b
+size 5424
diff --git a/inputs/cora/cora.csgr b/inputs/cora/cora.csgr
new file mode 100644
index 0000000000..9619c232cc
--- /dev/null
+++ b/inputs/cora/cora.csgr
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0a1e691f65460fab765fea2e679e0ac345aaaa9139e9c8d3f2fb78c7fd8f12fd
+size 63920
diff --git a/inputs/cora/cora.el b/inputs/cora/cora.el
new file mode 100644
index 0000000000..b64d9fa9c0
--- /dev/null
+++ b/inputs/cora/cora.el
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c1555fbbb3226bf7f23c25a4b797ffc1bc2be9377f400badcb4ec4f0db6cb985
+size 48447
diff --git a/inputs/cora/cora.features b/inputs/cora/cora.features
new file mode 100644
index 0000000000..c13b57d217
--- /dev/null
+++ b/inputs/cora/cora.features
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4ddbf810e46d3d34b899ecf242e8a6e51f5b3490edb46dc134fbd00db20f54f2
+size 15534686
diff --git a/inputs/cora/cora.ft b/inputs/cora/cora.ft
new file mode 100644
index 0000000000..b07aebda57
--- /dev/null
+++ b/inputs/cora/cora.ft
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f75a72d201d81ae8c2570b13518f2671f664829e4220ba98ce9207c88bf3d78
+size 881186
diff --git a/inputs/cora/cora.gr b/inputs/cora/cora.gr
new file mode 100644
index 0000000000..6d23c326de
--- /dev/null
+++ b/inputs/cora/cora.gr
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9385f97e6df599f6a0799cbfb51fa4a66d8f3fa7be5bf74cb41ada9195ff3820
+size 42808
diff --git a/inputs/cora/cora.sgr b/inputs/cora/cora.sgr
new file mode 100644
index 0000000000..5e1ae9b299
--- /dev/null
+++ b/inputs/cora/cora.sgr
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:09ff43f169d8225dc11cf3bfbea022045dde1628444282fc39f95780a24cfad3
+size 63920
diff --git a/inputs/cora/cora_edgelist.txt b/inputs/cora/cora_edgelist.txt
new file mode 100644
index 0000000000..d256d6c72e
--- /dev/null
+++ b/inputs/cora/cora_edgelist.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c41cf0e8321332492dd345ba54bee94817f217c19ca90a697050700235b12999
+size 99636
diff --git a/inputs/cora/cora_full.npz b/inputs/cora/cora_full.npz
new file mode 100644
index 0000000000..3a5cd5d7f7
--- /dev/null
+++ b/inputs/cora/cora_full.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a4e0516cf23ec24eca1ecc57e627414350219fa94337b5b2e232ab58acaf2df7
+size 11159872
diff --git a/inputs/cora/cora_labels.txt b/inputs/cora/cora_labels.txt
new file mode 100644
index 0000000000..2d796ac922
--- /dev/null
+++ b/inputs/cora/cora_labels.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f6abf39402ba4af3b629d7da7504788d4dec289ef15aac2cb393d499e0e907f5
+size 17846
diff --git a/inputs/tester/tester-dims.txt b/inputs/tester/tester-dims.txt
new file mode 100644
index 0000000000..6efce4258e
--- /dev/null
+++ b/inputs/tester/tester-dims.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:82001c16a68129ec5cc4b6872b731382b3fca6743cb12e347c4abc53461c01d4
+size 4
diff --git a/inputs/tester/tester-feats.bin b/inputs/tester/tester-feats.bin
new file mode 100644
index 0000000000..40f1f29599
--- /dev/null
+++ b/inputs/tester/tester-feats.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dfe39ec72535e9a520bb9c45eb67a02f36dc69f0bfb0fc0897dfecf82c4ac407
+size 84
diff --git a/inputs/tester/tester-labels.txt b/inputs/tester/tester-labels.txt
new file mode 100644
index 0000000000..a426b05466
--- /dev/null
+++ b/inputs/tester/tester-labels.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:48f6c88b057415789f68575575b9dd4aa4ea2312f9abf225ca054b8bcf41c500
+size 102
diff --git a/inputs/tester/tester-mlabels.txt b/inputs/tester/tester-mlabels.txt
new file mode 100644
index 0000000000..5b1990880f
--- /dev/null
+++ b/inputs/tester/tester-mlabels.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e5805372e013db8dacec5484204e0869b7a0863251182ce454b3a4976717475a
+size 102
diff --git a/inputs/tester/tester-test-feats.bin b/inputs/tester/tester-test-feats.bin
new file mode 100644
index 0000000000..9295ab95a4
--- /dev/null
+++ b/inputs/tester/tester-test-feats.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6acb4190d3968d32e91de159dc392aaf7ca0e2fd0bef7e356cab1b2091f17f2e
+size 84
diff --git a/inputs/tester/tester-test-labels-dims.txt b/inputs/tester/tester-test-labels-dims.txt
new file mode 100644
index 0000000000..a426b05466
--- /dev/null
+++ b/inputs/tester/tester-test-labels-dims.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:48f6c88b057415789f68575575b9dd4aa4ea2312f9abf225ca054b8bcf41c500
+size 102
diff --git a/inputs/tester/tester-test-labels.bin b/inputs/tester/tester-test-labels.bin
new file mode 100644
index 0000000000..7f33ed88cd
--- /dev/null
+++ b/inputs/tester/tester-test-labels.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fa1cb0d91b12f9dbf6f71942de069ab32da656af609d66c43176d91f1f69b9db
+size 7
diff --git a/inputs/tester/tester-test-mapping.bin b/inputs/tester/tester-test-mapping.bin
new file mode 100644
index 0000000000..088fd99b84
--- /dev/null
+++ b/inputs/tester/tester-test-mapping.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:82139343869744ad102ef59bc54c18f2d02dadbf5009cb96fcb9dc354247917e
+size 28
diff --git a/inputs/tester/tester-test_mask.txt b/inputs/tester/tester-test_mask.txt
new file mode 100644
index 0000000000..de6a786db0
--- /dev/null
+++ b/inputs/tester/tester-test_mask.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8a7b2807f11023bbbe72e2931922888938d08b32be2b1c39e8f4da05220b90b3
+size 18
diff --git a/inputs/tester/tester-train_mask.txt b/inputs/tester/tester-train_mask.txt
new file mode 100644
index 0000000000..a7ef547705
--- /dev/null
+++ b/inputs/tester/tester-train_mask.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b09f398f55218d9db8e9fbbaf8ecdcd568fd490febac1d88054142c98df40c48
+size 18
diff --git a/inputs/tester/tester-val_mask.txt b/inputs/tester/tester-val_mask.txt
new file mode 100644
index 0000000000..3d120ebe70
--- /dev/null
+++ b/inputs/tester/tester-val_mask.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:77c1800ef77bc11ca9ee04a7dea03884d818b76a449b4b93712e0ff0ec2d45c7
+size 18
diff --git a/inputs/tester/tester.csgr b/inputs/tester/tester.csgr
new file mode 100644
index 0000000000..11776ac3b4
--- /dev/null
+++ b/inputs/tester/tester.csgr
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e4e0bcbe036176c15f61d673c8a1acfb0dd8970fd56c68ad9f345f331297156d
+size 136
diff --git a/inputs/tester/tester.el b/inputs/tester/tester.el
new file mode 100644
index 0000000000..3fad0f3a3e
--- /dev/null
+++ b/inputs/tester/tester.el
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f46e73a8c721cef981bb7ea0595da475203f674fb1da841d2d9cfc3900e7dd11
+size 48
diff --git a/libcusp/test/shad-dist-graph.cpp b/libcusp/test/shad-dist-graph.cpp
index 492bfeb2ad..42e4fc6645 100644
--- a/libcusp/test/shad-dist-graph.cpp
+++ b/libcusp/test/shad-dist-graph.cpp
@@ -53,13 +53,10 @@ int main() {
   sumGlobalNodes += graph->numMasters();
   sumGlobalEdges += graph->sizeEdges();
 
-  uint64_t reducedSumGlobalNodes = sumGlobalNodes.reduce();
-  uint64_t reducedSumGlobalEdges = sumGlobalEdges.reduce();
-
-  assert(reducedSumGlobalNodes == numNodes);
-  assert(reducedSumGlobalNodes == graph->globalSize());
-  assert(reducedSumGlobalEdges == numEdges);
-  assert(reducedSumGlobalEdges == graph->globalSizeEdges());
+  assert(sumGlobalNodes.reduce() == numNodes);
+  assert(sumGlobalNodes.reduce() == graph->globalSize());
+  assert(sumGlobalEdges.reduce() == numEdges);
+  assert(sumGlobalEdges.reduce() == graph->globalSizeEdges());
 
   std::cout << "Num. nodes/edges tests has been passed\n";
 
diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index 525df23a1e..e8a28f8429 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -24,7 +24,8 @@ namespace galois {
 // TODO remove the need to hardcode this path
 //! Path to location of all gnn files
 static const std::string default_gnn_dataset_path =
-    "/home/hochan/inputs/Learning/";
+    //"/net/ohm/export/iss/inputs/Learning/";
+    " nope";
 
 //! Helper struct to maintain start/end/size of any particular range. Mostly
 //! used for mask ranges.
@@ -70,6 +71,9 @@ class GNNGraph {
         std::string("[") +
         std::to_string(galois::runtime::getSystemNetworkInterface().ID) +
         std::string("] ");
+
+    std::cout << "input directory:" << input_directory_ << ", "
+              << " data set name:" << dataset_name << "\n";
     // load partition
     partitioned_graph_ =
         LoadPartition(input_directory_, dataset_name, partition_scheme);
@@ -1657,6 +1661,15 @@ class GNNGraph {
     }
     GALOIS_LOG_VERBOSE("Partition loading: File to read is {}", input_file);
 
+    std::cout << "input file:" << input_file << "\n";
+    if (FILE* fp = fopen(input_file.c_str(), "r")) {
+      std::cout << "succeeded to read the input file:" << input_file << "\n"
+                << std::flush;
+      fclose(fp);
+    } else {
+      std::cout << "failed to read the input file:" << input_file << "\n"
+                << std::flush;
+    }
     // load partition
     switch (partition_scheme) {
     case galois::graphs::GNNPartitionScheme::kOEC:
@@ -2398,23 +2411,6 @@ class GNNGraph {
     num_correct_.reset();
     total_checked_.reset();
 
-#if 0
-    std::cout << "single accuracy print:\n";
-    for (int i = *begin_owned(); i < *end_owned(); ++i) {
-      if (!IsValidForPhase(i, GNNPhase::kBatch)) {
-        continue;
-      }
-      //std::cout << subgraph_->SIDToLID(i) << ", " << galois::MaxIndex(num_label_classes_, &predictions[i * num_label_classes_]) <<
-      std::cout << "accuracy:" << subgraph_->SIDToLID(i) << ", " <<
-      predictions[i * num_label_classes_] << ", " <<
-      predictions[i * num_label_classes_ + 1] << ", " <<
-      predictions[i * num_label_classes_ + 2] << ", " <<
-      predictions[i * num_label_classes_ + 3] << ", " <<
-      predictions[i * num_label_classes_ + 4] << "-> " <<
-      galois::MaxIndex(num_label_classes_, &predictions[i * num_label_classes_]) <<
-      " vs " << GetSingleClassLabel(i) << "\n";
-    }
-#endif
     galois::do_all(
         // will only loop over sampled nodes if sampling is on
         galois::iterate(begin_owned(), end_owned()),
diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt
index 40efcfa0e3..acc04cbb66 100644
--- a/libgnn/test/CMakeLists.txt
+++ b/libgnn/test/CMakeLists.txt
@@ -1,7 +1,11 @@
 find_package(OpenMP)
 
+set(TESTINPUT "${Galois_SOURCE_DIR}/inputs")
+add_definitions("-DTESTER_DIR=\"${TESTINPUT}/tester/\"")
+add_definitions("-DCORA_DIR=\"${TESTINPUT}/cora/\"")
+
 set(hosts)
-set(host 12)
+set(host 6)
 while (${host} GREATER 1)
   list(APPEND hosts ${host})
   math(EXPR host "${host} - 1")
@@ -17,7 +21,7 @@ if (NOT GALOIS_ENABLE_GPU)
       ${GALOIS_TESTS}
       convlayer-test
       sage-layer-test
-      l2norm-layer-test
+      # l2norm-layer-test TODO(hc): L2Norm is not maintained
       softmaxlayer-test
       sigmoidlayer-test
       gnnconstruct-test
@@ -27,7 +31,7 @@ if (NOT GALOIS_ENABLE_GPU)
       epoch-test
       multilabel-epoch-test
       multilabel-read
-      f1-test
+      # f1-test TODO(hc): it is a multilable test and will be fixed later
       sample-bit-test
       gcn-sample-edge-test
   )
diff --git a/libgnn/test/accuracy-test.cpp b/libgnn/test/accuracy-test.cpp
index f2d34c0403..ae6b201d89 100644
--- a/libgnn/test/accuracy-test.cpp
+++ b/libgnn/test/accuracy-test.cpp
@@ -14,7 +14,8 @@ int main() {
 
   // load test graph
   auto test_graph = std::make_unique<galois::graphs::GNNGraph<char, void>>(
-      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
+      TESTER_DIR, "tester", galois::graphs::GNNPartitionScheme::kOEC, true,
+      false);
 
   std::vector<galois::GNNLayerType> layer_types = {
       galois::GNNLayerType::kGraphConvolutional};
@@ -34,23 +35,22 @@ int main() {
   //////////////////////////////////////////////////////////////////////////////
 
   galois::PointerWithSize<galois::GNNFloat> distributions = gnn->DoInference();
-  // accuracy will be 0.2: everything chooses the first 1 as the entire row
-  // is the same
+
   float pred_accuracy = gnn->GetGlobalAccuracy(distributions);
   GALOIS_LOG_VERBOSE("{}", pred_accuracy);
-  GALOIS_LOG_ASSERT(pred_accuracy == static_cast<float>(0.2));
+  GALOIS_LOG_ASSERT(static_cast<int>(pred_accuracy * 1000) == 333);
 
   // validation mode
   gnn->SetLayerPhases(galois::GNNPhase::kValidate);
   galois::PointerWithSize<galois::GNNFloat> dist2 = gnn->DoInference();
   pred_accuracy = gnn->GetGlobalAccuracy(dist2);
-  GALOIS_LOG_ASSERT(pred_accuracy == static_cast<float>(0.0));
+  GALOIS_LOG_ASSERT(pred_accuracy == static_cast<float>(0));
 
   // test mode
   gnn->SetLayerPhases(galois::GNNPhase::kTest);
   galois::PointerWithSize<galois::GNNFloat> dist3 = gnn->DoInference();
   pred_accuracy = gnn->GetGlobalAccuracy(dist3);
-  GALOIS_LOG_ASSERT(pred_accuracy == static_cast<float>(0.0));
+  GALOIS_LOG_ASSERT(pred_accuracy == static_cast<float>(0));
 
   // manufactured predictions to make sure it predicts things correctly based
   // on mode
@@ -62,11 +62,11 @@ int main() {
   gnn->SetLayerPhases(galois::GNNPhase::kTrain);
   pred_accuracy = gnn->GetGlobalAccuracy(mpred);
   GALOIS_LOG_VERBOSE("{}", pred_accuracy);
-  GALOIS_LOG_ASSERT(pred_accuracy == static_cast<float>(0.8));
+  GALOIS_LOG_ASSERT(static_cast<int>(pred_accuracy * 1000) == 666);
 
   gnn->SetLayerPhases(galois::GNNPhase::kValidate);
   pred_accuracy = gnn->GetGlobalAccuracy(mpred);
-  GALOIS_LOG_ASSERT(pred_accuracy == static_cast<float>(0.0));
+  GALOIS_LOG_ASSERT(pred_accuracy == static_cast<float>(0.5));
 
   gnn->SetLayerPhases(galois::GNNPhase::kTest);
   pred_accuracy = gnn->GetGlobalAccuracy(mpred);
diff --git a/libgnn/test/aggregate-sync-test.cpp b/libgnn/test/aggregate-sync-test.cpp
index 549e6c7c53..19ff3e993e 100644
--- a/libgnn/test/aggregate-sync-test.cpp
+++ b/libgnn/test/aggregate-sync-test.cpp
@@ -10,7 +10,8 @@ int main() {
   }
 
   auto test_graph = std::make_unique<galois::graphs::GNNGraph<char, void>>(
-      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
+      TESTER_DIR, "tester", galois::graphs::GNNPartitionScheme::kOEC, true,
+      false);
 
   // print edges for sanity
   for (size_t node = 0; node < test_graph->size(); node++) {
@@ -207,7 +208,8 @@ int main() {
   }
   //////////////////////////////////////////////////////////////////////////////
   auto test_graph_2 = std::make_unique<galois::graphs::GNNGraph<char, void>>(
-      "tester", galois::graphs::GNNPartitionScheme::kCVC, true, false);
+      TESTER_DIR, "tester", galois::graphs::GNNPartitionScheme::kCVC, true,
+      false);
   // print edges for sanity
   for (size_t node = 0; node < test_graph_2->size(); node++) {
     for (auto e = test_graph_2->edge_begin(node);
diff --git a/libgnn/test/back-conv-test.cpp b/libgnn/test/back-conv-test.cpp
index df3dfe915e..2cb348658b 100644
--- a/libgnn/test/back-conv-test.cpp
+++ b/libgnn/test/back-conv-test.cpp
@@ -12,7 +12,8 @@ int main() {
                      num_threads);
   // load test graph
   galois::graphs::GNNGraph<char, void> test_graph(
-      "tester", galois::graphs::GNNPartitionScheme::kCVC, true, false);
+      TESTER_DIR, "tester", galois::graphs::GNNPartitionScheme::kCVC, true,
+      false);
   galois::PointerWithSize<galois::GNNFloat> feats =
       test_graph.GetLocalFeatures();
   for (size_t row = 0; row < test_graph.size(); row++) {
diff --git a/libgnn/test/convlayer-test.cpp b/libgnn/test/convlayer-test.cpp
index 6170e87d50..cb485112ac 100644
--- a/libgnn/test/convlayer-test.cpp
+++ b/libgnn/test/convlayer-test.cpp
@@ -15,7 +15,8 @@ int main() {
                      num_threads);
   // load test graph
   galois::graphs::GNNGraph<char, void> test_graph(
-      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
+      TESTER_DIR, "tester", galois::graphs::GNNPartitionScheme::kOEC, true,
+      false);
 
   galois::PointerWithSize<galois::GNNFloat> feats =
       test_graph.GetLocalFeatures();
diff --git a/libgnn/test/epoch-test.cpp b/libgnn/test/epoch-test.cpp
index c0b4ede716..68edfa4a18 100644
--- a/libgnn/test/epoch-test.cpp
+++ b/libgnn/test/epoch-test.cpp
@@ -14,7 +14,7 @@ int main() {
 
   // load graph
   auto test_graph = std::make_unique<galois::graphs::GNNGraph<char, void>>(
-      "cora", galois::graphs::GNNPartitionScheme::kCVC, true, false);
+      CORA_DIR, "cora", galois::graphs::GNNPartitionScheme::kCVC, true, false);
 
   std::vector<galois::GNNLayerType> layer_types = {
       galois::GNNLayerType::kGraphConvolutional,
diff --git a/libgnn/test/f1-test.cpp b/libgnn/test/f1-test.cpp
index 363c12861b..d284464047 100644
--- a/libgnn/test/f1-test.cpp
+++ b/libgnn/test/f1-test.cpp
@@ -9,7 +9,8 @@ int main() {
 
   // load test graph; false at end = multilabel
   galois::graphs::GNNGraph<char, void> test_graph(
-      "tester", galois::graphs::GNNPartitionScheme::kOEC, false, false);
+      TESTER_DIR, "tester", galois::graphs::GNNPartitionScheme::kOEC, false,
+      false);
 
   // perfect precision and recall
   std::vector<galois::GNNFloat> prediction = {
diff --git a/libgnn/test/gcn-sample-edge-test.cpp b/libgnn/test/gcn-sample-edge-test.cpp
index c612639d10..f04b8fb7d4 100644
--- a/libgnn/test/gcn-sample-edge-test.cpp
+++ b/libgnn/test/gcn-sample-edge-test.cpp
@@ -29,16 +29,17 @@
 int main() {
   galois::DistMemSys G;
 
-  size_t num_threads = 1;
   // tester graph: 0 - 1 - 2 - 3 - 4 - 5 - 6
   galois::graphs::GNNGraph<char, void> test_graph(
-      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
+      TESTER_DIR, "tester", galois::graphs::GNNPartitionScheme::kOEC, true,
+      false);
   test_graph.InitializeSamplingData();
 
   galois::GNNLayerConfig dcon;
   dcon.disable_aggregate_after_update = false;
   dcon.disable_normalization          = false;
   dcon.DebugConfig();
+
   // Choose a few sample nodes
   test_graph.SetSampledNode(0);
   test_graph.SetSampledNode(4);
@@ -88,6 +89,7 @@ int main() {
 
   layer_1->InitAllWeightsTo1();
   layer_1->EnableSampling();
+  layer_1->SetGraphUserLayerNumber(0);
   galois::PointerWithSize<galois::GNNFloat> features =
       test_graph.GetLocalFeatures();
 
@@ -121,9 +123,9 @@ int main() {
   GALOIS_LOG_ASSERT(layer_1_backward_output[3] == 2);
   GALOIS_LOG_ASSERT(layer_1_backward_output[4] == 2);
   GALOIS_LOG_ASSERT(layer_1_backward_output[5] == 2);
-  GALOIS_LOG_ASSERT(layer_1_backward_output[6] == 0);
-  GALOIS_LOG_ASSERT(layer_1_backward_output[7] == 0);
-  GALOIS_LOG_ASSERT(layer_1_backward_output[8] == 0);
+  GALOIS_LOG_ASSERT(layer_1_backward_output[6] == 2);
+  GALOIS_LOG_ASSERT(layer_1_backward_output[7] == 2);
+  GALOIS_LOG_ASSERT(layer_1_backward_output[8] == 2);
   GALOIS_LOG_ASSERT(layer_1_backward_output[9] == 0);
   GALOIS_LOG_ASSERT(layer_1_backward_output[10] == 0);
   GALOIS_LOG_ASSERT(layer_1_backward_output[11] == 0);
@@ -134,12 +136,12 @@ int main() {
   galois::PointerWithSize<galois::GNNFloat> layer_1_weight_gradients =
       layer_1->GetLayerWeightGradients();
 
-  GALOIS_LOG_ASSERT(layer_1_weight_gradients[0] == 6);
-  GALOIS_LOG_ASSERT(layer_1_weight_gradients[1] == 6);
-  GALOIS_LOG_ASSERT(layer_1_weight_gradients[2] == 6);
-  GALOIS_LOG_ASSERT(layer_1_weight_gradients[3] == 6);
-  GALOIS_LOG_ASSERT(layer_1_weight_gradients[4] == 6);
-  GALOIS_LOG_ASSERT(layer_1_weight_gradients[5] == 6);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[0] == 9);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[1] == 9);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[2] == 9);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[3] == 9);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[4] == 9);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[5] == 9);
 
   return 0;
 }
diff --git a/libgnn/test/gnnconstruct-test.cpp b/libgnn/test/gnnconstruct-test.cpp
index aa1513ca91..59c07ee286 100644
--- a/libgnn/test/gnnconstruct-test.cpp
+++ b/libgnn/test/gnnconstruct-test.cpp
@@ -15,7 +15,8 @@ int main() {
                      num_threads);
   // load test graph
   auto test_graph = std::make_unique<galois::graphs::GNNGraph<char, void>>(
-      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
+      TESTER_DIR, "tester", galois::graphs::GNNPartitionScheme::kOEC, true,
+      false);
 
   // 2 layer test with softmax
   std::vector<galois::GNNLayerType> layer_types = {
diff --git a/libgnn/test/gnnfb-test.cpp b/libgnn/test/gnnfb-test.cpp
index eb74ffb78a..b9b57ede70 100644
--- a/libgnn/test/gnnfb-test.cpp
+++ b/libgnn/test/gnnfb-test.cpp
@@ -15,7 +15,8 @@ int main() {
                      num_threads);
   // load test graph
   auto test_graph = std::make_unique<galois::graphs::GNNGraph<char, void>>(
-      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
+      TESTER_DIR, "tester", galois::graphs::GNNPartitionScheme::kOEC, true,
+      false);
 
   // 2 layer test with softmax
   std::vector<galois::GNNLayerType> layer_types = {
@@ -114,7 +115,7 @@ int main() {
   // train mode = last 2 should be masked off
   for (size_t c = 35; c < 49; c += 7) {
     for (size_t i = 0; i < 6; i++) {
-      GALOIS_LOG_ASSERT(fo_out[c + i] == 0);
+      GALOIS_LOG_ASSERT(static_cast<int>(fo_out[c + i] * 1000) == 142);
     }
   }
 
@@ -140,12 +141,12 @@ int main() {
   // first 5 and last should be 0s
   for (size_t c = 0; c < 35; c += 7) {
     for (size_t i = 0; i < 6; i++) {
-      GALOIS_LOG_ASSERT(fo_out_val[c + i] == 0);
+      GALOIS_LOG_ASSERT(static_cast<int>(fo_out_val[c + i] * 1000) == 142);
     }
   }
   for (size_t c = 42; c < 49; c += 7) {
     for (size_t i = 0; i < 6; i++) {
-      GALOIS_LOG_ASSERT(fo_out_val[c + i] == 0);
+      GALOIS_LOG_ASSERT(static_cast<int>(fo_out_val[c + i] * 1000) == 142);
     }
   }
 
@@ -161,7 +162,7 @@ int main() {
   // first 5 and last should be 0s
   for (size_t c = 0; c < 42; c += 7) {
     for (size_t i = 0; i < 6; i++) {
-      GALOIS_LOG_ASSERT(fo_out_test[c + i] == 0);
+      GALOIS_LOG_ASSERT(static_cast<int>(fo_out_test[c + i] * 1000) == 142);
     }
   }
 
@@ -172,7 +173,8 @@ int main() {
   GALOIS_LOG_VERBOSE("Running with different congifuration");
 
   test_graph = std::make_unique<galois::graphs::GNNGraph<char, void>>(
-      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
+      TESTER_DIR, "tester", galois::graphs::GNNPartitionScheme::kOEC, true,
+      false);
   galois::GraphNeuralNetworkConfig gnn_config2(
       2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax,
       dcon);
diff --git a/libgnn/test/gnngraph-test.cpp b/libgnn/test/gnngraph-test.cpp
index b8a05fc8cc..0fdf7c05c0 100644
--- a/libgnn/test/gnngraph-test.cpp
+++ b/libgnn/test/gnngraph-test.cpp
@@ -17,10 +17,10 @@ int main() {
   // note multi level reading tested in another test
   GALOIS_LOG_VERBOSE("reddit with single label, oec");
   galois::graphs::GNNGraph<char, void>(
-      "cora", galois::graphs::GNNPartitionScheme::kOEC, true, false);
+      CORA_DIR, "cora", galois::graphs::GNNPartitionScheme::kOEC, true, false);
   GALOIS_LOG_VERBOSE("reddit with single label, cvc");
   galois::graphs::GNNGraph<char, void>(
-      "cora", galois::graphs::GNNPartitionScheme::kCVC, true, false);
+      CORA_DIR, "cora", galois::graphs::GNNPartitionScheme::kCVC, true, false);
 
   // below for when I want to check the remapper
   // galois::graphs::GNNGraph remapper("ogbn-papers100M",
diff --git a/libgnn/test/l2norm-layer-test.cpp b/libgnn/test/l2norm-layer-test.cpp
index d2b659f238..9e291ef1ef 100644
--- a/libgnn/test/l2norm-layer-test.cpp
+++ b/libgnn/test/l2norm-layer-test.cpp
@@ -11,7 +11,8 @@ int main() {
 
   // load test graph
   galois::graphs::GNNGraph<char, void> test_graph(
-      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
+      TESTER_DIR, "tester", galois::graphs::GNNPartitionScheme::kOEC, true,
+      false);
 
   // input/output columns must be same in softmax
   galois::GNNLayerDimensions dimension_0;
diff --git a/libgnn/test/multilabel-epoch-test.cpp b/libgnn/test/multilabel-epoch-test.cpp
index b0a2430bd1..625ebe5100 100644
--- a/libgnn/test/multilabel-epoch-test.cpp
+++ b/libgnn/test/multilabel-epoch-test.cpp
@@ -14,7 +14,8 @@ int main() {
 
   // load graph
   auto test_graph = std::make_unique<galois::graphs::GNNGraph<char, void>>(
-      "tester", galois::graphs::GNNPartitionScheme::kOEC, false, false);
+      TESTER_DIR, "tester", galois::graphs::GNNPartitionScheme::kOEC, false,
+      false);
 
   std::vector<galois::GNNLayerType> layer_types = {
       galois::GNNLayerType::kGraphConvolutional,
diff --git a/libgnn/test/multilabel-read.cpp b/libgnn/test/multilabel-read.cpp
index 56b8b42071..2e846b5aca 100644
--- a/libgnn/test/multilabel-read.cpp
+++ b/libgnn/test/multilabel-read.cpp
@@ -9,7 +9,8 @@ int main() {
 
   // load test graph; false at end = multilabel
   galois::graphs::GNNGraph<char, void> test_graph(
-      "tester", galois::graphs::GNNPartitionScheme::kOEC, false, false);
+      TESTER_DIR, "tester", galois::graphs::GNNPartitionScheme::kOEC, false,
+      false);
   const galois::GNNLabel* labels = test_graph.GetMultiClassLabel(0);
 
   unsigned i = 0;
diff --git a/libgnn/test/sage-layer-test.cpp b/libgnn/test/sage-layer-test.cpp
index 3f53921795..e0bc766465 100644
--- a/libgnn/test/sage-layer-test.cpp
+++ b/libgnn/test/sage-layer-test.cpp
@@ -15,7 +15,8 @@ int main() {
                      num_threads);
   // load test graph
   galois::graphs::GNNGraph<char, void> test_graph(
-      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
+      TESTER_DIR, "tester", galois::graphs::GNNPartitionScheme::kOEC, true,
+      false);
 
   galois::GNNLayerDimensions dimension_0;
   dimension_0.input_rows     = 7;
diff --git a/libgnn/test/sample-bit-test.cpp b/libgnn/test/sample-bit-test.cpp
index b53860d950..cd08eab7ac 100644
--- a/libgnn/test/sample-bit-test.cpp
+++ b/libgnn/test/sample-bit-test.cpp
@@ -14,7 +14,8 @@ int main() {
                      num_threads);
 
   galois::graphs::GNNGraph<char, void> graph(
-      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
+      TESTER_DIR, "tester", galois::graphs::GNNPartitionScheme::kOEC, true,
+      false);
   graph.InitializeSamplingData(3, false);
 
   // first, assert all edges are not sampled (should come with all 0s)
diff --git a/libgnn/test/sample-test.cpp b/libgnn/test/sample-test.cpp
index d875a72ee4..6f8060051c 100644
--- a/libgnn/test/sample-test.cpp
+++ b/libgnn/test/sample-test.cpp
@@ -34,7 +34,8 @@ int main() {
                      num_threads);
   // load test graph
   galois::graphs::GNNGraph<char, void> test_graph(
-      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
+      TESTER_DIR, "tester", galois::graphs::GNNPartitionScheme::kOEC, true,
+      false);
 
   galois::GNNLayerDimensions dimension_0;
   dimension_0.input_rows     = 7;
@@ -203,7 +204,8 @@ int main() {
   // sigmoid
   //////////////////////////////////////////////////////////////////////////////
   galois::graphs::GNNGraph<char, void> multi_graph(
-      "tester", galois::graphs::GNNPartitionScheme::kOEC, false, false);
+      TESTER_DIR, "tester", galois::graphs::GNNPartitionScheme::kOEC, false,
+      false);
 
   auto sigmoid_layer = std::make_unique<galois::SigmoidLayer<char, void>>(
       3, multi_graph, &p_back_2, dimension_out);
diff --git a/libgnn/test/sigmoidlayer-test.cpp b/libgnn/test/sigmoidlayer-test.cpp
index 9fd861deff..fd24b49823 100644
--- a/libgnn/test/sigmoidlayer-test.cpp
+++ b/libgnn/test/sigmoidlayer-test.cpp
@@ -16,7 +16,8 @@ int main() {
 
   // load test graph
   galois::graphs::GNNGraph<char, void> test_graph(
-      "tester", galois::graphs::GNNPartitionScheme::kOEC, false, false);
+      TESTER_DIR, "tester", galois::graphs::GNNPartitionScheme::kOEC, false,
+      false);
 
   // input/output columns must be same in softmax
   galois::GNNLayerDimensions dimension_0;
diff --git a/libgnn/test/softmaxlayer-test.cpp b/libgnn/test/softmaxlayer-test.cpp
index 1ca2740729..97a3220ba3 100644
--- a/libgnn/test/softmaxlayer-test.cpp
+++ b/libgnn/test/softmaxlayer-test.cpp
@@ -18,7 +18,8 @@ int main() {
 
   // load test graph
   galois::graphs::GNNGraph<char, void> test_graph(
-      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
+      TESTER_DIR, "tester", galois::graphs::GNNPartitionScheme::kOEC, true,
+      false);
 
   // input/output columns must be same in softmax
   galois::GNNLayerDimensions dimension_0;
@@ -53,16 +54,6 @@ int main() {
     GALOIS_LOG_ASSERT(galois::MaxIndex(7, &(prediction_distribution[i * 7])) ==
                       i);
   }
-  // train mode means last 2 vertices should be empty
-  for (size_t i = 5; i < 7; i++) {
-    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 0] == 0.0);
-    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 1] == 0.0);
-    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 2] == 0.0);
-    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 3] == 0.0);
-    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 4] == 0.0);
-    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 5] == 0.0);
-    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 6] == 0.0);
-  }
 
   // NOTE: checked before backward because backward overwrites this matrix
 
@@ -80,24 +71,6 @@ int main() {
 
   // validate vertex is index 5
   GALOIS_LOG_ASSERT(galois::MaxIndex(7, &(pd2[5 * 7])) == 5);
-  for (size_t i = 0; i < 5; i++) {
-    GALOIS_LOG_ASSERT(pd2[i * 7 + 0] == 0.0);
-    GALOIS_LOG_ASSERT(pd2[i * 7 + 1] == 0.0);
-    GALOIS_LOG_ASSERT(pd2[i * 7 + 2] == 0.0);
-    GALOIS_LOG_ASSERT(pd2[i * 7 + 3] == 0.0);
-    GALOIS_LOG_ASSERT(pd2[i * 7 + 4] == 0.0);
-    GALOIS_LOG_ASSERT(pd2[i * 7 + 5] == 0.0);
-    GALOIS_LOG_ASSERT(pd2[i * 7 + 6] == 0.0);
-  }
-  for (size_t i = 6; i < 7; i++) {
-    GALOIS_LOG_ASSERT(pd2[i * 7 + 0] == 0.0);
-    GALOIS_LOG_ASSERT(pd2[i * 7 + 1] == 0.0);
-    GALOIS_LOG_ASSERT(pd2[i * 7 + 2] == 0.0);
-    GALOIS_LOG_ASSERT(pd2[i * 7 + 3] == 0.0);
-    GALOIS_LOG_ASSERT(pd2[i * 7 + 4] == 0.0);
-    GALOIS_LOG_ASSERT(pd2[i * 7 + 5] == 0.0);
-    GALOIS_LOG_ASSERT(pd2[i * 7 + 6] == 0.0);
-  }
 
   asdf = output_layer->BackwardPhase(softmax_input, nullptr);
   printf("Output 2\n========\n");
@@ -111,16 +84,6 @@ int main() {
       output_layer->ForwardPhase(softmax_input);
   // validate vertex is index 6
   GALOIS_LOG_ASSERT(galois::MaxIndex(7, &(pd3[6 * 7])) == 6);
-  // all but last are empty distributions
-  for (size_t i = 0; i < 6; i++) {
-    GALOIS_LOG_ASSERT(pd3[i * 7 + 0] == 0.0);
-    GALOIS_LOG_ASSERT(pd3[i * 7 + 1] == 0.0);
-    GALOIS_LOG_ASSERT(pd3[i * 7 + 2] == 0.0);
-    GALOIS_LOG_ASSERT(pd3[i * 7 + 3] == 0.0);
-    GALOIS_LOG_ASSERT(pd3[i * 7 + 4] == 0.0);
-    GALOIS_LOG_ASSERT(pd3[i * 7 + 5] == 0.0);
-    GALOIS_LOG_ASSERT(pd3[i * 7 + 6] == 0.0);
-  }
 
   asdf = output_layer->BackwardPhase(softmax_input, nullptr);
   printf("Output 3\n========\n");
diff --git a/libgpu/include/csr_graph.h b/libgpu/include/csr_graph.h
index 7fff0750e4..e08b425cb0 100644
--- a/libgpu/include/csr_graph.h
+++ b/libgpu/include/csr_graph.h
@@ -15,7 +15,7 @@
 #define LSG_CSR_GRAPH
 
 // TODO: original branch has this include; revert it back eventually
-//#include "graph_gpu.h"
+// #include "graph_gpu.h"
 
 #include <fstream>
 #include "checker.h"
diff --git a/libgpu/include/graph_gpu.h b/libgpu/include/graph_gpu.h
index d208a3328c..24ca4a353d 100644
--- a/libgpu/include/graph_gpu.h
+++ b/libgpu/include/graph_gpu.h
@@ -48,13 +48,9 @@ struct CSRGraph {
   unsigned deallocOnDevice();
   void dealloc();
 
-  CUDA_HOSTDEV bool valid_node(index_type node) {
-    return (node < nnodes);
-  }
+  CUDA_HOSTDEV bool valid_node(index_type node) { return (node < nnodes); }
 
-  CUDA_HOSTDEV bool valid_edge(index_type edge) {
-    return (edge < nedges);
-  }
+  CUDA_HOSTDEV bool valid_edge(index_type edge) { return (edge < nedges); }
 
   CUDA_HOSTDEV index_type getOutDegree(unsigned src) {
     assert(src < nnodes);
@@ -98,101 +94,114 @@ struct CSRGraph {
     return edge_data[abs_edge];
   };
 
-	void print_neighbors(index_type vid) {
-		printf("Vertex %d neighbors: [ ", vid);
-		index_type start = row_start[vid];
-		index_type end = row_start[vid+1];
-		for (index_type e = start; e != end; e++) {
-			index_type dst = edge_dst[e];
-			printf("%d ",  dst);
-		}
-		printf("]\n");
-	}
-	void add_selfloop() {
-		//print_neighbors(nnodes-1);
-		//print_neighbors(0);
-		index_type *new_edge_dst = new index_type[nnodes+nedges];
-		for (index_type i = 0; i < nnodes; i++) {
-			index_type start = row_start[i];
-			index_type end = row_start[i+1];
-			bool selfloop_inserted = false;
-			if (start == end) {
-				new_edge_dst[start+i] = i;
-				continue;
-			}
-			for (index_type e = start; e != end; e++) {
-				index_type dst = edge_dst[e];
-				if (!selfloop_inserted) {
-					if (i < dst) {
-						selfloop_inserted = true;
-						new_edge_dst[e+i] = i;
-						new_edge_dst[e+i+1] = dst;
-					} else if (e+1 == end) {
-						selfloop_inserted = true;
-						new_edge_dst[e+i+1] = i;
-						new_edge_dst[e+i] = dst;
-					} else new_edge_dst[e+i] = dst;
-				} else new_edge_dst[e+i+1] = dst;
-			}
-		}
-		for (index_type i = 0; i <= nnodes; i++) row_start[i] += i;
-		delete edge_dst;
-		edge_dst = new_edge_dst;
-		nedges += nnodes;
+  void print_neighbors(index_type vid) {
+    printf("Vertex %d neighbors: [ ", vid);
+    index_type start = row_start[vid];
+    index_type end   = row_start[vid + 1];
+    for (index_type e = start; e != end; e++) {
+      index_type dst = edge_dst[e];
+      printf("%d ", dst);
+    }
+    printf("]\n");
+  }
+  void add_selfloop() {
+    // print_neighbors(nnodes-1);
+    // print_neighbors(0);
+    index_type* new_edge_dst = new index_type[nnodes + nedges];
+    for (index_type i = 0; i < nnodes; i++) {
+      index_type start       = row_start[i];
+      index_type end         = row_start[i + 1];
+      bool selfloop_inserted = false;
+      if (start == end) {
+        new_edge_dst[start + i] = i;
+        continue;
+      }
+      for (index_type e = start; e != end; e++) {
+        index_type dst = edge_dst[e];
+        if (!selfloop_inserted) {
+          if (i < dst) {
+            selfloop_inserted       = true;
+            new_edge_dst[e + i]     = i;
+            new_edge_dst[e + i + 1] = dst;
+          } else if (e + 1 == end) {
+            selfloop_inserted       = true;
+            new_edge_dst[e + i + 1] = i;
+            new_edge_dst[e + i]     = dst;
+          } else
+            new_edge_dst[e + i] = dst;
+        } else
+          new_edge_dst[e + i + 1] = dst;
+      }
+    }
+    for (index_type i = 0; i <= nnodes; i++)
+      row_start[i] += i;
+    delete edge_dst;
+    edge_dst = new_edge_dst;
+    nedges += nnodes;
     printf("nnodes = %d, nedges = %d\n", nnodes, nedges);
-		//print_neighbors(nnodes-1);
-		//print_neighbors(0);
-	}
-
-	CUDA_HOSTDEV index_type getEdgeDst(unsigned edge) {
-		assert(edge < nedges);
-		return edge_dst[edge];
-	};
-	CUDA_HOSTDEV node_data_type getData(unsigned vid) {
-		return node_data[vid];
-	}
-	CUDA_HOSTDEV index_type edge_begin(unsigned src) {
-		assert(src <= nnodes);
-		return row_start[src];
-	};
-	CUDA_HOSTDEV index_type edge_end(unsigned src) {
-		assert(src <= nnodes);
-		return row_start[src+1];
-	};
-	CUDA_HOSTDEV index_type *row_start_host_ptr() { return row_start; }
-	CUDA_HOSTDEV index_type *row_start_ptr() { return row_start; }
-	CUDA_HOSTDEV const index_type *row_start_ptr() const { return row_start; }
-	CUDA_HOSTDEV index_type *edge_dst_ptr() { return edge_dst; }
-	CUDA_HOSTDEV const index_type *edge_dst_ptr() const { return edge_dst; }
-	CUDA_HOSTDEV node_data_type *node_data_ptr() { return node_data; }
-	CUDA_HOSTDEV const node_data_type *node_data_ptr() const { return node_data; }
-	CUDA_HOSTDEV edge_data_type *edge_data_ptr() { return edge_data; }
-	CUDA_HOSTDEV const edge_data_type *edge_data_ptr() const { return edge_data; }
-  CUDA_HOSTDEV void fixEndEdge(index_type vid, index_type row_end) { row_start[vid + 1] = row_end; }
-  CUDA_HOSTDEV void constructEdge(index_type eid, index_type dst, edge_data_type edata = 0) {
+    // print_neighbors(nnodes-1);
+    // print_neighbors(0);
+  }
+
+  CUDA_HOSTDEV index_type getEdgeDst(unsigned edge) {
+    assert(edge < nedges);
+    return edge_dst[edge];
+  };
+  CUDA_HOSTDEV node_data_type getData(unsigned vid) { return node_data[vid]; }
+  CUDA_HOSTDEV index_type edge_begin(unsigned src) {
+    assert(src <= nnodes);
+    return row_start[src];
+  };
+  CUDA_HOSTDEV index_type edge_end(unsigned src) {
+    assert(src <= nnodes);
+    return row_start[src + 1];
+  };
+  CUDA_HOSTDEV index_type* row_start_host_ptr() { return row_start; }
+  CUDA_HOSTDEV index_type* row_start_ptr() { return row_start; }
+  CUDA_HOSTDEV const index_type* row_start_ptr() const { return row_start; }
+  CUDA_HOSTDEV index_type* edge_dst_ptr() { return edge_dst; }
+  CUDA_HOSTDEV const index_type* edge_dst_ptr() const { return edge_dst; }
+  CUDA_HOSTDEV node_data_type* node_data_ptr() { return node_data; }
+  CUDA_HOSTDEV const node_data_type* node_data_ptr() const { return node_data; }
+  CUDA_HOSTDEV edge_data_type* edge_data_ptr() { return edge_data; }
+  CUDA_HOSTDEV const edge_data_type* edge_data_ptr() const { return edge_data; }
+  CUDA_HOSTDEV void fixEndEdge(index_type vid, index_type row_end) {
+    row_start[vid + 1] = row_end;
+  }
+  CUDA_HOSTDEV void constructEdge(index_type eid, index_type dst,
+                                  edge_data_type edata = 0) {
     assert(dst < nnodes);
     assert(eid < nedges);
     edge_dst[eid] = dst;
-    if (edge_data) edge_data[eid] = edata;
+    if (edge_data)
+      edge_data[eid] = edata;
   }
   void malloc_index_device(index_type n, index_type*& ptr);
   void free_index_device(index_type*& ptr);
-  void set_index(index_type pos, index_type value, index_type *ptr);
+  void set_index(index_type pos, index_type value, index_type* ptr);
   void allocateFrom(index_type nv, index_type ne) {
     bool need_realloc = false;
-    if (nedges < ne) need_realloc = true;
+    if (nedges < ne)
+      need_realloc = true;
     nnodes = nv;
     nedges = ne;
-    if (max_size < nnodes) max_size = nnodes;
-    //printf("allocating memory on gpu nnodes %d nedges %d\n", max_size, nedges);
+    if (max_size < nnodes)
+      max_size = nnodes;
+    // printf("allocating memory on gpu nnodes %d nedges %d\n", max_size,
+    // nedges);
     if (need_realloc) {
-      if (edge_dst) free_index_device(edge_dst);
+      if (edge_dst)
+        free_index_device(edge_dst);
       malloc_index_device(nedges, edge_dst);
     }
-    if (!row_start) malloc_index_device(max_size+1, row_start);
+    if (!row_start)
+      malloc_index_device(max_size + 1, row_start);
     set_index(0, 0, row_start);
   }
-  void set_max_size(index_type max) { assert(max>0); max_size = max; }
+  void set_max_size(index_type max) {
+    assert(max > 0);
+    max_size = max;
+  }
   size_t size() { return size_t(nnodes); }
   size_t sizeEdges() { return size_t(nedges); }
   void degree_counting() {}
@@ -204,6 +213,6 @@ struct CSRGraph {
   node_data_type* node_data;
   bool device_graph;
   index_type max_size; // this is for reallocation; avoid re-malloc
-  bool is_allocated; // this is for reallocation
+  bool is_allocated;   // this is for reallocation
 };
 #endif
diff --git a/libgpu/include/internal.h b/libgpu/include/internal.h
index 7d8e6f8a9a..f21a043553 100644
--- a/libgpu/include/internal.h
+++ b/libgpu/include/internal.h
@@ -24,8 +24,10 @@ struct multiple_sum {
   T el[items];
 
   // https://nvlabs.github.io/cub/classcub_1_1_block_scan.html#a6ed3f77795e582df31d3d6d9d950615e
-  // "This operation assumes the value of obtained by the T's default constructor (or by zero-initialization if no user-defined default constructor exists) is suitable as the identity value zero for addition."
-  __device__ __host__ multiple_sum() : multiple_sum(T()) { }
+  // "This operation assumes the value of obtained by the T's default
+  // constructor (or by zero-initialization if no user-defined default
+  // constructor exists) is suitable as the identity value zero for addition."
+  __device__ __host__ multiple_sum() : multiple_sum(T()) {}
 
   __device__ __host__ multiple_sum(const T e) {
     for (int i = 0; i < items; i++)
diff --git a/libpangolin/include/pangolin/canonical_graph.h b/libpangolin/include/pangolin/canonical_graph.h
index ec7a18a14d..5797887054 100644
--- a/libpangolin/include/pangolin/canonical_graph.h
+++ b/libpangolin/include/pangolin/canonical_graph.h
@@ -31,8 +31,8 @@ std::ostream& operator<<(std::ostream& strm,
 template <typename EmbeddingTy, typename ElementTy>
 class CanonicalGraph {
   friend std::ostream&
-  operator<<<>(std::ostream& strm,
-               const CanonicalGraph<EmbeddingTy, ElementTy>& cg);
+  operator<< <>(std::ostream& strm,
+                const CanonicalGraph<EmbeddingTy, ElementTy>& cg);
 
 public:
   CanonicalGraph() : number_of_vertices(0), hash_value(0) {}
diff --git a/libpangolin/include/pangolin/edge_embedding.h b/libpangolin/include/pangolin/edge_embedding.h
index 94af94e9c4..849021ebad 100644
--- a/libpangolin/include/pangolin/edge_embedding.h
+++ b/libpangolin/include/pangolin/edge_embedding.h
@@ -10,8 +10,8 @@ std::ostream& operator<<(std::ostream& strm,
 
 template <typename ElementTy>
 class EdgeInducedEmbedding : public Embedding<ElementTy> {
-  friend std::ostream& operator<<<>(std::ostream& strm,
-                                    const EdgeInducedEmbedding<ElementTy>& emb);
+  friend std::ostream&
+  operator<< <>(std::ostream& strm, const EdgeInducedEmbedding<ElementTy>& emb);
 
 public:
   EdgeInducedEmbedding() { qp_id = 0xFFFFFFFF; }
diff --git a/libpangolin/include/pangolin/embedding.h b/libpangolin/include/pangolin/embedding.h
index 3a45d43168..cdd37087e4 100644
--- a/libpangolin/include/pangolin/embedding.h
+++ b/libpangolin/include/pangolin/embedding.h
@@ -2,9 +2,9 @@
 #define EMBEDDING_HPP_
 
 // bliss headers
-//#include "bliss/defs.hh"
-//#include "bliss/utils.hh"
-//#include "bliss/bignum.hh"
+// #include "bliss/defs.hh"
+// #include "bliss/utils.hh"
+// #include "bliss/bignum.hh"
 
 #include "pangolin/element.h"
 
diff --git a/libpangolin/include/pangolin/quick_pattern.h b/libpangolin/include/pangolin/quick_pattern.h
index 65b3262645..e8002b2e1a 100644
--- a/libpangolin/include/pangolin/quick_pattern.h
+++ b/libpangolin/include/pangolin/quick_pattern.h
@@ -22,8 +22,8 @@ std::ostream& operator<<(std::ostream& strm,
 
 template <typename EmbTy, typename EleTy>
 class QuickPattern {
-  friend std::ostream& operator<<<>(std::ostream& strm,
-                                    const QuickPattern<EmbTy, EleTy>& qp);
+  friend std::ostream& operator<< <>(std::ostream& strm,
+                                     const QuickPattern<EmbTy, EleTy>& qp);
 
 public:
   QuickPattern() {}
diff --git a/lonestar/analytics/cpu/betweennesscentrality/OuterStructs.h b/lonestar/analytics/cpu/betweennesscentrality/OuterStructs.h
index 36e795c015..1b959e1fd9 100644
--- a/lonestar/analytics/cpu/betweennesscentrality/OuterStructs.h
+++ b/lonestar/analytics/cpu/betweennesscentrality/OuterStructs.h
@@ -245,7 +245,7 @@ class BCOuter {
    */
   template <typename T>
   void deleteArray(T** addr) {
-    delete[] * addr;
+    delete[] *addr;
   }
 
   /**
diff --git a/lonestar/analytics/cpu/bipart/Coarsening.cpp b/lonestar/analytics/cpu/bipart/Coarsening.cpp
index 96d17671dd..c5ad54ae34 100644
--- a/lonestar/analytics/cpu/bipart/Coarsening.cpp
+++ b/lonestar/analytics/cpu/bipart/Coarsening.cpp
@@ -29,7 +29,7 @@
 #include <unordered_set>
 #include <unordered_map>
 
-constexpr static const unsigned CHUNK_SIZE      = 512U;
+constexpr static const unsigned CHUNK_SIZE = 512U;
 
 int TOTALW;
 int LIMIT;
@@ -44,40 +44,40 @@ int hash(unsigned val) {
 void parallelRand(MetisGraph* graph, int) {
 
   GGraph* fineGGraph = graph->getFinerGraph()->getGraph();
-  
-	galois::StatTimer T_RAND("RAND");
+
+  galois::StatTimer T_RAND("RAND");
   T_RAND.start();
 
-	galois::do_all(
-      galois::iterate((uint64_t) 0, fineGGraph->hedges),
+  galois::do_all(
+      galois::iterate((uint64_t)0, fineGGraph->hedges),
       [&fineGGraph](uint64_t item) {
-				unsigned netnum = fineGGraph->getData(item, flag_no_lock).netnum;
-				netnum= hash(netnum);
+        unsigned netnum = fineGGraph->getData(item, flag_no_lock).netnum;
+        netnum          = hash(netnum);
         fineGGraph->getData(item, flag_no_lock).netrand = netnum;
       },
-			galois::steal(),
-//			 galois::chunk_size<CHUNK_SIZE>());
-     galois::loopname("rand"));
-	T_RAND.stop();
+      galois::steal(),
+      //			 galois::chunk_size<CHUNK_SIZE>());
+      galois::loopname("rand"));
+  T_RAND.stop();
 
-		//std::cout <<"hedges: " << fineGGraph->hedges << std::endl;
+  // std::cout <<"hedges: " << fineGGraph->hedges << std::endl;
 
-		galois::StatTimer T_INDEX("INDEX");
-  	T_INDEX.start();
-		galois::do_all(
-      galois::iterate((uint64_t) 0, fineGGraph->hedges),
+  galois::StatTimer T_INDEX("INDEX");
+  T_INDEX.start();
+  galois::do_all(
+      galois::iterate((uint64_t)0, fineGGraph->hedges),
       [&fineGGraph](uint64_t item) {
         unsigned netnum = fineGGraph->getData(item, flag_no_lock).index;
-        netnum= hash(1);
+        netnum          = hash(1);
         fineGGraph->getData(item, flag_no_lock).index = netnum;
       },
       galois::steal(),
-	//		 galois::chunk_size<CHUNK_SIZE>());
+      //		 galois::chunk_size<CHUNK_SIZE>());
       galois::loopname("rand_index"));
-		T_INDEX.stop();
+  T_INDEX.stop();
 
-		//std::cout <<"rand: " << T_RAND.get() << std::endl;
-		//std::cout << "rand_index: " << T_INDEX.get() << std::endl;
+  // std::cout <<"rand: " << T_RAND.get() << std::endl;
+  // std::cout << "rand_index: " << T_INDEX.get() << std::endl;
 }
 
 using MatchingPolicy = void(GNode, GGraph*);
@@ -201,10 +201,10 @@ void parallelHMatchAndCreateNodes(MetisGraph* graph, int iter, GNodeBag& bag,
   typedef galois::substrate::PerThreadStorage<VecTy> ThreadLocalData;
   ThreadLocalData edgesThreadLocal;
   std::string name = "phaseI";
-  
+
   galois::GAccumulator<unsigned> hedge;
-  
-	galois::InsertBag<GNode> hedge_bag;
+
+  galois::InsertBag<GNode> hedge_bag;
 
   galois::do_all(
       galois::iterate(size_t{0}, fineGGraph->hedges),
@@ -237,8 +237,8 @@ void parallelHMatchAndCreateNodes(MetisGraph* graph, int iter, GNodeBag& bag,
             return;
           fineGGraph->getData(item).setMatched();
           if (flag)
-						hedge_bag.push(item);
-          
+            hedge_bag.push(item);
+
           bag.push(nodeid);
           unsigned ww = 0;
           for (auto pp : edges) {
@@ -246,15 +246,16 @@ void parallelHMatchAndCreateNodes(MetisGraph* graph, int iter, GNodeBag& bag,
             fineGGraph->getData(pp).setMatched();
             fineGGraph->getData(pp).setParent(nodeid);
             fineGGraph->getData(pp).netnum = fineGGraph->getData(item).netnum;
-         		//fineGGraph->getData(pp).netnum = fineGGraph->getData(item).netnum.load();
-				  }
+            // fineGGraph->getData(pp).netnum =
+            // fineGGraph->getData(item).netnum.load();
+          }
           weight[nodeid - fineGGraph->hedges] = ww;
         }
       },
       galois::loopname("phaseI"));
 
-			for(auto item: hedge_bag)
-				hedges[item] = true;
+  for (auto item : hedge_bag)
+    hedges[item] = true;
 }
 
 void moreCoarse(MetisGraph* graph, galois::LargeArray<unsigned>& weight) {
@@ -310,8 +311,9 @@ void moreCoarse(MetisGraph* graph, galois::LargeArray<unsigned>& weight) {
               fineGGraph->getData(e).setMatched();
               fineGGraph->getData(e).setParent(nn);
               fineGGraph->getData(e).netnum = fineGGraph->getData(b).netnum;
-           		//fineGGraph->getData(e).netnum = fineGGraph->getData(b).netnum.load();
-						 }
+              // fineGGraph->getData(e).netnum =
+              // fineGGraph->getData(b).netnum.load();
+            }
           }
         }
       },
@@ -340,7 +342,7 @@ void coarsePhaseII(MetisGraph* graph, std::vector<bool>& hedges,
   galois::GAccumulator<int> hnode;
   moreCoarse(graph, weight);
 
-	galois::InsertBag<GNode> hedge_bag;
+  galois::InsertBag<GNode> hedge_bag;
 
   galois::do_all(
       galois::iterate(size_t{0}, fineGGraph->hedges),
@@ -369,38 +371,36 @@ void coarsePhaseII(MetisGraph* graph, std::vector<bool>& hedges,
           fineGGraph->getData(item).setMatched();
 
         } else {
-				//	auto& vec = *edgesThreadLocalV.getLocal();
-          //vec.push_back(item);
-					hedge_bag.push(item);
-					fineGGraph->getData(item).setMatched();
+          //	auto& vec = *edgesThreadLocalV.getLocal();
+          // vec.push_back(item);
+          hedge_bag.push(item);
+          fineGGraph->getData(item).setMatched();
         }
-      },galois::steal(),
-      galois::loopname("count # Hyperedges"));
+      },
+      galois::steal(), galois::loopname("count # Hyperedges"));
 
-			for(auto item:hedge_bag)
-				hedges[item] = true;
+  for (auto item : hedge_bag)
+    hedges[item] = true;
 }
 
-//find nodes that are not incident to any hyperedge
-void findLoneNodes(GGraph& graph){
-	
-	galois::do_all(
-		galois::iterate((uint64_t) graph.hedges, graph.size()),
-			[&](GNode n){
-				
-				graph.getData(n).notAlone = false;
-			}, galois::steal(), galois::loopname("initialize not alone variables"));
-	
-	galois::do_all(
-		galois::iterate((uint64_t) 0, graph.hedges),
-			[&](GNode h){
-
-				for(auto n:graph.edges(h))
-					graph.getData(graph.getEdgeDst(n)).notAlone = true;
-			}, galois::steal(), galois::loopname("set not alone variables"));
+// find nodes that are not incident to any hyperedge
+void findLoneNodes(GGraph& graph) {
+
+  galois::do_all(
+      galois::iterate((uint64_t)graph.hedges, graph.size()),
+      [&](GNode n) { graph.getData(n).notAlone = false; }, galois::steal(),
+      galois::loopname("initialize not alone variables"));
+
+  galois::do_all(
+      galois::iterate((uint64_t)0, graph.hedges),
+      [&](GNode h) {
+        for (auto n : graph.edges(h))
+          graph.getData(graph.getEdgeDst(n)).notAlone = true;
+      },
+      galois::steal(), galois::loopname("set not alone variables"));
 }
 
-//create coarsened graphs
+// create coarsened graphs
 void parallelCreateEdges(MetisGraph* graph, GNodeBag& bag,
                          std::vector<bool>& hedges,
                          galois::LargeArray<unsigned>& weight) {
@@ -416,68 +416,68 @@ void parallelCreateEdges(MetisGraph* graph, GNodeBag& bag,
           hg += 1;
       },
       galois::steal(), galois::loopname("number of hyperedges loop"));
- 
-	//find lone nodes
-	findLoneNodes(*fineGGraph);
- 
 
-	galois::do_all(
+  // find lone nodes
+  findLoneNodes(*fineGGraph);
+
+  galois::do_all(
       galois::iterate(fineGGraph->hedges, fineGGraph->size()),
       [&](GNode ii) {
-        if (!fineGGraph->getData(ii).isMatched()){// && fineGGraph->getData(ii).notAlone) {
+        if (!fineGGraph->getData(ii)
+                 .isMatched()) { // && fineGGraph->getData(ii).notAlone) {
           bag.push(ii);
           fineGGraph->getData(ii).setMatched();
           fineGGraph->getData(ii).setParent(ii);
           fineGGraph->getData(ii).netnum  = INT_MAX;
           weight[ii - fineGGraph->hedges] = fineGGraph->getData(ii).getWeight();
- 
-	      }
+        }
       },
       galois::steal(), galois::loopname("noedgebag match"));
 
-  
-	galois::StatTimer T_BAG("BAG");
-	T_BAG.start();
-	std::vector<bool> inNodeBag(1000, false);
-	std::vector<unsigned> nodeid(1000, INT_MAX);
-
-	for(GNode ii = fineGGraph->hedges; ii<fineGGraph->size();ii++){
-		
-		if(!fineGGraph->getData(ii).isMatched() && !fineGGraph->getData(ii).notAlone){
-			int index = ii%1000;
-			inNodeBag[index] = true;
-			if(ii < nodeid[index])
-				nodeid[index] = ii;
-			
-		}
-	}
-
-	for(int i=0;i<1000;i++){
-	
-		if(inNodeBag[i]){
-			bag.push(nodeid[i]);
-			weight[nodeid[i]-fineGGraph->hedges] =  0;
-		}
-	}
-
-	for(GNode ii = fineGGraph->hedges; ii<fineGGraph->size();ii++){
-
-    if(!fineGGraph->getData(ii).isMatched() && !fineGGraph->getData(ii).notAlone){
-      int index = ii%1000;
-   		fineGGraph->getData(ii).setMatched();
+  galois::StatTimer T_BAG("BAG");
+  T_BAG.start();
+  std::vector<bool> inNodeBag(1000, false);
+  std::vector<unsigned> nodeid(1000, INT_MAX);
+
+  for (GNode ii = fineGGraph->hedges; ii < fineGGraph->size(); ii++) {
+
+    if (!fineGGraph->getData(ii).isMatched() &&
+        !fineGGraph->getData(ii).notAlone) {
+      int index        = ii % 1000;
+      inNodeBag[index] = true;
+      if (ii < nodeid[index])
+        nodeid[index] = ii;
+    }
+  }
+
+  for (int i = 0; i < 1000; i++) {
+
+    if (inNodeBag[i]) {
+      bag.push(nodeid[i]);
+      weight[nodeid[i] - fineGGraph->hedges] = 0;
+    }
+  }
+
+  for (GNode ii = fineGGraph->hedges; ii < fineGGraph->size(); ii++) {
+
+    if (!fineGGraph->getData(ii).isMatched() &&
+        !fineGGraph->getData(ii).notAlone) {
+      int index = ii % 1000;
+      fineGGraph->getData(ii).setMatched();
       fineGGraph->getData(ii).setParent(nodeid[index]);
-      fineGGraph->getData(ii).netnum =  INT_MAX;
-  
-      weight[nodeid[index]-fineGGraph->hedges] += fineGGraph->getData(ii).getWeight();   
+      fineGGraph->getData(ii).netnum = INT_MAX;
+
+      weight[nodeid[index] - fineGGraph->hedges] +=
+          fineGGraph->getData(ii).getWeight();
     }
   }
-	T_BAG.stop();
+  T_BAG.stop();
 
-	//std::cout <<"bag time: "<< T_BAG.get() << std::endl;
-	unsigned hnum   = hg.reduce();
+  // std::cout <<"bag time: "<< T_BAG.get() << std::endl;
+  unsigned hnum   = hg.reduce();
   unsigned nodes  = std::distance(bag.begin(), bag.end()); // + numnodes;
   unsigned newval = hnum;
-  
+
   std::vector<unsigned> idmap(fineGGraph->hnodes);
   std::vector<unsigned> newrand(nodes);
   std::vector<unsigned> newWeight(nodes);
@@ -485,18 +485,18 @@ void parallelCreateEdges(MetisGraph* graph, GNodeBag& bag,
   Tloop.start();
   std::vector<unsigned> v;
 
-	galois::LargeArray<bool> inBag;
+  galois::LargeArray<bool> inBag;
 
-	inBag.allocateBlocked(fineGGraph->size());
-	for(GNode n = fineGGraph->hedges;n<fineGGraph->size() ; n++)
-		inBag[n] = false;
+  inBag.allocateBlocked(fineGGraph->size());
+  for (GNode n = fineGGraph->hedges; n < fineGGraph->size(); n++)
+    inBag[n] = false;
 
   for (auto n : bag)
-		inBag[n] = true;
-  
-  for(GNode n = fineGGraph->hedges; n<fineGGraph->size(); n++)
-		if(inBag[n])
-			v.push_back(n);
+    inBag[n] = true;
+
+  for (GNode n = fineGGraph->hedges; n < fineGGraph->size(); n++)
+    if (inBag[n])
+      v.push_back(n);
 
   for (auto n : v) {
     newrand[newval - hnum]        = n;
@@ -519,43 +519,40 @@ void parallelCreateEdges(MetisGraph* graph, GNodeBag& bag,
   galois::gstl::Vector<galois::PODResizeableArray<uint32_t>> edges_id(
       num_nodes_next);
   std::vector<std::vector<EdgeTy>> edges_data(num_nodes_next);
- 	std::vector<unsigned> old_id(hnum);
- 
+  std::vector<unsigned> old_id(hnum);
+
+  unsigned h_id = 0;
 
-	unsigned h_id = 0;
-  
   for (GNode n = 0; n < fineGGraph->hedges; n++) {
-	 			if (hedges[n]) {
-      		old_id[h_id]                  = fineGGraph->getData(n).netnum;
-      		fineGGraph->getData(n).nodeid = h_id++;
-    		}
- 	}
+    if (hedges[n]) {
+      old_id[h_id]                  = fineGGraph->getData(n).netnum;
+      fineGGraph->getData(n).nodeid = h_id++;
+    }
+  }
 
   galois::do_all(
       galois::iterate(size_t{0}, fineGGraph->hedges),
       [&](GNode n) {
         if (!hedges[n])
           return;
-        //auto data   = fineGGraph->getData(n, flag_no_lock);
+        // auto data   = fineGGraph->getData(n, flag_no_lock);
         unsigned id = fineGGraph->getData(n).nodeid;
 
         for (auto ii : fineGGraph->edges(n)) {
-          GNode dst     = fineGGraph->getEdgeDst(ii);
-        //  auto dst_data = fineGGraph->getData(dst, flag_no_lock);
-          //unsigned pid  = dst_data.getParent();
-					unsigned pid = fineGGraph->getData(dst).getParent();
+          GNode dst = fineGGraph->getEdgeDst(ii);
+          //  auto dst_data = fineGGraph->getData(dst, flag_no_lock);
+          // unsigned pid  = dst_data.getParent();
+          unsigned pid = fineGGraph->getData(dst).getParent();
 
           auto f = std::find(edges_id[id].begin(), edges_id[id].end(), pid);
-         if (f == edges_id[id].end()) {
+          if (f == edges_id[id].end()) {
 
             edges_id[id].push_back(pid);
           }
         } // End edge loop
-
       },
       galois::steal(), galois::loopname("BuildGrah: Find edges"));
 
-		
   std::vector<uint64_t> prefix_edges(num_nodes_next);
   galois::GAccumulator<uint64_t> num_edges_acc;
   galois::do_all(
@@ -581,20 +578,19 @@ void parallelCreateEdges(MetisGraph* graph, GNodeBag& bag,
         if (ii < hnum) {
           coarseGGraph->getData(ii).netval = INT_MAX;
           coarseGGraph->getData(ii).netnum = old_id[ii];
-				} else {
+        } else {
           coarseGGraph->getData(ii).netval  = INT_MAX;
           coarseGGraph->getData(ii).netnum  = INT_MAX;
           coarseGGraph->getData(ii).netrand = INT_MAX;
-          coarseGGraph->getData(ii).nodeid =
-              ii;
+          coarseGGraph->getData(ii).nodeid  = ii;
           coarseGGraph->getData(ii).setWeight(
               newWeight[ii - coarseGGraph->hedges]);
         }
       },
       galois::steal(), galois::loopname("noedgebag match"));
 
-	inBag.destroy();
-	inBag.deallocate();
+  inBag.destroy();
+  inBag.deallocate();
 }
 
 void findMatching(MetisGraph* coarseMetisGraph, scheduleMode sch, int iter) {
@@ -603,7 +599,7 @@ void findMatching(MetisGraph* coarseMetisGraph, scheduleMode sch, int iter) {
   int sz = coarseMetisGraph->getFinerGraph()->getGraph()->hedges;
   std::vector<bool> hedges(sz, false);
   galois::LargeArray<unsigned> weight;
-	weight.allocateBlocked(fineMetisGraph->getGraph()->hnodes);
+  weight.allocateBlocked(fineMetisGraph->getGraph()->hnodes);
 
   switch (sch) {
   case PLD:
@@ -648,8 +644,8 @@ void findMatching(MetisGraph* coarseMetisGraph, scheduleMode sch, int iter) {
   coarsePhaseII(coarseMetisGraph, hedges, weight);
   parallelCreateEdges(coarseMetisGraph, nodes, hedges, weight);
 
-	weight.destroy();
-	weight.deallocate();
+  weight.destroy();
+  weight.deallocate();
 }
 
 MetisGraph* coarsenOnce(MetisGraph* fineMetisGraph, scheduleMode sch,
@@ -665,14 +661,12 @@ MetisGraph* coarsen(MetisGraph* fineMetisGraph, unsigned coarsenTo,
                     scheduleMode sch) {
 
   MetisGraph* coarseGraph = fineMetisGraph;
-  unsigned size =
-      fineMetisGraph->getGraph()
-          ->hnodes;
-  unsigned hedgeSize = 0;
-  const float ratio  = 55.0 / 45.0;
-  const float tol    = std::max(ratio, 1 - ratio) - 1;
-  const int hi       = (1 + tol) * size / (2 + tol);
-  LIMIT              = hi / 4;
+  unsigned size           = fineMetisGraph->getGraph()->hnodes;
+  unsigned hedgeSize      = 0;
+  const float ratio       = 55.0 / 45.0;
+  const float tol         = std::max(ratio, 1 - ratio) - 1;
+  const int hi            = (1 + tol) * size / (2 + tol);
+  LIMIT                   = hi / 4;
 
   unsigned Size    = size;
   unsigned iterNum = 0;
@@ -681,15 +675,16 @@ MetisGraph* coarsen(MetisGraph* fineMetisGraph, unsigned coarsenTo,
     if (iterNum > coarsenTo)
       break;
     if (Size - newSize <= 0 && iterNum > 2)
-      break; 
+      break;
     newSize     = coarseGraph->getGraph()->hnodes;
     coarseGraph = coarsenOnce(coarseGraph, sch, iterNum);
     Size        = coarseGraph->getGraph()->hnodes;
     hedgeSize   = coarseGraph->getGraph()->hedges;
-    //std::cout << "SIZE IS " << coarseGraph->getGraph()->hnodes << " and net is "
-    //          << hedgeSize << "\n";
+    // std::cout << "SIZE IS " << coarseGraph->getGraph()->hnodes << " and net
+    // is "
+    //           << hedgeSize << "\n";
     if (hedgeSize < 1000)
-			break;
+      break;
 
     ++iterNum;
   }
diff --git a/lonestar/analytics/cpu/bipart/Refine.cpp b/lonestar/analytics/cpu/bipart/Refine.cpp
index 112dc06277..7397f907d8 100644
--- a/lonestar/analytics/cpu/bipart/Refine.cpp
+++ b/lonestar/analytics/cpu/bipart/Refine.cpp
@@ -539,7 +539,7 @@ void refine(MetisGraph* coarseGraph, unsigned K, double imbalance) {
   float tol   = 0.0f;
   bool flag   = isPT(K);
   if (flag) {
-    ratio = (50.0f + (double) imbalance)/(50.0f - (double) imbalance);
+    ratio = (50.0f + (double)imbalance) / (50.0f - (double)imbalance);
     tol   = std::max(ratio, 1 - ratio) - 1;
   } else {
     ratio = ((float)((K + 1) / 2)) / ((float)(K / 2)); // change if needed
diff --git a/lonestar/analytics/cpu/bipart/bipart.cpp b/lonestar/analytics/cpu/bipart/bipart.cpp
index 27761209ea..321afd11fa 100644
--- a/lonestar/analytics/cpu/bipart/bipart.cpp
+++ b/lonestar/analytics/cpu/bipart/bipart.cpp
@@ -121,9 +121,9 @@ void Partition(MetisGraph* metisGraph, unsigned coarsenTo, unsigned K) {
   T3.start();
   refine(mcg, K, imbalance);
   T3.stop();
-  Ctime += (T.get()/1000.0f);
-  Ptime += (T2.get()/1000.0f);
-  Rtime += (T3.get()/1000.0f);
+  Ctime += (T.get() / 1000.0f);
+  Ptime += (T2.get() / 1000.0f);
+  Rtime += (T3.get() / 1000.0f);
 
   execTime.stop();
 }
@@ -154,7 +154,7 @@ int computingBalance(GGraph& g) {
     unsigned pp = g.getData(c).getPart();
     parts[pp]++;
   }
-  for (unsigned i = 0; i <numPartitions; i++) {
+  for (unsigned i = 0; i < numPartitions; i++) {
     if (parts[i] > max)
       max = parts[i];
   }
@@ -415,13 +415,13 @@ int main(int argc, char** argv) {
             galois::steal(), galois::loopname("populate edge ids"));
 
         uint64_t num_edges_acc = 0;
-        //galois::do_all(
-          //  galois::iterate(uint32_t{0}, totalnodes),
-            for(uint32_t c = 0;c<totalnodes;c++) {
-              pre_edges[c] = edges_ids[c].size();
-              num_edges_acc += pre_edges[c];
-            }
-            //galois::steal(), galois::loopname("set pre edges"));
+        // galois::do_all(
+        //   galois::iterate(uint32_t{0}, totalnodes),
+        for (uint32_t c = 0; c < totalnodes; c++) {
+          pre_edges[c] = edges_ids[c].size();
+          num_edges_acc += pre_edges[c];
+        }
+        // galois::steal(), galois::loopname("set pre edges"));
 
         edges = num_edges_acc;
 
@@ -488,20 +488,20 @@ int main(int argc, char** argv) {
     toProcess = toProcessNew;
     toProcessNew.clear();
   } // end while
-  std::cout<<"Coarsening time(s):,"<<Ctime<<"\n";
-  std::cout<<"Partitiong time(s):,"<<Ptime<<"\n";
-  std::cout<<"Refinement time(s):,"<<Rtime<<"\n";
-  std::cout<<"\n";
-  std::cout<<"Edge Cut,"<<computingCut(graph)<<"\n\n";
+  std::cout << "Coarsening time(s):," << Ctime << "\n";
+  std::cout << "Partitiong time(s):," << Ptime << "\n";
+  std::cout << "Refinement time(s):," << Rtime << "\n";
+  std::cout << "\n";
+  std::cout << "Edge Cut," << computingCut(graph) << "\n\n";
 
   galois::runtime::reportStat_Single("BiPart", "Edge Cut", computingCut(graph));
-  //galois::runtime::reportStat_Single("BiPart", "zero-one",
-  //                                   computingBalance(graph));
+  // galois::runtime::reportStat_Single("BiPart", "zero-one",
+  //                                    computingBalance(graph));
 
   totalTime.stop();
   if (output) {
 
-    std::vector<std::vector<uint64_t> >parts(numPartitions);
+    std::vector<std::vector<uint64_t>> parts(numPartitions);
 
     for (GNode n = graph.hedges; n < graph.size(); n++) {
       unsigned p = graph.getData(n).getPart();
@@ -511,9 +511,9 @@ int main(int argc, char** argv) {
     std::ofstream outputFile(outfile.c_str());
 
     for (unsigned i = 0; i < numPartitions; i++) {
-      outputFile << i+1 << " ";
-      for (auto v : parts[i]) 
-        outputFile << v << " "; 
+      outputFile << i + 1 << " ";
+      for (auto v : parts[i])
+        outputFile << v << " ";
       outputFile << "\n";
     }
     outputFile.close();
diff --git a/lonestar/analytics/cpu/bipart/bipart.h b/lonestar/analytics/cpu/bipart/bipart.h
index ee69dfa29b..e99241adce 100644
--- a/lonestar/analytics/cpu/bipart/bipart.h
+++ b/lonestar/analytics/cpu/bipart/bipart.h
@@ -28,8 +28,8 @@ typedef uint32_t EdgeTy;
 
 struct GGraph
     : public galois::graphs::LC_CSR_Graph<MetisNode, EdgeTy>::with_no_lockable<
-        true>::type::with_numa_alloc<true>::type {
-    	//false>::type::with_numa_alloc<true>::type {
+          true>::type::with_numa_alloc<true>::type {
+  // false>::type::with_numa_alloc<true>::type {
   size_t hedges;
   size_t hnodes;
 };
@@ -80,17 +80,16 @@ class MetisNode {
   galois::CopyableAtomic<int> netval;
   galois::CopyableAtomic<int> degree;
   /*std::atomic<int> FS;
-	std::atomic<int> TE;
-	std::atomic<int> netnum;
-	std::atomic<int> netrand;
-	std::atomic<int> netval;
-	std::atomic<int> degree;
-*/	uint32_t index;
-	bool notAlone;
-	
-	void initPartition() { pd.locked = false; }
-	
-	
+    std::atomic<int> TE;
+    std::atomic<int> netnum;
+    std::atomic<int> netrand;
+    std::atomic<int> netval;
+    std::atomic<int> degree;
+*/ uint32_t index;
+  bool notAlone;
+
+  void initPartition() { pd.locked = false; }
+
   // int num;
   explicit MetisNode(int weight) : _weight(weight) {
     initCoarsen();
diff --git a/lonestar/analytics/cpu/clustering/louvainClustering.cpp b/lonestar/analytics/cpu/clustering/louvainClustering.cpp
index ef7ebb1fd3..c74901fd57 100644
--- a/lonestar/analytics/cpu/clustering/louvainClustering.cpp
+++ b/lonestar/analytics/cpu/clustering/louvainClustering.cpp
@@ -440,7 +440,7 @@ double algoLouvainWithLockingDelayUpdate(Graph& graph, double lower,
     double e_xx = 0;
     double a2_x = 0;
     curr_mod    = calModularityDelay(graph, c_info, c_update, e_xx, a2_x,
-                                  constant_for_second_term, local_target);
+                                     constant_for_second_term, local_target);
     galois::gPrint(num_iter, "        ", e_xx, "        ", a2_x, "        ",
                    lower, "      ", prev_mod, "       ", curr_mod, "\n");
 
diff --git a/lonestar/analytics/cpu/gmetis/Coarsening.cpp b/lonestar/analytics/cpu/gmetis/Coarsening.cpp
index f35e725faf..2f7c33545e 100644
--- a/lonestar/analytics/cpu/gmetis/Coarsening.cpp
+++ b/lonestar/analytics/cpu/gmetis/Coarsening.cpp
@@ -387,7 +387,7 @@ MetisGraph* coarsen(MetisGraph* fineMetisGraph, unsigned coarsenTo,
                     bool verbose) {
   MetisGraph* coarseGraph = fineMetisGraph;
   unsigned size           = std::distance(fineMetisGraph->getGraph()->begin(),
-                                fineMetisGraph->getGraph()->end());
+                                          fineMetisGraph->getGraph()->end());
   unsigned iterNum        = 0;
   bool with2Hop           = false;
   unsigned stat           = 0;
diff --git a/lonestar/analytics/cpu/gmetis/GMetis.cpp b/lonestar/analytics/cpu/gmetis/GMetis.cpp
index 145ac63930..981478a227 100644
--- a/lonestar/analytics/cpu/gmetis/GMetis.cpp
+++ b/lonestar/analytics/cpu/gmetis/GMetis.cpp
@@ -31,7 +31,7 @@
 #include "Metis.h"
 #include "galois/graphs/ReadGraph.h"
 #include "galois/Timer.h"
-//#include "GraphReader.h"
+// #include "GraphReader.h"
 #include "Lonestar/BoilerPlate.h"
 #include "galois/graphs/FileGraph.h"
 #include "galois/LargeArray.h"
diff --git a/lonestar/analytics/cpu/k-core/kcore.cpp b/lonestar/analytics/cpu/k-core/kcore.cpp
index f07fc7153a..8b1da0dfc0 100644
--- a/lonestar/analytics/cpu/k-core/kcore.cpp
+++ b/lonestar/analytics/cpu/k-core/kcore.cpp
@@ -29,9 +29,9 @@
 constexpr static const char* const REGION_NAME = "k-core";
 constexpr static const char* const name        = "k-core";
 constexpr static const char* const desc        = "Finds the k-core of a graph, "
-                                          "defined as the subgraph where"
-                                          " all vertices have degree at "
-                                          "least k.";
+                                                 "defined as the subgraph where"
+                                                 " all vertices have degree at "
+                                                 "least k.";
 
 /*******************************************************************************
  * Declaration of command line arguments
diff --git a/lonestar/analytics/cpu/matrixcompletion/matrixCompletion.h b/lonestar/analytics/cpu/matrixcompletion/matrixCompletion.h
index a47f245069..38ec2f6e8d 100644
--- a/lonestar/analytics/cpu/matrixcompletion/matrixCompletion.h
+++ b/lonestar/analytics/cpu/matrixcompletion/matrixCompletion.h
@@ -172,7 +172,7 @@ T doGradientUpdate(T* __restrict__ itemLatent, T* __restrict__ userLatent,
   T step   = stepSize;
   T rating = edgeRating;
   T error  = innerProduct(itemLatent, itemLatent + LATENT_VECTOR_SIZE,
-                         userLatent, -rating);
+                          userLatent, -rating);
 
   // Take gradient step to reduce error
   for (int i = 0; i < LATENT_VECTOR_SIZE; i++) {
diff --git a/lonestar/analytics/distributed/betweennesscentrality/bc_level.cpp b/lonestar/analytics/distributed/betweennesscentrality/bc_level.cpp
index edf4a331b8..95b1486123 100644
--- a/lonestar/analytics/distributed/betweennesscentrality/bc_level.cpp
+++ b/lonestar/analytics/distributed/betweennesscentrality/bc_level.cpp
@@ -22,7 +22,7 @@
  * avoid the overheads of having 2 extra accumulator variables.
  */
 
-//#define BCDEBUG
+// #define BCDEBUG
 
 #include "DistBench/Output.h"
 #include "DistBench/Start.h"
@@ -91,12 +91,12 @@ struct NodeData {
   float dependency;
   float betweeness_centrality;
 
-  //#ifdef BCDEBUG
+  // #ifdef BCDEBUG
   void dump() {
     galois::gPrint("DUMP: ", current_length.load(), " ",
                    num_shortest_paths.load(), " ", dependency, "\n");
   }
-  //#endif
+  // #endif
 };
 
 // reading in list of sources to operate on if provided
diff --git a/lonestar/analytics/distributed/betweennesscentrality/bc_mr.cpp b/lonestar/analytics/distributed/betweennesscentrality/bc_mr.cpp
index 5f175b93eb..7c9d162d87 100644
--- a/lonestar/analytics/distributed/betweennesscentrality/bc_mr.cpp
+++ b/lonestar/analytics/distributed/betweennesscentrality/bc_mr.cpp
@@ -541,7 +541,7 @@ std::vector<float> makeResults(std::unique_ptr<Graph>& hg) {
 constexpr static const char* const name = "Min-Rounds Betweeness Centrality";
 constexpr static const char* const desc = "Min-Rounds Betweeness "
                                           "Centrality on Distributed Galois.";
-constexpr static const char* const url = nullptr;
+constexpr static const char* const url  = nullptr;
 
 uint64_t macroRound = 0; // macro round, i.e. number of batches done so far
 
diff --git a/lonestar/analytics/distributed/bfs/bfs_push.cpp b/lonestar/analytics/distributed/bfs/bfs_push.cpp
index 34aa8031a3..51cd6e0da8 100644
--- a/lonestar/analytics/distributed/bfs/bfs_push.cpp
+++ b/lonestar/analytics/distributed/bfs/bfs_push.cpp
@@ -61,9 +61,9 @@ static cll::opt<uint32_t>
           cll::init(0));
 
 static cll::opt<unsigned>
-  rseed("rseed",
-        cll::desc("The random seed for choosing the hosts (default value 0)"),
-        cll::init(0));
+    rseed("rseed",
+          cll::desc("The random seed for choosing the hosts (default value 0)"),
+          cll::init(0));
 
 enum Exec { Sync, Async };
 
@@ -102,8 +102,7 @@ struct InitializeGraph {
   uint64_t local_src_node;
   Graph* graph;
 
-  InitializeGraph(uint64_t& _src_node, const uint32_t& _infinity,
-                  Graph* _graph)
+  InitializeGraph(uint64_t& _src_node, const uint32_t& _infinity, Graph* _graph)
       : local_infinity(_infinity), local_src_node(_src_node), graph(_graph) {}
 
   void static go(Graph& _graph) {
@@ -275,12 +274,12 @@ struct BFS {
 
   void operator()(GNode src) const {
     NodeData& snode = graph->getData(src);
-    //stack_capture->capture_stack_info();
+    // stack_capture->capture_stack_info();
     cyg_profile_func_stack(nullptr, nullptr);
 
     if (snode.dist_old > snode.dist_current) {
       active_vertices += 1;
-      //stack_capture->capture_stack_info();
+      // stack_capture->capture_stack_info();
       cyg_profile_func_stack(nullptr, nullptr);
 
       if (local_priority > snode.dist_current) {
@@ -295,10 +294,10 @@ struct BFS {
           uint32_t old_dist = galois::atomicMin(dnode.dist_current, new_dist);
           if (old_dist > new_dist)
             bitset_dist_current.set(dst);
-          //stack_capture->capture_stack_info();
+          // stack_capture->capture_stack_info();
           cyg_profile_func_stack(nullptr, nullptr);
         }
-        //stack_capture->capture_stack_info();
+        // stack_capture->capture_stack_info();
         cyg_profile_func_stack(nullptr, nullptr);
       }
     }
@@ -436,8 +435,8 @@ int main(int argc, char** argv) {
     galois::runtime::reportParam(REGION_NAME, "Source Node ID", src_node);
   }
 
-  //Setup Seeding information
-  uint64_t* src_nodes = (uint64_t*) malloc(sizeof(uint64_t) * numRuns);
+  // Setup Seeding information
+  uint64_t* src_nodes = (uint64_t*)malloc(sizeof(uint64_t) * numRuns);
   std::mt19937 generator(rseed);
 
   galois::StatTimer StatTimer_total("TimerTotal", REGION_NAME);
@@ -458,21 +457,18 @@ int main(int argc, char** argv) {
   galois::DGAccumulator<uint64_t> DGAccumulator_sum;
   galois::DGReduceMax<uint32_t> m;
 
-  //get the src_nodes of the runs
+  // get the src_nodes of the runs
   galois::StatTimer StatTimer_select("VertexSelection", REGION_NAME);
   StatTimer_select.start();
-  for(auto run = 0; run < numRuns; ++run)
-  {
+  for (auto run = 0; run < numRuns; ++run) {
     uint64_t degree = 0;
-    auto num_nodes = hg->globalSize();
-    uint64_t cand = 0;
-    while(degree < 1)
-    {
+    auto num_nodes  = hg->globalSize();
+    uint64_t cand   = 0;
+    while (degree < 1) {
       DGAccumulator_sum.reset();
       cand = generator() % num_nodes;
 
-      if(hg->isOwned(cand) || hg->isLocal(cand))
-      {
+      if (hg->isOwned(cand) || hg->isLocal(cand)) {
         auto lcand = hg->getLID(cand);
         DGAccumulator_sum += hg->localDegree(lcand);
       }
@@ -526,18 +522,20 @@ int main(int argc, char** argv) {
       writeOutput(outputLocation, "level", results.data(), results.size(),
                   globalIDs.data());
     }
-
   }
 
   StatTimer_total.stop();
-  galois::gPrint("[", net.ID, "] Max Stack Size ", stack_capture.get_max(), " bytes\n");
-
+  galois::gPrint("[", net.ID, "] Max Stack Size ", stack_capture.get_max(),
+                 " bytes\n");
 
   struct rusage r_usage;
-  getrusage(RUSAGE_SELF,&r_usage);
+  getrusage(RUSAGE_SELF, &r_usage);
   galois::gPrint("[", net.ID, "] Memory usage: ", r_usage.ru_maxrss, " KB\n");
   auto en = std::chrono::high_resolution_clock::now();
 
-  galois::gPrint("[", net.ID, "] E2ETime: ", std::chrono::duration_cast<std::chrono::nanoseconds>(en - st).count(), " ns\n");
+  galois::gPrint(
+      "[", net.ID, "] E2ETime: ",
+      std::chrono::duration_cast<std::chrono::nanoseconds>(en - st).count(),
+      " ns\n");
   return 0;
 }
diff --git a/lonestar/analytics/distributed/connected-components/cc_pull.cpp b/lonestar/analytics/distributed/connected-components/cc_pull.cpp
index f1460fd517..2ba9f9ed25 100644
--- a/lonestar/analytics/distributed/connected-components/cc_pull.cpp
+++ b/lonestar/analytics/distributed/connected-components/cc_pull.cpp
@@ -288,7 +288,7 @@ constexpr static const char* const name = "ConnectedComp Pull - Distributed "
                                           "Heterogeneous";
 constexpr static const char* const desc = "ConnectedComp pull on Distributed "
                                           "Galois.";
-constexpr static const char* const url = nullptr;
+constexpr static const char* const url  = nullptr;
 
 int main(int argc, char** argv) {
   galois::DistMemSys G;
diff --git a/lonestar/analytics/distributed/pagerank/pagerank_pull.cpp b/lonestar/analytics/distributed/pagerank/pagerank_pull.cpp
index 6b31a12631..1ac988a4d5 100644
--- a/lonestar/analytics/distributed/pagerank/pagerank_pull.cpp
+++ b/lonestar/analytics/distributed/pagerank/pagerank_pull.cpp
@@ -479,7 +479,7 @@ constexpr static const char* const name = "PageRank - Compiler Generated "
                                           "Distributed Heterogeneous";
 constexpr static const char* const desc = "PageRank Residual Pull version on "
                                           "Distributed Galois.";
-constexpr static const char* const url = nullptr;
+constexpr static const char* const url  = nullptr;
 
 int main(int argc, char** argv) {
   galois::DistMemSys G;
diff --git a/lonestar/analytics/distributed/pagerank/pagerank_push.cpp b/lonestar/analytics/distributed/pagerank/pagerank_push.cpp
index 2b9b3e4e13..ecda4e7582 100644
--- a/lonestar/analytics/distributed/pagerank/pagerank_push.cpp
+++ b/lonestar/analytics/distributed/pagerank/pagerank_push.cpp
@@ -480,7 +480,7 @@ constexpr static const char* const name = "PageRank - Compiler Generated "
                                           "Distributed Heterogeneous";
 constexpr static const char* const desc = "Residual PageRank on Distributed "
                                           "Galois.";
-constexpr static const char* const url = 0;
+constexpr static const char* const url  = 0;
 
 int main(int argc, char** argv) {
   galois::DistMemSys G;
diff --git a/lonestar/analytics/distributed/sssp/sssp_push.cpp b/lonestar/analytics/distributed/sssp/sssp_push.cpp
index 8cbdb87b69..c530c130e7 100644
--- a/lonestar/analytics/distributed/sssp/sssp_push.cpp
+++ b/lonestar/analytics/distributed/sssp/sssp_push.cpp
@@ -410,7 +410,7 @@ constexpr static const char* const name = "SSSP - Distributed Heterogeneous "
                                           "with worklist.";
 constexpr static const char* const desc = "Variant of Chaotic relaxation SSSP "
                                           "on Distributed Galois.";
-constexpr static const char* const url = nullptr;
+constexpr static const char* const url  = nullptr;
 
 int main(int argc, char** argv) {
   galois::DistMemSys G;
diff --git a/lonestar/analytics/gpu/matrixcompletion/SGDAsyncEdgeCu.h b/lonestar/analytics/gpu/matrixcompletion/SGDAsyncEdgeCu.h
index c386947db8..887b9e714f 100644
--- a/lonestar/analytics/gpu/matrixcompletion/SGDAsyncEdgeCu.h
+++ b/lonestar/analytics/gpu/matrixcompletion/SGDAsyncEdgeCu.h
@@ -703,7 +703,7 @@ struct SGDAsynEdgeCudaFunctor {
     */
     _P_DATA1      = (int*)malloc(sizeof(int) * (users / C + movies / R));
     _P1           = (struct a_list**)malloc(sizeof(struct a_list*) *
-                                  (users / C + movies / R - 1));
+                                            (users / C + movies / R - 1));
     _P_DATA1[0]   = 0;
     _P_DATA3      = ((
         struct mk*)(malloc(sizeof(struct mk) * (users / C + movies / R - 1))));
@@ -782,6 +782,6 @@ struct SGDAsynEdgeCudaFunctor {
     // fprintf(stderr, "Destroying SGDAsynEdgeCudaFunctor object.\n");
   }
 };
-//###################################################################//
+// ###################################################################//
 
 #endif /* GALOISGPU_APPS_SGD_CUDA_SGDASYNCEDGECU_H_ */
diff --git a/lonestar/analytics/gpu/matrixcompletion/SGDCommonCu.h b/lonestar/analytics/gpu/matrixcompletion/SGDCommonCu.h
index 96654978da..e7c1c704bc 100644
--- a/lonestar/analytics/gpu/matrixcompletion/SGDCommonCu.h
+++ b/lonestar/analytics/gpu/matrixcompletion/SGDCommonCu.h
@@ -44,7 +44,7 @@ typedef float FeatureType;
 float SGD_STEP_SIZE(int X) {
   return SGD_LEARNING_RATE * 1.5f / (1.0f + SGD_DECAY_RATE * pow(X + 1, 1.5f));
 } // Purdue.
-//#define SGD_STEP_SIZE(X) (0.001f *1.5f/(1.0+0.9* pow(X+1,1.5))) //Intel.
+// #define SGD_STEP_SIZE(X) (0.001f *1.5f/(1.0+0.9* pow(X+1,1.5))) //Intel.
 /************************************************
  *
  *************************************************/
diff --git a/lonestar/analytics/gpu/matrixcompletion/SGDGraphCu.h b/lonestar/analytics/gpu/matrixcompletion/SGDGraphCu.h
index 17b2f5286f..ea143abcf4 100644
--- a/lonestar/analytics/gpu/matrixcompletion/SGDGraphCu.h
+++ b/lonestar/analytics/gpu/matrixcompletion/SGDGraphCu.h
@@ -116,9 +116,9 @@ struct SGD_LC_LinearArray_Undirected_Graph {
     size_t masterLength = buf.st_size;
 
     int _MAP_BASE = MAP_PRIVATE;
-    //#ifdef MAP_POPULATE
-    //  _MAP_BASE  |= MAP_POPULATE;
-    //#endif
+    // #ifdef MAP_POPULATE
+    //   _MAP_BASE  |= MAP_POPULATE;
+    // #endif
 
     void* m = mmap(0, masterLength, PROT_READ, _MAP_BASE, masterFD, 0);
     if (m == MAP_FAILED) {
diff --git a/lonestar/analytics/gpu/pointstoanalysis/andersen.h b/lonestar/analytics/gpu/pointstoanalysis/andersen.h
index 81ca5b808b..3144cea486 100644
--- a/lonestar/analytics/gpu/pointstoanalysis/andersen.h
+++ b/lonestar/analytics/gpu/pointstoanalysis/andersen.h
@@ -123,12 +123,12 @@ typedef unsigned long int ulongint;
 
 // number of threads per block for each rule. The thread count is based on the
 // amount of shared memory available and empirical measures.
-//#define DEF_THREADS_PER_BLOCK (1024)
-//#define UPDATE_THREADS_PER_BLOCK (1024)
-//#define HCD_THREADS_PER_BLOCK (512)
-//#define COPY_INV_THREADS_PER_BLOCK (864)
-//#define STORE_INV_THREADS_PER_BLOCK (864)
-//#define GEP_INV_THREADS_PER_BLOCK (1024)
+// #define DEF_THREADS_PER_BLOCK (1024)
+// #define UPDATE_THREADS_PER_BLOCK (1024)
+// #define HCD_THREADS_PER_BLOCK (512)
+// #define COPY_INV_THREADS_PER_BLOCK (864)
+// #define STORE_INV_THREADS_PER_BLOCK (864)
+// #define GEP_INV_THREADS_PER_BLOCK (1024)
 
 #include "pta_tuning.h"
 
diff --git a/lonestar/eda/cpu/aig-rewriting/algorithms/RewriteManager.cpp b/lonestar/eda/cpu/aig-rewriting/algorithms/RewriteManager.cpp
index 99d6da44cb..46afe9067a 100644
--- a/lonestar/eda/cpu/aig-rewriting/algorithms/RewriteManager.cpp
+++ b/lonestar/eda/cpu/aig-rewriting/algorithms/RewriteManager.cpp
@@ -29,7 +29,7 @@
 
 #include "galois/worklists/Chunk.h"
 
-//#include "galois/runtime/profile.h"
+// #include "galois/runtime/profile.h"
 
 #include <stdlib.h>
 #include <string.h>
diff --git a/lonestar/eda/cpu/aig-rewriting/functional/FunctionHandler.h b/lonestar/eda/cpu/aig-rewriting/functional/FunctionHandler.h
index fcb92a0a38..d15f075a48 100644
--- a/lonestar/eda/cpu/aig-rewriting/functional/FunctionHandler.h
+++ b/lonestar/eda/cpu/aig-rewriting/functional/FunctionHandler.h
@@ -203,7 +203,7 @@ inline void createLiterals(
   literals.insert(std::make_pair("1", std::make_pair(constOne, 0)));
 
   //	std::cout << std::endl << "############################## Literals
-  //##############################" << std::endl; 	for ( auto lit : literals )
+  // ##############################" << std::endl; 	for ( auto lit : literals )
   //{ 		std::cout << lit.first << " = " << toHex( lit.second.first, nWords )
   //<< " | " << supportToBin( lit.second.second ) << std::endl;
   //	}
diff --git a/lonestar/eda/cpu/sproute/flute.h b/lonestar/eda/cpu/sproute/flute.h
index 7a0b3aedd9..69bef615d8 100644
--- a/lonestar/eda/cpu/sproute/flute.h
+++ b/lonestar/eda/cpu/sproute/flute.h
@@ -5,7 +5,7 @@
 #include <stdlib.h>
 #include <limits.h>
 #include <math.h>
-//#include "flute_mst.h"
+// #include "flute_mst.h"
 
 /*****************************/
 /*  User-Defined Parameters  */
@@ -72,10 +72,10 @@ typedef struct {
 #define flutes_LMD(d, xs, ys, s, acc)                                          \
   (d <= D ? flutes_LD(d, xs, ys, s) : flutes_MD(d, xs, ys, s, acc))
 
-//#define max(x,y) ((x)>(y)?(x):(y))
-//#define min(x,y) ((x)<(y)?(x):(y))
-// to work around max conflict with bitmap
-//#define abs(x) ((x)<0?(-x):(x))
+// #define max(x,y) ((x)>(y)?(x):(y))
+// #define min(x,y) ((x)<(y)?(x):(y))
+//  to work around max conflict with bitmap
+// #define abs(x) ((x)<0?(-x):(x))
 using namespace std;
 #define ADIFF(x, y) ((x) > (y) ? (x - y) : (y - x)) // Absolute difference
 
@@ -574,9 +574,9 @@ DTYPE flutes_wl_MD(int d, DTYPE xs[], DTYPE ys[], int s[], int acc) {
     penalty[r] = pnlty, penalty[d - 1 - r] = pnlty;
   for (r = d / 2 - 1, pnlty = dy; r >= 0; r--, pnlty += dy)
     penalty[s[r]] += pnlty, penalty[s[d - 1 - r]] += pnlty;
-  //#define CCWL 0.16
-  //    for (r=0; r<d; r++)
-  //        penalty[r] = abs(d-1-r-r)*dx + abs(d-1-si[r]-si[r])*dy;
+  // #define CCWL 0.16
+  //     for (r=0; r<d; r++)
+  //         penalty[r] = abs(d-1-r-r)*dx + abs(d-1-si[r]-si[r])*dy;
 
   // Compute distx[], disty[]
   xydiff = (xs[d - 1] - xs[0]) - (ys[d - 1] - ys[0]);
@@ -1135,10 +1135,10 @@ Tree flutes_MD(int d, DTYPE xs[], DTYPE ys[], int s[], int acc) {
     penalty[s[r]] += pnlty, penalty[s[d - 1 - r]] += pnlty;
   penalty[s[1]] += pnlty, penalty[s[d - 2]] += pnlty;
   penalty[s[0]] += pnlty, penalty[s[d - 1]] += pnlty;
-  //#define CC 0.16
-  //#define v(r) ((r==0||r==1||r==d-2||r==d-1) ? d-3 : abs(d-1-r-r))
-  //    for (r=0; r<d; r++)
-  //        penalty[r] = v(r)*dx + v(si[r])*dy;
+  // #define CC 0.16
+  // #define v(r) ((r==0||r==1||r==d-2||r==d-1) ? d-3 : abs(d-1-r-r))
+  //     for (r=0; r<d; r++)
+  //         penalty[r] = v(r)*dx + v(si[r])*dy;
 
   // Compute distx[], disty[]
   xydiff = (xs[d - 1] - xs[0]) - (ys[d - 1] - ys[0]);
diff --git a/lonestar/eda/cpu/sproute/maze.h b/lonestar/eda/cpu/sproute/maze.h
index 029fccf807..553578cdec 100644
--- a/lonestar/eda/cpu/sproute/maze.h
+++ b/lonestar/eda/cpu/sproute/maze.h
@@ -31,7 +31,7 @@
 // using namespace std;
 
 #define PARENT(i) (i - 1) / 2
-//#define PARENT(i) ((i-1)>>1)
+// #define PARENT(i) ((i-1)>>1)
 #define LEFT(i) 2 * i + 1
 #define RIGHT(i) 2 * i + 2
 
diff --git a/lonestar/eda/cpu/sproute/maze3D.h b/lonestar/eda/cpu/sproute/maze3D.h
index 88179230e7..ce1ea2d5f5 100644
--- a/lonestar/eda/cpu/sproute/maze3D.h
+++ b/lonestar/eda/cpu/sproute/maze3D.h
@@ -12,7 +12,7 @@
 #include <time.h>
 
 #define PARENT(i) (i - 1) / 2
-//#define PARENT(i) ((i-1)>>1)
+// #define PARENT(i) ((i-1)>>1)
 #define LEFT(i) 2 * i + 1
 #define RIGHT(i) 2 * i + 2
 
diff --git a/lonestar/eda/cpu/sproute/route.h b/lonestar/eda/cpu/sproute/route.h
index 4edba91d95..10bf73a37e 100644
--- a/lonestar/eda/cpu/sproute/route.h
+++ b/lonestar/eda/cpu/sproute/route.h
@@ -999,7 +999,7 @@ void routeMonotonic(int netID, int edgeID, int threshold) {
           grid = yl * xGrid_1;
           for (j = 0; j <= segHeight; j++) {
             tmp              = max((float)0, h_edges[grid + x].red +
-                                    h_edges[grid + x].est_usage - hCapacity_lb);
+                                                 h_edges[grid + x].est_usage - hCapacity_lb);
             cost[j][i + 1]   = cost[j][i] + tmp;
             parent[j][i + 1] = SAMEY;
             grid += xGrid - 1;
@@ -1071,7 +1071,7 @@ void routeMonotonic(int netID, int edgeID, int threshold) {
           ind_i = i + 1;
           for (j = segHeight; j >= 0; j--) {
             tmp              = max((float)0, h_edges[grid + x].red +
-                                    h_edges[grid + x].est_usage - hCapacity_lb);
+                                                 h_edges[grid + x].est_usage - hCapacity_lb);
             cost[j][ind_i]   = cost[j][i] + tmp;
             parent[j][ind_i] = SAMEY;
             grid -= xGrid - 1;
diff --git a/lonestar/gnn/distributed/gcn/gcn-dist.cpp b/lonestar/gnn/distributed/gcn/gcn-dist.cpp
index 60e9fe75b4..7076449384 100644
--- a/lonestar/gnn/distributed/gcn/gcn-dist.cpp
+++ b/lonestar/gnn/distributed/gcn/gcn-dist.cpp
@@ -9,8 +9,8 @@ int main(int argc, char* argv[]) {
   galois::StatTimer init_timer("InitializationTime");
   init_timer.start();
   std::unique_ptr<
-      galois::GraphNeuralNetwork<shad::ShadNodeTy, shad::ShadEdgeTy>> gnn =
-      InitializeGraphNeuralNetwork<shad::ShadNodeTy, shad::ShadEdgeTy>();
+      galois::GraphNeuralNetwork<shad::ShadNodeTy, shad::ShadEdgeTy>>
+      gnn = InitializeGraphNeuralNetwork<shad::ShadNodeTy, shad::ShadEdgeTy>();
   gnn->SetLayerPhases(galois::GNNPhase::kTrain);
   init_timer.stop();
 
diff --git a/lonestar/gnn/gat/gat.cpp b/lonestar/gnn/gat/gat.cpp
index 10647924b7..24ea0b840c 100644
--- a/lonestar/gnn/gat/gat.cpp
+++ b/lonestar/gnn/gat/gat.cpp
@@ -3,16 +3,19 @@
 #include "lonestargnn.h"
 
 const char* name = "Graph Attention Networks (GAT)";
-const char* desc = "Graph Attention Networks on an undirected graph: <https://arxiv.org/pdf/1710.10903.pdf>";
+const char* desc = "Graph Attention Networks on an undirected graph: "
+                   "<https://arxiv.org/pdf/1710.10903.pdf>";
 const char* url  = 0;
 
 // math: h_i^{(l+1)} = \sum_{j\in \mathcal{N}(i)} \alpha_{i,j} W^{(l)} h_j^{(l)}
-// where :math:`\alpha_{ij}` is the attention score bewteen node :math:`i` and node :math:`j`:
+// where :math:`\alpha_{ij}` is the attention score bewteen node :math:`i` and
+// node :math:`j`:
 // .. math:: \alpha_{ij}^{l} & = \mathrm{softmax_i} (e_{ij}^{l})
-//                e_{ij}^{l} & = \mathrm{LeakyReLU}\left(\vec{a}^T [W h_{i} \| W h_{j}]\right)
+//                e_{ij}^{l} & = \mathrm{LeakyReLU}\left(\vec{a}^T [W h_{i} \| W
+//                h_{j}]\right)
 /*
 namespace deepgalois {
- 
+
 // define aggregator here
 class AppAggregator: public Aggregator {
 public:
diff --git a/lonestar/gnn/gin/gin.cpp b/lonestar/gnn/gin/gin.cpp
index 4eb8835214..2cadef0c44 100644
--- a/lonestar/gnn/gin/gin.cpp
+++ b/lonestar/gnn/gin/gin.cpp
@@ -6,8 +6,12 @@
 const char* name = "Graph Isomorphism Network (GIN)";
 const char* desc = "Graph isomorphism neural networks on an undirected graph";
 const char* url  = 0;
-static cll::opt<unsigned>learn_eps("le", cll::desc("whether to learn the parameter epsilon (default value false)"), cll::init(0));
-static cll::opt<std::string>agg_type("at", cll::desc("Aggregator Type"), cll::init("sum"));
+static cll::opt<unsigned> learn_eps(
+    "le",
+    cll::desc("whether to learn the parameter epsilon (default value false)"),
+    cll::init(0));
+static cll::opt<std::string> agg_type("at", cll::desc("Aggregator Type"),
+                                      cll::init("sum"));
 
 template <>
 class graph_conv_layer<agg_type> {
@@ -30,4 +34,3 @@ int main(int argc, char** argv) {
   graph_conv_layer<agg_type> layer0;
   return 0;
 }
-
diff --git a/lonestar/gnn/include/DistributedGraphLoader.h b/lonestar/gnn/include/DistributedGraphLoader.h
index 0bce4b5819..e4970a82a8 100644
--- a/lonestar/gnn/include/DistributedGraphLoader.h
+++ b/lonestar/gnn/include/DistributedGraphLoader.h
@@ -129,7 +129,8 @@ namespace graphs {
  * loaded based on command line arguments
  */
 template <typename NodeData, typename EdgeData>
-std::unique_ptr<DistGraph<NodeData, EdgeData>> constructSymmetricGraph(std::vector<unsigned>&) {
+std::unique_ptr<DistGraph<NodeData, EdgeData>>
+constructSymmetricGraph(std::vector<unsigned>&) {
   std::string inputFile = deepgalois::path + dataset + ".csgr";
   galois::gInfo("File to read is ", inputFile);
   switch (partitionScheme) {
diff --git a/lonestar/gnn/include/engine.h b/lonestar/gnn/include/engine.h
index 016ac80831..25be9c7adc 100644
--- a/lonestar/gnn/include/engine.h
+++ b/lonestar/gnn/include/engine.h
@@ -28,7 +28,7 @@ void LonestarGnnStart(int argc, char** argv, const char* app, const char* desc,
   unsigned hostID = 0;
 #ifndef GALOIS_ENABLE_GPU
   numThreads = galois::setActiveThreads(numThreads); // number of threads on CPU
-  hostID = galois::runtime::getSystemNetworkInterface().ID;
+  hostID     = galois::runtime::getSystemNetworkInterface().ID;
 #endif
 
   if (hostID == 0) {
diff --git a/lonestar/gnn/include/lonestargnn.h b/lonestar/gnn/include/lonestargnn.h
index 8b18e80ae0..2f66303c86 100644
--- a/lonestar/gnn/include/lonestargnn.h
+++ b/lonestar/gnn/include/lonestargnn.h
@@ -3,34 +3,55 @@
 #include "llvm/Support/CommandLine.h"
 
 namespace cll = llvm::cl;
-static cll::opt<std::string> dataset(cll::Positional, 
-    cll::desc("<dataset name>"), cll::Required); // 'cora', 'citeseer', 'pubmed'
-//static cll::opt<std::string> model("m", 
-//  cll::desc("Model string"), cll::init("gcn")); // 'gcn', 'gcn_cheby', 'dense'
-static cll::opt<int> epochs("k",
-    cll::desc("number of epoch, i.e. iterations (default value 1)"), cll::init(1));
-static cll::opt<unsigned> num_conv_layers("nc",
-    cll::desc("number of convolutional layers, (default value 2)"), cll::init(2));
-static cll::opt<unsigned> hidden1("h",
-    cll::desc("Number of units in hidden layer 1 (default value 16)"), cll::init(16));
-static cll::opt<float> learning_rate("lr", 
-    cll::desc("Initial learning rate (default value 0.01)"), cll::init(0.01));
-static cll::opt<float> dropout_rate("dr", 
-    cll::desc("Dropout rate (1 - keep probability) (default value 0.5)"), cll::init(0.5));
-static cll::opt<float> weight_decay("wd",
-    cll::desc("Weight for L2 loss on embedding matrix (default value 5e-4)"), cll::init(5e-4));
-static cll::opt<float> early_stopping("es",
-    cll::desc("Tolerance for early stopping (# of epochs) (default value 10)"), cll::init(10));
-static cll::opt<bool> is_single_class("sc", 
-    cll::desc("single-class or multi-class label (default single)"), cll::init(1));
-static cll::opt<bool> do_validate("dv", cll::desc("enable validation"), cll::init(1));
+static cll::opt<std::string>
+    dataset(cll::Positional, cll::desc("<dataset name>"),
+            cll::Required); // 'cora', 'citeseer', 'pubmed'
+// static cll::opt<std::string> model("m",
+//   cll::desc("Model string"), cll::init("gcn")); // 'gcn', 'gcn_cheby',
+//   'dense'
+static cll::opt<int>
+    epochs("k", cll::desc("number of epoch, i.e. iterations (default value 1)"),
+           cll::init(1));
+static cll::opt<unsigned> num_conv_layers(
+    "nc", cll::desc("number of convolutional layers, (default value 2)"),
+    cll::init(2));
+static cll::opt<unsigned>
+    hidden1("h",
+            cll::desc("Number of units in hidden layer 1 (default value 16)"),
+            cll::init(16));
+static cll::opt<float>
+    learning_rate("lr", cll::desc("Initial learning rate (default value 0.01)"),
+                  cll::init(0.01));
+static cll::opt<float> dropout_rate(
+    "dr", cll::desc("Dropout rate (1 - keep probability) (default value 0.5)"),
+    cll::init(0.5));
+static cll::opt<float> weight_decay(
+    "wd",
+    cll::desc("Weight for L2 loss on embedding matrix (default value 5e-4)"),
+    cll::init(5e-4));
+static cll::opt<float> early_stopping(
+    "es",
+    cll::desc("Tolerance for early stopping (# of epochs) (default value 10)"),
+    cll::init(10));
+static cll::opt<bool> is_single_class(
+    "sc", cll::desc("single-class or multi-class label (default single)"),
+    cll::init(1));
+static cll::opt<bool> do_validate("dv", cll::desc("enable validation"),
+                                  cll::init(1));
 static cll::opt<bool> do_test("dt", cll::desc("enable test"), cll::init(1));
-static cll::opt<bool> add_selfloop("sl", cll::desc("add selfloop"), cll::init(0));
-static cll::opt<bool> add_l2norm("l2", cll::desc("add an l2_norm layer"), cll::init(0));
-static cll::opt<bool> add_dense("d", cll::desc("add an dense layer"), cll::init(0));
-static cll::opt<int> val_interval("vi", cll::desc("validation interval (default value 1)"), cll::init(1));
-static cll::opt<unsigned> neighbor_sample_sz("ns", cll::desc("neighbor sampling size (default value 0)"), cll::init(0));
-static cll::opt<unsigned> subgraph_sample_sz("ss", cll::desc("subgraph sampling size (default value 0)"), cll::init(0));
+static cll::opt<bool> add_selfloop("sl", cll::desc("add selfloop"),
+                                   cll::init(0));
+static cll::opt<bool> add_l2norm("l2", cll::desc("add an l2_norm layer"),
+                                 cll::init(0));
+static cll::opt<bool> add_dense("d", cll::desc("add an dense layer"),
+                                cll::init(0));
+static cll::opt<int>
+    val_interval("vi", cll::desc("validation interval (default value 1)"),
+                 cll::init(1));
+static cll::opt<unsigned> neighbor_sample_sz(
+    "ns", cll::desc("neighbor sampling size (default value 0)"), cll::init(0));
+static cll::opt<unsigned> subgraph_sample_sz(
+    "ss", cll::desc("subgraph sampling size (default value 0)"), cll::init(0));
 
 //! standard global options to the benchmarks
 extern llvm::cl::opt<bool> skipVerify;
@@ -38,9 +59,14 @@ extern llvm::cl::opt<int> numThreads;
 extern llvm::cl::opt<std::string> statFile;
 
 //! standard global options to the benchmarks
-llvm::cl::opt<bool> skipVerify("noverify",
-    llvm::cl::desc("Skip verification step (default value false)"), llvm::cl::init(false));
-llvm::cl::opt<int>numThreads("t", llvm::cl::desc("Number of threads (default value 1)"), llvm::cl::init(1));
-llvm::cl::opt<std::string> statFile("statFile",
-    llvm::cl::desc("ouput file to print stats to (default value empty)"), llvm::cl::init(""));
-
+llvm::cl::opt<bool>
+    skipVerify("noverify",
+               llvm::cl::desc("Skip verification step (default value false)"),
+               llvm::cl::init(false));
+llvm::cl::opt<int>
+    numThreads("t", llvm::cl::desc("Number of threads (default value 1)"),
+               llvm::cl::init(1));
+llvm::cl::opt<std::string> statFile(
+    "statFile",
+    llvm::cl::desc("ouput file to print stats to (default value empty)"),
+    llvm::cl::init(""));
diff --git a/lonestar/gnn/sage/sage.cpp b/lonestar/gnn/sage/sage.cpp
index 5f078dff63..b9daad0436 100644
--- a/lonestar/gnn/sage/sage.cpp
+++ b/lonestar/gnn/sage/sage.cpp
@@ -3,8 +3,9 @@
 #include "lonestargnn.h"
 
 const char* name = "GraphSAGE";
-const char* desc = "GraphSAGE on an undirected graph: <https://arxiv.org/pdf/1706.02216.pdf>";
-const char* url  = 0;
+const char* desc =
+    "GraphSAGE on an undirected graph: <https://arxiv.org/pdf/1706.02216.pdf>";
+const char* url = 0;
 
 // define aggregator here
 // .. math::
@@ -17,39 +18,42 @@ const char* url  = 0;
 //      h_{i}^{(l+1)} & = \mathrm{norm}(h_{i}^{l})
 
 namespace deepgalois {
- 
-class AppAggregator: public Aggregator {
+
+class AppAggregator : public Aggregator {
 public:
   emb_t applyEdge(VertexID, VertexID u, emb_t in) {
     auto ilen = get_in_feat_len();
-    return &in[ilen*u];
+    return &in[ilen * u];
   }
 
   emb_t applyVertex(VertexID v, emb_t in, emb_t accum) {
-    auto n = get_num_samples();
+    auto n    = get_num_samples();
     auto ilen = get_in_feat_len();
     auto olen = get_out_feat_len();
     emb_t a, b, c;
-    math::mvmul(CblasTrans, olen, ilen, 1.0, W, &accum[v*ilen], 0.0, a); // a = W * accum[v]; [olen x ilen] * [ilen x 1] = [olen x 1]
-    math::mvmul(CblasTrans, olen, ilen, 1.0, Q, &in[v*ilen], 0.0, b);    // b = Q * in; [olen x ilen] * [ilen x 1] = [olen x 1] 
+    math::mvmul(CblasTrans, olen, ilen, 1.0, W, &accum[v * ilen], 0.0,
+                a); // a = W * accum[v]; [olen x ilen] * [ilen x 1] = [olen x 1]
+    math::mvmul(CblasTrans, olen, ilen, 1.0, Q, &in[v * ilen], 0.0,
+                b); // b = Q * in; [olen x ilen] * [ilen x 1] = [olen x 1]
     math::vadd_cpu(olen, a, b, c); // c = a + b; [olen x 1]
-    return c; // the feature vector to update h[v]
-  }
-/*
-  emb_t applyVertex(emb_t in, emb_t accum) {
-    auto n = get_num_samples();
-    auto ilen = get_in_feat_len();
-    auto olen = get_out_feat_len();
-    emb_t a, b, c;
-    math::matmul(n, olen, ilen, accum, W, a); // a = accum * W; [n x ilen] * [ilen x olen] = [n x olen]
-    math::matmul(n, olen, ilen, in, Q, b);    // b = in * Q; [n x ilen] * [ilen x olen] = [n x olen] 
-    math::vadd(n*olen, a, b, c); // c = a + b; [n x olen]
-    return c; // all the feature vectors to update the entire h
+    return c;                      // the feature vector to update h[v]
   }
-*/
-  //void update_all(size_t len, Graph& g, const emb_t in, emb_t out) {
-  //}
+  /*
+    emb_t applyVertex(emb_t in, emb_t accum) {
+      auto n = get_num_samples();
+      auto ilen = get_in_feat_len();
+      auto olen = get_out_feat_len();
+      emb_t a, b, c;
+      math::matmul(n, olen, ilen, accum, W, a); // a = accum * W; [n x ilen] *
+    [ilen x olen] = [n x olen] math::matmul(n, olen, ilen, in, Q, b);    // b =
+    in * Q; [n x ilen] * [ilen x olen] = [n x olen] math::vadd(n*olen, a, b, c);
+    // c = a + b; [n x olen] return c; // all the feature vectors to update the
+    entire h
+    }
+  */
+  // void update_all(size_t len, Graph& g, const emb_t in, emb_t out) {
+  // }
 };
 
-}
+} // namespace deepgalois
 #include "engine.h"
diff --git a/lonestar/gnn/src/DistributedGraphLoader.cpp b/lonestar/gnn/src/DistributedGraphLoader.cpp
index 5e1a2dbe81..67679606b6 100644
--- a/lonestar/gnn/src/DistributedGraphLoader.cpp
+++ b/lonestar/gnn/src/DistributedGraphLoader.cpp
@@ -42,7 +42,8 @@ cll::opt<PARTITIONING_SCHEME> partitionScheme(
                    "gnn cvc: train nodes evenly distributed")),
     cll::init(GNN_OEC));
 
-cll::opt<bool> useWMD("useWMD", cll::desc("true if the input graph is"
-                                          " SHAD WMD graph format."
-                                          " Otheriwse, set false."),
-                       cll::init(false));
+cll::opt<bool> useWMD("useWMD",
+                      cll::desc("true if the input graph is"
+                                " SHAD WMD graph format."
+                                " Otheriwse, set false."),
+                      cll::init(false));
diff --git a/lonestar/libdistbench/include/DistBench/Input.h b/lonestar/libdistbench/include/DistBench/Input.h
index d7e9cb8568..4563b720a7 100644
--- a/lonestar/libdistbench/include/DistBench/Input.h
+++ b/lonestar/libdistbench/include/DistBench/Input.h
@@ -166,7 +166,7 @@ constructSymmetricGraph(std::vector<unsigned>& GALOIS_UNUSED(scaleFactor)) {
   case GINGER_O:
   case GINGER_I:
     return galois::cuspPartitionGraph<GingerP, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD ,true,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, true,
         inputFileTranspose);
 
   case FENNEL_O:
@@ -371,8 +371,7 @@ DistGraphPtr<NodeData, EdgeData> constructGraph(std::vector<unsigned>&) {
       return galois::cuspPartitionGraph<GenericCVCColumnFlip, NodeData,
                                         EdgeData>(inputFile, galois::CUSP_CSC,
                                                   galois::CUSP_CSC, useWMD,
-                                                  false,
-                                                  inputFileTranspose);
+                                                  false, inputFileTranspose);
     } else {
       GALOIS_DIE("cvc requires transpose graph");
       break;
diff --git a/lonestar/libdistbench/src/Input.cpp b/lonestar/libdistbench/src/Input.cpp
index 844591506f..3ed520e69c 100644
--- a/lonestar/libdistbench/src/Input.cpp
+++ b/lonestar/libdistbench/src/Input.cpp
@@ -60,10 +60,11 @@ cll::opt<PARTITIONING_SCHEME> partitionScheme(
                    "fennel, incoming edge cut, using CuSP")),
     cll::init(OEC));
 
-cll::opt<bool> useWMD("useWMD", cll::desc("true if the input graph is"
-                                          " SHAD WMD graph format."
-                                          " Otheriwse, set false."),
-                       cll::init(false));
+cll::opt<bool> useWMD("useWMD",
+                      cll::desc("true if the input graph is"
+                                " SHAD WMD graph format."
+                                " Otheriwse, set false."),
+                      cll::init(false));
 
 cll::opt<bool> readFromFile("readFromFile",
                             cll::desc("Set this flag if graph is to be "
diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp
index c1da754222..1068ebd9d0 100644
--- a/lonestar/libgnnbench/src/Input.cpp
+++ b/lonestar/libgnnbench/src/Input.cpp
@@ -26,10 +26,11 @@ llvm::cl::opt<galois::graphs::GNNPartitionScheme> partition_scheme(
                            "Original Cartesian Vertex-Cut")),
     cll::init(galois::graphs::GNNPartitionScheme::kOEC));
 
-cll::opt<bool> useWMD("useWMD", cll::desc("true if the input graph is"
-                                          " SHAD WMD graph format."
-                                          " Otheriwse, set false."),
-                       cll::init(false));
+cll::opt<bool> useWMD("useWMD",
+                      cll::desc("true if the input graph is"
+                                " SHAD WMD graph format."
+                                " Otheriwse, set false."),
+                      cll::init(false));
 
 llvm::cl::opt<unsigned> num_layers(
     "numLayers",
diff --git a/lonestar/libgnnbench/src/Start.cpp b/lonestar/libgnnbench/src/Start.cpp
index ed928374cc..76f0d01744 100644
--- a/lonestar/libgnnbench/src/Start.cpp
+++ b/lonestar/libgnnbench/src/Start.cpp
@@ -135,14 +135,14 @@ void GNNBenchStart(int argc, char** argv, const char* app, const char* desc,
       // partitioning policies. They consider and attempt to balance the
       // number of master "training" nodes for each host.
       // SHAD-GNN on WMD graphs is not necessarily constrained to this design.
-      // SHAD-GNN has the specific number of training vertices, and randomly 
+      // SHAD-GNN has the specific number of training vertices, and randomly
       // selects vertices from a graph as that, which means that Galois-GNN
       // could avoid vertex imbalancing due to the skewness if it chooses
       // vertices in balance manner.
       // To sum up, we do not support the specialized partitioning policies,
       // but choose vertices in balance manner.
       GALOIS_LOG_FATAL("Gnn CVC and OEC are not supported for WMD graphs {}",
-          GNNPartitionToString(partition_scheme));
+                       GNNPartitionToString(partition_scheme));
     }
   }
 
diff --git a/lonestar/mining/cpu/subgraph-listing/sgl_cycle.cpp b/lonestar/mining/cpu/subgraph-listing/sgl_cycle.cpp
index e38521caa6..2aad7806b0 100644
--- a/lonestar/mining/cpu/subgraph-listing/sgl_cycle.cpp
+++ b/lonestar/mining/cpu/subgraph-listing/sgl_cycle.cpp
@@ -4,7 +4,7 @@
 const char* name = "sgl";
 const char* desc = "listing edge-induced subgraphs of a given pattern in a "
                    "graph using bfs extension";
-const char* url = nullptr;
+const char* url  = nullptr;
 
 #include "pangolin/BfsMining/vertex_miner_api.h"
 class MyAPI : public VertexMinerAPI<BaseEmbedding> {
diff --git a/lonestar/mining/cpu/subgraph-listing/sgl_diamond.cpp b/lonestar/mining/cpu/subgraph-listing/sgl_diamond.cpp
index 2b68512858..ca272c8f52 100644
--- a/lonestar/mining/cpu/subgraph-listing/sgl_diamond.cpp
+++ b/lonestar/mining/cpu/subgraph-listing/sgl_diamond.cpp
@@ -4,7 +4,7 @@
 const char* name = "sgl";
 const char* desc = "listing edge-induced subgraphs of a given pattern in a "
                    "graph using bfs extension";
-const char* url = nullptr;
+const char* url  = nullptr;
 
 #include "pangolin/BfsMining/vertex_miner_api.h"
 class MyAPI : public VertexMinerAPI<BaseEmbedding> {
diff --git a/lonestar/scientific/cpu/delaunayrefinement/DelaunayRefinement.cpp b/lonestar/scientific/cpu/delaunayrefinement/DelaunayRefinement.cpp
index 8dadc777cb..d8e9630cb9 100644
--- a/lonestar/scientific/cpu/delaunayrefinement/DelaunayRefinement.cpp
+++ b/lonestar/scientific/cpu/delaunayrefinement/DelaunayRefinement.cpp
@@ -38,7 +38,7 @@ namespace cll = llvm::cl;
 static const char* name = "Delaunay Mesh Refinement";
 static const char* desc = "Refines a Delaunay triangulation mesh such that no "
                           "angle in the mesh is less than 30 degrees";
-static const char* url = "delaunay_mesh_refinement";
+static const char* url  = "delaunay_mesh_refinement";
 
 static cll::opt<std::string>
     inputFile(cll::Positional, cll::desc("<input file>"), cll::Required);
diff --git a/lonestar/scientific/cpu/longestedge/src/LongestEdge.cpp b/lonestar/scientific/cpu/longestedge/src/LongestEdge.cpp
index 6da95f5126..cb253bed15 100644
--- a/lonestar/scientific/cpu/longestedge/src/LongestEdge.cpp
+++ b/lonestar/scientific/cpu/longestedge/src/LongestEdge.cpp
@@ -33,7 +33,7 @@ namespace cll = llvm::cl;
 static const char* name = "Longest edge mesh generator";
 static const char* desc = "Implementation of Rivara's Longest Edge algorithm "
                           "based on hyper-graph grammars.";
-static const char* url = "longest_edge";
+static const char* url  = "longest_edge";
 
 // Command line arguments
 static cll::opt<std::string> dataDir("data", cll::Positional,
diff --git a/lonestar/scientific/cpu/longestedge/src/readers/SrtmReader.cpp b/lonestar/scientific/cpu/longestedge/src/readers/SrtmReader.cpp
index 40c5a1080b..9f916d744f 100644
--- a/lonestar/scientific/cpu/longestedge/src/readers/SrtmReader.cpp
+++ b/lonestar/scientific/cpu/longestedge/src/readers/SrtmReader.cpp
@@ -32,7 +32,7 @@ Map* SrtmReader::read(
   // from disk
   double** map_data = Map::init_map_data(rows, cols);
   Map* map          = new Map(map_data, cols, rows, 1. / VALUES_IN_DEGREE,
-                     1. / VALUES_IN_DEGREE);
+                              1. / VALUES_IN_DEGREE);
 
   map->setNorthBorder(map_N_border);
   map->setWestBorder(map_W_border);

From 03021913ffb3f306cd254e3c0fea9abe0d5d6ed6 Mon Sep 17 00:00:00 2001
From: Meyer Zinn <6132034+meyerzinn@users.noreply.github.com>
Date: Tue, 2 Apr 2024 16:28:30 -0500
Subject: [PATCH 642/660] Revert "GNN tests (#12)" (#19)

This reverts commit 9b91847ba9538d7a384deee967b19f5d0bb909b8.
---
 .gitattributes                                |  31 --
 .github/workflows/build-and-test.yml          |  14 +-
 Makefile                                      |   3 +-
 inputs/cora/cora-dims.txt                     |   3 -
 inputs/cora/cora-feat.bin                     |   3 -
 inputs/cora/cora-feats.bin                    |   3 -
 inputs/cora/cora-labels.txt                   |   3 -
 inputs/cora/cora-test_mask.txt                |   3 -
 inputs/cora/cora-train_mask.txt               |   3 -
 inputs/cora/cora-val_mask.txt                 |   3 -
 inputs/cora/cora.csgr                         |   3 -
 inputs/cora/cora.el                           |   3 -
 inputs/cora/cora.features                     |   3 -
 inputs/cora/cora.ft                           |   3 -
 inputs/cora/cora.gr                           |   3 -
 inputs/cora/cora.sgr                          |   3 -
 inputs/cora/cora_edgelist.txt                 |   3 -
 inputs/cora/cora_full.npz                     |   3 -
 inputs/cora/cora_labels.txt                   |   3 -
 inputs/tester/tester-dims.txt                 |   3 -
 inputs/tester/tester-feats.bin                |   3 -
 inputs/tester/tester-labels.txt               |   3 -
 inputs/tester/tester-mlabels.txt              |   3 -
 inputs/tester/tester-test-feats.bin           |   3 -
 inputs/tester/tester-test-labels-dims.txt     |   3 -
 inputs/tester/tester-test-labels.bin          |   3 -
 inputs/tester/tester-test-mapping.bin         |   3 -
 inputs/tester/tester-test_mask.txt            |   3 -
 inputs/tester/tester-train_mask.txt           |   3 -
 inputs/tester/tester-val_mask.txt             |   3 -
 inputs/tester/tester.csgr                     |   3 -
 inputs/tester/tester.el                       |   3 -
 libcusp/test/shad-dist-graph.cpp              |  11 +-
 libgnn/include/galois/graphs/GNNGraph.h       |  32 +-
 libgnn/test/CMakeLists.txt                    |  10 +-
 libgnn/test/accuracy-test.cpp                 |  16 +-
 libgnn/test/aggregate-sync-test.cpp           |   6 +-
 libgnn/test/back-conv-test.cpp                |   3 +-
 libgnn/test/convlayer-test.cpp                |   3 +-
 libgnn/test/epoch-test.cpp                    |   2 +-
 libgnn/test/f1-test.cpp                       |   3 +-
 libgnn/test/gcn-sample-edge-test.cpp          |  24 +-
 libgnn/test/gnnconstruct-test.cpp             |   3 +-
 libgnn/test/gnnfb-test.cpp                    |  14 +-
 libgnn/test/gnngraph-test.cpp                 |   4 +-
 libgnn/test/l2norm-layer-test.cpp             |   3 +-
 libgnn/test/multilabel-epoch-test.cpp         |   3 +-
 libgnn/test/multilabel-read.cpp               |   3 +-
 libgnn/test/sage-layer-test.cpp               |   3 +-
 libgnn/test/sample-bit-test.cpp               |   3 +-
 libgnn/test/sample-test.cpp                   |   6 +-
 libgnn/test/sigmoidlayer-test.cpp             |   3 +-
 libgnn/test/softmaxlayer-test.cpp             |  41 ++-
 libgpu/include/csr_graph.h                    |   2 +-
 libgpu/include/graph_gpu.h                    | 181 +++++------
 libgpu/include/internal.h                     |   6 +-
 .../include/pangolin/canonical_graph.h        |   4 +-
 libpangolin/include/pangolin/edge_embedding.h |   4 +-
 libpangolin/include/pangolin/embedding.h      |   6 +-
 libpangolin/include/pangolin/quick_pattern.h  |   4 +-
 .../cpu/betweennesscentrality/OuterStructs.h  |   2 +-
 lonestar/analytics/cpu/bipart/Coarsening.cpp  | 295 +++++++++---------
 lonestar/analytics/cpu/bipart/Refine.cpp      |   2 +-
 lonestar/analytics/cpu/bipart/bipart.cpp      |  44 +--
 lonestar/analytics/cpu/bipart/bipart.h        |  25 +-
 .../cpu/clustering/louvainClustering.cpp      |   2 +-
 lonestar/analytics/cpu/gmetis/Coarsening.cpp  |   2 +-
 lonestar/analytics/cpu/gmetis/GMetis.cpp      |   2 +-
 lonestar/analytics/cpu/k-core/kcore.cpp       |   6 +-
 .../cpu/matrixcompletion/matrixCompletion.h   |   2 +-
 .../betweennesscentrality/bc_level.cpp        |   6 +-
 .../betweennesscentrality/bc_mr.cpp           |   2 +-
 .../analytics/distributed/bfs/bfs_push.cpp    |  48 +--
 .../connected-components/cc_pull.cpp          |   2 +-
 .../distributed/pagerank/pagerank_pull.cpp    |   2 +-
 .../distributed/pagerank/pagerank_push.cpp    |   2 +-
 .../analytics/distributed/sssp/sssp_push.cpp  |   2 +-
 .../gpu/matrixcompletion/SGDAsyncEdgeCu.h     |   4 +-
 .../gpu/matrixcompletion/SGDCommonCu.h        |   2 +-
 .../gpu/matrixcompletion/SGDGraphCu.h         |   6 +-
 .../analytics/gpu/pointstoanalysis/andersen.h |  12 +-
 .../algorithms/RewriteManager.cpp             |   2 +-
 .../functional/FunctionHandler.h              |   2 +-
 lonestar/eda/cpu/sproute/flute.h              |  24 +-
 lonestar/eda/cpu/sproute/maze.h               |   2 +-
 lonestar/eda/cpu/sproute/maze3D.h             |   2 +-
 lonestar/eda/cpu/sproute/route.h              |   4 +-
 lonestar/gnn/distributed/gcn/gcn-dist.cpp     |   4 +-
 lonestar/gnn/gat/gat.cpp                      |  11 +-
 lonestar/gnn/gin/gin.cpp                      |   9 +-
 lonestar/gnn/include/DistributedGraphLoader.h |   3 +-
 lonestar/gnn/include/engine.h                 |   2 +-
 lonestar/gnn/include/lonestargnn.h            |  92 ++----
 lonestar/gnn/sage/sage.cpp                    |  52 ++-
 lonestar/gnn/src/DistributedGraphLoader.cpp   |   9 +-
 .../libdistbench/include/DistBench/Input.h    |   5 +-
 lonestar/libdistbench/src/Input.cpp           |   9 +-
 lonestar/libgnnbench/src/Input.cpp            |   9 +-
 lonestar/libgnnbench/src/Start.cpp            |   4 +-
 .../mining/cpu/subgraph-listing/sgl_cycle.cpp |   2 +-
 .../cpu/subgraph-listing/sgl_diamond.cpp      |   2 +-
 .../delaunayrefinement/DelaunayRefinement.cpp |   2 +-
 .../cpu/longestedge/src/LongestEdge.cpp       |   2 +-
 .../longestedge/src/readers/SrtmReader.cpp    |   2 +-
 104 files changed, 562 insertions(+), 701 deletions(-)
 delete mode 100644 .gitattributes
 delete mode 100644 inputs/cora/cora-dims.txt
 delete mode 100644 inputs/cora/cora-feat.bin
 delete mode 100644 inputs/cora/cora-feats.bin
 delete mode 100644 inputs/cora/cora-labels.txt
 delete mode 100644 inputs/cora/cora-test_mask.txt
 delete mode 100644 inputs/cora/cora-train_mask.txt
 delete mode 100644 inputs/cora/cora-val_mask.txt
 delete mode 100644 inputs/cora/cora.csgr
 delete mode 100644 inputs/cora/cora.el
 delete mode 100644 inputs/cora/cora.features
 delete mode 100644 inputs/cora/cora.ft
 delete mode 100644 inputs/cora/cora.gr
 delete mode 100644 inputs/cora/cora.sgr
 delete mode 100644 inputs/cora/cora_edgelist.txt
 delete mode 100644 inputs/cora/cora_full.npz
 delete mode 100644 inputs/cora/cora_labels.txt
 delete mode 100644 inputs/tester/tester-dims.txt
 delete mode 100644 inputs/tester/tester-feats.bin
 delete mode 100644 inputs/tester/tester-labels.txt
 delete mode 100644 inputs/tester/tester-mlabels.txt
 delete mode 100644 inputs/tester/tester-test-feats.bin
 delete mode 100644 inputs/tester/tester-test-labels-dims.txt
 delete mode 100644 inputs/tester/tester-test-labels.bin
 delete mode 100644 inputs/tester/tester-test-mapping.bin
 delete mode 100644 inputs/tester/tester-test_mask.txt
 delete mode 100644 inputs/tester/tester-train_mask.txt
 delete mode 100644 inputs/tester/tester-val_mask.txt
 delete mode 100644 inputs/tester/tester.csgr
 delete mode 100644 inputs/tester/tester.el

diff --git a/.gitattributes b/.gitattributes
deleted file mode 100644
index 03199fcb2d..0000000000
--- a/.gitattributes
+++ /dev/null
@@ -1,31 +0,0 @@
-inputs/cora filter=lfs diff=lfs merge=lfs -text
-inputs/tester filter=lfs diff=lfs merge=lfs -text
-inputs/cora/cora_edgelist.txt filter=lfs diff=lfs merge=lfs -text
-inputs/cora/cora-labels.txt filter=lfs diff=lfs merge=lfs -text
-inputs/cora/cora.sgr filter=lfs diff=lfs merge=lfs -text
-inputs/cora/cora.csgr filter=lfs diff=lfs merge=lfs -text
-inputs/cora/cora-dims.txt filter=lfs diff=lfs merge=lfs -text
-inputs/cora/cora_full.npz filter=lfs diff=lfs merge=lfs -text
-inputs/cora/cora_labels.txt filter=lfs diff=lfs merge=lfs -text
-inputs/cora/cora-val_mask.txt filter=lfs diff=lfs merge=lfs -text
-inputs/cora/cora.el filter=lfs diff=lfs merge=lfs -text
-inputs/cora/cora.features filter=lfs diff=lfs merge=lfs -text
-inputs/cora/cora.ft filter=lfs diff=lfs merge=lfs -text
-inputs/cora/cora-train_mask.txt filter=lfs diff=lfs merge=lfs -text
-inputs/cora/cora-feat.bin filter=lfs diff=lfs merge=lfs -text
-inputs/cora/cora-feats.bin filter=lfs diff=lfs merge=lfs -text
-inputs/cora/cora.gr filter=lfs diff=lfs merge=lfs -text
-inputs/cora/cora-test_mask.txt filter=lfs diff=lfs merge=lfs -text
-inputs/tester/tester-test-feats.bin filter=lfs diff=lfs merge=lfs -text
-inputs/tester/tester-test-labels.bin filter=lfs diff=lfs merge=lfs -text
-inputs/tester/tester-test-labels-dims.txt filter=lfs diff=lfs merge=lfs -text
-inputs/tester/tester-test_mask.txt filter=lfs diff=lfs merge=lfs -text
-inputs/tester/tester-dims.txt filter=lfs diff=lfs merge=lfs -text
-inputs/tester/tester.el filter=lfs diff=lfs merge=lfs -text
-inputs/tester/tester-feats.bin filter=lfs diff=lfs merge=lfs -text
-inputs/tester/tester-mlabels.txt filter=lfs diff=lfs merge=lfs -text
-inputs/tester/tester-train_mask.txt filter=lfs diff=lfs merge=lfs -text
-inputs/tester/tester.csgr filter=lfs diff=lfs merge=lfs -text
-inputs/tester/tester-labels.txt filter=lfs diff=lfs merge=lfs -text
-inputs/tester/tester-test-mapping.bin filter=lfs diff=lfs merge=lfs -text
-inputs/tester/tester-val_mask.txt filter=lfs diff=lfs merge=lfs -text
diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
index 7ecf6046fc..a95f29e42e 100644
--- a/.github/workflows/build-and-test.yml
+++ b/.github/workflows/build-and-test.yml
@@ -53,11 +53,11 @@ jobs:
         shell: bash -l {0}
     strategy:
       matrix:
-        build-type: ['Release']
-        sanitizer-type: ['nosan']
-        # exclude:
-        #   - build-type: 'Debug'
-        #     sanitizer-type: 'san'
+        build-type: ['Release', 'Debug']
+        sanitizer-type: ['nosan', 'san']
+        exclude:
+          - build-type: 'Debug'
+            sanitizer-type: 'san'
     needs: docker-create-ubuntu-2204
 
     steps:
@@ -105,9 +105,9 @@ jobs:
           make docker
 
     - name: Run Tests
-      timeout-minutes: 15
+      timeout-minutes: 5
       run: |
-        CONTAINER_CMD="bash -lc 'source /opt/intel/oneapi/setvars.sh &&  make run-tests'" \
+        CONTAINER_CMD="make run-tests" \
           IMAGE_NAME="${{ env.IMAGE_NAME }}" \
           VERSION="${{ env.IMAGE_VERSION }}" \
           make docker
diff --git a/Makefile b/Makefile
index 9259620058..a0544b1c2c 100644
--- a/Makefile
+++ b/Makefile
@@ -117,7 +117,6 @@ run-tests:
 	@ctest --test-dir build -R wmd --verbose
 	@ctest --test-dir build -R large-vec --verbose
 	@ctest --test-dir build -R compile-lscsr --verbose
-	@ctest --test-dir build/libgnn/test --verbose
 
 # this command is slow since hooks are not stored in the container image
 # this is mostly for CI use
@@ -125,4 +124,4 @@ docker-pre-commit:
 	@docker --context ${CONTAINER_CONTEXT} run --rm \
 	-v ${SRC_DIR}/:${CONTAINER_SRC_DIR} --privileged \
 	--workdir=${CONTAINER_WORKDIR} -t \
-	${IMAGE_NAME}:${VERSION} bash -lc "git config --global --add safe.directory /galois && make hooks && make pre-commit"
+	${IMAGE_NAME}:${VERSION} bash -lc "git config --global --add safe.directory /pando-galois && make hooks && make pre-commit"
diff --git a/inputs/cora/cora-dims.txt b/inputs/cora/cora-dims.txt
deleted file mode 100644
index a92cf3378c..0000000000
--- a/inputs/cora/cora-dims.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:dbb10c2013c41f9395f2ed0bac8c17bbd5e6114c0fc22737d3e5e757f37ad579
-size 10
diff --git a/inputs/cora/cora-feat.bin b/inputs/cora/cora-feat.bin
deleted file mode 100644
index ec044f57ce..0000000000
--- a/inputs/cora/cora-feat.bin
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:718d06f9d0ee6f7568d9fa9c67c53cc206cdb594fa7ef857295604dd97ea0948
-size 15522256
diff --git a/inputs/cora/cora-feats.bin b/inputs/cora/cora-feats.bin
deleted file mode 100644
index ec044f57ce..0000000000
--- a/inputs/cora/cora-feats.bin
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:718d06f9d0ee6f7568d9fa9c67c53cc206cdb594fa7ef857295604dd97ea0948
-size 15522256
diff --git a/inputs/cora/cora-labels.txt b/inputs/cora/cora-labels.txt
deleted file mode 100644
index c7b71c890b..0000000000
--- a/inputs/cora/cora-labels.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5a9a43663c60084738f6f42f625c7bed529b2e87bb5ba37851b9bdca0d401845
-size 37919
diff --git a/inputs/cora/cora-test_mask.txt b/inputs/cora/cora-test_mask.txt
deleted file mode 100644
index 21b79a3e9c..0000000000
--- a/inputs/cora/cora-test_mask.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:41ac837719e878b0b55edb8ac57f68fc875197505ded161d27ed2da70015be69
-size 5426
diff --git a/inputs/cora/cora-train_mask.txt b/inputs/cora/cora-train_mask.txt
deleted file mode 100644
index 2197485a30..0000000000
--- a/inputs/cora/cora-train_mask.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:30746c81e77730da65200f597f5db42e3234e1e0c36ae19a327c0172aba9cb2a
-size 5422
diff --git a/inputs/cora/cora-val_mask.txt b/inputs/cora/cora-val_mask.txt
deleted file mode 100644
index c858137960..0000000000
--- a/inputs/cora/cora-val_mask.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c396d702e58eac6309fdd73f222ae4881c0572e80eed760979096bf44d2ea31b
-size 5424
diff --git a/inputs/cora/cora.csgr b/inputs/cora/cora.csgr
deleted file mode 100644
index 9619c232cc..0000000000
--- a/inputs/cora/cora.csgr
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0a1e691f65460fab765fea2e679e0ac345aaaa9139e9c8d3f2fb78c7fd8f12fd
-size 63920
diff --git a/inputs/cora/cora.el b/inputs/cora/cora.el
deleted file mode 100644
index b64d9fa9c0..0000000000
--- a/inputs/cora/cora.el
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c1555fbbb3226bf7f23c25a4b797ffc1bc2be9377f400badcb4ec4f0db6cb985
-size 48447
diff --git a/inputs/cora/cora.features b/inputs/cora/cora.features
deleted file mode 100644
index c13b57d217..0000000000
--- a/inputs/cora/cora.features
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4ddbf810e46d3d34b899ecf242e8a6e51f5b3490edb46dc134fbd00db20f54f2
-size 15534686
diff --git a/inputs/cora/cora.ft b/inputs/cora/cora.ft
deleted file mode 100644
index b07aebda57..0000000000
--- a/inputs/cora/cora.ft
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7f75a72d201d81ae8c2570b13518f2671f664829e4220ba98ce9207c88bf3d78
-size 881186
diff --git a/inputs/cora/cora.gr b/inputs/cora/cora.gr
deleted file mode 100644
index 6d23c326de..0000000000
--- a/inputs/cora/cora.gr
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9385f97e6df599f6a0799cbfb51fa4a66d8f3fa7be5bf74cb41ada9195ff3820
-size 42808
diff --git a/inputs/cora/cora.sgr b/inputs/cora/cora.sgr
deleted file mode 100644
index 5e1ae9b299..0000000000
--- a/inputs/cora/cora.sgr
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:09ff43f169d8225dc11cf3bfbea022045dde1628444282fc39f95780a24cfad3
-size 63920
diff --git a/inputs/cora/cora_edgelist.txt b/inputs/cora/cora_edgelist.txt
deleted file mode 100644
index d256d6c72e..0000000000
--- a/inputs/cora/cora_edgelist.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c41cf0e8321332492dd345ba54bee94817f217c19ca90a697050700235b12999
-size 99636
diff --git a/inputs/cora/cora_full.npz b/inputs/cora/cora_full.npz
deleted file mode 100644
index 3a5cd5d7f7..0000000000
--- a/inputs/cora/cora_full.npz
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a4e0516cf23ec24eca1ecc57e627414350219fa94337b5b2e232ab58acaf2df7
-size 11159872
diff --git a/inputs/cora/cora_labels.txt b/inputs/cora/cora_labels.txt
deleted file mode 100644
index 2d796ac922..0000000000
--- a/inputs/cora/cora_labels.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f6abf39402ba4af3b629d7da7504788d4dec289ef15aac2cb393d499e0e907f5
-size 17846
diff --git a/inputs/tester/tester-dims.txt b/inputs/tester/tester-dims.txt
deleted file mode 100644
index 6efce4258e..0000000000
--- a/inputs/tester/tester-dims.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:82001c16a68129ec5cc4b6872b731382b3fca6743cb12e347c4abc53461c01d4
-size 4
diff --git a/inputs/tester/tester-feats.bin b/inputs/tester/tester-feats.bin
deleted file mode 100644
index 40f1f29599..0000000000
--- a/inputs/tester/tester-feats.bin
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:dfe39ec72535e9a520bb9c45eb67a02f36dc69f0bfb0fc0897dfecf82c4ac407
-size 84
diff --git a/inputs/tester/tester-labels.txt b/inputs/tester/tester-labels.txt
deleted file mode 100644
index a426b05466..0000000000
--- a/inputs/tester/tester-labels.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:48f6c88b057415789f68575575b9dd4aa4ea2312f9abf225ca054b8bcf41c500
-size 102
diff --git a/inputs/tester/tester-mlabels.txt b/inputs/tester/tester-mlabels.txt
deleted file mode 100644
index 5b1990880f..0000000000
--- a/inputs/tester/tester-mlabels.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e5805372e013db8dacec5484204e0869b7a0863251182ce454b3a4976717475a
-size 102
diff --git a/inputs/tester/tester-test-feats.bin b/inputs/tester/tester-test-feats.bin
deleted file mode 100644
index 9295ab95a4..0000000000
--- a/inputs/tester/tester-test-feats.bin
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6acb4190d3968d32e91de159dc392aaf7ca0e2fd0bef7e356cab1b2091f17f2e
-size 84
diff --git a/inputs/tester/tester-test-labels-dims.txt b/inputs/tester/tester-test-labels-dims.txt
deleted file mode 100644
index a426b05466..0000000000
--- a/inputs/tester/tester-test-labels-dims.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:48f6c88b057415789f68575575b9dd4aa4ea2312f9abf225ca054b8bcf41c500
-size 102
diff --git a/inputs/tester/tester-test-labels.bin b/inputs/tester/tester-test-labels.bin
deleted file mode 100644
index 7f33ed88cd..0000000000
--- a/inputs/tester/tester-test-labels.bin
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:fa1cb0d91b12f9dbf6f71942de069ab32da656af609d66c43176d91f1f69b9db
-size 7
diff --git a/inputs/tester/tester-test-mapping.bin b/inputs/tester/tester-test-mapping.bin
deleted file mode 100644
index 088fd99b84..0000000000
--- a/inputs/tester/tester-test-mapping.bin
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:82139343869744ad102ef59bc54c18f2d02dadbf5009cb96fcb9dc354247917e
-size 28
diff --git a/inputs/tester/tester-test_mask.txt b/inputs/tester/tester-test_mask.txt
deleted file mode 100644
index de6a786db0..0000000000
--- a/inputs/tester/tester-test_mask.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8a7b2807f11023bbbe72e2931922888938d08b32be2b1c39e8f4da05220b90b3
-size 18
diff --git a/inputs/tester/tester-train_mask.txt b/inputs/tester/tester-train_mask.txt
deleted file mode 100644
index a7ef547705..0000000000
--- a/inputs/tester/tester-train_mask.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b09f398f55218d9db8e9fbbaf8ecdcd568fd490febac1d88054142c98df40c48
-size 18
diff --git a/inputs/tester/tester-val_mask.txt b/inputs/tester/tester-val_mask.txt
deleted file mode 100644
index 3d120ebe70..0000000000
--- a/inputs/tester/tester-val_mask.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:77c1800ef77bc11ca9ee04a7dea03884d818b76a449b4b93712e0ff0ec2d45c7
-size 18
diff --git a/inputs/tester/tester.csgr b/inputs/tester/tester.csgr
deleted file mode 100644
index 11776ac3b4..0000000000
--- a/inputs/tester/tester.csgr
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e4e0bcbe036176c15f61d673c8a1acfb0dd8970fd56c68ad9f345f331297156d
-size 136
diff --git a/inputs/tester/tester.el b/inputs/tester/tester.el
deleted file mode 100644
index 3fad0f3a3e..0000000000
--- a/inputs/tester/tester.el
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f46e73a8c721cef981bb7ea0595da475203f674fb1da841d2d9cfc3900e7dd11
-size 48
diff --git a/libcusp/test/shad-dist-graph.cpp b/libcusp/test/shad-dist-graph.cpp
index 42e4fc6645..492bfeb2ad 100644
--- a/libcusp/test/shad-dist-graph.cpp
+++ b/libcusp/test/shad-dist-graph.cpp
@@ -53,10 +53,13 @@ int main() {
   sumGlobalNodes += graph->numMasters();
   sumGlobalEdges += graph->sizeEdges();
 
-  assert(sumGlobalNodes.reduce() == numNodes);
-  assert(sumGlobalNodes.reduce() == graph->globalSize());
-  assert(sumGlobalEdges.reduce() == numEdges);
-  assert(sumGlobalEdges.reduce() == graph->globalSizeEdges());
+  uint64_t reducedSumGlobalNodes = sumGlobalNodes.reduce();
+  uint64_t reducedSumGlobalEdges = sumGlobalEdges.reduce();
+
+  assert(reducedSumGlobalNodes == numNodes);
+  assert(reducedSumGlobalNodes == graph->globalSize());
+  assert(reducedSumGlobalEdges == numEdges);
+  assert(reducedSumGlobalEdges == graph->globalSizeEdges());
 
   std::cout << "Num. nodes/edges tests has been passed\n";
 
diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
index e8a28f8429..525df23a1e 100644
--- a/libgnn/include/galois/graphs/GNNGraph.h
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -24,8 +24,7 @@ namespace galois {
 // TODO remove the need to hardcode this path
 //! Path to location of all gnn files
 static const std::string default_gnn_dataset_path =
-    //"/net/ohm/export/iss/inputs/Learning/";
-    " nope";
+    "/home/hochan/inputs/Learning/";
 
 //! Helper struct to maintain start/end/size of any particular range. Mostly
 //! used for mask ranges.
@@ -71,9 +70,6 @@ class GNNGraph {
         std::string("[") +
         std::to_string(galois::runtime::getSystemNetworkInterface().ID) +
         std::string("] ");
-
-    std::cout << "input directory:" << input_directory_ << ", "
-              << " data set name:" << dataset_name << "\n";
     // load partition
     partitioned_graph_ =
         LoadPartition(input_directory_, dataset_name, partition_scheme);
@@ -1661,15 +1657,6 @@ class GNNGraph {
     }
     GALOIS_LOG_VERBOSE("Partition loading: File to read is {}", input_file);
 
-    std::cout << "input file:" << input_file << "\n";
-    if (FILE* fp = fopen(input_file.c_str(), "r")) {
-      std::cout << "succeeded to read the input file:" << input_file << "\n"
-                << std::flush;
-      fclose(fp);
-    } else {
-      std::cout << "failed to read the input file:" << input_file << "\n"
-                << std::flush;
-    }
     // load partition
     switch (partition_scheme) {
     case galois::graphs::GNNPartitionScheme::kOEC:
@@ -2411,6 +2398,23 @@ class GNNGraph {
     num_correct_.reset();
     total_checked_.reset();
 
+#if 0
+    std::cout << "single accuracy print:\n";
+    for (int i = *begin_owned(); i < *end_owned(); ++i) {
+      if (!IsValidForPhase(i, GNNPhase::kBatch)) {
+        continue;
+      }
+      //std::cout << subgraph_->SIDToLID(i) << ", " << galois::MaxIndex(num_label_classes_, &predictions[i * num_label_classes_]) <<
+      std::cout << "accuracy:" << subgraph_->SIDToLID(i) << ", " <<
+      predictions[i * num_label_classes_] << ", " <<
+      predictions[i * num_label_classes_ + 1] << ", " <<
+      predictions[i * num_label_classes_ + 2] << ", " <<
+      predictions[i * num_label_classes_ + 3] << ", " <<
+      predictions[i * num_label_classes_ + 4] << "-> " <<
+      galois::MaxIndex(num_label_classes_, &predictions[i * num_label_classes_]) <<
+      " vs " << GetSingleClassLabel(i) << "\n";
+    }
+#endif
     galois::do_all(
         // will only loop over sampled nodes if sampling is on
         galois::iterate(begin_owned(), end_owned()),
diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt
index acc04cbb66..40efcfa0e3 100644
--- a/libgnn/test/CMakeLists.txt
+++ b/libgnn/test/CMakeLists.txt
@@ -1,11 +1,7 @@
 find_package(OpenMP)
 
-set(TESTINPUT "${Galois_SOURCE_DIR}/inputs")
-add_definitions("-DTESTER_DIR=\"${TESTINPUT}/tester/\"")
-add_definitions("-DCORA_DIR=\"${TESTINPUT}/cora/\"")
-
 set(hosts)
-set(host 6)
+set(host 12)
 while (${host} GREATER 1)
   list(APPEND hosts ${host})
   math(EXPR host "${host} - 1")
@@ -21,7 +17,7 @@ if (NOT GALOIS_ENABLE_GPU)
       ${GALOIS_TESTS}
       convlayer-test
       sage-layer-test
-      # l2norm-layer-test TODO(hc): L2Norm is not maintained
+      l2norm-layer-test
       softmaxlayer-test
       sigmoidlayer-test
       gnnconstruct-test
@@ -31,7 +27,7 @@ if (NOT GALOIS_ENABLE_GPU)
       epoch-test
       multilabel-epoch-test
       multilabel-read
-      # f1-test TODO(hc): it is a multilable test and will be fixed later
+      f1-test
       sample-bit-test
       gcn-sample-edge-test
   )
diff --git a/libgnn/test/accuracy-test.cpp b/libgnn/test/accuracy-test.cpp
index ae6b201d89..f2d34c0403 100644
--- a/libgnn/test/accuracy-test.cpp
+++ b/libgnn/test/accuracy-test.cpp
@@ -14,8 +14,7 @@ int main() {
 
   // load test graph
   auto test_graph = std::make_unique<galois::graphs::GNNGraph<char, void>>(
-      TESTER_DIR, "tester", galois::graphs::GNNPartitionScheme::kOEC, true,
-      false);
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
 
   std::vector<galois::GNNLayerType> layer_types = {
       galois::GNNLayerType::kGraphConvolutional};
@@ -35,22 +34,23 @@ int main() {
   //////////////////////////////////////////////////////////////////////////////
 
   galois::PointerWithSize<galois::GNNFloat> distributions = gnn->DoInference();
-
+  // accuracy will be 0.2: everything chooses the first 1 as the entire row
+  // is the same
   float pred_accuracy = gnn->GetGlobalAccuracy(distributions);
   GALOIS_LOG_VERBOSE("{}", pred_accuracy);
-  GALOIS_LOG_ASSERT(static_cast<int>(pred_accuracy * 1000) == 333);
+  GALOIS_LOG_ASSERT(pred_accuracy == static_cast<float>(0.2));
 
   // validation mode
   gnn->SetLayerPhases(galois::GNNPhase::kValidate);
   galois::PointerWithSize<galois::GNNFloat> dist2 = gnn->DoInference();
   pred_accuracy = gnn->GetGlobalAccuracy(dist2);
-  GALOIS_LOG_ASSERT(pred_accuracy == static_cast<float>(0));
+  GALOIS_LOG_ASSERT(pred_accuracy == static_cast<float>(0.0));
 
   // test mode
   gnn->SetLayerPhases(galois::GNNPhase::kTest);
   galois::PointerWithSize<galois::GNNFloat> dist3 = gnn->DoInference();
   pred_accuracy = gnn->GetGlobalAccuracy(dist3);
-  GALOIS_LOG_ASSERT(pred_accuracy == static_cast<float>(0));
+  GALOIS_LOG_ASSERT(pred_accuracy == static_cast<float>(0.0));
 
   // manufactured predictions to make sure it predicts things correctly based
   // on mode
@@ -62,11 +62,11 @@ int main() {
   gnn->SetLayerPhases(galois::GNNPhase::kTrain);
   pred_accuracy = gnn->GetGlobalAccuracy(mpred);
   GALOIS_LOG_VERBOSE("{}", pred_accuracy);
-  GALOIS_LOG_ASSERT(static_cast<int>(pred_accuracy * 1000) == 666);
+  GALOIS_LOG_ASSERT(pred_accuracy == static_cast<float>(0.8));
 
   gnn->SetLayerPhases(galois::GNNPhase::kValidate);
   pred_accuracy = gnn->GetGlobalAccuracy(mpred);
-  GALOIS_LOG_ASSERT(pred_accuracy == static_cast<float>(0.5));
+  GALOIS_LOG_ASSERT(pred_accuracy == static_cast<float>(0.0));
 
   gnn->SetLayerPhases(galois::GNNPhase::kTest);
   pred_accuracy = gnn->GetGlobalAccuracy(mpred);
diff --git a/libgnn/test/aggregate-sync-test.cpp b/libgnn/test/aggregate-sync-test.cpp
index 19ff3e993e..549e6c7c53 100644
--- a/libgnn/test/aggregate-sync-test.cpp
+++ b/libgnn/test/aggregate-sync-test.cpp
@@ -10,8 +10,7 @@ int main() {
   }
 
   auto test_graph = std::make_unique<galois::graphs::GNNGraph<char, void>>(
-      TESTER_DIR, "tester", galois::graphs::GNNPartitionScheme::kOEC, true,
-      false);
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
 
   // print edges for sanity
   for (size_t node = 0; node < test_graph->size(); node++) {
@@ -208,8 +207,7 @@ int main() {
   }
   //////////////////////////////////////////////////////////////////////////////
   auto test_graph_2 = std::make_unique<galois::graphs::GNNGraph<char, void>>(
-      TESTER_DIR, "tester", galois::graphs::GNNPartitionScheme::kCVC, true,
-      false);
+      "tester", galois::graphs::GNNPartitionScheme::kCVC, true, false);
   // print edges for sanity
   for (size_t node = 0; node < test_graph_2->size(); node++) {
     for (auto e = test_graph_2->edge_begin(node);
diff --git a/libgnn/test/back-conv-test.cpp b/libgnn/test/back-conv-test.cpp
index 2cb348658b..df3dfe915e 100644
--- a/libgnn/test/back-conv-test.cpp
+++ b/libgnn/test/back-conv-test.cpp
@@ -12,8 +12,7 @@ int main() {
                      num_threads);
   // load test graph
   galois::graphs::GNNGraph<char, void> test_graph(
-      TESTER_DIR, "tester", galois::graphs::GNNPartitionScheme::kCVC, true,
-      false);
+      "tester", galois::graphs::GNNPartitionScheme::kCVC, true, false);
   galois::PointerWithSize<galois::GNNFloat> feats =
       test_graph.GetLocalFeatures();
   for (size_t row = 0; row < test_graph.size(); row++) {
diff --git a/libgnn/test/convlayer-test.cpp b/libgnn/test/convlayer-test.cpp
index cb485112ac..6170e87d50 100644
--- a/libgnn/test/convlayer-test.cpp
+++ b/libgnn/test/convlayer-test.cpp
@@ -15,8 +15,7 @@ int main() {
                      num_threads);
   // load test graph
   galois::graphs::GNNGraph<char, void> test_graph(
-      TESTER_DIR, "tester", galois::graphs::GNNPartitionScheme::kOEC, true,
-      false);
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
 
   galois::PointerWithSize<galois::GNNFloat> feats =
       test_graph.GetLocalFeatures();
diff --git a/libgnn/test/epoch-test.cpp b/libgnn/test/epoch-test.cpp
index 68edfa4a18..c0b4ede716 100644
--- a/libgnn/test/epoch-test.cpp
+++ b/libgnn/test/epoch-test.cpp
@@ -14,7 +14,7 @@ int main() {
 
   // load graph
   auto test_graph = std::make_unique<galois::graphs::GNNGraph<char, void>>(
-      CORA_DIR, "cora", galois::graphs::GNNPartitionScheme::kCVC, true, false);
+      "cora", galois::graphs::GNNPartitionScheme::kCVC, true, false);
 
   std::vector<galois::GNNLayerType> layer_types = {
       galois::GNNLayerType::kGraphConvolutional,
diff --git a/libgnn/test/f1-test.cpp b/libgnn/test/f1-test.cpp
index d284464047..363c12861b 100644
--- a/libgnn/test/f1-test.cpp
+++ b/libgnn/test/f1-test.cpp
@@ -9,8 +9,7 @@ int main() {
 
   // load test graph; false at end = multilabel
   galois::graphs::GNNGraph<char, void> test_graph(
-      TESTER_DIR, "tester", galois::graphs::GNNPartitionScheme::kOEC, false,
-      false);
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, false, false);
 
   // perfect precision and recall
   std::vector<galois::GNNFloat> prediction = {
diff --git a/libgnn/test/gcn-sample-edge-test.cpp b/libgnn/test/gcn-sample-edge-test.cpp
index f04b8fb7d4..c612639d10 100644
--- a/libgnn/test/gcn-sample-edge-test.cpp
+++ b/libgnn/test/gcn-sample-edge-test.cpp
@@ -29,17 +29,16 @@
 int main() {
   galois::DistMemSys G;
 
+  size_t num_threads = 1;
   // tester graph: 0 - 1 - 2 - 3 - 4 - 5 - 6
   galois::graphs::GNNGraph<char, void> test_graph(
-      TESTER_DIR, "tester", galois::graphs::GNNPartitionScheme::kOEC, true,
-      false);
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
   test_graph.InitializeSamplingData();
 
   galois::GNNLayerConfig dcon;
   dcon.disable_aggregate_after_update = false;
   dcon.disable_normalization          = false;
   dcon.DebugConfig();
-
   // Choose a few sample nodes
   test_graph.SetSampledNode(0);
   test_graph.SetSampledNode(4);
@@ -89,7 +88,6 @@ int main() {
 
   layer_1->InitAllWeightsTo1();
   layer_1->EnableSampling();
-  layer_1->SetGraphUserLayerNumber(0);
   galois::PointerWithSize<galois::GNNFloat> features =
       test_graph.GetLocalFeatures();
 
@@ -123,9 +121,9 @@ int main() {
   GALOIS_LOG_ASSERT(layer_1_backward_output[3] == 2);
   GALOIS_LOG_ASSERT(layer_1_backward_output[4] == 2);
   GALOIS_LOG_ASSERT(layer_1_backward_output[5] == 2);
-  GALOIS_LOG_ASSERT(layer_1_backward_output[6] == 2);
-  GALOIS_LOG_ASSERT(layer_1_backward_output[7] == 2);
-  GALOIS_LOG_ASSERT(layer_1_backward_output[8] == 2);
+  GALOIS_LOG_ASSERT(layer_1_backward_output[6] == 0);
+  GALOIS_LOG_ASSERT(layer_1_backward_output[7] == 0);
+  GALOIS_LOG_ASSERT(layer_1_backward_output[8] == 0);
   GALOIS_LOG_ASSERT(layer_1_backward_output[9] == 0);
   GALOIS_LOG_ASSERT(layer_1_backward_output[10] == 0);
   GALOIS_LOG_ASSERT(layer_1_backward_output[11] == 0);
@@ -136,12 +134,12 @@ int main() {
   galois::PointerWithSize<galois::GNNFloat> layer_1_weight_gradients =
       layer_1->GetLayerWeightGradients();
 
-  GALOIS_LOG_ASSERT(layer_1_weight_gradients[0] == 9);
-  GALOIS_LOG_ASSERT(layer_1_weight_gradients[1] == 9);
-  GALOIS_LOG_ASSERT(layer_1_weight_gradients[2] == 9);
-  GALOIS_LOG_ASSERT(layer_1_weight_gradients[3] == 9);
-  GALOIS_LOG_ASSERT(layer_1_weight_gradients[4] == 9);
-  GALOIS_LOG_ASSERT(layer_1_weight_gradients[5] == 9);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[0] == 6);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[1] == 6);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[2] == 6);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[3] == 6);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[4] == 6);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[5] == 6);
 
   return 0;
 }
diff --git a/libgnn/test/gnnconstruct-test.cpp b/libgnn/test/gnnconstruct-test.cpp
index 59c07ee286..aa1513ca91 100644
--- a/libgnn/test/gnnconstruct-test.cpp
+++ b/libgnn/test/gnnconstruct-test.cpp
@@ -15,8 +15,7 @@ int main() {
                      num_threads);
   // load test graph
   auto test_graph = std::make_unique<galois::graphs::GNNGraph<char, void>>(
-      TESTER_DIR, "tester", galois::graphs::GNNPartitionScheme::kOEC, true,
-      false);
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
 
   // 2 layer test with softmax
   std::vector<galois::GNNLayerType> layer_types = {
diff --git a/libgnn/test/gnnfb-test.cpp b/libgnn/test/gnnfb-test.cpp
index b9b57ede70..eb74ffb78a 100644
--- a/libgnn/test/gnnfb-test.cpp
+++ b/libgnn/test/gnnfb-test.cpp
@@ -15,8 +15,7 @@ int main() {
                      num_threads);
   // load test graph
   auto test_graph = std::make_unique<galois::graphs::GNNGraph<char, void>>(
-      TESTER_DIR, "tester", galois::graphs::GNNPartitionScheme::kOEC, true,
-      false);
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
 
   // 2 layer test with softmax
   std::vector<galois::GNNLayerType> layer_types = {
@@ -115,7 +114,7 @@ int main() {
   // train mode = last 2 should be masked off
   for (size_t c = 35; c < 49; c += 7) {
     for (size_t i = 0; i < 6; i++) {
-      GALOIS_LOG_ASSERT(static_cast<int>(fo_out[c + i] * 1000) == 142);
+      GALOIS_LOG_ASSERT(fo_out[c + i] == 0);
     }
   }
 
@@ -141,12 +140,12 @@ int main() {
   // first 5 and last should be 0s
   for (size_t c = 0; c < 35; c += 7) {
     for (size_t i = 0; i < 6; i++) {
-      GALOIS_LOG_ASSERT(static_cast<int>(fo_out_val[c + i] * 1000) == 142);
+      GALOIS_LOG_ASSERT(fo_out_val[c + i] == 0);
     }
   }
   for (size_t c = 42; c < 49; c += 7) {
     for (size_t i = 0; i < 6; i++) {
-      GALOIS_LOG_ASSERT(static_cast<int>(fo_out_val[c + i] * 1000) == 142);
+      GALOIS_LOG_ASSERT(fo_out_val[c + i] == 0);
     }
   }
 
@@ -162,7 +161,7 @@ int main() {
   // first 5 and last should be 0s
   for (size_t c = 0; c < 42; c += 7) {
     for (size_t i = 0; i < 6; i++) {
-      GALOIS_LOG_ASSERT(static_cast<int>(fo_out_test[c + i] * 1000) == 142);
+      GALOIS_LOG_ASSERT(fo_out_test[c + i] == 0);
     }
   }
 
@@ -173,8 +172,7 @@ int main() {
   GALOIS_LOG_VERBOSE("Running with different congifuration");
 
   test_graph = std::make_unique<galois::graphs::GNNGraph<char, void>>(
-      TESTER_DIR, "tester", galois::graphs::GNNPartitionScheme::kOEC, true,
-      false);
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
   galois::GraphNeuralNetworkConfig gnn_config2(
       2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax,
       dcon);
diff --git a/libgnn/test/gnngraph-test.cpp b/libgnn/test/gnngraph-test.cpp
index 0fdf7c05c0..b8a05fc8cc 100644
--- a/libgnn/test/gnngraph-test.cpp
+++ b/libgnn/test/gnngraph-test.cpp
@@ -17,10 +17,10 @@ int main() {
   // note multi level reading tested in another test
   GALOIS_LOG_VERBOSE("reddit with single label, oec");
   galois::graphs::GNNGraph<char, void>(
-      CORA_DIR, "cora", galois::graphs::GNNPartitionScheme::kOEC, true, false);
+      "cora", galois::graphs::GNNPartitionScheme::kOEC, true, false);
   GALOIS_LOG_VERBOSE("reddit with single label, cvc");
   galois::graphs::GNNGraph<char, void>(
-      CORA_DIR, "cora", galois::graphs::GNNPartitionScheme::kCVC, true, false);
+      "cora", galois::graphs::GNNPartitionScheme::kCVC, true, false);
 
   // below for when I want to check the remapper
   // galois::graphs::GNNGraph remapper("ogbn-papers100M",
diff --git a/libgnn/test/l2norm-layer-test.cpp b/libgnn/test/l2norm-layer-test.cpp
index 9e291ef1ef..d2b659f238 100644
--- a/libgnn/test/l2norm-layer-test.cpp
+++ b/libgnn/test/l2norm-layer-test.cpp
@@ -11,8 +11,7 @@ int main() {
 
   // load test graph
   galois::graphs::GNNGraph<char, void> test_graph(
-      TESTER_DIR, "tester", galois::graphs::GNNPartitionScheme::kOEC, true,
-      false);
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
 
   // input/output columns must be same in softmax
   galois::GNNLayerDimensions dimension_0;
diff --git a/libgnn/test/multilabel-epoch-test.cpp b/libgnn/test/multilabel-epoch-test.cpp
index 625ebe5100..b0a2430bd1 100644
--- a/libgnn/test/multilabel-epoch-test.cpp
+++ b/libgnn/test/multilabel-epoch-test.cpp
@@ -14,8 +14,7 @@ int main() {
 
   // load graph
   auto test_graph = std::make_unique<galois::graphs::GNNGraph<char, void>>(
-      TESTER_DIR, "tester", galois::graphs::GNNPartitionScheme::kOEC, false,
-      false);
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, false, false);
 
   std::vector<galois::GNNLayerType> layer_types = {
       galois::GNNLayerType::kGraphConvolutional,
diff --git a/libgnn/test/multilabel-read.cpp b/libgnn/test/multilabel-read.cpp
index 2e846b5aca..56b8b42071 100644
--- a/libgnn/test/multilabel-read.cpp
+++ b/libgnn/test/multilabel-read.cpp
@@ -9,8 +9,7 @@ int main() {
 
   // load test graph; false at end = multilabel
   galois::graphs::GNNGraph<char, void> test_graph(
-      TESTER_DIR, "tester", galois::graphs::GNNPartitionScheme::kOEC, false,
-      false);
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, false, false);
   const galois::GNNLabel* labels = test_graph.GetMultiClassLabel(0);
 
   unsigned i = 0;
diff --git a/libgnn/test/sage-layer-test.cpp b/libgnn/test/sage-layer-test.cpp
index e0bc766465..3f53921795 100644
--- a/libgnn/test/sage-layer-test.cpp
+++ b/libgnn/test/sage-layer-test.cpp
@@ -15,8 +15,7 @@ int main() {
                      num_threads);
   // load test graph
   galois::graphs::GNNGraph<char, void> test_graph(
-      TESTER_DIR, "tester", galois::graphs::GNNPartitionScheme::kOEC, true,
-      false);
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
 
   galois::GNNLayerDimensions dimension_0;
   dimension_0.input_rows     = 7;
diff --git a/libgnn/test/sample-bit-test.cpp b/libgnn/test/sample-bit-test.cpp
index cd08eab7ac..b53860d950 100644
--- a/libgnn/test/sample-bit-test.cpp
+++ b/libgnn/test/sample-bit-test.cpp
@@ -14,8 +14,7 @@ int main() {
                      num_threads);
 
   galois::graphs::GNNGraph<char, void> graph(
-      TESTER_DIR, "tester", galois::graphs::GNNPartitionScheme::kOEC, true,
-      false);
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
   graph.InitializeSamplingData(3, false);
 
   // first, assert all edges are not sampled (should come with all 0s)
diff --git a/libgnn/test/sample-test.cpp b/libgnn/test/sample-test.cpp
index 6f8060051c..d875a72ee4 100644
--- a/libgnn/test/sample-test.cpp
+++ b/libgnn/test/sample-test.cpp
@@ -34,8 +34,7 @@ int main() {
                      num_threads);
   // load test graph
   galois::graphs::GNNGraph<char, void> test_graph(
-      TESTER_DIR, "tester", galois::graphs::GNNPartitionScheme::kOEC, true,
-      false);
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
 
   galois::GNNLayerDimensions dimension_0;
   dimension_0.input_rows     = 7;
@@ -204,8 +203,7 @@ int main() {
   // sigmoid
   //////////////////////////////////////////////////////////////////////////////
   galois::graphs::GNNGraph<char, void> multi_graph(
-      TESTER_DIR, "tester", galois::graphs::GNNPartitionScheme::kOEC, false,
-      false);
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, false, false);
 
   auto sigmoid_layer = std::make_unique<galois::SigmoidLayer<char, void>>(
       3, multi_graph, &p_back_2, dimension_out);
diff --git a/libgnn/test/sigmoidlayer-test.cpp b/libgnn/test/sigmoidlayer-test.cpp
index fd24b49823..9fd861deff 100644
--- a/libgnn/test/sigmoidlayer-test.cpp
+++ b/libgnn/test/sigmoidlayer-test.cpp
@@ -16,8 +16,7 @@ int main() {
 
   // load test graph
   galois::graphs::GNNGraph<char, void> test_graph(
-      TESTER_DIR, "tester", galois::graphs::GNNPartitionScheme::kOEC, false,
-      false);
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, false, false);
 
   // input/output columns must be same in softmax
   galois::GNNLayerDimensions dimension_0;
diff --git a/libgnn/test/softmaxlayer-test.cpp b/libgnn/test/softmaxlayer-test.cpp
index 97a3220ba3..1ca2740729 100644
--- a/libgnn/test/softmaxlayer-test.cpp
+++ b/libgnn/test/softmaxlayer-test.cpp
@@ -18,8 +18,7 @@ int main() {
 
   // load test graph
   galois::graphs::GNNGraph<char, void> test_graph(
-      TESTER_DIR, "tester", galois::graphs::GNNPartitionScheme::kOEC, true,
-      false);
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
 
   // input/output columns must be same in softmax
   galois::GNNLayerDimensions dimension_0;
@@ -54,6 +53,16 @@ int main() {
     GALOIS_LOG_ASSERT(galois::MaxIndex(7, &(prediction_distribution[i * 7])) ==
                       i);
   }
+  // train mode means last 2 vertices should be empty
+  for (size_t i = 5; i < 7; i++) {
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 0] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 1] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 2] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 3] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 4] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 5] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 6] == 0.0);
+  }
 
   // NOTE: checked before backward because backward overwrites this matrix
 
@@ -71,6 +80,24 @@ int main() {
 
   // validate vertex is index 5
   GALOIS_LOG_ASSERT(galois::MaxIndex(7, &(pd2[5 * 7])) == 5);
+  for (size_t i = 0; i < 5; i++) {
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 0] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 1] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 2] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 3] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 4] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 5] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 6] == 0.0);
+  }
+  for (size_t i = 6; i < 7; i++) {
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 0] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 1] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 2] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 3] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 4] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 5] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 6] == 0.0);
+  }
 
   asdf = output_layer->BackwardPhase(softmax_input, nullptr);
   printf("Output 2\n========\n");
@@ -84,6 +111,16 @@ int main() {
       output_layer->ForwardPhase(softmax_input);
   // validate vertex is index 6
   GALOIS_LOG_ASSERT(galois::MaxIndex(7, &(pd3[6 * 7])) == 6);
+  // all but last are empty distributions
+  for (size_t i = 0; i < 6; i++) {
+    GALOIS_LOG_ASSERT(pd3[i * 7 + 0] == 0.0);
+    GALOIS_LOG_ASSERT(pd3[i * 7 + 1] == 0.0);
+    GALOIS_LOG_ASSERT(pd3[i * 7 + 2] == 0.0);
+    GALOIS_LOG_ASSERT(pd3[i * 7 + 3] == 0.0);
+    GALOIS_LOG_ASSERT(pd3[i * 7 + 4] == 0.0);
+    GALOIS_LOG_ASSERT(pd3[i * 7 + 5] == 0.0);
+    GALOIS_LOG_ASSERT(pd3[i * 7 + 6] == 0.0);
+  }
 
   asdf = output_layer->BackwardPhase(softmax_input, nullptr);
   printf("Output 3\n========\n");
diff --git a/libgpu/include/csr_graph.h b/libgpu/include/csr_graph.h
index e08b425cb0..7fff0750e4 100644
--- a/libgpu/include/csr_graph.h
+++ b/libgpu/include/csr_graph.h
@@ -15,7 +15,7 @@
 #define LSG_CSR_GRAPH
 
 // TODO: original branch has this include; revert it back eventually
-// #include "graph_gpu.h"
+//#include "graph_gpu.h"
 
 #include <fstream>
 #include "checker.h"
diff --git a/libgpu/include/graph_gpu.h b/libgpu/include/graph_gpu.h
index 24ca4a353d..d208a3328c 100644
--- a/libgpu/include/graph_gpu.h
+++ b/libgpu/include/graph_gpu.h
@@ -48,9 +48,13 @@ struct CSRGraph {
   unsigned deallocOnDevice();
   void dealloc();
 
-  CUDA_HOSTDEV bool valid_node(index_type node) { return (node < nnodes); }
+  CUDA_HOSTDEV bool valid_node(index_type node) {
+    return (node < nnodes);
+  }
 
-  CUDA_HOSTDEV bool valid_edge(index_type edge) { return (edge < nedges); }
+  CUDA_HOSTDEV bool valid_edge(index_type edge) {
+    return (edge < nedges);
+  }
 
   CUDA_HOSTDEV index_type getOutDegree(unsigned src) {
     assert(src < nnodes);
@@ -94,114 +98,101 @@ struct CSRGraph {
     return edge_data[abs_edge];
   };
 
-  void print_neighbors(index_type vid) {
-    printf("Vertex %d neighbors: [ ", vid);
-    index_type start = row_start[vid];
-    index_type end   = row_start[vid + 1];
-    for (index_type e = start; e != end; e++) {
-      index_type dst = edge_dst[e];
-      printf("%d ", dst);
-    }
-    printf("]\n");
-  }
-  void add_selfloop() {
-    // print_neighbors(nnodes-1);
-    // print_neighbors(0);
-    index_type* new_edge_dst = new index_type[nnodes + nedges];
-    for (index_type i = 0; i < nnodes; i++) {
-      index_type start       = row_start[i];
-      index_type end         = row_start[i + 1];
-      bool selfloop_inserted = false;
-      if (start == end) {
-        new_edge_dst[start + i] = i;
-        continue;
-      }
-      for (index_type e = start; e != end; e++) {
-        index_type dst = edge_dst[e];
-        if (!selfloop_inserted) {
-          if (i < dst) {
-            selfloop_inserted       = true;
-            new_edge_dst[e + i]     = i;
-            new_edge_dst[e + i + 1] = dst;
-          } else if (e + 1 == end) {
-            selfloop_inserted       = true;
-            new_edge_dst[e + i + 1] = i;
-            new_edge_dst[e + i]     = dst;
-          } else
-            new_edge_dst[e + i] = dst;
-        } else
-          new_edge_dst[e + i + 1] = dst;
-      }
-    }
-    for (index_type i = 0; i <= nnodes; i++)
-      row_start[i] += i;
-    delete edge_dst;
-    edge_dst = new_edge_dst;
-    nedges += nnodes;
+	void print_neighbors(index_type vid) {
+		printf("Vertex %d neighbors: [ ", vid);
+		index_type start = row_start[vid];
+		index_type end = row_start[vid+1];
+		for (index_type e = start; e != end; e++) {
+			index_type dst = edge_dst[e];
+			printf("%d ",  dst);
+		}
+		printf("]\n");
+	}
+	void add_selfloop() {
+		//print_neighbors(nnodes-1);
+		//print_neighbors(0);
+		index_type *new_edge_dst = new index_type[nnodes+nedges];
+		for (index_type i = 0; i < nnodes; i++) {
+			index_type start = row_start[i];
+			index_type end = row_start[i+1];
+			bool selfloop_inserted = false;
+			if (start == end) {
+				new_edge_dst[start+i] = i;
+				continue;
+			}
+			for (index_type e = start; e != end; e++) {
+				index_type dst = edge_dst[e];
+				if (!selfloop_inserted) {
+					if (i < dst) {
+						selfloop_inserted = true;
+						new_edge_dst[e+i] = i;
+						new_edge_dst[e+i+1] = dst;
+					} else if (e+1 == end) {
+						selfloop_inserted = true;
+						new_edge_dst[e+i+1] = i;
+						new_edge_dst[e+i] = dst;
+					} else new_edge_dst[e+i] = dst;
+				} else new_edge_dst[e+i+1] = dst;
+			}
+		}
+		for (index_type i = 0; i <= nnodes; i++) row_start[i] += i;
+		delete edge_dst;
+		edge_dst = new_edge_dst;
+		nedges += nnodes;
     printf("nnodes = %d, nedges = %d\n", nnodes, nedges);
-    // print_neighbors(nnodes-1);
-    // print_neighbors(0);
-  }
-
-  CUDA_HOSTDEV index_type getEdgeDst(unsigned edge) {
-    assert(edge < nedges);
-    return edge_dst[edge];
-  };
-  CUDA_HOSTDEV node_data_type getData(unsigned vid) { return node_data[vid]; }
-  CUDA_HOSTDEV index_type edge_begin(unsigned src) {
-    assert(src <= nnodes);
-    return row_start[src];
-  };
-  CUDA_HOSTDEV index_type edge_end(unsigned src) {
-    assert(src <= nnodes);
-    return row_start[src + 1];
-  };
-  CUDA_HOSTDEV index_type* row_start_host_ptr() { return row_start; }
-  CUDA_HOSTDEV index_type* row_start_ptr() { return row_start; }
-  CUDA_HOSTDEV const index_type* row_start_ptr() const { return row_start; }
-  CUDA_HOSTDEV index_type* edge_dst_ptr() { return edge_dst; }
-  CUDA_HOSTDEV const index_type* edge_dst_ptr() const { return edge_dst; }
-  CUDA_HOSTDEV node_data_type* node_data_ptr() { return node_data; }
-  CUDA_HOSTDEV const node_data_type* node_data_ptr() const { return node_data; }
-  CUDA_HOSTDEV edge_data_type* edge_data_ptr() { return edge_data; }
-  CUDA_HOSTDEV const edge_data_type* edge_data_ptr() const { return edge_data; }
-  CUDA_HOSTDEV void fixEndEdge(index_type vid, index_type row_end) {
-    row_start[vid + 1] = row_end;
-  }
-  CUDA_HOSTDEV void constructEdge(index_type eid, index_type dst,
-                                  edge_data_type edata = 0) {
+		//print_neighbors(nnodes-1);
+		//print_neighbors(0);
+	}
+
+	CUDA_HOSTDEV index_type getEdgeDst(unsigned edge) {
+		assert(edge < nedges);
+		return edge_dst[edge];
+	};
+	CUDA_HOSTDEV node_data_type getData(unsigned vid) {
+		return node_data[vid];
+	}
+	CUDA_HOSTDEV index_type edge_begin(unsigned src) {
+		assert(src <= nnodes);
+		return row_start[src];
+	};
+	CUDA_HOSTDEV index_type edge_end(unsigned src) {
+		assert(src <= nnodes);
+		return row_start[src+1];
+	};
+	CUDA_HOSTDEV index_type *row_start_host_ptr() { return row_start; }
+	CUDA_HOSTDEV index_type *row_start_ptr() { return row_start; }
+	CUDA_HOSTDEV const index_type *row_start_ptr() const { return row_start; }
+	CUDA_HOSTDEV index_type *edge_dst_ptr() { return edge_dst; }
+	CUDA_HOSTDEV const index_type *edge_dst_ptr() const { return edge_dst; }
+	CUDA_HOSTDEV node_data_type *node_data_ptr() { return node_data; }
+	CUDA_HOSTDEV const node_data_type *node_data_ptr() const { return node_data; }
+	CUDA_HOSTDEV edge_data_type *edge_data_ptr() { return edge_data; }
+	CUDA_HOSTDEV const edge_data_type *edge_data_ptr() const { return edge_data; }
+  CUDA_HOSTDEV void fixEndEdge(index_type vid, index_type row_end) { row_start[vid + 1] = row_end; }
+  CUDA_HOSTDEV void constructEdge(index_type eid, index_type dst, edge_data_type edata = 0) {
     assert(dst < nnodes);
     assert(eid < nedges);
     edge_dst[eid] = dst;
-    if (edge_data)
-      edge_data[eid] = edata;
+    if (edge_data) edge_data[eid] = edata;
   }
   void malloc_index_device(index_type n, index_type*& ptr);
   void free_index_device(index_type*& ptr);
-  void set_index(index_type pos, index_type value, index_type* ptr);
+  void set_index(index_type pos, index_type value, index_type *ptr);
   void allocateFrom(index_type nv, index_type ne) {
     bool need_realloc = false;
-    if (nedges < ne)
-      need_realloc = true;
+    if (nedges < ne) need_realloc = true;
     nnodes = nv;
     nedges = ne;
-    if (max_size < nnodes)
-      max_size = nnodes;
-    // printf("allocating memory on gpu nnodes %d nedges %d\n", max_size,
-    // nedges);
+    if (max_size < nnodes) max_size = nnodes;
+    //printf("allocating memory on gpu nnodes %d nedges %d\n", max_size, nedges);
     if (need_realloc) {
-      if (edge_dst)
-        free_index_device(edge_dst);
+      if (edge_dst) free_index_device(edge_dst);
       malloc_index_device(nedges, edge_dst);
     }
-    if (!row_start)
-      malloc_index_device(max_size + 1, row_start);
+    if (!row_start) malloc_index_device(max_size+1, row_start);
     set_index(0, 0, row_start);
   }
-  void set_max_size(index_type max) {
-    assert(max > 0);
-    max_size = max;
-  }
+  void set_max_size(index_type max) { assert(max>0); max_size = max; }
   size_t size() { return size_t(nnodes); }
   size_t sizeEdges() { return size_t(nedges); }
   void degree_counting() {}
@@ -213,6 +204,6 @@ struct CSRGraph {
   node_data_type* node_data;
   bool device_graph;
   index_type max_size; // this is for reallocation; avoid re-malloc
-  bool is_allocated;   // this is for reallocation
+  bool is_allocated; // this is for reallocation
 };
 #endif
diff --git a/libgpu/include/internal.h b/libgpu/include/internal.h
index f21a043553..7d8e6f8a9a 100644
--- a/libgpu/include/internal.h
+++ b/libgpu/include/internal.h
@@ -24,10 +24,8 @@ struct multiple_sum {
   T el[items];
 
   // https://nvlabs.github.io/cub/classcub_1_1_block_scan.html#a6ed3f77795e582df31d3d6d9d950615e
-  // "This operation assumes the value of obtained by the T's default
-  // constructor (or by zero-initialization if no user-defined default
-  // constructor exists) is suitable as the identity value zero for addition."
-  __device__ __host__ multiple_sum() : multiple_sum(T()) {}
+  // "This operation assumes the value of obtained by the T's default constructor (or by zero-initialization if no user-defined default constructor exists) is suitable as the identity value zero for addition."
+  __device__ __host__ multiple_sum() : multiple_sum(T()) { }
 
   __device__ __host__ multiple_sum(const T e) {
     for (int i = 0; i < items; i++)
diff --git a/libpangolin/include/pangolin/canonical_graph.h b/libpangolin/include/pangolin/canonical_graph.h
index 5797887054..ec7a18a14d 100644
--- a/libpangolin/include/pangolin/canonical_graph.h
+++ b/libpangolin/include/pangolin/canonical_graph.h
@@ -31,8 +31,8 @@ std::ostream& operator<<(std::ostream& strm,
 template <typename EmbeddingTy, typename ElementTy>
 class CanonicalGraph {
   friend std::ostream&
-  operator<< <>(std::ostream& strm,
-                const CanonicalGraph<EmbeddingTy, ElementTy>& cg);
+  operator<<<>(std::ostream& strm,
+               const CanonicalGraph<EmbeddingTy, ElementTy>& cg);
 
 public:
   CanonicalGraph() : number_of_vertices(0), hash_value(0) {}
diff --git a/libpangolin/include/pangolin/edge_embedding.h b/libpangolin/include/pangolin/edge_embedding.h
index 849021ebad..94af94e9c4 100644
--- a/libpangolin/include/pangolin/edge_embedding.h
+++ b/libpangolin/include/pangolin/edge_embedding.h
@@ -10,8 +10,8 @@ std::ostream& operator<<(std::ostream& strm,
 
 template <typename ElementTy>
 class EdgeInducedEmbedding : public Embedding<ElementTy> {
-  friend std::ostream&
-  operator<< <>(std::ostream& strm, const EdgeInducedEmbedding<ElementTy>& emb);
+  friend std::ostream& operator<<<>(std::ostream& strm,
+                                    const EdgeInducedEmbedding<ElementTy>& emb);
 
 public:
   EdgeInducedEmbedding() { qp_id = 0xFFFFFFFF; }
diff --git a/libpangolin/include/pangolin/embedding.h b/libpangolin/include/pangolin/embedding.h
index cdd37087e4..3a45d43168 100644
--- a/libpangolin/include/pangolin/embedding.h
+++ b/libpangolin/include/pangolin/embedding.h
@@ -2,9 +2,9 @@
 #define EMBEDDING_HPP_
 
 // bliss headers
-// #include "bliss/defs.hh"
-// #include "bliss/utils.hh"
-// #include "bliss/bignum.hh"
+//#include "bliss/defs.hh"
+//#include "bliss/utils.hh"
+//#include "bliss/bignum.hh"
 
 #include "pangolin/element.h"
 
diff --git a/libpangolin/include/pangolin/quick_pattern.h b/libpangolin/include/pangolin/quick_pattern.h
index e8002b2e1a..65b3262645 100644
--- a/libpangolin/include/pangolin/quick_pattern.h
+++ b/libpangolin/include/pangolin/quick_pattern.h
@@ -22,8 +22,8 @@ std::ostream& operator<<(std::ostream& strm,
 
 template <typename EmbTy, typename EleTy>
 class QuickPattern {
-  friend std::ostream& operator<< <>(std::ostream& strm,
-                                     const QuickPattern<EmbTy, EleTy>& qp);
+  friend std::ostream& operator<<<>(std::ostream& strm,
+                                    const QuickPattern<EmbTy, EleTy>& qp);
 
 public:
   QuickPattern() {}
diff --git a/lonestar/analytics/cpu/betweennesscentrality/OuterStructs.h b/lonestar/analytics/cpu/betweennesscentrality/OuterStructs.h
index 1b959e1fd9..36e795c015 100644
--- a/lonestar/analytics/cpu/betweennesscentrality/OuterStructs.h
+++ b/lonestar/analytics/cpu/betweennesscentrality/OuterStructs.h
@@ -245,7 +245,7 @@ class BCOuter {
    */
   template <typename T>
   void deleteArray(T** addr) {
-    delete[] *addr;
+    delete[] * addr;
   }
 
   /**
diff --git a/lonestar/analytics/cpu/bipart/Coarsening.cpp b/lonestar/analytics/cpu/bipart/Coarsening.cpp
index c5ad54ae34..96d17671dd 100644
--- a/lonestar/analytics/cpu/bipart/Coarsening.cpp
+++ b/lonestar/analytics/cpu/bipart/Coarsening.cpp
@@ -29,7 +29,7 @@
 #include <unordered_set>
 #include <unordered_map>
 
-constexpr static const unsigned CHUNK_SIZE = 512U;
+constexpr static const unsigned CHUNK_SIZE      = 512U;
 
 int TOTALW;
 int LIMIT;
@@ -44,40 +44,40 @@ int hash(unsigned val) {
 void parallelRand(MetisGraph* graph, int) {
 
   GGraph* fineGGraph = graph->getFinerGraph()->getGraph();
-
-  galois::StatTimer T_RAND("RAND");
+  
+	galois::StatTimer T_RAND("RAND");
   T_RAND.start();
 
-  galois::do_all(
-      galois::iterate((uint64_t)0, fineGGraph->hedges),
+	galois::do_all(
+      galois::iterate((uint64_t) 0, fineGGraph->hedges),
       [&fineGGraph](uint64_t item) {
-        unsigned netnum = fineGGraph->getData(item, flag_no_lock).netnum;
-        netnum          = hash(netnum);
+				unsigned netnum = fineGGraph->getData(item, flag_no_lock).netnum;
+				netnum= hash(netnum);
         fineGGraph->getData(item, flag_no_lock).netrand = netnum;
       },
-      galois::steal(),
-      //			 galois::chunk_size<CHUNK_SIZE>());
-      galois::loopname("rand"));
-  T_RAND.stop();
+			galois::steal(),
+//			 galois::chunk_size<CHUNK_SIZE>());
+     galois::loopname("rand"));
+	T_RAND.stop();
 
-  // std::cout <<"hedges: " << fineGGraph->hedges << std::endl;
+		//std::cout <<"hedges: " << fineGGraph->hedges << std::endl;
 
-  galois::StatTimer T_INDEX("INDEX");
-  T_INDEX.start();
-  galois::do_all(
-      galois::iterate((uint64_t)0, fineGGraph->hedges),
+		galois::StatTimer T_INDEX("INDEX");
+  	T_INDEX.start();
+		galois::do_all(
+      galois::iterate((uint64_t) 0, fineGGraph->hedges),
       [&fineGGraph](uint64_t item) {
         unsigned netnum = fineGGraph->getData(item, flag_no_lock).index;
-        netnum          = hash(1);
+        netnum= hash(1);
         fineGGraph->getData(item, flag_no_lock).index = netnum;
       },
       galois::steal(),
-      //		 galois::chunk_size<CHUNK_SIZE>());
+	//		 galois::chunk_size<CHUNK_SIZE>());
       galois::loopname("rand_index"));
-  T_INDEX.stop();
+		T_INDEX.stop();
 
-  // std::cout <<"rand: " << T_RAND.get() << std::endl;
-  // std::cout << "rand_index: " << T_INDEX.get() << std::endl;
+		//std::cout <<"rand: " << T_RAND.get() << std::endl;
+		//std::cout << "rand_index: " << T_INDEX.get() << std::endl;
 }
 
 using MatchingPolicy = void(GNode, GGraph*);
@@ -201,10 +201,10 @@ void parallelHMatchAndCreateNodes(MetisGraph* graph, int iter, GNodeBag& bag,
   typedef galois::substrate::PerThreadStorage<VecTy> ThreadLocalData;
   ThreadLocalData edgesThreadLocal;
   std::string name = "phaseI";
-
+  
   galois::GAccumulator<unsigned> hedge;
-
-  galois::InsertBag<GNode> hedge_bag;
+  
+	galois::InsertBag<GNode> hedge_bag;
 
   galois::do_all(
       galois::iterate(size_t{0}, fineGGraph->hedges),
@@ -237,8 +237,8 @@ void parallelHMatchAndCreateNodes(MetisGraph* graph, int iter, GNodeBag& bag,
             return;
           fineGGraph->getData(item).setMatched();
           if (flag)
-            hedge_bag.push(item);
-
+						hedge_bag.push(item);
+          
           bag.push(nodeid);
           unsigned ww = 0;
           for (auto pp : edges) {
@@ -246,16 +246,15 @@ void parallelHMatchAndCreateNodes(MetisGraph* graph, int iter, GNodeBag& bag,
             fineGGraph->getData(pp).setMatched();
             fineGGraph->getData(pp).setParent(nodeid);
             fineGGraph->getData(pp).netnum = fineGGraph->getData(item).netnum;
-            // fineGGraph->getData(pp).netnum =
-            // fineGGraph->getData(item).netnum.load();
-          }
+         		//fineGGraph->getData(pp).netnum = fineGGraph->getData(item).netnum.load();
+				  }
           weight[nodeid - fineGGraph->hedges] = ww;
         }
       },
       galois::loopname("phaseI"));
 
-  for (auto item : hedge_bag)
-    hedges[item] = true;
+			for(auto item: hedge_bag)
+				hedges[item] = true;
 }
 
 void moreCoarse(MetisGraph* graph, galois::LargeArray<unsigned>& weight) {
@@ -311,9 +310,8 @@ void moreCoarse(MetisGraph* graph, galois::LargeArray<unsigned>& weight) {
               fineGGraph->getData(e).setMatched();
               fineGGraph->getData(e).setParent(nn);
               fineGGraph->getData(e).netnum = fineGGraph->getData(b).netnum;
-              // fineGGraph->getData(e).netnum =
-              // fineGGraph->getData(b).netnum.load();
-            }
+           		//fineGGraph->getData(e).netnum = fineGGraph->getData(b).netnum.load();
+						 }
           }
         }
       },
@@ -342,7 +340,7 @@ void coarsePhaseII(MetisGraph* graph, std::vector<bool>& hedges,
   galois::GAccumulator<int> hnode;
   moreCoarse(graph, weight);
 
-  galois::InsertBag<GNode> hedge_bag;
+	galois::InsertBag<GNode> hedge_bag;
 
   galois::do_all(
       galois::iterate(size_t{0}, fineGGraph->hedges),
@@ -371,36 +369,38 @@ void coarsePhaseII(MetisGraph* graph, std::vector<bool>& hedges,
           fineGGraph->getData(item).setMatched();
 
         } else {
-          //	auto& vec = *edgesThreadLocalV.getLocal();
-          // vec.push_back(item);
-          hedge_bag.push(item);
-          fineGGraph->getData(item).setMatched();
+				//	auto& vec = *edgesThreadLocalV.getLocal();
+          //vec.push_back(item);
+					hedge_bag.push(item);
+					fineGGraph->getData(item).setMatched();
         }
-      },
-      galois::steal(), galois::loopname("count # Hyperedges"));
+      },galois::steal(),
+      galois::loopname("count # Hyperedges"));
 
-  for (auto item : hedge_bag)
-    hedges[item] = true;
+			for(auto item:hedge_bag)
+				hedges[item] = true;
 }
 
-// find nodes that are not incident to any hyperedge
-void findLoneNodes(GGraph& graph) {
-
-  galois::do_all(
-      galois::iterate((uint64_t)graph.hedges, graph.size()),
-      [&](GNode n) { graph.getData(n).notAlone = false; }, galois::steal(),
-      galois::loopname("initialize not alone variables"));
-
-  galois::do_all(
-      galois::iterate((uint64_t)0, graph.hedges),
-      [&](GNode h) {
-        for (auto n : graph.edges(h))
-          graph.getData(graph.getEdgeDst(n)).notAlone = true;
-      },
-      galois::steal(), galois::loopname("set not alone variables"));
+//find nodes that are not incident to any hyperedge
+void findLoneNodes(GGraph& graph){
+	
+	galois::do_all(
+		galois::iterate((uint64_t) graph.hedges, graph.size()),
+			[&](GNode n){
+				
+				graph.getData(n).notAlone = false;
+			}, galois::steal(), galois::loopname("initialize not alone variables"));
+	
+	galois::do_all(
+		galois::iterate((uint64_t) 0, graph.hedges),
+			[&](GNode h){
+
+				for(auto n:graph.edges(h))
+					graph.getData(graph.getEdgeDst(n)).notAlone = true;
+			}, galois::steal(), galois::loopname("set not alone variables"));
 }
 
-// create coarsened graphs
+//create coarsened graphs
 void parallelCreateEdges(MetisGraph* graph, GNodeBag& bag,
                          std::vector<bool>& hedges,
                          galois::LargeArray<unsigned>& weight) {
@@ -416,68 +416,68 @@ void parallelCreateEdges(MetisGraph* graph, GNodeBag& bag,
           hg += 1;
       },
       galois::steal(), galois::loopname("number of hyperedges loop"));
+ 
+	//find lone nodes
+	findLoneNodes(*fineGGraph);
+ 
 
-  // find lone nodes
-  findLoneNodes(*fineGGraph);
-
-  galois::do_all(
+	galois::do_all(
       galois::iterate(fineGGraph->hedges, fineGGraph->size()),
       [&](GNode ii) {
-        if (!fineGGraph->getData(ii)
-                 .isMatched()) { // && fineGGraph->getData(ii).notAlone) {
+        if (!fineGGraph->getData(ii).isMatched()){// && fineGGraph->getData(ii).notAlone) {
           bag.push(ii);
           fineGGraph->getData(ii).setMatched();
           fineGGraph->getData(ii).setParent(ii);
           fineGGraph->getData(ii).netnum  = INT_MAX;
           weight[ii - fineGGraph->hedges] = fineGGraph->getData(ii).getWeight();
-        }
+ 
+	      }
       },
       galois::steal(), galois::loopname("noedgebag match"));
 
-  galois::StatTimer T_BAG("BAG");
-  T_BAG.start();
-  std::vector<bool> inNodeBag(1000, false);
-  std::vector<unsigned> nodeid(1000, INT_MAX);
-
-  for (GNode ii = fineGGraph->hedges; ii < fineGGraph->size(); ii++) {
-
-    if (!fineGGraph->getData(ii).isMatched() &&
-        !fineGGraph->getData(ii).notAlone) {
-      int index        = ii % 1000;
-      inNodeBag[index] = true;
-      if (ii < nodeid[index])
-        nodeid[index] = ii;
-    }
-  }
-
-  for (int i = 0; i < 1000; i++) {
-
-    if (inNodeBag[i]) {
-      bag.push(nodeid[i]);
-      weight[nodeid[i] - fineGGraph->hedges] = 0;
-    }
-  }
-
-  for (GNode ii = fineGGraph->hedges; ii < fineGGraph->size(); ii++) {
-
-    if (!fineGGraph->getData(ii).isMatched() &&
-        !fineGGraph->getData(ii).notAlone) {
-      int index = ii % 1000;
-      fineGGraph->getData(ii).setMatched();
+  
+	galois::StatTimer T_BAG("BAG");
+	T_BAG.start();
+	std::vector<bool> inNodeBag(1000, false);
+	std::vector<unsigned> nodeid(1000, INT_MAX);
+
+	for(GNode ii = fineGGraph->hedges; ii<fineGGraph->size();ii++){
+		
+		if(!fineGGraph->getData(ii).isMatched() && !fineGGraph->getData(ii).notAlone){
+			int index = ii%1000;
+			inNodeBag[index] = true;
+			if(ii < nodeid[index])
+				nodeid[index] = ii;
+			
+		}
+	}
+
+	for(int i=0;i<1000;i++){
+	
+		if(inNodeBag[i]){
+			bag.push(nodeid[i]);
+			weight[nodeid[i]-fineGGraph->hedges] =  0;
+		}
+	}
+
+	for(GNode ii = fineGGraph->hedges; ii<fineGGraph->size();ii++){
+
+    if(!fineGGraph->getData(ii).isMatched() && !fineGGraph->getData(ii).notAlone){
+      int index = ii%1000;
+   		fineGGraph->getData(ii).setMatched();
       fineGGraph->getData(ii).setParent(nodeid[index]);
-      fineGGraph->getData(ii).netnum = INT_MAX;
-
-      weight[nodeid[index] - fineGGraph->hedges] +=
-          fineGGraph->getData(ii).getWeight();
+      fineGGraph->getData(ii).netnum =  INT_MAX;
+  
+      weight[nodeid[index]-fineGGraph->hedges] += fineGGraph->getData(ii).getWeight();   
     }
   }
-  T_BAG.stop();
+	T_BAG.stop();
 
-  // std::cout <<"bag time: "<< T_BAG.get() << std::endl;
-  unsigned hnum   = hg.reduce();
+	//std::cout <<"bag time: "<< T_BAG.get() << std::endl;
+	unsigned hnum   = hg.reduce();
   unsigned nodes  = std::distance(bag.begin(), bag.end()); // + numnodes;
   unsigned newval = hnum;
-
+  
   std::vector<unsigned> idmap(fineGGraph->hnodes);
   std::vector<unsigned> newrand(nodes);
   std::vector<unsigned> newWeight(nodes);
@@ -485,18 +485,18 @@ void parallelCreateEdges(MetisGraph* graph, GNodeBag& bag,
   Tloop.start();
   std::vector<unsigned> v;
 
-  galois::LargeArray<bool> inBag;
+	galois::LargeArray<bool> inBag;
 
-  inBag.allocateBlocked(fineGGraph->size());
-  for (GNode n = fineGGraph->hedges; n < fineGGraph->size(); n++)
-    inBag[n] = false;
+	inBag.allocateBlocked(fineGGraph->size());
+	for(GNode n = fineGGraph->hedges;n<fineGGraph->size() ; n++)
+		inBag[n] = false;
 
   for (auto n : bag)
-    inBag[n] = true;
-
-  for (GNode n = fineGGraph->hedges; n < fineGGraph->size(); n++)
-    if (inBag[n])
-      v.push_back(n);
+		inBag[n] = true;
+  
+  for(GNode n = fineGGraph->hedges; n<fineGGraph->size(); n++)
+		if(inBag[n])
+			v.push_back(n);
 
   for (auto n : v) {
     newrand[newval - hnum]        = n;
@@ -519,40 +519,43 @@ void parallelCreateEdges(MetisGraph* graph, GNodeBag& bag,
   galois::gstl::Vector<galois::PODResizeableArray<uint32_t>> edges_id(
       num_nodes_next);
   std::vector<std::vector<EdgeTy>> edges_data(num_nodes_next);
-  std::vector<unsigned> old_id(hnum);
-
-  unsigned h_id = 0;
+ 	std::vector<unsigned> old_id(hnum);
+ 
 
+	unsigned h_id = 0;
+  
   for (GNode n = 0; n < fineGGraph->hedges; n++) {
-    if (hedges[n]) {
-      old_id[h_id]                  = fineGGraph->getData(n).netnum;
-      fineGGraph->getData(n).nodeid = h_id++;
-    }
-  }
+	 			if (hedges[n]) {
+      		old_id[h_id]                  = fineGGraph->getData(n).netnum;
+      		fineGGraph->getData(n).nodeid = h_id++;
+    		}
+ 	}
 
   galois::do_all(
       galois::iterate(size_t{0}, fineGGraph->hedges),
       [&](GNode n) {
         if (!hedges[n])
           return;
-        // auto data   = fineGGraph->getData(n, flag_no_lock);
+        //auto data   = fineGGraph->getData(n, flag_no_lock);
         unsigned id = fineGGraph->getData(n).nodeid;
 
         for (auto ii : fineGGraph->edges(n)) {
-          GNode dst = fineGGraph->getEdgeDst(ii);
-          //  auto dst_data = fineGGraph->getData(dst, flag_no_lock);
-          // unsigned pid  = dst_data.getParent();
-          unsigned pid = fineGGraph->getData(dst).getParent();
+          GNode dst     = fineGGraph->getEdgeDst(ii);
+        //  auto dst_data = fineGGraph->getData(dst, flag_no_lock);
+          //unsigned pid  = dst_data.getParent();
+					unsigned pid = fineGGraph->getData(dst).getParent();
 
           auto f = std::find(edges_id[id].begin(), edges_id[id].end(), pid);
-          if (f == edges_id[id].end()) {
+         if (f == edges_id[id].end()) {
 
             edges_id[id].push_back(pid);
           }
         } // End edge loop
+
       },
       galois::steal(), galois::loopname("BuildGrah: Find edges"));
 
+		
   std::vector<uint64_t> prefix_edges(num_nodes_next);
   galois::GAccumulator<uint64_t> num_edges_acc;
   galois::do_all(
@@ -578,19 +581,20 @@ void parallelCreateEdges(MetisGraph* graph, GNodeBag& bag,
         if (ii < hnum) {
           coarseGGraph->getData(ii).netval = INT_MAX;
           coarseGGraph->getData(ii).netnum = old_id[ii];
-        } else {
+				} else {
           coarseGGraph->getData(ii).netval  = INT_MAX;
           coarseGGraph->getData(ii).netnum  = INT_MAX;
           coarseGGraph->getData(ii).netrand = INT_MAX;
-          coarseGGraph->getData(ii).nodeid  = ii;
+          coarseGGraph->getData(ii).nodeid =
+              ii;
           coarseGGraph->getData(ii).setWeight(
               newWeight[ii - coarseGGraph->hedges]);
         }
       },
       galois::steal(), galois::loopname("noedgebag match"));
 
-  inBag.destroy();
-  inBag.deallocate();
+	inBag.destroy();
+	inBag.deallocate();
 }
 
 void findMatching(MetisGraph* coarseMetisGraph, scheduleMode sch, int iter) {
@@ -599,7 +603,7 @@ void findMatching(MetisGraph* coarseMetisGraph, scheduleMode sch, int iter) {
   int sz = coarseMetisGraph->getFinerGraph()->getGraph()->hedges;
   std::vector<bool> hedges(sz, false);
   galois::LargeArray<unsigned> weight;
-  weight.allocateBlocked(fineMetisGraph->getGraph()->hnodes);
+	weight.allocateBlocked(fineMetisGraph->getGraph()->hnodes);
 
   switch (sch) {
   case PLD:
@@ -644,8 +648,8 @@ void findMatching(MetisGraph* coarseMetisGraph, scheduleMode sch, int iter) {
   coarsePhaseII(coarseMetisGraph, hedges, weight);
   parallelCreateEdges(coarseMetisGraph, nodes, hedges, weight);
 
-  weight.destroy();
-  weight.deallocate();
+	weight.destroy();
+	weight.deallocate();
 }
 
 MetisGraph* coarsenOnce(MetisGraph* fineMetisGraph, scheduleMode sch,
@@ -661,12 +665,14 @@ MetisGraph* coarsen(MetisGraph* fineMetisGraph, unsigned coarsenTo,
                     scheduleMode sch) {
 
   MetisGraph* coarseGraph = fineMetisGraph;
-  unsigned size           = fineMetisGraph->getGraph()->hnodes;
-  unsigned hedgeSize      = 0;
-  const float ratio       = 55.0 / 45.0;
-  const float tol         = std::max(ratio, 1 - ratio) - 1;
-  const int hi            = (1 + tol) * size / (2 + tol);
-  LIMIT                   = hi / 4;
+  unsigned size =
+      fineMetisGraph->getGraph()
+          ->hnodes;
+  unsigned hedgeSize = 0;
+  const float ratio  = 55.0 / 45.0;
+  const float tol    = std::max(ratio, 1 - ratio) - 1;
+  const int hi       = (1 + tol) * size / (2 + tol);
+  LIMIT              = hi / 4;
 
   unsigned Size    = size;
   unsigned iterNum = 0;
@@ -675,16 +681,15 @@ MetisGraph* coarsen(MetisGraph* fineMetisGraph, unsigned coarsenTo,
     if (iterNum > coarsenTo)
       break;
     if (Size - newSize <= 0 && iterNum > 2)
-      break;
+      break; 
     newSize     = coarseGraph->getGraph()->hnodes;
     coarseGraph = coarsenOnce(coarseGraph, sch, iterNum);
     Size        = coarseGraph->getGraph()->hnodes;
     hedgeSize   = coarseGraph->getGraph()->hedges;
-    // std::cout << "SIZE IS " << coarseGraph->getGraph()->hnodes << " and net
-    // is "
-    //           << hedgeSize << "\n";
+    //std::cout << "SIZE IS " << coarseGraph->getGraph()->hnodes << " and net is "
+    //          << hedgeSize << "\n";
     if (hedgeSize < 1000)
-      break;
+			break;
 
     ++iterNum;
   }
diff --git a/lonestar/analytics/cpu/bipart/Refine.cpp b/lonestar/analytics/cpu/bipart/Refine.cpp
index 7397f907d8..112dc06277 100644
--- a/lonestar/analytics/cpu/bipart/Refine.cpp
+++ b/lonestar/analytics/cpu/bipart/Refine.cpp
@@ -539,7 +539,7 @@ void refine(MetisGraph* coarseGraph, unsigned K, double imbalance) {
   float tol   = 0.0f;
   bool flag   = isPT(K);
   if (flag) {
-    ratio = (50.0f + (double)imbalance) / (50.0f - (double)imbalance);
+    ratio = (50.0f + (double) imbalance)/(50.0f - (double) imbalance);
     tol   = std::max(ratio, 1 - ratio) - 1;
   } else {
     ratio = ((float)((K + 1) / 2)) / ((float)(K / 2)); // change if needed
diff --git a/lonestar/analytics/cpu/bipart/bipart.cpp b/lonestar/analytics/cpu/bipart/bipart.cpp
index 321afd11fa..27761209ea 100644
--- a/lonestar/analytics/cpu/bipart/bipart.cpp
+++ b/lonestar/analytics/cpu/bipart/bipart.cpp
@@ -121,9 +121,9 @@ void Partition(MetisGraph* metisGraph, unsigned coarsenTo, unsigned K) {
   T3.start();
   refine(mcg, K, imbalance);
   T3.stop();
-  Ctime += (T.get() / 1000.0f);
-  Ptime += (T2.get() / 1000.0f);
-  Rtime += (T3.get() / 1000.0f);
+  Ctime += (T.get()/1000.0f);
+  Ptime += (T2.get()/1000.0f);
+  Rtime += (T3.get()/1000.0f);
 
   execTime.stop();
 }
@@ -154,7 +154,7 @@ int computingBalance(GGraph& g) {
     unsigned pp = g.getData(c).getPart();
     parts[pp]++;
   }
-  for (unsigned i = 0; i < numPartitions; i++) {
+  for (unsigned i = 0; i <numPartitions; i++) {
     if (parts[i] > max)
       max = parts[i];
   }
@@ -415,13 +415,13 @@ int main(int argc, char** argv) {
             galois::steal(), galois::loopname("populate edge ids"));
 
         uint64_t num_edges_acc = 0;
-        // galois::do_all(
-        //   galois::iterate(uint32_t{0}, totalnodes),
-        for (uint32_t c = 0; c < totalnodes; c++) {
-          pre_edges[c] = edges_ids[c].size();
-          num_edges_acc += pre_edges[c];
-        }
-        // galois::steal(), galois::loopname("set pre edges"));
+        //galois::do_all(
+          //  galois::iterate(uint32_t{0}, totalnodes),
+            for(uint32_t c = 0;c<totalnodes;c++) {
+              pre_edges[c] = edges_ids[c].size();
+              num_edges_acc += pre_edges[c];
+            }
+            //galois::steal(), galois::loopname("set pre edges"));
 
         edges = num_edges_acc;
 
@@ -488,20 +488,20 @@ int main(int argc, char** argv) {
     toProcess = toProcessNew;
     toProcessNew.clear();
   } // end while
-  std::cout << "Coarsening time(s):," << Ctime << "\n";
-  std::cout << "Partitiong time(s):," << Ptime << "\n";
-  std::cout << "Refinement time(s):," << Rtime << "\n";
-  std::cout << "\n";
-  std::cout << "Edge Cut," << computingCut(graph) << "\n\n";
+  std::cout<<"Coarsening time(s):,"<<Ctime<<"\n";
+  std::cout<<"Partitiong time(s):,"<<Ptime<<"\n";
+  std::cout<<"Refinement time(s):,"<<Rtime<<"\n";
+  std::cout<<"\n";
+  std::cout<<"Edge Cut,"<<computingCut(graph)<<"\n\n";
 
   galois::runtime::reportStat_Single("BiPart", "Edge Cut", computingCut(graph));
-  // galois::runtime::reportStat_Single("BiPart", "zero-one",
-  //                                    computingBalance(graph));
+  //galois::runtime::reportStat_Single("BiPart", "zero-one",
+  //                                   computingBalance(graph));
 
   totalTime.stop();
   if (output) {
 
-    std::vector<std::vector<uint64_t>> parts(numPartitions);
+    std::vector<std::vector<uint64_t> >parts(numPartitions);
 
     for (GNode n = graph.hedges; n < graph.size(); n++) {
       unsigned p = graph.getData(n).getPart();
@@ -511,9 +511,9 @@ int main(int argc, char** argv) {
     std::ofstream outputFile(outfile.c_str());
 
     for (unsigned i = 0; i < numPartitions; i++) {
-      outputFile << i + 1 << " ";
-      for (auto v : parts[i])
-        outputFile << v << " ";
+      outputFile << i+1 << " ";
+      for (auto v : parts[i]) 
+        outputFile << v << " "; 
       outputFile << "\n";
     }
     outputFile.close();
diff --git a/lonestar/analytics/cpu/bipart/bipart.h b/lonestar/analytics/cpu/bipart/bipart.h
index e99241adce..ee69dfa29b 100644
--- a/lonestar/analytics/cpu/bipart/bipart.h
+++ b/lonestar/analytics/cpu/bipart/bipart.h
@@ -28,8 +28,8 @@ typedef uint32_t EdgeTy;
 
 struct GGraph
     : public galois::graphs::LC_CSR_Graph<MetisNode, EdgeTy>::with_no_lockable<
-          true>::type::with_numa_alloc<true>::type {
-  // false>::type::with_numa_alloc<true>::type {
+        true>::type::with_numa_alloc<true>::type {
+    	//false>::type::with_numa_alloc<true>::type {
   size_t hedges;
   size_t hnodes;
 };
@@ -80,16 +80,17 @@ class MetisNode {
   galois::CopyableAtomic<int> netval;
   galois::CopyableAtomic<int> degree;
   /*std::atomic<int> FS;
-    std::atomic<int> TE;
-    std::atomic<int> netnum;
-    std::atomic<int> netrand;
-    std::atomic<int> netval;
-    std::atomic<int> degree;
-*/ uint32_t index;
-  bool notAlone;
-
-  void initPartition() { pd.locked = false; }
-
+	std::atomic<int> TE;
+	std::atomic<int> netnum;
+	std::atomic<int> netrand;
+	std::atomic<int> netval;
+	std::atomic<int> degree;
+*/	uint32_t index;
+	bool notAlone;
+	
+	void initPartition() { pd.locked = false; }
+	
+	
   // int num;
   explicit MetisNode(int weight) : _weight(weight) {
     initCoarsen();
diff --git a/lonestar/analytics/cpu/clustering/louvainClustering.cpp b/lonestar/analytics/cpu/clustering/louvainClustering.cpp
index c74901fd57..ef7ebb1fd3 100644
--- a/lonestar/analytics/cpu/clustering/louvainClustering.cpp
+++ b/lonestar/analytics/cpu/clustering/louvainClustering.cpp
@@ -440,7 +440,7 @@ double algoLouvainWithLockingDelayUpdate(Graph& graph, double lower,
     double e_xx = 0;
     double a2_x = 0;
     curr_mod    = calModularityDelay(graph, c_info, c_update, e_xx, a2_x,
-                                     constant_for_second_term, local_target);
+                                  constant_for_second_term, local_target);
     galois::gPrint(num_iter, "        ", e_xx, "        ", a2_x, "        ",
                    lower, "      ", prev_mod, "       ", curr_mod, "\n");
 
diff --git a/lonestar/analytics/cpu/gmetis/Coarsening.cpp b/lonestar/analytics/cpu/gmetis/Coarsening.cpp
index 2f7c33545e..f35e725faf 100644
--- a/lonestar/analytics/cpu/gmetis/Coarsening.cpp
+++ b/lonestar/analytics/cpu/gmetis/Coarsening.cpp
@@ -387,7 +387,7 @@ MetisGraph* coarsen(MetisGraph* fineMetisGraph, unsigned coarsenTo,
                     bool verbose) {
   MetisGraph* coarseGraph = fineMetisGraph;
   unsigned size           = std::distance(fineMetisGraph->getGraph()->begin(),
-                                          fineMetisGraph->getGraph()->end());
+                                fineMetisGraph->getGraph()->end());
   unsigned iterNum        = 0;
   bool with2Hop           = false;
   unsigned stat           = 0;
diff --git a/lonestar/analytics/cpu/gmetis/GMetis.cpp b/lonestar/analytics/cpu/gmetis/GMetis.cpp
index 981478a227..145ac63930 100644
--- a/lonestar/analytics/cpu/gmetis/GMetis.cpp
+++ b/lonestar/analytics/cpu/gmetis/GMetis.cpp
@@ -31,7 +31,7 @@
 #include "Metis.h"
 #include "galois/graphs/ReadGraph.h"
 #include "galois/Timer.h"
-// #include "GraphReader.h"
+//#include "GraphReader.h"
 #include "Lonestar/BoilerPlate.h"
 #include "galois/graphs/FileGraph.h"
 #include "galois/LargeArray.h"
diff --git a/lonestar/analytics/cpu/k-core/kcore.cpp b/lonestar/analytics/cpu/k-core/kcore.cpp
index 8b1da0dfc0..f07fc7153a 100644
--- a/lonestar/analytics/cpu/k-core/kcore.cpp
+++ b/lonestar/analytics/cpu/k-core/kcore.cpp
@@ -29,9 +29,9 @@
 constexpr static const char* const REGION_NAME = "k-core";
 constexpr static const char* const name        = "k-core";
 constexpr static const char* const desc        = "Finds the k-core of a graph, "
-                                                 "defined as the subgraph where"
-                                                 " all vertices have degree at "
-                                                 "least k.";
+                                          "defined as the subgraph where"
+                                          " all vertices have degree at "
+                                          "least k.";
 
 /*******************************************************************************
  * Declaration of command line arguments
diff --git a/lonestar/analytics/cpu/matrixcompletion/matrixCompletion.h b/lonestar/analytics/cpu/matrixcompletion/matrixCompletion.h
index 38ec2f6e8d..a47f245069 100644
--- a/lonestar/analytics/cpu/matrixcompletion/matrixCompletion.h
+++ b/lonestar/analytics/cpu/matrixcompletion/matrixCompletion.h
@@ -172,7 +172,7 @@ T doGradientUpdate(T* __restrict__ itemLatent, T* __restrict__ userLatent,
   T step   = stepSize;
   T rating = edgeRating;
   T error  = innerProduct(itemLatent, itemLatent + LATENT_VECTOR_SIZE,
-                          userLatent, -rating);
+                         userLatent, -rating);
 
   // Take gradient step to reduce error
   for (int i = 0; i < LATENT_VECTOR_SIZE; i++) {
diff --git a/lonestar/analytics/distributed/betweennesscentrality/bc_level.cpp b/lonestar/analytics/distributed/betweennesscentrality/bc_level.cpp
index 95b1486123..edf4a331b8 100644
--- a/lonestar/analytics/distributed/betweennesscentrality/bc_level.cpp
+++ b/lonestar/analytics/distributed/betweennesscentrality/bc_level.cpp
@@ -22,7 +22,7 @@
  * avoid the overheads of having 2 extra accumulator variables.
  */
 
-// #define BCDEBUG
+//#define BCDEBUG
 
 #include "DistBench/Output.h"
 #include "DistBench/Start.h"
@@ -91,12 +91,12 @@ struct NodeData {
   float dependency;
   float betweeness_centrality;
 
-  // #ifdef BCDEBUG
+  //#ifdef BCDEBUG
   void dump() {
     galois::gPrint("DUMP: ", current_length.load(), " ",
                    num_shortest_paths.load(), " ", dependency, "\n");
   }
-  // #endif
+  //#endif
 };
 
 // reading in list of sources to operate on if provided
diff --git a/lonestar/analytics/distributed/betweennesscentrality/bc_mr.cpp b/lonestar/analytics/distributed/betweennesscentrality/bc_mr.cpp
index 7c9d162d87..5f175b93eb 100644
--- a/lonestar/analytics/distributed/betweennesscentrality/bc_mr.cpp
+++ b/lonestar/analytics/distributed/betweennesscentrality/bc_mr.cpp
@@ -541,7 +541,7 @@ std::vector<float> makeResults(std::unique_ptr<Graph>& hg) {
 constexpr static const char* const name = "Min-Rounds Betweeness Centrality";
 constexpr static const char* const desc = "Min-Rounds Betweeness "
                                           "Centrality on Distributed Galois.";
-constexpr static const char* const url  = nullptr;
+constexpr static const char* const url = nullptr;
 
 uint64_t macroRound = 0; // macro round, i.e. number of batches done so far
 
diff --git a/lonestar/analytics/distributed/bfs/bfs_push.cpp b/lonestar/analytics/distributed/bfs/bfs_push.cpp
index 51cd6e0da8..34aa8031a3 100644
--- a/lonestar/analytics/distributed/bfs/bfs_push.cpp
+++ b/lonestar/analytics/distributed/bfs/bfs_push.cpp
@@ -61,9 +61,9 @@ static cll::opt<uint32_t>
           cll::init(0));
 
 static cll::opt<unsigned>
-    rseed("rseed",
-          cll::desc("The random seed for choosing the hosts (default value 0)"),
-          cll::init(0));
+  rseed("rseed",
+        cll::desc("The random seed for choosing the hosts (default value 0)"),
+        cll::init(0));
 
 enum Exec { Sync, Async };
 
@@ -102,7 +102,8 @@ struct InitializeGraph {
   uint64_t local_src_node;
   Graph* graph;
 
-  InitializeGraph(uint64_t& _src_node, const uint32_t& _infinity, Graph* _graph)
+  InitializeGraph(uint64_t& _src_node, const uint32_t& _infinity,
+                  Graph* _graph)
       : local_infinity(_infinity), local_src_node(_src_node), graph(_graph) {}
 
   void static go(Graph& _graph) {
@@ -274,12 +275,12 @@ struct BFS {
 
   void operator()(GNode src) const {
     NodeData& snode = graph->getData(src);
-    // stack_capture->capture_stack_info();
+    //stack_capture->capture_stack_info();
     cyg_profile_func_stack(nullptr, nullptr);
 
     if (snode.dist_old > snode.dist_current) {
       active_vertices += 1;
-      // stack_capture->capture_stack_info();
+      //stack_capture->capture_stack_info();
       cyg_profile_func_stack(nullptr, nullptr);
 
       if (local_priority > snode.dist_current) {
@@ -294,10 +295,10 @@ struct BFS {
           uint32_t old_dist = galois::atomicMin(dnode.dist_current, new_dist);
           if (old_dist > new_dist)
             bitset_dist_current.set(dst);
-          // stack_capture->capture_stack_info();
+          //stack_capture->capture_stack_info();
           cyg_profile_func_stack(nullptr, nullptr);
         }
-        // stack_capture->capture_stack_info();
+        //stack_capture->capture_stack_info();
         cyg_profile_func_stack(nullptr, nullptr);
       }
     }
@@ -435,8 +436,8 @@ int main(int argc, char** argv) {
     galois::runtime::reportParam(REGION_NAME, "Source Node ID", src_node);
   }
 
-  // Setup Seeding information
-  uint64_t* src_nodes = (uint64_t*)malloc(sizeof(uint64_t) * numRuns);
+  //Setup Seeding information
+  uint64_t* src_nodes = (uint64_t*) malloc(sizeof(uint64_t) * numRuns);
   std::mt19937 generator(rseed);
 
   galois::StatTimer StatTimer_total("TimerTotal", REGION_NAME);
@@ -457,18 +458,21 @@ int main(int argc, char** argv) {
   galois::DGAccumulator<uint64_t> DGAccumulator_sum;
   galois::DGReduceMax<uint32_t> m;
 
-  // get the src_nodes of the runs
+  //get the src_nodes of the runs
   galois::StatTimer StatTimer_select("VertexSelection", REGION_NAME);
   StatTimer_select.start();
-  for (auto run = 0; run < numRuns; ++run) {
+  for(auto run = 0; run < numRuns; ++run)
+  {
     uint64_t degree = 0;
-    auto num_nodes  = hg->globalSize();
-    uint64_t cand   = 0;
-    while (degree < 1) {
+    auto num_nodes = hg->globalSize();
+    uint64_t cand = 0;
+    while(degree < 1)
+    {
       DGAccumulator_sum.reset();
       cand = generator() % num_nodes;
 
-      if (hg->isOwned(cand) || hg->isLocal(cand)) {
+      if(hg->isOwned(cand) || hg->isLocal(cand))
+      {
         auto lcand = hg->getLID(cand);
         DGAccumulator_sum += hg->localDegree(lcand);
       }
@@ -522,20 +526,18 @@ int main(int argc, char** argv) {
       writeOutput(outputLocation, "level", results.data(), results.size(),
                   globalIDs.data());
     }
+
   }
 
   StatTimer_total.stop();
-  galois::gPrint("[", net.ID, "] Max Stack Size ", stack_capture.get_max(),
-                 " bytes\n");
+  galois::gPrint("[", net.ID, "] Max Stack Size ", stack_capture.get_max(), " bytes\n");
+
 
   struct rusage r_usage;
-  getrusage(RUSAGE_SELF, &r_usage);
+  getrusage(RUSAGE_SELF,&r_usage);
   galois::gPrint("[", net.ID, "] Memory usage: ", r_usage.ru_maxrss, " KB\n");
   auto en = std::chrono::high_resolution_clock::now();
 
-  galois::gPrint(
-      "[", net.ID, "] E2ETime: ",
-      std::chrono::duration_cast<std::chrono::nanoseconds>(en - st).count(),
-      " ns\n");
+  galois::gPrint("[", net.ID, "] E2ETime: ", std::chrono::duration_cast<std::chrono::nanoseconds>(en - st).count(), " ns\n");
   return 0;
 }
diff --git a/lonestar/analytics/distributed/connected-components/cc_pull.cpp b/lonestar/analytics/distributed/connected-components/cc_pull.cpp
index 2ba9f9ed25..f1460fd517 100644
--- a/lonestar/analytics/distributed/connected-components/cc_pull.cpp
+++ b/lonestar/analytics/distributed/connected-components/cc_pull.cpp
@@ -288,7 +288,7 @@ constexpr static const char* const name = "ConnectedComp Pull - Distributed "
                                           "Heterogeneous";
 constexpr static const char* const desc = "ConnectedComp pull on Distributed "
                                           "Galois.";
-constexpr static const char* const url  = nullptr;
+constexpr static const char* const url = nullptr;
 
 int main(int argc, char** argv) {
   galois::DistMemSys G;
diff --git a/lonestar/analytics/distributed/pagerank/pagerank_pull.cpp b/lonestar/analytics/distributed/pagerank/pagerank_pull.cpp
index 1ac988a4d5..6b31a12631 100644
--- a/lonestar/analytics/distributed/pagerank/pagerank_pull.cpp
+++ b/lonestar/analytics/distributed/pagerank/pagerank_pull.cpp
@@ -479,7 +479,7 @@ constexpr static const char* const name = "PageRank - Compiler Generated "
                                           "Distributed Heterogeneous";
 constexpr static const char* const desc = "PageRank Residual Pull version on "
                                           "Distributed Galois.";
-constexpr static const char* const url  = nullptr;
+constexpr static const char* const url = nullptr;
 
 int main(int argc, char** argv) {
   galois::DistMemSys G;
diff --git a/lonestar/analytics/distributed/pagerank/pagerank_push.cpp b/lonestar/analytics/distributed/pagerank/pagerank_push.cpp
index ecda4e7582..2b9b3e4e13 100644
--- a/lonestar/analytics/distributed/pagerank/pagerank_push.cpp
+++ b/lonestar/analytics/distributed/pagerank/pagerank_push.cpp
@@ -480,7 +480,7 @@ constexpr static const char* const name = "PageRank - Compiler Generated "
                                           "Distributed Heterogeneous";
 constexpr static const char* const desc = "Residual PageRank on Distributed "
                                           "Galois.";
-constexpr static const char* const url  = 0;
+constexpr static const char* const url = 0;
 
 int main(int argc, char** argv) {
   galois::DistMemSys G;
diff --git a/lonestar/analytics/distributed/sssp/sssp_push.cpp b/lonestar/analytics/distributed/sssp/sssp_push.cpp
index c530c130e7..8cbdb87b69 100644
--- a/lonestar/analytics/distributed/sssp/sssp_push.cpp
+++ b/lonestar/analytics/distributed/sssp/sssp_push.cpp
@@ -410,7 +410,7 @@ constexpr static const char* const name = "SSSP - Distributed Heterogeneous "
                                           "with worklist.";
 constexpr static const char* const desc = "Variant of Chaotic relaxation SSSP "
                                           "on Distributed Galois.";
-constexpr static const char* const url  = nullptr;
+constexpr static const char* const url = nullptr;
 
 int main(int argc, char** argv) {
   galois::DistMemSys G;
diff --git a/lonestar/analytics/gpu/matrixcompletion/SGDAsyncEdgeCu.h b/lonestar/analytics/gpu/matrixcompletion/SGDAsyncEdgeCu.h
index 887b9e714f..c386947db8 100644
--- a/lonestar/analytics/gpu/matrixcompletion/SGDAsyncEdgeCu.h
+++ b/lonestar/analytics/gpu/matrixcompletion/SGDAsyncEdgeCu.h
@@ -703,7 +703,7 @@ struct SGDAsynEdgeCudaFunctor {
     */
     _P_DATA1      = (int*)malloc(sizeof(int) * (users / C + movies / R));
     _P1           = (struct a_list**)malloc(sizeof(struct a_list*) *
-                                            (users / C + movies / R - 1));
+                                  (users / C + movies / R - 1));
     _P_DATA1[0]   = 0;
     _P_DATA3      = ((
         struct mk*)(malloc(sizeof(struct mk) * (users / C + movies / R - 1))));
@@ -782,6 +782,6 @@ struct SGDAsynEdgeCudaFunctor {
     // fprintf(stderr, "Destroying SGDAsynEdgeCudaFunctor object.\n");
   }
 };
-// ###################################################################//
+//###################################################################//
 
 #endif /* GALOISGPU_APPS_SGD_CUDA_SGDASYNCEDGECU_H_ */
diff --git a/lonestar/analytics/gpu/matrixcompletion/SGDCommonCu.h b/lonestar/analytics/gpu/matrixcompletion/SGDCommonCu.h
index e7c1c704bc..96654978da 100644
--- a/lonestar/analytics/gpu/matrixcompletion/SGDCommonCu.h
+++ b/lonestar/analytics/gpu/matrixcompletion/SGDCommonCu.h
@@ -44,7 +44,7 @@ typedef float FeatureType;
 float SGD_STEP_SIZE(int X) {
   return SGD_LEARNING_RATE * 1.5f / (1.0f + SGD_DECAY_RATE * pow(X + 1, 1.5f));
 } // Purdue.
-// #define SGD_STEP_SIZE(X) (0.001f *1.5f/(1.0+0.9* pow(X+1,1.5))) //Intel.
+//#define SGD_STEP_SIZE(X) (0.001f *1.5f/(1.0+0.9* pow(X+1,1.5))) //Intel.
 /************************************************
  *
  *************************************************/
diff --git a/lonestar/analytics/gpu/matrixcompletion/SGDGraphCu.h b/lonestar/analytics/gpu/matrixcompletion/SGDGraphCu.h
index ea143abcf4..17b2f5286f 100644
--- a/lonestar/analytics/gpu/matrixcompletion/SGDGraphCu.h
+++ b/lonestar/analytics/gpu/matrixcompletion/SGDGraphCu.h
@@ -116,9 +116,9 @@ struct SGD_LC_LinearArray_Undirected_Graph {
     size_t masterLength = buf.st_size;
 
     int _MAP_BASE = MAP_PRIVATE;
-    // #ifdef MAP_POPULATE
-    //   _MAP_BASE  |= MAP_POPULATE;
-    // #endif
+    //#ifdef MAP_POPULATE
+    //  _MAP_BASE  |= MAP_POPULATE;
+    //#endif
 
     void* m = mmap(0, masterLength, PROT_READ, _MAP_BASE, masterFD, 0);
     if (m == MAP_FAILED) {
diff --git a/lonestar/analytics/gpu/pointstoanalysis/andersen.h b/lonestar/analytics/gpu/pointstoanalysis/andersen.h
index 3144cea486..81ca5b808b 100644
--- a/lonestar/analytics/gpu/pointstoanalysis/andersen.h
+++ b/lonestar/analytics/gpu/pointstoanalysis/andersen.h
@@ -123,12 +123,12 @@ typedef unsigned long int ulongint;
 
 // number of threads per block for each rule. The thread count is based on the
 // amount of shared memory available and empirical measures.
-// #define DEF_THREADS_PER_BLOCK (1024)
-// #define UPDATE_THREADS_PER_BLOCK (1024)
-// #define HCD_THREADS_PER_BLOCK (512)
-// #define COPY_INV_THREADS_PER_BLOCK (864)
-// #define STORE_INV_THREADS_PER_BLOCK (864)
-// #define GEP_INV_THREADS_PER_BLOCK (1024)
+//#define DEF_THREADS_PER_BLOCK (1024)
+//#define UPDATE_THREADS_PER_BLOCK (1024)
+//#define HCD_THREADS_PER_BLOCK (512)
+//#define COPY_INV_THREADS_PER_BLOCK (864)
+//#define STORE_INV_THREADS_PER_BLOCK (864)
+//#define GEP_INV_THREADS_PER_BLOCK (1024)
 
 #include "pta_tuning.h"
 
diff --git a/lonestar/eda/cpu/aig-rewriting/algorithms/RewriteManager.cpp b/lonestar/eda/cpu/aig-rewriting/algorithms/RewriteManager.cpp
index 46afe9067a..99d6da44cb 100644
--- a/lonestar/eda/cpu/aig-rewriting/algorithms/RewriteManager.cpp
+++ b/lonestar/eda/cpu/aig-rewriting/algorithms/RewriteManager.cpp
@@ -29,7 +29,7 @@
 
 #include "galois/worklists/Chunk.h"
 
-// #include "galois/runtime/profile.h"
+//#include "galois/runtime/profile.h"
 
 #include <stdlib.h>
 #include <string.h>
diff --git a/lonestar/eda/cpu/aig-rewriting/functional/FunctionHandler.h b/lonestar/eda/cpu/aig-rewriting/functional/FunctionHandler.h
index d15f075a48..fcb92a0a38 100644
--- a/lonestar/eda/cpu/aig-rewriting/functional/FunctionHandler.h
+++ b/lonestar/eda/cpu/aig-rewriting/functional/FunctionHandler.h
@@ -203,7 +203,7 @@ inline void createLiterals(
   literals.insert(std::make_pair("1", std::make_pair(constOne, 0)));
 
   //	std::cout << std::endl << "############################## Literals
-  // ##############################" << std::endl; 	for ( auto lit : literals )
+  //##############################" << std::endl; 	for ( auto lit : literals )
   //{ 		std::cout << lit.first << " = " << toHex( lit.second.first, nWords )
   //<< " | " << supportToBin( lit.second.second ) << std::endl;
   //	}
diff --git a/lonestar/eda/cpu/sproute/flute.h b/lonestar/eda/cpu/sproute/flute.h
index 69bef615d8..7a0b3aedd9 100644
--- a/lonestar/eda/cpu/sproute/flute.h
+++ b/lonestar/eda/cpu/sproute/flute.h
@@ -5,7 +5,7 @@
 #include <stdlib.h>
 #include <limits.h>
 #include <math.h>
-// #include "flute_mst.h"
+//#include "flute_mst.h"
 
 /*****************************/
 /*  User-Defined Parameters  */
@@ -72,10 +72,10 @@ typedef struct {
 #define flutes_LMD(d, xs, ys, s, acc)                                          \
   (d <= D ? flutes_LD(d, xs, ys, s) : flutes_MD(d, xs, ys, s, acc))
 
-// #define max(x,y) ((x)>(y)?(x):(y))
-// #define min(x,y) ((x)<(y)?(x):(y))
-//  to work around max conflict with bitmap
-// #define abs(x) ((x)<0?(-x):(x))
+//#define max(x,y) ((x)>(y)?(x):(y))
+//#define min(x,y) ((x)<(y)?(x):(y))
+// to work around max conflict with bitmap
+//#define abs(x) ((x)<0?(-x):(x))
 using namespace std;
 #define ADIFF(x, y) ((x) > (y) ? (x - y) : (y - x)) // Absolute difference
 
@@ -574,9 +574,9 @@ DTYPE flutes_wl_MD(int d, DTYPE xs[], DTYPE ys[], int s[], int acc) {
     penalty[r] = pnlty, penalty[d - 1 - r] = pnlty;
   for (r = d / 2 - 1, pnlty = dy; r >= 0; r--, pnlty += dy)
     penalty[s[r]] += pnlty, penalty[s[d - 1 - r]] += pnlty;
-  // #define CCWL 0.16
-  //     for (r=0; r<d; r++)
-  //         penalty[r] = abs(d-1-r-r)*dx + abs(d-1-si[r]-si[r])*dy;
+  //#define CCWL 0.16
+  //    for (r=0; r<d; r++)
+  //        penalty[r] = abs(d-1-r-r)*dx + abs(d-1-si[r]-si[r])*dy;
 
   // Compute distx[], disty[]
   xydiff = (xs[d - 1] - xs[0]) - (ys[d - 1] - ys[0]);
@@ -1135,10 +1135,10 @@ Tree flutes_MD(int d, DTYPE xs[], DTYPE ys[], int s[], int acc) {
     penalty[s[r]] += pnlty, penalty[s[d - 1 - r]] += pnlty;
   penalty[s[1]] += pnlty, penalty[s[d - 2]] += pnlty;
   penalty[s[0]] += pnlty, penalty[s[d - 1]] += pnlty;
-  // #define CC 0.16
-  // #define v(r) ((r==0||r==1||r==d-2||r==d-1) ? d-3 : abs(d-1-r-r))
-  //     for (r=0; r<d; r++)
-  //         penalty[r] = v(r)*dx + v(si[r])*dy;
+  //#define CC 0.16
+  //#define v(r) ((r==0||r==1||r==d-2||r==d-1) ? d-3 : abs(d-1-r-r))
+  //    for (r=0; r<d; r++)
+  //        penalty[r] = v(r)*dx + v(si[r])*dy;
 
   // Compute distx[], disty[]
   xydiff = (xs[d - 1] - xs[0]) - (ys[d - 1] - ys[0]);
diff --git a/lonestar/eda/cpu/sproute/maze.h b/lonestar/eda/cpu/sproute/maze.h
index 553578cdec..029fccf807 100644
--- a/lonestar/eda/cpu/sproute/maze.h
+++ b/lonestar/eda/cpu/sproute/maze.h
@@ -31,7 +31,7 @@
 // using namespace std;
 
 #define PARENT(i) (i - 1) / 2
-// #define PARENT(i) ((i-1)>>1)
+//#define PARENT(i) ((i-1)>>1)
 #define LEFT(i) 2 * i + 1
 #define RIGHT(i) 2 * i + 2
 
diff --git a/lonestar/eda/cpu/sproute/maze3D.h b/lonestar/eda/cpu/sproute/maze3D.h
index ce1ea2d5f5..88179230e7 100644
--- a/lonestar/eda/cpu/sproute/maze3D.h
+++ b/lonestar/eda/cpu/sproute/maze3D.h
@@ -12,7 +12,7 @@
 #include <time.h>
 
 #define PARENT(i) (i - 1) / 2
-// #define PARENT(i) ((i-1)>>1)
+//#define PARENT(i) ((i-1)>>1)
 #define LEFT(i) 2 * i + 1
 #define RIGHT(i) 2 * i + 2
 
diff --git a/lonestar/eda/cpu/sproute/route.h b/lonestar/eda/cpu/sproute/route.h
index 10bf73a37e..4edba91d95 100644
--- a/lonestar/eda/cpu/sproute/route.h
+++ b/lonestar/eda/cpu/sproute/route.h
@@ -999,7 +999,7 @@ void routeMonotonic(int netID, int edgeID, int threshold) {
           grid = yl * xGrid_1;
           for (j = 0; j <= segHeight; j++) {
             tmp              = max((float)0, h_edges[grid + x].red +
-                                                 h_edges[grid + x].est_usage - hCapacity_lb);
+                                    h_edges[grid + x].est_usage - hCapacity_lb);
             cost[j][i + 1]   = cost[j][i] + tmp;
             parent[j][i + 1] = SAMEY;
             grid += xGrid - 1;
@@ -1071,7 +1071,7 @@ void routeMonotonic(int netID, int edgeID, int threshold) {
           ind_i = i + 1;
           for (j = segHeight; j >= 0; j--) {
             tmp              = max((float)0, h_edges[grid + x].red +
-                                                 h_edges[grid + x].est_usage - hCapacity_lb);
+                                    h_edges[grid + x].est_usage - hCapacity_lb);
             cost[j][ind_i]   = cost[j][i] + tmp;
             parent[j][ind_i] = SAMEY;
             grid -= xGrid - 1;
diff --git a/lonestar/gnn/distributed/gcn/gcn-dist.cpp b/lonestar/gnn/distributed/gcn/gcn-dist.cpp
index 7076449384..60e9fe75b4 100644
--- a/lonestar/gnn/distributed/gcn/gcn-dist.cpp
+++ b/lonestar/gnn/distributed/gcn/gcn-dist.cpp
@@ -9,8 +9,8 @@ int main(int argc, char* argv[]) {
   galois::StatTimer init_timer("InitializationTime");
   init_timer.start();
   std::unique_ptr<
-      galois::GraphNeuralNetwork<shad::ShadNodeTy, shad::ShadEdgeTy>>
-      gnn = InitializeGraphNeuralNetwork<shad::ShadNodeTy, shad::ShadEdgeTy>();
+      galois::GraphNeuralNetwork<shad::ShadNodeTy, shad::ShadEdgeTy>> gnn =
+      InitializeGraphNeuralNetwork<shad::ShadNodeTy, shad::ShadEdgeTy>();
   gnn->SetLayerPhases(galois::GNNPhase::kTrain);
   init_timer.stop();
 
diff --git a/lonestar/gnn/gat/gat.cpp b/lonestar/gnn/gat/gat.cpp
index 24ea0b840c..10647924b7 100644
--- a/lonestar/gnn/gat/gat.cpp
+++ b/lonestar/gnn/gat/gat.cpp
@@ -3,19 +3,16 @@
 #include "lonestargnn.h"
 
 const char* name = "Graph Attention Networks (GAT)";
-const char* desc = "Graph Attention Networks on an undirected graph: "
-                   "<https://arxiv.org/pdf/1710.10903.pdf>";
+const char* desc = "Graph Attention Networks on an undirected graph: <https://arxiv.org/pdf/1710.10903.pdf>";
 const char* url  = 0;
 
 // math: h_i^{(l+1)} = \sum_{j\in \mathcal{N}(i)} \alpha_{i,j} W^{(l)} h_j^{(l)}
-// where :math:`\alpha_{ij}` is the attention score bewteen node :math:`i` and
-// node :math:`j`:
+// where :math:`\alpha_{ij}` is the attention score bewteen node :math:`i` and node :math:`j`:
 // .. math:: \alpha_{ij}^{l} & = \mathrm{softmax_i} (e_{ij}^{l})
-//                e_{ij}^{l} & = \mathrm{LeakyReLU}\left(\vec{a}^T [W h_{i} \| W
-//                h_{j}]\right)
+//                e_{ij}^{l} & = \mathrm{LeakyReLU}\left(\vec{a}^T [W h_{i} \| W h_{j}]\right)
 /*
 namespace deepgalois {
-
+ 
 // define aggregator here
 class AppAggregator: public Aggregator {
 public:
diff --git a/lonestar/gnn/gin/gin.cpp b/lonestar/gnn/gin/gin.cpp
index 2cadef0c44..4eb8835214 100644
--- a/lonestar/gnn/gin/gin.cpp
+++ b/lonestar/gnn/gin/gin.cpp
@@ -6,12 +6,8 @@
 const char* name = "Graph Isomorphism Network (GIN)";
 const char* desc = "Graph isomorphism neural networks on an undirected graph";
 const char* url  = 0;
-static cll::opt<unsigned> learn_eps(
-    "le",
-    cll::desc("whether to learn the parameter epsilon (default value false)"),
-    cll::init(0));
-static cll::opt<std::string> agg_type("at", cll::desc("Aggregator Type"),
-                                      cll::init("sum"));
+static cll::opt<unsigned>learn_eps("le", cll::desc("whether to learn the parameter epsilon (default value false)"), cll::init(0));
+static cll::opt<std::string>agg_type("at", cll::desc("Aggregator Type"), cll::init("sum"));
 
 template <>
 class graph_conv_layer<agg_type> {
@@ -34,3 +30,4 @@ int main(int argc, char** argv) {
   graph_conv_layer<agg_type> layer0;
   return 0;
 }
+
diff --git a/lonestar/gnn/include/DistributedGraphLoader.h b/lonestar/gnn/include/DistributedGraphLoader.h
index e4970a82a8..0bce4b5819 100644
--- a/lonestar/gnn/include/DistributedGraphLoader.h
+++ b/lonestar/gnn/include/DistributedGraphLoader.h
@@ -129,8 +129,7 @@ namespace graphs {
  * loaded based on command line arguments
  */
 template <typename NodeData, typename EdgeData>
-std::unique_ptr<DistGraph<NodeData, EdgeData>>
-constructSymmetricGraph(std::vector<unsigned>&) {
+std::unique_ptr<DistGraph<NodeData, EdgeData>> constructSymmetricGraph(std::vector<unsigned>&) {
   std::string inputFile = deepgalois::path + dataset + ".csgr";
   galois::gInfo("File to read is ", inputFile);
   switch (partitionScheme) {
diff --git a/lonestar/gnn/include/engine.h b/lonestar/gnn/include/engine.h
index 25be9c7adc..016ac80831 100644
--- a/lonestar/gnn/include/engine.h
+++ b/lonestar/gnn/include/engine.h
@@ -28,7 +28,7 @@ void LonestarGnnStart(int argc, char** argv, const char* app, const char* desc,
   unsigned hostID = 0;
 #ifndef GALOIS_ENABLE_GPU
   numThreads = galois::setActiveThreads(numThreads); // number of threads on CPU
-  hostID     = galois::runtime::getSystemNetworkInterface().ID;
+  hostID = galois::runtime::getSystemNetworkInterface().ID;
 #endif
 
   if (hostID == 0) {
diff --git a/lonestar/gnn/include/lonestargnn.h b/lonestar/gnn/include/lonestargnn.h
index 2f66303c86..8b18e80ae0 100644
--- a/lonestar/gnn/include/lonestargnn.h
+++ b/lonestar/gnn/include/lonestargnn.h
@@ -3,55 +3,34 @@
 #include "llvm/Support/CommandLine.h"
 
 namespace cll = llvm::cl;
-static cll::opt<std::string>
-    dataset(cll::Positional, cll::desc("<dataset name>"),
-            cll::Required); // 'cora', 'citeseer', 'pubmed'
-// static cll::opt<std::string> model("m",
-//   cll::desc("Model string"), cll::init("gcn")); // 'gcn', 'gcn_cheby',
-//   'dense'
-static cll::opt<int>
-    epochs("k", cll::desc("number of epoch, i.e. iterations (default value 1)"),
-           cll::init(1));
-static cll::opt<unsigned> num_conv_layers(
-    "nc", cll::desc("number of convolutional layers, (default value 2)"),
-    cll::init(2));
-static cll::opt<unsigned>
-    hidden1("h",
-            cll::desc("Number of units in hidden layer 1 (default value 16)"),
-            cll::init(16));
-static cll::opt<float>
-    learning_rate("lr", cll::desc("Initial learning rate (default value 0.01)"),
-                  cll::init(0.01));
-static cll::opt<float> dropout_rate(
-    "dr", cll::desc("Dropout rate (1 - keep probability) (default value 0.5)"),
-    cll::init(0.5));
-static cll::opt<float> weight_decay(
-    "wd",
-    cll::desc("Weight for L2 loss on embedding matrix (default value 5e-4)"),
-    cll::init(5e-4));
-static cll::opt<float> early_stopping(
-    "es",
-    cll::desc("Tolerance for early stopping (# of epochs) (default value 10)"),
-    cll::init(10));
-static cll::opt<bool> is_single_class(
-    "sc", cll::desc("single-class or multi-class label (default single)"),
-    cll::init(1));
-static cll::opt<bool> do_validate("dv", cll::desc("enable validation"),
-                                  cll::init(1));
+static cll::opt<std::string> dataset(cll::Positional, 
+    cll::desc("<dataset name>"), cll::Required); // 'cora', 'citeseer', 'pubmed'
+//static cll::opt<std::string> model("m", 
+//  cll::desc("Model string"), cll::init("gcn")); // 'gcn', 'gcn_cheby', 'dense'
+static cll::opt<int> epochs("k",
+    cll::desc("number of epoch, i.e. iterations (default value 1)"), cll::init(1));
+static cll::opt<unsigned> num_conv_layers("nc",
+    cll::desc("number of convolutional layers, (default value 2)"), cll::init(2));
+static cll::opt<unsigned> hidden1("h",
+    cll::desc("Number of units in hidden layer 1 (default value 16)"), cll::init(16));
+static cll::opt<float> learning_rate("lr", 
+    cll::desc("Initial learning rate (default value 0.01)"), cll::init(0.01));
+static cll::opt<float> dropout_rate("dr", 
+    cll::desc("Dropout rate (1 - keep probability) (default value 0.5)"), cll::init(0.5));
+static cll::opt<float> weight_decay("wd",
+    cll::desc("Weight for L2 loss on embedding matrix (default value 5e-4)"), cll::init(5e-4));
+static cll::opt<float> early_stopping("es",
+    cll::desc("Tolerance for early stopping (# of epochs) (default value 10)"), cll::init(10));
+static cll::opt<bool> is_single_class("sc", 
+    cll::desc("single-class or multi-class label (default single)"), cll::init(1));
+static cll::opt<bool> do_validate("dv", cll::desc("enable validation"), cll::init(1));
 static cll::opt<bool> do_test("dt", cll::desc("enable test"), cll::init(1));
-static cll::opt<bool> add_selfloop("sl", cll::desc("add selfloop"),
-                                   cll::init(0));
-static cll::opt<bool> add_l2norm("l2", cll::desc("add an l2_norm layer"),
-                                 cll::init(0));
-static cll::opt<bool> add_dense("d", cll::desc("add an dense layer"),
-                                cll::init(0));
-static cll::opt<int>
-    val_interval("vi", cll::desc("validation interval (default value 1)"),
-                 cll::init(1));
-static cll::opt<unsigned> neighbor_sample_sz(
-    "ns", cll::desc("neighbor sampling size (default value 0)"), cll::init(0));
-static cll::opt<unsigned> subgraph_sample_sz(
-    "ss", cll::desc("subgraph sampling size (default value 0)"), cll::init(0));
+static cll::opt<bool> add_selfloop("sl", cll::desc("add selfloop"), cll::init(0));
+static cll::opt<bool> add_l2norm("l2", cll::desc("add an l2_norm layer"), cll::init(0));
+static cll::opt<bool> add_dense("d", cll::desc("add an dense layer"), cll::init(0));
+static cll::opt<int> val_interval("vi", cll::desc("validation interval (default value 1)"), cll::init(1));
+static cll::opt<unsigned> neighbor_sample_sz("ns", cll::desc("neighbor sampling size (default value 0)"), cll::init(0));
+static cll::opt<unsigned> subgraph_sample_sz("ss", cll::desc("subgraph sampling size (default value 0)"), cll::init(0));
 
 //! standard global options to the benchmarks
 extern llvm::cl::opt<bool> skipVerify;
@@ -59,14 +38,9 @@ extern llvm::cl::opt<int> numThreads;
 extern llvm::cl::opt<std::string> statFile;
 
 //! standard global options to the benchmarks
-llvm::cl::opt<bool>
-    skipVerify("noverify",
-               llvm::cl::desc("Skip verification step (default value false)"),
-               llvm::cl::init(false));
-llvm::cl::opt<int>
-    numThreads("t", llvm::cl::desc("Number of threads (default value 1)"),
-               llvm::cl::init(1));
-llvm::cl::opt<std::string> statFile(
-    "statFile",
-    llvm::cl::desc("ouput file to print stats to (default value empty)"),
-    llvm::cl::init(""));
+llvm::cl::opt<bool> skipVerify("noverify",
+    llvm::cl::desc("Skip verification step (default value false)"), llvm::cl::init(false));
+llvm::cl::opt<int>numThreads("t", llvm::cl::desc("Number of threads (default value 1)"), llvm::cl::init(1));
+llvm::cl::opt<std::string> statFile("statFile",
+    llvm::cl::desc("ouput file to print stats to (default value empty)"), llvm::cl::init(""));
+
diff --git a/lonestar/gnn/sage/sage.cpp b/lonestar/gnn/sage/sage.cpp
index b9daad0436..5f078dff63 100644
--- a/lonestar/gnn/sage/sage.cpp
+++ b/lonestar/gnn/sage/sage.cpp
@@ -3,9 +3,8 @@
 #include "lonestargnn.h"
 
 const char* name = "GraphSAGE";
-const char* desc =
-    "GraphSAGE on an undirected graph: <https://arxiv.org/pdf/1706.02216.pdf>";
-const char* url = 0;
+const char* desc = "GraphSAGE on an undirected graph: <https://arxiv.org/pdf/1706.02216.pdf>";
+const char* url  = 0;
 
 // define aggregator here
 // .. math::
@@ -18,42 +17,39 @@ const char* url = 0;
 //      h_{i}^{(l+1)} & = \mathrm{norm}(h_{i}^{l})
 
 namespace deepgalois {
-
-class AppAggregator : public Aggregator {
+ 
+class AppAggregator: public Aggregator {
 public:
   emb_t applyEdge(VertexID, VertexID u, emb_t in) {
     auto ilen = get_in_feat_len();
-    return &in[ilen * u];
+    return &in[ilen*u];
   }
 
   emb_t applyVertex(VertexID v, emb_t in, emb_t accum) {
-    auto n    = get_num_samples();
+    auto n = get_num_samples();
     auto ilen = get_in_feat_len();
     auto olen = get_out_feat_len();
     emb_t a, b, c;
-    math::mvmul(CblasTrans, olen, ilen, 1.0, W, &accum[v * ilen], 0.0,
-                a); // a = W * accum[v]; [olen x ilen] * [ilen x 1] = [olen x 1]
-    math::mvmul(CblasTrans, olen, ilen, 1.0, Q, &in[v * ilen], 0.0,
-                b); // b = Q * in; [olen x ilen] * [ilen x 1] = [olen x 1]
+    math::mvmul(CblasTrans, olen, ilen, 1.0, W, &accum[v*ilen], 0.0, a); // a = W * accum[v]; [olen x ilen] * [ilen x 1] = [olen x 1]
+    math::mvmul(CblasTrans, olen, ilen, 1.0, Q, &in[v*ilen], 0.0, b);    // b = Q * in; [olen x ilen] * [ilen x 1] = [olen x 1] 
     math::vadd_cpu(olen, a, b, c); // c = a + b; [olen x 1]
-    return c;                      // the feature vector to update h[v]
+    return c; // the feature vector to update h[v]
+  }
+/*
+  emb_t applyVertex(emb_t in, emb_t accum) {
+    auto n = get_num_samples();
+    auto ilen = get_in_feat_len();
+    auto olen = get_out_feat_len();
+    emb_t a, b, c;
+    math::matmul(n, olen, ilen, accum, W, a); // a = accum * W; [n x ilen] * [ilen x olen] = [n x olen]
+    math::matmul(n, olen, ilen, in, Q, b);    // b = in * Q; [n x ilen] * [ilen x olen] = [n x olen] 
+    math::vadd(n*olen, a, b, c); // c = a + b; [n x olen]
+    return c; // all the feature vectors to update the entire h
   }
-  /*
-    emb_t applyVertex(emb_t in, emb_t accum) {
-      auto n = get_num_samples();
-      auto ilen = get_in_feat_len();
-      auto olen = get_out_feat_len();
-      emb_t a, b, c;
-      math::matmul(n, olen, ilen, accum, W, a); // a = accum * W; [n x ilen] *
-    [ilen x olen] = [n x olen] math::matmul(n, olen, ilen, in, Q, b);    // b =
-    in * Q; [n x ilen] * [ilen x olen] = [n x olen] math::vadd(n*olen, a, b, c);
-    // c = a + b; [n x olen] return c; // all the feature vectors to update the
-    entire h
-    }
-  */
-  // void update_all(size_t len, Graph& g, const emb_t in, emb_t out) {
-  // }
+*/
+  //void update_all(size_t len, Graph& g, const emb_t in, emb_t out) {
+  //}
 };
 
-} // namespace deepgalois
+}
 #include "engine.h"
diff --git a/lonestar/gnn/src/DistributedGraphLoader.cpp b/lonestar/gnn/src/DistributedGraphLoader.cpp
index 67679606b6..5e1a2dbe81 100644
--- a/lonestar/gnn/src/DistributedGraphLoader.cpp
+++ b/lonestar/gnn/src/DistributedGraphLoader.cpp
@@ -42,8 +42,7 @@ cll::opt<PARTITIONING_SCHEME> partitionScheme(
                    "gnn cvc: train nodes evenly distributed")),
     cll::init(GNN_OEC));
 
-cll::opt<bool> useWMD("useWMD",
-                      cll::desc("true if the input graph is"
-                                " SHAD WMD graph format."
-                                " Otheriwse, set false."),
-                      cll::init(false));
+cll::opt<bool> useWMD("useWMD", cll::desc("true if the input graph is"
+                                          " SHAD WMD graph format."
+                                          " Otheriwse, set false."),
+                       cll::init(false));
diff --git a/lonestar/libdistbench/include/DistBench/Input.h b/lonestar/libdistbench/include/DistBench/Input.h
index 4563b720a7..d7e9cb8568 100644
--- a/lonestar/libdistbench/include/DistBench/Input.h
+++ b/lonestar/libdistbench/include/DistBench/Input.h
@@ -166,7 +166,7 @@ constructSymmetricGraph(std::vector<unsigned>& GALOIS_UNUSED(scaleFactor)) {
   case GINGER_O:
   case GINGER_I:
     return galois::cuspPartitionGraph<GingerP, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, true,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD ,true,
         inputFileTranspose);
 
   case FENNEL_O:
@@ -371,7 +371,8 @@ DistGraphPtr<NodeData, EdgeData> constructGraph(std::vector<unsigned>&) {
       return galois::cuspPartitionGraph<GenericCVCColumnFlip, NodeData,
                                         EdgeData>(inputFile, galois::CUSP_CSC,
                                                   galois::CUSP_CSC, useWMD,
-                                                  false, inputFileTranspose);
+                                                  false,
+                                                  inputFileTranspose);
     } else {
       GALOIS_DIE("cvc requires transpose graph");
       break;
diff --git a/lonestar/libdistbench/src/Input.cpp b/lonestar/libdistbench/src/Input.cpp
index 3ed520e69c..844591506f 100644
--- a/lonestar/libdistbench/src/Input.cpp
+++ b/lonestar/libdistbench/src/Input.cpp
@@ -60,11 +60,10 @@ cll::opt<PARTITIONING_SCHEME> partitionScheme(
                    "fennel, incoming edge cut, using CuSP")),
     cll::init(OEC));
 
-cll::opt<bool> useWMD("useWMD",
-                      cll::desc("true if the input graph is"
-                                " SHAD WMD graph format."
-                                " Otheriwse, set false."),
-                      cll::init(false));
+cll::opt<bool> useWMD("useWMD", cll::desc("true if the input graph is"
+                                          " SHAD WMD graph format."
+                                          " Otheriwse, set false."),
+                       cll::init(false));
 
 cll::opt<bool> readFromFile("readFromFile",
                             cll::desc("Set this flag if graph is to be "
diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp
index 1068ebd9d0..c1da754222 100644
--- a/lonestar/libgnnbench/src/Input.cpp
+++ b/lonestar/libgnnbench/src/Input.cpp
@@ -26,11 +26,10 @@ llvm::cl::opt<galois::graphs::GNNPartitionScheme> partition_scheme(
                            "Original Cartesian Vertex-Cut")),
     cll::init(galois::graphs::GNNPartitionScheme::kOEC));
 
-cll::opt<bool> useWMD("useWMD",
-                      cll::desc("true if the input graph is"
-                                " SHAD WMD graph format."
-                                " Otheriwse, set false."),
-                      cll::init(false));
+cll::opt<bool> useWMD("useWMD", cll::desc("true if the input graph is"
+                                          " SHAD WMD graph format."
+                                          " Otheriwse, set false."),
+                       cll::init(false));
 
 llvm::cl::opt<unsigned> num_layers(
     "numLayers",
diff --git a/lonestar/libgnnbench/src/Start.cpp b/lonestar/libgnnbench/src/Start.cpp
index 76f0d01744..ed928374cc 100644
--- a/lonestar/libgnnbench/src/Start.cpp
+++ b/lonestar/libgnnbench/src/Start.cpp
@@ -135,14 +135,14 @@ void GNNBenchStart(int argc, char** argv, const char* app, const char* desc,
       // partitioning policies. They consider and attempt to balance the
       // number of master "training" nodes for each host.
       // SHAD-GNN on WMD graphs is not necessarily constrained to this design.
-      // SHAD-GNN has the specific number of training vertices, and randomly
+      // SHAD-GNN has the specific number of training vertices, and randomly 
       // selects vertices from a graph as that, which means that Galois-GNN
       // could avoid vertex imbalancing due to the skewness if it chooses
       // vertices in balance manner.
       // To sum up, we do not support the specialized partitioning policies,
       // but choose vertices in balance manner.
       GALOIS_LOG_FATAL("Gnn CVC and OEC are not supported for WMD graphs {}",
-                       GNNPartitionToString(partition_scheme));
+          GNNPartitionToString(partition_scheme));
     }
   }
 
diff --git a/lonestar/mining/cpu/subgraph-listing/sgl_cycle.cpp b/lonestar/mining/cpu/subgraph-listing/sgl_cycle.cpp
index 2aad7806b0..e38521caa6 100644
--- a/lonestar/mining/cpu/subgraph-listing/sgl_cycle.cpp
+++ b/lonestar/mining/cpu/subgraph-listing/sgl_cycle.cpp
@@ -4,7 +4,7 @@
 const char* name = "sgl";
 const char* desc = "listing edge-induced subgraphs of a given pattern in a "
                    "graph using bfs extension";
-const char* url  = nullptr;
+const char* url = nullptr;
 
 #include "pangolin/BfsMining/vertex_miner_api.h"
 class MyAPI : public VertexMinerAPI<BaseEmbedding> {
diff --git a/lonestar/mining/cpu/subgraph-listing/sgl_diamond.cpp b/lonestar/mining/cpu/subgraph-listing/sgl_diamond.cpp
index ca272c8f52..2b68512858 100644
--- a/lonestar/mining/cpu/subgraph-listing/sgl_diamond.cpp
+++ b/lonestar/mining/cpu/subgraph-listing/sgl_diamond.cpp
@@ -4,7 +4,7 @@
 const char* name = "sgl";
 const char* desc = "listing edge-induced subgraphs of a given pattern in a "
                    "graph using bfs extension";
-const char* url  = nullptr;
+const char* url = nullptr;
 
 #include "pangolin/BfsMining/vertex_miner_api.h"
 class MyAPI : public VertexMinerAPI<BaseEmbedding> {
diff --git a/lonestar/scientific/cpu/delaunayrefinement/DelaunayRefinement.cpp b/lonestar/scientific/cpu/delaunayrefinement/DelaunayRefinement.cpp
index d8e9630cb9..8dadc777cb 100644
--- a/lonestar/scientific/cpu/delaunayrefinement/DelaunayRefinement.cpp
+++ b/lonestar/scientific/cpu/delaunayrefinement/DelaunayRefinement.cpp
@@ -38,7 +38,7 @@ namespace cll = llvm::cl;
 static const char* name = "Delaunay Mesh Refinement";
 static const char* desc = "Refines a Delaunay triangulation mesh such that no "
                           "angle in the mesh is less than 30 degrees";
-static const char* url  = "delaunay_mesh_refinement";
+static const char* url = "delaunay_mesh_refinement";
 
 static cll::opt<std::string>
     inputFile(cll::Positional, cll::desc("<input file>"), cll::Required);
diff --git a/lonestar/scientific/cpu/longestedge/src/LongestEdge.cpp b/lonestar/scientific/cpu/longestedge/src/LongestEdge.cpp
index cb253bed15..6da95f5126 100644
--- a/lonestar/scientific/cpu/longestedge/src/LongestEdge.cpp
+++ b/lonestar/scientific/cpu/longestedge/src/LongestEdge.cpp
@@ -33,7 +33,7 @@ namespace cll = llvm::cl;
 static const char* name = "Longest edge mesh generator";
 static const char* desc = "Implementation of Rivara's Longest Edge algorithm "
                           "based on hyper-graph grammars.";
-static const char* url  = "longest_edge";
+static const char* url = "longest_edge";
 
 // Command line arguments
 static cll::opt<std::string> dataDir("data", cll::Positional,
diff --git a/lonestar/scientific/cpu/longestedge/src/readers/SrtmReader.cpp b/lonestar/scientific/cpu/longestedge/src/readers/SrtmReader.cpp
index 9f916d744f..40c5a1080b 100644
--- a/lonestar/scientific/cpu/longestedge/src/readers/SrtmReader.cpp
+++ b/lonestar/scientific/cpu/longestedge/src/readers/SrtmReader.cpp
@@ -32,7 +32,7 @@ Map* SrtmReader::read(
   // from disk
   double** map_data = Map::init_map_data(rows, cols);
   Map* map          = new Map(map_data, cols, rows, 1. / VALUES_IN_DEGREE,
-                              1. / VALUES_IN_DEGREE);
+                     1. / VALUES_IN_DEGREE);
 
   map->setNorthBorder(map_N_border);
   map->setWestBorder(map_W_border);

From 5d4ca9654981869c8fbe8dd4c12634b5a2d4deb8 Mon Sep 17 00:00:00 2001
From: Meyer Zinn <6132034+meyerzinn@users.noreply.github.com>
Date: Tue, 2 Apr 2024 16:37:39 -0500
Subject: [PATCH 643/660] improve LS_CSR edge iterator performance (#18)

---
 libgalois/include/galois/LargeVector.h        |  3 -
 .../include/galois/graphs/LS_LC_CSR_Graph.h   | 71 +++++++------------
 2 files changed, 25 insertions(+), 49 deletions(-)

diff --git a/libgalois/include/galois/LargeVector.h b/libgalois/include/galois/LargeVector.h
index 7b5deb4654..4ee4235aa4 100644
--- a/libgalois/include/galois/LargeVector.h
+++ b/libgalois/include/galois/LargeVector.h
@@ -65,9 +65,6 @@ class LargeVector : public boost::noncopyable {
     size_t const mmap_size =
         std::max(m_mappings.front().second * 2, m_capacity * sizeof(T));
 
-    std::cout << "new_cap = " << new_cap << "\tmmap_size = " << mmap_size
-              << std::endl;
-
     m_data = static_cast<T*>(
         mmap(nullptr, mmap_size, PROT_READ | PROT_WRITE,
              MAP_SHARED | MAP_HUGETLB | MAP_HUGE_2MB | MAP_POPULATE, m_fd, 0));
diff --git a/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h b/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
index 67a90dd1a9..1ad7bc6b4a 100644
--- a/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
+++ b/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
@@ -56,22 +56,7 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
   using VertexRange =
       boost::iterator_range<boost::counting_iterator<VertexTopologyID>>;
 
-  struct EdgeHandle {
-  private:
-    uint8_t buffer : 1;
-    uint64_t index : 48;
-    VertexTopologyID src : 48;
-
-    EdgeHandle(uint8_t buffer, uint64_t index, uint64_t src)
-        : buffer(buffer), index(index), src(src) {}
-
-    friend class LS_LC_CSR_Graph;
-
-  public:
-    EdgeHandle(EdgeHandle const&) = default;
-    EdgeHandle(EdgeHandle&&)      = default;
-
-  } __attribute__((packed));
+  using EdgeHandle = std::pair<VertexTopologyID, VertexTopologyID>;
 
 private:
   using SpinLock = galois::substrate::PaddedLock<concurrent>;
@@ -115,10 +100,6 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
     return m_edges[buffer][index];
   }
 
-  inline EdgeMetadata& getEdgeMetadata(EdgeHandle handle) const {
-    return getEdgeMetadata(handle.buffer, handle.index);
-  }
-
 public:
   LS_LC_CSR_Graph(uint64_t num_vertices)
       : m_vertices(num_vertices, VertexMetadata()) {
@@ -144,8 +125,7 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
 
   template <typename E = EdgeData, typename = std::enable_if<HasEdgeData>>
   void setEdgeData(EdgeHandle handle, E data) {
-    VertexTopologyID const src                                    = handle.src;
-    m_edge_data[std::make_pair(src, getEdgeMetadata(handle).dst)] = data;
+    m_edge_data[handle] = data;
   }
 
   inline VertexTopologyID begin() const noexcept {
@@ -180,23 +160,26 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
     return start;
   }
 
-  size_t getDegree(VertexTopologyID id) { return m_vertices[id].degree; }
+  inline size_t getDegree(VertexTopologyID id) { return m_vertices[id].degree; }
 
-  VertexTopologyID getEdgeDst(EdgeHandle eh) { return getEdgeMetadata(eh).dst; }
+  inline VertexTopologyID getEdgeDst(EdgeHandle eh) { return eh.second; }
 
   template <typename E = EdgeData, typename = std::enable_if<HasEdgeData>>
   inline E& getEdgeData(EdgeHandle handle) {
-    VertexTopologyID const src = handle.src;
-    return m_edge_data[std::make_pair(src, getEdgeDst(handle))];
+    return m_edge_data[handle];
   }
 
   EdgeRange edges(VertexTopologyID node) {
     auto& vertex_meta = m_vertices[node];
 
-    return EdgeRange(EdgeIterator(this, vertex_meta.buffer, vertex_meta.begin,
-                                  vertex_meta.end, node),
-                     EdgeIterator(this, vertex_meta.buffer, vertex_meta.end,
-                                  vertex_meta.end, node));
+    EdgeMetadata const* const start =
+        &getEdgeMetadata(vertex_meta.buffer, vertex_meta.begin);
+
+    EdgeMetadata const* const end =
+        &getEdgeMetadata(vertex_meta.buffer, vertex_meta.end);
+
+    return EdgeRange(EdgeIterator(node, start, end),
+                     EdgeIterator(node, end, end));
   }
 
   void addEdges(VertexTopologyID src, const std::vector<VertexTopologyID> dsts,
@@ -262,7 +245,7 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
       EdgeMetadata& edge_meta = getEdgeMetadata(vertex_meta.buffer, i);
       if (!edge_meta.is_tomb() &&
           edges_set.find(edge_meta.dst) != edges_set.end()) {
-        edge_meta.tomb();
+        edge_meta.set_tomb();
         --vertex_meta.degree;
         ++holes_added;
         // remove tombstoned edges from the start of the edge list
@@ -372,37 +355,33 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
     VertexTopologyID dst : 48;
 
     bool is_tomb() const noexcept { return (flags & TOMB) > 0; }
-    void tomb() { flags |= TOMB; }
+    void set_tomb() { flags |= TOMB; }
   } __attribute__((packed));
 
   static_assert(sizeof(EdgeMetadata) <= sizeof(uint64_t));
 
   class EdgeIterator
-      : public boost::iterator_facade<EdgeIterator, EdgeHandle const,
+      : public boost::iterator_facade<EdgeIterator, EdgeHandle,
                                       boost::forward_traversal_tag,
                                       EdgeHandle const> {
   private:
-    LS_LC_CSR_Graph* graph;
-    uint8_t buffer;
-    uint64_t index;
-    uint64_t end;
-    VertexTopologyID src;
+    VertexTopologyID const src;
+    EdgeMetadata const* curr;
+    EdgeMetadata const* const end;
 
-    explicit EdgeIterator(LS_LC_CSR_Graph* graph, uint8_t buffer,
-                          uint64_t index, uint64_t end, VertexTopologyID src)
-        : graph(graph), buffer(buffer), index(index), end(end), src(src) {}
+    EdgeIterator(VertexTopologyID src, EdgeMetadata const* start,
+                 EdgeMetadata const* end)
+        : src(src), curr(start), end(end) {}
 
     void increment() {
-      while (++index < end && graph->getEdgeMetadata(buffer, index).is_tomb())
+      while (++curr < end && curr->is_tomb())
         ;
     }
 
     // updates to the graph will invalidate iterators
-    bool equal(EdgeIterator const& other) const {
-      return graph == other.graph && index == other.index;
-    }
+    bool equal(EdgeIterator const& other) const { return curr == other.curr; }
 
-    EdgeHandle dereference() const { return EdgeHandle(buffer, index, src); }
+    EdgeHandle dereference() const { return EdgeHandle(src, curr->dst); }
 
     friend class LS_LC_CSR_Graph;
     friend class boost::iterator_core_access;

From 096977fda2419fd160e5853fdd5563a7b475aa0d Mon Sep 17 00:00:00 2001
From: Meyer Zinn <6132034+meyerzinn@users.noreply.github.com>
Date: Wed, 3 Apr 2024 14:32:40 -0500
Subject: [PATCH 644/660] add features to LS_CSR (#22)

---
 .../include/galois/graphs/LS_LC_CSR_Graph.h   | 125 +++++++++++++++++-
 libgalois/test/graph-compile-lscsr.cpp        |  27 +++-
 2 files changed, 144 insertions(+), 8 deletions(-)

diff --git a/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h b/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
index 1ad7bc6b4a..7edf10ff69 100644
--- a/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
+++ b/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
@@ -36,7 +36,7 @@
 
 #include "galois/config.h"
 #include "galois/LargeVector.h"
-#include "galois/LargeArray.h"
+#include "galois/PrefixSum.h"
 
 #ifdef __cpp_lib_hardware_interference_size
 using std::hardware_destructive_interference_size;
@@ -87,14 +87,44 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
   LargeVector<EdgeMetadata> m_edges[2];
   SpinLock m_edges_lock; // guards resizing of edges vectors
   EdgeDataStore m_edge_data;
-
   alignas(hardware_destructive_interference_size) std::atomic_uint64_t
       m_edges_tail = ATOMIC_VAR_INIT(0);
-
   // m_holes is the number of holes in the log (m_edges[1])
   alignas(hardware_destructive_interference_size) std::atomic_uint64_t m_holes =
       ATOMIC_VAR_INIT(0);
 
+  /*
+   * Prefix Sum utilities
+   */
+  std::vector<uint64_t> m_pfx_sum_cache;
+  static uint64_t transmute(const VertexMetadata& vertex_meta) {
+    return vertex_meta.degree;
+  }
+  static uint64_t scan_op(const VertexMetadata& p, const uint64_t& l) {
+    return p.degree + l;
+  }
+  static uint64_t combiner(const uint64_t& f, const uint64_t& s) {
+    return f + s;
+  }
+  PrefixSum<VertexMetadata, uint64_t, transmute, scan_op, combiner,
+            CacheLinePaddedArr>
+      m_pfx{&m_vertices[0], &m_pfx_sum_cache[0]};
+
+  alignas(hardware_destructive_interference_size)
+      std::atomic<bool> m_prefix_valid = ATOMIC_VAR_INIT(false);
+
+  void resetPrefixSum() {
+    m_pfx_sum_cache.resize(m_vertices.size());
+    m_pfx.src = &m_vertices[0];
+    m_pfx.dst = &m_pfx_sum_cache[0];
+  }
+
+  // Compute the prefix sum using the two level method
+  void computePrefixSum() {
+    m_pfx.computePrefixSum(m_vertices.size());
+    m_prefix_valid.store(true, std::memory_order_release);
+  }
+
   // returns a reference to the metadata for the pointed-to edge
   inline EdgeMetadata& getEdgeMetadata(uint8_t buffer, uint64_t index) const {
     return m_edges[buffer][index];
@@ -106,6 +136,7 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
     if constexpr (HasVertexData) {
       m_vertex_data.resize(num_vertices);
     }
+    resetPrefixSum();
   }
 
   inline uint64_t size() const noexcept { return m_vertices.size(); }
@@ -143,6 +174,7 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
     if constexpr (HasVertexData) {
       m_vertex_data.resize(m_vertices.size());
     }
+    resetPrefixSum();
     return m_vertices.size() - 1;
   }
 
@@ -153,7 +185,7 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
     VertexTopologyID const start = m_vertices.size();
     m_vertices.resize(m_vertices.size() + data.size());
     m_vertex_data.resize(m_vertices.size());
-
+    resetPrefixSum();
     galois::do_all(
         galois::iterate(0ul, data.size()),
         [&](VertexTopologyID const& off) { setData(start + off, data[off]); });
@@ -169,8 +201,37 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
     return m_edge_data[handle];
   }
 
+  /*
+   * Count the total number of edges in parallel.
+   */
+  uint64_t sizeEdges() {
+    galois::GAccumulator<uint64_t> num_edges;
+    num_edges.reset();
+    galois::do_all(galois::iterate(begin(), end()),
+                   [&](VertexTopologyID const& vertex) {
+                     num_edges += getDegree(vertex);
+                   });
+    return num_edges.reduce();
+  }
+
+  EdgeIterator edge_begin(VertexTopologyID vertex) {
+    auto const& vertex_meta = m_vertices[vertex];
+    EdgeMetadata const* const start =
+        &getEdgeMetadata(vertex_meta.buffer, vertex_meta.begin);
+    EdgeMetadata const* const end =
+        &getEdgeMetadata(vertex_meta.buffer, vertex_meta.end);
+    return EdgeIterator(vertex, start, end);
+  }
+
+  EdgeIterator edge_end(VertexTopologyID vertex) {
+    auto const& vertex_meta = m_vertices[vertex];
+    EdgeMetadata const* const end =
+        &getEdgeMetadata(vertex_meta.buffer, vertex_meta.end);
+    return EdgeIterator(vertex, end, end);
+  }
+
   EdgeRange edges(VertexTopologyID node) {
-    auto& vertex_meta = m_vertices[node];
+    auto const& vertex_meta = m_vertices[node];
 
     EdgeMetadata const* const start =
         &getEdgeMetadata(vertex_meta.buffer, vertex_meta.begin);
@@ -182,6 +243,25 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
                      EdgeIterator(node, end, end));
   }
 
+  /*
+   * Sort the outgoing edges for the given vertex, pruning tombstoned edges in
+   * the process.
+   */
+  void sortEdges(VertexTopologyID node) {
+    auto& vertex_meta = m_vertices[node];
+
+    EdgeMetadata* start =
+        &getEdgeMetadata(vertex_meta.buffer, vertex_meta.begin);
+
+    EdgeMetadata* end = &getEdgeMetadata(vertex_meta.buffer, vertex_meta.end);
+
+    std::sort(start, end);
+
+    // Tombstoned edges will be moved to the end, so we can drop them by moving
+    // the end pointer:
+    vertex_meta.end = vertex_meta.begin + vertex_meta.degree;
+  }
+
   void addEdges(VertexTopologyID src, const std::vector<VertexTopologyID> dsts,
                 std::vector<EdgeData> data) {
     GALOIS_ASSERT(data.size() == dsts.size());
@@ -189,7 +269,6 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
     for (size_t i = 0; i < dsts.size(); ++i) {
       m_edge_data[std::make_pair(src, dsts[i])] = data[i];
     }
-    // todo: save edge data
   }
 
   void addEdgesTopologyOnly(VertexTopologyID src,
@@ -233,6 +312,8 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
 
     m_holes.fetch_add(vertex_meta.degree, std::memory_order_relaxed);
     vertex_meta.degree += dsts.size();
+
+    m_prefix_valid.store(false, std::memory_order_release);
   }
 
   void deleteEdges(VertexTopologyID src,
@@ -265,6 +346,7 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
     }
 
     m_holes.fetch_add(holes_added, std::memory_order_relaxed);
+    m_prefix_valid.store(false, std::memory_order_release);
   }
 
   // Performs the compaction algorithm by copying any vertices left in buffer 0
@@ -330,6 +412,20 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
     return m_holes.load(std::memory_order_relaxed) * sizeof(EdgeMetadata);
   }
 
+  /**
+   * DO NOT USE WHILE MODIFYING THE GRAPH!
+   * ONLY USE IF GRAPH HAS BEEN LOADED
+   *
+   * @param n Index into edge prefix sum
+   * @returns The value that would be located at index n in an edge prefix sum
+   * array
+   */
+  uint64_t operator[](uint64_t n) {
+    if (!m_prefix_valid.load(std::memory_order_acquire))
+      computePrefixSum();
+    return m_pfx_sum_cache[n];
+  }
+
 private:
   struct VertexMetadata {
     uint8_t buffer : 1;
@@ -356,13 +452,23 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
 
     bool is_tomb() const noexcept { return (flags & TOMB) > 0; }
     void set_tomb() { flags |= TOMB; }
+
+    bool operator<(EdgeMetadata const& rhs) {
+      if (is_tomb() != rhs.is_tomb())
+        // tombstoned edges come last
+        return is_tomb() < rhs.is_tomb();
+      else
+        // otherwise, sort by dst
+        return dst < rhs.dst;
+    }
+
   } __attribute__((packed));
 
   static_assert(sizeof(EdgeMetadata) <= sizeof(uint64_t));
 
   class EdgeIterator
       : public boost::iterator_facade<EdgeIterator, EdgeHandle,
-                                      boost::forward_traversal_tag,
+                                      boost::bidirectional_traversal_tag,
                                       EdgeHandle const> {
   private:
     VertexTopologyID const src;
@@ -378,6 +484,11 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
         ;
     }
 
+    void decrement() {
+      while ((--curr)->is_tomb())
+        ;
+    }
+
     // updates to the graph will invalidate iterators
     bool equal(EdgeIterator const& other) const { return curr == other.curr; }
 
diff --git a/libgalois/test/graph-compile-lscsr.cpp b/libgalois/test/graph-compile-lscsr.cpp
index b09298010c..bbb119bbf2 100644
--- a/libgalois/test/graph-compile-lscsr.cpp
+++ b/libgalois/test/graph-compile-lscsr.cpp
@@ -74,7 +74,7 @@ int main() {
   g.setData(3, 3);
   GALOIS_ASSERT(g.getData(3) == 3);
 
-  size_t four = g.addVertices({4, 5, 6, 7});
+  uint64_t four = g.addVertices({4, 5, 6, 7});
 
   for (size_t ii = 0; ii < 4; ++ii) {
     // make sure previous data survived the resize
@@ -88,5 +88,30 @@ int main() {
     GALOIS_ASSERT(g.getEdgeDst(handle) == g.getEdgeData(handle));
   }
 
+  uint64_t eight = g.addVertexTopologyOnly();
+  GALOIS_ASSERT(eight == 8);
+
+  g.addEdgesTopologyOnly(eight, {3, 2, 1, 0});
+  GALOIS_ASSERT(g.getEdgeDst(*g.edge_begin(eight)) == 3);
+  GALOIS_ASSERT(g.getEdgeDst(*(++g.edge_begin(eight))) == 2);
+  GALOIS_ASSERT(g.getEdgeDst(*(++(++g.edge_begin(eight)))) == 1);
+  GALOIS_ASSERT(g.getEdgeDst(*(++(++(++g.edge_begin(eight))))) == 0);
+  GALOIS_ASSERT(g.getEdgeDst(*(--g.edge_end(eight))) == 0);
+
+  g.deleteEdges(eight, {1});
+  g.sortEdges(eight);
+
+  GALOIS_ASSERT(g.getEdgeDst(*g.edge_begin(eight)) == 0);
+  GALOIS_ASSERT(g.getEdgeDst(*(++g.edge_begin(eight))) == 2);
+  GALOIS_ASSERT(g.getEdgeDst(*(++(++g.edge_begin(eight)))) == 3);
+  GALOIS_ASSERT(g.getEdgeDst(*(--g.edge_end(eight))) == 3);
+
+  // check prefix sum
+  GALOIS_ASSERT(g[0] == 3);
+  GALOIS_ASSERT(g[1] == 3);
+  GALOIS_ASSERT(g[2] == 3);
+  // ...
+  GALOIS_ASSERT(g[8] == 6);
+
   return 0;
 }

From 24b8db23c87970c5b28bc04152346c07ec9c9c41 Mon Sep 17 00:00:00 2001
From: Meyer Zinn <6132034+meyerzinn@users.noreply.github.com>
Date: Wed, 3 Apr 2024 17:15:45 -0500
Subject: [PATCH 645/660] fix bug in LSCSR class visibility (#25)

---
 libgalois/include/galois/graphs/LS_LC_CSR_Graph.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h b/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
index 7edf10ff69..f3c7e71232 100644
--- a/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
+++ b/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
@@ -58,6 +58,9 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
 
   using EdgeHandle = std::pair<VertexTopologyID, VertexTopologyID>;
 
+  class EdgeIterator;
+  using EdgeRange = boost::iterator_range<EdgeIterator>;
+
 private:
   using SpinLock = galois::substrate::PaddedLock<concurrent>;
   static constexpr bool HasVertexData = !std::is_same_v<VertexData, void>;
@@ -77,9 +80,6 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
   struct VertexMetadata;
   struct EdgeMetadata;
 
-  class EdgeIterator;
-  using EdgeRange = boost::iterator_range<EdgeIterator>;
-
   VertexDataStore m_vertex_data;
   std::vector<VertexMetadata> m_vertices;
 
@@ -466,6 +466,7 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
 
   static_assert(sizeof(EdgeMetadata) <= sizeof(uint64_t));
 
+public:
   class EdgeIterator
       : public boost::iterator_facade<EdgeIterator, EdgeHandle,
                                       boost::bidirectional_traversal_tag,

From dcafab6e572c8b5100a30de389321c7d768f9678 Mon Sep 17 00:00:00 2001
From: Meyer Zinn <6132034+meyerzinn@users.noreply.github.com>
Date: Wed, 3 Apr 2024 19:31:59 -0500
Subject: [PATCH 646/660] add findEdge to LS_CSR (#23)

---
 .../include/galois/graphs/LS_LC_CSR_Graph.h   | 35 +++++++++++++++----
 libgalois/include/galois/graphs/MorphGraph.h  |  2 +-
 libgalois/test/graph-compile-lscsr.cpp        |  6 ++++
 3 files changed, 35 insertions(+), 8 deletions(-)

diff --git a/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h b/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
index f3c7e71232..eb653346f1 100644
--- a/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
+++ b/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
@@ -262,6 +262,20 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
     vertex_meta.end = vertex_meta.begin + vertex_meta.degree;
   }
 
+  /*
+   * Returns whether the given edge exists.
+   *
+   * Assumes sortEdges was already called for the vertex!
+   */
+  bool findEdgeSorted(VertexTopologyID src, VertexTopologyID dst) {
+    auto const& vertex_meta = m_vertices[src];
+    EdgeMetadata* start =
+        &getEdgeMetadata(vertex_meta.buffer, vertex_meta.begin);
+    EdgeMetadata* end = &getEdgeMetadata(vertex_meta.buffer, vertex_meta.end);
+
+    return std::binary_search(start, end, EdgeMetadata(dst));
+  }
+
   void addEdges(VertexTopologyID src, const std::vector<VertexTopologyID> dsts,
                 std::vector<EdgeData> data) {
     GALOIS_ASSERT(data.size() == dsts.size());
@@ -295,9 +309,7 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
 
     // insert new edges
     std::transform(dsts.begin(), dsts.end(), &getEdgeMetadata(1, new_begin),
-                   [](VertexTopologyID dst) {
-                     return EdgeMetadata{.flags = 0, .dst = dst};
-                   });
+                   [](VertexTopologyID dst) { return EdgeMetadata(dst); });
 
     // copy old, non-tombstoned edges
     std::copy_if(&getEdgeMetadata(vertex_meta.buffer, vertex_meta.begin),
@@ -453,13 +465,22 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
     bool is_tomb() const noexcept { return (flags & TOMB) > 0; }
     void set_tomb() { flags |= TOMB; }
 
-    bool operator<(EdgeMetadata const& rhs) {
-      if (is_tomb() != rhs.is_tomb())
+    EdgeMetadata() {}
+    explicit EdgeMetadata(VertexTopologyID dst) : flags(0), dst(dst) {}
+
+    // Sort edges, with tombstoned coming after non-tombstoned.
+    friend bool operator<(EdgeMetadata const& lhs, EdgeMetadata const& rhs) {
+      if (lhs.is_tomb() != rhs.is_tomb())
         // tombstoned edges come last
-        return is_tomb() < rhs.is_tomb();
+        return lhs.is_tomb() < rhs.is_tomb();
       else
         // otherwise, sort by dst
-        return dst < rhs.dst;
+        return lhs.dst < rhs.dst;
+    }
+
+    // Check dst equality only.
+    friend bool operator==(EdgeMetadata const& lhs, EdgeMetadata const& rhs) {
+      return (lhs.dst == rhs.dst);
     }
 
   } __attribute__((packed));
diff --git a/libgalois/include/galois/graphs/MorphGraph.h b/libgalois/include/galois/graphs/MorphGraph.h
index 6157f50f09..3605daef83 100644
--- a/libgalois/include/galois/graphs/MorphGraph.h
+++ b/libgalois/include/galois/graphs/MorphGraph.h
@@ -977,7 +977,7 @@ public
   //! Sorts edge of a node by destination.
   void sortEdgesByDst(GraphNode N,
                       galois::MethodFlag mflag = MethodFlag::WRITE) {
-    // acquire(N, mflag);
+    acquire(N, mflag);
     typedef typename gNode::EdgeInfo EdgeInfo;
     std::sort(N->begin(), N->end(),
               [=](const EdgeInfo& e1, const EdgeInfo& e2) {
diff --git a/libgalois/test/graph-compile-lscsr.cpp b/libgalois/test/graph-compile-lscsr.cpp
index bbb119bbf2..7c8aac7da9 100644
--- a/libgalois/test/graph-compile-lscsr.cpp
+++ b/libgalois/test/graph-compile-lscsr.cpp
@@ -106,6 +106,12 @@ int main() {
   GALOIS_ASSERT(g.getEdgeDst(*(++(++g.edge_begin(eight)))) == 3);
   GALOIS_ASSERT(g.getEdgeDst(*(--g.edge_end(eight))) == 3);
 
+  // check searching
+  GALOIS_ASSERT(g.findEdgeSorted(eight, 0));
+  GALOIS_ASSERT(!g.findEdgeSorted(eight, 1));
+  GALOIS_ASSERT(g.findEdgeSorted(eight, 2));
+  GALOIS_ASSERT(g.findEdgeSorted(eight, 3));
+
   // check prefix sum
   GALOIS_ASSERT(g[0] == 3);
   GALOIS_ASSERT(g[1] == 3);

From 8c1fd1dc7e8ae5c9cff0afc27e544e938b09269b Mon Sep 17 00:00:00 2001
From: Meyer Zinn <6132034+meyerzinn@users.noreply.github.com>
Date: Wed, 3 Apr 2024 20:34:39 -0500
Subject: [PATCH 647/660] expose the LS_CSR prefix sum computation (#26)

---
 libgalois/include/galois/graphs/LS_LC_CSR_Graph.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h b/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
index eb653346f1..451477b2f4 100644
--- a/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
+++ b/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
@@ -438,6 +438,12 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
     return m_pfx_sum_cache[n];
   }
 
+  std::vector<uint64_t> const& getEdgePrefixSum() {
+    if (!m_prefix_valid.load(std::memory_order_acquire))
+      computePrefixSum();
+    return m_pfx_sum_cache;
+  }
+
 private:
   struct VertexMetadata {
     uint8_t buffer : 1;

From 8ee55e58d9549eb59bf81c89d276aa15e85a8b96 Mon Sep 17 00:00:00 2001
From: Patrick Kenney <patrickkenney9801@gmail.com>
Date: Thu, 4 Apr 2024 13:25:30 -0500
Subject: [PATCH 648/660] chore: Add test graph constructor (#16)

* chore: Remove instrumentation legacy code
---
 CMakeLists.txt                             |   4 -
 CONTRIBUTING.md                            |  39 -----
 libwmd/include/galois/wmd/WMDGraph.h       |  98 -----------
 libwmd/include/galois/wmd/WMDPartitioner.h | 171 +++++++++++++------
 libwmd/include/galois/wmd/instrument.h     | 190 ---------------------
 libwmd/include/galois/wmd/schema.h         |  10 +-
 libwmd/test/wmd-graph-build.cpp            |   4 -
 7 files changed, 130 insertions(+), 386 deletions(-)
 delete mode 100644 libwmd/include/galois/wmd/instrument.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b505be17bc..721a4db6e7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -208,10 +208,6 @@ if(GALOIS_ENABLE_PAPI)
   add_definitions(-DGALOIS_ENABLE_PAPI)
 endif()
 
-if (GALOIS_ENABLE_INSTRUMENT)
-  add_definitions(-DGALOIS_INSTRUMENT)
-endif()
-
 find_package(Threads REQUIRED)
 
 include(CheckMmap)
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 7f91c20d41..007227dc70 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -12,45 +12,6 @@ Users can run `make docker-image` to setup all dependecies needed for
 `pando-galois`.  After creating the image it can be run via `make docker`.
 And for first time cmake users can run `make run-cmake`.
 
-# Instrumentation
-
-This section pertains to enabling and instrumenting memory accesses for
-performance projections on the theoretical PANDO hardware.
-
-In order for the instrumentation code in `libwmd/include/galois/wmd/instrument.h`,
-the following should be added to your top level source directory:
-
-```cmake
-set(GALOIS_ENABLE_INSTRUMENT ON)
-if (GALOIS_ENABLE_INSTRUMENT)
-  add_definitions(-DGALOIS_INSTRUMENT)
-endif()
-```
-
-Here is a description of the control-flow macros used by the instrumentation
-and when they should be used.
-
-```cpp
-// Should be called once at the start of the program to initialize the instrumentation
-// For example specifying `GRAPH_NAME=example-graph` will result in instrumentation
-// files starting with `example-graph`
-I_INIT(GRAPH_NAME, HOST, NUM_HOSTS, NUM_EDGES)
-// Should be called once at the end of the program to cleanup the instrumentation
-I_DEINIT()
-// Should be called after the first kernel measured if multiple kernels are being measured
-// For example if you specified `GRAPH_NAME=example-graph` above then specifying here that
-// `NAME_SUFFIX=-kernel2` will result in instrumentation files starting `example-graph-kernel2`
-I_NEW_FILE(NAME_SUFFIX, NUM_EDGES)
-// I_ROUND should be called at the end of a communication round to log all memory accesses
-// and communication recorded into instrumentation files
-// I_CLEAR should be called after I_ROUND
-I_ROUND(ROUND_NUM)
-I_CLEAR()
-// Should be called when sending custom communication to a remote host, recommended practice
-// is to just pass in the size of the SendBuffer you are using
-I_LC(REMOTE_HOST, BYTES)
-```
-
 ## Tools
 
 ### [asdf](https://asdf-vm.com)
diff --git a/libwmd/include/galois/wmd/WMDGraph.h b/libwmd/include/galois/wmd/WMDGraph.h
index df92ca29c8..8f1d9aef73 100644
--- a/libwmd/include/galois/wmd/WMDGraph.h
+++ b/libwmd/include/galois/wmd/WMDGraph.h
@@ -34,7 +34,6 @@
 #include "graphTypes.h"
 #include "graph.h"
 #include "schema.h"
-#include "instrument.h"
 
 namespace galois {
 namespace graphs {
@@ -102,17 +101,13 @@ class WMDOfflineGraph : public OfflineGraph {
 
   inline void insertlocalEdgesPerThread(unsigned tid, uint64_t token,
                                         EdgeDataType& edge) {
-    I_RR();
     if (auto search = perThreadTokenToLocalEdgesIdx[tid].find(token);
         search !=
         perThreadTokenToLocalEdgesIdx[tid].end()) { // if token already exists
-      I_WR();
       perThreadLocalEdges[tid][search->second].push_back(std::move(edge));
     } else { // not exist, make a new one
-      I_WR();
       perThreadTokenToLocalEdgesIdx[tid].insert(
           {token, perThreadLocalEdges[tid].size()});
-      I_WR();
       std::vector<EdgeDataType> v;
       v.push_back(std::move(edge));
       perThreadLocalEdges[tid].push_back(std::move(v));
@@ -170,7 +165,6 @@ class WMDOfflineGraph : public OfflineGraph {
       if (segmentID != 0) {
         graphFile.seekg(start - 1);
         getline(graphFile, line);
-        I_RS();
 
         // if not at start of a line, discard partial line
         if (!line.empty())
@@ -181,7 +175,6 @@ class WMDOfflineGraph : public OfflineGraph {
       if (segmentID != numSegments - 1) {
         graphFile.seekg(end - 1);
         getline(graphFile, line);
-        I_RS();
 
         // if not at end of a line, include next line
         if (!line.empty())
@@ -196,7 +189,6 @@ class WMDOfflineGraph : public OfflineGraph {
       uint64_t segmentLength = end - start;
       char* segmentBuffer    = new char[segmentLength];
       graphFile.read(segmentBuffer, segmentLength);
-      I_RS();
 
       if (!graphFile)
         galois::gError("failed to read segment start: ", start, ", end: ", end,
@@ -218,7 +210,6 @@ class WMDOfflineGraph : public OfflineGraph {
           // if not at start of a line, discard partial line
           if (*(currentLine - 1) != '\n')
             currentLine = std::strchr(currentLine, '\n') + 1;
-          I_RR();
         }
 
         // last thread processes to end of file
@@ -233,7 +224,6 @@ class WMDOfflineGraph : public OfflineGraph {
           assert(std::strchr(currentLine, '\n'));
           char* nextLine      = std::strchr(currentLine, '\n') + 1;
           uint64_t lineLength = nextLine - currentLine;
-          I_RR();
 
           // skip comments
           if (currentLine[0] == '#') {
@@ -244,10 +234,8 @@ class WMDOfflineGraph : public OfflineGraph {
           // delimiter and # tokens set for wmd data file
           ParsedGraphStructure<NodeDataType, EdgeDataType> value =
               parser.ParseLine(currentLine, lineLength);
-          I_RS();
 
           if (value.isNode) {
-            I_WR();
             perThreadLocalNodes[tid].emplace_back(value.node);
           } else if (value.isEdge) {
             for (auto& edge : value.edges) {
@@ -261,7 +249,6 @@ class WMDOfflineGraph : public OfflineGraph {
         edgeCounter += edgeAdded;
         if (cur == segmentsPerHost - 1) {
           nodeCounter += perThreadLocalNodes[tid].size();
-          I_RR();
         }
       });
 
@@ -305,9 +292,7 @@ class WMDOfflineGraph : public OfflineGraph {
 
     for (std::unique_ptr<FileParser<NodeDataType, EdgeDataType>>& parser :
          parsers) {
-      I_RR();
       for (const std::string& file : parser->GetFiles()) {
-        I_RR();
         loadGraphFile(file, *parser, segmentsPerHost, nodeCounter, edgeCounter);
       }
     }
@@ -319,10 +304,8 @@ class WMDOfflineGraph : public OfflineGraph {
       setSizeEdges(edgeCounter.reduce());
     }
 
-    I_RR();
     localEdgeSize = edgeCounter.read_local();
     localNodeSize.resize(numHosts);
-    I_BM(numHosts);
     localNodeSize[hostID] = nodeCounter.reduce();
   }
 
@@ -348,14 +331,11 @@ class WMDOfflineGraph : public OfflineGraph {
       for (uint64_t i = beginNode; i < endNode; ++i) {
         uint32_t index = (localEdges[i][0].src) % numVirtualHosts;
         threadEdgeCnt[tid][index] += localEdges[i].size();
-        I_WR();
       }
     });
     for (uint32_t i = 0; i < activeThreads; i++) {
       for (uint32_t j = 0; j < numVirtualHosts; j++) {
         edgeCnt[j] += threadEdgeCnt[i][j];
-        I_RS();
-        I_WR();
       }
     }
     // Send EdgeCnt
@@ -363,7 +343,6 @@ class WMDOfflineGraph : public OfflineGraph {
       if (i == hostID)
         continue;
 
-      I_WM(edgeCnt.size());
       galois::runtime::SendBuffer b;
       galois::runtime::gSerialize(b, edgeCnt);
       net.sendTagged(i, galois::runtime::evilPhase, std::move(b));
@@ -377,22 +356,17 @@ class WMDOfflineGraph : public OfflineGraph {
         p = net.recieveTagged(galois::runtime::evilPhase);
       } while (!p);
       galois::runtime::gDeserialize(p->second, recvChunkCounts);
-      I_LC(p->first, recvChunkCounts.size() * sizeof(uint64_t));
       galois::do_all(galois::iterate((size_t)0, recvChunkCounts.size()),
                      [this, &edgeCnt, &recvChunkCounts](uint64_t i) {
-                       I_RR();
-                       I_WR();
                        edgeCnt[i] += recvChunkCounts[i];
                      });
     }
     increment_evilPhase();
     uint64_t edgesNum = 0;
     for (uint32_t h = 0; h < numVirtualHosts; h++) {
-      I_RS();
       edgesNum += edgeCnt[h];
     }
     setSizeEdges(edgesNum);
-    I_WR();
     // Process edgeCnt
     std::vector<uint64_t> edgeCntBkp = edgeCnt;
     uint32_t sf                      = scaleFactor;
@@ -401,42 +375,25 @@ class WMDOfflineGraph : public OfflineGraph {
       std::vector<uint32_t> vec;
       vec.push_back(i);
       cnt_vec.push_back(std::make_pair(edgeCnt[i], vec));
-      I_WR();
-      I_RS();
     }
     std::sort(cnt_vec.begin(), cnt_vec.end());
     while (sf > 1) {
       for (uint32_t i = 0; i < (sf * numHosts / 2); i++) {
         std::pair<uint64_t, std::vector<uint32_t>> mypair;
-        I_RR();
-        I_RR();
-        I_WR();
         cnt_vec[i].first += cnt_vec[sf * numHosts - i - 1].first;
         std::vector vec = cnt_vec[(sf * numHosts) - i - 1].second;
         for (size_t j = 0; j < vec.size(); j++) {
           cnt_vec[i].second.push_back(
               cnt_vec[(sf * numHosts) - i - 1].second[j]);
-          I_RS();
-          I_WR();
         }
       }
       sf /= 2;
-
-#ifdef GALOIS_INSTRUMENT
-      std::sort(cnt_vec.begin(), cnt_vec.begin() + (sf * numHosts));
-      for (uint32_t i = 0; i < (uint32_t)sf * numHosts; i++) {
-        I_RR();
-        I_WR();
-      }
-#endif
     }
     // Determine virtualToPhyMapping values
     for (uint32_t i = 0; i < numHosts; i++) {
       std::vector vec = cnt_vec[i].second;
       for (size_t j = 0; j < vec.size(); j++) {
         virtualToPhyMapping[vec[j]] = i;
-        I_RS();
-        I_WR();
       }
     }
   }
@@ -451,23 +408,17 @@ class WMDOfflineGraph : public OfflineGraph {
     std::unordered_map<uint64_t, size_t> globalNodeIDToLocalEdgesIdx;
     uint64_t numThreads = perThreadLocalEdges.size();
     for (size_t i = 0; i < numThreads; i++) {
-      I_RR();
       uint64_t perThreadSize = perThreadLocalEdges[i].size();
       for (size_t j = 0; j < perThreadSize; j++) {
-        I_RR();
         uint64_t globalID = perThreadLocalEdges[i][j][0].src;
-        I_RR();
         if (auto search = globalNodeIDToLocalEdgesIdx.find(globalID);
             search !=
             globalNodeIDToLocalEdgesIdx.end()) { // if token already exists
-          I_WM(perThreadLocalEdges[i][j].size());
           std::move(perThreadLocalEdges[i][j].begin(),
                     perThreadLocalEdges[i][j].end(),
                     std::back_inserter(localEdges[search->second]));
         } else { // not exist, make a new one
-          I_WR();
           globalNodeIDToLocalEdgesIdx.insert({globalID, localEdges.size()});
-          I_WR();
           localEdges.emplace_back(std::move(perThreadLocalEdges[i][j]));
         }
       }
@@ -480,7 +431,6 @@ class WMDOfflineGraph : public OfflineGraph {
     galois::do_all(
         galois::iterate(globalNodeIDToLocalEdgesIdx),
         [this](std::unordered_map<uint64_t, size_t>::value_type& p) {
-          I_WR();
           localEdgesIdxToGlobalNodeID[p.second] = p.first;
         },
         galois::steal());
@@ -489,14 +439,12 @@ class WMDOfflineGraph : public OfflineGraph {
     std::vector<uint64_t> perThreadLocalNodesOffset(perThreadLocalNodes.size(),
                                                     0);
     for (size_t i = 1; i < perThreadLocalNodes.size(); i++) {
-      I_WR();
       perThreadLocalNodesOffset[i] =
           perThreadLocalNodes[i - 1].size() + perThreadLocalNodesOffset[i - 1];
     }
     localNodes.resize(localNodeSize[hostID]);
     galois::on_each([&](unsigned tid, unsigned) {
       uint64_t perThreadOffset = perThreadLocalNodesOffset[tid];
-      I_WM(perThreadLocalNodes[tid].size());
       std::move(perThreadLocalNodes[tid].begin(),
                 perThreadLocalNodes[tid].end(),
                 localNodes.begin() + perThreadOffset);
@@ -518,7 +466,6 @@ class WMDOfflineGraph : public OfflineGraph {
     galois::do_all(
         galois::iterate((size_t)0, numLocalNodes),
         [this, &localNodeDegree](size_t n) {
-          I_WR();
           localNodeDegree[n] = localEdges[n].size();
         },
         galois::steal());
@@ -538,7 +485,6 @@ class WMDOfflineGraph : public OfflineGraph {
 
         galois::runtime::SendBuffer b;
         galois::runtime::gSerialize(b, std::move(sendBuffer));
-        I_LC(h, b.size());
         net.sendTagged(h, galois::runtime::evilPhase, std::move(b));
       }
     }
@@ -548,7 +494,6 @@ class WMDOfflineGraph : public OfflineGraph {
     galois::do_all(
         galois::iterate((size_t)0, localEdgesIdxToGlobalNodeID.size()),
         [this, &localNodeDegree](size_t n) {
-          I_WR();
           globalEdgePrefixSum[localEdgesIdxToGlobalNodeID[n]] +=
               localNodeDegree[n];
         },
@@ -572,7 +517,6 @@ class WMDOfflineGraph : public OfflineGraph {
       galois::do_all(
           galois::iterate((size_t)0, recvNodeDegree.size()),
           [this, &recvNodeDegree, &recvNodeGlobalID](size_t n) {
-            I_WR();
             globalEdgePrefixSum[recvNodeGlobalID[n]] += recvNodeDegree[n];
           },
           galois::steal());
@@ -581,7 +525,6 @@ class WMDOfflineGraph : public OfflineGraph {
     // globalEdgePrefixSum has degree info now, so could compute prefixsum
     // in place
     for (size_t h = 1; h < numGlobalNodes; h++) {
-      I_WR();
       globalEdgePrefixSum[h] += globalEdgePrefixSum[h - 1];
     }
 
@@ -751,7 +694,6 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
         galois::runtime::activeThreads);
     for (uint32_t i = 0; i < galois::runtime::activeThreads; i++) {
       threadNodesToSend[i].resize(numHosts, 0);
-      I_WR();
     }
     galois::on_each([&](unsigned tid, unsigned nthreads) {
       uint64_t beginNode;
@@ -763,16 +705,11 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
         int host =
             virtualToPhyMapping[srcGraph.localNodes[i].id % numVirtualHosts];
         threadNodesToSend[tid][host]++;
-        I_WR();
-        for (int k = 0; k < 2; k++)
-          I_RR();
       }
     });
     for (uint32_t tid = 0; tid < galois::runtime::activeThreads; tid++) {
       for (uint32_t h = 0; h < numHosts; h++) {
         localNodeSize[h] += threadNodesToSend[tid][h];
-        I_RR();
-        I_WR();
       }
     }
 
@@ -787,7 +724,6 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
       galois::runtime::SendBuffer sendBuffer;
       galois::runtime::gSerialize(sendBuffer, localNodeSize);
       net.sendTagged(h, galois::runtime::evilPhase, std::move(sendBuffer));
-      I_WM(localNodeSize.size());
     }
 
     for (uint32_t h = 0; h < numHosts - 1; h++) {
@@ -798,11 +734,8 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
       std::vector<uint64_t> cnt;
       // deserialize local_node_size
       galois::runtime::gDeserialize(p->second, cnt);
-      I_LC(p->first, cnt.size() * sizeof(uint64_t));
       for (uint32_t i = 0; i < numHosts; i++) {
         localNodeSize[i] += cnt[i];
-        I_RR();
-        I_WR();
       }
     }
 
@@ -812,15 +745,9 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
     globalNodeOffset[0] = 0;
     for (size_t h = 1; h < numHosts; h++) {
       globalNodeOffset[h] = localNodeSize[h - 1] + globalNodeOffset[h - 1];
-      for (int k = 0; k < 2; k++)
-        I_RR();
-      I_WR();
     }
     srcGraph.setSize(globalNodeOffset[numHosts - 1] +
                      localNodeSize[numHosts - 1]);
-    for (int k = 0; k < 2; k++)
-      I_RR();
-    I_WR();
 
     increment_evilPhase();
   }
@@ -955,7 +882,6 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
       galois::gInfo("[", hostID, "] ", "send to ", h,
                     " edgesToSend size: ", edgesToSend[h].size());
       net.sendTagged(h, galois::runtime::evilPhase, std::move(sendBuffer));
-      I_WM(edgesToSend[h].size());
     }
 
     // Appending edges in each host that belong to self
@@ -972,10 +898,6 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
       std::vector<std::vector<EdgeDataType>> edgeList;
 
       galois::runtime::gDeserialize(p->second, edgeList);
-#ifdef GALOIS_INSTRUMENT
-      for (auto l : edgeList)
-        I_LC(sendingHost, l.size() * sizeof(EdgeDataType));
-#endif
 
       galois::gInfo("[", hostID, "] recv from ", sendingHost,
                     " edgeList size: ", edgeList.size());
@@ -990,9 +912,6 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
           localEdges[lid].insert(std::end(localEdges[lid]),
                                  std::begin(edgeList[j]),
                                  std::end(edgeList[j]));
-          for (int i = 0; i < 3; i++)
-            I_RR();
-          I_WR();
         }
       });
       edgeList.clear();
@@ -1007,9 +926,6 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
         localEdges[lid].insert(std::end(localEdges[lid]),
                                std::begin(edgesToSend[hostID][j]),
                                std::end(edgesToSend[hostID][j]));
-        for (int i = 0; i < 4; i++)
-          I_RR();
-        I_WR();
       }
     });
     edgesToSend.clear();
@@ -1030,8 +946,6 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
       else
         cnt = localEdges[i].size();
       offsets[i + 1] += cnt + offsets[i];
-      I_RR();
-      I_WR();
     }
     numLocalEdges = offsets[numLocalNodes];
 
@@ -1040,7 +954,6 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
     galois::do_all(
         galois::iterate((size_t)0, localEdges.size()),
         [this, &localEdges](size_t i) {
-          I_WM(localEdges[i].size());
           std::move(localEdges[i].begin(), localEdges[i].end(),
                     edges.begin() + offsets[i]);
         },
@@ -1121,8 +1034,6 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
     virtualToPhyMapping.resize(numVirtualHosts);
     for (uint32_t i = 0; i < numVirtualHosts; i++) {
       virtualToPhyMapping[i] = srcGraph.virtualToPhyMapping[i];
-      I_RS();
-      I_WR();
     }
 
     // build local buffered graph
@@ -1182,7 +1093,6 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
 
     for (uint32_t i = 0; i < galois::runtime::activeThreads; i++) {
       threadNodesToSend[i].resize(numHosts);
-      I_WR();
     }
     // Phase 1
     galois::on_each([&](unsigned tid, unsigned nthreads) {
@@ -1214,7 +1124,6 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
       galois::runtime::SendBuffer sendBuffer;
       galois::runtime::gSerialize(sendBuffer, nodesToSend[h]);
       net.sendTagged(h, galois::runtime::evilPhase, std::move(sendBuffer));
-      I_WM(nodesToSend[h].size());
     }
 #ifndef NDEBUG
     std::atomic<uint64_t> addedData{0};
@@ -1225,7 +1134,6 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
         p = net.recieveTagged(galois::runtime::evilPhase);
       } while (!p);
       std::vector<NodeDataType> NodeData;
-      I_LC(p->first, p->second.size());
       galois::runtime::gDeserialize(p->second, NodeData);
       galois::do_all(galois::iterate((size_t)0, NodeData.size()),
                      [this, NodeData, &dstGraph, &globalToLocalMap
@@ -1262,7 +1170,6 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
         [this, &nodesToSend, &localNodes, &proxiesOnHosts, globalIDOffset,
          &dstGraph, &globalToLocalMap](uint64_t i) {
           if (i != hostID) {
-            I_RR();
             for (uint64_t j = 0; j < proxiesOnHosts[i].size(); j++) {
               auto& r =
                   dstGraph.getData(globalToLocalMap[proxiesOnHosts[i][j]]);
@@ -1284,7 +1191,6 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
       galois::runtime::gSerialize(sendBuffer, proxiesOnHosts[h]);
       galois::gDebug("[", hostID, "] ", "send to ", h,
                      " nodesToSend size: ", nodesToSend[h].size());
-      I_WM(nodesToSend[h].size() + proxiesOnHosts[h].size());
       net.sendTagged(h, galois::runtime::evilPhase, std::move(sendBuffer));
     }
 
@@ -1299,7 +1205,6 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
 
       std::vector<NodeDataType> nodeRecv;
       std::vector<uint64_t> IDofNodeRecv;
-      I_LC(sendingHost, p->second.size());
       galois::runtime::gDeserialize(p->second, nodeRecv);
       galois::runtime::gDeserialize(p->second, IDofNodeRecv);
 
@@ -1312,9 +1217,6 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
           [this, &nodeRecv, &IDofNodeRecv, &dstGraph,
            &globalToLocalMap](size_t j) {
             dstGraph.getData(globalToLocalMap[IDofNodeRecv[j]]) = nodeRecv[j];
-            for (int k = 0; k < 2; k++)
-              I_RR();
-            I_WR();
           },
           galois::steal());
       nodeRecv.clear();
diff --git a/libwmd/include/galois/wmd/WMDPartitioner.h b/libwmd/include/galois/wmd/WMDPartitioner.h
index 70b7c1bcee..d5cdce65e1 100644
--- a/libwmd/include/galois/wmd/WMDPartitioner.h
+++ b/libwmd/include/galois/wmd/WMDPartitioner.h
@@ -14,17 +14,30 @@
 #include "galois/DReducible.h"
 
 #include "WMDGraph.h"
-#include "instrument.h"
 
 #include <atomic>
 #include <unistd.h>
 #include <ios>
 #include <iostream>
 #include <fstream>
+#include <set>
 #include <string>
 
 namespace galois {
+
+template <typename EdgeType>
+struct GenericEdge {
+  GenericEdge() = default;
+  GenericEdge(uint64_t src_, uint64_t dst_, EdgeType data_)
+      : src(src_), dst(dst_), data(data_) {}
+
+  uint64_t src;
+  uint64_t dst;
+  EdgeType data;
+};
+
 namespace graphs {
+
 /**
  * @tparam NodeTy type of node data for the graph
  * @tparam EdgeTy type of edge data for the graph
@@ -173,9 +186,6 @@ class WMDGraph : public DistLocalGraph<NodeTy, EdgeTy> {
     inspectionTimer.start();
     base_DistGraph::numGlobalNodes = g.size();
     base_DistGraph::numGlobalEdges = g.sizeEdges();
-    for (int k = 0; k < 2; k++)
-      I_RR();
-    I_WM(2);
 
     // galois::gstl::Vector<uint64_t> prefixSumOfEdges;
     // prefixSumOfEdges.resize(base_DistGraph::numOwned);
@@ -186,17 +196,11 @@ class WMDGraph : public DistLocalGraph<NodeTy, EdgeTy> {
     uint64_t nodeEnd   = bufGraph.globalNodeOffset[base_DistGraph::id] +
                        bufGraph.localNodeSize[base_DistGraph::id];
     base_DistGraph::numOwned = bufGraph.localNodeSize[base_DistGraph::id];
-    for (int k = 0; k < 4; k++)
-      I_RR();
-    I_WM(1);
 
     base_DistGraph::gid2host.resize(base_DistGraph::numHosts);
     for (uint64_t h = 0; h < base_DistGraph::numHosts - 1; h++) {
       base_DistGraph::gid2host[h] = std::pair<uint64_t, uint64_t>(
           bufGraph.globalNodeOffset[h], bufGraph.globalNodeOffset[h + 1]);
-      for (int k = 0; k < 2; k++)
-        I_RR();
-      I_WM(1);
     }
     base_DistGraph::gid2host[base_DistGraph::numHosts - 1] =
         std::pair<uint64_t, uint64_t>(
@@ -232,7 +236,6 @@ class WMDGraph : public DistLocalGraph<NodeTy, EdgeTy> {
     // Allocate and construct the graph
     base_DistGraph::graph.allocateFrom(base_DistGraph::numNodes,
                                        base_DistGraph::numEdges);
-    I_WM(base_DistGraph::numNodes);
     base_DistGraph::graph.constructNodes();
 
     // construct edges
@@ -243,16 +246,11 @@ class WMDGraph : public DistLocalGraph<NodeTy, EdgeTy> {
         galois::iterate(nodeBegin, nodeEnd),
         [&](uint64_t globalID) {
           auto edgeDst = bufGraph.edgeLocalDst(globalID);
-          I_RR();
           std::vector<uint64_t> dstData;
           for (auto dst : edgeDst) {
             dstData.emplace_back(base_DistGraph::globalToLocalMap[dst]);
-            I_RR();
-            I_WR();
           }
           auto edgeData = bufGraph.edgeDataPtr(globalID);
-          I_RR();
-          I_WM(bufGraph.edgeNum(globalID));
           base_DistGraph::graph.addEdgesUnSort(
               true, (globalID - bufGraph.globalNodeOffset[base_DistGraph::id]),
               dstData.data(), edgeData, bufGraph.edgeNum(globalID), false);
@@ -387,34 +385,25 @@ class WMDGraph : public DistLocalGraph<NodeTy, EdgeTy> {
         galois::iterate(base_DistGraph::masterNodesRange().begin(),
                         base_DistGraph::masterNodesRange().end()),
         [&](auto& node) {
-          I_RS();
           if (!projection.KeepNode(*this, node)) {
             return;
           }
-          I_RR();
           NodeGID nodeGID = base_DistGraph::getGID(node);
           std::vector<NodeGID> edgeDsts;
           std::vector<NewEdgeType> keptEdgeData;
 
           uint64_t keptEdges = 0;
           for (const auto& edge : base_DistGraph::edges(node)) {
-            I_RS();
-            I_RR();
             EdgeTy edgeData = base_DistGraph::getEdgeData(edge);
-            I_RR();
             NodeLID dstNode = base_DistGraph::getEdgeDst(edge);
             if (!projection.KeepEdge(*this, edgeData, node, dstNode)) {
               continue;
             }
             keptEdges++;
-            I_RR();
-            I_WR();
             edgeDsts.emplace_back(base_DistGraph::getGID(dstNode));
-            I_WR();
             keptEdgeData.emplace_back(
                 projection.ProjectEdge(*this, edgeData, node, dstNode));
             if (dstNode >= base_DistGraph::numOwned) {
-              I_WR();
               keepMirrors[dstNode - base_DistGraph::numOwned] = true;
             }
           }
@@ -424,8 +413,7 @@ class WMDGraph : public DistLocalGraph<NodeTy, EdgeTy> {
             }
             globalNodes += 1;
             globalEdges += keptEdges;
-            NodeLID nodeLID = masterNodes.fetch_add(1);
-            I_WR();
+            NodeLID nodeLID                        = masterNodes.fetch_add(1);
             newGraph->localToGlobalVector[nodeLID] = nodeGID;
             newTopology[nodeLID]                   = std::move(edgeDsts);
             newEdgeData[nodeLID]                   = std::move(keptEdgeData);
@@ -437,15 +425,11 @@ class WMDGraph : public DistLocalGraph<NodeTy, EdgeTy> {
     galois::do_all(galois::iterate(uint64_t(base_DistGraph::numOwned),
                                    uint64_t(base_DistGraph::numNodes)),
                    [&](auto& mirrorNode) {
-                     I_RS();
-                     I_RR();
                      if (!keepMirrors[mirrorNode - base_DistGraph::numOwned]) {
                        return;
                      }
-                     I_RR();
                      NodeGID nodeGID = base_DistGraph::getGID(mirrorNode);
                      NodeLID nodeLID = numMasters + mirrorNodes.fetch_add(1);
-                     I_WR();
                      newGraph->localToGlobalVector[nodeLID] = nodeGID;
                    });
 
@@ -477,15 +461,11 @@ class WMDGraph : public DistLocalGraph<NodeTy, EdgeTy> {
     newTopology.resize(newGraph->numNodes);
     newEdgeData.resize(newGraph->numNodes);
     newGraph->localToGlobalVector.resize(newGraph->numNodes);
-    I_WM(newGraph->numNodes);
     newGraph->recalculateG2LMap();
 
     for (uint32_t i = newGraph->numOwned; i < newGraph->numNodes; i++) {
-      I_RR();
       uint64_t globalID = newGraph->getGID(i);
       // deliberately use the old graph partitioner to get the owner of the GID
-      I_RR();
-      I_WR();
       newGraph->mirrorNodes[graphPartitioner->retrieveMaster(globalID)]
           .emplace_back(globalID);
     }
@@ -496,15 +476,11 @@ class WMDGraph : public DistLocalGraph<NodeTy, EdgeTy> {
     galois::do_all(
         galois::iterate(uint64_t(0), uint64_t(newGraph->numNodes)),
         [&](auto& node) {
-          I_RS();
-          I_RR();
           NodeLID oldGraphLID =
               base_DistGraph::getLID(newGraph->localToGlobalVector[node]);
-          I_WR(node >= newGraph->numOwned);
           newGraph->graph.getData(node) = projection.ProjectNode(
               *this, base_DistGraph::getData(oldGraphLID), oldGraphLID);
 
-          I_RR();
           uint64_t numEdges = newTopology[node].size();
           if (node >= newGraph->numOwned) {
             return;
@@ -512,12 +488,8 @@ class WMDGraph : public DistLocalGraph<NodeTy, EdgeTy> {
           std::vector<NodeLID> localDsts;
           localDsts.reserve(numEdges);
           for (NodeGID gid : newTopology[node]) {
-            I_RR();
-            I_WR();
             localDsts.emplace_back(newGraph->getLID(gid));
           }
-          I_RR();
-          I_WM(numEdges);
           newGraph->graph.addEdgesUnSort(true, node, localDsts.data(),
                                          newEdgeData[node].data(), numEdges,
                                          false);
@@ -538,6 +510,114 @@ class WMDGraph : public DistLocalGraph<NodeTy, EdgeTy> {
     return newGraph;
   }
 
+  /**
+   * Test-only Constructor
+   */
+  WMDGraph(unsigned host, unsigned _numHosts, std::vector<NodeTy> nodes_,
+           std::vector<galois::GenericEdge<EdgeTy>> edges_)
+      : base_DistGraph(host, _numHosts) {
+    base_DistGraph::numGlobalNodes = nodes_.size();
+    base_DistGraph::numGlobalEdges = edges_.size();
+    base_DistGraph::localToGlobalVector.resize(base_DistGraph::numGlobalNodes);
+
+    std::vector<uint64_t> ndegrees;
+    graphPartitioner = std::make_unique<Partitioner>(
+        host, _numHosts, base_DistGraph::numGlobalNodes,
+        base_DistGraph::numGlobalEdges, ndegrees);
+
+    // TODO(Patrick) support using virtualToPhysicalMapping
+    base_DistGraph::numEdges          = 0;
+    base_DistGraph::numNodesWithEdges = 0;
+    std::vector<NodeTy> localNodes;
+    std::set<uint64_t> localMirrors;
+    std::vector<std::vector<galois::GenericEdge<EdgeTy>>> localEdges;
+    std::vector<uint32_t> virtualToPhyMapping(_numHosts);
+    base_DistGraph::gid2host.resize(base_DistGraph::numHosts);
+    for (uint64_t h = 0; h < base_DistGraph::numHosts; h++) {
+      virtualToPhyMapping[h] = h;
+      uint64_t beginNode;
+      uint64_t endNode;
+      std::tie(beginNode, endNode) =
+          galois::block_range((uint64_t)0, base_DistGraph::numGlobalNodes, h,
+                              base_DistGraph::numHosts);
+      base_DistGraph::gid2host[h] =
+          std::pair<uint64_t, uint64_t>(beginNode, endNode);
+      if (h != host) {
+        continue;
+      }
+      uint64_t edgeIter = 0;
+      for (; edgeIter < edges_.size() && edges_[edgeIter].src < beginNode;
+           edgeIter++) {
+      }
+      for (uint64_t i = beginNode; i < endNode; i++) {
+        uint64_t lid = i - beginNode;
+        localNodes.emplace_back(nodes_[i]);
+        localEdges.emplace_back(std::vector<galois::GenericEdge<EdgeTy>>());
+        base_DistGraph::localToGlobalVector[lid] = i;
+        while (edgeIter < edges_.size() && edges_[edgeIter].src == i) {
+          auto edge = edges_[edgeIter++];
+          if (edge.dst < beginNode || edge.dst >= endNode) {
+            localMirrors.insert(edge.dst);
+          }
+          localEdges[lid].emplace_back(edge);
+          base_DistGraph::numEdges++;
+        }
+        if (localEdges[lid].size() > 0) {
+          base_DistGraph::numNodesWithEdges++;
+        }
+      }
+    }
+    graphPartitioner->saveGIDToHost(base_DistGraph::gid2host);
+
+    base_DistGraph::numOwned    = localNodes.size();
+    uint64_t numMirrors         = localMirrors.size();
+    base_DistGraph::numNodes    = base_DistGraph::numOwned + numMirrors;
+    base_DistGraph::beginMaster = 0;
+
+    base_DistGraph::localToGlobalVector.resize(base_DistGraph::numOwned);
+    for (uint64_t mirrorGID : localMirrors) {
+      localNodes.emplace_back(nodes_[mirrorGID]);
+      base_DistGraph::localToGlobalVector.emplace_back(mirrorGID);
+    }
+    base_DistGraph::recalculateG2LMap();
+
+    for (uint32_t i = base_DistGraph::numOwned; i < base_DistGraph::numNodes;
+         i++) {
+      uint64_t globalID = base_DistGraph::getGID(i);
+      base_DistGraph::mirrorNodes[graphPartitioner->retrieveMaster(globalID)]
+          .emplace_back(globalID);
+    }
+
+    base_DistGraph::graph.allocateFrom(base_DistGraph::numNodes,
+                                       base_DistGraph::numEdges);
+    uint64_t edgeCount = 0;
+    for (uint64_t node = 0; node < base_DistGraph::numNodes; node++) {
+      base_DistGraph::getData(node) = localNodes[node];
+      if (node >= base_DistGraph::numOwned) {
+        continue;
+      }
+      uint64_t numEdges = localEdges[node].size();
+      if (numEdges == 0) {
+        continue;
+      }
+      std::vector<uint64_t> localDsts;
+      std::vector<EdgeTy> newEdgeData;
+      for (auto edge : localEdges[node]) {
+        localDsts.emplace_back(base_DistGraph::getLID(edge.dst));
+        newEdgeData.emplace_back(edge.data);
+      }
+      edgeCount += numEdges;
+      base_DistGraph::graph.addEdgesUnSort(true, node, localDsts.data(),
+                                           newEdgeData.data(), numEdges, false);
+    }
+    base_DistGraph::graph.getEdgePrefixSum();
+
+    base_DistGraph::determineThreadRanges();
+    base_DistGraph::determineThreadRangesMaster();
+    base_DistGraph::determineThreadRangesWithEdges();
+    base_DistGraph::initializeSpecificRanges();
+  }
+
 private:
   WMDGraph(unsigned host, unsigned _numHosts)
       : base_DistGraph(host, _numHosts) {}
@@ -645,7 +725,6 @@ class WMDGraph : public DistLocalGraph<NodeTy, EdgeTy> {
       if (h != base_DistGraph::id) {
         galois::runtime::SendBuffer bitsetBuffer;
         galois::runtime::gSerialize(bitsetBuffer, presentProxies[h]);
-        I_LC(h, bitsetBuffer.size());
         net.sendTagged(h, galois::runtime::evilPhase, std::move(bitsetBuffer));
       }
     }
@@ -658,7 +737,6 @@ class WMDGraph : public DistLocalGraph<NodeTy, EdgeTy> {
       } while (!p);
       uint32_t sendingHost = p->first;
       // deserialize proxiesOnOtherHosts
-      I_LC(sendingHost, p->second.size());
       galois::runtime::gDeserialize(p->second,
                                     proxiesOnOtherHosts[sendingHost]);
     }
@@ -797,10 +875,7 @@ class WMDGraph : public DistLocalGraph<NodeTy, EdgeTy> {
                                         base_DistGraph::numOwned);
     for (uint32_t i = base_DistGraph::numOwned; i < base_DistGraph::numNodes;
          i++) {
-      I_RR();
       uint64_t globalID = base_DistGraph::localToGlobalVector[i];
-      I_RR();
-      I_WR();
       assert(graphPartitioner->retrieveMaster(globalID) <
              base_DistGraph::numHosts);
       base_DistGraph::mirrorNodes[graphPartitioner->retrieveMaster(globalID)]
diff --git a/libwmd/include/galois/wmd/instrument.h b/libwmd/include/galois/wmd/instrument.h
deleted file mode 100644
index 283b91ff91..0000000000
--- a/libwmd/include/galois/wmd/instrument.h
+++ /dev/null
@@ -1,190 +0,0 @@
-#ifndef WMD_INSTRUMENT_H_
-#define WMD_INSTRUMENT_H_
-
-#include <iomanip>
-#include <iostream>
-#include <fstream>
-#include <memory>
-
-#include "galois/AtomicWrapper.h"
-#include "galois/DynamicBitset.h"
-#include "galois/DReducible.h"
-
-namespace agile::workflow1 {
-
-class Instrument;
-
-inline std::unique_ptr<Instrument> instrument;
-
-#ifdef GALOIS_INSTRUMENT
-#define I_INIT(GRAPH_NAME, HOST, NUM_HOSTS, NUM_EDGES)                         \
-  ({                                                                           \
-    agile::workflow1::Instrument::init(GRAPH_NAME, HOST, NUM_HOSTS,            \
-                                       NUM_EDGES);                             \
-  })
-#define I_DEINIT()                                                             \
-  { agile::workflow1::instrument = nullptr; }
-#define I_NEW_FILE(NAME_SUFFIX, NUM_EDGES)                                     \
-  ({ agile::workflow1::instrument->new_file(NAME_SUFFIX, NUM_EDGES); })
-#define I_ROUND(ROUND_NUM)                                                     \
-  ({ agile::workflow1::instrument->log_round(ROUND_NUM); })
-#define I_CLEAR() ({ agile::workflow1::instrument->clear(); })
-#define I_RS() ({ agile::workflow1::instrument->record_local_read_stream(); })
-#define I_RR(MIRROR)                                                           \
-  ({ agile::workflow1::instrument->record_read_random(MIRROR); })
-#define I_WR(MIRROR)                                                           \
-  ({ agile::workflow1::instrument->record_write_random(MIRROR); })
-#define I_WRR(REMOTE_HOST)                                                     \
-  ({ agile::workflow1::instrument->record_write_random_remote(REMOTE_HOST); })
-#define I_WM(WRITES) ({ agile::workflow1::instrument->write_many(WRITES); })
-#define I_LC(REMOTE_HOST, BYTES)                                               \
-  ({ agile::workflow1::instrument->log_communication(REMOTE_HOST, BYTES); })
-#define I_BM(NODES)                                                            \
-  ({ agile::workflow1::instrument->broadcast_masters(NODES); })
-#else
-#define I_INIT(GRAPH_NAME, HOST, NUM_HOSTS, NUM_EDGES) ({})
-#define I_DEINIT() ({})
-#define I_NEW_FILE(NAME_SUFFIX) ({})
-#define I_ROUND(ROUND_NUM) ({})
-#define I_CLEAR() ({})
-#define I_RS() ({})
-#define I_RR(MIRROR) ({})
-#define I_WR(MIRROR) ({})
-#define I_WRR(REMOTE_HOST) ({})
-#define I_WM(WRITES) ({})
-#define I_LC(REMOTE_HOST, BYTES) ({})
-#define I_BM(NODES) ({})
-#endif
-
-class Instrument {
-public:
-  uint64_t hostID;
-  uint64_t numHosts;
-  std::string graph_name;
-
-  std::unique_ptr<galois::DGAccumulator<uint64_t>> local_read_stream;
-  std::unique_ptr<galois::DGAccumulator<uint64_t>> master_read;
-  std::unique_ptr<galois::DGAccumulator<uint64_t>> master_write;
-  std::unique_ptr<galois::DGAccumulator<uint64_t>> mirror_read;
-  std::unique_ptr<galois::DGAccumulator<uint64_t>> mirror_write;
-  std::unique_ptr<galois::DGAccumulator<uint64_t>[]> remote_comm_to_host;
-  std::ofstream file;
-
-  static void init(const std::string& graph_name_, uint64_t hid, uint64_t numH,
-                   uint64_t numEdges) {
-    instrument = std::make_unique<Instrument>(graph_name_, hid, numH, numEdges);
-  }
-
-  Instrument(const std::string& graph_name_, uint64_t hid, uint64_t numH,
-             uint64_t numEdges) {
-    hostID     = hid;
-    numHosts   = numH;
-    graph_name = graph_name_;
-
-    local_read_stream = std::make_unique<galois::DGAccumulator<uint64_t>>();
-    master_read       = std::make_unique<galois::DGAccumulator<uint64_t>>();
-    master_write      = std::make_unique<galois::DGAccumulator<uint64_t>>();
-    mirror_read       = std::make_unique<galois::DGAccumulator<uint64_t>>();
-    mirror_write      = std::make_unique<galois::DGAccumulator<uint64_t>>();
-    remote_comm_to_host =
-        std::make_unique<galois::DGAccumulator<uint64_t>[]>(numH);
-    clear();
-
-    // start instrumentation
-    file.open(graph_name + "_" + std::to_string(numH) + "procs_id" +
-              std::to_string(hid));
-    file << "#####   Stat   #####" << std::endl;
-    file << "host " << hid << " total edges: " << numEdges << std::endl;
-  }
-
-  void new_file(const std::string& filename_extension, uint64_t numEdges) {
-    file.close();
-    file.open(graph_name + filename_extension + "_" + std::to_string(numHosts) +
-              "procs_id" + std::to_string(hostID));
-    file << "#####   Stat   #####" << std::endl;
-    file << "host " << hostID << " total edges: " << numEdges << std::endl;
-  }
-
-  void clear() {
-    local_read_stream->reset();
-    master_read->reset();
-    master_write->reset();
-    mirror_read->reset();
-    mirror_write->reset();
-
-    for (auto i = 0ul; i < numHosts; i++) {
-      remote_comm_to_host[i].reset();
-    }
-  }
-
-  void record_local_read_stream() { *local_read_stream += 1; }
-
-  void record_read_random(bool mirror = false) {
-    if (!mirror) { // master
-      *master_read += 1;
-    } else { // mirror
-      *mirror_read += 1;
-    }
-  }
-
-  void record_write_random(bool mirror = false) {
-    if (!mirror) { // master
-      *master_write += 1;
-    } else { // mirror
-      *mirror_write += 1;
-    }
-  }
-
-  void record_write_random_remote(uint64_t remote_host) {
-    *mirror_write += 1;
-    remote_comm_to_host[remote_host] += 1;
-  }
-
-  void write_many(uint64_t accesses) { *master_write += accesses; }
-
-  void log_run(uint64_t run) {
-    file << "#####   Run " << run << "   #####" << std::endl;
-  }
-
-  void log_round(uint64_t num_iterations) {
-    auto host_id   = hostID;
-    auto num_hosts = numHosts;
-    file << "#####   Round " << num_iterations << "   #####" << std::endl;
-    file << "host " << host_id
-         << " local read (stream): " << local_read_stream->read_local()
-         << std::endl;
-    file << "host " << host_id << " master reads: " << master_read->read_local()
-         << std::endl;
-    file << "host " << host_id
-         << " master writes: " << master_write->read_local() << std::endl;
-    file << "host " << host_id << " mirror reads: " << mirror_read->read_local()
-         << std::endl;
-    file << "host " << host_id
-         << " mirror writes: " << mirror_write->read_local() << std::endl;
-
-    for (uint32_t i = 0; i < num_hosts; i++) {
-      file << "host " << host_id << " remote communication for host " << i
-           << ": " << remote_comm_to_host[i].read_local() << std::endl;
-    }
-  }
-
-  void log_communication(uint64_t target_host, uint64_t bytes) {
-    remote_comm_to_host[target_host] += bytes;
-  }
-
-  void broadcast_masters(uint64_t num_nodes) {
-    auto host_id   = hostID;
-    auto num_hosts = numHosts;
-
-    for (uint64_t h = 0; h < num_hosts; h++) {
-      if (h == host_id) {
-        continue;
-      }
-      remote_comm_to_host[h] += num_nodes / num_hosts;
-    }
-  }
-};
-
-} // namespace agile::workflow1
-
-#endif
diff --git a/libwmd/include/galois/wmd/schema.h b/libwmd/include/galois/wmd/schema.h
index a35bb2c967..444a39089e 100644
--- a/libwmd/include/galois/wmd/schema.h
+++ b/libwmd/include/galois/wmd/schema.h
@@ -82,7 +82,9 @@ class FileParser {
     uint64_t ndx = 0, start = 0, end = 0;
     std::vector<std::string> tokens(numTokens);
 
-    for (; end < lineLength - 1; end++) {
+    for (; line[end] != '\0' && line[end] != '\n' && ndx < numTokens &&
+           end < lineLength;
+         end++) {
       if (line[end] == delim) {
         tokens[ndx] = std::string(line + start, end - start);
         start       = end + 1;
@@ -90,8 +92,10 @@ class FileParser {
       }
     }
 
-    tokens[numTokens - 1] =
-        std::string(line + start, end - start); // flush last token
+    if (ndx < numTokens) {
+      tokens[numTokens - 1] =
+          std::string(line + start, end - start); // flush last token
+    }
     return tokens;
   }
 };
diff --git a/libwmd/test/wmd-graph-build.cpp b/libwmd/test/wmd-graph-build.cpp
index 99158508da..88fca39ab9 100644
--- a/libwmd/test/wmd-graph-build.cpp
+++ b/libwmd/test/wmd-graph-build.cpp
@@ -48,11 +48,8 @@ int main(int argc, char* argv[]) {
       std::make_unique<galois::graphs::WMDParser<agile::workflow1::Vertex,
                                                  agile::workflow1::Edge>>(
           10, filenames));
-  I_INIT("tmp/WMDGraph", net.ID, net.Num, 0);
   Graph* graph = new Graph(parsers, net.ID, net.Num, true, false,
                            galois::graphs::BALANCED_EDGES_OF_MASTERS);
-  I_ROUND(0);
-  I_CLEAR();
   assert(graph != nullptr);
 
   // generate a file with sorted token of all nodes and its outgoing edge dst
@@ -120,6 +117,5 @@ int main(int argc, char* argv[]) {
     }
     output.close();
   }
-  I_DEINIT();
   return 0;
 }

From 2658bd2a1565812e9f4fba1e33b44b85a5bc55f4 Mon Sep 17 00:00:00 2001
From: Meyer Zinn <6132034+meyerzinn@users.noreply.github.com>
Date: Thu, 4 Apr 2024 18:27:54 -0500
Subject: [PATCH 649/660] remove tombstoning from LS_CSR (#27)

---
 .../include/galois/graphs/LS_LC_CSR_Graph.h   | 193 ++++++------------
 libgalois/test/graph-compile-lscsr.cpp        |  15 +-
 2 files changed, 66 insertions(+), 142 deletions(-)

diff --git a/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h b/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
index 451477b2f4..93f13df0c7 100644
--- a/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
+++ b/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
@@ -78,7 +78,7 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
 
   // forward-declarations of internal structs
   struct VertexMetadata;
-  struct EdgeMetadata;
+  using EdgeMetadata = VertexTopologyID;
 
   VertexDataStore m_vertex_data;
   std::vector<VertexMetadata> m_vertices;
@@ -98,10 +98,10 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
    */
   std::vector<uint64_t> m_pfx_sum_cache;
   static uint64_t transmute(const VertexMetadata& vertex_meta) {
-    return vertex_meta.degree;
+    return vertex_meta.degree();
   }
   static uint64_t scan_op(const VertexMetadata& p, const uint64_t& l) {
-    return p.degree + l;
+    return p.degree() + l;
   }
   static uint64_t combiner(const uint64_t& f, const uint64_t& s) {
     return f + s;
@@ -192,7 +192,9 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
     return start;
   }
 
-  inline size_t getDegree(VertexTopologyID id) { return m_vertices[id].degree; }
+  inline size_t getDegree(VertexTopologyID id) {
+    return m_vertices[id].degree();
+  }
 
   inline VertexTopologyID getEdgeDst(EdgeHandle eh) { return eh.second; }
 
@@ -214,38 +216,24 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
     return num_edges.reduce();
   }
 
-  EdgeIterator edge_begin(VertexTopologyID vertex) {
+  inline EdgeIterator edge_begin(VertexTopologyID vertex) {
     auto const& vertex_meta = m_vertices[vertex];
-    EdgeMetadata const* const start =
-        &getEdgeMetadata(vertex_meta.buffer, vertex_meta.begin);
-    EdgeMetadata const* const end =
-        &getEdgeMetadata(vertex_meta.buffer, vertex_meta.end);
-    return EdgeIterator(vertex, start, end);
+    return EdgeIterator(
+        vertex, &getEdgeMetadata(vertex_meta.buffer, vertex_meta.begin));
   }
 
-  EdgeIterator edge_end(VertexTopologyID vertex) {
+  inline EdgeIterator edge_end(VertexTopologyID vertex) {
     auto const& vertex_meta = m_vertices[vertex];
-    EdgeMetadata const* const end =
-        &getEdgeMetadata(vertex_meta.buffer, vertex_meta.end);
-    return EdgeIterator(vertex, end, end);
+    return EdgeIterator(vertex,
+                        &getEdgeMetadata(vertex_meta.buffer, vertex_meta.end));
   }
 
-  EdgeRange edges(VertexTopologyID node) {
-    auto const& vertex_meta = m_vertices[node];
-
-    EdgeMetadata const* const start =
-        &getEdgeMetadata(vertex_meta.buffer, vertex_meta.begin);
-
-    EdgeMetadata const* const end =
-        &getEdgeMetadata(vertex_meta.buffer, vertex_meta.end);
-
-    return EdgeRange(EdgeIterator(node, start, end),
-                     EdgeIterator(node, end, end));
+  inline EdgeRange edges(VertexTopologyID node) {
+    return EdgeRange(edge_begin(node), edge_end(node));
   }
 
   /*
-   * Sort the outgoing edges for the given vertex, pruning tombstoned edges in
-   * the process.
+   * Sort the outgoing edges for the given vertex.
    */
   void sortEdges(VertexTopologyID node) {
     auto& vertex_meta = m_vertices[node];
@@ -256,10 +244,6 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
     EdgeMetadata* end = &getEdgeMetadata(vertex_meta.buffer, vertex_meta.end);
 
     std::sort(start, end);
-
-    // Tombstoned edges will be moved to the end, so we can drop them by moving
-    // the end pointer:
-    vertex_meta.end = vertex_meta.begin + vertex_meta.degree;
   }
 
   /*
@@ -276,24 +260,31 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
     return std::binary_search(start, end, EdgeMetadata(dst));
   }
 
+  template <bool sorted = false>
   void addEdges(VertexTopologyID src, const std::vector<VertexTopologyID> dsts,
                 std::vector<EdgeData> data) {
     GALOIS_ASSERT(data.size() == dsts.size());
-    this->addEdgesTopologyOnly(src, dsts);
+    this->addEdgesTopologyOnly<sorted>(src, dsts);
     for (size_t i = 0; i < dsts.size(); ++i) {
       m_edge_data[std::make_pair(src, dsts[i])] = data[i];
     }
   }
 
+  /*
+   * Adds outgoing edges from the given src to all dsts. If `sorted`, assume
+   * both `dsts` and the existing edge array is sorted ascending, and maintain
+   * sorted order.
+   */
+  template <bool sorted = false>
   void addEdgesTopologyOnly(VertexTopologyID src,
                             const std::vector<VertexTopologyID> dsts) {
-
-    // Copies the edge list to the end of m_edges[1], prepending
-    // the new edges.
+    // Copies the edge list to the end of m_edges[1] together with the new
+    // edges.
 
     auto& vertex_meta = m_vertices[src];
+    m_holes.fetch_add(vertex_meta.degree(), std::memory_order_relaxed);
 
-    uint64_t const new_degree = vertex_meta.degree + dsts.size();
+    uint64_t const new_degree = vertex_meta.degree() + dsts.size();
     uint64_t const new_begin =
         m_edges_tail.fetch_add(new_degree, std::memory_order_relaxed);
     uint64_t const new_end = new_begin + new_degree;
@@ -307,57 +298,27 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
       m_edges_lock.unlock();
     }
 
-    // insert new edges
-    std::transform(dsts.begin(), dsts.end(), &getEdgeMetadata(1, new_begin),
-                   [](VertexTopologyID dst) { return EdgeMetadata(dst); });
-
-    // copy old, non-tombstoned edges
-    std::copy_if(&getEdgeMetadata(vertex_meta.buffer, vertex_meta.begin),
+    EdgeMetadata* log_dst = &getEdgeMetadata(1, new_begin);
+    if constexpr (sorted) {
+      std::merge(dsts.begin(), dsts.end(),
+                 &getEdgeMetadata(vertex_meta.buffer, vertex_meta.begin),
                  &getEdgeMetadata(vertex_meta.buffer, vertex_meta.end),
-                 &getEdgeMetadata(1, new_begin + dsts.size()),
-                 [](EdgeMetadata& edge) { return !edge.is_tomb(); });
+                 log_dst);
+    } else {
+      // copy old edges
+      log_dst = std::copy(
+          &getEdgeMetadata(vertex_meta.buffer, vertex_meta.begin),
+          &getEdgeMetadata(vertex_meta.buffer, vertex_meta.end), log_dst);
+
+      // insert new edges
+      std::copy(dsts.begin(), dsts.end(), log_dst);
+    }
 
     // update vertex metadata
     vertex_meta.buffer = 1;
     vertex_meta.begin  = new_begin;
     vertex_meta.end    = new_end;
 
-    m_holes.fetch_add(vertex_meta.degree, std::memory_order_relaxed);
-    vertex_meta.degree += dsts.size();
-
-    m_prefix_valid.store(false, std::memory_order_release);
-  }
-
-  void deleteEdges(VertexTopologyID src,
-                   const std::vector<VertexTopologyID>& edges) {
-    std::unordered_set<VertexTopologyID> edges_set(edges.begin(), edges.end());
-
-    auto& vertex_meta    = m_vertices[src];
-    uint64_t holes_added = 0;
-    for (auto i = vertex_meta.begin; i < vertex_meta.end; ++i) {
-      EdgeMetadata& edge_meta = getEdgeMetadata(vertex_meta.buffer, i);
-      if (!edge_meta.is_tomb() &&
-          edges_set.find(edge_meta.dst) != edges_set.end()) {
-        edge_meta.set_tomb();
-        --vertex_meta.degree;
-        ++holes_added;
-        // remove tombstoned edges from the start of the edge list
-        if (i == vertex_meta.begin)
-          ++vertex_meta.begin;
-      }
-    }
-
-    // remove tombstoned edges from the end of the edge list
-    for (auto i = vertex_meta.end; i > vertex_meta.begin; --i) {
-      if (getEdgeMetadata(vertex_meta.buffer, i - 1).is_tomb()) {
-        --vertex_meta.end;
-        --vertex_meta.degree;
-      } else {
-        break;
-      }
-    }
-
-    m_holes.fetch_add(holes_added, std::memory_order_relaxed);
     m_prefix_valid.store(false, std::memory_order_release);
   }
 
@@ -374,7 +335,7 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
                      VertexMetadata& vertex_meta = m_vertices[vertex_id];
 
                      if (vertex_meta.buffer == 0) {
-                       this->addEdgesTopologyOnly(vertex_id, {});
+                       this->addEdgesTopologyOnly<false>(vertex_id, {});
                      }
 
                      // we are about to swap the buffers, so all vertices will
@@ -449,78 +410,44 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
     uint8_t buffer : 1;
     uint64_t begin : 48; // inclusive
     uint64_t end : 48;   // exclusive
-    uint64_t degree;
 
-    VertexMetadata() : buffer(0), begin(0), end(0), degree(0) {}
+    VertexMetadata() : buffer(0), begin(0), end(0) {}
 
     VertexMetadata(VertexMetadata const& other)
-        : buffer(other.buffer), begin(other.begin), end(other.end),
-          degree(other.degree) {}
+        : buffer(other.buffer), begin(other.begin), end(other.end) {}
 
     VertexMetadata(VertexMetadata&& other)
         : buffer(std::move(other.buffer)), begin(std::move(other.begin)),
-          end(std::move(other.end)), degree(std::move(other.degree)) {}
-  };
-
-  struct EdgeMetadata {
-    enum Flags : uint16_t { TOMB = 0x1 };
-
-    uint16_t flags : 16;
-    VertexTopologyID dst : 48;
-
-    bool is_tomb() const noexcept { return (flags & TOMB) > 0; }
-    void set_tomb() { flags |= TOMB; }
-
-    EdgeMetadata() {}
-    explicit EdgeMetadata(VertexTopologyID dst) : flags(0), dst(dst) {}
+          end(std::move(other.end)) {}
 
-    // Sort edges, with tombstoned coming after non-tombstoned.
-    friend bool operator<(EdgeMetadata const& lhs, EdgeMetadata const& rhs) {
-      if (lhs.is_tomb() != rhs.is_tomb())
-        // tombstoned edges come last
-        return lhs.is_tomb() < rhs.is_tomb();
-      else
-        // otherwise, sort by dst
-        return lhs.dst < rhs.dst;
-    }
-
-    // Check dst equality only.
-    friend bool operator==(EdgeMetadata const& lhs, EdgeMetadata const& rhs) {
-      return (lhs.dst == rhs.dst);
-    }
-
-  } __attribute__((packed));
-
-  static_assert(sizeof(EdgeMetadata) <= sizeof(uint64_t));
+    inline uint64_t degree() const { return end - begin; }
+  };
 
 public:
   class EdgeIterator
       : public boost::iterator_facade<EdgeIterator, EdgeHandle,
-                                      boost::bidirectional_traversal_tag,
+                                      boost::random_access_traversal_tag,
                                       EdgeHandle const> {
   private:
     VertexTopologyID const src;
-    EdgeMetadata const* curr;
-    EdgeMetadata const* const end;
+    EdgeMetadata const* ptr;
 
-    EdgeIterator(VertexTopologyID src, EdgeMetadata const* start,
-                 EdgeMetadata const* end)
-        : src(src), curr(start), end(end) {}
+    EdgeIterator(VertexTopologyID src, EdgeMetadata const* ptr)
+        : src(src), ptr(ptr) {}
 
-    void increment() {
-      while (++curr < end && curr->is_tomb())
-        ;
-    }
+    void advance(std::ptrdiff_t n) { ptr += n; }
 
-    void decrement() {
-      while ((--curr)->is_tomb())
-        ;
+    std::ptrdiff_t distance_to(EdgeIterator const& y) const {
+      return y.ptr - ptr;
     }
 
-    // updates to the graph will invalidate iterators
-    bool equal(EdgeIterator const& other) const { return curr == other.curr; }
+    void increment() { ++ptr; }
+
+    void decrement() { --ptr; }
+
+    bool equal(EdgeIterator const& other) const { return ptr == other.ptr; }
 
-    EdgeHandle dereference() const { return EdgeHandle(src, curr->dst); }
+    EdgeHandle dereference() const { return EdgeHandle(src, *ptr); }
 
     friend class LS_LC_CSR_Graph;
     friend class boost::iterator_core_access;
diff --git a/libgalois/test/graph-compile-lscsr.cpp b/libgalois/test/graph-compile-lscsr.cpp
index 7c8aac7da9..2ed2e597e9 100644
--- a/libgalois/test/graph-compile-lscsr.cpp
+++ b/libgalois/test/graph-compile-lscsr.cpp
@@ -46,10 +46,6 @@ void check() {
 
   print_graph("added 2->1");
 
-  g.deleteEdges(1, {3});
-
-  print_graph("deleted 1->3");
-
   g.compact();
 
   print_graph("compacted");
@@ -98,26 +94,27 @@ int main() {
   GALOIS_ASSERT(g.getEdgeDst(*(++(++(++g.edge_begin(eight))))) == 0);
   GALOIS_ASSERT(g.getEdgeDst(*(--g.edge_end(eight))) == 0);
 
-  g.deleteEdges(eight, {1});
   g.sortEdges(eight);
 
   GALOIS_ASSERT(g.getEdgeDst(*g.edge_begin(eight)) == 0);
-  GALOIS_ASSERT(g.getEdgeDst(*(++g.edge_begin(eight))) == 2);
-  GALOIS_ASSERT(g.getEdgeDst(*(++(++g.edge_begin(eight)))) == 3);
+  GALOIS_ASSERT(g.getEdgeDst(*(++g.edge_begin(eight))) == 1);
+  GALOIS_ASSERT(g.getEdgeDst(*(++(++g.edge_begin(eight)))) == 2);
+  GALOIS_ASSERT(g.getEdgeDst(*(g.edge_begin(eight) + 2)) == 2);
   GALOIS_ASSERT(g.getEdgeDst(*(--g.edge_end(eight))) == 3);
 
   // check searching
   GALOIS_ASSERT(g.findEdgeSorted(eight, 0));
-  GALOIS_ASSERT(!g.findEdgeSorted(eight, 1));
+  GALOIS_ASSERT(g.findEdgeSorted(eight, 1));
   GALOIS_ASSERT(g.findEdgeSorted(eight, 2));
   GALOIS_ASSERT(g.findEdgeSorted(eight, 3));
+  GALOIS_ASSERT(!g.findEdgeSorted(eight, 4));
 
   // check prefix sum
   GALOIS_ASSERT(g[0] == 3);
   GALOIS_ASSERT(g[1] == 3);
   GALOIS_ASSERT(g[2] == 3);
   // ...
-  GALOIS_ASSERT(g[8] == 6);
+  GALOIS_ASSERT(g[8] == 7);
 
   return 0;
 }

From 11b843e5829238e07b9317b5a3e2fe73cfd1978a Mon Sep 17 00:00:00 2001
From: Meyer Zinn <6132034+meyerzinn@users.noreply.github.com>
Date: Thu, 4 Apr 2024 18:36:49 -0500
Subject: [PATCH 650/660] remove explicit hugepages (#24)

---
 libgalois/include/galois/LargeVector.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/libgalois/include/galois/LargeVector.h b/libgalois/include/galois/LargeVector.h
index 4ee4235aa4..a10834746f 100644
--- a/libgalois/include/galois/LargeVector.h
+++ b/libgalois/include/galois/LargeVector.h
@@ -65,9 +65,8 @@ class LargeVector : public boost::noncopyable {
     size_t const mmap_size =
         std::max(m_mappings.front().second * 2, m_capacity * sizeof(T));
 
-    m_data = static_cast<T*>(
-        mmap(nullptr, mmap_size, PROT_READ | PROT_WRITE,
-             MAP_SHARED | MAP_HUGETLB | MAP_HUGE_2MB | MAP_POPULATE, m_fd, 0));
+    m_data = static_cast<T*>(mmap(nullptr, mmap_size, PROT_READ | PROT_WRITE,
+                                  MAP_SHARED | MAP_POPULATE, m_fd, 0));
     if (m_data == MAP_FAILED)
       throw std::runtime_error(std::string("mmap failed: ") +
                                std::strerror(errno));
@@ -78,7 +77,7 @@ class LargeVector : public boost::noncopyable {
 public:
   LargeVector(size_t initial_capacity)
       : m_capacity(0), m_size(0), m_data(nullptr),
-        m_fd(memfd_create("LargeVector", MFD_HUGETLB | MFD_HUGE_2MB)),
+        m_fd(memfd_create("LargeVector", 0)),
         m_mappings({std::make_pair(nullptr, 0)}) {
     if (m_fd == -1)
       throw std::runtime_error(std::string("creating memfd: ") +

From cb75c3901fdcef38bcd81d2dba1b17108beb259b Mon Sep 17 00:00:00 2001
From: Meyer Zinn <6132034+meyerzinn@users.noreply.github.com>
Date: Thu, 11 Apr 2024 20:57:56 -0500
Subject: [PATCH 651/660] fix latent bug in morphgraph (#30)

---
 libgalois/include/galois/graphs/MorphGraph.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libgalois/include/galois/graphs/MorphGraph.h b/libgalois/include/galois/graphs/MorphGraph.h
index 3605daef83..ef29d1c0ad 100644
--- a/libgalois/include/galois/graphs/MorphGraph.h
+++ b/libgalois/include/galois/graphs/MorphGraph.h
@@ -1049,7 +1049,7 @@ public
   }
 
   uint64_t getDegree(GraphNode N) {
-    uint64_t ret;
+    uint64_t ret = 0;
     for (auto& edge : out_edges(N)) {
       ret++;
     }

From b9a1906c7eac575ece74213ccd43104763da922a Mon Sep 17 00:00:00 2001
From: Meyer Zinn <6132034+meyerzinn@users.noreply.github.com>
Date: Thu, 11 Apr 2024 21:31:14 -0500
Subject: [PATCH 652/660] Use parallel hashmap in LS_CSR for edge data (#31)

---
 .../include/galois/graphs/LS_LC_CSR_Graph.h   | 21 +++++++++++--------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h b/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
index 93f13df0c7..32dfd19e7b 100644
--- a/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
+++ b/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
@@ -69,12 +69,12 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
   using VertexDataStore =
       std::conditional_t<HasVertexData, typename std::vector<VertexData>,
                          typename std::tuple<>>;
-  using EdgeDataStore = std::conditional_t<
-      HasEdgeData,
-      phmap::flat_hash_map<
-          std::pair<VertexTopologyID, VertexTopologyID>, EdgeData,
-          boost::hash<std::pair<VertexTopologyID, VertexTopologyID>>>,
-      std::tuple<>>;
+
+  // todo: should we use a galois spinlock here instead of a mutex?
+  using EdgeDataMap = phmap::parallel_flat_hash_map_m<EdgeHandle, EdgeData>;
+
+  using EdgeDataStore =
+      std::conditional_t<HasEdgeData, EdgeDataMap, std::tuple<>>;
 
   // forward-declarations of internal structs
   struct VertexMetadata;
@@ -155,8 +155,10 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
   }
 
   template <typename E = EdgeData, typename = std::enable_if<HasEdgeData>>
-  void setEdgeData(EdgeHandle handle, E data) {
-    m_edge_data[handle] = data;
+  inline void setEdgeData(EdgeHandle handle, E data) {
+    m_edge_data.lazy_emplace_l(
+        handle, [&](auto& v) { v.second = data; },
+        [&](auto const& cons) { cons(handle, data); });
   }
 
   inline VertexTopologyID begin() const noexcept {
@@ -266,7 +268,8 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
     GALOIS_ASSERT(data.size() == dsts.size());
     this->addEdgesTopologyOnly<sorted>(src, dsts);
     for (size_t i = 0; i < dsts.size(); ++i) {
-      m_edge_data[std::make_pair(src, dsts[i])] = data[i];
+      auto key = std::make_pair(src, dsts[i]);
+      setEdgeData(key, data[i]);
     }
   }
 

From 3262256d5f154c29ff44e305830e83702b8137b6 Mon Sep 17 00:00:00 2001
From: Patrick Kenney <patrickkenney9801@gmail.com>
Date: Sat, 13 Apr 2024 18:22:36 -0500
Subject: [PATCH 653/660] ci: Disable debug builds (#32)

---
 .github/workflows/build-and-test.yml | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
index a95f29e42e..945716c387 100644
--- a/.github/workflows/build-and-test.yml
+++ b/.github/workflows/build-and-test.yml
@@ -53,11 +53,8 @@ jobs:
         shell: bash -l {0}
     strategy:
       matrix:
-        build-type: ['Release', 'Debug']
+        build-type: ['Release']
         sanitizer-type: ['nosan', 'san']
-        exclude:
-          - build-type: 'Debug'
-            sanitizer-type: 'san'
     needs: docker-create-ubuntu-2204
 
     steps:
@@ -80,9 +77,6 @@ jobs:
         if [ ${{ matrix.sanitizer-type }} == 'san' ]; then
           echo "GALOIS_CONTAINER_ENV=$GALOIS_CONTAINER_ENV -e=GALOIS_EXTRA_CXX_FLAGS='\"-fsanitize=address -fsanitize=undefined\"'" >> $GITHUB_ENV
         fi
-        if [ ${{ matrix.build-type }} == 'Debug' ]; then
-          echo "GALOIS_CONTAINER_ENV=$GALOIS_CONTAINER_ENV -e=GALOIS_EXTRA_CXX_FLAGS='-O3'" >> $GITHUB_ENV
-        fi
         cat $GITHUB_ENV
 
     - name: Configure

From 278539227a5eaa9803104323a750555e4c48b7ab Mon Sep 17 00:00:00 2001
From: divija95 <60272396+divija95@users.noreply.github.com>
Date: Sat, 13 Apr 2024 20:45:50 -0500
Subject: [PATCH 654/660] DistLocalGraph switch to LS_LC_CSR graph (#28)

* changes to switch to LC_LS_CSR graph

* fixing debug err

* fixing pre-commit issues

* changing api calls for wf4

* Added data.001.csv using lfs

* fixing getEdgeData api

* fix for getEdgeData()

* ci fix

* data.001.csv

* changing test dataset

* quickfix

* fixing precommit

* fixing graph deallocate()

* fixing test

* Update workflows to be realistic

* CPU set

* Try this again

* Try this again

* Slight refactor

---------

Co-authored-by: AdityaAtulTewari <adityaatewari@gmail.com>
---
 .gitattributes                                |   2 +
 .github/workflows/build-and-test.yml          |  22 ++-
 Makefile                                      |   2 +
 inputs/wmd/data.001.csv                       |   3 +
 .../galois/graphs/DistributedLocalGraph.h     |  69 ++++----
 .../include/galois/graphs/GraphHelpers.h      |   9 +-
 .../include/galois/graphs/LS_LC_CSR_Graph.h   |   8 +-
 libwmd/include/galois/wmd/WMDGraph.h          |   9 +-
 libwmd/include/galois/wmd/WMDPartitioner.h    |  28 ++--
 libwmd/include/galois/wmd/graph.h             |   4 +-
 libwmd/test/wmd-graph-build.cpp               | 150 +++++++++++++++---
 11 files changed, 221 insertions(+), 85 deletions(-)
 create mode 100644 .gitattributes
 create mode 100644 inputs/wmd/data.001.csv

diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000000..71b4e67e5e
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,2 @@
+*.csv filter=lfs diff=lfs merge=lfs -text
+inputs/wmd/data.001.csv filter=lfs diff=lfs merge=lfs -text
diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
index 945716c387..925406e22c 100644
--- a/.github/workflows/build-and-test.yml
+++ b/.github/workflows/build-and-test.yml
@@ -46,7 +46,6 @@ jobs:
       CONTAINER_BUILD_DIR: "/pando-galois/build"
       CONTAINER_WORK_DIR: "/pando-galois"
       GALOIS_CONTAINER_ENV: "-e=GALOIS_BUILD_TOOL=Ninja"
-      GALOIS_CONTAINER_FLAGS: "--cpus=8"
       INTERACTIVE: ""
     defaults:
       run:
@@ -77,6 +76,27 @@ jobs:
         if [ ${{ matrix.sanitizer-type }} == 'san' ]; then
           echo "GALOIS_CONTAINER_ENV=$GALOIS_CONTAINER_ENV -e=GALOIS_EXTRA_CXX_FLAGS='\"-fsanitize=address -fsanitize=undefined\"'" >> $GITHUB_ENV
         fi
+        if [ ${{ matrix.build-type }} == 'Debug' ]; then
+          echo "GALOIS_CONTAINER_ENV=$GALOIS_CONTAINER_ENV -e=GALOIS_EXTRA_CXX_FLAGS='-O3'" >> $GITHUB_ENV
+        fi
+        if [ ${{ runner.name }} == 'zerberus-0' ]; then
+          echo "CONTAINER_CPUSET='--cpuset-cpus=0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30'" >> $GITHUB_ENV
+        fi
+        if [ ${{ runner.name }} == 'zerberus-1' ]; then
+          echo "CONTAINER_CPUSET='--cpuset-cpus=1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31'" >> $GITHUB_ENV
+        fi
+        if [ ${{ runner.name }} == 'languedoc-0' ]; then
+          echo "CONTAINER_CPUSET='--cpuset-cpus=0,1,2,3,4,5,6,7,32,33,34,35,36,37,38,39'" >> $GITHUB_ENV
+        fi
+        if [ ${{ runner.name }} == 'languedoc-1' ]; then
+          echo "CONTAINER_CPUSET='--cpuset-cpus=16,17,18,19,20,21,22,23,48,49,50,51,52,53,54,55'" >> $GITHUB_ENV
+        fi
+        if [ ${{ runner.name }} == 'languedoc-2' ]; then
+          echo "CONTAINER_CPUSET='--cpuset-cpus=8,9,10,11,12,13,14,15,40,41,42,43,44,45,46,47'" >> $GITHUB_ENV
+        fi
+        if [ ${{ runner.name }} == 'languedoc-3' ]; then
+          echo "CONTAINER_CPUSET='--cpuset-cpus=24,25,26,27,28,29,30,31,56,57,58,59,60,61,62,63'" >> $GITHUB_ENV
+        fi
         cat $GITHUB_ENV
 
     - name: Configure
diff --git a/Makefile b/Makefile
index a0544b1c2c..011abf422e 100644
--- a/Makefile
+++ b/Makefile
@@ -14,6 +14,7 @@ CONTAINER_BUILD_DIR ?= /pando-galois/build
 CONTAINER_WORKDIR ?= ${CONTAINER_SRC_DIR}
 CONTAINER_CONTEXT ?= default
 CONTAINER_OPTS ?=
+CONTAINER_CPUSET ?=
 CONTAINER_CMD ?= bash -l
 INTERACTIVE ?= i
 
@@ -91,6 +92,7 @@ docker:
 	${GALOIS_CONTAINER_MOUNTS} \
 	${GALOIS_CONTAINER_ENV} \
 	${GALOIS_CONTAINER_FLAGS} \
+	${CONTAINER_CPUSET} \
 	--privileged \
 	--workdir=${CONTAINER_WORKDIR} \
 	${CONTAINER_OPTS} \
diff --git a/inputs/wmd/data.001.csv b/inputs/wmd/data.001.csv
new file mode 100644
index 0000000000..f5479d326c
--- /dev/null
+++ b/inputs/wmd/data.001.csv
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ff7df1aa0a2261d930471fc057251d1aa2cb404fa8c88c12c3b65fe2a5204bf8
+size 2879652
diff --git a/libcusp/include/galois/graphs/DistributedLocalGraph.h b/libcusp/include/galois/graphs/DistributedLocalGraph.h
index 4b803c5247..f826f88a31 100644
--- a/libcusp/include/galois/graphs/DistributedLocalGraph.h
+++ b/libcusp/include/galois/graphs/DistributedLocalGraph.h
@@ -31,7 +31,7 @@
 #include <fstream>
 
 #include "galois/graphs/DistributedGraph.h"
-#include "galois/graphs/LS_LC_CSR_64_Graph.h"
+#include "galois/graphs/LS_LC_CSR_Graph.h"
 #include "galois/graphs/BufferedGraph.h"
 #include "galois/runtime/DistStats.h"
 #include "galois/graphs/OfflineGraph.h"
@@ -56,7 +56,7 @@ class DistLocalGraph {
   //! Graph name used for printing things
   constexpr static const char* const GRNAME = "dGraph";
 
-  using GraphTy = galois::graphs::LS_LC_CSR_64_Graph<NodeTy, EdgeTy, true>;
+  using GraphTy = galois::graphs::LS_LC_CSR_Graph<NodeTy, EdgeTy>;
 
   // vector for determining range objects for master nodes + nodes
   // with edges (which includes masters)
@@ -83,7 +83,7 @@ class DistLocalGraph {
 
 protected:
   //! The internal graph used by DistLocalGraph to represent the graph
-  GraphTy graph;
+  GraphTy* graph;
 
   //! Marks if the graph is transposed or not.
   bool transposed;
@@ -476,15 +476,11 @@ class DistLocalGraph {
 
 public:
   //! Type representing a node in this graph
-  using GraphNode = typename GraphTy::GraphNode;
-  //! Expose EdgeTy to other classes
+  using GraphNode = typename GraphTy::VertexTopologyID;
+  //! Type representing an edge data in this graph
   using EdgeType = EdgeTy;
-  //! iterator type over nodes
-  using iterator = typename GraphTy::iterator;
-  //! constant iterator type over nodes
-  using const_iterator = typename GraphTy::const_iterator;
   //! iterator type over edges
-  using edge_iterator = typename GraphTy::edge_iterator;
+  using edge_iterator = typename GraphTy::EdgeIterator;
 
   /**
    * Constructor for DistLocalGraph. Initializes metadata fields.
@@ -529,6 +525,7 @@ class DistLocalGraph {
 
 public:
   virtual ~DistLocalGraph() {}
+  void initGraph(uint64_t numNodes) { graph = new GraphTy(numNodes); }
   //! Determines which host has the master for a particular node
   //! @returns Host id of node in question
   inline unsigned getHostID(uint64_t gid) const { return getHostIDImpl(gid); }
@@ -575,10 +572,8 @@ class DistLocalGraph {
    * @param mflag access flag for node data
    * @returns A node data object
    */
-  inline typename GraphTy::node_data_reference
-  getData(GraphNode N,
-          galois::MethodFlag mflag = galois::MethodFlag::UNPROTECTED) {
-    auto& r = graph.getData(N, mflag);
+  inline NodeTy& getData(GraphNode N) {
+    auto& r = graph->getData(N);
     return r;
   }
 
@@ -589,10 +584,14 @@ class DistLocalGraph {
    * @param mflag access flag for edge data
    * @returns The edge data for the requested edge
    */
-  inline typename GraphTy::edge_data_reference
-  getEdgeData(edge_iterator ni,
-              galois::MethodFlag mflag = galois::MethodFlag::UNPROTECTED) {
-    auto& r = graph.getEdgeData(ni, mflag);
+  inline EdgeTy& getEdgeData(GraphNode src, edge_iterator ni) {
+    GraphNode dst = getEdgeDst(ni);
+    auto& r       = graph->getEdgeData(std::make_pair(src, getGID(dst)));
+    return r;
+  }
+
+  inline EdgeTy& getEdgeData(edge_iterator ni) {
+    auto& r = graph->getEdgeData(*ni);
     return r;
   }
 
@@ -602,7 +601,9 @@ class DistLocalGraph {
    * @param ni edge id to get destination of
    * @returns Local ID of destination of edge ni
    */
-  GraphNode getEdgeDst(edge_iterator ni) { return graph.getEdgeDst(ni); }
+  GraphNode getEdgeDst(edge_iterator ni) {
+    return getGID(graph->getEdgeDst(*ni));
+  }
 
   /**
    * Gets the first edge of some node.
@@ -611,7 +612,7 @@ class DistLocalGraph {
    * @returns iterator to first edge of N
    */
   inline edge_iterator edge_begin(GraphNode N) {
-    return graph.edge_begin(N, galois::MethodFlag::UNPROTECTED);
+    return graph->edges(N).begin();
   }
 
   /**
@@ -621,14 +622,12 @@ class DistLocalGraph {
    * @returns iterator to the end of the edges of node N, i.e. the first edge
    * of the next node (or an "end" iterator if there is no next node)
    */
-  inline edge_iterator edge_end(GraphNode N) {
-    return graph.edge_end(N, galois::MethodFlag::UNPROTECTED);
-  }
+  inline edge_iterator edge_end(GraphNode N) { return graph->edges(N).end(); }
 
   /**
    * Return the degree of the edge in the local graph
    **/
-  inline uint64_t localDegree(GraphNode N) { return graph.getDegree(N); }
+  inline uint64_t localDegree(GraphNode N) { return graph->getDegree(N); }
 
   /**
    * Returns an iterable object over the edges of a particular node in the
@@ -647,14 +646,14 @@ class DistLocalGraph {
    *
    * @returns number of nodes present in this (local) graph
    */
-  inline size_t size() const { return graph.size(); }
+  inline size_t size() const { return graph->size(); }
 
   /**
    * Gets number of edges on this (local) graph.
    *
    * @returns number of edges present in this (local) graph
    */
-  inline size_t sizeEdges() const { return graph.sizeEdges(); }
+  inline size_t sizeEdges() { return graph->sizeEdges(); }
 
   /**
    * Gets number of nodes on this (local) graph.
@@ -746,7 +745,7 @@ class DistLocalGraph {
    */
   inline void determineThreadRanges() {
     allNodesRanges = galois::graphs::determineUnitRangesFromPrefixSum(
-        galois::runtime::activeThreads, graph.getEdgePrefixSum());
+        galois::runtime::activeThreads, graph->getEdgePrefixSum());
   }
 
   /**
@@ -770,9 +769,8 @@ class DistLocalGraph {
     } else {
       galois::gDebug("Manually det. master thread ranges");
       masterRanges = galois::graphs::determineUnitRangesFromGraph(
-          graph, galois::runtime::activeThreads, beginMaster,
-          beginMaster + numOwned, 0,
-          (galois::graphs::is_LS_LC_CSR_64_Graph<decltype(graph)>::value == 1));
+          *graph, galois::runtime::activeThreads, beginMaster,
+          beginMaster + numOwned, 0, true);
     }
   }
 
@@ -797,7 +795,7 @@ class DistLocalGraph {
     } else {
       galois::gDebug("Manually det. with edges thread ranges");
       withEdgeRanges = galois::graphs::determineUnitRangesFromGraph(
-          graph, galois::runtime::activeThreads, 0, numNodesWithEdges, 0);
+          *graph, galois::runtime::activeThreads, 0, numNodesWithEdges, 0);
     }
   }
 
@@ -869,7 +867,7 @@ class DistLocalGraph {
    */
   void deallocate() {
     galois::gDebug("Deallocating CSR in DistLocalGraph");
-    graph.deallocate();
+    graph->deallocate();
   }
 
   /**
@@ -877,11 +875,10 @@ class DistLocalGraph {
    * It sorts edges of the nodes by destination.
    */
   void sortEdgesByDestination() {
-    using GN = typename GraphTy::GraphNode;
     galois::do_all(
-        galois::iterate(graph),
-        [&](GN n) { graph.sortEdges(n, IdLess<GN, EdgeTy>()); },
-        galois::no_stats(), galois::loopname("CSREdgeSort"), galois::steal());
+        galois::iterate(graph->vertices().begin(), graph->vertices().end()),
+        [&](GraphNode n) { graph->sortEdges(n); }, galois::no_stats(),
+        galois::loopname("CSREdgeSort"), galois::steal());
   }
 
   //! Used by substrate to determine if some stats are to be reported
diff --git a/libgalois/include/galois/graphs/GraphHelpers.h b/libgalois/include/galois/graphs/GraphHelpers.h
index a5dda328c7..f6bf923b63 100644
--- a/libgalois/include/galois/graphs/GraphHelpers.h
+++ b/libgalois/include/galois/graphs/GraphHelpers.h
@@ -262,10 +262,11 @@ void determineUnitRangesLoopGraph(GraphTy& graph, uint32_t unitsToSplit,
 
   // cannot use edge_end/begin on log strcutred CSR since its edges are not
   // consecutive.
-  uint64_t numEdgesInRange = (is_LS_LC_CSR) ? graph.sizeEdges()
-                                            : graph.edge_end(endNode - 1) -
-                                                  graph.edge_begin(beginNode);
-  uint64_t edgeOffset      = (is_LS_LC_CSR) ? 0 : *graph.edge_begin(beginNode);
+  uint64_t numEdgesInRange = (is_LS_LC_CSR)
+                                 ? graph.sizeEdges()
+                                 : std::distance(graph.edge_end(beginNode),
+                                                 graph.edge_begin(endNode - 1));
+  uint64_t edgeOffset      = (is_LS_LC_CSR) ? 0 : graph[beginNode];
 
   returnRanges[0] = beginNode;
   std::vector<unsigned int> dummyScaleFactor;
diff --git a/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h b/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
index 32dfd19e7b..2ee7aea422 100644
--- a/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
+++ b/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
@@ -17,8 +17,8 @@
  * Documentation, or loss or inaccuracy of data of any kind.
  */
 
-#ifndef GALOIS_GRAPHS_LC_CSR_GRAPH_H
-#define GALOIS_GRAPHS_LC_CSR_GRAPH_H
+#ifndef GALOIS_GRAPHS_LS_LC_CSR_GRAPH_H
+#define GALOIS_GRAPHS_LS_LC_CSR_GRAPH_H
 
 #include <unordered_set>
 #include <iterator>
@@ -205,6 +205,10 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
     return m_edge_data[handle];
   }
 
+  template <typename E = EdgeData, typename = std::enable_if<HasEdgeData>>
+  inline E& getEdgeData(EdgeIterator const& it) {
+    return m_edge_data[*it];
+  }
   /*
    * Count the total number of edges in parallel.
    */
diff --git a/libwmd/include/galois/wmd/WMDGraph.h b/libwmd/include/galois/wmd/WMDGraph.h
index 8f1d9aef73..962296109e 100644
--- a/libwmd/include/galois/wmd/WMDGraph.h
+++ b/libwmd/include/galois/wmd/WMDGraph.h
@@ -1142,7 +1142,8 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
                       &addedData
 #endif
       ](size_t j) {
-                       dstGraph.getData(GIDtoLID[NodeData[j].id]) = NodeData[j];
+                       dstGraph->getData(GIDtoLID[NodeData[j].id]) =
+                           NodeData[j];
                      });
       NodeData.clear();
     }
@@ -1153,7 +1154,7 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
                     &addedData
 #endif
     ](size_t i) {
-                     dstGraph.getData(GIDtoLID[nodesToSend[hostID][i].id]) =
+                     dstGraph->getData(GIDtoLID[nodesToSend[hostID][i].id]) =
                          nodesToSend[hostID][i];
 #ifndef NDEBUG
                      addedData++;
@@ -1172,7 +1173,7 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
           if (i != hostID) {
             for (uint64_t j = 0; j < proxiesOnHosts[i].size(); j++) {
               auto& r =
-                  dstGraph.getData(globalToLocalMap[proxiesOnHosts[i][j]]);
+                  dstGraph->getData(globalToLocalMap[proxiesOnHosts[i][j]]);
               nodesToSend[i].push_back(r);
             };
           }
@@ -1216,7 +1217,7 @@ class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
           galois::iterate((size_t)0, IDofNodeRecv.size()),
           [this, &nodeRecv, &IDofNodeRecv, &dstGraph,
            &globalToLocalMap](size_t j) {
-            dstGraph.getData(globalToLocalMap[IDofNodeRecv[j]]) = nodeRecv[j];
+            dstGraph->getData(globalToLocalMap[IDofNodeRecv[j]]) = nodeRecv[j];
           },
           galois::steal());
       nodeRecv.clear();
diff --git a/libwmd/include/galois/wmd/WMDPartitioner.h b/libwmd/include/galois/wmd/WMDPartitioner.h
index d5cdce65e1..d3fe9cf6dc 100644
--- a/libwmd/include/galois/wmd/WMDPartitioner.h
+++ b/libwmd/include/galois/wmd/WMDPartitioner.h
@@ -234,9 +234,7 @@ class WMDGraph : public DistLocalGraph<NodeTy, EdgeTy> {
     // Graph construction related calls
     base_DistGraph::beginMaster = 0;
     // Allocate and construct the graph
-    base_DistGraph::graph.allocateFrom(base_DistGraph::numNodes,
-                                       base_DistGraph::numEdges);
-    base_DistGraph::graph.constructNodes();
+    base_DistGraph::initGraph(base_DistGraph::numNodes);
 
     // construct edges
     // not need to move edges from other host since all edges is already ready
@@ -250,10 +248,10 @@ class WMDGraph : public DistLocalGraph<NodeTy, EdgeTy> {
           for (auto dst : edgeDst) {
             dstData.emplace_back(base_DistGraph::globalToLocalMap[dst]);
           }
-          auto edgeData = bufGraph.edgeDataPtr(globalID);
-          base_DistGraph::graph.addEdgesUnSort(
-              true, (globalID - bufGraph.globalNodeOffset[base_DistGraph::id]),
-              dstData.data(), edgeData, bufGraph.edgeNum(globalID), false);
+          std::vector<EdgeTy> edgeData(bufGraph.edgeNum(globalID));
+          base_DistGraph::graph->addEdges(
+              (globalID - bufGraph.globalNodeOffset[base_DistGraph::id]),
+              dstData, edgeData);
         },
         galois::steal());
 
@@ -269,10 +267,10 @@ class WMDGraph : public DistLocalGraph<NodeTy, EdgeTy> {
                   "] LS_CSR graph local nodes: ", base_DistGraph::numNodes);
     galois::gInfo("[", base_DistGraph::id,
                   "] LS_CSR graph master nodes: ", base_DistGraph::numOwned);
-    galois::gInfo("[", base_DistGraph::id, "] LS_CSR graph local edges: ",
-                  base_DistGraph::graph.sizeEdges());
-    assert(base_DistGraph::graph.sizeEdges() == base_DistGraph::numEdges);
-    assert(base_DistGraph::graph.size() == base_DistGraph::numNodes);
+    galois::gInfo("[", base_DistGraph::id,
+                  "] LS_CSR graph local edges: ", base_DistGraph::sizeEdges());
+    assert(base_DistGraph::sizeEdges() == base_DistGraph::numEdges);
+    assert(base_DistGraph::graph->size() == base_DistGraph::numNodes);
 
     bufGraph.resetAndFree();
 
@@ -471,14 +469,14 @@ class WMDGraph : public DistLocalGraph<NodeTy, EdgeTy> {
     }
 
     galois::gInfo("[", base_DistGraph::id, "] Start building projected graph.");
-    newGraph->graph.allocateFrom(newGraph->numNodes, newGraph->numEdges);
+    newGraph->initGraph(newGraph->numNodes);
 
     galois::do_all(
         galois::iterate(uint64_t(0), uint64_t(newGraph->numNodes)),
         [&](auto& node) {
           NodeLID oldGraphLID =
               base_DistGraph::getLID(newGraph->localToGlobalVector[node]);
-          newGraph->graph.getData(node) = projection.ProjectNode(
+          newGraph->graph->getData(node) = projection.ProjectNode(
               *this, base_DistGraph::getData(oldGraphLID), oldGraphLID);
 
           uint64_t numEdges = newTopology[node].size();
@@ -490,9 +488,7 @@ class WMDGraph : public DistLocalGraph<NodeTy, EdgeTy> {
           for (NodeGID gid : newTopology[node]) {
             localDsts.emplace_back(newGraph->getLID(gid));
           }
-          newGraph->graph.addEdgesUnSort(true, node, localDsts.data(),
-                                         newEdgeData[node].data(), numEdges,
-                                         false);
+          newGraph->graph->addEdges(node, localDsts, newEdgeData[node]);
 
           newTopology[node].clear();
           newEdgeData[node].clear();
diff --git a/libwmd/include/galois/wmd/graph.h b/libwmd/include/galois/wmd/graph.h
index 86a28be423..03649e168e 100644
--- a/libwmd/include/galois/wmd/graph.h
+++ b/libwmd/include/galois/wmd/graph.h
@@ -48,13 +48,13 @@
 #include <cstdint>
 #include <limits>
 #include <vector>
-#include <unordered_map>
+// #include <unordered_map>
 
 #include <boost/archive/text_oarchive.hpp>
 #include <boost/archive/text_iarchive.hpp>
 
 #include "graphTypes.h"
-#include "galois/graphs/LS_LC_CSR_64_Graph.h"
+// #include "galois/graphs/LS_LC_CSR_Graph.h"
 #include "galois/shad/DataTypes.h"
 
 #define UINT shad::data_types::UINT
diff --git a/libwmd/test/wmd-graph-build.cpp b/libwmd/test/wmd-graph-build.cpp
index 88fca39ab9..c0711bd24b 100644
--- a/libwmd/test/wmd-graph-build.cpp
+++ b/libwmd/test/wmd-graph-build.cpp
@@ -10,6 +10,8 @@
  */
 #include "galois/wmd/graph.h"
 #include "galois/wmd/WMDPartitioner.h"
+#include "galois/shad/DataTypes.h"
+#include "galois/wmd/graphTypes.h"
 
 #include "galois/DistGalois.h"
 #include "galois/graphs/GenericPartitioners.h"
@@ -17,12 +19,114 @@
 #include <cstdlib>
 #include <iostream>
 #include <fstream>
+#include <unordered_map>
 
 using namespace agile::workflow1;
 
-typedef galois::graphs::WMDGraph<agile::workflow1::Vertex,
-                                 agile::workflow1::Edge, OECPolicy>
-    Graph;
+typedef galois::graphs::WMDGraph<Vertex, Edge, OECPolicy> Graph;
+
+void insertEdge(
+    Edge edge,
+    std::unordered_map<std::uint64_t, std::pair<TYPES, std::vector<Edge>>>&
+        vertices) {
+  if (vertices.find(edge.src) != vertices.end()) {
+    vertices[edge.src].second.push_back(edge);
+  } else {
+    assert(false);
+  }
+}
+
+void parser(std::string line,
+            std::unordered_map<std::uint64_t,
+                               std::pair<TYPES, std::vector<Edge>>>& vertices) {
+  if (line.find("//") != std::string::npos ||
+      line.find("#") != std::string::npos) {
+    return;
+  } else if (line.find("/*") != std::string::npos ||
+             line.find("*/") != std::string::npos) {
+    return;
+  } else {
+    const char* ptr = line.c_str();
+    std::istringstream ss(ptr);
+    std::string token;
+    std::vector<std::string> tokens;
+    while (std::getline(ss, token, ',')) {
+      tokens.push_back(token);
+    }
+    if (tokens.size() == 9)
+      tokens.push_back("");
+    if (tokens.size() == 0)
+      return;
+    assert(tokens.size() == 10);
+    bool isNode = tokens[0] == "Person" || tokens[0] == "ForumEvent" ||
+                  tokens[0] == "Forum" || tokens[0] == "Publication" ||
+                  tokens[0] == "Topic";
+    if (isNode) {
+      uint64_t id                        = 0;
+      agile::workflow1::TYPES vertexType = agile::workflow1::TYPES::NONE;
+      if (tokens[0] == "Person") {
+        vertexType = agile::workflow1::TYPES::PERSON;
+        id         = std::stoull(tokens[1]);
+      } else if (tokens[0] == "ForumEvent") {
+        vertexType = agile::workflow1::TYPES::FORUMEVENT;
+        id         = std::stoull(tokens[4]);
+      } else if (tokens[0] == "Forum") {
+        vertexType = agile::workflow1::TYPES::FORUM;
+        id         = std::stoull(tokens[3]);
+      } else if (tokens[0] == "Publication") {
+        vertexType = agile::workflow1::TYPES::PUBLICATION;
+        id         = std::stoull(tokens[5]);
+      } else if (tokens[0] == "Topic") {
+        vertexType = agile::workflow1::TYPES::TOPIC;
+        id         = std::stoull(tokens[6]);
+      } else {
+        assert(false);
+      }
+      vertices[id] =
+          std::pair<TYPES, std::vector<Edge>>(vertexType, std::vector<Edge>());
+    } else {
+      Edge edge(tokens);
+      insertEdge(edge, vertices);
+      // Inverse edge
+      agile::workflow1::TYPES inverseEdgeType = agile::workflow1::TYPES::NONE;
+      if (tokens[0] == "Sale") {
+        inverseEdgeType = agile::workflow1::TYPES::PURCHASE;
+      } else if (tokens[0] == "Author") {
+        inverseEdgeType = agile::workflow1::TYPES::WRITTENBY;
+      } else if (tokens[0] == "Includes") {
+        inverseEdgeType = agile::workflow1::TYPES::INCLUDEDIN;
+      } else if (tokens[0] == "HasTopic") {
+        inverseEdgeType = agile::workflow1::TYPES::TOPICIN;
+      } else if (tokens[0] == "HasOrg") {
+        inverseEdgeType = agile::workflow1::TYPES::ORGIN;
+      } else {
+        assert(false);
+      }
+      agile::workflow1::Edge inverseEdge = edge;
+      inverseEdge.type                   = inverseEdgeType;
+      std::swap(inverseEdge.src, inverseEdge.dst);
+      std::swap(inverseEdge.src_type, inverseEdge.dst_type);
+      insertEdge(inverseEdge, vertices);
+    }
+  }
+}
+
+void getDataFromGraph(
+    std::string& filename,
+    std::unordered_map<std::uint64_t, std::pair<TYPES, std::vector<Edge>>>&
+        vertices) {
+  // read file line by line
+  std::string line;
+  std::ifstream myfile(filename);
+  if (myfile.is_open()) {
+    while (getline(myfile, line)) {
+      parser(line, vertices);
+    }
+    myfile.close();
+  } else {
+    std::cout << "Unable to open file";
+  }
+}
 
 int main(int argc, char* argv[]) {
   galois::DistMemSys G; // init galois memory
@@ -39,6 +143,7 @@ int main(int argc, char* argv[]) {
   }
 
   std::string dataFile = argv[1];
+  std::string file     = dataFile;
   std::vector<std::string> filenames;
   filenames.emplace_back(dataFile);
   std::vector<std::unique_ptr<galois::graphs::FileParser<
@@ -52,6 +157,11 @@ int main(int argc, char* argv[]) {
                            galois::graphs::BALANCED_EDGES_OF_MASTERS);
   assert(graph != nullptr);
 
+  std::unordered_map<std::uint64_t, std::pair<TYPES, std::vector<Edge>>>
+      vertices;
+  if (net.ID == 0)
+    getDataFromGraph(file, vertices);
+
   // generate a file with sorted token of all nodes and its outgoing edge dst
   // compare it with other implementation to verify the correctness
   std::vector<std::pair<uint64_t, std::vector<uint64_t>>> tokenAndEdges;
@@ -66,10 +176,14 @@ int main(int argc, char* argv[]) {
         auto end = graph->edge_end(lid);
         auto itr = graph->edge_begin(lid);
         for (; itr != end; itr++) {
-          edgeDst.push_back(graph->getEdgeData(itr).dst);
+          edgeDst.push_back(graph->getEdgeDst(itr));
         }
+        std::vector<uint64_t> edgeDstDbg;
+        for (auto& e : graph->edges(lid)) {
+          edgeDstDbg.push_back(graph->getEdgeDst(e));
+        }
+        assert(edgeDst == edgeDstDbg);
         std::sort(edgeDst.begin(), edgeDst.end());
-
         tokenAndEdges[lid] = std::make_pair(token, std::move(edgeDst));
       },
       galois::steal());
@@ -99,23 +213,19 @@ int main(int argc, char* argv[]) {
   // sort the node info by token order
   // serilize it to file
   if (net.ID == 0) {
-    std::sort(tokenAndEdges.begin(), tokenAndEdges.end(),
-              [](const std::pair<uint64_t, std::vector<uint64_t>>& a,
-                 const std::pair<uint64_t, std::vector<uint64_t>>& b) {
-                return a.first < b.first;
-              });
-
-    std::ofstream output;
-    output.open("wmd-graph-build-result.txt");
-
-    for (auto itr : tokenAndEdges) {
-      output << itr.first;
-      for (auto edge : itr.second) {
-        output << "," << edge;
+    // compare with vertices
+    assert(tokenAndEdges.size() == vertices.size());
+    for (size_t i = 0; i < tokenAndEdges.size(); i++) {
+      auto& tokenAndEdge = tokenAndEdges[i];
+      auto& vertex       = vertices[tokenAndEdge.first];
+      assert(vertex.second.size() == tokenAndEdge.second.size());
+      std::sort(vertex.second.begin(), vertex.second.end(),
+                [](const agile::workflow1::Edge& a,
+                   const agile::workflow1::Edge& b) { return a.dst < b.dst; });
+      for (size_t j = 0; j < vertex.second.size(); j++) {
+        assert(vertex.second[j].dst == tokenAndEdge.second[j]);
       }
-      output << "\n";
     }
-    output.close();
   }
   return 0;
 }

From 0b435d8071292ca88561c393e1563ca49afb37db Mon Sep 17 00:00:00 2001
From: Meyer Zinn <6132034+meyerzinn@users.noreply.github.com>
Date: Sun, 14 Apr 2024 17:25:13 -0500
Subject: [PATCH 655/660] use work stealing in compaction (#34)

---
 .../include/galois/graphs/LS_LC_CSR_Graph.h   | 26 ++++++++++---------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h b/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
index 2ee7aea422..dc4942f3c4 100644
--- a/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
+++ b/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
@@ -337,18 +337,20 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
     using std::swap;
 
     // move from buffer 0 to buffer 1
-    galois::do_all(galois::iterate(vertices().begin(), vertices().end()),
-                   [&](VertexTopologyID vertex_id) {
-                     VertexMetadata& vertex_meta = m_vertices[vertex_id];
-
-                     if (vertex_meta.buffer == 0) {
-                       this->addEdgesTopologyOnly<false>(vertex_id, {});
-                     }
-
-                     // we are about to swap the buffers, so all vertices will
-                     // be in buffer 0
-                     vertex_meta.buffer = 0;
-                   });
+    galois::do_all(
+        galois::iterate(vertices().begin(), vertices().end()),
+        [&](VertexTopologyID vertex_id) {
+          VertexMetadata& vertex_meta = m_vertices[vertex_id];
+
+          if (vertex_meta.buffer == 0) {
+            this->addEdgesTopologyOnly<false>(vertex_id, {});
+          }
+
+          // we are about to swap the buffers, so all vertices will
+          // be in buffer 0
+          vertex_meta.buffer = 0;
+        },
+        galois::steal());
 
     // At this point, there are no more live edges in buffer 0.
     m_edges_lock.lock();

From 1cad580b06e7167d82c5cdfddc7eb2999f0aac8b Mon Sep 17 00:00:00 2001
From: divija95 <60272396+divija95@users.noreply.github.com>
Date: Mon, 15 Apr 2024 03:19:16 -0500
Subject: [PATCH 656/660] dynamic edges support (#35)

* dynamic edges support

* adding correct test

* fixing precommit

* moving static file to lfs
---
 .gitattributes                                |   1 +
 inputs/wmd/dynamic0.txt                       |  46 ++
 inputs/wmd/dynamic1.txt                       |  44 ++
 inputs/wmd/static                             | 650 ++++++++++++++++++
 .../galois/graphs/DistributedLocalGraph.h     | 169 +++++
 .../galois/runtime/GraphUpdateManager.h       | 134 ++++
 libwmd/test/CMakeLists.txt                    |   2 +-
 libwmd/test/wmd-graph-build.cpp               |  31 +-
 8 files changed, 1066 insertions(+), 11 deletions(-)
 create mode 100644 inputs/wmd/dynamic0.txt
 create mode 100644 inputs/wmd/dynamic1.txt
 create mode 100644 inputs/wmd/static
 create mode 100644 libgalois/include/galois/runtime/GraphUpdateManager.h

diff --git a/.gitattributes b/.gitattributes
index 71b4e67e5e..80513e2651 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,2 +1,3 @@
 *.csv filter=lfs diff=lfs merge=lfs -text
 inputs/wmd/data.001.csv filter=lfs diff=lfs merge=lfs -text
+inputs/wmd/static filter=lfs diff=lfs merge=lfs -text
diff --git a/inputs/wmd/dynamic0.txt b/inputs/wmd/dynamic0.txt
new file mode 100644
index 0000000000..37648532bc
--- /dev/null
+++ b/inputs/wmd/dynamic0.txt
@@ -0,0 +1,46 @@
+Author,346401281431409585,,,1114502034902546550,,,,,
+Sale,317248309514344163,1524681741257900519,,,,,8/3/2018,,
+HasTopic,,,,618434247743641149,,3884230,,,
+Sale,1243472362254658420,205415260510814362,,,,,8/9/2018,,
+Sale,1472154222902711100,529550602103217450,,,,185785,2/17/2019,,
+Includes,,,1314315120197156050,803952155714850701,,,,,
+Author,1262668194076216011,,,747423119260925972,,,,,
+HasTopic,,,,833681012494554358,,787185,,,
+HasTopic,,,,932362105613871012,,160409,,,
+Sale,1125113326787431160,437201545096608055,,,,146,10/14/2018,,
+HasTopic,,,,1424660009578332566,,334600,,,
+HasTopic,,,451888058015735870,,,121765,,,
+HasTopic,,,,1285128710332882742,,193294,,,
+HasTopic,,,,956704137555154092,,217627,,,
+HasTopic,,,,1424263331858043042,,2454265,,,
+HasTopic,,,,1651365355351122204,,783874,,,
+HasTopic,,,,1209342585680609487,,792565,,,
+Author,1128501731262832684,,,1060309546214304182,,,,,
+Author,775818654043059161,,,740410432146852843,,,,,
+HasTopic,,,,854149383334143372,,4442,,,
+HasOrg,,,,,1004346153600881042,670897,,,
+Sale,1508951542204233332,611325512448133762,,,,,10/11/2018,,
+HasTopic,,,,747423119260925972,,202013,,,
+Author,208411288512434105,,,1513662032452523252,,,,,
+HasTopic,,,,740410432146852843,,5088838,,,
+Includes,,,15133734353741126,1532662490035322233,,,,,
+HasTopic,,,,817526874194673140,,18122778,,,
+HasTopic,,,,440265285168056234,,102014,,,
+HasTopic,,,,186108460103013588,,732934,,,
+Includes,,,1615340315424362057,209800678458482108,,,,,
+Author,373641740834326257,,,116892402526543412,,,,,
+HasTopic,,,,740410432146852843,,122113,,,
+Includes,,,1615340315424362057,440265285168056234,,,,,
+Author,719533111062900642,,,,1433303251800176474,,,,
+HasTopic,,,,420762134340393550,,10289,,,
+HasTopic,,,,1184855350262395542,,11348,,,
+HasTopic,,,,82629615412640377,,247154,,,
+Author,369370063627142227,,,1184855350262395542,,,,,
+Author,1135272113235621141,,,91431002216341149,,,,,
+Sale,635555368637193420,134403203055015143,,,,,8/15/2018,,
+Includes,,,1314315120197156050,1282227710122181132,,,,,
+HasTopic,,,,,1433303251800176474,828749,,,
+HasTopic,,,1314315120197156050,,,12796,,,
+HasTopic,,,,541215404780905313,,182218,,,
+HasTopic,,,,877764733212222524,,7543639,,,
+HasTopic,,,,,1433303251800176474,1436668,,,
diff --git a/inputs/wmd/dynamic1.txt b/inputs/wmd/dynamic1.txt
new file mode 100644
index 0000000000..8f13aa07e8
--- /dev/null
+++ b/inputs/wmd/dynamic1.txt
@@ -0,0 +1,44 @@
+HasTopic,,,,447169043921403064,,29171,,,
+HasOrg,,,,,1433303251800176474,2030894,,,
+Sale,477384404927196020,1128501731262832684,,,,271997,09/30/2018,,
+HasTopic,,,,581543512052485139,,505619,,,
+HasTopic,,,,618434247743641149,,60,,,
+HasTopic,,,,803952155714850701,,557887,,,
+HasTopic,,,,78678286442461987,,831691,,,
+Author,101810442957214781,,,,1004346153600881042,,,,
+Sale,1597454052092354280,961135479935321085,,,,1149078,8/1/2018,,
+HasOrg,,,,,1004346153600881042,5018694,,,
+Sale,301710390995444087,1312322776399358210,,,,379860,10/12/2018,,
+HasTopic,,,,,1004346153600881042,29718382,,,
+HasTopic,,,,,1433303251800176474,620463,,,
+HasTopic,,,,1651365355351122204,,44311,,,
+Author,1443919105364146460,,,1424263331858043042,,,,,
+Sale,895197896920634500,1508332501512270227,,,,487,7/31/2018,,
+HasTopic,,,,321724159614056152,,158668,,,
+Includes,,,1427292001647224242,240337224527030225,,,,,
+HasTopic,,,,1184855350262395542,,223155,,,
+Includes,,,1615340315424362057,1184855350262395542,,,,,
+HasTopic,,,,1532662490035322233,,1337691,,,
+Author,1352636429150180228,,,186108460103013588,,,,,
+HasTopic,,,,393285992310638641,,974850,,,
+Includes,,,1615340315424362057,116892402526543412,,,,,
+HasTopic,,,,128423416112315798,,361,,,
+HasTopic,,,,541215404780905313,,13977,,,
+HasTopic,,,,1512214307542520410,,699385,,,
+Sale,1035098046740791143,352033450190732475,,,,,8/25/2018,,
+Includes,,,1615340315424362057,420762134340393550,,,,,
+Author,556320934631523806,,,541215404780905313,,,,,
+HasTopic,,,1427292001647224242,,,25395,,,
+HasTopic,,,,1424660009578332566,,191290,,,
+Sale,351354309273100074,1073324208204442390,,,,206021,8/7/2018,,
+Sale,1025135622623992536,1001287904525368324,,,,,10/2/2018,,
+Author,12321118467056216,,,1512214307542520410,,,,,
+HasTopic,,,,932362105613871012,,60,,,
+HasTopic,,,,803952155714850701,,5,,,
+HasTopic,,,,1220295546212024391,,18031504,,,
+HasTopic,,,,91431002216341149,,73843,,,
+Author,437573095319558705,,,447169043921403064,,,,,
+HasTopic,,,,1424263331858043042,,2566598,,,
+Includes,,,15133734353741126,1285128710332882742,,,,,
+HasTopic,,,,1513662032452523252,,466439,,,
+Includes,,,1314315120197156050,393285992310638641,,,,,
diff --git a/inputs/wmd/static b/inputs/wmd/static
new file mode 100644
index 0000000000..b95a23c9b4
--- /dev/null
+++ b/inputs/wmd/static
@@ -0,0 +1,650 @@
+#delimieter: ,
+#columns:type,person1,person2,forum,forum_event,publication,topic,date,lat,lon
+#types:STRING,UINT,UINT,UINT,UINT,UINT,UINT,USDATE,DOUBLE,DOUBLE
+Publication,,,,,102583151124020340,,4/1/2013,,
+Publication,,,,,1004346153600881042,,12/2/2014,,
+Publication,,,,,1433303251800176474,,1/1/2014,,
+Publication,,,,,963345652072941810,,3/1/2017,,
+ForumEvent,,,1372844135435303981,1651365355351122204,,,1/7/2019,,
+ForumEvent,,,1372844135435303981,1060309546214304182,,,1/3/2018,,
+ForumEvent,,,1372844135435303981,932362105613871012,,,1/8/2018,,
+ForumEvent,,,1372844135435303981,618434247743641149,,,1/8/2018,,
+ForumEvent,,,1372844135435303981,1209342585680609487,,,1/10/2018,,
+ForumEvent,,,1615340315424362057,1245126351375505703,,,2/13/2018,,
+ForumEvent,,,1372844135435303981,581543512052485139,,,2/5/2018,,
+ForumEvent,,,1314315120197156050,833681012494554358,,,3/23/2018,,
+ForumEvent,,,1615340315424362057,1220295546212024391,,,3/26/2018,,
+ForumEvent,,,1372844135435303981,1424263331858043042,,,4/5/2018,,
+ForumEvent,,,1615340315424362057,1290121451283392110,,,4/12/2018,,
+ForumEvent,,,1427292001647224242,240337224527030225,,,4/24/2018,,
+ForumEvent,,,1615340315424362057,440265285168056234,,,5/17/2018,,
+ForumEvent,,,1615340315424362057,817526874194673140,,,5/31/2018,,
+ForumEvent,,,1314315120197156050,846536331643665114,,,6/12/2018,,
+ForumEvent,,,1202482536733844323,1114502034902546550,,,6/14/2018,,
+ForumEvent,,,1372844135435303981,1441762191425652442,,,7/8/2018,,
+ForumEvent,,,1615340315424362057,128423416112315798,,,7/20/2018,,
+ForumEvent,,,1615340315424362057,701755398615636460,,,8/1/2018,,
+ForumEvent,,,1314315120197156050,393285992310638641,,,8/12/2018,,
+ForumEvent,,,1615340315424362057,420762134340393550,,,9/9/2018,,
+ForumEvent,,,1372844135435303981,737353170652104031,,,9/14/2018,,
+ForumEvent,,,1615340315424362057,116892402526543412,,,10/13/2018,,
+ForumEvent,,,1372844135435303981,1028329324575034354,,,10/20/2018,,
+ForumEvent,,,1202482536733844323,1513662032452523252,,,10/30/2018,,
+ForumEvent,,,1314315120197156050,803952155714850701,,,11/14/2018,,
+ForumEvent,,,1372844135435303981,186108460103013588,,,11/12/2018,,
+ForumEvent,,,1615340315424362057,1184855350262395542,,,12/1/2018,,
+ForumEvent,,,1372844135435303981,1302313601603127196,,,12/16/2018,,
+ForumEvent,,,1615340315424362057,78678286442461987,,,1/11/2019,,
+ForumEvent,,,15133734353741126,1285128710332882742,,,1/10/2019,,
+ForumEvent,,,1615340315424362057,447169043921403064,,,2/2/2019,,
+ForumEvent,,,1372844135435303981,91431002216341149,,,2/13/2019,,
+ForumEvent,,,1202482536733844323,1296829658689065159,,,2/13/2019,,
+ForumEvent,,,1615340315424362057,877764733212222524,,,3/28/2019,,
+ForumEvent,,,1314315120197156050,1614534111336540475,,,3/3/2019,,
+ForumEvent,,,1615340315424362057,209800678458482108,,,4/14/2019,,
+ForumEvent,,,15133734353741126,1532662490035322233,,,4/1/2019,,
+ForumEvent,,,1314315120197156050,321724159614056152,,,5/29/2019,,
+ForumEvent,,,1372844135435303981,1512214307542520410,,,5/17/2019,,
+ForumEvent,,,1615340315424362057,740410432146852843,,,6/5/2019,,
+ForumEvent,,,1372844135435303981,82629615412640377,,,6/24/2019,,
+ForumEvent,,,1427292001647224242,936722743217343702,,,6/30/2019,,
+ForumEvent,,,1372844135435303981,747423119260925972,,,7/11/2019,,
+ForumEvent,,,451888058015735870,541215404780905313,,,7/3/2019,,
+ForumEvent,,,1615340315424362057,1424660009578332566,,,8/25/2019,,
+ForumEvent,,,1314315120197156050,1282227710122181132,,,8/5/2019,,
+ForumEvent,,,1314315120197156050,854149383334143372,,,9/19/2019,,
+ForumEvent,,,1615340315424362057,202421472143651025,,,9/21/2019,,
+ForumEvent,,,353365307219544531,956704137555154092,,,10/17/2019,,
+ForumEvent,,,,1142353335442842612,,,10/2/2019,,
+ForumEvent,,,,1417645062678302203,,,10/27/2019,,
+ForumEvent,,,,691612430615344311,,,11/18/2019,,
+ForumEvent,,,,499518911125406276,,,11/7/2019,,
+ForumEvent,,,,802203574353867462,,,12/26/2019,,
+ForumEvent,,,,1154045191214226005,,,12/19/2019,,
+Forum,,,227560344059645632,,,,,,
+Forum,,,642724485236726353,,,,,,
+Forum,,,1583773067440233990,,,,,,
+Forum,,,353365307219544531,,,,,,
+Forum,,,1372844135435303981,,,,,,
+Forum,,,817570614729612563,,,,,,
+Forum,,,1222966301068614432,,,,,,
+Forum,,,254347350613723281,,,,,,
+Forum,,,230406515001545612,,,,,,
+Forum,,,1561731546512891652,,,,,,
+Forum,,,188043543797416114,,,,,,
+Forum,,,1083041743586306041,,,,,,
+Forum,,,132472381132383125,,,,,,
+Forum,,,20118285562646166,,,,,,
+Forum,,,555784630220125214,,,,,,
+Forum,,,1015255971523263924,,,,,,
+Forum,,,1342495276080758813,,,,,,
+Forum,,,851350143155248158,,,,,,
+Forum,,,1427292001647224242,,,,,,
+Forum,,,722051276937327353,,,,,,
+Forum,,,1107212912316309796,,,,,,
+Forum,,,504490409499070811,,,,,,
+Forum,,,15133734353741126,,,,,,
+Forum,,,869745302967338810,,,,,,
+Forum,,,324124332757504717,,,,,,
+Forum,,,852491638004013222,,,,,,
+Forum,,,1040437236245414809,,,,,,
+Forum,,,442231451428861295,,,,,,
+Forum,,,101022092642335391,,,,,,
+Forum,,,1037815940207624157,,,,,,
+Forum,,,1331941318481662527,,,,,,
+Forum,,,1615340315424362057,,,,,,
+Forum,,,1425519641234605945,,,,,,
+Forum,,,705065952261175094,,,,,,
+Forum,,,1314315120197156050,,,,,,
+Forum,,,214214821270800149,,,,,,
+Forum,,,1361197157264541395,,,,,,
+Forum,,,1033538541314217453,,,,,,
+Forum,,,565733832133342431,,,,,,
+Forum,,,451888058015735870,,,,,,
+Forum,,,155345234637251110,,,,,,
+Forum,,,1371100161965701220,,,,,,
+Forum,,,1307221369082243900,,,,,,
+Forum,,,406508153569651122,,,,,,
+Forum,,,1202482536733844323,,,,,,
+Forum,,,912373284682369433,,,,,,
+Person,477384404927196020,,,,,,,,
+Person,182010581109145287,,,,,,,,
+Topic,,,,,,271997,,,
+Topic,,,,,,127197,,,
+Person,284405379592161575,,,,,,,,
+Topic,,,,,,11650,,,
+Topic,,,,,,185785,,,
+Topic,,,,,,1907525,,,
+Topic,,,,,,1333024,,,
+Topic,,,,,,2329,,,
+Topic,,,,,,571,,,
+Topic,,,,,,56683126,,,
+Topic,,,,,,146,,,
+Topic,,,,,,487,,,
+Topic,,,,,,193294,,,
+Topic,,,,,,177,,,
+Topic,,,,,,81944,,,
+Topic,,,,,,998,,,
+Topic,,,,,,55424107,,,
+Topic,,,,,,41323,,,
+Topic,,,,,,38695,,,
+Topic,,,,,,379860,,,
+Topic,,,,,,1149078,,,
+Topic,,,,,,172809,,,
+Topic,,,,,,1642639,,,
+Topic,,,,,,903552,,,
+Topic,,,,,,204,,,
+Topic,,,,,,7817,,,
+Topic,,,,,,201816,,,
+Topic,,,,,,785,,49.19,-2.11
+Topic,,,,,,127,,,
+Topic,,,,,,206021,,,
+Topic,,,,,,181508,,,
+Topic,,,,,,735,,,
+Topic,,,,,,304878,,,
+Topic,,,,,,7590,,,
+Topic,,,,,,8074,,,
+Topic,,,,,,24862,,,
+Topic,,,,,,35127,,,
+Topic,,,,,,60,,40.67,-73.94
+Topic,,,,,,443533,,,
+Person,1160244137181801222,,,,,,,,
+Topic,,,,,,192242,,,
+Topic,,,,,,11707,,,
+Topic,,,,,,73843,,,
+Topic,,,,,,505619,,,
+Topic,,,,,,158668,,,
+Topic,,,,,,889,,34.0,66.0
+Person,895197896920634500,,,,,,,,
+Topic,,,,,,18426,,40.84676,-73.873207
+Topic,,,,,,787185,,,
+Topic,,,,,,467,,,
+Person,1419850416906085161,,,,,,,,
+Topic,,,,,,2869238,,,
+Topic,,,,,,5,,,
+Topic,,,,,,334600,,,
+Topic,,,,,,191290,,,
+Topic,,,,,,122113,,,
+Topic,,,,,,179057,,,
+Topic,,,,,,11635,,,
+Topic,,,,,,329717,,,
+Person,33927662206515912,,,,,,,,
+Topic,,,,,,35140,,,
+Topic,,,,,,485537,,,
+Topic,,,,,,102014,,,
+Topic,,,,,,40357,,,
+Topic,,,,,,1337691,,,
+Topic,,,,,,160409,,40.7825,-73.966111111111
+Topic,,,,,,69871376,,,
+Topic,,,,,,177749,,,
+Topic,,,,,,11348,,,
+Topic,,,,,,182218,,,
+Topic,,,,,,1229,,47.568611111111,40.852783333333
+Topic,,,,,,5322,,,
+Person,1035098046740791143,,,,,,,,
+Topic,,,,,,792565,,48.10277778,20.78388889
+Topic,,,,,,37654,,,
+Topic,,,,,,25395,,40.735277777778,-74.185
+Topic,,,,,,169313,,,
+Topic,,,,,,728,,,
+Topic,,,,,,699385,,,
+Topic,,,,,,22983,,,
+Person,971383124880710240,,,,,,,,
+Person,1010629269012322480,,,,,,,,
+Topic,,,,,,11299,,40.728333333333,-73.994166666667
+Topic,,,,,,83460,,,
+Topic,,,,,,10289,,,
+Topic,,,,,,11019,,,
+Topic,,,,,,470118,,,
+Person,1426050562563532645,,,,,,,,
+Person,75415528634186650,,,,,,,,
+Person,1001287904525368324,,,,,,,,
+Person,242111862342742257,,,,,,,,
+Topic,,,,,,11249,,40.747,-73.986
+Topic,,,,,,3933135,,,
+Topic,,,,,,44311,,,
+Person,1025135622623992536,,,,,,,,
+Topic,,,,,,37497186,,,
+Person,584485814982143221,,,,,,,,
+Person,1508332501512270227,,,,,,,,
+Topic,,,,,,328473,,40.712,-74.002
+Topic,,,,,,25347,,,
+Topic,,,,,,175111,,,
+Person,1312322776399358210,,,,,,,,
+Topic,,,,,,16003594,,,
+Topic,,,,,,48789658,,,
+Topic,,,,,,8148,,,
+Topic,,,,,,9420,,,
+Topic,,,,,,771572,,40.699945,-73.950148
+Topic,,,,,,5088838,,,
+Person,1597454052092354280,,,,,,,,
+Person,961135479935321085,,,,,,,,
+Topic,,,,,,123705,,,
+Topic,,,,,,598435,,,
+Topic,,,,,,732934,,,
+Person,317248309514344163,,,,,,,,
+Person,1524681741257900519,,,,,,,,
+Topic,,,,,,254860,,,
+Topic,,,,,,335046,,,
+Person,534449219561977424,,,,,,,,
+Person,1035056342462002945,,,,,,,,
+Person,1222330726897222256,,,,,,,,
+Person,493345739124130581,,,,,,,,
+Topic,,,,,,831691,,,
+Topic,,,,,,28321638,,,
+Person,682588400093615551,,,,,,,,
+Person,920136262355651383,,,,,,,,
+Person,351354309273100074,,,,,,,,
+Person,495352903902152146,,,,,,,,
+Person,211778681592778731,,,,,,,,
+Topic,,,,,,1189753,,,
+Person,396953035572582107,,,,,,,,
+Topic,,,,,,828749,,,
+Topic,,,,,,904756,,,
+Topic,,,,,,7392008,,,
+Topic,,,,,,2566598,,,
+Person,363047312690634767,,,,,,,,
+Topic,,,,,,618102,,,
+Person,205415260510814362,,,,,,,,
+Person,1251650482793161774,,,,,,,,
+Topic,,,,,,620463,,,
+Person,1463522545161373807,,,,,,,,
+Person,1150357430325141247,,,,,,,,
+Person,674253449444876344,,,,,,,,
+Person,1073324208204442390,,,,,,,,
+Topic,,,,,,1049632,,40.665352,-73.969264
+Topic,,,,,,29171,,,
+Person,1637740339335566412,,,,,,,,
+Person,524508243055647325,,,,,,,,
+Person,320151361710953715,,,,,,,,
+Person,128643504412157535,,,,,,,,
+Topic,,,,,,361,,,
+Person,1243472362254658420,,,,,,,,
+Topic,,,,,,617927,,,
+Person,1275555184736572954,,,,,,,,
+Topic,,,,,,974850,,,
+Person,934144115142885657,,,,,,,,
+Topic,,,,,,217627,,,
+Topic,,,,,,223155,,,
+Person,1504217244688272832,,,,,,,,
+Person,144548678565311334,,,,,,,,
+Person,1400516284533535554,,,,,,,,
+Topic,,,,,,14528,,,
+Person,1508951542204233332,,,,,,,,
+Person,611325512448133762,,,,,,,,
+Person,635555368637193420,,,,,,,,
+Person,134403203055015143,,,,,,,,
+Topic,,,,,,202013,,,
+Topic,,,,,,7602643,,,
+Topic,,,,,,121765,,40.774444444444,-73.904166666667
+Person,765254641650259739,,,,,,,,
+Person,975526659664533195,,,,,,,,
+Person,273872236541568195,,,,,,,,
+Topic,,,,,,557887,,,
+Topic,,,,,,774228,,,
+Topic,,,,,,46744,,,
+Person,352033450190732475,,,,,,,,
+Person,841466124620556016,,,,,,,,
+Person,1517466541524095404,,,,,,,,
+Topic,,,,,,114633,,,
+Person,301710390995444087,,,,,,,,
+Topic,,,,,,16868955,,,
+Person,747231730275042400,,,,,,,,
+Person,1164902255571715230,,,,,,,,
+Person,291914370254601234,,,,,,,,
+Topic,,,,,,247154,,,
+Topic,,,,,,519,,,
+Topic,,,,,,3303945,,,
+Person,1022241560051472272,,,,,,,,
+Person,566448585007839403,,,,,,,,
+Topic,,,,,,11229,,,
+Person,735713441679521195,,,,,,,,
+Person,1128501731262832684,,,,,,,,
+Person,446962590481145702,,,,,,,,
+Person,1125113326787431160,,,,,,,,
+Person,437201545096608055,,,,,,,,
+Person,940377106445268064,,,,,,,,
+Person,1647329525841402942,,,,,,,,
+Topic,,,,,,3884230,,,
+Topic,,,,,,131191,,,
+Person,1376053313411407054,,,,,,,,
+Person,1347432655942023365,,,,,,,,
+Person,1472154222902711100,,,,,,,,
+Person,529550602103217450,,,,,,,,
+Topic,,,,,,43035,,,
+Topic,,,,,,126095,,,
+Topic,,,,,,49088,,40.8075,-73.961944444444
+Person,910075513854877065,,,,,,,,
+Topic,,,,,,5018694,,40.859105555556,-74.198686111111
+Topic,,,,,,2446683,,43.1189,20.0797
+Topic,,,,,,2030894,,40.850852,-73.844949
+Topic,,,,,,29718382,,,
+Topic,,,,,,130965,,40.860833333333,-73.884444444444
+Topic,,,,,,167172,,,
+Topic,,,,,,2456507,,,
+Person,842652402732741813,,,,,,,,
+Topic,,,,,,670897,,33.421111111111,-111.93166666667
+Topic,,,,,,1436668,,,
+Topic,,,,,,753651,,,
+Topic,,,,,,7451247,,,
+Topic,,,,,,2493,,,
+Person,719533111062900642,,,,,,,,
+Person,834321901190546647,,,,,,,,
+Topic,,,,,,12796,,,
+Person,937074421253040138,,,,,,,,
+Topic,,,,,,18159587,,,
+Person,101810442957214781,,,,,,,,
+Topic,,,,,,131401,,,
+Topic,,,,,,929920,,,
+Topic,,,,,,466439,,,
+Topic,,,,,,6498684,,,
+Topic,,,,,,206361,,,
+Topic,,,,,,41796,,,
+Person,1152266442105786574,,,,,,,,
+Person,95240187156237415,,,,,,,,
+Topic,,,,,,7897553,,,
+Topic,,,,,,206887,,,
+Topic,,,,,,5405633,,,
+Person,1031526243841315760,,,,,,,,
+Topic,,,,,,11348540,,,
+Topic,,,,,,4198163,,,
+Topic,,,,,,16048728,,,
+Topic,,,,,,189756,,,
+Topic,,,,,,643638,,,
+Topic,,,,,,783874,,,
+Topic,,,,,,492346,,37.2708,-76.7069
+Topic,,,,,,29042975,,,
+Topic,,,,,,12103677,,,
+Topic,,,,,,2329157,,,
+Person,1563598527979706128,,,,,,,,
+Topic,,,,,,4442,,,
+Person,264075025125849069,,,,,,,,
+Person,369370063627142227,,,,,,,,
+Person,1300183120520109060,,,,,,,,
+Topic,,,,,,18031504,,,
+Topic,,,,,,4229887,,,
+Person,611117914195523184,,,,,,,,
+Topic,,,,,,7543639,,,
+Topic,,,,,,13977,,,
+Topic,,,,,,18122778,,,
+Person,166319955306346577,,,,,,,,
+Topic,,,,,,588894,,,
+Topic,,,,,,2454265,,,
+Person,1547400408884914628,,,,,,,,
+Person,373641740834326257,,,,,,,,
+Topic,,,,,,5264957,,,
+Topic,,,,,,968598,,,
+Person,754480939973310112,,,,,,,,
+Topic,,,,,,1808877,,,
+Person,1443919105364146460,,,,,,,,
+Person,735243266472522113,,,,,,,,
+Person,1321304826561136177,,,,,,,,
+Person,1560601202484151215,,,,,,,,
+Person,1403521534163206962,,,,,,,,
+Person,231472126788137195,,,,,,,,
+Person,208411288512434105,,,,,,,,
+Topic,,,,,,7252790,,,
+Person,1211456636406749825,,,,,,,,
+Person,1071303249530347453,,,,,,,,
+Person,1069710216181783510,,,,,,,,
+Person,1578613817419480731,,,,,,,,
+Person,944546653739552042,,,,,,,,
+Topic,,,,,,8856932,,,
+Person,616673625330310949,,,,,,,,
+Person,1302421465423646583,,,,,,,,
+Person,720320812100121121,,,,,,,,
+Person,653345304799504620,,,,,,,,
+Person,346401281431409585,,,,,,,,
+Person,1526112405471861415,,,,,,,,
+Person,1501623481588541372,,,,,,,,
+Person,312380611598980641,,,,,,,,
+Person,1115244423173415593,,,,,,,,
+Person,1555348115336584230,,,,,,,,
+Person,12321118467056216,,,,,,,,
+Person,1352636429150180228,,,,,,,,
+Person,725324491051434870,,,,,,,,
+Person,846764541256336994,,,,,,,,
+Person,140443713446471314,,,,,,,,
+Person,1135272113235621141,,,,,,,,
+Person,775818654043059161,,,,,,,,
+Person,529476525413023401,,,,,,,,
+Person,1262668194076216011,,,,,,,,
+Person,119474435514352445,,,,,,,,
+Person,437573095319558705,,,,,,,,
+Person,1035555223142154728,,,,,,,,
+Person,556320934631523806,,,,,,,,
+Person,1356253242219285320,,,,,,,,
+Person,248654236829951090,,,,,,,,
+Person,481153633235353485,,,,,,,,
+Includes,,,1202482536733844323,1296829658689065159,,,,,
+HasTopic,,,,956704137555154092,,335046,,,
+HasTopic,,,,1028329324575034354,,1808877,,,
+HasTopic,,,,,1004346153600881042,735,,,
+Author,1560601202484151215,,,1285128710332882742,,,,,
+HasTopic,,,,1513662032452523252,,131401,,,
+HasTopic,,,,1302313601603127196,,48789658,,,
+HasTopic,,,,1114502034902546550,,40357,,,
+Sale,1275555184736572954,1463522545161373807,,,,,8/16/2018,,
+HasTopic,,,,78678286442461987,,28321638,,,
+HasTopic,,,,854149383334143372,,903552,,,
+HasTopic,,,,240337224527030225,,519,,,
+HasTopic,,,,116892402526543412,,5264957,,,
+HasTopic,,,,202421472143651025,,12103677,,,
+HasTopic,,,,393285992310638641,,470118,,,
+Author,910075513854877065,,,,102583151124020340,,,,
+Sale,1426050562563532645,75415528634186650,,,,,7/29/2018,,
+Author,1128501731262832684,,,1513662032452523252,,,,,
+Sale,971383124880710240,1010629269012322480,,,,38695,7/28/2018,,
+Author,477384404927196020,,,1651365355351122204,,,,,
+Author,725324491051434870,,,202421472143651025,,,,,
+HasTopic,,,,932362105613871012,,69871376,,,
+HasTopic,,,,1290121451283392110,,169313,,,
+Sale,396953035572582107,1400516284533535554,,,,41323,8/21/2018,,
+HasTopic,,,,701755398615636460,,8148,,,
+Sale,940377106445268064,1647329525841402942,,,,25347,10/15/2018,,
+HasTopic,,,,1651365355351122204,,643638,,,
+HasTopic,,,,1114502034902546550,,46744,,,
+HasTopic,,,,936722743217343702,,123705,,,
+HasTopic,,,,321724159614056152,,6498684,,,
+Sale,1419850416906085161,1128501731262832684,,,,2869238,09/28/2018,,
+HasTopic,,,1372844135435303981,,,60,,,
+Author,1578613817419480731,,,321724159614056152,,,,,
+HasTopic,,,,1512214307542520410,,8074,,,
+HasTopic,,,,618434247743641149,,192242,,,
+Includes,,,1202482536733844323,1114502034902546550,,,,,
+Includes,,,1615340315424362057,877764733212222524,,,,,
+HasTopic,,,,240337224527030225,,1229,,,
+HasTopic,,,,1209342585680609487,,179057,,,
+Author,834321901190546647,,,,1433303251800176474,,,,
+HasTopic,,,,1209342585680609487,,175111,,,
+Includes,,,1372844135435303981,581543512052485139,,,,,
+Author,720320812100121121,,,420762134340393550,,,,,
+Includes,,,1615340315424362057,1424660009578332566,,,,,
+HasTopic,,,,932362105613871012,,771572,,,
+Includes,,,353365307219544531,956704137555154092,,,,,
+HasTopic,,,,956704137555154092,,929920,,,
+HasTopic,,,,1441762191425652442,,177749,,,
+Author,1563598527979706128,,,1220295546212024391,,,,,
+Author,248654236829951090,,,1424660009578332566,,,,,
+Author,529476525413023401,,,1302313601603127196,,,,,
+HasTopic,,,,1209342585680609487,,771572,,,
+HasTopic,,,,,1433303251800176474,83460,,,
+Sale,1022241560051472272,1637740339335566412,,,,1642639,10/13/2018,,
+Includes,,,1615340315424362057,128423416112315798,,,,,
+Includes,,,1372844135435303981,932362105613871012,,,,,
+Includes,,,1372844135435303981,747423119260925972,,,,,
+HasTopic,,,,854149383334143372,,24862,,,
+Sale,1160244137181801222,1035056342462002945,,,,181508,10/4/2018,,
+HasTopic,,,,1114502034902546550,,328473,,,
+Includes,,,1314315120197156050,854149383334143372,,,,,
+Sale,1472154222902711100,1128501731262832684,,,,185785,09/28/2018,,
+HasOrg,,,,,102583151124020340,49088,,,
+HasTopic,,,,932362105613871012,,11299,,,
+HasTopic,,,,932362105613871012,,18426,,,
+HasTopic,,,,,1004346153600881042,7392008,,,
+Author,1526112405471861415,,,846536331643665114,,,,,
+HasTopic,,,,78678286442461987,,617927,,,
+HasTopic,,,,321724159614056152,,4229887,,,
+HasTopic,,,,82629615412640377,,5322,,,
+Author,1128501731262832684,,,1114502034902546550,,,,,
+Author,166319955306346577,,,209800678458482108,,,,,
+Sale,735713441679521195,1128501731262832684,,,,11650,10/10/2018,,
+Author,944546653739552042,,,803952155714850701,,,,,
+HasTopic,,,,581543512052485139,,3933135,,,
+HasTopic,,,,209800678458482108,,4198163,,,
+HasTopic,,,,932362105613871012,,7590,,,
+Includes,,,1372844135435303981,1060309546214304182,,,,,
+HasTopic,,,,846536331643665114,,167172,,,
+HasTopic,,,,740410432146852843,,11348540,,,
+Author,937074421253040138,,,,1004346153600881042,,,,
+HasTopic,,,,1282227710122181132,,2493,,,
+HasTopic,,,,,1433303251800176474,43035,,,
+Sale,495352903902152146,211778681592778731,,,,81944,8/6/2018,,
+Includes,,,1372844135435303981,1512214307542520410,,,,,
+HasTopic,,,353365307219544531,,,2329,,,
+Includes,,,1615340315424362057,701755398615636460,,,,,
+Includes,,,1372844135435303981,1209342585680609487,,,,,
+Includes,,,1202482536733844323,1513662032452523252,,,,,
+HasTopic,,,,701755398615636460,,968598,,,
+HasTopic,,,,128423416112315798,,11019,,,
+Author,1356253242219285320,,,854149383334143372,,,,,
+Author,1031526243841315760,,,,1433303251800176474,,,,
+Author,842652402732741813,,,,1433303251800176474,,,,
+HasTopic,,,,440265285168056234,,18159587,,,
+Author,1501623481588541372,,,833681012494554358,,,,,
+HasTopic,,,,1028329324575034354,,11299,,,
+Author,140443713446471314,,,1028329324575034354,,,,,
+Author,264075025125849069,,,1245126351375505703,,,,,
+Includes,,,1314315120197156050,1614534111336540475,,,,,
+HasTopic,,,,,1433303251800176474,131191,,,
+HasTopic,,,,420762134340393550,,1907525,,,
+Sale,975526659664533195,524508243055647325,,,,,10/15/2018,,
+Author,284405379592161575,,,,102583151124020340,,,,
+HasTopic,,,,,963345652072941810,735,,,
+HasTopic,,,,,1433303251800176474,3303945,,,
+Author,1152266442105786574,,,,963345652072941810,,,,
+HasTopic,,,,202421472143651025,,1189753,,,
+HasTopic,,,,1028329324575034354,,11229,,,
+Includes,,,1615340315424362057,1220295546212024391,,,,,
+Includes,,,1314315120197156050,846536331643665114,,,,,
+HasTopic,,,,1296829658689065159,,7451247,,,
+Includes,,,1615340315424362057,78678286442461987,,,,,
+HasTopic,,,,541215404780905313,,11635,,,
+Author,481153633235353485,,,956704137555154092,,,,,
+Author,611117914195523184,,,701755398615636460,,,,,
+Author,95240187156237415,,,,963345652072941810,,,,
+Includes,,,1372844135435303981,91431002216341149,,,,,
+Author,846764541256336994,,,1290121451283392110,,,,,
+HasTopic,,,,393285992310638641,,206887,,,
+HasTopic,,,,1512214307542520410,,177749,,,
+HasTopic,,,,1114502034902546550,,7252790,,,
+HasTopic,,,1202482536733844323,,,60,,,
+HasTopic,,,,1302313601603127196,,41796,,,
+HasTopic,,,,1114502034902546550,,44311,,,
+Author,1547400408884914628,,,393285992310638641,,,,,
+Author,312380611598980641,,,78678286442461987,,,,,
+Sale,273872236541568195,1251650482793161774,,,,172809,8/22/2018,,
+HasTopic,,,,1513662032452523252,,728,,,
+Includes,,,1615340315424362057,447169043921403064,,,,,
+Includes,,,1372844135435303981,82629615412640377,,,,,
+HasTopic,,,,1302313601603127196,,29042975,,,
+Author,1321304826561136177,,,1532662490035322233,,,,,
+Includes,,,1372844135435303981,1302313601603127196,,,,,
+Sale,363047312690634767,242111862342742257,,,,,10/4/2018,,
+Author,1071303249530347453,,,737353170652104031,,,,,
+HasTopic,,,,1282227710122181132,,35140,,,
+HasTopic,,,,91431002216341149,,46744,,,
+Includes,,,1372844135435303981,1441762191425652442,,,,,
+Sale,446962590481145702,534449219561977424,,,,,10/11/2018,,
+Author,1035555223142154728,,,877764733212222524,,,,,
+Author,1403521534163206962,,,932362105613871012,,,,,
+HasTopic,,,,701755398615636460,,35127,,,
+Includes,,,1372844135435303981,1028329324575034354,,,,,
+HasTopic,,,,1245126351375505703,,254860,,,
+HasTopic,,,,209800678458482108,,7897553,,,
+Includes,,,1372844135435303981,186108460103013588,,,,,
+HasOrg,,,,,102583151124020340,304878,,,
+HasTopic,,,,,1433303251800176474,998,,,
+Author,653345304799504620,,,581543512052485139,,,,,
+Author,1302421465423646583,,,240337224527030225,,,,,
+Author,1211456636406749825,,,618434247743641149,,,,,
+HasTopic,,,,240337224527030225,,785,,,
+HasTopic,,,,,1004346153600881042,83460,,,
+Includes,,,1615340315424362057,1245126351375505703,,,,,
+HasTopic,,,,747423119260925972,,16868955,,,
+HasTopic,,,,91431002216341149,,9420,,,
+Includes,,,451888058015735870,541215404780905313,,,,,
+HasTopic,,,,1209342585680609487,,492346,,,
+HasTopic,,,,1290121451283392110,,114633,,,
+HasTopic,,,,1441762191425652442,,11249,,,
+HasTopic,,,,1532662490035322233,,753651,,,
+HasTopic,,,,1532662490035322233,,11707,,,
+HasTopic,,,,1245126351375505703,,904756,,,
+HasTopic,,,15133734353741126,,,189756,,,
+Sale,1222330726897222256,493345739124130581,,,,177,8/4/2018,,
+Includes,,,1372844135435303981,1424263331858043042,,,,,
+Includes,,,1615340315424362057,1290121451283392110,,,,,
+Sale,477384404927196020,182010581109145287,,,,271997,2/17/2019,,
+HasTopic,,,,737353170652104031,,16003594,,,
+HasTopic,,,,747423119260925972,,2329157,,,
+Sale,566448585007839403,765254641650259739,,,,,10/9/2018,,
+HasOrg,,,,,963345652072941810,130965,,,
+Author,616673625330310949,,,936722743217343702,,,,,
+HasTopic,,,,1513662032452523252,,44311,,,
+Author,119474435514352445,,,1282227710122181132,,,,,
+HasTopic,,,,82629615412640377,,22983,,,
+HasTopic,,,,833681012494554358,,201816,,,
+HasTopic,,,,,1433303251800176474,14528,,,
+Sale,33927662206515912,934144115142885657,,,,,10/10/2018,,
+Includes,,,1372844135435303981,618434247743641149,,,,,
+Includes,,,1615340315424362057,740410432146852843,,,,,
+Author,1300183120520109060,,,128423416112315798,,,,,
+Author,1069710216181783510,,,1296829658689065159,,,,,
+Sale,1504217244688272832,144548678565311334,,,,56683126,8/13/2018,,
+Author,1115244423173415593,,,1614534111336540475,,,,,
+Sale,747231730275042400,584485814982143221,,,,127,10/7/2018,,
+HasTopic,,,,1285128710332882742,,37654,,,
+HasTopic,,,1615340315424362057,,,12796,,,
+Author,1555348115336584230,,,817526874194673140,,,,,
+HasTopic,,,,,102583151124020340,43035,,,
+HasTopic,,,,737353170652104031,,206361,,,
+Includes,,,1314315120197156050,833681012494554358,,,,,
+Sale,128643504412157535,320151361710953715,,,,443533,8/24/2018,,
+HasTopic,,,,1220295546212024391,,588894,,,
+HasTopic,,,,202421472143651025,,7602643,,,
+Sale,1150357430325141247,674253449444876344,,,,,10/7/2018,,
+Author,735243266472522113,,,1209342585680609487,,,,,
+Includes,,,1372844135435303981,1651365355351122204,,,,,
+HasTopic,,,,1424660009578332566,,618102,,,
+HasTopic,,,,,102583151124020340,126095,,,
+HasTopic,,,,,1004346153600881042,2446683,,,
+HasTopic,,,,,1433303251800176474,598435,,,
+Author,735713441679521195,,,,1433303251800176474,,,,
+Includes,,,1314315120197156050,321724159614056152,,,,,
+HasTopic,,,,1114502034902546550,,8856932,,,
+HasTopic,,,,,1433303251800176474,5405633,,,
+HasTopic,,,,1285128710332882742,,11299,,,
+HasTopic,,,,420762134340393550,,12796,,,
+HasTopic,,,,1184855350262395542,,329717,,,
+HasTopic,,,,116892402526543412,,2456507,,,
+Includes,,,1615340315424362057,817526874194673140,,,,,
+Sale,841466124620556016,1517466541524095404,,,,,8/27/2018,,
+HasTopic,,,,1209342585680609487,,127197,,,
+HasTopic,,,,420762134340393550,,16048728,,,
+HasTopic,,,,209800678458482108,,774228,,,
+Author,231472126788137195,,,82629615412640377,,,,,
+HasTopic,,,,,1433303251800176474,7817,,,
+HasTopic,,,,932362105613871012,,1049632,,,
+Sale,1164902255571715230,291914370254601234,,,,571,10/8/2018,,
+HasTopic,,,,,1433303251800176474,467,,,
+HasTopic,,,,1220295546212024391,,37497186,,,
+HasTopic,,,,817526874194673140,,732934,,,
+Includes,,,1615340315424362057,202421472143651025,,,,,
+HasTopic,,,,1513662032452523252,,889,,,
+Includes,,,1372844135435303981,737353170652104031,,,,,
+HasTopic,,,,1296829658689065159,,83460,,,
+HasTopic,,,,747423119260925972,,1333024,,,
+HasTopic,,,,1290121451283392110,,204,,,
+Author,611117914195523184,,,440265285168056234,,,,,
+Author,754480939973310112,,,1441762191425652442,,,,,
+Includes,,,1427292001647224242,936722743217343702,,,,,
diff --git a/libcusp/include/galois/graphs/DistributedLocalGraph.h b/libcusp/include/galois/graphs/DistributedLocalGraph.h
index f826f88a31..2920614232 100644
--- a/libcusp/include/galois/graphs/DistributedLocalGraph.h
+++ b/libcusp/include/galois/graphs/DistributedLocalGraph.h
@@ -883,6 +883,175 @@ class DistLocalGraph {
 
   //! Used by substrate to determine if some stats are to be reported
   bool is_a_graph() const { return true; }
+  inline NodeTy& getTopologyID(uint64_t nodeID) {
+    return graph.getData(getLID(nodeID));
+  }
+
+  inline NodeTy& getTopologyIDFromIndex(uint64_t index) {
+    return graph.getData(index);
+  }
+
+  uint64_t getTokenID(NodeTy& vertex) {
+    return getGID(&vertex - &graph.getData(0));
+  }
+
+  uint32_t getVertexIndex(NodeTy& vertex) {
+    return (&vertex - &graph.getData(0));
+  }
+
+  uint64_t getLocalityVertex(NodeTy& vertex) {
+    uint64_t gid = getTopologyID(vertex);
+    return getHostIDImpl(gid);
+  }
+
+  /** Edge Manipulation **/
+  edge_iterator mintEdgeHandle(NodeTy& src, std::uint64_t off) {
+    return edge_begin(src) + off;
+  }
+
+  // template <typename T = NodeTy>
+  // typename std::enable_if<!std::is_void<T>::value>::type
+  // setData(typename GraphTy::node_data_reference vertex, T data) {
+  //   graph.setData(vertex, data);
+  // }
+
+  ///** Data Manipulations **/
+
+  // typename GraphTy::node_data_reference
+  // getData(typename GraphTy::node_data_reference vertex) {
+  //   return graph.getData(getTokenID(vertex));
+  // }
+
+  template <typename T = NodeTy>
+  typename std::enable_if<!std::is_void<T>::value>::type
+  setEdgeData(edge_iterator eh, T data) {
+    graph.setEdgeData(eh, data);
+  }
+
+  template <typename T = NodeTy>
+  typename std::enable_if<!std::is_void<T>::value, EdgeTy&>::type
+  getEdgeData(edge_iterator eh) {
+    return graph.getEdgeData(eh);
+  }
+
+  enum Task {
+    ADD_VERTEX,
+    ADD_VERTEX_TOPOLOGY_ONLY,
+    ADD_EDGES,
+    ADD_EDGES_TOPOLOGY_ONLY,
+    DELETE_VERTEX,
+    DELETE_EDGES
+  };
+
+  template <typename... Args>
+  void sendModifyRequest(uint32_t host, Args... args) {
+    galois::runtime::SendBuffer b;
+    galois::runtime::gSerialize(b, args...);
+    galois::runtime::getSystemNetworkInterface().sendTagged(
+        host, galois::runtime::evilPhase, std::move(b));
+  }
+
+  // Assumptions:
+  //  1. A vertex is added before any edges are added to it
+  //  2. No support for deleting edges/vertices yet
+  //  3. Only works for OEC
+  void
+  updateVariables(bool isVertex, uint64_t src,
+                  std::optional<std::vector<uint64_t>> dsts = std::nullopt) {
+
+    if (isVertex) {
+      if (globalToLocalMap.find(src) == globalToLocalMap.end()) {
+        localToGlobalVector.push_back(src);
+        globalToLocalMap[src] = localToGlobalVector.size() - 1;
+        numNodes++;
+      }
+      numOwned++;
+    } else {
+      uint64_t srcLID = globalToLocalMap[src];
+      if (edge_begin(srcLID) == edge_end(srcLID)) {
+        numNodesWithEdges++;
+      }
+      for (auto token : dsts.value()) {
+        if (globalToLocalMap.find(token) == globalToLocalMap.end()) {
+          localToGlobalVector.push_back(token);
+          globalToLocalMap[token] = localToGlobalVector.size() - 1;
+          numNodes++;
+        }
+        if (!isOwned(token)) {
+          mirrorNodes[getHostID(token)].push_back(token);
+        }
+      }
+      numEdges += dsts.value().size();
+    }
+  }
+
+  /** Topology Modifications **/
+  void addVertexTopologyOnly(uint32_t token) {
+    uint64_t belongsTo = getHostID(token);
+    if (belongsTo == id) {
+      updateVariables(true, token);
+      // graph->addVertexTopologyOnly();
+    } else {
+      sendModifyRequest(belongsTo, ADD_VERTEX_TOPOLOGY_ONLY, token);
+    }
+  }
+
+  template <typename T>
+  void addVertex(uint64_t token, T data) {
+    uint64_t belongsTo = getHostID(token);
+    if (belongsTo == id) {
+      updateVariables(true, token);
+      // graph->setData(getLID(token), data);
+    } else {
+      sendModifyRequest(belongsTo, ADD_VERTEX, token, data);
+    }
+  }
+
+  void addEdgesTopologyOnly(uint64_t src, std::vector<uint64_t> dsts) {
+    uint64_t belongsTo = getHostID(src);
+    if (belongsTo == id) {
+      updateVariables(false, src, dsts);
+      graph->addEdgesTopologyOnly(getLID(src), dsts);
+    } else {
+      sendModifyRequest(belongsTo, ADD_EDGES_TOPOLOGY_ONLY, src, dsts);
+    }
+  }
+
+  void addEdges(uint64_t src, std::vector<uint64_t> dsts,
+                std::vector<EdgeTy> data) {
+    uint64_t belongsTo = getHostID(src);
+    if (belongsTo == id) {
+      updateVariables(false, src, dsts);
+      std::vector<uint64_t> lids;
+      for (uint32_t i = 0; i < dsts.size(); i++) {
+        lids.push_back(getLID(dsts[i]));
+      }
+      graph->addEdges(getLID(src), lids, data);
+    } else {
+      sendModifyRequest(belongsTo, src, dsts, data);
+    }
+  }
+
+  void deleteVertex(uint64_t src) {
+    uint64_t belongsTo = getHostID(src);
+    if (belongsTo == id) {
+      // TODO(Divija): Uncomment when we have the graph API
+      // graph.deleteVertex(getLID(src));
+    } else {
+      sendModifyRequest(belongsTo, DELETE_VERTEX, src);
+    }
+  }
+
+  void deleteEdges(uint64_t src, std::vector<edge_iterator> edges) {
+    // TODO:Remove dst tokens from local map?
+    uint64_t belongsTo = getHostID(src);
+    if (belongsTo == id) {
+      // TODO(Divija): Uncomment when we have the graph API
+      // return graph.deleteEdges(getLID(src), edges);
+    } else {
+      sendModifyRequest(belongsTo, DELETE_EDGES, src, edges);
+    }
+  }
 };
 
 template <typename NodeTy, typename EdgeTy>
diff --git a/libgalois/include/galois/runtime/GraphUpdateManager.h b/libgalois/include/galois/runtime/GraphUpdateManager.h
new file mode 100644
index 0000000000..cb563cae6f
--- /dev/null
+++ b/libgalois/include/galois/runtime/GraphUpdateManager.h
@@ -0,0 +1,134 @@
+#include <iostream>
+#include <thread>
+#include <atomic>
+#include <galois/Timer.h>
+#include "galois/wmd/graphTypes.h"
+
+// Usage: call start() to start the ingestion of the file
+//        call stop() to stop the ingestion of the file
+//        call setBatchSize() to set the batch size
+//    Refer to wmd-graph-build for an example of how to use this class
+
+using namespace agile::workflow1;
+
+template <typename NodeData, typename EdgeData>
+class graphUpdateManager {
+public:
+  using T              = galois::graphs::DistLocalGraph<NodeData, EdgeData>;
+  graphUpdateManager() = default;
+  graphUpdateManager(
+      std::unique_ptr<galois::graphs::FileParser<NodeData, EdgeData>> parser,
+      std::string inputFile, int period, T* distGraphPtr) {
+    periodForCheck = period;
+    graphFile      = inputFile;
+    graph          = distGraphPtr;
+    fileParser     = std::move(parser);
+  }
+  // disable copy constructor
+  graphUpdateManager(const graphUpdateManager&)            = delete;
+  graphUpdateManager& operator=(const graphUpdateManager&) = delete;
+
+  // disable move constructor
+  graphUpdateManager(graphUpdateManager&&)            = delete;
+  graphUpdateManager& operator=(graphUpdateManager&&) = delete;
+
+  void start() {
+    // start the dynamic changes
+    startIngest = std::thread(&graphUpdateManager::ingestFile, this);
+    checkThread = std::thread(&graphUpdateManager::checkForMessages, this);
+  }
+
+  void setBatchSize(uint64_t size) { batchSize = size; }
+
+  uint64_t getBatchSize() { return batchSize; }
+
+  void setPeriod(uint64_t period) { periodForCheck = period; }
+
+  uint64_t getPeriod() { return periodForCheck; }
+
+  bool stop() {
+    if (stopIngest) {
+      while (!checkThread.joinable())
+        ;
+      startIngest.join();
+    }
+    return stopIngest;
+  }
+  bool stop2() {
+    std::this_thread::sleep_for(std::chrono::milliseconds(10 * periodForCheck));
+    stopCheck = true;
+    while (!checkThread.joinable())
+      ;
+    checkThread.join();
+    return stopIngest;
+  }
+
+private:
+  std::thread checkThread;
+  std::thread startIngest;
+  uint64_t periodForCheck;
+  std::string graphFile;
+  T* graph;
+  uint64_t batchSize = 10;
+  bool stopIngest    = false;
+  bool stopCheck     = false;
+  std::unique_ptr<galois::graphs::FileParser<NodeData, EdgeData>> fileParser;
+
+  template <typename N = NodeData, typename E = EdgeData>
+  void processLine(const char* line, size_t len) {
+    galois::graphs::ParsedGraphStructure<N, E> value =
+        fileParser->ParseLine(const_cast<char*>(line), len);
+    for (auto& edge : value.edges) {
+      std::vector<uint64_t> dsts;
+      dsts.push_back(edge.dst);
+      std::vector<E> data;
+      data.push_back(edge);
+      graph->addEdges(edge.src, dsts, data);
+    }
+  }
+
+  template <typename N = NodeData, typename E = EdgeData>
+  void ingestFile() {
+    std::ifstream inputFile(graphFile);
+    if (!inputFile.is_open()) {
+      std::cerr << "Error opening file: " << graphFile << "\n";
+      return;
+    }
+
+    // Read each line from the stringstream
+    std::string line;
+    uint64_t lineNumber = 0;
+    while ((std::getline(inputFile, line))) {
+      processLine(line.c_str(), line.size());
+      lineNumber++;
+      if (lineNumber == batchSize) {
+        std::this_thread::sleep_for(std::chrono::milliseconds(periodForCheck));
+        lineNumber = 0;
+      }
+    }
+    inputFile.close();
+    auto& net = galois::runtime::getSystemNetworkInterface();
+    net.flush();
+    stopIngest = true;
+  }
+
+  template <typename N = NodeData, typename E = EdgeData>
+  void checkForMessages() {
+    // check for messages
+    auto& net = galois::runtime::getSystemNetworkInterface();
+    while (!stopCheck) {
+      auto m = net.recieveTagged(galois::runtime::evilPhase);
+      if (m.has_value()) {
+        uint64_t src_node;
+        galois::runtime::gDeserialize(m->second, src_node);
+        std::vector<uint64_t> edge_dsts;
+        galois::runtime::gDeserialize(m->second, edge_dsts);
+        std::vector<E> edge_data;
+        galois::runtime::gDeserialize(m->second, edge_data);
+        graph->addEdges(src_node, edge_dsts, edge_data);
+      }
+      std::this_thread::sleep_for(
+          std::chrono::milliseconds(periodForCheck / (batchSize)));
+    }
+  }
+};
diff --git a/libwmd/test/CMakeLists.txt b/libwmd/test/CMakeLists.txt
index 684f503069..cdea7d87de 100644
--- a/libwmd/test/CMakeLists.txt
+++ b/libwmd/test/CMakeLists.txt
@@ -6,7 +6,7 @@ function(add_test_unit name)
 
   set(command_line "$<TARGET_FILE:${test_name}>")
 
-  add_test(NAME ${test_name} COMMAND ${command_line} ${Galois_SOURCE_DIR}/inputs/wmd/data.00001.csv)
+  add_test(NAME ${test_name} COMMAND ${command_line} ${Galois_SOURCE_DIR}/inputs/wmd/static ${Galois_SOURCE_DIR}/inputs/wmd/dynamic)
 endfunction()
 
 add_test_unit(wmd-graph-build)
diff --git a/libwmd/test/wmd-graph-build.cpp b/libwmd/test/wmd-graph-build.cpp
index c0711bd24b..f5404da83c 100644
--- a/libwmd/test/wmd-graph-build.cpp
+++ b/libwmd/test/wmd-graph-build.cpp
@@ -15,6 +15,7 @@
 
 #include "galois/DistGalois.h"
 #include "galois/graphs/GenericPartitioners.h"
+#include "galois/runtime/GraphUpdateManager.h"
 
 #include <cstdlib>
 #include <iostream>
@@ -159,19 +160,30 @@ int main(int argc, char* argv[]) {
 
   std::unordered_map<std::uint64_t, std::pair<TYPES, std::vector<Edge>>>
       vertices;
-  if (net.ID == 0)
-    getDataFromGraph(file, vertices);
+
+  std::string dynFile = argv[2] + std::to_string(net.ID) + ".txt";
+
+  graphUpdateManager<agile::workflow1::Vertex, agile::workflow1::Edge> GUM(
+      std::make_unique<galois::graphs::WMDParser<agile::workflow1::Vertex,
+                                                 agile::workflow1::Edge>>(
+          10, filenames),
+      dynFile, 100, graph);
+  GUM.start();
+  // wait for GUM to finish
+  while (!GUM.stop()) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(100));
+  }
+  galois::runtime::getHostBarrier().wait();
+  GUM.stop2();
 
   // generate a file with sorted token of all nodes and its outgoing edge dst
   // compare it with other implementation to verify the correctness
   std::vector<std::pair<uint64_t, std::vector<uint64_t>>> tokenAndEdges;
   tokenAndEdges.resize(graph->numMasters());
-
   galois::do_all(
       galois::iterate(graph->masterNodesRange()),
       [&](size_t lid) {
         auto token = graph->getData(lid).id;
-
         std::vector<uint64_t> edgeDst;
         auto end = graph->edge_end(lid);
         auto itr = graph->edge_begin(lid);
@@ -187,7 +199,6 @@ int main(int argc, char* argv[]) {
         tokenAndEdges[lid] = std::make_pair(token, std::move(edgeDst));
       },
       galois::steal());
-
   // gather node info from other hosts
   if (net.ID != 0) { // send token and degree pairs to host 0
     galois::runtime::SendBuffer sendBuffer;
@@ -199,20 +210,20 @@ int main(int argc, char* argv[]) {
       do {
         p = net.recieveTagged(galois::runtime::evilPhase);
       } while (!p);
-
       std::vector<std::pair<uint64_t, std::vector<uint64_t>>>
           incomingtokenAndEdges;
       galois::runtime::gDeserialize(p->second, incomingtokenAndEdges);
-
       // combine data
       std::move(incomingtokenAndEdges.begin(), incomingtokenAndEdges.end(),
                 std::back_inserter(tokenAndEdges));
     }
   }
-
-  // sort the node info by token order
-  // serilize it to file
   if (net.ID == 0) {
+    getDataFromGraph(file, vertices);
+    for (uint32_t i = 0; i < net.Num; i++) {
+      std::string dynFile = argv[2] + std::to_string(i) + ".txt";
+      getDataFromGraph(dynFile, vertices);
+    }
     // compare with vertices
     assert(tokenAndEdges.size() == vertices.size());
     for (size_t i = 0; i < tokenAndEdges.size(); i++) {

From 73e91c6529ac47fc9805294586e9ddba11145a56 Mon Sep 17 00:00:00 2001
From: Meyer Zinn <6132034+meyerzinn@users.noreply.github.com>
Date: Mon, 15 Apr 2024 13:37:36 -0500
Subject: [PATCH 657/660] use serial prefix sum in lscsr (#37)

---
 .../include/galois/graphs/LS_LC_CSR_Graph.h   | 24 +++++--------------
 libgalois/test/graph-compile-lscsr.cpp        |  9 +++++++
 2 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h b/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
index dc4942f3c4..183a3277ec 100644
--- a/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
+++ b/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
@@ -97,31 +97,19 @@ class LS_LC_CSR_Graph : private boost::noncopyable {
    * Prefix Sum utilities
    */
   std::vector<uint64_t> m_pfx_sum_cache;
-  static uint64_t transmute(const VertexMetadata& vertex_meta) {
-    return vertex_meta.degree();
-  }
-  static uint64_t scan_op(const VertexMetadata& p, const uint64_t& l) {
-    return p.degree() + l;
-  }
-  static uint64_t combiner(const uint64_t& f, const uint64_t& s) {
-    return f + s;
-  }
-  PrefixSum<VertexMetadata, uint64_t, transmute, scan_op, combiner,
-            CacheLinePaddedArr>
-      m_pfx{&m_vertices[0], &m_pfx_sum_cache[0]};
 
   alignas(hardware_destructive_interference_size)
       std::atomic<bool> m_prefix_valid = ATOMIC_VAR_INIT(false);
 
-  void resetPrefixSum() {
-    m_pfx_sum_cache.resize(m_vertices.size());
-    m_pfx.src = &m_vertices[0];
-    m_pfx.dst = &m_pfx_sum_cache[0];
-  }
+  void resetPrefixSum() { m_pfx_sum_cache.resize(m_vertices.size()); }
 
   // Compute the prefix sum using the two level method
   void computePrefixSum() {
-    m_pfx.computePrefixSum(m_vertices.size());
+    // todo: switch to parallel prefix sum when `galois::PrefixSum` is fixed
+    std::transform_inclusive_scan(
+        m_vertices.begin(), m_vertices.end(), m_pfx_sum_cache.begin(),
+        std::plus<uint64_t>(),
+        [](VertexMetadata const& v) { return v.degree(); }, 0ul);
     m_prefix_valid.store(true, std::memory_order_release);
   }
 
diff --git a/libgalois/test/graph-compile-lscsr.cpp b/libgalois/test/graph-compile-lscsr.cpp
index 2ed2e597e9..610068ba4c 100644
--- a/libgalois/test/graph-compile-lscsr.cpp
+++ b/libgalois/test/graph-compile-lscsr.cpp
@@ -116,5 +116,14 @@ int main() {
   // ...
   GALOIS_ASSERT(g[8] == 7);
 
+  uint64_t num_vertices = (1 << 22) + 67;
+  galois::graphs::LS_LC_CSR_Graph<void, void> big(num_vertices);
+  for (uint64_t i = 0; i < num_vertices; ++i) {
+    big.addEdgesTopologyOnly(i, {(i + 1) % num_vertices});
+  }
+  for (uint64_t i = 0; i < num_vertices; ++i) {
+    GALOIS_ASSERT(big[i] == i + 1);
+  }
+
   return 0;
 }

From 2a07be11a280a4230176dd47e950f021006d043a Mon Sep 17 00:00:00 2001
From: AdityaAtulTewari <adityaatewari@gmail.com>
Date: Mon, 15 Apr 2024 19:07:31 +0000
Subject: [PATCH 658/660] Updated test file

---
 Makefile                             |   2 +
 inputs/wmd/static                    | 650 ---------------------------
 libgalois/include/galois/PrefixSum.h |  11 +-
 libgalois/test/prefixsum.cpp         |  13 +-
 libgalois/test/wfl.cpp               |   9 +-
 5 files changed, 18 insertions(+), 667 deletions(-)
 delete mode 100644 inputs/wmd/static

diff --git a/Makefile b/Makefile
index 011abf422e..1a5e58b116 100644
--- a/Makefile
+++ b/Makefile
@@ -119,6 +119,8 @@ run-tests:
 	@ctest --test-dir build -R wmd --verbose
 	@ctest --test-dir build -R large-vec --verbose
 	@ctest --test-dir build -R compile-lscsr --verbose
+	@ctest --test-dir build -R prefixsum --verbose
+	@ctest --test-dir build -R wfl --verbose
 
 # this command is slow since hooks are not stored in the container image
 # this is mostly for CI use
diff --git a/inputs/wmd/static b/inputs/wmd/static
deleted file mode 100644
index b95a23c9b4..0000000000
--- a/inputs/wmd/static
+++ /dev/null
@@ -1,650 +0,0 @@
-#delimieter: ,
-#columns:type,person1,person2,forum,forum_event,publication,topic,date,lat,lon
-#types:STRING,UINT,UINT,UINT,UINT,UINT,UINT,USDATE,DOUBLE,DOUBLE
-Publication,,,,,102583151124020340,,4/1/2013,,
-Publication,,,,,1004346153600881042,,12/2/2014,,
-Publication,,,,,1433303251800176474,,1/1/2014,,
-Publication,,,,,963345652072941810,,3/1/2017,,
-ForumEvent,,,1372844135435303981,1651365355351122204,,,1/7/2019,,
-ForumEvent,,,1372844135435303981,1060309546214304182,,,1/3/2018,,
-ForumEvent,,,1372844135435303981,932362105613871012,,,1/8/2018,,
-ForumEvent,,,1372844135435303981,618434247743641149,,,1/8/2018,,
-ForumEvent,,,1372844135435303981,1209342585680609487,,,1/10/2018,,
-ForumEvent,,,1615340315424362057,1245126351375505703,,,2/13/2018,,
-ForumEvent,,,1372844135435303981,581543512052485139,,,2/5/2018,,
-ForumEvent,,,1314315120197156050,833681012494554358,,,3/23/2018,,
-ForumEvent,,,1615340315424362057,1220295546212024391,,,3/26/2018,,
-ForumEvent,,,1372844135435303981,1424263331858043042,,,4/5/2018,,
-ForumEvent,,,1615340315424362057,1290121451283392110,,,4/12/2018,,
-ForumEvent,,,1427292001647224242,240337224527030225,,,4/24/2018,,
-ForumEvent,,,1615340315424362057,440265285168056234,,,5/17/2018,,
-ForumEvent,,,1615340315424362057,817526874194673140,,,5/31/2018,,
-ForumEvent,,,1314315120197156050,846536331643665114,,,6/12/2018,,
-ForumEvent,,,1202482536733844323,1114502034902546550,,,6/14/2018,,
-ForumEvent,,,1372844135435303981,1441762191425652442,,,7/8/2018,,
-ForumEvent,,,1615340315424362057,128423416112315798,,,7/20/2018,,
-ForumEvent,,,1615340315424362057,701755398615636460,,,8/1/2018,,
-ForumEvent,,,1314315120197156050,393285992310638641,,,8/12/2018,,
-ForumEvent,,,1615340315424362057,420762134340393550,,,9/9/2018,,
-ForumEvent,,,1372844135435303981,737353170652104031,,,9/14/2018,,
-ForumEvent,,,1615340315424362057,116892402526543412,,,10/13/2018,,
-ForumEvent,,,1372844135435303981,1028329324575034354,,,10/20/2018,,
-ForumEvent,,,1202482536733844323,1513662032452523252,,,10/30/2018,,
-ForumEvent,,,1314315120197156050,803952155714850701,,,11/14/2018,,
-ForumEvent,,,1372844135435303981,186108460103013588,,,11/12/2018,,
-ForumEvent,,,1615340315424362057,1184855350262395542,,,12/1/2018,,
-ForumEvent,,,1372844135435303981,1302313601603127196,,,12/16/2018,,
-ForumEvent,,,1615340315424362057,78678286442461987,,,1/11/2019,,
-ForumEvent,,,15133734353741126,1285128710332882742,,,1/10/2019,,
-ForumEvent,,,1615340315424362057,447169043921403064,,,2/2/2019,,
-ForumEvent,,,1372844135435303981,91431002216341149,,,2/13/2019,,
-ForumEvent,,,1202482536733844323,1296829658689065159,,,2/13/2019,,
-ForumEvent,,,1615340315424362057,877764733212222524,,,3/28/2019,,
-ForumEvent,,,1314315120197156050,1614534111336540475,,,3/3/2019,,
-ForumEvent,,,1615340315424362057,209800678458482108,,,4/14/2019,,
-ForumEvent,,,15133734353741126,1532662490035322233,,,4/1/2019,,
-ForumEvent,,,1314315120197156050,321724159614056152,,,5/29/2019,,
-ForumEvent,,,1372844135435303981,1512214307542520410,,,5/17/2019,,
-ForumEvent,,,1615340315424362057,740410432146852843,,,6/5/2019,,
-ForumEvent,,,1372844135435303981,82629615412640377,,,6/24/2019,,
-ForumEvent,,,1427292001647224242,936722743217343702,,,6/30/2019,,
-ForumEvent,,,1372844135435303981,747423119260925972,,,7/11/2019,,
-ForumEvent,,,451888058015735870,541215404780905313,,,7/3/2019,,
-ForumEvent,,,1615340315424362057,1424660009578332566,,,8/25/2019,,
-ForumEvent,,,1314315120197156050,1282227710122181132,,,8/5/2019,,
-ForumEvent,,,1314315120197156050,854149383334143372,,,9/19/2019,,
-ForumEvent,,,1615340315424362057,202421472143651025,,,9/21/2019,,
-ForumEvent,,,353365307219544531,956704137555154092,,,10/17/2019,,
-ForumEvent,,,,1142353335442842612,,,10/2/2019,,
-ForumEvent,,,,1417645062678302203,,,10/27/2019,,
-ForumEvent,,,,691612430615344311,,,11/18/2019,,
-ForumEvent,,,,499518911125406276,,,11/7/2019,,
-ForumEvent,,,,802203574353867462,,,12/26/2019,,
-ForumEvent,,,,1154045191214226005,,,12/19/2019,,
-Forum,,,227560344059645632,,,,,,
-Forum,,,642724485236726353,,,,,,
-Forum,,,1583773067440233990,,,,,,
-Forum,,,353365307219544531,,,,,,
-Forum,,,1372844135435303981,,,,,,
-Forum,,,817570614729612563,,,,,,
-Forum,,,1222966301068614432,,,,,,
-Forum,,,254347350613723281,,,,,,
-Forum,,,230406515001545612,,,,,,
-Forum,,,1561731546512891652,,,,,,
-Forum,,,188043543797416114,,,,,,
-Forum,,,1083041743586306041,,,,,,
-Forum,,,132472381132383125,,,,,,
-Forum,,,20118285562646166,,,,,,
-Forum,,,555784630220125214,,,,,,
-Forum,,,1015255971523263924,,,,,,
-Forum,,,1342495276080758813,,,,,,
-Forum,,,851350143155248158,,,,,,
-Forum,,,1427292001647224242,,,,,,
-Forum,,,722051276937327353,,,,,,
-Forum,,,1107212912316309796,,,,,,
-Forum,,,504490409499070811,,,,,,
-Forum,,,15133734353741126,,,,,,
-Forum,,,869745302967338810,,,,,,
-Forum,,,324124332757504717,,,,,,
-Forum,,,852491638004013222,,,,,,
-Forum,,,1040437236245414809,,,,,,
-Forum,,,442231451428861295,,,,,,
-Forum,,,101022092642335391,,,,,,
-Forum,,,1037815940207624157,,,,,,
-Forum,,,1331941318481662527,,,,,,
-Forum,,,1615340315424362057,,,,,,
-Forum,,,1425519641234605945,,,,,,
-Forum,,,705065952261175094,,,,,,
-Forum,,,1314315120197156050,,,,,,
-Forum,,,214214821270800149,,,,,,
-Forum,,,1361197157264541395,,,,,,
-Forum,,,1033538541314217453,,,,,,
-Forum,,,565733832133342431,,,,,,
-Forum,,,451888058015735870,,,,,,
-Forum,,,155345234637251110,,,,,,
-Forum,,,1371100161965701220,,,,,,
-Forum,,,1307221369082243900,,,,,,
-Forum,,,406508153569651122,,,,,,
-Forum,,,1202482536733844323,,,,,,
-Forum,,,912373284682369433,,,,,,
-Person,477384404927196020,,,,,,,,
-Person,182010581109145287,,,,,,,,
-Topic,,,,,,271997,,,
-Topic,,,,,,127197,,,
-Person,284405379592161575,,,,,,,,
-Topic,,,,,,11650,,,
-Topic,,,,,,185785,,,
-Topic,,,,,,1907525,,,
-Topic,,,,,,1333024,,,
-Topic,,,,,,2329,,,
-Topic,,,,,,571,,,
-Topic,,,,,,56683126,,,
-Topic,,,,,,146,,,
-Topic,,,,,,487,,,
-Topic,,,,,,193294,,,
-Topic,,,,,,177,,,
-Topic,,,,,,81944,,,
-Topic,,,,,,998,,,
-Topic,,,,,,55424107,,,
-Topic,,,,,,41323,,,
-Topic,,,,,,38695,,,
-Topic,,,,,,379860,,,
-Topic,,,,,,1149078,,,
-Topic,,,,,,172809,,,
-Topic,,,,,,1642639,,,
-Topic,,,,,,903552,,,
-Topic,,,,,,204,,,
-Topic,,,,,,7817,,,
-Topic,,,,,,201816,,,
-Topic,,,,,,785,,49.19,-2.11
-Topic,,,,,,127,,,
-Topic,,,,,,206021,,,
-Topic,,,,,,181508,,,
-Topic,,,,,,735,,,
-Topic,,,,,,304878,,,
-Topic,,,,,,7590,,,
-Topic,,,,,,8074,,,
-Topic,,,,,,24862,,,
-Topic,,,,,,35127,,,
-Topic,,,,,,60,,40.67,-73.94
-Topic,,,,,,443533,,,
-Person,1160244137181801222,,,,,,,,
-Topic,,,,,,192242,,,
-Topic,,,,,,11707,,,
-Topic,,,,,,73843,,,
-Topic,,,,,,505619,,,
-Topic,,,,,,158668,,,
-Topic,,,,,,889,,34.0,66.0
-Person,895197896920634500,,,,,,,,
-Topic,,,,,,18426,,40.84676,-73.873207
-Topic,,,,,,787185,,,
-Topic,,,,,,467,,,
-Person,1419850416906085161,,,,,,,,
-Topic,,,,,,2869238,,,
-Topic,,,,,,5,,,
-Topic,,,,,,334600,,,
-Topic,,,,,,191290,,,
-Topic,,,,,,122113,,,
-Topic,,,,,,179057,,,
-Topic,,,,,,11635,,,
-Topic,,,,,,329717,,,
-Person,33927662206515912,,,,,,,,
-Topic,,,,,,35140,,,
-Topic,,,,,,485537,,,
-Topic,,,,,,102014,,,
-Topic,,,,,,40357,,,
-Topic,,,,,,1337691,,,
-Topic,,,,,,160409,,40.7825,-73.966111111111
-Topic,,,,,,69871376,,,
-Topic,,,,,,177749,,,
-Topic,,,,,,11348,,,
-Topic,,,,,,182218,,,
-Topic,,,,,,1229,,47.568611111111,40.852783333333
-Topic,,,,,,5322,,,
-Person,1035098046740791143,,,,,,,,
-Topic,,,,,,792565,,48.10277778,20.78388889
-Topic,,,,,,37654,,,
-Topic,,,,,,25395,,40.735277777778,-74.185
-Topic,,,,,,169313,,,
-Topic,,,,,,728,,,
-Topic,,,,,,699385,,,
-Topic,,,,,,22983,,,
-Person,971383124880710240,,,,,,,,
-Person,1010629269012322480,,,,,,,,
-Topic,,,,,,11299,,40.728333333333,-73.994166666667
-Topic,,,,,,83460,,,
-Topic,,,,,,10289,,,
-Topic,,,,,,11019,,,
-Topic,,,,,,470118,,,
-Person,1426050562563532645,,,,,,,,
-Person,75415528634186650,,,,,,,,
-Person,1001287904525368324,,,,,,,,
-Person,242111862342742257,,,,,,,,
-Topic,,,,,,11249,,40.747,-73.986
-Topic,,,,,,3933135,,,
-Topic,,,,,,44311,,,
-Person,1025135622623992536,,,,,,,,
-Topic,,,,,,37497186,,,
-Person,584485814982143221,,,,,,,,
-Person,1508332501512270227,,,,,,,,
-Topic,,,,,,328473,,40.712,-74.002
-Topic,,,,,,25347,,,
-Topic,,,,,,175111,,,
-Person,1312322776399358210,,,,,,,,
-Topic,,,,,,16003594,,,
-Topic,,,,,,48789658,,,
-Topic,,,,,,8148,,,
-Topic,,,,,,9420,,,
-Topic,,,,,,771572,,40.699945,-73.950148
-Topic,,,,,,5088838,,,
-Person,1597454052092354280,,,,,,,,
-Person,961135479935321085,,,,,,,,
-Topic,,,,,,123705,,,
-Topic,,,,,,598435,,,
-Topic,,,,,,732934,,,
-Person,317248309514344163,,,,,,,,
-Person,1524681741257900519,,,,,,,,
-Topic,,,,,,254860,,,
-Topic,,,,,,335046,,,
-Person,534449219561977424,,,,,,,,
-Person,1035056342462002945,,,,,,,,
-Person,1222330726897222256,,,,,,,,
-Person,493345739124130581,,,,,,,,
-Topic,,,,,,831691,,,
-Topic,,,,,,28321638,,,
-Person,682588400093615551,,,,,,,,
-Person,920136262355651383,,,,,,,,
-Person,351354309273100074,,,,,,,,
-Person,495352903902152146,,,,,,,,
-Person,211778681592778731,,,,,,,,
-Topic,,,,,,1189753,,,
-Person,396953035572582107,,,,,,,,
-Topic,,,,,,828749,,,
-Topic,,,,,,904756,,,
-Topic,,,,,,7392008,,,
-Topic,,,,,,2566598,,,
-Person,363047312690634767,,,,,,,,
-Topic,,,,,,618102,,,
-Person,205415260510814362,,,,,,,,
-Person,1251650482793161774,,,,,,,,
-Topic,,,,,,620463,,,
-Person,1463522545161373807,,,,,,,,
-Person,1150357430325141247,,,,,,,,
-Person,674253449444876344,,,,,,,,
-Person,1073324208204442390,,,,,,,,
-Topic,,,,,,1049632,,40.665352,-73.969264
-Topic,,,,,,29171,,,
-Person,1637740339335566412,,,,,,,,
-Person,524508243055647325,,,,,,,,
-Person,320151361710953715,,,,,,,,
-Person,128643504412157535,,,,,,,,
-Topic,,,,,,361,,,
-Person,1243472362254658420,,,,,,,,
-Topic,,,,,,617927,,,
-Person,1275555184736572954,,,,,,,,
-Topic,,,,,,974850,,,
-Person,934144115142885657,,,,,,,,
-Topic,,,,,,217627,,,
-Topic,,,,,,223155,,,
-Person,1504217244688272832,,,,,,,,
-Person,144548678565311334,,,,,,,,
-Person,1400516284533535554,,,,,,,,
-Topic,,,,,,14528,,,
-Person,1508951542204233332,,,,,,,,
-Person,611325512448133762,,,,,,,,
-Person,635555368637193420,,,,,,,,
-Person,134403203055015143,,,,,,,,
-Topic,,,,,,202013,,,
-Topic,,,,,,7602643,,,
-Topic,,,,,,121765,,40.774444444444,-73.904166666667
-Person,765254641650259739,,,,,,,,
-Person,975526659664533195,,,,,,,,
-Person,273872236541568195,,,,,,,,
-Topic,,,,,,557887,,,
-Topic,,,,,,774228,,,
-Topic,,,,,,46744,,,
-Person,352033450190732475,,,,,,,,
-Person,841466124620556016,,,,,,,,
-Person,1517466541524095404,,,,,,,,
-Topic,,,,,,114633,,,
-Person,301710390995444087,,,,,,,,
-Topic,,,,,,16868955,,,
-Person,747231730275042400,,,,,,,,
-Person,1164902255571715230,,,,,,,,
-Person,291914370254601234,,,,,,,,
-Topic,,,,,,247154,,,
-Topic,,,,,,519,,,
-Topic,,,,,,3303945,,,
-Person,1022241560051472272,,,,,,,,
-Person,566448585007839403,,,,,,,,
-Topic,,,,,,11229,,,
-Person,735713441679521195,,,,,,,,
-Person,1128501731262832684,,,,,,,,
-Person,446962590481145702,,,,,,,,
-Person,1125113326787431160,,,,,,,,
-Person,437201545096608055,,,,,,,,
-Person,940377106445268064,,,,,,,,
-Person,1647329525841402942,,,,,,,,
-Topic,,,,,,3884230,,,
-Topic,,,,,,131191,,,
-Person,1376053313411407054,,,,,,,,
-Person,1347432655942023365,,,,,,,,
-Person,1472154222902711100,,,,,,,,
-Person,529550602103217450,,,,,,,,
-Topic,,,,,,43035,,,
-Topic,,,,,,126095,,,
-Topic,,,,,,49088,,40.8075,-73.961944444444
-Person,910075513854877065,,,,,,,,
-Topic,,,,,,5018694,,40.859105555556,-74.198686111111
-Topic,,,,,,2446683,,43.1189,20.0797
-Topic,,,,,,2030894,,40.850852,-73.844949
-Topic,,,,,,29718382,,,
-Topic,,,,,,130965,,40.860833333333,-73.884444444444
-Topic,,,,,,167172,,,
-Topic,,,,,,2456507,,,
-Person,842652402732741813,,,,,,,,
-Topic,,,,,,670897,,33.421111111111,-111.93166666667
-Topic,,,,,,1436668,,,
-Topic,,,,,,753651,,,
-Topic,,,,,,7451247,,,
-Topic,,,,,,2493,,,
-Person,719533111062900642,,,,,,,,
-Person,834321901190546647,,,,,,,,
-Topic,,,,,,12796,,,
-Person,937074421253040138,,,,,,,,
-Topic,,,,,,18159587,,,
-Person,101810442957214781,,,,,,,,
-Topic,,,,,,131401,,,
-Topic,,,,,,929920,,,
-Topic,,,,,,466439,,,
-Topic,,,,,,6498684,,,
-Topic,,,,,,206361,,,
-Topic,,,,,,41796,,,
-Person,1152266442105786574,,,,,,,,
-Person,95240187156237415,,,,,,,,
-Topic,,,,,,7897553,,,
-Topic,,,,,,206887,,,
-Topic,,,,,,5405633,,,
-Person,1031526243841315760,,,,,,,,
-Topic,,,,,,11348540,,,
-Topic,,,,,,4198163,,,
-Topic,,,,,,16048728,,,
-Topic,,,,,,189756,,,
-Topic,,,,,,643638,,,
-Topic,,,,,,783874,,,
-Topic,,,,,,492346,,37.2708,-76.7069
-Topic,,,,,,29042975,,,
-Topic,,,,,,12103677,,,
-Topic,,,,,,2329157,,,
-Person,1563598527979706128,,,,,,,,
-Topic,,,,,,4442,,,
-Person,264075025125849069,,,,,,,,
-Person,369370063627142227,,,,,,,,
-Person,1300183120520109060,,,,,,,,
-Topic,,,,,,18031504,,,
-Topic,,,,,,4229887,,,
-Person,611117914195523184,,,,,,,,
-Topic,,,,,,7543639,,,
-Topic,,,,,,13977,,,
-Topic,,,,,,18122778,,,
-Person,166319955306346577,,,,,,,,
-Topic,,,,,,588894,,,
-Topic,,,,,,2454265,,,
-Person,1547400408884914628,,,,,,,,
-Person,373641740834326257,,,,,,,,
-Topic,,,,,,5264957,,,
-Topic,,,,,,968598,,,
-Person,754480939973310112,,,,,,,,
-Topic,,,,,,1808877,,,
-Person,1443919105364146460,,,,,,,,
-Person,735243266472522113,,,,,,,,
-Person,1321304826561136177,,,,,,,,
-Person,1560601202484151215,,,,,,,,
-Person,1403521534163206962,,,,,,,,
-Person,231472126788137195,,,,,,,,
-Person,208411288512434105,,,,,,,,
-Topic,,,,,,7252790,,,
-Person,1211456636406749825,,,,,,,,
-Person,1071303249530347453,,,,,,,,
-Person,1069710216181783510,,,,,,,,
-Person,1578613817419480731,,,,,,,,
-Person,944546653739552042,,,,,,,,
-Topic,,,,,,8856932,,,
-Person,616673625330310949,,,,,,,,
-Person,1302421465423646583,,,,,,,,
-Person,720320812100121121,,,,,,,,
-Person,653345304799504620,,,,,,,,
-Person,346401281431409585,,,,,,,,
-Person,1526112405471861415,,,,,,,,
-Person,1501623481588541372,,,,,,,,
-Person,312380611598980641,,,,,,,,
-Person,1115244423173415593,,,,,,,,
-Person,1555348115336584230,,,,,,,,
-Person,12321118467056216,,,,,,,,
-Person,1352636429150180228,,,,,,,,
-Person,725324491051434870,,,,,,,,
-Person,846764541256336994,,,,,,,,
-Person,140443713446471314,,,,,,,,
-Person,1135272113235621141,,,,,,,,
-Person,775818654043059161,,,,,,,,
-Person,529476525413023401,,,,,,,,
-Person,1262668194076216011,,,,,,,,
-Person,119474435514352445,,,,,,,,
-Person,437573095319558705,,,,,,,,
-Person,1035555223142154728,,,,,,,,
-Person,556320934631523806,,,,,,,,
-Person,1356253242219285320,,,,,,,,
-Person,248654236829951090,,,,,,,,
-Person,481153633235353485,,,,,,,,
-Includes,,,1202482536733844323,1296829658689065159,,,,,
-HasTopic,,,,956704137555154092,,335046,,,
-HasTopic,,,,1028329324575034354,,1808877,,,
-HasTopic,,,,,1004346153600881042,735,,,
-Author,1560601202484151215,,,1285128710332882742,,,,,
-HasTopic,,,,1513662032452523252,,131401,,,
-HasTopic,,,,1302313601603127196,,48789658,,,
-HasTopic,,,,1114502034902546550,,40357,,,
-Sale,1275555184736572954,1463522545161373807,,,,,8/16/2018,,
-HasTopic,,,,78678286442461987,,28321638,,,
-HasTopic,,,,854149383334143372,,903552,,,
-HasTopic,,,,240337224527030225,,519,,,
-HasTopic,,,,116892402526543412,,5264957,,,
-HasTopic,,,,202421472143651025,,12103677,,,
-HasTopic,,,,393285992310638641,,470118,,,
-Author,910075513854877065,,,,102583151124020340,,,,
-Sale,1426050562563532645,75415528634186650,,,,,7/29/2018,,
-Author,1128501731262832684,,,1513662032452523252,,,,,
-Sale,971383124880710240,1010629269012322480,,,,38695,7/28/2018,,
-Author,477384404927196020,,,1651365355351122204,,,,,
-Author,725324491051434870,,,202421472143651025,,,,,
-HasTopic,,,,932362105613871012,,69871376,,,
-HasTopic,,,,1290121451283392110,,169313,,,
-Sale,396953035572582107,1400516284533535554,,,,41323,8/21/2018,,
-HasTopic,,,,701755398615636460,,8148,,,
-Sale,940377106445268064,1647329525841402942,,,,25347,10/15/2018,,
-HasTopic,,,,1651365355351122204,,643638,,,
-HasTopic,,,,1114502034902546550,,46744,,,
-HasTopic,,,,936722743217343702,,123705,,,
-HasTopic,,,,321724159614056152,,6498684,,,
-Sale,1419850416906085161,1128501731262832684,,,,2869238,09/28/2018,,
-HasTopic,,,1372844135435303981,,,60,,,
-Author,1578613817419480731,,,321724159614056152,,,,,
-HasTopic,,,,1512214307542520410,,8074,,,
-HasTopic,,,,618434247743641149,,192242,,,
-Includes,,,1202482536733844323,1114502034902546550,,,,,
-Includes,,,1615340315424362057,877764733212222524,,,,,
-HasTopic,,,,240337224527030225,,1229,,,
-HasTopic,,,,1209342585680609487,,179057,,,
-Author,834321901190546647,,,,1433303251800176474,,,,
-HasTopic,,,,1209342585680609487,,175111,,,
-Includes,,,1372844135435303981,581543512052485139,,,,,
-Author,720320812100121121,,,420762134340393550,,,,,
-Includes,,,1615340315424362057,1424660009578332566,,,,,
-HasTopic,,,,932362105613871012,,771572,,,
-Includes,,,353365307219544531,956704137555154092,,,,,
-HasTopic,,,,956704137555154092,,929920,,,
-HasTopic,,,,1441762191425652442,,177749,,,
-Author,1563598527979706128,,,1220295546212024391,,,,,
-Author,248654236829951090,,,1424660009578332566,,,,,
-Author,529476525413023401,,,1302313601603127196,,,,,
-HasTopic,,,,1209342585680609487,,771572,,,
-HasTopic,,,,,1433303251800176474,83460,,,
-Sale,1022241560051472272,1637740339335566412,,,,1642639,10/13/2018,,
-Includes,,,1615340315424362057,128423416112315798,,,,,
-Includes,,,1372844135435303981,932362105613871012,,,,,
-Includes,,,1372844135435303981,747423119260925972,,,,,
-HasTopic,,,,854149383334143372,,24862,,,
-Sale,1160244137181801222,1035056342462002945,,,,181508,10/4/2018,,
-HasTopic,,,,1114502034902546550,,328473,,,
-Includes,,,1314315120197156050,854149383334143372,,,,,
-Sale,1472154222902711100,1128501731262832684,,,,185785,09/28/2018,,
-HasOrg,,,,,102583151124020340,49088,,,
-HasTopic,,,,932362105613871012,,11299,,,
-HasTopic,,,,932362105613871012,,18426,,,
-HasTopic,,,,,1004346153600881042,7392008,,,
-Author,1526112405471861415,,,846536331643665114,,,,,
-HasTopic,,,,78678286442461987,,617927,,,
-HasTopic,,,,321724159614056152,,4229887,,,
-HasTopic,,,,82629615412640377,,5322,,,
-Author,1128501731262832684,,,1114502034902546550,,,,,
-Author,166319955306346577,,,209800678458482108,,,,,
-Sale,735713441679521195,1128501731262832684,,,,11650,10/10/2018,,
-Author,944546653739552042,,,803952155714850701,,,,,
-HasTopic,,,,581543512052485139,,3933135,,,
-HasTopic,,,,209800678458482108,,4198163,,,
-HasTopic,,,,932362105613871012,,7590,,,
-Includes,,,1372844135435303981,1060309546214304182,,,,,
-HasTopic,,,,846536331643665114,,167172,,,
-HasTopic,,,,740410432146852843,,11348540,,,
-Author,937074421253040138,,,,1004346153600881042,,,,
-HasTopic,,,,1282227710122181132,,2493,,,
-HasTopic,,,,,1433303251800176474,43035,,,
-Sale,495352903902152146,211778681592778731,,,,81944,8/6/2018,,
-Includes,,,1372844135435303981,1512214307542520410,,,,,
-HasTopic,,,353365307219544531,,,2329,,,
-Includes,,,1615340315424362057,701755398615636460,,,,,
-Includes,,,1372844135435303981,1209342585680609487,,,,,
-Includes,,,1202482536733844323,1513662032452523252,,,,,
-HasTopic,,,,701755398615636460,,968598,,,
-HasTopic,,,,128423416112315798,,11019,,,
-Author,1356253242219285320,,,854149383334143372,,,,,
-Author,1031526243841315760,,,,1433303251800176474,,,,
-Author,842652402732741813,,,,1433303251800176474,,,,
-HasTopic,,,,440265285168056234,,18159587,,,
-Author,1501623481588541372,,,833681012494554358,,,,,
-HasTopic,,,,1028329324575034354,,11299,,,
-Author,140443713446471314,,,1028329324575034354,,,,,
-Author,264075025125849069,,,1245126351375505703,,,,,
-Includes,,,1314315120197156050,1614534111336540475,,,,,
-HasTopic,,,,,1433303251800176474,131191,,,
-HasTopic,,,,420762134340393550,,1907525,,,
-Sale,975526659664533195,524508243055647325,,,,,10/15/2018,,
-Author,284405379592161575,,,,102583151124020340,,,,
-HasTopic,,,,,963345652072941810,735,,,
-HasTopic,,,,,1433303251800176474,3303945,,,
-Author,1152266442105786574,,,,963345652072941810,,,,
-HasTopic,,,,202421472143651025,,1189753,,,
-HasTopic,,,,1028329324575034354,,11229,,,
-Includes,,,1615340315424362057,1220295546212024391,,,,,
-Includes,,,1314315120197156050,846536331643665114,,,,,
-HasTopic,,,,1296829658689065159,,7451247,,,
-Includes,,,1615340315424362057,78678286442461987,,,,,
-HasTopic,,,,541215404780905313,,11635,,,
-Author,481153633235353485,,,956704137555154092,,,,,
-Author,611117914195523184,,,701755398615636460,,,,,
-Author,95240187156237415,,,,963345652072941810,,,,
-Includes,,,1372844135435303981,91431002216341149,,,,,
-Author,846764541256336994,,,1290121451283392110,,,,,
-HasTopic,,,,393285992310638641,,206887,,,
-HasTopic,,,,1512214307542520410,,177749,,,
-HasTopic,,,,1114502034902546550,,7252790,,,
-HasTopic,,,1202482536733844323,,,60,,,
-HasTopic,,,,1302313601603127196,,41796,,,
-HasTopic,,,,1114502034902546550,,44311,,,
-Author,1547400408884914628,,,393285992310638641,,,,,
-Author,312380611598980641,,,78678286442461987,,,,,
-Sale,273872236541568195,1251650482793161774,,,,172809,8/22/2018,,
-HasTopic,,,,1513662032452523252,,728,,,
-Includes,,,1615340315424362057,447169043921403064,,,,,
-Includes,,,1372844135435303981,82629615412640377,,,,,
-HasTopic,,,,1302313601603127196,,29042975,,,
-Author,1321304826561136177,,,1532662490035322233,,,,,
-Includes,,,1372844135435303981,1302313601603127196,,,,,
-Sale,363047312690634767,242111862342742257,,,,,10/4/2018,,
-Author,1071303249530347453,,,737353170652104031,,,,,
-HasTopic,,,,1282227710122181132,,35140,,,
-HasTopic,,,,91431002216341149,,46744,,,
-Includes,,,1372844135435303981,1441762191425652442,,,,,
-Sale,446962590481145702,534449219561977424,,,,,10/11/2018,,
-Author,1035555223142154728,,,877764733212222524,,,,,
-Author,1403521534163206962,,,932362105613871012,,,,,
-HasTopic,,,,701755398615636460,,35127,,,
-Includes,,,1372844135435303981,1028329324575034354,,,,,
-HasTopic,,,,1245126351375505703,,254860,,,
-HasTopic,,,,209800678458482108,,7897553,,,
-Includes,,,1372844135435303981,186108460103013588,,,,,
-HasOrg,,,,,102583151124020340,304878,,,
-HasTopic,,,,,1433303251800176474,998,,,
-Author,653345304799504620,,,581543512052485139,,,,,
-Author,1302421465423646583,,,240337224527030225,,,,,
-Author,1211456636406749825,,,618434247743641149,,,,,
-HasTopic,,,,240337224527030225,,785,,,
-HasTopic,,,,,1004346153600881042,83460,,,
-Includes,,,1615340315424362057,1245126351375505703,,,,,
-HasTopic,,,,747423119260925972,,16868955,,,
-HasTopic,,,,91431002216341149,,9420,,,
-Includes,,,451888058015735870,541215404780905313,,,,,
-HasTopic,,,,1209342585680609487,,492346,,,
-HasTopic,,,,1290121451283392110,,114633,,,
-HasTopic,,,,1441762191425652442,,11249,,,
-HasTopic,,,,1532662490035322233,,753651,,,
-HasTopic,,,,1532662490035322233,,11707,,,
-HasTopic,,,,1245126351375505703,,904756,,,
-HasTopic,,,15133734353741126,,,189756,,,
-Sale,1222330726897222256,493345739124130581,,,,177,8/4/2018,,
-Includes,,,1372844135435303981,1424263331858043042,,,,,
-Includes,,,1615340315424362057,1290121451283392110,,,,,
-Sale,477384404927196020,182010581109145287,,,,271997,2/17/2019,,
-HasTopic,,,,737353170652104031,,16003594,,,
-HasTopic,,,,747423119260925972,,2329157,,,
-Sale,566448585007839403,765254641650259739,,,,,10/9/2018,,
-HasOrg,,,,,963345652072941810,130965,,,
-Author,616673625330310949,,,936722743217343702,,,,,
-HasTopic,,,,1513662032452523252,,44311,,,
-Author,119474435514352445,,,1282227710122181132,,,,,
-HasTopic,,,,82629615412640377,,22983,,,
-HasTopic,,,,833681012494554358,,201816,,,
-HasTopic,,,,,1433303251800176474,14528,,,
-Sale,33927662206515912,934144115142885657,,,,,10/10/2018,,
-Includes,,,1372844135435303981,618434247743641149,,,,,
-Includes,,,1615340315424362057,740410432146852843,,,,,
-Author,1300183120520109060,,,128423416112315798,,,,,
-Author,1069710216181783510,,,1296829658689065159,,,,,
-Sale,1504217244688272832,144548678565311334,,,,56683126,8/13/2018,,
-Author,1115244423173415593,,,1614534111336540475,,,,,
-Sale,747231730275042400,584485814982143221,,,,127,10/7/2018,,
-HasTopic,,,,1285128710332882742,,37654,,,
-HasTopic,,,1615340315424362057,,,12796,,,
-Author,1555348115336584230,,,817526874194673140,,,,,
-HasTopic,,,,,102583151124020340,43035,,,
-HasTopic,,,,737353170652104031,,206361,,,
-Includes,,,1314315120197156050,833681012494554358,,,,,
-Sale,128643504412157535,320151361710953715,,,,443533,8/24/2018,,
-HasTopic,,,,1220295546212024391,,588894,,,
-HasTopic,,,,202421472143651025,,7602643,,,
-Sale,1150357430325141247,674253449444876344,,,,,10/7/2018,,
-Author,735243266472522113,,,1209342585680609487,,,,,
-Includes,,,1372844135435303981,1651365355351122204,,,,,
-HasTopic,,,,1424660009578332566,,618102,,,
-HasTopic,,,,,102583151124020340,126095,,,
-HasTopic,,,,,1004346153600881042,2446683,,,
-HasTopic,,,,,1433303251800176474,598435,,,
-Author,735713441679521195,,,,1433303251800176474,,,,
-Includes,,,1314315120197156050,321724159614056152,,,,,
-HasTopic,,,,1114502034902546550,,8856932,,,
-HasTopic,,,,,1433303251800176474,5405633,,,
-HasTopic,,,,1285128710332882742,,11299,,,
-HasTopic,,,,420762134340393550,,12796,,,
-HasTopic,,,,1184855350262395542,,329717,,,
-HasTopic,,,,116892402526543412,,2456507,,,
-Includes,,,1615340315424362057,817526874194673140,,,,,
-Sale,841466124620556016,1517466541524095404,,,,,8/27/2018,,
-HasTopic,,,,1209342585680609487,,127197,,,
-HasTopic,,,,420762134340393550,,16048728,,,
-HasTopic,,,,209800678458482108,,774228,,,
-Author,231472126788137195,,,82629615412640377,,,,,
-HasTopic,,,,,1433303251800176474,7817,,,
-HasTopic,,,,932362105613871012,,1049632,,,
-Sale,1164902255571715230,291914370254601234,,,,571,10/8/2018,,
-HasTopic,,,,,1433303251800176474,467,,,
-HasTopic,,,,1220295546212024391,,37497186,,,
-HasTopic,,,,817526874194673140,,732934,,,
-Includes,,,1615340315424362057,202421472143651025,,,,,
-HasTopic,,,,1513662032452523252,,889,,,
-Includes,,,1372844135435303981,737353170652104031,,,,,
-HasTopic,,,,1296829658689065159,,83460,,,
-HasTopic,,,,747423119260925972,,1333024,,,
-HasTopic,,,,1290121451283392110,,204,,,
-Author,611117914195523184,,,440265285168056234,,,,,
-Author,754480939973310112,,,1441762191425652442,,,,,
-Includes,,,1427292001647224242,936722743217343702,,,,,
diff --git a/libgalois/include/galois/PrefixSum.h b/libgalois/include/galois/PrefixSum.h
index 9fc854a4c4..d287eca8e3 100644
--- a/libgalois/include/galois/PrefixSum.h
+++ b/libgalois/include/galois/PrefixSum.h
@@ -183,11 +183,20 @@ class PrefixSum {
    */
   void computePrefixSum(uint64_t ns) {
     galois::on_each([&](unsigned tid, unsigned numThreads) {
-      this->parallel_pfxsum_op(ns, tid, numThreads);
+      if (numThreads == 1) {
+        computePrefixSumSerially(ns);
+      } else {
+        this->parallel_pfxsum_op(ns, tid, numThreads);
+      }
     });
     this->lock.reset();
   }
 
+  void computePrefixSumSerially(uint64_t ns) {
+    serial_pfxsum<A, B, transmute, scan_op, std::monostate, empty, empty, Arr>(
+        src, dst, ns, std::monostate());
+  }
+
   const char* name() {
     return typeid(PrefixSum<A, B, transmute, scan_op, combiner, Conduit>)
         .name();
diff --git a/libgalois/test/prefixsum.cpp b/libgalois/test/prefixsum.cpp
index e05c54971e..b7352d6614 100644
--- a/libgalois/test/prefixsum.cpp
+++ b/libgalois/test/prefixsum.cpp
@@ -59,7 +59,7 @@ int main(int argc, char** argv) {
   if (argc > 1)
     iter = atoi(argv[1]);
   else
-    iter = 16 * 1024;
+    iter = 16;
   if (argc > 2)
     numThreads = atoi(argv[2]);
   else
@@ -68,12 +68,7 @@ int main(int argc, char** argv) {
   gethostname(bname, sizeof(bname));
   using namespace galois;
 
-  std::cout << "Host"
-            << ","
-            << "Lock Name"
-            << ","
-            << "numThreads"
-            << ","
+  std::cout << "Host" << "," << "Lock Name" << "," << "numThreads" << ","
             << "Time (us)" << std::endl;
 
   // using PTS   = PerThreadStorage<unsigned>;
@@ -82,7 +77,7 @@ int main(int argc, char** argv) {
       sizeof(std::pair<uint64_t, uint64_t>) * (1 << 30));
   auto dst = (uint64_t*)malloc(sizeof(uint64_t) * (1 << 30));
 
-  for (uint64_t i = 0; i < (1 << 30); i++)
+  for (uint64_t i = 0; i < (1 << 20); i++)
     src[i] = {0, 1};
 
   using PSUM = PrefixSum<std::pair<uint64_t, uint64_t>, uint64_t, transmute,
@@ -93,7 +88,7 @@ int main(int argc, char** argv) {
   test<PSUM>(prefix, 50, dst);
   test<PSUM>(prefix, 1000, dst);
   test<PSUM>(prefix, 40000, dst);
-  test<PSUM>(prefix, (1 << 30), dst);
+  test<PSUM>(prefix, (1 << 20), dst);
   free(src);
   free(dst);
 
diff --git a/libgalois/test/wfl.cpp b/libgalois/test/wfl.cpp
index 79e68a64a0..0bdcdee68b 100644
--- a/libgalois/test/wfl.cpp
+++ b/libgalois/test/wfl.cpp
@@ -79,7 +79,7 @@ int main(int argc, char** argv) {
   if (argc > 1)
     iter = atoi(argv[1]);
   else
-    iter = 16 * 1024;
+    iter = 16;
   if (argc > 2)
     numThreads = atoi(argv[2]);
   else
@@ -88,12 +88,7 @@ int main(int argc, char** argv) {
   gethostname(bname, sizeof(bname));
   using namespace galois;
 
-  std::cout << "Host"
-            << ","
-            << "Lock Name"
-            << ","
-            << "numThreads"
-            << ","
+  std::cout << "Host" << "," << "Lock Name" << "," << "numThreads" << ","
             << "Time (us)" << std::endl;
 
   using PTS = PerThreadStorage<unsigned>;

From 79a5e03685193a90e9da52b5a0156d136d8cf4ed Mon Sep 17 00:00:00 2001
From: AdityaAtulTewari <adityaatewari@gmail.com>
Date: Mon, 15 Apr 2024 19:10:00 +0000
Subject: [PATCH 659/660] File update

---
 .gitattributes    |   1 -
 inputs/wmd/static | 650 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 650 insertions(+), 1 deletion(-)
 create mode 100644 inputs/wmd/static

diff --git a/.gitattributes b/.gitattributes
index 80513e2651..71b4e67e5e 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,3 +1,2 @@
 *.csv filter=lfs diff=lfs merge=lfs -text
 inputs/wmd/data.001.csv filter=lfs diff=lfs merge=lfs -text
-inputs/wmd/static filter=lfs diff=lfs merge=lfs -text
diff --git a/inputs/wmd/static b/inputs/wmd/static
new file mode 100644
index 0000000000..b95a23c9b4
--- /dev/null
+++ b/inputs/wmd/static
@@ -0,0 +1,650 @@
+#delimieter: ,
+#columns:type,person1,person2,forum,forum_event,publication,topic,date,lat,lon
+#types:STRING,UINT,UINT,UINT,UINT,UINT,UINT,USDATE,DOUBLE,DOUBLE
+Publication,,,,,102583151124020340,,4/1/2013,,
+Publication,,,,,1004346153600881042,,12/2/2014,,
+Publication,,,,,1433303251800176474,,1/1/2014,,
+Publication,,,,,963345652072941810,,3/1/2017,,
+ForumEvent,,,1372844135435303981,1651365355351122204,,,1/7/2019,,
+ForumEvent,,,1372844135435303981,1060309546214304182,,,1/3/2018,,
+ForumEvent,,,1372844135435303981,932362105613871012,,,1/8/2018,,
+ForumEvent,,,1372844135435303981,618434247743641149,,,1/8/2018,,
+ForumEvent,,,1372844135435303981,1209342585680609487,,,1/10/2018,,
+ForumEvent,,,1615340315424362057,1245126351375505703,,,2/13/2018,,
+ForumEvent,,,1372844135435303981,581543512052485139,,,2/5/2018,,
+ForumEvent,,,1314315120197156050,833681012494554358,,,3/23/2018,,
+ForumEvent,,,1615340315424362057,1220295546212024391,,,3/26/2018,,
+ForumEvent,,,1372844135435303981,1424263331858043042,,,4/5/2018,,
+ForumEvent,,,1615340315424362057,1290121451283392110,,,4/12/2018,,
+ForumEvent,,,1427292001647224242,240337224527030225,,,4/24/2018,,
+ForumEvent,,,1615340315424362057,440265285168056234,,,5/17/2018,,
+ForumEvent,,,1615340315424362057,817526874194673140,,,5/31/2018,,
+ForumEvent,,,1314315120197156050,846536331643665114,,,6/12/2018,,
+ForumEvent,,,1202482536733844323,1114502034902546550,,,6/14/2018,,
+ForumEvent,,,1372844135435303981,1441762191425652442,,,7/8/2018,,
+ForumEvent,,,1615340315424362057,128423416112315798,,,7/20/2018,,
+ForumEvent,,,1615340315424362057,701755398615636460,,,8/1/2018,,
+ForumEvent,,,1314315120197156050,393285992310638641,,,8/12/2018,,
+ForumEvent,,,1615340315424362057,420762134340393550,,,9/9/2018,,
+ForumEvent,,,1372844135435303981,737353170652104031,,,9/14/2018,,
+ForumEvent,,,1615340315424362057,116892402526543412,,,10/13/2018,,
+ForumEvent,,,1372844135435303981,1028329324575034354,,,10/20/2018,,
+ForumEvent,,,1202482536733844323,1513662032452523252,,,10/30/2018,,
+ForumEvent,,,1314315120197156050,803952155714850701,,,11/14/2018,,
+ForumEvent,,,1372844135435303981,186108460103013588,,,11/12/2018,,
+ForumEvent,,,1615340315424362057,1184855350262395542,,,12/1/2018,,
+ForumEvent,,,1372844135435303981,1302313601603127196,,,12/16/2018,,
+ForumEvent,,,1615340315424362057,78678286442461987,,,1/11/2019,,
+ForumEvent,,,15133734353741126,1285128710332882742,,,1/10/2019,,
+ForumEvent,,,1615340315424362057,447169043921403064,,,2/2/2019,,
+ForumEvent,,,1372844135435303981,91431002216341149,,,2/13/2019,,
+ForumEvent,,,1202482536733844323,1296829658689065159,,,2/13/2019,,
+ForumEvent,,,1615340315424362057,877764733212222524,,,3/28/2019,,
+ForumEvent,,,1314315120197156050,1614534111336540475,,,3/3/2019,,
+ForumEvent,,,1615340315424362057,209800678458482108,,,4/14/2019,,
+ForumEvent,,,15133734353741126,1532662490035322233,,,4/1/2019,,
+ForumEvent,,,1314315120197156050,321724159614056152,,,5/29/2019,,
+ForumEvent,,,1372844135435303981,1512214307542520410,,,5/17/2019,,
+ForumEvent,,,1615340315424362057,740410432146852843,,,6/5/2019,,
+ForumEvent,,,1372844135435303981,82629615412640377,,,6/24/2019,,
+ForumEvent,,,1427292001647224242,936722743217343702,,,6/30/2019,,
+ForumEvent,,,1372844135435303981,747423119260925972,,,7/11/2019,,
+ForumEvent,,,451888058015735870,541215404780905313,,,7/3/2019,,
+ForumEvent,,,1615340315424362057,1424660009578332566,,,8/25/2019,,
+ForumEvent,,,1314315120197156050,1282227710122181132,,,8/5/2019,,
+ForumEvent,,,1314315120197156050,854149383334143372,,,9/19/2019,,
+ForumEvent,,,1615340315424362057,202421472143651025,,,9/21/2019,,
+ForumEvent,,,353365307219544531,956704137555154092,,,10/17/2019,,
+ForumEvent,,,,1142353335442842612,,,10/2/2019,,
+ForumEvent,,,,1417645062678302203,,,10/27/2019,,
+ForumEvent,,,,691612430615344311,,,11/18/2019,,
+ForumEvent,,,,499518911125406276,,,11/7/2019,,
+ForumEvent,,,,802203574353867462,,,12/26/2019,,
+ForumEvent,,,,1154045191214226005,,,12/19/2019,,
+Forum,,,227560344059645632,,,,,,
+Forum,,,642724485236726353,,,,,,
+Forum,,,1583773067440233990,,,,,,
+Forum,,,353365307219544531,,,,,,
+Forum,,,1372844135435303981,,,,,,
+Forum,,,817570614729612563,,,,,,
+Forum,,,1222966301068614432,,,,,,
+Forum,,,254347350613723281,,,,,,
+Forum,,,230406515001545612,,,,,,
+Forum,,,1561731546512891652,,,,,,
+Forum,,,188043543797416114,,,,,,
+Forum,,,1083041743586306041,,,,,,
+Forum,,,132472381132383125,,,,,,
+Forum,,,20118285562646166,,,,,,
+Forum,,,555784630220125214,,,,,,
+Forum,,,1015255971523263924,,,,,,
+Forum,,,1342495276080758813,,,,,,
+Forum,,,851350143155248158,,,,,,
+Forum,,,1427292001647224242,,,,,,
+Forum,,,722051276937327353,,,,,,
+Forum,,,1107212912316309796,,,,,,
+Forum,,,504490409499070811,,,,,,
+Forum,,,15133734353741126,,,,,,
+Forum,,,869745302967338810,,,,,,
+Forum,,,324124332757504717,,,,,,
+Forum,,,852491638004013222,,,,,,
+Forum,,,1040437236245414809,,,,,,
+Forum,,,442231451428861295,,,,,,
+Forum,,,101022092642335391,,,,,,
+Forum,,,1037815940207624157,,,,,,
+Forum,,,1331941318481662527,,,,,,
+Forum,,,1615340315424362057,,,,,,
+Forum,,,1425519641234605945,,,,,,
+Forum,,,705065952261175094,,,,,,
+Forum,,,1314315120197156050,,,,,,
+Forum,,,214214821270800149,,,,,,
+Forum,,,1361197157264541395,,,,,,
+Forum,,,1033538541314217453,,,,,,
+Forum,,,565733832133342431,,,,,,
+Forum,,,451888058015735870,,,,,,
+Forum,,,155345234637251110,,,,,,
+Forum,,,1371100161965701220,,,,,,
+Forum,,,1307221369082243900,,,,,,
+Forum,,,406508153569651122,,,,,,
+Forum,,,1202482536733844323,,,,,,
+Forum,,,912373284682369433,,,,,,
+Person,477384404927196020,,,,,,,,
+Person,182010581109145287,,,,,,,,
+Topic,,,,,,271997,,,
+Topic,,,,,,127197,,,
+Person,284405379592161575,,,,,,,,
+Topic,,,,,,11650,,,
+Topic,,,,,,185785,,,
+Topic,,,,,,1907525,,,
+Topic,,,,,,1333024,,,
+Topic,,,,,,2329,,,
+Topic,,,,,,571,,,
+Topic,,,,,,56683126,,,
+Topic,,,,,,146,,,
+Topic,,,,,,487,,,
+Topic,,,,,,193294,,,
+Topic,,,,,,177,,,
+Topic,,,,,,81944,,,
+Topic,,,,,,998,,,
+Topic,,,,,,55424107,,,
+Topic,,,,,,41323,,,
+Topic,,,,,,38695,,,
+Topic,,,,,,379860,,,
+Topic,,,,,,1149078,,,
+Topic,,,,,,172809,,,
+Topic,,,,,,1642639,,,
+Topic,,,,,,903552,,,
+Topic,,,,,,204,,,
+Topic,,,,,,7817,,,
+Topic,,,,,,201816,,,
+Topic,,,,,,785,,49.19,-2.11
+Topic,,,,,,127,,,
+Topic,,,,,,206021,,,
+Topic,,,,,,181508,,,
+Topic,,,,,,735,,,
+Topic,,,,,,304878,,,
+Topic,,,,,,7590,,,
+Topic,,,,,,8074,,,
+Topic,,,,,,24862,,,
+Topic,,,,,,35127,,,
+Topic,,,,,,60,,40.67,-73.94
+Topic,,,,,,443533,,,
+Person,1160244137181801222,,,,,,,,
+Topic,,,,,,192242,,,
+Topic,,,,,,11707,,,
+Topic,,,,,,73843,,,
+Topic,,,,,,505619,,,
+Topic,,,,,,158668,,,
+Topic,,,,,,889,,34.0,66.0
+Person,895197896920634500,,,,,,,,
+Topic,,,,,,18426,,40.84676,-73.873207
+Topic,,,,,,787185,,,
+Topic,,,,,,467,,,
+Person,1419850416906085161,,,,,,,,
+Topic,,,,,,2869238,,,
+Topic,,,,,,5,,,
+Topic,,,,,,334600,,,
+Topic,,,,,,191290,,,
+Topic,,,,,,122113,,,
+Topic,,,,,,179057,,,
+Topic,,,,,,11635,,,
+Topic,,,,,,329717,,,
+Person,33927662206515912,,,,,,,,
+Topic,,,,,,35140,,,
+Topic,,,,,,485537,,,
+Topic,,,,,,102014,,,
+Topic,,,,,,40357,,,
+Topic,,,,,,1337691,,,
+Topic,,,,,,160409,,40.7825,-73.966111111111
+Topic,,,,,,69871376,,,
+Topic,,,,,,177749,,,
+Topic,,,,,,11348,,,
+Topic,,,,,,182218,,,
+Topic,,,,,,1229,,47.568611111111,40.852783333333
+Topic,,,,,,5322,,,
+Person,1035098046740791143,,,,,,,,
+Topic,,,,,,792565,,48.10277778,20.78388889
+Topic,,,,,,37654,,,
+Topic,,,,,,25395,,40.735277777778,-74.185
+Topic,,,,,,169313,,,
+Topic,,,,,,728,,,
+Topic,,,,,,699385,,,
+Topic,,,,,,22983,,,
+Person,971383124880710240,,,,,,,,
+Person,1010629269012322480,,,,,,,,
+Topic,,,,,,11299,,40.728333333333,-73.994166666667
+Topic,,,,,,83460,,,
+Topic,,,,,,10289,,,
+Topic,,,,,,11019,,,
+Topic,,,,,,470118,,,
+Person,1426050562563532645,,,,,,,,
+Person,75415528634186650,,,,,,,,
+Person,1001287904525368324,,,,,,,,
+Person,242111862342742257,,,,,,,,
+Topic,,,,,,11249,,40.747,-73.986
+Topic,,,,,,3933135,,,
+Topic,,,,,,44311,,,
+Person,1025135622623992536,,,,,,,,
+Topic,,,,,,37497186,,,
+Person,584485814982143221,,,,,,,,
+Person,1508332501512270227,,,,,,,,
+Topic,,,,,,328473,,40.712,-74.002
+Topic,,,,,,25347,,,
+Topic,,,,,,175111,,,
+Person,1312322776399358210,,,,,,,,
+Topic,,,,,,16003594,,,
+Topic,,,,,,48789658,,,
+Topic,,,,,,8148,,,
+Topic,,,,,,9420,,,
+Topic,,,,,,771572,,40.699945,-73.950148
+Topic,,,,,,5088838,,,
+Person,1597454052092354280,,,,,,,,
+Person,961135479935321085,,,,,,,,
+Topic,,,,,,123705,,,
+Topic,,,,,,598435,,,
+Topic,,,,,,732934,,,
+Person,317248309514344163,,,,,,,,
+Person,1524681741257900519,,,,,,,,
+Topic,,,,,,254860,,,
+Topic,,,,,,335046,,,
+Person,534449219561977424,,,,,,,,
+Person,1035056342462002945,,,,,,,,
+Person,1222330726897222256,,,,,,,,
+Person,493345739124130581,,,,,,,,
+Topic,,,,,,831691,,,
+Topic,,,,,,28321638,,,
+Person,682588400093615551,,,,,,,,
+Person,920136262355651383,,,,,,,,
+Person,351354309273100074,,,,,,,,
+Person,495352903902152146,,,,,,,,
+Person,211778681592778731,,,,,,,,
+Topic,,,,,,1189753,,,
+Person,396953035572582107,,,,,,,,
+Topic,,,,,,828749,,,
+Topic,,,,,,904756,,,
+Topic,,,,,,7392008,,,
+Topic,,,,,,2566598,,,
+Person,363047312690634767,,,,,,,,
+Topic,,,,,,618102,,,
+Person,205415260510814362,,,,,,,,
+Person,1251650482793161774,,,,,,,,
+Topic,,,,,,620463,,,
+Person,1463522545161373807,,,,,,,,
+Person,1150357430325141247,,,,,,,,
+Person,674253449444876344,,,,,,,,
+Person,1073324208204442390,,,,,,,,
+Topic,,,,,,1049632,,40.665352,-73.969264
+Topic,,,,,,29171,,,
+Person,1637740339335566412,,,,,,,,
+Person,524508243055647325,,,,,,,,
+Person,320151361710953715,,,,,,,,
+Person,128643504412157535,,,,,,,,
+Topic,,,,,,361,,,
+Person,1243472362254658420,,,,,,,,
+Topic,,,,,,617927,,,
+Person,1275555184736572954,,,,,,,,
+Topic,,,,,,974850,,,
+Person,934144115142885657,,,,,,,,
+Topic,,,,,,217627,,,
+Topic,,,,,,223155,,,
+Person,1504217244688272832,,,,,,,,
+Person,144548678565311334,,,,,,,,
+Person,1400516284533535554,,,,,,,,
+Topic,,,,,,14528,,,
+Person,1508951542204233332,,,,,,,,
+Person,611325512448133762,,,,,,,,
+Person,635555368637193420,,,,,,,,
+Person,134403203055015143,,,,,,,,
+Topic,,,,,,202013,,,
+Topic,,,,,,7602643,,,
+Topic,,,,,,121765,,40.774444444444,-73.904166666667
+Person,765254641650259739,,,,,,,,
+Person,975526659664533195,,,,,,,,
+Person,273872236541568195,,,,,,,,
+Topic,,,,,,557887,,,
+Topic,,,,,,774228,,,
+Topic,,,,,,46744,,,
+Person,352033450190732475,,,,,,,,
+Person,841466124620556016,,,,,,,,
+Person,1517466541524095404,,,,,,,,
+Topic,,,,,,114633,,,
+Person,301710390995444087,,,,,,,,
+Topic,,,,,,16868955,,,
+Person,747231730275042400,,,,,,,,
+Person,1164902255571715230,,,,,,,,
+Person,291914370254601234,,,,,,,,
+Topic,,,,,,247154,,,
+Topic,,,,,,519,,,
+Topic,,,,,,3303945,,,
+Person,1022241560051472272,,,,,,,,
+Person,566448585007839403,,,,,,,,
+Topic,,,,,,11229,,,
+Person,735713441679521195,,,,,,,,
+Person,1128501731262832684,,,,,,,,
+Person,446962590481145702,,,,,,,,
+Person,1125113326787431160,,,,,,,,
+Person,437201545096608055,,,,,,,,
+Person,940377106445268064,,,,,,,,
+Person,1647329525841402942,,,,,,,,
+Topic,,,,,,3884230,,,
+Topic,,,,,,131191,,,
+Person,1376053313411407054,,,,,,,,
+Person,1347432655942023365,,,,,,,,
+Person,1472154222902711100,,,,,,,,
+Person,529550602103217450,,,,,,,,
+Topic,,,,,,43035,,,
+Topic,,,,,,126095,,,
+Topic,,,,,,49088,,40.8075,-73.961944444444
+Person,910075513854877065,,,,,,,,
+Topic,,,,,,5018694,,40.859105555556,-74.198686111111
+Topic,,,,,,2446683,,43.1189,20.0797
+Topic,,,,,,2030894,,40.850852,-73.844949
+Topic,,,,,,29718382,,,
+Topic,,,,,,130965,,40.860833333333,-73.884444444444
+Topic,,,,,,167172,,,
+Topic,,,,,,2456507,,,
+Person,842652402732741813,,,,,,,,
+Topic,,,,,,670897,,33.421111111111,-111.93166666667
+Topic,,,,,,1436668,,,
+Topic,,,,,,753651,,,
+Topic,,,,,,7451247,,,
+Topic,,,,,,2493,,,
+Person,719533111062900642,,,,,,,,
+Person,834321901190546647,,,,,,,,
+Topic,,,,,,12796,,,
+Person,937074421253040138,,,,,,,,
+Topic,,,,,,18159587,,,
+Person,101810442957214781,,,,,,,,
+Topic,,,,,,131401,,,
+Topic,,,,,,929920,,,
+Topic,,,,,,466439,,,
+Topic,,,,,,6498684,,,
+Topic,,,,,,206361,,,
+Topic,,,,,,41796,,,
+Person,1152266442105786574,,,,,,,,
+Person,95240187156237415,,,,,,,,
+Topic,,,,,,7897553,,,
+Topic,,,,,,206887,,,
+Topic,,,,,,5405633,,,
+Person,1031526243841315760,,,,,,,,
+Topic,,,,,,11348540,,,
+Topic,,,,,,4198163,,,
+Topic,,,,,,16048728,,,
+Topic,,,,,,189756,,,
+Topic,,,,,,643638,,,
+Topic,,,,,,783874,,,
+Topic,,,,,,492346,,37.2708,-76.7069
+Topic,,,,,,29042975,,,
+Topic,,,,,,12103677,,,
+Topic,,,,,,2329157,,,
+Person,1563598527979706128,,,,,,,,
+Topic,,,,,,4442,,,
+Person,264075025125849069,,,,,,,,
+Person,369370063627142227,,,,,,,,
+Person,1300183120520109060,,,,,,,,
+Topic,,,,,,18031504,,,
+Topic,,,,,,4229887,,,
+Person,611117914195523184,,,,,,,,
+Topic,,,,,,7543639,,,
+Topic,,,,,,13977,,,
+Topic,,,,,,18122778,,,
+Person,166319955306346577,,,,,,,,
+Topic,,,,,,588894,,,
+Topic,,,,,,2454265,,,
+Person,1547400408884914628,,,,,,,,
+Person,373641740834326257,,,,,,,,
+Topic,,,,,,5264957,,,
+Topic,,,,,,968598,,,
+Person,754480939973310112,,,,,,,,
+Topic,,,,,,1808877,,,
+Person,1443919105364146460,,,,,,,,
+Person,735243266472522113,,,,,,,,
+Person,1321304826561136177,,,,,,,,
+Person,1560601202484151215,,,,,,,,
+Person,1403521534163206962,,,,,,,,
+Person,231472126788137195,,,,,,,,
+Person,208411288512434105,,,,,,,,
+Topic,,,,,,7252790,,,
+Person,1211456636406749825,,,,,,,,
+Person,1071303249530347453,,,,,,,,
+Person,1069710216181783510,,,,,,,,
+Person,1578613817419480731,,,,,,,,
+Person,944546653739552042,,,,,,,,
+Topic,,,,,,8856932,,,
+Person,616673625330310949,,,,,,,,
+Person,1302421465423646583,,,,,,,,
+Person,720320812100121121,,,,,,,,
+Person,653345304799504620,,,,,,,,
+Person,346401281431409585,,,,,,,,
+Person,1526112405471861415,,,,,,,,
+Person,1501623481588541372,,,,,,,,
+Person,312380611598980641,,,,,,,,
+Person,1115244423173415593,,,,,,,,
+Person,1555348115336584230,,,,,,,,
+Person,12321118467056216,,,,,,,,
+Person,1352636429150180228,,,,,,,,
+Person,725324491051434870,,,,,,,,
+Person,846764541256336994,,,,,,,,
+Person,140443713446471314,,,,,,,,
+Person,1135272113235621141,,,,,,,,
+Person,775818654043059161,,,,,,,,
+Person,529476525413023401,,,,,,,,
+Person,1262668194076216011,,,,,,,,
+Person,119474435514352445,,,,,,,,
+Person,437573095319558705,,,,,,,,
+Person,1035555223142154728,,,,,,,,
+Person,556320934631523806,,,,,,,,
+Person,1356253242219285320,,,,,,,,
+Person,248654236829951090,,,,,,,,
+Person,481153633235353485,,,,,,,,
+Includes,,,1202482536733844323,1296829658689065159,,,,,
+HasTopic,,,,956704137555154092,,335046,,,
+HasTopic,,,,1028329324575034354,,1808877,,,
+HasTopic,,,,,1004346153600881042,735,,,
+Author,1560601202484151215,,,1285128710332882742,,,,,
+HasTopic,,,,1513662032452523252,,131401,,,
+HasTopic,,,,1302313601603127196,,48789658,,,
+HasTopic,,,,1114502034902546550,,40357,,,
+Sale,1275555184736572954,1463522545161373807,,,,,8/16/2018,,
+HasTopic,,,,78678286442461987,,28321638,,,
+HasTopic,,,,854149383334143372,,903552,,,
+HasTopic,,,,240337224527030225,,519,,,
+HasTopic,,,,116892402526543412,,5264957,,,
+HasTopic,,,,202421472143651025,,12103677,,,
+HasTopic,,,,393285992310638641,,470118,,,
+Author,910075513854877065,,,,102583151124020340,,,,
+Sale,1426050562563532645,75415528634186650,,,,,7/29/2018,,
+Author,1128501731262832684,,,1513662032452523252,,,,,
+Sale,971383124880710240,1010629269012322480,,,,38695,7/28/2018,,
+Author,477384404927196020,,,1651365355351122204,,,,,
+Author,725324491051434870,,,202421472143651025,,,,,
+HasTopic,,,,932362105613871012,,69871376,,,
+HasTopic,,,,1290121451283392110,,169313,,,
+Sale,396953035572582107,1400516284533535554,,,,41323,8/21/2018,,
+HasTopic,,,,701755398615636460,,8148,,,
+Sale,940377106445268064,1647329525841402942,,,,25347,10/15/2018,,
+HasTopic,,,,1651365355351122204,,643638,,,
+HasTopic,,,,1114502034902546550,,46744,,,
+HasTopic,,,,936722743217343702,,123705,,,
+HasTopic,,,,321724159614056152,,6498684,,,
+Sale,1419850416906085161,1128501731262832684,,,,2869238,09/28/2018,,
+HasTopic,,,1372844135435303981,,,60,,,
+Author,1578613817419480731,,,321724159614056152,,,,,
+HasTopic,,,,1512214307542520410,,8074,,,
+HasTopic,,,,618434247743641149,,192242,,,
+Includes,,,1202482536733844323,1114502034902546550,,,,,
+Includes,,,1615340315424362057,877764733212222524,,,,,
+HasTopic,,,,240337224527030225,,1229,,,
+HasTopic,,,,1209342585680609487,,179057,,,
+Author,834321901190546647,,,,1433303251800176474,,,,
+HasTopic,,,,1209342585680609487,,175111,,,
+Includes,,,1372844135435303981,581543512052485139,,,,,
+Author,720320812100121121,,,420762134340393550,,,,,
+Includes,,,1615340315424362057,1424660009578332566,,,,,
+HasTopic,,,,932362105613871012,,771572,,,
+Includes,,,353365307219544531,956704137555154092,,,,,
+HasTopic,,,,956704137555154092,,929920,,,
+HasTopic,,,,1441762191425652442,,177749,,,
+Author,1563598527979706128,,,1220295546212024391,,,,,
+Author,248654236829951090,,,1424660009578332566,,,,,
+Author,529476525413023401,,,1302313601603127196,,,,,
+HasTopic,,,,1209342585680609487,,771572,,,
+HasTopic,,,,,1433303251800176474,83460,,,
+Sale,1022241560051472272,1637740339335566412,,,,1642639,10/13/2018,,
+Includes,,,1615340315424362057,128423416112315798,,,,,
+Includes,,,1372844135435303981,932362105613871012,,,,,
+Includes,,,1372844135435303981,747423119260925972,,,,,
+HasTopic,,,,854149383334143372,,24862,,,
+Sale,1160244137181801222,1035056342462002945,,,,181508,10/4/2018,,
+HasTopic,,,,1114502034902546550,,328473,,,
+Includes,,,1314315120197156050,854149383334143372,,,,,
+Sale,1472154222902711100,1128501731262832684,,,,185785,09/28/2018,,
+HasOrg,,,,,102583151124020340,49088,,,
+HasTopic,,,,932362105613871012,,11299,,,
+HasTopic,,,,932362105613871012,,18426,,,
+HasTopic,,,,,1004346153600881042,7392008,,,
+Author,1526112405471861415,,,846536331643665114,,,,,
+HasTopic,,,,78678286442461987,,617927,,,
+HasTopic,,,,321724159614056152,,4229887,,,
+HasTopic,,,,82629615412640377,,5322,,,
+Author,1128501731262832684,,,1114502034902546550,,,,,
+Author,166319955306346577,,,209800678458482108,,,,,
+Sale,735713441679521195,1128501731262832684,,,,11650,10/10/2018,,
+Author,944546653739552042,,,803952155714850701,,,,,
+HasTopic,,,,581543512052485139,,3933135,,,
+HasTopic,,,,209800678458482108,,4198163,,,
+HasTopic,,,,932362105613871012,,7590,,,
+Includes,,,1372844135435303981,1060309546214304182,,,,,
+HasTopic,,,,846536331643665114,,167172,,,
+HasTopic,,,,740410432146852843,,11348540,,,
+Author,937074421253040138,,,,1004346153600881042,,,,
+HasTopic,,,,1282227710122181132,,2493,,,
+HasTopic,,,,,1433303251800176474,43035,,,
+Sale,495352903902152146,211778681592778731,,,,81944,8/6/2018,,
+Includes,,,1372844135435303981,1512214307542520410,,,,,
+HasTopic,,,353365307219544531,,,2329,,,
+Includes,,,1615340315424362057,701755398615636460,,,,,
+Includes,,,1372844135435303981,1209342585680609487,,,,,
+Includes,,,1202482536733844323,1513662032452523252,,,,,
+HasTopic,,,,701755398615636460,,968598,,,
+HasTopic,,,,128423416112315798,,11019,,,
+Author,1356253242219285320,,,854149383334143372,,,,,
+Author,1031526243841315760,,,,1433303251800176474,,,,
+Author,842652402732741813,,,,1433303251800176474,,,,
+HasTopic,,,,440265285168056234,,18159587,,,
+Author,1501623481588541372,,,833681012494554358,,,,,
+HasTopic,,,,1028329324575034354,,11299,,,
+Author,140443713446471314,,,1028329324575034354,,,,,
+Author,264075025125849069,,,1245126351375505703,,,,,
+Includes,,,1314315120197156050,1614534111336540475,,,,,
+HasTopic,,,,,1433303251800176474,131191,,,
+HasTopic,,,,420762134340393550,,1907525,,,
+Sale,975526659664533195,524508243055647325,,,,,10/15/2018,,
+Author,284405379592161575,,,,102583151124020340,,,,
+HasTopic,,,,,963345652072941810,735,,,
+HasTopic,,,,,1433303251800176474,3303945,,,
+Author,1152266442105786574,,,,963345652072941810,,,,
+HasTopic,,,,202421472143651025,,1189753,,,
+HasTopic,,,,1028329324575034354,,11229,,,
+Includes,,,1615340315424362057,1220295546212024391,,,,,
+Includes,,,1314315120197156050,846536331643665114,,,,,
+HasTopic,,,,1296829658689065159,,7451247,,,
+Includes,,,1615340315424362057,78678286442461987,,,,,
+HasTopic,,,,541215404780905313,,11635,,,
+Author,481153633235353485,,,956704137555154092,,,,,
+Author,611117914195523184,,,701755398615636460,,,,,
+Author,95240187156237415,,,,963345652072941810,,,,
+Includes,,,1372844135435303981,91431002216341149,,,,,
+Author,846764541256336994,,,1290121451283392110,,,,,
+HasTopic,,,,393285992310638641,,206887,,,
+HasTopic,,,,1512214307542520410,,177749,,,
+HasTopic,,,,1114502034902546550,,7252790,,,
+HasTopic,,,1202482536733844323,,,60,,,
+HasTopic,,,,1302313601603127196,,41796,,,
+HasTopic,,,,1114502034902546550,,44311,,,
+Author,1547400408884914628,,,393285992310638641,,,,,
+Author,312380611598980641,,,78678286442461987,,,,,
+Sale,273872236541568195,1251650482793161774,,,,172809,8/22/2018,,
+HasTopic,,,,1513662032452523252,,728,,,
+Includes,,,1615340315424362057,447169043921403064,,,,,
+Includes,,,1372844135435303981,82629615412640377,,,,,
+HasTopic,,,,1302313601603127196,,29042975,,,
+Author,1321304826561136177,,,1532662490035322233,,,,,
+Includes,,,1372844135435303981,1302313601603127196,,,,,
+Sale,363047312690634767,242111862342742257,,,,,10/4/2018,,
+Author,1071303249530347453,,,737353170652104031,,,,,
+HasTopic,,,,1282227710122181132,,35140,,,
+HasTopic,,,,91431002216341149,,46744,,,
+Includes,,,1372844135435303981,1441762191425652442,,,,,
+Sale,446962590481145702,534449219561977424,,,,,10/11/2018,,
+Author,1035555223142154728,,,877764733212222524,,,,,
+Author,1403521534163206962,,,932362105613871012,,,,,
+HasTopic,,,,701755398615636460,,35127,,,
+Includes,,,1372844135435303981,1028329324575034354,,,,,
+HasTopic,,,,1245126351375505703,,254860,,,
+HasTopic,,,,209800678458482108,,7897553,,,
+Includes,,,1372844135435303981,186108460103013588,,,,,
+HasOrg,,,,,102583151124020340,304878,,,
+HasTopic,,,,,1433303251800176474,998,,,
+Author,653345304799504620,,,581543512052485139,,,,,
+Author,1302421465423646583,,,240337224527030225,,,,,
+Author,1211456636406749825,,,618434247743641149,,,,,
+HasTopic,,,,240337224527030225,,785,,,
+HasTopic,,,,,1004346153600881042,83460,,,
+Includes,,,1615340315424362057,1245126351375505703,,,,,
+HasTopic,,,,747423119260925972,,16868955,,,
+HasTopic,,,,91431002216341149,,9420,,,
+Includes,,,451888058015735870,541215404780905313,,,,,
+HasTopic,,,,1209342585680609487,,492346,,,
+HasTopic,,,,1290121451283392110,,114633,,,
+HasTopic,,,,1441762191425652442,,11249,,,
+HasTopic,,,,1532662490035322233,,753651,,,
+HasTopic,,,,1532662490035322233,,11707,,,
+HasTopic,,,,1245126351375505703,,904756,,,
+HasTopic,,,15133734353741126,,,189756,,,
+Sale,1222330726897222256,493345739124130581,,,,177,8/4/2018,,
+Includes,,,1372844135435303981,1424263331858043042,,,,,
+Includes,,,1615340315424362057,1290121451283392110,,,,,
+Sale,477384404927196020,182010581109145287,,,,271997,2/17/2019,,
+HasTopic,,,,737353170652104031,,16003594,,,
+HasTopic,,,,747423119260925972,,2329157,,,
+Sale,566448585007839403,765254641650259739,,,,,10/9/2018,,
+HasOrg,,,,,963345652072941810,130965,,,
+Author,616673625330310949,,,936722743217343702,,,,,
+HasTopic,,,,1513662032452523252,,44311,,,
+Author,119474435514352445,,,1282227710122181132,,,,,
+HasTopic,,,,82629615412640377,,22983,,,
+HasTopic,,,,833681012494554358,,201816,,,
+HasTopic,,,,,1433303251800176474,14528,,,
+Sale,33927662206515912,934144115142885657,,,,,10/10/2018,,
+Includes,,,1372844135435303981,618434247743641149,,,,,
+Includes,,,1615340315424362057,740410432146852843,,,,,
+Author,1300183120520109060,,,128423416112315798,,,,,
+Author,1069710216181783510,,,1296829658689065159,,,,,
+Sale,1504217244688272832,144548678565311334,,,,56683126,8/13/2018,,
+Author,1115244423173415593,,,1614534111336540475,,,,,
+Sale,747231730275042400,584485814982143221,,,,127,10/7/2018,,
+HasTopic,,,,1285128710332882742,,37654,,,
+HasTopic,,,1615340315424362057,,,12796,,,
+Author,1555348115336584230,,,817526874194673140,,,,,
+HasTopic,,,,,102583151124020340,43035,,,
+HasTopic,,,,737353170652104031,,206361,,,
+Includes,,,1314315120197156050,833681012494554358,,,,,
+Sale,128643504412157535,320151361710953715,,,,443533,8/24/2018,,
+HasTopic,,,,1220295546212024391,,588894,,,
+HasTopic,,,,202421472143651025,,7602643,,,
+Sale,1150357430325141247,674253449444876344,,,,,10/7/2018,,
+Author,735243266472522113,,,1209342585680609487,,,,,
+Includes,,,1372844135435303981,1651365355351122204,,,,,
+HasTopic,,,,1424660009578332566,,618102,,,
+HasTopic,,,,,102583151124020340,126095,,,
+HasTopic,,,,,1004346153600881042,2446683,,,
+HasTopic,,,,,1433303251800176474,598435,,,
+Author,735713441679521195,,,,1433303251800176474,,,,
+Includes,,,1314315120197156050,321724159614056152,,,,,
+HasTopic,,,,1114502034902546550,,8856932,,,
+HasTopic,,,,,1433303251800176474,5405633,,,
+HasTopic,,,,1285128710332882742,,11299,,,
+HasTopic,,,,420762134340393550,,12796,,,
+HasTopic,,,,1184855350262395542,,329717,,,
+HasTopic,,,,116892402526543412,,2456507,,,
+Includes,,,1615340315424362057,817526874194673140,,,,,
+Sale,841466124620556016,1517466541524095404,,,,,8/27/2018,,
+HasTopic,,,,1209342585680609487,,127197,,,
+HasTopic,,,,420762134340393550,,16048728,,,
+HasTopic,,,,209800678458482108,,774228,,,
+Author,231472126788137195,,,82629615412640377,,,,,
+HasTopic,,,,,1433303251800176474,7817,,,
+HasTopic,,,,932362105613871012,,1049632,,,
+Sale,1164902255571715230,291914370254601234,,,,571,10/8/2018,,
+HasTopic,,,,,1433303251800176474,467,,,
+HasTopic,,,,1220295546212024391,,37497186,,,
+HasTopic,,,,817526874194673140,,732934,,,
+Includes,,,1615340315424362057,202421472143651025,,,,,
+HasTopic,,,,1513662032452523252,,889,,,
+Includes,,,1372844135435303981,737353170652104031,,,,,
+HasTopic,,,,1296829658689065159,,83460,,,
+HasTopic,,,,747423119260925972,,1333024,,,
+HasTopic,,,,1290121451283392110,,204,,,
+Author,611117914195523184,,,440265285168056234,,,,,
+Author,754480939973310112,,,1441762191425652442,,,,,
+Includes,,,1427292001647224242,936722743217343702,,,,,

From 4296fc83a6109b52aac0a50bbe929a124a18ed4f Mon Sep 17 00:00:00 2001
From: Meyer Zinn <meyer@cs.utexas.edu>
Date: Tue, 16 Apr 2024 04:13:28 +0000
Subject: [PATCH 660/660] fixup! File update

---
 libgalois/test/prefixsum.cpp | 7 ++++++-
 libgalois/test/wfl.cpp       | 7 ++++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/libgalois/test/prefixsum.cpp b/libgalois/test/prefixsum.cpp
index b7352d6614..9aa599ca93 100644
--- a/libgalois/test/prefixsum.cpp
+++ b/libgalois/test/prefixsum.cpp
@@ -68,7 +68,12 @@ int main(int argc, char** argv) {
   gethostname(bname, sizeof(bname));
   using namespace galois;
 
-  std::cout << "Host" << "," << "Lock Name" << "," << "numThreads" << ","
+  std::cout << "Host"
+            << ","
+            << "Lock Name"
+            << ","
+            << "numThreads"
+            << ","
             << "Time (us)" << std::endl;
 
   // using PTS   = PerThreadStorage<unsigned>;
diff --git a/libgalois/test/wfl.cpp b/libgalois/test/wfl.cpp
index 0bdcdee68b..77f4f54b90 100644
--- a/libgalois/test/wfl.cpp
+++ b/libgalois/test/wfl.cpp
@@ -88,7 +88,12 @@ int main(int argc, char** argv) {
   gethostname(bname, sizeof(bname));
   using namespace galois;
 
-  std::cout << "Host" << "," << "Lock Name" << "," << "numThreads" << ","
+  std::cout << "Host"
+            << ","
+            << "Lock Name"
+            << ","
+            << "numThreads"
+            << ","
             << "Time (us)" << std::endl;
 
   using PTS = PerThreadStorage<unsigned>;